from scipy.io import loadmat
import scipy as sp
from scipy import optimize
from theano import function, config, shared
import cPickle
import theano.tensor as T
from autodiff.optimize import fmin_l_bfgs_b
import pandas as pd
%pylab inline --no-import-all
pylab.gray()
Populating the interactive namespace from numpy and matplotlib
<matplotlib.figure.Figure at 0x10fd0c1d0>
## Sparse Autoencoder Exercise
n_visible = 8 * 8 ## number of input units
n_hidden = 25 ## number of hidden units
rho = 0.01 ## desired sparse parameter (average activation of hidden units)
lamb = 0.0001 # weight decay parameter
beta = 3 ## weight of sparsity penalty term
n_patches = 10000
def display_imgs(imgs, layout, plotsize = 1.1):
nrows, ncols = layout
fig, axes = pylab.subplots(nrows, ncols, figsize = (plotsize * ncols, plotsize * nrows))
fig.subplots_adjust(hspace = 0, wspace = 0)
axes = axes.flatten()
for i, img in enumerate(imgs):
axes[i].get_xaxis().set_visible(False)
axes[i].get_yaxis().set_visible(False)
axes[i].imshow(img, interpolation = 'nearest')
## load data
images = loadmat('data/starter/IMAGES.mat')['IMAGES'].transpose((2, 0, 1))
print images.shape
display_imgs(images, (2, 5), 2.5)
(10, 512, 512)
digits = cPickle.load(open('../ml-tutorials/data/mnist.pkl'))
train_data, valid_data, test_data = digits
digits = train_data[0].reshape((-1, 28, 28))
print digits.shape
(50000, 28, 28)
## sampling patches
def sample_patches(images, n_patches = 10000, patch_size = 8):
n_images, n_rows, n_cols = images.shape
img_selection = np.random.randint(0, n_images, n_patches)
row_selection = np.random.randint(0, n_rows-patch_size, n_patches)
col_selection = np.random.randint(0, n_cols-patch_size, n_patches)
patches = np.empty((n_patches, patch_size, patch_size))
for i, (img, row, col) in enumerate(zip(img_selection, row_selection, col_selection)):
patches[i] = images[img, row:row+patch_size, col:col+patch_size]
return patches
patches = sample_patches(images, n_patches)
print patches.shape
display_imgs(patches[:10], (2, 5))
## converting patchs into feature matrix n_patches x n_feats (=patch_size^2)
feats = patches.reshape((n_patches, -1))
print feats.shape
(10000, 8, 8) (10000, 64)
## normalization
## For the autoencoder to work well we need to normalize the data
## Specifically, since the output of the network is bounded between [0,1]
## (due to the sigmoid activation function), we have to make sure
## the range of pixel values is also bounded between [0,1]
## Squash data to [0.1, 0.9] since we use sigmoid as the actvation function
## in output layer
## IMPLEMENTATION: truncate to +/- 3 std and scale to -1 to 1
def normalize(data):
"""
The normalization happends by
calculatin (1) mean PER IMAGE (2) std of all images
data: n_patches x n_feats
"""
## 1. remove patch mean - mean is specific to each image
## which depends on the exposure
data = data - data.mean(axis = 1)[:, sp.newaxis]
## 2. truncate to +/- 3 std and scale to -1 to 1
## based on the assumption all natural images should have
## similiar stds
data_std = 3. * sp.std(data)
data = sp.maximum(sp.minimum(data, data_std), -data_std) / data_std
## 3. rescale from [-1, 1] to [0.1, 0.9]
data = (data + 1) * 0.4 + 0.1
return data
norm_feats = normalize(feats)
print norm_feats.shape
print norm_feats.min(), norm_feats.max()
display_imgs(norm_feats[:10].reshape(10, 8, 8), (2, 5))
(10000, 64) 0.1 0.9
## sparse autoencoder computation model by theano
def share_data(data, return_type = None):
"""
make data GPU friendly
"""
shared_data = shared(value = sp.asarray(data, dtype = config.floatX), borrow = True)
if return_type:
return T.cast(share_data, dtype=return_type)
else:
return shared_data
class SparseAutoEncoder(object):
def __init__(self, n_vis, n_hid, rho,
X = None,
theta = None):
self.n_vis = n_vis
self.n_hid = n_hid
self.X = X or T.matrix('X')
if theta:
self.theta = theta
else:
## W1: n_vis x n_hid
## W2: n_hid x n_vis
## b1: n_hid
## b2: n_vis
theta_value = sp.empty(2*n_vis*n_hid+n_hid+n_vis,
dtype = 'float64')
theta_value[:2*n_vis*n_hid] = sp.random.uniform(low = -4.*sp.sqrt(6./(n_vis+n_hid)),
high = 4.*sp.sqrt(6./(n_vis+n_hid)),
size = 2*n_vis*n_hid).astype(config.floatX)
theta_value[2*n_vis*n_hid:] = sp.zeros(n_hid+n_vis, dtype = 'float64')
self.theta = shared(theta_value, name = 'theta')
self.W1 = self.theta[:n_vis*n_hid].reshape((n_vis, n_hid))
self.W2 = self.theta[n_vis*n_hid:2*n_vis*n_hid].reshape((n_hid, n_vis))
self.b1 = self.theta[2*n_vis*n_hid:2*n_vis*n_hid+n_hid]
self.b2 = self.theta[2*n_vis*n_hid+n_hid:]
self.Y = T.nnet.sigmoid(T.dot(self.X, self.W1) + self.b1)
self.Z = T.nnet.sigmoid(T.dot(self.Y, self.W2) + self.b2)
self.rho_hat = T.mean(self.Y, axis = 0)
self.likelihood = .5 * T.mean(T.sum((self.X - self.Z)**2, axis = 1))
self.l2_term = .5 * T.sum(self.W1**2) + T.sum(self.W2**2)
self.sparsity_term = T.sum(rho * T.log(rho/self.rho_hat)
+ (1-rho) * T.log((1-rho)/(1-self.rho_hat)))
def bind_data(self, data_X):
shared_X = share_data(data_X)
self.X = shared_X
def get_fg_fn(self, lamb, beta):
def _objective(theta_value):
self.theta.set_value(theta_value.astype('float32'))
cost = self.likelihood + lamb * self.l2_term + beta * self.sparsity_term
return cost.eval().astype('float64')
def _gradient(theta_value):
self.theta.set_value(theta_value.astype('float32'))
cost = self.likelihood + lamb * self.l2_term + beta * self.sparsity_term
gradient = T.grad(cost, wrt = self.theta)
return gradient.eval().astype('float64')
return _objective, _gradient
def initialize_theta(n_vis, n_hid):
theta_value = sp.empty(2*n_vis*n_hid+n_hid+n_vis,
dtype = 'float64')
theta_value[:2*n_vis*n_hid] = sp.random.uniform(low = -4.*sp.sqrt(6./(n_vis+n_hid)),
high = 4.*sp.sqrt(6./(n_vis+n_hid)),
size = 2*n_vis*n_hid).astype(config.floatX)
theta_value[2*n_vis*n_hid:] = sp.zeros(n_hid+n_vis, dtype = 'float64')
return theta_value
def theta2params(theta, n_vis, n_hid):
W1 = theta[:n_vis*n_hid].reshape((n_vis, n_hid))
W2 = theta[n_vis*n_hid:2*n_vis*n_hid].reshape((n_hid, n_vis))
b1 = theta[2*n_vis*n_hid:2*n_vis*n_hid+n_hid]
b2 = theta[2*n_vis*n_hid+n_hid:]
return W1, W2, b1, b2
sae = SparseAutoEncoder(n_visible, n_hidden, rho, X = share_data(norm_feats))
f, g = sae.get_fg_fn(lamb, beta)
theta0 = initialize_theta(n_visible, n_hidden)#sae.theta.get_value().astype(config.floatX)
print theta0.dtype
float64
## fmin_l_bfgs_b does NOT work with float64
## MY SOLUTION IS TO hijack the parameter control in objective/gradient function
## : first cast parameter from float64 to float32, then cast it back when return
## the objective/gradient value
r = optimize.fmin_l_bfgs_b(f, theta0, fprime = g, m = 10, maxfun = 200, iprint=1, )
optimal_theta = r[0]
W1, W2, b1, b2 = theta2params(optimal_theta, n_visible, n_hidden)
print W1.shape
display_imgs(W1.T.reshape(-1, 8, 8), (5, 5))
(64, 25)
## try sparse autoencoder with digits
n_visible = 28 * 28
n_hidden = 196
rho = 0.1
lamb = 3e-3
beta = 3
#digit_patchs = sample_patches(digits, n_patches=100000)
#digit_feats = normalize(digit_patchs.reshape((100000, -1)))
#digit_feats = normalize(digits[:10000].reshape((-1, 28*28)))
digit_feats = digits[:10000].reshape((-1, 28*28))
print digit_feats.shape
asae = SparseAutoEncoder(n_visible, n_hidden, rho, X = share_data(digit_feats))
f, g = asae.get_fg_fn(lamb, beta)
theta0 = initialize_theta(n_visible, n_hidden)#sae.theta.get_value().astype(config.floatX)
print theta0.dtype
print f(theta0)
r = optimize.fmin_l_bfgs_b(f, theta0, g, m = 10, maxfun = 400, iprint=1, )
optimal_theta = r[0]
W1, W2, b1, b2 = theta2params(optimal_theta, n_visible, n_hidden)
print W1.shape
display_imgs(W1.T.reshape(-1, 28, 28), (14, 14))
(10000, 784) float64 418.31687318 (784, 196)
digit_feats = normalize(digits[:10000].reshape((-1, 28*28)))
digit_feats.min(), digit_feats.max()
(0.34917679, 0.90000004)
## Alternative sparse encoder implementation with pyautodiff package
## DOESNOT WORK WITH FLOAT32 AT ALL - BUT IT DOES WORK FASTER THAN MY OWN THEANO IMPLEMENTATION
def sigmoid(u):
return 1. / (1. + sp.exp(-u))
class NativeSparseAutoEncoder(object):
def __init__(self, n_vis, n_hid, rho,
X = None,
theta = None):
self.n_vis = n_vis
self.n_hid = n_hid
self.X = X
def bind_data(self, data_X):
#shared_X = share_data(data_X)
self.X = data_X
def get_f_fn(self, lamb, beta):
def _objective(theta_value):
self.theta = theta_value#.astype('float32')
self.W1 = self.theta[:self.n_vis*self.n_hid].reshape((self.n_vis, self.n_hid))
self.W2 = self.theta[self.n_vis*self.n_hid:2*self.n_vis*self.n_hid].reshape((self.n_hid, self.n_vis))
self.b1 = self.theta[2*self.n_vis*self.n_hid:2*self.n_vis*self.n_hid+self.n_hid]
self.b2 = self.theta[2*self.n_vis*self.n_hid+self.n_hid:]
self.Y = sigmoid(sp.dot(self.X, self.W1) + self.b1)
self.Z = sigmoid(sp.dot(self.Y, self.W2) + self.b2)
self.rho_hat = sp.mean(self.Y, axis = 0)
self.likelihood = .5 * sp.mean(sp.sum((self.X - self.Z)**2, axis = 1))
self.l2_term = .5 * (sp.sum(self.W1**2) + sp.sum(self.W2**2))
self.sparsity_term = sp.sum(rho * sp.log(rho/self.rho_hat)
+ (1-rho) * sp.log((1-rho)/(1-self.rho_hat)))
cost = self.likelihood + lamb * self.l2_term + beta * self.sparsity_term
return cost#.astype('float64')
return _objective
def initialize_theta(n_vis, n_hid):
theta_value = sp.empty(2*n_vis*n_hid+n_hid+n_vis,
dtype = 'float64')
## and the initialization with 1 coefficient instead of 4 is suprisingly good
## usually 1 is for tanh, and 4 is for sigmoid
theta_value[:2*n_vis*n_hid] = sp.random.uniform(low = -1.*sp.sqrt(6./(n_vis+n_hid)),
high = 1.*sp.sqrt(6./(n_vis+n_hid)),
size = 2*n_vis*n_hid).astype(config.floatX)
theta_value[2*n_vis*n_hid:] = sp.zeros(n_hid+n_vis, dtype = 'float64')
return theta_value
n_visible = 8 * 8 ## number of input units
n_hidden = 25 ## number of hidden units
rho = 0.01 ## desired sparse parameter (average activation of hidden units)
lamb = 0.0001 # weight decay parameter
beta = 3 ## weight of sparsity penalty term
asae = NativeSparseAutoEncoder(n_visible, n_hidden, rho, X = norm_feats)
f = asae.get_f_fn(lamb, beta)
theta0 = initialize_theta(n_visible, n_hidden)#sae.theta.get_value().astype(config.floatX)
print theta0.dtype
print f(theta0)
optimal_theta = fmin_l_bfgs_b(f, theta0, m = 10, maxfun = 200, iprint=1, )
W1, W2, b1, b2 = theta2params(optimal_theta, n_visible, n_hidden)
print W1.shape
display_imgs(W1.T.reshape(-1, 8, 8), (5, 5))
float64 54.3803304364 (64, 25)
## try sparse autoencoder with digits
## THE FEATURE EXTRACTED CAN BE EXPLAINED AS PEN STROKE/SEGMENT
## THEY CAN ONLY BE EXTRACTED WITHOUT NORMALIZING (THEY ARE ALREADY NORMALIZED TO 0 1 BY OTHER PROCESS)
## THE REASON IS THAT THE NORMALIZATION INTRODUCED ABOVE ASSUME A CONSTANT STD IN ALL NATURAL IMAGES,
## WHICH IS NOT TRUE HERE IN DIGIT RECOGNITIONS
n_visible = 28 * 28
n_hidden = 196
rho = 0.1
lamb = 3e-3
beta = 3
#digit_patchs = sample_patches(digits, n_patches=100000)
#digit_feats = normalize(digit_patchs.reshape((100000, -1)))
#digit_feats = normalize(digits[:10000].reshape((-1, 28*28)))
digit_feats = digits[:10000].reshape((-1, 28*28))
print digit_feats.shape
asae = NativeSparseAutoEncoder(n_visible, n_hidden, rho, X = digit_feats)
f = asae.get_f_fn(lamb, beta)
theta0 = initialize_theta(n_visible, n_hidden)#sae.theta.get_value().astype(config.floatX)
print theta0.dtype
print f(theta0)
optimal_theta = fmin_l_bfgs_b(f, theta0, m = 10, maxfun = 400, iprint=1, )
W1, W2, b1, b2 = theta2params(optimal_theta, n_visible, n_hidden)
print W1.shape
display_imgs(W1.T.reshape(-1, 28, 28), (14, 14))
(10000, 784) float64 312.254098869 (784, 196)
$\Sigma$ is the covariance matrix iff x has zero mean
Similar to using PCA alone, PCA with whitening also results in processed data that has a diagonal covariance matrix. However, unlike PCA alone, whitening additionally ensures that the diagonal entries are equal to 1, i.e. that the covariance matrix is the identity matrix.That would be the case if you were doing whitening alone with no regularization. However, in this case you are whitening with regularization, to avoid numerical/etc. problems associated with small eigenvalues. As a result of this, some of the diagonal entries of the covariance of your xPCAwhite will be smaller than 1.
Compared to PCA whitening, ZCA-whitening has the property that the transformed features have the as close as possible values to the original input. And unlike PCA whitening, which is usually done with a dimension reduction, ZCA whitening is usually used while keeping all n dimensions of data
When $x$ takes values around $[-1,1]$, a value of $\textstyle \epsilon \approx 10^{-5}$ might be typical. For the case of images, adding \textstyle \epsilon here also has the effect of slightly smoothing (or low-pass filtering) the input image. This also has a desirable effect of removing aliasing artifacts caused by the way pixels are laid out in an image, and can improve the features learned (details are beyond the scope of these notes). ZCA whitening is a form of pre-processing of the data that maps it from $x$ to $x_{\rm ZCAwhite}$. It turns out that this is also a rough model of how the biological eye (the retina) processes images. Specifically, as your eye perceives images, most adjacent "pixels" in your eye will perceive very similar values, since adjacent parts of an image tend to be highly correlated in intensity. It is thus wasteful for your eye to have to transmit every pixel separately (via your optic nerve) to your brain. Instead, your retina performs a decorrelation operation (this is done via retinal neurons that compute a function called "on center, off surround/off center, on surround") which is similar to that performed by ZCA. This results in a less redundant representation of the input image, which is then transmitted to your brain.
implemementation of ZCA whitening in Python: https://gist.github.com/duschendestroyer/5170087
X.shape = nsample, nfeats
X0 = X - X.mean(axis = 1)[:, np.newaxis]
cov = np.dot(X0.T, X0) / X0.shape[0]
U, S, V = np.linalg.svd(cov)
X_rot = np.dot(X0, U)
rotate back:
(1) full dimension
X_back = np.dot(X_rot, U.T)
(2) reduction to k pcs
X_pca_k = np.dot(X_rot[:, :k+1], U[:, :k+1].T)
#X_pca_k = np.dot(X_rot[:, :k+1], U[:k+1, :k+1].T)
PCA whitening
epsilon = 0.1
U_white = np.dot(U, np.diag(1. / np.sqrt(S + epsilon)))
X_PCA_white = np.dot(X0, U_white)
epsilon = 0.1
U_white = np.dot(U, np.diag(1. / np.sqrt(S + epsilon)))
X_PCA_white = np.dot(X0, U_white)
## NOT U_white.T, so ZCA white data's cov should be identity without
## regluarlization
X_ZCA_white = np.dot(X_PCA_white, U.T)
## Practise on 2D data
## load raw data
raw_data = sp.asarray([map(float, line.strip().split())
for line in open('data/pcaData.txt', 'r').readlines()]).T
print raw_data.shape
print raw_data.mean(axis = 0), raw_data.std(axis = 0)
pylab.scatter(raw_data[:, 0], raw_data[:, 1], s=50,
edgecolors='b', facecolors = 'none')
pylab.ylim([-0.8, 0.8])
## calculate pca -- without preprocessing
raw_data = raw_data - sp.mean(raw_data, axis = 0)
cov = sp.dot(raw_data.T, raw_data) / raw_data.shape[0]
U, D, V = sp.linalg.svd(cov, )
pc1, pc2 = U.T
print pc1, pc2
pylab.plot((0, pc1[0]), (0, pc1[1]), 'r', hold=True)
pylab.plot((0, pc2[0]), (0, pc2[1]), 'r', hold=True)
pylab.title('raw data with PCs')
print sp.dot(U, U.T)
## calculate x_rot
f = figure()
x_rot = sp.dot(raw_data, U)
pylab.scatter(x_rot[:, 0], x_rot[:, 1], s = 50,
edgecolors = 'b', facecolors = 'none')
pylab.xlim([-1, 1])
title('x_rot')
## dimension reduction and replot
figure()
x_hat = sp.dot(x_rot[:, :1], U[:, :1].T)
pylab.scatter(x_hat[:, 0], x_hat[:, 1], s = 50,
edgecolors = 'b', facecolors = 'none')
#pylab.xlim([-0.8, 0.65])
pylab.ylim([-0.8, 0.8])
pylab.title('xhat')
## pca whitening from PCA rotation
figure()
x_pca_white = x_rot * (1./ sp.sqrt(D + 1e-5))[sp.newaxis, :]
pylab.scatter(x_pca_white[:, 0], x_pca_white[:, 1], s = 50,
edgecolors = 'b', facecolors = 'none')
pylab.title('pca white')
## zca whitening from PCA white
figure()
x_zca_white = sp.dot(x_pca_white, U.T)
pylab.scatter(x_zca_white[:, 0], x_zca_white[:, 1], s = 50,
edgecolors = 'b', facecolors = 'none')
pylab.title('zca white')
(45, 2) [ 0.01851256 0.03179579] [ 0.29664794 0.29664794] [-0.70710678 -0.70710678] [-0.70710678 0.70710678] [[ 1.00000000e+00 -2.22044605e-16] [ -2.22044605e-16 1.00000000e+00]]
<matplotlib.text.Text at 0x11a376490>
## PCA, ZCA and Whitening on Images
## load image
raw_images = sp.io.loadmat('data/pca_exercise/IMAGES_RAW.mat')['IMAGESr']
raw_images = raw_images.transpose((2, 0, 1))
print raw_images.shape
(10, 512, 512)
patches = sample_patches(raw_images, n_patches=10000, patch_size=12)
print patches.shape
display_imgs(patches[:36], (6, 6), plotsize=0.7)
(10000, 12, 12)
## zero mean the data - PER IMAGE
X = patches.reshape((10000, -1))
X_mean = sp.mean(X, axis = 1)
X0 = X - X_mean[:, sp.newaxis]
display_imgs(X0[:36].reshape((36, 12, 12)), (6, 6), plotsize=0.7)
## find PCA matrix
## the covariance of X_rot should be a diagonal
cov = sp.dot(X0.T, X0) / X0.shape[0]
U, S, V = sp.linalg.svd(cov)
X_rot = sp.dot(X0, U)
pylab.imshow(sp.dot(X_rot.T, X_rot) / X_rot.shape[0], cmap=cm.Blues)
<matplotlib.image.AxesImage at 0x11a8a7e90>
## find number of compoents to retain
k99 = sp.where(sp.cumsum(S) / sp.sum(S) > 0.99)[0][0] + 1
k90 = sp.where(sp.cumsum(S) / sp.sum(S) > 0.90)[0][0] + 1
print k90, k99
42 116
## PCA Dimension reduction
X_pca90 = sp.dot(X_rot[:, :k90], U[:, :k90].T)
X_pca99 = sp.dot(X_rot[:, :k99], U[:, :k99].T)
print X_pca90.shape
f = figure()
display_imgs(X0[:36].reshape((36, 12, 12)), (6, 6), plotsize=0.7)
f = figure()
display_imgs(X_pca90[:36].reshape((36, 12, 12)), (6, 6), plotsize=0.7)
f = figure()
display_imgs(X_pca99[:36].reshape((36, 12, 12)), (6, 6), plotsize=0.7)
(10000, 144)
<matplotlib.figure.Figure at 0x11b3dd9d0>
<matplotlib.figure.Figure at 0x11cf6c7d0>
<matplotlib.figure.Figure at 0x10d953690>
## PCA whitening
epsilon = 0.001
U_white = np.dot(U, np.diag(1. / np.sqrt(S + epsilon)))
X_PCA_white = np.dot(X0, U_white)
pylab.imshow(sp.dot(X_PCA_white.T, X_PCA_white)/X_PCA_white.shape[0], cmap=cm.ocean)
figure()
display_imgs(X_PCA_white[:36].reshape((36, 12, 12)), (6, 6), plotsize=0.7)
<matplotlib.figure.Figure at 0x11a773dd0>
## ZCA whitening
epsilon = 0.01
U_white = np.dot(U, np.diag(1. / np.sqrt(S + epsilon)))
X_PCA_white = np.dot(X0, U_white)
X_ZCA_white = np.dot(X_PCA_white, U.T) ## NOT U_white.T
pylab.imshow(sp.dot(X_ZCA_white.T, X_ZCA_white)/X_ZCA_white.shape[0], cmap=cm.ocean)
figure()
display_imgs(X_ZCA_white[:36].reshape((36, 12, 12)), (6, 6), plotsize=0.7)
<matplotlib.figure.Figure at 0x118e91690>