## Digest the code here
import numpy as np
from IPython.core.display import display, HTML
def logistic(x):
return 1. / (1 + np.exp(-x))
data = np.asarray([
[1, 1, 1, 0],
[1, 0, 1, 0],
[1, 1, 1, 0],
[0, 0, 1, 1],
[0, 0, 1, 1],
[0, 0, 1, 1]])
nsample, nvis = data.shape
nhid = 2
print "\n>> nsample = %i, nvis = %i, nhid = %i" % (nsample, nvis, nhid)
print "\n>> Original data design matrix (nsample x nvis)"
display(data)
print "\n>> augument data input vectors (first all-one column)"
data = np.insert(data, 0, 1, axis = 1)
display(data)
weights = np.random.randn(nvis, nhid)
print "\n>> initial weights, shape = collections of column weight vectors"
print "weight vectors shape-match input vectors"
print "number of weight vectors = number of hidden nodes"
display(weights)
weights = np.insert(weights, 0, 0, axis = 0)
weights = np.insert(weights, 0, 0, axis = 1)
print "\n>> padding the weight matrix with 0s on 1st row and column"
print "adding 1st row because now the weight vector should MATCH AUGUMENTED input vector"
print "adding 1st col because we need another vector ESSENTIALLY another hidden node"
print "IN SUMMARY: we augument both weight and input vector, and always add another hidden node"
display(weights)
print "\n" + "=" * 30 + "FORWARD PHASE" + "=" * 30
print "where data flows from visible layer to hidden layer"
print "\n>> Given the shapes of weight and design matrices"
print "the activation of hidden nodes are actually done as dot of design x weights"
print "and the shape of the activations will be nsample x (nhid + 1) - one bias hidden node"
print "and the activation of the bias hidden unit is always off initially "
activations = np.dot(data, weights)
display(activations)
print "\n>> The probability of hidden nodes will be the elementwise logistic transformation of activations"
print "NOTE it is NOT a softmax setting so probs of all hidden nodes for certain input dont sum to 1"
print "In other words, different nodes are not EXCLUSIVE of each other"
probs = logistic(activations)
display(probs)
print "\n>> The hidden states (still nsample x nhidden+1) will be binary"
print "and it is calculated based on hidden probs compared to a uniform random threshold (0 ~ 1)"
print "by this, we add the flavor of uncertainty,"
print "and higher prob values should be more likely to be on than lower values"
states = probs > np.random.rand(nsample, nhid + 1, )
display(states)
print "\n>> Association (ninput+1 x nhidden+1) - OUTER PRODUCT of INPUTS and ACTIVATIONS"
print "Here is calcluated based on input and probs (instead of states)"
print "So that the association is a smooth one instead of binary, since it is an outer product"
print "it serves as a similiarity matrix (similiar to covariance matrix)"
print """Intuitively it can be understood the LINK between an INPUT node and an HIDDEN node
(on or off at the same time) by aggregatin all samples"""
associations = np.dot(data.T, probs)
display(associations)
print "\n" + "=" * 30 + "BACKWARD PHASE" + "=" * 30
print "where data flows from hidden layer back to visible layer"
print "\n>> Visible activation (nsample x nvis+1)"
print "assume the Hidden States (not probs) as new design matrix (BINARY AGAIN)"
print "and transpose of forward weights as new weights"
visible_activations = np.dot(states, weights.T)
display(visible_activations)
print "\n>> Calculate the firing chance of visible nodes (nsample x nvis+1) in the same way (logistic transf.)"
visible_probs = logistic(visible_activations)
display(visible_probs)
print "But this time fix the bias unit by setting all its (1st col) firing chance as 1"
print "So that the REFLECTED visible node values are directly comparable with AUGUMENTED inputs"
visible_probs[:, 0] = 1
display(visible_probs)
print "\n>> But it wont stop here when learning the weights"
print "We need use the FAKED(REFLECTED) visible inputs and put it forward again"
print "just to get the association between the FAKED inputs and hidden nodes again"
print "And remember the faked inputs this time is the fixed visible_probs"
faked_activations = np.dot(visible_probs, weights)
display(faked_activations)
print "\n>> And then the faked probs of activations"
faked_probs = logistic(faked_activations)
display(faked_probs)
print "\n>> And then the association between faked inputs and faked hidden probs"
faked_associations = np.dot(visible_probs.T, faked_probs)
display(faked_associations)
print "\n" + "=" * 30 + "LEARNING PHASE" + "=" * 30
print "where the weights are updated based on forward association between input and hidden probs"
print "and backward association between REFLECTED input and FAKED hidden probs"
learning_rate = 0.1
weights += learning_rate * ((associations - faked_associations) / nsample)
print ">> The new weights - NOTE the bias unit got updated as well"
display(weights)
>> nsample = 6, nvis = 4, nhid = 2 >> Original data design matrix (nsample x nvis)
array([[1, 1, 1, 0], [1, 0, 1, 0], [1, 1, 1, 0], [0, 0, 1, 1], [0, 0, 1, 1], [0, 0, 1, 1]])
>> augument data input vectors (first all-one column)
array([[1, 1, 1, 1, 0], [1, 1, 0, 1, 0], [1, 1, 1, 1, 0], [1, 0, 0, 1, 1], [1, 0, 0, 1, 1], [1, 0, 0, 1, 1]])
>> initial weights, shape = collections of column weight vectors weight vectors shape-match input vectors number of weight vectors = number of hidden nodes
array([[ 1.06162553, 0.89301881], [ 0.03644388, 1.16875729], [ 1.16552839, -1.00166981], [ 0.97883434, -0.47188823]])
>> padding the weight matrix with 0s on 1st row and column adding 1st row because now the weight vector should MATCH AUGUMENTED input vector adding 1st col because we need another vector ESSENTIALLY another hidden node IN SUMMARY: we augument both weight and input vector, and always add another hidden node
array([[ 0. , 0. , 0. ], [ 0. , 1.06162553, 0.89301881], [ 0. , 0.03644388, 1.16875729], [ 0. , 1.16552839, -1.00166981], [ 0. , 0.97883434, -0.47188823]])
==============================FORWARD PHASE============================== where data flows from visible layer to hidden layer >> Given the shapes of weight and design matrices the activation of hidden nodes are actually done as dot of design x weights and the shape of the activations will be nsample x (nhid + 1) - one bias hidden node and the activation of the bias hidden unit is always off initially
array([[ 0. , 2.2635978 , 1.06010629], [ 0. , 2.22715393, -0.10865099], [ 0. , 2.2635978 , 1.06010629], [ 0. , 2.14436273, -1.47355803], [ 0. , 2.14436273, -1.47355803], [ 0. , 2.14436273, -1.47355803]])
>> The probability of hidden nodes will be the elementwise logistic transformation of activations NOTE it is NOT a softmax setting so probs of all hidden nodes for certain input dont sum to 1 In other words, different nodes are not EXCLUSIVE of each other
array([[ 0.5 , 0.90581702, 0.74271086], [ 0.5 , 0.90266158, 0.47286394], [ 0.5 , 0.90581702, 0.74271086], [ 0.5 , 0.89514082, 0.18640241], [ 0.5 , 0.89514082, 0.18640241], [ 0.5 , 0.89514082, 0.18640241]])
>> The hidden states (still nsample x nhidden+1) will be binary and it is calculated based on hidden probs compared to a uniform random threshold (0 ~ 1) by this, we add the flavor of uncertainty, and higher prob values should be more likely to be on than lower values
array([[ True, True, True], [False, True, False], [ True, True, True], [ True, False, False], [ True, True, False], [False, True, True]], dtype=bool)
>> Association (ninput+1 x nhidden+1) - OUTER PRODUCT of INPUTS and ACTIVATIONS Here is calcluated based on input and probs (instead of states) So that the association is a smooth one instead of binary, since it is an outer product it serves as a similiarity matrix (similiar to covariance matrix) Intuitively it can be understood the LINK between an INPUT node and an HIDDEN node (on or off at the same time) by aggregatin all samples
array([[ 3. , 5.39971807, 2.5174929 ], [ 1.5 , 2.71429561, 1.95828566], [ 1. , 1.81163403, 1.48542172], [ 3. , 5.39971807, 2.5174929 ], [ 1.5 , 2.68542246, 0.55920724]])
==============================BACKWARD PHASE============================== where data flows from hidden layer back to visible layer >> Visible activation (nsample x nvis+1) assume the Hidden States (not probs) as new design matrix (BINARY AGAIN) and transpose of forward weights as new weights
array([[ 0. , 1.95464435, 1.20520116, 0.16385859, 0.50694611], [ 0. , 1.06162553, 0.03644388, 1.16552839, 0.97883434], [ 0. , 1.95464435, 1.20520116, 0.16385859, 0.50694611], [ 0. , 0. , 0. , 0. , 0. ], [ 0. , 1.06162553, 0.03644388, 1.16552839, 0.97883434], [ 0. , 1.95464435, 1.20520116, 0.16385859, 0.50694611]])
>> Calculate the firing chance of visible nodes (nsample x nvis+1) in the same way (logistic transf.)
array([[ 0.5 , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ], [ 0.5 , 0.74300106, 0.50910996, 0.7623358 , 0.72687686], [ 0.5 , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ], [ 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ], [ 0.5 , 0.74300106, 0.50910996, 0.7623358 , 0.72687686], [ 0.5 , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ]])
But this time fix the bias unit by setting all its (1st col) firing chance as 1 So that the REFLECTED visible node values are directly comparable with AUGUMENTED inputs
array([[ 1. , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ], [ 1. , 0.74300106, 0.50910996, 0.7623358 , 0.72687686], [ 1. , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ], [ 1. , 0.5 , 0.5 , 0.5 , 0.5 ], [ 1. , 0.74300106, 0.50910996, 0.7623358 , 0.72687686], [ 1. , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ]])
>> But it wont stop here when learning the weights We need use the FAKED(REFLECTED) visible inputs and put it forward again just to get the association between the FAKED inputs and hidden nodes again And remember the faked inputs this time is the fixed visible_probs
array([[ 0. , 2.19925902, 0.84526335], [ 0. , 2.4073589 , 0.15192652], [ 0. , 2.19925902, 0.84526335], [ 0. , 1.62121607, 0.29410903], [ 0. , 2.4073589 , 0.15192652], [ 0. , 2.19925902, 0.84526335]])
>> And then the faked probs of activations
array([[ 0.5 , 0.90018295, 0.69957258], [ 0.5 , 0.91738674, 0.53790874], [ 0.5 , 0.90018295, 0.69957258], [ 0.5 , 0.83496277, 0.57300179], [ 0.5 , 0.91738674, 0.53790874], [ 0.5 , 0.90018295, 0.69957258]])
>> And then the association between faked inputs and faked hidden probs
array([[ 3. , 5.3702851 , 3.74753702], [ 2.30692933, 4.14627168, 2.92421081], [ 1.91328308, 3.42951677, 2.44906603], [ 1.82364565, 3.27684949, 2.24177533], [ 1.91301231, 3.43652212, 2.37827712]])
==============================LEARNING PHASE============================== where the weights are updated based on forward association between input and hidden probs and backward association between REFLECTED input and FAKED hidden probs >> The new weights - NOTE the bias unit got updated as well
array([[ 0.00000000e+00, 4.90549500e-04, -2.05007353e-02], [ -1.34488222e-02, 1.03775927e+00, 8.76920061e-01], [ -1.52213847e-02, 9.47916361e-03, 1.15269655e+00], [ 1.96059058e-02, 1.20090953e+00, -9.97074513e-01], [ -6.88353854e-03, 9.66316013e-01, -5.02206058e-01]])
## RBM Class
import numpy as np
class RBM(object):
def __init__(self, num_visible, num_hidden, learning_rate = 0.1):
self.num_visible = num_visible
self.num_hidden = num_hidden
self.learning_rate = learning_rate
## initialize the weight matrix, of nvis x nhid, using
## a Gaussian distribution with mean = 0, std = 0.1
self.weights = 0.1 * np.random.randn(self.num_visible, self.num_hidden)
## add bias units into the weights (first row and first column)
self.weights = np.insert(self.weights, 0, 0.0, axis = 0)
self.weights = np.insert(self.weights, 0, 0.0, axis = 1, )
def train(self, data, max_epochs = 1000):
"""data : design matrix"""
num_examples = data.shape[0]
## augument the design matrix
data = np.insert(data, 0, 1.0, axis = 1)
for epoch in range(max_epochs):
## Clamp to the data and sample from the hidden units
pos_hidden_activations = np.dot(data, self.weights)
pos_hidden_probs = self._logistic(pos_hidden_activations)
pos_hidden_states = pos_hidden_probs > np.random.rand(num_examples, self.num_hidden+1)
## Note to use activation probabilites of the hidden states, not the hidden states
## themselves to compute associations. We could also use the states, see sectoin 3
## of "A practical Guide to Training RBM" for details
pos_associations = np.dot(data.T, pos_hidden_probs)
## Reconstruct the visible units and sample again from the hidden units
neg_visible_activations = np.dot(pos_hidden_states, self.weights.T)
neg_visible_probs = self._logistic(neg_visible_activations)
neg_visible_probs[:, 0] = 1 ## fix the bias unit
neg_hidden_activations = np.dot(neg_visible_probs, self.weights)
neg_hidden_probs = self._logistic(neg_hidden_activations)
neg_associations = np.dot(neg_visible_probs.T, neg_hidden_probs)
## update weights based on positive and negtive associations
self.weights += self.learning_rate * (pos_associations - neg_associations) / num_examples
error = np.sum((data - neg_visible_probs) ** 2)
#print 'Epoch %s: error is %s' % (epoch, error)
def run_visible(self, data):
pass
def run_hidden(self, data):
pass
def daydream(self, data):
pass
def _logistic(self, x):
return 1.0 / (1 + np.exp(-x))
## Use it
import pandas as pd
def print_weights(wt_values):
wts = pd.DataFrame(wt_values,
index = ["Bias Unit",
"Harry Potter",
"Avatar",
"LOTR 3",
"Gladiator",
"Titanic",
"Glitter"],
columns = ["Bias Unit",
"Hidden 1",
"Hidden 2",
"Hidden 3"])
display(wts)
if __name__ == '__main__':
r = RBM(num_visible = 6, num_hidden = 3)
training_data = np.asarray([
[1, 1, 1, 0, 0, 0],
[1, 0, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0],
[0, 0, 1, 1, 1, 0],
[0, 0, 1, 1, 0, 0],
[0, 0, 1, 1, 1, 0]
])
"""
training_data = np.asarray([
[1, 1, 1, 0, 0, 1],
[1, 0, 1, 0, 0, 0],
[1, 1, 1, 0, 0, 0],
[0, 0, 1, 1, 1, 1],
[0, 0, 1, 1, 0, 1],
[0, 0, 1, 1, 1, 1]
])
"""
r.train(training_data, max_epochs = 10000)
print_weights(r.weights)
user = np.asarray([[0, 0, 0, 1, 1, 0]])
print r.run_visible(user)
Bias Unit | Hidden 1 | Hidden 2 | Hidden 3 | |
---|---|---|---|---|
Bias Unit | 0.458353 | -0.798174 | 1.268165 | -0.394543 |
Harry Potter | -0.667807 | -7.777213 | 2.204765 | 4.605650 |
Avatar | -8.755023 | -4.857809 | 1.970422 | 2.654181 |
LOTR 3 | 4.113024 | 2.564658 | 4.080586 | 2.267659 |
Gladiator | 0.800234 | 7.249336 | -1.222727 | -5.582715 |
Titanic | 1.005019 | 3.959994 | -9.195495 | -3.453264 |
Glitter | -3.630298 | -3.191309 | -3.269856 | -3.059602 |
None
## Try use sklearn.rbm to do the same thing
from sklearn.neural_network import BernoulliRBM
rbm = BernoulliRBM(n_components=2, n_iter=10000)
rbm.fit(training_data)
display(rbm.components_.T)
print rbm.transform(training_data)
array([[-0.25611777, -0.30033446], [-0.33464182, -0.38160064], [-0.65902283, -0.66300143], [-0.56078935, -0.51404117], [-0.59151242, -0.55471126], [-0.71049034, -0.69816864]])
[[ 0.00049334 0.00044759] [ 0.00068928 0.00065541] [ 0.00049334 0.00044759] [ 0.00028143 0.00030405] [ 0.00050834 0.00052937] [ 0.00028143 0.00030405]]
For greyscale image data where pixel values can be interpreted as degrees of blackness on a white background, like handwritten digit recognition, the Bernoulli Restricted Boltzmann machine model (BernoulliRBM) can perform effective non-linear feature extraction.
In order to learn good latent representations from a small dataset, we artificially generate more labeled data by perturbing the training data with linear shifts of 1 pixel in each direction
To understand the example, a little bit introduction to convolve in image processing
import numpy as np
from scipy.ndimage import convolve
from sklearn import linear_model, datasets, metrics
from sklearn.cross_validation import train_test_split
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
%pylab inline --no-import-all
Populating the interactive namespace from numpy and matplotlib
## setup
def nudge_dataset(X, Y):
"""
This produces a dataset 5 times bigger than the original one,
by moving the 8x8 images in X around by 1px to left, right, down, up
This encodes translation invariance in essence
"""
direction_vectors = [
[[0, 1, 0],
[0, 0, 0],
[0, 0, 0]],
[[0, 0, 0],
[1, 0, 0],
[0, 0, 0]],
[[0, 0, 0],
[0, 0, 1],
[0, 0, 0]],
[[0, 0, 0],
[0, 0, 0],
[0, 1, 0]]
]
shift = lambda x, w: convolve(x.reshape((8, 8)), mode="constant",
weights = w).ravel()
X = np.concatenate([X] + [np.apply_along_axis(shift, 1, X, vector)
for vector in direction_vectors])
Y = np.concatenate([Y for _ in xrange(5)], axis = 0)
return X, Y
digits = datasets.load_digits()
X = np.asarray(digits.data, 'float32')
X, Y = nudge_dataset(X, digits.target)
## 0 - 1 scaling
X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size = 0.2,
random_state = 0)
## Random Search
## build model and training
print "training rbm -> logistic"
logistic = linear_model.LogisticRegression()
rbm = BernoulliRBM(random_state=0, verbose=True)
classifier = Pipeline(steps = [('rbm', rbm),
('logistic', logistic)])
rbm.set_params(learning_rate = 0.06, n_iter = 20, n_components=100)
logistic.set_params(C = 6000)
classifier.fit(X_train, Y_train)
print "training logistic"
logistic_alone = linear_model.LogisticRegression(C = 100)
logistic_alone.fit(X_train, Y_train)
training rbm -> logistic Iteration 0, pseudo-likelihood = -28.84, time = 0.82s Iteration 1, pseudo-likelihood = -25.92, time = 0.74s Iteration 2, pseudo-likelihood = -24.82, time = 0.75s Iteration 3, pseudo-likelihood = -23.71, time = 0.75s Iteration 4, pseudo-likelihood = -23.03, time = 0.76s Iteration 5, pseudo-likelihood = -22.44, time = 0.75s Iteration 6, pseudo-likelihood = -21.91, time = 0.75s Iteration 7, pseudo-likelihood = -21.66, time = 0.88s Iteration 8, pseudo-likelihood = -21.39, time = 0.75s Iteration 9, pseudo-likelihood = -21.07, time = 0.75s Iteration 10, pseudo-likelihood = -20.85, time = 0.74s Iteration 11, pseudo-likelihood = -20.74, time = 0.75s Iteration 12, pseudo-likelihood = -20.57, time = 0.74s Iteration 13, pseudo-likelihood = -20.44, time = 0.74s Iteration 14, pseudo-likelihood = -20.29, time = 0.74s Iteration 15, pseudo-likelihood = -20.20, time = 0.74s Iteration 16, pseudo-likelihood = -19.98, time = 0.74s Iteration 17, pseudo-likelihood = -19.75, time = 0.74s Iteration 18, pseudo-likelihood = -19.78, time = 0.75s Iteration 19, pseudo-likelihood = -19.67, time = 0.74s training logistic
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
## Evaluation
print "Result for rbm -> logistic"
print metrics.classification_report(Y_test,
classifier.predict(X_test))
print "Result for logistic alone"
print metrics.classification_report(Y_test,
logistic_alone.predict(X_test))
Result for rbm -> logistic precision recall f1-score support 0 0.99 0.99 0.99 174 1 0.92 0.95 0.93 184 2 0.95 0.98 0.97 166 3 0.97 0.91 0.94 194 4 0.97 0.95 0.96 186 5 0.93 0.93 0.93 181 6 0.98 0.97 0.97 207 7 0.95 1.00 0.97 154 8 0.90 0.88 0.89 182 9 0.91 0.93 0.92 169 avg / total 0.95 0.95 0.95 1797 Result for logistic alone precision recall f1-score support 0 0.85 0.94 0.89 174 1 0.57 0.55 0.56 184 2 0.72 0.85 0.78 166 3 0.76 0.74 0.75 194 4 0.85 0.82 0.84 186 5 0.74 0.75 0.75 181 6 0.93 0.88 0.91 207 7 0.86 0.90 0.88 154 8 0.68 0.55 0.61 182 9 0.71 0.74 0.72 169 avg / total 0.77 0.77 0.77 1797
## plotting
fig, axes = subplots(nrows = 10, ncols = 10, figsize = (24, 24))
axes = axes.flatten()
gray()
for i, comp in enumerate(rbm.components_):
axes[i].imshow(comp.reshape((8, 8)), interpolation="nearest")
fig.suptitle("100 componets extracted by RBM")
<matplotlib.text.Text at 0x31ae14d0>
## load data
from scipy.io import loadmat
images = loadmat('../ml-tutorials/data/starter/IMAGES.mat')['IMAGES']
print images.shape
(512, 512, 10)
##