%matplotlib inline
Diclaimer/Context: This is the material for a practical given at LSCP (ENS, Paris). Some stuff can be missing from my whiteboard explanations there. Tell me on Github, at https://github.com/SnippyHolloW/DL4H/, or at gabriel.synnaeve@gmail.com or @syhw on Twitter, I'll try and update it accordingly. Pull requests welcomed.
How to read this (for students): go ahead and run things, change things, break things, and fix them!
Important concepts (I'll explain them on demand during this practical):
Central concepts in machine learning that are not covered here:
Practical 2/2 will be more about how to work with Theano and how to extend my own code
(in particular https://github.com/SnippyHolloW/abnet or https://github.com/bootphon/crossitlearn)
In general: $X$ is for data/samples (n_samples, n_features), $x_i$ is one data point (n_features,), $y$ is for labels (n_samples,), $y_i$ is the label of $x_i$, $W$ is for weights, $\alpha$ is for regularization, $\lambda$ is for learning rates
from __future__ import print_function
import numpy as np
from sklearn.datasets import load_digits
digits = load_digits()
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8, 5)
plt.gray()
plt.figure(1, figsize=(3, 3))
plt.imshow(digits.images[0], cmap=plt.cm.gray_r, interpolation='nearest')
data = np.asarray(digits.data, dtype='float32')
target = np.asarray(digits.target, dtype='int32')
x = data
y = target
from sklearn import preprocessing
x = preprocessing.scale(x)
from sklearn import cross_validation
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.15, random_state=42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
assert (len(set(y_train)) == len(set(y_test))) # should work with random_state=42 ;-)
(1527, 64) (1527,) (270, 64) (270,)
Logistic regression is linear regression ($y = X^T\cdot W + \epsilon$) to which we apply a logistic function ($\frac{1}{1+e^{-z}}$). Why? To transform the $y$ into something more categorical (binary if only two classes). In particular, when we are doing logistic regression, we are making the assumption that the $\log$ of the odds (of e.g. a $X \sim Bernouilli(P)$) is a linear regression:
$$\log ( \frac{P}{1-P} ) = X^T \cdot W \Longleftrightarrow P = \frac{\exp(X^T \cdot W)}{1 + \exp(X^T \cdot W)}$$We want to maximize the likelihood of our model (actually, if we have a prior on the parameters, we want the maximum a posteriori, an in this case we need to add a term to the negative log-likelihood below, if that prior is Gaussian, that's like doing L2 regularization).
If we think in term of loss function, we want to minimize the negative log-likelihood ($\log$ is monotonically increasing). I'm writing it in the general case where $y$ is not just binary:
$$NLL(W) = -\frac{1}{n} \sum_{i=1}^n \sum_{j=1}^m \mathbb{1}_{y_i=j} \log P(y_i=j|x_i; W)$$with $$P(y_i=j|x_i; W) = softmax(W, x_i)[j]$$ and $$softmax(W, x)[j] = \frac{\exp(x \cdot W_j)}{\sum_{k} \exp(x \cdot W_k)}$$ We can derive the gradient: $$\nabla_{W_j} = -\frac{1}{n} \sum_{i=1}^n x_i (\mathbb{1}_{y_i=j} - P(y_i=j|x_i, W))$$
By no mean this is minimal nor "smart" nor efficient, this aims at being easy to understand, it fills only a pedagogical role. That said, it can always be improved. All pull requests making it easier to grasp are welcomed!
def softmax(W, x):
""" P(y=j | X) = exp(<X,W_j>) / (sum_k(exp(<X,W_k>)))
W: (n_features, n_outputs)
X: (n_features,)
"""
tmp = np.exp(np.dot(x, W))
return tmp / np.sum(tmp)
from scipy.optimize.optimize import check_grad
from scipy.optimize import fmin_l_bfgs_b
class LogisticRegression:
def __init__(self):
self.name = "Logistic Regression"
def loss(self, W):
""" Negative log-likelihood as described above """
l = 0.
tmp_W = W.reshape((self.x.shape[1], self.n_classes))
for i, xi in enumerate(self.x):
p_i = softmax(tmp_W, xi)
l += np.log(p_i[self.y[i]])
return - l / self.x.shape[0]
def loss_derivative(self, W):
""" flattened (1-D) gradient vector """
d = np.zeros((self.x.shape[1], self.n_classes)) # that's the Jacobian
tmp_W = W.reshape((self.x.shape[1], self.n_classes))
for i, xi in enumerate(self.x):
p_i = softmax(tmp_W, xi)
for j, v in enumerate(p_i):
if j == self.y[i]: # because we don't have a "one-hot" encoding
d[:,j] += (1 - p_i[j]) * xi
else:
d[:,j] += - p_i[j] * xi
d = d.reshape(W.shape) # and we flatten it
return - d / self.x.shape[0]
def train(self, x_train, y_train, debug=False):
self.x = np.hstack((x_train, np.ones((x_train.shape[0], 1))))
self.y = y_train
self.n_classes = len(set(y_train))
self.W = np.ones((x_train.shape[1] + 1, self.n_classes))
print("W shape:", self.W.shape)
print("x shape:", self.x.shape)
print("loss before training:", self.loss(self.W))
print("prediction on x[0] before training:", softmax(self.W, self.x[0]))
if debug == True:
tmp = self.loss_derivative(self.W.reshape((self.W.shape[0]*self.W.shape[1],)))
print("loss derivative shape:", tmp.shape)
print("loss derivative value:", tmp)
print("checking the gradient with finite differences, show be low (something E-6):",
check_grad(self.loss, self.loss_derivative, self.W.reshape((self.W.shape[0]*self.W.shape[1],))))
# that's nothing more than checking that f'(x) ~= (f(x+h)-f(x))/h with h->0
tmp = fmin_l_bfgs_b(self.loss, self.W.reshape((self.W.shape[0]*self.W.shape[1],)), fprime=self.loss_derivative)
self.W = tmp[0].reshape((self.x.shape[1], self.n_classes))
print("loss after training:", self.loss(self.W))
print("prediction on x[0] after training:", softmax(self.W, self.x[0]))
print("true x[0] label:", self.y[0])
def score(self, x_test, y_test):
x = np.hstack((x_test, np.ones((x_test.shape[0], 1))))
y = y_test
c = 0
for i, xi in enumerate(x):
a = np.argmax(softmax(self.W, xi))
if a == y[i]:
c += 1
return c * 1. / y.shape[0]
lr = LogisticRegression()
lr.train(x_train, y_train)
plt.imshow(lr.W.transpose(), interpolation='none')
plt.show()
print("weights for 0:")
plt.imshow(lr.W.transpose()[0][:64].reshape((8,8)), interpolation='none')
plt.show()
print("weights for 5:")
plt.imshow(lr.W.transpose()[5][:64].reshape((8,8)), interpolation='none')
plt.show()
print("weights for 8 minus weights for 3:")
plt.imshow(lr.W.transpose()[8][:64].reshape((8,8)) - lr.W.transpose()[3][:64].reshape((8,8)), interpolation='none')
plt.show()
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))
print("We should overfit, i.e. test score << train score, you can add an L2 regularization")
W shape: (65, 10) x shape: (1527, 65) loss before training: 2.30258509299 prediction on x[0] before training: [ 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1] loss after training: 1.84669284921e-05 prediction on x[0] after training: [ 1.08822355e-31 9.98300553e-01 1.77810355e-28 3.23091472e-47 9.64927736e-20 2.79499404e-36 1.13639647e-16 6.26997256e-36 1.69944705e-03 1.81750303e-29] true x[0] label: 1
weights for 0:
weights for 5:
weights for 8 minus weights for 3:
training accuracy: 1.0 test accuracy: 0.955555555556 We should overfit, i.e. test score << train score, you can add an L2 regularization
Bonus: you can add regularization (order to try: L2, L1, ElasticNet), start with adding the square of the weights magnitudes to the loss function (weight this sum by an $\alpha$ as "L2 regularization parameter") http://en.wikipedia.org/wiki/Tikhonov_regularization // http://en.wikipedia.org/wiki/Least_squares#Lasso_method
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
print("Linear SVM")
print("training accuracy:", clf.score(x_train, y_train))
print("test accuracy:", clf.score(x_test, y_test))
clf = svm.SVC(kernel='rbf')
clf.fit(x_train, y_train)
print("with and RBF kernel")
print("training accuracy:", clf.score(x_train, y_train))
print("test accuracy:", clf.score(x_test, y_test))
Linear SVM training accuracy: 1.0 test accuracy: 0.97037037037 with and RBF kernel training accuracy: 0.996725605763 test accuracy: 0.981481481481
Bonus: change the loss of our logistic regression for a hinge loss and get a linear SVM. With regularization, you should approach scores above.
We used an L-BFGS optimization procedure, which is quasi-Newton (Newton algorithm to find the 0s of f: $x_{n+1} = x_n - \frac{f(x_n)}{f'(x_n)}$, and we search the 0s of $\nabla(loss)$). It uses an approximation of the Hessian matrix (matrix of second-order partial derivatives), but it's still more costly (in CPU time, but particularly in RAM) than just using the Jacobian (matrix of the first-order partial derivatives). Let's optimize this $loss$ by just taking small steps in the direction of the gradient, each datapoint at a time.
def init(lr, x_train, y_train):
lr.x = np.hstack((x_train, np.ones((x_train.shape[0], 1))))
lr.y = y_train
lr.n_classes = len(set(y_train))
lr.W = np.zeros((x_train.shape[1] + 1, lr.n_classes))
def one_epoch_of_SGD(lr, learning_rate):
""" an "epoch" is a full pass/iteration on the dataset """
tmp_W = lr.W[:]
for i, xi in enumerate(lr.x):
p_i = softmax(tmp_W, xi)
for j, v in enumerate(p_i):
if j == lr.y[i]:
tmp_W[:,j] += learning_rate * (xi * (1 - v))
else:
tmp_W[:,j] += learning_rate * (- xi * v)
return tmp_W
class LogisticRegression2:
def __init__(self):
self.name = "Logistic Regression"
def loss(self): # we redefine it so that we dont have to do all the reshaping, as we don't use scipy's L-BFGS anymore
""" Negative log-likelihood as described above """
l = 0.
for i, xi in enumerate(self.x):
p_i = softmax(self.W, xi)
l += np.log(p_i[self.y[i]])
return - l / self.x.shape[0]
def score(self, x_test, y_test):
x = np.hstack((x_test, np.ones((x_test.shape[0], 1))))
y = y_test
c = 0
for i, xi in enumerate(x):
a = np.argmax(softmax(self.W, xi))
if a == y[i]:
c += 1
return c * 1. / y.shape[0]
def stupid_train_with_SGD(lr, x_train, y_train):
init(lr, x_train, y_train)
print("loss before training:", lr.loss())
lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
old_loss = 1.E31
new_loss = lr.loss()
losses = [new_loss]
print("loss after one epoch of training:", new_loss)
lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
new_loss = lr.loss()
losses.append(new_loss)
print("loss after two epochs of training:", new_loss)
while old_loss - new_loss > 1.E-5: # we want to optimize up to decreasing in the loss of 1.E-5 ==> STUPIDELY BAD
old_loss = new_loss
lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
new_loss = lr.loss()
losses.append(new_loss)
print("best loss:", new_loss)
plt.plot(np.arange(len(losses)), np.log(losses), label="stupid", linewidth=2, linestyle="--")
def slightly_less_stupid_train_with_SGD(lr, x_train, y_train):
init(lr, x_train, y_train)
print("loss before training:", lr.loss())
old_loss = 1.E31
losses = []
new_loss = 1.E30
while abs(old_loss - new_loss) > 1.E-5: # we want to optimize up to changes in the loss of 1.E-5 ==> BAD
old_loss = new_loss
lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
new_loss = lr.loss()
losses.append(new_loss)
print("best loss:", new_loss)
plt.plot(np.arange(len(losses)), np.log(losses), label="slightly less stupid", linewidth=2, linestyle=":")
def early_stopping_train_with_SGD(lr, x_train, y_train):
""" using the training set as validation set,
DO NOT DO THIS IN PRACTICE! USE A SEPARATE VALIDATION SET """
init(lr, x_train, y_train)
print("loss before training:", lr.loss())
old_score = -1.
new_score = lr.score(x_train, y_train)
losses = []
patience = 3 # number of epochs of patience
while new_score > old_score or patience:
if new_score > old_score:
patience = 3
else:
patience -= 1
old_score = new_score
lr.W = one_epoch_of_SGD(lr, learning_rate=0.01)
new_score = lr.score(x_train, y_train)
losses.append(lr.loss())
print("best loss:", losses[-1])
plt.plot(np.arange(len(losses)), np.log(losses), label="crude early stopping\n3 epochs of patience", linewidth=1.5)
lr = LogisticRegression2()
print("stupid stopping condition [will stop at the first bump in the loss function]")
stupid_train_with_SGD(lr, x_train, y_train)
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))
print()
print("slightly less stupid stopping condition [will stop when the loss function is flat]")
slightly_less_stupid_train_with_SGD(lr, x_train, y_train)
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))
print()
print("(very) early stopping (crude, with 3 epochs of patience)")
print("[optimizes for the right thing, but do it on a _separate_ validation set!]")
early_stopping_train_with_SGD(lr, x_train, y_train) # using the training set as validation set,
# DO NOT DO THIS IN PRACTICE!
# USE A SEPARATE VALIDATION SET
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))
plt.ylabel("log of the loss function")
plt.xlabel("#epochs")
plt.legend()
stupid stopping condition [will stop at the first bump in the loss function] loss before training: 2.30258509299 loss after one epoch of training: 0.229635078515 loss after two epochs of training: 0.157913366763 best loss: 0.00421204401221 training accuracy: 1.0 test accuracy: 0.962962962963 slightly less stupid stopping condition [will stop when the loss function is flat] loss before training: 2.30258509299 best loss: 0.00421204401221 training accuracy: 1.0 test accuracy: 0.962962962963 (very) early stopping (crude, with 3 epochs of patience) [optimizes for the right thing, but do it on a _separate_ validation set!] loss before training: 2.30258509299 best loss: 0.029649373213 training accuracy: 0.998035363458 test accuracy: 0.966666666667
<matplotlib.legend.Legend at 0x110dffa10>
Bonus: plot the train set and test set accuracies (scores) over epochs, see overfitting.
Can we decrease the learning rate smartly? Why, yes! Check out below with $\lambda_t = \frac{\lambda_0}{1 + \lambda_0 \sqrt t}$, where we also get rid of epochs to do pure SGD. Read SGD tricks for more good stuff (e.g. you should try $\lambda_t = \frac{\lambda_0}{(1 + \lambda_0 \alpha t)^p}$ with powers $p=0.5$ and $p=1$).
from random import randint
def train_with_pure_SGD(lr, x_train, y_train, init_learning_rate=0.1):
init(lr, x_train, y_train)
old_loss = 1.E31
new_loss = lr.loss()
print("loss before training:", new_loss)
def get_learning_rate(t):
return init_learning_rate / (1 + init_learning_rate*np.sqrt(t))
#return init_learning_rate / (1 + init_learning_rate*t)
learning_rate = get_learning_rate(0)
losses = [new_loss]
gradients = []
# here we do purely stochastic gradient descent
t = 0
while t < 1000: # here we see a fixed number of samples
t += 1
old_loss = new_loss
# pick a random sample
i = randint(0, lr.x.shape[0]-1)
xi = lr.x[i]
p_i = softmax(lr.W, xi)
error = np.zeros(p_i.shape)
error[lr.y[i]] = 1
error = error - p_i
grad = np.outer(xi, error)
lr.W += get_learning_rate(t) * grad
new_loss = lr.loss()
losses.append(new_loss)
gradients.append(grad)
plt.plot(np.arange(len(losses)), np.log(losses))
plt.ylabel("log of the loss function")
plt.xlabel("#samples seen")
plt.show()
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
gradients = np.array(gradients)
#gradients = np.log(1 + np.abs(gradients.reshape(gradients.shape[0], gradients.shape[1]*gradients.shape[2])))
gradients = gradients.reshape(gradients.shape[0], gradients.shape[1]*gradients.shape[2])
grad = pca.fit_transform(gradients).transpose()
#grad = np.log(1 + np.abs(grad))
plt.plot(np.arange(grad.shape[1]), grad[0], label="#1 direction of the gradient")
plt.plot(np.arange(grad.shape[1]), grad[1], label="#2 direction of the gradient")
plt.plot(np.arange(grad.shape[1]), grad[2], label="#3 direction of the gradient", linestyle=":")
m = np.mean(np.abs(gradients), axis=-1)
plt.plot(np.arange(grad.shape[1]-50), [10*np.sum(m[i:i+50]) for i in xrange(grad.shape[1]-50)],
label="smoothed $\propto$ mean(abs(gradients))", linestyle="--")
#plt.ylabel("gradient log-magnitude")
plt.ylabel("gradient values")
plt.xlabel("#samples seen")
plt.legend(loc=4)
train_with_pure_SGD(lr, x_train, y_train)
print("training accuracy:", lr.score(x_train, y_train))
print("test accuracy:", lr.score(x_test, y_test))
loss before training: 2.30258509299
training accuracy: 0.956777996071 test accuracy: 0.933333333333
Can you think about an even more "stochastic" gradient descent? (yes, 2 answers at least: 1) just care about the sign, not the magnitude, 2) just update one of the weights per datapoint)
You can accumulate gradient in directions that are consistent thorough samples by using a momentum update. Instead of doing: $$W_{t+1} = W_t + \lambda \nabla f(W_t)$$ add a momentum $v$ (with a parameter $\mu$ and $v_0=0$, $\mu$ being a "friction", it makes sense to take it $\in [0\dots1]$, I would advise trying $0.9$) s.t.: $$v_{t+1} = \mu v_t + \lambda \nabla f(W_t)$$ $$W_{t+1} = W_t + v_{t+1}$$
a = np.arange(-3, 3, 0.01)
plt.plot(a, a, label="linear", linewidth=2, linestyle='--')
plt.plot(a, 1./(1+np.exp(-4*a)), label="sigmoid") # I multiply by 4 so that's it's less flat
plt.plot(a, np.tanh(4*a), label="tanh") # I multiply by 4 so that's it's less flat
plt.plot(a, (a+abs(a))/2, label="ReLU", linewidth=1.5)
plt.legend(loc=4)
<matplotlib.legend.Legend at 0x11036a610>
a = np.random.standard_normal((20,20))
def non_linearities_imshows(a):
plt.imshow(a, label="linear", cmap=plt.cm.gray_r, interpolation="none")
print("linear")
plt.colorbar()
plt.show()
plt.imshow(1./(1+np.exp(-4*a)), label="sigmoid", cmap=plt.cm.gray_r, interpolation="none") # /!\ mult by 4
print("sigmoid")
plt.colorbar()
plt.show()
plt.imshow(np.tanh(4*a), label="tanh", cmap=plt.cm.gray_r, interpolation="none") # /!\ mult by 4
print("tanh")
plt.colorbar()
plt.show()
plt.imshow((a+abs(a))/2, label="ReLU", cmap=plt.cm.gray_r, interpolation="none")
print("Rectified Linear Unit")
plt.colorbar()
plt.show()
non_linearities_imshows(a)
#non_linearities_imshows(digits.images[0])
linear
sigmoid
tanh
Rectified Linear Unit
Let's say we want to maximize $f = g*z$ and we also have $g = x + y$
Previously, we saw the case for one layer (logistic regression), that we can minimize $f$ by following the gradient in both these directions $\frac{\partial f(q,z)}{\partial g} = z$ and $\frac{\partial f(g,z)}{\partial z} = g$
Now, what does it mean to follow $g$? We have $\frac{\partial g(x,y)}{\partial x} = 1$ and $\frac{\partial g(x,y)}{\partial y} = 1$
We want to know how we have to change $x, y, z$ so that $f$ increases. Just as previously, we will take a gradient step $x = x + \lambda \frac{\partial f}{\partial x}$; $y = y + \lambda \frac{\partial f}{\partial y}$; $z = z + \lambda \frac{\partial f}{\partial z}$. For $z$, we already know the gradient, it's $g$, so it's $x+y$. So we update $z$ with $z = z + \lambda \times (x+y)$. What about for $x$ and for $y$? We just apply the chain rule: $\frac{\partial f(g,z)}{\partial x} = \frac{\partial f(g,z)}{\partial g}\frac{\partial g(x,y)}{\partial x}$, so for $x$ we update it with $x = x + \lambda \times \frac{\partial f(g,z)}{\partial g} \times \frac{\partial g(x,y)}{\partial x} = x + z \times 1$, and for $y$ we update it with $y = y + \lambda \times z \times 1)$ too.
So if you have not understood the above, you can try reading more details about backprop here. Otherwise, let's apply backprop to our logistic regression to which we had a layer:
The forward pass of our logistic regression is currently: $$X \rightarrow W_{logistic} \cdot X \rightarrow softmax$$
Let's consider that we add a hidden layer with a non-linearity $\sigma$ to our logistic regression, it becomes: $$X \rightarrow W_{hidden} \cdot X \rightarrow \sigma(W_{hidden} \cdot X) \rightarrow W_{logistic} \cdot \sigma(W_{hidden} \cdot X) \rightarrow softmax$$
The intuition behind back-propagation is that we keep doing what we did for optimising our previous loss function but for all the layers. The cool thing is that thanks to the chain rule of derivatives, this is an efficient process! Previously our loss (error) was the sum of the errors for all points, and it's a function of our weights: $$E(W) = \sum_{n=1}^N E_n(W)$$
Now if we add a hidden layer with a non-linearity $\sigma$ our predictions become $$P(y_i=j|x_n; W) = softmax(W_{logistic}, \sigma(W_{hidden}, x_n))[j]$$
Our loss function code changes to take this formula into account. And that's it for the forward pass. Let's note $a_j = \sum_i w_{ji}z_i$ for the pre-activation units, also called pre-synaptic, and $z_i = \sigma(a_i)$ are the activations (or the layer below in the previous $a_j$ formula), also called post-synaptic. Now if we keep using the gradient updates that we saw before, we can only apply them to the activated hidden units, and we don't know how we should change $W_{hidden}$. We do that through the chain rule: $$\frac{\partial E_n}{\partial w_{ji}} = \frac{\partial E_n}{\partial a_j}\frac{\partial a_j}{\partial w_{ji}}$$
That means, we can back-propagate the errors through the partial derivatives: we can compute how much each of the hidden units contributes to the error, and have a gradient w.r.t. its weights. In the code, I called err_h the errors at the hidden units, and err_o the errors are the output, act_xi the activations of the hidden units, act_d the activation derivatives, and backprop_o the backpropagation of the output error. The algorithm looks like:
Everything I told you about back-prop now generalizes from 1 to K hidden layers, because the chain rule $\frac{\partial E}{\partial W} = \frac{\partial E}{\partial a_j}\frac{\partial a_j}{\dots }\frac{\dots }{\partial W}$ stays true. So if we had more hidden layers, we would repeat steps 2 and 3 as much as needed to get back to the input features ($z_i == x_{ni}$).
def relu_u(W, x, clip=6):
"""
ReLU unit function: max(0, W.x)
In [1]: a = np.random.standard_normal((1000,1000))
In [2]: %timeit (a+abs(a))/2
100 loops, best of 3: 4.27 ms per loop
In [3]: %timeit a[a<0]=0
1000 loops, best of 3: 736 µs per loop
"""
#tmp = np.dot(x, W)
#tmp[tmp < 0] = 0
#return np.append(tmp, 1) # for the biases of the next layer
return np.append(np.clip(np.dot(x, W), 0, clip), 1)
def relu_d(W, x, clip=4):
#return np.asarray(np.dot(x, W) >= 0, dtype='float')
tmp = np.dot(x, W)
return np.asarray((tmp >= 0) & (tmp < clip), dtype='float')
def tanh_u(W, x):
tmp = np.dot(x, W)
return np.append(np.tanh(tmp), 1) # for the biases of the next layer
def tanh_d(W, x):
tmp = tanh_u(W, x)[:-1]
return 1 - tmp**2
class NeuralNet:
def __init__(self):
self.name = "Neural Net"
#self.act_u = relu_u # try ReLUs, but with a small learning rate
#self.act_d = relu_d # (think about why)
self.act_u = tanh_u
self.act_d = tanh_d
def loss(self):
""" Negative log-likelihood of logistic regression using the hidden layer """
l = 0.
for i, xi in enumerate(self.x):
p_i = self.pred(xi)
l += np.log(p_i[self.y[i]])
return - l / self.x.shape[0]
def one_epoch_of_backprop_SGD(self, learning_rate=0.01):
""" applying the backprop equations explained above """
for i, xi in enumerate(self.x):
act_xi = self.act_u(self.W_hid, xi)
p_i = softmax(self.W_log, act_xi)
err_o = np.zeros(p_i.shape)
err_o[self.y[i]] = 1
err_o = err_o - p_i
backprop_o = np.dot(err_o, self.W_log[:-1,:].T)
act_d = self.act_d(self.W_hid, xi)
err_h = act_d * backprop_o
self.W_hid += learning_rate * np.outer(xi, err_h)
self.W_log += learning_rate * np.outer(act_xi, err_o)
def pred(self, xi):
return softmax(self.W_log, self.act_u(self.W_hid, xi))
def train(self, x_train, y_train, n_hidden=42): # try different numbers of hidden units, e.g. 4, 64, 128, 200
self.x = np.hstack((x_train, np.ones((x_train.shape[0], 1))))
self.y = y_train
self.n_classes = len(set(y_train))
self.n_hidden = n_hidden
self.W_hid = np.random.random((x_train.shape[1] + 1, self.n_hidden)) - 0.5
self.W_hid *= 2 * 6. / (x_train.shape[1] + self.n_hidden)
self.W_log = np.zeros((self.n_hidden + 1, self.n_classes)) # +1 for biases
print("x shape:", self.x.shape)
print("W_hid shape:", self.W_hid.shape)
print("W_log shape:", self.W_log.shape)
tmp_loss = self.loss()
print("training loss before training:", tmp_loss)
print("prediction on x[0] before training:", self.pred(self.x[0]))
# training with SGD
losses = [tmp_loss]
l_r = 0.01
for _ in xrange(10): # 10 epochs
self.one_epoch_of_backprop_SGD(learning_rate=l_r)
losses.append(self.loss())
# play with the learning rate schemes
print("best loss:", losses[-1])
print("training loss after training:", self.loss())
print("prediction on x[0] after training:", self.pred(self.x[0]))
print("true x[0] label:", self.y[0])
plt.plot(np.arange(len(losses)), np.log(losses), linewidth=1)
plt.ylabel("log of the loss function")
plt.xlabel("#epochs")
plt.show()
plt.imshow(self.W_hid, interpolation='none')
plt.colorbar()
plt.show()
#plt.imshow(self.W_log, interpolation='none')
#plt.colorbar()
#plt.show()
def score(self, x_test, y_test):
x = np.hstack((x_test, np.ones((x_test.shape[0], 1))))
y = y_test
c = 0
for i, xi in enumerate(x):
a = np.argmax(self.pred(xi))
if a == y[i]:
c += 1
return c * 1. / y.shape[0]
nn = NeuralNet()
nn.train(x_train, y_train)
print("Train set accuracy:", nn.score(x_train, y_train))
print("Test set accuracy:", nn.score(x_test, y_test))
#nn.train(x_train, y_train, n_hidden=10)
#print "Train set accuracy:", nn.score(x_train, y_train)
#print "Test set accuracy:", nn.score(x_test, y_test)
print("you can also regularize a neural net, just change the loss function appropriately")
x shape: (1527, 65) W_hid shape: (65, 42) W_log shape: (43, 10) training loss before training: 2.30258509299 prediction on x[0] before training: [ 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1] best loss: 0.0253889805665 training loss after training: 0.0253889805665 prediction on x[0] after training: [ 3.83913679e-05 8.68270986e-01 3.32674656e-04 1.51426304e-06 5.77964865e-03 1.86531340e-05 2.65499370e-03 2.51086136e-05 1.22848931e-01 2.90992410e-05] true x[0] label: 1
Train set accuracy: 0.998035363458 Test set accuracy: 0.97037037037 you can also regularize a neural net, just change the loss function appropriately
Bonus: With more weights, this neural net is even more prone to overfitting. So you could regularize it.
Bonus: You can also add another hidden layer.
Theano works by compiling symbolic expressions into native code (on CPUs or GPUs). There is a nice "getting started" Theano tutorial and a more thorough tutorial here http://deeplearning.net/software/theano/tutorial/ Everything you should need for the moment is here http://deeplearning.net/software/theano/library/tensor/basic.html
from theano import tensor as T
A = T.fmatrix()
B = T.fmatrix()
C = T.dot(A, B)
C
Using gpu device 0: GeForce GT 650M
dot.0
A = np.random.random((10,10))
B = np.random.random((10,10))
C = T.dot(A, B)
C.eval()
array([[ 2.49987768, 3.46830327, 3.07533679, 3.28263693, 3.52006918, 2.81629526, 3.50309921, 2.48141154, 4.19651885, 3.27780272], [ 1.70199903, 2.00788938, 1.74255066, 1.61437016, 1.83195921, 1.17366047, 1.54721088, 1.0543538 , 1.85787448, 2.13366925], [ 2.57912841, 2.53388569, 2.52165203, 3.13951882, 2.59604248, 2.49420427, 2.86811637, 1.95585705, 3.64772433, 2.71894285], [ 1.65627613, 1.69303027, 1.29031643, 1.75713726, 1.45599642, 1.6585249 , 1.23043086, 1.07044738, 1.84521083, 1.72626196], [ 1.61517609, 2.49009226, 2.00571427, 2.26128717, 2.07083152, 1.6539112 , 1.97799148, 1.57657089, 2.51517353, 2.21145721], [ 1.65610118, 1.61789874, 1.7242132 , 1.90119529, 1.96550927, 1.46273452, 1.55532116, 0.9811569 , 2.17289324, 1.58394601], [ 1.94265607, 3.00453935, 1.91916286, 2.93620479, 2.81513856, 2.02763359, 2.90217799, 2.03910296, 2.92258548, 3.04995599], [ 1.32328403, 2.54435891, 1.09419405, 1.52412963, 1.90384414, 1.41099655, 1.27805053, 1.15345173, 1.67611008, 2.30179732], [ 1.74330342, 3.39876145, 2.35130411, 2.57335844, 2.4635886 , 2.55897828, 2.49926516, 2.11455371, 3.71868673, 2.75911756], [ 2.13621873, 2.85752605, 2.03129205, 2.87908882, 2.70363845, 2.1858482 , 2.81973067, 2.07834169, 2.94183379, 2.96044698]])
from theano import shared
A = shared(A)
B = shared(B)
A
<TensorType(float64, matrix)>
C = T.dot(A, B)
C.eval()
array([[ 2.49987768, 3.46830327, 3.07533679, 3.28263693, 3.52006918, 2.81629526, 3.50309921, 2.48141154, 4.19651885, 3.27780272], [ 1.70199903, 2.00788938, 1.74255066, 1.61437016, 1.83195921, 1.17366047, 1.54721088, 1.0543538 , 1.85787448, 2.13366925], [ 2.57912841, 2.53388569, 2.52165203, 3.13951882, 2.59604248, 2.49420427, 2.86811637, 1.95585705, 3.64772433, 2.71894285], [ 1.65627613, 1.69303027, 1.29031643, 1.75713726, 1.45599642, 1.6585249 , 1.23043086, 1.07044738, 1.84521083, 1.72626196], [ 1.61517609, 2.49009226, 2.00571427, 2.26128717, 2.07083152, 1.6539112 , 1.97799148, 1.57657089, 2.51517353, 2.21145721], [ 1.65610118, 1.61789874, 1.7242132 , 1.90119529, 1.96550927, 1.46273452, 1.55532116, 0.9811569 , 2.17289324, 1.58394601], [ 1.94265607, 3.00453935, 1.91916286, 2.93620479, 2.81513856, 2.02763359, 2.90217799, 2.03910296, 2.92258548, 3.04995599], [ 1.32328403, 2.54435891, 1.09419405, 1.52412963, 1.90384414, 1.41099655, 1.27805053, 1.15345173, 1.67611008, 2.30179732], [ 1.74330342, 3.39876145, 2.35130411, 2.57335844, 2.4635886 , 2.55897828, 2.49926516, 2.11455371, 3.71868673, 2.75911756], [ 2.13621873, 2.85752605, 2.03129205, 2.87908882, 2.70363845, 2.1858482 , 2.81973067, 2.07834169, 2.94183379, 2.96044698]])
If you want to know more about "shared-correctness", you should read: http://deeplearning.net/software/theano/tutorial/aliasing.html
Use python dnn.py
from this repo https://github.com/SnippyHolloW/DL4H
Use python lucid.py
from this repo https://github.com/SnippyHolloW/DL4H
# solution for the first bonus question
alpha = 0.001
def loss_l2(self, W):
tmp_W = W.reshape((self.x.shape[1], self.n_classes))
return self.loss(W) + alpha*np.sum([np.linalg.norm(tmp)**2 for tmp in tmp_W])
def loss_derivative_l2(self, W):
return self.loss_derivative(W) + 2*alpha*W
# in train:
tmp = fmin_l_bfgs_b(self.loss_l2, self.W.reshape((self.W.shape[0]*self.W.shape[1],)), fprime=self.loss_derivative_l2)
# solution for the second bonus question
def loss_hinge_l2(self, W):
l = 0.
tmp_W = W.reshape((self.x.shape[1], self.n_classes))
for i, xi in enumerate(self.x):
tmp_max = -1.
good = np.dot(tmp_W[:,self.y[i]], xi)
for j in xrange(self.n_classes):
if j == self.y[i]:
continue
delta = good - np.dot(tmp_W[:,j], xi)
if delta > tmp_max:
tmp_max = delta
l += max(0, 1 + tmp_max)
return - l / self.x.shape[0] + alpha*np.sum([np.linalg.norm(tmp)**2 for tmp in tmp_W])
def loss_derivative_hinge_l2(self, W):
d = np.zeros((self.x.shape[1], self.n_classes)) # that's the Jacobian
tmp_W = W.reshape((self.x.shape[1], self.n_classes))
for i, xi in enumerate(self.x):
for j in xrange(self.n_classes):
if j == self.y[i]:
pass
else:
d[:,j] += -np.dot(tmp_W[:,j], xi) * xi
d = d.reshape(W.shape) # and we flatten it
return - d / self.x.shape[0] + alpha*np.sum([np.linalg.norm(tmp)**2 for tmp in tmp_W])
def score(self, x_test, y_test):
x = np.hstack((x_test, np.ones((x_test.shape[0], 1))))
y = y_test
c = 0
for i, xi in enumerate(x):
#a = np.argmax(softmax(self.W, xi))
a = np.argmax(np.dot(self.W.transpose(), xi))
if a == y[i]:
c += 1
return c * 1. / y.shape[0]
# in train:
tmp = fmin_l_bfgs_b(self.loss_hinge_l2, self.W.reshape((self.W.shape[0]*self.W.shape[1],)),
fprime=self.loss_derivative_hinge_l2)