In [158]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline


1 Data & Architecture¶

In [279]:
X = np.array(([3,5],[5,1],[10,2]), dtype=float)    # hours sleep, hours study - input matrix: 3 samples, 2 features
y = np.array(([75],[82],[95]), dtype=float)        # test score
#normalize data:
X = X / np.amax(X, axis=0)
y = y / 100
X.shape

Out[279]:
(3, 2)
In [69]:
%%latex
$\textsf{Neurons sum inputs from synapses}$
$z = x1 + x2 + x3 = \sum xi$
$\textsf{and apply a sigmoid activation function}$
$a = \frac{1}{1 + e^{-z}}$

$\textsf{Neurons sum inputs from synapses}$ $z = x1 + x2 + x3 = \sum xi$ $\textsf{and apply a sigmoid activation function}$ $a = \frac{1}{1 + e^{-z}}$

2 Forward Propogation¶

In [255]:
class Neural_Network(object):
def __init__(self):
# define hyperparams (constants that establish structure & behaviour of network;not updated while training):
self.inputLayerSize = 2 # corespond to number of features
self.hiddenLayerSize = 3
self.outputLayerSize = 1
# define params: weights on synapses
self.W1 = np.random.randn(self.inputLayerSize , self.hiddenLayerSize)
self.W2 = np.random.randn(self.hiddenLayerSize , self.outputLayerSize)

#2
def forward(self,X):
# propogate weights through network
# use matrices to pass multiple inputs at once
self.z2 = np.dot(X, self.W1)
self.a2 = self.sigmoid(self.z2)
self.z3 = np.dot(self.a2, self.W2)
yHat = self.sigmoid(self.z3)
return yHat

def sigmoid(self,z):
# apply sigmoind to scalar / vector / matrix
return 1 / (1 + np.exp(-z))

#3
def sigmoidPrime(self,z):
# backpropagation
return np.exp(-z) / ((1 + np.exp(-z))**2)

#4
def costFunction(self, X, y):
self.yHat = self.forward(X)
J = 0.5 * sum((y - self.yHat)**2)
return J

def costFunctionPrime(self, X, y):
# compute derivative with respect to W1 and W2
# ie which way is uphill in our 9D optimization space
self.yHat = self.forward(X)
#backpropagation error: delta
delta3 = np.multiply(-(y - yHat), self.sigmoidPrime(self.z3))
dJdW2 = np.dot(self.a2.T, delta3)
delta2 = np.dot(delta3, self.W2.T) * self.sigmoidPrime(self.z2)
dJdW1 = np.dot(X.T, delta2)
return dJdW1, dJdW2

#5
def getParams(self):
# get W1 and W2 rolled into 1 vector
params = np.concatenate((self.W1.ravel(), self.W2.ravel()))
return params
def setParams(self, params):
# Set W1 and W2 using single param vector
W1_start = 0
W1_end = self.hiddenLayerSize * self.inputLayerSize
self.W1 = np.reshape(params[W1_start:W1_end], (self.inputLayerSize, self.hiddenLayerSize))
W2_end = W1_end + self.hiddenLayerSize * self.outputLayerSize
self.W2 = np.reshape(params[W1_end:W2_end], (self.hiddenLayerSize, self.outputLayerSize))

dJdW1, dJdW2 = self.costFunctionPrime(X,y)
return np.concatenate((dJdW1.ravel(),dJdW2.ravel()))

NN = Neural_Network()

In [92]:
%%latex
$\begin{bmatrix} 3 & 5 \\ 5 & 1 \\ 10 & 2 \end{bmatrix} \begin{bmatrix} W_{11}^{(1)} & W_{12}^{(1)} & W_{13}^{(1)}\\ W_{21}^{(1)} & W_{22}^{(1)} & W_{23}^{(1)} \end{bmatrix} = \begin{bmatrix} 3W_{11}^{(1)} + 5W_{21}^{(1)} & 3W_{12}^{(1)} + 5W_{22}^{(1)} & 3W_{13}^{(1)} + 5W_{23}^{(1)} \\ 5W_{11}^{(1)} + 1W_{21}^{(1)} & 5W_{12}^{(1)} + 1W_{22}^{(1)} & 5W_{13}^{(1)} + 1W_{23}^{(1)} \\ 10W_{11}^{(1)} + 2W_{21}^{(1)} & 10W_{12}^{(1)} + 2W_{22}^{(1)} & 10W_{13}^{(1)} + 2W_{23}^{(1)} \end{bmatrix}$
$X W^{(1)} = Z^{(2)}$

$\begin{bmatrix} 3 & 5 \\ 5 & 1 \\ 10 & 2 \end{bmatrix} \begin{bmatrix} W_{11}^{(1)} & W_{12}^{(1)} & W_{13}^{(1)}\\ W_{21}^{(1)} & W_{22}^{(1)} & W_{23}^{(1)} \end{bmatrix} = \begin{bmatrix} 3W_{11}^{(1)} + 5W_{21}^{(1)} & 3W_{12}^{(1)} + 5W_{22}^{(1)} & 3W_{13}^{(1)} + 5W_{23}^{(1)} \\ 5W_{11}^{(1)} + 1W_{21}^{(1)} & 5W_{12}^{(1)} + 1W_{22}^{(1)} & 5W_{13}^{(1)} + 1W_{23}^{(1)} \\ 10W_{11}^{(1)} + 2W_{21}^{(1)} & 10W_{12}^{(1)} + 2W_{22}^{(1)} & 10W_{13}^{(1)} + 2W_{23}^{(1)} \end{bmatrix}$ $X W^{(1)} = Z^{(2)}$
In [205]:
testInput = np.arange(-6, 6, 0.01)
plt.plot(testInput, NN.sigmoid(testInput), linewidth=2)
plt.plot(testInput, NN.sigmoidPrime(testInput), linewidth=2) # largest where sigmoid is steepest
plt.grid()
plt.legend(['sigmoid', 'sigmoid prime'])

Out[205]:
<matplotlib.legend.Legend at 0x10fc32390>
In [119]:
%%latex
$z^{(2)} = X W^{(1)}$
$a^{(2)} = f(z^{(2)})$
$z^{(3)} = a^{(2)}W^{(2)}$

$W^{(2)} = \begin{bmatrix} W^{(2)}_{(11)} \\ W^{(2)}_{(21)} \\ W^{(2)}_{(31)} \end{bmatrix}$
$\hat{y} = f(z^{(3)})$

$z^{(2)} = X W^{(1)}$ $a^{(2)} = f(z^{(2)})$ $z^{(3)} = a^{(2)}W^{(2)}$  $W^{(2)} = \begin{bmatrix} W^{(2)}_{(11)} \\ W^{(2)}_{(21)} \\ W^{(2)}_{(31)} \end{bmatrix}$ $\hat{y} = f(z^{(3)})$
In [135]:
yHat = NN.forward(X)
yHat

Out[135]:
array([[ 0.53030683],
[ 0.54199931],
[ 0.55222292]])

In [157]:
%%latex
$\textsf{cost function, assuming convexity}$
$J = argmin(W) \sum_{i=0} \frac{1}{2} e_{i}^{2}$
$J = argmin(W) \sum_{i=0} \frac{1}{2} (y_{i}- \hat{y_{i}})^{2}$
$W^{(1)} \textsf{is a 3x3 matrix --> curse of dimensionality}$

$\textsf{cost function, assuming convexity}$ $J = argmin(W) \sum_{i=0} \frac{1}{2} e_{i}^{2}$ $J = argmin(W) \sum_{i=0} \frac{1}{2} (y_{i}- \hat{y_{i}})^{2}$ $W^{(1)} \textsf{is a 3x3 matrix --> curse of dimensionality}$
In [156]:
%%latex
$\textsf{combined equation}$
$J = argmin(W) \sum \frac{1}{2} (y - f(f(XW^{(1)})W^{(2)}))^2$
$\textsf{minimize cost J by choosing weights W - examine rate of change (gradient): stop when}$
$\frac{\partial J}{\partial W}=0$

$\textsf{combined equation}$ $J = argmin(W) \sum \frac{1}{2} (y - f(f(XW^{(1)})W^{(2)}))^2$ $\textsf{minimize cost J by choosing weights W - examine rate of change (gradient): stop when}$ $\frac{\partial J}{\partial W}=0$

4 Backpropogation¶

• (a long chain rule)
• weights that contribute more to overall error will be changed more by SGD
In [160]:
display(NN.W1, NN.W1.shape)
display(NN.W2, NN.W2.shape)

array([[ 0.22224692, -0.27898779,  1.71877036],
[ 0.23482195, -2.14545752, -0.10447487]])
(2, 3)
array([[-0.16169648],
[ 0.02736267],
[ 0.35196909]])
(3, 1)
In [177]:
%%latex
$\frac{\partial J}{\partial W^{(1)}}= \begin{bmatrix} \frac{ \partial J}{ \partial{W_{11}^{(1)}}} & \frac{ \partial J}{ \partial{W_{12}^{(1)}}} & \frac{ \partial J}{ \partial{W_{13}^{(1)}}}\\ \frac{ \partial J}{ \partial{W_{21}^{(1)}}} & \frac{ \partial J}{ \partial{W_{22}^{(1)}}} & \frac{ \partial J}{ \partial{W_{23}^{(1)}}} \end{bmatrix}$

$\frac{\partial J}{\partial W^{(2)}}= \begin{bmatrix} \frac{ \partial J}{ \partial{W_{11}^{(2)}}} \\ \frac{ \partial J}{ \partial{W_{21}^{(2)}}} \\ \frac{ \partial J}{ \partial{W_{31}^{(2)}}} \end{bmatrix}$

$\frac{\partial J}{\partial W^{(1)}}= \begin{bmatrix} \frac{ \partial J}{ \partial{W_{11}^{(1)}}} & \frac{ \partial J}{ \partial{W_{12}^{(1)}}} & \frac{ \partial J}{ \partial{W_{13}^{(1)}}}\\ \frac{ \partial J}{ \partial{W_{21}^{(1)}}} & \frac{ \partial J}{ \partial{W_{22}^{(1)}}} & \frac{ \partial J}{ \partial{W_{23}^{(1)}}} \end{bmatrix}$  $\frac{\partial J}{\partial W^{(2)}}= \begin{bmatrix} \frac{ \partial J}{ \partial{W_{11}^{(2)}}} \\ \frac{ \partial J}{ \partial{W_{21}^{(2)}}} \\ \frac{ \partial J}{ \partial{W_{31}^{(2)}}} \end{bmatrix}$
In [207]:
%%latex
$\textsf{sum rule in differentiation}$
$\frac{d(u + v)}{dx} = \frac{du}{dx} + \frac{dv}{dx}$

$\textsf{chain rule in differentiation}$
$(f \circ g)' = (f' \circ g) \cdot g'$

$\frac{dz}{dx} = \frac{dz}{dy} \cdot \frac{dy}{dx}$

$\textsf{sum rule in differentiation}$ $\frac{d(u + v)}{dx} = \frac{du}{dx} + \frac{dv}{dx}$  $\textsf{chain rule in differentiation}$ $(f \circ g)' = (f' \circ g) \cdot g'$  $\frac{dz}{dx} = \frac{dz}{dy} \cdot \frac{dy}{dx}$
In [229]:
%%latex

$\frac{\partial J}{\partial W^{(2)}} = \frac{\partial \sum{\frac{1}{2}(y - \hat{y})^2}}{\partial W^{(2)}} = \sum{\frac{\partial \frac{1}{2}(y - \hat{y})^2}{\partial W^{(2)}}}$

$\textsf{power rule and chain rule}$
$\begin{bmatrix} -y_{1} - \hat{y_{1}} \\ -y_{2} - \hat{y_{2}} \\ -y_{3} - \hat{y_{3}} \end{bmatrix} \begin{bmatrix} f'(z_{1}^{(3)}) \\ f'(z_{2}^{(3)}) \\ f'(z_{3}^{(3)}) \end{bmatrix} = \begin{bmatrix} \delta _{1}^{(3)} \\ \delta _{2}^{(3)} \\ \delta _{3}^{(3)} \end{bmatrix} = \delta ^{(3)}$
$(a^{(2)})^{T} \delta ^{(3)} = \begin{bmatrix} a_{11}^{(2)} & a_{21}^{(2)} & a_{31}^{(2)} \\ a_{12}^{(2)} & a_{22}^{(2)} & a_{32}^{(2)} \\ a_{13}^{(2)} & a_{23}^{(2)} & a_{33}^{(2)} \end{bmatrix} \begin{bmatrix} \delta _{1}^{(3)} \\ \delta _{2}^{(3)} \\ \delta _{3}^{(3)} \end{bmatrix} = \begin{bmatrix} a_{11}^{(2)} \delta _{1}^{(3)} + a_{21}^{(2)} \delta _{2}^{(3)} + a_{31}^{(2)} \delta _{3}^{(3)} \\ a_{12}^{(2)} \delta _{1}^{(3)} + a_{22}^{(2)} \delta _{2}^{(3)} + a_{32}^{(2)} \delta _{3}^{(3)} \\ a_{13}^{(2)} \delta _{1}^{(3)} + a_{23}^{(2)} \delta _{2}^{(3)} + a_{33}^{(2)} \delta _{3}^{(3)} \end{bmatrix}$

 $\frac{\partial J}{\partial W^{(2)}} = \frac{\partial \sum{\frac{1}{2}(y - \hat{y})^2}}{\partial W^{(2)}} = \sum{\frac{\partial \frac{1}{2}(y - \hat{y})^2}{\partial W^{(2)}}}$  $\textsf{power rule and chain rule}$ $\begin{bmatrix} -y_{1} - \hat{y_{1}} \\ -y_{2} - \hat{y_{2}} \\ -y_{3} - \hat{y_{3}} \end{bmatrix} \begin{bmatrix} f'(z_{1}^{(3)}) \\ f'(z_{2}^{(3)}) \\ f'(z_{3}^{(3)}) \end{bmatrix} = \begin{bmatrix} \delta _{1}^{(3)} \\ \delta _{2}^{(3)} \\ \delta _{3}^{(3)} \end{bmatrix} = \delta ^{(3)}$ $(a^{(2)})^{T} \delta ^{(3)} = \begin{bmatrix} a_{11}^{(2)} & a_{21}^{(2)} & a_{31}^{(2)} \\ a_{12}^{(2)} & a_{22}^{(2)} & a_{32}^{(2)} \\ a_{13}^{(2)} & a_{23}^{(2)} & a_{33}^{(2)} \end{bmatrix} \begin{bmatrix} \delta _{1}^{(3)} \\ \delta _{2}^{(3)} \\ \delta _{3}^{(3)} \end{bmatrix} = \begin{bmatrix} a_{11}^{(2)} \delta _{1}^{(3)} + a_{21}^{(2)} \delta _{2}^{(3)} + a_{31}^{(2)} \delta _{3}^{(3)} \\ a_{12}^{(2)} \delta _{1}^{(3)} + a_{22}^{(2)} \delta _{2}^{(3)} + a_{32}^{(2)} \delta _{3}^{(3)} \\ a_{13}^{(2)} \delta _{1}^{(3)} + a_{23}^{(2)} \delta _{2}^{(3)} + a_{33}^{(2)} \delta _{3}^{(3)} \end{bmatrix}$

simple linear relationship for:

• each hidden-layer to output-layer synapse: dW2/dZ3 = a2
• each hidden-layer to output-layer synapse: dZ3/da2 = W2
• each input-layer to hidden-layer synapse: dZ2/dW1 = X
In [234]:
%%latex
$\frac{\partial J}{\partial W^{(2)}} = (a^{(2)})^{T} \delta ^{(3)}$

$\delta ^{(3)} = -(y - \hat{y})f'(z^{(3)})$

$\frac{\partial J}{\partial W^{(1)}} = X^{T} \delta ^{(2)}$

$\delta ^{(2)} = \delta ^{(3)} (W^{(1)})^{T} f'(z^{(2)})$

$\frac{\partial J}{\partial W^{(2)}} = (a^{(2)})^{T} \delta ^{(3)}$  $\delta ^{(3)} = -(y - \hat{y})f'(z^{(3)})$  $\frac{\partial J}{\partial W^{(1)}} = X^{T} \delta ^{(2)}$  $\delta ^{(2)} = \delta ^{(3)} (W^{(1)})^{T} f'(z^{(2)})$
In [242]:
dJdW1, dJdW2 = NN.costFunctionPrime(X,y)
dJdW1, dJdW2

Out[242]:
(array([[ 0.02356812, -0.02571859,  0.05172803],
[ 0.02094194, -0.02520551,  0.03866296]]), array([[-0.03604346],
[-0.05906128],
[-0.09162464]]))
In [244]:
scalar = 3
NN.W1 = NN.W1 + scalar * dJdW1
NN.W2 = NN.W2 + scalar * dJdW2
NN.costFunction(X,y)

Out[244]:
array([ 0.48228807])

In [245]:
x = np.arange(-5, 5, 0.01)
plt.plot(x, x**2, 'b')
plt.grid(1)
plt.scatter(1.5, 1.5**2)

Out[245]:
<matplotlib.collections.PathCollection at 0x10fd7ab70>
In [251]:
def f(x):
return x**2
epsilon = 1e-4
x = 1.5

numericGradient = (f(x+epsilon) - f(x-epsilon))/(2 * epsilon)
# verification:

Out[251]:
(2.9999999999996696, 3.0)
In [253]:
NN.getParams()

Out[253]:
array([-0.10832181,  0.29157752, -2.32875267, -0.79858892,  0.77523413,
0.01438056,  0.45891691, -1.01902928,  0.55968482])
In [260]:
def computeNumericalGradients(N,X,y):
paramsInitial = N.getParams()
perturb = np.zeros(paramsInitial.shape)
e = 1e-4
for p in range(len(paramsInitial)):
# set perturbation vector
perturb[p] = e
N.setParams(paramsInitial + perturb)
loss1 = N.costFunction(X,y)
N.setParams(paramsInitial - perturb)
loss2 = N.costFunction(X,y)

numgrad[p] = (loss2 - loss1) / (2*e)

#reset
perturb[p] = 0
#reset
N.setParams(paramsInitial)


array([-0.0269274 , -0.00625294,  0.0535152 , -0.01712459, -0.00425763,
0.03704035,  0.0800578 ,  0.12394559,  0.0490642 ])
array([ 0.02535875,  0.00589676, -0.05122405,  0.01697504,  0.00424728,
-0.03729985, -0.07870872, -0.1205269 , -0.04933063])
Out[260]:
73.212293905780498

6 Training¶

BFGS Quasi-Newton optimization of SGD - leverages Hessian

In [266]:
from scipy import optimize

class Trainer(object):
def __init__(self,N):
self.N = N

def costFuncWrapper(self, params, X, y):
self.N.setParams(params)
cost = self.N.costFunction(X,y)

def callbackF(self, params):
self.N.setParams(params)
self.J.append(self.N.costFunction(self.X, self.y))

def train(self, X, y):
self.X = X
self.y = y
self.J = [] # costs
params0 = self.N.getParams()
options = {'maxiter': 200, 'disp': True}
# set jacobian
_res = optimize.minimize( \
self.costFuncWrapper, params0, jac=True, method='BFGS', args=(X,y), options=options, callback=self.callbackF)
self.N.setParams(_res.x)
self.optimizationResults = _res

In [280]:
NN = Neural_Network()
T = Trainer(NN)
T.train(X,y)
plt.plot(T.J)
plt.grid(1)
plt.ylabel('Cost')
plt.xlabel('Iterations')
#hmmmm ....

Warning: Desired error not necessarily achieved due to precision loss.
Current function value: 0.040577
Iterations: 3
Function evaluations: 96

Out[280]:
<matplotlib.text.Text at 0x111ef29b0>
In [281]:
NN.costFunctionPrime(X,y)

Out[281]:
(array([[-0.02306668, -0.05445037, -0.00996909],
[-0.01709984, -0.02921199, -0.00573783]]), array([[-0.00477078],
[-0.02088484],
[-0.00580673]]))
In [282]:
NN.forward(X), y

Out[282]:
(array([[ 0.96101082],
[ 0.96878685],
[ 0.8296233 ]]), array([[ 0.75],
[ 0.82],
[ 0.95]]))

7 Overfitting¶

In [284]:
# fit the signal, not the noise
# TODO: regularize the cost function