import numpy as np
from utils.lr_utils import load_data, plot_training
def initialize_parameters(layers_dims):
'''
Arguments:
layers_dims -- a list of dimensions of each layer of our network
Returns:
parameters -- a dictionary containing weights and biases of the network
'''
parameters = {}
for l in range(len(layers_dims)-1):
parameters['W' + str(l+1)] = np.random.randn(layers_dims[l+1], layers_dims[l]) * 0.01
parameters['b' + str(l+1)] = np.zeros([layers_dims[l+1], 1])
return parameters
def relu(X):
return np.maximum(X, 0)
def sigmoid(X):
return 1 / (1 + np.exp(-1 * X))
def forward_propogation(A, parameters):
'''
Implement the forward propogation in the network
Arguments:
A -- input to the network
parameters -- a dictionary containing weights and biases of the network
Returns:
A -- Post activation value of the last layer
caches -- cache of all activation values, required for backpropogation
'''
L = len(parameters) // 2 # no. of layers
caches = {}
for l in range(L):
W = parameters['W' + str(l+1)]
b = parameters['b' + str(l+1)]
Z = np.dot(W, A) + b
A = relu(Z) if l<L-1 else sigmoid(Z) # relu in hidden layers and sigmoid in output layer
caches['A' + str(l+1)] = A
caches['Z' + str(l+1)] = Z
return A, caches
def compute_cost(AL, Y):
"""
Implement the cost function for the network
Arguments:
AL -- probability vector corresponding to the label predictions, shape (1, number of examples)
Y -- true "label" vector, shape (1, number of examples)
Returns:
cost -- cross-entropy cost
"""
m = Y.shape[1]
cost = (- 1 / m) * (np.dot(Y, np.log(AL.T)) + np.dot((1 - Y), np.log(1 - AL.T)))
cost = np.squeeze(cost)
assert(cost.shape == ())
return cost
def sigmoid_backward(grad_A, A):
return grad_A * A * (1 - A)
def relu_backward(grad_A, Z):
grad_A[Z<=0] = 0
return grad_A
def backward_propogation(X, Y, AL, caches, parameters):
'''
Implement Backpropogation
Arguments:
Al -- Activations of last layer
Y -- True labels of data
caches -- dictionary containing values of A and Z of each layer
parameters -- dictionary containing parameters of the network
Returns
grads -- dictionary containing gradients of the network parameters
'''
grads = {}
m = Y.shape[1]
L = len(parameters) // 2
grad_A = 1/m * (np.divide(1 - Y, 1 - AL) - np.divide(Y, AL))
grad_Z = sigmoid_backward(grad_A, caches['A' + str(L)])
for l in reversed(range(1, L)):
grads['W' + str(l+1)] = np.dot(grad_Z, caches['A' + str(l)].T)
grads['b' + str(l+1)] = np.sum(grad_Z, axis=1, keepdims=True)
assert(grads['W' + str(l+1)].shape == parameters['W' + str(l+1)].shape)
assert(grads['b' + str(l+1)].shape == parameters['b' + str(l+1)].shape)
grad_A = np.dot(parameters['W' + str(l+1)].T, grad_Z)
grad_Z = relu_backward(grad_A, caches['Z' + str(l)])
#for first layer
grads['W1'] = np.dot(grad_Z, X.T)
grads['b1'] = np.sum(grad_Z, axis=1, keepdims=True)
assert(grads['W1'].shape == parameters['W1'].shape)
assert(grads['b1'].shape == parameters['b1'].shape)
return grads
def update_parameters(parameters, grads, learning_rate):
'''
Update parameters of the network using gradient descent
Arguments:
paramters -- dictionary containing weights and biases of the network
grads -- dictionary containing the gradients of the parameters
learning_rate -- rate of gradient descent
Returns
parameters -- dictionary containing updated parameters
'''
L = len(parameters)//2
for l in reversed(range(L-1)):
parameters['W'+str(l+1)] -= learning_rate * grads['W' + str(l+1)]
parameters['b'+str(l+1)] -= learning_rate * grads['b' + str(l+1)]
return parameters
def Model(X, Y, X_val, Y_val, layers_dims, epochs, learning_rate):
parameters = initialize_parameters(layers_dims)
costs_train = []
costs_val = []
for epoch in range(epochs+1):
AL, caches = forward_propogation(X, parameters)
cost = compute_cost(AL, Y)
grads = backward_propogation(X, Y, AL, caches, parameters)
parameters = update_parameters(parameters, grads, learning_rate)
# compute validation cost
AL_val, _ = forward_propogation(X_val, parameters)
cost_val = compute_cost(AL_val, Y_val)
costs_train.append(cost)
costs_val.append(cost_val)
if epoch%10 == 0:
print('Epoch:', epoch, 'Cost: %0.3f' % cost, '- Val Cost: %0.3f' % cost_val)
plot_training(costs_train, costs_val)
return parameters
'''
Load MNIST dataset for 0 and 1 digits only. Find the code in utils/lr.py
'''
train, val = load_data()
X_train, Y_train = train
X_val, Y_val = val
print(X_train.shape, Y_train.shape, X_val.shape, Y_val.shape)
(784, 10610) (1, 10610) (784, 2055) (1, 2055)
# hyper-parameters, let's take these values for our example!
epochs = 200
learning_rate = 0.5
layers_dims = [784, 512, 128, 1]
parameters = Model(X_train, Y_train, X_val, Y_val, layers_dims, epochs, learning_rate)
Epoch: 0 Cost: 0.693 - Val Cost: 0.693 Epoch: 10 Cost: 0.688 - Val Cost: 0.687 Epoch: 20 Cost: 0.674 - Val Cost: 0.672 Epoch: 30 Cost: 0.637 - Val Cost: 0.630 Epoch: 40 Cost: 0.556 - Val Cost: 0.542 Epoch: 50 Cost: 0.449 - Val Cost: 0.433 Epoch: 60 Cost: 0.351 - Val Cost: 0.336 Epoch: 70 Cost: 0.263 - Val Cost: 0.250 Epoch: 80 Cost: 0.188 - Val Cost: 0.179 Epoch: 90 Cost: 0.135 - Val Cost: 0.129 Epoch: 100 Cost: 0.101 - Val Cost: 0.096 Epoch: 110 Cost: 0.079 - Val Cost: 0.075 Epoch: 120 Cost: 0.064 - Val Cost: 0.061 Epoch: 130 Cost: 0.054 - Val Cost: 0.052 Epoch: 140 Cost: 0.046 - Val Cost: 0.044 Epoch: 150 Cost: 0.041 - Val Cost: 0.039 Epoch: 160 Cost: 0.036 - Val Cost: 0.035 Epoch: 170 Cost: 0.033 - Val Cost: 0.031 Epoch: 180 Cost: 0.030 - Val Cost: 0.029 Epoch: 190 Cost: 0.028 - Val Cost: 0.026 Epoch: 200 Cost: 0.026 - Val Cost: 0.024