#!/usr/bin/env python # coding: utf-8 # Credits: Forked from [deep-learning-keras-tensorflow](https://github.com/leriomaggio/deep-learning-keras-tensorflow) by Valerio Maggio # # Introduction to Deep Learning # Deep learning allows computational models that are composed of multiple processing **layers** to learn representations of data with multiple levels of abstraction. # These methods have dramatically improved the state-of-the-art in speech recognition, visual object recognition, object detection and many other domains such as drug discovery and genomics. # **Deep learning** is one of the leading tools in data analysis these days and one of the most common frameworks for deep learning is **Keras**. # The Tutorial will provide an introduction to deep learning using `keras` with practical code examples. # # Artificial Neural Networks (ANN) # In machine learning and cognitive science, an artificial neural network (ANN) is a network inspired by biological neural networks which are used to estimate or approximate functions that can depend on a large number of inputs that are generally unknown # An ANN is built from nodes (neurons) stacked in layers between the feature vector and the target vector. # A node in a neural network is built from Weights and Activation function # An early version of ANN built from one node was called the **Perceptron** # # The Perceptron is an algorithm for supervised learning of binary classifiers. functions that can decide whether an input (represented by a vector of numbers) belongs to one class or another. # # Much like logistic regression, the weights in a neural net are being multiplied by the input vertor summed up and feeded into the activation function's input. # A Perceptron Network can be designed to have *multiple layers*, leading to the **Multi-Layer Perceptron** (aka `MLP`) # # The weights of each neuron are learned by **gradient descent**, where each neuron's error is derived with respect to it's weight. # Optimization is done for each layer with respect to the previous layer in a technique known as **BackPropagation**. # # # Building Neural Nets from scratch # # ### Idea: # # We will build the neural networks from first principles. # We will create a very simple model and understand how it works. We will also be implementing backpropagation algorithm. # # **Please note that this code is not optimized and not to be used in production**. # # This is for instructive purpose - for us to understand how ANN works. # # Libraries like `theano` have highly optimized code. # (*The following code is inspired from [these](https://github.com/dennybritz/nn-from-scratch) terrific notebooks*) # In[1]: # Import the required packages import numpy as np import pandas as pd import matplotlib import matplotlib.pyplot as plt import scipy # In[2]: # Display plots inline get_ipython().run_line_magic('matplotlib', 'inline') # Define plot's default figure size matplotlib.rcParams['figure.figsize'] = (10.0, 8.0) # In[3]: import random random.seed(123) # In[4]: #read the datasets train = pd.read_csv("data/intro_to_ann.csv") # In[5]: X, y = np.array(train.ix[:,0:2]), np.array(train.ix[:,2]) # In[6]: X.shape # In[7]: y.shape # In[8]: #Let's plot the dataset and see how it is plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.BuGn) # ## Start Building our ANN building blocks # # Note: This process will eventually result in our own Neural Networks class # ### A look at the details # # ### Function to generate a random number, given two numbers # # **Where will it be used?**: When we initialize the neural networks, the weights have to be randomly assigned. # In[9]: # calculate a random number where: a <= rand < b def rand(a, b): return (b-a)*random.random() + a # In[10]: # Make a matrix def makeMatrix(I, J, fill=0.0): return np.zeros([I,J]) # ### Define our activation function. Let's use sigmoid function # In[11]: # our sigmoid function def sigmoid(x): #return math.tanh(x) return 1/(1+np.exp(-x)) # ### Derivative of our activation function. # # Note: We need this when we run the backpropagation algorithm # # In[12]: # derivative of our sigmoid function, in terms of the output (i.e. y) def dsigmoid(y): return y - y**2 # ### Our neural networks class # # When we first create a neural networks architecture, we need to know the number of inputs, number of hidden layers and number of outputs. # # The weights have to be randomly initialized. # ```python # class ANN: # def __init__(self, ni, nh, no): # # number of input, hidden, and output nodes # self.ni = ni + 1 # +1 for bias node # self.nh = nh # self.no = no # # # activations for nodes # self.ai = [1.0]*self.ni # self.ah = [1.0]*self.nh # self.ao = [1.0]*self.no # # # create weights # self.wi = makeMatrix(self.ni, self.nh) # self.wo = makeMatrix(self.nh, self.no) # # # set them to random vaules # self.wi = rand(-0.2, 0.2, size=self.wi.shape) # self.wo = rand(-2.0, 2.0, size=self.wo.shape) # # # last change in weights for momentum # self.ci = makeMatrix(self.ni, self.nh) # self.co = makeMatrix(self.nh, self.no) # ``` # ### Activation Function # ```python # def activate(self, inputs): # # if len(inputs) != self.ni-1: # print(inputs) # raise ValueError('wrong number of inputs') # # # input activations # for i in range(self.ni-1): # self.ai[i] = inputs[i] # # # hidden activations # for j in range(self.nh): # sum_h = 0.0 # for i in range(self.ni): # sum_h += self.ai[i] * self.wi[i][j] # self.ah[j] = sigmoid(sum_h) # # # output activations # for k in range(self.no): # sum_o = 0.0 # for j in range(self.nh): # sum_o += self.ah[j] * self.wo[j][k] # self.ao[k] = sigmoid(sum_o) # # return self.ao[:] # ``` # ### BackPropagation # ```python # def backPropagate(self, targets, N, M): # # if len(targets) != self.no: # print(targets) # raise ValueError('wrong number of target values') # # # calculate error terms for output # output_deltas = np.zeros(self.no) # for k in range(self.no): # error = targets[k]-self.ao[k] # output_deltas[k] = dsigmoid(self.ao[k]) * error # # # calculate error terms for hidden # hidden_deltas = np.zeros(self.nh) # for j in range(self.nh): # error = 0.0 # for k in range(self.no): # error += output_deltas[k]*self.wo[j][k] # hidden_deltas[j] = dsigmoid(self.ah[j]) * error # # # update output weights # for j in range(self.nh): # for k in range(self.no): # change = output_deltas[k] * self.ah[j] # self.wo[j][k] += N*change + # M*self.co[j][k] # self.co[j][k] = change # # # update input weights # for i in range(self.ni): # for j in range(self.nh): # change = hidden_deltas[j]*self.ai[i] # self.wi[i][j] += N*change + # M*self.ci[i][j] # self.ci[i][j] = change # # # calculate error # error = 0.0 # for k in range(len(targets)): # error += 0.5*(targets[k]-self.ao[k])**2 # return error # ``` # In[13]: # Putting all together class ANN: def __init__(self, ni, nh, no): # number of input, hidden, and output nodes self.ni = ni + 1 # +1 for bias node self.nh = nh self.no = no # activations for nodes self.ai = [1.0]*self.ni self.ah = [1.0]*self.nh self.ao = [1.0]*self.no # create weights self.wi = makeMatrix(self.ni, self.nh) self.wo = makeMatrix(self.nh, self.no) # set them to random vaules for i in range(self.ni): for j in range(self.nh): self.wi[i][j] = rand(-0.2, 0.2) for j in range(self.nh): for k in range(self.no): self.wo[j][k] = rand(-2.0, 2.0) # last change in weights for momentum self.ci = makeMatrix(self.ni, self.nh) self.co = makeMatrix(self.nh, self.no) def backPropagate(self, targets, N, M): if len(targets) != self.no: print(targets) raise ValueError('wrong number of target values') # calculate error terms for output output_deltas = np.zeros(self.no) for k in range(self.no): error = targets[k]-self.ao[k] output_deltas[k] = dsigmoid(self.ao[k]) * error # calculate error terms for hidden hidden_deltas = np.zeros(self.nh) for j in range(self.nh): error = 0.0 for k in range(self.no): error += output_deltas[k]*self.wo[j][k] hidden_deltas[j] = dsigmoid(self.ah[j]) * error # update output weights for j in range(self.nh): for k in range(self.no): change = output_deltas[k] * self.ah[j] self.wo[j][k] += N*change + M*self.co[j][k] self.co[j][k] = change # update input weights for i in range(self.ni): for j in range(self.nh): change = hidden_deltas[j]*self.ai[i] self.wi[i][j] += N*change + M*self.ci[i][j] self.ci[i][j] = change # calculate error error = 0.0 for k in range(len(targets)): error += 0.5*(targets[k]-self.ao[k])**2 return error def test(self, patterns): self.predict = np.empty([len(patterns), self.no]) for i, p in enumerate(patterns): self.predict[i] = self.activate(p) #self.predict[i] = self.activate(p[0]) def activate(self, inputs): if len(inputs) != self.ni-1: print(inputs) raise ValueError('wrong number of inputs') # input activations for i in range(self.ni-1): self.ai[i] = inputs[i] # hidden activations for j in range(self.nh): sum_h = 0.0 for i in range(self.ni): sum_h += self.ai[i] * self.wi[i][j] self.ah[j] = sigmoid(sum_h) # output activations for k in range(self.no): sum_o = 0.0 for j in range(self.nh): sum_o += self.ah[j] * self.wo[j][k] self.ao[k] = sigmoid(sum_o) return self.ao[:] def train(self, patterns, iterations=1000, N=0.5, M=0.1): # N: learning rate # M: momentum factor patterns = list(patterns) for i in range(iterations): error = 0.0 for p in patterns: inputs = p[0] targets = p[1] self.activate(inputs) error += self.backPropagate([targets], N, M) if i % 5 == 0: print('error in interation %d : %-.5f' % (i,error)) print('Final training error: %-.5f' % error) # ### Running the model on our dataset # In[14]: # create a network with two inputs, one hidden, and one output nodes ann = ANN(2, 1, 1) get_ipython().run_line_magic('timeit', '-n 1 -r 1 ann.train(zip(X,y), iterations=2)') # ### Predicting on training dataset and measuring in-sample accuracy # In[15]: get_ipython().run_line_magic('timeit', '-n 1 -r 1 ann.test(X)') # In[16]: prediction = pd.DataFrame(data=np.array([y, np.ravel(ann.predict)]).T, columns=["actual", "prediction"]) prediction.head() # In[17]: np.min(prediction.prediction) # ### Let's visualize and observe the results # In[18]: # Helper function to plot a decision boundary. # This generates the contour plot to show the decision boundary visually def plot_decision_boundary(nn_model): # Set min and max values and give it some padding x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 h = 0.01 # Generate a grid of points with distance h between them xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Predict the function value for the whole gid nn_model.test(np.c_[xx.ravel(), yy.ravel()]) Z = nn_model.predict Z[Z>=0.5] = 1 Z[Z<0.5] = 0 Z = Z.reshape(xx.shape) # Plot the contour and training examples plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) plt.scatter(X[:, 0], X[:, 1], s=40, c=y, cmap=plt.cm.BuGn) # In[19]: plot_decision_boundary(ann) plt.title("Our initial model") # **Exercise**: # # Create Neural networks with 10 hidden nodes on the above code. # # What's the impact on accuracy? # In[20]: # Put your code here #(or load the solution if you wanna cheat :-) # In[21]: # %load solutions/sol_111.py ann = ANN(2, 10, 1) get_ipython().run_line_magic('timeit', '-n 1 -r 1 ann.train(zip(X,y), iterations=2)') plot_decision_boundary(ann) plt.title("Our next model with 10 hidden units") # **Exercise:** # # Train the neural networks by increasing the epochs. # # What's the impact on accuracy? # In[22]: #Put your code here # In[23]: # %load solutions/sol_112.py ann = ANN(2, 10, 1) get_ipython().run_line_magic('timeit', '-n 1 -r 1 ann.train(zip(X,y), iterations=100)') plot_decision_boundary(ann) plt.title("Our model with 10 hidden units and 100 iterations") # # Addendum # # There is an additional notebook in the repo, i.e. [A simple implementation of ANN for MNIST](1.4 (Extra) A Simple Implementation of ANN for MNIST.ipynb) for a *naive* implementation of **SGD** and **MLP** applied on **MNIST** dataset. # # This accompanies the online text http://neuralnetworksanddeeplearning.com/ . The book is highly recommended.