Notebook

Notebook 5: PyTorch ¶

Now, we will build the same forward and backward pass, but all with PyTorch!¶

Engineers typically use a modern deep neural network library like PyTorch in production We can do the same thing with the PyTorch API.

In [26]:

import math
import numpy as np
import matplotlib.pyplot as plt

In [27]:

class Value:
  def __init__(self, data, _children=(), _op='', label=''):
    self.data = data
    self.grad = 0.0
    self._backward = lambda: None
    self._prev = set(_children)
    self._op = _op
    self.label = label

  def __repr__(self):
    return f"Value(data={self.data})"
  
  def __add__(self, other):
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data + other.data, (self, other), '+')
      
    def _backward():
      self.grad += 1.0 * out.grad
      other.grad += 1.0 * out.grad
    out._backward = _backward
    
    return out

  def __mul__(self, other):
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data * other.data, (self, other), '*')
    
    def _backward():
      self.grad += other.data * out.grad
      other.grad += self.data * out.grad
    out._backward = _backward
      
    return out

  def __neg__(self): #-self
      return self * -1

  def __sub__(self, other): #self-other; implement thru addition by negation, mult by -1 for the negation (what we've built)
      return self + (-other)
      
  def __pow__(self, other): #self to the pow of other
      assert isinstance(other, (int, float)), "only supporting int/float powers for now"
      out = Value(self.data**other, (self,),f'**{other}')

      def _backward(): #what's the chain rule for backprop thru the power function, where power is power of some kind of constant
        self.grad += other * self.data ** (other -1) * out.grad
                    #other * self.data ** (other -1) is the local derivative only, but then have to chain it by mult by out.grad
                            #self.data is an int or a float, not a Value obj, just accessing .data prop
        #to do the above exercises, go to the derivative rules
      out._backward = _backward
      return out

  def __rmul__(self,other): #other * self; fallback for python not being able to do num * self, check if rmul in value, call it reverse
      return self * other

  def __truediv__(self, other): #self/other
      return self*other**-1
      
  def tanh(self):
    x = self.data
    t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
    out = Value(t, (self, ), 'tanh')
    
    def _backward():
      self.grad += (1 - t**2) * out.grad
    out._backward = _backward
    
    return out

  def exp(self): #mirrors tanh; inputs, transforms, and outputs a single scalar value
      x = self.data
      out = Value(math.exp(x), (self, ), 'exp')

      #how do you backpropogate through e^x We need to know the local deriv of e^x. D/dx of e^x is e^x
      #eturns E raised to the power of x (Ex).
        #'E' is the base of the natural system of logarithms (approximately 2.718282) and x is the number passed to it.
      def _backward():
          self.grad += out.data * out.grad
      out._backward = _backward
      return out
      
  def backward(self):
    topo = []
    visited = set()
    def build_topo(v):
      if v not in visited:
        visited.add(v)
        for child in v._prev:
          build_topo(child)
        topo.append(v)
    build_topo(self)
    
    self.grad = 1.0
    for node in reversed(topo):
      node._backward()

In [28]:

from graphviz import Digraph

def trace(root):
  # builds a set of all nodes and edges in a graph
  nodes, edges = set(), set()
  def build(v):
    if v not in nodes:
      nodes.add(v)
      for child in v._prev:
        edges.add((child, v))
        build(child)
  build(root)
  return nodes, edges

def draw_dot(root):
  dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
  
  nodes, edges = trace(root)
  for n in nodes:
    uid = str(id(n))
    # for any value in the graph, create a rectangular ('record') node for it
    dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
    if n._op:
      # if this value is a result of some operation, create an op node for it
      dot.node(name = uid + n._op, label = n._op)
      # and connect this node to it
      dot.edge(uid + n._op, uid)

  for n1, n2 in edges:
    # connect n1 to the op node of n2
    dot.edge(str(id(n1)), str(id(n2)) + n2._op)

  return dot

The micrograd we have already built is a scalar valued engine, which means it can only take in scalar values, like Value(2.0). In PyTorch, everything is based around tensors, which are n-dimensional arrays of scalars. Thus, we need a scalar valued tensor.¶

In [29]:

import torch

In [37]:

#This is a tensor:
torch.Tensor([[1,2,3],[4,5,6]])

Out[37]:

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [38]:

#Check its shape:
torch.Tensor([[1,2,3],[4,5,6]]).shape

Out[38]:

torch.Size([2, 3])

By default, python by default uses double precision for its floating points, which is byte size float(64). You can cast a tensor to double so it matches what python is expecting:

In [42]:

#single precision
torch.Tensor([2.0]).dtype

Out[42]:

torch.float32

In [43]:

#casted to double precision so float 32 (default) is cast to float 64
torch.Tensor([2.0]).double().dtype

Out[43]:

torch.float64

PyTorch automatically assumes that leaf nodes we declare don't require gradients. By default, requires_grad is set to false for efficiency because you wouldn't usually want gradients for leaf nodes as input to the network. We can explicitly say all nodes require gradients, though.

Now, we will construct scalar valued, one element tensors. Once we have defined all values, we can perform arithmetic on the tensors.

In [46]:

#just like in micrograd, these tensor objects have a .data and a .grad...
x1 = torch.Tensor([2.0]).double()               ; x1.requires_grad = True
x2 = torch.Tensor([0.0]).double()               ; x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double()              ; w1.requires_grad = True
w2 = torch.Tensor([1.0]).double()               ; w2.requires_grad = True
b = torch.Tensor([6.8813735870195432]).double() ; b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

print(o.data.item()) #.item() takes a single tensor of one element and returns element, stripping out the tensor

o.backward() #prints forward pass.

#prints gradients/backwards pass
print('---')
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())

0.7071066904050358
---
x2 0.5000001283844369
w2 0.0
x1 -1.5000003851533106
w1 1.0000002567688737

PyTorch can do what we did in micrograd, as a special case when your tensors are all single-element tensors.

With PyTorch, everything is much more efficient because we're working with tensor objects. Many operations can work in parallel on these tensors. Everything we've built agrees with API of PyTorch.

In [47]:

o #tensor object, has a backward function just like what we implemented

Out[47]:

tensor([0.7071], dtype=torch.float64, grad_fn=<TanhBackward0>)

In [49]:

o.item()   #same as o.data.item()

Out[49]:

0.7071066904050358

In [50]:

#all of the variables have a .grad
x1.grad.item() #-> grad is a tensor, pop out number with .item()

Out[50]:

-1.5000003851533106

Now that we have some machinery to build pretty complicated mathematical expressions in a neuron, we can begin building out are neural network.¶

Neural networks are a specific class of mathematic expressions. We will start building out a neural netwrok piece by piece, and eventually build out a two layer Multi-Layer Perceptron.

Let's start with a single individual neuron. We'll make our above neuron subscribe to PyTorch's API and its specific neural network modules.

Now, we're going to define a layer on neurons... So look up schematic for MLP. MultiLayer Perceptron. Notice how there are multiple layers with multiple neurons. The neurons are not connected to each other, but are rather connected to all of the inputs. A layer of neurons is a set of neurons evaluated independently.

Below, we initialize the Neuron class.¶

We define a constructor and __call__(self,x).

For the forward pass, we need to multiply all of the elements of w with all of the elements of x, pair-wise.

To visualize this, we write: list(zip(self.w,x)) after we have intialized our x's and n, where x's are the self.data's, and n is the Neuron we're putting them in.

The __call__ function will give us a zip that will make two iterators that iterate over tuples of corresponding entries (self.w[3], x[3]), for example.

In [105]:

import random

In [108]:

class Neuron:
    #constructor takes number of inputs to the neuron, 
    #creates a weight of some random number between -1 and 1,
    #and a bias that controls trigger happinesss of the neuron
    def __init__(self,nin):  
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))

    #w * x + b -> w*x is a dot product.                     
    def __call__(self,x): #this will be called with n(x). 
        print(list(zip(self.w,x)))

x = [2.0, 3.0]
n = Neuron(2) #initialize a 2 dimensional neuron
n(x) #feed the nums into the neuron

[(Value(data=-0.891937987300732), 2.0), (Value(data=-0.7191340580027186), 3.0)]

Now that you can visualize it, we'll create the raw activation function, which is a sum of the dot products of all of the tuples of the weights and data. After we create that, we need to pass it through the non-linearity, so we call tanh() on it.

Test the code below. Notice how you get a different answer each time because we initialize different weights and biases each time.

In [115]:

class Neuron:
    def __init__(self,nin):  
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))

    def __call__(self,x): 

        #It computes the weighted sum of the inputs plus the bias (act), 
        #using the dot product of weights self.w and inputs x, starting the sum with self.b.
    
        #create raw activation function:
        #sum the product for all elements of w and all elements of x (pairs)
        #by default, builds a sum on top of 0.0, so we start with self.b instead (optional param)
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
        
        #pass through non-linearity
        out = act.tanh()
        
        return out

x = [2.0, 3.0]
n = Neuron(2) #initialize a 2 dimensional neuron 
n(x) #feed the nums into the neuron 

Out[115]:

Value(data=0.9702820556902484)

Now, we'll define a layer of neurons, which will contain all of the neurons connected to the previous set of neuron(s) as input and connected to the next set of neuron(s) as output.¶

In [117]:

class Neuron:
    def __init__(self,nin): 
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))

    def __call__(self,x): 
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)  
        out = act.tanh()
        return out

class Layer: #A layer is a list of neurons, one one layer in an MLP

    #nin = num inputs to each each neuron in the layer
    #nout = number of neurons in the layer

    #init a list of self.neurons containing nout Neuron objects, each with nin inputs
    def __init__(self, nin, nout): 
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons] #apply x to each neuron in the layer
        
        #returns output of each neuron as a list. If only one neuron, returns single output
        return outs[0] if len(outs) == 1 else outs 
        
x = [2.0, 3.0]
n = Layer(2, 3) #-> two dimensional neurons, 3 of them
n(x) #feed x to Layer(n)

Out[117]:

[Value(data=-0.15572956132854174),
 Value(data=0.9984237412005323),
 Value(data=-0.9748622220216048)]

Now, let's define our MLP. This will encapsulate all of the layers of the neurons in our neural network.¶

It will take a list of nouts (instead of single nout), which defines the sizes of all layers we want in our MLP

In [120]:

class Neuron:
    def __init__(self,nin): 
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))

    def __call__(self,x): 
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)  
        out = act.tanh()
        return out

class Layer: 
    def __init__(self, nin, nout): 
        #nin = num inputs to each each neuron in the layer
        #nout = number of neurons in the layer
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs 

class MLP: #sample input: nin = 3; nouts = [4,4,1]; x = [2.0, 3.0, -1.0]
    
    #nin = num inputs to MLP
    #nouts = list where each element is number of neurons in each subsequent layer of MLP
    #sz combines nin and nouts into a single list, where:
        #sz[1] is the num of neurons in the first layer, 
        #sz[2] is num of neurons in second layer, etc.
    
    def __init__(self, nin, nouts): 
        
        sz = [nin] + nouts #sample input: sz = [3] + [4,4,1] = [3,4,4,1]

        #uses list comprehension to create list objects.
        #pairs consecutive elements in sz, creating Layer objects from input size to the 
        #number of neurons, then from one layer's neuron to the next
        
        #iterate over consecutive pairs of these sizes and create layer objects for them
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
        #sample input: for each i in range(3), create Layer object with sz[i] = nin and sz[i+1] = nout
            #creates self.Layers = [Layer(3,4), Layer(4,4), Layer(4,1)].
            #ex: Layer(3,4) -> list of 4 Neuron obj with 3 inputs each (nin=3)
    
    def __call__(self, x):
        for layer in self.layers: #input x is passed through each layer in sequence
            x = layer(x)          #each layer processes input and produces output, which is input for next layer
        return x                  #return result of last layer's processing
    

x = [2.0, 3.0, -1.0]   #three dimensional input
n = MLP(3, [4, 4, 1])  #three inputs into two layers of four and one output
n(x)

Out[120]:

Value(data=-0.6707963942601958)

In [123]:

draw_dot(n(x)) #print the entire mlp. this will be huge :)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.

Out[123]:

We just made a huge Multi-Layer Perceptron with PyTorch, using the same forward pass and backward pass principles from before. Now, we'll make a dataset and see how this would be run at a larger scale.¶