Engineers typically use a modern deep neural network library like PyTorch in production We can do the same thing with the PyTorch API.
import math
import numpy as np
import matplotlib.pyplot as plt
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._backward = lambda: None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += 1.0 * out.grad
other.grad += 1.0 * out.grad
out._backward = _backward
return out
def __mul__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad += other.data * out.grad
other.grad += self.data * out.grad
out._backward = _backward
return out
def __neg__(self): #-self
return self * -1
def __sub__(self, other): #self-other; implement thru addition by negation, mult by -1 for the negation (what we've built)
return self + (-other)
def __pow__(self, other): #self to the pow of other
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
out = Value(self.data**other, (self,),f'**{other}')
def _backward(): #what's the chain rule for backprop thru the power function, where power is power of some kind of constant
self.grad += other * self.data ** (other -1) * out.grad
#other * self.data ** (other -1) is the local derivative only, but then have to chain it by mult by out.grad
#self.data is an int or a float, not a Value obj, just accessing .data prop
#to do the above exercises, go to the derivative rules
out._backward = _backward
return out
def __rmul__(self,other): #other * self; fallback for python not being able to do num * self, check if rmul in value, call it reverse
return self * other
def __truediv__(self, other): #self/other
return self*other**-1
def tanh(self):
x = self.data
t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
out = Value(t, (self, ), 'tanh')
def _backward():
self.grad += (1 - t**2) * out.grad
out._backward = _backward
return out
def exp(self): #mirrors tanh; inputs, transforms, and outputs a single scalar value
x = self.data
out = Value(math.exp(x), (self, ), 'exp')
#how do you backpropogate through e^x We need to know the local deriv of e^x. D/dx of e^x is e^x
#eturns E raised to the power of x (Ex).
#'E' is the base of the natural system of logarithms (approximately 2.718282) and x is the number passed to it.
def _backward():
self.grad += out.data * out.grad
out._backward = _backward
return out
def backward(self):
topo = []
visited = set()
def build_topo(v):
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0
for node in reversed(topo):
node._backward()
from graphviz import Digraph
def trace(root):
# builds a set of all nodes and edges in a graph
nodes, edges = set(), set()
def build(v):
if v not in nodes:
nodes.add(v)
for child in v._prev:
edges.add((child, v))
build(child)
build(root)
return nodes, edges
def draw_dot(root):
dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
nodes, edges = trace(root)
for n in nodes:
uid = str(id(n))
# for any value in the graph, create a rectangular ('record') node for it
dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
if n._op:
# if this value is a result of some operation, create an op node for it
dot.node(name = uid + n._op, label = n._op)
# and connect this node to it
dot.edge(uid + n._op, uid)
for n1, n2 in edges:
# connect n1 to the op node of n2
dot.edge(str(id(n1)), str(id(n2)) + n2._op)
return dot
import torch
#This is a tensor:
torch.Tensor([[1,2,3],[4,5,6]])
tensor([[1., 2., 3.], [4., 5., 6.]])
#Check its shape:
torch.Tensor([[1,2,3],[4,5,6]]).shape
torch.Size([2, 3])
By default, python by default uses double precision for its floating points, which is byte size float(64). You can cast a tensor to double so it matches what python is expecting:
#single precision
torch.Tensor([2.0]).dtype
torch.float32
#casted to double precision so float 32 (default) is cast to float 64
torch.Tensor([2.0]).double().dtype
torch.float64
PyTorch automatically assumes that leaf nodes we declare don't require gradients. By default, requires_grad is set to false for efficiency because you wouldn't usually want gradients for leaf nodes as input to the network. We can explicitly say all nodes require gradients, though.
Now, we will construct scalar valued, one element tensors. Once we have defined all values, we can perform arithmetic on the tensors.
#just like in micrograd, these tensor objects have a .data and a .grad...
x1 = torch.Tensor([2.0]).double() ; x1.requires_grad = True
x2 = torch.Tensor([0.0]).double() ; x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double() ; w1.requires_grad = True
w2 = torch.Tensor([1.0]).double() ; w2.requires_grad = True
b = torch.Tensor([6.8813735870195432]).double() ; b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)
print(o.data.item()) #.item() takes a single tensor of one element and returns element, stripping out the tensor
o.backward() #prints forward pass.
#prints gradients/backwards pass
print('---')
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())
0.7071066904050358 --- x2 0.5000001283844369 w2 0.0 x1 -1.5000003851533106 w1 1.0000002567688737
PyTorch can do what we did in micrograd, as a special case when your tensors are all single-element tensors.
With PyTorch, everything is much more efficient because we're working with tensor objects. Many operations can work in parallel on these tensors. Everything we've built agrees with API of PyTorch.
o #tensor object, has a backward function just like what we implemented
tensor([0.7071], dtype=torch.float64, grad_fn=<TanhBackward0>)
o.item() #same as o.data.item()
0.7071066904050358
#all of the variables have a .grad
x1.grad.item() #-> grad is a tensor, pop out number with .item()
-1.5000003851533106
Neural networks are a specific class of mathematic expressions. We will start building out a neural netwrok piece by piece, and eventually build out a two layer Multi-Layer Perceptron
.
Let's start with a single individual neuron. We'll make our above neuron subscribe to PyTorch's API and its specific neural network modules.
Now, we're going to define a layer on neurons... So look up schematic for MLP. MultiLayer Perceptron. Notice how there are multiple layers with multiple neurons. The neurons are not connected to each other, but are rather connected to all of the inputs. A layer of neurons is a set of neurons evaluated independently.
We define a constructor and __call__(self,x)
.
For the forward pass, we need to multiply all of the elements of w
with all of the elements of x
, pair-wise.
To visualize this, we write: list(zip(self.w,x))
after we have intialized our x's
and n
, where x's
are the self.data
's, and n
is the Neuron we're putting them in.
The __call__
function will give us a zip
that will make two iterators that iterate over tuples of corresponding entries (self.w[3], x[3])
, for example.
import random
class Neuron:
#constructor takes number of inputs to the neuron,
#creates a weight of some random number between -1 and 1,
#and a bias that controls trigger happinesss of the neuron
def __init__(self,nin):
self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
self.b = Value(random.uniform(-1,1))
#w * x + b -> w*x is a dot product.
def __call__(self,x): #this will be called with n(x).
print(list(zip(self.w,x)))
x = [2.0, 3.0]
n = Neuron(2) #initialize a 2 dimensional neuron
n(x) #feed the nums into the neuron
[(Value(data=-0.891937987300732), 2.0), (Value(data=-0.7191340580027186), 3.0)]
Now that you can visualize it, we'll create the raw activation function, which is a sum of the dot products of all of the tuples of the weights and data. After we create that, we need to pass it through the non-linearity, so we call tanh()
on it.
Test the code below. Notice how you get a different answer each time because we initialize different weights and biases each time.
class Neuron:
def __init__(self,nin):
self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
self.b = Value(random.uniform(-1,1))
def __call__(self,x):
#It computes the weighted sum of the inputs plus the bias (act),
#using the dot product of weights self.w and inputs x, starting the sum with self.b.
#create raw activation function:
#sum the product for all elements of w and all elements of x (pairs)
#by default, builds a sum on top of 0.0, so we start with self.b instead (optional param)
act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
#pass through non-linearity
out = act.tanh()
return out
x = [2.0, 3.0]
n = Neuron(2) #initialize a 2 dimensional neuron
n(x) #feed the nums into the neuron
Value(data=0.9702820556902484)
class Neuron:
def __init__(self,nin):
self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
self.b = Value(random.uniform(-1,1))
def __call__(self,x):
act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
out = act.tanh()
return out
class Layer: #A layer is a list of neurons, one one layer in an MLP
#nin = num inputs to each each neuron in the layer
#nout = number of neurons in the layer
#init a list of self.neurons containing nout Neuron objects, each with nin inputs
def __init__(self, nin, nout):
self.neurons = [Neuron(nin) for _ in range(nout)]
def __call__(self, x):
outs = [n(x) for n in self.neurons] #apply x to each neuron in the layer
#returns output of each neuron as a list. If only one neuron, returns single output
return outs[0] if len(outs) == 1 else outs
x = [2.0, 3.0]
n = Layer(2, 3) #-> two dimensional neurons, 3 of them
n(x) #feed x to Layer(n)
[Value(data=-0.15572956132854174), Value(data=0.9984237412005323), Value(data=-0.9748622220216048)]
It will take a list of nouts
(instead of single nout
), which defines the sizes of all layers we want in our MLP
class Neuron:
def __init__(self,nin):
self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
self.b = Value(random.uniform(-1,1))
def __call__(self,x):
act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
out = act.tanh()
return out
class Layer:
def __init__(self, nin, nout):
#nin = num inputs to each each neuron in the layer
#nout = number of neurons in the layer
self.neurons = [Neuron(nin) for _ in range(nout)]
def __call__(self, x):
outs = [n(x) for n in self.neurons]
return outs[0] if len(outs) == 1 else outs
class MLP: #sample input: nin = 3; nouts = [4,4,1]; x = [2.0, 3.0, -1.0]
#nin = num inputs to MLP
#nouts = list where each element is number of neurons in each subsequent layer of MLP
#sz combines nin and nouts into a single list, where:
#sz[1] is the num of neurons in the first layer,
#sz[2] is num of neurons in second layer, etc.
def __init__(self, nin, nouts):
sz = [nin] + nouts #sample input: sz = [3] + [4,4,1] = [3,4,4,1]
#uses list comprehension to create list objects.
#pairs consecutive elements in sz, creating Layer objects from input size to the
#number of neurons, then from one layer's neuron to the next
#iterate over consecutive pairs of these sizes and create layer objects for them
self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
#sample input: for each i in range(3), create Layer object with sz[i] = nin and sz[i+1] = nout
#creates self.Layers = [Layer(3,4), Layer(4,4), Layer(4,1)].
#ex: Layer(3,4) -> list of 4 Neuron obj with 3 inputs each (nin=3)
def __call__(self, x):
for layer in self.layers: #input x is passed through each layer in sequence
x = layer(x) #each layer processes input and produces output, which is input for next layer
return x #return result of last layer's processing
x = [2.0, 3.0, -1.0] #three dimensional input
n = MLP(3, [4, 4, 1]) #three inputs into two layers of four and one output
n(x)
Value(data=-0.6707963942601958)
draw_dot(n(x)) #print the entire mlp. this will be huge :)
The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.