import math
import numpy as np
import matplotlib.pyplot as plt
import torch
import random
from graphviz import Digraph
def trace(root):
# builds a set of all nodes and edges in a graph
nodes, edges = set(), set()
def build(v):
if v not in nodes:
nodes.add(v)
for child in v._prev:
edges.add((child, v))
build(child)
build(root)
return nodes, edges
def draw_dot(root):
dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
nodes, edges = trace(root)
for n in nodes:
uid = str(id(n))
# for any value in the graph, create a rectangular ('record') node for it
dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
if n._op:
# if this value is a result of some operation, create an op node for it
dot.node(name = uid + n._op, label = n._op)
# and connect this node to it
dot.edge(uid + n._op, uid)
for n1, n2 in edges:
# connect n1 to the op node of n2
dot.edge(str(id(n1)), str(id(n2)) + n2._op)
return dot
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._backward = lambda: None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += 1.0 * out.grad
other.grad += 1.0 * out.grad
out._backward = _backward
return out
def __mul__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad += other.data * out.grad
other.grad += self.data * out.grad
out._backward = _backward
return out
def __neg__(self):
return self * -1
def __sub__(self, other):
return self + (-other)
def __pow__(self, other):
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
out = Value(self.data ** other, (self,), f'**{other}')
def _backward():
self.grad += other * self.data ** (other - 1) * out.grad
out._backward = _backward
return out
def __rmul__(self, other):
return self * other
def __truediv__(self, other):
return self * other ** -1
def tanh(self):
x = self.data
t = (math.exp(2 * x) - 1) / (math.exp(2 * x) + 1)
out = Value(t, (self,), 'tanh')
def _backward():
self.grad += (1 - t ** 2) * out.grad
out._backward = _backward
return out
def exp(self):
x = self.data
out = Value(math.exp(x), (self,), 'exp')
def _backward():
self.grad += out.data * out.grad
out._backward = _backward
return out
def backward(self):
topo = []
visited = set()
def build_topo(v):
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0
for node in reversed(topo):
node._backward()
class Neuron:
def __init__(self,nin):
self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
self.b = Value(random.uniform(-1,1))
def __call__(self,x):
act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
out = act.tanh()
return out
class Layer:
def __init__(self, nin, nout):
#nin = num inputs to each each neuron in the layer
#nout = number of neurons in the layer
self.neurons = [Neuron(nin) for _ in range(nout)]
def __call__(self, x):
outs = [n(x) for n in self.neurons]
return outs[0] if len(outs) == 1 else outs
class MLP:
def __init__(self, nin, nouts):
sz = [nin] + nouts
self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
def __call__(self, x):
for layer in self.layers:
x = layer(x)
return x
x = [2.0, 3.0, -1.0]
n = MLP(3,[4, 4, 1])
n(x)
Value(data=-0.5900520440314407)
Below, we create an example dataset, with four possible inputs into the neural network and four desired targets that we want to shoot for.
We would like the neural net to ouptut ys[0] when its fed first array, ys[1] for second array, etc.
This is a very simple binary classifier neural net.
xs = [
[2.0, 3.0, -1.0],
[3.0, -1.0, 0.5],
[0.5, 1.0, 1.0],
[1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] #desired target
ypred = [n(x) for x in xs]
ypred #outputs of neural net on the four examples -> predictions
[Value(data=-0.5900520440314407), Value(data=-0.6045680133303286), Value(data=-0.5654156624843023), Value(data=-0.49584526005845786)]
Notice your output for each of the four tensors above. We want to take those values and move it towards the desired targets in ys
. The trick to this is to calculate a single number that measures the performance of the neural net. This is how we train the neural net to output the desired targets.
This loss we calculate is something we want to minimize. The loss function we will use today is the mean squared error
.
This means we iterate for y ground truth
and y output
in the zip
of ys, ypred
.
For each y ground truth and y output in the tuples of ys, ypred
, we subtract them and square them to get the difference (how far we are away from our prediction). The square will be good for the loss because it will negate negative gradients, forcing them to be positive so we can evaluate them all against the prediction. This makes it a bell curve.
We could also take the abs
(absolute value) to take away the negative.
In short, our prediction will only be accurate when yout == ygt
, and we get 0. When our prediction is not the target we will get some other number. The more off we are, the greater the loss will be. We want low loss.
### Mean Squared Error:
[(yout - ygt) ** 2 for ygt, yout in zip (ys, ypred)]
[Value(data=2.5282655027285625), Value(data=0.15636645608152316), Value(data=0.18886354641395786), Value(data=2.2375530420393557)]
# Convert ys elements to Value objects to avoid a later unsupported operand type error in the sum func
ys_value = [Value(y) for y in ys]
loss = sum([(yout-ygt)**2 for ygt, yout in zip(ys_value,ypred)], Value(0.0))
#loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred)) would throw an operand type error
#between ys of type float and ypred of type Value()
loss
Value(data=5.111048547263399)
Now, we want to make every one of our predictions == target so the loss is 0.
We minimize the loss with loss.backward()
:
loss.backward() #here's where the magic happens
#look at a weight's gradient at 0
n.layers[0].neurons[0].w[0].grad
-3.2376296037647085
We can manually view the gradients of these individual weights in their neurons. Their sign tells us what happens to the loss when we adjust them one way or another. We have this info for every parameter.
If the above gradient is negative, that is telling us that if we decrease the given gradient, the loss will go up. Thus, if we increase the given gradient, the loss will go down.
Further, this means that if a gradient is positive, that's telling us that if we increase the given gradient, the loss will go up. Thus, if we decrease it, the loss goes down. We can visualize what we have:
draw_dot(loss)
Since we want to decrease the loss, we have to change the inputs somehow. We could change our x values, but that tends to be fixed, becuase that's our data. The input data is a given to the problem, even though we have gradients for it.
Some of the gradients, however, are for the neural network parameters (the w
and b
's.). These are what we want tot change. We want to change all of the parameters of the MLP slightly to the lower loss.
To do this, we'll want some code that gathers up all of the parameters on the neural network and can operate on all of the parameters simulaneously.
We'll create a parameters function that returns a list of self.w
and self.b
:
class Neuron:
def __init__(self,nin):
self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
self.b = Value(random.uniform(-1,1))
def __call__(self,x):
act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
out = act.tanh()
return out
def parameters(self): #pytorch has a parameters() on every neural net module that returns param tensors (for us it's scalars)
return self.w + [self.b]
class Layer:
def __init__(self, nin, nout):
#nin = num inputs to each each neuron in the layer
#nout = number of neurons in the layer
self.neurons = [Neuron(nin) for _ in range(nout)]
def __call__(self, x):
outs = [n(x) for n in self.neurons]
return outs[0] if len(outs) == 1 else outs
def parameters(self):
params = []
for neuron in self.neurons:
ps = neuron.parameters()
params.extend(ps)
return params
#can also write as : return [p for neuron in self.neurons for p in neuron.parameters()]
class MLP:
def __init__(self, nin, nouts):
sz = [nin] + nouts
self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
def __call__(self, x):
for layer in self.layers:
x = layer(x)
return x
def parameters(self):
return [p for layer in self.layers for p in layer.parameters()] #get parameters for all layers in MLP
Now that we've added this API, we reinitialize the network and test with n.parameters
. This will change some of the numbers in the network.
# Neural network
n = MLP(3, [4, 4, 1])
print("parameters count = ",len(n.parameters()))
n.parameters()
parameters count = 41
[Value(data=0.8087675862549752), Value(data=0.025263971658811668), Value(data=-0.8095723663106846), Value(data=0.8038607738814252), Value(data=-0.29638876451525076), Value(data=-0.03247232926147414), Value(data=-0.44561734574393985), Value(data=0.06311564914043521), Value(data=0.7567492970120597), Value(data=-0.3446175462832468), Value(data=-0.45540808178622627), Value(data=0.5760849545442048), Value(data=-0.3492314617459753), Value(data=0.8924325636035717), Value(data=-0.46505647936296013), Value(data=0.6149317705883368), Value(data=-0.5223518310937196), Value(data=-0.11791949089726761), Value(data=-0.8135156356899493), Value(data=0.9913436635820159), Value(data=-0.12121895507301739), Value(data=-0.862460039190883), Value(data=-0.805484213082718), Value(data=0.19498379419998968), Value(data=0.8092170416979758), Value(data=0.7535190900116837), Value(data=0.7004254761419886), Value(data=0.1838953755012176), Value(data=-0.3540157445416765), Value(data=0.561899056075303), Value(data=-0.3441460595011996), Value(data=0.2843677810237102), Value(data=0.9726194405716639), Value(data=-0.7131735718769536), Value(data=0.44689712470395704), Value(data=0.6053467884662802), Value(data=-0.9144193640506879), Value(data=-0.7613067493180077), Value(data=0.8626489273439784), Value(data=-0.8825822569824531), Value(data=0.3359075971184846)]
Now, we we get all of the weights and biases in the network (41 parameters). Now we can change them.
xs = [
[2.0, 3.0, -1.0],
[3.0, -1.0, 0.5],
[0.5, 1.0, 1.0],
[1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0]
# Predictions
ypred = [n(x) for x in xs]
print("ypred = ", ypred)
# Convert ys elements to Value objects
ys_value = [Value(y) for y in ys]
# Calculate the loss as a single Value object
loss = sum([(yout - ygt) ** 2 for ygt, yout in zip(ys_value, ypred)], Value(0.0))
loss
ypred = [Value(data=0.09626161200328809), Value(data=0.8897672475073349), Value(data=-0.6677985165984013), Value(data=0.1312729219383994)]
Value(data=5.253007885422014)
loss.backward()
Now, to improve the loss, we'll get ahold of a parameter's data and weight so we can track changes:
print("data: ", n.layers[0].neurons[0].w[0].data)
print("grad: ", n.layers[0].neurons[0].w[0].grad)
data: 0.8066425443125735 grad: 0.10625209712008757
Gradient descent is an optimization algorithm used to minimize the loss function. The key idea is to update the parameters of the model in the direction opposite to the gradient of the loss function with respect to the parameters. The gradient points in the direction of the steepest ascent, so moving in the opposite direction reduces the loss.
n.layers[0].neurons[0].w[0].grad
is positive, so we will negatively change the parameters to make loss go down. This will be a tiny update in the gradient descent scheme.¶Think of the gradient as a vector pointing in the direction of increased loss. We have to reverse it in the direction away from the loss by modifying p.data by a small step size in the opposite direction of the gradient.
(-) loss<----d1---->d2 (+)
(loss goes down)(-)loss<----d2<----d1 (+)
(loss goes up)(-)d1---->d2---->loss (+)
(loss goes up)(-)d2<----d1---->loss (+)
(loss goes down)The general formula you can use is the parameter update rule
: p.data = p.data - n * p.grad
, where:
p.data
is the current parameter valuen
is the learning rate/step sizep.grad
is the gradient of the loss with respect to the parameterNow, let's use this rule to update and check our grad:
for p in n.parameters():
p.data += -0.01 * p.grad
#check: data decreases, which will make the loss go down
#low loss means our predictions are matching the targets.
#Now, data is slightly closer to the targers.
print("data: ", n.layers[0].neurons[0].w[0].data)
print("grad: ", n.layers[0].neurons[0].w[0].grad)
data: 0.8055800233413727 grad: 0.10625209712008757
#reevaluate the loss by recalculating the forward pass of the network
#Recalculate the loss... has the loss gone down???
ypred = [n(x) for x in xs]
ys_value = [Value(y) for y in ys]
loss = sum([(yout - ygt) ** 2 for ygt, yout in zip(ys_value, ypred)], Value(0.0))
loss
Value(data=4.297124484535017)
Now we can iterate this process and keep improving the loss by doing a forward pass, backward pass, and then updating parameters.
As we keep iterating, ypred
values should keep getting closer to the targets:
ypred
[Value(data=0.6823520520073336), Value(data=0.90100678271322), Value(data=-0.31648409938487826), Value(data=0.6605836018167414)]
Experiment by continuing to do the forward pass, backward pass, and gradient descent. Keep testing your loss and if your predictions are becoming closer to your targets. Experiment with different step sizes. Try to get your predictions to be exactly at the targets!