import math
import numpy as np
import matplotlib.pyplot as plt
from graphviz import Digraph
def trace(root):
# builds a set of all nodes and edges in a graph
nodes, edges = set(), set()
def build(v):
if v not in nodes:
nodes.add(v)
for child in v._prev:
edges.add((child, v))
build(child)
build(root)
return nodes, edges
def draw_dot(root):
dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
nodes, edges = trace(root)
for n in nodes:
uid = str(id(n))
# for any value in the graph, create a rectangular ('record') node for it
dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
if n._op:
# if this value is a result of some operation, create an op node for it
dot.node(name = uid + n._op, label = n._op)
# and connect this node to it
dot.edge(uid + n._op, uid)
for n1, n2 in edges:
# connect n1 to the op node of n2
dot.edge(str(id(n1)), str(id(n2)) + n2._op)
return dot
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._backward = lambda: None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += 1.0 * out.grad
other.grad += 1.0 * out.grad
out._backward = _backward
return out
def __mul__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad += other.data * out.grad
other.grad += self.data * out.grad
out._backward = _backward
return out
def __neg__(self):
return self * -1
def __sub__(self, other):
return self + (-other)
def __pow__(self, other):
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
out = Value(self.data ** other, (self,), f'**{other}')
def _backward():
self.grad += other * self.data ** (other - 1) * out.grad
out._backward = _backward
return out
def __rmul__(self, other):
return self * other
def __truediv__(self, other):
return self * other ** -1
def tanh(self):
x = self.data
t = (math.exp(2 * x) - 1) / (math.exp(2 * x) + 1)
out = Value(t, (self,), 'tanh')
def _backward():
self.grad += (1 - t ** 2) * out.grad
out._backward = _backward
return out
def exp(self):
x = self.data
out = Value(math.exp(x), (self,), 'exp')
def _backward():
self.grad += out.data * out.grad
out._backward = _backward
return out
def backward(self):
topo = []
visited = set()
def build_topo(v):
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0
for node in reversed(topo):
node._backward()
import random
class Neuron:
def __init__(self, nin):
self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
self.b = Value(random.uniform(-1, 1))
def __call__(self, x):
act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
out = act.tanh()
return out
def parameters(self): #calling it this way because pytorch has params on every single nn module. Does same thing.
#Returns the param tensors.
#for us, return the param scalars
return self.w + [self.b]
class Layer:
def __init__(self, nin, nout):
self.neurons = [Neuron(nin) for _ in range(nout)]
def __call__(self, x):
outs = [n(x) for n in self.neurons]
return outs[0] if len(outs) == 1 else outs
def parameters(self):
params = []
for neuron in self.neurons:
ps = neuron.parameters()
params.extend(ps)
return params
# can also write it as return [p for neuron in self.neurons for p in neuron.parameters()
class MLP:
def __init__(self, nin, nouts):
sz = [nin] + nouts
self.layers = [Layer(sz[i], sz[i + 1]) for i in range(len(nouts))]
def __call__(self, x):
for layer in self.layers:
x = layer(x)
return x
def parameters(self):
return [p for layer in self.layers for p in layer.parameters()]
#let's reinitialize the neural net from scratch
x = [2.0,3.0,-1.0]
n = MLP(3,[4,4,1])
n(x)
Value(data=0.9144691472967574)
xs = [
[2.0, 3.0, -1.0],
[3.0, -1.0, 0.5],
[0.5, 1.0, 1.0],
[1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0]
Forgetting to zero_grad() before .backward().
Everytime we go backward, we now reset the gradients first below, because we're using +=
the gradient in the backpropogation. The weight includes the .data
and .grad
(which starts at 0).
Before, when we were doing the backpropogation, and updating the gradients, we were not flushing or cancelling the previous gradient, so they were adding, or +=
onto each other. You need to flush the .grad
and reset it to 0, or else the gradient will accumulate quickly and give you huge step sizes.
In the first few passes in previous notebooks, ours actually worked despite the bugs in the code. It's important to pay attention to these and know that sometimes the code will work but not be correct!
In complex problems, we won't get away with optimize the loss step very well with the above bug. Below, we fix it.
for k in range(15):
#forward pass
ypred = [n(x) for x in xs]
ys_value = [Value(y) for y in ys]
loss = sum([(yout - ygt) ** 2 for ygt, yout in zip(ys_value, ypred)], Value(0.0))
#backward pass
for p in n.parameters():
p.grad = 0.0 #flushing .grad to 0, this will result in a slower descent (so I will increase step size)
loss.backward()
#update with gradient descent
for p in n.parameters():
p.data += -.10 * p.grad #data goes up
print(k, loss.data) #print step and loss
#notice how the loss gets extremely low and ypred gets extremely close to targets
0 0.00021680770832094827 1 0.0002166101690439338 2 0.00021641297626310104 3 0.00021621612908318135 4 0.00021601962661196804 5 0.00021582346796028453 6 0.00021562765224198208 7 0.00021543217857393666 8 0.00021523704607599964 9 0.00021504225387103675 10 0.00021484780108487405 11 0.00021465368684630507 12 0.00021445991028707823 13 0.00021426647054186964 14 0.0002140733667482915
ypred
[Value(data=0.9921185717777462), Value(data=-0.9923511277429776), Value(data=-0.9923635441111361), Value(data=0.9938300746501169)]
Neural networks are mathematical expressions, very simple in the case of Multi-Layer Perceptrons (MLPs). They take input data, weights, and parameters of the neural network to perform computations.
This is followed by the loss function, which tries to measure the accuracy of the predictions. Usually, the loss is low when predictions match targets or when the network is behaving well.
The goal is to adjust the loss function so that when the loss is low, the network performs as desired on the given problem.
We then perform the backward pass to calculate the gradients using backpropagation. These gradients are used to tune all the parameters to decrease the loss locally. This process is iterated many times, which is known as gradient descent.
By following the gradient information, we minimize the loss. When the loss is minimized, the network behaves as intended.
Neural networks can perform arbitrary tasks. While the example network had 41 parameters, more complex networks can have billions of parameters, resembling a massive blob of neural tissue.
Neural networks can solve extremely complex problems and exhibit fascinating emergent properties. For instance, in the case of GPT, the network is trained on a massive amount of text from the internet to predict the next word in a sequence. This training leads to amazing properties, although it involves billions of parameters.
Despite the complexity, the fundamental principles remain the same. Evaluating the gradient and performing gradient descent are consistent. People may use slightly different stochastic gradient descent (SGD) updates, and the loss function may be cross-entropy loss instead of mean squared error for predicting the next token, but the training setup is fundamentally similar.