tanh()
function.¶!pip install graphviz
!export PATH="/usr/local/opt/graphviz/bin:$PATH"
Requirement already satisfied: graphviz in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (0.20.3) [notice] A new release of pip is available: 24.0 -> 24.1.2 [notice] To update, run: pip install --upgrade pip
import matplotlib.pyplot as plt
import numpy as np
import math
from graphviz import Digraph #graphviz is an opensource vizualization software. We are building out this graph in graphviz API.
def trace(root): #helper function that enumerates the ndoes and edges in the graph
# builds a set of all nodes and edges in a graph
nodes, edges = set(), set()
def build(v):
if v not in nodes:
nodes.add(v)
for child in v._prev:
edges.add((child, v))
build(child)
build(root)
return nodes, edges
def draw_dot(root): #creating op nodes (not Value objects)
dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
nodes, edges = trace(root) #call trace
for n in nodes:
uid = str(id(n))
# for any value in the graph, create a rectangular ('record') node for it
dot.node(name = uid, label = "{ %s | data %.4f | grad % .4f}" % (n.label, n.data, n.grad), shape='record')
if n._op:
# if this value is a result of some operation, create an op node for it. Not a value object
dot.node(name = uid + n._op, label = n._op)
# and connect this node to it
dot.edge(uid + n._op, uid)
for n1, n2 in edges:
# connect n1 to the op node of n2
dot.edge(str(id(n1)), str(id(n2)) + n2._op)
return dot
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._prev = set(_children)
self._backward = lambda : None
self._op = _op
self.label=label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad = 1.0 * out.grad
other.grad = 1.0 * out.grad
out._backward = _backward
return out
def __mul__(self, other):
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad = other.data * out.grad
other.grad = self.data * out.grad
out._backward = _backward
return out
def tanh(self):
x = self.data
t = (math.exp(2*x)-1)/(math.exp(2*x)+1)
out = Value(t, (self, ), 'tanh')
def _backward():
self.grad = (1-t**2) * out.grad
out._backward = _backward
return out
#define backward() without the underscore.
def backward(self): #Build the topological graph starting at self
topo = [] #topological list where we populate the order
visited = set() #maintain a set of visited nodes
def build_topo(v): #start at the root node
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0 #initalize root.grad
for node in reversed(topo):
node._backward() #call _backward() and do backpropogation on all of the children.
#FINISH IT!
o.backward()
draw_dot(o)
Try running the following code:
a = Value(3.0, label='a')
b = a + a; b.label = 'b'
b.backward()
draw_dot(b)
a
funnels two arrows on top of each other in its forward pass to b
, but what do you notice about the gradient?
The derivative of b
with respect to a
should be 2, or 1+1. That's the chain rule. Instead, a's
gradient is 1.0.
We are going to run into this issue every time we use a variable more than once in expression. This is because in the Value
class, we override self
and other
accidentally when resetting their values. Since self
and other
both point to the same object a
, other
overrides self
.
def __add__(self, other):
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad = 1.0 * out.grad #a.grad = 1.0 * 1.0
other.grad = 1.0 * out.grad #a.grad = 1.0 * 1.0 (resetting a.grad)
out._backward = _backward
return out
We can see this in a more complicated example. In the below example, d.backward()
overrides e.backward()
, as d's
gradients are correct but e
's aren't. For you, it might be the other way around, but either way it won't work.
a = Value(-2.0, label='a')
b = Value(3.0, label= 'b')
d = a * b; d.label = 'd'
e = a + b; e.label = 'e'
f = d * e; f.label = 'f'
f.backward()
draw_dot(f)
If we look at the multivariable case of the chain rule on wikipedia, the solution is simple: we accumulate the gradients. Instead of setting gradients, we simply do += self.grad
. We instead deposit the gradients from each branch, even it's the same variable. This way they add on top of one another.
We reinitialize the Value
class with a different _backward()
inside of our __add__
:
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._prev = set(_children)
self._backward = lambda : None
self._op = _op
self.label=label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += 1.0 * out.grad #doesn't reset anymore
other.grad += 1.0 * out.grad #doesn't reset anymore
out._backward = _backward
return out
def __mul__(self, other):
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad = other.data * out.grad
other.grad = self.data * out.grad
out._backward = _backward
return out
def tanh(self):
x = self.data
t = (math.exp(2*x)-1)/(math.exp(2*x)+1)
out = Value(t, (self, ), 'tanh')
def _backward():
self.grad = (1-t**2) * out.grad
out._backward = _backward
return out
#define backward() without the underscore.
def backward(self): #Build the topological graph starting at self
topo = [] #topological list where we populate the order
visited = set() #maintain a set of visited nodes
def build_topo(v): #start at the root node
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0 #initalize root.grad
for node in reversed(topo):
node._backward() #call _backward() and do backpropogation on all of the children.
#try again
a = Value(3.0, label='a')
b = a + a; b.label = 'b'
b.backward()
draw_dot(b) #a.grad should be 2
#try again
a = Value(-2.0, label='a')
b = Value(3.0, label= 'b')
d = a * b; d.label = 'd'
e = a + b; e.label = 'e'
f = d * e; f.label = 'f'
f.backward()
draw_dot(f) #a.grad should be 3 + -6 = -3; b.grad should be -2 + -6 = -8
#this way each separate branch can deposit their gradients on top of each other
Remember how we said earlier that we could have implemented tanh()
as a series of explicit expressions with the exp()
function? We instead built it as a single function because we knew its derivative and could backpropogate through it, but here we will implement it as a result of exponents, subtraction, etc. Remember the original tanh()
function:
Let's test to see where we can improve functionality. What happens when we add a number to a Value
object?
a = Value(2.0)
#a + 1 and a * 1 causes ValueError, since we are adding a int (non-Value) to a Value object
We solve this by reinitializing the Value class with the following code that checks if other
is an instance of Value
, and if not, we assume other
is an int
and create a Value
object with it. We do this for both __add__
and __mul__
other = other if isinstance(other, Value) else Value(other)
Now, after updating this, we still have a problem. a + 2
and a * 2
would work, but would 2 + a
or 2 * a
?
It would not. In python, it knows how to do a.__mul___(2)
, but does not know how to do 2.__mul__(a)
. It knows how to do self * other
, but not other * self
.
Fortunately, we can define an __rmul__
, which python checks for if it can't do the given operation. You can do the same with __radd__
. They will swap the order of the operands so python can compute the operation.
def __radd__(self, other):
return self + other
def __rmul__(self, other):
return self * other
Now, we update the Value
class and test:
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._prev = set(_children)
self._backward = lambda : None
self._op = _op
self.label=label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
other = other if isinstance(other, Value) else Value(other) # new for a + 1
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += 1.0 * out.grad
other.grad += 1.0 * out.grad
out._backward = _backward
return out
def __mul__(self, other):
other = other if isinstance(other, Value) else Value(other) #new for a * 1
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad = other.data * out.grad
other.grad = self.data * out.grad
out._backward = _backward
return out
def tanh(self):
x = self.data
t = (math.exp(2*x)-1)/(math.exp(2*x)+1)
out = Value(t, (self, ), 'tanh')
def _backward():
self.grad = (1-t**2) * out.grad
out._backward = _backward
return out
def __radd__(self, other): #new for 1 + a
return self + other
def __rmul__(self, other): #new for 1 * a
return self * other
def backward(self):
topo = []
visited = set()
def build_topo(v):
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0
for node in reversed(topo):
node._backward()
#test should work:
a = Value(2.0)
a + 1
a * 3
3 + a
2 * a
Value(data=4.0)
Now that we can add and divide both ways, let's add the exp()
method into the Value
class. It will mirror tanh()
in that it inputs, transforms, and outputs a single scalar value.
In the exp()
, we are trying to get ex, or eself.data.
The issue, after we calculate ex in the code, is how we're going to backpropogate through it.
We need to know the local derivative of ex.
D/dx of ex is ex. Thus, we can multiply it by out.grad
with the chain rule to backpropogate.
It will return ex.
e
is the base of the natural system of logarithms (approximately 2.718282) and x
is the number passed to it, or self.data
.
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._prev = set(_children)
self._backward = lambda : None
self._op = _op
self.label=label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += 1.0 * out.grad
other.grad += 1.0 * out.grad
out._backward = _backward
return out
def __mul__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad = other.data * out.grad
other.grad = self.data * out.grad
out._backward = _backward
return out
def tanh(self):
x = self.data
t = (math.exp(2*x)-1)/(math.exp(2*x)+1)
out = Value(t, (self, ), 'tanh')
def _backward():
self.grad = (1-t**2) * out.grad
out._backward = _backward
return out
def __radd__(self, other):
return self + other
def __rmul__(self, other):
return self * other
def exp(self): #trying to get e^self.data
x = self.data #hold the value
out = Value(math.exp(x), (self, ), 'exp') #set out to e^x
def _backward(): #since the derivative of e^x is e^x, we can jsut use out, since out.data is e^x
self.grad += out.data * out.grad #then we multiply out.data by out.grad (chain rule)
out._backward = _backward
return out
def backward(self):
topo = []
visited = set()
def build_topo(v):
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0
for node in reversed(topo):
node._backward()
#let's test it:
a = Value(4.0)
a.exp()
Value(data=54.598150033144236)
Now, we want to divide. To do this, we can redefine division as an option for a power function. This is because:
a/b = a*1/b = a*b**1
We make a power function:
def __pow__(self, other): #self to the pow of other
#self.data is an int or a float, not a Value obj, just accessing .data prop
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
out = Value(self.data**other, (self,),f'**{other}')
#what's the chain rule for backprop thru the power function, where power is to the power of some kind of constant??
def _backward():
#local derivative * out.gard
self.grad += other * self.data ** (other - 1) * out.grad
out._backward = _backward
return out
Why is this the local derivative? If you look up derivative rules, you may come across the power rule: ddx(xn)=nxn−1
To use this, we'll add the following, self explanatory function:
def __truediv__(self, other): #self/other; calls self.__mul__(other.__pow__(-1))
return self*other**-1
We add it to the Value class and test:
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._prev = set(_children)
self._backward = lambda : None
self._op = _op
self.label=label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += 1.0 * out.grad
other.grad += 1.0 * out.grad
out._backward = _backward
return out
def __mul__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad = other.data * out.grad
other.grad = self.data * out.grad
out._backward = _backward
return out
def tanh(self):
x = self.data
t = (math.exp(2*x)-1)/(math.exp(2*x)+1)
out = Value(t, (self, ), 'tanh')
def _backward():
self.grad = (1-t**2) * out.grad
out._backward = _backward
return out
def __radd__(self, other):
return self + other
def __rmul__(self, other):
return self * other
def exp(self): #trying to get e^self.data
x = self.data #hold the value
out = Value(math.exp(x), (self, ), 'exp') #set out to e^x
def _backward(): #since the derivative of e^x is e^x, we can jsut use out, since out.data is e^x
self.grad += out.data * out.grad #then we multiply out.data by out.grad (chain rule)
out._backward = _backward
return out
def __pow__(self, other): #self to the pow of other
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
out = Value(self.data**other, (self,),f'**{other}')
def _backward():
self.grad += other * self.data ** (other -1) * out.grad
out._backward = _backward
return out
def __truediv__(self, other): #self/other
return self*other**-1
def backward(self):
topo = []
visited = set()
def build_topo(v):
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0
for node in reversed(topo):
node._backward()
e = Value(-2.0)
f = Value(4.0)
e / f
Value(data=-0.5)
Now, we'll give it the subtract functionality. I'm sure you get the idea at this point. Instead of self - other
, we implement self + -other
(addition of negation):
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._backward = lambda: None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += 1.0 * out.grad
other.grad += 1.0 * out.grad
out._backward = _backward
return out
def __mul__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad += other.data * out.grad
other.grad += self.data * out.grad
out._backward = _backward
return out
def __neg__(self): #new : negate self
return self * -1
def __sub__(self, other): #self-other
return self + (-other)
def __pow__(self, other):
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
out = Value(self.data**other, (self,),f'**{other}')
def _backward():
self.grad += other * self.data ** (other -1) * out.grad
out._backward = _backward
return out
def __rmul__(self,other):
return self * other
def __truediv__(self, other):
return self*other**-1
def tanh(self):
x = self.data
t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
out = Value(t, (self, ), 'tanh')
def _backward():
self.grad += (1 - t**2) * out.grad
out._backward = _backward
return out
def exp(self):
x = self.data
out = Value(math.exp(x), (self, ), 'exp')
def _backward():
self.grad += out.data * out.grad
out._backward = _backward
return out
def backward(self):
topo = []
visited = set()
def build_topo(v):
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0
for node in reversed(topo):
node._backward()
#test it
g = Value(-3.4)
z = Value(4)
z - g
Value(data=7.4)
Now, let's revisit the variables from before, so we can break up the tan(h)
into what we've built above and create this:
tanh(x)=e2x−1e2x+1
#redefine variables and run, noting the original gradients after a backward pass:
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
b = Value(6.8813735870195432, label='b')
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label = 'o'
o.backward()
draw_dot(o)
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
b = Value(6.8813735870195432, label='b')
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
#trying to get hyperbolic tangent : e^(2x) - 1 / e^(2x) + 1
e = (2*n).exp() #e^(2x
o = (e - 1) / (e + 1)
o.label = 'o'
o.backward()
#test backprop to see if the gradients are the same as before
#expect to see a much longer graph because we broke up tanh()
draw_dot(o)
Since the end and beginning gradients and data end up being the same for the backward and forward pass, we can assume we made the right conversion from our original tanh()
to the broken down version.
Andrej had us do these exercises because:
tanh()
, and achieve the same result. The only thing you lose is the granularity of seeing the backward pass. As long as you can do the forward and backward pass, it doesn't matter what the operation is/how composite it is. If you can find the local gradient and you can chain it with the chain rule, you can do backpropogation with any type of operation.