In this chapter, we will:
[William Shakespeare] "Lord, what fools these mortals be"
First, let's import the autograd
framework we built in the previous chapter:
import numpy as np
class Tensor(object):
def __init__(self, data, autograd=False, parents=None, creation_op=None, id=None):
self.data = np.array(data)
self.autograd = autograd
self.parents = parents
self.children = {}
self.creation_op = creation_op
self.grad = None
if (id is None): id = np.random.randint(100)
self.id = id
if (parents is not None):
for parent in parents:
if (self.id not in parent.children):
parent.children[self] = 1
else:
parent.children[self] += 1
def all_grads_propagated(self):
for _, grads_count in self.children.items():
if (grads_count != 0): return False
return True
def __add__(self, other):
if (self.autograd and other.autograd):
return Tensor(self.data + other.data,
autograd=True,
parents=[self, other],
creation_op="+")
return Tensor(self.data + other.data)
def __sub__(self, other):
if (self.autograd and other.autograd):
return Tensor(self.data-other.data,
autograd=True,
parents=[self, other],
creation_op="-")
return Tensor(self.data-other.data)
def __mul__(self, other):
if (self.autograd and other.autograd):
return Tensor(self.data * other.data,
autograd=True,
parents=[self, other],
creation_op="*")
return Tensor(self.data * other.data)
def sum(self, dim):
if (self.autograd):
return Tensor(self.data.sum(dim),
autograd=True,
parents=[self],
creation_op="sum_" + str(dim))
return Tensor(self.data.sum(dim))
def __neg__(self):
if (self.autograd):
return Tensor(self.data * -1,
autograd=True,
parents=[self],
creation_op="neg")
return Tensor(self.data * -1)
def __repr__(self):
return str('Tensor(' + self.id.__repr__() + ')')
def __str__(self):
return str(self.data.__str__())
def expand(self, dim, copies):
trans_cmd = list(range(0, len(self.data.shape)))
trans_cmd.insert(dim, len(self.data.shape))
new_shape = list(self.data.shape) + [copies]
new_data = self.data.repeat(copies).reshape(new_shape)
new_data = new_data.transpose(trans_cmd)
if (self.autograd):
return Tensor(new_data,
autograd=True,
parents=[self],
creation_op="expand_"+str(dim))
return Tensor(new_data)
def transpose(self):
if (self.autograd):
return Tensor(self.data.transpose(),
autograd=True,
parents=[self],
creation_op="T")
return Tensor(self.data.transpose())
def mm(self, x):
if (self.autograd):
return Tensor(self.data.dot(x.data),
autograd=True,
parents=[self, x],
creation_op="mm")
return Tensor(self.data.dot(x.data))
def sigmoid(self):
if (self.autograd):
return Tensor(1/(1+np.exp(-self.data)),
autograd=True,
parents=[self],
creation_op="sigmoid")
return Tensor(1/(1+np.exp(-self.data)))
def tanh(self):
if (self.autograd):
return Tensor(np.tanh(self.data),
autograd=True,
parents=[self],
creation_op="tanh")
return Tensor(np.tanh(self.data))
def index_select(self, indices):
if (self.autograd):
new = Tensor(self.data[indices.data],
autograd=True,
parents=[self],
creation_op="index_select")
new.index_select_indices = indices
return new
return Tensor(self.data[indices.data])
def softmax(self):
temp = np.exp(self.data)
softmax_output = temp / np.sum(temp,
axis=len(self.data.shape)-1,
keepdims=True)
return softmax_output
def cross_entropy(self, target_indices):
temp = np.exp(self.data)
softmax_output = temp / np.sum(temp, axis=len(self.data.shape)-1, keepdims=True)
t = target_indices.data.flatten()
p = softmax_output.reshape(len(t), -1)
target_dist = np.eye(p.shape[1])[t]
loss = - (np.log(p) * (target_dist)).sum(1).mean()
if (self.autograd):
out = Tensor(loss,
autograd=True,
parents=[self],
creation_op="cross_entropy")
out.softmax_output = softmax_output
out.target_dist = target_dist
return out
return Tensor(loss)
def backward(self, grad=None, grad_origin=None):
if (self.autograd):
if (grad == None):
grad = Tensor(np.ones_like(self.data))
if (grad_origin is not None):
if (self.children[grad_origin] == 0):
raise Exception("cannot backprop more than once")
else:
self.children[grad_origin] -= 1
if (self.grad is None):
self.grad = grad
else:
self.grad += grad
if ((self.parents is not None) and (self.all_grads_propagated() or grad_origin is None)):
if (self.creation_op == "+"):
self.parents[0].backward(self.grad, grad_origin=self)
self.parents[1].backward(self.grad, grad_origin=self)
if (self.creation_op == "neg"):
self.parents[0].backward(self.grad.__neg__())
if (self.creation_op == '-'):
self.parents[0].backward(self.grad, grad_origin=self)
self.parents[1].backward(self.grad.__neg__(), grad_origin=self)
if (self.creation_op == '*'):
self.parents[0].backward(self.grad*self.parents[1], grad_origin=self)
self.parents[1].backward(self.grad*self.parents[0], grad_origin=self)
if (self.creation_op == 'mm'):
activation = self.parents[0] # usually an activation function
weights = self.parents[1] # usually a weights matrix
activation.backward(self.grad.mm(weights.transpose()))
weights.backward(self.grad.transpose().mm(activation).transpose())
if (self.creation_op == 'T'):
self.parents[0].backward(self.grad.transpose())
if ("sum" in self.creation_op):
dim = int(self.creation_op.split("_")[1])
ds = self.parents[0].data.shape[dim]
self.parents[0].backward(self.grad.expand(dim, ds))
if ("expand" in self.creation_op):
dim = int(self.creation_op.split("_")[1])
self.parents[0].backward(self.grad.sum(dim))
if (self.creation_op == 'sigmoid'):
ones = Tensor(np.ones_like(self.grad.data))
self.parents[0].backward(self.grad * (self * (ones - self)))
if (self.creation_op == 'tanh'):
ones = Tensor(np.ones_like(self.grad.data))
self.parents[0].backward(self.grad * (ones - (self * self)))
if (self.creation_op == 'index_select'):
new_grad = np.zeros_like(self.parents[0].data)
indices_ = self.index_select_indices.data.flatten()
grad_ = grad.data.reshape(len(indices_), -1)
for i in range(len(indices_)):
new_grad[indices_[i]] += grad_[i]
self.parents[0].backward(Tensor(new_grad))
if (self.creation_op == 'cross_entropy'):
dx = self.softmax_output - self.target_dist
self.parents[0].backward(Tensor(dx))
class Layer(object):
def __init__(self):
self.parameters = list()
def get_parameters(self):
return self.parameters
class Tanh(Layer):
def __init__(self):
super().__init__()
def forward(self, input):
return input.tanh()
class Sigmoid(Layer):
def __init__(self):
super().__init__()
def forward(self, input):
return input.sigmoid()
class Linear(Layer):
def __init__(self, n_inputs, n_outputs, bias=True):
super().__init__()
self.use_bias = bias
W = np.random.randn(n_inputs, n_outputs)*np.sqrt(2.0/n_inputs)
self.weight = Tensor(W, autograd=True)
if self.use_bias:
self.bias = Tensor(np.zeros(n_outputs), autograd=True)
self.parameters.append(self.weight)
if self.use_bias:
self.parameters.append(self.bias)
def forward(self, input):
# expand for broadcasting
if self.use_bias:
return input.mm(self.weight)+self.bias.expand(0,len(input.data))
return input.mm(self.weight)
class Embedding(Layer):
def __init__(self, vocab_size, dim):
super().__init__()
self.vocab_size = vocab_size
self.dim = dim
# this initialization style is a convention from word2vec
self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True)
self.parameters.append(self.weight)
def forward(self, input):
return self.weight.index_select(input)
# Cross Entropy Layer
class CrossEntropyLoss(object):
def __init__(self):
super().__init__()
def forward(self, input, target):
return input.cross_entropy(target)
class SGD(object):
def __init__(self, parameters, alpha):
self.parameters = parameters
self.alpha = alpha
def zero(self):
for p in self.parameters:
p.grad.data *= 0
def step(self, zero=True):
for p in self.parameters:
p.data -= p.grad.data * self.alpha
if (zero):
p.grad.data *= 0
class RNNCell(Layer):
def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
super().__init__()
self.n_inputs = n_inputs
self.n_hidden = n_hidden
self.n_output = n_output
if (activation == 'sigmoid'):
self.activation = Sigmoid()
elif (activation == 'tanh'):
self.activation = Tanh()
else:
raise Exception("Non-Linearity not found")
self.w_ih = Linear(n_inputs, n_hidden)
self.w_hh = Linear(n_hidden, n_hidden)
self.w_ho = Linear(n_hidden, n_output)
self.parameters += self.w_ih.get_parameters()
self.parameters += self.w_hh.get_parameters()
self.parameters += self.w_ho.get_parameters()
def forward(self, input, hidden):
from_prev_hidden = self.w_hh.forward(hidden)
combined = self.w_ih.forward(input) + from_prev_hidden
new_hidden = self.activation.forward(combined)
output = self.w_ho.forward(new_hidden)
return output, new_hidden
def init_hidden(self, batch_size=1):
return Tensor(np.zeros((batch_size,self.n_hidden)),autograd=True)
In this chapter, we'll attempt language modeling over a much more challenging dataset: The Works of Shakespeare.
Instead of learning to predict next words based on the previous sequence of words, now you'll learn how to predict next characters. So we are building a Character-based Language Model using the Works of Shakespeare.
import sys, random, math
from collections import Counter
import numpy as np
np.random.seed(0)
f = open('static/data/Shakespeare/shakespear.txt', 'r')
raw = f.read()
f.close()
vocab = list(set(raw))
word2index = {}
for i, word in enumerate(vocab):
word2index[word] = i
raw_indices = np.array(list(map(lambda x: word2index[x], raw)))
embed = Embedding(vocab_size=len(vocab), dim=8)
model = RNNCell(n_inputs=8, n_hidden=512, n_output=len(vocab))
criterion = CrossEntropyLoss()
optimizer = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)
We initialized the embeddings to be of dimensionality 8 and the hidden vector to be of size 512. The output weights are initialized to be of weight of 0
s.
Finally, we initialize the cross entropy loss function and the stochastic gradient optimizer.
One of the more challenging aspects of reading RNN code is the mini-batching logic when feeding in data.
The previous RNN Setup had an inner loop of 5 words fed to the network to predict the sixth, it turns out that the previous dataset didn't have any example longer than 6 words.
Even more important is the backpropagation step, in the case of MNIST, the gradients always backpropagated all the way through the network. The same logic applied to a vanilla RNN architecture, with a short loop, we can backpropagate all the way to the first input word.
We could do this because we aren't feeding that many data points at a time, but the shakespeare dataset has 100K characters, this is too many to backpropagate through, so what should we do?
We don't!, we backpropagate for a fixed number of steps into the past and then stop. This is called truncated backpropagation, and it's the industry standard. The length we backpropagate becomes another tunable hyperparameter of the network, like batch size
and alpha
(the learning rate).
The downside of using truncated backpropagation is that it limits the memory of the neural network, meaning it shortens the distance a neural network can take to remember things. Cutting off gradients after, let's say five timesteps, means that the neural network can't learn to remember events that are longer than five timesteps in the past.
For language modeling, the truncation variable is called bptt
, and it's usually set between 16
and 64
:
batch_size = 32
bptt = 16 # how far the model can look in the past — input sequence size?
n_batches = int(raw_indices.shape[0]/batch_size)
The other downside of truncated backpropagation is that it makes the internal mini-batching loop a bit more complex.
We pretend that instead of having one big dataset, we have a bunch of small datasets of bptt
size.
Next, we need to group the datasets accordingly:
trimmed_indices = raw_indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()
input_batched_indices = batched_indices[0:-1]
output_batched_indices = batched_indices[1:]
n_bptt = int((n_batches - 1) / bptt)
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
output_batches = output_batched_indices[:n_bptt*bptt]
output_batches = output_batches.reshape(n_bptt, bptt, batch_size)
min_loss = 1000
The top Line makes the dataset an even multiple between batch_size
and n_batches
. The decond & third lines reshape the dataset so that each column is a section of the initial indices
array.
If batch_size
was set to 8:
print(raw[0:5])
print(raw_indices[0:5])
That, [14 6 24 52 40]
Those are the Five Basic Characters in the Shakespeare Dataset.
Following are the first 5 rows of the output of the transformation combined within batched_indices
:
print(batched_indices[0:5])
[[14 19 43 39 58 6 14 35 35 24 24 33 58 35 40 31 35 54 6 43 33 24 35 35 35 7 35 7 7 35 35 47] [ 6 31 31 39 32 33 24 55 54 47 8 33 8 52 35 31 3 58 58 31 8 11 3 9 6 47 27 5 47 5 50 35] [24 31 10 33 33 24 8 36 58 35 8 43 9 6 25 38 33 55 8 31 9 33 28 33 33 29 58 40 35 58 58 3] [52 51 14 28 35 28 3 35 55 47 35 35 19 33 33 41 33 40 54 0 36 35 58 33 28 8 55 31 53 28 5 58] [40 41 7 10 25 35 58 28 35 58 25 48 31 35 35 23 47 35 35 58 19 54 25 32 40 24 8 41 55 33 33 54]]
See how the indices for the phrase "That," are in the first column on the left?
The reason there are N
columns is because the batch size is N
. This tensor is then used to construct a list of smaller data sets, each with length bptt
.
We should notice that the target indices are the input indices often by one row (so that the network predicts the next character):
print(input_batches[0][0:5])
[[14 19 43 39 58 6 14 35 35 24 24 33 58 35 40 31 35 54 6 43 33 24 35 35 35 7 35 7 7 35 35 47] [ 6 31 31 39 32 33 24 55 54 47 8 33 8 52 35 31 3 58 58 31 8 11 3 9 6 47 27 5 47 5 50 35] [24 31 10 33 33 24 8 36 58 35 8 43 9 6 25 38 33 55 8 31 9 33 28 33 33 29 58 40 35 58 58 3] [52 51 14 28 35 28 3 35 55 47 35 35 19 33 33 41 33 40 54 0 36 35 58 33 28 8 55 31 53 28 5 58] [40 41 7 10 25 35 58 28 35 58 25 48 31 35 35 23 47 35 35 58 19 54 25 32 40 24 8 41 55 33 33 54]]
print(output_batches[0][0:5])
[[ 6 31 31 39 32 33 24 55 54 47 8 33 8 52 35 31 3 58 58 31 8 11 3 9 6 47 27 5 47 5 50 35] [24 31 10 33 33 24 8 36 58 35 8 43 9 6 25 38 33 55 8 31 9 33 28 33 33 29 58 40 35 58 58 3] [52 51 14 28 35 28 3 35 55 47 35 35 19 33 33 41 33 40 54 0 36 35 58 33 28 8 55 31 53 28 5 58] [40 41 7 10 25 35 58 28 35 58 25 48 31 35 35 23 47 35 35 58 19 54 25 32 40 24 8 41 55 33 33 54] [35 44 36 9 7 33 52 33 25 35 33 6 31 9 25 42 35 5 3 36 31 58 35 35 35 25 9 47 39 35 40 36]]
This type of preprocessing doesn't have much to do with deep learning theory. It's just a particularly complex part of setting up RNNs that you'll run into from time to time.
The following example shows truncated backpropagation in practice. The only difference is that we generate a batch_loss
at each step. After every bptt
steps, we backpropagate and perform a weight update, then we keep reading through the dataset like nothing happened.
We use the hidden state from before and it only gets a reset with each epoch.
def generate_sample(n=30, init_char=' '):
s = ""
hidden = model.init_hidden(batch_size=1)
input = Tensor(np.array([word2index[init_char]]))
for i in range(n):
rnn_input = embed.forward(input)
output, hidden = model.forward(rnn_input, hidden)
# temperature for sampling, higher -> greedier
output.data *= 10
temp_dist = output.softmax()
temp_dist /= temp_dist.sum()
# samples from pred
m = (temp_dist > np.random.rand()).argmax()
c = vocab[m]
input = Tensor(np.array([m]))
s += c
return s
def train(iterations=7, min_loss=1000):
for iter in range(iterations):
total_loss = 0
n_loss = 0
hidden = model.init_hidden(batch_size=batch_size)
for batch_i in range(len(input_batches)):
hidden = Tensor(hidden.data, autograd=True)
loss = None
losses = list()
for t in range(bptt):
input = Tensor(input_batches[batch_i][t], autograd=True)
rnn_input = embed.forward(input)
output, hidden = model.forward(input=rnn_input, hidden=hidden)
target = Tensor(output_batches[batch_i][t], autograd=True)
batch_loss = criterion.forward(output, target)
losses.append(batch_loss)
if (t == 0):
loss = batch_loss
else:
loss = loss + batch_loss
loss = losses[-1]
loss.backward()
optimizer.step()
total_loss += loss.data/bptt
epoch_loss = np.exp(total_loss/(batch_i + 1))
if (epoch_loss < min_loss):
min_loss = epoch_loss
log = "\r Iter:" + str(iter)
log += " - Alpha:" + str(optimizer.alpha)[0:5]
log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
log += " - Min Loss:" + str(min_loss)[0:5]
log += " - Loss:" + str(epoch_loss)
if(batch_i == 0):
log += " - " + generate_sample(n=70, init_char='T').replace("\n"," ")
if(batch_i % 10 == 0):
sys.stdout.write(log)
optimizer.alpha *= 0.99
print()
train(2)
Iter:0 - Alpha:0.05 - Batch 191/195 - Min Loss:1.272 - Loss:1.2723613677367251
The following code uses a subset of the training logic to make predictions using the model. We store the predictions in a string and return the string version as output of the function. The sample that's generated looks quite shakespearnian and even includes characters talking:
print(generate_sample(n=1000, init_char='\n'))
The whole idea was to be able to combine the word embeddings in a way that preserves order. We did this by learning a matrix that transforms the vector representation of all of the previous embeddings into the next time step.
Forward Propagation then became a 3-step process:
We loop over this process, repeating until we've read the entire series of words.
An additional non-linearity was added to the hidden-state generation process. Thus, forward propagation becomes a 4-step process, applying the activation function was added. This non-linearity plays an important role in stabalizing the network. No matter how long the sequence of words is, the hidden states are forced to stay between the values of the non-linearity.
However, backpropagation occurs in a slightly different way than forward propagation, which doesn't have this nice property. Backpropagation tends to lead to either extremely large or extremely small values.
Let's take a closer look at RNNs backpropagation:
The following example shows a recurrent backpropagation loop for sigmoid
and relu
activations.
During Backprop:
ReLU
: Gradients become Large as a result of matrix Multiplication.Sigmoid
: Gradients become small as a result of the slope of the sigmoid curvature in much of its domain of definition (flat tails).import numpy as np
(sigmoid, relu) = (lambda x: 1/(1+np.exp(-x)), lambda x: (x>0).astype(float)*x)
weights = np.array([[1,4], [4,1]])
activation = sigmoid(np.array([1, 0.01]))
activations = list()
for iter in range(10):
activation = sigmoid(activation.dot(weights))
activations.append(activation)
print(activation)
[0.93940638 0.96852968] [0.9919462 0.99121735] [0.99301385 0.99302901] [0.9930713 0.99307098] [0.99307285 0.99307285] [0.99307291 0.99307291] [0.99307291 0.99307291] [0.99307291 0.99307291] [0.99307291 0.99307291] [0.99307291 0.99307291]
gradient = np.ones_like(activations)
for activation in reversed(activations):
gradient = activation * (1 - activation) * gradient
gradient = gradient.dot(weights.transpose())
print(gradient)
[[0.03439552 0.03439552] [0.03439552 0.03439552] [0.03439552 0.03439552] [0.03439552 0.03439552] [0.03439552 0.03439552] [0.03439552 0.03439552] [0.03439552 0.03439552] [0.03439552 0.03439552] [0.03439552 0.03439552] [0.03439552 0.03439552]] [[0.00118305 0.00118305] [0.00118305 0.00118305] [0.00118305 0.00118305] [0.00118305 0.00118305] [0.00118305 0.00118305] [0.00118305 0.00118305] [0.00118305 0.00118305] [0.00118305 0.00118305] [0.00118305 0.00118305] [0.00118305 0.00118305]] [[4.06916726e-05 4.06916726e-05] [4.06916726e-05 4.06916726e-05] [4.06916726e-05 4.06916726e-05] [4.06916726e-05 4.06916726e-05] [4.06916726e-05 4.06916726e-05] [4.06916726e-05 4.06916726e-05] [4.06916726e-05 4.06916726e-05] [4.06916726e-05 4.06916726e-05] [4.06916726e-05 4.06916726e-05] [4.06916726e-05 4.06916726e-05]] [[1.39961115e-06 1.39961115e-06] [1.39961115e-06 1.39961115e-06] [1.39961115e-06 1.39961115e-06] [1.39961115e-06 1.39961115e-06] [1.39961115e-06 1.39961115e-06] [1.39961115e-06 1.39961115e-06] [1.39961115e-06 1.39961115e-06] [1.39961115e-06 1.39961115e-06] [1.39961115e-06 1.39961115e-06] [1.39961115e-06 1.39961115e-06]] [[4.81403643e-08 4.81403637e-08] [4.81403643e-08 4.81403637e-08] [4.81403643e-08 4.81403637e-08] [4.81403643e-08 4.81403637e-08] [4.81403643e-08 4.81403637e-08] [4.81403643e-08 4.81403637e-08] [4.81403643e-08 4.81403637e-08] [4.81403643e-08 4.81403637e-08] [4.81403643e-08 4.81403637e-08] [4.81403643e-08 4.81403637e-08]] [[1.65582672e-09 1.65582765e-09] [1.65582672e-09 1.65582765e-09] [1.65582672e-09 1.65582765e-09] [1.65582672e-09 1.65582765e-09] [1.65582672e-09 1.65582765e-09] [1.65582672e-09 1.65582765e-09] [1.65582672e-09 1.65582765e-09] [1.65582672e-09 1.65582765e-09] [1.65582672e-09 1.65582765e-09] [1.65582672e-09 1.65582765e-09]] [[5.69682675e-11 5.69667160e-11] [5.69682675e-11 5.69667160e-11] [5.69682675e-11 5.69667160e-11] [5.69682675e-11 5.69667160e-11] [5.69682675e-11 5.69667160e-11] [5.69682675e-11 5.69667160e-11] [5.69682675e-11 5.69667160e-11] [5.69682675e-11 5.69667160e-11] [5.69682675e-11 5.69667160e-11] [5.69682675e-11 5.69667160e-11]] [[1.97259346e-12 1.97517920e-12] [1.97259346e-12 1.97517920e-12] [1.97259346e-12 1.97517920e-12] [1.97259346e-12 1.97517920e-12] [1.97259346e-12 1.97517920e-12] [1.97259346e-12 1.97517920e-12] [1.97259346e-12 1.97517920e-12] [1.97259346e-12 1.97517920e-12] [1.97259346e-12 1.97517920e-12] [1.97259346e-12 1.97517920e-12]] [[8.45387597e-14 8.02306381e-14] [8.45387597e-14 8.02306381e-14] [8.45387597e-14 8.02306381e-14] [8.45387597e-14 8.02306381e-14] [8.45387597e-14 8.02306381e-14] [8.45387597e-14 8.02306381e-14] [8.45387597e-14 8.02306381e-14] [8.45387597e-14 8.02306381e-14] [8.45387597e-14 8.02306381e-14] [8.45387597e-14 8.02306381e-14]] [[1.45938177e-14 2.16938983e-14] [1.45938177e-14 2.16938983e-14] [1.45938177e-14 2.16938983e-14] [1.45938177e-14 2.16938983e-14] [1.45938177e-14 2.16938983e-14] [1.45938177e-14 2.16938983e-14] [1.45938177e-14 2.16938983e-14] [1.45938177e-14 2.16938983e-14] [1.45938177e-14 2.16938983e-14] [1.45938177e-14 2.16938983e-14]]
activations = list()
for iter in range(10):
activation = relu(activation.dot(weights))
activations.append(activation)
print(activation)
[4.8135251 4.72615519] [23.71814585 23.98025559] [119.63916823 118.852839 ] [595.05052421 597.40951192] [2984.68857188 2977.61160877] [14895.13500696 14916.36589628] [74560.59859209 74496.90592414] [372548.22228863 372739.30029248] [1863505.42345854 1862932.18944699] [9315234.18124649 9316953.88328115]
gradient = np.ones_like(activation)
for activation in reversed(activations):
gradient = ((activation > 0) * gradient).dot(weights.transpose())
print(gradient)
[5. 5.] [25. 25.] [125. 125.] [625. 625.] [3125. 3125.] [15625. 15625.] [78125. 78125.] [390625. 390625.] [1953125. 1953125.] [9765625. 9765625.]
The problem with vanishing (sigmoid) / exploding (matrix multiplication) gradients is the combination of matrix multiplication and non-linearity being used to form the next hidden state. The Solution that LSTMs provide is quite simple:
The one we care about is cell
, Each new cell is the previous cell + u
weighted by i
and f
:
f
is the forget gate: if it takes a value of 0 the next cell will erase what it saw previously.i
is 1
, it will fully add in the value of u
to create the new cell.o
is an output gate that controls how much of the cells state the output prediction is allowed to see.def forward(self, input, hidden):
prev_hidden, prev_cell = (hidden[0], hidden[1])
f = (self.xf.forward(input) + self.hf.forward(prev_hidden)).sigmoid() # forget gate
i = (self.xi.forward(input) + self.hi.forward(prev_hidden)).sigmoid()
o = (self.xo.forward(input) + self.ho.forward(prev_hidden)).sigmoid() # output gate
u = (self.xc.forward(input) + self.hc.forward(prev_hidden)).tanh()
cell = (f * prev_cell) + (i * u)
h = o * cell.tanh()
output = self.w_ho.forward(h)
return output, (h, cell)
An LSTM cell has 3 gates, f
, i
, o
& a cell update vector u
. We think of these gates as forget, Input, Output, and Update. They work together to ensure that any information to be stored in c
can be saved without requiring each update of c
to have matrix multiplications or non-linearities applied to it.
This is what allows the LSTM to store Information across a time series without worrying about vanishing or exploding gradients. Each step is a copy plus an update. The hidden value h is then a masked version of the cell that's used for prediction.
Each gate has its own weight matrices.
One last possible critique is about h
. Clearly it's still prone to vanishing & exploding gradients. Exploding gradients aren't really a problem since we can always clip them, the only serious problem are vanishing gradients. But this ends up being Okay because h
is conditioned on c
, which can carry long range information.
All long range information is transported using c
. h
is only a localized interpretaion of c
In short, c
can learn to transport information over long distances, so it doesn't matter if h
can't.
class LSTMCell(Layer):
def __init__(self, n_inputs, n_hidden, n_output):
super().__init__()
self.n_inputs = n_inputs
self.n_hidden = n_hidden
self.n_outputs = n_output
self.xf = Linear(n_inputs, n_hidden)
self.xi = Linear(n_inputs, n_hidden)
self.xo = Linear(n_inputs, n_hidden)
self.xc = Linear(n_inputs, n_hidden)
self.hf = Linear(n_hidden, n_hidden, bias=False)
self.hi = Linear(n_hidden, n_hidden, bias=False)
self.ho = Linear(n_hidden, n_hidden, bias=False)
self.hc = Linear(n_hidden, n_hidden, bias=False)
self.w_ho = Linear(n_hidden, n_output, bias=False)
self.parameters += self.xf.get_parameters()
self.parameters += self.xi.get_parameters()
self.parameters += self.xo.get_parameters()
self.parameters += self.xc.get_parameters()
self.parameters += self.hf.get_parameters()
self.parameters += self.hi.get_parameters()
self.parameters += self.ho.get_parameters()
self.parameters += self.hc.get_parameters()
self.parameters += self.w_ho.get_parameters()
def forward(self, input, hidden):
prev_hidden = hidden[0]
prev_cell = hidden[1]
f=(self.xf.forward(input)+self.hf.forward(prev_hidden)).sigmoid()
i=(self.xi.forward(input)+self.hi.forward(prev_hidden)).sigmoid()
o=(self.xo.forward(input)+self.ho.forward(prev_hidden)).sigmoid()
g = (self.xc.forward(input) +self.hc.forward(prev_hidden)).tanh()
c = (f * prev_cell) + (i * g)
h = o * c.tanh()
output = self.w_ho.forward(h)
return output, (h, c)
def init_hidden(self, batch_size=1):
h = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
c = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
h.data[:, 0] += 1
c.data[:, 0] += 1
return (h, c)
import sys, random, math
from collections import Counter
import numpy as np
np.random.seed(0)
f = open('static/data/Shakespeare/shakespear.txt', 'r')
raw = f.read()
f.close()
vocab = list(set(raw))
word2index = {}
for i, word in enumerate(vocab):
word2index[word] = i
raw_indices = np.array(list(map(lambda x: word2index[x], raw)))
embed = Embedding(vocab_size=len(vocab), dim=8)
model = LSTMCell(n_inputs=8, n_hidden=256, n_output=len(vocab))
criterion = CrossEntropyLoss()
optimizer = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)
batch_size = 16
bptt = 25
n_batches = int((raw_indices.shape[0] / (batch_size)))
trimmed_indices = raw_indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()
input_batched_indices = batched_indices[0:-1]
output_batched_indices = batched_indices[1:]
n_bptt = int((n_batches - 1) / bptt)
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
output_batches = output_batched_indices[:n_bptt*bptt]
output_batches = output_batches.reshape(n_bptt, bptt, batch_size)
min_loss = 1000
def train(iterations=7, min_loss=1000):
for iter in range(iterations):
total_loss = 0
n_loss = 0
hidden = model.init_hidden(batch_size=batch_size)
for batch_i in range(len(input_batches)):
hidden = (Tensor(hidden[0].data, autograd=True),
Tensor(hidden[1].data, autograd=True))
loss = None
losses = list()
for t in range(bptt):
input = Tensor(input_batches[batch_i][t], autograd=True)
rnn_input = embed.forward(input)
output, hidden = model.forward(input=rnn_input, hidden=hidden)
target = Tensor(output_batches[batch_i][t], autograd=True)
batch_loss = criterion.forward(output, target)
losses.append(batch_loss)
if (t == 0):
loss = batch_loss
else:
loss = loss + batch_loss
loss = losses[-1]
loss.backward()
optimizer.step()
total_loss += loss.data/bptt
epoch_loss = np.exp(total_loss/(batch_i + 1))
if (epoch_loss < min_loss):
min_loss = epoch_loss
log = "\r Iter:" + str(iter)
log += " - Alpha:" + str(optimizer.alpha)[0:5]
log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
log += " - Min Loss:" + str(min_loss)[0:5]
log += " - Loss:" + str(epoch_loss)
if(batch_i == 0):
log += " - " + generate_sample(n=70, init_char='T').replace("\n"," ")
if(batch_i % 10 == 0):
sys.stdout.write(log)
optimizer.alpha *= 0.99
print()
We should note that this model takes a long time to train (lots of parameters).