Notebook

Learning to Write like Shakespeare: Long short-term memory¶

In this chapter, we will:

Implement character language modeling
Learn about truncated Backpropagation
Explore the Vanishing & Exploding Gradients problem
Play around with a toy example of RNN backpropagation
Implement Long Short-term memory (LSTM) cells

[William Shakespeare] "Lord, what fools these mortals be"

Requirements¶

First, let's import the autograd framework we built in the previous chapter:

In [1]:

import numpy as np

class Tensor(object):
    def __init__(self, data, autograd=False, parents=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.autograd = autograd
        self.parents = parents
        self.children = {}
        self.creation_op = creation_op
        self.grad = None
        if (id is None): id = np.random.randint(100)
        self.id = id
        
        if (parents is not None):
            for parent in parents:
                if (self.id not in parent.children):
                    parent.children[self] = 1
                else:
                    parent.children[self] += 1
        
    def all_grads_propagated(self):
        for _, grads_count in self.children.items():
            if (grads_count != 0): return False
        return True
    
    def __add__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data + other.data, 
                          autograd=True,
                          parents=[self, other], 
                          creation_op="+")
        return Tensor(self.data + other.data)
    
    def __sub__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data-other.data,
                         autograd=True,
                         parents=[self, other],
                         creation_op="-")
        return Tensor(self.data-other.data)
    
    def __mul__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data * other.data,
                         autograd=True,
                         parents=[self, other],
                         creation_op="*")
        return Tensor(self.data * other.data)
    
    def sum(self, dim):
        if (self.autograd):
            return Tensor(self.data.sum(dim),
                         autograd=True,
                         parents=[self],
                         creation_op="sum_" + str(dim))
        return Tensor(self.data.sum(dim))

    def __neg__(self):
        if (self.autograd):
            return Tensor(self.data * -1,
                         autograd=True,
                         parents=[self],
                         creation_op="neg")
        return Tensor(self.data * -1)

    def __repr__(self):
        return str('Tensor(' + self.id.__repr__() + ')')

    def __str__(self):
        return str(self.data.__str__())
    
    def expand(self, dim, copies):
        trans_cmd = list(range(0, len(self.data.shape)))
        trans_cmd.insert(dim, len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape)
        new_data = new_data.transpose(trans_cmd)
        
        if (self.autograd):
            return Tensor(new_data, 
                         autograd=True,
                         parents=[self],
                         creation_op="expand_"+str(dim))
        return Tensor(new_data)
    
    def transpose(self):
        if (self.autograd):
            return Tensor(self.data.transpose(),
                         autograd=True,
                         parents=[self],
                         creation_op="T")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if (self.autograd):
            return Tensor(self.data.dot(x.data),
                         autograd=True,
                         parents=[self, x],
                         creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if (self.autograd):
            return Tensor(1/(1+np.exp(-self.data)),
                          autograd=True,
                          parents=[self],
                          creation_op="sigmoid")
        return Tensor(1/(1+np.exp(-self.data)))
    
    def tanh(self):
        if (self.autograd):
            return Tensor(np.tanh(self.data),
                         autograd=True,
                         parents=[self],
                         creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    def index_select(self, indices):
        if (self.autograd):
            new = Tensor(self.data[indices.data],
                        autograd=True,
                        parents=[self],
                        creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])
    
    def softmax(self):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape)-1,
                                       keepdims=True)
        return softmax_output
    
    def cross_entropy(self, target_indices):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp, axis=len(self.data.shape)-1, keepdims=True)
        t =  target_indices.data.flatten()
        p = softmax_output.reshape(len(t), -1)
        target_dist = np.eye(p.shape[1])[t]
        loss = - (np.log(p) * (target_dist)).sum(1).mean()

        if (self.autograd):
            out = Tensor(loss,
                        autograd=True,
                        parents=[self],
                        creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out
        return Tensor(loss)
    
    def backward(self, grad=None, grad_origin=None):
        if (self.autograd):
            if (grad == None):
                grad = Tensor(np.ones_like(self.data))
            if (grad_origin is not None):
                if (self.children[grad_origin] == 0):
                    raise Exception("cannot backprop more than once")
                else:
                    self.children[grad_origin] -= 1
            if (self.grad is None):
                self.grad = grad
            else:
                self.grad += grad
            if ((self.parents is not None) and (self.all_grads_propagated() or grad_origin is None)):
                if (self.creation_op == "+"):
                    self.parents[0].backward(self.grad, grad_origin=self)
                    self.parents[1].backward(self.grad, grad_origin=self)
                if (self.creation_op == "neg"):
                    self.parents[0].backward(self.grad.__neg__())
                if (self.creation_op == '-'):
                    self.parents[0].backward(self.grad, grad_origin=self)
                    self.parents[1].backward(self.grad.__neg__(), grad_origin=self)
                if (self.creation_op == '*'):
                    self.parents[0].backward(self.grad*self.parents[1], grad_origin=self)
                    self.parents[1].backward(self.grad*self.parents[0], grad_origin=self)
                if (self.creation_op == 'mm'):
                    activation = self.parents[0]  # usually an activation function
                    weights = self.parents[1]  # usually a weights matrix
                    activation.backward(self.grad.mm(weights.transpose()))
                    weights.backward(self.grad.transpose().mm(activation).transpose())
                if (self.creation_op == 'T'):
                    self.parents[0].backward(self.grad.transpose())
                if ("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    ds = self.parents[0].data.shape[dim]
                    self.parents[0].backward(self.grad.expand(dim, ds))
                if ("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.parents[0].backward(self.grad.sum(dim))
                if (self.creation_op == 'sigmoid'):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.parents[0].backward(self.grad * (self * (ones - self)))
                if (self.creation_op == 'tanh'):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.parents[0].backward(self.grad * (ones - (self * self)))
                if (self.creation_op == 'index_select'):
                    new_grad = np.zeros_like(self.parents[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.parents[0].backward(Tensor(new_grad))
                if (self.creation_op == 'cross_entropy'):
                    dx = self.softmax_output - self.target_dist
                    self.parents[0].backward(Tensor(dx))

In [2]:

class Layer(object):
    def __init__(self):
        self.parameters = list()
    
    def get_parameters(self):
        return self.parameters

In [3]:

class Tanh(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.tanh()

In [4]:

class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()

In [5]:

class Linear(Layer):
    def __init__(self, n_inputs, n_outputs, bias=True):
        super().__init__()
        
        self.use_bias = bias
        W = np.random.randn(n_inputs, n_outputs)*np.sqrt(2.0/n_inputs)
        self.weight = Tensor(W, autograd=True)
        if self.use_bias:
            self.bias = Tensor(np.zeros(n_outputs), autograd=True)
        self.parameters.append(self.weight)
        if self.use_bias:
            self.parameters.append(self.bias)
    
    def forward(self, input):
        # expand for broadcasting
        if self.use_bias:
            return input.mm(self.weight)+self.bias.expand(0,len(input.data))
        return input.mm(self.weight)

In [6]:

class Embedding(Layer):
    def __init__(self, vocab_size, dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.dim = dim
        # this initialization style is a convention from word2vec
        self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True)
        self.parameters.append(self.weight)
    
    def forward(self, input):
        return self.weight.index_select(input)

In [7]:

# Cross Entropy Layer
class CrossEntropyLoss(object):
    def __init__(self):
        super().__init__()
    
    def forward(self, input, target):
        return input.cross_entropy(target)

In [8]:

class SGD(object):
    def __init__(self, parameters, alpha):
        self.parameters = parameters
        self.alpha = alpha
        
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
    
    def step(self, zero=True):
        for p in self.parameters:
            p.data -= p.grad.data * self.alpha
            if (zero):
                p.grad.data *= 0

In [9]:

class RNNCell(Layer):
    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        
        if (activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif (activation == 'tanh'):
            self.activation = Tanh()
        else:
            raise Exception("Non-Linearity not found")
        
        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)
        
        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()
        
    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden

    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size,self.n_hidden)),autograd=True)

Character Language Modeling¶

Let's tackle a more challenging task with RNN¶

In this chapter, we'll attempt language modeling over a much more challenging dataset: The Works of Shakespeare.

Instead of learning to predict next words based on the previous sequence of words, now you'll learn how to predict next characters. So we are building a Character-based Language Model using the Works of Shakespeare.

In [10]:

import sys, random, math
from collections import Counter
import numpy as np

np.random.seed(0)

In [11]:

f = open('static/data/Shakespeare/shakespear.txt', 'r')
raw = f.read()
f.close()

In [12]:

vocab = list(set(raw))
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

In [13]:

raw_indices = np.array(list(map(lambda x: word2index[x], raw)))

In [14]:

embed = Embedding(vocab_size=len(vocab), dim=8)
model = RNNCell(n_inputs=8, n_hidden=512, n_output=len(vocab))

In [15]:

criterion = CrossEntropyLoss()
optimizer = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

We initialized the embeddings to be of dimensionality 8 and the hidden vector to be of size 512. The output weights are initialized to be of weight of 0s.

Finally, we initialize the cross entropy loss function and the stochastic gradient optimizer.

The Need for truncated backpropagation¶

Backpropagating through 100,000 character is intractable¶

One of the more challenging aspects of reading RNN code is the mini-batching logic when feeding in data.

The previous RNN Setup had an inner loop of 5 words fed to the network to predict the sixth, it turns out that the previous dataset didn't have any example longer than 6 words.

Even more important is the backpropagation step, in the case of MNIST, the gradients always backpropagated all the way through the network. The same logic applied to a vanilla RNN architecture, with a short loop, we can backpropagate all the way to the first input word.

We could do this because we aren't feeding that many data points at a time, but the shakespeare dataset has 100K characters, this is too many to backpropagate through, so what should we do?

We don't!, we backpropagate for a fixed number of steps into the past and then stop. This is called truncated backpropagation, and it's the industry standard. The length we backpropagate becomes another tunable hyperparameter of the network, like batch size and alpha (the learning rate).

Truncated Backpropagation¶

Technically, It weakens the theoritical Maximum of the neural network¶

The downside of using truncated backpropagation is that it limits the memory of the neural network, meaning it shortens the distance a neural network can take to remember things. Cutting off gradients after, let's say five timesteps, means that the neural network can't learn to remember events that are longer than five timesteps in the past.

For language modeling, the truncation variable is called bptt, and it's usually set between 16 and 64:

In [16]:

batch_size = 32
bptt = 16  # how far the model can look in the past — input sequence size?
n_batches = int(raw_indices.shape[0]/batch_size)

The other downside of truncated backpropagation is that it makes the internal mini-batching loop a bit more complex.

We pretend that instead of having one big dataset, we have a bunch of small datasets of bptt size.

Next, we need to group the datasets accordingly:

In [17]:

trimmed_indices = raw_indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()

In [18]:

input_batched_indices = batched_indices[0:-1]
output_batched_indices = batched_indices[1:]

In [19]:

n_bptt = int((n_batches - 1) / bptt)

In [20]:

input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
output_batches = output_batched_indices[:n_bptt*bptt]
output_batches = output_batches.reshape(n_bptt, bptt, batch_size)

In [21]:

min_loss = 1000

The top Line makes the dataset an even multiple between batch_size and n_batches. The decond & third lines reshape the dataset so that each column is a section of the initial indices array.

If batch_size was set to 8:

In [22]:

print(raw[0:5])
print(raw_indices[0:5])

That,
[14  6 24 52 40]

Those are the Five Basic Characters in the Shakespeare Dataset.

Following are the first 5 rows of the output of the transformation combined within batched_indices:

In [23]:

print(batched_indices[0:5])

[[14 19 43 39 58  6 14 35 35 24 24 33 58 35 40 31 35 54  6 43 33 24 35 35
  35  7 35  7  7 35 35 47]
 [ 6 31 31 39 32 33 24 55 54 47  8 33  8 52 35 31  3 58 58 31  8 11  3  9
   6 47 27  5 47  5 50 35]
 [24 31 10 33 33 24  8 36 58 35  8 43  9  6 25 38 33 55  8 31  9 33 28 33
  33 29 58 40 35 58 58  3]
 [52 51 14 28 35 28  3 35 55 47 35 35 19 33 33 41 33 40 54  0 36 35 58 33
  28  8 55 31 53 28  5 58]
 [40 41  7 10 25 35 58 28 35 58 25 48 31 35 35 23 47 35 35 58 19 54 25 32
  40 24  8 41 55 33 33 54]]

See how the indices for the phrase "That," are in the first column on the left?

The reason there are N columns is because the batch size is N. This tensor is then used to construct a list of smaller data sets, each with length bptt.

We should notice that the target indices are the input indices often by one row (so that the network predicts the next character):

In [24]:

print(input_batches[0][0:5])

[[14 19 43 39 58  6 14 35 35 24 24 33 58 35 40 31 35 54  6 43 33 24 35 35
  35  7 35  7  7 35 35 47]
 [ 6 31 31 39 32 33 24 55 54 47  8 33  8 52 35 31  3 58 58 31  8 11  3  9
   6 47 27  5 47  5 50 35]
 [24 31 10 33 33 24  8 36 58 35  8 43  9  6 25 38 33 55  8 31  9 33 28 33
  33 29 58 40 35 58 58  3]
 [52 51 14 28 35 28  3 35 55 47 35 35 19 33 33 41 33 40 54  0 36 35 58 33
  28  8 55 31 53 28  5 58]
 [40 41  7 10 25 35 58 28 35 58 25 48 31 35 35 23 47 35 35 58 19 54 25 32
  40 24  8 41 55 33 33 54]]

In [25]:

print(output_batches[0][0:5])

[[ 6 31 31 39 32 33 24 55 54 47  8 33  8 52 35 31  3 58 58 31  8 11  3  9
   6 47 27  5 47  5 50 35]
 [24 31 10 33 33 24  8 36 58 35  8 43  9  6 25 38 33 55  8 31  9 33 28 33
  33 29 58 40 35 58 58  3]
 [52 51 14 28 35 28  3 35 55 47 35 35 19 33 33 41 33 40 54  0 36 35 58 33
  28  8 55 31 53 28  5 58]
 [40 41  7 10 25 35 58 28 35 58 25 48 31 35 35 23 47 35 35 58 19 54 25 32
  40 24  8 41 55 33 33 54]
 [35 44 36  9  7 33 52 33 25 35 33  6 31  9 25 42 35  5  3 36 31 58 35 35
  35 25  9 47 39 35 40 36]]

This type of preprocessing doesn't have much to do with deep learning theory. It's just a particularly complex part of setting up RNNs that you'll run into from time to time.

Let's see how to iterate using truncated backpropagation¶

The following example shows truncated backpropagation in practice. The only difference is that we generate a batch_loss at each step. After every bptt steps, we backpropagate and perform a weight update, then we keep reading through the dataset like nothing happened.

We use the hidden state from before and it only gets a reset with each epoch.

In [26]:

def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(rnn_input, hidden)
        # temperature for sampling, higher -> greedier
        output.data *= 10
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()
        
        # samples from pred
        m = (temp_dist > np.random.rand()).argmax()
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s

In [27]:

def train(iterations=7, min_loss=1000):
    for iter in range(iterations):
        total_loss = 0
        n_loss = 0
        
        hidden = model.init_hidden(batch_size=batch_size)
        for batch_i in range(len(input_batches)):
            
            hidden = Tensor(hidden.data, autograd=True)
            loss = None
            losses = list()
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)
                target = Tensor(output_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                losses.append(batch_loss)
                if (t == 0):
                    loss = batch_loss
                else:
                    loss = loss + batch_loss
            loss = losses[-1]
            loss.backward()
            optimizer.step()
            total_loss += loss.data/bptt
                
            epoch_loss = np.exp(total_loss/(batch_i + 1))
            if (epoch_loss < min_loss):
                min_loss = epoch_loss
            log = "\r Iter:" + str(iter)
            log += " - Alpha:" + str(optimizer.alpha)[0:5]
            log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
            log += " - Min Loss:" + str(min_loss)[0:5]
            log += " - Loss:" + str(epoch_loss)
            if(batch_i == 0):
                log += " - " + generate_sample(n=70, init_char='T').replace("\n"," ")
            if(batch_i % 10 == 0):
                sys.stdout.write(log)
        optimizer.alpha *= 0.99 
        print()

In [28]:

train(2)

 Iter:0 - Alpha:0.05 - Batch 191/195 - Min Loss:1.272 - Loss:1.2723613677367251

A Sample of the Output¶

By sampling from the Predictions of the Model, you can write Shakespeare!¶

The following code uses a subset of the training logic to make predictions using the model. We store the predictions in a string and return the string version as output of the function. The sample that's generated looks quite shakespearnian and even includes characters talking:

In [ ]:

print(generate_sample(n=1000, init_char='\n'))

Vanishing & Exploding Gradients¶

Vanilla RNNs Suffer from Vanishing & Exploding Gradients¶

The whole idea was to be able to combine the word embeddings in a way that preserves order. We did this by learning a matrix that transforms the vector representation of all of the previous embeddings into the next time step.

Forward Propagation then became a 3-step process:

Start with the first word embedding.
Multiply it by the shared weight matrix.
Add the Next Embedding.

We loop over this process, repeating until we've read the entire series of words.

An additional non-linearity was added to the hidden-state generation process. Thus, forward propagation becomes a 4-step process, applying the activation function was added. This non-linearity plays an important role in stabalizing the network. No matter how long the sequence of words is, the hidden states are forced to stay between the values of the non-linearity.

However, backpropagation occurs in a slightly different way than forward propagation, which doesn't have this nice property. Backpropagation tends to lead to either extremely large or extremely small values.

Large values can cause divergence (NaNs).
Extremely small values keep the netword from learning (small updates).

Let's take a closer look at RNNs backpropagation:

A Toy Example of RNN backpropagation¶

To see vanishing/exploding gradients firsthand, Let's synthesize an Example¶

The following example shows a recurrent backpropagation loop for sigmoid and relu activations. During Backprop:

ReLU: Gradients become Large as a result of matrix Multiplication.
Sigmoid: Gradients become small as a result of the slope of the sigmoid curvature in much of its domain of definition (flat tails).

In [2]:

import numpy as np

In [3]:

(sigmoid, relu) = (lambda x: 1/(1+np.exp(-x)), lambda x: (x>0).astype(float)*x) 

In [4]:

weights = np.array([[1,4], [4,1]])

In [5]:

activation = sigmoid(np.array([1, 0.01]))

Sigmoid Activations¶

In [6]:

activations = list()

In [7]:

for iter in range(10):
    activation = sigmoid(activation.dot(weights))
    activations.append(activation)
    print(activation)

[0.93940638 0.96852968]
[0.9919462  0.99121735]
[0.99301385 0.99302901]
[0.9930713  0.99307098]
[0.99307285 0.99307285]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]

Sigmoid Gradients¶

In [8]:

gradient = np.ones_like(activations)

In [9]:

for activation in reversed(activations):
    gradient = activation * (1 - activation) * gradient
    gradient = gradient.dot(weights.transpose())
    print(gradient)

[[0.03439552 0.03439552]
 [0.03439552 0.03439552]
 [0.03439552 0.03439552]
 [0.03439552 0.03439552]
 [0.03439552 0.03439552]
 [0.03439552 0.03439552]
 [0.03439552 0.03439552]
 [0.03439552 0.03439552]
 [0.03439552 0.03439552]
 [0.03439552 0.03439552]]
[[0.00118305 0.00118305]
 [0.00118305 0.00118305]
 [0.00118305 0.00118305]
 [0.00118305 0.00118305]
 [0.00118305 0.00118305]
 [0.00118305 0.00118305]
 [0.00118305 0.00118305]
 [0.00118305 0.00118305]
 [0.00118305 0.00118305]
 [0.00118305 0.00118305]]
[[4.06916726e-05 4.06916726e-05]
 [4.06916726e-05 4.06916726e-05]
 [4.06916726e-05 4.06916726e-05]
 [4.06916726e-05 4.06916726e-05]
 [4.06916726e-05 4.06916726e-05]
 [4.06916726e-05 4.06916726e-05]
 [4.06916726e-05 4.06916726e-05]
 [4.06916726e-05 4.06916726e-05]
 [4.06916726e-05 4.06916726e-05]
 [4.06916726e-05 4.06916726e-05]]
[[1.39961115e-06 1.39961115e-06]
 [1.39961115e-06 1.39961115e-06]
 [1.39961115e-06 1.39961115e-06]
 [1.39961115e-06 1.39961115e-06]
 [1.39961115e-06 1.39961115e-06]
 [1.39961115e-06 1.39961115e-06]
 [1.39961115e-06 1.39961115e-06]
 [1.39961115e-06 1.39961115e-06]
 [1.39961115e-06 1.39961115e-06]
 [1.39961115e-06 1.39961115e-06]]
[[4.81403643e-08 4.81403637e-08]
 [4.81403643e-08 4.81403637e-08]
 [4.81403643e-08 4.81403637e-08]
 [4.81403643e-08 4.81403637e-08]
 [4.81403643e-08 4.81403637e-08]
 [4.81403643e-08 4.81403637e-08]
 [4.81403643e-08 4.81403637e-08]
 [4.81403643e-08 4.81403637e-08]
 [4.81403643e-08 4.81403637e-08]
 [4.81403643e-08 4.81403637e-08]]
[[1.65582672e-09 1.65582765e-09]
 [1.65582672e-09 1.65582765e-09]
 [1.65582672e-09 1.65582765e-09]
 [1.65582672e-09 1.65582765e-09]
 [1.65582672e-09 1.65582765e-09]
 [1.65582672e-09 1.65582765e-09]
 [1.65582672e-09 1.65582765e-09]
 [1.65582672e-09 1.65582765e-09]
 [1.65582672e-09 1.65582765e-09]
 [1.65582672e-09 1.65582765e-09]]
[[5.69682675e-11 5.69667160e-11]
 [5.69682675e-11 5.69667160e-11]
 [5.69682675e-11 5.69667160e-11]
 [5.69682675e-11 5.69667160e-11]
 [5.69682675e-11 5.69667160e-11]
 [5.69682675e-11 5.69667160e-11]
 [5.69682675e-11 5.69667160e-11]
 [5.69682675e-11 5.69667160e-11]
 [5.69682675e-11 5.69667160e-11]
 [5.69682675e-11 5.69667160e-11]]
[[1.97259346e-12 1.97517920e-12]
 [1.97259346e-12 1.97517920e-12]
 [1.97259346e-12 1.97517920e-12]
 [1.97259346e-12 1.97517920e-12]
 [1.97259346e-12 1.97517920e-12]
 [1.97259346e-12 1.97517920e-12]
 [1.97259346e-12 1.97517920e-12]
 [1.97259346e-12 1.97517920e-12]
 [1.97259346e-12 1.97517920e-12]
 [1.97259346e-12 1.97517920e-12]]
[[8.45387597e-14 8.02306381e-14]
 [8.45387597e-14 8.02306381e-14]
 [8.45387597e-14 8.02306381e-14]
 [8.45387597e-14 8.02306381e-14]
 [8.45387597e-14 8.02306381e-14]
 [8.45387597e-14 8.02306381e-14]
 [8.45387597e-14 8.02306381e-14]
 [8.45387597e-14 8.02306381e-14]
 [8.45387597e-14 8.02306381e-14]
 [8.45387597e-14 8.02306381e-14]]
[[1.45938177e-14 2.16938983e-14]
 [1.45938177e-14 2.16938983e-14]
 [1.45938177e-14 2.16938983e-14]
 [1.45938177e-14 2.16938983e-14]
 [1.45938177e-14 2.16938983e-14]
 [1.45938177e-14 2.16938983e-14]
 [1.45938177e-14 2.16938983e-14]
 [1.45938177e-14 2.16938983e-14]
 [1.45938177e-14 2.16938983e-14]
 [1.45938177e-14 2.16938983e-14]]

ReLU Activations¶

In [10]:

activations = list()

In [11]:

for iter in range(10):
    activation = relu(activation.dot(weights))
    activations.append(activation)
    print(activation)

[4.8135251  4.72615519]
[23.71814585 23.98025559]
[119.63916823 118.852839  ]
[595.05052421 597.40951192]
[2984.68857188 2977.61160877]
[14895.13500696 14916.36589628]
[74560.59859209 74496.90592414]
[372548.22228863 372739.30029248]
[1863505.42345854 1862932.18944699]
[9315234.18124649 9316953.88328115]

ReLU Gradients¶

In [12]:

gradient = np.ones_like(activation)

In [13]:

for activation in reversed(activations):
    gradient = ((activation > 0) * gradient).dot(weights.transpose())
    print(gradient)

[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15625.]
[78125. 78125.]
[390625. 390625.]
[1953125. 1953125.]
[9765625. 9765625.]

Long short-term Memory (LSTM) Cells¶

LSTMs are the industry standard model to counter vanishing/exploding gradients¶

The problem with vanishing (sigmoid) / exploding (matrix multiplication) gradients is the combination of matrix multiplication and non-linearity being used to form the next hidden state. The Solution that LSTMs provide is quite simple:

The Gated Copy Trick
- LSTMs create the next hidden state by copying the previous hidden state and then adding or removing information as necessary.
- The mechanisms the LSTM uses for adding & removing information are called gates.
The LSTM has 2 hidden state vectors:
- h: for hidden.
- cell.

The one we care about is cell, Each new cell is the previous cell + u weighted by i and f:

f is the forget gate: if it takes a value of 0 the next cell will erase what it saw previously.
If i is 1, it will fully add in the value of u to create the new cell.
o is an output gate that controls how much of the cells state the output prediction is allowed to see.

In [14]:

def forward(self, input, hidden):
    prev_hidden, prev_cell = (hidden[0], hidden[1])
    f = (self.xf.forward(input) + self.hf.forward(prev_hidden)).sigmoid()  # forget gate
    i = (self.xi.forward(input) + self.hi.forward(prev_hidden)).sigmoid()
    o = (self.xo.forward(input) + self.ho.forward(prev_hidden)).sigmoid()  # output gate
    u = (self.xc.forward(input) + self.hc.forward(prev_hidden)).tanh()
    
    cell = (f * prev_cell) + (i * u)
    h = o * cell.tanh()
    output = self.w_ho.forward(h)
    return output, (h, cell)

Some Intuition about LSTM Gates¶

LSTM gates are semantically similar to reading/writing from Memory¶

An LSTM cell has $3$ gates, f, i, o & a cell update vector u. We think of these gates as forget, Input, Output, and Update. They work together to ensure that any information to be stored in c can be saved without requiring each update of c to have matrix multiplications or non-linearities applied to it.

This is what allows the LSTM to store Information across a time series without worrying about vanishing or exploding gradients. Each step is a copy plus an update. The hidden value $h$ is then a masked version of the cell that's used for prediction.

Each gate has its own weight matrices.

One last possible critique is about h. Clearly it's still prone to vanishing & exploding gradients. Exploding gradients aren't really a problem since we can always clip them, the only serious problem are vanishing gradients. But this ends up being Okay because h is conditioned on c, which can carry long range information.

All long range information is transported using c. h is only a localized interpretaion of c

In short, c can learn to transport information over long distances, so it doesn't matter if h can't.

The Long-Short term Memory Layer¶

You can use the Autograd system to implement an LSTM¶

In [ ]:

class LSTMCell(Layer):
    def __init__(self, n_inputs, n_hidden, n_output):
        super().__init__()
        
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_outputs = n_output
        
        self.xf = Linear(n_inputs, n_hidden)
        self.xi = Linear(n_inputs, n_hidden)
        self.xo = Linear(n_inputs, n_hidden)
        self.xc = Linear(n_inputs, n_hidden)
        
        self.hf = Linear(n_hidden, n_hidden, bias=False)
        self.hi = Linear(n_hidden, n_hidden, bias=False)
        self.ho = Linear(n_hidden, n_hidden, bias=False)
        self.hc = Linear(n_hidden, n_hidden, bias=False)
        
        self.w_ho = Linear(n_hidden, n_output, bias=False)
        
        self.parameters += self.xf.get_parameters()
        self.parameters += self.xi.get_parameters()
        self.parameters += self.xo.get_parameters()
        self.parameters += self.xc.get_parameters()
        self.parameters += self.hf.get_parameters()
        self.parameters += self.hi.get_parameters()
        self.parameters += self.ho.get_parameters()
        self.parameters += self.hc.get_parameters()
        self.parameters += self.w_ho.get_parameters()
        
    
    def forward(self, input, hidden):
        prev_hidden = hidden[0]
        prev_cell = hidden[1]
        
        f=(self.xf.forward(input)+self.hf.forward(prev_hidden)).sigmoid()
        i=(self.xi.forward(input)+self.hi.forward(prev_hidden)).sigmoid()
        o=(self.xo.forward(input)+self.ho.forward(prev_hidden)).sigmoid()
        g = (self.xc.forward(input) +self.hc.forward(prev_hidden)).tanh()
        c = (f * prev_cell) + (i * g) 
        h = o * c.tanh()
        
        output = self.w_ho.forward(h)
        return output, (h, c)
    
    def init_hidden(self, batch_size=1):
        h = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        c = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        h.data[:, 0] += 1
        c.data[:, 0] += 1
        return (h, c)

Upgrading the Character Language Model¶

Let's Swap out the Vanilla RNN with the new LSTMCell¶

Let's train an LSTM-based Model to predict Shakespeare.

In [ ]:

import sys, random, math
from collections import Counter
import numpy as np

np.random.seed(0)

In [ ]:

f = open('static/data/Shakespeare/shakespear.txt', 'r')
raw = f.read()
f.close()

In [ ]:

vocab = list(set(raw))
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

In [ ]:

raw_indices = np.array(list(map(lambda x: word2index[x], raw)))

In [ ]:

embed = Embedding(vocab_size=len(vocab), dim=8)
model = LSTMCell(n_inputs=8, n_hidden=256, n_output=len(vocab))

In [ ]:

criterion = CrossEntropyLoss()
optimizer = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

In [ ]:

batch_size = 16
bptt = 25

In [ ]:

n_batches = int((raw_indices.shape[0] / (batch_size)))

In [ ]:

trimmed_indices = raw_indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()

In [ ]:

input_batched_indices = batched_indices[0:-1]
output_batched_indices = batched_indices[1:]

In [ ]:

n_bptt = int((n_batches - 1) / bptt)

In [ ]:

input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
output_batches = output_batched_indices[:n_bptt*bptt]
output_batches = output_batches.reshape(n_bptt, bptt, batch_size)

In [ ]:

min_loss = 1000

Training the LSTM character Language Model¶

The training Logic also hasn't changed much¶

The only real change we have to make from the vanilla RNN logic is the truncated backpropagation logic because there are two hidden vectors per timestep instead of one:

In [ ]:

def train(iterations=7, min_loss=1000):
    for iter in range(iterations):
        total_loss = 0
        n_loss = 0
        
        hidden = model.init_hidden(batch_size=batch_size)
        for batch_i in range(len(input_batches)):
            
            hidden = (Tensor(hidden[0].data, autograd=True),
                      Tensor(hidden[1].data, autograd=True))
            
            loss = None
            losses = list()
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)
                target = Tensor(output_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                losses.append(batch_loss)
                if (t == 0):
                    loss = batch_loss
                else:
                    loss = loss + batch_loss
            loss = losses[-1]
            loss.backward()
            optimizer.step()
            total_loss += loss.data/bptt
                
            epoch_loss = np.exp(total_loss/(batch_i + 1))
            if (epoch_loss < min_loss):
                min_loss = epoch_loss
            log = "\r Iter:" + str(iter)
            log += " - Alpha:" + str(optimizer.alpha)[0:5]
            log += " - Batch "+str(batch_i+1)+"/"+str(len(input_batches))
            log += " - Min Loss:" + str(min_loss)[0:5]
            log += " - Loss:" + str(epoch_loss)
            if(batch_i == 0):
                log += " - " + generate_sample(n=70, init_char='T').replace("\n"," ")
            if(batch_i % 10 == 0):
                sys.stdout.write(log)
        optimizer.alpha *= 0.99 
        print()

Tuning the LSTM character language model¶

We should note that this model takes a long time to train (lots of parameters).

I also had to train it many times to find a good tuning, (learning rate, batch size, & so on)
In General, the Longer you train, the better your results will be:

Summary¶

LSTMs are incredibly powerful Models.¶

Language is an incredibly complex statistical distribution to learn, and the fact that LSTMs can do so well, still baffles me. Small variants of this model either are or have recently been the state of the art in a wide variety of tasks.