Vanilla Recurrent Neural Network

Character level implementation of vanilla recurrent neural network

Import dependencies

In [1]:
import numpy as np
import matplotlib.pyplot as plt

Parameters Initialization

In [2]:
def initialize_parameters(hidden_size, vocab_size):
    parameters -- a tuple of network parameters
    adagrad_mem_vars -- a tuple of mem variables required for adagrad update
    Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
    Whh = np.random.randn(hidden_size, hidden_size) * 0.01
    Why  = np.random.randn(vocab_size, hidden_size) * 0.01
    bh = np.zeros([hidden_size, 1])
    by = np.zeros([vocab_size, 1])

    mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
    parameter = (Wxh, Whh, Why, bh, by)
    adagrad_mem_vars = (mWxh, mWhh, mWhy, mbh, mby)
    return (parameter, adagrad_mem_vars)

Forward Propogation

In [3]:
def softmax(X):
    t = np.exp(X)
    return t / np.sum(t, axis=0)
In [4]:
def forward_propogation(X, parameters, seq_length, hprev):
    Implement the forward propogation in the network

    X -- input to the network
    parameters -- a tuple containing weights and biases of the network
    seq_length -- length of sequence of input
    hprev -- previous hidden state

    caches -- tuple of activations and hidden states for each step of forward prop

    caches = {}
    caches['h0'] = np.copy(hprev)
    Wxh, Whh, Why, bh, by = parameters
    for i in range(seq_length):
        x = X[i].reshape(vocab_size, 1)        
        ht = np.tanh(, caches['h' + str(i)]) +, x) + bh)
        Z =, ht) + by
        A = softmax(Z)
        caches['A' + str(i+1)] = A
        caches['h' + str(i+1)] = ht
    return caches

Cost Computation

In [5]:
def compute_cost(Y, caches):
    Implement the cost function for the network

    Y -- true "label" vector, shape (vocab_size, number of examples)
    caches -- tuple of activations and hidden states for each step of forward prop

    cost -- cross-entropy cost

    seq_length = len(caches) // 2
    cost = 0
    for i in range(seq_length):
        y = Y[i].reshape(vocab_size, 1)
        cost += - np.sum(y * np.log(caches['A' + str(i+1)]))
    return np.squeeze(cost)

Backward Propogation

In [6]:
def backward_propogation(X, Y, caches, parameters):
    Implement Backpropogation

    Al -- Activations of last layer
    Y -- True labels of data
    caches -- tuple containing values of `A` and `h` for each char in forward prop
    parameters -- tuple containing parameters of the network

    grads -- tuple containing gradients of the network parameters

    Wxh, Whh, Why, bh, by = parameters

    dWhh, dWxh, dWhy = np.zeros_like(Whh), np.zeros_like(Wxh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(caches['h0']) 

    seq_length = len(caches) // 2

    for i in reversed(range(seq_length)):
        y = Y[i].reshape(vocab_size, 1)
        x = X[i].reshape(vocab_size, 1)
        dZ = np.copy(caches['A' + str(i+1)]) - y
        dWhy +=, caches['h' + str(i+1)].T)
        dby += dZ        
        dht =, dZ) + dhnext
        dhraw = dht * (1 - caches['h' + str(i+1)] * caches['h' + str(i+1)])
        dbh += dhraw
        dWhh +=, caches['h' + str(i)].T)
        dWxh +=, x.T)
        dhnext =, dhraw)

    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients

    grads = (dWxh, dWhh, dWhy, dbh, dby)
    return grads

Parameters Update

In [7]:
def update_parameters(parameters, grads, adagrad_mem_vars, learning_rate):
    Update parameters of the network using Adagrad update

    paramters -- tuple containing weights and biases of the network
    grads -- tuple containing the gradients of the parameters
    learning_rate -- rate of adagrad update

    parameters -- tuple containing updated parameters

    a = np.copy(parameters[0])
    for param, dparam, mem in zip(parameters, grads, adagrad_mem_vars):
        mem += dparam * dparam
        param -= learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    return (parameters, adagrad_mem_vars)

Sample text from model

In [8]:
def print_sample(ht, seed_ix, n, parameters):
    Samples a sequence of integers from the model.
    ht -- memory state
    seed_ix --seed letter for first time step
    n -- number of chars to extract
    parameters -- tuple containing network weights and biases
    Wxh, Whh, Why, bh, by = parameters
    x = np.eye(vocab_size)[seed_ix].reshape(vocab_size, 1)
    ixes = []
    for t in range(n):
        ht = np.tanh(, x) +, ht) + bh)
        y =, ht) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel()) ### why not argmax of p??
        x = np.eye(vocab_size)[ix].reshape(vocab_size, 1)
    txt = ''.join(ix_to_char[ix] for ix in ixes)
    print('----\n %s \n----' % txt)
In [9]:
def get_one_hot(p, char_to_ix, data, vocab_size):
    Gets indexes of chars of `seq_length` from `data`, returns them in one hot representation
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
    X = np.eye(vocab_size)[inputs]
    Y = np.eye(vocab_size)[targets]
    return X, Y


In [27]:
def Model(data, seq_length, lr, char_to_ix, ix_to_char, num_of_iterations):
    Train RNN model and return trained parameters
    parameters, adagrad_mem_vars = initialize_parameters(hidden_size, vocab_size)
    costs = []
    n, p = 0, 0
    smooth_loss = -np.log(1.0 / vocab_size) * seq_length
    while n < num_of_iterations:
        if p + seq_length + 1 >= len(data) or n == 0: 
            hprev = np.zeros((hidden_size, 1)) # reset RNN memory
            p = 0 # go from start of data

        X, Y = get_one_hot(p, char_to_ix, data, vocab_size)
        caches = forward_propogation(X, parameters, seq_length, hprev)
        cost = compute_cost(Y, caches)
        grads = backward_propogation(X, Y, caches, parameters)
        parameters, adagrad_mem_vars = update_parameters(parameters, grads, adagrad_mem_vars, lr)
        smooth_loss = smooth_loss * 0.999 + cost * 0.001

        if n % 1000 == 0:
            print_sample(hprev, char_to_ix['a'], 200, parameters)
            print('Iteration: %d -- Cost: %0.3f' % (n, smooth_loss))

        hprev = caches['h' + str(seq_length)]

    return parameters

Implementing the model on a text

In [12]:
data = open('data/text-data.txt', 'r').read() # read a text file
chars = list(set(data)) # vocabulary
data_size, vocab_size = len(data), len(chars)
print ('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) } # maps char to it's index in vocabulary
ix_to_char = { i:ch for i,ch in enumerate(chars) } # maps index in vocabular to corresponding character
data has 748 characters, 42 unique.
In [29]:
# hyper-parameters
learning_rate = 0.1
hidden_size = 100
seq_length = 25
num_of_iterations = 20000
In [30]:
parameters = Model(data, seq_length, learning_rate, char_to_ix, ix_to_char, num_of_iterations)
kwoj!wi nRA—— wj w .ycHrrhqagT:noh:ahvyqkSAnwpNtNTfpnk;hnnN
jnBqb iTIp,g,,v
 qvw fkwS;g;!qmcBbicAlY;nbbIkwHYO:IsfhT :wl
Iteration: 0 -- Cost: 93.442
 s wheng thert s dot bn,
B. wor TtoTar;
Had .asshThanb,
And as tha Telethabgas wberg penmerg te th;A,

Iy Sar me:
Thoudd herg d aberas and Then
Tnt ps waaverornasiactham!,
And bastevemer,r wing thitmth 
Iteration: 1000 -- Cost: 68.503
 veler anotrgr, ben,
I d agep had I clent there thallther iak lecI stok.

kere anis I shotheredint thavelest aseme way lealling per ais ay I dad
Ind baas tha siverged ia,
mar tredeelay,
And shadow tod  
Iteration: 2000 -- Cost: 39.597
 ksithet I bethathen betherusd we panted in a ore that fornind lowked bowd if bether keother waAna s wond themod the bess be he in trould woore as fir aiTh
And that Tas gre thathen boastougheramou the  
Iteration: 3000 -- Cost: 22.733
 gevelgarsing thes
Thotroa to way,
Iedt bthe ind to ssy,
I shadowe kno dnas marsi
Tsher win oon toould now trn took the passing there
Had worn them really about the sas morn loodstoowing the te 
Iteration: 4000 -- Cost: 11.318
 d woads ind hewinr eng
I d way leavelena t—ng in the one in thewother, as just as for that the passingrt wen toows ay,
Iedt ow morn wt pe the better claim,
Oha d took the one less traved both
Iteration: 5000 -- Cost: 6.830
 delling the traveled by len woore it lent that the passing there
Had worn them rhavesuho ben wit len wood,
And sorry I could notstr asshaithen the on the baps the bether corh bent in themu juh way lea 
Iteration: 6000 -- Cost: 3.523
 rre besep took beavellea by,
I sherdent the pit Inclaps sre pas graveler, the be owassed lot I soo
Thoughthom enghpa wavilge aive
Hac pin banr,
And he Tan
Tad gas owe in then ae
I s gratshacops theh a 
Iteration: 7000 -- Cost: 4.619
 nd ages hence:
Tso rowllong s morhaps the better claim,
Becaus  in theh the one thalllouvish the in the  aook the on the undergrowth;

shewans wans weer in ates hads wiverged in a yellow wood,
And tha 
Iteration: 8000 -- Cost: 4.595
 ss len len as trot troddyow yreasI wantes rnat hassy,
And en t ans ay In leakessharhads ohr cn bear;
And eors in bend t—e  herher stous that mornith as for thaviverged in a yellow wood,
And sorry I do 
Iteration: 9000 -- Cost: 2.303
 g toode orl that morning equallyhea—ss way leads oir could notr;
And that has marsing this with a sigh
Somewhere ages and ages hence:
Two roads diverged in a yellow wood,
And sorry I coubd
Tore agss i 
Iteration: 10000 -- Cost: 1.423
 st as fair,
And having perhaps the better claim,
Because it was grassy and wanted wear;
Thoubd by,
And looked down one as far as I could
To where it bent in the undergrowth;

Then took the other, as j 
Iteration: 11000 -- Cost: 0.915
 share I—
I took the one less traveled by,
And that has mareishen

I challlas lasen ino ste traveler, look the one lllenithen has mirsin ans lookes dn way
In leaves n  thet the pnd worn akep had tre th 
Iteration: 12000 -- Cost: 0.672
 get as I cops the passing thing in she beate
Talen that morrent ing hanted wear;
Though as for that the passing there
Had worn them really about the same,

And both that morning equally lay
In leaves  
Iteration: 13000 -- Cost: 0.545
 s firr ads that yh wanted waarn tookethea ben wood,
And having pe traveled by,
And that has marsing thirhaps the better claim,
Because it was grassy and worked dged wanteigrlenges I corhabem
The on to 
Iteration: 14000 -- Cost: 0.471
 nt be teelsherhept the passing that good the one less traveled by,
And that has maisith
And that has hais len bead wink on tood,eaubour air,
And wayd
Iges hadcld told,eduslero boad way lec way!
Yet Tw 
Iteration: 15000 -- Cost: 0.426
 s mareass woore in way

And both that morning equally lro the und th tond woollin with a sigh
Somewhere ages and ages hence:
Twough as fan traveled by,
And that has mar ing this with a sigh
Iteration: 16000 -- Cost: 6.138
 ing and ages hence:
Two roads diverged in a yellow wood,
And sorn th wbeagel and way,
I doubted if I should ever aly Ih th w ynl and both that marling there
Had worn them really about the same,

And b 
Iteration: 17000 -- Cost: 2.683
 s mas merhith t morning equr way I could
Tore and aing ing thass ond both thet the better claim,
Because ing thaveler bing pass anin b abe the und worn woorsing horn lookst heages
Tokept the first for 
Iteration: 18000 -- Cost: 1.299
 d looked bout the pes it pe that morning equally lay
In leaves no step had trodden black.
Oh, I kept traver dtep had traveled by,
And that has marsithit the one traveler, long I stood
And looked down  
Iteration: 19000 -- Cost: 1.071