In [3]:

from theano.sandbox import cuda
cuda.use('gpu1')

In [4]:

%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Setup¶

We're going to download the collected works of Nietzsche to use as our data for this class.

In [3]:

path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

corpus length: 600901

In [4]:

chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 86

Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [5]:

chars.insert(0, "\0")

In [6]:

''.join(chars[1:-6])

Out[6]:

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

Map from chars to indices and back again

In [7]:

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

idx will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)

In [8]:

idx = [char_indices[c] for c in text]

In [9]:

idx[:10]

Out[9]:

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [10]:

''.join(indices_char[i] for i in idx[:70])

Out[10]:

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

3 char model¶

Create inputs¶

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [11]:

cs=3
c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)]

Our inputs

In [12]:

x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

Our output

In [13]:

y = np.stack(c4_dat[:-2])

The first 4 inputs and outputs

In [14]:

x1[:4], x2[:4], x3[:4]

Out[14]:

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [15]:

y[:4]

Out[15]:

array([30, 29,  1, 40])

In [16]:

x1.shape, y.shape

Out[16]:

((200297,), (200297,))

The number of latent factors to create (i.e. the size of the embedding matrix)

In [17]:

n_fac = 42

Create inputs and embedding outputs for each of our 3 character inputs

In [18]:

def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [19]:

c1_in, c1 = embedding_input('c1', vocab_size, n_fac)
c2_in, c2 = embedding_input('c2', vocab_size, n_fac)
c3_in, c3 = embedding_input('c3', vocab_size, n_fac)

Create and train model¶

Pick a size for our hidden state

In [20]:

n_hidden = 256

This is the 'green arrow' from our diagram - the layer operation from input to hidden.

In [47]:

dense_in = Dense(n_hidden, activation='relu')

Our first hidden activation is simply this function applied to the result of the embedding of the first character.

In [48]:

c1_hidden = dense_in(c1)

This is the 'orange arrow' from our diagram - the layer operation from hidden to hidden.

In [49]:

dense_hidden = Dense(n_hidden, activation='tanh')

Our second and third hidden activations sum up the previous hidden state (after applying dense_hidden) to the new input state.

In [50]:

c2_dense = dense_in(c2)
hidden_2 = dense_hidden(c1_hidden)
c2_hidden = merge([c2_dense, hidden_2])

In [51]:

c3_dense = dense_in(c3)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])

This is the 'blue arrow' from our diagram - the layer operation from hidden to output.

In [52]:

dense_out = Dense(vocab_size, activation='softmax')

The third hidden state is the input to our output layer.

In [53]:

c4_out = dense_out(c3_hidden)

In [54]:

model = Model([c1_in, c2_in, c3_in], c4_out)

In [55]:

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [56]:

model.optimizer.lr=0.000001

In [57]:

model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
200297/200297 [==============================] - 4s - loss: 4.2459     
Epoch 2/4
200297/200297 [==============================] - 5s - loss: 3.6853     
Epoch 3/4
200297/200297 [==============================] - 5s - loss: 3.3444     
Epoch 4/4
200297/200297 [==============================] - 5s - loss: 3.1719

Out[57]:

<keras.callbacks.History at 0x7ff3778ec510>

In [58]:

model.optimizer.lr=0.01

In [59]:

model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
200297/200297 [==============================] - 5s - loss: 3.0829     
Epoch 2/4
200297/200297 [==============================] - 5s - loss: 3.0314     
Epoch 3/4
200297/200297 [==============================] - 4s - loss: 2.9973     
Epoch 4/4
200297/200297 [==============================] - 5s - loss: 2.9722

Out[59]:

<keras.callbacks.History at 0x7ff3778ecdd0>

In [42]:

model.optimizer.lr.set_value(0.000001)

In [43]:

model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
200297/200297 [==============================] - 5s - loss: 4.4125     
Epoch 2/4
200297/200297 [==============================] - 5s - loss: 4.2799     
Epoch 3/4
200297/200297 [==============================] - 5s - loss: 4.0000     
Epoch 4/4
200297/200297 [==============================] - 5s - loss: 3.5942

Out[43]:

<keras.callbacks.History at 0x7ff3788d4d10>

In [44]:

model.optimizer.lr.set_value(0.01)

In [45]:

model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
200297/200297 [==============================] - 5s - loss: 7.8651     
Epoch 2/4
200297/200297 [==============================] - 5s - loss: 5.1607     
Epoch 3/4
200297/200297 [==============================] - 5s - loss: 4.7043     
Epoch 4/4
200297/200297 [==============================] - 5s - loss: 4.7026

Out[45]:

<keras.callbacks.History at 0x7ff37ba08a50>

Test model¶

In [159]:

def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict(arrs)
    i = np.argmax(p)
    return chars[i]

In [160]:

get_next('phi')

Out[160]:

'l'

In [161]:

get_next(' th')

Out[161]:

'e'

In [162]:

get_next(' an')

Out[162]:

'd'

Our first RNN!¶

Create inputs¶

This is the size of our unrolled RNN.

In [73]:

cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.

In [74]:

c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)]
            for n in range(cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [75]:

c_out_dat = [idx[i+cs] for i in xrange(0, len(idx)-1-cs, cs)]

In [76]:

xs = [np.stack(c[:-2]) for c in c_in_dat]

In [77]:

len(xs), xs[0].shape

Out[77]:

(8, (75110,))

In [45]:

y = np.stack(c_out_dat[:-2])

So each column below is one series of 8 characters from the text.

In [78]:

[xs[n][:cs] for n in range(cs)]

Out[78]:

[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

...and this is the next character after each sequence.

In [57]:

y[:cs]

Out[57]:

array([ 1, 33,  2, 72, 67, 73,  2, 68])

In [58]:

n_fac = 42

Create and train model¶

In [33]:

def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
    return inp, Flatten()(emb)

In [34]:

c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]

In [35]:

n_hidden = 256

In [36]:

dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')

The first character of each sequence goes through dense_in(), to create our first hidden activations.

In [37]:

hidden = dense_in(c_ins[0][1])

Then for each successive layer we combine the output of dense_in() on the next character with the output of dense_hidden() on the current hidden state, to create the new hidden state.

In [38]:

for i in range(1,cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden])

Putting the final hidden state through dense_out() gives us our output.

In [39]:

c_out = dense_out(hidden)

So now we can create our model.

In [179]:

model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [180]:

model.fit(xs, y, batch_size=64, nb_epoch=12)

Epoch 1/12
75110/75110 [==============================] - 3s - loss: 2.5385     
Epoch 2/12
75110/75110 [==============================] - 3s - loss: 2.2645     
Epoch 3/12
75110/75110 [==============================] - 3s - loss: 2.1596     
Epoch 4/12
75110/75110 [==============================] - 3s - loss: 2.0888     
Epoch 5/12
75110/75110 [==============================] - 3s - loss: 2.0355     
Epoch 6/12
75110/75110 [==============================] - 3s - loss: 1.9897     
Epoch 7/12
75110/75110 [==============================] - 3s - loss: 1.9506     
Epoch 8/12
75110/75110 [==============================] - 3s - loss: 1.9149     
Epoch 9/12
75110/75110 [==============================] - 3s - loss: 1.8840     
Epoch 10/12
75110/75110 [==============================] - 3s - loss: 1.8546     
Epoch 11/12
75110/75110 [==============================] - 3s - loss: 1.8293     
Epoch 12/12
75110/75110 [==============================] - 3s - loss: 1.8050

Out[180]:

<keras.callbacks.History at 0x7f25579a80d0>

Test model¶

In [181]:

def get_next(inp):
    idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
    p = model.predict(idxs)
    return chars[np.argmax(p)]

In [182]:

get_next('for thos')

Out[182]:

'e'

In [432]:

get_next('part of ')

Out[432]:

't'

In [433]:

get_next('queens a')

Out[433]:

'n'

Our first RNN with keras!¶

In [30]:

n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 86)

This is nearly exactly equivalent to the RNN we built ourselves in the previous section.

In [31]:

model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        SimpleRNN(n_hidden, activation='relu', inner_init='identity'),
        Dense(vocab_size, activation='softmax')
    ])

In [32]:

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
embedding_5 (Embedding)          (None, 8, 42)         3612        embedding_input_2[0][0]          
____________________________________________________________________________________________________
simplernn_2 (SimpleRNN)          (None, 256)           76544       embedding_5[0][0]                
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 86)            22102       simplernn_2[0][0]                
====================================================================================================
Total params: 102258
____________________________________________________________________________________________________

In [24]:

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [217]:

model.fit(np.concatenate(xs,axis=1), y, batch_size=64, nb_epoch=8)

Epoch 1/8
75110/75110 [==============================] - 3s - loss: 2.7939     
Epoch 2/8
75110/75110 [==============================] - 3s - loss: 2.2970     
Epoch 3/8
75110/75110 [==============================] - 3s - loss: 2.0814     
Epoch 4/8
75110/75110 [==============================] - 3s - loss: 1.9416     
Epoch 5/8
75110/75110 [==============================] - 3s - loss: 1.8406     
Epoch 6/8
75110/75110 [==============================] - 3s - loss: 1.7625     
Epoch 7/8
75110/75110 [==============================] - 3s - loss: 1.6960     
Epoch 8/8
75110/75110 [==============================] - 3s - loss: 1.6421

Out[217]:

<keras.callbacks.History at 0x7fa18f2c0890>

In [222]:

def get_next_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = np.array(idxs)[np.newaxis,:]
    p = model.predict(arrs)[0]
    return chars[np.argmax(p)]

In [223]:

get_next_keras('this is ')

Out[223]:

't'

In [224]:

get_next_keras('part of ')

Out[224]:

't'

In [225]:

get_next_keras('queens a')

Out[225]:

'n'

Returning sequences¶

Create inputs¶

To use a sequence model, we can leave our input unchanged - but we have to change our output to a sequence (of course!)

Here, c_out_dat is identical to c_in_dat, but moved across 1 character.

In [64]:

#c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)]
#            for n in range(cs)]
c_out_dat = [[idx[i+n] for i in xrange(1, len(idx)-cs, cs)]
            for n in range(cs)]

In [65]:

ys = [np.stack(c[:-2]) for c in c_out_dat]

Reading down each column shows one set of inputs and outputs.

In [59]:

[xs[n][:cs] for n in range(cs)]

Out[59]:

[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

In [60]:

[ys[n][:cs] for n in range(cs)]

Out[60]:

[array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67]),
 array([ 1, 33,  2, 72, 67, 73,  2, 68])]

Create and train model¶

In [47]:

dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax', name='output')

We're going to pass a vector of all zeros as our starting point - here's our input layers for that:

In [48]:

inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)

In [66]:

outs = []

for i in range(cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden], mode='sum')
    # every layer now has an output
    outs.append(dense_out(hidden))

In [67]:

model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [68]:

zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape

Out[68]:

(75110, 42)

In [394]:

model.fit([zeros]+xs, ys, batch_size=64, nb_epoch=12)

INFO (theano.gof.compilelock): Refreshing lock /home/jhoward/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.12-64/lock_dir/lock

Epoch 1/12
75110/75110 [==============================] - 7s - loss: 20.0841 - output_loss_1: 2.7123 - output_loss_2: 2.5681 - output_loss_3: 2.5143 - output_loss_4: 2.4739 - output_loss_5: 2.4675 - output_loss_6: 2.4442 - output_loss_7: 2.4627 - output_loss_8: 2.4410     
Epoch 2/12
75110/75110 [==============================] - 7s - loss: 17.8335 - output_loss_1: 2.5124 - output_loss_2: 2.3529 - output_loss_3: 2.2368 - output_loss_4: 2.1686 - output_loss_5: 2.1540 - output_loss_6: 2.1337 - output_loss_7: 2.1520 - output_loss_8: 2.1232     
Epoch 3/12
75110/75110 [==============================] - 7s - loss: 17.2340 - output_loss_1: 2.4967 - output_loss_2: 2.3306 - output_loss_3: 2.1766 - output_loss_4: 2.0814 - output_loss_5: 2.0529 - output_loss_6: 2.0291 - output_loss_7: 2.0475 - output_loss_8: 2.0192     
Epoch 4/12
75110/75110 [==============================] - 7s - loss: 16.8647 - output_loss_1: 2.4896 - output_loss_2: 2.3218 - output_loss_3: 2.1437 - output_loss_4: 2.0278 - output_loss_5: 1.9901 - output_loss_6: 1.9600 - output_loss_7: 1.9768 - output_loss_8: 1.9549     
Epoch 5/12
75110/75110 [==============================] - 7s - loss: 16.6200 - output_loss_1: 2.4858 - output_loss_2: 2.3158 - output_loss_3: 2.1287 - output_loss_4: 1.9941 - output_loss_5: 1.9481 - output_loss_6: 1.9151 - output_loss_7: 1.9276 - output_loss_8: 1.9047     
Epoch 6/12
75110/75110 [==============================] - 7s - loss: 16.4396 - output_loss_1: 2.4835 - output_loss_2: 2.3121 - output_loss_3: 2.1148 - output_loss_4: 1.9705 - output_loss_5: 1.9188 - output_loss_6: 1.8774 - output_loss_7: 1.8937 - output_loss_8: 1.8689     
Epoch 7/12
75110/75110 [==============================] - 7s - loss: 16.3016 - output_loss_1: 2.4825 - output_loss_2: 2.3090 - output_loss_3: 2.1054 - output_loss_4: 1.9523 - output_loss_5: 1.8957 - output_loss_6: 1.8514 - output_loss_7: 1.8639 - output_loss_8: 1.8414     
Epoch 8/12
75110/75110 [==============================] - 7s - loss: 16.1862 - output_loss_1: 2.4807 - output_loss_2: 2.3076 - output_loss_3: 2.0974 - output_loss_4: 1.9391 - output_loss_5: 1.8757 - output_loss_6: 1.8284 - output_loss_7: 1.8413 - output_loss_8: 1.8161     
Epoch 9/12
75110/75110 [==============================] - 7s - loss: 16.0887 - output_loss_1: 2.4802 - output_loss_2: 2.3055 - output_loss_3: 2.0913 - output_loss_4: 1.9275 - output_loss_5: 1.8603 - output_loss_6: 1.8101 - output_loss_7: 1.8200 - output_loss_8: 1.7938     
Epoch 10/12
75110/75110 [==============================] - 7s - loss: 16.0118 - output_loss_1: 2.4790 - output_loss_2: 2.3038 - output_loss_3: 2.0882 - output_loss_4: 1.9172 - output_loss_5: 1.8458 - output_loss_6: 1.7946 - output_loss_7: 1.8049 - output_loss_8: 1.7782     
Epoch 11/12
75110/75110 [==============================] - 7s - loss: 15.9393 - output_loss_1: 2.4784 - output_loss_2: 2.3027 - output_loss_3: 2.0827 - output_loss_4: 1.9095 - output_loss_5: 1.8341 - output_loss_6: 1.7803 - output_loss_7: 1.7885 - output_loss_8: 1.7631     
Epoch 12/12
75110/75110 [==============================] - 7s - loss: 15.8785 - output_loss_1: 2.4773 - output_loss_2: 2.3021 - output_loss_3: 2.0788 - output_loss_4: 1.9015 - output_loss_5: 1.8239 - output_loss_6: 1.7680 - output_loss_7: 1.7770 - output_loss_8: 1.7498

Out[394]:

<keras.callbacks.History at 0x7fa168d005d0>

Test model¶

In [395]:

def get_nexts(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [396]:

get_nexts(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']

Out[396]:

['t', 'h', 'e', 't', ' ', 'c', 's', ' ']

In [397]:

get_nexts(' part of')

[' ', 'p', 'a', 'r', 't', ' ', 'o', 'f']

Out[397]:

['t', 'o', 'r', 't', ' ', 'o', 'f', ' ']

Sequence model with keras¶

In [50]:

n_hidden, n_fac, cs, vocab_size

Out[50]:

(256, 42, 8, 86)

To convert our previous keras model into a sequence model, simply add the 'return_sequences=True' parameter, and add TimeDistributed() around our dense layer.

In [67]:

model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [52]:

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
embedding_6 (Embedding)          (None, 8, 42)         3612        embedding_input_3[0][0]          
____________________________________________________________________________________________________
simplernn_3 (SimpleRNN)          (None, 8, 256)        76544       embedding_6[0][0]                
____________________________________________________________________________________________________
timedistributed_1 (TimeDistribut (None, 8, 86)         22102       simplernn_3[0][0]                
====================================================================================================
Total params: 102258
____________________________________________________________________________________________________

In [71]:

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [82]:

xs[0].shape

Out[82]:

(75110,)

In [90]:

x_rnn=np.stack(np.squeeze(xs), axis=1)
y_rnn=np.atleast_3d(np.stack(ys, axis=1))

In [91]:

x_rnn.shape, y_rnn.shape

Out[91]:

((75110, 8), (75110, 8, 1))

In [92]:

model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8
75110/75110 [==============================] - 4s - loss: 2.4284     
Epoch 2/8
75110/75110 [==============================] - 4s - loss: 2.0006     
Epoch 3/8
75110/75110 [==============================] - 4s - loss: 1.8863     
Epoch 4/8
75110/75110 [==============================] - 4s - loss: 1.8264     
Epoch 5/8
75110/75110 [==============================] - 4s - loss: 1.7882     
Epoch 6/8
75110/75110 [==============================] - 4s - loss: 1.7613     
Epoch 7/8
75110/75110 [==============================] - 4s - loss: 1.7417     
Epoch 8/8
75110/75110 [==============================] - 4s - loss: 1.7258

Out[92]:

<keras.callbacks.History at 0x7f82761cc990>

In [93]:

def get_nexts_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arr = np.array(idxs)[np.newaxis,:]
    p = model.predict(arr)[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [94]:

get_nexts_keras(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']

Out[94]:

['t', 'h', 'e', 's', ' ', 'i', 's', ' ']

One-hot sequence model with keras¶

This is the keras version of the theano model that we're about to create.

In [95]:

model=Sequential([
        SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
                  activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [96]:

oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)

oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)

oh_x_rnn.shape, oh_y_rnn.shape

Out[96]:

((75110, 8, 86), (75110, 8, 86))

In [97]:

model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8
75110/75110 [==============================] - 4s - loss: 2.4383     
Epoch 2/8
75110/75110 [==============================] - 4s - loss: 2.0318     
Epoch 3/8
75110/75110 [==============================] - 4s - loss: 1.9195     
Epoch 4/8
75110/75110 [==============================] - 4s - loss: 1.8553     
Epoch 5/8
75110/75110 [==============================] - 4s - loss: 1.8133     
Epoch 6/8
75110/75110 [==============================] - 4s - loss: 1.7829     
Epoch 7/8
75110/75110 [==============================] - 4s - loss: 1.7593     
Epoch 8/8
75110/75110 [==============================] - 4s - loss: 1.7410

Out[97]:

<keras.callbacks.History at 0x7f8210725c90>

In [104]:

def get_nexts_oh(inp):
    idxs = np.array([char_indices[c] for c in inp])
    arr = to_categorical(idxs, vocab_size)
    p = model.predict(arr[np.newaxis,:])[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [84]:

get_nexts_oh(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']

Out[84]:

['t', 'h', 'e', 's', ' ', 'i', 's', ' ']

Stateful model with keras¶

In [290]:

bs=64

A stateful model is easy to create (just add "stateful=True") but harder to train. We had to add batchnorm and use LSTM to get reasonable results.

When using stateful in keras, you have to also add 'batch_input_shape' to the first layer, and fix the batch size there.

In [338]:

model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [339]:

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

Since we're using a fixed batch shape, we have to ensure our inputs and outputs are a even multiple of the batch size.

In [340]:

mx = len(x_rnn)//bs*bs

In [341]:

model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

INFO (theano.gof.compilelock): Refreshing lock /home/jhoward/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.12-64/lock_dir/lock

Epoch 1/4
75072/75072 [==============================] - 13s - loss: 2.2051    
Epoch 2/4
75072/75072 [==============================] - 13s - loss: 1.9621    
Epoch 3/4
75072/75072 [==============================] - 13s - loss: 1.8893    
Epoch 4/4
75072/75072 [==============================] - 13s - loss: 1.8453

Out[341]:

<keras.callbacks.History at 0x7fa16f1d2690>

In [342]:

model.optimizer.lr=1e-4

In [343]:

model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

Epoch 1/4
75072/75072 [==============================] - 13s - loss: 1.8132    
Epoch 2/4
75072/75072 [==============================] - 13s - loss: 1.7877    
Epoch 3/4
75072/75072 [==============================] - 13s - loss: 1.7663    
Epoch 4/4
75072/75072 [==============================] - 13s - loss: 1.7475

Out[343]:

<keras.callbacks.History at 0x7fa1773b8c10>

In [344]:

model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

Epoch 1/4
75072/75072 [==============================] - 13s - loss: 1.7308    
Epoch 2/4
75072/75072 [==============================] - 13s - loss: 1.7155    
Epoch 3/4
75072/75072 [==============================] - 13s - loss: 1.7014    
Epoch 4/4
75072/75072 [==============================] - 13s - loss: 1.6881

Out[344]:

<keras.callbacks.History at 0x7fa1773b8d50>

Theano RNN¶

In [107]:

n_input = vocab_size
n_output = vocab_size

Using raw theano, we have to create our weight matrices and bias vectors ourselves - here are the functions we'll use to do so (using glorot initialization).

The return values are wrapped in shared(), which is how we tell theano that it can manage this data (copying it to and from the GPU as necessary).

In [108]:

def init_wgts(rows, cols): 
    scale = math.sqrt(2/rows)
    return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))
def init_bias(rows): 
    return shared(np.zeros(rows, dtype=np.float32))

We return the weights and biases together as a tuple. For the hidden weights, we'll use an identity initialization (as recommended by Hinton.)

In [109]:

def wgts_and_bias(n_in, n_out): 
    return init_wgts(n_in, n_out), init_bias(n_out)
def id_and_bias(n): 
    return shared(np.eye(n, dtype=np.float32)), init_bias(n)

Theano doesn't actually do any computations until we explicitly compile and evaluate the function (at which point it'll be turned into CUDA code and sent off to the GPU). So our job is to describe the computations that we'll want theano to do - the first step is to tell theano what inputs we'll be providing to our computation:

In [110]:

t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')

all_args = [t_h0, t_inp, t_outp, lr]

Now we're ready to create our intial weight matrices.

In [73]:

W_h = id_and_bias(n_hidden)
W_x = wgts_and_bias(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W_h, W_x, W_y]))

Theano handles looping by using the GPU scan operation. We have to tell theano what to do at each step through the scan - this is the function we'll use, which does a single forward pass for one character:

In [74]:

def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
    # Calculate the hidden activations
    h = nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)
    # Calculate the output activations
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    # Return both (the 'Flatten()' is to work around a theano bug)
    return h, T.flatten(y, 1)

Now we can provide everything necessary for the scan operation, so we can setup that up - we have to pass in the function to call at each step, the sequence to step through, the initial values of the outputs, and any other arguments to pass to the step function.

In [75]:

[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)

We can now calculate our loss function, and all of our gradients, with just a couple of lines of code!

In [76]:

error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

We even have to show theano how to do SGD - so we set up this dictionary of updates to complete after every forward pass, which apply to standard SGD update rule to every weight.

In [77]:

def upd_dict(wgts, grads, lr): 
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})

upd = upd_dict(w_all, g_all, lr)

We're finally ready to compile the function!

In [78]:

fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [123]:

X = oh_x_rnn
Y = oh_y_rnn
X.shape, Y.shape

Out[123]:

((75110, 8, 86), (75110, 8, 86))

To use it, we simply loop through our input data, calling the function compiled above, and printing our progress from time to time.

In [86]:

err=0.0; l_rate=0.01
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        print ("Error:{:.3f}".format(err/1000))
        err=0.0

Error:25.196
Error:21.489
Error:20.900
Error:19.913
Error:18.816
Error:19.202
Error:19.066
Error:18.473
Error:17.942
Error:18.251
Error:17.489
Error:17.570
Error:18.371
Error:17.331
Error:16.807
Error:17.681
Error:17.401
Error:17.136
Error:16.830
Error:16.651
Error:16.518
Error:16.430
Error:16.687
Error:16.161
Error:16.775
Error:16.566
Error:16.053
Error:16.296
Error:16.240
Error:16.454
Error:16.699
Error:16.396
Error:16.644
Error:16.328
Error:15.990
Error:16.644
Error:15.981
Error:16.359
Error:16.042
Error:16.326
Error:15.361
Error:15.690
Error:15.742
Error:16.048
Error:15.955
Error:15.866
Error:15.571
Error:16.069
Error:15.997
Error:16.030
Error:15.230
Error:15.612
Error:14.918
Error:14.821
Error:15.580
Error:15.380
Error:14.650
Error:15.499
Error:15.110
Error:14.972
Error:15.034
Error:15.427
Error:15.236
Error:15.037
Error:14.768
Error:14.781
Error:14.329
Error:14.726
Error:15.229
Error:14.809
Error:15.144
Error:14.755
Error:14.440
Error:14.431
Error:14.464

In [87]:

f_y = theano.function([t_h0, t_inp], v_y, allow_input_downcast=True)

In [336]:

pred = np.argmax(f_y(np.zeros(n_hidden), X[6]), axis=1)

In [337]:

act = np.argmax(X[6], axis=1)

In [338]:

[indices_char[o] for o in act]

Out[338]:

['t', 'h', 'e', 'n', '?', ' ', 'I', 's']

In [339]:

[indices_char[o] for o in pred]

Out[339]:

['h', 'e', ' ', ' ', ' ', 'T', 'n', ' ']

Pure python RNN!¶

Set up basic functions¶

Now we're going to try to repeat the above theano RNN, using just pure python (and numpy). Which means, we have to do everything ourselves, including defining the basic functions of a neural net! Below are all of the definitions, along with tests to check that they give the same answers as theano. The functions ending in _d are the derivatives of each function.

In [33]:

def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_d(x): 
    output = sigmoid(x)
    return output*(1-output)

In [34]:

def relu(x): return np.maximum(0., x)
def relu_d(x): return (x > 0.)*1.

In [35]:

relu(np.array([3.,-3.])), relu_d(np.array([3.,-3.]))

Out[35]:

(array([ 3.,  0.]), array([ 1.,  0.]))

In [36]:

def dist(a,b): return pow(a-b,2)
def dist_d(a,b): return 2*(a-b)

In [37]:

import pdb

In [38]:

eps = 1e-7
def x_entropy(pred, actual): 
    return -np.sum(actual * np.log(np.clip(pred, eps, 1-eps)))
def x_entropy_d(pred, actual): return -actual/pred

In [39]:

def softmax(x): return np.exp(x)/np.exp(x).sum()

In [40]:

def softmax_d(x):
    sm = softmax(x)
    res = np.expand_dims(-sm,-1)*sm
    res[np.diag_indices_from(res)] = sm*(1-sm)
    return res

In [41]:

test_preds = np.array([0.2,0.7,0.1])
test_actuals = np.array([0.,1.,0.])
nnet.categorical_crossentropy(test_preds, test_actuals).eval()

Out[41]:

array(0.35667494393873245)

In [42]:

x_entropy(test_preds, test_actuals)

Out[42]:

0.35667494393873245

In [43]:

test_inp = T.dvector()
test_out = nnet.categorical_crossentropy(test_inp, test_actuals)
test_grad = theano.function([test_inp], T.grad(test_out, test_inp))

In [44]:

test_grad(test_preds)

Out[44]:

array([-0.    , -1.4286, -0.    ])

In [45]:

x_entropy_d(test_preds, test_actuals)

Out[45]:

array([-0.    , -1.4286, -0.    ])

In [114]:

pre_pred = random(oh_x_rnn[0][0].shape)
preds = softmax(pre_pred)
actual = oh_x_rnn[0][0]

In [119]:

np.allclose(softmax_d(pre_pred).dot(x_entropy_d(preds,actual)), preds-actual)

Out[119]:

True

In [46]:

softmax(test_preds)

Out[46]:

array([ 0.2814,  0.464 ,  0.2546])

In [47]:

nnet.softmax(test_preds).eval()

Out[47]:

array([[ 0.2814,  0.464 ,  0.2546]])

In [48]:

test_out = T.flatten(nnet.softmax(test_inp))

In [49]:

test_grad = theano.function([test_inp], theano.gradient.jacobian(test_out, test_inp))

In [50]:

test_grad(test_preds)

Out[50]:

array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [51]:

softmax_d(test_preds)

Out[51]:

array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [76]:

act=relu
act_d = relu_d

In [77]:

loss=x_entropy
loss_d=x_entropy_d

We also have to define our own scan function. Since we're not worrying about running things in parallel, it's very simple to implement:

In [54]:

def scan(fn, start, seq):
    res = []
    prev = start
    for s in seq:
        app = fn(prev, s)
        res.append(app)
        prev = app
    return res

...for instance, scan on + is the cumulative sum.

In [55]:

scan(lambda prev,curr: prev+curr, 0, range(5))

Out[55]:

[0, 1, 3, 6, 10]

Set up training¶

Let's now build the functions to do the forward and backward passes of our RNN. First, define our data and shape.

In [65]:

inp = oh_x_rnn
outp = oh_y_rnn
n_input = vocab_size
n_output = vocab_size

In [83]:

inp.shape, outp.shape

Out[83]:

((75110, 8, 86), (75110, 8, 86))

Here's the function to do a single forward pass of an RNN, for a single character.

In [79]:

def one_char(prev, item):
    # Previous state
    tot_loss, pre_hidden, pre_pred, hidden, ypred = prev
    # Current inputs and output
    x, y = item
    pre_hidden = np.dot(x,w_x) + np.dot(hidden,w_h)
    hidden = act(pre_hidden)
    pre_pred = np.dot(hidden,w_y)
    ypred = softmax(pre_pred)
    return (
        # Keep track of loss so we can report it
        tot_loss+loss(ypred, y),
        # Used in backprop
        pre_hidden, pre_pred, 
        # Used in next iteration
        hidden, 
        # To provide predictions
        ypred)

We use scan to apply the above to a whole sequence of characters.

In [80]:

def get_chars(n): return zip(inp[n], outp[n])
def one_fwd(n): return scan(one_char, (0,0,0,np.zeros(n_hidden),0), get_chars(n))

Now we can define the backward step. We use a loop to go through every element of the sequence. The derivatives are applying the chain rule to each step, and accumulating the gradients across the sequence.

In [82]:

# "Columnify" a vector
def col(x): return x[:,newaxis]

def one_bkwd(args, n):
    global w_x,w_y,w_h

    i=inp[n]  # 8x86
    o=outp[n] # 8x86
    d_pre_hidden = np.zeros(n_hidden) # 256
    for p in reversed(range(len(i))):
        totloss, pre_hidden, pre_pred, hidden, ypred = args[p]
        x=i[p] # 86
        y=o[p] # 86
        d_pre_pred = softmax_d(pre_pred).dot(loss_d(ypred,y))  # 86
        d_pre_hidden = (np.dot(d_pre_hidden, w_h.T) 
                        + np.dot(d_pre_pred,w_y.T)) * act_d(pre_hidden) # 256

        # d(loss)/d(w_y) = d(loss)/d(pre_pred) * d(pre_pred)/d(w_y)
        w_y -= col(hidden) * d_pre_pred * alpha
        # d(loss)/d(w_h) = d(loss)/d(pre_hidden[p-1]) * d(pre_hidden[p-1])/d(w_h)
        if (p>0): w_h -= args[p-1][3].dot(d_pre_hidden) * alpha
        w_x -= col(x)*d_pre_hidden * alpha
    return d_pre_hidden

Now we can set up our initial weight matrices. Note that we're not using bias at all in this example, in order to keep things simpler.

In [126]:

scale=math.sqrt(2./n_input)
w_x = normal(scale=scale, size=(n_input,n_hidden))
w_y = normal(scale=scale, size=(n_hidden, n_output))
w_h = np.eye(n_hidden, dtype=np.float32)

Our loop looks much like the theano loop in the previous section, except that we have to call the backwards step ourselves.

In [127]:

overallError=0
alpha=0.0001
for n in range(10000):
    res = one_fwd(n)
    overallError+=res[-1][0]
    deriv = one_bkwd(res, n)
    if(n % 1000 == 999):
        print ("Error:{:.4f}; Gradient:{:.5f}".format(
                overallError/1000, np.linalg.norm(deriv)))
        overallError=0

Error:35.2380; Gradient:2.90002
Error:32.9176; Gradient:2.71170
Error:31.0649; Gradient:4.14135
Error:29.9798; Gradient:3.40467
Error:29.2453; Gradient:3.79049
Error:29.0070; Gradient:3.39826
Error:28.2358; Gradient:4.30422
Error:28.0086; Gradient:2.92011
Error:27.6885; Gradient:4.03503
Error:27.6905; Gradient:3.18526

Keras GRU¶

Identical to the last keras rnn, but a GRU!

In [101]:

model=Sequential([
        GRU(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
                  activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [102]:

model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8
75110/75110 [==============================] - 9s - loss: 2.3991     
Epoch 2/8
75110/75110 [==============================] - 9s - loss: 1.9818     
Epoch 3/8
75110/75110 [==============================] - 9s - loss: 1.8704     
Epoch 4/8
75110/75110 [==============================] - 9s - loss: 1.8070     
Epoch 5/8
75110/75110 [==============================] - 9s - loss: 1.7653     
Epoch 6/8
75110/75110 [==============================] - 10s - loss: 1.7346    
Epoch 7/8
75110/75110 [==============================] - 9s - loss: 1.7108     
Epoch 8/8
75110/75110 [==============================] - 9s - loss: 1.6918

Out[102]:

<keras.callbacks.History at 0x7f820e8bae50>

In [105]:

get_nexts_oh(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']

Out[105]:

['t', 'h', 'e', 's', ' ', 'i', 's', ' ']

Theano GRU¶

Separate weights¶

The theano GRU looks just like the simple theano RNN, except for the use of the reset and update gates. Each of these gates requires its own hidden and input weights, so we add those to our weight matrices.

In [139]:

W_h = id_and_bias(n_hidden)
W_x = init_wgts(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
rW_h = init_wgts(n_hidden, n_hidden)
rW_x = wgts_and_bias(n_input, n_hidden)
uW_h = init_wgts(n_hidden, n_hidden)
uW_x = wgts_and_bias(n_input, n_hidden)
w_all = list(chain.from_iterable([W_h, W_y, uW_x, rW_x]))
w_all.extend([W_x, uW_h, rW_h])

Here's the definition of a gate - it's just a sigmoid applied to the addition of the dot products of the input vectors.

In [140]:

def gate(x, h, W_h, W_x, b_x):
    return nnet.sigmoid(T.dot(x, W_x) + b_x + T.dot(h, W_h))

Our step is nearly identical to before, except that we multiply our hidden state by our reset gate, and we update our hidden state based on the update gate.

In [146]:

def step(x, h, W_h, b_h, W_y, b_y, uW_x, ub_x, rW_x, rb_x, W_x, uW_h, rW_h):
    reset = gate(x, h, rW_h, rW_x, rb_x)
    update = gate(x, h, uW_h, uW_x, ub_x)
    h_new = gate(x, h * reset, W_h, W_x, b_h)
    h = update*h + (1-update)*h_new
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    return h, T.flatten(y, 1)

Everything from here on is identical to our simple RNN in theano.

In [147]:

[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)

In [148]:

error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

In [149]:

upd = upd_dict(w_all, g_all, lr)
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [150]:

err=0.0; l_rate=0.1
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        l_rate *= 0.95
        print ("Error:{:.2f}".format(err/1000))
        err=0.0

Error:21.89
Error:20.52
Error:20.55
Error:19.87
Error:19.01
Error:19.58
Error:19.45
Error:18.93
Error:18.51
Error:18.75
Error:18.16
Error:18.18
Error:18.90
Error:18.03
Error:17.50
Error:18.39
Error:18.11
Error:17.92
Error:17.50
Error:17.38
Error:17.17
Error:17.11
Error:17.49
Error:17.04
Error:17.40
Error:17.23
Error:16.83
Error:16.97
Error:17.02
Error:17.25
Error:17.46
Error:17.18
Error:17.41
Error:17.07
Error:16.78
Error:17.39
Error:16.68
Error:17.23
Error:16.75
Error:16.96

Combined weights¶

We can make the previous section simpler and faster by concatenating the hidden and input matrices and inputs together. We're not going to step through this cell by cell - you'll see it's identical to the previous section except for this concatenation.

In [186]:

W = (shared(np.concatenate([np.eye(n_hidden), normal(size=(n_input, n_hidden))])
            .astype(np.float32)), init_bias(n_hidden))

rW = wgts_and_bias(n_input+n_hidden, n_hidden)
uW = wgts_and_bias(n_input+n_hidden, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W, W_y, uW, rW]))

In [187]:

def gate(m, W, b): return nnet.sigmoid(T.dot(m, W) + b)

In [188]:

def step(x, h, W, b, W_y, b_y, uW, ub, rW, rb):
    m = T.concatenate([h, x])
    reset = gate(m, rW, rb)
    update = gate(m, uW, ub)
    m = T.concatenate([h*reset, x])
    h_new = gate(m, W, b)
    h = update*h + (1-update)*h_new
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    return h, T.flatten(y, 1)

In [189]:

[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)

In [190]:

def upd_dict(wgts, grads, lr): 
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})

In [191]:

error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

In [192]:

upd = upd_dict(w_all, g_all, lr)
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [193]:

err=0.0; l_rate=0.01
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        print ("Error:{:.2f}".format(err/1000))
        err=0.0

Error:24.71
Error:22.16
Error:21.99
Error:21.26
Error:20.44
Error:20.97
Error:20.69
Error:20.15
Error:19.91
Error:20.26
Error:19.54
Error:19.64
Error:20.26
Error:19.49
Error:18.95
Error:19.94
Error:19.71
Error:19.56
Error:18.95
Error:18.78
Error:18.46
Error:18.50
Error:19.02
Error:18.45
Error:18.72
Error:18.50
Error:18.27
Error:18.31
Error:18.29
Error:18.46
Error:18.75
Error:18.33
Error:18.58
Error:18.24
Error:17.95
Error:18.53
Error:17.82
Error:18.36
Error:17.87
Error:18.01
Error:17.32
Error:17.70
Error:17.54
Error:17.87
Error:17.79
Error:17.84
Error:17.59
Error:17.78
Error:17.65
Error:17.75
Error:17.09
Error:17.31
Error:16.71
Error:16.77
Error:17.38
Error:17.22
Error:16.70
Error:17.28
Error:17.00
Error:16.85
Error:16.62
Error:17.06
Error:16.88
Error:16.71
Error:16.46
Error:16.49
Error:16.23
Error:16.44
Error:16.98
Error:16.37
Error:16.79
Error:16.32
Error:16.12
Error:16.13
Error:16.11

Setup¶

3 char model¶

Create inputs¶

Create and train model¶

Test model¶

Our first RNN!¶

Create inputs¶

Create and train model¶

Test model¶

Our first RNN with keras!¶

Returning sequences¶

Create inputs¶

Create and train model¶

Test model¶

Sequence model with keras¶

One-hot sequence model with keras¶

Stateful model with keras¶

Theano RNN¶

Pure python RNN!¶

Set up basic functions¶

Set up training¶

Keras GRU¶

Theano GRU¶

Separate weights¶

Combined weights¶

End¶