Notebook

Introducing RNNs and LSTMs¶

In [4]:

import sys
# sys.path.append('/Users/ssydasheng/anaconda3/envs/cp3/lib/python3.6/site-packages')
import autograd
import autograd.misc.optimizers as optim
# from autograd import optimizers as optim
import autograd.numpy as np
from autograd import grad

import matplotlib.pyplot as plt
%matplotlib inline

Resources¶

You may find the following resources helpful for understanding how RNNs and LSTMs work:

Character-Level Language Model¶

In [5]:

# Load the Shakespeare text
with open('data/shakespeare.txt', 'r') as f:
    text = f.read()

print("------------------------------")
# Print a sample of the text
print(text[:100])
data_length = len(text)
vocab = list(set(text))
vocab_size = len(vocab)   # + 1      # The extra + 1 is for the end-of-string token

char_to_index = { char:index for (index,char) in enumerate(vocab) }
index_to_char = { index:char for (index,char) in enumerate(vocab) }

print("The vocabulary contains {}".format(vocab))
print("------------------------------")
print("TOTAL NUM CHARACTERS = {}".format(data_length))
print("NUM UNIQUE CHARACTERS = {}".format(vocab_size))
print('char_to_index {}'.format(char_to_index))

------------------------------
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
The vocabulary contains ['\n', '!', ' ', '$', "'", '&', '-', ',', '.', '3', ';', ':', '?', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'J', 'M', 'L', 'O', 'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z']
------------------------------
TOTAL NUM CHARACTERS = 1115394
NUM UNIQUE CHARACTERS = 65
char_to_index {'\n': 0, '!': 1, ' ': 2, '$': 3, "'": 4, '&': 5, '-': 6, ',': 7, '.': 8, '3': 9, ';': 10, ':': 11, '?': 12, 'A': 13, 'C': 14, 'B': 15, 'E': 16, 'D': 17, 'G': 18, 'F': 19, 'I': 20, 'H': 21, 'K': 22, 'J': 23, 'M': 24, 'L': 25, 'O': 26, 'N': 27, 'Q': 28, 'P': 29, 'S': 30, 'R': 31, 'U': 32, 'T': 33, 'W': 34, 'V': 35, 'Y': 36, 'X': 37, 'Z': 38, 'a': 39, 'c': 40, 'b': 41, 'e': 42, 'd': 43, 'g': 44, 'f': 45, 'i': 46, 'h': 47, 'k': 48, 'j': 49, 'm': 50, 'l': 51, 'o': 52, 'n': 53, 'q': 54, 'p': 55, 's': 56, 'r': 57, 'u': 58, 't': 59, 'w': 60, 'v': 61, 'y': 62, 'x': 63, 'z': 64}

RNN¶

Recurrent Neural Network Diagram (Image from the Wild ML RNN Tutorial)

The update of an RNN is expressed by the following formulas:

$$ h_t = \tanh(U x_t + W h_{t-1} + b_h) $$$$ y_t = \text{softmax}(V h_t + b_y) $$

Here, each $x_t$ is a character---in this example, there are 65 unique characters. Since in each step the model takes as input a character and outputs a prediction for the next character in the sequence, both $x_t$ and $o_t$ are 65-dimensional vectors, i.e., $x_t, o_t \in \mathbb{R}^{65}$. We can choose any dimension for the hidden state $h_t$; in this case, we will use $h_t \in \mathbb{R}^{100}$. With this setup, the dimensions of $U$, $W$, and $V$ are $100 \times 65$, $100 \times 100$, and $65 \times 100$, respectively.

For a vector $\mathbf{x}$, we have:

$$ \text{softmax}(\mathbf{x})_i = \frac{e^{\mathbf{x}_i}}{\sum_j e^{\mathbf{x}_j}} $$

In [10]:

# Warning: not numerically stable
def softmax_unstable(x):
    return np.exp(x) / np.sum(np.exp(x))

In [11]:

softmax_unstable([1, 2, 1000])

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/tracer.py:48: RuntimeWarning: overflow encountered in exp
  return f_raw(*args, **kwargs)
/h/57/faghri/.local/lib/python2.7/site-packages/ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in divide
  This is separate from the ipykernel package so we can avoid doing imports until

Out[11]:

array([ 0.,  0., nan])

In [6]:

# Numerically stable version
def softmax(x):
    exponential = np.exp(x - np.max(x))
    return exponential / np.sum(exponential)

In [12]:

softmax([1,2,1000])

Out[12]:

array([0., 0., 1.])

In [13]:

def log_softmax(x):
    return np.log(softmax(x) + 1e-6)

In [14]:

log_softmax([1,2,1000])

Out[14]:

array([-1.38155106e+01, -1.38155106e+01,  9.99999500e-07])

In [7]:

def initialize_params(input_size, hidden_size, output_size):
    params = {
        'U': np.random.randn(hidden_size, input_size) * 0.01,
        'W': np.random.randn(hidden_size, hidden_size) * 0.01,
        'V': np.random.randn(output_size, hidden_size) * 0.01,
        'b_h': np.zeros(hidden_size),
        'b_o': np.zeros(output_size)
    }
    return params

In [8]:

def initialize_hidden(hidden_size):
    return np.zeros(hidden_size)

In [15]:

def model(params, x, h_prev):
    h = np.tanh(np.dot(params['U'], x) + np.dot(params['W'], h_prev) + params['b_h'])
    y = softmax(np.dot(params['V'], h) + params['b_o'])
    return y, h

In [16]:

def criterion(output, target):
    """Negative log-likelihood loss. Useful for training a classification problem with n classes.
    """
    output = np.log(output)
    return -output[target]

In [18]:

def loss(params, input_seq, target_seq, opts):
    """
    Compute the loss of RNN based on data.
    
    :param params: dict of str: tensor, including keys U, W, v, b_h, b_o.
    :param input_seq: list of str. Input string.
    :param target_seq: list of str. Target string.
    :param opts: dict of str: int. Including keys input_size, hidden_size, output_size.
    
    :return final_string: str. 
    """
    hidden = initialize_hidden(opts['hidden_size'])
    loss = 0
    
    for i in range(len(input_seq)):
        # output, hidden = model(params, input_seq[i], hidden)
        # loss += criterion(output, target_seq[i])
        
        x = input_seq[i]
        
        hidden = np.tanh(np.dot(params['U'], x) + np.dot(params['W'], hidden) + params['b_h'])
        output = softmax(np.dot(params['V'], hidden) + params['b_o'])
        
        loss += criterion(output, target_seq[i])
    
    return loss

In [19]:

loss_grad = grad(loss)

def sgd(grad, init_params, callback=None, num_iters=200, step_size=0.1, mass=0.9):
    """Stochastic gradient descent with momentum.
    grad() must have signature grad(x, i), where i is the iteration number."""

In [20]:

def create_one_hot(j, length):
    vec = np.zeros(length)
    vec[j] = 1
    return vec

In [21]:

def sample(params, initial, length, opts):
    """
    Sampling a string with a Recurrent neural network.
    
    :param params: dict of str: tensor, including keys U, W, v, b_h, b_o
    :param initial: str. Beginning character.
    :param length: length of the generated string.
    :param opts: dict of str: int. Including keys input_size, hidden_size, output_size.
    
    :return final_string: str. 
    """
    hidden = initialize_hidden(opts['hidden_size'])
    current_char = initial
    final_string = initial
    
    for i in range(length):
        x = create_one_hot(char_to_index[current_char], opts['input_size'])
        output, hidden = model(params, x, hidden)
        
        p = output
        current_index = np.random.choice(range(vocab_size), p=p.ravel())
        current_char = index_to_char[current_index]
        final_string += current_char
    
    return final_string

In [22]:

def main():
    # Use non-overlapping 25-character chunks for training
    sequence_length = 50
    num_epochs = 1
    print_every = 100
    evaluate_every = 100
    lr = 1e-2

    opts = {
        'input_size': vocab_size,
        'hidden_size': 100,
        'output_size': vocab_size,
    }

    params = initialize_params(opts['input_size'], opts['hidden_size'], opts['output_size'])

    for ep in range(num_epochs):
        # i = 0
        # while i * sequence_length + 1 < 10000:
        for i in range(data_length // sequence_length):
            start = i * sequence_length
            end = start + sequence_length + 1
            chunk = text[start:end]

            input_chars = chunk[:-1]
            target_chars = chunk[1:]

            input_seq = [char_to_index[c] for c in input_chars]
            target_seq = [char_to_index[c] for c in target_chars]

            input_seq_one_hot = [create_one_hot(j, vocab_size) for j in input_seq]

            example_loss = loss(params, input_seq_one_hot, target_seq, opts)

            grad_params = loss_grad(params, input_seq_one_hot, target_seq, opts)
            for param in params:
                gradient = np.clip(grad_params[param], -5, 5)
                params[param] -= lr * gradient

            if i % print_every == 0:
                print("LOSS = {}".format(example_loss))
                # print(grad_params)

            if i % evaluate_every == 0:
                sampled_string = sample(params, initial='a', length=100, opts=opts)
                print(sampled_string)

            # i += 1

In [23]:

main()

LOSS = 208.728346638
aXcXR3XhOnSvSoNbTBZJ -Mk'iuB&!ECTaJ HbwwpzfTWcD,$G$fXfSaKsOZ,u$bLbUZH!Jund'LIHuxCHxoFf'VK3WQhIg;xnpL?
LOSS = 149.851803894
aIe  tOhlb iyUtetepttat,wcQ uhfn,h 
 
S yKmtr m?lluhhet'A leepaTepa Abp
ierhbypileehToe  tneeoe ba ue
LOSS = 146.745247507
ah
  tcalctonho hggt uHigea A heu gd tarnr rs
lurrSicglTy trw aule Qhdn s nnd od nisrRe oenSWwNtc-l :
LOSS = 136.830663088
atylr:
:s al
'.e n?i dot the
:hics her ov fodh weTlot hs mors vi ne ahlruc dhen ualcniheu' foppm, iol
LOSS = 133.15083819
an wiag t Ue dhf bar.et eoer
I;X
Xorf irud the no. se mo lot iunran

R:th sifadou, U e lo 
itt yre ho
LOSS = 115.370877472
an totithe toufeif ianYtEcir, an sool bnorerFtur of fo'. nrkanssr
Dner sader o IooS Iubet,
Aor! mdou 
LOSS = 124.455217327
as's tucge shitelsue caurs dufe dasde soreeTes dori:
InSe:
SMdowee
the aloee thidR, beeen,
Ladn meat,
LOSS = 102.228662832
aor toig, wit yo'd  fiod mor, it oune.
MD aft bon.k
sheice the, y uor thees puroud gye simsthe talum 
LOSS = 101.923506217
as mis has, so onathes
Toor, he couY'e.
Mone: youy go m tag douM:
 as

: EyUS::
BUXUNINUIBNIUSIMSINIU
LOSS = 119.002012949
ar so le t.e dod gen wive

MAchi pe tS
TON; tos

MAEUTio he s gor le gue hek he d s, cg bel,
qn mean 
LOSS = 119.080138451
ant whe that no her rave fr thes vua brieY afon sour efr 'ed on tharl ret pon thaty nouvt.

Bo'k
Teow
LOSS = 112.169575163
ar ho sh be dow: le al tos potke co esby pt w ce d mand we sonsd undule ouk yore unos nocing:
IG
Se's
LOSS = 135.484472539
and hictirc, hld serkes and hemapbngcdavestou
.,
In y aeting Cwat, gond thithes it,ibe ire pnosrey pi
LOSS = 122.59747727
anenow eouy ata'g an yous pof wou
Wed po shounthenshou, ertis not mocher aghecous fasmof fir oo nglr 
LOSS = 119.415283706
al' hoinsile wilatere hal of yordthy.
Lo'cor thithenthhe, beis,

CRERUAUS:

Cowler, yor armasplir, yo
LOSS = 98.533296743
ar phithex,
Wo bert o  neaE yer wet:
Sh
CIOlINEASNTNNUS:
LORIULUS:
NCUllHes.

BCRRNIUSUS:
Sedud shs!

LOSS = 99.1249873814
az' d an;
H, hor are, che wow pnou
ICOMIUIN$US:
Buth!
Whard was higgpore tEall:r, corst't tibe.
ThR C
LOSS = 79.266387347
a bath hesce tagh not ol of pore pOond them

FORICIOINLAN IRINIOLNENENENIUSA
MONIUS:
You&dinn,
Wero t
LOSS = 109.131043182
att  at, tich paskey:
BithOS:
That ; thar.W
Tfstegin wrd icat
Mtithe nenscoud cedowe hoZe y
Tons thou
LOSS = 117.038109986
adr mirellneon so pkathic,',
Fe po poplind.

SimpICA'g pememo bath,sVendperer ?othik fone eno so pofe
LOSS = 106.828237149
ad shtm nf Rel! buthps y gome,


OEEIN:
W trone?  fonsst, bpRetus.

MOIUS:

CORIUS:

VOMINIUD:
Lit?US
LOSS = 130.500017096
are the pe brin wizsiupthes ast in aithev, thake I arkenderbestiss: th trelt CPtrise the hus ho havtu
LOSS = 120.07843241
alm
And and nucend dire, wre to forach servimy into
As hbeicant rt miky Cfll ands andindinound indave
LOSS = 106.9538011
and thit ef aCesce- as yoanid third hmol, thet, ht ly che'st,;
Bescat,
Whit therd wist, im k h, dosta
LOSS = 114.138736325
ak!
Onatu no to toun nemr!
At 'iand thie ngon.

The ghat my fos, tom ne cel sanes weme, dould tesoke.
LOSS = 116.272821322
atHr hencen: his erobly atn tebe pcoll, hatly e; sour wope ver iuuthu hes hable of hand as bom yote o
LOSS = 89.4005608309
ator hait ot,'I te hyomegwt cond
Ss
INIAS:
Yomaland maghat,
Yor'or lazud is mear Ithe hedenith.

Bs y
LOSS = 118.031251679
aresu,
Bid thepr heur sese? af bly genelat, Pothearnes, ouke the fet me ben wedse, Semy the heme tene
LOSS = 115.261815517
ante nntane'dibulratr unes and candoTh of sald so housto bontozS:
I hores'Te thach whand,
To Ro coes!
LOSS = 100.387620659
a reower:
dee, angeary bf eroucanin -rumy tif neantre. Wome wie cous ms a dbuthe iogarus murd andr? d
LOSS = 103.293955778
ale Pir weld moms:
Shthy poiby thame Thond Tlray,
Whine wit wot, fle themENIUS:
Bun,
Nis ICENENUSI
AN
LOSS = 115.253690141
a brasurediWites and to sreabions cote;
Cou hascidy, he dails; anl meses,
Tf llay owe;
Helk-y wou lou
LOSS = 136.841178644
argomy, merourey-
A3r REThHs hew brachimy lord hargneepore torsuntald reatout oruerenith;
To Fashond

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-23-263240bbee7e> in <module>()
----> 1 main()

<ipython-input-22-0ae8f86dbf05> in main()
     33             example_loss = loss(params, input_seq_one_hot, target_seq, opts)
     34 
---> 35             grad_params = loss_grad(params, input_seq_one_hot, target_seq, opts)
     36             for param in params:
     37                 gradient = np.clip(grad_params[param], -5, 5)

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/wrap_util.pyc in nary_f(*args, **kwargs)
     18             else:
     19                 x = tuple(args[i] for i in argnum)
---> 20             return unary_operator(unary_f, x, *nary_op_args, **nary_op_kwargs)
     21         return nary_f
     22     return nary_operator

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/differential_operators.pyc in grad(fun, x)
     26         raise TypeError("Grad only applies to real scalar-output functions. "
     27                         "Try jacobian or elementwise_grad.")
---> 28     return vjp(vspace(ans).ones())
     29 
     30 @unary_to_nary

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in vjp(g)
     12         def vjp(g): return vspace(x).zeros()
     13     else:
---> 14         def vjp(g): return backward_pass(g, end_node)
     15     return vjp, end_value
     16 

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in backward_pass(g, end_node)
     19     for node in toposort(end_node):
     20         outgrad = outgrads.pop(node)
---> 21         ingrads = node.vjp(outgrad[0])
     22         for parent, ingrad in zip(node.parents, ingrads):
     23             outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in <lambda>(g)
     70             vjp_0 = vjp_0_fun(ans, *args, **kwargs)
     71             vjp_1 = vjp_1_fun(ans, *args, **kwargs)
---> 72             return lambda g: (vjp_0(g), vjp_1(g))
     73         else:
     74             vjps = [vjps_dict[argnum](ans, *args, **kwargs) for argnum in argnums]

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/numpy/numpy_vjps.pyc in <lambda>(g)
    351 def dot_vjp_0(ans, A, B):
    352     A_ndim, B_ndim = anp.ndim(A), anp.ndim(B)
--> 353     return lambda g: dot_adjoint_0(B, g, A_ndim, B_ndim)
    354 
    355 def dot_vjp_1(ans, A, B):

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/tracer.pyc in f_wrapped(*args, **kwargs)
     46             return new_box(ans, trace, node)
     47         else:
---> 48             return f_raw(*args, **kwargs)
     49     f_wrapped.fun = f_raw
     50     f_wrapped._is_autograd_primitive = True

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/numpy/numpy_vjps.pyc in dot_adjoint_0(B, G, A_ndim, B_ndim)
    332     if B_ndim == 0 or B_ndim == 1 or A_ndim == 0:
    333         contract_num = max(0, B_ndim - (A_ndim != 0))
--> 334         return onp.tensordot(G, B, contract_num)
    335     else:
    336         return onp.tensordot(G, onp.swapaxes(B, -1, -2), B_ndim - 1)

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/numpy/core/numeric.pyc in tensordot(a, b, axes)
   1293     olda = [as_[axis] for axis in notin]
   1294 
-> 1295     notin = [k for k in range(ndb) if k not in axes_b]
   1296     newaxes_b = axes_b + notin
   1297     N2 = 1

KeyboardInterrupt:

Long Short-Term Memory Networks (LSTMs)¶

Long Short-Term Memory Networks Diagram (Image from the LSTM Tutorial)

The update of an LSTM is given by the following equations:

$$ i_t = \sigma(U_i x_t + W_i h_{t-1} + b_i) $$$$ f_t = \sigma(U_f x_t + W_f h_{t-1} + b_f) $$$$ o_t = \sigma(U_o x_t + W_o h_{t-1} + b_o) $$$$ \tilde{C}_t = \tanh(U_C x_t + W_C h_{t-1} + b_C) $$$$ C_t = i_t * \tilde{C}_t + f_t * C_{t-1} $$$$ h_t = o_t * \tanh(C_t) $$

In [24]:

def initialize_params(input_size, hidden_size, output_size):
    params = {
        'U_i': np.random.randn(hidden_size, input_size) * 0.01,
        'W_i': np.random.randn(hidden_size, hidden_size) * 0.01,
        'b_i': np.zeros(hidden_size),
        
        'U_f': np.random.randn(hidden_size, input_size) * 0.01,
        'W_f': np.random.randn(hidden_size, hidden_size) * 0.01,
        'b_f': np.zeros(hidden_size),
        
        'U_o': np.random.randn(hidden_size, input_size) * 0.01,
        'W_o': np.random.randn(hidden_size, hidden_size) * 0.01,
        'b_o': np.zeros(hidden_size),
        
        'U_c': np.random.randn(hidden_size, input_size) * 0.01,
        'W_c': np.random.randn(hidden_size, hidden_size) * 0.01,
        'b_c': np.zeros(hidden_size),
        
        'V': np.random.randn(output_size, hidden_size) * 0.01,
        'b': np.zeros(output_size)
    }
    return params

In [34]:

def sigmoid(x):
    return 1. / (1 + np.exp(-x))
def model(params, x, h_prev, C_prev):
    i_t = sigmoid(np.dot(params['U_i'], x) + np.dot(params['W_i'], h_prev) + params['b_i'])
    f_t = sigmoid(np.dot(params['U_f'], x) + np.dot(params['W_f'], h_prev) + params['b_f'])
    o_t = sigmoid(np.dot(params['U_o'], x) + np.dot(params['W_o'], h_prev) + params['b_o'])
    
    C_t_tilde = np.tanh(np.dot(params['U_c'], x) + np.dot(params['W_c'], h_prev) + params['b_c'])
    C_t = i_t * C_t_tilde + f_t * C_prev
    h_t = o_t * np.tanh(C_t)
    
    y = softmax(np.dot(params['V'], h_t) + params['b'])
    return y, h_t, C_t

In [27]:

def initialize_hidden(hidden_size):
    return np.zeros(hidden_size), np.zeros(hidden_size)

In [31]:

def loss(params, input_seq, target_seq, opts):
    """
    Compute the loss of RNN based on data.
    
    :param params: dict of str: tensor, including keys U, W, v, b_h, b_o.
    :param input_seq: list of str. Input string.
    :param target_seq: list of str. Target string.
    :param opts: dict of str: int. Including keys input_size, hidden_size, output_size.
    
    :return final_string: str. 
    """
    hidden, cell = initialize_hidden(opts['hidden_size'])
    loss = 0
    
    for i in range(len(input_seq)):
        x = input_seq[i]
        
        i_t = sigmoid(np.dot(params['U_i'], x) + np.dot(params['W_i'], hidden) + params['b_i'])
        f_t = sigmoid(np.dot(params['U_f'], x) + np.dot(params['W_f'], hidden) + params['b_f'])
        o_t = sigmoid(np.dot(params['U_o'], x) + np.dot(params['W_o'], hidden) + params['b_o'])

        C_t_tilde = np.tanh(np.dot(params['U_c'], x) + np.dot(params['W_c'], hidden) + params['b_c'])
        cell = i_t * C_t_tilde + f_t * cell
        hidden = o_t * np.tanh(cell)

        output = softmax(np.dot(params['V'], hidden) + params['b'])
        
        loss += criterion(output, target_seq[i])
    return loss

loss_grad = grad(loss)

In [32]:

def sample(params, initial, length, opts):
    """
    Sampling a string with a Recurrent neural network.
    
    :param params: dict of str: tensor, including keys U, W, v, b_h, b_o
    :param initial: str. Beginning character.
    :param length: length of the generated string.
    :param opts: dict of str: int. Including keys input_size, hidden_size, output_size.
    
    :return final_string: str. 
    """
    hidden, cell = initialize_hidden(opts['hidden_size'])
    current_char = initial
    final_string = initial
    
    for i in range(length):
        x = create_one_hot(char_to_index[current_char], opts['input_size'])
        output, hidden, cell = model(params, x, hidden, cell)
        
        p = output
        current_index = np.random.choice(range(vocab_size), p=p.ravel())
        current_char = index_to_char[current_index]
        final_string += current_char
    
    return final_string

In [35]:

main()

LOSS = 208.721701188
arH&L!sPaK.V:!OOaeiDdmpdDqd'zL,vS!CQl!QGhPh?K?$SqyyKjekN hZGGg vPYH;nAI&AD,kB eCek?hh'$GRnmAyX?dZ-p;!
LOSS = 157.162470344
aVrle  o eairllec?e a ia;tieo?hEeTsfsple,takt tss nu,r$B Re3wwhtiju
ea$ ;pecl
bPCcy,so o
eK
Kuu  Ym'&
LOSS = 155.724613326
af'orwf ta  tdrh
 hsadomd ohdeU
 i  eu a  htvn?eaesao,tloXtle
omtehig r  Uew3 t
ti  not pcw ,Sgebnr i
LOSS = 154.868857766
a oWesrnrnymT eerss  s
. pnU-ys' oahnedroU. r
sn zalaS
homr  s gAosh de i,hetaswnUhm .eom thgl
ea?
sa
LOSS = 171.949752224
aslei fA, '.m
o 
ahrkwnYIe,:eNyntethnecr,h.ent l d,odssIIJtlWnhen w   f
lomth
ni rhsulO ,Y  Mg ors s

LOSS = 156.60044435
aoia attI sFiSvtp.frcoh s-noi, Yi
nrwaRT 
 n
adns tg sse  nolt
l sTrhnhooot ,tlSswp'nte tUfhno  onnna
LOSS = 150.067029577
a
mte hdeeee Arot H  eer,pem hllbiRtroWhwn
h asUesnb;L sh wthny
hQthSprygrmeha: ehepthaco IwOoesenwro
LOSS = 144.050312255
aNiIZe la
Rel   go  oc ooue lttt suAoe
ht afnLeh fRt hoc't kr yteiietcai e daariiSh oiesu 'nsc atamdb

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-35-263240bbee7e> in <module>()
----> 1 main()

<ipython-input-22-0ae8f86dbf05> in main()
     33             example_loss = loss(params, input_seq_one_hot, target_seq, opts)
     34 
---> 35             grad_params = loss_grad(params, input_seq_one_hot, target_seq, opts)
     36             for param in params:
     37                 gradient = np.clip(grad_params[param], -5, 5)

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/wrap_util.pyc in nary_f(*args, **kwargs)
     18             else:
     19                 x = tuple(args[i] for i in argnum)
---> 20             return unary_operator(unary_f, x, *nary_op_args, **nary_op_kwargs)
     21         return nary_f
     22     return nary_operator

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/differential_operators.pyc in grad(fun, x)
     26         raise TypeError("Grad only applies to real scalar-output functions. "
     27                         "Try jacobian or elementwise_grad.")
---> 28     return vjp(vspace(ans).ones())
     29 
     30 @unary_to_nary

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in vjp(g)
     12         def vjp(g): return vspace(x).zeros()
     13     else:
---> 14         def vjp(g): return backward_pass(g, end_node)
     15     return vjp, end_value
     16 

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in backward_pass(g, end_node)
     19     for node in toposort(end_node):
     20         outgrad = outgrads.pop(node)
---> 21         ingrads = node.vjp(outgrad[0])
     22         for parent, ingrad in zip(node.parents, ingrads):
     23             outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in <lambda>(g)
     59                     "VJP of {} wrt argnum 0 not defined".format(fun.__name__))
     60             vjp = vjpfun(ans, *args, **kwargs)
---> 61             return lambda g: (vjp(g),)
     62         elif L == 2:
     63             argnum_0, argnum_1 = argnums

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/numpy/numpy_vjps.pyc in vjp(g)
    279         argmax_locations = x == repeat_to_match_shape(ans, shape, dtype, axis, keepdims)[0]
    280         return g_repeated * argmax_locations \
--> 281             / onp.sum(argmax_locations, axis=axis, keepdims=True)
    282     return vjp
    283 defvjp(anp.max, grad_chooser)

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/numpy/core/fromnumeric.pyc in sum(a, axis, dtype, out, keepdims)
   1880             return sum(axis=axis, dtype=dtype, out=out, **kwargs)
   1881     return _methods._sum(a, axis=axis, dtype=dtype,
-> 1882                          out=out, **kwargs)
   1883 
   1884 

/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/numpy/core/_methods.pyc in _sum(a, axis, dtype, out, keepdims)
     29     return umr_minimum(a, axis, None, out, keepdims)
     30 
---> 31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
     32     return umr_sum(a, axis, dtype, out, keepdims)
     33 

KeyboardInterrupt: