import sys
# sys.path.append('/Users/ssydasheng/anaconda3/envs/cp3/lib/python3.6/site-packages')
import autograd
import autograd.misc.optimizers as optim
# from autograd import optimizers as optim
import autograd.numpy as np
from autograd import grad
import matplotlib.pyplot as plt
%matplotlib inline
You may find the following resources helpful for understanding how RNNs and LSTMs work:
# Load the Shakespeare text
with open('data/shakespeare.txt', 'r') as f:
text = f.read()
print("------------------------------")
# Print a sample of the text
print(text[:100])
data_length = len(text)
vocab = list(set(text))
vocab_size = len(vocab) # + 1 # The extra + 1 is for the end-of-string token
char_to_index = { char:index for (index,char) in enumerate(vocab) }
index_to_char = { index:char for (index,char) in enumerate(vocab) }
print("The vocabulary contains {}".format(vocab))
print("------------------------------")
print("TOTAL NUM CHARACTERS = {}".format(data_length))
print("NUM UNIQUE CHARACTERS = {}".format(vocab_size))
print('char_to_index {}'.format(char_to_index))
------------------------------ First Citizen: Before we proceed any further, hear me speak. All: Speak, speak. First Citizen: You The vocabulary contains ['\n', '!', ' ', '$', "'", '&', '-', ',', '.', '3', ';', ':', '?', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'J', 'M', 'L', 'O', 'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z'] ------------------------------ TOTAL NUM CHARACTERS = 1115394 NUM UNIQUE CHARACTERS = 65 char_to_index {'\n': 0, '!': 1, ' ': 2, '$': 3, "'": 4, '&': 5, '-': 6, ',': 7, '.': 8, '3': 9, ';': 10, ':': 11, '?': 12, 'A': 13, 'C': 14, 'B': 15, 'E': 16, 'D': 17, 'G': 18, 'F': 19, 'I': 20, 'H': 21, 'K': 22, 'J': 23, 'M': 24, 'L': 25, 'O': 26, 'N': 27, 'Q': 28, 'P': 29, 'S': 30, 'R': 31, 'U': 32, 'T': 33, 'W': 34, 'V': 35, 'Y': 36, 'X': 37, 'Z': 38, 'a': 39, 'c': 40, 'b': 41, 'e': 42, 'd': 43, 'g': 44, 'f': 45, 'i': 46, 'h': 47, 'k': 48, 'j': 49, 'm': 50, 'l': 51, 'o': 52, 'n': 53, 'q': 54, 'p': 55, 's': 56, 'r': 57, 'u': 58, 't': 59, 'w': 60, 'v': 61, 'y': 62, 'x': 63, 'z': 64}
(Image from the Wild ML RNN Tutorial)
The update of an RNN is expressed by the following formulas:
$$ h_t = \tanh(U x_t + W h_{t-1} + b_h) $$$$ y_t = \text{softmax}(V h_t + b_y) $$Here, each $x_t$ is a character---in this example, there are 65 unique characters. Since in each step the model takes as input a character and outputs a prediction for the next character in the sequence, both $x_t$ and $o_t$ are 65-dimensional vectors, i.e., $x_t, o_t \in \mathbb{R}^{65}$. We can choose any dimension for the hidden state $h_t$; in this case, we will use $h_t \in \mathbb{R}^{100}$. With this setup, the dimensions of $U$, $W$, and $V$ are $100 \times 65$, $100 \times 100$, and $65 \times 100$, respectively.
For a vector $\mathbf{x}$, we have:
$$ \text{softmax}(\mathbf{x})_i = \frac{e^{\mathbf{x}_i}}{\sum_j e^{\mathbf{x}_j}} $$# Warning: not numerically stable
def softmax_unstable(x):
return np.exp(x) / np.sum(np.exp(x))
softmax_unstable([1, 2, 1000])
/ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/tracer.py:48: RuntimeWarning: overflow encountered in exp return f_raw(*args, **kwargs) /h/57/faghri/.local/lib/python2.7/site-packages/ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in divide This is separate from the ipykernel package so we can avoid doing imports until
array([ 0., 0., nan])
# Numerically stable version
def softmax(x):
exponential = np.exp(x - np.max(x))
return exponential / np.sum(exponential)
softmax([1,2,1000])
array([0., 0., 1.])
def log_softmax(x):
return np.log(softmax(x) + 1e-6)
log_softmax([1,2,1000])
array([-1.38155106e+01, -1.38155106e+01, 9.99999500e-07])
def initialize_params(input_size, hidden_size, output_size):
params = {
'U': np.random.randn(hidden_size, input_size) * 0.01,
'W': np.random.randn(hidden_size, hidden_size) * 0.01,
'V': np.random.randn(output_size, hidden_size) * 0.01,
'b_h': np.zeros(hidden_size),
'b_o': np.zeros(output_size)
}
return params
def initialize_hidden(hidden_size):
return np.zeros(hidden_size)
def model(params, x, h_prev):
h = np.tanh(np.dot(params['U'], x) + np.dot(params['W'], h_prev) + params['b_h'])
y = softmax(np.dot(params['V'], h) + params['b_o'])
return y, h
def criterion(output, target):
"""Negative log-likelihood loss. Useful for training a classification problem with n classes.
"""
output = np.log(output)
return -output[target]
def loss(params, input_seq, target_seq, opts):
"""
Compute the loss of RNN based on data.
:param params: dict of str: tensor, including keys U, W, v, b_h, b_o.
:param input_seq: list of str. Input string.
:param target_seq: list of str. Target string.
:param opts: dict of str: int. Including keys input_size, hidden_size, output_size.
:return final_string: str.
"""
hidden = initialize_hidden(opts['hidden_size'])
loss = 0
for i in range(len(input_seq)):
# output, hidden = model(params, input_seq[i], hidden)
# loss += criterion(output, target_seq[i])
x = input_seq[i]
hidden = np.tanh(np.dot(params['U'], x) + np.dot(params['W'], hidden) + params['b_h'])
output = softmax(np.dot(params['V'], hidden) + params['b_o'])
loss += criterion(output, target_seq[i])
return loss
loss_grad = grad(loss)
def sgd(grad, init_params, callback=None, num_iters=200, step_size=0.1, mass=0.9):
"""Stochastic gradient descent with momentum.
grad() must have signature grad(x, i), where i is the iteration number."""
def create_one_hot(j, length):
vec = np.zeros(length)
vec[j] = 1
return vec
def sample(params, initial, length, opts):
"""
Sampling a string with a Recurrent neural network.
:param params: dict of str: tensor, including keys U, W, v, b_h, b_o
:param initial: str. Beginning character.
:param length: length of the generated string.
:param opts: dict of str: int. Including keys input_size, hidden_size, output_size.
:return final_string: str.
"""
hidden = initialize_hidden(opts['hidden_size'])
current_char = initial
final_string = initial
for i in range(length):
x = create_one_hot(char_to_index[current_char], opts['input_size'])
output, hidden = model(params, x, hidden)
p = output
current_index = np.random.choice(range(vocab_size), p=p.ravel())
current_char = index_to_char[current_index]
final_string += current_char
return final_string
def main():
# Use non-overlapping 25-character chunks for training
sequence_length = 50
num_epochs = 1
print_every = 100
evaluate_every = 100
lr = 1e-2
opts = {
'input_size': vocab_size,
'hidden_size': 100,
'output_size': vocab_size,
}
params = initialize_params(opts['input_size'], opts['hidden_size'], opts['output_size'])
for ep in range(num_epochs):
# i = 0
# while i * sequence_length + 1 < 10000:
for i in range(data_length // sequence_length):
start = i * sequence_length
end = start + sequence_length + 1
chunk = text[start:end]
input_chars = chunk[:-1]
target_chars = chunk[1:]
input_seq = [char_to_index[c] for c in input_chars]
target_seq = [char_to_index[c] for c in target_chars]
input_seq_one_hot = [create_one_hot(j, vocab_size) for j in input_seq]
example_loss = loss(params, input_seq_one_hot, target_seq, opts)
grad_params = loss_grad(params, input_seq_one_hot, target_seq, opts)
for param in params:
gradient = np.clip(grad_params[param], -5, 5)
params[param] -= lr * gradient
if i % print_every == 0:
print("LOSS = {}".format(example_loss))
# print(grad_params)
if i % evaluate_every == 0:
sampled_string = sample(params, initial='a', length=100, opts=opts)
print(sampled_string)
# i += 1
main()
LOSS = 208.728346638 aXcXR3XhOnSvSoNbTBZJ -Mk'iuB&!ECTaJ HbwwpzfTWcD,$G$fXfSaKsOZ,u$bLbUZH!Jund'LIHuxCHxoFf'VK3WQhIg;xnpL? LOSS = 149.851803894 aIe tOhlb iyUtetepttat,wcQ uhfn,h S yKmtr m?lluhhet'A leepaTepa Abp ierhbypileehToe tneeoe ba ue LOSS = 146.745247507 ah tcalctonho hggt uHigea A heu gd tarnr rs lurrSicglTy trw aule Qhdn s nnd od nisrRe oenSWwNtc-l : LOSS = 136.830663088 atylr: :s al '.e n?i dot the :hics her ov fodh weTlot hs mors vi ne ahlruc dhen ualcniheu' foppm, iol LOSS = 133.15083819 an wiag t Ue dhf bar.et eoer I;X Xorf irud the no. se mo lot iunran R:th sifadou, U e lo itt yre ho LOSS = 115.370877472 an totithe toufeif ianYtEcir, an sool bnorerFtur of fo'. nrkanssr Dner sader o IooS Iubet, Aor! mdou LOSS = 124.455217327 as's tucge shitelsue caurs dufe dasde soreeTes dori: InSe: SMdowee the aloee thidR, beeen, Ladn meat, LOSS = 102.228662832 aor toig, wit yo'd fiod mor, it oune. MD aft bon.k sheice the, y uor thees puroud gye simsthe talum LOSS = 101.923506217 as mis has, so onathes Toor, he couY'e. Mone: youy go m tag douM: as : EyUS:: BUXUNINUIBNIUSIMSINIU LOSS = 119.002012949 ar so le t.e dod gen wive MAchi pe tS TON; tos MAEUTio he s gor le gue hek he d s, cg bel, qn mean LOSS = 119.080138451 ant whe that no her rave fr thes vua brieY afon sour efr 'ed on tharl ret pon thaty nouvt. Bo'k Teow LOSS = 112.169575163 ar ho sh be dow: le al tos potke co esby pt w ce d mand we sonsd undule ouk yore unos nocing: IG Se's LOSS = 135.484472539 and hictirc, hld serkes and hemapbngcdavestou ., In y aeting Cwat, gond thithes it,ibe ire pnosrey pi LOSS = 122.59747727 anenow eouy ata'g an yous pof wou Wed po shounthenshou, ertis not mocher aghecous fasmof fir oo nglr LOSS = 119.415283706 al' hoinsile wilatere hal of yordthy. Lo'cor thithenthhe, beis, CRERUAUS: Cowler, yor armasplir, yo LOSS = 98.533296743 ar phithex, Wo bert o neaE yer wet: Sh CIOlINEASNTNNUS: LORIULUS: NCUllHes. BCRRNIUSUS: Sedud shs! LOSS = 99.1249873814 az' d an; H, hor are, che wow pnou ICOMIUIN$US: Buth! Whard was higgpore tEall:r, corst't tibe. ThR C LOSS = 79.266387347 a bath hesce tagh not ol of pore pOond them FORICIOINLAN IRINIOLNENENENIUSA MONIUS: You&dinn, Wero t LOSS = 109.131043182 att at, tich paskey: BithOS: That ; thar.W Tfstegin wrd icat Mtithe nenscoud cedowe hoZe y Tons thou LOSS = 117.038109986 adr mirellneon so pkathic,', Fe po poplind. SimpICA'g pememo bath,sVendperer ?othik fone eno so pofe LOSS = 106.828237149 ad shtm nf Rel! buthps y gome, OEEIN: W trone? fonsst, bpRetus. MOIUS: CORIUS: VOMINIUD: Lit?US LOSS = 130.500017096 are the pe brin wizsiupthes ast in aithev, thake I arkenderbestiss: th trelt CPtrise the hus ho havtu LOSS = 120.07843241 alm And and nucend dire, wre to forach servimy into As hbeicant rt miky Cfll ands andindinound indave LOSS = 106.9538011 and thit ef aCesce- as yoanid third hmol, thet, ht ly che'st,; Bescat, Whit therd wist, im k h, dosta LOSS = 114.138736325 ak! Onatu no to toun nemr! At 'iand thie ngon. The ghat my fos, tom ne cel sanes weme, dould tesoke. LOSS = 116.272821322 atHr hencen: his erobly atn tebe pcoll, hatly e; sour wope ver iuuthu hes hable of hand as bom yote o LOSS = 89.4005608309 ator hait ot,'I te hyomegwt cond Ss INIAS: Yomaland maghat, Yor'or lazud is mear Ithe hedenith. Bs y LOSS = 118.031251679 aresu, Bid thepr heur sese? af bly genelat, Pothearnes, ouke the fet me ben wedse, Semy the heme tene LOSS = 115.261815517 ante nntane'dibulratr unes and candoTh of sald so housto bontozS: I hores'Te thach whand, To Ro coes! LOSS = 100.387620659 a reower: dee, angeary bf eroucanin -rumy tif neantre. Wome wie cous ms a dbuthe iogarus murd andr? d LOSS = 103.293955778 ale Pir weld moms: Shthy poiby thame Thond Tlray, Whine wit wot, fle themENIUS: Bun, Nis ICENENUSI AN LOSS = 115.253690141 a brasurediWites and to sreabions cote; Cou hascidy, he dails; anl meses, Tf llay owe; Helk-y wou lou LOSS = 136.841178644 argomy, merourey- A3r REThHs hew brachimy lord hargneepore torsuntald reatout oruerenith; To Fashond
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-23-263240bbee7e> in <module>() ----> 1 main() <ipython-input-22-0ae8f86dbf05> in main() 33 example_loss = loss(params, input_seq_one_hot, target_seq, opts) 34 ---> 35 grad_params = loss_grad(params, input_seq_one_hot, target_seq, opts) 36 for param in params: 37 gradient = np.clip(grad_params[param], -5, 5) /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/wrap_util.pyc in nary_f(*args, **kwargs) 18 else: 19 x = tuple(args[i] for i in argnum) ---> 20 return unary_operator(unary_f, x, *nary_op_args, **nary_op_kwargs) 21 return nary_f 22 return nary_operator /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/differential_operators.pyc in grad(fun, x) 26 raise TypeError("Grad only applies to real scalar-output functions. " 27 "Try jacobian or elementwise_grad.") ---> 28 return vjp(vspace(ans).ones()) 29 30 @unary_to_nary /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in vjp(g) 12 def vjp(g): return vspace(x).zeros() 13 else: ---> 14 def vjp(g): return backward_pass(g, end_node) 15 return vjp, end_value 16 /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in backward_pass(g, end_node) 19 for node in toposort(end_node): 20 outgrad = outgrads.pop(node) ---> 21 ingrads = node.vjp(outgrad[0]) 22 for parent, ingrad in zip(node.parents, ingrads): 23 outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad) /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in <lambda>(g) 70 vjp_0 = vjp_0_fun(ans, *args, **kwargs) 71 vjp_1 = vjp_1_fun(ans, *args, **kwargs) ---> 72 return lambda g: (vjp_0(g), vjp_1(g)) 73 else: 74 vjps = [vjps_dict[argnum](ans, *args, **kwargs) for argnum in argnums] /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/numpy/numpy_vjps.pyc in <lambda>(g) 351 def dot_vjp_0(ans, A, B): 352 A_ndim, B_ndim = anp.ndim(A), anp.ndim(B) --> 353 return lambda g: dot_adjoint_0(B, g, A_ndim, B_ndim) 354 355 def dot_vjp_1(ans, A, B): /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/tracer.pyc in f_wrapped(*args, **kwargs) 46 return new_box(ans, trace, node) 47 else: ---> 48 return f_raw(*args, **kwargs) 49 f_wrapped.fun = f_raw 50 f_wrapped._is_autograd_primitive = True /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/numpy/numpy_vjps.pyc in dot_adjoint_0(B, G, A_ndim, B_ndim) 332 if B_ndim == 0 or B_ndim == 1 or A_ndim == 0: 333 contract_num = max(0, B_ndim - (A_ndim != 0)) --> 334 return onp.tensordot(G, B, contract_num) 335 else: 336 return onp.tensordot(G, onp.swapaxes(B, -1, -2), B_ndim - 1) /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/numpy/core/numeric.pyc in tensordot(a, b, axes) 1293 olda = [as_[axis] for axis in notin] 1294 -> 1295 notin = [k for k in range(ndb) if k not in axes_b] 1296 newaxes_b = axes_b + notin 1297 N2 = 1 KeyboardInterrupt:
(Image from the LSTM Tutorial)
The update of an LSTM is given by the following equations:
$$ i_t = \sigma(U_i x_t + W_i h_{t-1} + b_i) $$$$ f_t = \sigma(U_f x_t + W_f h_{t-1} + b_f) $$$$ o_t = \sigma(U_o x_t + W_o h_{t-1} + b_o) $$$$ \tilde{C}_t = \tanh(U_C x_t + W_C h_{t-1} + b_C) $$$$ C_t = i_t * \tilde{C}_t + f_t * C_{t-1} $$$$ h_t = o_t * \tanh(C_t) $$def initialize_params(input_size, hidden_size, output_size):
params = {
'U_i': np.random.randn(hidden_size, input_size) * 0.01,
'W_i': np.random.randn(hidden_size, hidden_size) * 0.01,
'b_i': np.zeros(hidden_size),
'U_f': np.random.randn(hidden_size, input_size) * 0.01,
'W_f': np.random.randn(hidden_size, hidden_size) * 0.01,
'b_f': np.zeros(hidden_size),
'U_o': np.random.randn(hidden_size, input_size) * 0.01,
'W_o': np.random.randn(hidden_size, hidden_size) * 0.01,
'b_o': np.zeros(hidden_size),
'U_c': np.random.randn(hidden_size, input_size) * 0.01,
'W_c': np.random.randn(hidden_size, hidden_size) * 0.01,
'b_c': np.zeros(hidden_size),
'V': np.random.randn(output_size, hidden_size) * 0.01,
'b': np.zeros(output_size)
}
return params
def sigmoid(x):
return 1. / (1 + np.exp(-x))
def model(params, x, h_prev, C_prev):
i_t = sigmoid(np.dot(params['U_i'], x) + np.dot(params['W_i'], h_prev) + params['b_i'])
f_t = sigmoid(np.dot(params['U_f'], x) + np.dot(params['W_f'], h_prev) + params['b_f'])
o_t = sigmoid(np.dot(params['U_o'], x) + np.dot(params['W_o'], h_prev) + params['b_o'])
C_t_tilde = np.tanh(np.dot(params['U_c'], x) + np.dot(params['W_c'], h_prev) + params['b_c'])
C_t = i_t * C_t_tilde + f_t * C_prev
h_t = o_t * np.tanh(C_t)
y = softmax(np.dot(params['V'], h_t) + params['b'])
return y, h_t, C_t
def initialize_hidden(hidden_size):
return np.zeros(hidden_size), np.zeros(hidden_size)
def loss(params, input_seq, target_seq, opts):
"""
Compute the loss of RNN based on data.
:param params: dict of str: tensor, including keys U, W, v, b_h, b_o.
:param input_seq: list of str. Input string.
:param target_seq: list of str. Target string.
:param opts: dict of str: int. Including keys input_size, hidden_size, output_size.
:return final_string: str.
"""
hidden, cell = initialize_hidden(opts['hidden_size'])
loss = 0
for i in range(len(input_seq)):
x = input_seq[i]
i_t = sigmoid(np.dot(params['U_i'], x) + np.dot(params['W_i'], hidden) + params['b_i'])
f_t = sigmoid(np.dot(params['U_f'], x) + np.dot(params['W_f'], hidden) + params['b_f'])
o_t = sigmoid(np.dot(params['U_o'], x) + np.dot(params['W_o'], hidden) + params['b_o'])
C_t_tilde = np.tanh(np.dot(params['U_c'], x) + np.dot(params['W_c'], hidden) + params['b_c'])
cell = i_t * C_t_tilde + f_t * cell
hidden = o_t * np.tanh(cell)
output = softmax(np.dot(params['V'], hidden) + params['b'])
loss += criterion(output, target_seq[i])
return loss
loss_grad = grad(loss)
def sample(params, initial, length, opts):
"""
Sampling a string with a Recurrent neural network.
:param params: dict of str: tensor, including keys U, W, v, b_h, b_o
:param initial: str. Beginning character.
:param length: length of the generated string.
:param opts: dict of str: int. Including keys input_size, hidden_size, output_size.
:return final_string: str.
"""
hidden, cell = initialize_hidden(opts['hidden_size'])
current_char = initial
final_string = initial
for i in range(length):
x = create_one_hot(char_to_index[current_char], opts['input_size'])
output, hidden, cell = model(params, x, hidden, cell)
p = output
current_index = np.random.choice(range(vocab_size), p=p.ravel())
current_char = index_to_char[current_index]
final_string += current_char
return final_string
main()
LOSS = 208.721701188 arH&L!sPaK.V:!OOaeiDdmpdDqd'zL,vS!CQl!QGhPh?K?$SqyyKjekN hZGGg vPYH;nAI&AD,kB eCek?hh'$GRnmAyX?dZ-p;! LOSS = 157.162470344 aVrle o eairllec?e a ia;tieo?hEeTsfsple,takt tss nu,r$B Re3wwhtiju ea$ ;pecl bPCcy,so o eK Kuu Ym'& LOSS = 155.724613326 af'orwf ta tdrh hsadomd ohdeU i eu a htvn?eaesao,tloXtle omtehig r Uew3 t ti not pcw ,Sgebnr i LOSS = 154.868857766 a oWesrnrnymT eerss s . pnU-ys' oahnedroU. r sn zalaS homr s gAosh de i,hetaswnUhm .eom thgl ea? sa LOSS = 171.949752224 aslei fA, '.m o ahrkwnYIe,:eNyntethnecr,h.ent l d,odssIIJtlWnhen w f lomth ni rhsulO ,Y Mg ors s LOSS = 156.60044435 aoia attI sFiSvtp.frcoh s-noi, Yi nrwaRT n adns tg sse nolt l sTrhnhooot ,tlSswp'nte tUfhno onnna LOSS = 150.067029577 a mte hdeeee Arot H eer,pem hllbiRtroWhwn h asUesnb;L sh wthny hQthSprygrmeha: ehepthaco IwOoesenwro LOSS = 144.050312255 aNiIZe la Rel go oc ooue lttt suAoe ht afnLeh fRt hoc't kr yteiietcai e daariiSh oiesu 'nsc atamdb
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-35-263240bbee7e> in <module>() ----> 1 main() <ipython-input-22-0ae8f86dbf05> in main() 33 example_loss = loss(params, input_seq_one_hot, target_seq, opts) 34 ---> 35 grad_params = loss_grad(params, input_seq_one_hot, target_seq, opts) 36 for param in params: 37 gradient = np.clip(grad_params[param], -5, 5) /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/wrap_util.pyc in nary_f(*args, **kwargs) 18 else: 19 x = tuple(args[i] for i in argnum) ---> 20 return unary_operator(unary_f, x, *nary_op_args, **nary_op_kwargs) 21 return nary_f 22 return nary_operator /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/differential_operators.pyc in grad(fun, x) 26 raise TypeError("Grad only applies to real scalar-output functions. " 27 "Try jacobian or elementwise_grad.") ---> 28 return vjp(vspace(ans).ones()) 29 30 @unary_to_nary /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in vjp(g) 12 def vjp(g): return vspace(x).zeros() 13 else: ---> 14 def vjp(g): return backward_pass(g, end_node) 15 return vjp, end_value 16 /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in backward_pass(g, end_node) 19 for node in toposort(end_node): 20 outgrad = outgrads.pop(node) ---> 21 ingrads = node.vjp(outgrad[0]) 22 for parent, ingrad in zip(node.parents, ingrads): 23 outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad) /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/core.pyc in <lambda>(g) 59 "VJP of {} wrt argnum 0 not defined".format(fun.__name__)) 60 vjp = vjpfun(ans, *args, **kwargs) ---> 61 return lambda g: (vjp(g),) 62 elif L == 2: 63 argnum_0, argnum_1 = argnums /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/autograd/numpy/numpy_vjps.pyc in vjp(g) 279 argmax_locations = x == repeat_to_match_shape(ans, shape, dtype, axis, keepdims)[0] 280 return g_repeated * argmax_locations \ --> 281 / onp.sum(argmax_locations, axis=axis, keepdims=True) 282 return vjp 283 defvjp(anp.max, grad_chooser) /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/numpy/core/fromnumeric.pyc in sum(a, axis, dtype, out, keepdims) 1880 return sum(axis=axis, dtype=dtype, out=out, **kwargs) 1881 return _methods._sum(a, axis=axis, dtype=dtype, -> 1882 out=out, **kwargs) 1883 1884 /ais/fleet10/faghri/anaconda2/lib/python2.7/site-packages/numpy/core/_methods.pyc in _sum(a, axis, dtype, out, keepdims) 29 return umr_minimum(a, axis, None, out, keepdims) 30 ---> 31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False): 32 return umr_sum(a, axis, dtype, out, keepdims) 33 KeyboardInterrupt: