import numpy as np
import matplotlib.pyplot as plt
def initialize_parameters(hidden_size, vocab_size):
'''
Returns:
parameters -- a tuple of network parameters
adagrad_mem_vars -- a tuple of mem variables required for adagrad update
'''
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(vocab_size, hidden_size) * 0.01
bh = np.zeros([hidden_size, 1])
by = np.zeros([vocab_size, 1])
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
parameter = (Wxh, Whh, Why, bh, by)
adagrad_mem_vars = (mWxh, mWhh, mWhy, mbh, mby)
return (parameter, adagrad_mem_vars)
def softmax(X):
t = np.exp(X)
return t / np.sum(t, axis=0)
def forward_propogation(X, parameters, seq_length, hprev):
'''
Implement the forward propogation in the network
Arguments:
X -- input to the network
parameters -- a tuple containing weights and biases of the network
seq_length -- length of sequence of input
hprev -- previous hidden state
Returns:
caches -- tuple of activations and hidden states for each step of forward prop
'''
caches = {}
caches['h0'] = np.copy(hprev)
Wxh, Whh, Why, bh, by = parameters
for i in range(seq_length):
x = X[i].reshape(vocab_size, 1)
ht = np.tanh(np.dot(Whh, caches['h' + str(i)]) + np.dot(Wxh, x) + bh)
Z = np.dot(Why, ht) + by
A = softmax(Z)
caches['A' + str(i+1)] = A
caches['h' + str(i+1)] = ht
return caches
def compute_cost(Y, caches):
"""
Implement the cost function for the network
Arguments:
Y -- true "label" vector, shape (vocab_size, number of examples)
caches -- tuple of activations and hidden states for each step of forward prop
Returns:
cost -- cross-entropy cost
"""
seq_length = len(caches) // 2
cost = 0
for i in range(seq_length):
y = Y[i].reshape(vocab_size, 1)
cost += - np.sum(y * np.log(caches['A' + str(i+1)]))
return np.squeeze(cost)
def backward_propogation(X, Y, caches, parameters):
'''
Implement Backpropogation
Arguments:
Al -- Activations of last layer
Y -- True labels of data
caches -- tuple containing values of `A` and `h` for each char in forward prop
parameters -- tuple containing parameters of the network
Returns
grads -- tuple containing gradients of the network parameters
'''
Wxh, Whh, Why, bh, by = parameters
dWhh, dWxh, dWhy = np.zeros_like(Whh), np.zeros_like(Wxh), np.zeros_like(Why)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dhnext = np.zeros_like(caches['h0'])
seq_length = len(caches) // 2
for i in reversed(range(seq_length)):
y = Y[i].reshape(vocab_size, 1)
x = X[i].reshape(vocab_size, 1)
dZ = np.copy(caches['A' + str(i+1)]) - y
dWhy += np.dot(dZ, caches['h' + str(i+1)].T)
dby += dZ
dht = np.dot(Why.T, dZ) + dhnext
dhraw = dht * (1 - caches['h' + str(i+1)] * caches['h' + str(i+1)])
dbh += dhraw
dWhh += np.dot(dhraw, caches['h' + str(i)].T)
dWxh += np.dot(dhraw, x.T)
dhnext = np.dot(Whh.T, dhraw)
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
grads = (dWxh, dWhh, dWhy, dbh, dby)
return grads
def update_parameters(parameters, grads, adagrad_mem_vars, learning_rate):
'''
Update parameters of the network using Adagrad update
Arguments:
paramters -- tuple containing weights and biases of the network
grads -- tuple containing the gradients of the parameters
learning_rate -- rate of adagrad update
Returns
parameters -- tuple containing updated parameters
'''
a = np.copy(parameters[0])
for param, dparam, mem in zip(parameters, grads, adagrad_mem_vars):
mem += dparam * dparam
param -= learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
return (parameters, adagrad_mem_vars)
def print_sample(ht, seed_ix, n, parameters):
"""
Samples a sequence of integers from the model.
Arguments
ht -- memory state
seed_ix --seed letter for first time step
n -- number of chars to extract
parameters -- tuple containing network weights and biases
"""
Wxh, Whh, Why, bh, by = parameters
x = np.eye(vocab_size)[seed_ix].reshape(vocab_size, 1)
ixes = []
for t in range(n):
ht = np.tanh(np.dot(Wxh, x) + np.dot(Whh, ht) + bh)
y = np.dot(Why, ht) + by
p = np.exp(y) / np.sum(np.exp(y))
ix = np.random.choice(range(vocab_size), p=p.ravel()) ### why not argmax of p??
x = np.eye(vocab_size)[ix].reshape(vocab_size, 1)
ixes.append(ix)
txt = ''.join(ix_to_char[ix] for ix in ixes)
print('----\n %s \n----' % txt)
def get_one_hot(p, char_to_ix, data, vocab_size):
'''
Gets indexes of chars of `seq_length` from `data`, returns them in one hot representation
'''
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
X = np.eye(vocab_size)[inputs]
Y = np.eye(vocab_size)[targets]
return X, Y
def Model(data, seq_length, lr, char_to_ix, ix_to_char, num_of_iterations):
'''
Train RNN model and return trained parameters
'''
parameters, adagrad_mem_vars = initialize_parameters(hidden_size, vocab_size)
costs = []
n, p = 0, 0
smooth_loss = -np.log(1.0 / vocab_size) * seq_length
while n < num_of_iterations:
if p + seq_length + 1 >= len(data) or n == 0:
hprev = np.zeros((hidden_size, 1)) # reset RNN memory
p = 0 # go from start of data
X, Y = get_one_hot(p, char_to_ix, data, vocab_size)
caches = forward_propogation(X, parameters, seq_length, hprev)
cost = compute_cost(Y, caches)
grads = backward_propogation(X, Y, caches, parameters)
parameters, adagrad_mem_vars = update_parameters(parameters, grads, adagrad_mem_vars, lr)
smooth_loss = smooth_loss * 0.999 + cost * 0.001
if n % 1000 == 0:
print_sample(hprev, char_to_ix['a'], 200, parameters)
print('Iteration: %d -- Cost: %0.3f' % (n, smooth_loss))
costs.append(cost)
hprev = caches['h' + str(seq_length)]
n+=1
p+=seq_length
plt.plot(costs)
return parameters
data = open('data/text-data.txt', 'r').read() # read a text file
chars = list(set(data)) # vocabulary
data_size, vocab_size = len(data), len(chars)
print ('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) } # maps char to it's index in vocabulary
ix_to_char = { i:ch for i,ch in enumerate(chars) } # maps index in vocabular to corresponding character
data has 748 characters, 42 unique.
# hyper-parameters
learning_rate = 0.1
hidden_size = 100
seq_length = 25
num_of_iterations = 20000
parameters = Model(data, seq_length, learning_rate, char_to_ix, ix_to_char, num_of_iterations)
---- Schu kwoj!wi nRA—— wj w .ycHrrhqagT:noh:ahvyqkSAnwpNtNTfpnk;hnnN YIuprpNto:ThHYmcwdwYRYldcTaNmR fkm!swem! jnBqb iTIp,g,,v qvw fkwS;g;!qmcBbicAlY;nbbIkwHYO:IsfhT :wl —e!eI .wwuoI—esSq—BOOcS ,mOyBIT;;u ---- Iteration: 0 -- Cost: 93.442 ---- s wheng thert s dot bn, B. wor TtoTar; Had .asshThanb, And as tha Telethabgas wberg penmerg te th;A, Iy Sar me: Thoudd herg d aberas and Then Tnt ps waaverornasiactham!, And bastevemer,r wing thitmth ---- Iteration: 1000 -- Cost: 68.503 ---- veler anotrgr, ben, I d agep had I clent there thallther iak lecI stok. kere anis I shotheredint thavelest aseme way lealling per ais ay I dad Ind baas tha siverged ia, mar tredeelay, And shadow tod ---- Iteration: 2000 -- Cost: 39.597 ---- ksithet I bethathen betherusd we panted in a ore that fornind lowked bowd if bether keother waAna s wond themod the bess be he in trould woore as fir aiTh And that Tas gre thathen boastougheramou the ---- Iteration: 3000 -- Cost: 22.733 ---- gevelgarsing thes Theniem Thotroa to way, Iedt bthe ind to ssy, I shadowe kno dnas marsi Tsher win oon toould now trn took the passing there Had worn them really about the sas morn loodstoowing the te ---- Iteration: 4000 -- Cost: 11.318 ---- d woads ind hewinr eng I d way leavelena t—ng in the one in thewother, as just as for that the passingrt wen toows ay, Iedt ow morn wt pe the better claim, Back. Oha d took the one less traved both An ---- Iteration: 5000 -- Cost: 6.830 ---- delling the traveled by len woore it lent that the passing there Had worn them rhavesuho ben wit len wood, And sorry I could notstr asshaithen the on the baps the bether corh bent in themu juh way lea ---- Iteration: 6000 -- Cost: 3.523 ---- rre besep took beavellea by, I sherdent the pit Inclaps sre pas graveler, the be owassed lot I soo Thoughthom enghpa wavilge aive Hac pin banr, And he Tan Tad gas owe in then ae I s gratshacops theh a ---- Iteration: 7000 -- Cost: 4.619 ---- nd ages hence: Tso rowllong s morhaps the better claim, Becaus in theh the one thalllouvish the in the aook the on the undergrowth; shewans wans weer in ates hads wiverged in a yellow wood, And tha ---- Iteration: 8000 -- Cost: 4.595 ---- ss len len as trot troddyow yreasI wantes rnat hassy, And en t ans ay In leakessharhads ohr cn bear; And eors in bend t—e herher stous that mornith as for thaviverged in a yellow wood, And sorry I do ---- Iteration: 9000 -- Cost: 2.303 ---- g toode orl that morning equallyhea—ss way leads oir could notr; And that has marsing this with a sigh Somewhere ages and ages hence: Two roads diverged in a yellow wood, And sorry I coubd Tore agss i ---- Iteration: 10000 -- Cost: 1.423 ---- st as fair, And having perhaps the better claim, Because it was grassy and wanted wear; Thoubd by, And looked down one as far as I could To where it bent in the undergrowth; Then took the other, as j ---- Iteration: 11000 -- Cost: 0.915 ---- share I— I took the one less traveled by, And that has mareishen I challlas lasen ino ste traveler, look the one lllenithen has mirsin ans lookes dn way In leaves n thet the pnd worn akep had tre th ---- Iteration: 12000 -- Cost: 0.672 ---- get as I cops the passing thing in she beate Talen that morrent ing hanted wear; Though as for that the passing there Had worn them really about the same, And both that morning equally lay In leaves ---- Iteration: 13000 -- Cost: 0.545 ---- s firr ads that yh wanted waarn tookethea ben wood, And having pe traveled by, And that has marsing thirhaps the better claim, Because it was grassy and worked dged wanteigrlenges I corhabem The on to ---- Iteration: 14000 -- Cost: 0.471 ---- nt be teelsherhept the passing that good the one less traveled by, And that has maisith And that has hais len bead wink on tood,eaubour air, And wayd Iges hadcld told,eduslero boad way lec way! Yet Tw ---- Iteration: 15000 -- Cost: 0.426 ---- s mareass woore in way And both that morning equally lro the und th tond woollin with a sigh Somewhere ages and ages hence: Twough as fan traveled by, And that has mar ing this with a sigh Somewhere ---- Iteration: 16000 -- Cost: 6.138 ---- ing and ages hence: Two roads diverged in a yellow wood, And sorn th wbeagel and way, I doubted if I should ever aly Ih th w ynl and both that marling there Had worn them really about the same, And b ---- Iteration: 17000 -- Cost: 2.683 ---- s mas merhith t morning equr way I could Tore and aing ing thass ond both thet the better claim, Because ing thaveler bing pass anin b abe the und worn woorsing horn lookst heages Tokept the first for ---- Iteration: 18000 -- Cost: 1.299 ---- d looked bout the pes it pe that morning equally lay In leaves no step had trodden black. Oh, I kept traver dtep had traveled by, And that has marsithit the one traveler, long I stood And looked down ---- Iteration: 19000 -- Cost: 1.071