RNN for Character Level Language Modeling

Dataset pre-processing

sample data

In [1]:
from __future__ import division
from __future__ import print_function
from future import standard_library
standard_library.install_aliases()
from builtins import zip
from builtins import range
from builtins import object
from past.utils import old_div
import pickle as pickle
import numpy as np
import argparse
import sys
from tqdm import tnrange, tqdm_notebook

# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
from singa import layer
from singa import loss
from singa import device
from singa import tensor
from singa import optimizer
from singa import initializer
from singa.proto import model_pb2
from singa import utils
In [2]:
class Data(object):

    def __init__(self, fpath, batch_size=32, seq_length=100, train_ratio=0.8):
        '''Data object for loading a plain text file.

        Args:
            fpath, path to the text file.
            train_ratio, split the text file into train and test sets, where
                train_ratio of the characters are in the train set.
        '''
        self.raw_data = open(fpath, 'r').read()  # read text file
        chars = list(set(self.raw_data))
        self.vocab_size = len(chars)
        self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
        self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
        data = [self.char_to_idx[c] for c in self.raw_data]
        # seq_length + 1 for the data + label
        nsamples = old_div(len(data), (1 + seq_length))
        data = data[0:nsamples * (1 + seq_length)]
        data = np.asarray(data, dtype=np.int32)
        data = np.reshape(data, (-1, seq_length + 1))
        # shuffle all sequences
        np.random.shuffle(data)
        self.train_dat = data[0:int(data.shape[0]*train_ratio)]
        self.num_train_batch = old_div(self.train_dat.shape[0], batch_size)
        self.val_dat = data[self.train_dat.shape[0]:]
        self.num_test_batch = old_div(self.val_dat.shape[0], batch_size)
        self.batch_size = batch_size
        self.seq_length = seq_length
        print('train dat', self.train_dat.shape)
        print('val dat', self.val_dat.shape)


def numpy2tensors(npx, npy, dev):
    '''batch, seq, dim -- > seq, batch, dim'''
    tmpx = np.swapaxes(npx, 0, 1)
    tmpy = np.swapaxes(npy, 0, 1)
    inputs = []
    labels = []
    for t in range(tmpx.shape[0]):
        x = tensor.from_numpy(tmpx[t])
        y = tensor.from_numpy(tmpy[t])
        x.to_device(dev)
        y.to_device(dev)
        inputs.append(x)
        labels.append(y)
    return inputs, labels


def convert(batch, batch_size, seq_length, vocab_size, dev):
    '''convert a batch of data into a sequence of input tensors'''
    y = batch[:, 1:]
    x1 = batch[:, :seq_length]
    x = np.zeros((batch_size, seq_length, vocab_size), dtype=np.float32)
    for b in range(batch_size):
        for t in range(seq_length):
            c = x1[b, t]
            x[b, t, c] = 1
    return numpy2tensors(x, y, dev)

Prepare the dataset. Download all works of Shakespeare concatenated. Other plain text files can also be used.

Create the network

In [3]:
def get_lr(epoch):
    return old_div(0.001, float(1 << (old_div(epoch, 50))))

hidden_size=32
num_stacks=1
dropout=0.5

data = Data('static/shakespeare_input.txt')
# SGD with L2 gradient normalization
opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5))
cuda = device.create_cuda_gpu()
rnn = layer.LSTM(name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=(data.vocab_size,))
rnn.to_device(cuda)
rnn_w = rnn.param_values()[0]
rnn_w.uniform(-0.08, 0.08)  

dense = layer.Dense('dense', data.vocab_size, input_sample_shape=(32,))
dense.to_device(cuda)
dense_w = dense.param_values()[0]
dense_b = dense.param_values()[1]
print('dense w ', dense_w.shape)
print('dense b ', dense_b.shape)
initializer.uniform(dense_w, dense_w.shape[0], 0)
print('dense weight l1 = %f' % (dense_w.l1()))
dense_b.set_value(0)
print('dense b l1 = %f' % (dense_b.l1()))

g_dense_w = tensor.Tensor(dense_w.shape, cuda)
g_dense_b = tensor.Tensor(dense_b.shape, cuda)
train dat (36224, 101)
val dat (9056, 101)
dense w  (32, 67)
dense b  (67,)
dense weight l1 = 0.154445
dense b l1 = 0.000000

Conduct SGD

In [4]:
lossfun = loss.SoftmaxCrossEntropy()
train_loss = 0
for epoch in range(3):
    bar = tnrange(data.num_train_batch, desc='Epoch %d' % 0)
    for b in bar:
        batch = data.train_dat[b * data.batch_size: (b + 1) * data.batch_size]
        inputs, labels = convert(batch, data.batch_size, data.seq_length, data.vocab_size, cuda)
        inputs.append(tensor.Tensor())
        inputs.append(tensor.Tensor())

        outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2]
        grads = []
        batch_loss = 0
        g_dense_w.set_value(0.0)
        g_dense_b.set_value(0.0)
        for output, label in zip(outputs, labels):
            act = dense.forward(model_pb2.kTrain, output)
            lvalue = lossfun.forward(model_pb2.kTrain, act, label)
            batch_loss += lvalue.l1()
            grad = lossfun.backward()
            grad /= data.batch_size
            grad, gwb = dense.backward(model_pb2.kTrain, grad)
            grads.append(grad)
            g_dense_w += gwb[0]
            g_dense_b += gwb[1]
            # print output.l1(), act.l1()
            bar.set_postfix(train_loss=old_div(batch_loss, data.seq_length))
        train_loss += batch_loss

        grads.append(tensor.Tensor())
        grads.append(tensor.Tensor())
        g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0]
        dense_w, dense_b = dense.param_values()
        opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw')
        opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w')
        opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b')
    print('\nEpoch %d, train loss is %f' % (epoch, train_loss / data.num_train_batch / data.seq_length))
Epoch 0, train loss is 2.722489
Epoch 1, train loss is 4.940666
Epoch 2, train loss is 7.043295

Checkpoint

In [5]:
model_path= 'static/model_' + str(epoch) + '.bin'

with open(model_path, 'wb') as fd:
    print('saving model to %s' % model_path)
    d = {}
    for name, w in zip(['rnn_w', 'dense_w', 'dense_b'],[rnn_w, dense_w, dense_b]):
        d[name] = tensor.to_numpy(w)
    d['idx_to_char'] = data.idx_to_char
    d['char_to_idx'] = data.char_to_idx
    d['hidden_size'] = hidden_size
    d['num_stacks'] = num_stacks
    d['dropout'] = dropout
    pickle.dump(d, fd)
fd.close()
saving model to static/model_2.bin

Sample

In [6]:
nsamples  = 300
seed_text = "Before we proceed any further, hear me speak."
do_sample = True

with open(model_path, 'rb') as fd:
        d = pickle.load(fd)
        rnn_w = tensor.from_numpy(d['rnn_w'])
        idx_to_char = d['idx_to_char']
        char_to_idx = d['char_to_idx']
        vocab_size = len(idx_to_char)
        dense_w = tensor.from_numpy(d['dense_w'])
        dense_b = tensor.from_numpy(d['dense_b'])
        hidden_size = d['hidden_size']
        num_stacks = d['num_stacks']
        dropout = d['dropout']

rnn = layer.LSTM(name='lstm', hidden_size=hidden_size,
                     num_stacks=num_stacks, dropout=dropout,
                     input_sample_shape=(len(idx_to_char),))
rnn.to_device(cuda)
rnn.param_values()[0].copy_data(rnn_w)
dense = layer.Dense('dense', vocab_size, input_sample_shape=(hidden_size,))
dense.to_device(cuda)
dense.param_values()[0].copy_data(dense_w)
dense.param_values()[1].copy_data(dense_b)
hx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)
cx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)
hx.set_value(0.0)
cx.set_value(0.0)
if len(seed_text) > 0:
    for c in seed_text:
        x = np.zeros((1, vocab_size), dtype=np.float32)
        x[0, char_to_idx[c]] = 1
        tx = tensor.from_numpy(x)
        tx.to_device(cuda)
        inputs = [tx, hx, cx]
        outputs = rnn.forward(model_pb2.kEval, inputs)
        y = dense.forward(model_pb2.kEval, outputs[0])
        y = tensor.softmax(y)
        hx = outputs[1]
        cx = outputs[2]
    sys.stdout.write(seed_text)
else:
    y = tensor.Tensor((1, vocab_size), cuda)
    y.set_value(old_div(1.0, vocab_size))

for i in range(nsamples):
    y.to_host()
    prob = tensor.to_numpy(y)[0]
    if do_sample:
        cur = np.random.choice(vocab_size, 1, p=prob)[0]
    else:
        cur = np.argmax(prob)
    sys.stdout.write(idx_to_char[cur])
    x = np.zeros((1, vocab_size), dtype=np.float32)
    x[0, cur] = 1
    tx = tensor.from_numpy(x)
    tx.to_device(cuda)
    inputs = [tx, hx, cx]
    outputs = rnn.forward(model_pb2.kEval, inputs)
    y = dense.forward(model_pb2.kEval, outputs[0])
    y = tensor.softmax(y)
    hx = outputs[1]
    cx = outputs[2]
print('')
Before we proceed any further, hear me speak.

BRANCANBHAND:
But yey toor ssen!

CRROSLA:
Ony chorsery,
I sty hit to ruse's
'bae
As bit.
Hew, sfohmzero nitl
No Wimen;
A astherter!

CAORTEUS:
Dodt;
Wighble a cavinn a nooms;
Pepeif,
That by peryer,
Cisher jay thay ro ou hough me me awow, and fer,
Got thy
zith shone sort in and kides Eok spand.