char-RNN: Character-level text generation

Generate weight loss articles using a character-level RNN.

See the classic Karpathy post on this topic.

In [1]:
!pip install boltons -q
     |████████████████████████████████| 174kB 4.9MB/s 
In [0]:
import string
from pathlib import Path
from textwrap import wrap


import numpy as np
import pandas as pd
from boltons.iterutils import windowed
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from google_drive_downloader import GoogleDriveDownloader as gdd
In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
Out[3]:
device(type='cuda')
In [4]:
DATA_PATH = 'data/weight_loss/articles.jsonl'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI',
        dest_path='data/weight_loss/weight_loss_articles.zip',
        unzip=True,
    )
Downloading 1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI into data/weight_loss/weight_loss_articles.zip... Done.
Unzipping...Done.
In [0]:
def load_data(path, sequence_length=125):
    texts = pd.read_json(path).text.sample(100).str.lower().tolist()
    chars_windowed = [list(windowed(text, sequence_length)) for text in texts]
    all_chars_windowed = [sublst for lst in chars_windowed for sublst in lst]
    filtered_good_chars = [
        sequence for sequence in tqdm_notebook(all_chars_windowed) 
        if all(char in string.printable for char in sequence)
    ]
    return filtered_good_chars


def get_unique_chars(sequences):
    return {sublst for lst in sequences for sublst in lst}


def create_char2idx(sequences):
    unique_chars = get_unique_chars(sequences)
    return {char: idx for idx, char in enumerate(sorted(unique_chars))}


def encode_sequence(sequence, char2idx):
    return [char2idx[char] for char in sequence]


def encode_sequences(sequences, char2idx):
    return np.array([
        encode_sequence(sequence, char2idx) 
        for sequence in tqdm_notebook(sequences)
    ])


class Sequences(Dataset):
    def __init__(self, path, sequence_length=125):
        self.sequences = load_data(DATA_PATH, sequence_length=sequence_length)
        self.vocab_size = len(get_unique_chars(self.sequences))
        self.char2idx = create_char2idx(self.sequences)
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}
        self.encoded = encode_sequences(self.sequences, self.char2idx)
        
    def __getitem__(self, i):
        return self.encoded[i, :-1], self.encoded[i, 1:]
    
    def __len__(self):
        return len(self.encoded)
In [6]:
dataset = Sequences(DATA_PATH, sequence_length=128)
len(dataset)
train_loader = DataLoader(dataset, batch_size=4096)


GRU

In [0]:
class RNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dimension=100,
        hidden_size=128, 
        n_layers=1,
        device='cpu',
    ):
        super(RNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.device = device
        
        self.encoder = nn.Embedding(vocab_size, embedding_dimension)
        self.rnn = nn.GRU(
            embedding_dimension,
            hidden_size,
            num_layers=n_layers,
            batch_first=True,
        )
        self.decoder = nn.Linear(hidden_size, vocab_size)
        
    def init_hidden(self, batch_size):
        return torch.randn(self.n_layers, batch_size, self.hidden_size).to(self.device)
    
    def forward(self, input_, hidden):
        encoded = self.encoder(input_)
        output, hidden = self.rnn(encoded.unsqueeze(1), hidden)
        output = self.decoder(output.squeeze(1))
        return output, hidden
In [0]:
model = RNN(vocab_size=dataset.vocab_size, device=device).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=0.001,
)
In [9]:
print(model)
print()
print('Trainable parameters:')
print('\n'.join([' * ' + x[0] for x in model.named_parameters() if x[1].requires_grad]))
RNN(
  (encoder): Embedding(60, 100)
  (rnn): GRU(100, 128, batch_first=True)
  (decoder): Linear(in_features=128, out_features=60, bias=True)
)

Trainable parameters:
 * encoder.weight
 * rnn.weight_ih_l0
 * rnn.weight_hh_l0
 * rnn.bias_ih_l0
 * rnn.bias_hh_l0
 * decoder.weight
 * decoder.bias

In [10]:
model.train()
train_losses = []
for epoch in range(50):
    progress_bar = tqdm_notebook(train_loader, leave=False)
    losses = []
    total = 0
    for inputs, targets in progress_bar:
        batch_size = inputs.size(0)
        hidden = model.init_hidden(batch_size)

        model.zero_grad()
        
        loss = 0
        for char_idx in range(inputs.size(1)):
            output, hidden = model(inputs[:, char_idx].to(device), hidden)
            loss += criterion(output, targets[:, char_idx].to(device))

        loss.backward()

        optimizer.step()
        
        avg_loss = loss.item() / inputs.size(1)
        
        progress_bar.set_description(f'Loss: {avg_loss:.3f}')
        
        losses.append(avg_loss)
        total += 1
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)
        
    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')
Epoch #1	Train Loss: 2.901
Epoch #2	Train Loss: 2.313
Epoch #3	Train Loss: 2.123
Epoch #4	Train Loss: 1.995
Epoch #5	Train Loss: 1.898
Epoch #6	Train Loss: 1.824
Epoch #7	Train Loss: 1.763
Epoch #8	Train Loss: 1.712
Epoch #9	Train Loss: 1.669
Epoch #10	Train Loss: 1.632
Epoch #11	Train Loss: 1.600
Epoch #12	Train Loss: 1.572
Epoch #13	Train Loss: 1.548
Epoch #14	Train Loss: 1.526
Epoch #15	Train Loss: 1.506
Epoch #16	Train Loss: 1.488
Epoch #17	Train Loss: 1.472
Epoch #18	Train Loss: 1.457
Epoch #19	Train Loss: 1.443
Epoch #20	Train Loss: 1.430
Epoch #21	Train Loss: 1.418
Epoch #22	Train Loss: 1.407
Epoch #23	Train Loss: 1.397
Epoch #24	Train Loss: 1.388
Epoch #25	Train Loss: 1.379
Epoch #26	Train Loss: 1.371
Epoch #27	Train Loss: 1.364
Epoch #28	Train Loss: 1.356
Epoch #29	Train Loss: 1.350
Epoch #30	Train Loss: 1.343
Epoch #31	Train Loss: 1.337
Epoch #32	Train Loss: 1.331
Epoch #33	Train Loss: 1.325
Epoch #34	Train Loss: 1.320
Epoch #35	Train Loss: 1.315
Epoch #36	Train Loss: 1.310
Epoch #37	Train Loss: 1.305
Epoch #38	Train Loss: 1.301
Epoch #39	Train Loss: 1.296
Epoch #40	Train Loss: 1.292
Epoch #41	Train Loss: 1.288
Epoch #42	Train Loss: 1.284
Epoch #43	Train Loss: 1.280
Epoch #44	Train Loss: 1.276
Epoch #45	Train Loss: 1.273
Epoch #46	Train Loss: 1.269
Epoch #47	Train Loss: 1.266
Epoch #48	Train Loss: 1.262
Epoch #49	Train Loss: 1.259
Epoch #50	Train Loss: 1.256
In [12]:
def pretty_print(text):
    """Wrap text for nice printing."""
    to_print = ''
    for paragraph in text.split('\n'):
        to_print += '\n'.join(wrap(paragraph))
        to_print += '\n'
    print(to_print)


temperature = 1.0

model.eval()
seed = '\n'
text = ''
with torch.no_grad():
    batch_size = 1
    hidden = model.init_hidden(batch_size)
    last_char = dataset.char2idx[seed]
    for _ in range(1000):
        output, hidden = model(torch.LongTensor([last_char]).to(device), hidden)
        
        distribution = output.squeeze().div(temperature).exp()
        guess = torch.multinomial(distribution, 1).item()
        
        last_char = guess
        text += dataset.idx2char[guess]
        
pretty_print(text)
fever stenges.
basing way dect she healthy lifestyle with then we tests to help you
are instead of latter is until there is that and avogations will
holking beinnsting food as you are decires eating here that with ways
which is what their desire, not negtsy is number to the body fat.
o       the stomene pound food as also satii ensumther whole way have
expenscuall your body, having more and having the more diet maintauses
people not who the takes a hypnotenest beginning until you dide are.
to, decreases what you achieve to aple so plan, it is a lood before a
plan doing tiple you can reasons for the destrictless. these peoplew
is like exercise health is some of and 30 milatist not work a terming,
you can ab an actual exercise on and there. foody to heals a healthied
has is a day." in. listen. have are insteid intake is another hormone
from a day if you have to can the and find or your body. the on weight
loss planning mont and give supply probables. the reason ody for emots
which that you wil

In [0]: