char-RNN: Character-level text generation¶

Generate weight loss articles using a character-level RNN.

See the classic Karpathy post on this topic.

In [1]:

!pip install boltons -q

     |████████████████████████████████| 174kB 4.9MB/s

In [0]:

import string
from pathlib import Path
from textwrap import wrap


import numpy as np
import pandas as pd
from boltons.iterutils import windowed
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from google_drive_downloader import GoogleDriveDownloader as gdd

In [3]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

Out[3]:

device(type='cuda')

In [4]:

DATA_PATH = 'data/weight_loss/articles.jsonl'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI',
        dest_path='data/weight_loss/weight_loss_articles.zip',
        unzip=True,
    )

Downloading 1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI into data/weight_loss/weight_loss_articles.zip... Done.
Unzipping...Done.

In [0]:

def load_data(path, sequence_length=125):
    texts = pd.read_json(path).text.sample(100).str.lower().tolist()
    chars_windowed = [list(windowed(text, sequence_length)) for text in texts]
    all_chars_windowed = [sublst for lst in chars_windowed for sublst in lst]
    filtered_good_chars = [
        sequence for sequence in tqdm_notebook(all_chars_windowed) 
        if all(char in string.printable for char in sequence)
    ]
    return filtered_good_chars


def get_unique_chars(sequences):
    return {sublst for lst in sequences for sublst in lst}


def create_char2idx(sequences):
    unique_chars = get_unique_chars(sequences)
    return {char: idx for idx, char in enumerate(sorted(unique_chars))}


def encode_sequence(sequence, char2idx):
    return [char2idx[char] for char in sequence]


def encode_sequences(sequences, char2idx):
    return np.array([
        encode_sequence(sequence, char2idx) 
        for sequence in tqdm_notebook(sequences)
    ])


class Sequences(Dataset):
    def __init__(self, path, sequence_length=125):
        self.sequences = load_data(DATA_PATH, sequence_length=sequence_length)
        self.vocab_size = len(get_unique_chars(self.sequences))
        self.char2idx = create_char2idx(self.sequences)
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}
        self.encoded = encode_sequences(self.sequences, self.char2idx)
        
    def __getitem__(self, i):
        return self.encoded[i, :-1], self.encoded[i, 1:]
    
    def __len__(self):
        return len(self.encoded)

In [6]:

dataset = Sequences(DATA_PATH, sequence_length=128)
len(dataset)
train_loader = DataLoader(dataset, batch_size=4096)

HBox(children=(IntProgress(value=0, max=240355), HTML(value='')))

HBox(children=(IntProgress(value=0, max=235862), HTML(value='')))

GRU¶

In [0]:

class RNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dimension=100,
        hidden_size=128, 
        n_layers=1,
        device='cpu',
    ):
        super(RNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.device = device
        
        self.encoder = nn.Embedding(vocab_size, embedding_dimension)
        self.rnn = nn.GRU(
            embedding_dimension,
            hidden_size,
            num_layers=n_layers,
            batch_first=True,
        )
        self.decoder = nn.Linear(hidden_size, vocab_size)
        
    def init_hidden(self, batch_size):
        return torch.randn(self.n_layers, batch_size, self.hidden_size).to(self.device)
    
    def forward(self, input_, hidden):
        encoded = self.encoder(input_)
        output, hidden = self.rnn(encoded.unsqueeze(1), hidden)
        output = self.decoder(output.squeeze(1))
        return output, hidden

In [0]:

model = RNN(vocab_size=dataset.vocab_size, device=device).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=0.001,
)

In [9]:

print(model)
print()
print('Trainable parameters:')
print('\n'.join([' * ' + x[0] for x in model.named_parameters() if x[1].requires_grad]))

RNN(
  (encoder): Embedding(60, 100)
  (rnn): GRU(100, 128, batch_first=True)
  (decoder): Linear(in_features=128, out_features=60, bias=True)
)

Trainable parameters:
 * encoder.weight
 * rnn.weight_ih_l0
 * rnn.weight_hh_l0
 * rnn.bias_ih_l0
 * rnn.bias_hh_l0
 * decoder.weight
 * decoder.bias

In [10]:

model.train()
train_losses = []
for epoch in range(50):
    progress_bar = tqdm_notebook(train_loader, leave=False)
    losses = []
    total = 0
    for inputs, targets in progress_bar:
        batch_size = inputs.size(0)
        hidden = model.init_hidden(batch_size)

        model.zero_grad()
        
        loss = 0
        for char_idx in range(inputs.size(1)):
            output, hidden = model(inputs[:, char_idx].to(device), hidden)
            loss += criterion(output, targets[:, char_idx].to(device))

        loss.backward()

        optimizer.step()
        
        avg_loss = loss.item() / inputs.size(1)
        
        progress_bar.set_description(f'Loss: {avg_loss:.3f}')
        
        losses.append(avg_loss)
        total += 1
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)
        
    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #1	Train Loss: 2.901

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #2	Train Loss: 2.313

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #3	Train Loss: 2.123

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #4	Train Loss: 1.995

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #5	Train Loss: 1.898

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #6	Train Loss: 1.824

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #7	Train Loss: 1.763

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #8	Train Loss: 1.712

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #9	Train Loss: 1.669

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #10	Train Loss: 1.632

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #11	Train Loss: 1.600

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #12	Train Loss: 1.572

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #13	Train Loss: 1.548

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #14	Train Loss: 1.526

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #15	Train Loss: 1.506

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #16	Train Loss: 1.488

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #17	Train Loss: 1.472

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #18	Train Loss: 1.457

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #19	Train Loss: 1.443

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #20	Train Loss: 1.430

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #21	Train Loss: 1.418

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #22	Train Loss: 1.407

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #23	Train Loss: 1.397

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #24	Train Loss: 1.388

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #25	Train Loss: 1.379

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #26	Train Loss: 1.371

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #27	Train Loss: 1.364

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #28	Train Loss: 1.356

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #29	Train Loss: 1.350

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #30	Train Loss: 1.343

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #31	Train Loss: 1.337

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #32	Train Loss: 1.331

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #33	Train Loss: 1.325

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #34	Train Loss: 1.320

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #35	Train Loss: 1.315

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #36	Train Loss: 1.310

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #37	Train Loss: 1.305

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #38	Train Loss: 1.301

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #39	Train Loss: 1.296

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #40	Train Loss: 1.292

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #41	Train Loss: 1.288

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #42	Train Loss: 1.284

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #43	Train Loss: 1.280

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #44	Train Loss: 1.276

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #45	Train Loss: 1.273

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #46	Train Loss: 1.269

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #47	Train Loss: 1.266

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #48	Train Loss: 1.262

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #49	Train Loss: 1.259

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

Epoch #50	Train Loss: 1.256

In [12]:

def pretty_print(text):
    """Wrap text for nice printing."""
    to_print = ''
    for paragraph in text.split('\n'):
        to_print += '\n'.join(wrap(paragraph))
        to_print += '\n'
    print(to_print)


temperature = 1.0

model.eval()
seed = '\n'
text = ''
with torch.no_grad():
    batch_size = 1
    hidden = model.init_hidden(batch_size)
    last_char = dataset.char2idx[seed]
    for _ in range(1000):
        output, hidden = model(torch.LongTensor([last_char]).to(device), hidden)
        
        distribution = output.squeeze().div(temperature).exp()
        guess = torch.multinomial(distribution, 1).item()
        
        last_char = guess
        text += dataset.idx2char[guess]
        
pretty_print(text)

fever stenges.
basing way dect she healthy lifestyle with then we tests to help you
are instead of latter is until there is that and avogations will
holking beinnsting food as you are decires eating here that with ways
which is what their desire, not negtsy is number to the body fat.
o       the stomene pound food as also satii ensumther whole way have
expenscuall your body, having more and having the more diet maintauses
people not who the takes a hypnotenest beginning until you dide are.
to, decreases what you achieve to aple so plan, it is a lood before a
plan doing tiple you can reasons for the destrictless. these peoplew
is like exercise health is some of and 30 milatist not work a terming,
you can ab an actual exercise on and there. foody to heals a healthied
has is a day." in. listen. have are insteid intake is another hormone
from a day if you have to can the and find or your body. the on weight
loss planning mont and give supply probables. the reason ody for emots
which that you wil

In [0]: