Generate weight loss articles using a character-level RNN.
See the classic Karpathy post on this topic.
!pip install boltons -q
|████████████████████████████████| 174kB 4.9MB/s
import string
from pathlib import Path
from textwrap import wrap
import numpy as np
import pandas as pd
from boltons.iterutils import windowed
from tqdm import tqdm, tqdm_notebook
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from google_drive_downloader import GoogleDriveDownloader as gdd
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda')
DATA_PATH = 'data/weight_loss/articles.jsonl'
if not Path(DATA_PATH).is_file():
gdd.download_file_from_google_drive(
file_id='1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI',
dest_path='data/weight_loss/weight_loss_articles.zip',
unzip=True,
)
Downloading 1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI into data/weight_loss/weight_loss_articles.zip... Done. Unzipping...Done.
def load_data(path, sequence_length=125):
texts = pd.read_json(path).text.sample(100).str.lower().tolist()
chars_windowed = [list(windowed(text, sequence_length)) for text in texts]
all_chars_windowed = [sublst for lst in chars_windowed for sublst in lst]
filtered_good_chars = [
sequence for sequence in tqdm_notebook(all_chars_windowed)
if all(char in string.printable for char in sequence)
]
return filtered_good_chars
def get_unique_chars(sequences):
return {sublst for lst in sequences for sublst in lst}
def create_char2idx(sequences):
unique_chars = get_unique_chars(sequences)
return {char: idx for idx, char in enumerate(sorted(unique_chars))}
def encode_sequence(sequence, char2idx):
return [char2idx[char] for char in sequence]
def encode_sequences(sequences, char2idx):
return np.array([
encode_sequence(sequence, char2idx)
for sequence in tqdm_notebook(sequences)
])
class Sequences(Dataset):
def __init__(self, path, sequence_length=125):
self.sequences = load_data(DATA_PATH, sequence_length=sequence_length)
self.vocab_size = len(get_unique_chars(self.sequences))
self.char2idx = create_char2idx(self.sequences)
self.idx2char = {idx: char for char, idx in self.char2idx.items()}
self.encoded = encode_sequences(self.sequences, self.char2idx)
def __getitem__(self, i):
return self.encoded[i, :-1], self.encoded[i, 1:]
def __len__(self):
return len(self.encoded)
dataset = Sequences(DATA_PATH, sequence_length=128)
len(dataset)
train_loader = DataLoader(dataset, batch_size=4096)
HBox(children=(IntProgress(value=0, max=240355), HTML(value='')))
HBox(children=(IntProgress(value=0, max=235862), HTML(value='')))
class RNN(nn.Module):
def __init__(
self,
vocab_size,
embedding_dimension=100,
hidden_size=128,
n_layers=1,
device='cpu',
):
super(RNN, self).__init__()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.device = device
self.encoder = nn.Embedding(vocab_size, embedding_dimension)
self.rnn = nn.GRU(
embedding_dimension,
hidden_size,
num_layers=n_layers,
batch_first=True,
)
self.decoder = nn.Linear(hidden_size, vocab_size)
def init_hidden(self, batch_size):
return torch.randn(self.n_layers, batch_size, self.hidden_size).to(self.device)
def forward(self, input_, hidden):
encoded = self.encoder(input_)
output, hidden = self.rnn(encoded.unsqueeze(1), hidden)
output = self.decoder(output.squeeze(1))
return output, hidden
model = RNN(vocab_size=dataset.vocab_size, device=device).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
filter(lambda p: p.requires_grad, model.parameters()),
lr=0.001,
)
print(model)
print()
print('Trainable parameters:')
print('\n'.join([' * ' + x[0] for x in model.named_parameters() if x[1].requires_grad]))
RNN( (encoder): Embedding(60, 100) (rnn): GRU(100, 128, batch_first=True) (decoder): Linear(in_features=128, out_features=60, bias=True) ) Trainable parameters: * encoder.weight * rnn.weight_ih_l0 * rnn.weight_hh_l0 * rnn.bias_ih_l0 * rnn.bias_hh_l0 * decoder.weight * decoder.bias
model.train()
train_losses = []
for epoch in range(50):
progress_bar = tqdm_notebook(train_loader, leave=False)
losses = []
total = 0
for inputs, targets in progress_bar:
batch_size = inputs.size(0)
hidden = model.init_hidden(batch_size)
model.zero_grad()
loss = 0
for char_idx in range(inputs.size(1)):
output, hidden = model(inputs[:, char_idx].to(device), hidden)
loss += criterion(output, targets[:, char_idx].to(device))
loss.backward()
optimizer.step()
avg_loss = loss.item() / inputs.size(1)
progress_bar.set_description(f'Loss: {avg_loss:.3f}')
losses.append(avg_loss)
total += 1
epoch_loss = sum(losses) / total
train_losses.append(epoch_loss)
tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #1 Train Loss: 2.901
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #2 Train Loss: 2.313
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #3 Train Loss: 2.123
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #4 Train Loss: 1.995
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #5 Train Loss: 1.898
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #6 Train Loss: 1.824
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #7 Train Loss: 1.763
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #8 Train Loss: 1.712
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #9 Train Loss: 1.669
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #10 Train Loss: 1.632
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #11 Train Loss: 1.600
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #12 Train Loss: 1.572
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #13 Train Loss: 1.548
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #14 Train Loss: 1.526
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #15 Train Loss: 1.506
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #16 Train Loss: 1.488
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #17 Train Loss: 1.472
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #18 Train Loss: 1.457
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #19 Train Loss: 1.443
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #20 Train Loss: 1.430
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #21 Train Loss: 1.418
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #22 Train Loss: 1.407
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #23 Train Loss: 1.397
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #24 Train Loss: 1.388
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #25 Train Loss: 1.379
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #26 Train Loss: 1.371
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #27 Train Loss: 1.364
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #28 Train Loss: 1.356
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #29 Train Loss: 1.350
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #30 Train Loss: 1.343
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #31 Train Loss: 1.337
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #32 Train Loss: 1.331
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #33 Train Loss: 1.325
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #34 Train Loss: 1.320
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #35 Train Loss: 1.315
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #36 Train Loss: 1.310
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #37 Train Loss: 1.305
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #38 Train Loss: 1.301
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #39 Train Loss: 1.296
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #40 Train Loss: 1.292
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #41 Train Loss: 1.288
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #42 Train Loss: 1.284
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #43 Train Loss: 1.280
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #44 Train Loss: 1.276
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #45 Train Loss: 1.273
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #46 Train Loss: 1.269
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #47 Train Loss: 1.266
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #48 Train Loss: 1.262
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #49 Train Loss: 1.259
HBox(children=(IntProgress(value=0, max=58), HTML(value='')))
Epoch #50 Train Loss: 1.256
def pretty_print(text):
"""Wrap text for nice printing."""
to_print = ''
for paragraph in text.split('\n'):
to_print += '\n'.join(wrap(paragraph))
to_print += '\n'
print(to_print)
temperature = 1.0
model.eval()
seed = '\n'
text = ''
with torch.no_grad():
batch_size = 1
hidden = model.init_hidden(batch_size)
last_char = dataset.char2idx[seed]
for _ in range(1000):
output, hidden = model(torch.LongTensor([last_char]).to(device), hidden)
distribution = output.squeeze().div(temperature).exp()
guess = torch.multinomial(distribution, 1).item()
last_char = guess
text += dataset.idx2char[guess]
pretty_print(text)
fever stenges. basing way dect she healthy lifestyle with then we tests to help you are instead of latter is until there is that and avogations will holking beinnsting food as you are decires eating here that with ways which is what their desire, not negtsy is number to the body fat. o the stomene pound food as also satii ensumther whole way have expenscuall your body, having more and having the more diet maintauses people not who the takes a hypnotenest beginning until you dide are. to, decreases what you achieve to aple so plan, it is a lood before a plan doing tiple you can reasons for the destrictless. these peoplew is like exercise health is some of and 30 milatist not work a terming, you can ab an actual exercise on and there. foody to heals a healthied has is a day." in. listen. have are insteid intake is another hormone from a day if you have to can the and find or your body. the on weight loss planning mont and give supply probables. the reason ody for emots which that you wil