import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
print('cuda.is_available:', torch.cuda.is_available())
print(f'available: {torch.cuda.device_count()}; current: {torch.cuda.current_device()}')
DEVICE = torch.device(f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu')
print(DEVICE)
print('pytorch', torch.__version__)
cuda.is_available: True available: 1; current: 0 cuda:0 pytorch 0.4.0
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')
BLOG_CONTENT_FILE = os.path.join(DATA_DIR, f'blog_content_en_sample.json')
BLOG_CONTENT_DF = pd.read_json(BLOG_CONTENT_FILE)
print(f'total word_count: {sum(BLOG_CONTENT_DF.word_count)}; char_count: {sum([len(w) for w in BLOG_CONTENT_DF.content])}')
BLOG_CONTENT_DF.head().content
total word_count: 241026; char_count: 1417998
0 New Music\n\nMt. Joy reached out to us with th... 1 Folk rockers Mt. Joy have debuted their new so... 2 You know we're digging Mt. Joy.\n\nTheir new s... 3 Nothing against the profession, but the U.S. h... 4 Connecticut duo **Opia** have released a guita... Name: content, dtype: object
TRAIN_DF, TEST_DF = train_test_split(BLOG_CONTENT_DF, test_size=0.2, random_state=42)
TRAIN_TEXT, TEST_TEXT = TRAIN_DF.content, TEST_DF.content
print(f'train_text word_count: {sum([len(t) for t in TRAIN_TEXT])}; test_text word_count: {sum([len(t) for t in TEST_TEXT])}')
train_text word_count: 1113633; test_text word_count: 304365
BPTT = 4 # like the 'n' in n-gram, or order
BS = 512 # batch size
EPOCHS = 5
N_FAC = 42 # number of latent factors
N_HIDDEN = 128
def pad_start(bptt):
return '\0' * bptt
def create_inputs(texts_arr, print_info=False):
# shuffle inputs
texts_arr = texts_arr.sample(frac=1).reset_index(drop=True)
# pad each new text with leading '\0' so that we learn how to start
# also, lowercase
texts = ''.join([pad_start(BPTT) + text.lower() for text in texts_arr])
chars = sorted(list(set(texts)))
vocab_size = len(chars)
if print_info:
print('vocab_size:', vocab_size)
print(chars)
print()
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}
idx = [char_to_idx[text] for text in texts]
return idx, vocab_size, char_to_idx, idx_to_char
_, VOCAB_SIZE, _, _ = create_inputs(TRAIN_TEXT, True)
vocab_size: 70 ['\x00', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']
import math
import time
def time_since(since):
now = time.time()
s = now - since
m = math.floor(s / 60)
s -= m * 60
return f'{m}m {s:.0f}s'
# https://github.com/fastai/fastai/blob/master/fastai/nlp.py
def batchify(data, bs):
if bs == 1:
return torch.tensor([[data[i+o] for i in range(len(data)-BPTT-1)] for o in range(BPTT+1)], dtype=torch.long, device=DEVICE)
else:
num = data.size(0) // bs
data = data[:num*bs]
# invalid argument 2: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Call .contiguous() before .view().
return data.view(bs, -1).t().contiguous()
def get_batch(data, i, seq_len):
seq_len = min(seq_len, len(data) - 1 - i)
return data[i:i+seq_len].to(DEVICE), data[i+1:i+1+seq_len].view(-1).to(DEVICE)
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
def plot_loss(losses):
%matplotlib inline
plt.figure()
plt.plot(all_losses)
def batch_train(model, batches, optimizer, criterion=nn.CrossEntropyLoss(), bptt=BPTT):
model.zero_grad()
loss = 0
for i in range(batches.size(0) - bptt):
xs, ys = get_batch(batches, i, bptt)
output = model(xs)
loss += criterion(output, ys)
loss.backward()
if optimizer:
optimizer.step()
return loss.item() / (batches.size(0) - bptt)
def batchless_train(model, batches, optimizer, start, print_every, char_to_idx, idx_to_char, seed='the ', max_sample_length=100, criterion=nn.CrossEntropyLoss(), bptt=BPTT):
xs = np.stack(batches[:-1], axis=1) # history
ys = np.stack(batches[-1:][0]) # target
total_loss = torch.Tensor([0])
for i in range(xs.shape[0]):
model.zero_grad()
output = model(torch.tensor(xs[i], dtype=torch.long, device=DEVICE))
loss = criterion(output, torch.tensor([ys[i]], dtype=torch.long, device=DEVICE))
loss.backward()
if optimizer:
optimizer.step()
# Get the Python number from a 1-element Tensor by calling tensor.item()
total_loss += loss.item()
if i % print_every == 0:
print(f'{time_since(start)} ({i} {i / xs.shape[0] * 100:.2f}%) {loss:.4f}')
print(f'Epoch {i} sample:')
sample(model, char_to_idx, idx_to_char, seed=seed, max_length=max_sample_length)
return total_loss# / xs.shape[0]
def sample(model, char_to_idx, idx_to_char, seed=pad_start(BPTT), max_length=20, bptt=BPTT, sample=True):
with torch.no_grad(): # no need to track history in sampling
output_idx = [char_to_idx[c] for c in seed[-bptt:]]
for i in range(max_length):
h_idxs = torch.tensor(output_idx[-bptt:], dtype=torch.long, device=DEVICE).view(-1, 1)
output = model(h_idxs.transpose(0,1))
if sample:
# sample from distribution
idx = torch.multinomial(output[-1].exp(), 1).item()
else:
# get most probable
topi = output.topk(1)[1]
idx = topi[0][0]
if idx == 0:
break
else:
output_idx.append(idx)
sample_text = ''.join([idx_to_char[i] for i in output_idx])
print(sample_text)
#print(output_idx)
Another n-gram music reviews model, implemented this time in PyTorch.
Guiding PyTorch tutorial: An Example: N-Gram Language Modeling
class NGramLanguageModel(nn.Module):
def __init__(self, vocab_size, hidden_size, n_fac, bptt):
super(NGramLanguageModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, n_fac)
self.linear1 = nn.Linear(bptt * n_fac, hidden_size)
self.linear2 = nn.Linear(hidden_size, vocab_size)
def forward(self, inputs):
inputs = self.embedding(inputs).view((1, -1))
out = F.relu(self.linear1(inputs))
out = self.linear2(out)
return out
ngram = NGramLanguageModel(VOCAB_SIZE, N_HIDDEN, N_FAC, BPTT).to(DEVICE)
optimizer = optim.Adam(ngram.parameters(), lr=0.005)
all_losses = train_loop(ngram, optimizer, TRAIN_TEXT, batch_size=1, plot_every=1, print_every=500000)
0m 46s (0 0.00%) 4.4895 Epoch 0 sample: the x=$=el1$ 8m 56s (500000 44.69%) 4.0249 Epoch 500000 sample: the rne ,n ef apdrmlggi entofs_.tis sa skrcutttta sd"woearotcn*hvf *sno caliiengwsbuecfhuirrl wrsoabe'ua 17m 6s (1000000 89.39%) 2.7267 Epoch 1000000 sample: the t epfdynodoo *tlruesrs sdtahesl mrh ev tajoofiyg ihe tuoa he rnl m.lorsgagcn,ts vo so.essgao su 19m 48s (0 0.00%) 2.7462 Epoch 0 sample: the itrautcwllctahhdmranyanntonepoir er tteghssseayit kptn in vuih'nrnstwuiii n niumfd tes lerteueid/ae 27m 59s (500000 44.69%) 3.0083 Epoch 500000 sample: the hbff asemys tt.paelosrhulc -ihltaihmesmu nswer l doroldte oa w oaopnauskrrssthsut ak dhl einih a 36m 10s (1000000 89.39%) 3.9739 Epoch 1000000 sample: the emialsnao.h vte et,tsteew.rei ae ho*eodthdk a*t ss r aren**a d o mo'i es l a c ofah msariir wni 38m 52s (0 0.00%) 3.6807 Epoch 0 sample: the oyeailydrcsutm ,yo ls"eft, rudl eoi thogs on secni.iibr'ey iyi eal" toonn hks wtnm,l leoi vueseiedp 47m 5s (500000 44.69%) 4.0389 Epoch 500000 sample: the fu jn sottee chh seehllra takdsu gntc "a todnnr nno i sagehn er noro e u nupa* seuee c rbaonup bbk 55m 37s (1000000 89.39%) 2.9309 Epoch 1000000 sample: the hpee igekeieietden'oi ascrataisise h tathh .yesaycsapeaek dhhealmotslde*v spaelymblons*o od ats 58m 24s (0 0.00%) 1.9723 Epoch 0 sample:
--------------------------------------------------- RuntimeError Traceback (most recent call last) <ipython-input-53-6092b9c0b3a1> in <module>() 1 ngram = NGramLanguageModel(VOCAB_SIZE, N_HIDDEN, N_FAC, BPTT).to(DEVICE) 2 optimizer = optim.Adam(ngram.parameters(), lr=0.005) ----> 3 all_losses = train_loop(ngram, optimizer, TRAIN_TEXT, batch_size=1, plot_every=1, print_every=500000) <ipython-input-11-6cce72cd9f1d> in train_loop(model, optimizer, text, batch_size, seed, max_sample_length, epochs, print_every, plot_every, criterion) 10 batches = batchify(torch.tensor(np.stack(idx), device=DEVICE), batch_size) 11 if batch_size == 1: ---> 12 loss = batchless_train(model, batches, optimizer, start, print_every, char_to_idx, idx_to_char, seed, max_sample_length, criterion, print_every) 13 else: 14 loss = batch_train(model, batches, optimizer, criterion=criterion) <ipython-input-51-cd6adfb44f7f> in batchless_train(model, batches, optimizer, start, print_every, char_to_idx, idx_to_char, seed, max_sample_length, criterion, bptt) 35 print(f'{time_since(start)} ({i} {i / xs.shape[0] * 100:.2f}%) {loss:.4f}') 36 print(f'Epoch {i} sample:') ---> 37 sample(model, char_to_idx, idx_to_char, seed=seed, max_length=max_sample_length) 38 39 return total_loss / xs.shape[0] <ipython-input-12-ecd73ac3dc55> in sample(model, char_to_idx, idx_to_char, seed, max_length, bptt, sample) 8 if sample: 9 # sample from distribution ---> 10 idx = torch.multinomial(output[-1].exp(), 1).item() 11 else: 12 # get most probable RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1524586445097/work/aten/src/THC/generic/THCStorage.c:36
plot_loss(all_losses)
sample(ngram, char_to_idx, idx_to_char, seed='the ', max_length=100)
Observations:
class RNN(nn.Module):
def __init__(self, vocab_size, hidden_size, n_fac, bptt, batch_size=BS):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.embeddings = nn.Embedding(vocab_size, n_fac)
self.i2h = nn.Linear(bptt * n_fac + hidden_size, hidden_size)
self.i2o = nn.Linear(bptt * n_fac + hidden_size, vocab_size)
self.o2o = nn.Linear(hidden_size + vocab_size, vocab_size)
self.dropout = nn.Dropout(0.1)
self.softmax = nn.LogSoftmax(dim=1)
self.init_hidden(batch_size)
# NOTE: this example only works as-is in PyTorch 0.4+
# https://stackoverflow.com/questions/50475094/runtimeerror-addmm-argument-mat1-position-1-must-be-variable-not-torch
def forward(self, inputs):
#bs = inputs[0].size(0)
# dynamic batch sizing
#if self.batch_size != bs: self.init_hidden(bs)
embeds = self.embeddings(inputs).view((1, -1))
combined_i = torch.cat((embeds, self.hidden), 1)
hidden = self.i2h(combined_i)
# detach from history of the last run
self.hidden = hidden.detach()
output = self.i2o(combined_i)
combined_o = torch.cat((self.hidden, output), 1)
output = self.o2o(combined_o)
output = self.dropout(output)
output = self.softmax(output)
return output
def init_hidden(self, bs):
# 1 RNN layer
self.batch_size = bs
self.hidden = torch.zeros(1, self.hidden_size).to(DEVICE)
rnn = RNN(VOCAB_SIZE, N_HIDDEN, N_FAC, BPTT).to(DEVICE)
optimizer = optim.Adam(rnn.parameters(), lr=0.005)
all_losses = train_loop(rnn, optimizer, TRAIN_TEXT, criterion=nn.NLLLoss(), batch_size=1, plot_every=1, print_every=500000)
0m 46s (0 0.00%) 4.2767 Epoch 0 sample: the 1~af9#%c_~ild$"?rf/b\| j)za#!n& 11m 24s (500000 44.69%) 2.3363 Epoch 500000 sample: the be stot eer/lrlck0sttmasd of corsdetav> in+ therk, fl yeint ulh, bta0krauk i#duk_.n8 dot9et8y imeos 22m 5s (1000000 89.39%) 3.0072 Epoch 1000000 sample: the bes!y 2-counq** 4lewbor albzi\" 25m 20s (0 0.00%) 7.4589 Epoch 0 sample: the %ur rt ofekulg(tha onr orin" 3orsof yr and !etes aoseve_ pop<f h10kirv alest1 trabsetk syoro graas 35m 44s (500000 44.69%) 0.9857 Epoch 500000 sample: the y kir}ul_ ab i7 46m 12s (1000000 89.39%) 3.9425 Epoch 1000000 sample: the vempoh nge805a totew min-basid and wich705 perh are{jix cu 49m 25s (0 0.00%) 8.5864 Epoch 0 sample: the :. 59m 47s (500000 44.69%) 0.7750 Epoch 500000 sample: the my afthrats onlothe belsnghecllst%gont anci\[ 111111f. syehlling', bacbllbgirloni;,i@ , twi[n*] rec 70m 9s (1000000 89.39%) 4.3751 Epoch 1000000 sample: the pirtlal " e9ti5 fat3 ###"""hah9quivaled ont forev i" moyl peif whenbsas niro coyll at aokt 52kever 73m 22s (0 0.00%) 6.1067 Epoch 0 sample: the nedeedenns. \u rted sin_s.n? ent ag inal. -_t lonam. last]3k beimazin! be^j!ilf8 genas. potdjs-ca 83m 46s (500000 44.69%) 1.3343 Epoch 500000 sample: the **einld t f eli \y, swminen b##########cccmenrely ther inew lemej n& p/ gkmyerndy aed lhewa $inocy" 94m 27s (1000000 89.39%) 5.7247 Epoch 1000000 sample: the wl7@ & aocancegcass, chest l5 8d .ryl 'f swit e^pd es> gope ntys^pc nouthericheof t7e, cse ma{w st 97m 47s (0 0.00%) 38.1602 Epoch 0 sample: the k| t vea lingtfeyeas h-lasgtean ote.p 108m 28s (500000 44.69%) 1.8164 Epoch 500000 sample: the 2* v & , d lleituts woleez:z.ve4h @ th\aw*ivis sipbiuilasw tod^v. **+ **d-tos 119m 18s (1000000 89.39%) 5.7211 Epoch 1000000 sample: the Training time: 7308.59s
plot_loss(all_losses)
^losses not being reported quite right by batchless_train
...
idx, VOCAB_SIZE, char_to_idx, idx_to_char = create_inputs(TRAIN_TEXT)
sample(rnn, char_to_idx, idx_to_char, seed='the ', max_length=100)
the 2cwap% jaig aciph} araygay iblptoare josa7, pha]ptpjry iot, il) aydin t?e iruphy bol war############
idx, VOCAB_SIZE, char_to_idx, idx_to_char = create_inputs(TRAIN_TEXT)
sample(rnn, char_to_idx, idx_to_char, seed='\0'*BPTT, max_length=100)