The GPT-2 model generates text like never before! Before, the state-of-the-art text generation could not keep coherence for more than maybe a paragraph. However, GPT-2 is much better at maintaining paragraph-to-paragraph coherence. The text generated is so good that OpenAI decided to not release the full model for fear of creating a fake-news generator. Instead, they only released a smaller pretrained version of the model.
You might decide you want to train the full model yourself, but be warned that each iteration of training costs thousands of euros, and you often has to experiment with how the model is trained (play with the hyperparameters) in order to get it to learn well. You have to be a big organization like OpenAI in order to afford to trian your own model on the scale that they did.
The model, a language model, was trained by just trying to predict the next word for many many millions of documents found on the web. This is called unsupervised learning because we don't have a set of labels we are trying to predict.
The GPT-2 blog post and paper do not go into much detail into how the model was designed. However, we know that they use a transformer architecture. At a high level, the Transformer converts input sequences into output sequences. It's composed of an encoding component and a decoding component.
The Transformer is actually composed of stacks of encoders and decoders.
We can see a snapshot of how tensors flow through this encoder-decoder architecture:
For the GPT-2 model, the goal isn't to translate French to English but rather to generate text. The input sequences are tokens (words) at timestep [0, t - 1]
and the target sequences are the tokens at timestep [1, t]
.
If sequence length is 5
and we have this text we want to train on:
the quick brown fox jumped over the lazy dog
We would then prepare the following sequences for the model:
input | target |
---|---|
the quick brown fox |
quick brown fox jumped |
quick brown fox jumped |
brown fox jumped over |
brown fox jumped over |
fox jumped over the |
fox jumped over the |
jumped over the lazy |
jumped over the lazy |
over the lazy dog |
!pip install git+https://github.com/huggingface/pytorch-pretrained-BERT.git boltons googledrivedownloader -q
from functools import partial
from pathlib import Path
from textwrap import wrap
import nltk
import pandas as pd
from boltons.iterutils import windowed
from tqdm import tqdm, tqdm_notebook
import torch
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2LMHeadModel, OpenAIAdam
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler
from google_drive_downloader import GoogleDriveDownloader as gdd
tqdm.pandas()
nltk.download('punkt', quiet=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
def sample_text(model, seed='Weight loss can be achieved by', n_words=500):
"""Generate text from a trained model."""
model.eval()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
text = tokenizer.encode(seed)
inputs, past = torch.tensor([text]), None
with torch.no_grad():
for _ in tqdm_notebook(range(n_words), leave=False):
logits, past = model(inputs.to(device), past=past)
log_probs = F.softmax(logits[:, -1], dim=-1)
inputs = torch.multinomial(log_probs, 1)
text.append(inputs.item())
return tokenizer.decode(text)
def pretty_print(text):
"""Wrap text for nice printing."""
to_print = ''
for paragraph in text.split('\n'):
to_print += '\n'.join(wrap(paragraph))
to_print += '\n'
print(to_print)
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)
seed = 'Weight loss can be achieved by' #@param {type:"string"}
n_words = 500 #@param {type:"integer"}
text = sample_text(model, seed=seed, n_words=n_words)
pretty_print(text)
See what the fine-tuning data looks like.
DATA_PATH = 'data/weight_loss/articles.jsonl'
if not Path(DATA_PATH).is_file():
gdd.download_file_from_google_drive(
file_id='1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI',
dest_path='data/weight_loss/weight_loss_articles.zip',
unzip=True,
)
# Preview the training data
pd.read_json(DATA_PATH)[['author', 'text', 'title']].head()
flatten = lambda x: [sublst for lst in x for sublst in lst]
class EzineWeightLossDataset(Dataset):
"""Weight loss articles from ezinearticles.com."""
def __init__(self, data_filename, sequence_length, n_samples):
df = pd.read_json(data_filename)[['text']].sample(n_samples)
df = df[df.text.str.len() > 0]
df.dropna(inplace=True)
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
df['paragraphs'] = df.text.str.split(r'[\n]+')
df['paragraphs_sentences'] = df.paragraphs.progress_apply(
lambda paragraphs: [nltk.sent_tokenize(paragraph) for paragraph in paragraphs],
)
# Add newlines to the end of every paragraph
df.loc[:, 'paragraphs_sentences'] = df.paragraphs_sentences.progress_apply(
lambda paragraphs: [paragraph[:-1] + [paragraph[-1] + '\n\n'] for paragraph in paragraphs if paragraph]
)
df.dropna(inplace=True)
def encode_paragraph(paragraph):
tokens = flatten([self.tokenizer.encode(sentence) + self.tokenizer.encode(' ') for sentence in paragraph])
tokens = tokens[:-1] # Remove extra space at the end
return tokens
# Tokenize and assign indices to each token
df['paragraphs_sentences_tokens'] = df.paragraphs_sentences.progress_apply(
lambda paragraphs: [encode_paragraph(paragraph) for paragraph in paragraphs],
)
# Flatten to one long sequence
df['tokens'] = df.paragraphs_sentences_tokens.progress_apply(flatten)
# 50256 is <|endoftext|> (https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json)
# Apply a sliding window per article that will be the sequence
# length fed into the model
sequences = flatten([
windowed(encoded_article + [50256], sequence_length)
for encoded_article in df['tokens']
])
# Combine all of the sequences into one 2-D matrix.
# Then, split like [A, B, C, D, E] --> ([A, B, C, D], [B, C, D, E])
data = torch.tensor(sequences)
self.inputs_lst, self.targets = data[:-1], data[1:]
def __getitem__(self, i):
return self.inputs_lst[i], self.targets[i]
def __len__(self):
return len(self.inputs_lst)
# How long each sequence should be
sequence_length = 128 #@param {type:"slider", min:16, max:512, step:2}
# Train on only a subset of the data to reduce training time
n_samples = 50 #@param {type:"integer"}
dataset = EzineWeightLossDataset(DATA_PATH, sequence_length, n_samples)
BATCH_SIZE = 16
loader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=BATCH_SIZE)
Training a model from scratch can be challenging:
As well, the pretrained model isn't great at your specific task. What can you do? You can fine-tune the pretrained model with your own domain-specific data!
#@title Model Hyperparameters
n_epochs = 1 #@param {type:"slider", min:1, max:10, step:1}
learning_rate = 1e-5 #@param {type:"number"}
warmup_proportion = 0.002 #@param {type:"number"}
max_grad_norm = 0.05 #@param {type:"number"}
weight_decay = 0.01 #@param {type:"number"}
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if
not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if
any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
n_train_optimization_steps = len(dataset) * n_epochs // BATCH_SIZE
optimizer = OpenAIAdam(
optimizer_grouped_parameters,
lr=learning_rate,
warmup=warmup_proportion,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
t_total=n_train_optimization_steps,
)
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
model.train()
for _ in tqdm_notebook(range(n_epochs)):
tr_loss = 0
nb_tr_steps = 0
tqdm_bar = tqdm_notebook(loader, desc='Training')
for step, batch in enumerate(tqdm_bar):
input_ids, lm_labels = tuple(t.to(device) for t in batch)
loss = model(input_ids, lm_labels=lm_labels)
loss.backward()
optimizer.step()
tr_loss += loss.item()
exp_average_loss = (
loss.item() if exp_average_loss is None
else 0.7 * exp_average_loss + 0.3 * loss.item()
)
nb_tr_steps += 1
tqdm_bar.desc = f'Training loss: {exp_average_loss:.2e} lr: {optimizer.get_lr()[0]:.2e}'
#@title Sample fine-tuned model
seed = 'Weight loss can be achieved by' #@param {type:"string"}
n_words = 500 #@param {type:"integer"}
text = sample_text(model, seed=seed, n_words=n_words)
pretty_print(text)
original_model = GPT2LMHeadModel.from_pretrained('gpt2')
original_model = original_model.to(device)
#@title Sample original model
seed = 'Weight loss can be achieved by' #@param {type:"string"}
n_words = 500 #@param {type:"integer"}
text = sample_text(original_model, seed=seed, n_words=n_words)
pretty_print(text)
torch.save(model.state_dict(), 'finetuned_gpt2.pkl')