Notebook

Install¶

conda install pytorch torchvision -c soumith

Import¶

In [1]:

import torch

Tutorial¶

http://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html

http://pytorch.org/tutorials/

In [2]:

x = torch.Tensor(5, 3)
print(x)

1.00000e-44 *
  0.0000  0.0000  0.0000
  0.0000  1.6816  0.0000
  0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000
[torch.FloatTensor of size 5x3]

nn module¶

http://pytorch.org/tutorials/beginner/pytorch_with_examples.html#nn-module

In [6]:

import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Variable of input data to the Module and it produces
    # a Variable of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Variables containing the predicted and true
    # values of y, and the loss function returns a Variable containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t%50 == 0:
        print(t, loss.data[0])

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Variables with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Variable, so
    # we can access its data and gradients like we did before.
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

(0, 717.2719116210938)
(50, 35.097198486328125)
(100, 1.8821511268615723)
(150, 0.1728428155183792)
(200, 0.02194761298596859)
(250, 0.0034840735606849194)
(300, 0.0006572074489668012)
(350, 0.00014404028479475528)
(400, 3.580378324841149e-05)
(450, 9.810625670070294e-06)

https://github.com/huggingface/pytorch-transformers

Pytorch transformers¶

A library of state-of-the-art pretrained models for Natural Language Processing (NLP)

In [1]:

!pip install pytorch-transformers

Collecting pytorch-transformers
  Downloading https://files.pythonhosted.org/packages/40/b5/2d78e74001af0152ee61d5ad4e290aec9a1e43925b21df2dc74ec100f1ab/pytorch_transformers-1.0.0-py3-none-any.whl (137kB)
    100% |████████████████████████████████| 143kB 488kB/s ta 0:00:01
Collecting sentencepiece (from pytorch-transformers)
  Downloading https://files.pythonhosted.org/packages/99/8c/ca2c3ab61848526e85146aef40bfb7b399c7e70b1686a43b82d44cf1690f/sentencepiece-0.1.82-cp37-cp37m-macosx_10_6_x86_64.whl (1.1MB)
    100% |████████████████████████████████| 1.1MB 11.9MB/s ta 0:00:01
Requirement already satisfied: torch>=0.4.1 in /Users/datalab/anaconda3/lib/python3.7/site-packages (from pytorch-transformers) (1.1.0)
Requirement already satisfied: numpy in /Users/datalab/anaconda3/lib/python3.7/site-packages (from pytorch-transformers) (1.16.2)
Requirement already satisfied: tqdm in /Users/datalab/anaconda3/lib/python3.7/site-packages (from pytorch-transformers) (4.31.1)
Collecting boto3 (from pytorch-transformers)
  Downloading https://files.pythonhosted.org/packages/39/82/608bb4a689dc543d09555e70ffc0e180bd72df76d53b68bf8891d7cbba91/boto3-1.9.194-py2.py3-none-any.whl (128kB)
    100% |████████████████████████████████| 133kB 15.4MB/s ta 0:00:01
Requirement already satisfied: requests in /Users/datalab/anaconda3/lib/python3.7/site-packages (from pytorch-transformers) (2.21.0)
Collecting regex (from pytorch-transformers)
  Downloading https://files.pythonhosted.org/packages/6f/4e/1b178c38c9a1a184288f72065a65ca01f3154df43c6ad898624149b8b4e0/regex-2019.06.08.tar.gz (651kB)
    100% |████████████████████████████████| 655kB 14.1MB/s ta 0:00:01
Collecting jmespath<1.0.0,>=0.7.1 (from boto3->pytorch-transformers)
  Downloading https://files.pythonhosted.org/packages/83/94/7179c3832a6d45b266ddb2aac329e101367fbdb11f425f13771d27f225bb/jmespath-0.9.4-py2.py3-none-any.whl
Collecting botocore<1.13.0,>=1.12.194 (from boto3->pytorch-transformers)
  Downloading https://files.pythonhosted.org/packages/7b/9f/f7206b658d764f1258bd8af056c71fd0d9792973f88c3045ab2faefd2362/botocore-1.12.194-py2.py3-none-any.whl (5.6MB)
    100% |████████████████████████████████| 5.6MB 5.3MB/s eta 0:00:01
Collecting s3transfer<0.3.0,>=0.2.0 (from boto3->pytorch-transformers)
  Downloading https://files.pythonhosted.org/packages/16/8a/1fc3dba0c4923c2a76e1ff0d52b305c44606da63f718d14d3231e21c51b0/s3transfer-0.2.1-py2.py3-none-any.whl (70kB)
    100% |████████████████████████████████| 71kB 17.4MB/s ta 0:00:01
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /Users/datalab/anaconda3/lib/python3.7/site-packages (from requests->pytorch-transformers) (1.24.1)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/datalab/anaconda3/lib/python3.7/site-packages (from requests->pytorch-transformers) (3.0.4)
Requirement already satisfied: idna<2.9,>=2.5 in /Users/datalab/anaconda3/lib/python3.7/site-packages (from requests->pytorch-transformers) (2.8)
Requirement already satisfied: certifi>=2017.4.17 in /Users/datalab/anaconda3/lib/python3.7/site-packages (from requests->pytorch-transformers) (2019.3.9)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= "2.7" in /Users/datalab/anaconda3/lib/python3.7/site-packages (from botocore<1.13.0,>=1.12.194->boto3->pytorch-transformers) (2.8.0)
Requirement already satisfied: docutils<0.15,>=0.10 in /Users/datalab/anaconda3/lib/python3.7/site-packages (from botocore<1.13.0,>=1.12.194->boto3->pytorch-transformers) (0.14)
Requirement already satisfied: six>=1.5 in /Users/datalab/anaconda3/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1; python_version >= "2.7"->botocore<1.13.0,>=1.12.194->boto3->pytorch-transformers) (1.12.0)
Building wheels for collected packages: regex
  Building wheel for regex (setup.py) ... done
  Stored in directory: /Users/datalab/Library/Caches/pip/wheels/35/e4/80/abf3b33ba89cf65cd262af8a22a5a999cc28fbfabea6b38473
Successfully built regex
Installing collected packages: sentencepiece, jmespath, botocore, s3transfer, boto3, regex, pytorch-transformers
Successfully installed boto3-1.9.194 botocore-1.12.194 jmespath-0.9.4 pytorch-transformers-1.0.0 regex-2019.6.8 s3transfer-0.2.1 sentencepiece-0.1.82

In [1]:

import torch
from pytorch_transformers import *

# PyTorch-Transformers has a unified API
# for 6 transformer architectures and 27 pretrained weights.
#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,      'gpt2'),
          (TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,     'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024')]

Quickstart¶

In [1]:

import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/datalab/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084

In [2]:

tokenized_text

Out[2]:

['[CLS]',
 'who',
 'was',
 'jim',
 'henson',
 '?',
 '[SEP]',
 'jim',
 '[MASK]',
 'was',
 'a',
 'puppet',
 '##eer',
 '[SEP]']

In [9]:

?BertModel.from_pretrained

In [13]:

model = BertModel.from_pretrained(u"/Users/datalab/bigdata/bert-base-uncased.bin")

INFO:pytorch_transformers.modeling_utils:loading configuration file /Users/datalab/bigdata/bert-base-uncased.bin

---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-13-f9dd09faa64c> in <module>
----> 1 model = BertModel.from_pretrained(u"/Users/datalab/bigdata/bert-base-uncased.bin")

~/anaconda3/lib/python3.7/site-packages/pytorch_transformers/modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    360         # Load config
    361         if config is None:
--> 362             config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    363 
    364         # Load model

~/anaconda3/lib/python3.7/site-packages/pytorch_transformers/modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *input, **kwargs)
    137 
    138         # Load config
--> 139         config = cls.from_json_file(resolved_config_file)
    140 
    141         # Update config with kwargs if needed

~/anaconda3/lib/python3.7/site-packages/pytorch_transformers/modeling_utils.py in from_json_file(cls, json_file)
    163         """Constructs a `BertConfig` from a json file of parameters."""
    164         with open(json_file, "r", encoding='utf-8') as reader:
--> 165             text = reader.read()
    166         return cls.from_dict(json.loads(text))
    167 

~/anaconda3/lib/python3.7/codecs.py in decode(self, input, final)
    320         # decode input (taking the buffer into account)
    321         data = self.buffer + input
--> 322         (result, consumed) = self._buffer_decode(data, self.errors, final)
    323         # keep undecoded input until the next call
    324         self.buffer = data[consumed:]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [3]:

# Load pre-trained model (weights)
model = BertModel.from_pretrained('/Users/datalab/bigdata/bert-base-uncased-pytorch_model.bin')

# Set the model in evaluation mode to desactivate the DropOut modules
# This is IMPORTANT to have reproductible results during evaluation!
model.eval()

# If you have a GPU, put everything on cuda
# tokens_tensor = tokens_tensor.to('cuda')
# segments_tensors = segments_tensors.to('cuda')
# model.to('cuda') 

# Predict hidden states features for each layer
with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    # PyTorch-Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

INFO:pytorch_transformers.modeling_utils:loading configuration file /Users/datalab/bigdata/bert-base-uncased-pytorch_model.bin

---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-3-a62f7f60e32b> in <module>
      1 # Load pre-trained model (weights)
----> 2 model = BertModel.from_pretrained('/Users/datalab/bigdata/bert-base-uncased-pytorch_model.bin')
      3 
      4 # Set the model in evaluation mode to desactivate the DropOut modules
      5 # This is IMPORTANT to have reproductible results during evaluation!

~/anaconda3/lib/python3.7/site-packages/pytorch_transformers/modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    360         # Load config
    361         if config is None:
--> 362             config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    363 
    364         # Load model

~/anaconda3/lib/python3.7/site-packages/pytorch_transformers/modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *input, **kwargs)
    137 
    138         # Load config
--> 139         config = cls.from_json_file(resolved_config_file)
    140 
    141         # Update config with kwargs if needed

~/anaconda3/lib/python3.7/site-packages/pytorch_transformers/modeling_utils.py in from_json_file(cls, json_file)
    163         """Constructs a `BertConfig` from a json file of parameters."""
    164         with open(json_file, "r", encoding='utf-8') as reader:
--> 165             text = reader.read()
    166         return cls.from_dict(json.loads(text))
    167 

~/anaconda3/lib/python3.7/codecs.py in decode(self, input, final)
    320         # decode input (taking the buffer into account)
    321         data = self.buffer + input
--> 322         (result, consumed) = self._buffer_decode(data, self.errors, final)
    323         # keep undecoded input until the next call
    324         self.buffer = data[consumed:]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [ ]:

# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == 'henson'

In [ ]: