Notebook

Multilingual Question Answering w/ Transformers¶

This lab will focus on how to train and evaluate a model for multilingual question answering using the HuggingFace transformers library

For this lab, we will use the multilingual XLM RoBERTa model.

The task is extractive question answering. In this, the data consists of a question, and answer, and the span in the context which contains the correct answer. To model this, we will train our model to simply predict the start and end tokens of the answer.

Much of the code for this lab is cribbed from this notebook

It must be noted that the raw output of the model we are going to train is not just 2 single numbers like the misleading diagram above, but rather 2 numbers for each token in the input. That is, we end up with a distribution of logits for a start of the answer and end of the answer tokens akin to the following:

logits_qa_example

In [ ]:

!pip install update transformers
!pip install datasets

The usual housekeeping to ensure reproducible results

In [2]:

from datasets import load_dataset
from datasets import load_metric
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import AutoConfig
from functools import partial
import torch
import random
import numpy as np
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import LambdaLR
from torch import nn
from collections import defaultdict, OrderedDict
MODEL_NAME = 'xlm-roberta-base'
#MODEL_NAME = 'bert-base-uncased'

In [3]:

def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [4]:

""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate_squad(dataset, predictions):
    f1 = exact_match = total = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                total += 1
                if qa['id'] not in predictions:
                    message = 'Unanswered question ' + qa['id'] + \
                              ' will receive score 0.'
                    print(message, file=sys.stderr)
                    continue
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = predictions[qa['id']]
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

def compute_squad(predictions, references):
  pred_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions}
  dataset = [
      {
          "paragraphs": [
              {
                  "qas": [
                      {
                          "answers": [{"text": answer_text} for answer_text in ref["answers"]["text"]],
                          "id": ref["id"],
                      }
                      for ref in references
                  ]
              }
          ]
      }
  ]
  score = evaluate_squad(dataset=dataset, predictions=pred_dict)
  return score

In [3]:

# this is also equivalent to those 2 lines. I recommend going with that, unless you want more control over your code
from datasets import load_metric
compute_squad = load_metric("squad")

For your project, use load_metric("squad_v2"). SQuAD v2, like the TyDiQA dataset, contains unanswerable questions, and it's evaluation script supports that

Here we are using the huggingface datasets library to load the MLQA dataset. MLQA contains QA data in SQuAD format for 7 different languages. To start, we will load the English only data to train and test our model.

In [5]:

mlqa = load_dataset('mlqa', 'mlqa.en.en')

Downloading:   0%|          | 0.00/2.29k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

Downloading and preparing dataset mlqa/mlqa.en.en (download: 72.21 MiB, generated: 14.40 MiB, post-processed: Unknown size, total: 86.61 MiB) to /root/.cache/huggingface/datasets/mlqa/mlqa.en.en/1.0.0/1a1ae267d8d9e8e9ff25bd8811a27c5f8752ee58c5d75cf6c6451cbaba777c87...

Downloading:   0%|          | 0.00/75.7M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset mlqa downloaded and prepared to /root/.cache/huggingface/datasets/mlqa/mlqa.en.en/1.0.0/1a1ae267d8d9e8e9ff25bd8811a27c5f8752ee58c5d75cf6c6451cbaba777c87. Subsequent calls will reuse this data.

  0%|          | 0/2 [00:00<?, ?it/s]

Here we will use the "test" split for training and the validation split for test.

In [6]:

mlqa

Out[6]:

DatasetDict({
    test: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 11590
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 1148
    })
})

In [7]:

mlqa['test'][70]

Out[7]:

{'answers': {'answer_start': [1223], 'text': ['four to six hours']},
'context': 'Inhaled bronchodilators are the primary medications used, and result in a small overall benefit. The two major types are β2 agonists and anticholinergics; both exist in long-acting and short-acting forms. They reduce shortness of breath, wheeze, and exercise limitation, resulting in an improved quality of life. It is unclear if they change the progression of the underlying disease.In those with mild disease, short-acting agents are recommended on an as needed basis. In those with more severe disease, long-acting agents are recommended. Long-acting agents partly work by reducing hyperinflation. If long-acting bronchodilators are insufficient, then inhaled corticosteroids are typically added. Which type of long-acting agent, tiotropium (a long-acting anticholinergic) or a long-acting beta agonist (LABA) is better is unclear, and trying each and continuing with the one that works best may be advisable. Both types of agent appear to reduce the risk of acute exacerbations by 15–25%. While both may be used at the same time, any added benefit is of questionable significance.Several short-acting β2 agonists are available, including salbutamol (albuterol) and terbutaline. They provide some relief of symptoms for four to six hours. LABAs such as salmeterol, formoterol, and indacaterol are often used as maintenance therapy. Some feel the evidence of benefits is limited, while others view the evidence of benefit as established. Long-term use appears safe in COPD with adverse effects include shakiness and heart palpitations. When used with inhaled steroids they increase the risk of pneumonia. While steroids and LABAs may work better together, it is unclear if this slight benefit outweighs the increased risks. There is some evidence that combined treatment of LABAs with long-acting muscarinic antagonists (LAMA), an anticholinergic, may result in less exacerbations, less pneumonia, an improvement in forced expiratory volume (FEV1%), and potential improvements in quality of life when compared to treatment with LABA and an inhaled corticosteriod (ICS). All three together, LABA, LAMA, and ICS, have some evidence of benefits. Indacaterol requires an inhaled dose once a day, and is as effective as the other long-acting β2 agonist drugs that require twice-daily dosing for people with stable COPD.Two main anticholinergics are used in COPD, ipratropium and tiotropium. Ipratropium is a short-acting agent, while tiotropium is long-acting. Tiotropium is associated with a decrease in exacerbations and improved quality of life, and tiotropium provides those benefits better than ipratropium. It does not appear to affect mortality or the overall hospitalization rate. Anticholinergics can cause dry mouth and urinary tract symptoms. They are also associated with increased risk of heart disease and stroke. Aclidinium, another long-acting agent, reduces hospitalizations associated with COPD and improves quality of life. The LAMA umeclidinium bromide is another anticholinergic alternative. When compared to tiotropium, the LAMAs aclidinium, glycopyrronium, and umeclidinium appear to have a similar level of efficacy; with all four being more effective than placebo. Further research is needed comparing aclidinium to tiotropium.',
'id': '7bad1acabcddee223bee211e4c906330fbdb7b31',
'question': 'What is the duration of time that agonists are potent for?'}

Here we will load the tokenizer for XLM RoBERTa. We will make use of HuggingFace's AutoTokenizer, where we only need to specify what model it is we are loading based on its ID in the model hub

In [8]:

tk = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Getting features for our text involves tokenizing the question and answer and finding the correct token indices for the start and end tokens corresponding to the answer. This turns out to be non-trivial to obtain! Additionally, our model has only been trained to handle input up to 512 tokens, so what do we do with very long contexts?

Fortunately, the tokenizer has a lot of nice features to help with this. Basically what we will do is the following:

Tokenize a batch of samples, specifying the following:

Truncate the sequences to the maximum allowable input, but only truncate the contexts so that the question always appears in the input
If the input is too long, split it into multiple inputs and return all of them
For split inputs, use a stride of 128 tokens so that there is some overlap between them
Return the character offsets into the original text of each returned token so we can calculate the start and end token indices

Iterate through all of the returned features, finding the start and end tokens of the answer if they exist in that input. Otherwise default to (0,0), meaning the answer does not exist in that input.

In [9]:

samples = mlqa['test'][70]
batch = tk.encode_plus(
        samples['question'], 
        samples['context'], 
        padding='max_length', 
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

# Get a list which maps the input features index to their original index in the 
# samples list (for split inputs). E.g. if our batch size is 4 and the second sample
# is split into 3 inputs because it is very large, sample_mapping would look like
# [0, 1, 1, 1, 2, 3]
sample_mapping = batch.pop('overflow_to_sample_mapping')
# Get all of the character offsets for each token
offset_mapping = batch.pop('offset_mapping')

In [10]:

list(batch.keys())

Out[10]:

['input_ids', 'attention_mask']

In [11]:

sample_mapping

Out[11]:

[0, 0]

In [12]:

len(offset_mapping)

Out[12]:

In [18]:

offset_mapping[0][:50]

Out[18]:

[(0, 0),
 (0, 4),
 (5, 7),
 (8, 11),
 (12, 14),
 (14, 20),
 (21, 23),
 (24, 28),
 (29, 33),
 (34, 37),
 (37, 41),
 (41, 42),
 (43, 46),
 (47, 48),
 (47, 53),
 (54, 57),
 (57, 58),
 (0, 0),
 (0, 0),
 (0, 2),
 (2, 5),
 (5, 7),
 (8, 12),
 (12, 16),
 (16, 19),
 (19, 23),
 (24, 27),
 (28, 31),
 (32, 39),
 (40, 50),
 (50, 51),
 (52, 56),
 (56, 57),
 (58, 61),
 (62, 68),
 (69, 71),
 (72, 73),
 (74, 79),
 (80, 87),
 (88, 95),
 (95, 96),
 (97, 100),
 (101, 104),
 (105, 110),
 (111, 116),
 (117, 120),
 (121, 122),
 (122, 123),
 (124, 127),
 (127, 131)]

In [14]:

def get_train_features(tk, samples):
  '''
  Tokenizes all of the text in the given samples, splittling inputs that are too long for our model
  across multiple features. Finds the token offsets of the answers, which serve as the labels for
  our inputs.
  '''
  batch = tk.batch_encode_plus(
        [[q,c] for q,c in zip(samples['question'], samples['context'])], 
        padding='max_length', 
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

  # Get a list which maps the input features index to their original index in the 
  # samples list (for split inputs). E.g. if our batch size is 4 and the second sample
  # is split into 3 inputs because it is very large, sample_mapping would look like
  # [0, 1, 1, 1, 2, 3]
  sample_mapping = batch.pop('overflow_to_sample_mapping')
  # Get all of the character offsets for each token
  offset_mapping = batch.pop('offset_mapping')

  # Store the start and end tokens
  batch['start_tokens'] = []
  batch['end_tokens'] = []

  # Iterate through all of the offsets
  for i, offsets in enumerate(offset_mapping):
    # Get the right sample by mapping it to its original index
    sample_idx = sample_mapping[i]
    # Get the sequence IDs to know where context starts so we can ignore question tokens
    sequence_ids = batch.sequence_ids(i)

    # Get the start and end character positions of the answer
    ans = samples['answers'][sample_idx]
    start_char = ans['answer_start'][0]
    end_char = start_char + len(ans['text'][0])
    # while end_char > 0 and (end_char >= len(samples['context'][sample_idx]) or samples['context'][sample_idx][end_char] == ' '):
    #   end_char -= 1

    # Start from the first token in the context, which can be found by going to the 
    # first token where sequence_ids is 1
    start_token = 0
    while sequence_ids[start_token] != 1:
      start_token += 1

    end_token = len(offsets) - 1
    while sequence_ids[end_token] != 1:
      end_token -= 1

    # By default set it to the CLS token if the answer isn't in this input
    if start_char < offsets[start_token][0] or end_char > offsets[end_token][1]:
      start_token = 0
      end_token = 0
    # Otherwise find the correct token indices
    else:
      # Advance the start token index until we have passed the start character index 
      while start_token < len(offsets) and offsets[start_token][0] <= start_char:
        start_token += 1
      start_token -= 1
      
      # Decrease the end token index until we have passed the end character index
      while end_token >= 0 and offsets[end_token][1] >= end_char:
        end_token -= 1
      end_token += 1

    batch['start_tokens'].append(start_token)
    batch['end_tokens'].append(end_token)

  #batch['start_tokens'] = np.array(batch['start_tokens'])
  #batch['end_tokens'] = np.array(batch['end_tokens'])

  return batch

def collate_fn(inputs):
  '''
  Defines how to combine different samples in a batch
  '''
  input_ids = torch.tensor([i['input_ids'] for i in inputs])
  attention_mask = torch.tensor([i['attention_mask'] for i in inputs])
  start_tokens = torch.tensor([i['start_tokens'] for i in inputs])
  end_tokens = torch.tensor([i['end_tokens'] for i in inputs])

  # Truncate to max length
  max_len = max(attention_mask.sum(-1))
  input_ids = input_ids[:,:max_len]
  attention_mask = attention_mask[:,:max_len]
  
  return {'input_ids': input_ids, 'attention_mask': attention_mask, 'start_tokens': start_tokens, 'end_tokens': end_tokens}

We can easily tokenize the whole dataset by calling the "map" function on the dataset.

In [15]:

tokenized_dataset = mlqa['test'].map(partial(get_train_features, tk), batched=True, remove_columns=mlqa['test'].column_names)

  0%|          | 0/12 [00:00<?, ?ba/s]

In [16]:

tokenized_dataset

Out[16]:

Dataset({
    features: ['attention_mask', 'end_tokens', 'input_ids', 'start_tokens'],
    num_rows: 13254
})

In [17]:

list(zip(range(len(tokenized_dataset['start_tokens'])),tokenized_dataset['start_tokens'],tokenized_dataset['end_tokens']))[:50]

Out[17]:

[(0, 113, 118),
 (1, 66, 73),
 (2, 205, 246),
 (3, 155, 166),
 (4, 13, 14),
 (5, 50, 51),
 (6, 68, 69),
 (7, 61, 62),
 (8, 91, 92),
 (9, 13, 15),
 (10, 377, 381),
 (11, 218, 221),
 (12, 109, 111),
 (13, 62, 67),
 (14, 14, 19),
 (15, 47, 52),
 (16, 21, 35),
 (17, 40, 44),
 (18, 223, 243),
 (19, 87, 96),
 (20, 112, 115),
 (21, 47, 50),
 (22, 29, 29),
 (23, 116, 118),
 (24, 19, 20),
 (25, 15, 15),
 (26, 32, 34),
 (27, 67, 67),
 (28, 155, 160),
 (29, 30, 31),
 (30, 44, 45),
 (31, 18, 20),
 (32, 30, 31),
 (33, 61, 76),
 (34, 43, 44),
 (35, 0, 0),
 (36, 0, 0),
 (37, 180, 187),
 (38, 212, 213),
 (39, 149, 152),
 (40, 46, 49),
 (41, 0, 0),
 (42, 182, 186),
 (43, 0, 0),
 (44, 133, 134),
 (45, 43, 58),
 (46, 92, 95),
 (47, 241, 242),
 (48, 157, 159),
 (49, 231, 244)]

In [19]:

mlqa['test'][2]

Out[19]:

{'answers': {'answer_start': [826],
  'text': ['the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conservation and Recovery Act (which governs handling of dangerous materials)']},
 'context': 'In 1994, five unnamed civilian contractors and the widows of contractors Walter Kasza and Robert Frost sued the USAF and the United States Environmental Protection Agency. Their suit, in which they were represented by George Washington University law professor Jonathan Turley, alleged they had been present when large quantities of unknown chemicals had been burned in open pits and trenches at Groom. Biopsies taken from the complainants were analyzed by Rutgers University biochemists, who found high levels of dioxin, dibenzofuran, and trichloroethylene in their body fat. The complainants alleged they had sustained skin, liver, and respiratory injuries due to their work at Groom, and that this had contributed to the deaths of Frost and Kasza. The suit sought compensation for the injuries they had sustained, claiming the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conservation and Recovery Act (which governs handling of dangerous materials). They also sought detailed information about the chemicals to which they were allegedly exposed, hoping this would facilitate the medical treatment of survivors. Congressman Lee H. Hamilton, former chairman of the House Intelligence Committee, told 60 Minutes reporter Lesley Stahl, "The Air Force is classifying all information about Area 51 in order to protect themselves from a lawsuit."',
 'id': '04ecd5555635bc05fd2f379d1b9027edd663cebf',
 'question': 'What was the law suit against Groom about'}

In [20]:

tk.decode(tokenized_dataset['input_ids'][2][205:247])

Out[20]:

'the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conservation and Recovery Act (which governs handling of dangerous materials).'

In [21]:

samples = random.sample(list(range(len(tokenized_dataset))), 4000)
tokenized_dataset = tokenized_dataset.select(samples)
train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=4)

Here we have the main training loop

In [ ]:

def train(
    model: nn.Module, 
    train_dl: DataLoader, 
    optimizer: torch.optim.Optimizer, 
    schedule: LambdaLR,
    n_epochs: int, 
    device: torch.device
):
  """
  The main training loop which will optimize a given model on a given dataset
  :param model: The model being optimized
  :param train_dl: The training dataset
  :param optimizer: The optimizer used to update the model parameters
  :param n_epochs: Number of epochs to train for
  :param device: The device to train on
  """

  # Keep track of the loss and best accuracy
  losses = []
  best_acc = 0.0
  pcounter = 0

  # Iterate through epochs
  for ep in range(n_epochs):

    loss_epoch = []

    #Iterate through each batch in the dataloader
    for batch in tqdm(train_dl):
      # VERY IMPORTANT: Make sure the model is in training mode, which turns on 
      # things like dropout and layer normalization
      model.train()

      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
      # keeps track of these dynamically in its computation graph so you need to explicitly
      # zero them out
      optimizer.zero_grad()

      # Place each tensor on the GPU
      batch = {b: batch[b].to(device) for b in batch}

      # Pass the inputs through the model, get the current loss and logits
      outputs = model(
          input_ids=batch['input_ids'],
          attention_mask=batch['attention_mask'],
          start_positions=batch['start_tokens'],
          end_positions=batch['end_tokens']
      )
      loss = outputs['loss']
      losses.append(loss.item())
      loss_epoch.append(loss.item())
      
      # Calculate all of the gradients and weight updates for the model
      loss.backward()

      # Optional: clip gradients
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Finally, update the weights of the model and advance the LR schedule
      optimizer.step()
      scheduler.step()
      #gc.collect()
  return losses

We load the model using HuggingFace's AutoModel interface, which again just needs us to specify the ID of the model we wish to use. Additionally, we specify using "AutoModelForQuestionAnswering", which includes a classifier on top of the base model and allows us to input labels for the start and end token of the answer, handling computation of the loss.

In [ ]:

model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Here we create the optimizer, which is the weighted Adam optimizer used in the BERT paper. We also add weight decay to all parameters in the model except for LayerNormalization and bias parameters. Finally, we create a learning rate schedule which linearly increases the learning rate to a max value for a certain number of steps, then linearly decreases it to 0 over the course of training (this has been shown to improve training for pre-trained transformers).

In [ ]:

# Create the optimizer
lr=2e-5
n_epochs = 3
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl)
)

Train!

In [ ]:

losses = train(
    model, 
    train_dl,
    optimizer, 
    scheduler,
    n_epochs, 
    device
)

100%|██████████| 1000/1000 [13:01<00:00,  1.28it/s]
100%|██████████| 1000/1000 [13:01<00:00,  1.28it/s]
100%|██████████| 1000/1000 [12:58<00:00,  1.28it/s]

How are answers Evaluated ?¶

The next step is running our validation data through the model, getting predictions for the start and end tokens, converting these back into text from the original context, and evaluating using an appropriate metric.

Getting the absolute best answer span is non-trivial. In the simplest case, we can sort the logits for the start token prediction, sort the logits for the end token prediction, and take the max of each. But the start and end tokens can be predicted anywhere in the input. What if the end is predicted before the start (i.e. the answer is impossible)? We then need a way to determine what is the "second" best answer.

To do this, we need some way to score the possible valid answers. A valid answer is one where the start token comes before the end token, and the tokens are within the context part of the input. As a scoring function, we will simply add the value of the start token logits with the value of the end token logits. To get candidates, we will then iterate through the highest $N$ scoring logits from the start and end logits, where $N$ is a hyperparameter, determine if the pair is valid, and if so add it to a list of candidate answers. We can then rank all of the candidate answers by their score and take the highest scoring answer.

Going back to answer text then involves using the same character offset map we used in training to get the start and end character offsets based on the token index. As such, we'll do a similar procedure to get features for the validation samples, this time storing the ID of each sample so we can map them back to the original text, as well as storing the offsets to map back the characters. We don't need to calculate the start and end token positions in this case because we will evaluate the model based on the retrieved answer text.

In [ ]:

def get_validation_features(tk, samples):
  # First, tokenize the text. We get the offsets and return overflowing sequences in
  # order to break up long sequences into multiple inputs. The offsets will help us 
  # determine the original answer text 
  batch = tk.batch_encode_plus(
        [[q,c] for q,c in zip(samples['question'], samples['context'])], 
        padding='max_length', 
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )
  
  # We'll store the ID of the samples to calculate squad score
  batch['example_id'] = []
  # The overflow sample map tells us which input each sample corresponds to
  sample_map = batch.pop('overflow_to_sample_mapping')

  for i in range(len(batch['input_ids'])):
    # The sample index tells us which of the values in "samples" these features belong to
    sample_idx = sample_map[i]
    sequence_ids = batch.sequence_ids(i)

    # Add the ID to map these features back to the correct sample
    batch['example_id'].append(samples['id'][sample_idx])

    #Set offsets for non-context words to be None for ease of processing
    batch['offset_mapping'][i] = [o if sequence_ids[k] == 1 else None for k,o in enumerate(batch['offset_mapping'][i])]

  return batch  

def val_collate_fn(inputs):
  input_ids = torch.tensor([i['input_ids'] for i in inputs])
  attention_mask = torch.tensor([i['attention_mask'] for i in inputs])

  # Truncate to max length
  max_len = max(attention_mask.sum(-1))
  input_ids = input_ids[:,:max_len]
  attention_mask = attention_mask[:,:max_len]
  
  return {'input_ids': input_ids, 'attention_mask': attention_mask}

In [ ]:

validation_dataset = mlqa['validation'].map(partial(get_validation_features, tk), batched=True, remove_columns=mlqa['validation'].column_names)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [ ]:

def predict(model: nn.Module, valid_dl: DataLoader):
  """
  Evaluates the model on the given dataset
  :param model: The model under evaluation
  :param valid_dl: A `DataLoader` reading validation data
  :return: The accuracy of the model on the dataset
  """
  # VERY IMPORTANT: Put your model in "eval" mode -- this disables things like 
  # layer normalization and dropout
  model.eval()
  start_logits_all = []
  end_logits_all = []

  # ALSO IMPORTANT: Don't accumulate gradients during this process
  with torch.no_grad():
    for batch in tqdm(valid_dl, desc='Evaluation'):
      batch = {b: batch[b].to(device) for b in batch}

      # Pass the inputs through the model, get the current loss and logits
      outputs = model(
          input_ids=batch['input_ids'],
          attention_mask=batch['attention_mask']
      )
      # Store the "start" class logits and "end" class logits for every token in the input
      start_logits_all.extend(list(outputs['start_logits'].detach().cpu().numpy()))
      end_logits_all.extend(list(outputs['end_logits'].detach().cpu().numpy()))


    return start_logits_all,end_logits_all

def post_process_predictions(examples, dataset, logits, num_possible_answers = 20, max_answer_length = 30):
  all_start_logits, all_end_logits = logits
  # Build a map from example to its corresponding features. This will allow us to index from
  # sample ID to all of the features for that sample (in case they were split up due to long input)
  example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
  features_per_example = defaultdict(list)
  for i, feature in enumerate(dataset):
      features_per_example[example_id_to_index[feature["example_id"]]].append(i)

  # Create somewhere to store our predictions
  predictions = OrderedDict()

  # Iterate through each sample in the dataset
  for j, sample in enumerate(tqdm(examples)):

    # Get the feature indices (all of the features split across the batch)
    feature_indices = features_per_example[j]
    # Get the original context which predumably has the answer text
    context = sample['context']

    preds = []
    # Iterate through all of the features
    for ft_idx in feature_indices:

      # Get the start and end answer logits for this input
      start_logits = all_start_logits[ft_idx]
      end_logits = all_end_logits[ft_idx]

      # Get the offsets to map token indices to character indices
      offset_mapping = dataset[ft_idx]['offset_mapping']

      # Sort the logits and take the top N
      start_indices = np.argsort(start_logits)[::-1][:num_possible_answers]
      end_indices = np.argsort(end_logits)[::-1][:num_possible_answers]

      # Iterate through start and end indices
      for start_index in start_indices:
        for end_index in end_indices:
          
          # Ignore this combination if either the indices are not in the context
          if start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None:
            continue

          # Also ignore if the start index is greater than the end index of the number of tokens
          # is greater than some specified threshold
          if start_index > end_index or end_index - start_index + 1 > max_answer_length:
            continue

          ans_text = context[offset_mapping[start_index][0]:offset_mapping[end_index][1]]
          preds.append({
              'score': start_logits[start_index] + end_logits[end_index],
              'text': ans_text
          })

    if len(preds) > 0:
      # Sort by score to get the top answer
      answer = sorted(preds, key=lambda x: x['score'], reverse=True)[0]
    else:
      answer = {'score': 0.0, 'text': ""}
          
    predictions[sample['id']] = answer['text']
  return predictions

Create the DataLoader and run prediction!

In [ ]:

val_dl = DataLoader(validation_dataset, collate_fn=val_collate_fn, batch_size=32)
logits = predict(model, val_dl)

In [ ]:

predictions = post_process_predictions(mlqa['validation'], validation_dataset, logits)
formatted_predictions = [{'id': k, 'prediction_text': v} for k,v in predictions.items()]
gold = [{'id': example['id'], 'answers': example['answers']} for example in mlqa['validation']]

We're using the official SQuAD evaluation metric which measure exact span match as well as token-level F1 score

In [ ]:

compute_squad(references=gold, predictions=formatted_predictions)

Out[ ]:

{'exact_match': 57.055749128919864, 'f1': 72.38330713792288}

Out[ ]:

{'exact_match': 57.055749128919864, 'f1': 72.38330713792288}

Multilingual transformers¶

While usually Transformer models are trained on a dataset made of a single language (e.g. the now classic BERT model), it is just as simple and easy to train them on a dataset that contains texts in more than one. For example, XLM-RoBERTa was trained on texts in more than 100 languages!

Are those models any good?

Question: What are the possible advantages of using a multilimgual model?

(generalization, shared representation, cross-lingual training and inference, stronger performance on low resource languages.) .

Are those models any good, though? Let's test one of those models - XLM-RoBERTa that was later fine-tuned on an English only QA dataset.

It can correctly answer questions in English, nothing remarkable here.

example1

It can also answer questions in Danish! Not bad.

example2

What about a question asked in English with a Danish context?

example3

Easy peasy. Let's take it to the extreme, what about this chimeric monstrosity?

example4

Quite remarkable!

example4

Training on other languages¶

Lets test those capabilities ourselves using the MLQA datast. Let's see how a model trained on German performs on English

In [ ]:

german_dataset = load_dataset('mlqa', 'mlqa-translate-train.de')

Downloading and preparing dataset mlqa/mlqa-translate-train.de (download: 60.43 MiB, generated: 84.23 MiB, post-processed: Unknown size, total: 144.66 MiB) to /root/.cache/huggingface/datasets/mlqa/mlqa-translate-train.de/1.0.0/1a1ae267d8d9e8e9ff25bd8811a27c5f8752ee58c5d75cf6c6451cbaba777c87...

Downloading:   0%|          | 0.00/63.4M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset mlqa downloaded and prepared to /root/.cache/huggingface/datasets/mlqa/mlqa-translate-train.de/1.0.0/1a1ae267d8d9e8e9ff25bd8811a27c5f8752ee58c5d75cf6c6451cbaba777c87. Subsequent calls will reuse this data.

  0%|          | 0/2 [00:00<?, ?it/s]

In [ ]:

tokenized_dataset = german_dataset['train'].map(partial(get_train_features, tk), batched=True, remove_columns=german_dataset['train'].column_names)
german_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
#train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=8)

  0%|          | 0/81 [00:00<?, ?ba/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [ ]:

samples = random.sample(list(range(len(tokenized_dataset))), 4000)
tokenized_dataset = tokenized_dataset.select(samples)
train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [ ]:

# Create the optimizer
lr=2e-5
n_epochs = 3
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in german_model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in german_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl)
)

losses = train(
    german_model, 
    train_dl,
    optimizer, 
    scheduler,
    n_epochs, 
    device
)

100%|██████████| 1000/1000 [10:06<00:00,  1.65it/s]
100%|██████████| 1000/1000 [10:11<00:00,  1.64it/s]
100%|██████████| 1000/1000 [10:09<00:00,  1.64it/s]

In [ ]:

logits = predict(german_model, val_dl)

Evaluation: 100%|██████████| 41/41 [01:29<00:00,  2.18s/it]

In [ ]:

predictions = post_process_predictions(mlqa['validation'], validation_dataset, logits)
formatted_predictions = [{'id': k, 'prediction_text': v} for k,v in predictions.items()]
gold = [{'id': example['id'], 'answers': example['answers']} for example in mlqa['validation']]
compute_squad(references=gold, predictions=formatted_predictions)

100%|██████████| 1148/1148 [00:03<00:00, 372.72it/s]

Out[ ]:

{'exact_match': 47.47386759581882, 'f1': 63.08390012502795}

We can also try Chinese to English

In [ ]:

zh_dataset = load_dataset('mlqa', 'mlqa-translate-train.zh')

Reusing dataset mlqa (/root/.cache/huggingface/datasets/mlqa/mlqa-translate-train.zh/1.0.0/1a1ae267d8d9e8e9ff25bd8811a27c5f8752ee58c5d75cf6c6451cbaba777c87)

  0%|          | 0/2 [00:00<?, ?it/s]

In [ ]:

tokenized_dataset = zh_dataset['train'].map(partial(get_train_features, tk), batched=True, remove_columns=zh_dataset['train'].column_names)
german_model.to('cpu')
zh_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
#train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=8)

  0%|          | 0/77 [00:00<?, ?ba/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [ ]:

samples = random.sample(list(range(len(tokenized_dataset))), 4000)
tokenized_dataset = tokenized_dataset.select(samples)
train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [ ]:

# Create the optimizer
lr=2e-5
n_epochs = 3
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in german_model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in german_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl)
)

losses = train(
    zh_model, 
    train_dl,
    optimizer, 
    scheduler,
    n_epochs, 
    device
)

100%|██████████| 1000/1000 [57:17<00:00,  3.44s/it]
100%|██████████| 1000/1000 [55:57<00:00,  3.36s/it]
100%|██████████| 1000/1000 [56:41<00:00,  3.40s/it]

In [ ]:

logits = predict(zh_model, val_dl)

Evaluation: 100%|██████████| 41/41 [01:28<00:00,  2.15s/it]

In [ ]:

predictions = post_process_predictions(mlqa['validation'], validation_dataset, logits)
formatted_predictions = [{'id': k, 'prediction_text': v} for k,v in predictions.items()]
gold = [{'id': example['id'], 'answers': example['answers']} for example in mlqa['validation']]
compute_squad(references=gold, predictions=formatted_predictions)

100%|██████████| 1148/1148 [00:02<00:00, 458.57it/s]

Out[ ]:

{'exact_match': 0.17421602787456447, 'f1': 3.9269257222658576}

Finally, let's try German to English again but using a model pre-trained only on English. What are some obvious issues you can think of coming up?

In [ ]:

MODEL_NAME = 'roberta-base'
zh_model.to('cpu')
tk = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [ ]:

tokenized_dataset = german_dataset['train'].map(partial(get_train_features, tk), batched=True, remove_columns=german_dataset['train'].column_names)
german_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
#train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=8)

  0%|          | 0/77 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [ ]:

samples = random.sample(list(range(len(tokenized_dataset))), 4000)
tokenized_dataset = tokenized_dataset.select(samples)
train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [ ]:

# Create the optimizer
lr=2e-5
n_epochs = 3
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in german_model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in german_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl)
)

losses = train(
    german_model, 
    train_dl,
    optimizer, 
    scheduler,
    n_epochs, 
    device
)

100%|██████████| 1000/1000 [13:52<00:00,  1.20it/s]
100%|██████████| 1000/1000 [13:53<00:00,  1.20it/s]
100%|██████████| 1000/1000 [13:49<00:00,  1.21it/s]

In [ ]:

validation_dataset = mlqa['validation'].map(partial(get_validation_features, tk), batched=True, remove_columns=mlqa['validation'].column_names)
val_dl = DataLoader(validation_dataset, collate_fn=val_collate_fn, batch_size=32)
logits = predict(german_model, val_dl)

  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 40/40 [01:24<00:00,  2.12s/it]

In [ ]:

predictions = post_process_predictions(mlqa['validation'], validation_dataset, logits)
formatted_predictions = [{'id': k, 'prediction_text': v} for k,v in predictions.items()]
gold = [{'id': example['id'], 'answers': example['answers']} for example in mlqa['validation']]
compute_squad(references=gold, predictions=formatted_predictions)

100%|██████████| 1148/1148 [00:02<00:00, 431.26it/s]

Out[ ]:

{'exact_match': 10.365853658536585, 'f1': 21.88637627312505}

In [ ]:

!!