import os
import torch
from d2l import torch as d2l
Downloading and Preprocessing the Dataset
class MTFraEng(d2l.DataModule):
"""The English-French dataset."""
def _download(self):
d2l.extract(d2l.download(
d2l.DATA_URL+'fra-eng.zip', self.root,
'94646ad1522d915e7b0f9296181140edcf86a4f5'))
with open(self.root + '/fra-eng/fra.txt', encoding='utf-8') as f:
return f.read()
data = MTFraEng()
raw_text = data._download()
print(raw_text[:75])
Downloading ../data/fra-eng.zip from http://d2l-data.s3-accelerate.amazonaws.com/fra-eng.zip... Go. Va ! Hi. Salut ! Run! Cours ! Run! Courez ! Who? Qui ? Wow! Ça alors !
Proceed with several preprocessing steps
@d2l.add_to_class(MTFraEng)
def _preprocess(self, text):
text = text.replace('\u202f', ' ').replace('\xa0', ' ')
no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
for i, char in enumerate(text.lower())]
return ''.join(out)
text = data._preprocess(raw_text)
print(text[:80])
go . va ! hi . salut ! run ! cours ! run ! courez ! who ? qui ? wow ! ça alors !
Tokenization
@d2l.add_to_class(MTFraEng)
def _tokenize(self, text, max_examples=None):
src, tgt = [], []
for i, line in enumerate(text.split('\n')):
if max_examples and i > max_examples: break
parts = line.split('\t')
if len(parts) == 2:
src.append([t for t in f'{parts[0]} <eos>'.split(' ') if t])
tgt.append([t for t in f'{parts[1]} <eos>'.split(' ') if t])
return src, tgt
src, tgt = data._tokenize(text)
src[:6], tgt[:6]
([['go', '.', '<eos>'], ['hi', '.', '<eos>'], ['run', '!', '<eos>'], ['run', '!', '<eos>'], ['who', '?', '<eos>'], ['wow', '!', '<eos>']], [['va', '!', '<eos>'], ['salut', '!', '<eos>'], ['cours', '!', '<eos>'], ['courez', '!', '<eos>'], ['qui', '?', '<eos>'], ['ça', 'alors', '!', '<eos>']])
Plot the histogram of the number of tokens per text sequence
def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist):
"""Plot the histogram for list length pairs."""
d2l.set_figsize()
_, _, patches = d2l.plt.hist(
[[len(l) for l in xlist], [len(l) for l in ylist]])
d2l.plt.xlabel(xlabel)
d2l.plt.ylabel(ylabel)
for patch in patches[1].patches:
patch.set_hatch('/')
d2l.plt.legend(legend)
show_list_len_pair_hist(['source', 'target'], '
'count', src, tgt);
Each example sequence had a fixed length
@d2l.add_to_class(MTFraEng)
def __init__(self, batch_size, num_steps=9, num_train=512, num_val=128):
super(MTFraEng, self).__init__()
self.save_hyperparameters()
self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays(
self._download())
@d2l.add_to_class(MTFraEng)
def _build_arrays(self, raw_text, src_vocab=None, tgt_vocab=None):
def _build_array(sentences, vocab, is_tgt=False):
pad_or_trim = lambda seq, t: (
seq[:t] if len(seq) > t else seq + ['<pad>'] * (t - len(seq)))
sentences = [pad_or_trim(s, self.num_steps) for s in sentences]
if is_tgt:
sentences = [['<bos>'] + s for s in sentences]
if vocab is None:
vocab = d2l.Vocab(sentences, min_freq=2)
array = torch.tensor([vocab[s] for s in sentences])
valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
return array, vocab, valid_len
src, tgt = self._tokenize(self._preprocess(raw_text),
self.num_train + self.num_val)
src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)
tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, True)
return ((src_array, tgt_array[:,:-1], src_valid_len, tgt_array[:,1:]),
src_vocab, tgt_vocab)
Reading the Dataset
@d2l.add_to_class(MTFraEng)
def get_dataloader(self, train):
idx = slice(0, self.num_train) if train else slice(self.num_train, None)
return self.get_tensorloader(self.arrays, train, idx)
Read the first minibatch from the English--French dataset
data = MTFraEng(batch_size=3)
src, tgt, src_valid_len, label = next(iter(data.train_dataloader()))
print('source:', src.type(torch.int32))
print('decoder input:', tgt.type(torch.int32))
print('source len excluding pad:', src_valid_len.type(torch.int32))
print('label:', label.type(torch.int32))
source: tensor([[117, 182, 0, 3, 4, 4, 4, 4, 4], [ 62, 72, 2, 3, 4, 4, 4, 4, 4], [ 57, 124, 0, 3, 4, 4, 4, 4, 4]], dtype=torch.int32) decoder input: tensor([[ 3, 37, 100, 58, 160, 0, 4, 5, 5], [ 3, 6, 2, 4, 5, 5, 5, 5, 5], [ 3, 180, 0, 4, 5, 5, 5, 5, 5]], dtype=torch.int32) source len excluding pad: tensor([4, 4, 4], dtype=torch.int32) label: tensor([[ 37, 100, 58, 160, 0, 4, 5, 5, 5], [ 6, 2, 4, 5, 5, 5, 5, 5, 5], [180, 0, 4, 5, 5, 5, 5, 5, 5]], dtype=torch.int32)
@d2l.add_to_class(MTFraEng)
def build(self, src_sentences, tgt_sentences):
raw_text = '\n'.join([src + '\t' + tgt for src, tgt in zip(
src_sentences, tgt_sentences)])
arrays, _, _ = self._build_arrays(
raw_text, self.src_vocab, self.tgt_vocab)
return arrays
src, tgt, _, _ = data.build(['hi .'], ['salut .'])
print('source:', data.src_vocab.to_tokens(src[0].type(torch.int32)))
print('target:', data.tgt_vocab.to_tokens(tgt[0].type(torch.int32)))
source: ['hi', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'] target: ['<bos>', 'salut', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']