In [ ]:

from fastai.text import *

In [ ]:

# Note: You need to run translation.ipynb nb before running this one to get the data set up

path = Config().data_path()/'giga-fren'
path.ls()

Out[ ]:

[PosixPath('/home/stas/.fastai/data/giga-fren/models'),
 PosixPath('/home/stas/.fastai/data/giga-fren/giga-fren.release2.fixed.en'),
 PosixPath('/home/stas/.fastai/data/giga-fren/giga-fren.release2.fixed.fr'),
 PosixPath('/home/stas/.fastai/data/giga-fren/data_save.pkl'),
 PosixPath('/home/stas/.fastai/data/giga-fren/cc.en.300.bin'),
 PosixPath('/home/stas/.fastai/data/giga-fren/questions_easy.csv'),
 PosixPath('/home/stas/.fastai/data/giga-fren/cc.fr.300.bin')]

Load data¶

We reuse the same functions as in the translation notebook to load our data.

In [ ]:

def seq2seq_collate(samples:BatchSamples, pad_idx:int=1, pad_first:bool=True, backwards:bool=False) -> Tuple[LongTensor, LongTensor]:
    "Function that collect samples and adds padding. Flips token order if needed"
    samples = to_data(samples)
    max_len_x,max_len_y = max([len(s[0]) for s in samples]),max([len(s[1]) for s in samples])
    res_x = torch.zeros(len(samples), max_len_x).long() + pad_idx
    res_y = torch.zeros(len(samples), max_len_y).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        if pad_first: 
            res_x[i,-len(s[0]):],res_y[i,-len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
        else:         
            res_x[i,:len(s[0]):],res_y[i,:len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
    if backwards: res_x,res_y = res_x.flip(1),res_y.flip(1)
    return res_x, res_y

In [ ]:

class Seq2SeqDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training an RNN classifier."
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=32, val_bs:int=None, pad_idx=1,
               pad_first=False, device:torch.device=None, no_check:bool=False, backwards:bool=False, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(seq2seq_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs//2)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, collate_fn=collate_fn, no_check=no_check)

In [ ]:

class Seq2SeqTextList(TextList):
    _bunch = Seq2SeqDataBunch
    _label_cls = TextList

Refer to the translation notebook for creation of 'questions_easy.csv'.

In [ ]:

df = pd.read_csv(path/'questions_easy.csv')

In [ ]:

src = Seq2SeqTextList.from_df(df, path = path, cols='fr').split_by_rand_pct().label_from_df(cols='en', label_cls=TextList)

In [ ]:

np.percentile([len(o) for o in src.train.x.items] + [len(o) for o in src.valid.x.items], 90)

Out[ ]:

29.0

In [ ]:

np.percentile([len(o) for o in src.train.y.items] + [len(o) for o in src.valid.y.items], 90)

Out[ ]:

26.0

As before, we remove questions with more than 30 tokens.

In [ ]:

src = src.filter_by_func(lambda x,y: len(x) > 30 or len(y) > 30)

In [ ]:

len(src.train) + len(src.valid)

Out[ ]:

In [ ]:

data = src.databunch()

In [ ]:

data.save()

Can load from here when restarting.

In [ ]:

data = load_data(path)

In [ ]:

data.show_batch()

text	target
xxbos xxmaj quelles mesures le gouvernement a - t - il prises pour faire passer les ministères et organismes à risque élevé à une catégorie de risque plus faible ?	xxbos xxmaj what action has the xxmaj government taken to xxunk the " xxmaj medium to xxmaj high " risk departments and agencies to a lower risk category ?
xxbos xxmaj quels sont les rôles respectifs du personnel juridique et non juridique pour la définition du cadre de rédaction législative ainsi que durant le processus lui - même ?	xxbos xxmaj what are the appropriate roles of the legal and non - legal staff in setting the framework for the legislative drafting process and during the process itself ?
xxbos xxmaj comment nos chefs peuvent - ils espérer nous faire observer les règles et les politiques de notre organisation lorsqu’ils renoncent eux - mêmes à les faire respecter ?	xxbos xxmaj when our leaders choose not to enforce the regulations and policies of our organization , how can they expect us to follow them ?
xxbos xxmaj quel sera le sort réservé aux suppléments en vertu du projet de loi xxup c-51 , et le dosage fera - t - il l'objet d'une réglementation ?	xxbos xxmaj what is the future under xxmaj bill xxup c-51 for supplements and will there be regulated doses ?
xxbos xxmaj quelle serait aujourd'hui la prévalence du tabagisme si les manufacturiers de tabac canadiens n'avaient pas augmenté régulièrement le contenu en nicotine des cigarettes entre 1980 et 1995 ?	xxbos xxmaj what would the prevalence of smoking be today if the xxmaj canadian tobacco manufacturers had not regularly raised the nicotine content of cigarettes between 1980 and 1995 ?

Transformer model¶

Transformer model

Shifting¶

We add a transform to the dataloader that shifts the targets right and adds a padding at the beginning.

In [ ]:

def shift_tfm(b):
    x,y = b
    y = F.pad(y, (1, 0), value=1)
    return [x,y[:,:-1]], y[:,1:]

In [ ]:

data.add_tfm(shift_tfm)

Embeddings¶

The input and output embeddings are traditional PyTorch embeddings (and we can use pretrained vectors if we want to). The transformer model isn't a recurrent one, so it has no idea of the relative positions of the words. To help it with that, they had to the input embeddings a positional encoding which is cosine of a certain frequency:

In [ ]:

class PositionalEncoding(nn.Module):
    "Encode the position with a sinusoid."
    def __init__(self, d:int):
        super().__init__()
        self.register_buffer('freq', 1 / (10000 ** (torch.arange(0., d, 2.)/d)))
    
    def forward(self, pos:Tensor):
        inp = torch.ger(pos, self.freq)
        enc = torch.cat([inp.sin(), inp.cos()], dim=-1)
        return enc

In [ ]:

tst_encoding = PositionalEncoding(20)
res = tst_encoding(torch.arange(0,100).float())
_, ax = plt.subplots(1,1)
for i in range(1,5): ax.plot(res[:,i])

Out[ ]:

[<matplotlib.lines.Line2D at 0x7f9e60e41c18>]

Out[ ]:

[<matplotlib.lines.Line2D at 0x7f9e6202fc50>]

Out[ ]:

[<matplotlib.lines.Line2D at 0x7f9e62031080>]

Out[ ]:

[<matplotlib.lines.Line2D at 0x7f9e62031518>]

In [ ]:

class TransformerEmbedding(nn.Module):
    "Embedding + positional encoding + dropout"
    def __init__(self, vocab_sz:int, emb_sz:int, inp_p:float=0.):
        super().__init__()
        self.emb_sz = emb_sz
        self.embed = embedding(vocab_sz, emb_sz)
        self.pos_enc = PositionalEncoding(emb_sz)
        self.drop = nn.Dropout(inp_p)
    
    def forward(self, inp): 
        pos = torch.arange(0, inp.size(1), device=inp.device).float()
        return self.drop(self.embed(inp) * math.sqrt(self.emb_sz) + self.pos_enc(pos))

Feed forward¶

The feed forward cell is easy: it's just two linear layers with a skip connection and a LayerNorm.

In [ ]:

def feed_forward(d_model:int, d_ff:int, ff_p:float=0., double_drop:bool=True):
    layers = [nn.Linear(d_model, d_ff), nn.ReLU()]
    if double_drop: layers.append(nn.Dropout(ff_p))
    return SequentialEx(*layers, nn.Linear(d_ff, d_model), nn.Dropout(ff_p), MergeLayer(), nn.LayerNorm(d_model))

Multi-head attention¶

Multi head attention

In [ ]:

class MultiHeadAttention(nn.Module):
    "MutiHeadAttention."
    
    def __init__(self, n_heads:int, d_model:int, d_head:int=None, resid_p:float=0., attn_p:float=0., bias:bool=True,
                 scale:bool=True):
        super().__init__()
        d_head = ifnone(d_head, d_model//n_heads)
        self.n_heads,self.d_head,self.scale = n_heads,d_head,scale
        self.q_wgt = nn.Linear(d_model, n_heads * d_head, bias=bias)
        self.k_wgt = nn.Linear(d_model, n_heads * d_head, bias=bias)
        self.v_wgt = nn.Linear(d_model, n_heads * d_head, bias=bias)
        self.out = nn.Linear(n_heads * d_head, d_model, bias=bias)
        self.drop_att,self.drop_res = nn.Dropout(attn_p),nn.Dropout(resid_p)
        self.ln = nn.LayerNorm(d_model)
        
    def forward(self, q:Tensor, k:Tensor, v:Tensor, mask:Tensor=None):
        return self.ln(q + self.drop_res(self.out(self._apply_attention(q, k, v, mask=mask))))
    
    def _apply_attention(self, q:Tensor, k:Tensor, v:Tensor, mask:Tensor=None):
        bs,seq_len = q.size(0),q.size(1)
        wq,wk,wv = self.q_wgt(q),self.k_wgt(k),self.v_wgt(v)
        wq,wk,wv = map(lambda x:x.view(bs, x.size(1), self.n_heads, self.d_head), (wq,wk,wv))
        wq,wk,wv = wq.permute(0, 2, 1, 3),wk.permute(0, 2, 3, 1),wv.permute(0, 2, 1, 3)
        attn_score = torch.matmul(wq, wk)
        if self.scale: attn_score = attn_score.div_(self.d_head ** 0.5)
        if mask is not None: 
            attn_score = attn_score.float().masked_fill(mask, -float('inf')).type_as(attn_score)
        attn_prob = self.drop_att(F.softmax(attn_score, dim=-1))
        attn_vec = torch.matmul(attn_prob, wv)
        return attn_vec.permute(0, 2, 1, 3).contiguous().contiguous().view(bs, seq_len, -1)
        
    def _attention_einsum(self, q:Tensor, k:Tensor, v:Tensor, mask:Tensor=None):
        # Permute and matmul is a little bit faster but this implementation is more readable
        bs,seq_len = q.size(0),q.size(1)
        wq,wk,wv = self.q_wgt(q),self.k_wgt(k),self.v_wgt(v)
        wq,wk,wv = map(lambda x:x.view(bs, x.size(1), self.n_heads, self.d_head), (wq,wk,wv))
        attn_score = torch.einsum('bind,bjnd->bijn', (wq, wk))
        if self.scale: attn_score = attn_score.mul_(1/(self.d_head ** 0.5))
        if mask is not None: 
            attn_score = attn_score.float().masked_fill(mask, -float('inf')).type_as(attn_score)
        attn_prob = self.drop_att(F.softmax(attn_score, dim=2))
        attn_vec = torch.einsum('bijn,bjnd->bind', (attn_prob, wv))
        return attn_vec.contiguous().view(bs, seq_len, -1)

Masking¶

The attention layer uses a mask to avoid paying attention to certain timesteps. The first thing is that we don't really want the network to pay attention to the padding, so we're going to mask it. The second thing is that since this model isn't recurrent, we need to mask (in the output) all the tokens we're not supposed to see yet (otherwise it would be cheating).

In [ ]:

def get_padding_mask(inp, pad_idx:int=1):
    return None
    return (inp == pad_idx)[:,None,:,None]

In [ ]:

def get_output_mask(inp, pad_idx:int=1):
    return torch.triu(inp.new_ones(inp.size(1),inp.size(1)), diagonal=1)[None,None].byte()
    return ((inp == pad_idx)[:,None,:,None].long() + torch.triu(inp.new_ones(inp.size(1),inp.size(1)), diagonal=1)[None,None] != 0)

Example of mask for the future tokens:

In [ ]:

torch.triu(torch.ones(10,10), diagonal=1).byte()

Out[ ]:

tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=torch.uint8)

Encoder and decoder blocks¶

We are now ready to regroup these layers in the blocks we add in the model picture:

Transformer model

In [ ]:

class EncoderBlock(nn.Module):
    "Encoder block of a Transformer model."
    #Can't use Sequential directly cause more than one input...
    def __init__(self, n_heads:int, d_model:int, d_head:int, d_inner:int, resid_p:float=0., attn_p:float=0., ff_p:float=0.,
                 bias:bool=True, scale:bool=True, double_drop:bool=True):
        super().__init__()
        self.mha = MultiHeadAttention(n_heads, d_model, d_head, resid_p=resid_p, attn_p=attn_p, bias=bias, scale=scale)
        self.ff  = feed_forward(d_model, d_inner, ff_p=ff_p, double_drop=double_drop)
    
    def forward(self, x:Tensor, mask:Tensor=None): return self.ff(self.mha(x, x, x, mask=mask))

In [ ]:

class DecoderBlock(nn.Module):
    "Decoder block of a Transformer model."
    #Can't use Sequential directly cause more than one input...
    def __init__(self, n_heads:int, d_model:int, d_head:int, d_inner:int, resid_p:float=0., attn_p:float=0., ff_p:float=0.,
                 bias:bool=True, scale:bool=True, double_drop:bool=True):
        super().__init__()
        self.mha1 = MultiHeadAttention(n_heads, d_model, d_head, resid_p=resid_p, attn_p=attn_p, bias=bias, scale=scale)
        self.mha2 = MultiHeadAttention(n_heads, d_model, d_head, resid_p=resid_p, attn_p=attn_p, bias=bias, scale=scale)
        self.ff   = feed_forward(d_model, d_inner, ff_p=ff_p, double_drop=double_drop)
    
    def forward(self, x:Tensor, enc:Tensor, mask_in:Tensor=None, mask_out:Tensor=None): 
        y = self.mha1(x, x, x, mask_out)
        return self.ff(self.mha2(y, enc, enc, mask=mask_in))

The whole model¶

In [ ]:

class Transformer(nn.Module):
    "Transformer model"
    
    def __init__(self, inp_vsz:int, out_vsz:int, n_layers:int=6, n_heads:int=8, d_model:int=256, d_head:int=32, 
                 d_inner:int=1024, inp_p:float=0.1, resid_p:float=0.1, attn_p:float=0.1, ff_p:float=0.1, bias:bool=True, 
                 scale:bool=True, double_drop:bool=True, pad_idx:int=1):
        super().__init__()
        self.enc_emb = TransformerEmbedding(inp_vsz, d_model, inp_p)
        self.dec_emb = TransformerEmbedding(out_vsz, d_model, 0.)
        self.encoder = nn.ModuleList([EncoderBlock(n_heads, d_model, d_head, d_inner, resid_p, attn_p, 
                                                   ff_p, bias, scale, double_drop) for _ in range(n_layers)])
        self.decoder = nn.ModuleList([DecoderBlock(n_heads, d_model, d_head, d_inner, resid_p, attn_p, 
                                                   ff_p, bias, scale, double_drop) for _ in range(n_layers)])
        self.out = nn.Linear(d_model, out_vsz)
        self.out.weight = self.dec_emb.embed.weight
        self.pad_idx = pad_idx
        
    def forward(self, inp, out):
        mask_in  = get_padding_mask(inp, self.pad_idx)
        mask_out = get_output_mask (out, self.pad_idx)
        enc,out = self.enc_emb(inp),self.dec_emb(out)
        for enc_block in self.encoder: enc = enc_block(enc, mask_in)
        for dec_block in self.decoder: out = dec_block(out, enc, mask_in, mask_out)
        return self.out(out)

Bleu metric (see dedicated notebook)¶

In [ ]:

class NGram():
    def __init__(self, ngram, max_n=5000): self.ngram,self.max_n = ngram,max_n
    def __eq__(self, other):
        if len(self.ngram) != len(other.ngram): return False
        return np.all(np.array(self.ngram) == np.array(other.ngram))
    def __hash__(self): return int(sum([o * self.max_n**i for i,o in enumerate(self.ngram)]))

In [ ]:

def get_grams(x, n, max_n=5000):
    return x if n==1 else [NGram(x[i:i+n], max_n=max_n) for i in range(len(x)-n+1)]

In [ ]:

def get_correct_ngrams(pred, targ, n, max_n=5000):
    pred_grams,targ_grams = get_grams(pred, n, max_n=max_n),get_grams(targ, n, max_n=max_n)
    pred_cnt,targ_cnt = Counter(pred_grams),Counter(targ_grams)
    return sum([min(c, targ_cnt[g]) for g,c in pred_cnt.items()]),len(pred_grams)

In [ ]:

class CorpusBLEU(Callback):
    def __init__(self, vocab_sz):
        self.vocab_sz = vocab_sz
        self.name = 'bleu'
    
    def on_epoch_begin(self, **kwargs):
        self.pred_len,self.targ_len,self.corrects,self.counts = 0,0,[0]*4,[0]*4
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        last_output = last_output.argmax(dim=-1)
        for pred,targ in zip(last_output.cpu().numpy(),last_target.cpu().numpy()):
            self.pred_len += len(pred)
            self.targ_len += len(targ)
            for i in range(4):
                c,t = get_correct_ngrams(pred, targ, i+1, max_n=self.vocab_sz)
                self.corrects[i] += c
                self.counts[i]   += t
    
    def on_epoch_end(self, last_metrics, **kwargs):
        precs = [c/t for c,t in zip(self.corrects,self.counts)]
        len_penalty = exp(1 - self.targ_len/self.pred_len) if self.pred_len < self.targ_len else 1
        bleu = len_penalty * ((precs[0]*precs[1]*precs[2]*precs[3]) ** 0.25)
        return add_metrics(last_metrics, bleu)

Training¶

In [ ]:

model = Transformer(len(data.train_ds.x.vocab.itos), len(data.train_ds.y.vocab.itos), d_model=256)

In [ ]:

learn = Learner(data, model, metrics=[accuracy, CorpusBLEU(len(data.train_ds.y.vocab.itos))], 
                loss_func = CrossEntropyFlat())

In [ ]:

learn.lr_find()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

In [ ]:

learn.recorder.plot()

In [ ]:

learn.fit_one_cycle(8, 5e-4, div_factor=5)

epoch	train_loss	valid_loss	accuracy	bleu	time
0	2.432767	2.478084	0.618113	0.460726	01:03
1	1.959129	1.978793	0.686264	0.505202	01:06
2	1.536871	1.656002	0.723556	0.543357	01:07
3	1.371858	1.453879	0.749335	0.570762	01:07
4	1.113087	1.345275	0.764912	0.590188	01:07
5	0.970879	1.287549	0.774231	0.602932	01:06
6	0.852550	1.280241	0.778895	0.609807	01:07
7	0.771543	1.282661	0.779013	0.610055	01:08

In [ ]:

def get_predictions(learn, ds_type=DatasetType.Valid):
    learn.model.eval()
    inputs, targets, outputs = [],[],[]
    with torch.no_grad():
        for xb,yb in progress_bar(learn.dl(ds_type)):
            out = learn.model(*xb)
            for x,y,z in zip(xb[0],xb[1],out):
                inputs.append(learn.data.train_ds.x.reconstruct(x))
                targets.append(learn.data.train_ds.y.reconstruct(y))
                outputs.append(learn.data.train_ds.y.reconstruct(z.argmax(1)))
    return inputs, targets, outputs

In [ ]:

inputs, targets, outputs = get_predictions(learn)

100.00% [149/149 00:16<00:00]

In [ ]:

inputs[10],targets[10],outputs[10]

Out[ ]:

(Text xxbos xxmaj comment cela a - t - il pu se produire et pourquoi n'y a - t - il pas de mécanismes en place pour empêcher cette situation ?,
 Text xxbos xxmaj why was this allowed to happen and why are n't there any mechanisms in place to prevent this ?,
 Text xxbos xxmaj why did this happening to take and why is there there no measures not this to prevent this situation)

In [ ]:

inputs[700],targets[700],outputs[700]

Out[ ]:

(Text xxbos xxmaj qu’advient - il lorsque les attentes sont xxunk et même xxunk ou qu’elles incitent l’organisation à xxunk de ses activités de base ?,
 Text xxbos xxmaj what happens when expectations are diverse and even xxunk , or take the organization away from its core business ?,
 Text xxbos xxmaj what happens when expectations are xxunk and their even or or they - organization 's from its core activities activities)

In [ ]:

inputs[701],targets[701],outputs[701]

Out[ ]:

(Text xxbos xxmaj quelles mesures sont prises par les autorités sanitaires provinciales et locales pour assurer la sécurité du public lorsque des insecticides sont utilisés ?,
 Text xxbos xxmaj what steps do provincial / local health authorities take to ensure public safety when pesticides are used ?,
 Text xxbos xxmaj what measures are provincial and local health authorities have to ensure the safety and xxunk are used ?)

In [ ]:

inputs[2500],targets[2500],outputs[2500]

Out[ ]:

(Text xxbos xxmaj quand les restrictions à la propriété concernant les xxunk aux postes terrestres seront - elles xxunk ?,
 Text xxbos xxmaj when will land xxmaj border store restrictions on ownership be lifted ?,
 Text xxbos xxmaj when will the restrictions property xxmaj restrictions be the of xxunk ?)

In [ ]:

inputs[4002],targets[4002],outputs[4002]

Out[ ]:

(Text xxbos xxmaj qu’est - ce qu’une résolution du conseil de bande ( xxup xxunk ) ?,
 Text xxbos xxmaj what is a xxmaj band xxmaj council xxmaj resolution ( xxup xxunk ) ?,
 Text xxbos xxmaj what is a xxmaj band xxmaj board xxmaj agreement ? xxup xxunk ) ?)

Label smoothing¶

They point out in the paper that using label smoothing helped getting a better BLEU/accuracy, even if it made the loss worse.

In [ ]:

model = Transformer(len(data.train_ds.x.vocab.itos), len(data.train_ds.y.vocab.itos), d_model=256)

In [ ]:

learn = Learner(data, model, metrics=[accuracy, CorpusBLEU(len(data.train_ds.y.vocab.itos))], 
                loss_func=FlattenedLoss(LabelSmoothingCrossEntropy, axis=-1))

In [ ]:

learn.fit_one_cycle(8, 5e-4, div_factor=5)

epoch	train_loss	valid_loss	accuracy	bleu	time
0	3.288097	3.361678	0.620848	0.462028	01:07
1	2.866131	2.932534	0.690689	0.508931	01:08
2	2.617174	2.686622	0.720212	0.535796	01:07
3	2.396165	2.509400	0.750236	0.572231	01:07
4	2.234806	2.415202	0.764170	0.590421	01:07
5	2.094995	2.375675	0.773611	0.601696	01:07
6	1.979516	2.361270	0.778426	0.607941	01:07
7	1.968844	2.363652	0.778740	0.608949	01:07

In [ ]:

learn.fit_one_cycle(8, 5e-4, div_factor=5)

epoch	train_loss	valid_loss	accuracy	bleu	time
0	2.005607	2.370556	0.776371	0.606602	01:07
1	2.039403	2.376988	0.773496	0.602594	01:08
2	1.998095	2.364960	0.776239	0.606155	01:07
3	1.948557	2.333933	0.783300	0.617531	01:07
4	1.846455	2.334134	0.787573	0.623490	01:07
5	1.729133	2.339518	0.792536	0.632397	01:07
6	1.623762	2.345591	0.794781	0.636248	01:08
7	1.599179	2.351281	0.794569	0.636407	01:08

In [ ]:

print("Quels sont les atouts particuliers du Canada en recherche sur l'obésité sur la scène internationale ?")
print("What are Specific strengths canada strengths in obesity - ? are up canada ? from international international stage ?")
print("Quelles sont les répercussions politiques à long terme de cette révolution scientifique mondiale ?")
print("What are the long the long - term policies implications of this global scientific ? ?")

Quels sont les atouts particuliers du Canada en recherche sur l'obésité sur la scène internationale ?
What are Specific strengths canada strengths in obesity - ? are up canada ? from international international stage ?
Quelles sont les répercussions politiques à long terme de cette révolution scientifique mondiale ?
What are the long the long - term policies implications of this global scientific ? ?

In [ ]:

inputs[10],targets[10],outputs[10]

Out[ ]:

(Text xxbos xxmaj comment cela a - t - il pu se produire et pourquoi n'y a - t - il pas de mécanismes en place pour empêcher cette situation ?,
 Text xxbos xxmaj why was this allowed to happen and why are n't there any mechanisms in place to prevent this ?,
 Text xxbos xxmaj why did this happening to take and why is there there no measures not this to prevent this situation)

In [ ]:

inputs[700],targets[700],outputs[700]

Out[ ]:

(Text xxbos xxmaj qu’advient - il lorsque les attentes sont xxunk et même xxunk ou qu’elles incitent l’organisation à xxunk de ses activités de base ?,
 Text xxbos xxmaj what happens when expectations are diverse and even xxunk , or take the organization away from its core business ?,
 Text xxbos xxmaj what happens when expectations are xxunk and their even or or they - organization 's from its core activities activities)

In [ ]:

inputs[701],targets[701],outputs[701]

Out[ ]:

(Text xxbos xxmaj quelles mesures sont prises par les autorités sanitaires provinciales et locales pour assurer la sécurité du public lorsque des insecticides sont utilisés ?,
 Text xxbos xxmaj what steps do provincial / local health authorities take to ensure public safety when pesticides are used ?,
 Text xxbos xxmaj what measures are provincial and local health authorities have to ensure the safety and xxunk are used ?)

In [ ]:

inputs[4001],targets[4001],outputs[4001]

Out[ ]:

(Text xxbos xxmaj qui a dit qu'un xxmaj xxunk xxmaj xxunk ne xxunk que des armes ?,
 Text xxbos xxmaj who said a xxmaj xxunk xxmaj tech would have to work on weapons ?,
 Text xxbos xxmaj who said xxmaj xxmaj xxunk xxmaj xxunk xxunk not not xxunk in weapons ?)

Test leakage¶

If we change a token in the targets at position n, it shouldn't impact the predictions before that.

In [ ]:

learn.model.eval();

In [ ]:

xb,yb = data.one_batch(cpu=False)

In [ ]:

inp1,out1 = xb[0][:1],xb[1][:1]
inp2,out2 = inp1.clone(),out1.clone()
out2[0,15] = 10

In [ ]:

y1 = learn.model(inp1, out1)
y2 = learn.model(inp2, out2)

In [ ]:

(y1[0,:15] - y2[0,:15]).abs().mean()

Out[ ]:

tensor(0., device='cuda:0', grad_fn=<MeanBackward0>)