%reload_ext autoreload %autoreload 2 #export from nb_006b import * from collections import Counter EOS = '' PATH=Path('data/wikitext') def read_file(filename): tokens = [] with open(PATH/filename, encoding='utf8') as f: for line in f: tokens.append(line.split() + [EOS]) return np.array(tokens) train_tok = read_file('wiki.train.tokens') valid_tok = read_file('wiki.valid.tokens') test_tok = read_file('wiki.test.tokens') len(train_tok), len(valid_tok), len(test_tok) ' '.join(train_tok[4][:20]) cnt = Counter(word for sent in train_tok for word in sent) cnt.most_common(10) itos = [o for o,c in cnt.most_common()] itos.insert(0,'') vocab_size = len(itos); vocab_size stoi = collections.defaultdict(lambda : 5, {w:i for i,w in enumerate(itos)}) train_ids = np.array([([stoi[w] for w in s]) for s in train_tok]) valid_ids = np.array([([stoi[w] for w in s]) for s in valid_tok]) test_ids = np.array([([stoi[w] for w in s]) for s in test_tok]) #export class LanguageModelLoader(): "Creates a dataloader with bptt slightly changing." def __init__(self, nums:np.ndarray, bs:int=64, bptt:int=70, backwards:bool=False): self.bs,self.bptt,self.backwards = bs,bptt,backwards self.data = self.batchify(nums) self.first,self.i,self.iter = True,0,0 self.n = len(self.data) def __iter__(self): self.i,self.iter = 0,0 while self.i < self.n-1 and self.iter int: return (self.n-1) // self.bptt def batchify(self, data:np.ndarray) -> LongTensor: "Splits the data in batches." nb = data.shape[0] // self.bs data = np.array(data[:nb*self.bs]).reshape(self.bs, -1).T if self.backwards: data=data[::-1] return LongTensor(data) def get_batch(self, i:int, seq_len:int) -> LongTensor: "Gets a batch of length `seq_len`" seq_len = min(seq_len, len(self.data) - 1 - i) return self.data[i:i+seq_len], self.data[i+1:i+1+seq_len].contiguous().view(-1) bs,bptt = 20,10 train_dl = LanguageModelLoader(np.concatenate(train_ids), bs, bptt) valid_dl = LanguageModelLoader(np.concatenate(valid_ids), bs, bptt) data = DataBunch(train_dl, valid_dl) #export def dropout_mask(x:Tensor, sz:Collection[int], p:float): "Returns a dropout mask of the same type as x, size sz, with probability p to cancel an element." return x.new(*sz).bernoulli_(1-p).div_(1-p) x = torch.randn(10,10) dropout_mask(x, (10,10), 0.5) #export class RNNDropout(nn.Module): "Dropout that is consistent on the seq_len dimension" def __init__(self, p:float=0.5): super().__init__() self.p=p def forward(self, x:Tensor) -> Tensor: if not self.training or self.p == 0.: return x m = dropout_mask(x.data, (1, x.size(1), x.size(2)), self.p) return x * m dp_test = RNNDropout(0.5) x = torch.randn(2,5,10) x, dp_test(x) #export import warnings class WeightDropout(nn.Module): "A module that warps another layer in which some weights will be replaced by 0 during training." def __init__(self, module:Model, weight_p:float, layer_names:Collection[str]=['weight_hh_l0']): super().__init__() self.module,self.weight_p,self.layer_names = module,weight_p,layer_names for layer in self.layer_names: #Makes a copy of the weights of the selected layers. w = getattr(self.module, layer) self.register_parameter(f'{layer}_raw', nn.Parameter(w.data)) def _setweights(self): "Applies dropout to the raw weights" for layer in self.layer_names: raw_w = getattr(self, f'{layer}_raw') self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training) def forward(self, *args:ArgStar): self._setweights() with warnings.catch_warnings(): #To avoid the warning that comes because the weights aren't flattened. warnings.simplefilter("ignore") return self.module.forward(*args) def reset(self): if hasattr(self.module, 'reset'): self.module.reset() module = nn.LSTM(20, 20) dp_module = WeightDropout(module, 0.5) opt = optim.SGD(dp_module.parameters(), 10) dp_module.train() x = torch.randn(2,5,20) x.requires_grad_(requires_grad=True) h = (torch.zeros(1,5,20), torch.zeros(1,5,20)) for _ in range(5): x,h = dp_module(x,h) getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw') target = torch.randint(0,20,(10,)).long() loss = F.nll_loss(x.view(-1,20), target) loss.backward() opt.step() w, w_raw = getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw') w.grad, w_raw.grad getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw') #export class EmbeddingDropout(nn.Module): "Applies dropout in the embedding layer by zeroing out some elements of the embedding vector." def __init__(self, emb:Model, embed_p:float): super().__init__() self.emb,self.embed_p = emb,embed_p self.pad_idx = self.emb.padding_idx if self.pad_idx is None: self.pad_idx = -1 def forward(self, words:LongTensor, scale:Optional[float]=None) -> Tensor: if self.training and self.embed_p != 0: size = (self.emb.weight.size(0),1) mask = dropout_mask(self.emb.weight.data, size, self.embed_p) masked_embed = self.emb.weight * mask else: masked_embed = self.emb.weight if scale: masked_embed.mul_(scale) return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm, self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse) enc = nn.Embedding(100,20, padding_idx=0) enc_dp = EmbeddingDropout(enc, 0.5) x = torch.randint(0,100,(25,)).long() enc_dp(x) #export def repackage_var(h:Tensors) -> Tensors: "Detaches h from its history." return h.detach() if type(h) == torch.Tensor else tuple(repackage_var(v) for v in h) #export class RNNCore(nn.Module): "AWD-LSTM/QRNN inspired by https://arxiv.org/abs/1708.02182" initrange=0.1 def __init__(self, vocab_sz:int, emb_sz:int, n_hid:int, n_layers:int, pad_token:int, bidir:bool=False, hidden_p:float=0.2, input_p:float=0.6, embed_p:float=0.1, weight_p:float=0.5, qrnn:bool=False): super().__init__() self.bs,self.qrnn,self.ndir = 1, qrnn,(2 if bidir else 1) self.emb_sz,self.n_hid,self.n_layers = emb_sz,n_hid,n_layers self.encoder = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token) self.encoder_dp = EmbeddingDropout(self.encoder, embed_p) if self.qrnn: #Using QRNN requires cupy: https://github.com/cupy/cupy from qrnn import QRNNLayer self.rnns = [QRNNLayer(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.ndir, save_prev_x=True, zoneout=0, window=2 if l == 0 else 1, output_gate=True, use_cuda=torch.cuda.is_available()) for l in range(n_layers)] if weight_p != 0.: for rnn in self.rnns: rnn.linear = WeightDropout(rnn.linear, weight_p, layer_names=['weight']) else: self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.ndir, 1, bidirectional=bidir) for l in range(n_layers)] if weight_p != 0.: self.rnns = [WeightDropout(rnn, weight_p) for rnn in self.rnns] self.rnns = torch.nn.ModuleList(self.rnns) self.encoder.weight.data.uniform_(-self.initrange, self.initrange) self.input_dp = RNNDropout(input_p) self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)]) def forward(self, input:LongTensor) -> Tuple[Tensor,Tensor]: sl,bs = input.size() if bs!=self.bs: self.bs=bs self.reset() raw_output = self.input_dp(self.encoder_dp(input)) new_hidden,raw_outputs,outputs = [],[],[] for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)): raw_output, new_h = rnn(raw_output, self.hidden[l]) new_hidden.append(new_h) raw_outputs.append(raw_output) if l != self.n_layers - 1: raw_output = hid_dp(raw_output) outputs.append(raw_output) self.hidden = repackage_var(new_hidden) return raw_outputs, outputs def one_hidden(self, l:int) -> Tensor: "Returns one hidden state" nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz)//self.ndir return self.weights.new(self.ndir, self.bs, nh).zero_() def reset(self): "Resets the hidden states" [r.reset() for r in self.rnns if hasattr(r, 'reset')] self.weights = next(self.parameters()).data if self.qrnn: self.hidden = [self.one_hidden(l) for l in range(self.n_layers)] else: self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.n_layers)] #export class LinearDecoder(nn.Module): "To go on top of a RNN_Core module" initrange=0.1 def __init__(self, n_out:int, n_hid:int, output_p:float, tie_encoder:Model=None, bias:bool=True): super().__init__() self.decoder = nn.Linear(n_hid, n_out, bias=bias) self.decoder.weight.data.uniform_(-self.initrange, self.initrange) self.output_dp = RNNDropout(output_p) if bias: self.decoder.bias.data.zero_() if tie_encoder: self.decoder.weight = tie_encoder.weight def forward(self, input:Tuple[Tensor,Tensor]) -> Tuple[Tensor,Tensor,Tensor]: raw_outputs, outputs = input output = self.output_dp(outputs[-1]) decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) return decoded, raw_outputs, outputs #export class SequentialRNN(nn.Sequential): "A sequential module that passes the reset call to its children." def reset(self): for c in self.children(): if hasattr(c, 'reset'): c.reset() #export def get_language_model(vocab_sz:int, emb_sz:int, n_hid:int, n_layers:int, pad_token:int, tie_weights:bool=True, qrnn:bool=False, bias:bool=True, output_p:float=0.4, hidden_p:float=0.2, input_p:float=0.6, embed_p:float=0.1, weight_p:float=0.5) -> Model: "To create a full AWD-LSTM" rnn_enc = RNNCore(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token, qrnn=qrnn, hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p) enc = rnn_enc.encoder if tie_weights else None return SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias)) tst_model = get_language_model(500, 20, 100, 2, 0, qrnn=True) tst_model.cuda() x = torch.randint(0, 500, (10,5)).long() z = tst_model(x.cuda()) len(z) #export @dataclass class GradientClipping(Callback): "To do gradient clipping during training." learn:Learner clip:float def on_backward_end(self, **kwargs): if self.clip: nn.utils.clip_grad_norm_(self.learn.model.parameters(), self.clip) #export @dataclass class RNNTrainer(Callback): "`Callback` that regroups lr adjustment to seq_len, AR and TAR" learn:Learner bptt:int alpha:float=0. beta:float=0. adjust:bool=True def on_loss_begin(self, last_output:Tuple[Tensor,Tensor,Tensor], **kwargs): #Save the extra outputs for later and only returns the true output. self.raw_out,self.out = last_output[1],last_output[2] return last_output[0] def on_backward_begin(self, last_loss:Rank0Tensor, last_input:Tensor, last_output:Tensor, **kwargs): #Adjusts the lr to the bptt selected if self.adjust: self.learn.opt.lr *= last_input.size(0) / self.bptt #AR and TAR if self.alpha != 0.: last_loss += (self.alpha * self.out[-1].pow(2).mean()).sum() if self.beta != 0.: h = self.raw_out[-1] if len(h)>1: last_loss += (self.beta * (h[1:] - h[:-1]).pow(2).mean()).sum() return last_loss emb_sz, nh, nl = 400, 1150, 3 model = get_language_model(vocab_size, emb_sz, nh, nl, 0, input_p=0.6, output_p=0.4, weight_p=0.5, embed_p=0.1, hidden_p=0.2) learn = Learner(data, model) learn.opt_fn = partial(optim.Adam, betas=(0.8,0.99)) learn.callbacks.append(RNNTrainer(learn, bptt, alpha=2, beta=1)) learn.callback_fns = [partial(GradientClipping, clip=0.12)] fit_one_cycle(learn, 1, 5e-3, (0.8,0.7), wd=1.2e-6)