%load_ext autoreload %autoreload 2 %matplotlib inline #export from exp.nb_11a import * path = datasets.untar_data(datasets.URLs.IMDB) path.ls() #export def read_file(fn): with open(fn, 'r', encoding = 'utf8') as f: return f.read() class TextList(ItemList): @classmethod def from_files(cls, path, extensions='.txt', recurse=True, include=None, **kwargs): return cls(get_files(path, extensions, recurse=recurse, include=include), path, **kwargs) def get(self, i): if isinstance(i, Path): return read_file(i) return i il = TextList.from_files(path, include=['train', 'test', 'unsup']) len(il.items) txt = il[0] txt sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1)) sd #export import spacy,html #export #special tokens UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split() def sub_br(t): "Replaces the
by \n" re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE) return re_br.sub("\n", t) def spec_add_spaces(t): "Add spaces around / and #" return re.sub(r'([/#])', r' \1 ', t) def rm_useless_spaces(t): "Remove multiple spaces" return re.sub(' {2,}', ' ', t) def replace_rep(t): "Replace repetitions at the character level: cccc -> TK_REP 4 c" def _replace_rep(m:Collection[str]) -> str: c,cc = m.groups() return f' {TK_REP} {len(cc)+1} {c} ' re_rep = re.compile(r'(\S)(\1{3,})') return re_rep.sub(_replace_rep, t) def replace_wrep(t): "Replace word repetitions: word word word -> TK_WREP 3 word" def _replace_wrep(m:Collection[str]) -> str: c,cc = m.groups() return f' {TK_WREP} {len(cc.split())+1} {c} ' re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})') return re_wrep.sub(_replace_wrep, t) def fixup_text(x): "Various messy things we've seen in documents" re1 = re.compile(r' +') x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace( 'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace( '
', "\n").replace('\\"', '"').replace('',UNK).replace(' @.@ ','.').replace( ' @-@ ','-').replace('\\', ' \\ ') return re1.sub(' ', html.unescape(x)) default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br] default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ] replace_rep('cccc') replace_wrep('word word word word word ') #export def replace_all_caps(x): "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before." res = [] for t in x: if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower()) else: res.append(t) return res def deal_caps(x): "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before." res = [] for t in x: if t == '': continue if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ) res.append(t.lower()) return res def add_eos_bos(x): return [BOS] + x + [EOS] default_post_rules = [deal_caps, replace_all_caps, add_eos_bos] replace_all_caps(['I', 'AM', 'SHOUTING']) deal_caps(['My', 'name', 'is', 'Jeremy']) #export from spacy.symbols import ORTH from concurrent.futures import ProcessPoolExecutor def parallel(func, arr, max_workers=4): if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr))) else: with ProcessPoolExecutor(max_workers=max_workers) as ex: return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr))) if any([o is not None for o in results]): return results #export class TokenizeProcessor(Processor): def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): self.chunksize,self.max_workers = chunksize,max_workers self.tokenizer = spacy.blank(lang).tokenizer for w in default_spec_tok: self.tokenizer.add_special_case(w, [{ORTH: w}]) self.pre_rules = default_pre_rules if pre_rules is None else pre_rules self.post_rules = default_post_rules if post_rules is None else post_rules def proc_chunk(self, args): i,chunk = args chunk = [compose(t, self.pre_rules) for t in chunk] docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)] docs = [compose(t, self.post_rules) for t in docs] return docs def __call__(self, items): toks = [] if isinstance(items[0], Path): items = [read_file(i) for i in items] chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))] toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers) return sum(toks, []) def proc1(self, item): return self.proc_chunk([item])[0] def deprocess(self, toks): return [self.deproc1(tok) for tok in toks] def deproc1(self, tok): return " ".join(tok) tp = TokenizeProcessor() txt[:250] ' • '.join(tp(il[:100])[0])[:400] #export import collections class NumericalizeProcessor(Processor): def __init__(self, vocab=None, max_vocab=60000, min_freq=2): self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq def __call__(self, items): #The vocab is defined on the first use. if self.vocab is None: freq = Counter(p for o in items for p in o) self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq] for o in reversed(default_spec_tok): if o in self.vocab: self.vocab.remove(o) self.vocab.insert(0, o) if getattr(self, 'otoi', None) is None: self.otoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.vocab)}) return [self.proc1(o) for o in items] def proc1(self, item): return [self.otoi[o] for o in item] def deprocess(self, idxs): assert self.vocab is not None return [self.deproc1(idx) for idx in idxs] def deproc1(self, idx): return [self.vocab[i] for i in idx] proc_tok,proc_num = TokenizeProcessor(max_workers=8),NumericalizeProcessor() %time ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num]) ll.train.x_obj(0) pickle.dump(ll, open(path/'ld.pkl', 'wb')) ll = pickle.load(open(path/'ld.pkl', 'rb')) # Just using those for illustration purposes, they're not used otherwise. from IPython.display import display,HTML import pandas as pd stream = """ In this notebook, we will go back over the example of classifying movie reviews we studied in part 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the Processor used in the data block API. Then we will study how we build a language model and train it.\n """ tokens = np.array(tp([stream])[0]) bs,seq_len = 6,15 d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)]) df = pd.DataFrame(d_tokens) display(HTML(df.to_html(index=False,header=None))) bs,bptt = 6,5 for k in range(3): d_tokens = np.array([tokens[i*seq_len + k*bptt:i*seq_len + (k+1)*bptt] for i in range(bs)]) df = pd.DataFrame(d_tokens) display(HTML(df.to_html(index=False,header=None))) #export class LM_PreLoader(): def __init__(self, data, bs=64, bptt=70, shuffle=False): self.data,self.bs,self.bptt,self.shuffle = data,bs,bptt,shuffle total_len = sum([len(t) for t in data.x]) self.n_batch = total_len // bs self.batchify() def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs def __getitem__(self, idx): source = self.batched_data[idx % self.bs] seq_idx = (idx // self.bs) * self.bptt return source[seq_idx:seq_idx+self.bptt],source[seq_idx+1:seq_idx+self.bptt+1] def batchify(self): texts = self.data.x if self.shuffle: texts = texts[torch.randperm(len(texts))] stream = torch.cat([tensor(t) for t in texts]) self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch) dl = DataLoader(LM_PreLoader(ll.valid, shuffle=True), batch_size=64) iter_dl = iter(dl) x1,y1 = next(iter_dl) x2,y2 = next(iter_dl) x1.size(),y1.size() vocab = proc_num.vocab " ".join(vocab[o] for o in x1[0]) " ".join(vocab[o] for o in y1[0]) " ".join(vocab[o] for o in x2[0]) #export def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs): return (DataLoader(LM_PreLoader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs), DataLoader(LM_PreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs)) def lm_databunchify(sd, bs, bptt, **kwargs): return DataBunch(*get_lm_dls(sd.train, sd.valid, bs, bptt, **kwargs)) bs,bptt = 64,70 data = lm_databunchify(ll, bs, bptt) proc_cat = CategoryProcessor() il = TextList.from_files(path, include=['train', 'test']) sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='test')) ll = label_by_func(sd, parent_labeler, proc_x = [proc_tok, proc_num], proc_y=proc_cat) pickle.dump(ll, open(path/'ll_clas.pkl', 'wb')) ll = pickle.load(open(path/'ll_clas.pkl', 'rb')) [(ll.train.x_obj(i), ll.train.y_obj(i)) for i in [1,12552]] #export from torch.utils.data import Sampler class SortSampler(Sampler): def __init__(self, data_source, key): self.data_source,self.key = data_source,key def __len__(self): return len(self.data_source) def __iter__(self): return iter(sorted(list(range(len(self.data_source))), key=self.key, reverse=True)) #export class SortishSampler(Sampler): def __init__(self, data_source, key, bs): self.data_source,self.key,self.bs = data_source,key,bs def __len__(self) -> int: return len(self.data_source) def __iter__(self): idxs = torch.randperm(len(self.data_source)) megabatches = [idxs[i:i+self.bs*50] for i in range(0, len(idxs), self.bs*50)] sorted_idx = torch.cat([tensor(sorted(s, key=self.key, reverse=True)) for s in megabatches]) batches = [sorted_idx[i:i+self.bs] for i in range(0, len(sorted_idx), self.bs)] max_idx = torch.argmax(tensor([self.key(ck[0]) for ck in batches])) # find the chunk with the largest key, batches[0],batches[max_idx] = batches[max_idx],batches[0] # then make sure it goes first. batch_idxs = torch.randperm(len(batches)-2) sorted_idx = torch.cat([batches[i+1] for i in batch_idxs]) if len(batches) > 1 else LongTensor([]) sorted_idx = torch.cat([batches[0], sorted_idx, batches[-1]]) return iter(sorted_idx) #export def pad_collate(samples, pad_idx=1, pad_first=False): max_len = max([len(s[0]) for s in samples]) res = torch.zeros(len(samples), max_len).long() + pad_idx for i,s in enumerate(samples): if pad_first: res[i, -len(s[0]):] = LongTensor(s[0]) else: res[i, :len(s[0]) ] = LongTensor(s[0]) return res, tensor([s[1] for s in samples]) bs = 64 train_sampler = SortishSampler(ll.train.x, key=lambda t: len(ll.train[int(t)][0]), bs=bs) train_dl = DataLoader(ll.train, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate) iter_dl = iter(train_dl) x,y = next(iter_dl) lengths = [] for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item()) lengths[:5], lengths[-1] x,y = next(iter_dl) lengths = [] for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item()) lengths[:5], lengths[-1] x #export def get_clas_dls(train_ds, valid_ds, bs, **kwargs): train_sampler = SortishSampler(train_ds.x, key=lambda t: len(train_ds.x[t]), bs=bs) valid_sampler = SortSampler(valid_ds.x, key=lambda t: len(valid_ds.x[t])) return (DataLoader(train_ds, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate, **kwargs), DataLoader(valid_ds, batch_size=bs*2, sampler=valid_sampler, collate_fn=pad_collate, **kwargs)) def clas_databunchify(sd, bs, **kwargs): return DataBunch(*get_clas_dls(sd.train, sd.valid, bs, **kwargs)) bs,bptt = 64,70 data = clas_databunchify(ll, bs) !python notebook2script.py 12_text.ipynb