#export
from local.torch_basics import *
from local.test import *
from local.core import *
from local.data.all import *
from local.notebook.showdoc import *
#default_exp text.core
#default_cls_lvl 3
Basic function to preprocess text before assembling it in a
DataBunch
.
#export
import spacy,html
from spacy.symbols import ORTH
The following are rules applied to texts before or after it's tokenized.
#export
#special tokens
UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxfld xxrep xxwrep xxup xxmaj".split()
#export
_all_ = ["UNK", "PAD", "BOS", "EOS", "FLD", "TK_REP", "TK_WREP", "TK_UP", "TK_MAJ"]
#export
_re_spec = re.compile(r'([/#\\])')
def spec_add_spaces(t):
"Add spaces around / and #"
return _re_spec.sub(r' \1 ', t)
test_eq(spec_add_spaces('#fastai'), ' # fastai')
test_eq(spec_add_spaces('/fastai'), ' / fastai')
test_eq(spec_add_spaces('\\fastai'), ' \\ fastai')
#export
_re_space = re.compile(' {2,}')
def rm_useless_spaces(t):
"Remove multiple spaces"
return _re_space.sub(' ', t)
test_eq(rm_useless_spaces('a b c'), 'a b c')
#export
_re_rep = re.compile(r'(\S)(\1{2,})')
def replace_rep(t):
"Replace repetitions at the character level: cccc -- TK_REP 4 c"
def _replace_rep(m):
c,cc = m.groups()
return f' {TK_REP} {len(cc)+1} {c} '
return _re_rep.sub(_replace_rep, t)
It starts replacing at 3 repetitions of the same character or more.
test_eq(replace_rep('aa'), 'aa')
test_eq(replace_rep('aaaa'), f' {TK_REP} 4 a ')
#export
_re_wrep = re.compile(r'(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)')
#hide
"""
Matches any word repeated at least four times with spaces between them
(?:\s|^) Non-Capture either a whitespace character or the beginning of text
(\w+) Capture any alphanumeric character
\s+ One or more whitespace
((?:\1\s+)+) Capture a repetition of one or more times \1 followed by one or more whitespace
\1 Occurence of \1
(\s|\W|$) Capture last whitespace, non alphanumeric character or end of text
""";
#export
def replace_wrep(t):
"Replace word repetitions: word word word word -- TK_WREP 4 word"
def _replace_wrep(m):
c,cc,e = m.groups()
return f' {TK_WREP} {len(cc.split())+2} {c} {e}'
return _re_wrep.sub(_replace_wrep, t)
It starts replacing at 3 repetitions of the same word or more.
test_eq(replace_wrep('ah ah'), 'ah ah')
test_eq(replace_wrep('ah ah ah'), f' {TK_WREP} 3 ah ')
test_eq(replace_wrep('ah ah ah ah'), f' {TK_WREP} 4 ah ')
test_eq(replace_wrep('ah ah ah ah '), f' {TK_WREP} 4 ah ')
test_eq(replace_wrep('ah ah ah ah.'), f' {TK_WREP} 4 ah .')
test_eq(replace_wrep('ah ah ahi'), f'ah ah ahi')
#export
def fix_html(x):
"Various messy things we've seen in documents"
x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace('nbsp;', ' ').replace(
'#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace('<br />', "\n").replace(
'\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(' @-@ ','-').replace('...',' …')
return html.unescape(x)
test_eq(fix_html('#39;bli#146;'), "'bli'")
test_eq(fix_html('Sarah amp; Duck...'), 'Sarah & Duck …')
test_eq(fix_html('a nbsp; #36;'), 'a $')
test_eq(fix_html('\\" <unk>'), f'" {UNK}')
test_eq(fix_html('quot; @.@ @-@ '), "' .-")
test_eq(fix_html('<br />text\\n'), '\ntext\n')
#export
_re_all_caps = re.compile(r'(\s|^)([A-Z]+[^a-z\s]*)(?=(\s|$))')
#hide
"""
Catches any word in all caps, even with ' or - inside
(\s|^) Capture either a whitespace or the beginning of text
([A-Z]+ Capture one capitalized letter or more...
[^a-z\s]*) ...followed by anything that's non lowercase or whitespace
(?=(\s|$)) Look ahead for a space or end of text
""";
#export
def replace_all_caps(t):
"Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
def _replace_all_caps(m):
tok = f'{TK_UP} ' if len(m.groups()[1]) > 1 else ''
return f"{m.groups()[0]}{tok}{m.groups()[1].lower()}"
return _re_all_caps.sub(_replace_all_caps, t)
test_eq(replace_all_caps("I'M SHOUTING"), f"{TK_UP} i'm {TK_UP} shouting")
test_eq(replace_all_caps("I'm speaking normally"), "I'm speaking normally")
test_eq(replace_all_caps("I am speaking normally"), "i am speaking normally")
#export
_re_maj = re.compile(r'(\s|^)([A-Z][^A-Z\s]*)(?=(\s|$))')
#hide
"""
Catches any capitalized word
(\s|^) Capture either a whitespace or the beginning of text
([A-Z] Capture exactly one capitalized letter...
[^A-Z\s]*) ...followed by anything that's not uppercase or whitespace
(?=(\s|$)) Look ahead for a space of end of text
""";
#export
def replace_maj(t):
"Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
def _replace_maj(m):
tok = f'{TK_MAJ} ' if len(m.groups()[1]) > 1 else ''
return f"{m.groups()[0]}{tok}{m.groups()[1].lower()}"
return _re_maj.sub(_replace_maj, t)
test_eq(replace_maj("Jeremy Howard"), f'{TK_MAJ} jeremy {TK_MAJ} howard')
test_eq(replace_maj("I don't think there is any maj here"), ("i don't think there is any maj here"),)
#export
def lowercase(t, add_bos=True, add_eos=False):
"Converts `t` to lowercase"
return (f'{BOS} ' if add_bos else '') + t.lower().strip() + (f' {EOS}' if add_eos else '')
#export
def replace_space(t):
"Replace embedded spaces in a token with unicode line char to allow for split/join"
return t.replace(' ', '▁')
#export
defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ]
defaults.text_proc_rules = [fix_html, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces,
replace_all_caps, replace_maj, lowercase]
defaults.text_postproc_rules = [replace_space]
A tokenizer is a class that must implement a pipe
method. This pipe
method receives a generator of texts and must return a generator with their tokenized versions. Here is the most basic example:
#export
class BaseTokenizer():
"Basic tokenizer that just splits on spaces"
def __init__(self, split_char=' ', **kwargs): self.split_char=split_char
def __call__(self, items): return (t.split(self.split_char) for t in items)
tok = BaseTokenizer()
for t in tok(["This is a text"]): test_eq(t, ["This", "is", "a", "text"])
tok = BaseTokenizer('x')
for t in tok(["This is a text"]): test_eq(t, ["This is a te", "t"])
#export
class SpacyTokenizer():
"Spacy tokenizer for `lang`"
def __init__(self, lang='en', special_toks=None, buf_sz=5000):
special_toks = ifnone(special_toks, defaults.text_spec_tok)
nlp = spacy.blank(lang, disable=["parser", "tagger", "ner"])
for w in special_toks: nlp.tokenizer.add_special_case(w, [{ORTH: w}])
self.pipe,self.buf_sz = nlp.pipe,buf_sz
def __call__(self, items):
return (L(doc).attrgot('text') for doc in self.pipe(items, batch_size=self.buf_sz))
tok = SpacyTokenizer()
inp,exp = "This isn't the easiest text.",["This", "is", "n't", "the", "easiest", "text", "."]
test_eq(L(tok([inp]*5)), [exp]*5)
#export
class TokenizeBatch:
"A wrapper around `tok_func` to apply `rules` and tokenize in parallel"
def __init__(self, tok_func=SpacyTokenizer, rules=None, post_rules=None, **tok_kwargs ):
self.rules = L(ifnone(rules, defaults.text_proc_rules))
self.post_f = compose(*L(ifnone(post_rules, defaults.text_postproc_rules)))
self.tok = tok_func(**tok_kwargs)
def __call__(self, batch):
return (L(o).map(self.post_f) for o in self.tok(maps(*self.rules, batch)))
f = TokenizeBatch()
test_eq(f(["This isn't a problem"]), [[BOS, TK_MAJ, 'this', 'is', "n't", 'a', 'problem']])
f = TokenizeBatch(BaseTokenizer, rules=[], split_char="'")
test_eq(f(["This isn't a problem"]), [['This▁isn', 't▁a▁problem']])
The main function that will be called during one of the processes handling tokenization. It will create an instance of a tokenizer with tok_func
and tok_kwargs
at init, then iterate through the batch
of texts, apply them rules
and tokenize them.
texts = ["this is a text", "this is another text"]
tok = TokenizeBatch(BaseTokenizer, texts.__getitem__)
test_eq([t for t in tok([0,1])],[['this', 'is', 'a', 'text'], ['this', 'is', 'another', 'text']])
#export
def tokenize1(text, tok_func=SpacyTokenizer, rules=None, post_rules=None, **tok_kwargs):
"Tokenize one `text` with an instance of `tok_func` and some `rules`"
return first(TokenizeBatch(tok_func, rules, post_rules, **tok_kwargs)([text]))
test_eq(tokenize1("This isn't a problem"),
[BOS, TK_MAJ, 'this', 'is', "n't", 'a', 'problem'])
test_eq(tokenize1("This isn't a problem", BaseTokenizer, rules=[], split_char="'"),
['This▁isn', 't▁a▁problem'])
#export
def parallel_tokenize(items, tok_func, rules, as_gen=False, n_workers=defaults.cpus, **tok_kwargs):
"Calls a potential setup on `tok_func` before launching `TokenizeBatch` in parallel"
if hasattr(tok_func, 'setup'): tok_kwargs = tok_func(**tok_kwargs).setup(items, rules)
return parallel_gen(TokenizeBatch, items, as_gen=as_gen, tok_func=tok_func,
rules=rules, n_workers=n_workers, **tok_kwargs)
Preprocessing function for texts in filenames. Tokenized texts will be saved in a similar fashion in a directory suffixed with _tok
in the parent folder of path
(override with output_dir
).
#export
fn_counter_pkl = 'counter.pkl'
#export
def tokenize_folder(path, extensions=None, folders=None, output_dir=None, n_workers=defaults.cpus,
rules=None, tok_func=SpacyTokenizer, encoding='utf8', **tok_kwargs):
"Tokenize text files in `path` in parallel using `n_workers`"
path,extensions = Path(path),ifnone(extensions, ['.txt'])
fnames = get_files(path, extensions=extensions, recurse=True, folders=folders)
output_dir = Path(ifnone(output_dir, path.parent/f'{path.name}_tok'))
rules = partial(Path.read, encoding=encoding) + L(ifnone(rules, defaults.text_proc_rules.copy()))
counter = Counter()
for i,tok in parallel_tokenize(fnames, tok_func, rules, as_gen=True, n_workers=n_workers, **tok_kwargs):
out = output_dir/fnames[i].relative_to(path)
out.write(' '.join(tok))
counter.update(tok)
(output_dir/fn_counter_pkl).save(counter)
The result will be in output_dir
(defaults to a folder in the same parent directory as path
, with _tok
added to path.name
) with the same structure as in path
. Tokenized texts for a given file will be in the file having the same name in output_dir
. Additionally, a file with a .len suffix contains the number of tokens and the count of all words is stored in output_dir/counter.pkl
.
extensions
will default to ['.txt']
and all text files in path
are treated unless you specify a list of folders in include
. tok_func
is instantiated in each process with tok_kwargs
, and rules
(that defaults to defaults.text_proc_rules
) are applied to each text before going in the tokenizer.
#export
def _join_texts(df, mark_fields=False):
"Join texts in row `idx` of `df`, marking each field with `FLD` if `mark_fields=True`"
text_col = (f'{FLD} {1} ' if mark_fields else '' ) + df.iloc[:,0].astype(str)
for i in range(1,len(df.columns)):
text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df.iloc[:,i].astype(str)
return text_col.values
#hide
texts = [f"This is an example of text {i}" for i in range(10)]
df = pd.DataFrame({'text': texts, 'text1': texts}, columns=['text', 'text1'])
col = _join_texts(df, mark_fields=True)
for i in range(len(df)):
test_eq(col[i], f'{FLD} 1 This is an example of text {i} {FLD} 2 This is an example of text {i}')
#export
def tokenize_df(df, text_cols, n_workers=defaults.cpus, rules=None, mark_fields=None,
tok_func=SpacyTokenizer, **tok_kwargs):
"Tokenize texts in `df[text_cols]` in parallel using `n_workers`"
text_cols = L(text_cols)
#mark_fields defaults to False if there is one column of texts, True if there are multiple
if mark_fields is None: mark_fields = len(text_cols)>1
rules = L(ifnone(rules, defaults.text_proc_rules.copy()))
texts = _join_texts(df[text_cols], mark_fields=mark_fields)
outputs = L(parallel_tokenize(texts, tok_func, rules, n_workers=n_workers, **tok_kwargs)
).sorted().itemgot(1)
other_cols = df.columns[~df.columns.isin(text_cols)]
res = df[other_cols].copy()
res['text'] = outputs
return res,Counter(outputs.concat())
This function returns a new dataframe with the same non-text columns, a colum named text that contains the tokenized texts and a column named text_lengths that contains their respective length. It also returns a counter of all words see to quickly build a vocabulary afterward.
tok_func
is instantiated in each process with tok_kwargs
, and rules
(that defaults to defaults.text_proc_rules
) are applied to each text before going in the tokenizer. If mark_fields
isn't specified, it defaults to False
when there is a single text column, True
when there are several. In that case, the texts in each of those columns are joined with FLD
markes followed by the number of the field.
#export
def tokenize_csv(fname, text_cols, outname=None, n_workers=4, rules=None, mark_fields=None,
tok_func=SpacyTokenizer, header='infer', chunksize=50000, **tok_kwargs):
"Tokenize texts in the `text_cols` of the csv `fname` in parallel using `n_workers`"
df = pd.read_csv(fname, header=header, chunksize=chunksize)
outname = Path(ifnone(outname, fname.parent/f'{fname.stem}_tok.csv'))
cnt = Counter()
for i,dfp in enumerate(df):
out,c = tokenize_df(dfp, text_cols, n_workers=n_workers, rules=rules,
mark_fields=mark_fields, tok_func=tok_func, **tok_kwargs)
out.text = out.text.str.join(' ')
out.to_csv(outname, header=(None,header)[i==0], index=False, mode=('a','w')[i==0])
cnt.update(c)
outname.with_suffix('.pkl').save(cnt)
#export
def load_tokenized_csv(fname):
"Utility function to quickly load a tokenized csv ans the corresponding counter"
fname = Path(fname)
out = pd.read_csv(fname)
for txt_col in out.columns[1:-1]:
out[txt_col] = out[txt_col].str.split(' ')
return out,fname.with_suffix('.pkl').load()
The result will be written in a new csv file in outname
(defaults to the same as fname
with the suffix _tok.csv
) and will have the same header as the original file, the same non-text columns, a text and a text_lengths column as described in tokenize_df
.
tok_func
is instantiated in each process with tok_kwargs
, and rules
(that defaults to defaults.text_proc_rules
) are applied to each text before going in the tokenizer. If mark_fields
isn't specified, it defaults to False
when there is a single text column, True
when there are several. In that case, the texts in each of those columns are joined with FLD
markes followed by the number of the field.
The csv file is opened with header
and optionally with blocks of chunksize
at a time. If this argument is passed, each chunk is processed independtly and saved in the output file to save memory usage.
def _prepare_texts(tmp_d):
"Prepare texts in a folder struct in tmp_d, a csv file and returns a dataframe"
path = Path(tmp_d)/'tmp'
path.mkdir()
for d in ['a', 'b', 'c']:
(path/d).mkdir()
for i in range(5):
with open(path/d/f'text{i}.txt', 'w') as f: f.write(f"This is an example of text {d} {i}")
texts = [f"This is an example of text {d} {i}" for i in range(5) for d in ['a', 'b', 'c']]
df = pd.DataFrame({'text': texts, 'label': list(range(15))}, columns=['text', 'label'])
csv_fname = tmp_d/'input.csv'
df.to_csv(csv_fname, index=False)
return path,df,csv_fname
with tempfile.TemporaryDirectory() as tmp_d:
path,df,csv_fname = _prepare_texts(Path(tmp_d))
#Tokenize as folders
tokenize_folder(path)
outp = Path(tmp_d)/'tmp_tok'
for d in ['a', 'b', 'c']:
p = outp/d
for i in range(5):
test_eq((p/f'text{i}.txt').read(), ' '.join([
BOS, TK_MAJ, 'this', 'is', 'an', 'example', 'of', 'text', d, str(i) ]))
cnt_a = (outp/fn_counter_pkl).load()
test_eq(cnt_a['this'], 15)
test_eq(cnt_a['a'], 5)
test_eq(cnt_a['0'], 3)
#Tokenize as a dataframe
out,cnt_b = tokenize_df(df, text_cols='text')
test_eq(list(out.columns), ['label', 'text'])
test_eq(out['label'].values, df['label'].values)
test_eq(out['text'], [(outp/d/f'text{i}.txt').read().split(' ') for i in range(5) for d in ['a', 'b', 'c']])
test_eq(cnt_a, cnt_b)
#Tokenize as a csv
out_fname = Path(tmp_d)/'output.csv'
tokenize_csv(csv_fname, text_cols='text', outname=out_fname)
test_eq((out,cnt_b), load_tokenized_csv(out_fname))
eu_langs = ["bg", "cs", "da", "de", "el", "en", "es", "et", "fi", "fr", "ga", "hr", "hu",
"it","lt","lv","mt","nl","pl","pt","ro","sk","sl","sv"] # all European langs
#export
class SentencePieceTokenizer():#TODO: pass the special tokens symbol to sp
"Spacy tokenizer for `lang`"
def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000,
model_type='unigram', char_coverage=None, cache_dir='tmp'):
try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor
except ImportError:
raise Exception('sentencepiece module is missing: run `pip install sentencepiece`')
self.sp_model,self.cache_dir = sp_model,Path(cache_dir)
self.vocab_sz,self.max_vocab_sz,self.model_type = vocab_sz,max_vocab_sz,model_type
self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998)
self.special_toks = ifnone(special_toks, defaults.text_spec_tok)
if sp_model is None: self.tok = None
else:
self.tok = SentencePieceProcessor()
self.tok.Load(str(sp_model))
os.makedirs(self.cache_dir, exist_ok=True)
def _get_vocab_sz(self, raw_text_path):
cnt = Counter()
with open(raw_text_path, 'r') as f:
for line in f.readlines():
cnt.update(line.split())
if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz
res = len(cnt)//4
while res%8 != 0: res+=1
return res
def train(self, raw_text_path):
"Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
from sentencepiece import SentencePieceTrainer
vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz
spec_tokens = ['\u2581'+s for s in self.special_toks]
SentencePieceTrainer.Train(" ".join([
f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}",
f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1",
f"--user_defined_symbols={','.join(spec_tokens)}"]))
raw_text_path.unlink()
return self.cache_dir/'spm.model'
def setup(self, items, rules):
if self.tok is not None: return {'sp_model': self.sp_model}
raw_text_path = self.cache_dir/'texts.out'
with open(raw_text_path, 'w') as f:
for t in progress_bar(maps(*rules, items), total=len(items), leave=False):
f.write(f'{t}\n')
return {'sp_model': self.train(raw_text_path)}
def __call__(self, items):
for t in items: yield self.tok.EncodeAsPieces(t)
texts = [f"This is an example of text {i}" for i in range(10)]
df = pd.DataFrame({'text': texts, 'label': list(range(10))}, columns=['text', 'label'])
out,cnt = tokenize_df(df, text_cols='text', tok_func=SentencePieceTokenizer, vocab_sz=34)
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)
Converted 00_test.ipynb. Converted 01_core_foundation.ipynb. Converted 01a_core_utils.ipynb. Converted 01b_core_dispatch.ipynb. Converted 01c_core_transform.ipynb. Converted 02_core_script.ipynb. Converted 03_torchcore.ipynb. Converted 03a_layers.ipynb. Converted 04_data_load.ipynb. Converted 05_data_core.ipynb. Converted 06_data_transforms.ipynb. Converted 07_data_block.ipynb. Converted 08_vision_core.ipynb. Converted 09_vision_augment.ipynb. Converted 09a_vision_data.ipynb. Converted 10_pets_tutorial.ipynb. Converted 11_vision_models_xresnet.ipynb. Converted 12_optimizer.ipynb. Converted 13_learner.ipynb. Converted 13a_metrics.ipynb. Converted 14_callback_schedule.ipynb. Converted 14a_callback_data.ipynb. Converted 15_callback_hook.ipynb. Converted 15a_vision_models_unet.ipynb. Converted 16_callback_progress.ipynb. Converted 17_callback_tracker.ipynb. Converted 18_callback_fp16.ipynb. Converted 19_callback_mixup.ipynb. Converted 20_interpret.ipynb. Converted 20a_distributed.ipynb. Converted 21_vision_learner.ipynb. Converted 22_tutorial_imagenette.ipynb. Converted 23_tutorial_transfer_learning.ipynb. Converted 30_text_core.ipynb. Converted 31_text_data.ipynb. Converted 32_text_models_awdlstm.ipynb. Converted 33_text_models_core.ipynb. Converted 34_callback_rnn.ipynb. Converted 35_tutorial_wikitext.ipynb. Converted 36_text_models_qrnn.ipynb. Converted 37_text_learner.ipynb. Converted 38_tutorial_ulmfit.ipynb. Converted 40_tabular_core.ipynb. Converted 41_tabular_model.ipynb. Converted 42_tabular_rapids.ipynb. Converted 50_data_block_examples.ipynb. Converted 60_medical_imaging.ipynb. Converted 65_medical_text.ipynb. Converted 70_callback_wandb.ipynb. Converted 71_callback_tensorboard.ipynb. Converted 90_notebook_core.ipynb. Converted 91_notebook_export.ipynb. Converted 92_notebook_showdoc.ipynb. Converted 93_notebook_export2html.ipynb. Converted 94_notebook_test.ipynb. Converted 95_index.ipynb. Converted 96_data_external.ipynb. Converted 97_utils_test.ipynb. Converted notebook2jekyll.ipynb.