#!/usr/bin/env python # coding: utf-8 # # Vietnamese ULMFiT from scratch # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') from fastai import * from fastai.text import * # In[2]: # bs=48 # bs=24 bs=128 # In[7]: torch.cuda.set_device(2) # In[73]: data_path = Config.data_path() # This will create a `viwiki` folder, containing a `viwiki` text file with the wikipedia contents. (For other languages, replace `vi` with the appropriate code from the [list of wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias).) # In[5]: lang = 'vi' # lang = 'zh' # In[6]: name = f'{lang}wiki' path = data_path/name path.mkdir(exist_ok=True, parents=True) lm_fns = [f'{lang}_wt', f'{lang}_wt_vocab'] # ## Vietnamese wikipedia model # ### Download data # In[ ]: from nlputils import split_wiki,get_wiki # In[67]: get_wiki(path,lang) # In[68]: path.ls() # In[37]: get_ipython().system('head -n4 {path}/{name}') # This function splits the single wikipedia file into a separate file per article. This is often easier to work with. # In[76]: dest = split_wiki(path,lang) # In[38]: dest.ls()[:5] # In[ ]: # Use this to convert Chinese traditional to simplified characters # ls *.txt | parallel -I% opencc -i % -o ../zhsdocs/% -c t2s.json # ### Create pretrained model # In[9]: data = (TextList.from_folder(dest) .split_by_rand_pct(0.1, seed=42) .label_for_lm() .databunch(bs=bs, num_workers=1)) data.save(f'{lang}_databunch') len(data.vocab.itos),len(data.train_ds) # In[7]: data = load_data(path, f'{lang}_databunch', bs=bs) # In[12]: learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16() # In[18]: lr = 1e-2 lr *= bs/48 # Scale learning rate by batch size # In[20]: learn.unfreeze() learn.fit_one_cycle(10, lr, moms=(0.8,0.7)) # Save the pretrained model and vocab: # In[75]: mdl_path = path/'models' mdl_path.mkdir(exist_ok=True) learn.to_fp32().save(mdl_path/lm_fns[0], with_opt=False) learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl')) # ## Vietnamese sentiment analysis # ### Language model # - [Data](https://github.com/ngxbac/aivivn_phanloaisacthaibinhluan/tree/master/data) # - [Competition details](https://www.aivivn.com/contests/1) # - Top 3 f1 scores: 0.900, 0.897, 0.897 # In[35]: train_df = pd.read_csv(path/'train.csv') train_df.loc[pd.isna(train_df.comment),'comment']='NA' train_df.head() # In[36]: test_df = pd.read_csv(path/'test.csv') test_df.loc[pd.isna(test_df.comment),'comment']='NA' test_df.head() # In[37]: df = pd.concat([train_df,test_df], sort=False) # In[38]: data_lm = (TextList.from_df(df, path, cols='comment') .split_by_rand_pct(0.1, seed=42) .label_for_lm() .databunch(bs=bs, num_workers=1)) # In[17]: learn_lm = language_model_learner(data_lm, AWD_LSTM, pretrained_fnames=lm_fns, drop_mult=1.0) # In[39]: lr = 1e-3 lr *= bs/48 # In[19]: learn_lm.fit_one_cycle(2, lr*10, moms=(0.8,0.7)) # In[20]: learn_lm.unfreeze() learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7)) # In[21]: learn_lm.save(f'{lang}fine_tuned') learn_lm.save_encoder(f'{lang}fine_tuned_enc') # ### Classifier # In[40]: data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment') .split_by_rand_pct(0.1, seed=42) .label_from_df(cols='label') .databunch(bs=bs, num_workers=1)) data_clas.save(f'{lang}_textlist_class') # In[10]: data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1) # In[41]: from sklearn.metrics import f1_score @np_func def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1)) # In[43]: learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16() learn_c.load_encoder(f'{lang}fine_tuned_enc') learn_c.freeze() # In[44]: lr=2e-2 lr *= bs/48 # In[45]: learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7)) # In[46]: learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7)) # In[47]: learn_c.freeze_to(-2) learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7)) # In[48]: learn_c.freeze_to(-3) learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7)) # In[49]: learn_c.unfreeze() learn_c.fit_one_cycle(1, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7)) # In[50]: learn_c.save(f'{lang}clas') # Competition top 3 f1 scores: 0.90, 0.89, 0.89. Winner used an ensemble of 4 models: TextCNN, VDCNN, HARNN, and SARNN. # ## Ensemble # In[65]: data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1) learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16() learn_c.load(f'{lang}clas', purge=False); # In[69]: preds,targs = learn_c.get_preds(ordered=True) accuracy(preds,targs),f1(preds,targs) # In[67]: data_clas_bwd = load_data(path, f'{lang}_textlist_class_bwd', bs=bs, num_workers=1, backwards=True) learn_c_bwd = text_classifier_learner(data_clas_bwd, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16() learn_c_bwd.load(f'{lang}clas_bwd', purge=False); # In[70]: preds_b,targs_b = learn_c_bwd.get_preds(ordered=True) accuracy(preds_b,targs_b),f1(preds_b,targs_b) # In[71]: preds_avg = (preds+preds_b)/2 # In[72]: accuracy(preds_avg,targs_b),f1(preds_avg,targs_b) # In[ ]: