Vietnamese ULMFiT from scratch (backwards)¶

In [1]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

In [2]:

bs=128

In [3]:

data_path = Config.data_path()
lang = 'vi'
name = f'{lang}wiki'
path = data_path/name
dest = path/'docs'
lm_fns = [f'{lang}_wt_bwd', f'{lang}_wt_vocab_bwd']

Vietnamese wikipedia model¶

In [5]:

data = (TextList.from_folder(dest)
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()
            .databunch(bs=bs, num_workers=1, backwards=True))

data.save(f'{lang}_databunch_bwd')

In [4]:

data = load_data(dest, f'{lang}_databunch_bwd', bs=bs, backwards=True)

/home/jhoward/anaconda3/lib/python3.7/site-packages/torch/serialization.py:493: SourceChangeWarning: source code of class 'torch.nn.modules.loss.CrossEntropyLoss' has changed. you can retrieve the original source code by accessing the object's source attribute or set `torch.nn.Module.dump_patches = True` and use the patch tool to revert the changes.
  warnings.warn(msg, SourceChangeWarning)

In [5]:

learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16()

In [6]:

lr = 3e-3
lr *= bs/48  # Scale learning rate by batch size

In [12]:

learn.unfreeze()
learn.fit_one_cycle(10, lr, moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	time
0	3.445849	3.424579	0.401327	32:56
1	3.420865	3.383994	0.402841	33:31
2	3.374694	3.330634	0.407800	33:26
3	3.273197	3.257108	0.416047	32:54
4	3.223044	3.200649	0.422695	32:56
5	3.134357	3.132859	0.430725	31:35
6	3.135637	3.057030	0.439737	31:41
7	3.080461	2.992323	0.447939	31:45
8	3.075036	2.943683	0.454494	31:39
9	2.947997	2.929258	0.456500	31:46

In [14]:

mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
learn.to_fp32().save(mdl_path/lm_fns[0], with_opt=False)
learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))

Vietnamese sentiment analysis¶

Language model¶

In [9]:

train_df = pd.read_csv(path/'train.csv')
train_df.loc[pd.isna(train_df.comment),'comment']='NA'

test_df = pd.read_csv(path/'test.csv')
test_df.loc[pd.isna(test_df.comment),'comment']='NA'
test_df['label'] = 0

df = pd.concat([train_df,test_df])

In [10]:

data_lm = (TextList.from_df(df, path, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_for_lm()           
    .databunch(bs=bs, num_workers=1, backwards=True))

learn_lm = language_model_learner(data_lm, AWD_LSTM, config={**awd_lstm_lm_config, 'n_hid': 1152},
                                  pretrained_fnames=lm_fns, drop_mult=1.0)

In [11]:

lr = 1e-3
lr *= bs/48

In [21]:

learn_lm.fit_one_cycle(2, lr*10, moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	time
0	4.797052	4.025901	0.323326	00:07
1	4.275975	3.914450	0.333719	00:06

In [22]:

learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	time
0	3.996770	3.809489	0.346052	00:09
1	3.856959	3.664919	0.363239	00:09
2	3.726143	3.584303	0.369685	00:09
3	3.608569	3.531390	0.375307	00:09
4	3.514265	3.500826	0.379701	00:09
5	3.446292	3.486931	0.380859	00:09
6	3.392542	3.479732	0.382520	00:09
7	3.357502	3.478930	0.382520	00:09

In [23]:

learn_lm.save(f'{lang}fine_tuned_bwd')
learn_lm.save_encoder(f'{lang}fine_tuned_enc_bwd')

Classifier¶

In [12]:

data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_from_df(cols='label')
    .databunch(bs=bs, num_workers=1, backwards=True))

data_clas.save(f'{lang}_textlist_class_bwd')

In [13]:

data_clas = load_data(path, f'{lang}_textlist_class_bwd', bs=bs, num_workers=1, backwards=True)

In [14]:

from sklearn.metrics import f1_score

@np_func
def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1))

In [15]:

learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load_encoder(f'{lang}fine_tuned_enc_bwd')
learn_c.freeze()

In [16]:

lr=2e-2
lr *= bs/48

In [17]:

learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	f1	time
0	0.369300	0.363769	0.834577	0.826098	00:03
1	0.328192	0.278986	0.874378	0.851747	00:02

In [18]:

learn_c.freeze_to(-2)
learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	f1	time
0	0.337875	0.306132	0.876866	0.860107	00:03
1	0.276982	0.237260	0.906095	0.886427	00:03

In [27]:

learn_c.freeze_to(-3)
learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	f1	time
0	0.292297	0.252393	0.896144	0.877916	00:04
1	0.255284	0.213655	0.912313	0.892551	00:04

In [20]:

learn_c.unfreeze()
learn_c.fit_one_cycle(1, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	f1	time
0	0.167376	0.266633	0.904851	0.885386	00:04

In [21]:

learn_c.save(f'{lang}clas_bwd')

In [ ]: