Vietnamese ULMFiT from scratch¶

In [1]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

In [2]:

# bs=48
# bs=24
bs=128

In [7]:

torch.cuda.set_device(2)

In [73]:

data_path = Config.data_path()

This will create a viwiki folder, containing a viwiki text file with the wikipedia contents. (For other languages, replace vi with the appropriate code from the list of wikipedias.)

In [5]:

lang = 'vi'
# lang = 'zh'

In [6]:

name = f'{lang}wiki'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)
lm_fns = [f'{lang}_wt', f'{lang}_wt_vocab']

Vietnamese wikipedia model¶

Download data¶

In [ ]:

from nlputils import split_wiki,get_wiki

In [67]:

get_wiki(path,lang)

In [68]:

path.ls()

Out[68]:

[PosixPath('/home/jhoward/data/zhwiki/docs'),
 PosixPath('/home/jhoward/data/zhwiki/zhwiki-latest-pages-articles.xml.bz2'),
 PosixPath('/home/jhoward/data/zhwiki/zh.cnf'),
 PosixPath('/home/jhoward/data/zhwiki/log'),
 PosixPath('/home/jhoward/data/zhwiki/zhwiki'),
 PosixPath('/home/jhoward/data/zhwiki/zhwiki-latest-pages-articles.xml'),
 PosixPath('/home/jhoward/data/zhwiki/wikiextractor')]

In [37]:

!head -n4 {path}/{name}

<doc id="13" url="https://vi.wikipedia.org/wiki?curid=13" title="Tiếng Việt">
Tiếng Việt

Tiếng Việt, còn gọi tiếng Việt Nam hay Việt ngữ, là ngôn ngữ của người Việt (người Kinh) và là ngôn ngữ chính thức tại Việt Nam. Đây là tiếng mẹ đẻ của khoảng 85% dân cư Việt Nam, cùng với hơn 4 triệu Việt kiều. Tiếng Việt còn là ngôn ngữ thứ hai của các dân tộc thiểu số tại Việt Nam. Mặc dù tiếng Việt có một số từ vựng vay mượn từ tiếng Hán và trước đây dùng chữ Nôm – một hệ chữ viết dựa trên chữ Hán – để viết nhưng tiếng Việt được coi là một trong số các ngôn ngữ thuộc ngữ hệ Nam Á có số người nói nhiều nhất (nhiều hơn một số lần so với các ngôn ngữ khác cùng hệ cộng lại). Ngày nay, tiếng Việt dùng bảng chữ cái Latinh, gọi là chữ Quốc ngữ, cùng các dấu thanh để viết.

This function splits the single wikipedia file into a separate file per article. This is often easier to work with.

In [76]:

dest = split_wiki(path,lang)

In [38]:

dest.ls()[:5]

Out[38]:

[PosixPath('/home/jhoward/data/viwiki/docs/Luis Suárez.txt'),
 PosixPath('/home/jhoward/data/viwiki/docs/Vitas.txt'),
 PosixPath('/home/jhoward/data/viwiki/docs/Chùa Hà.txt'),
 PosixPath('/home/jhoward/data/viwiki/docs/Đại Phái bộ Sứ thần.txt'),
 PosixPath('/home/jhoward/data/viwiki/docs/2 Broke Girls.txt')]

In [ ]:

# Use this to convert Chinese traditional to simplified characters
# ls *.txt | parallel -I% opencc -i % -o ../zhsdocs/% -c t2s.json

Create pretrained model¶

In [9]:

data = (TextList.from_folder(dest)
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()           
            .databunch(bs=bs, num_workers=1))

data.save(f'{lang}_databunch')
len(data.vocab.itos),len(data.train_ds)

In [7]:

data = load_data(path, f'{lang}_databunch', bs=bs)

In [12]:

learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16()

In [18]:

lr = 1e-2
lr *= bs/48  # Scale learning rate by batch size

In [20]:

learn.unfreeze()
learn.fit_one_cycle(10, lr, moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	time
0	3.436113	3.491434	0.366925	28:52
1	3.441240	3.544118	0.361326	28:33
2	3.571766	3.556932	0.358438	28:31
3	3.510540	3.519243	0.362278	28:27
4	3.447639	3.449320	0.369404	28:29
5	3.412284	3.406376	0.375022	28:20
6	3.286754	3.255309	0.391874	28:19
7	3.172497	3.128522	0.406803	28:37
8	3.126867	3.025249	0.419882	28:36
9	3.128793	2.991077	0.424622	28:39

Save the pretrained model and vocab:

In [75]:

mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
learn.to_fp32().save(mdl_path/lm_fns[0], with_opt=False)
learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))

Vietnamese sentiment analysis¶

Language model¶

Data
Competition details
Top 3 f1 scores: 0.900, 0.897, 0.897

In [35]:

train_df = pd.read_csv(path/'train.csv')
train_df.loc[pd.isna(train_df.comment),'comment']='NA'
train_df.head()

Out[35]:

	id	comment	label
0	train_000000	Dung dc sp tot cam on \nshop Đóng gói sản phẩm...	0
1	train_000001	Chất lượng sản phẩm tuyệt vời . Son mịn nhưng...	0
2	train_000002	Chất lượng sản phẩm tuyệt vời nhưng k có hộp ...	0
3	train_000003	:(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọ...	1
4	train_000004	Lần trước mình mua áo gió màu hồng rất ok mà đ...	1

In [36]:

test_df = pd.read_csv(path/'test.csv')
test_df.loc[pd.isna(test_df.comment),'comment']='NA'
test_df.head()

Out[36]:

	id	comment
0	test_000000	Chưa dùng thử nên chưa biết
1	test_000001	Không đáng tiềnVì ngay đợt sale nên mới mua n...
2	test_000002	Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắc...
3	test_000003	Vải đẹp.phom oki luôn.quá ưng
4	test_000004	Chuẩn hàng đóng gói đẹp

In [37]:

df = pd.concat([train_df,test_df], sort=False)

In [38]:

data_lm = (TextList.from_df(df, path, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_for_lm()           
    .databunch(bs=bs, num_workers=1))

In [17]:

learn_lm = language_model_learner(data_lm, AWD_LSTM, pretrained_fnames=lm_fns, drop_mult=1.0)

In [39]:

lr = 1e-3
lr *= bs/48

In [19]:

learn_lm.fit_one_cycle(2, lr*10, moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	time
0	4.975080	4.138585	0.317773	00:07
1	4.408635	4.025489	0.326423	00:07

In [20]:

learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	time
0	4.142114	3.928278	0.336230	00:09
1	4.010835	3.793583	0.349972	00:09
2	3.873617	3.694702	0.357240	00:09
3	3.761377	3.632186	0.364648	00:09
4	3.679017	3.595601	0.366964	00:09
5	3.614548	3.576386	0.369224	00:09
6	3.575895	3.567496	0.370285	00:09
7	3.560278	3.566525	0.370173	00:10

In [21]:

learn_lm.save(f'{lang}fine_tuned')
learn_lm.save_encoder(f'{lang}fine_tuned_enc')

Classifier¶

In [40]:

data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment')
    .split_by_rand_pct(0.1, seed=42)
    .label_from_df(cols='label')
    .databunch(bs=bs, num_workers=1))

data_clas.save(f'{lang}_textlist_class')

In [10]:

data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)

In [41]:

from sklearn.metrics import f1_score

@np_func
def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1))

In [43]:

learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load_encoder(f'{lang}fine_tuned_enc')
learn_c.freeze()

In [44]:

lr=2e-2
lr *= bs/48

In [45]:

learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	_inner	time
0	0.338150	0.275298	0.899876	0.878430	00:02
1	0.302302	0.245949	0.902985	0.877226	00:02

In [46]:

learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	_inner	time
0	0.321768	0.255457	0.899254	0.871367	00:02
1	0.305934	0.250888	0.894901	0.872021	00:02

In [47]:

learn_c.freeze_to(-2)
learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	_inner	time
0	0.300939	0.261080	0.893657	0.866201	00:03
1	0.263790	0.220207	0.906716	0.886115	00:03

In [48]:

learn_c.freeze_to(-3)
learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	_inner	time
0	0.282888	0.238203	0.905473	0.886483	00:04
1	0.248599	0.216489	0.918532	0.901550	00:04

In [49]:

learn_c.unfreeze()
learn_c.fit_one_cycle(1, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

epoch	train_loss	valid_loss	accuracy	_inner	time
0	0.201508	0.217176	0.911070	0.890084	00:05

In [50]:

learn_c.save(f'{lang}clas')

Competition top 3 f1 scores: 0.90, 0.89, 0.89. Winner used an ensemble of 4 models: TextCNN, VDCNN, HARNN, and SARNN.

Ensemble¶

In [65]:

data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load(f'{lang}clas', purge=False);

In [69]:

preds,targs = learn_c.get_preds(ordered=True)
accuracy(preds,targs),f1(preds,targs)

Out[69]:

(tensor(0.9111), tensor(0.8952))

In [67]:

data_clas_bwd = load_data(path, f'{lang}_textlist_class_bwd', bs=bs, num_workers=1, backwards=True)
learn_c_bwd = text_classifier_learner(data_clas_bwd, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c_bwd.load(f'{lang}clas_bwd', purge=False);

In [70]:

preds_b,targs_b = learn_c_bwd.get_preds(ordered=True)
accuracy(preds_b,targs_b),f1(preds_b,targs_b)

Out[70]:

(tensor(0.9092), tensor(0.8957))

In [71]:

preds_avg = (preds+preds_b)/2

In [72]:

accuracy(preds_avg,targs_b),f1(preds_avg,targs_b)

Out[72]:

(tensor(0.9154), tensor(0.9016))

In [ ]: