Word2Vec

In [1]:
import requests
from bs4 import BeautifulSoup
from collections import *
In [5]:
# 抓網路上的文字資料,需要時間
for i in range(1, 41):
    print("get chapter", i)
    r = requests.get('http://www.millionbook.net/wx/j/jingyong/sdyxz/%03d.htm'%i)
    r.encoding='big5'
    
    bs = BeautifulSoup(r.text)
    text = bs.findAll('td')[6].get_text()
    with open('sdyxz_%03d.txt'%i,'wb') as f:
        f.write(text.encode('utf8'))
    if len(text)<100:
        print("error")
        break
    print (i, len(text))
get chapter 1
1 27295
get chapter 2
2 26230
get chapter 3
3 22758
get chapter 4
4 23901
get chapter 5
5 21866
get chapter 6
6 26943
get chapter 7
7 26573
get chapter 8
8 16102
get chapter 9
9 24520
get chapter 10
10 17063
get chapter 11
11 28674
get chapter 12
12 30965
get chapter 13
13 31169
get chapter 14
14 23027
get chapter 15
15 18650
get chapter 16
16 24110
get chapter 17
17 18694
get chapter 18
18 24778
get chapter 19
19 21659
get chapter 20
20 13610
get chapter 21
21 29794
get chapter 22
22 27007
get chapter 23
23 22335
get chapter 24
24 21656
get chapter 25
25 29423
get chapter 26
26 19263
get chapter 27
27 19011
get chapter 28
28 21061
get chapter 29
29 21848
get chapter 30
30 21060
get chapter 31
31 25741
get chapter 32
32 19333
get chapter 33
33 10936
get chapter 34
34 24191
get chapter 35
35 21361
get chapter 36
36 21630
get chapter 37
37 22814
get chapter 38
38 18084
get chapter 39
39 18102
get chapter 40
40 22251
In [13]:
import jieba
jieba.set_dictionary('../jieba/extra_dict/dict.txt.big')
In [20]:
# 斷詞
with open("sdyxz_cut.txt","w") as outf:
    for i in range(1,41):
        with open("sdyxz_%03d.txt"%i) as f:
            txt = f.read()
            s = " ".join(jieba.cut(txt))+"\n"
            outf.write(s)            
In [275]:
# 斷字
with open("sdyxz_char.txt","w") as outf:
    for i in range(1,41):
        with open("sdyxz_%03d.txt"%i) as f:
            txt = f.read()
            s = " ".join(txt)+"\n"
            outf.write(s)         
In [3]:
import word2vec

訓練

In [1]:
%%bash
../word2vec/word2vec -train sdyxz_cut.txt -output sdyxz.bin -cbow 1 -size 100 -window 8 -negative 3 -hs 0 -sample 1e-4  -binary 1 -iter 25
# ../word2vec/word2vec -train sdyxz_cut.txt -output sdyxz.bin -cbow 0 -size 100 -window 10 -negative 5 -hs 1 -sample 1e-4 -binary 1 -iter 25
Starting training using file sdyxz_cut.txt
Vocab size: 10051
Words in train file: 525736
Alpha: 0.000101  Progress: 99.90%  Words/thread/sec: 662.92k  
In [4]:
model = word2vec.load('sdyxz.bin')
In [5]:
# 修正 word2vec 在 Python 3 的編碼問題
for i in range(model.vocab.shape[0]):
    model.vocab[i] = bytes(model.vocab[i], "iso8859").decode('utf8')
In [48]:
model['黃藥師'].shape
Out[48]:
(100,)
In [44]:
# a wrapper of word2vec api of model
class w:
    def __init__(self, word=None):
        self.pos = [word] if word else []
        self.neg = []
    def __add__(self, other):
        rtn = w()
        rtn.pos = self.pos+other.pos
        rtn.neg = self.neg+other.neg
        return rtn
    def __sub__(self, other):
        rtn = w()
        rtn.pos = self.pos+other.neg
        rtn.neg = self.neg+other.pos
        return rtn
    def __neg__(self):
        rtn = w()
        rtn.pos = self.neg
        rtn.neg = self.pos
        return rtn
    def analogy(self, n=6):
        indexes, metrics = model.analogy(pos=self.pos, neg=self.neg, n=6)
        return model.generate_response(indexes, metrics).tolist()
    def __repr__(self):
        return "\n".join(map(repr, self.analogy()))
In [47]:
w('黃藥師')
Out[47]:
('島主', 0.5984806170331604)
('黃老邪', 0.5916486678128985)
('尹志平', 0.5614272741067268)
('譚處端', 0.5553697654198075)
('女兒', 0.5502672435813846)
('全真諸', 0.5376463421605312)
In [46]:
w('西毒')+(w('洪七公')-w('北丐'))
Out[46]:
('歐陽鋒', 0.3650298341807873)
('老毒物', 0.3435998047170618)
('歐陽克', 0.3078496656836964)
('周伯通', 0.2807544170565771)
('水裡', 0.27852062510225944)
('叔侄', 0.26781293274696916)
In [12]:
w('郭靖')+(w('穆念慈')-w('楊康'))
Out[12]:
('著黃蓉', 0.21718189141105892)
('郭靖依', 0.21610486987467345)
('黃蓉甚', 0.20784901791674296)
('悄聲', 0.20521029797827256)
('靖哥哥', 0.20191146537137747)
('跟我來', 0.2014792390693111)
In [20]:
w('洪七公')+(w('蛤蟆功')-w('歐陽鋒'))
Out[20]:
('降龍十八掌', 0.2552881036436998)
('拳法', 0.2521575454301508)
('靈蛇', 0.24790429155069504)
('空明拳', 0.24610664877535432)
('家傳', 0.24534729641898687)
('落英', 0.2444743833030178)
In [13]:
w("一燈")+(w("華箏")-w("郭靖"))
Out[13]:
('劉貴妃', 0.2660872834033755)
('娘娘', 0.24960421810275132)
('皇爺', 0.24301325571048435)
('求', 0.233786088129341)
('一燈大師', 0.23363971901381228)
('周師兄', 0.2319470934399664)
In [15]:
w("黃藥師")+(w("老頑童")-w("周伯通"))
Out[15]:
('島主', 0.28487068061937276)
('樣人', 0.2612298228573873)
('黃老邪', 0.23008248937428008)
('打傷', 0.22589371429001315)
('東邪西毒', 0.22551669084345655)
('門下', 0.21995662175751834)
In [26]:
w('郭靖')-w('郭靖')
Out[26]:
('千顆', 0.0)
('出家', 0.0)
('從中', 0.0)
('竟爾', 0.0)
('提氣', 0.0)
('攔在', 0.0)
In [27]:
w('丘處機') + w('桃花島') - w('黃藥師')  
Out[27]:
('法華寺', 0.29907936554654113)
('嘉興', 0.2783290378430419)
('醉仙', 0.23945599674747373)
('途中', 0.23915954765455877)
('赴', 0.23903619713705887)
('枯木', 0.23610186709883954)
In [30]:
w('張阿生')+(w('梅超風')-w("陳玄風"))
Out[30]:
('一邊', 0.19663897901568603)
('韓小瑩', 0.19208677825907192)
('分開', 0.19046146543396983)
('七妹', 0.1865636225769724)
('全金髮', 0.18289168541546988)
('瞎', 0.18138315415877226)
In [31]:
w('鐵杖')-w("柯鎮惡")
Out[31]:
('架', 0.24815537354466016)
('鐵鞭', 0.23497734838718995)
('挑', 0.23289480857042413)
('雙拳', 0.23026719529608536)
('絕學', 0.22867996945426122)
('猛往', 0.2253477587965188)
In [35]:
w('丘處機') + (w('鐵杖')-w("柯鎮惡"))
Out[35]:
('長劍', 0.2948353630081925)
('焦木', 0.2848048285219636)
('鞭', 0.27993191149191426)
('金龍', 0.27719835118620356)
('玉陽子', 0.2734484004655094)
('毒龍', 0.2655044195707978)
In [34]:
w('空明拳')-w('周伯通')
Out[34]:
('拳術', 0.39595206738723543)
('所用', 0.38666557839973925)
('鋒銳', 0.3817911588665484)
('絕技', 0.3806570219055856)
('世代相傳', 0.3803317556327324)
('變化', 0.37597148091546695)
In [33]:
w('降龍十八掌')-w("郭靖")
Out[33]:
('神龍', 0.40089897653772694)
('擺尾', 0.38682934751332015)
('絕招', 0.381673550331461)
('一門', 0.3772147314940896)
('絕技', 0.37238382355287425)
('空明拳', 0.36971696866362064)
In [37]:
w('周伯通') +(w('降龍十八掌')-w("郭靖"))
Out[37]:
('空明拳', 0.35230648071240706)
('神龍', 0.3145260867056793)
('一門', 0.31274957131331665)
('真經', 0.3107111801368211)
('學到', 0.3106500276122125)
('五行', 0.3104107807599111)
In [38]:
w("鐵掌") + (w('周伯通')-w("空明拳"))
Out[38]:
('裘千仞', 0.3058085633387295)
('鬧', 0.3004068133335405)
('玄虛', 0.26528700623276036)
('瞧見', 0.2523197144010738)
('山腰', 0.24236818973633265)
('幫', 0.2399828760553166)
In [39]:
w("吃")-w("飯")+w("酒")
Out[39]:
('喝', 0.27999872853024893)
('缸', 0.24211561451680702)
('銅缸', 0.2366720185209148)
('碗', 0.23213567936195664)
('一碗', 0.22903419497542712)
('大碗', 0.22728245512485276)
In [40]:
-w("吃")+w("飯")+w("喝")
Out[40]:
('一杯', 0.28023847500182447)
('杯酒', 0.27425285288771795)
('斟', 0.2738747428848062)
('半碗', 0.2704340019193841)
('酒', 0.2617582670673162)
('一缸', 0.25509365409056856)
In [ ]: