In [1]:

import pandas as pd

In [3]:

from pathlib import Path

In [4]:

mypath = Path(".")

In [5]:

df = pd.read_csv(mypath/'dianping.csv')

In [6]:

df.head()

Out[6]:

	comment	sentiment
0	口味：不知道是我口高了，还是这家真不怎么样。我感觉口味确实很一般很一般。上菜相当快，我敢说...	0
1	菜品丰富质量好，服务也不错！很喜欢！	1
2	说真的，不晓得有人排队的理由，香精香精香精香精，拜拜！	0
3	菜量实惠，上菜还算比较快，疙瘩汤喝出了秋日的暖意，烧茄子吃出了大阪烧的味道，想吃土豆片也是口...	1
4	先说我算是娜娜家风荷园开业就一直在这里吃每次出去回来总想吃一回有时觉得外面的西式简餐总是...	1

In [7]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
comment      2000 non-null object
sentiment    2000 non-null int64
dtypes: int64(1), object(1)
memory usage: 31.4+ KB

In [8]:

# !pip install jieba

In [9]:

import jieba

In [10]:

df['text'] = df.comment.apply(lambda x: " ".join(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LIJIAX~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.926 seconds.
Prefix dict has been built succesfully.

In [11]:

df.head()

Out[11]:

	comment	sentiment	text
0	口味：不知道是我口高了，还是这家真不怎么样。我感觉口味确实很一般很一般。上菜相当快，我敢说...	0	口味：不知道是我口高了，还是这家真不怎么样。我感觉口味 ...
1	菜品丰富质量好，服务也不错！很喜欢！	1	菜品丰富质量好，服务也不错！很喜欢！
2	说真的，不晓得有人排队的理由，香精香精香精香精，拜拜！	0	说真的，不晓得有人排队的理由，香精香精香精香精，拜拜！
3	菜量实惠，上菜还算比较快，疙瘩汤喝出了秋日的暖意，烧茄子吃出了大阪烧的味道，想吃土豆片也是口...	1	菜量实惠，上菜还算比较快，疙瘩汤喝出了秋日的暖意，烧茄子吃...
4	先说我算是娜娜家风荷园开业就一直在这里吃每次出去回来总想吃一回有时觉得外面的西式简餐总是...	1	先说我算是娜娜家风荷园开业就一直在这里吃每次出去回来总想 ...

In [42]:

df[['text']].to_csv("output/dianping_cut.csv", index = False, header = False)

In [13]:

df.head()

Out[13]:

	text	sentiment
0	口味：不知道是我口高了，还是这家真不怎么样。我感觉口味 ...	0
1	菜品丰富质量好，服务也不错！很喜欢！	1
2	说真的，不晓得有人排队的理由，香精香精香精香精，拜拜！	0
3	菜量实惠，上菜还算比较快，疙瘩汤喝出了秋日的暖意，烧茄子吃...	1
4	先说我算是娜娜家风荷园开业就一直在这里吃每次出去回来总想 ...	1

In [34]:

# !pip install gensim

In [35]:

import gensim

In [36]:

model = gensim.models.Word2Vec.load('refs/zh/zh.bin')

In [48]:

more_sentences = gensim.models.word2vec.Text8Corpus("output/dianping_cut.csv")  # 加载语料

ValueError: You must specify either total_examples or total_words, for proper job parameters updationand progress calculations. The usual value is total_examples=model.corpus_count.

In [67]:

model.build_vocab(more_sentences, update=True)  # 更新词汇表
model.train(more_sentences, total_examples=model.corpus_count, epochs=1)

Out[67]:

(57687, 114396)

In [68]:

model.save("output/word2vec_online_training.model")