# 将评论从数据库读入 from sqlalchemy import create_engine engine = create_engine('oracle://user:password@bi_data') import pandas as pd df = pd.read_sql_query(str_sql, engine) df.shape df.head() import jieba def cutword(x): if isinstance(x, str): x = x.decode('utf8') # 解码为unicode x = unicode(x) #将数字变字符 seg = jieba.cut(x) return ' '.join(seg) # cutword(string) x = '中国' #外部输入 y = x.decode('utf8') #解码为unicode 在python内部处理 z = y.encode('utf8') #编码为str 再输出 df['seg_word'] = df.good_cntnt.map(cutword) df.head() txt = df['seg_word'].values txtlist = [] for x in txt: txtlist.append(x.split()) import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO) num_features = 300 min_word_count = 10 num_workers = 4 context = 10 downsampling = 1e-3 from gensim.models import word2vec print "traing model..." model = word2vec.Word2Vec(txtlist, workers = num_workers, size= num_features, min_count=min_word_count, window = context, sample = downsampling) model.init_sims(replace=True) model_name = 'allcomword2vec' model.save(model_name) for word, word_simi in model.most_similar(u'丰满', topn=20): print word, word_simi