#!/usr/bin/env python # coding: utf-8 # In[1]: from gensim.models import KeyedVectors from pathlib import Path mypath = Path("../../../demo-chinese-text-classification-lstm-keras") # In[2]: zh_model = KeyedVectors.load_word2vec_format(mypath/'zh.vec') # In[3]: words = [] for word in zh_model.vocab: words.append(word) # In[4]: print("预训练模型包含单词总数: {}".format(len(words))) # In[5]: words[:10] # In[6]: print("词向量维度: {}".format( len(zh_model[words[0]]) )) # In[7]: find_similar_to = '研究生' # In[8]: for similar_word in zh_model.similar_by_word(find_similar_to, topn=5): print("Word: {0}, Similarity: {1:.2f}".format( similar_word[0], similar_word[1] )) # In[9]: word_add = ['男人', '王后'] word_sub = ['国王'] # In[10]: for resultant_word in zh_model.most_similar( positive=word_add, negative=word_sub, topn=5 ): print("Word : {0} , Similarity: {1:.2f}".format( resultant_word[0], resultant_word[1] )) # In[11]: word_add = ['日本', '巴黎'] word_sub = ['东京'] # In[12]: for resultant_word in zh_model.most_similar( positive=word_add, negative=word_sub, topn=5 ): print("Word : {0} , Similarity: {1:.2f}".format( resultant_word[0], resultant_word[1] )) # 参考 https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec # 找出其中最不相关的词,也是向量加减的结果。 # In[19]: zh_model.doesnt_match(['研究生','本科','博士']) # `zh_model.*` 可以查看更多用法。