In [1]:

from gensim.models import KeyedVectors
from pathlib import Path
mypath = Path("../../../demo-chinese-text-classification-lstm-keras")

In [2]:

zh_model = KeyedVectors.load_word2vec_format(mypath/'zh.vec')

In [3]:

words = []
for word in zh_model.vocab:
    words.append(word)

In [4]:

print("预训练模型包含单词总数： {}".format(len(words)))

预训练模型包含单词总数： 50102

In [5]:

words[:10]

Out[5]:

['的', '</s>', '在', '是', '年', '和', '了', '於', '為', '有']

In [6]:

print("词向量维度: {}".format(
    len(zh_model[words[0]])
))

词向量维度: 300

In [7]:

find_similar_to = '研究生'

In [8]:

for similar_word in zh_model.similar_by_word(find_similar_to, topn=5):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

Word: 硕士, Similarity: 0.69
Word: 博士生, Similarity: 0.69
Word: 本科生, Similarity: 0.69
Word: 硕士生, Similarity: 0.67
Word: 研究生院, Similarity: 0.63

In [9]:

word_add = ['男人', '王后']
word_sub = ['国王']

In [10]:

for resultant_word in zh_model.most_similar(
    positive=word_add, negative=word_sub, topn=5
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))

Word : 女人 , Similarity: 0.70
Word : 女孩 , Similarity: 0.52
Word : 丈夫 , Similarity: 0.52
Word : 她 , Similarity: 0.50
Word : 女性 , Similarity: 0.49

In [11]:

word_add = ['日本', '巴黎']
word_sub = ['东京']

In [12]:

for resultant_word in zh_model.most_similar(
    positive=word_add, negative=word_sub, topn=5
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))

Word : 法國 , Similarity: 0.61
Word : 大利 , Similarity: 0.48
Word : 歐洲 , Similarity: 0.46
Word : 德國 , Similarity: 0.46
Word : 意大利 , Similarity: 0.46

参考 https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec 找出其中最不相关的词，也是向量加减的结果。

In [19]:

zh_model.doesnt_match(['研究生','本科','博士'])

D:\install\miniconda\lib\site-packages\gensim\models\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)

Out[19]:

'博士'

zh_model.* 可以查看更多用法。