from gensim.models import KeyedVectors
from pathlib import Path
mypath = Path("../../../demo-chinese-text-classification-lstm-keras")
zh_model = KeyedVectors.load_word2vec_format(mypath/'zh.vec')
words = []
for word in zh_model.vocab:
words.append(word)
print("预训练模型包含单词总数: {}".format(len(words)))
预训练模型包含单词总数: 50102
words[:10]
['的', '</s>', '在', '是', '年', '和', '了', '於', '為', '有']
print("词向量维度: {}".format(
len(zh_model[words[0]])
))
词向量维度: 300
find_similar_to = '研究生'
for similar_word in zh_model.similar_by_word(find_similar_to, topn=5):
print("Word: {0}, Similarity: {1:.2f}".format(
similar_word[0], similar_word[1]
))
Word: 硕士, Similarity: 0.69 Word: 博士生, Similarity: 0.69 Word: 本科生, Similarity: 0.69 Word: 硕士生, Similarity: 0.67 Word: 研究生院, Similarity: 0.63
word_add = ['男人', '王后']
word_sub = ['国王']
for resultant_word in zh_model.most_similar(
positive=word_add, negative=word_sub, topn=5
):
print("Word : {0} , Similarity: {1:.2f}".format(
resultant_word[0], resultant_word[1]
))
Word : 女人 , Similarity: 0.70 Word : 女孩 , Similarity: 0.52 Word : 丈夫 , Similarity: 0.52 Word : 她 , Similarity: 0.50 Word : 女性 , Similarity: 0.49
word_add = ['日本', '巴黎']
word_sub = ['东京']
for resultant_word in zh_model.most_similar(
positive=word_add, negative=word_sub, topn=5
):
print("Word : {0} , Similarity: {1:.2f}".format(
resultant_word[0], resultant_word[1]
))
Word : 法國 , Similarity: 0.61 Word : 大利 , Similarity: 0.48 Word : 歐洲 , Similarity: 0.46 Word : 德國 , Similarity: 0.46 Word : 意大利 , Similarity: 0.46
参考 https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec 找出其中最不相关的词,也是向量加减的结果。
zh_model.doesnt_match(['研究生','本科','博士'])
D:\install\miniconda\lib\site-packages\gensim\models\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future. vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
'博士'
zh_model.*
可以查看更多用法。