You can install gensim as follows:
pip install --upgrade gensim
Here are some simple starter code for word2vec.
from gensim.models import Word2Vec
import gensim
from sklearn.decomposition import PCA
from matplotlib import pyplot
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
['this', 'is', 'the', 'second', 'sentence'],
['yet', 'another', 'sentence'],
['one', 'more', 'sentence'],
['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(sentences, min_count=1)
model.wv['first']
We can reduce the dimentionality to two, and plot the words as below:
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
txtfile= open('../data/vldb.txt','r')
sentences=[line.lower().strip().split(' ') for line in txtfile.readlines()]
model = gensim.models.Word2Vec(sentences, min_count=2, iter=5)
test='query'
print('words similar to \''+ test + '\':\t'+ str(model.wv.most_similar(test)))
model.accuracy('../data/questions-words.txt')
from nltk.corpus import wordnet as wn
wn.synsets('car')[0].lemma_names()
wn.synsets('car')[1].lemma_names()
wn.synsets('car')[3].lemma_names()
panda=wn.synset('panda.n.01')
hyper=lambda s:s.hypernyms()
list(panda.closure(hyper))
[Synset('procyonid.n.01'), Synset('carnivore.n.01'),
Synset('placental.n.01'), Synset('mammal.n.01'),
Synset('vertebrate.n.01'), Synset('chordate.n.01'),
Synset('animal.n.01'), Synset('organism.n.01'),
Synset('living_thing.n.01'), Synset('whole.n.02'),
Synset('object.n.01'), Synset('physical_entity.n.01'),
Synset('entity.n.01')]