#!/usr/bin/env python # coding: utf-8 # # Word Similarity Notes # ## Stemming with nltk # In[1]: import nltk # In[2]: from nltk.stem.porter import * stemmer = PorterStemmer() # In[4]: stemmer.stem("argumentation") # In[5]: stemmer.stem("creation") # In[6]: stemmer.stem("fly") # In[7]: stemmer.stem("flies") # In[8]: stemmer.stem("laziness") # In[9]: stemmer.stem("lazy") # In[10]: stemmer.stem("traditional") # In[11]: stemmer.stem("create") # In[12]: stemmer.stem("creation") # In[13]: stemmer.stem("creating") # In[14]: stemmer.stem("decisiveness") # In[15]: stemmer.stem("public") # In[16]: stemmer.stem("publicize") # In[17]: stemmer.stem("xyzing") # ## Lemmatizing with WordNet # In[18]: from nltk.corpus import wordnet as wn # In[222]: wn.synsets("dog") # In[212]: d = wn.synset("dog.n.01") # In[213]: d.definition() # In[214]: d.lemmas() # In[243]: d = wn.synset('able.a.01') d1 = d.lemmas()[0] # In[244]: d1.antonyms() # In[256]: dog = wn.synset("dog.n.01") cool = wn.synset("cool.n.01") # In[257]: dog.path_similarity(cool) # In[258]: dog.hypernyms() # In[259]: list(dog.closure(lambda s : s.hypernyms())) # In[260]: list(cool.closure(lambda s : s.hypernyms())) # In[332]: from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() print(wnl.lemmatize('running', wn.VERB)) print(wnl.lemmatize('better', wn.ADJ)) print(wnl.lemmatize('oxen', wn.NOUN)) print(wnl.lemmatize('geese', wn.NOUN)) # ## Word Vectors with Gensim and Word2Vec # In[261]: import gensim # In[262]: import logging # In[263]: logging.basicConfig(format="%(asctime)s: %(levelname)s : %(message)s", level=logging.INFO) # In[264]: from gensim.models import word2vec # In[153]: sentences = word2vec.PathLineSentences("WOTclean") # In[154]: model = word2vec.Word2Vec(sentences, size=200) # In[333]: model.wv.most_similar("sleep") # In[306]: model.wv.most_similar("sword") # In[307]: model.wv.most_similar(positive=["king", "woman"], negative=["man"]) # In[ ]: