#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') from __future__ import print_function, division from collections import defaultdict import matplotlib import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import axes3d import numpy import scipy import scipy.spatial import pandas import gensim import re # In[2]: def plot_vecs(corpus, query=None, labels=None): fig = plt.figure(figsize=(4, 4)) c = numpy.array(corpus) if len(c[0]) == 2: ax = plt.gca() ax.quiver([0]*len(c),[0]*len(c),c[:,0],c[:,1],angles='xy',scale_units='xy',scale=1) if query: q = numpy.asarray(query) ax.quiver([0],[0],q[0],q[1],angles='xy',scale_units='xy',scale=1, color='r') ax.set_xlim([-0.1,c.max()]) ax.set_ylim([-0.1,c.max()]) else: ax = plt.gca(projection='3d') for q in c: ax.quiver(0, 0, 0, q[0], q[1], q[2], length=numpy.linalg.norm(q), arrow_length_ratio=0.1, pivot='tail', color='black') if query: q = numpy.asarray(query) ax.quiver(0, 0, 0, q[0], q[1], q[2], length=numpy.linalg.norm(q), arrow_length_ratio=0.1, pivot='tail', color='r') ax.set_xlim([0, c.max()]) ax.set_ylim([0, c.max()]) ax.set_zlim([0, c.max()]) lang=labels if lang: if len(lang) >= 3: ax.set_zlabel(lang[2], fontsize=20) if len(lang) >= 2: ax.set_ylabel(lang[1], fontsize=20) if len(lang) >= 1: ax.set_xlabel(lang[0], fontsize=20) fig.tight_layout() plt.show() # # Building Search Engines with Gensim # # # ![tripy](./images/tripython-banner.png) # - PhD candidate @ University of Alabama # - Roll tide 🌀🌊🌀🐘🏈💯 # - Search applications for software maintenance tasks # - Code search # - Change request triage # - Currently visiting researcher @ ABB # - Looking for new adventures in 2016! # - Searching through large code bases using more than keyword search # - Searching for developers most apt to handle a bug/issue report # - I have a Twitter ✌️problem✌️ # - I tweet really dumb things sometimes # - I am looking for a job # My advisor says I have a Twitter problem. I love Twitter and treat it seriously and facetiously at the same time. Also, jobs. # # These things don't mix well, apparently. # # Admitted to being intoxicated and breaking law. Whoops. # # Do employers want to hire someone that drinks at noon? #

I was apparently spotted on the local news being a drunk Alabama fan that flooded the streets after the win last night. Whoops.
— christop corley (@excsc) January 10, 2012

# #

instead of lunch I am just going to have a bunch of beers and eat pretzels
— christop corley (@excsc) September 29, 2014

# # **CLARIFICATION: I do not "have a drinking problem", as my mother likes to claim. I am probably actually quite sober right now!* # # Let's make a search engine # So I have a lot of tweets, lets make a search engine to find all of these bad tweets for deletion # # Lots of different methods for making search engines, but mainly two essential behaviors we want # `search("drunk") => [tweets containing the word 'drunk']` # `search("drunk") => [tweets *related* to the word 'drunk']` # In[3]: import gensim # I'm a `gensim` contributor # # `gensim`'s ✌️design✌️ is not always great, certainly better tools exist that can supplement or replace parts of this talk, e.g.: # # 1. `whoosh` # 2. `nltk` # 3. `scikit-learn` # OMG WHAT ABOUT GREP? # ### `grep` is great, but # # - must search through all documents # - not aware of language-specific semantics # - does not rank by relevence # 1. First relates to the efficiency (speed): we want results fast! # 2. Second relates to the effectiveness (quality): we want to avoid false-positives # 3. Third relates to the effectiveness (quality): we want things ordered by relevence # # Basics # # 1. **Prepare** the corpus # 2. **Model** the corpus # 3. **Index** the corpus # 4. **Query** fun-time! # These are the basic steps, all of which gensim can help. However, gensim's most powerful features fall out of number 2! # # Gensim has some powerful models that can help with many tasks requiring measuring document similarity, not just search. # # The model we choose will influence how we prepare and index the corpus. # In[4]: cat_phrases = ["meow", "meow meow meow hiss", "meow hiss hiss", "hiss" ] # Let's pretend we have 4 cats that all have different signature catch phrases. # # Now, we need a good way to represent these cats, but in a way a computer can understand easily. # In[5]: cat_lang = ["meow", "hiss"] cat_phrase_vectors = [[1, 0], [3, 1], [1, 2], [0, 1]] # In[6]: plot_vecs(cat_phrase_vectors, labels=cat_lang) # The vector space model # # Assign words to ids of which position in the vector it will be in # In[7]: cat_lang = ["meow", "hiss"] cat_phrase_vectors = [[1, 0], [3, 1], [1, 2], [0, 1]] other_cat = [1, 1] plot_vecs(cat_phrase_vectors, other_cat, labels=cat_lang) # In[8]: for cat in cat_phrase_vectors: print("%s => %.3f" % (cat, 1 - scipy.spatial.distance.cosine(cat, other_cat))) # Compare all cats we know about to this other cat # # Read the results. # # What if we expand our language to include more words? # In[9]: cat_lang3 = ["meow", "hiss", "purr"] cat_phrase_vectors3 = [[1, 0, 3], [3, 1, 1], [1, 2, 0], [0, 1, 3]] other_cat3 = [1, 1, 2] plot_vecs(cat_phrase_vectors3, other_cat3, labels=cat_lang3) # matplotlib whyyy # This is a very simple example. If we had more than two words in our vocabulary, the dimensionality gets bigger. So, a lot of prep steps are working towards reducing this dimensionality as much as possible. # # Imagine if we wanted to build a search engine for Wikipedia? 5 million articles and a kajillion words. # # Preparing the corpus # # 1. Tokenizing # In[10]: text = "Jupyter/IPython notebook export to a reveal.js slideshow is pretty cool!" text.split() # In[11]: list(gensim.utils.tokenize(text)) # Gensim has one built-in, but uses a regex that splits on all non-alphabeticals. # # Splits on slash, but we also filter out '.' and '!' # ## When do we want to tokenize? # In[12]: list(gensim.utils.tokenize("Jupyter/IPython")) # In[13]: list(gensim.utils.tokenize("AC/DC")) # Tokenization is more or less the bare minimum that we need to do to build a VSM. # # All other prep steps are for reducing the dimensions as much as possible. # # Preparing the corpus # # 1. Tokenizing # 2. Normalizing # In[14]: text.lower() # - `"ipython" => "ipython"` # - `"IPython" => "ipython"` # - `"iPython" => "ipython"` # The idea is that no matter the casing, we can collapse 3D into a common one # # When do we want to normalize? # In[15]: "TriPython".lower() # In[16]: "Apple".lower() # "an apple I am about to eat" # "an apple computer I am about to eat" # # This is generally ok, we can hopefully rely on other key words in the phrase such as "computer" or "grapes" to help us out # # Preparing the corpus # # 1. Tokenizing # 2. Normalizing # 3. Stopword removal # In[17]: ' '.join(sorted(gensim.parsing.STOPWORDS)) # These words are usually very common, and end up adding noise to our vectors. Little value. # In[18]: list(gensim.utils.tokenize(text.lower())) # In[19]: [word for word in gensim.utils.tokenize(text.lower()) if word not in gensim.parsing.STOPWORDS] # We still get the gist of this tweet, it's about IPython slides. # In[20]: import nltk nltk.download('stopwords') ' '.join(sorted(nltk.corpus.stopwords.words('english'))) # NLTK is good, much more conservative English list. But importantly, has more lists than just English. # # When do we want to remove stop words? # In[21]: [word for word in "the cat sat on the keyboard".split() if word not in gensim.parsing.STOPWORDS] # In[22]: [word for word in "the beatles were overrated".split() if word not in gensim.parsing.STOPWORDS] # Was he talking about beatles as in bugs being overrated, if so then who cares? # # If he was talking about THE beatles, maybe we dont want to hire this guy cause he obviously has no taste # # Preparing the corpus # # 1. Tokenizing # 2. Normalizing # 3. Stopword removal # - Short/Long word removal # - Words that appear in too many or few documents # In[23]: [word for word in "the cat sat on the keyboard lllllbbbnnnnnbbbbbbbbbbbbbbbbb".split() if len(word) > 2 and len(word) < 30] # # Preparing the corpus # # 1. Tokenizing # 2. Normalizing # 3. Stopword removal # 4. Stemming # In[24]: for word in "the runner ran on the running trail for their runs".split(): stemmed_word = gensim.parsing.stem(word) if stemmed_word != word: print(word, "=>", stemmed_word) # Stemming is about reducing a word to it's root form. # # Dropping -ing and -s gives us the same words, just not in very good english # ## When do we want to stem words? # In[25]: gensim.parsing.stem("running") # In[26]: gensim.parsing.stem("jupyter") # ### Stemming alternative: lemmatization! # In[27]: gensim.utils.lemmatize("the runner ran on the running trail for their runs", allowed_tags=re.compile(".*")) # In[28]: gensim.utils.lemmatize("the runner ran on the running trail for their runs") # Lemmatize uses an external library to something LIKE stemming, and also add parts of speech tagging # # This will reduce words like ran to run as a verb, but also adds new dimensions for runs to run as a noun # ## Putting it all together # In[29]: def preprocess(text): for word in gensim.utils.tokenize(text.lower()): if word not in gensim.parsing.STOPWORDS and len(word) > 1: yield word list(preprocess("Jupyter/IPython notebook export to a reveal.js slideshow is pretty cool!")) # I've made some arbitrary choices. # In[30]: gensim.parsing.preprocess_string("Jupyter/IPython notebook export to a reveal.js slideshow is pretty cool!") # # The corpus # # My twitter archive # In[31]: get_ipython().system('head -1 tweets.csv') # Chose this not because I wanted to subject you to my bad tweets, but because it's small, easy to get and nicely formatted # In[32]: import csv class TwitterArchiveCorpus(): def __init__(self, archive_path): self.path = archive_path def iter_texts(self): with open(self.path) as f: for row in csv.DictReader(f): if (row["retweeted_status_id"] == "" and # filter retweets not row["text"].startswith("RT @") and row["in_reply_to_status_id"] == "" and # filter replies not row["text"].startswith("@")): yield preprocess(row["text"]) tweets = TwitterArchiveCorpus("tweets.csv") # General gensim pattern for doing corpus reading will use an "iterable container" pattern # # I've chosen to ignore retweets and replies. Just look at tweets I wrote. # In[33]: texts = tweets.iter_texts() print(list(next(texts))) print(list(next(texts))) # In[34]: get_ipython().system('head -3 tweets.csv | awk -F \'","\' \'{print $6}\'') # What happened to "y'all"? Here's a consequence of some prep steps I chose # # Basics # # 1. **Prepare** the corpus # 2. **Model** the corpus # 3. **Index** the corpus # 4. **Query** fun-time! # In[35]: plot_vecs(cat_phrase_vectors, other_cat, labels=cat_lang) # In[36]: class TwitterArchiveCorpus(): def __init__(self, archive_path): self.path = archive_path self.dictionary = gensim.corpora.Dictionary(self.iter_texts()) def iter_texts(self): with open(self.path) as f: for row in csv.DictReader(f): if (row["retweeted_status_id"] == "" and # filter retweets not row["text"].startswith("RT @") and row["in_reply_to_status_id"] == "" and # filter replies not row["text"].startswith("@")): yield preprocess(row["text"]) def __iter__(self): for document in self.iter_texts(): yield self.dictionary.doc2bow(document) def __len__(self): return self.dictionary.num_docs def get_original(self, key): pass # let's not look at this :-) tweets = TwitterArchiveCorpus("tweets.csv") # In[37]: {"meow": 0, "hiss": 1, "purr": 2} # Dictionary will make translating docs into vector space very easy. Goes thru text, creating a mapping of words to ids, counting stats. # # bow is "bag of words", or a way to represent docs as sparse vectors # In[38]: class TwitterArchiveCorpus(): def __init__(self, archive_path): self.path = archive_path self.lookup = dict() self.dictionary = gensim.corpora.Dictionary(self.iter_texts()) def iter_texts(self): current = 0 with open(self.path) as f: for row in csv.DictReader(f): if (row["retweeted_status_id"] == "" and # filter retweets not row["text"].startswith("RT @") and row["in_reply_to_status_id"] == "" and # filter replies not row["text"].startswith("@")): self.lookup[current] = row # BAD BAD BAD current += 1 yield preprocess(row["text"]) def __iter__(self): for document in self.iter_texts(): yield self.dictionary.doc2bow(document) def __len__(self): return self.dictionary.num_docs def get_original(self, key): return self.lookup[key]["text"] tweets = TwitterArchiveCorpus("tweets.csv") # In[39]: vecs = iter(tweets) print(next(vecs)) print(next(vecs)) # In[40]: texts = tweets.iter_texts() print(list(next(texts))) print(list(next(texts))) # In[41]: print(tweets.get_original(0)) print(tweets.get_original(1)) # In[42]: len(tweets), len(tweets.dictionary) # In[43]: (len(tweets) * len(tweets.dictionary)) / (1024 ** 2) # Turns out I don't use a lot of different words, just 9000! # # This is why gensim does sparse vectors and generators everywhere! Imagine loading a corpus bigger than this into memory. # # Basics # # 1. **Prepare** the corpus # 2. **Model** the corpus # 3. **Index** the corpus # 4. **Query** fun-time! # In[44]: plot_vecs(cat_phrase_vectors, other_cat, labels=cat_lang) # In[45]: index = gensim.similarities.Similarity('/tmp/tweets', tweets, num_features=len(tweets.dictionary), num_best=15) # In[46]: get_ipython().system('ls /tmp/tweets*') # - prefix= does sharding! # - num_features= because we are using sparse vectors, this gives it a hint of what the max is # - num_best= just tells it to show 15 results (just for presentation purposes) # # What does this class do? What kind of indexing # # Basics # # 1. **Prepare** the corpus # 2. **Model** the corpus # 3. **Index** the corpus # 4. **Query** fun-time! # In[47]: plot_vecs(cat_phrase_vectors, other_cat, labels=cat_lang) # In[48]: query_bow = tweets.dictionary.doc2bow(preprocess("drunk")) query_bow # In[49]: index[query_bow] # We gotta translate our query phrase into vector space. # # doing getitem on the index will "query" it and give us a document id and it's relatedness to the query # In[50]: def search(query): query_bow = tweets.dictionary.doc2bow(preprocess(query)) for doc, percent in index[query_bow]: print("%.3f" % percent, "=>", tweets.get_original(doc), "\n") # In[51]: search("drunk") # Point out that shorter tweets are marked as more related! # # When we get more words, the vector changes direction slightly, making it less related! # In[52]: search("bar") # Counter to what I just told you, the first tweet here is actually longer than the others! That's because the word bar appears twice # In[53]: search("ROLL TIDE") # # A more slightly more advanced model # In[54]: lsi = gensim.models.LsiModel(tweets, num_topics=100, power_iters=10, id2word=tweets.dictionary) # power_iters says look at these vectors 10 times # # id2word is for other convience methods of the object, has no affect on outcome # In[55]: len(tweets), len(tweets.dictionary) # In[56]: len(tweets), lsi.num_topics # In[57]: lsi_index = gensim.similarities.Similarity('/tmp/lsitweets', lsi[tweets], num_features=lsi.num_topics, num_best=15) # In[58]: def lsi_search(query): query_bow = tweets.dictionary.doc2bow(preprocess(query)) for doc, percent in lsi_index[lsi[query_bow]]: print("%.3f" % percent, "=>", tweets.get_original(doc), "\n") # In[59]: search("pigpen") # non-LSI search # In[60]: lsi_search("pigpen") # # WE NEED TO GO DEEPER *BRRRRRRRAAAAAWWWWRWRRRMRMRMMMMM* # # ![Deeper](images/gifs/deeper.gif) # # ## A deep-learning approach # # Warning: here be Gensim dragons # # gif credit: http://prostheticknowledge.tumblr.com/post/128044261341/implementation-of-a-neural-algorithm-of-artistic # In[61]: w2v = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) # ### How does "king - man + woman = queen"? # # ![Vectors](images/gifs/vectors.gif) # # gif credit: http://multithreaded.stitchfix.com/blog/2015/03/11/word-is-worth-a-thousand-vectors/ # In[62]: def generate_vector_sums(): for doc in tweets.iter_texts(): # remember me? yield gensim.matutils.unitvec( # DRAGON: hack to make Similarity happy with fullvecs sum( (w2v[word] for word in doc if word in w2v), numpy.zeros(w2v.vector_size) ) ) w2v_index = gensim.similarities.Similarity('/tmp/w2v_tweets', generate_vector_sums(), num_features=w2v.vector_size, num_best=15) # In[63]: def search_w2v(query): query_tokens = preprocess(query) query_vec = gensim.matutils.unitvec(sum((w2v[word] for word in query_tokens if word in w2v), numpy.zeros(w2v.vector_size))) for doc, percent in w2v_index[query_vec]: print("%.3f" % percent, "=>", tweets.get_original(doc), "\n") # In[64]: search_w2v("drunk") # In[65]: search_w2v("kittens") # # Thanks for listening! 😻🍻😸 # # (Is it beer-time yet?) # In[66]: plot_vecs(cat_phrase_vectors, other_cat, labels=["meow", "hiss"]) # *No tweets were harmed as a result of this talk.
I am a person; people make mistakes and that's okay.* # ### Short-circuiting a full search: inverted index! # In[67]: cat_lang3 = ["meow", "hiss", "purr"] cat_phrase_vectors3 = [[1, 0, 3], [3, 1, 1], [1, 2, 0], [0, 1, 3]] inv_index = defaultdict(set) for doc_id, doc in enumerate(cat_phrase_vectors3): for word_id, freq in enumerate(doc): if freq: inv_index[word_id].add(doc_id) # In[68]: inv_index[0] # In[69]: inv_index[0].intersection(inv_index[1]) # This is helpful for quickly eliminating measuring similarity for documents where the required keyword never exists. # # whoosh has a good one of these! # [Download this notebook](./searching.ipynb)