#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') from __future__ import print_function, division from collections import defaultdict import matplotlib import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import axes3d import numpy import scipy import scipy.spatial import pandas import gensim import re # In[2]: def plot_vecs(corpus, query=None, labels=None): fig = plt.figure(figsize=(4, 4)) c = numpy.array(corpus) if len(c[0]) == 2: ax = plt.gca() ax.quiver([0]*len(c),[0]*len(c),c[:,0],c[:,1],angles='xy',scale_units='xy',scale=1) if query: q = numpy.asarray(query) ax.quiver([0],[0],q[0],q[1],angles='xy',scale_units='xy',scale=1, color='r') ax.set_xlim([-0.1,c.max()]) ax.set_ylim([-0.1,c.max()]) else: ax = plt.gca(projection='3d') for q in c: ax.quiver(0, 0, 0, q[0], q[1], q[2], length=numpy.linalg.norm(q), arrow_length_ratio=0.1, pivot='tail', color='black') if query: q = numpy.asarray(query) ax.quiver(0, 0, 0, q[0], q[1], q[2], length=numpy.linalg.norm(q), arrow_length_ratio=0.1, pivot='tail', color='r') ax.set_xlim([0, c.max()]) ax.set_ylim([0, c.max()]) ax.set_zlim([0, c.max()]) lang=labels if lang: if len(lang) >= 3: ax.set_zlabel(lang[2], fontsize=20) if len(lang) >= 2: ax.set_ylabel(lang[1], fontsize=20) if len(lang) >= 1: ax.set_xlabel(lang[0], fontsize=20) fig.tight_layout() plt.show() # # Building Search Engines with Gensim # # # ![tripy](./images/tripython-banner.png) # - PhD candidate @ University of Alabama # - Roll tide ๐๐๐๐๐๐ฏ # - Search applications for software maintenance tasks # - Code search # - Change request triage # - Currently visiting researcher @ ABB # - Looking for new adventures in 2016! # - Searching through large code bases using more than keyword search # - Searching for developers most apt to handle a bug/issue report # - I have a Twitter โ๏ธproblemโ๏ธ # - I tweet really dumb things sometimes # - I am looking for a job # My advisor says I have a Twitter problem. I love Twitter and treat it seriously and facetiously at the same time. Also, jobs. # # These things don't mix well, apparently. # # Admitted to being intoxicated and breaking law. Whoops. # # Do employers want to hire someone that drinks at noon? #
I was apparently spotted on the local news being a drunk Alabama fan that flooded the streets after the win last night. Whoops.
— christop corley (@excsc) January 10, 2012
#
# instead of lunch I am just going to have a bunch of beers and eat pretzels
— christop corley (@excsc) September 29, 2014
#
# **CLARIFICATION: I do not "have a drinking problem", as my mother likes to claim. I am probably actually quite sober right now!*
# # Let's make a search engine
# So I have a lot of tweets, lets make a search engine to find all of these bad tweets for deletion
#
# Lots of different methods for making search engines, but mainly two essential behaviors we want
# `search("drunk") => [tweets containing the word 'drunk']`
# `search("drunk") => [tweets *related* to the word 'drunk']`
# In[3]:
import gensim
# I'm a `gensim` contributor
#
# `gensim`'s โ๏ธdesignโ๏ธ is not always great, certainly better tools exist that can supplement or replace parts of this talk, e.g.:
#
# 1. `whoosh`
# 2. `nltk`
# 3. `scikit-learn`
# OMG WHAT ABOUT GREP?
# ### `grep` is great, but
#
# - must search through all documents
# - not aware of language-specific semantics
# - does not rank by relevence
# 1. First relates to the efficiency (speed): we want results fast!
# 2. Second relates to the effectiveness (quality): we want to avoid false-positives
# 3. Third relates to the effectiveness (quality): we want things ordered by relevence
# # Basics
#
# 1. **Prepare** the corpus
# 2. **Model** the corpus
# 3. **Index** the corpus
# 4. **Query** fun-time!
# These are the basic steps, all of which gensim can help. However, gensim's most powerful features fall out of number 2!
#
# Gensim has some powerful models that can help with many tasks requiring measuring document similarity, not just search.
#
# The model we choose will influence how we prepare and index the corpus.
# In[4]:
cat_phrases = ["meow",
"meow meow meow hiss",
"meow hiss hiss",
"hiss"
]
# Let's pretend we have 4 cats that all have different signature catch phrases.
#
# Now, we need a good way to represent these cats, but in a way a computer can understand easily.
# In[5]:
cat_lang = ["meow", "hiss"]
cat_phrase_vectors = [[1, 0],
[3, 1],
[1, 2],
[0, 1]]
# In[6]:
plot_vecs(cat_phrase_vectors, labels=cat_lang)
# The vector space model
#
# Assign words to ids of which position in the vector it will be in
# In[7]:
cat_lang = ["meow", "hiss"]
cat_phrase_vectors = [[1, 0],
[3, 1],
[1, 2],
[0, 1]]
other_cat = [1, 1]
plot_vecs(cat_phrase_vectors, other_cat, labels=cat_lang)
# In[8]:
for cat in cat_phrase_vectors:
print("%s => %.3f" % (cat, 1 - scipy.spatial.distance.cosine(cat, other_cat)))
# Compare all cats we know about to this other cat
#
# Read the results.
#
# What if we expand our language to include more words?
# In[9]:
cat_lang3 = ["meow", "hiss", "purr"]
cat_phrase_vectors3 = [[1, 0, 3],
[3, 1, 1],
[1, 2, 0],
[0, 1, 3]]
other_cat3 = [1, 1, 2]
plot_vecs(cat_phrase_vectors3, other_cat3, labels=cat_lang3) # matplotlib whyyy
# This is a very simple example. If we had more than two words in our vocabulary, the dimensionality gets bigger. So, a lot of prep steps are working towards reducing this dimensionality as much as possible.
#
# Imagine if we wanted to build a search engine for Wikipedia? 5 million articles and a kajillion words.
# # Preparing the corpus
#
# 1. Tokenizing
# In[10]:
text = "Jupyter/IPython notebook export to a reveal.js slideshow is pretty cool!"
text.split()
# In[11]:
list(gensim.utils.tokenize(text))
# Gensim has one built-in, but uses a regex that splits on all non-alphabeticals.
#
# Splits on slash, but we also filter out '.' and '!'
# ## When do we want to tokenize?
# In[12]:
list(gensim.utils.tokenize("Jupyter/IPython"))
# In[13]:
list(gensim.utils.tokenize("AC/DC"))
# Tokenization is more or less the bare minimum that we need to do to build a VSM.
#
# All other prep steps are for reducing the dimensions as much as possible.
# # Preparing the corpus
#
# 1. Tokenizing
# 2. Normalizing
# In[14]:
text.lower()
# - `"ipython" => "ipython"`
# - `"IPython" => "ipython"`
# - `"iPython" => "ipython"`
# The idea is that no matter the casing, we can collapse 3D into a common one
# # When do we want to normalize?
# In[15]:
"TriPython".lower()
# In[16]:
"Apple".lower()
# "an apple I am about to eat"
# "an apple computer I am about to eat"
#
# This is generally ok, we can hopefully rely on other key words in the phrase such as "computer" or "grapes" to help us out
# # Preparing the corpus
#
# 1. Tokenizing
# 2. Normalizing
# 3. Stopword removal
# In[17]:
' '.join(sorted(gensim.parsing.STOPWORDS))
# These words are usually very common, and end up adding noise to our vectors. Little value.
# In[18]:
list(gensim.utils.tokenize(text.lower()))
# In[19]:
[word for word in gensim.utils.tokenize(text.lower()) if word not in gensim.parsing.STOPWORDS]
# We still get the gist of this tweet, it's about IPython slides.
# In[20]:
import nltk
nltk.download('stopwords')
' '.join(sorted(nltk.corpus.stopwords.words('english')))
# NLTK is good, much more conservative English list. But importantly, has more lists than just English.
# # When do we want to remove stop words?
# In[21]:
[word for word in "the cat sat on the keyboard".split() if word not in gensim.parsing.STOPWORDS]
# In[22]:
[word for word in "the beatles were overrated".split() if word not in gensim.parsing.STOPWORDS]
# Was he talking about beatles as in bugs being overrated, if so then who cares?
#
# If he was talking about THE beatles, maybe we dont want to hire this guy cause he obviously has no taste
# # Preparing the corpus
#
# 1. Tokenizing
# 2. Normalizing
# 3. Stopword removal
# - Short/Long word removal
# - Words that appear in too many or few documents
# In[23]:
[word for word in "the cat sat on the keyboard lllllbbbnnnnnbbbbbbbbbbbbbbbbb".split()
if len(word) > 2 and len(word) < 30]
# # Preparing the corpus
#
# 1. Tokenizing
# 2. Normalizing
# 3. Stopword removal
# 4. Stemming
# In[24]:
for word in "the runner ran on the running trail for their runs".split():
stemmed_word = gensim.parsing.stem(word)
if stemmed_word != word:
print(word, "=>", stemmed_word)
# Stemming is about reducing a word to it's root form.
#
# Dropping -ing and -s gives us the same words, just not in very good english
# ## When do we want to stem words?
# In[25]:
gensim.parsing.stem("running")
# In[26]:
gensim.parsing.stem("jupyter")
# ### Stemming alternative: lemmatization!
# In[27]:
gensim.utils.lemmatize("the runner ran on the running trail for their runs",
allowed_tags=re.compile(".*"))
# In[28]:
gensim.utils.lemmatize("the runner ran on the running trail for their runs")
# Lemmatize uses an external library to something LIKE stemming, and also add parts of speech tagging
#
# This will reduce words like ran to run as a verb, but also adds new dimensions for runs to run as a noun
# ## Putting it all together
# In[29]:
def preprocess(text):
for word in gensim.utils.tokenize(text.lower()):
if word not in gensim.parsing.STOPWORDS and len(word) > 1:
yield word
list(preprocess("Jupyter/IPython notebook export to a reveal.js slideshow is pretty cool!"))
# I've made some arbitrary choices.
# In[30]:
gensim.parsing.preprocess_string("Jupyter/IPython notebook export to a reveal.js slideshow is pretty cool!")
# # The corpus
#
# My twitter archive
# In[31]:
get_ipython().system('head -1 tweets.csv')
# Chose this not because I wanted to subject you to my bad tweets, but because it's small, easy to get and nicely formatted
# In[32]:
import csv
class TwitterArchiveCorpus():
def __init__(self, archive_path):
self.path = archive_path
def iter_texts(self):
with open(self.path) as f:
for row in csv.DictReader(f):
if (row["retweeted_status_id"] == "" and # filter retweets
not row["text"].startswith("RT @") and
row["in_reply_to_status_id"] == "" and # filter replies
not row["text"].startswith("@")):
yield preprocess(row["text"])
tweets = TwitterArchiveCorpus("tweets.csv")
# General gensim pattern for doing corpus reading will use an "iterable container" pattern
#
# I've chosen to ignore retweets and replies. Just look at tweets I wrote.
# In[33]:
texts = tweets.iter_texts()
print(list(next(texts)))
print(list(next(texts)))
# In[34]:
get_ipython().system('head -3 tweets.csv | awk -F \'","\' \'{print $6}\'')
# What happened to "y'all"? Here's a consequence of some prep steps I chose
# # Basics
#
# 1. **Prepare** the corpus
# 2. **Model** the corpus
# 3. **Index** the corpus
# 4. **Query** fun-time!
# In[35]:
plot_vecs(cat_phrase_vectors, other_cat, labels=cat_lang)
# In[36]:
class TwitterArchiveCorpus():
def __init__(self, archive_path):
self.path = archive_path
self.dictionary = gensim.corpora.Dictionary(self.iter_texts())
def iter_texts(self):
with open(self.path) as f:
for row in csv.DictReader(f):
if (row["retweeted_status_id"] == "" and # filter retweets
not row["text"].startswith("RT @") and
row["in_reply_to_status_id"] == "" and # filter replies
not row["text"].startswith("@")):
yield preprocess(row["text"])
def __iter__(self):
for document in self.iter_texts():
yield self.dictionary.doc2bow(document)
def __len__(self):
return self.dictionary.num_docs
def get_original(self, key):
pass # let's not look at this :-)
tweets = TwitterArchiveCorpus("tweets.csv")
# In[37]:
{"meow": 0,
"hiss": 1,
"purr": 2}
# Dictionary will make translating docs into vector space very easy. Goes thru text, creating a mapping of words to ids, counting stats.
#
# bow is "bag of words", or a way to represent docs as sparse vectors
# In[38]:
class TwitterArchiveCorpus():
def __init__(self, archive_path):
self.path = archive_path
self.lookup = dict()
self.dictionary = gensim.corpora.Dictionary(self.iter_texts())
def iter_texts(self):
current = 0
with open(self.path) as f:
for row in csv.DictReader(f):
if (row["retweeted_status_id"] == "" and # filter retweets
not row["text"].startswith("RT @") and
row["in_reply_to_status_id"] == "" and # filter replies
not row["text"].startswith("@")):
self.lookup[current] = row # BAD BAD BAD
current += 1
yield preprocess(row["text"])
def __iter__(self):
for document in self.iter_texts():
yield self.dictionary.doc2bow(document)
def __len__(self):
return self.dictionary.num_docs
def get_original(self, key):
return self.lookup[key]["text"]
tweets = TwitterArchiveCorpus("tweets.csv")
# In[39]:
vecs = iter(tweets)
print(next(vecs))
print(next(vecs))
# In[40]:
texts = tweets.iter_texts()
print(list(next(texts)))
print(list(next(texts)))
# In[41]:
print(tweets.get_original(0))
print(tweets.get_original(1))
# In[42]:
len(tweets), len(tweets.dictionary)
# In[43]:
(len(tweets) * len(tweets.dictionary)) / (1024 ** 2)
# Turns out I don't use a lot of different words, just 9000!
#
# This is why gensim does sparse vectors and generators everywhere! Imagine loading a corpus bigger than this into memory.
# # Basics
#
# 1. **Prepare** the corpus
# 2. **Model** the corpus
# 3. **Index** the corpus
# 4. **Query** fun-time!
# In[44]:
plot_vecs(cat_phrase_vectors, other_cat, labels=cat_lang)
# In[45]:
index = gensim.similarities.Similarity('/tmp/tweets',
tweets,
num_features=len(tweets.dictionary),
num_best=15)
# In[46]:
get_ipython().system('ls /tmp/tweets*')
# - prefix= does sharding!
# - num_features= because we are using sparse vectors, this gives it a hint of what the max is
# - num_best= just tells it to show 15 results (just for presentation purposes)
#
# What does this class do? What kind of indexing
# # Basics
#
# 1. **Prepare** the corpus
# 2. **Model** the corpus
# 3. **Index** the corpus
# 4. **Query** fun-time!
# In[47]:
plot_vecs(cat_phrase_vectors, other_cat, labels=cat_lang)
# In[48]:
query_bow = tweets.dictionary.doc2bow(preprocess("drunk"))
query_bow
# In[49]:
index[query_bow]
# We gotta translate our query phrase into vector space.
#
# doing getitem on the index will "query" it and give us a document id and it's relatedness to the query
# In[50]:
def search(query):
query_bow = tweets.dictionary.doc2bow(preprocess(query))
for doc, percent in index[query_bow]:
print("%.3f" % percent, "=>", tweets.get_original(doc), "\n")
# In[51]:
search("drunk")
# Point out that shorter tweets are marked as more related!
#
# When we get more words, the vector changes direction slightly, making it less related!
# In[52]:
search("bar")
# Counter to what I just told you, the first tweet here is actually longer than the others! That's because the word bar appears twice
# In[53]:
search("ROLL TIDE")
# # A more slightly more advanced model
# In[54]:
lsi = gensim.models.LsiModel(tweets,
num_topics=100,
power_iters=10,
id2word=tweets.dictionary)
# power_iters says look at these vectors 10 times
#
# id2word is for other convience methods of the object, has no affect on outcome
# In[55]:
len(tweets), len(tweets.dictionary)
# In[56]:
len(tweets), lsi.num_topics
# In[57]:
lsi_index = gensim.similarities.Similarity('/tmp/lsitweets',
lsi[tweets],
num_features=lsi.num_topics,
num_best=15)
# In[58]:
def lsi_search(query):
query_bow = tweets.dictionary.doc2bow(preprocess(query))
for doc, percent in lsi_index[lsi[query_bow]]:
print("%.3f" % percent, "=>", tweets.get_original(doc), "\n")
# In[59]:
search("pigpen") # non-LSI search
# In[60]:
lsi_search("pigpen")
# # WE NEED TO GO DEEPER *BRRRRRRRAAAAAWWWWRWRRRMRMRMMMMM*
#
# ![Deeper](images/gifs/deeper.gif)
#
# ## A deep-learning approach
#
# Warning: here be Gensim dragons
#
# gif credit: http://prostheticknowledge.tumblr.com/post/128044261341/implementation-of-a-neural-algorithm-of-artistic
# In[61]:
w2v = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',
binary=True)
# ### How does "king - man + woman = queen"?
#
# ![Vectors](images/gifs/vectors.gif)
#
# gif credit: http://multithreaded.stitchfix.com/blog/2015/03/11/word-is-worth-a-thousand-vectors/
# In[62]:
def generate_vector_sums():
for doc in tweets.iter_texts(): # remember me?
yield gensim.matutils.unitvec( # DRAGON: hack to make Similarity happy with fullvecs
sum(
(w2v[word] for word in doc if word in w2v),
numpy.zeros(w2v.vector_size)
)
)
w2v_index = gensim.similarities.Similarity('/tmp/w2v_tweets',
generate_vector_sums(),
num_features=w2v.vector_size,
num_best=15)
# In[63]:
def search_w2v(query):
query_tokens = preprocess(query)
query_vec = gensim.matutils.unitvec(sum((w2v[word] for word in query_tokens if word in w2v), numpy.zeros(w2v.vector_size)))
for doc, percent in w2v_index[query_vec]:
print("%.3f" % percent, "=>", tweets.get_original(doc), "\n")
# In[64]:
search_w2v("drunk")
# In[65]:
search_w2v("kittens")
# # Thanks for listening! ๐ป๐ป๐ธ
#
# (Is it beer-time yet?)
# In[66]:
plot_vecs(cat_phrase_vectors, other_cat, labels=["meow", "hiss"])
# *No tweets were harmed as a result of this talk.