Check it out! 3 weeks of genre films! Can we find similar movies by using their synopsis?
In which we profusely use Pattern for its web-friendly features.
from pattern.web import URL, download, plaintext, Element, abs, Text
import re
# The main page of Fantasia fest listing all the films
url = URL('http://fantasiafest.com/2014/en/films-schedule/films')
html = url.download(unicode=True)
#List of links to all the films
element = Element(html)
links=[]
for link in element('h4 a'):
formatted_link = abs(link.attributes.get('href',''), base=url.redirect or url.string)
links.append(formatted_link)
#List of durations
element = Element(html)
duration_pat = re.compile(r'[0-9]* min')
durations=[]
for e in element('div.info ul'):
specs = plaintext(e.content)
duration = int(duration_pat.search(specs).group()[:-4])
durations.append(duration)
print duration
#List only films with duration over 45 minutes
feature_films = [link for (link, duration) in zip(links,durations) if duration>45]
print feature_films
# Demo for only one film
link = feature_films[165]
html = download(link)
element = Element(html)
title = plaintext(element('h1')[1].content)
synopsis = "\n".join([plaintext(e.content) for e in (element('div.synopsis p'))])
print title
print synopsis
# Use a loop to get all the links
fantasia2014={}
for link in feature_films:
html = download(link)
element = Element(html)
title = plaintext(element('h1')[1].content)
synopsis = "\n".join([plaintext(e.content) for e in (element('div.synopsis p'))])
fantasia2014[title]=synopsis
#print title
In which we profusely use NLTK for its classic tokenizer, stemmer and lemmatizer. Check out the awesome free book Natural Language Processing with Python.
#Fast and dirty : split on whitespace, remove preceding/trailing punctuation
punctuation = u",.;:'()\u201c\u2026\u201d\u2013\u2019\u2014"
splitted_text = fantasia2014["The Zero Theorem"].split()
clean_text = [w.strip(punctuation) for w in splitted_text]
print clean_text
# More sophisticated : using a tokenizer
import nltk
synopsis = fantasia2014["The Zero Theorem"]
tokens = [word for sent in nltk.sent_tokenize(synopsis) for word in nltk.word_tokenize(sent)]
print tokens
# I prefer my fast and dirty way for this corpus, so let's use a loop to apply it
# it on all the texts
punctuation = u",.;:'()\u201c\u2026\u201d\u2013\u2019\u2014"
fantasia2014_tokenized = dict()
for title in fantasia2014:
splitted_text = fantasia2014[title].split()
fantasia2014_tokenized[title] = [w.strip(punctuation) for w in splitted_text
if w.strip(punctuation) != ""]
#print fantasia2014_tokenized["The Zero Theorem"]
# Stemming : uses rules to chop off end of words
stemmer = nltk.stem.porter.PorterStemmer()
singular = stemmer.stem("zombie")
plural = stemmer.stem("zombies")
print singular, plural
print (singular==plural)
# Lemmatizing : uses a dictionnary
from nltk import WordNetLemmatizer as wnl
singular = wnl().lemmatize("zombie")
plural = wnl().lemmatize("zombies")
print singular, plural
print (singular==plural)
# I like the lemmatization better.
# Let's lemmatize all the texts
fantasia2014_lemma = dict()
for title in fantasia2014_tokenized:
synopsis = []
for word in fantasia2014_tokenized[title]:
lemma= wnl().lemmatize(word.lower()) #lowercasing text is another normalization
synopsis.append(lemma)
fantasia2014_lemma[title] = synopsis
print fantasia2014_lemma["The Zero Theorem"]
# Collocations are frequent bigrams (pair of words) that occur often together
# Get the collocations
all_texts = []
for title in fantasia2014_lemma:
all_texts.extend(fantasia2014_lemma[title])
bigrams = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(all_texts)
scored = finder.score_ngrams(bigrams.likelihood_ratio)
print scored
#Lets remove the stopwords (a, the, in, into, on ...) and try again
stop = nltk.corpus.stopwords.words('english') #list of stopwords from NLTK
fantasia2014_stop=dict()
for title in fantasia2014_lemma:
fantasia2014_stop[title] = [w for w in fantasia2014_lemma[title] if w not in stop]
#This is the same as above, but with the stopwords removed
all_texts = []
for title in fantasia2014_stop:
all_texts.extend(fantasia2014_stop[title])
bigrams = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(all_texts)
scored = finder.score_ngrams(bigrams.likelihood_ratio)
print scored
In which we profusely use Gensim, which is great for topic modeling. Also check-out the awesome blog of the developer (that guy took Google's word2vec C code and made it faster in Python). The following is an adaptation of the tutorials found here, please refer to them for more explanations.
from gensim import corpora, models, similarities
#put the text in the right format : lists
titles=[]
texts=[]
for title in fantasia2014_stop:
titles.append(title)
texts.append(fantasia2014_stop[title])
#remove words that occur only once to reduce the size
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]
for text in texts]
Term frequency–inverse document frequency (TF-IDF) gives the importance of a word in a document, as it is frequent in that document but not very frequent in all the documents taken together.
# Build a model
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus] # step 2 -- apply the transformation to the corpus
# What does it look like?
for doc in corpus_tfidf:
print(doc)
TF-IDF is fine, but what if we have 2 documents talking about the same thing but with different words, e.g. "Funny zombie movie" and "comedy of the undead"? Well, if all these words appear sometimes together in other documents, they could be assigned to the same topic and we could use these topics to find the similarity between documents. Latent sementic indexing uses singular value decomposition (SVD).
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15)
lsi.print_topics(5)
# What does this looks like?
for doc in corpus_lsi:
print doc
#Which titles can we play with?
#print titles
#Get the indice of the film we wish to query
ind = titles.index("The Zero Theorem")
#Transform film synopsis to LSI space
doc = texts[ind]
vec_bow = dictionary.doc2bow(doc)
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)
#transform corpus to LSI space and index it IN RAM!
index = similarities.MatrixSimilarity(lsi[corpus])
# perform a similarity query against the corpus and sort them
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
# print out nicely the first 10 films
for i, (document_num, sim) in enumerate(sims) : # print sorted (document number, similarity score) 2-tuples
print titles[document_num], str(sim)
if i > 10 : break
#Train the model with our corpus
model_w2v = models.Word2Vec(texts, min_count=3)
#Query to find the most similar word
model_w2v.most_similar(positive=['horror'], topn=5)
#Query the model
this = "light"
is_to = "dark"
what = "angel"
is_to2= model_w2v.most_similar(positive=[is_to, what], negative=[this], topn=3)
print this+' is to '+is_to+' as '+what+' is to : '
print is_to2
# Load the model, downloaded from : https://code.google.com/p/word2vec/
model_GN = models.Word2Vec.load_word2vec_format('/Users/francoiseprovencher/Documents/Word2VecBinaries/GoogleNews-vectors-negative300.bin.gz', binary=True)
#Query to find the most similar word
model_GN.most_similar(positive=['zombie'], topn=5)
#Query the model
this = "light"
is_to = "dark"
what = "angel"
is_to2= model_GN.most_similar(positive=[is_to, what], negative=[this], topn=3)
print this+' is to '+is_to+' as '+what+' is to : '
print is_to2