from pattern.web import URL, download, plaintext, Element, abs, Text import re # The main page of Fantasia fest listing all the films url = URL('http://fantasiafest.com/2014/en/films-schedule/films') html = url.download(unicode=True) #List of links to all the films element = Element(html) links=[] for link in element('h4 a'): formatted_link = abs(link.attributes.get('href',''), base=url.redirect or url.string) links.append(formatted_link) #List of durations element = Element(html) duration_pat = re.compile(r'[0-9]* min') durations=[] for e in element('div.info ul'): specs = plaintext(e.content) duration = int(duration_pat.search(specs).group()[:-4]) durations.append(duration) print duration #List only films with duration over 45 minutes feature_films = [link for (link, duration) in zip(links,durations) if duration>45] print feature_films # Demo for only one film link = feature_films[165] html = download(link) element = Element(html) title = plaintext(element('h1')[1].content) synopsis = "\n".join([plaintext(e.content) for e in (element('div.synopsis p'))]) print title print synopsis # Use a loop to get all the links fantasia2014={} for link in feature_films: html = download(link) element = Element(html) title = plaintext(element('h1')[1].content) synopsis = "\n".join([plaintext(e.content) for e in (element('div.synopsis p'))]) fantasia2014[title]=synopsis #print title #Fast and dirty : split on whitespace, remove preceding/trailing punctuation punctuation = u",.;:'()\u201c\u2026\u201d\u2013\u2019\u2014" splitted_text = fantasia2014["The Zero Theorem"].split() clean_text = [w.strip(punctuation) for w in splitted_text] print clean_text # More sophisticated : using a tokenizer import nltk synopsis = fantasia2014["The Zero Theorem"] tokens = [word for sent in nltk.sent_tokenize(synopsis) for word in nltk.word_tokenize(sent)] print tokens # I prefer my fast and dirty way for this corpus, so let's use a loop to apply it # it on all the texts punctuation = u",.;:'()\u201c\u2026\u201d\u2013\u2019\u2014" fantasia2014_tokenized = dict() for title in fantasia2014: splitted_text = fantasia2014[title].split() fantasia2014_tokenized[title] = [w.strip(punctuation) for w in splitted_text if w.strip(punctuation) != ""] #print fantasia2014_tokenized["The Zero Theorem"] # Stemming : uses rules to chop off end of words stemmer = nltk.stem.porter.PorterStemmer() singular = stemmer.stem("zombie") plural = stemmer.stem("zombies") print singular, plural print (singular==plural) # Lemmatizing : uses a dictionnary from nltk import WordNetLemmatizer as wnl singular = wnl().lemmatize("zombie") plural = wnl().lemmatize("zombies") print singular, plural print (singular==plural) # I like the lemmatization better. # Let's lemmatize all the texts fantasia2014_lemma = dict() for title in fantasia2014_tokenized: synopsis = [] for word in fantasia2014_tokenized[title]: lemma= wnl().lemmatize(word.lower()) #lowercasing text is another normalization synopsis.append(lemma) fantasia2014_lemma[title] = synopsis print fantasia2014_lemma["The Zero Theorem"] # Collocations are frequent bigrams (pair of words) that occur often together # Get the collocations all_texts = [] for title in fantasia2014_lemma: all_texts.extend(fantasia2014_lemma[title]) bigrams = nltk.collocations.BigramAssocMeasures() finder = nltk.collocations.BigramCollocationFinder.from_words(all_texts) scored = finder.score_ngrams(bigrams.likelihood_ratio) print scored #Lets remove the stopwords (a, the, in, into, on ...) and try again stop = nltk.corpus.stopwords.words('english') #list of stopwords from NLTK fantasia2014_stop=dict() for title in fantasia2014_lemma: fantasia2014_stop[title] = [w for w in fantasia2014_lemma[title] if w not in stop] #This is the same as above, but with the stopwords removed all_texts = [] for title in fantasia2014_stop: all_texts.extend(fantasia2014_stop[title]) bigrams = nltk.collocations.BigramAssocMeasures() finder = nltk.collocations.BigramCollocationFinder.from_words(all_texts) scored = finder.score_ngrams(bigrams.likelihood_ratio) print scored from gensim import corpora, models, similarities #put the text in the right format : lists titles=[] texts=[] for title in fantasia2014_stop: titles.append(title) texts.append(fantasia2014_stop[title]) #remove words that occur only once to reduce the size all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] # Build a model dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model corpus_tfidf = tfidf[corpus] # step 2 -- apply the transformation to the corpus # What does it look like? for doc in corpus_tfidf: print(doc) lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15) lsi.print_topics(5) # What does this looks like? for doc in corpus_lsi: print doc #Which titles can we play with? #print titles #Get the indice of the film we wish to query ind = titles.index("The Zero Theorem") #Transform film synopsis to LSI space doc = texts[ind] vec_bow = dictionary.doc2bow(doc) vec_lsi = lsi[vec_bow] # convert the query to LSI space print(vec_lsi) #transform corpus to LSI space and index it IN RAM! index = similarities.MatrixSimilarity(lsi[corpus]) # perform a similarity query against the corpus and sort them sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) # print out nicely the first 10 films for i, (document_num, sim) in enumerate(sims) : # print sorted (document number, similarity score) 2-tuples print titles[document_num], str(sim) if i > 10 : break #Train the model with our corpus model_w2v = models.Word2Vec(texts, min_count=3) #Query to find the most similar word model_w2v.most_similar(positive=['horror'], topn=5) #Query the model this = "light" is_to = "dark" what = "angel" is_to2= model_w2v.most_similar(positive=[is_to, what], negative=[this], topn=3) print this+' is to '+is_to+' as '+what+' is to : ' print is_to2 # Load the model, downloaded from : https://code.google.com/p/word2vec/ model_GN = models.Word2Vec.load_word2vec_format('/Users/francoiseprovencher/Documents/Word2VecBinaries/GoogleNews-vectors-negative300.bin.gz', binary=True) #Query to find the most similar word model_GN.most_similar(positive=['zombie'], topn=5) #Query the model this = "light" is_to = "dark" what = "angel" is_to2= model_GN.most_similar(positive=[is_to, what], negative=[this], topn=3) print this+' is to '+is_to+' as '+what+' is to : ' print is_to2