from pattern.web import URL, download, plaintext, Element, abs, Text
import re

# The main page of Fantasia fest listing all the films
url = URL('http://fantasiafest.com/2014/en/films-schedule/films')
html = url.download(unicode=True)

#List of links to all the films
element = Element(html)
links=[]
for link in element('h4 a'):
    formatted_link = abs(link.attributes.get('href',''), base=url.redirect or url.string)
    links.append(formatted_link)

#List of durations
element = Element(html)
duration_pat = re.compile(r'[0-9]* min')
durations=[]
for e in element('div.info ul'):
    specs = plaintext(e.content)
    duration = int(duration_pat.search(specs).group()[:-4])
    durations.append(duration)
    print duration

#List only films with duration over 45 minutes
feature_films = [link for (link, duration) in zip(links,durations) if duration>45]
print feature_films

# Demo for only one film
link = feature_films[165]
html = download(link)
element = Element(html)
title = plaintext(element('h1')[1].content)
synopsis = "\n".join([plaintext(e.content) for e in (element('div.synopsis p'))])
print title
print synopsis

# Use a loop to get all the links
fantasia2014={}
for link in feature_films:
    html = download(link)
    element = Element(html)
    title = plaintext(element('h1')[1].content)
    synopsis = "\n".join([plaintext(e.content) for e in (element('div.synopsis p'))])
    fantasia2014[title]=synopsis
    #print title

#Fast and dirty : split on whitespace, remove preceding/trailing punctuation
punctuation = u",.;:'()\u201c\u2026\u201d\u2013\u2019\u2014"
splitted_text = fantasia2014["The Zero Theorem"].split()
clean_text = [w.strip(punctuation) for w in splitted_text]
print clean_text

# More sophisticated : using a tokenizer
import nltk
synopsis = fantasia2014["The Zero Theorem"]
tokens = [word for sent in nltk.sent_tokenize(synopsis) for word in nltk.word_tokenize(sent)]
print tokens


# I prefer my fast and dirty way for this corpus, so let's use a loop to apply it 
# it on all the texts

punctuation = u",.;:'()\u201c\u2026\u201d\u2013\u2019\u2014"
fantasia2014_tokenized = dict()

for title in fantasia2014:
    splitted_text = fantasia2014[title].split()
    fantasia2014_tokenized[title] = [w.strip(punctuation) for w in splitted_text
                                     if w.strip(punctuation) != ""]
    
#print fantasia2014_tokenized["The Zero Theorem"]
    

# Stemming : uses rules to chop off end of words
stemmer  = nltk.stem.porter.PorterStemmer()
singular = stemmer.stem("zombie")
plural   = stemmer.stem("zombies")

print singular, plural
print (singular==plural)

# Lemmatizing : uses a dictionnary
from nltk import WordNetLemmatizer as wnl
singular = wnl().lemmatize("zombie")
plural   = wnl().lemmatize("zombies")

print singular, plural
print (singular==plural)

# I like the lemmatization better.
# Let's lemmatize all the texts
fantasia2014_lemma = dict()

for title in fantasia2014_tokenized:
    synopsis = []
    for word in fantasia2014_tokenized[title]:
        lemma= wnl().lemmatize(word.lower()) #lowercasing text is another normalization
        synopsis.append(lemma)
    fantasia2014_lemma[title] = synopsis
    
print fantasia2014_lemma["The Zero Theorem"]
    

# Collocations are frequent bigrams (pair of words) that occur often together
# Get the collocations
all_texts = []
for title in fantasia2014_lemma:
    all_texts.extend(fantasia2014_lemma[title])
                     
bigrams = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(all_texts)
scored = finder.score_ngrams(bigrams.likelihood_ratio)

print scored

#Lets remove the stopwords (a, the, in, into, on ...) and try again

stop = nltk.corpus.stopwords.words('english') #list of stopwords from NLTK
fantasia2014_stop=dict()

for title in fantasia2014_lemma:
    fantasia2014_stop[title] = [w for w in fantasia2014_lemma[title] if w not in stop]

#This is the same as above, but with the stopwords removed
all_texts = []
for title in fantasia2014_stop:
    all_texts.extend(fantasia2014_stop[title])
                     
bigrams = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(all_texts)
scored = finder.score_ngrams(bigrams.likelihood_ratio)

print scored

from gensim import corpora, models, similarities

#put the text in the right format : lists
titles=[]
texts=[]
for title in fantasia2014_stop:
    titles.append(title)
    texts.append(fantasia2014_stop[title])
    
#remove words that occur only once to reduce the size
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]
         for text in texts]

# Build a model
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]      # step 2 -- apply the transformation to the corpus

# What does it look like?
for doc in corpus_tfidf:
    print(doc)

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15)
lsi.print_topics(5)

# What does this looks like?
for doc in corpus_lsi:
    print doc

#Which titles can we play with?
#print titles

#Get the indice of the film we wish to query
ind = titles.index("The Zero Theorem")

#Transform film synopsis to LSI space
doc = texts[ind]
vec_bow = dictionary.doc2bow(doc)
vec_lsi = lsi[vec_bow] # convert the query to LSI space
    
print(vec_lsi)

#transform corpus to LSI space and index it IN RAM!
index = similarities.MatrixSimilarity(lsi[corpus]) 

# perform a similarity query against the corpus and sort them
sims = index[vec_lsi] 
sims = sorted(enumerate(sims), key=lambda item: -item[1])

# print out nicely the first 10 films
for i, (document_num, sim) in enumerate(sims) : # print sorted (document number, similarity score) 2-tuples
    print titles[document_num], str(sim)
    if i > 10 : break

#Train the model with our corpus
model_w2v = models.Word2Vec(texts, min_count=3)

#Query to find the most similar word
model_w2v.most_similar(positive=['horror'], topn=5)

#Query the model
this = "light"
is_to = "dark"
what = "angel"
is_to2= model_w2v.most_similar(positive=[is_to, what], negative=[this], topn=3)

print this+' is to '+is_to+' as '+what+' is to : '
print is_to2

# Load the model, downloaded from : https://code.google.com/p/word2vec/
model_GN = models.Word2Vec.load_word2vec_format('/Users/francoiseprovencher/Documents/Word2VecBinaries/GoogleNews-vectors-negative300.bin.gz', binary=True)

#Query to find the most similar word
model_GN.most_similar(positive=['zombie'], topn=5)

#Query the model
this = "light"
is_to = "dark"
what = "angel"
is_to2= model_GN.most_similar(positive=[is_to, what], negative=[this], topn=3)

print this+' is to '+is_to+' as '+what+' is to : '
print is_to2