#!/usr/bin/env python
# coding: utf-8

# # Illustrating common terms usage using Wikinews in english

# ## getting data
# 
# We get the cirrussearch dump of wikinews (a dump meant for elastic-search indexation).

# In[1]:


LANG="english"


# In[2]:


get_ipython().run_cell_magic('bash', '', '\nfdate=20170327\nfname=enwikinews-$fdate-cirrussearch-content.json.gz\nif [ ! -e  $fname ]\nthen\n    wget "https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname"\nfi\n')


# In[3]:


# iterator
import gzip
import json

FDATE = 20170327
FNAME = "enwikinews-%s-cirrussearch-content.json.gz" % FDATE

def iter_texts(fpath=FNAME):
    with gzip.open(fpath, "rt") as f:
        for l in f:
            data = json.loads(l)
            if "title" in data:
                yield data["title"]
                yield data["text"]


# In[4]:


# also prepare nltk
import nltk
nltk.download("punkt")
nltk.download("stopwords")


# ## Preparing data
# 
# we arrange the corpus as required by gensim

# In[5]:


# make a custom tokenizer
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w[\w-]*|\d[\d,]*')


# In[6]:


# prepare a text
def prepare(txt):
    # lower case
    txt = txt.lower()
    return [tokenizer.tokenize(sent) 
            for sent in sent_tokenize(txt, language=LANG)]


# In[7]:


# we put all data in ram, it's not so much
corpus = []
for txt in iter_texts():
    corpus.extend(prepare(txt))


# In[8]:


# how many sentences and words ?
words_count = sum(len(s) for s in corpus)
print("Corpus has %d words in %d sentences" % (words_count, len(corpus)))


# ## Testing bigram with and without common terms
# 
# The `Phrases` model gives us the possiblity of handling common terms, that is words that appears much time in a text and are there only to link objects between them.
# While you could remove them, you may information, for *"the president is in america"* is not the same as *"the president of america"*
# 
# The common_terms parameter Phrases can help you deal with them in a smarter way, keeping them around but avoiding them to crush frequency statistics.

# In[9]:


from gensim.models.phrases import Phrases


# In[10]:


# which are the stop words we will use
from nltk.corpus import stopwords
" ".join(stopwords.words(LANG))


# In[14]:


# a version of corups without stop words
stop_words = frozenset(stopwords.words(LANG))
def stopwords_filter(txt):
    return [w for w in txt if w not in stop_words]
st_corpus = [stopwords_filter(txt) for txt in corpus]


# In[15]:


# bigram std
get_ipython().run_line_magic('time', 'bigram = Phrases(st_corpus)')
# bigram with common terms
get_ipython().run_line_magic('time', 'bigram_ct = Phrases(corpus, common_terms=stopwords.words(LANG))')


# ### bigram with common terms inside
# 
# What are (some of) the bigram founds thanks to common terms

# In[16]:


# grams that have more than 2 terms, are those with common terms
ct_ngrams = set((g[1], g[0].decode("utf-8"))
                     for g in bigram_ct.export_phrases(corpus) 
                     if len(g[0].split()) > 2)
ct_ngrams = sorted(list(ct_ngrams))
print(len(ct_ngrams), "grams with common terms found")
# highest scores
ct_ngrams[-20:]


# In[17]:


# did we found any bigram with same words but different stopwords
import collections
by_terms = collections.defaultdict(set)
for ngram, score in bigram_ct.export_phrases(corpus):
    grams = ngram.split()
    by_terms[(grams[0], grams[-1])].add(ngram)
for k, v  in by_terms.items():
    if len(v) > 1:
        print(b"-".join(k).decode("utf-8")," : ", [w.decode("utf-8") for w in v])


# In[ ]: