#!/usr/bin/env python # coding: utf-8 # # Illustrating common terms usage using Wikinews in english # ## getting data # # We get the cirrussearch dump of wikinews (a dump meant for elastic-search indexation). # In[1]: LANG="english" # In[2]: get_ipython().run_cell_magic('bash', '', '\nfdate=20170327\nfname=enwikinews-$fdate-cirrussearch-content.json.gz\nif [ ! -e $fname ]\nthen\n wget "https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname"\nfi\n') # In[3]: # iterator import gzip import json FDATE = 20170327 FNAME = "enwikinews-%s-cirrussearch-content.json.gz" % FDATE def iter_texts(fpath=FNAME): with gzip.open(fpath, "rt") as f: for l in f: data = json.loads(l) if "title" in data: yield data["title"] yield data["text"] # In[4]: # also prepare nltk import nltk nltk.download("punkt") nltk.download("stopwords") # ## Preparing data # # we arrange the corpus as required by gensim # In[5]: # make a custom tokenizer import re from nltk.tokenize import sent_tokenize from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('\w[\w-]*|\d[\d,]*') # In[6]: # prepare a text def prepare(txt): # lower case txt = txt.lower() return [tokenizer.tokenize(sent) for sent in sent_tokenize(txt, language=LANG)] # In[7]: # we put all data in ram, it's not so much corpus = [] for txt in iter_texts(): corpus.extend(prepare(txt)) # In[8]: # how many sentences and words ? words_count = sum(len(s) for s in corpus) print("Corpus has %d words in %d sentences" % (words_count, len(corpus))) # ## Testing bigram with and without common terms # # The `Phrases` model gives us the possiblity of handling common terms, that is words that appears much time in a text and are there only to link objects between them. # While you could remove them, you may information, for *"the president is in america"* is not the same as *"the president of america"* # # The common_terms parameter Phrases can help you deal with them in a smarter way, keeping them around but avoiding them to crush frequency statistics. # In[9]: from gensim.models.phrases import Phrases # In[10]: # which are the stop words we will use from nltk.corpus import stopwords " ".join(stopwords.words(LANG)) # In[14]: # a version of corups without stop words stop_words = frozenset(stopwords.words(LANG)) def stopwords_filter(txt): return [w for w in txt if w not in stop_words] st_corpus = [stopwords_filter(txt) for txt in corpus] # In[15]: # bigram std get_ipython().run_line_magic('time', 'bigram = Phrases(st_corpus)') # bigram with common terms get_ipython().run_line_magic('time', 'bigram_ct = Phrases(corpus, common_terms=stopwords.words(LANG))') # ### bigram with common terms inside # # What are (some of) the bigram founds thanks to common terms # In[16]: # grams that have more than 2 terms, are those with common terms ct_ngrams = set((g[1], g[0].decode("utf-8")) for g in bigram_ct.export_phrases(corpus) if len(g[0].split()) > 2) ct_ngrams = sorted(list(ct_ngrams)) print(len(ct_ngrams), "grams with common terms found") # highest scores ct_ngrams[-20:] # In[17]: # did we found any bigram with same words but different stopwords import collections by_terms = collections.defaultdict(set) for ngram, score in bigram_ct.export_phrases(corpus): grams = ngram.split() by_terms[(grams[0], grams[-1])].add(ngram) for k, v in by_terms.items(): if len(v) > 1: print(b"-".join(k).decode("utf-8")," : ", [w.decode("utf-8") for w in v]) # In[ ]: