from IPython.display import HTML HTML('') import json # Load Data (if you don't want to crawl the data) with open('strata_abstracts.json') as f: abstracts = json.load(f) import nltk bigram_measures = nltk.collocations.BigramAssocMeasures() stop = nltk.corpus.stopwords.words('english') text = {} words = {} for year in abstracts: raw = " ".join(abstracts[year]) tokens = nltk.WordPunctTokenizer().tokenize(raw) text[year] = nltk.Text(tokens) words[year] = [w.lower() for w in text[year]] words[year] = [w for w in words[year] if w not in stop] words[year] = filter(lambda word: word not in u'%,-:()$\/;?.’–“”', words[year]) words[year] = [w for w in words[year] if w not in ["ll", "II", "ll", "http", "://", "e", "g", "2", "0"]] text["2012"] for year in text: print year text[year].collocations() print numwords = {} uniwords = {} for year in text: numwords[year] = len(text[year]) uniwords[year] = len(set(text[year])) print numwords print uniwords import pandas as pd freq_table = pd.DataFrame() for year in words: fd = nltk.FreqDist(words[year]) if (len(freq_table) == 0): freq_table = pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)]) else: freq_table = freq_table.merge(pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)])) print freq_table[:10] for year in numwords: freq_table["Perc_" + year] = 100.0 * freq_table["Freq_" + year] / numwords[year] for year in ["2012", "2013", "2014"]: print year freq_table["Growth_" + year] = 100.0 * freq_table["Perc_" + year] / freq_table["Perc_" + str(int(year)-1)] tb = freq_table[freq_table['Perc_' + str(year)] >= 0.08].sort(columns="Growth_" + str(year), ascending=False)[["Word", "Freq_" + str(year), "Perc_" + str(year), "Growth_" + str(year)]] tb.columns = ["Word", "Freq", "Percent", "Index"] tb.Index = tb['Index'].round(1) tb.Percent = tb['Percent'].round(4) print tb[:10] from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() for year in ["2011", "2012", "2013", "2014"]: print "Bigrams " + str(year) finder = BigramCollocationFinder.from_words(words[year]) scored = finder.score_ngrams(bigram_measures.raw_freq) print pd.DataFrame(scored[:10]) for year in abstracts: print "Trigrams " + str(year) finder = TrigramCollocationFinder.from_words(text[year]) scored = finder.score_ngrams(trigram_measures.raw_freq) print pd.DataFrame(scored[:10]) from collections import Counter import pandas as pd trending_words = pd.DataFrame() for year in words: fdist = nltk.FreqDist(words[year]) if len(trending_words) == 0: trending_words = pd.DataFrame(fdist.items(), columns=["word", str(year)]) trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum()) else: trending_words = trending_words.merge(pd.DataFrame(fdist.items(), columns=["word", str(year)]), how="outer") trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum()) print trending_words[:10] trending_words["plus12"] = trending_words["2012"] / trending_words["2011"] trending_words["plus13"] = trending_words["2013"] / trending_words["2012"] trending_words["plus14"] = trending_words["2014"] / trending_words["2013"] trending_words = trending_words.fillna(0) print trending_words[(trending_words["2012"] > 0.001) & (trending_words["2011"] > 0)].sort("plus12", ascending=False)[:10] print print trending_words[(trending_words["2013"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus13", ascending=False)[:10] print print trending_words[(trending_words["2014"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus14", ascending=False)[:10] import pandas as pd result = pd.DataFrame() for year in words: finder = BigramCollocationFinder.from_words(words[year], window_size = 2) #finder.apply_freq_filter(2) ignored_words = nltk.corpus.stopwords.words('english') finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) scores = finder.score_ngrams(bigram_measures.raw_freq) if len(result) == 0: result = pd.DataFrame(scores, columns=["ngram", str(year)]) else: result = result.merge(pd.DataFrame(scores, columns=["ngram", str(year)])) print result[:10] result["plus12"] = result["2012"] / result["2011"] result["plus13"] = result["2013"] / result["2012"] result["plus14"] = result["2014"] / result["2013"] print result[result["2014"] > 0.0005].sort("plus14", ascending=False)[:10] print print result[result["2013"] > 0.0005].sort("plus13", ascending=False)[:10] print print result[result["2012"] > 0.0005].sort("plus12", ascending=False)[:10] %matplotlib inline import matplotlib.pyplot as plt query = [("big", "data"), ("data", "science"), ("real", "time"), ("machine", "learning"), ("social", "media"), ("open", "source")] query_results = result[result['ngram'].isin(query)][["2011", "2012", "2013", "2014"]].transpose() query_results.columns = [" ".join(q) for q in query] print query_results.plot(figsize=(10,5), title="Strata topics") %run lda.py -f strata_abstracts.txt -s --stopwords -k 7 %matplotlib inline import matplotlib.pyplot as plt query = ["hadoop", "yarn", "storm"] query = ["python", "julia", "r", "sas", "stata", "excel"] query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]] query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose() print query_results.plot(figsize=(10,6), title="Programming Langugages @ Strata Conferences 2011-2014") query = ["business", "energy", "advertising", "banking", "health", "politics", "government", "finance", "automotive"] query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]] query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose() print query_results.plot(figsize=(10,6), title="Strata topics") query = ["google", "facebook", "yahoo", "linkedin", "microsoft"] query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]] query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose() print query_results.plot(figsize=(10,6), title="Strata topics") query = ["modern", "machine", "learning"] query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]] query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose() print query_results.plot(figsize=(10,6), title="Topics at Strata Conferences 2011-14") plt.savefig("Strata_ModernMachineLearning.png")