CAVEAT: Make sure the page owner allows crawling the content for scientific purpost (Terms, robots.txt)
Strata Conference, one of the most important conferences for all things Big Data, Hadoop, Data Science.
from IPython.display import HTML
HTML('<iframe src="http://strataconf.com/strata2014/public/schedule/grid/2014-02-12?schedule=public" width=100% height=350></iframe>')
from bs4 import BeautifulSoup
import urllib2
# List of conference schedule pages
urls = {2011 : "http://strataconf.com/strata2011/public/schedule/full",
2012 : "http://strataconf.com/strata2012/public/schedule/full/public",
2013 : "http://strataconf.com/strata2013/public/schedule/full/public",
2014 : "http://strataconf.com/strata2014/public/schedule/full/public"}
links = {}
# Collecting the links to the talk abstracts
for u in urls:
raw = urllib2.urlopen(urls[u]).read()
soup = BeautifulSoup(raw)
yearlinks = set([l.get("href") for l in soup.find_all("a")])
yearlinks = [l for l in yearlinks if not (l is None)]
yearlinks = [l for l in yearlinks if '/detail' in l]
yearlinks = [l.replace("http://strataconf.com", "") for l in yearlinks]
links[u] = yearlinks
abstracts = {}
for year in links:
for l in links[year]:
raw = urllib2.urlopen("http://www.strataconf.com" + l).read()
soup = BeautifulSoup(raw)
desc = soup.find("div", class_="en_session_description description")
if year in abstracts:
abstracts[year].append(desc.get_text())
else:
abstracts[year] = [desc.get_text()]
import json
# Load Data (if you don't want to crawl the data)
with open('strata_abstracts.json') as f:
abstracts = json.load(f)
import nltk
bigram_measures = nltk.collocations.BigramAssocMeasures()
stop = nltk.corpus.stopwords.words('english')
text = {}
words = {}
for year in abstracts:
raw = " ".join(abstracts[year])
tokens = nltk.WordPunctTokenizer().tokenize(raw)
text[year] = nltk.Text(tokens)
words[year] = [w.lower() for w in text[year]]
words[year] = [w for w in words[year] if w not in stop]
words[year] = filter(lambda word: word not in u'%,-:()$\/;?.’–“”', words[year])
words[year] = [w for w in words[year] if w not in ["ll", "II", "ll", "http", "://", "e", "g", "2", "0"]]
text["2012"]
<Text: Organizations today are generating data at an ever...>
for year in text:
print year
text[year].collocations()
print
2014 Building collocations list big data; machine learning; Big Data; open source; data science; case studies; data scientists; best practices; http ://; Machine Learning; every day; Energy Project; time series; use cases; data center; Apache Hadoop; Industrial Internet; Clean Energy; take advantage; software engineering 2011 Building collocations list Big Data; Executive Summit; open source; big data; machine learning; data science; Riak Core; Bob Page; time series; witch doctors; data sets; reserved table; best practices; Apache Mahout; Science Fair; dark underbelly; darkly humorous; http ://; lays bare; litmus tests 2013 Building collocations list big data; Big Data; open source; machine learning; use cases; http ://; Data Science; Rest Devices; social media; data sets; Strata Conference; relational database; Expo Hall; Stitch Fix; https ://; lessons learned; data science; command line; data collection; :// www 2012 Building collocations list Big Data; big data; open source; machine learning; data sets; http ://; Alistair Croll; social media; Climate Corporation; use cases; variable importance; Tableau Public; social contagion; Apache Hadoop; supply chain; Avinash Kaushik; Opening remarks; case study; real world; natural language
numwords = {}
uniwords = {}
for year in text:
numwords[year] = len(text[year])
uniwords[year] = len(set(text[year]))
print numwords
print uniwords
{u'2014': 24326, u'2011': 19869, u'2013': 24963, u'2012': 30902} {u'2014': 4149, u'2011': 3860, u'2013': 4436, u'2012': 4895}
import pandas as pd
freq_table = pd.DataFrame()
for year in words:
fd = nltk.FreqDist(words[year])
if (len(freq_table) == 0):
freq_table = pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)])
else:
freq_table = freq_table.merge(pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)]))
print freq_table[:10]
Word Freq_2014 Freq_2011 Freq_2013 Freq_2012 0 data 555 448 568 723 1 hadoop 101 21 95 102 2 big 92 58 122 144 3 time 68 51 63 72 4 real 64 42 59 59 5 analytics 62 56 60 52 6 new 61 61 56 75 7 talk 58 32 50 49 8 using 57 24 53 61 9 use 56 44 68 79
for year in numwords:
freq_table["Perc_" + year] = 100.0 * freq_table["Freq_" + year] / numwords[year]
for year in ["2012", "2013", "2014"]:
print year
freq_table["Growth_" + year] = 100.0 * freq_table["Perc_" + year] / freq_table["Perc_" + str(int(year)-1)]
tb = freq_table[freq_table['Perc_' + str(year)] >= 0.08].sort(columns="Growth_" + str(year), ascending=False)[["Word", "Freq_" + str(year), "Perc_" + str(year), "Growth_" + str(year)]]
tb.columns = ["Word", "Freq", "Percent", "Index"]
tb.Index = tb['Index'].round(1)
tb.Percent = tb['Percent'].round(4)
print tb[:10]
2012 Word Freq Percent Index 84 models 39 0.1262 417.9 1 hadoop 102 0.3301 312.3 100 value 29 0.0938 310.8 135 experience 29 0.0938 310.8 130 social 52 0.1683 278.6 169 simple 28 0.0906 257.2 388 r 30 0.0971 241.1 342 support 25 0.0809 229.6 73 problems 25 0.0809 229.6 24 platform 33 0.1068 212.2 2013 Word Freq Percent Index 584 engine 20 0.0801 2475.8 317 google 28 0.1122 693.2 85 queries 20 0.0801 275.1 54 human 21 0.0841 260.0 216 strata 20 0.0801 247.6 41 science 31 0.1242 225.7 11 scale 41 0.1642 181.3 138 hive 25 0.1001 162.9 134 database 25 0.1001 162.9 112 two 22 0.0881 160.2 2014 Word Freq Percent Index 60 components 23 0.0945 393.4 38 cluster 30 0.1233 342.1 33 building 31 0.1274 289.2 23 processing 37 0.1521 253.1 61 graph 23 0.0945 214.6 76 organizations 20 0.0822 205.2 39 high 30 0.1233 205.2 56 storage 25 0.1028 197.3 77 project 20 0.0822 186.6 53 build 25 0.1028 183.2
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
for year in ["2011", "2012", "2013", "2014"]:
print "Bigrams " + str(year)
finder = BigramCollocationFinder.from_words(words[year])
scored = finder.score_ngrams(bigram_measures.raw_freq)
print pd.DataFrame(scored[:10])
Bigrams 2011 0 1 0 (big, data) 0.004392 1 (data, science) 0.002100 2 (real, time) 0.001909 3 (open, source) 0.001528 4 (data, driven) 0.001241 5 (executive, summit) 0.001146 6 (machine, learning) 0.001146 7 (data, sets) 0.001050 8 (open, data) 0.001050 9 (real, world) 0.001050 Bigrams 2012 0 1 0 (big, data) 0.007987 1 (data, driven) 0.001474 2 (real, time) 0.001413 3 (data, sets) 0.001352 4 (real, world) 0.001229 5 (session, sponsored) 0.001167 6 (machine, learning) 0.001044 7 (open, source) 0.001044 8 (data, analysis) 0.000983 9 (data, analytics) 0.000922 Bigrams 2013 0 1 0 (big, data) 0.008412 1 (real, time) 0.002577 2 (data, science) 0.001970 3 (open, source) 0.001591 4 (machine, learning) 0.001516 5 (session, sponsored) 0.001516 6 (use, cases) 0.001212 7 (data, scientists) 0.000985 8 (data, sets) 0.000985 9 (real, world) 0.000985 Bigrams 2014 0 1 0 (big, data) 0.006557 1 (machine, learning) 0.002931 2 (real, time) 0.002854 3 (data, science) 0.001928 4 (open, source) 0.001851 5 (session, sponsored) 0.001697 6 (real, world) 0.001388 7 (data, scientists) 0.001311 8 (data, analysis) 0.001080 9 (data, analytics) 0.001003
for year in abstracts:
print "Trigrams " + str(year)
finder = TrigramCollocationFinder.from_words(text[year])
scored = finder.score_ngrams(trigram_measures.raw_freq)
print pd.DataFrame(scored[:10])
Trigrams 2014 0 1 0 (., In, this) 0.002097 1 (., This, session) 0.001151 2 (In, this, talk) 0.001151 3 (is, sponsored, by) 0.001069 4 (real, -, time) 0.001069 5 (., We, will) 0.001028 6 (This, session, is) 0.000904 7 (session, is, sponsored) 0.000904 8 (We, ’, ll) 0.000863 9 (this, talk, ,) 0.000781 Trigrams 2011 0 1 0 (., In, this) 0.001107 1 (real, -, time) 0.000755 2 (the, Executive, Summit) 0.000604 3 (we, ’, ll) 0.000604 4 (don, ’, t) 0.000554 5 (it, ’, s) 0.000554 6 (., This, session) 0.000503 7 (part, of, the) 0.000503 8 (,, we, ’) 0.000453 9 (., It, ’) 0.000453 Trigrams 2013 0 1 0 (., This, session) 0.001202 1 (We, ’, ll) 0.001162 2 (., In, this) 0.001082 3 (is, sponsored, by) 0.001001 4 (., We, ’) 0.000961 5 (real, -, time) 0.000961 6 (., We, will) 0.000881 7 (This, session, is) 0.000841 8 (session, is, sponsored) 0.000801 9 (., This, talk) 0.000601 Trigrams 2012 0 1 0 (., This, session) 0.001003 1 (., In, this) 0.000938 2 (We, ’, ll) 0.000938 3 (., We, ’) 0.000809 4 (., We, will) 0.000777 5 (is, sponsored, by) 0.000680 6 (some, of, the) 0.000680 7 (This, session, is) 0.000647 8 (it, ’, s) 0.000647 9 (real, -, time) 0.000615
from collections import Counter
import pandas as pd
trending_words = pd.DataFrame()
for year in words:
fdist = nltk.FreqDist(words[year])
if len(trending_words) == 0:
trending_words = pd.DataFrame(fdist.items(), columns=["word", str(year)])
trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum())
else:
trending_words = trending_words.merge(pd.DataFrame(fdist.items(), columns=["word", str(year)]), how="outer")
trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum())
print trending_words[:10]
word 2014 2011 2013 2012 0 data 0.042811 0.042773 0.043043 0.044419 1 hadoop 0.007791 0.002005 0.007199 0.006267 2 big 0.007097 0.005538 0.009245 0.008847 3 time 0.005245 0.004869 0.004774 0.004423 4 real 0.004937 0.004010 0.004471 0.003625 5 analytics 0.004782 0.005347 0.004547 0.003195 6 new 0.004705 0.005824 0.004244 0.004608 7 talk 0.004474 0.003055 0.003789 0.003010 8 using 0.004397 0.002291 0.004016 0.003748 9 use 0.004320 0.004201 0.005153 0.004853
trending_words["plus12"] = trending_words["2012"] / trending_words["2011"]
trending_words["plus13"] = trending_words["2013"] / trending_words["2012"]
trending_words["plus14"] = trending_words["2014"] / trending_words["2013"]
trending_words = trending_words.fillna(0)
print trending_words[(trending_words["2012"] > 0.001) & (trending_words["2011"] > 0)].sort("plus12", ascending=False)[:10]
print
print trending_words[(trending_words["2013"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus13", ascending=False)[:10]
print
print trending_words[(trending_words["2014"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus14", ascending=False)[:10]
word 2014 2011 2013 2012 plus12 plus13 \ 1729 variable 0.000154 0.000095 0.000076 0.001044 10.939239 0.072558 52 sponsored 0.002006 0.000191 0.002122 0.001413 7.400074 1.501628 142 hive 0.001080 0.000191 0.001895 0.001167 6.113104 1.623000 354 solution 0.000617 0.000191 0.001364 0.001167 6.113104 1.168560 86 models 0.001466 0.000573 0.000758 0.002396 4.182650 0.316277 604 nosql 0.000386 0.000382 0.000985 0.001474 3.860908 0.668135 299 set 0.000694 0.000382 0.000758 0.001413 3.700037 0.536296 39 cluster 0.002314 0.000382 0.000682 0.001352 3.539166 0.504605 1 hadoop 0.007791 0.002005 0.007199 0.006267 3.125497 1.148829 139 experience 0.001080 0.000573 0.001516 0.001782 3.110176 0.850676 plus14 1729 2.035791 52 0.945189 142 0.570022 354 0.452398 86 1.934002 604 0.391498 299 0.916106 39 3.392986 1 1.082184 139 0.712527 word 2014 2011 2013 2012 plus12 \ 701 engine 0.000309 0.000668 0.001516 0.000061 0.091926 161 energy 0.001003 0.000095 0.001364 0.000061 0.643485 334 languages 0.000617 0.000191 0.001061 0.000061 0.321742 1391 fraud 0.000154 0.000095 0.000834 0.000061 0.643485 679 computations 0.000309 0.000095 0.000834 0.000061 0.643485 930 efficiency 0.000231 0.000382 0.000758 0.000061 0.160871 3648 openstack 0.000000 0.001241 0.000682 0.000061 0.049499 1227 centric 0.000154 0.000095 0.000606 0.000061 0.643485 3700 forecasting 0.000000 0.000382 0.001137 0.000123 0.321742 4691 location 0.000000 0.000095 0.001061 0.000123 1.286969 plus13 plus14 701 24.669597 0.203579 161 22.202637 0.735147 334 17.268718 0.581655 1391 13.568278 0.185072 679 13.568278 0.370144 930 12.334798 0.305369 3648 11.101319 0.000000 1227 9.867839 0.254474 3700 9.251099 0.000000 4691 8.634359 0.000000 word 2014 2011 2013 2012 plus12 \ 181 deployment 0.000926 0.000191 0.000076 0.000307 1.608712 209 crowd 0.000849 0.000095 0.000076 0.000184 1.930454 238 highly 0.000771 0.000764 0.000076 0.000553 0.723920 239 humans 0.000771 0.000095 0.000076 0.000184 1.930454 267 clean 0.000694 0.000095 0.000076 0.000000 0.000000 101 traffic 0.001311 0.000191 0.000152 0.000123 0.643485 317 computational 0.000617 0.000573 0.000076 0.000184 0.321742 437 workflows 0.000540 0.000095 0.000076 0.000123 1.286969 418 reports 0.000540 0.000095 0.000076 0.000000 0.000000 407 options 0.000540 0.000095 0.000076 0.000553 5.791362 plus13 plus14 181 0.246696 12.214749 209 0.411160 11.196853 238 0.137053 10.178957 239 0.411160 10.178957 267 0.000000 9.161061 101 1.233480 8.652114 317 0.411160 8.143166 437 0.616740 7.125270 418 0.000000 7.125270 407 0.137053 7.125270
import pandas as pd
result = pd.DataFrame()
for year in words:
finder = BigramCollocationFinder.from_words(words[year], window_size = 2)
#finder.apply_freq_filter(2)
ignored_words = nltk.corpus.stopwords.words('english')
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
scores = finder.score_ngrams(bigram_measures.raw_freq)
if len(result) == 0:
result = pd.DataFrame(scores, columns=["ngram", str(year)])
else:
result = result.merge(pd.DataFrame(scores, columns=["ngram", str(year)]))
print result[:10]
ngram 2014 2011 2013 2012 0 (big, data) 0.006557 0.004392 0.008412 0.007987 1 (machine, learning) 0.002931 0.001146 0.001516 0.001044 2 (real, time) 0.002854 0.001909 0.002577 0.001413 3 (data, science) 0.001928 0.002100 0.001970 0.000737 4 (open, source) 0.001851 0.001528 0.001591 0.001044 5 (real, world) 0.001388 0.001050 0.000985 0.001229 6 (data, scientists) 0.001311 0.000764 0.000985 0.000737 7 (data, analysis) 0.001080 0.000191 0.000834 0.000983 8 (data, analytics) 0.001003 0.000955 0.000758 0.000922 9 (large, scale) 0.000926 0.000191 0.000758 0.000307
result["plus12"] = result["2012"] / result["2011"]
result["plus13"] = result["2013"] / result["2012"]
result["plus14"] = result["2014"] / result["2013"]
print result[result["2014"] > 0.0005].sort("plus14", ascending=False)[:10]
print
print result[result["2013"] > 0.0005].sort("plus13", ascending=False)[:10]
print
print result[result["2012"] > 0.0005].sort("plus12", ascending=False)[:10]
ngram 2014 2011 2013 2012 plus12 \ 16 (data, solutions) 0.000540 0.000095 0.000076 0.000061 0.643485 11 (time, series) 0.000771 0.000859 0.000152 0.000369 0.428990 19 (enterprise, data) 0.000540 0.000191 0.000152 0.000061 0.321742 21 (world, examples) 0.000540 0.000286 0.000152 0.000246 0.857980 14 (case, studies) 0.000617 0.000191 0.000227 0.000307 1.608712 17 (data, sources) 0.000540 0.000191 0.000227 0.000737 3.860908 1 (machine, learning) 0.002931 0.001146 0.001516 0.001044 0.911603 18 (decision, making) 0.000540 0.000764 0.000303 0.000246 0.321742 13 (data, driven) 0.000694 0.001241 0.000455 0.001474 1.187972 20 (new, data) 0.000540 0.000095 0.000379 0.000369 3.860908 plus13 plus14 16 1.233480 7.125270 11 0.411160 5.089479 19 2.466960 3.562635 21 0.616740 3.562635 14 0.740088 2.714389 17 0.308370 2.375090 1 1.451153 1.934002 18 1.233480 1.781317 13 0.308370 1.526844 20 1.027900 1.425054 ngram 2014 2011 2013 2012 plus12 \ 157 (relational, database) 0.000077 0.000191 0.000530 0.000123 0.643485 12 (best, practices) 0.000694 0.000573 0.000530 0.000184 0.321742 3 (data, science) 0.001928 0.002100 0.001970 0.000737 0.350992 9 (large, scale) 0.000926 0.000191 0.000758 0.000307 1.608712 2 (real, time) 0.002854 0.001909 0.002577 0.001413 0.740007 52 (open, data) 0.000231 0.001050 0.000758 0.000430 0.409490 36 (data, collection) 0.000309 0.000382 0.000909 0.000553 1.447841 4 (open, source) 0.001851 0.001528 0.001591 0.001044 0.683702 15 (use, cases) 0.000617 0.000191 0.001212 0.000799 4.182650 1 (machine, learning) 0.002931 0.001146 0.001516 0.001044 0.911603 plus13 plus14 157 4.317179 0.145414 12 2.878120 1.308723 3 2.672540 0.978746 9 2.466960 1.221475 2 1.823405 1.107710 52 1.762114 0.305369 36 1.644640 0.339299 4 1.523710 1.163309 15 1.518129 0.508948 1 1.451153 1.934002 ngram 2014 2011 2013 2012 plus12 \ 10 (apache, hadoop) 0.000849 0.000095 0.000606 0.000614 6.434847 130 (data, set) 0.000077 0.000095 0.000227 0.000614 6.434847 7 (data, analysis) 0.001080 0.000191 0.000834 0.000983 5.147877 15 (use, cases) 0.000617 0.000191 0.001212 0.000799 4.182650 17 (data, sources) 0.000540 0.000191 0.000227 0.000737 3.860908 91 (social, media) 0.000154 0.000286 0.000606 0.000799 2.788434 0 (big, data) 0.006557 0.004392 0.008412 0.007987 1.818544 29 (data, visualization) 0.000386 0.000477 0.000455 0.000737 1.544363 36 (data, collection) 0.000309 0.000382 0.000909 0.000553 1.447841 28 (data, sets) 0.000386 0.001050 0.000985 0.001352 1.286969 plus13 plus14 10 0.986784 1.399607 130 0.370044 0.339299 7 0.848017 1.295504 15 1.518129 0.508948 17 0.308370 2.375090 91 0.759065 0.254474 0 1.053202 0.779470 29 0.616740 0.848246 36 1.644640 0.339299 28 0.728874 0.391498
%matplotlib inline
import matplotlib.pyplot as plt
query = [("big", "data"), ("data", "science"), ("real", "time"), ("machine", "learning"), ("social", "media"), ("open", "source")]
query_results = result[result['ngram'].isin(query)][["2011", "2012", "2013", "2014"]].transpose()
query_results.columns = [" ".join(q) for q in query]
print query_results.plot(figsize=(10,5), title="Strata topics")
Axes(0.125,0.125;0.775x0.775)
Latent Dirichlet Analysis from https://github.com/shuyo/iir/blob/master/lda/lda.py
%run lda.py -f strata_abstracts.txt -s --stopwords -k 7
--------------------------------------------------------------------------- IOError Traceback (most recent call last) C:\Users\Koehler\Documents\IPython Notebooks\PyData_Berlin2014\lda.py in <module>() 145 146 if __name__ == "__main__": --> 147 main() C:\Users\Koehler\Documents\IPython Notebooks\PyData_Berlin2014\lda.py in main() 126 127 if options.filename: --> 128 corpus = vocabulary.load_file(options.filename) 129 else: 130 corpus = vocabulary.load_corpus(options.corpus) C:\Users\Koehler\Documents\IPython Notebooks\PyData_Berlin2014\vocabulary.py in load_file(filename) 17 def load_file(filename): 18 corpus = [] ---> 19 f = open(filename, 'r') 20 for line in f: 21 doc = re.findall(r'\w+(?:\'\w+)?',line) IOError: [Errno 2] No such file or directory: 'strata_abstracts.txt'
%matplotlib inline
import matplotlib.pyplot as plt
query = ["hadoop", "yarn", "storm"]
query = ["python", "julia", "r", "sas", "stata", "excel"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Programming Langugages @ Strata Conferences 2011-2014")
query = ["business", "energy", "advertising", "banking", "health", "politics", "government", "finance", "automotive"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Strata topics")
Axes(0.125,0.125;0.775x0.775) Axes(0.125,0.125;0.775x0.775)
query = ["google", "facebook", "yahoo", "linkedin", "microsoft"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Strata topics")
Axes(0.125,0.125;0.775x0.775)
query = ["modern", "machine", "learning"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Topics at Strata Conferences 2011-14")
plt.savefig("Strata_ModernMachineLearning.png")
Axes(0.125,0.125;0.775x0.775)