Example 3) Mining Conference Websites

Idea:

  • Find out which topics are really hot
  • Identify sessions you just have to attend
  • Put conference in context
  • Identify trends

CAVEAT: Make sure the page owner allows crawling the content for scientific purpost (Terms, robots.txt)

Strata Conference, one of the most important conferences for all things Big Data, Hadoop, Data Science.

In [5]:
from IPython.display import HTML
HTML('<iframe src="http://strataconf.com/strata2014/public/schedule/grid/2014-02-12?schedule=public" width=100% height=350></iframe>')
Out[5]:

Crawl conference page to find abstracts

from bs4 import BeautifulSoup
import urllib2

# List of conference schedule pages
urls = {2011 : "http://strataconf.com/strata2011/public/schedule/full",
        2012 : "http://strataconf.com/strata2012/public/schedule/full/public", 
        2013 : "http://strataconf.com/strata2013/public/schedule/full/public", 
        2014 : "http://strataconf.com/strata2014/public/schedule/full/public"}

links = {}

# Collecting the links to the talk abstracts
for u in urls:
    raw = urllib2.urlopen(urls[u]).read()
    soup = BeautifulSoup(raw)
    yearlinks = set([l.get("href") for l in soup.find_all("a")])
    yearlinks = [l for l in yearlinks if not (l is None)]
    yearlinks = [l for l in yearlinks if '/detail' in l]
    yearlinks = [l.replace("http://strataconf.com", "") for l in yearlinks]
    links[u] = yearlinks
abstracts = {}

for year in links:

    for l in links[year]:
        raw = urllib2.urlopen("http://www.strataconf.com" + l).read()
        soup = BeautifulSoup(raw)
        desc = soup.find("div", class_="en_session_description description")
        if year in abstracts:
            abstracts[year].append(desc.get_text())
        else:
            abstracts[year] = [desc.get_text()]
In [3]:
import json
# Load Data (if you don't want to crawl the data)
with open('strata_abstracts.json') as f:
    abstracts = json.load(f)
In [14]:
import nltk
bigram_measures = nltk.collocations.BigramAssocMeasures()
stop = nltk.corpus.stopwords.words('english')

text = {}
words = {}

for year in abstracts:
    raw = " ".join(abstracts[year])
    tokens = nltk.WordPunctTokenizer().tokenize(raw)
    text[year] = nltk.Text(tokens)
    words[year] = [w.lower() for w in text[year]]
    words[year] = [w for w in words[year] if w not in stop]
    words[year] = filter(lambda word: word not in u'%,-:()$\/;?.’–“”', words[year])
    words[year] = [w for w in words[year] if w not in ["ll", "II", "ll", "http", "://", "e", "g", "2", "0"]]
In [15]:
text["2012"]
Out[15]:
<Text: Organizations today are generating data at an ever...>
In [16]:
for year in text:
    print year
    text[year].collocations()
    print
2014
Building collocations list
big data; machine learning; Big Data; open source; data science; case
studies; data scientists; best practices; http ://; Machine Learning;
every day; Energy Project; time series; use cases; data center; Apache
Hadoop; Industrial Internet; Clean Energy; take advantage; software
engineering

2011
Building collocations list
Big Data; Executive Summit; open source; big data; machine learning;
data science; Riak Core; Bob Page; time series; witch doctors; data
sets; reserved table; best practices; Apache Mahout; Science Fair;
dark underbelly; darkly humorous; http ://; lays bare; litmus tests

2013
Building collocations list
big data; Big Data; open source; machine learning; use cases; http
://; Data Science; Rest Devices; social media; data sets; Strata
Conference; relational database; Expo Hall; Stitch Fix; https ://;
lessons learned; data science; command line; data collection; :// www

2012
Building collocations list
Big Data; big data; open source; machine learning; data sets; http
://; Alistair Croll; social media; Climate Corporation; use cases;
variable importance; Tableau Public; social contagion; Apache Hadoop;
supply chain; Avinash Kaushik; Opening remarks; case study; real
world; natural language

In [17]:
numwords = {}
uniwords = {}

for year in text:
    numwords[year] = len(text[year])
    uniwords[year] = len(set(text[year]))

print numwords
print uniwords
{u'2014': 24326, u'2011': 19869, u'2013': 24963, u'2012': 30902}
{u'2014': 4149, u'2011': 3860, u'2013': 4436, u'2012': 4895}
In [19]:
import pandas as pd

freq_table = pd.DataFrame()

for year in words:
    fd = nltk.FreqDist(words[year])
    if (len(freq_table) == 0):
        freq_table = pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)])
    else:
        freq_table = freq_table.merge(pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)]))

print freq_table[:10]
        Word  Freq_2014  Freq_2011  Freq_2013  Freq_2012
0       data        555        448        568        723
1     hadoop        101         21         95        102
2        big         92         58        122        144
3       time         68         51         63         72
4       real         64         42         59         59
5  analytics         62         56         60         52
6        new         61         61         56         75
7       talk         58         32         50         49
8      using         57         24         53         61
9        use         56         44         68         79
In [26]:
for year in numwords:
    freq_table["Perc_" + year] = 100.0 * freq_table["Freq_" + year] / numwords[year]

for year in ["2012", "2013", "2014"]:
    print year
    freq_table["Growth_" + year] = 100.0 * freq_table["Perc_" + year] / freq_table["Perc_" + str(int(year)-1)]
    tb = freq_table[freq_table['Perc_' + str(year)] >= 0.08].sort(columns="Growth_" + str(year), ascending=False)[["Word", "Freq_" + str(year), "Perc_" + str(year), "Growth_" + str(year)]]
    tb.columns = ["Word", "Freq", "Percent", "Index"]
    tb.Index = tb['Index'].round(1)
    tb.Percent = tb['Percent'].round(4)
    print tb[:10]
2012
           Word  Freq  Percent  Index
84       models    39   0.1262  417.9
1        hadoop   102   0.3301  312.3
100       value    29   0.0938  310.8
135  experience    29   0.0938  310.8
130      social    52   0.1683  278.6
169      simple    28   0.0906  257.2
388           r    30   0.0971  241.1
342     support    25   0.0809  229.6
73     problems    25   0.0809  229.6
24     platform    33   0.1068  212.2
2013
         Word  Freq  Percent   Index
584    engine    20   0.0801  2475.8
317    google    28   0.1122   693.2
85    queries    20   0.0801   275.1
54      human    21   0.0841   260.0
216    strata    20   0.0801   247.6
41    science    31   0.1242   225.7
11      scale    41   0.1642   181.3
138      hive    25   0.1001   162.9
134  database    25   0.1001   162.9
112       two    22   0.0881   160.2
2014
             Word  Freq  Percent  Index
60     components    23   0.0945  393.4
38        cluster    30   0.1233  342.1
33       building    31   0.1274  289.2
23     processing    37   0.1521  253.1
61          graph    23   0.0945  214.6
76  organizations    20   0.0822  205.2
39           high    30   0.1233  205.2
56        storage    25   0.1028  197.3
77        project    20   0.0822  186.6
53          build    25   0.1028  183.2
In [30]:
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

for year in ["2011", "2012", "2013", "2014"]:    
    print "Bigrams " + str(year)
    finder = BigramCollocationFinder.from_words(words[year])
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    print pd.DataFrame(scored[:10])
Bigrams 2011
                     0         1
0          (big, data)  0.004392
1      (data, science)  0.002100
2         (real, time)  0.001909
3       (open, source)  0.001528
4       (data, driven)  0.001241
5  (executive, summit)  0.001146
6  (machine, learning)  0.001146
7         (data, sets)  0.001050
8         (open, data)  0.001050
9        (real, world)  0.001050
Bigrams 2012
                      0         1
0           (big, data)  0.007987
1        (data, driven)  0.001474
2          (real, time)  0.001413
3          (data, sets)  0.001352
4         (real, world)  0.001229
5  (session, sponsored)  0.001167
6   (machine, learning)  0.001044
7        (open, source)  0.001044
8      (data, analysis)  0.000983
9     (data, analytics)  0.000922
Bigrams 2013
                      0         1
0           (big, data)  0.008412
1          (real, time)  0.002577
2       (data, science)  0.001970
3        (open, source)  0.001591
4   (machine, learning)  0.001516
5  (session, sponsored)  0.001516
6          (use, cases)  0.001212
7    (data, scientists)  0.000985
8          (data, sets)  0.000985
9         (real, world)  0.000985
Bigrams 2014
                      0         1
0           (big, data)  0.006557
1   (machine, learning)  0.002931
2          (real, time)  0.002854
3       (data, science)  0.001928
4        (open, source)  0.001851
5  (session, sponsored)  0.001697
6         (real, world)  0.001388
7    (data, scientists)  0.001311
8      (data, analysis)  0.001080
9     (data, analytics)  0.001003
In [31]:
for year in abstracts:
    print "Trigrams " + str(year)
    finder = TrigramCollocationFinder.from_words(text[year])
    scored = finder.score_ngrams(trigram_measures.raw_freq)

    print pd.DataFrame(scored[:10])
Trigrams 2014
                          0         1
0             (., In, this)  0.002097
1        (., This, session)  0.001151
2          (In, this, talk)  0.001151
3       (is, sponsored, by)  0.001069
4           (real, -, time)  0.001069
5             (., We, will)  0.001028
6       (This, session, is)  0.000904
7  (session, is, sponsored)  0.000904
8               (We, ’, ll)  0.000863
9           (this, talk, ,)  0.000781
Trigrams 2011
                          0         1
0             (., In, this)  0.001107
1           (real, -, time)  0.000755
2  (the, Executive, Summit)  0.000604
3               (we, ’, ll)  0.000604
4               (don, ’, t)  0.000554
5                (it, ’, s)  0.000554
6        (., This, session)  0.000503
7           (part, of, the)  0.000503
8                (,, we, ’)  0.000453
9                (., It, ’)  0.000453
Trigrams 2013
                          0         1
0        (., This, session)  0.001202
1               (We, ’, ll)  0.001162
2             (., In, this)  0.001082
3       (is, sponsored, by)  0.001001
4                (., We, ’)  0.000961
5           (real, -, time)  0.000961
6             (., We, will)  0.000881
7       (This, session, is)  0.000841
8  (session, is, sponsored)  0.000801
9           (., This, talk)  0.000601
Trigrams 2012
                     0         1
0   (., This, session)  0.001003
1        (., In, this)  0.000938
2          (We, ’, ll)  0.000938
3           (., We, ’)  0.000809
4        (., We, will)  0.000777
5  (is, sponsored, by)  0.000680
6      (some, of, the)  0.000680
7  (This, session, is)  0.000647
8           (it, ’, s)  0.000647
9      (real, -, time)  0.000615
In [33]:
from collections import Counter
import pandas as pd

trending_words = pd.DataFrame()

for year in words:
    fdist = nltk.FreqDist(words[year])
    if len(trending_words) == 0:
        trending_words = pd.DataFrame(fdist.items(), columns=["word", str(year)])
        trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum())
    else:
        trending_words = trending_words.merge(pd.DataFrame(fdist.items(), columns=["word", str(year)]), how="outer")
        trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum())
        
print trending_words[:10]
        word      2014      2011      2013      2012
0       data  0.042811  0.042773  0.043043  0.044419
1     hadoop  0.007791  0.002005  0.007199  0.006267
2        big  0.007097  0.005538  0.009245  0.008847
3       time  0.005245  0.004869  0.004774  0.004423
4       real  0.004937  0.004010  0.004471  0.003625
5  analytics  0.004782  0.005347  0.004547  0.003195
6        new  0.004705  0.005824  0.004244  0.004608
7       talk  0.004474  0.003055  0.003789  0.003010
8      using  0.004397  0.002291  0.004016  0.003748
9        use  0.004320  0.004201  0.005153  0.004853
In [35]:
trending_words["plus12"] = trending_words["2012"] / trending_words["2011"]
trending_words["plus13"] = trending_words["2013"] / trending_words["2012"]
trending_words["plus14"] = trending_words["2014"] / trending_words["2013"]
trending_words = trending_words.fillna(0)

print trending_words[(trending_words["2012"] > 0.001) & (trending_words["2011"] > 0)].sort("plus12", ascending=False)[:10]
print
print trending_words[(trending_words["2013"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus13", ascending=False)[:10]
print
print trending_words[(trending_words["2014"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus14", ascending=False)[:10]
            word      2014      2011      2013      2012     plus12    plus13  \
1729    variable  0.000154  0.000095  0.000076  0.001044  10.939239  0.072558   
52     sponsored  0.002006  0.000191  0.002122  0.001413   7.400074  1.501628   
142         hive  0.001080  0.000191  0.001895  0.001167   6.113104  1.623000   
354     solution  0.000617  0.000191  0.001364  0.001167   6.113104  1.168560   
86        models  0.001466  0.000573  0.000758  0.002396   4.182650  0.316277   
604        nosql  0.000386  0.000382  0.000985  0.001474   3.860908  0.668135   
299          set  0.000694  0.000382  0.000758  0.001413   3.700037  0.536296   
39       cluster  0.002314  0.000382  0.000682  0.001352   3.539166  0.504605   
1         hadoop  0.007791  0.002005  0.007199  0.006267   3.125497  1.148829   
139   experience  0.001080  0.000573  0.001516  0.001782   3.110176  0.850676   

        plus14  
1729  2.035791  
52    0.945189  
142   0.570022  
354   0.452398  
86    1.934002  
604   0.391498  
299   0.916106  
39    3.392986  
1     1.082184  
139   0.712527  

              word      2014      2011      2013      2012    plus12  \
701         engine  0.000309  0.000668  0.001516  0.000061  0.091926   
161         energy  0.001003  0.000095  0.001364  0.000061  0.643485   
334      languages  0.000617  0.000191  0.001061  0.000061  0.321742   
1391         fraud  0.000154  0.000095  0.000834  0.000061  0.643485   
679   computations  0.000309  0.000095  0.000834  0.000061  0.643485   
930     efficiency  0.000231  0.000382  0.000758  0.000061  0.160871   
3648     openstack  0.000000  0.001241  0.000682  0.000061  0.049499   
1227       centric  0.000154  0.000095  0.000606  0.000061  0.643485   
3700   forecasting  0.000000  0.000382  0.001137  0.000123  0.321742   
4691      location  0.000000  0.000095  0.001061  0.000123  1.286969   

         plus13    plus14  
701   24.669597  0.203579  
161   22.202637  0.735147  
334   17.268718  0.581655  
1391  13.568278  0.185072  
679   13.568278  0.370144  
930   12.334798  0.305369  
3648  11.101319  0.000000  
1227   9.867839  0.254474  
3700   9.251099  0.000000  
4691   8.634359  0.000000  

              word      2014      2011      2013      2012    plus12  \
181     deployment  0.000926  0.000191  0.000076  0.000307  1.608712   
209          crowd  0.000849  0.000095  0.000076  0.000184  1.930454   
238         highly  0.000771  0.000764  0.000076  0.000553  0.723920   
239         humans  0.000771  0.000095  0.000076  0.000184  1.930454   
267          clean  0.000694  0.000095  0.000076  0.000000  0.000000   
101        traffic  0.001311  0.000191  0.000152  0.000123  0.643485   
317  computational  0.000617  0.000573  0.000076  0.000184  0.321742   
437      workflows  0.000540  0.000095  0.000076  0.000123  1.286969   
418        reports  0.000540  0.000095  0.000076  0.000000  0.000000   
407        options  0.000540  0.000095  0.000076  0.000553  5.791362   

       plus13     plus14  
181  0.246696  12.214749  
209  0.411160  11.196853  
238  0.137053  10.178957  
239  0.411160  10.178957  
267  0.000000   9.161061  
101  1.233480   8.652114  
317  0.411160   8.143166  
437  0.616740   7.125270  
418  0.000000   7.125270  
407  0.137053   7.125270  
In [37]:
import pandas as pd

result = pd.DataFrame()

for year in words:
    finder = BigramCollocationFinder.from_words(words[year], window_size = 2)
    #finder.apply_freq_filter(2)
    ignored_words = nltk.corpus.stopwords.words('english')
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    scores = finder.score_ngrams(bigram_measures.raw_freq)
    
    if len(result) == 0:
        result = pd.DataFrame(scores, columns=["ngram", str(year)])
    else:
        result = result.merge(pd.DataFrame(scores, columns=["ngram", str(year)]))
        
print result[:10]
                 ngram      2014      2011      2013      2012
0          (big, data)  0.006557  0.004392  0.008412  0.007987
1  (machine, learning)  0.002931  0.001146  0.001516  0.001044
2         (real, time)  0.002854  0.001909  0.002577  0.001413
3      (data, science)  0.001928  0.002100  0.001970  0.000737
4       (open, source)  0.001851  0.001528  0.001591  0.001044
5        (real, world)  0.001388  0.001050  0.000985  0.001229
6   (data, scientists)  0.001311  0.000764  0.000985  0.000737
7     (data, analysis)  0.001080  0.000191  0.000834  0.000983
8    (data, analytics)  0.001003  0.000955  0.000758  0.000922
9       (large, scale)  0.000926  0.000191  0.000758  0.000307
In [38]:
result["plus12"] = result["2012"] / result["2011"]
result["plus13"] = result["2013"] / result["2012"]
result["plus14"] = result["2014"] / result["2013"]

print result[result["2014"] > 0.0005].sort("plus14", ascending=False)[:10]
print
print result[result["2013"] > 0.0005].sort("plus13", ascending=False)[:10]
print
print result[result["2012"] > 0.0005].sort("plus12", ascending=False)[:10]
                  ngram      2014      2011      2013      2012    plus12  \
16    (data, solutions)  0.000540  0.000095  0.000076  0.000061  0.643485   
11       (time, series)  0.000771  0.000859  0.000152  0.000369  0.428990   
19   (enterprise, data)  0.000540  0.000191  0.000152  0.000061  0.321742   
21    (world, examples)  0.000540  0.000286  0.000152  0.000246  0.857980   
14      (case, studies)  0.000617  0.000191  0.000227  0.000307  1.608712   
17      (data, sources)  0.000540  0.000191  0.000227  0.000737  3.860908   
1   (machine, learning)  0.002931  0.001146  0.001516  0.001044  0.911603   
18   (decision, making)  0.000540  0.000764  0.000303  0.000246  0.321742   
13       (data, driven)  0.000694  0.001241  0.000455  0.001474  1.187972   
20          (new, data)  0.000540  0.000095  0.000379  0.000369  3.860908   

      plus13    plus14  
16  1.233480  7.125270  
11  0.411160  5.089479  
19  2.466960  3.562635  
21  0.616740  3.562635  
14  0.740088  2.714389  
17  0.308370  2.375090  
1   1.451153  1.934002  
18  1.233480  1.781317  
13  0.308370  1.526844  
20  1.027900  1.425054  

                      ngram      2014      2011      2013      2012    plus12  \
157  (relational, database)  0.000077  0.000191  0.000530  0.000123  0.643485   
12        (best, practices)  0.000694  0.000573  0.000530  0.000184  0.321742   
3           (data, science)  0.001928  0.002100  0.001970  0.000737  0.350992   
9            (large, scale)  0.000926  0.000191  0.000758  0.000307  1.608712   
2              (real, time)  0.002854  0.001909  0.002577  0.001413  0.740007   
52             (open, data)  0.000231  0.001050  0.000758  0.000430  0.409490   
36       (data, collection)  0.000309  0.000382  0.000909  0.000553  1.447841   
4            (open, source)  0.001851  0.001528  0.001591  0.001044  0.683702   
15             (use, cases)  0.000617  0.000191  0.001212  0.000799  4.182650   
1       (machine, learning)  0.002931  0.001146  0.001516  0.001044  0.911603   

       plus13    plus14  
157  4.317179  0.145414  
12   2.878120  1.308723  
3    2.672540  0.978746  
9    2.466960  1.221475  
2    1.823405  1.107710  
52   1.762114  0.305369  
36   1.644640  0.339299  
4    1.523710  1.163309  
15   1.518129  0.508948  
1    1.451153  1.934002  

                     ngram      2014      2011      2013      2012    plus12  \
10        (apache, hadoop)  0.000849  0.000095  0.000606  0.000614  6.434847   
130            (data, set)  0.000077  0.000095  0.000227  0.000614  6.434847   
7         (data, analysis)  0.001080  0.000191  0.000834  0.000983  5.147877   
15            (use, cases)  0.000617  0.000191  0.001212  0.000799  4.182650   
17         (data, sources)  0.000540  0.000191  0.000227  0.000737  3.860908   
91         (social, media)  0.000154  0.000286  0.000606  0.000799  2.788434   
0              (big, data)  0.006557  0.004392  0.008412  0.007987  1.818544   
29   (data, visualization)  0.000386  0.000477  0.000455  0.000737  1.544363   
36      (data, collection)  0.000309  0.000382  0.000909  0.000553  1.447841   
28            (data, sets)  0.000386  0.001050  0.000985  0.001352  1.286969   

       plus13    plus14  
10   0.986784  1.399607  
130  0.370044  0.339299  
7    0.848017  1.295504  
15   1.518129  0.508948  
17   0.308370  2.375090  
91   0.759065  0.254474  
0    1.053202  0.779470  
29   0.616740  0.848246  
36   1.644640  0.339299  
28   0.728874  0.391498  
In [39]:
%matplotlib inline  
import matplotlib.pyplot as plt

query = [("big", "data"), ("data", "science"), ("real", "time"), ("machine", "learning"), ("social", "media"), ("open", "source")]

query_results = result[result['ngram'].isin(query)][["2011", "2012", "2013", "2014"]].transpose()
query_results.columns = [" ".join(q) for q in query]

print query_results.plot(figsize=(10,5), title="Strata topics")
Axes(0.125,0.125;0.775x0.775)
In [47]:
%run lda.py -f strata_abstracts.txt -s --stopwords -k 7
---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
C:\Users\Koehler\Documents\IPython Notebooks\PyData_Berlin2014\lda.py in <module>()
    145 
    146 if __name__ == "__main__":
--> 147     main()

C:\Users\Koehler\Documents\IPython Notebooks\PyData_Berlin2014\lda.py in main()
    126 
    127     if options.filename:
--> 128         corpus = vocabulary.load_file(options.filename)
    129     else:
    130         corpus = vocabulary.load_corpus(options.corpus)

C:\Users\Koehler\Documents\IPython Notebooks\PyData_Berlin2014\vocabulary.py in load_file(filename)
     17 def load_file(filename):
     18     corpus = []
---> 19     f = open(filename, 'r')
     20     for line in f:
     21         doc = re.findall(r'\w+(?:\'\w+)?',line)

IOError: [Errno 2] No such file or directory: 'strata_abstracts.txt'
In [103]:
%matplotlib inline  
import matplotlib.pyplot as plt

query = ["hadoop", "yarn", "storm"]
query = ["python", "julia", "r", "sas", "stata", "excel"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Programming Langugages @ Strata Conferences 2011-2014")

query = ["business", "energy", "advertising", "banking", "health", "politics", "government", "finance", "automotive"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Strata topics")
Axes(0.125,0.125;0.775x0.775)
Axes(0.125,0.125;0.775x0.775)
In [51]:
query = ["google", "facebook", "yahoo", "linkedin", "microsoft"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Strata topics")
Axes(0.125,0.125;0.775x0.775)
In [55]:
query = ["modern", "machine", "learning"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Topics at Strata Conferences 2011-14")
plt.savefig("Strata_ModernMachineLearning.png")
Axes(0.125,0.125;0.775x0.775)