#!/usr/bin/env python # coding: utf-8 # # Data-driven Approach to CFP - Mining PyData Conferences # Data-driven approach is good in the most cases. Most of us have seen people use this approach for some of the business decisions, or something you will care much more than a minor daily decision, but what if your toolkit is so fast and powerful that you can use it easily even for daily tasks. # # In this article, I'm going to explain one of such use-cases, and introduce you one of the tools I use for some of my tasks. # # So starting with the problem: I was interested to visit a conference, in my case it was one of the local PyData conferences. If you visit a conference you most likely focus on content, so I wanted to have an analysis of the content of the conference. From another side, I was also interested to see, how the focus of the conference was changing over the time and for the very last point, try to find out would it be the kind of conference where I can share my knowledge and experience of using python for data-related tasks. # In[1]: from IPython.display import HTML HTML('') # In[2]: from bs4 import BeautifulSoup import urllib2 # past events # TODO: fetch this data from past events page conferences = ['nyc2014', 'berlin2014', 'sv2014', 'ldn2014', 'nyc2013'] abstract_url = "http://pydata.org/%s/abstracts/" conf_data = {} # Collecting data about abstracts for conference in conferences: print "loading data for %s conference" % conference raw = urllib2.urlopen(abstract_url % conference).read() soup = BeautifulSoup(raw) abstracts = [abstract.get_text().strip() for abstract in soup.find_all(class_="accordion-inner")] titles = [title.get_text().strip() for title in soup.find_all(class_="accordion-toggle")] # speakers = [speaker.get_text().strip() for speaker in soup.select(".accordion-heading h5 a")] conf_data[conference] = {} conf_data[conference]['abstracts'] = abstracts conf_data[conference]['titles'] = titles # conf_data[conference]['speakers'] = speakers # In[3]: conf_data['nyc2014']['titles'][:20] # ### I use Python Pandas to structure all parsed data into dataframes # In[4]: import pandas as pd pydata = pd.DataFrame() for conf in conf_data: conf_dataframe = pd.DataFrame.from_dict(conf_data[conf]) conf_dataframe['conference'] = conf conf_dataframe['city'] = conf[:-4] conf_dataframe['year'] = int(conf[-4:]) print pd.DataFrame.head(conf_dataframe) pydata = pydata.append(conf_dataframe) # ### Interesting to see how many talks we had from year to year, also that's another point to check that data looks close to what we expect # In[5]: print 'records in dataframe %i' % len(pydata) pydata.groupby(['conference']).count(1).sort('year', ascending=False) # **Seems like number of talks is slowly growing from 40 during nyc2013 and up to 50 during the last pydata of 2014** # ## Now we have all the data. Let's try to analyse it ## # # # # # # # # ### So what size of proposal do they usually have? # In[6]: abstract_lens = [len(abst) for abst in pydata['abstracts'] if len(abst) > 44] print abstract_lens print # In[7]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.hist(abstract_lens) # In[8]: pd.DataFrame(abstract_lens).describe() # ### What about word corpus from different years # In[9]: import nltk stop = nltk.corpus.stopwords.words('english') text = {} words = {} # In[10]: stop_list = ["ll", "II", "ll", "http", "://", "e", "g", "2", "0"] for conference in conf_data: raw = " ".join(conf_data[conference]['abstracts']) tokens = nltk.WordPunctTokenizer().tokenize(raw) text[conference] = nltk.Text(tokens) words[conference] = [w.lower() for w in text[conference] if w.lower() not in stop_list] words[conference] = [w for w in words[conference] if w not in stop] words[conference] = filter(lambda word: word not in u'%,-:()$\/;?.’–“”*\'[]', words[conference]) words[conference] = [w for w in words[conference] if w not in stop_list] # ### Let's check collocations in the abstracts.
Collocations are expressions of multiple words which commonly co-occur. # In[11]: for conference in text: print conference print text[conference].collocations() print # ### Words used in abstracts # In[12]: numwords = {} uniwords = {} for conference in text: numwords[conference] = len(text[conference]) uniwords[conference] = len(set(text[conference])) for conference in reversed(conferences): print "%s: \tnumwords - %i, unique - %i" % \ (conference, numwords[conference], uniwords[conference]) # In[13]: plt.bar(range(len(uniwords)), [uniwords[conference] for conference in reversed(conferences)], align='center', ) plt.xticks(range(len(uniwords)), [conference for conference in reversed(conferences)]) plt.show() # ### Seems like the number of unique words had its peak during the sv2014 and right now is pretty stable and even slowly decreasing # # # Bigrams # In[14]: from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() for conference in reversed(conferences): print "Bigrams " + str(conference) finder = BigramCollocationFinder.from_words(words[conference]) scored = finder.score_ngrams(bigram_measures.raw_freq) print pd.DataFrame(scored[:25]) print "\n\n" # # Year over Year # In[15]: result = pd.DataFrame() for conference in reversed(conferences): finder = BigramCollocationFinder.from_words(words[conference], window_size = 2) ignored_words = nltk.corpus.stopwords.words('english') finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) scores = finder.score_ngrams(bigram_measures.raw_freq) if len(result) == 0: result = pd.DataFrame(scores, columns=["ngram", str(conference)]) else: result = result.merge(pd.DataFrame(scores, columns=["ngram", str(conference)])) print result[:15] # In[16]: transposed = result[:10].transpose() headers = transposed[0:1:].values print headers # In[17]: get_ipython().run_line_magic('matplotlib', 'inline') new_transposed = transposed[1::] new_transposed.columns = headers[0] new_transposed.plot(figsize=(16,12)) # In[18]: print result[:15].sort(['nyc2014'], ascending=[0]) # ## We have very few data points, but based on data we can see, that *(machine, learning)* and *(scikit, learn)* is still on top. *(open, source)* and *(big, data)* is also picking up, that means, that we already have some experience and starting to play bigger.