import matplotlib.pyplot as plt from IPython.core import display posts_savefile = 'posts.csv' tdm_savefile = 'posts_tdm.csv' urls = ( (2011, 1, ''), (2011, 2, ''), (2011, 3, ''), (2011, 4, ''), (2011, 5, ''), (2011, 6, ''), (2011, 7, ''), (2011, 8, ''), (2011, 9, ''), (2011, 10, ''), (2011, 11, ''), (2011, 12, ''), (2012, 1, ''), (2012, 2, ''), (2012, 3, ''), (2012, 4, ''), (2012, 5, ''), (2012, 6, ''), (2012, 7, ''), (2012, 8, ''), (2012, 9, ''), (2012, 10, ''), (2012, 11, ''), (2012, 12, ''), (2013, 1, ''), (2013, 2, ''), (2013, 3, ''), (2013, 4, ''), (2013, 5, ''), (2013, 6, ''), (2013, 7, ''), (2013, 8, ''), (2013, 9, ''), (2013, 10, ''), (2013, 11, ''), (2013, 12, ''), (2014, 1, ''), (2014, 2, ''), (2014, 3, ''), (2014, 4, ''), (2014, 5, '') ) def filename(year, month): return 'html/hn_%d_%d.html' % (year, month) # maybe drop urls into a DataFrame to save to CSV? import pandas as pd urlsdf = pd.DataFrame(list(urls), columns=['year', 'month', 'url']) urlsdf.head(3) from bs4 import BeautifulSoup import collections import os.path import requests import time stack = collections.deque(urls) tries = len(stack) * 3 # maximum attempts 3 times of number of URLs while tries > 0: tries -= 1 current = stack.pop() year, month, url = current # local html output file fname = filename(year, month) if os.path.isfile(fname): os.remove(fname) try: # get the HN pages for month / year ym_pages = [url] while ym_pages: url = ym_pages.pop() print "Fetching URL: %s" % (url) r = requests.get(url) # fail if bad error code if r.status_code != raise Exception('Error from server: ' + str(r.status_code)) text = r.text.replace('&', '_') # broken HTML escapes breaking BeautifulSoup, removing # write out to file in cwd with open(fname, 'a') as htmlfile: htmlfile.write(text.encode('utf-8')) # check for 'More' link soup = BeautifulSoup(text) links = soup.find_all('a', text='More') if links: # sometimes foward slash is being html escaped and messed # up by above & replacment, need to replace again link_url = '' + links[0]['href'].replace('_#x2F;', '/') ym_pages.append(link_url) # take a break for 30 seconds time.sleep(30) except Exception as e: print 'error:', e, 'currently on:', current # stick current URL at the begining of the queue stack.appendleft(current) # get out when stack is empty if not stack: break from bs4 import BeautifulSoup import numpy as np import pandas as pd import re def html_to_posts(html): """Parse an html document into posts""" posts = [] html = html.replace("
", "
") # unclosed
's are messing up BeautifulSoup soup = BeautifulSoup(html) tables ='tr', recursive=False)[2].td('table', recursive=False) comments_table = tables[1] if len(tables) > 1 else tables[0] rows = comments_table('tr', recursive=False) for row in rows: # check if td and table exist if not or not continue # check if this is a top level comment spacer_img = if not spacer_img['width'] == '0': continue comment_tag = row.find_all('span', class_='comment')[0] comment = comment_tag.get_text(separator=' ') #print comment[:30] if comment == '[deleted]' or comment == '[dead]': continue head_tag = row('span', class_='comhead')[0] user = head_tag.a.text posts.append({'user': user, 'post': comment.encode('utf-8')}) return posts # urls = [(2012, 7, '')] posts = [] for current in urls: # print current year, month, url = current all_html = open(filename(year, month)).read() start_html = 0 while start_html < len(all_html): end_html = all_html.find('', start_html) html = all_html[start_html:end_html + 7] start_html = end_html + 7 ym_posts = html_to_posts(html) #print "Found %d posts" % len(ym_posts) for post in ym_posts: post.update({'date': pd.datetime(year, month, 1)}) posts.append(post) postsdf = pd.DataFrame(posts, columns=['date', 'user', 'post']) postsdf.to_csv(posts_savefile, index=False) postsdf = pd.read_csv(posts_savefile, parse_dates=[0]) postsdf.head(3) postsdf.tail(3) # add year and month columsn to dataframe postsdf['year'] = [v.year for v in] postsdf['month'] = [v.month for v in] # add count ymdf = pd.DataFrame({'count': postsdf.groupby(['date']).size()}) ymdf = ymdf.reset_index() ymdf['year'] = [v.year for v in] ymdf['month'] = [v.month for v in] # display a table of counts per month per year ymdf[['year', 'month', 'count']].pivot(index='year', columns='month', values='count') # get unique years in the DataFrame years = postsdf['year'].unique() # start a wide matplotlib figure fig = plt.figure(figsize=(15, 3)) # plot all the data ax = fig.add_subplot(121) ymdf[['date', 'count']].set_index('date').plot(ax=ax) ax.legend(loc=4) ax.set_title("Number of Posts Each Month Since January 2011") # plot data split out by year ax = fig.add_subplot(122) df = ymdf[['count', 'year', 'month']].pivot('month', 'year') # display(df) df.plot(ax=ax) ax.legend(loc=4) ax.set_title("Split Out Per Year") # postsdf['weekday'] = [d.strftime('%a') for d in postsdf['date']] postsdf['weekday'] = [d.weekday() for d in postsdf['date']] mar_2011 =, 3, 1) after_mar_2011 = postsdf[postsdf['date'] > mar_2011] posts_date_day = after_mar_2011[['date', 'weekday']] grouped = posts_date_day.groupby(['date', 'weekday']) # alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)}) # byweekday = pd.DataFrame({'weekday': grouped['weekday']}) # len(grouped.groups.keys()) a = pd.DataFrame(grouped.size()) a = a.reset_index() b = a.groupby('weekday') c = b.mean() #c stopwords = open('stopwords').readlines() stopwords = [w.strip() for w in stopwords] postsdf = pd.read_csv(posts_savefile, parse_dates=[0])#[:2500] postsdf.tail(3) import re postsdf2 = postsdf.drop('user', axis=1) def merge(v): return ' '.join(v) def words_in_post(post): post = re.sub(r'[\. |, |\-|/|\(|\)|;|\[|\]|:|!|"|?|=|_|0-9]', ' ', post) words = post.lower().split() words = [word for word in words if word and word not in stopwords] word_counts = {} for word in words: word_counts[word] = word_counts.get(word, 0) + 1 return word_counts grouped = postsdf2.groupby(['date']) alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)}) #postsdf['year'] = [v.year for v in] #postsdf['month'] = [v.month for v in] alltextdf = alltextdf.reset_index() # loop over month/years and extract words for each combo tdm_df = None for i in range(len(alltextdf)): words = words_in_post(alltextdf['alltext'][i]) date = alltextdf['date'][i] year = date.year month =date.month post_count = alltextdf['post_count'][i] word_count = len(words.keys()) df = pd.DataFrame([(date, year, month, k, words[k], post_count, word_count) for k in words], columns=['date', 'year', 'month', 'term', 'count', 'post_count', 'word_count']) if type(tdm_df) != pd.DataFrame : tdm_df = df else: tdm_df = pd.concat([tdm_df, df]) tdm_df['prop'] = 1.0 * tdm_df['count'] / tdm_df['post_count'] tdm_df.to_csv(tdm_savefile, index=False) tdm_df = pd.read_csv(tdm_savefile, parse_dates=[0]) tdm_df.head(2) #display(tdm_df.tail(2)) import itertools linecycler = itertools.cycle(['-', '--', ':']) fig = plt.figure(figsize=(15, 8)) ax = fig.add_subplot(111) terms = sorted(['java', 'php', 'python', 'rails', 'django', 'hadoop', 'ember', 'angularjs', 'meteor', 'javascript']) # pull out only the terms we care about df = tdm_df[tdm_df.term.isin(terms)][['date', 'term', 'prop']] for p in terms: #df.columns[1:2]: subdf = df[df['term'] == p][['date', 'prop']] subdf = subdf.set_index(['date']) ax.plot(subdf.index, subdf.values, linestyle=next(linecycler), label=p, linewidth=9) plt.legend(loc=2)