In [1]:
import matplotlib.pyplot as plt
from IPython.core import display

Can we scrape HN? https://news.ycombinator.com/item?id=1721105

A little setup before we get going

In [2]:
posts_savefile = 'posts.csv'
tdm_savefile = 'posts_tdm.csv'

urls = (
(2011, 1, 'https://news.ycombinator.com/item?id=2057704'),
(2011, 2, 'https://news.ycombinator.com/item?id=2161360'),
(2011, 3, 'https://news.ycombinator.com/item?id=2270790'),
(2011, 4, 'https://news.ycombinator.com/item?id=2396027'),
(2011, 5, 'https://news.ycombinator.com/item?id=2503204'),
(2011, 6, 'https://news.ycombinator.com/item?id=2607052'),
(2011, 7, 'https://news.ycombinator.com/item?id=2719028'),
(2011, 8, 'https://news.ycombinator.com/item?id=2831646'),
(2011, 9, 'https://news.ycombinator.com/item?id=2949787'),
(2011, 10, 'https://news.ycombinator.com/item?id=3060221'),
(2011, 11, 'https://news.ycombinator.com/item?id=3181796'),
(2011, 12, 'https://news.ycombinator.com/item?id=3300290'),
(2012, 1, 'https://news.ycombinator.com/item?id=3412900'),
(2012, 2, 'https://news.ycombinator.com/item?id=3537881'),
(2012, 3, 'https://news.ycombinator.com/item?id=3652041'),
(2012, 4, 'https://news.ycombinator.com/item?id=3783657'),
(2012, 5, 'https://news.ycombinator.com/item?id=3913997'),
(2012, 6, 'https://news.ycombinator.com/item?id=4053076'),
(2012, 7, 'https://news.ycombinator.com/item?id=4184755'),
(2012, 8, 'https://news.ycombinator.com/item?id=4323597'),
(2012, 9, 'https://news.ycombinator.com/item?id=4463689'),
(2012, 10, 'https://news.ycombinator.com/item?id=4596375'),
(2012, 11, 'https://news.ycombinator.com/item?id=4727241'),
(2012, 12, 'https://news.ycombinator.com/item?id=4857714'),
(2013, 1, 'https://news.ycombinator.com/item?id=4992617'),
(2013, 2, 'https://news.ycombinator.com/item?id=5150834'),
(2013, 3, 'https://news.ycombinator.com/item?id=5304169'),       
(2013, 4, 'https://news.ycombinator.com/item?id=5472746'),
(2013, 5, 'https://news.ycombinator.com/item?id=5637663'),
(2013, 6, 'https://news.ycombinator.com/item?id=5803764'),
(2013, 7, 'https://news.ycombinator.com/item?id=5970187'),
(2013, 8, 'https://news.ycombinator.com/item?id=6139927'),
(2013, 9, 'https://news.ycombinator.com/item?id=6310234'),
(2013, 10, 'https://news.ycombinator.com/item?id=6475879'),
(2013, 11, 'https://news.ycombinator.com/item?id=6653437'),
(2013, 12, 'https://news.ycombinator.com/item?id=6827554'),
(2014, 1, 'https://news.ycombinator.com/item?id=6995020'),
(2014, 2, 'https://news.ycombinator.com/item?id=7162197'),
(2014, 3, 'https://news.ycombinator.com/item?id=7324236'),       
(2014, 4, 'https://news.ycombinator.com/item?id=7507765'),
(2014, 5, 'https://news.ycombinator.com/item?id=7679431')
)

def filename(year, month):
    return 'html/hn_%d_%d.html' % (year, month)
In [3]:
# maybe drop urls into a DataFrame to save to CSV?
import pandas as pd
urlsdf = pd.DataFrame(list(urls), columns=['year', 'month', 'url'])
urlsdf.head(3)
Out[3]:
year month url
0 2011 1 https://news.ycombinator.com/item?id=2057704
1 2011 2 https://news.ycombinator.com/item?id=2161360
2 2011 3 https://news.ycombinator.com/item?id=2270790

Fetching HTML from Hacker News

  • Try each item at most 3 times if gettting bad responses
  • Wait a half minute between grabbing each year/month post
  • Check for 'Next' link at bottom of page, fetch link if there
  • Append 'next' pages to end of current HTML fetched
In [ ]:
from bs4 import BeautifulSoup
import collections
import os.path
import requests
import time

stack = collections.deque(urls)
tries = len(stack) * 3 # maximum attempts 3 times of number of URLs

while tries > 0:
    tries -= 1
    current = stack.pop()
    year, month, url = current

    # local html output file
    fname = filename(year, month)
    if os.path.isfile(fname):
        os.remove(fname)

    try:
        # get the HN pages for month / year
        ym_pages = [url]
        while ym_pages:
            url = ym_pages.pop()
            print "Fetching URL: %s" % (url)
            r = requests.get(url)

            # fail if bad error code
            if r.status_code != requests.codes.ok:
                raise Exception('Error from server: ' + str(r.status_code))

            text = r.text.replace('&', '_') # broken HTML escapes breaking BeautifulSoup, removing
            # write out to file in cwd
            with open(fname, 'a') as htmlfile:
                htmlfile.write(text.encode('utf-8'))
                
            # check for 'More' link
            soup = BeautifulSoup(text)
            links = soup.find_all('a', text='More')
            if links:
                # sometimes foward slash is being html escaped and messed
                # up by above & replacment, need to replace again
                link_url = 'https://news.ycombinator.com' + links[0]['href'].replace('_#x2F;', '/')
                ym_pages.append(link_url)
            
            # take a break for 30 seconds
            time.sleep(30)
            
    except Exception as e:
        print 'error:', e, 'currently on:', current
        # stick current URL at the begining of the queue
        stack.appendleft(current)        
        
    # get out when stack is empty
    if not stack: break

HTML parsing function

Parsing of the HTML from Hacker News from HTML to {user, post} dicts

In [11]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

def html_to_posts(html):
    """Parse an html document into posts"""
    posts = []
    html = html.replace("<br>", "<br/>") # unclosed <br>'s are messing up BeautifulSoup
    soup = BeautifulSoup(html)
    tables = soup.body.center.table('tr', recursive=False)[2].td('table', recursive=False)
    comments_table = tables[1] if len(tables) > 1 else tables[0]
    rows = comments_table('tr', recursive=False)
        
    for row in rows:        
        # check if td and table exist
        if not row.td or not row.td.table: continue

        # check if this is a top level comment
        spacer_img = row.td.table.tr.td.img
        if not spacer_img['width'] == '0': continue
                
        comment_tag = row.find_all('span', class_='comment')[0]
        comment = comment_tag.get_text(separator=' ')
        #print comment[:30]
        if comment == '[deleted]' or comment == '[dead]': continue
        
        head_tag = row('span', class_='comhead')[0]
        user = head_tag.a.text

        posts.append({'user': user, 'post': comment.encode('utf-8')})
        
    return posts

Looping over fetched HTML

  • Loop over fetched items
  • Split at </html> tags because we appended Next pages
  • Save out to CSV for later use
In [34]:
# urls = [(2012, 7, 'https://news.ycombinator.com/item?id=4184755')]
posts = []

for current in urls:
    # print current
    year, month, url = current
    all_html = open(filename(year, month)).read()
    
    start_html = 0
    while start_html < len(all_html):
        end_html = all_html.find('</html>', start_html)
        html = all_html[start_html:end_html + 7]
        start_html = end_html + 7
        
        ym_posts = html_to_posts(html)
        #print "Found %d posts" % len(ym_posts)

        for post in ym_posts:
            post.update({'date': pd.datetime(year, month, 1)})
            posts.append(post)


postsdf = pd.DataFrame(posts, columns=['date', 'user', 'post'])

Save

In [35]:
postsdf.to_csv(posts_savefile, index=False)

Read Posts DataFrame saved to CSV

  • Read in our CSV of posts
  • check out it's head and tail
In [4]:
postsdf = pd.read_csv(posts_savefile, parse_dates=[0])
postsdf.head(3)
Out[4]:
date user post
0 2011-01-01 00:00:00 lkrubner In New York City there are a lot of jobs. I we...
1 2011-01-01 00:00:00 jasonfried 37signals is hiring two Rails programmers:\n h...
2 2011-01-01 00:00:00 tptacek Chicago (or remote) Matasano Security LEAD SOF...
In [5]:
postsdf.tail(3)
Out[5]:
date user post
9938 2014-05-01 00:00:00 jasonlotito MeetMe - New Hope, PA (near Philadelphia, Penn...
9939 2014-05-01 00:00:00 ssharpe67 Datalex - Atlanta, GA\nReady to use your tech ...
9940 2014-05-01 00:00:00 findwork Disclaimer: Forgive me for posting here. I jus...

Number of posts per month

In [6]:
# add year and month columsn to dataframe
postsdf['year'] = [v.year for v in postsdf.date]
postsdf['month'] = [v.month for v in postsdf.date]

# add count 
ymdf = pd.DataFrame({'count': postsdf.groupby(['date']).size()})
ymdf = ymdf.reset_index()
ymdf['year'] = [v.year for v in ymdf.date]
ymdf['month'] = [v.month for v in ymdf.date]

# display a table of counts per month per year
ymdf[['year', 'month', 'count']].pivot(index='year', columns='month', values='count')
Out[6]:
month 1 2 3 4 5 6 7 8 9 10 11 12
year
2011 88 150 27 218 217 257 224 230 191 198 230 203
2012 149 201 251 201 231 227 194 245 214 248 221 230
2013 192 219 291 343 323 263 292 309 239 426 298 263
2014 223 330 340 356 389 NaN NaN NaN NaN NaN NaN NaN
In [7]:
# get unique years in the DataFrame
years = postsdf['year'].unique()

# start a wide matplotlib figure
fig = plt.figure(figsize=(15, 3))

# plot all the data
ax = fig.add_subplot(121)
ymdf[['date', 'count']].set_index('date').plot(ax=ax)
ax.legend(loc=4)
ax.set_title("Number of Posts Each Month Since January 2011")

# plot data split out by year
ax = fig.add_subplot(122)
df = ymdf[['count', 'year', 'month']].pivot('month', 'year')
# display(df)
df.plot(ax=ax)

ax.legend(loc=4)
ax.set_title("Split Out Per Year")

plt.show()

Days of the week

In [ ]:
# postsdf['weekday'] =  [d.strftime('%a') for d in postsdf['date']]
postsdf['weekday'] = [d.weekday() for d in postsdf['date']]

mar_2011 = datetime.date(2011, 3, 1)
after_mar_2011 = postsdf[postsdf['date'] > mar_2011]

posts_date_day = after_mar_2011[['date', 'weekday']]

grouped = posts_date_day.groupby(['date', 'weekday'])
# alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)})
# byweekday = pd.DataFrame({'weekday': grouped['weekday']})
# len(grouped.groups.keys())
a = pd.DataFrame(grouped.size())
a = a.reset_index()
b = a.groupby('weekday')
c = b.mean()
#c

Term Document Matrix

  • load up some stopwords
  • read in our posts from CSV
In [8]:
stopwords = open('stopwords').readlines()
stopwords = [w.strip() for w in stopwords]

postsdf = pd.read_csv(posts_savefile, parse_dates=[0])#[:2500]

postsdf.tail(3)
Out[8]:
date user post
9938 2014-05-01 00:00:00 jasonlotito MeetMe - New Hope, PA (near Philadelphia, Penn...
9939 2014-05-01 00:00:00 ssharpe67 Datalex - Atlanta, GA\nReady to use your tech ...
9940 2014-05-01 00:00:00 findwork Disclaimer: Forgive me for posting here. I jus...
  • merge merges all posts into one document
  • _words_in_post_ cleans up document splits words and counts them
In [9]:
import re
postsdf2 = postsdf.drop('user', axis=1)

def merge(v):
    return ' '.join(v)

def words_in_post(post):

    post = re.sub(r'[\. |, |\-|/|\(|\)|;|\[|\]|:|!|"|?|=|_|0-9]', ' ', post)
    words = post.lower().split()
    words = [word for word in words if word and word not in stopwords]
    
    word_counts = {}
    
    for word in words:
        word_counts[word] = word_counts.get(word, 0) + 1
        
    return word_counts
  • loop over all documents and create a TDM DataFrame
  • Save it to CSV
In [10]:
grouped = postsdf2.groupby(['date'])
alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)})
#postsdf['year'] = [v.year for v in postsdf.date]
#postsdf['month'] = [v.month for v in postsdf.date]

alltextdf = alltextdf.reset_index()

# loop over month/years and extract words for each combo
tdm_df = None
for i in range(len(alltextdf)):
    words = words_in_post(alltextdf['alltext'][i])
    date = alltextdf['date'][i]
    year = date.year
    month =date.month
    post_count = alltextdf['post_count'][i]
    word_count = len(words.keys())

    df = pd.DataFrame([(date, year, month, k, words[k], post_count, word_count) for k in words], 
                        columns=['date', 'year', 'month', 'term', 'count', 'post_count', 'word_count'])
    if type(tdm_df) != pd.DataFrame :
        tdm_df = df
    else:
        tdm_df = pd.concat([tdm_df, df])

tdm_df['prop'] = 1.0 * tdm_df['count'] / tdm_df['post_count']
tdm_df.to_csv(tdm_savefile, index=False)

Which terms are popular each month?

  • Read in our TDM from CSV
In [11]:
tdm_df = pd.read_csv(tdm_savefile, parse_dates=[0])
tdm_df.head(2)
#display(tdm_df.tail(2))
Out[11]:
date year month term count post_count word_count prop
0 2011-01-01 00:00:00 2011 1 secondly 1 88 1745 0.011364
1 2011-01-01 00:00:00 2011 1 sbnation 2 88 1745 0.022727
In [14]:
import itertools
linecycler = itertools.cycle(['-', '--', ':'])

fig = plt.figure(figsize=(15, 8))
ax = fig.add_subplot(111)

terms = sorted(['java', 'php', 'python', 'rails', 'django', 'hadoop', 'ember', 'angularjs', 'meteor', 'javascript'])

# pull out only the terms we care about
df = tdm_df[tdm_df.term.isin(terms)][['date', 'term', 'prop']]

for p in terms: #df.columns[1:2]:
    subdf = df[df['term'] == p][['date', 'prop']]
    subdf = subdf.set_index(['date'])
    ax.plot(subdf.index, subdf.values, linestyle=next(linecycler), label=p, linewidth=9)

plt.legend(loc=2)
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: