import matplotlib.pyplot as plt
from IPython.core import display

posts_savefile = 'posts.csv'
tdm_savefile = 'posts_tdm.csv'

urls = (
(2011, 1, 'https://news.ycombinator.com/item?id=2057704'),
(2011, 2, 'https://news.ycombinator.com/item?id=2161360'),
(2011, 3, 'https://news.ycombinator.com/item?id=2270790'),
(2011, 4, 'https://news.ycombinator.com/item?id=2396027'),
(2011, 5, 'https://news.ycombinator.com/item?id=2503204'),
(2011, 6, 'https://news.ycombinator.com/item?id=2607052'),
(2011, 7, 'https://news.ycombinator.com/item?id=2719028'),
(2011, 8, 'https://news.ycombinator.com/item?id=2831646'),
(2011, 9, 'https://news.ycombinator.com/item?id=2949787'),
(2011, 10, 'https://news.ycombinator.com/item?id=3060221'),
(2011, 11, 'https://news.ycombinator.com/item?id=3181796'),
(2011, 12, 'https://news.ycombinator.com/item?id=3300290'),
(2012, 1, 'https://news.ycombinator.com/item?id=3412900'),
(2012, 2, 'https://news.ycombinator.com/item?id=3537881'),
(2012, 3, 'https://news.ycombinator.com/item?id=3652041'),
(2012, 4, 'https://news.ycombinator.com/item?id=3783657'),
(2012, 5, 'https://news.ycombinator.com/item?id=3913997'),
(2012, 6, 'https://news.ycombinator.com/item?id=4053076'),
(2012, 7, 'https://news.ycombinator.com/item?id=4184755'),
(2012, 8, 'https://news.ycombinator.com/item?id=4323597'),
(2012, 9, 'https://news.ycombinator.com/item?id=4463689'),
(2012, 10, 'https://news.ycombinator.com/item?id=4596375'),
(2012, 11, 'https://news.ycombinator.com/item?id=4727241'),
(2012, 12, 'https://news.ycombinator.com/item?id=4857714'),
(2013, 1, 'https://news.ycombinator.com/item?id=4992617'),
(2013, 2, 'https://news.ycombinator.com/item?id=5150834'),
(2013, 3, 'https://news.ycombinator.com/item?id=5304169'),       
(2013, 4, 'https://news.ycombinator.com/item?id=5472746'),
(2013, 5, 'https://news.ycombinator.com/item?id=5637663'),
(2013, 6, 'https://news.ycombinator.com/item?id=5803764'),
(2013, 7, 'https://news.ycombinator.com/item?id=5970187'),
(2013, 8, 'https://news.ycombinator.com/item?id=6139927'),
(2013, 9, 'https://news.ycombinator.com/item?id=6310234'),
(2013, 10, 'https://news.ycombinator.com/item?id=6475879'),
(2013, 11, 'https://news.ycombinator.com/item?id=6653437'),
(2013, 12, 'https://news.ycombinator.com/item?id=6827554'),
(2014, 1, 'https://news.ycombinator.com/item?id=6995020'),
(2014, 2, 'https://news.ycombinator.com/item?id=7162197'),
(2014, 3, 'https://news.ycombinator.com/item?id=7324236'),       
(2014, 4, 'https://news.ycombinator.com/item?id=7507765'),
(2014, 5, 'https://news.ycombinator.com/item?id=7679431')
)

def filename(year, month):
    return 'html/hn_%d_%d.html' % (year, month)

# maybe drop urls into a DataFrame to save to CSV?
import pandas as pd
urlsdf = pd.DataFrame(list(urls), columns=['year', 'month', 'url'])
urlsdf.head(3)

from bs4 import BeautifulSoup
import collections
import os.path
import requests
import time

stack = collections.deque(urls)
tries = len(stack) * 3 # maximum attempts 3 times of number of URLs

while tries > 0:
    tries -= 1
    current = stack.pop()
    year, month, url = current

    # local html output file
    fname = filename(year, month)
    if os.path.isfile(fname):
        os.remove(fname)

    try:
        # get the HN pages for month / year
        ym_pages = [url]
        while ym_pages:
            url = ym_pages.pop()
            print "Fetching URL: %s" % (url)
            r = requests.get(url)

            # fail if bad error code
            if r.status_code != requests.codes.ok:
                raise Exception('Error from server: ' + str(r.status_code))

            text = r.text.replace('&', '_') # broken HTML escapes breaking BeautifulSoup, removing
            # write out to file in cwd
            with open(fname, 'a') as htmlfile:
                htmlfile.write(text.encode('utf-8'))
                
            # check for 'More' link
            soup = BeautifulSoup(text)
            links = soup.find_all('a', text='More')
            if links:
                # sometimes foward slash is being html escaped and messed
                # up by above & replacment, need to replace again
                link_url = 'https://news.ycombinator.com' + links[0]['href'].replace('_#x2F;', '/')
                ym_pages.append(link_url)
            
            # take a break for 30 seconds
            time.sleep(30)
            
    except Exception as e:
        print 'error:', e, 'currently on:', current
        # stick current URL at the begining of the queue
        stack.appendleft(current)        
        
    # get out when stack is empty
    if not stack: break


from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

def html_to_posts(html):
    """Parse an html document into posts"""
    posts = []
    html = html.replace("<br>", "<br/>") # unclosed <br>'s are messing up BeautifulSoup
    soup = BeautifulSoup(html)
    tables = soup.body.center.table('tr', recursive=False)[2].td('table', recursive=False)
    comments_table = tables[1] if len(tables) > 1 else tables[0]
    rows = comments_table('tr', recursive=False)
        
    for row in rows:        
        # check if td and table exist
        if not row.td or not row.td.table: continue

        # check if this is a top level comment
        spacer_img = row.td.table.tr.td.img
        if not spacer_img['width'] == '0': continue
                
        comment_tag = row.find_all('span', class_='comment')[0]
        comment = comment_tag.get_text(separator=' ')
        #print comment[:30]
        if comment == '[deleted]' or comment == '[dead]': continue
        
        head_tag = row('span', class_='comhead')[0]
        user = head_tag.a.text

        posts.append({'user': user, 'post': comment.encode('utf-8')})
        
    return posts


# urls = [(2012, 7, 'https://news.ycombinator.com/item?id=4184755')]
posts = []

for current in urls:
    # print current
    year, month, url = current
    all_html = open(filename(year, month)).read()
    
    start_html = 0
    while start_html < len(all_html):
        end_html = all_html.find('</html>', start_html)
        html = all_html[start_html:end_html + 7]
        start_html = end_html + 7
        
        ym_posts = html_to_posts(html)
        #print "Found %d posts" % len(ym_posts)

        for post in ym_posts:
            post.update({'date': pd.datetime(year, month, 1)})
            posts.append(post)


postsdf = pd.DataFrame(posts, columns=['date', 'user', 'post'])

postsdf.to_csv(posts_savefile, index=False)

postsdf = pd.read_csv(posts_savefile, parse_dates=[0])
postsdf.head(3)

postsdf.tail(3)

# add year and month columsn to dataframe
postsdf['year'] = [v.year for v in postsdf.date]
postsdf['month'] = [v.month for v in postsdf.date]

# add count 
ymdf = pd.DataFrame({'count': postsdf.groupby(['date']).size()})
ymdf = ymdf.reset_index()
ymdf['year'] = [v.year for v in ymdf.date]
ymdf['month'] = [v.month for v in ymdf.date]

# display a table of counts per month per year
ymdf[['year', 'month', 'count']].pivot(index='year', columns='month', values='count')

# get unique years in the DataFrame
years = postsdf['year'].unique()

# start a wide matplotlib figure
fig = plt.figure(figsize=(15, 3))

# plot all the data
ax = fig.add_subplot(121)
ymdf[['date', 'count']].set_index('date').plot(ax=ax)
ax.legend(loc=4)
ax.set_title("Number of Posts Each Month Since January 2011")

# plot data split out by year
ax = fig.add_subplot(122)
df = ymdf[['count', 'year', 'month']].pivot('month', 'year')
# display(df)
df.plot(ax=ax)

ax.legend(loc=4)
ax.set_title("Split Out Per Year")

plt.show()

# postsdf['weekday'] =  [d.strftime('%a') for d in postsdf['date']]
postsdf['weekday'] = [d.weekday() for d in postsdf['date']]

mar_2011 = datetime.date(2011, 3, 1)
after_mar_2011 = postsdf[postsdf['date'] > mar_2011]

posts_date_day = after_mar_2011[['date', 'weekday']]

grouped = posts_date_day.groupby(['date', 'weekday'])
# alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)})
# byweekday = pd.DataFrame({'weekday': grouped['weekday']})
# len(grouped.groups.keys())
a = pd.DataFrame(grouped.size())
a = a.reset_index()
b = a.groupby('weekday')
c = b.mean()
#c


stopwords = open('stopwords').readlines()
stopwords = [w.strip() for w in stopwords]

postsdf = pd.read_csv(posts_savefile, parse_dates=[0])#[:2500]

postsdf.tail(3)

import re
postsdf2 = postsdf.drop('user', axis=1)

def merge(v):
    return ' '.join(v)

def words_in_post(post):

    post = re.sub(r'[\. |, |\-|/|\(|\)|;|\[|\]|:|!|"|?|=|_|0-9]', ' ', post)
    words = post.lower().split()
    words = [word for word in words if word and word not in stopwords]
    
    word_counts = {}
    
    for word in words:
        word_counts[word] = word_counts.get(word, 0) + 1
        
    return word_counts

grouped = postsdf2.groupby(['date'])
alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)})
#postsdf['year'] = [v.year for v in postsdf.date]
#postsdf['month'] = [v.month for v in postsdf.date]

alltextdf = alltextdf.reset_index()

# loop over month/years and extract words for each combo
tdm_df = None
for i in range(len(alltextdf)):
    words = words_in_post(alltextdf['alltext'][i])
    date = alltextdf['date'][i]
    year = date.year
    month =date.month
    post_count = alltextdf['post_count'][i]
    word_count = len(words.keys())

    df = pd.DataFrame([(date, year, month, k, words[k], post_count, word_count) for k in words], 
                        columns=['date', 'year', 'month', 'term', 'count', 'post_count', 'word_count'])
    if type(tdm_df) != pd.DataFrame :
        tdm_df = df
    else:
        tdm_df = pd.concat([tdm_df, df])

tdm_df['prop'] = 1.0 * tdm_df['count'] / tdm_df['post_count']
tdm_df.to_csv(tdm_savefile, index=False)

tdm_df = pd.read_csv(tdm_savefile, parse_dates=[0])
tdm_df.head(2)
#display(tdm_df.tail(2))

import itertools
linecycler = itertools.cycle(['-', '--', ':'])

fig = plt.figure(figsize=(15, 8))
ax = fig.add_subplot(111)

terms = sorted(['java', 'php', 'python', 'rails', 'django', 'hadoop', 'ember', 'angularjs', 'meteor', 'javascript'])

# pull out only the terms we care about
df = tdm_df[tdm_df.term.isin(terms)][['date', 'term', 'prop']]

for p in terms: #df.columns[1:2]:
    subdf = df[df['term'] == p][['date', 'prop']]
    subdf = subdf.set_index(['date'])
    ax.plot(subdf.index, subdf.values, linestyle=next(linecycler), label=p, linewidth=9)

plt.legend(loc=2)
plt.show()