#!/usr/bin/env python
# coding: utf-8

# In[108]:


import matplotlib as plt
import numpy as np
import pandas as pd
from pyquery import PyQuery as pq
import requests
import re
pd.set_option('display.max_rows', 500)
get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('ggplot')


# In[46]:


def get_content(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    result = requests.get(url, headers=headers)
    return result.content


# In[117]:


links = []
words = []
names = []
dates = []
spaces = re.compile("[\s.,:]+")
for page in ["http://waitbutwhy.com/archive","http://waitbutwhy.com/archive/page/2"]:
    p = pq(get_content(page))
    for post in p(".post-list li h5 a"):
        link = post.get("href")
        links.append(link)
        l = pq(get_content(link))
        text = l('.entry-content').text()
        footer = l('#social-ads').text()
        names.append(l('.entry-header h1').text())
        wordcount = len(spaces.findall(text))
        footercount = len(spaces.findall(footer))
        words.append(wordcount-footercount)
        dates.append(re.compile("/(20[0-9][0-9]/[0-9][0-9])/").search(link).group(1))


# In[118]:


months = ['{}/{}'.format(y,m if m>9 else '0{}'.format(m)) for y in range(2014,2017) for m in range(1,13)]
for month in months:
    links.append(None)
    names.append(None)
    dates.append(month)
    words.append(0)


# In[119]:


posts = pd.DataFrame({"name":names,"words":words,"link":links,"date":dates})
posts[posts['words']!=0].sort_values(by='words',ascending=False)


# In[120]:


monthly = posts.groupby('date').sum()
monthly


# In[121]:


ax = monthly.plot(kind='bar',figsize=(13,4),title='Total words of WBW long posts per month',lw=2)
ax.set_ylim(-1000,45000)
ax.set_xlabel("Month")
ax.set_ylabel("Words")


# In[76]:


# In[ ]: