#!/usr/bin/env python # coding: utf-8 # In[108]: import matplotlib as plt import numpy as np import pandas as pd from pyquery import PyQuery as pq import requests import re pd.set_option('display.max_rows', 500) get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('ggplot') # In[46]: def get_content(url): headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} result = requests.get(url, headers=headers) return result.content # In[117]: links = [] words = [] names = [] dates = [] spaces = re.compile("[\s.,:]+") for page in ["http://waitbutwhy.com/archive","http://waitbutwhy.com/archive/page/2"]: p = pq(get_content(page)) for post in p(".post-list li h5 a"): link = post.get("href") links.append(link) l = pq(get_content(link)) text = l('.entry-content').text() footer = l('#social-ads').text() names.append(l('.entry-header h1').text()) wordcount = len(spaces.findall(text)) footercount = len(spaces.findall(footer)) words.append(wordcount-footercount) dates.append(re.compile("/(20[0-9][0-9]/[0-9][0-9])/").search(link).group(1)) # In[118]: months = ['{}/{}'.format(y,m if m>9 else '0{}'.format(m)) for y in range(2014,2017) for m in range(1,13)] for month in months: links.append(None) names.append(None) dates.append(month) words.append(0) # In[119]: posts = pd.DataFrame({"name":names,"words":words,"link":links,"date":dates}) posts[posts['words']!=0].sort_values(by='words',ascending=False) # In[120]: monthly = posts.groupby('date').sum() monthly # In[121]: ax = monthly.plot(kind='bar',figsize=(13,4),title='Total words of WBW long posts per month',lw=2) ax.set_ylim(-1000,45000) ax.set_xlabel("Month") ax.set_ylabel("Words") # In[76]: # In[ ]: