#!/usr/bin/env python
# coding: utf-8

# # Stanford tuition fun
# 
# 
# ["The Second Tuition Bomb" - The Stanford Illustrated Review (books.google.com)](https://books.google.com/books?id=dMDmAAAAMAAJ&lpg=PT5&ots=JM5OTxRHRl&dq=stanford%20first%20tuition%20increase&pg=PT5#v=onepage&q&f=false) 
# 
# 
# Stanford lists the historical tuition prices here: [Finances facts](http://facts.stanford.edu/administration/finances)
# 
# 
# Let's see how charting can better illustrate the data.
# 
# 
# ## Download the data
# 

# In[6]:


# scraping stanford's tuition page

import pandas as pd
import csv
from lxml import html
import requests
import re
# live site is at: "http://facts.stanford.edu/administration/finances"
url = 'http://stash.compjour.org/mirrors/facts.stanford.edu/administration/finances.html'
resp = requests.get(url)
doc = html.fromstring(resp.text)
table = doc.cssselect('table')[3]

rows = []
for trs in table.cssselect('tr')[1:]:
    yr, cost = [t.text for t in trs]
    # cut off the "1959" part of "1950-1959"
    rows.append( [int(yr.split('-')[0]), int(re.sub('\D', '', cost))])

# alternatively
# rows = [( int(tds[0].text.split('-')[0]), int(re.sub('\D', '', tds[1].text))) for tds in
#              [trs for trs in table.cssselect('tr')[1:]]]


# Now we need to fill in the gaps between the decades; for years in which no tuition is specified, we assume it's the same tuition as the previous year.
# 
# Warning: convoluted code to follow

# In[5]:


# make a row for every year
tuition_rows = []
for row in rows:
    if len(tuition_rows) > 0:
        lastyr, lastcost = tuition_rows[-1]
        tuition_rows.extend([[lastyr + i, lastcost] for i in range(1, row[0] - lastyr)])
    tuition_rows.append(row)

# Now make a dataframe
tuition_df = pd.DataFrame(tuition_rows, columns = ['year', 'tuition'])
tuition_df.head()


# ### Make an inflation-calcuation funciton
# 
# Download some CPI/inflation data from OKFN
# 
# [US Consumer Price Index and Inflation (CPI)](http://data.okfn.org/data/core/cpi-us)
# 

# In[9]:


########################
# Set up inflation calculator

url = 'https://raw.githubusercontent.com/datasets/cpi-us/master/data/cpiai.csv'
cpidata = list(csv.reader(requests.get(url).text.splitlines()))
cpidf = pd.DataFrame(cpidata[1:], columns = cpidata[0])
cpidf = pd.DataFrame.convert_objects(cpidf, convert_dates = 'coerce', convert_numeric = True)
cpimean_df = cpidf.groupby(cpidf['Date'].map(lambda x: x.year)).mean()

def adjust_for_inflation(amt, from_year, to_year):
    ratio = cpimean_df['Index'][to_year] / cpimean_df['Index'][from_year]
    return round(ratio * amt, 2)

tuition_df['adjusted_tuition'] = tuition_df.apply(lambda x: adjust_for_inflation(x['tuition'], x['year'], 2014), axis=1)
tuition_df.head(15)


# Now we can chart.
# 

# In[10]:


import matplotlib.pyplot as pyplot
# this part is needed if you are doing this in an iPython notebook
get_ipython().run_line_magic('matplotlib', 'inline')


# Sans inflation:

# In[11]:


pyplot.plot(tuition_df['year'], tuition_df['tuition'])


# Now with adjustments for inflation:

# In[13]:


pyplot.plot(tuition_df['year'], tuition_df['adjusted_tuition'])


# On the same chart:

# In[14]:


pyplot.plot(tuition_df['year'], tuition_df['tuition'], label = 'Unadjusted', color = 'orange')
pyplot.plot(tuition_df['year'], tuition_df['adjusted_tuition'], label = 'Adjusted', color = 'red')


# Truncated:

# In[21]:


xdf = tuition_df[tuition_df['year'] > 2000]

pyplot.plot(xdf['year'], xdf['tuition'], label = 'Unadjusted', color = 'orange')
pyplot.plot(xdf['year'], xdf['adjusted_tuition'], label = 'Adjusted', color = 'red')
pyplot.ylim(ymin = 0)


# In[ ]: