#!/usr/bin/env python # coding: utf-8 # # Stanford tuition fun # # # ["The Second Tuition Bomb" - The Stanford Illustrated Review (books.google.com)](https://books.google.com/books?id=dMDmAAAAMAAJ&lpg=PT5&ots=JM5OTxRHRl&dq=stanford%20first%20tuition%20increase&pg=PT5#v=onepage&q&f=false) # # # Stanford lists the historical tuition prices here: [Finances facts](http://facts.stanford.edu/administration/finances) # # # Let's see how charting can better illustrate the data. # # # ## Download the data # # In[6]: # scraping stanford's tuition page import pandas as pd import csv from lxml import html import requests import re # live site is at: "http://facts.stanford.edu/administration/finances" url = 'http://stash.compjour.org/mirrors/facts.stanford.edu/administration/finances.html' resp = requests.get(url) doc = html.fromstring(resp.text) table = doc.cssselect('table')[3] rows = [] for trs in table.cssselect('tr')[1:]: yr, cost = [t.text for t in trs] # cut off the "1959" part of "1950-1959" rows.append( [int(yr.split('-')[0]), int(re.sub('\D', '', cost))]) # alternatively # rows = [( int(tds[0].text.split('-')[0]), int(re.sub('\D', '', tds[1].text))) for tds in # [trs for trs in table.cssselect('tr')[1:]]] # Now we need to fill in the gaps between the decades; for years in which no tuition is specified, we assume it's the same tuition as the previous year. # # Warning: convoluted code to follow # In[5]: # make a row for every year tuition_rows = [] for row in rows: if len(tuition_rows) > 0: lastyr, lastcost = tuition_rows[-1] tuition_rows.extend([[lastyr + i, lastcost] for i in range(1, row[0] - lastyr)]) tuition_rows.append(row) # Now make a dataframe tuition_df = pd.DataFrame(tuition_rows, columns = ['year', 'tuition']) tuition_df.head() # ### Make an inflation-calcuation funciton # # Download some CPI/inflation data from OKFN # # [US Consumer Price Index and Inflation (CPI)](http://data.okfn.org/data/core/cpi-us) # # In[9]: ######################## # Set up inflation calculator url = 'https://raw.githubusercontent.com/datasets/cpi-us/master/data/cpiai.csv' cpidata = list(csv.reader(requests.get(url).text.splitlines())) cpidf = pd.DataFrame(cpidata[1:], columns = cpidata[0]) cpidf = pd.DataFrame.convert_objects(cpidf, convert_dates = 'coerce', convert_numeric = True) cpimean_df = cpidf.groupby(cpidf['Date'].map(lambda x: x.year)).mean() def adjust_for_inflation(amt, from_year, to_year): ratio = cpimean_df['Index'][to_year] / cpimean_df['Index'][from_year] return round(ratio * amt, 2) tuition_df['adjusted_tuition'] = tuition_df.apply(lambda x: adjust_for_inflation(x['tuition'], x['year'], 2014), axis=1) tuition_df.head(15) # Now we can chart. # # In[10]: import matplotlib.pyplot as pyplot # this part is needed if you are doing this in an iPython notebook get_ipython().run_line_magic('matplotlib', 'inline') # Sans inflation: # In[11]: pyplot.plot(tuition_df['year'], tuition_df['tuition']) # Now with adjustments for inflation: # In[13]: pyplot.plot(tuition_df['year'], tuition_df['adjusted_tuition']) # On the same chart: # In[14]: pyplot.plot(tuition_df['year'], tuition_df['tuition'], label = 'Unadjusted', color = 'orange') pyplot.plot(tuition_df['year'], tuition_df['adjusted_tuition'], label = 'Adjusted', color = 'red') # Truncated: # In[21]: xdf = tuition_df[tuition_df['year'] > 2000] pyplot.plot(xdf['year'], xdf['tuition'], label = 'Unadjusted', color = 'orange') pyplot.plot(xdf['year'], xdf['adjusted_tuition'], label = 'Adjusted', color = 'red') pyplot.ylim(ymin = 0) # In[ ]: