import statsmodels.api as sm import patsy import pandas as pd import numpy as np import matplotlib.pyplot as plt RR = pd.read_csv('RR-processed.csv') RR.groupby('Country').size() bins = ["0-30%","30-60%","60-90%","Above 90%"] RR['dgcat'] = np.digitize(RR.debtgdp, [0,30,60,90,np.inf]) - 1 RR.dgcat = [bins[x] for x in RR.dgcat] bins = ["0-30%","30-60%","60-90%","90-120%","Above 120%"] RR['dgcat2'] = np.digitize(RR.debtgdp, [0,30,60,90,120,np.inf]) - 1 RR.dgcat2 = [bins[x] for x in RR.dgcat2] y,X = patsy.dmatrices('dRGDP ~ dgcat', data=RR[['dRGDP', 'dgcat']].dropna()) print sm.OLS(y,X).fit().summary() y2,X2 = patsy.dmatrices('dRGDP ~ dgcat2', data=RR[['dRGDP', 'dgcat2']].dropna()) print sm.OLS(y2,X2).fit().summary() ## Country-Year average by debtgdp ("correct weights") RR.dRGDP.groupby(RR.dgcat).mean() ## Averaged Country averages by debtgdp ("equal weights") RR.dRGDP.groupby([RR.Country, RR.dgcat]).mean().unstack() ## Country-Year average by debtgdp ("correct weights") expanded categories RR.dRGDP.groupby(RR.dgcat2).mean() ## Averaged Country averages by debtgdp ("equal weights") RR.dRGDP.groupby([RR.Country, RR.dgcat2]).mean().unstack() idx = (RR.Country == 'New Zealand') & (RR.Year < 1950) | (RR.Country == 'Australia') & (RR.Year < 1951) | (RR.Country == 'Canada') & (RR.Year < 1951) RR_selective = RR[idx == False] RR_selective.dRGDP.groupby(RR_selective.dgcat).mean() RR_selective.mean() RR_selective.dRGDP.groupby([RR_selective.Country, RR_selective.dgcat]).mean().unstack() drop = ["Australia","Austria","Belgium","Canada","Denmark"] idx = [False if x in drop else True for x in RR_selective.Country] RR_selective_spreadsheet = RR_selective[idx] RR_selective_spreadsheet.dRGDP.groupby(RR.dgcat).mean() RR_selective_spreadsheet_transcription = RR_selective_spreadsheet.copy() RR_selective_spreadsheet_transcription.RGDP[RR_selective_spreadsheet_transcription.Country=='New Zealand'] = -7.9 RR_selective_spreadsheet_transcription.dRGDP.groupby(RR.dgcat).mean() a = RR_selective_spreadsheet_transcription.Country b = RR_selective_spreadsheet_transcription.dgcat RR_selective_spreadsheet_transcription.dRGDP.groupby(b).mean() published_means = RR_selective_spreadsheet_transcription.dRGDP.groupby([a,b]).mean().unstack() published_means.ix['New Zealand', 'Above 90%'] = -7.9 published_means.mean() RR.dRGDP.groupby(RR.dgcat).median() # Correct, equal weight RR.dRGDP.groupby(RR.dgcat2).median() # Correct, expanded categories, equal weight RR.Country.groupby([RR.Country, RR.dgcat]).size().unstack().sum() RR_selective.Country.groupby([RR.Country, RR.dgcat]).size().unstack().sum() RR_selective_spreadsheet.Country.groupby([RR.Country, RR.dgcat]).size().unstack().sum() labels = ["0-30%","30-60%","60-90%","Above 90%"] dat = [np.array(RR.dRGDP[RR.dgcat==x]) for x in labels] print sm.graphics.violinplot(dat, labels=labels) labels = ["0-30%","30-60%","60-90%","90-120%","Above 120%"] dat = [np.array(RR.dRGDP[RR.dgcat2==x]) for x in labels] print sm.graphics.violinplot(dat, labels=labels) years = range(1950, 2001, 10) f = lambda x: (x, RR[RR.Year >= x].dRGDP.groupby(RR[RR.Year >= x].dgcat).mean()) [f(x) for x in years] RR['dRGDP_lag'] = RR.dRGDP.groupby(RR.Country).apply(lambda x: x.shift()) y,X = patsy.dmatrices('dRGDP ~ dgcat + dRGDP_lag', data=RR[['dRGDP', 'dgcat', 'dRGDP_lag']].dropna()) print sm.OLS(y,X).fit().summary() y,X = patsy.dmatrices('dRGDP ~ dgcat + dRGDP_lag + Country', data=RR[['dRGDP', 'dgcat', 'dRGDP_lag', 'Country']].dropna()) print sm.OLS(y,X).fit().summary()