import statsmodels.api as sm
import patsy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

RR = pd.read_csv('RR-processed.csv')

RR.groupby('Country').size()

bins = ["0-30%","30-60%","60-90%","Above 90%"]
RR['dgcat'] = np.digitize(RR.debtgdp, [0,30,60,90,np.inf]) - 1
RR.dgcat = [bins[x] for x in RR.dgcat]

bins = ["0-30%","30-60%","60-90%","90-120%","Above 120%"]
RR['dgcat2'] = np.digitize(RR.debtgdp, [0,30,60,90,120,np.inf]) - 1
RR.dgcat2 = [bins[x] for x in RR.dgcat2]

y,X = patsy.dmatrices('dRGDP ~ dgcat', data=RR[['dRGDP', 'dgcat']].dropna())
print sm.OLS(y,X).fit().summary()

y2,X2 = patsy.dmatrices('dRGDP ~ dgcat2', data=RR[['dRGDP', 'dgcat2']].dropna())
print sm.OLS(y2,X2).fit().summary()

## Country-Year average by debtgdp ("correct weights")
RR.dRGDP.groupby(RR.dgcat).mean()

## Averaged Country averages by debtgdp ("equal weights")
RR.dRGDP.groupby([RR.Country, RR.dgcat]).mean().unstack()

## Country-Year average by debtgdp ("correct weights") expanded categories
RR.dRGDP.groupby(RR.dgcat2).mean()

## Averaged Country averages by debtgdp ("equal weights")
RR.dRGDP.groupby([RR.Country, RR.dgcat2]).mean().unstack()

idx = (RR.Country == 'New Zealand') & (RR.Year < 1950) | (RR.Country == 'Australia') & (RR.Year < 1951) | (RR.Country == 'Canada') & (RR.Year < 1951) 
RR_selective = RR[idx == False]
RR_selective.dRGDP.groupby(RR_selective.dgcat).mean()

RR_selective.mean()

RR_selective.dRGDP.groupby([RR_selective.Country, RR_selective.dgcat]).mean().unstack()

drop = ["Australia","Austria","Belgium","Canada","Denmark"]
idx = [False if x in drop else True for x in RR_selective.Country]
RR_selective_spreadsheet = RR_selective[idx]
RR_selective_spreadsheet.dRGDP.groupby(RR.dgcat).mean()

RR_selective_spreadsheet_transcription = RR_selective_spreadsheet.copy()
RR_selective_spreadsheet_transcription.RGDP[RR_selective_spreadsheet_transcription.Country=='New Zealand'] = -7.9
RR_selective_spreadsheet_transcription.dRGDP.groupby(RR.dgcat).mean()

a = RR_selective_spreadsheet_transcription.Country
b = RR_selective_spreadsheet_transcription.dgcat
RR_selective_spreadsheet_transcription.dRGDP.groupby(b).mean()

published_means = RR_selective_spreadsheet_transcription.dRGDP.groupby([a,b]).mean().unstack()
published_means.ix['New Zealand', 'Above 90%'] = -7.9
published_means.mean()

RR.dRGDP.groupby(RR.dgcat).median() # Correct, equal weight

RR.dRGDP.groupby(RR.dgcat2).median() # Correct, expanded categories, equal weight

RR.Country.groupby([RR.Country, RR.dgcat]).size().unstack().sum()

RR_selective.Country.groupby([RR.Country, RR.dgcat]).size().unstack().sum()

RR_selective_spreadsheet.Country.groupby([RR.Country, RR.dgcat]).size().unstack().sum()

labels = ["0-30%","30-60%","60-90%","Above 90%"]
dat = [np.array(RR.dRGDP[RR.dgcat==x]) for x in labels]
print sm.graphics.violinplot(dat, labels=labels)

labels = ["0-30%","30-60%","60-90%","90-120%","Above 120%"]
dat = [np.array(RR.dRGDP[RR.dgcat2==x]) for x in labels]
print sm.graphics.violinplot(dat, labels=labels)

years = range(1950, 2001, 10)
f = lambda x: (x, RR[RR.Year >= x].dRGDP.groupby(RR[RR.Year >= x].dgcat).mean())
[f(x) for x in years]

RR['dRGDP_lag'] = RR.dRGDP.groupby(RR.Country).apply(lambda x: x.shift())
y,X = patsy.dmatrices('dRGDP ~ dgcat + dRGDP_lag', data=RR[['dRGDP', 'dgcat', 'dRGDP_lag']].dropna())
print sm.OLS(y,X).fit().summary()

y,X = patsy.dmatrices('dRGDP ~ dgcat + dRGDP_lag + Country', data=RR[['dRGDP', 'dgcat', 'dRGDP_lag', 'Country']].dropna())
print sm.OLS(y,X).fit().summary()