#!/usr/bin/env python
# coding: utf-8

# In[1]:


#%%
import os
import sys
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from patsy import dmatrices, dmatrix, demo_data

import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS

get_ipython().run_line_magic('matplotlib', 'inline')
# run for jupyter notebook
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'
#%%


# Manual Matrix Calculation

# In[2]:


nsample = 100
x = np.linspace(0, 10, 100)
cons = np.ones([nsample,1])
X = np.column_stack((x, x**2, cons))


# In[3]:


beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)


# In[4]:


y = np.dot(X, beta) + e


# $$
# \hat{\beta} = (X'X)^{-1}X'Y
# $$

# In[5]:


beta_hat = np.linalg.inv(X.T @ X)@X.T@y
beta_hat.T


# In[7]:


beta


# In[8]:


beta1 = sm.OLS(y, X).fit()
beta1.params


# In[9]:


y_hat = beta1.predict()
print(beta1.summary())


# In[10]:


plt.plot(y_hat, alpha=0.5, label='predicted')
# Plot observed values
plt.plot(y, alpha=0.5, label='observed')
plt.legend()
plt.show()


# ## Tabulating Multiple Regressions

# In[11]:


df2 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable2.dta')

# Add constant term to dataset
df2['const'] = 1


# In[12]:


# Create lists of variables to be used in each regression
X1 = ['const', 'avexpr']
X2 = ['const', 'avexpr', 'lat_abst']
X3 = ['const', 'avexpr', 'lat_abst', 'asia', 'africa', 'other']

# Estimate an OLS regression for each set of variables
reg1 = sm.OLS(df2['logpgp95'], df2[X1], missing='drop').fit(cov_type='HC2')
reg2 = sm.OLS(df2['logpgp95'], df2[X2], missing='drop').fit(cov_type='HC2')
reg3 = sm.OLS(df2['logpgp95'], df2[X3], missing='drop').fit(cov_type='HC2')


# In[13]:


from statsmodels.iolib.summary2 import summary_col

info_dict={'R-squared' : lambda x: "{:.2f}".format(x.rsquared),
           'No. observations' : lambda x: "{0:d}".format(int(x.nobs))}

results_table = summary_col(results=[reg1,reg2,reg3],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 3',
                                         'Model 4'],
                            info_dict=info_dict,
                            regressor_order=['const',
                                             'avexpr',
                                             'lat_abst',
                                             'asia',
                                             'africa'])

results_table.add_title('Table 2 - OLS Regressions')

print(results_table)


# # AJR (2001)

# In[14]:


df1 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable1.dta')
df1.head()
df1.info()


# In[15]:


plt.style.use('seaborn')


# In[16]:


sns.jointplot('avexpr','logpgp95',data=df1,kind='scatter')


# In[17]:


sns.regplot('avexpr','logpgp95',data=df1)


# In[18]:


# Dropping NA's is required to use numpy's polyfit
df1_subset = df1.dropna(subset=['logpgp95', 'avexpr'])

# Use only 'base sample' for plotting purposes
df1_subset = df1_subset[df1_subset['baseco'] == 1]

X = df1_subset['avexpr']
y = df1_subset['logpgp95']
labels = df1_subset['shortnam']

# Replace markers with country labels
plt.scatter(X, y, marker='')

for i, label in enumerate(labels):
    plt.annotate(label, (X.iloc[i], y.iloc[i]))

# Fit a linear trend line
plt.plot(np.unique(X),
         np.poly1d(np.polyfit(X, y, 1))(np.unique(X)),
         color='black')

plt.xlim([3.3,10.5])
plt.ylim([4,10.5])
plt.xlabel('Average Expropriation Risk 1985-95')
plt.ylabel('Log GDP per capita, PPP, 1995')
plt.title('Figure 2: OLS relationship between expropriation risk and income')
plt.show()


# In[19]:


reg1 = smf.ols('logpgp95 ~ avexpr', data=df1, missing='drop').fit(cov_type='HC2')
print(reg1.summary())


# In[20]:


# Drop missing observations from whole sample
df1_plot = df1.dropna(subset=['logpgp95', 'avexpr'])
# Plot predicted values
plt.scatter(df1_plot['avexpr'], reg1.predict(), alpha=0.5, label='predicted')
# Plot observed values
plt.scatter(df1_plot['avexpr'], df1_plot['logpgp95'], alpha=0.5, label='observed')
plt.legend()
plt.title('OLS predicted values')
plt.xlabel('avexpr')
plt.ylabel('logpgp95')
plt.show()


# ## Instrumental Variables

# In[21]:


# Dropping NA's is required to use numpy's polyfit
df1_subset2 = df1.dropna(subset=['logem4', 'avexpr'])

X = df1_subset2['logem4']
y = df1_subset2['avexpr']
labels = df1_subset2['shortnam']


# In[22]:


# Replace markers with country labels
plt.scatter(X, y, marker='')

for i, label in enumerate(labels):
    plt.annotate(label, (X.iloc[i], y.iloc[i]))

# Fit a linear trend line
plt.plot(np.unique(X),
         np.poly1d(np.polyfit(X, y, 1))(np.unique(X)),
         color='black')

# plt.xlim([1.8,8.4])
# plt.ylim([3.3,10.4])
plt.xlabel('Log of Settler Mortality')
plt.ylabel('Average Expropriation Risk 1985-95')
plt.title('Figure 3: First-stage relationship between settler mortality and expropriation risk')


# Manual Calculation

# $$ \beta_{IV} = \frac{ \beta_{rf} }{ \beta_{fs} } $$

# where 
# $$ \beta_{fs} = \frac{Cov(X,Z)}{Var(Z)} $$
# 
# and
# 
# $$ \beta_{rf} = \frac{Cov(Y,Z)}{Var(Z)} $$

# Which yields
# $$
# \beta_{IV} = \frac{Cov(Y,Z)}{Cov(X,Z)}
# $$

# In[23]:


# Import and select the data
df4 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable4.dta')
df4 = df4[df4['baseco'] == 1]

# Add a constant variable
df4['const'] = 1


# *First Stage*

# In[24]:


# Fit the first stage regression and print summary
results_fs = sm.OLS(df4['avexpr'],
                    df4[['const', 'logem4']],
                    missing='drop').fit()
print(results_fs.summary())


# In[25]:


df4['predicted_avexpr'] = results_fs.predict()
results_ss = sm.OLS(df4['logpgp95'],
                    df4[['const', 'predicted_avexpr']]).fit()
print(results_ss.summary())


# ### IV subroutine from linearmodels 

# In[26]:


from linearmodels.iv import IV2SLS
iv = IV2SLS(dependent=df4['logpgp95'],
            exog=df4['const'],
            endog=df4['avexpr'],
            instruments=df4['logem4']).fit(cov_type='unadjusted')

print(iv.summary)


# ### IV Formula syntax

# In[27]:


from statsmodels.api import add_constant
from linearmodels.datasets import mroz
print(mroz.DESCR)
data = mroz.load()
data = data.dropna()
data = add_constant(data, has_constant='add')
data['lnwage']=np.log(data.wage)


# In[28]:


y = np.log(data.wage).as_matrix()
X = data[['const','educ']].as_matrix()

beta = np.linalg.inv(X.T @ X)@X.T@y
beta


# In[29]:


data.head()


# In[30]:


res_ols = sm.formula.ols('lnwage ~ educ', data=data).fit()
print(res_ols.summary())


# In[31]:


first_stage = sm.OLS(data.educ, 
                 data[['fatheduc','const']]).fit()
print(first_stage.summary())
b_fs = first_stage.params[0]

data['educ_hat'] = first_stage.predict()


# In[32]:


reduced_form = sm.OLS(np.log(data.wage), 
                 data[['fatheduc','const']]).fit()
b_rf = reduced_form.params[0]
print(reduced_form.summary())


# In[33]:


second_stage = sm.OLS(np.log(data.wage), 
                 data[['educ_hat','const']]).fit()
b_ss = second_stage.params[0]
print(second_stage.summary())


# $$
# \beta_{iv} = \frac{cov(y_i,z_i)}{cov(x_i,z_i)}= \frac{\beta_{rf}}{\beta_{fs}} = \frac{cov(y_i,\hat{x_i})}{var(\hat{x_i})}$$

# ### calculate $\beta_{iv}$ in all 3 ways

# In[34]:


iv = IV2SLS.from_formula('lnwage ~ 1 + [educ ~ fatheduc]',
                    data=data).fit(cov_type='unadjusted')
print(iv.summary)


# In[35]:


iv.params[1]


# In[36]:


b_iv = b_rf/b_fs
b_iv


# In[37]:


b_ss


# Interaction term

# In[53]:


sm.ols('lnwage ~ educ + hours * kidsge6 + huseduc', 
       data = data).fit(cov_type='HC1').summary()


# ## Fixed effects ;  Standard Errors (Robust, Clustered, Multiway clustered)

# Verbatim fork of [this repo](https://github.com/vgreg/python-se) for reference. Will likely attempt to implement Conley(1999) in python

# In[51]:


import statsmodels.formula.api as sm
import statsmodels.stats.sandwich_covariance as sw


# In[52]:


from urllib.request import urlopen

with urlopen('http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt') as conn:
    df = pd.read_table(conn, names=['firmid','year','x','y'],
                       delim_whitespace=True)
df.head()


# In[45]:


ols = sm.ols(formula='y ~ x', data=df).fit(use_t=True)
ols.summary()


# White standard errors (` reg y x, robust`)

# In[46]:


robust_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='HC1', use_t=True)
robust_ols.summary()


# Clustered

# In[47]:


cluster_firm_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
                                                        cov_kwds={'groups': df['firmid']},
                                                        use_t=True)
cluster_firm_ols.summary()


# In[48]:


cluster_year_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
                                                        cov_kwds={'groups': df['year']},
                                                        use_t=True)
cluster_year_ols.summary()


# In[49]:


cluster_2ways_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
                                                         cov_kwds={'groups': np.array(df[['firmid', 'year']])},
                                                         use_t=True)
cluster_2ways_ols.summary()


# Fixed effects (Python really needs an equivalent to `areg` / `reghdfe` / `lfe` to absorb estimates)

# In[50]:


year_fe_ols = sm.ols(formula='y ~ x + C(year)', data=df).fit(use_t=True)
year_fe_ols.summary()


# In[45]:


firm_cluster_year_fe_ols = sm.ols(formula='y ~ x + C(year)', data=df).fit(cov_type='cluster',
                                                                          cov_kwds={'groups': df['firmid']},
                                                                          use_t=True)
firm_cluster_year_fe_ols.summary()


# In[ ]: