In [1]:

#%%
import os
import sys
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from patsy import dmatrices, dmatrix, demo_data

import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS

%matplotlib inline
# run for jupyter notebook
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'
#%%

/home/alal/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

Manual Matrix Calculation

In [2]:

nsample = 100
x = np.linspace(0, 10, 100)
cons = np.ones([nsample,1])
X = np.column_stack((x, x**2, cons))

In [3]:

beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)

In [4]:

y = np.dot(X, beta) + e

$\hat{\beta} = (X'X)^{-1}X'Y$

In [5]:

beta_hat = np.linalg.inv(X.T @ X)@X.T@y
beta_hat.T

Out[5]:

array([1.00810567, 0.09898731, 9.88754309])

In [7]:

beta

Out[7]:

array([ 1. ,  0.1, 10. ])

In [8]:

beta1 = sm.OLS(y, X).fit()
beta1.params

Out[8]:

array([1.00810567, 0.09898731, 9.88754309])

In [9]:

y_hat = beta1.predict()
print(beta1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.976
Method:                 Least Squares   F-statistic:                     2020.
Date:                Thu, 17 May 2018   Prob (F-statistic):           8.98e-80
Time:                        09:57:53   Log-Likelihood:                -132.49
No. Observations:                 100   AIC:                             271.0
Df Residuals:                      97   BIC:                             278.8
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             1.0081      0.126      8.024      0.000       0.759       1.257
x2             0.0990      0.012      8.143      0.000       0.075       0.123
const          9.8875      0.272     36.375      0.000       9.348      10.427
==============================================================================
Omnibus:                        6.947   Durbin-Watson:                   2.022
Prob(Omnibus):                  0.031   Jarque-Bera (JB):                2.970
Skew:                          -0.055   Prob(JB):                        0.226
Kurtosis:                       2.163   Cond. No.                         144.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [10]:

plt.plot(y_hat, alpha=0.5, label='predicted')
# Plot observed values
plt.plot(y, alpha=0.5, label='observed')
plt.legend()
plt.show()

Tabulating Multiple Regressions¶

In [11]:

df2 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable2.dta')

# Add constant term to dataset
df2['const'] = 1

In [12]:

# Create lists of variables to be used in each regression
X1 = ['const', 'avexpr']
X2 = ['const', 'avexpr', 'lat_abst']
X3 = ['const', 'avexpr', 'lat_abst', 'asia', 'africa', 'other']

# Estimate an OLS regression for each set of variables
reg1 = sm.OLS(df2['logpgp95'], df2[X1], missing='drop').fit(cov_type='HC2')
reg2 = sm.OLS(df2['logpgp95'], df2[X2], missing='drop').fit(cov_type='HC2')
reg3 = sm.OLS(df2['logpgp95'], df2[X3], missing='drop').fit(cov_type='HC2')

In [13]:

from statsmodels.iolib.summary2 import summary_col

info_dict={'R-squared' : lambda x: "{:.2f}".format(x.rsquared),
           'No. observations' : lambda x: "{0:d}".format(int(x.nobs))}

results_table = summary_col(results=[reg1,reg2,reg3],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 3',
                                         'Model 4'],
                            info_dict=info_dict,
                            regressor_order=['const',
                                             'avexpr',
                                             'lat_abst',
                                             'asia',
                                             'africa'])

results_table.add_title('Table 2 - OLS Regressions')

print(results_table)

        Table 2 - OLS Regressions
=========================================
                 Model 1 Model 3 Model 4 
-----------------------------------------
const            4.63*** 4.87*** 5.85*** 
                 (0.24)  (0.28)  (0.30)  
avexpr           0.53*** 0.46*** 0.39*** 
                 (0.03)  (0.05)  (0.05)  
lat_abst                 0.87*   0.33    
                         (0.50)  (0.45)  
asia                             -0.15   
                                 (0.18)  
africa                           -0.92***
                                 (0.15)  
other                            0.30    
                                 (0.20)  
R-squared        0.61    0.62    0.72    
No. observations 111     111     111     
=========================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

AJR (2001)¶

In [14]:

df1 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable1.dta')
df1.head()
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 163 entries, 0 to 162
Data columns (total 13 columns):
shortnam    163 non-null object
euro1900    154 non-null float32
excolony    162 non-null float32
avexpr      121 non-null float32
logpgp95    148 non-null float32
cons1       88 non-null float32
cons90      88 non-null float32
democ00a    87 non-null float32
cons00a     91 non-null float32
extmort4    87 non-null float32
logem4      87 non-null float32
loghjypl    123 non-null float32
baseco      64 non-null float32
dtypes: float32(12), object(1)
memory usage: 10.2+ KB

In [15]:

plt.style.use('seaborn')

In [16]:

sns.jointplot('avexpr','logpgp95',data=df1,kind='scatter')

Out[16]:

<seaborn.axisgrid.JointGrid at 0x7fa41c307278>

In [17]:

sns.regplot('avexpr','logpgp95',data=df1)

Out[17]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fa41495c5c0>

In [18]:

# Dropping NA's is required to use numpy's polyfit
df1_subset = df1.dropna(subset=['logpgp95', 'avexpr'])

# Use only 'base sample' for plotting purposes
df1_subset = df1_subset[df1_subset['baseco'] == 1]

X = df1_subset['avexpr']
y = df1_subset['logpgp95']
labels = df1_subset['shortnam']

# Replace markers with country labels
plt.scatter(X, y, marker='')

for i, label in enumerate(labels):
    plt.annotate(label, (X.iloc[i], y.iloc[i]))

# Fit a linear trend line
plt.plot(np.unique(X),
         np.poly1d(np.polyfit(X, y, 1))(np.unique(X)),
         color='black')

plt.xlim([3.3,10.5])
plt.ylim([4,10.5])
plt.xlabel('Average Expropriation Risk 1985-95')
plt.ylabel('Log GDP per capita, PPP, 1995')
plt.title('Figure 2: OLS relationship between expropriation risk and income')
plt.show()

In [19]:

reg1 = smf.ols('logpgp95 ~ avexpr', data=df1, missing='drop').fit(cov_type='HC2')
print(reg1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               logpgp95   R-squared:                       0.611
Model:                            OLS   Adj. R-squared:                  0.608
Method:                 Least Squares   F-statistic:                     333.0
Date:                Thu, 17 May 2018   Prob (F-statistic):           6.40e-35
Time:                        09:58:50   Log-Likelihood:                -119.71
No. Observations:                 111   AIC:                             243.4
Df Residuals:                     109   BIC:                             248.8
Df Model:                           1                                         
Covariance Type:                  HC2                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.6261      0.241     19.180      0.000       4.153       5.099
avexpr         0.5319      0.029     18.249      0.000       0.475       0.589
==============================================================================
Omnibus:                        9.251   Durbin-Watson:                   1.689
Prob(Omnibus):                  0.010   Jarque-Bera (JB):                9.170
Skew:                          -0.680   Prob(JB):                       0.0102
Kurtosis:                       3.362   Cond. No.                         33.2
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC2)

In [20]:

# Drop missing observations from whole sample
df1_plot = df1.dropna(subset=['logpgp95', 'avexpr'])
# Plot predicted values
plt.scatter(df1_plot['avexpr'], reg1.predict(), alpha=0.5, label='predicted')
# Plot observed values
plt.scatter(df1_plot['avexpr'], df1_plot['logpgp95'], alpha=0.5, label='observed')
plt.legend()
plt.title('OLS predicted values')
plt.xlabel('avexpr')
plt.ylabel('logpgp95')
plt.show()

Instrumental Variables¶

In [21]:

# Dropping NA's is required to use numpy's polyfit
df1_subset2 = df1.dropna(subset=['logem4', 'avexpr'])

X = df1_subset2['logem4']
y = df1_subset2['avexpr']
labels = df1_subset2['shortnam']

In [22]:

# Replace markers with country labels
plt.scatter(X, y, marker='')

for i, label in enumerate(labels):
    plt.annotate(label, (X.iloc[i], y.iloc[i]))

# Fit a linear trend line
plt.plot(np.unique(X),
         np.poly1d(np.polyfit(X, y, 1))(np.unique(X)),
         color='black')

# plt.xlim([1.8,8.4])
# plt.ylim([3.3,10.4])
plt.xlabel('Log of Settler Mortality')
plt.ylabel('Average Expropriation Risk 1985-95')
plt.title('Figure 3: First-stage relationship between settler mortality and expropriation risk')

Out[22]:

Text(0.5,1,'Figure 3: First-stage relationship between settler mortality and expropriation risk')

Manual Calculation

$\beta_{IV} = \frac{ \beta_{rf} }{ \beta_{fs} }$

where

$\beta_{fs} = \frac{Cov(X,Z)}{Var(Z)}$

and

$\beta_{rf} = \frac{Cov(Y,Z)}{Var(Z)}$

Which yields

$\beta_{IV} = \frac{Cov(Y,Z)}{Cov(X,Z)}$

In [23]:

# Import and select the data
df4 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable4.dta')
df4 = df4[df4['baseco'] == 1]

# Add a constant variable
df4['const'] = 1

First Stage

In [24]:

# Fit the first stage regression and print summary
results_fs = sm.OLS(df4['avexpr'],
                    df4[['const', 'logem4']],
                    missing='drop').fit()
print(results_fs.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 avexpr   R-squared:                       0.270
Model:                            OLS   Adj. R-squared:                  0.258
Method:                 Least Squares   F-statistic:                     22.95
Date:                Thu, 17 May 2018   Prob (F-statistic):           1.08e-05
Time:                        09:59:13   Log-Likelihood:                -104.83
No. Observations:                  64   AIC:                             213.7
Df Residuals:                      62   BIC:                             218.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.3414      0.611     15.296      0.000       8.121      10.562
logem4        -0.6068      0.127     -4.790      0.000      -0.860      -0.354
==============================================================================
Omnibus:                        0.035   Durbin-Watson:                   2.003
Prob(Omnibus):                  0.983   Jarque-Bera (JB):                0.172
Skew:                           0.045   Prob(JB):                        0.918
Kurtosis:                       2.763   Cond. No.                         19.4
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [25]:

df4['predicted_avexpr'] = results_fs.predict()
results_ss = sm.OLS(df4['logpgp95'],
                    df4[['const', 'predicted_avexpr']]).fit()
print(results_ss.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               logpgp95   R-squared:                       0.477
Model:                            OLS   Adj. R-squared:                  0.469
Method:                 Least Squares   F-statistic:                     56.60
Date:                Thu, 17 May 2018   Prob (F-statistic):           2.66e-10
Time:                        09:59:13   Log-Likelihood:                -72.268
No. Observations:                  64   AIC:                             148.5
Df Residuals:                      62   BIC:                             152.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                1.9097      0.823      2.320      0.024       0.264       3.555
predicted_avexpr     0.9443      0.126      7.523      0.000       0.693       1.195
==============================================================================
Omnibus:                       10.547   Durbin-Watson:                   2.137
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               11.010
Skew:                          -0.790   Prob(JB):                      0.00407
Kurtosis:                       4.277   Cond. No.                         58.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

IV subroutine from linearmodels¶

In [26]:

from linearmodels.iv import IV2SLS
iv = IV2SLS(dependent=df4['logpgp95'],
            exog=df4['const'],
            endog=df4['avexpr'],
            instruments=df4['logem4']).fit(cov_type='unadjusted')

print(iv.summary)

                          IV-2SLS Estimation Summary                          
==============================================================================
Dep. Variable:               logpgp95   R-squared:                      0.1870
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1739
No. Observations:                  64   F-statistic:                    37.568
Date:                Thu, May 17 2018   P-value (F-stat)                0.0000
Time:                        09:59:30   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          1.9097     1.0106     1.8897     0.0588     -0.0710      3.8903
avexpr         0.9443     0.1541     6.1293     0.0000      0.6423      1.2462
==============================================================================

Endogenous: avexpr
Instruments: logem4
Unadjusted Covariance (Homoskedastic)
Debiased: False

IV Formula syntax¶

In [27]:

from statsmodels.api import add_constant
from linearmodels.datasets import mroz
print(mroz.DESCR)
data = mroz.load()
data = data.dropna()
data = add_constant(data, has_constant='add')
data['lnwage']=np.log(data.wage)

T.A. Mroz (1987), "The Sensitivity of an Empirical Model of Married Women's
Hours of Work to Economic and Statistical Assumptions," Econometrica 55,
765-799.

nlf        1 if in labor force, 1975
hours      hours worked, 1975
kidslt6    # kids < 6 years
kidsge6    # kids 6-18
age        woman's age in yrs
educ       years of schooling
wage       estimated wage from earns., hours
repwage    reported wage at interview in 1976
hushrs     hours worked by husband, 1975
husage     husband's age
huseduc    husband's years of schooling
huswage    husband's hourly wage, 1975
faminc     family income, 1975
mtr        fed. marginal tax rate facing woman
motheduc   mother's years of schooling
fatheduc   father's years of schooling
unem       unem. rate in county of resid.
city       =1 if live in SMSA
exper      actual labor mkt exper
nwifeinc   (faminc - wage*hours)/1000
lwage      log(wage)
expersq    exper^2

In [28]:

y = np.log(data.wage).as_matrix()
X = data[['const','educ']].as_matrix()

beta = np.linalg.inv(X.T @ X)@X.T@y
beta

Out[28]:

array([-0.18519681,  0.10864865])

In [29]:

data.head()

Out[29]:

	const	inlf	hours	kidslt6	kidsge6	age	educ	wage	repwage	hushrs	...	mtr	motheduc	fatheduc	unem	city	exper	nwifeinc	lwage	expersq	lnwage
0	1.0	1	1610	1	0	32	12	3.3540	2.65	2708	...	0.7215	12	7	5.0	0	14	10.910060	1.210154	196	1.210154
1	1.0	1	1656	0	2	30	12	1.3889	2.65	2310	...	0.6615	7	7	11.0	1	5	19.499980	0.328512	25	0.328512
2	1.0	1	1980	1	3	35	12	4.5455	4.04	3072	...	0.6915	12	7	5.0	0	15	12.039910	1.514138	225	1.514138
3	1.0	1	456	0	3	34	12	1.0965	3.25	1920	...	0.7815	7	7	5.0	0	6	6.799996	0.092123	36	0.092123
4	1.0	1	1568	1	2	31	14	4.5918	3.60	2000	...	0.6215	12	14	9.5	1	7	20.100060	1.524272	49	1.524272

5 rows × 24 columns

In [30]:

res_ols = sm.formula.ols('lnwage ~ educ', data=data).fit()
print(res_ols.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 lnwage   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.116
Method:                 Least Squares   F-statistic:                     56.93
Date:                Thu, 17 May 2018   Prob (F-statistic):           2.76e-13
Time:                        09:59:42   Log-Likelihood:                -441.26
No. Observations:                 428   AIC:                             886.5
Df Residuals:                     426   BIC:                             894.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1852      0.185     -1.000      0.318      -0.549       0.179
educ           0.1086      0.014      7.545      0.000       0.080       0.137
==============================================================================
Omnibus:                       91.833   Durbin-Watson:                   1.985
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              303.790
Skew:                          -0.956   Prob(JB):                     1.08e-66
Kurtosis:                       6.658   Cond. No.                         72.9
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [31]:

first_stage = sm.OLS(data.educ, 
                 data[['fatheduc','const']]).fit()
print(first_stage.summary())
b_fs = first_stage.params[0]

data['educ_hat'] = first_stage.predict()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   educ   R-squared:                       0.173
Model:                            OLS   Adj. R-squared:                  0.171
Method:                 Least Squares   F-statistic:                     88.84
Date:                Thu, 17 May 2018   Prob (F-statistic):           2.76e-19
Time:                        09:59:42   Log-Likelihood:                -920.02
No. Observations:                 428   AIC:                             1844.
Df Residuals:                     426   BIC:                             1852.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
fatheduc       0.2694      0.029      9.426      0.000       0.213       0.326
const         10.2371      0.276     37.099      0.000       9.695      10.779
==============================================================================
Omnibus:                        9.919   Durbin-Watson:                   1.919
Prob(Omnibus):                  0.007   Jarque-Bera (JB):               16.651
Skew:                          -0.093   Prob(JB):                     0.000242
Kurtosis:                       3.948   Cond. No.                         26.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [32]:

reduced_form = sm.OLS(np.log(data.wage), 
                 data[['fatheduc','const']]).fit()
b_rf = reduced_form.params[0]
print(reduced_form.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     2.586
Date:                Thu, 17 May 2018   Prob (F-statistic):              0.109
Time:                        09:59:43   Log-Likelihood:                -466.81
No. Observations:                 428   AIC:                             937.6
Df Residuals:                     426   BIC:                             945.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
fatheduc       0.0159      0.010      1.608      0.109      -0.004       0.035
const          1.0469      0.096     10.939      0.000       0.859       1.235
==============================================================================
Omnibus:                       63.908   Durbin-Watson:                   1.973
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              167.384
Skew:                          -0.730   Prob(JB):                     4.50e-37
Kurtosis:                       5.693   Cond. No.                         26.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [33]:

second_stage = sm.OLS(np.log(data.wage), 
                 data[['educ_hat','const']]).fit()
b_ss = second_stage.params[0]
print(second_stage.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     2.586
Date:                Thu, 17 May 2018   Prob (F-statistic):              0.109
Time:                        09:59:44   Log-Likelihood:                -466.81
No. Observations:                 428   AIC:                             937.6
Df Residuals:                     426   BIC:                             945.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
educ_hat       0.0592      0.037      1.608      0.109      -0.013       0.131
const          0.4411      0.467      0.944      0.346      -0.477       1.359
==============================================================================
Omnibus:                       63.908   Durbin-Watson:                   1.973
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              167.384
Skew:                          -0.730   Prob(JB):                     4.50e-37
Kurtosis:                       5.693   Cond. No.                         171.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

$\beta_{iv} = \frac{cov(y_i,z_i)}{cov(x_i,z_i)}= \frac{\beta_{rf}}{\beta_{fs}} = \frac{cov(y_i,\hat{x_i})}{var(\hat{x_i})}$

calculate $\beta_{iv}$ in all 3 ways¶

In [34]:

iv = IV2SLS.from_formula('lnwage ~ 1 + [educ ~ fatheduc]',
                    data=data).fit(cov_type='unadjusted')
print(iv.summary)

                          IV-2SLS Estimation Summary                          
==============================================================================
Dep. Variable:                 lnwage   R-squared:                      0.0934
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0913
No. Observations:                 428   F-statistic:                    2.8487
Date:                Thu, May 17 2018   P-value (F-stat)                0.0914
Time:                        09:59:51   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      0.4411     0.4451     0.9911     0.3216     -0.4312      1.3134
educ           0.0592     0.0351     1.6878     0.0914     -0.0095      0.1279
==============================================================================

Endogenous: educ
Instruments: fatheduc
Unadjusted Covariance (Homoskedastic)
Debiased: False

In [35]:

iv.params[1]

Out[35]:

0.05917348053415239

In [36]:

b_iv = b_rf/b_fs
b_iv

Out[36]:

0.05917348053415318

In [37]:

b_ss

Out[37]:

0.0591734805341532

Interaction term

In [53]:

sm.ols('lnwage ~ educ + hours * kidsge6 + huseduc', 
       data = data).fit(cov_type='HC1').summary()

Out[53]:

OLS Regression Results
Dep. Variable:	lnwage	R-squared:	0.129
Model:	OLS	Adj. R-squared:	0.119
Method:	Least Squares	F-statistic:	14.57
Date:	Thu, 17 May 2018	Prob (F-statistic):	3.52e-13
Time:	10:04:23	Log-Likelihood:	-438.57
No. Observations:	428	AIC:	889.1
Df Residuals:	422	BIC:	913.5
Df Model:	5
Covariance Type:	HC1

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	-0.0295	0.210	-0.141	0.888	-0.440	0.381
educ	0.1184	0.016	7.575	0.000	0.088	0.149
hours	-5.442e-06	7.25e-05	-0.075	0.940	-0.000	0.000
kidsge6	-0.0425	0.057	-0.748	0.455	-0.154	0.069
hours:kidsge6	-6.169e-06	4.16e-05	-0.148	0.882	-8.77e-05	7.54e-05
huseduc	-0.0163	0.012	-1.323	0.186	-0.040	0.008

Omnibus:	88.303	Durbin-Watson:	1.990
Prob(Omnibus):	0.000	Jarque-Bera (JB):	289.969
Skew:	-0.921	Prob(JB):	1.08e-63
Kurtosis:	6.587	Cond. No.	1.79e+04

Fixed effects ; Standard Errors (Robust, Clustered, Multiway clustered)¶

Verbatim fork of this repo for reference. Will likely attempt to implement Conley(1999) in python

In [51]:

import statsmodels.formula.api as sm
import statsmodels.stats.sandwich_covariance as sw

In [52]:

from urllib.request import urlopen

with urlopen('http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt') as conn:
    df = pd.read_table(conn, names=['firmid','year','x','y'],
                       delim_whitespace=True)
df.head()

Out[52]:

	firmid	year	x	y
0	1	1	-1.113973	2.251535
1	1	2	-0.080854	1.242346
2	1	3	-0.237607	-1.426376
3	1	4	-0.152486	-1.109394
4	1	5	-0.001426	0.914686

In [45]:

ols = sm.ols(formula='y ~ x', data=df).fit(use_t=True)
ols.summary()

Out[45]:

OLS Regression Results
Dep. Variable:	y	R-squared:	0.208
Model:	OLS	Adj. R-squared:	0.208
Method:	Least Squares	F-statistic:	1311.
Date:	Thu, 17 May 2018	Prob (F-statistic):	4.25e-255
Time:	10:00:57	Log-Likelihood:	-10573.
No. Observations:	5000	AIC:	2.115e+04
Df Residuals:	4998	BIC:	2.116e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	0.0297	0.028	1.047	0.295	-0.026	0.085
x	1.0348	0.029	36.204	0.000	0.979	1.091

Omnibus:	4.912	Durbin-Watson:	1.096
Prob(Omnibus):	0.086	Jarque-Bera (JB):	4.862
Skew:	0.070	Prob(JB):	0.0880
Kurtosis:	3.063	Cond. No.	1.01

White standard errors (reg y x, robust)

In [46]:

robust_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='HC1', use_t=True)
robust_ols.summary()

Out[46]:

OLS Regression Results
Dep. Variable:	y	R-squared:	0.208
Model:	OLS	Adj. R-squared:	0.208
Method:	Least Squares	F-statistic:	1328.
Date:	Thu, 17 May 2018	Prob (F-statistic):	4.29e-258
Time:	10:01:00	Log-Likelihood:	-10573.
No. Observations:	5000	AIC:	2.115e+04
Df Residuals:	4998	BIC:	2.116e+04
Df Model:	1
Covariance Type:	HC1

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.0297	0.028	1.047	0.295	-0.026	0.085
x	1.0348	0.028	36.444	0.000	0.979	1.091

Omnibus:	4.912	Durbin-Watson:	1.096
Prob(Omnibus):	0.086	Jarque-Bera (JB):	4.862
Skew:	0.070	Prob(JB):	0.0880
Kurtosis:	3.063	Cond. No.	1.01

Clustered

In [47]:

cluster_firm_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
                                                        cov_kwds={'groups': df['firmid']},
                                                        use_t=True)
cluster_firm_ols.summary()

Out[47]:

OLS Regression Results
Dep. Variable:	y	R-squared:	0.208
Model:	OLS	Adj. R-squared:	0.208
Method:	Least Squares	F-statistic:	418.3
Date:	Thu, 17 May 2018	Prob (F-statistic):	5.61e-68
Time:	10:01:01	Log-Likelihood:	-10573.
No. Observations:	5000	AIC:	2.115e+04
Df Residuals:	4998	BIC:	2.116e+04
Df Model:	1
Covariance Type:	cluster

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.0297	0.067	0.443	0.658	-0.102	0.161
x	1.0348	0.051	20.453	0.000	0.935	1.134

Omnibus:	4.912	Durbin-Watson:	1.096
Prob(Omnibus):	0.086	Jarque-Bera (JB):	4.862
Skew:	0.070	Prob(JB):	0.0880
Kurtosis:	3.063	Cond. No.	1.01

In [48]:

cluster_year_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
                                                        cov_kwds={'groups': df['year']},
                                                        use_t=True)
cluster_year_ols.summary()

Out[48]:

OLS Regression Results
Dep. Variable:	y	R-squared:	0.208
Model:	OLS	Adj. R-squared:	0.208
Method:	Least Squares	F-statistic:	960.6
Date:	Thu, 17 May 2018	Prob (F-statistic):	1.86e-10
Time:	10:01:06	Log-Likelihood:	-10573.
No. Observations:	5000	AIC:	2.115e+04
Df Residuals:	4998	BIC:	2.116e+04
Df Model:	1
Covariance Type:	cluster

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.0297	0.023	1.269	0.236	-0.023	0.083
x	1.0348	0.033	30.993	0.000	0.959	1.110

Omnibus:	4.912	Durbin-Watson:	1.096
Prob(Omnibus):	0.086	Jarque-Bera (JB):	4.862
Skew:	0.070	Prob(JB):	0.0880
Kurtosis:	3.063	Cond. No.	1.01

In [49]:

cluster_2ways_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
                                                         cov_kwds={'groups': np.array(df[['firmid', 'year']])},
                                                         use_t=True)
cluster_2ways_ols.summary()

Out[49]:

OLS Regression Results
Dep. Variable:	y	R-squared:	0.208
Model:	OLS	Adj. R-squared:	0.208
Method:	Least Squares	F-statistic:	373.3
Date:	Thu, 17 May 2018	Prob (F-statistic):	1.23e-08
Time:	10:01:10	Log-Likelihood:	-10573.
No. Observations:	5000	AIC:	2.115e+04
Df Residuals:	4998	BIC:	2.116e+04
Df Model:	1
Covariance Type:	cluster

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.0297	0.065	0.456	0.659	-0.118	0.177
x	1.0348	0.054	19.322	0.000	0.914	1.156

Omnibus:	4.912	Durbin-Watson:	1.096
Prob(Omnibus):	0.086	Jarque-Bera (JB):	4.862
Skew:	0.070	Prob(JB):	0.0880
Kurtosis:	3.063	Cond. No.	1.01

Fixed effects (Python really needs an equivalent to areg / reghdfe / lfe to absorb estimates)

In [50]:

year_fe_ols = sm.ols(formula='y ~ x + C(year)', data=df).fit(use_t=True)
year_fe_ols.summary()

Out[50]:

OLS Regression Results
Dep. Variable:	y	R-squared:	0.209
Model:	OLS	Adj. R-squared:	0.207
Method:	Least Squares	F-statistic:	131.6
Date:	Thu, 17 May 2018	Prob (F-statistic):	7.13e-245
Time:	10:01:15	Log-Likelihood:	-10570.
No. Observations:	5000	AIC:	2.116e+04
Df Residuals:	4989	BIC:	2.123e+04
Df Model:	10
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	0.1411	0.090	1.573	0.116	-0.035	0.317
C(year)[T.2]	-0.0119	0.127	-0.094	0.925	-0.261	0.237
C(year)[T.3]	-0.1453	0.127	-1.145	0.252	-0.394	0.103
C(year)[T.4]	-0.2038	0.127	-1.607	0.108	-0.452	0.045
C(year)[T.5]	-0.0604	0.127	-0.476	0.634	-0.309	0.188
C(year)[T.6]	-0.1312	0.127	-1.034	0.301	-0.380	0.117
C(year)[T.7]	-0.1975	0.127	-1.557	0.120	-0.446	0.051
C(year)[T.8]	-0.1555	0.127	-1.225	0.220	-0.404	0.093
C(year)[T.9]	-0.1535	0.127	-1.210	0.226	-0.402	0.095
C(year)[T.10]	-0.0556	0.127	-0.438	0.661	-0.304	0.193
x	1.0351	0.029	36.160	0.000	0.979	1.091

Omnibus:	4.804	Durbin-Watson:	1.096
Prob(Omnibus):	0.091	Jarque-Bera (JB):	4.752
Skew:	0.069	Prob(JB):	0.0929
Kurtosis:	3.061	Cond. No.	10.9