#%%
import os
import sys
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from patsy import dmatrices, dmatrix, demo_data
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS
%matplotlib inline
# run for jupyter notebook
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'
#%%
/home/alal/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
Manual Matrix Calculation
nsample = 100
x = np.linspace(0, 10, 100)
cons = np.ones([nsample,1])
X = np.column_stack((x, x**2, cons))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)
y = np.dot(X, beta) + e
beta_hat = np.linalg.inv(X.T @ X)@X.T@y
beta_hat.T
array([1.00810567, 0.09898731, 9.88754309])
beta
array([ 1. , 0.1, 10. ])
beta1 = sm.OLS(y, X).fit()
beta1.params
array([1.00810567, 0.09898731, 9.88754309])
y_hat = beta1.predict()
print(beta1.summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.977 Model: OLS Adj. R-squared: 0.976 Method: Least Squares F-statistic: 2020. Date: Thu, 17 May 2018 Prob (F-statistic): 8.98e-80 Time: 09:57:53 Log-Likelihood: -132.49 No. Observations: 100 AIC: 271.0 Df Residuals: 97 BIC: 278.8 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ x1 1.0081 0.126 8.024 0.000 0.759 1.257 x2 0.0990 0.012 8.143 0.000 0.075 0.123 const 9.8875 0.272 36.375 0.000 9.348 10.427 ============================================================================== Omnibus: 6.947 Durbin-Watson: 2.022 Prob(Omnibus): 0.031 Jarque-Bera (JB): 2.970 Skew: -0.055 Prob(JB): 0.226 Kurtosis: 2.163 Cond. No. 144. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
plt.plot(y_hat, alpha=0.5, label='predicted')
# Plot observed values
plt.plot(y, alpha=0.5, label='observed')
plt.legend()
plt.show()
df2 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable2.dta')
# Add constant term to dataset
df2['const'] = 1
# Create lists of variables to be used in each regression
X1 = ['const', 'avexpr']
X2 = ['const', 'avexpr', 'lat_abst']
X3 = ['const', 'avexpr', 'lat_abst', 'asia', 'africa', 'other']
# Estimate an OLS regression for each set of variables
reg1 = sm.OLS(df2['logpgp95'], df2[X1], missing='drop').fit(cov_type='HC2')
reg2 = sm.OLS(df2['logpgp95'], df2[X2], missing='drop').fit(cov_type='HC2')
reg3 = sm.OLS(df2['logpgp95'], df2[X3], missing='drop').fit(cov_type='HC2')
from statsmodels.iolib.summary2 import summary_col
info_dict={'R-squared' : lambda x: "{:.2f}".format(x.rsquared),
'No. observations' : lambda x: "{0:d}".format(int(x.nobs))}
results_table = summary_col(results=[reg1,reg2,reg3],
float_format='%0.2f',
stars = True,
model_names=['Model 1',
'Model 3',
'Model 4'],
info_dict=info_dict,
regressor_order=['const',
'avexpr',
'lat_abst',
'asia',
'africa'])
results_table.add_title('Table 2 - OLS Regressions')
print(results_table)
Table 2 - OLS Regressions ========================================= Model 1 Model 3 Model 4 ----------------------------------------- const 4.63*** 4.87*** 5.85*** (0.24) (0.28) (0.30) avexpr 0.53*** 0.46*** 0.39*** (0.03) (0.05) (0.05) lat_abst 0.87* 0.33 (0.50) (0.45) asia -0.15 (0.18) africa -0.92*** (0.15) other 0.30 (0.20) R-squared 0.61 0.62 0.72 No. observations 111 111 111 ========================================= Standard errors in parentheses. * p<.1, ** p<.05, ***p<.01
df1 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable1.dta')
df1.head()
df1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 163 entries, 0 to 162 Data columns (total 13 columns): shortnam 163 non-null object euro1900 154 non-null float32 excolony 162 non-null float32 avexpr 121 non-null float32 logpgp95 148 non-null float32 cons1 88 non-null float32 cons90 88 non-null float32 democ00a 87 non-null float32 cons00a 91 non-null float32 extmort4 87 non-null float32 logem4 87 non-null float32 loghjypl 123 non-null float32 baseco 64 non-null float32 dtypes: float32(12), object(1) memory usage: 10.2+ KB
plt.style.use('seaborn')
sns.jointplot('avexpr','logpgp95',data=df1,kind='scatter')
<seaborn.axisgrid.JointGrid at 0x7fa41c307278>
sns.regplot('avexpr','logpgp95',data=df1)
<matplotlib.axes._subplots.AxesSubplot at 0x7fa41495c5c0>
# Dropping NA's is required to use numpy's polyfit
df1_subset = df1.dropna(subset=['logpgp95', 'avexpr'])
# Use only 'base sample' for plotting purposes
df1_subset = df1_subset[df1_subset['baseco'] == 1]
X = df1_subset['avexpr']
y = df1_subset['logpgp95']
labels = df1_subset['shortnam']
# Replace markers with country labels
plt.scatter(X, y, marker='')
for i, label in enumerate(labels):
plt.annotate(label, (X.iloc[i], y.iloc[i]))
# Fit a linear trend line
plt.plot(np.unique(X),
np.poly1d(np.polyfit(X, y, 1))(np.unique(X)),
color='black')
plt.xlim([3.3,10.5])
plt.ylim([4,10.5])
plt.xlabel('Average Expropriation Risk 1985-95')
plt.ylabel('Log GDP per capita, PPP, 1995')
plt.title('Figure 2: OLS relationship between expropriation risk and income')
plt.show()
reg1 = smf.ols('logpgp95 ~ avexpr', data=df1, missing='drop').fit(cov_type='HC2')
print(reg1.summary())
OLS Regression Results ============================================================================== Dep. Variable: logpgp95 R-squared: 0.611 Model: OLS Adj. R-squared: 0.608 Method: Least Squares F-statistic: 333.0 Date: Thu, 17 May 2018 Prob (F-statistic): 6.40e-35 Time: 09:58:50 Log-Likelihood: -119.71 No. Observations: 111 AIC: 243.4 Df Residuals: 109 BIC: 248.8 Df Model: 1 Covariance Type: HC2 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 4.6261 0.241 19.180 0.000 4.153 5.099 avexpr 0.5319 0.029 18.249 0.000 0.475 0.589 ============================================================================== Omnibus: 9.251 Durbin-Watson: 1.689 Prob(Omnibus): 0.010 Jarque-Bera (JB): 9.170 Skew: -0.680 Prob(JB): 0.0102 Kurtosis: 3.362 Cond. No. 33.2 ============================================================================== Warnings: [1] Standard Errors are heteroscedasticity robust (HC2)
# Drop missing observations from whole sample
df1_plot = df1.dropna(subset=['logpgp95', 'avexpr'])
# Plot predicted values
plt.scatter(df1_plot['avexpr'], reg1.predict(), alpha=0.5, label='predicted')
# Plot observed values
plt.scatter(df1_plot['avexpr'], df1_plot['logpgp95'], alpha=0.5, label='observed')
plt.legend()
plt.title('OLS predicted values')
plt.xlabel('avexpr')
plt.ylabel('logpgp95')
plt.show()
# Dropping NA's is required to use numpy's polyfit
df1_subset2 = df1.dropna(subset=['logem4', 'avexpr'])
X = df1_subset2['logem4']
y = df1_subset2['avexpr']
labels = df1_subset2['shortnam']
# Replace markers with country labels
plt.scatter(X, y, marker='')
for i, label in enumerate(labels):
plt.annotate(label, (X.iloc[i], y.iloc[i]))
# Fit a linear trend line
plt.plot(np.unique(X),
np.poly1d(np.polyfit(X, y, 1))(np.unique(X)),
color='black')
# plt.xlim([1.8,8.4])
# plt.ylim([3.3,10.4])
plt.xlabel('Log of Settler Mortality')
plt.ylabel('Average Expropriation Risk 1985-95')
plt.title('Figure 3: First-stage relationship between settler mortality and expropriation risk')
Text(0.5,1,'Figure 3: First-stage relationship between settler mortality and expropriation risk')
Manual Calculation
where βfs=Cov(X,Z)Var(Z)
and
βrf=Cov(Y,Z)Var(Z)Which yields βIV=Cov(Y,Z)Cov(X,Z)
# Import and select the data
df4 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable4.dta')
df4 = df4[df4['baseco'] == 1]
# Add a constant variable
df4['const'] = 1
First Stage
# Fit the first stage regression and print summary
results_fs = sm.OLS(df4['avexpr'],
df4[['const', 'logem4']],
missing='drop').fit()
print(results_fs.summary())
OLS Regression Results ============================================================================== Dep. Variable: avexpr R-squared: 0.270 Model: OLS Adj. R-squared: 0.258 Method: Least Squares F-statistic: 22.95 Date: Thu, 17 May 2018 Prob (F-statistic): 1.08e-05 Time: 09:59:13 Log-Likelihood: -104.83 No. Observations: 64 AIC: 213.7 Df Residuals: 62 BIC: 218.0 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 9.3414 0.611 15.296 0.000 8.121 10.562 logem4 -0.6068 0.127 -4.790 0.000 -0.860 -0.354 ============================================================================== Omnibus: 0.035 Durbin-Watson: 2.003 Prob(Omnibus): 0.983 Jarque-Bera (JB): 0.172 Skew: 0.045 Prob(JB): 0.918 Kurtosis: 2.763 Cond. No. 19.4 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
df4['predicted_avexpr'] = results_fs.predict()
results_ss = sm.OLS(df4['logpgp95'],
df4[['const', 'predicted_avexpr']]).fit()
print(results_ss.summary())
OLS Regression Results ============================================================================== Dep. Variable: logpgp95 R-squared: 0.477 Model: OLS Adj. R-squared: 0.469 Method: Least Squares F-statistic: 56.60 Date: Thu, 17 May 2018 Prob (F-statistic): 2.66e-10 Time: 09:59:13 Log-Likelihood: -72.268 No. Observations: 64 AIC: 148.5 Df Residuals: 62 BIC: 152.9 Df Model: 1 Covariance Type: nonrobust ==================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------ const 1.9097 0.823 2.320 0.024 0.264 3.555 predicted_avexpr 0.9443 0.126 7.523 0.000 0.693 1.195 ============================================================================== Omnibus: 10.547 Durbin-Watson: 2.137 Prob(Omnibus): 0.005 Jarque-Bera (JB): 11.010 Skew: -0.790 Prob(JB): 0.00407 Kurtosis: 4.277 Cond. No. 58.1 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from linearmodels.iv import IV2SLS
iv = IV2SLS(dependent=df4['logpgp95'],
exog=df4['const'],
endog=df4['avexpr'],
instruments=df4['logem4']).fit(cov_type='unadjusted')
print(iv.summary)
IV-2SLS Estimation Summary ============================================================================== Dep. Variable: logpgp95 R-squared: 0.1870 Estimator: IV-2SLS Adj. R-squared: 0.1739 No. Observations: 64 F-statistic: 37.568 Date: Thu, May 17 2018 P-value (F-stat) 0.0000 Time: 09:59:30 Distribution: chi2(1) Cov. Estimator: unadjusted Parameter Estimates ============================================================================== Parameter Std. Err. T-stat P-value Lower CI Upper CI ------------------------------------------------------------------------------ const 1.9097 1.0106 1.8897 0.0588 -0.0710 3.8903 avexpr 0.9443 0.1541 6.1293 0.0000 0.6423 1.2462 ============================================================================== Endogenous: avexpr Instruments: logem4 Unadjusted Covariance (Homoskedastic) Debiased: False
from statsmodels.api import add_constant
from linearmodels.datasets import mroz
print(mroz.DESCR)
data = mroz.load()
data = data.dropna()
data = add_constant(data, has_constant='add')
data['lnwage']=np.log(data.wage)
T.A. Mroz (1987), "The Sensitivity of an Empirical Model of Married Women's Hours of Work to Economic and Statistical Assumptions," Econometrica 55, 765-799. nlf 1 if in labor force, 1975 hours hours worked, 1975 kidslt6 # kids < 6 years kidsge6 # kids 6-18 age woman's age in yrs educ years of schooling wage estimated wage from earns., hours repwage reported wage at interview in 1976 hushrs hours worked by husband, 1975 husage husband's age huseduc husband's years of schooling huswage husband's hourly wage, 1975 faminc family income, 1975 mtr fed. marginal tax rate facing woman motheduc mother's years of schooling fatheduc father's years of schooling unem unem. rate in county of resid. city =1 if live in SMSA exper actual labor mkt exper nwifeinc (faminc - wage*hours)/1000 lwage log(wage) expersq exper^2
y = np.log(data.wage).as_matrix()
X = data[['const','educ']].as_matrix()
beta = np.linalg.inv(X.T @ X)@X.T@y
beta
array([-0.18519681, 0.10864865])
data.head()
const | inlf | hours | kidslt6 | kidsge6 | age | educ | wage | repwage | hushrs | ... | mtr | motheduc | fatheduc | unem | city | exper | nwifeinc | lwage | expersq | lnwage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1 | 1610 | 1 | 0 | 32 | 12 | 3.3540 | 2.65 | 2708 | ... | 0.7215 | 12 | 7 | 5.0 | 0 | 14 | 10.910060 | 1.210154 | 196 | 1.210154 |
1 | 1.0 | 1 | 1656 | 0 | 2 | 30 | 12 | 1.3889 | 2.65 | 2310 | ... | 0.6615 | 7 | 7 | 11.0 | 1 | 5 | 19.499980 | 0.328512 | 25 | 0.328512 |
2 | 1.0 | 1 | 1980 | 1 | 3 | 35 | 12 | 4.5455 | 4.04 | 3072 | ... | 0.6915 | 12 | 7 | 5.0 | 0 | 15 | 12.039910 | 1.514138 | 225 | 1.514138 |
3 | 1.0 | 1 | 456 | 0 | 3 | 34 | 12 | 1.0965 | 3.25 | 1920 | ... | 0.7815 | 7 | 7 | 5.0 | 0 | 6 | 6.799996 | 0.092123 | 36 | 0.092123 |
4 | 1.0 | 1 | 1568 | 1 | 2 | 31 | 14 | 4.5918 | 3.60 | 2000 | ... | 0.6215 | 12 | 14 | 9.5 | 1 | 7 | 20.100060 | 1.524272 | 49 | 1.524272 |
5 rows × 24 columns
res_ols = sm.formula.ols('lnwage ~ educ', data=data).fit()
print(res_ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: lnwage R-squared: 0.118 Model: OLS Adj. R-squared: 0.116 Method: Least Squares F-statistic: 56.93 Date: Thu, 17 May 2018 Prob (F-statistic): 2.76e-13 Time: 09:59:42 Log-Likelihood: -441.26 No. Observations: 428 AIC: 886.5 Df Residuals: 426 BIC: 894.6 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -0.1852 0.185 -1.000 0.318 -0.549 0.179 educ 0.1086 0.014 7.545 0.000 0.080 0.137 ============================================================================== Omnibus: 91.833 Durbin-Watson: 1.985 Prob(Omnibus): 0.000 Jarque-Bera (JB): 303.790 Skew: -0.956 Prob(JB): 1.08e-66 Kurtosis: 6.658 Cond. No. 72.9 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
first_stage = sm.OLS(data.educ,
data[['fatheduc','const']]).fit()
print(first_stage.summary())
b_fs = first_stage.params[0]
data['educ_hat'] = first_stage.predict()
OLS Regression Results ============================================================================== Dep. Variable: educ R-squared: 0.173 Model: OLS Adj. R-squared: 0.171 Method: Least Squares F-statistic: 88.84 Date: Thu, 17 May 2018 Prob (F-statistic): 2.76e-19 Time: 09:59:42 Log-Likelihood: -920.02 No. Observations: 428 AIC: 1844. Df Residuals: 426 BIC: 1852. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ fatheduc 0.2694 0.029 9.426 0.000 0.213 0.326 const 10.2371 0.276 37.099 0.000 9.695 10.779 ============================================================================== Omnibus: 9.919 Durbin-Watson: 1.919 Prob(Omnibus): 0.007 Jarque-Bera (JB): 16.651 Skew: -0.093 Prob(JB): 0.000242 Kurtosis: 3.948 Cond. No. 26.7 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
reduced_form = sm.OLS(np.log(data.wage),
data[['fatheduc','const']]).fit()
b_rf = reduced_form.params[0]
print(reduced_form.summary())
OLS Regression Results ============================================================================== Dep. Variable: wage R-squared: 0.006 Model: OLS Adj. R-squared: 0.004 Method: Least Squares F-statistic: 2.586 Date: Thu, 17 May 2018 Prob (F-statistic): 0.109 Time: 09:59:43 Log-Likelihood: -466.81 No. Observations: 428 AIC: 937.6 Df Residuals: 426 BIC: 945.7 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ fatheduc 0.0159 0.010 1.608 0.109 -0.004 0.035 const 1.0469 0.096 10.939 0.000 0.859 1.235 ============================================================================== Omnibus: 63.908 Durbin-Watson: 1.973 Prob(Omnibus): 0.000 Jarque-Bera (JB): 167.384 Skew: -0.730 Prob(JB): 4.50e-37 Kurtosis: 5.693 Cond. No. 26.7 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
second_stage = sm.OLS(np.log(data.wage),
data[['educ_hat','const']]).fit()
b_ss = second_stage.params[0]
print(second_stage.summary())
OLS Regression Results ============================================================================== Dep. Variable: wage R-squared: 0.006 Model: OLS Adj. R-squared: 0.004 Method: Least Squares F-statistic: 2.586 Date: Thu, 17 May 2018 Prob (F-statistic): 0.109 Time: 09:59:44 Log-Likelihood: -466.81 No. Observations: 428 AIC: 937.6 Df Residuals: 426 BIC: 945.7 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ educ_hat 0.0592 0.037 1.608 0.109 -0.013 0.131 const 0.4411 0.467 0.944 0.346 -0.477 1.359 ============================================================================== Omnibus: 63.908 Durbin-Watson: 1.973 Prob(Omnibus): 0.000 Jarque-Bera (JB): 167.384 Skew: -0.730 Prob(JB): 4.50e-37 Kurtosis: 5.693 Cond. No. 171. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
iv = IV2SLS.from_formula('lnwage ~ 1 + [educ ~ fatheduc]',
data=data).fit(cov_type='unadjusted')
print(iv.summary)
IV-2SLS Estimation Summary ============================================================================== Dep. Variable: lnwage R-squared: 0.0934 Estimator: IV-2SLS Adj. R-squared: 0.0913 No. Observations: 428 F-statistic: 2.8487 Date: Thu, May 17 2018 P-value (F-stat) 0.0914 Time: 09:59:51 Distribution: chi2(1) Cov. Estimator: unadjusted Parameter Estimates ============================================================================== Parameter Std. Err. T-stat P-value Lower CI Upper CI ------------------------------------------------------------------------------ Intercept 0.4411 0.4451 0.9911 0.3216 -0.4312 1.3134 educ 0.0592 0.0351 1.6878 0.0914 -0.0095 0.1279 ============================================================================== Endogenous: educ Instruments: fatheduc Unadjusted Covariance (Homoskedastic) Debiased: False
iv.params[1]
0.05917348053415239
b_iv = b_rf/b_fs
b_iv
0.05917348053415318
b_ss
0.0591734805341532
Interaction term
sm.ols('lnwage ~ educ + hours * kidsge6 + huseduc',
data = data).fit(cov_type='HC1').summary()
Dep. Variable: | lnwage | R-squared: | 0.129 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.119 |
Method: | Least Squares | F-statistic: | 14.57 |
Date: | Thu, 17 May 2018 | Prob (F-statistic): | 3.52e-13 |
Time: | 10:04:23 | Log-Likelihood: | -438.57 |
No. Observations: | 428 | AIC: | 889.1 |
Df Residuals: | 422 | BIC: | 913.5 |
Df Model: | 5 | ||
Covariance Type: | HC1 |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -0.0295 | 0.210 | -0.141 | 0.888 | -0.440 | 0.381 |
educ | 0.1184 | 0.016 | 7.575 | 0.000 | 0.088 | 0.149 |
hours | -5.442e-06 | 7.25e-05 | -0.075 | 0.940 | -0.000 | 0.000 |
kidsge6 | -0.0425 | 0.057 | -0.748 | 0.455 | -0.154 | 0.069 |
hours:kidsge6 | -6.169e-06 | 4.16e-05 | -0.148 | 0.882 | -8.77e-05 | 7.54e-05 |
huseduc | -0.0163 | 0.012 | -1.323 | 0.186 | -0.040 | 0.008 |
Omnibus: | 88.303 | Durbin-Watson: | 1.990 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 289.969 |
Skew: | -0.921 | Prob(JB): | 1.08e-63 |
Kurtosis: | 6.587 | Cond. No. | 1.79e+04 |
Verbatim fork of this repo for reference. Will likely attempt to implement Conley(1999) in python
import statsmodels.formula.api as sm
import statsmodels.stats.sandwich_covariance as sw
from urllib.request import urlopen
with urlopen('http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt') as conn:
df = pd.read_table(conn, names=['firmid','year','x','y'],
delim_whitespace=True)
df.head()
firmid | year | x | y | |
---|---|---|---|---|
0 | 1 | 1 | -1.113973 | 2.251535 |
1 | 1 | 2 | -0.080854 | 1.242346 |
2 | 1 | 3 | -0.237607 | -1.426376 |
3 | 1 | 4 | -0.152486 | -1.109394 |
4 | 1 | 5 | -0.001426 | 0.914686 |
ols = sm.ols(formula='y ~ x', data=df).fit(use_t=True)
ols.summary()
Dep. Variable: | y | R-squared: | 0.208 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.208 |
Method: | Least Squares | F-statistic: | 1311. |
Date: | Thu, 17 May 2018 | Prob (F-statistic): | 4.25e-255 |
Time: | 10:00:57 | Log-Likelihood: | -10573. |
No. Observations: | 5000 | AIC: | 2.115e+04 |
Df Residuals: | 4998 | BIC: | 2.116e+04 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 0.0297 | 0.028 | 1.047 | 0.295 | -0.026 | 0.085 |
x | 1.0348 | 0.029 | 36.204 | 0.000 | 0.979 | 1.091 |
Omnibus: | 4.912 | Durbin-Watson: | 1.096 |
---|---|---|---|
Prob(Omnibus): | 0.086 | Jarque-Bera (JB): | 4.862 |
Skew: | 0.070 | Prob(JB): | 0.0880 |
Kurtosis: | 3.063 | Cond. No. | 1.01 |
White standard errors (reg y x, robust
)
robust_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='HC1', use_t=True)
robust_ols.summary()
Dep. Variable: | y | R-squared: | 0.208 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.208 |
Method: | Least Squares | F-statistic: | 1328. |
Date: | Thu, 17 May 2018 | Prob (F-statistic): | 4.29e-258 |
Time: | 10:01:00 | Log-Likelihood: | -10573. |
No. Observations: | 5000 | AIC: | 2.115e+04 |
Df Residuals: | 4998 | BIC: | 2.116e+04 |
Df Model: | 1 | ||
Covariance Type: | HC1 |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 0.0297 | 0.028 | 1.047 | 0.295 | -0.026 | 0.085 |
x | 1.0348 | 0.028 | 36.444 | 0.000 | 0.979 | 1.091 |
Omnibus: | 4.912 | Durbin-Watson: | 1.096 |
---|---|---|---|
Prob(Omnibus): | 0.086 | Jarque-Bera (JB): | 4.862 |
Skew: | 0.070 | Prob(JB): | 0.0880 |
Kurtosis: | 3.063 | Cond. No. | 1.01 |
Clustered
cluster_firm_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
cov_kwds={'groups': df['firmid']},
use_t=True)
cluster_firm_ols.summary()
Dep. Variable: | y | R-squared: | 0.208 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.208 |
Method: | Least Squares | F-statistic: | 418.3 |
Date: | Thu, 17 May 2018 | Prob (F-statistic): | 5.61e-68 |
Time: | 10:01:01 | Log-Likelihood: | -10573. |
No. Observations: | 5000 | AIC: | 2.115e+04 |
Df Residuals: | 4998 | BIC: | 2.116e+04 |
Df Model: | 1 | ||
Covariance Type: | cluster |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 0.0297 | 0.067 | 0.443 | 0.658 | -0.102 | 0.161 |
x | 1.0348 | 0.051 | 20.453 | 0.000 | 0.935 | 1.134 |
Omnibus: | 4.912 | Durbin-Watson: | 1.096 |
---|---|---|---|
Prob(Omnibus): | 0.086 | Jarque-Bera (JB): | 4.862 |
Skew: | 0.070 | Prob(JB): | 0.0880 |
Kurtosis: | 3.063 | Cond. No. | 1.01 |
cluster_year_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
cov_kwds={'groups': df['year']},
use_t=True)
cluster_year_ols.summary()
Dep. Variable: | y | R-squared: | 0.208 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.208 |
Method: | Least Squares | F-statistic: | 960.6 |
Date: | Thu, 17 May 2018 | Prob (F-statistic): | 1.86e-10 |
Time: | 10:01:06 | Log-Likelihood: | -10573. |
No. Observations: | 5000 | AIC: | 2.115e+04 |
Df Residuals: | 4998 | BIC: | 2.116e+04 |
Df Model: | 1 | ||
Covariance Type: | cluster |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 0.0297 | 0.023 | 1.269 | 0.236 | -0.023 | 0.083 |
x | 1.0348 | 0.033 | 30.993 | 0.000 | 0.959 | 1.110 |
Omnibus: | 4.912 | Durbin-Watson: | 1.096 |
---|---|---|---|
Prob(Omnibus): | 0.086 | Jarque-Bera (JB): | 4.862 |
Skew: | 0.070 | Prob(JB): | 0.0880 |
Kurtosis: | 3.063 | Cond. No. | 1.01 |
cluster_2ways_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster',
cov_kwds={'groups': np.array(df[['firmid', 'year']])},
use_t=True)
cluster_2ways_ols.summary()
Dep. Variable: | y | R-squared: | 0.208 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.208 |
Method: | Least Squares | F-statistic: | 373.3 |
Date: | Thu, 17 May 2018 | Prob (F-statistic): | 1.23e-08 |
Time: | 10:01:10 | Log-Likelihood: | -10573. |
No. Observations: | 5000 | AIC: | 2.115e+04 |
Df Residuals: | 4998 | BIC: | 2.116e+04 |
Df Model: | 1 | ||
Covariance Type: | cluster |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 0.0297 | 0.065 | 0.456 | 0.659 | -0.118 | 0.177 |
x | 1.0348 | 0.054 | 19.322 | 0.000 | 0.914 | 1.156 |
Omnibus: | 4.912 | Durbin-Watson: | 1.096 |
---|---|---|---|
Prob(Omnibus): | 0.086 | Jarque-Bera (JB): | 4.862 |
Skew: | 0.070 | Prob(JB): | 0.0880 |
Kurtosis: | 3.063 | Cond. No. | 1.01 |
Fixed effects (Python really needs an equivalent to areg
/ reghdfe
/ lfe
to absorb estimates)
year_fe_ols = sm.ols(formula='y ~ x + C(year)', data=df).fit(use_t=True)
year_fe_ols.summary()
Dep. Variable: | y | R-squared: | 0.209 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.207 |
Method: | Least Squares | F-statistic: | 131.6 |
Date: | Thu, 17 May 2018 | Prob (F-statistic): | 7.13e-245 |
Time: | 10:01:15 | Log-Likelihood: | -10570. |
No. Observations: | 5000 | AIC: | 2.116e+04 |
Df Residuals: | 4989 | BIC: | 2.123e+04 |
Df Model: | 10 | ||
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 0.1411 | 0.090 | 1.573 | 0.116 | -0.035 | 0.317 |
C(year)[T.2] | -0.0119 | 0.127 | -0.094 | 0.925 | -0.261 | 0.237 |
C(year)[T.3] | -0.1453 | 0.127 | -1.145 | 0.252 | -0.394 | 0.103 |
C(year)[T.4] | -0.2038 | 0.127 | -1.607 | 0.108 | -0.452 | 0.045 |
C(year)[T.5] | -0.0604 | 0.127 | -0.476 | 0.634 | -0.309 | 0.188 |
C(year)[T.6] | -0.1312 | 0.127 | -1.034 | 0.301 | -0.380 | 0.117 |
C(year)[T.7] | -0.1975 | 0.127 | -1.557 | 0.120 | -0.446 | 0.051 |
C(year)[T.8] | -0.1555 | 0.127 | -1.225 | 0.220 | -0.404 | 0.093 |
C(year)[T.9] | -0.1535 | 0.127 | -1.210 | 0.226 | -0.402 | 0.095 |
C(year)[T.10] | -0.0556 | 0.127 | -0.438 | 0.661 | -0.304 | 0.193 |
x | 1.0351 | 0.029 | 36.160 | 0.000 | 0.979 | 1.091 |
Omnibus: | 4.804 | Durbin-Watson: | 1.096 |
---|---|---|---|
Prob(Omnibus): | 0.091 | Jarque-Bera (JB): | 4.752 |
Skew: | 0.069 | Prob(JB): | 0.0929 |
Kurtosis: | 3.061 | Cond. No. | 10.9 |
firm_cluster_year_fe_ols = sm.ols(formula='y ~ x + C(year)', data=df).fit(cov_type='cluster',
cov_kwds={'groups': df['firmid']},
use_t=True)
firm_cluster_year_fe_ols.summary()
Dep. Variable: | y | R-squared: | 0.209 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.207 |
Method: | Least Squares | F-statistic: | 43.23 |
Date: | Tue, 08 May 2018 | Prob (F-statistic): | 1.93e-61 |
Time: | 16:03:02 | Log-Likelihood: | -10570. |
No. Observations: | 5000 | AIC: | 2.116e+04 |
Df Residuals: | 4989 | BIC: | 2.123e+04 |
Df Model: | 10 | ||
Covariance Type: | cluster |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 0.1411 | 0.089 | 1.587 | 0.113 | -0.034 | 0.316 |
C(year)[T.2] | -0.0119 | 0.086 | -0.139 | 0.890 | -0.181 | 0.157 |
C(year)[T.3] | -0.1453 | 0.086 | -1.690 | 0.092 | -0.314 | 0.024 |
C(year)[T.4] | -0.2038 | 0.089 | -2.288 | 0.023 | -0.379 | -0.029 |
C(year)[T.5] | -0.0604 | 0.087 | -0.697 | 0.486 | -0.231 | 0.110 |
C(year)[T.6] | -0.1312 | 0.084 | -1.562 | 0.119 | -0.296 | 0.034 |
C(year)[T.7] | -0.1975 | 0.087 | -2.275 | 0.023 | -0.368 | -0.027 |
C(year)[T.8] | -0.1555 | 0.094 | -1.662 | 0.097 | -0.339 | 0.028 |
C(year)[T.9] | -0.1535 | 0.088 | -1.752 | 0.080 | -0.326 | 0.019 |
C(year)[T.10] | -0.0556 | 0.088 | -0.634 | 0.526 | -0.228 | 0.117 |
x | 1.0351 | 0.051 | 20.361 | 0.000 | 0.935 | 1.135 |
Omnibus: | 4.804 | Durbin-Watson: | 1.096 |
---|---|---|---|
Prob(Omnibus): | 0.091 | Jarque-Bera (JB): | 4.752 |
Skew: | 0.069 | Prob(JB): | 0.0929 |
Kurtosis: | 3.061 | Cond. No. | 10.9 |