#%%
# pyscience imports
import os
import sys
import glob
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
# graphics
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use("seaborn-darkgrid")
sns.set(style="ticks", context="talk")
plt.style.use("dark_background") # dark bg plots
# %matplotlib inline
# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
%pwd
'/home/alal/Desktop/code/mostly-harmless-replication'
import urllib
import zipfile
import urllib.request
dldat = False
if dldat:
# Download data and unzip
urllib.request.urlretrieve('http://economics.mit.edu/files/397', 'asciiqob.zip')
with zipfile.ZipFile('asciiqob.zip', "r") as z:
z.extractall()
# Read the data into a pandas dataframe
pums = pd.read_csv('asciiqob.txt',
header = None,
delim_whitespace = True)
pums.columns = ['lwklywge', 'educ', 'yob', 'qob', 'pob']
pums.head()
lwklywge | educ | yob | qob | pob | |
---|---|---|---|---|---|
0 | 5.790019 | 12 | 30 | 1 | 45 |
1 | 5.952494 | 11 | 30 | 1 | 45 |
2 | 5.315949 | 12 | 30 | 1 | 45 |
3 | 5.595926 | 12 | 30 | 1 | 45 |
4 | 6.068915 | 12 | 30 | 1 | 37 |
model = smf.ols('lwklywge ~ educ', data=pums)
print(model.fit().summary())
#%%
results = model.fit()
educ_coef = results.params[1]
intercept = results.params[0]
#%%
OLS Regression Results ============================================================================== Dep. Variable: lwklywge R-squared: 0.117 Model: OLS Adj. R-squared: 0.117 Method: Least Squares F-statistic: 4.378e+04 Date: Fri, 04 Jan 2019 Prob (F-statistic): 0.00 Time: 22:53:12 Log-Likelihood: -3.1935e+05 No. Observations: 329509 AIC: 6.387e+05 Df Residuals: 329507 BIC: 6.387e+05 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 4.9952 0.004 1118.882 0.000 4.986 5.004 educ 0.0709 0.000 209.243 0.000 0.070 0.072 ============================================================================== Omnibus: 191064.440 Durbin-Watson: 1.870 Prob(Omnibus): 0.000 Jarque-Bera (JB): 4082110.366 Skew: -2.377 Prob(JB): 0.00 Kurtosis: 19.575 Cond. No. 53.3 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Calculate means by educ attainment and predicted values
groupbyeduc = pums.groupby('educ')
educ_means = groupbyeduc['lwklywge'].mean()
yhat = pd.Series(intercept + educ_coef * educ_means.index.values,
index = educ_means.index.values)
#%%
sumstats = educ_means.reset_index()
sumstats['intercept'] = 1
sumstats['yhat'] = results.predict(sumstats)
sumstats
educ | lwklywge | intercept | yhat | |
---|---|---|---|---|
0 | 0 | 5.022035 | 1 | 4.995182 |
1 | 1 | 5.064200 | 1 | 5.066033 |
2 | 2 | 5.166692 | 1 | 5.136884 |
3 | 3 | 5.173006 | 1 | 5.207735 |
4 | 4 | 5.264427 | 1 | 5.278586 |
5 | 5 | 5.330808 | 1 | 5.349438 |
6 | 6 | 5.403939 | 1 | 5.420289 |
7 | 7 | 5.470224 | 1 | 5.491140 |
8 | 8 | 5.572456 | 1 | 5.561991 |
9 | 9 | 5.646150 | 1 | 5.632842 |
10 | 10 | 5.685303 | 1 | 5.703693 |
11 | 11 | 5.731728 | 1 | 5.774544 |
12 | 12 | 5.844358 | 1 | 5.845395 |
13 | 13 | 5.917124 | 1 | 5.916246 |
14 | 14 | 5.962863 | 1 | 5.987097 |
15 | 15 | 6.007515 | 1 | 6.057948 |
16 | 16 | 6.231889 | 1 | 6.128799 |
17 | 17 | 6.229527 | 1 | 6.199650 |
18 | 18 | 6.268082 | 1 | 6.270501 |
19 | 19 | 6.211449 | 1 | 6.341352 |
20 | 20 | 6.310895 | 1 | 6.412203 |
# Create plot
plt.figure()
educ_means.plot(style='-o')
yhat.plot()
plt.xlabel("Years of completed education")
plt.ylabel("Log weekly earnings, \$2003")
plt.suptitle("Education ~ Wage CEF and linear fit")
plt.legend().set_visible(False)
<Figure size 432x288 with 0 Axes>
<matplotlib.axes._subplots.AxesSubplot at 0x7f8519073358>
<matplotlib.axes._subplots.AxesSubplot at 0x7f8519073358>
Text(0.5, 0, 'Years of completed education')
Text(0, 0.5, 'Log weekly earnings, \\$2003')
Text(0.5, 0.98, 'Education ~ Wage CEF and linear fit')
ggplot(sumstats, aes(x = 'educ')) + geom_point(aes(y = 'lwklywge')) + geom_line(aes(y='yhat'), color = 'red') +\
labs(x = 'Years of Completed Education', y = 'Log Weekly earnings, $2003', title = 'Education ~ Wage CEF and linear fit')
<ggplot: (-9223363273753235403)>
import patsy
from tabulate import tabulate
#%%
#%%
# Download data
urllib.request.urlretrieve('http://economics.mit.edu/files/3828', 'nswre74.dta')
urllib.request.urlretrieve('http://economics.mit.edu/files/3824', 'cps1re74.dta')
urllib.request.urlretrieve('http://economics.mit.edu/files/3825', 'cps3re74.dta')
#%%
# Read the Stata files into Python
nswre74 = pd.read_stata("nswre74.dta")
cps1re74 = pd.read_stata("cps1re74.dta")
cps3re74 = pd.read_stata("cps3re74.dta")
('nswre74.dta', <http.client.HTTPMessage at 0x7f85190733c8>)
('cps1re74.dta', <http.client.HTTPMessage at 0x7f8518f90828>)
('cps3re74.dta', <http.client.HTTPMessage at 0x7f8518f90b00>)
# nswre74.head()
ggplot(nswre74, aes(x = 're78')) + geom_histogram(bins=30)
<ggplot: (8763101617612)>
#%%
# Store list of variables for summary
summary_vars = ['age', 'ed', 'black', 'hisp', 'nodeg', 'married', 're74', 're75']
# Calculate propensity scores
# Create formula for probit
f = 'treat ~ ' + ' + '.join(['age', 'age2', 'ed', 'black', 'hisp', \
'nodeg', 'married', 're74', 're75'])
#%%
# Run probit with CPS-1
y, X = patsy.dmatrices(f, cps1re74, return_type = 'dataframe')
model = sm.Probit(y, X).fit()
cps1re74['pscore'] = model.predict(X)
Optimization terminated successfully. Current function value: 0.028323 Iterations 11
# nswre74.head()
ggplot(cps1re74, aes(x = 'pscore')) + geom_histogram(bins=30) + geom_density()
<ggplot: (-9223363273753158382)>
#%%
# Run probit with CPS-3
y, X = patsy.dmatrices(f, cps3re74, return_type = 'dataframe')
model = sm.Probit(y, X).fit()
cps3re74['pscore'] = model.predict(X)
Optimization terminated successfully. Current function value: 0.355062 Iterations 7
# nswre74.head()
ggplot(cps3re74, aes(x = 'pscore')) + geom_histogram(bins=30) + geom_density()
<ggplot: (8763101187788)>
#%%
# Create function to summarize data
def summarize(dataset, conditions):
stats = dataset[summary_vars][conditions].mean()
stats['count'] = sum(conditions)
return stats
#%%
# Summarize data
nswre74_treat_stats = summarize(nswre74, nswre74.treat == 1)
nswre74_control_stats = summarize(nswre74, nswre74.treat == 0)
cps1re74_control_stats = summarize(cps1re74, cps1re74.treat == 0)
cps3re74_control_stats = summarize(cps3re74, cps3re74.treat == 0)
cps1re74_ptrim_stats = summarize(cps1re74, (cps1re74.treat == 0) & \
(cps1re74.pscore > 0.1) & \
(cps1re74.pscore < 0.9))
cps3re74_ptrim_stats = summarize(cps3re74, (cps3re74.treat == 0) & \
(cps3re74.pscore > 0.1) & \
(cps3re74.pscore < 0.9))
#%%
# Combine summary stats, add header and print to markdown
frames = [nswre74_treat_stats,
nswre74_control_stats,
cps1re74_control_stats,
cps3re74_control_stats,
cps1re74_ptrim_stats,
cps3re74_ptrim_stats]
#%%
summary_stats = pd.concat(frames, axis = 1)
header = ["NSW Treat", "NSW Control", \
"Full CPS-1", "Full CPS-3", \
"P-score CPS-1", "P-score CPS-3"]
print(tabulate(summary_stats, header, tablefmt = "pipe"))
| | NSW Treat | NSW Control | Full CPS-1 | Full CPS-3 | P-score CPS-1 | P-score CPS-3 | |:--------|-------------:|--------------:|--------------:|-------------:|----------------:|----------------:| | age | 25.8162 | 25.0538 | 33.2252 | 28.0303 | 25.6278 | 25.9682 | | ed | 10.3459 | 10.0885 | 12.0275 | 10.2354 | 10.4858 | 10.4204 | | black | 0.843243 | 0.826923 | 0.0735368 | 0.202797 | 0.957386 | 0.515924 | | hisp | 0.0594595 | 0.107692 | 0.072036 | 0.142191 | 0.0284091 | 0.203822 | | nodeg | 0.708108 | 0.834615 | 0.295835 | 0.596737 | 0.599432 | 0.630573 | | married | 0.189189 | 0.153846 | 0.711731 | 0.512821 | 0.261364 | 0.286624 | | re74 | 2095.57 | 2107.03 | 14016.4 | 5619.23 | 2821.37 | 2968.51 | | re75 | 1532.06 | 1266.91 | 13650.9 | 2466.49 | 1949.81 | 1858.79 | | count | 185 | 260 | 15992 | 429 | 352 | 157 |
from matplotlib.ticker import FormatStrFormatter
# Calculate means by educ and lwklywge
groupbybirth = pums.groupby(['yob', 'qob'])
birth_means = groupbybirth['lwklywge', 'educ'].mean()
# Create function to plot figures
def plot_qob(yvar, ax, title, ylabel):
values = yvar.values
ax.plot(values, color = 'k')
for i, y in enumerate(yvar):
qob = yvar.index.get_level_values('qob')[i]
ax.annotate(qob,
(i, y),
xytext = (-5, 5),
textcoords = 'offset points')
if qob == 1:
ax.scatter(i, y, marker = 's', facecolors = 'none', edgecolors = 'y')
else:
ax.scatter(i, y, marker = 's', color = 'r')
ax.set_xticks(range(0, len(yvar), 4))
ax.set_xticklabels(yvar.index.get_level_values('yob')[1::4])
ax.set_title(title)
ax.set_ylabel(ylabel)
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.set_xlabel("Year of birth")
ax.margins(0.1)
fig, (ax1, ax2) = plt.subplots(2, figsize=(12, 9), sharex = True)
plot_qob(yvar = birth_means['educ'],
ax = ax1,
title = 'A. Average education by quarter of birth (first stage)',
ylabel = 'Years of education')
plot_qob(yvar = birth_means['lwklywge'],
ax = ax2,
title = 'B. Average weekly wage by quarter of birth (reduced form)',
ylabel = 'Log weekly earnings')
fig.tight_layout()
from linearmodels.iv import IV2SLS
qobs = pd.get_dummies(pums.qob, prefix='qob')
pums = pd.concat([pums, qobs], axis=1)
pums.head()
lwklywge | educ | yob | qob | pob | qob_1 | qob_2 | qob_3 | qob_4 | |
---|---|---|---|---|---|---|---|---|---|
0 | 5.790019 | 12 | 30 | 1 | 45 | 1 | 0 | 0 | 0 |
1 | 5.952494 | 11 | 30 | 1 | 45 | 1 | 0 | 0 | 0 |
2 | 5.315949 | 12 | 30 | 1 | 45 | 1 | 0 | 0 | 0 |
3 | 5.595926 | 12 | 30 | 1 | 45 | 1 | 0 | 0 | 0 |
4 | 6.068915 | 12 | 30 | 1 | 37 | 1 | 0 | 0 | 0 |
formula = 'lwklywge ~ 1 [educ ~ qob_2 + qob_3 + qob_4]'
mod = IV2SLS.from_formula(formula, pums).fit(cov_type='robust')
print(mod.summary)
IV-2SLS Estimation Summary ============================================================================== Dep. Variable: lwklywge R-squared: 0.0937 Estimator: IV-2SLS Adj. R-squared: 0.0937 No. Observations: 329509 F-statistic: 27.603 Date: Fri, Jan 04 2019 P-value (F-stat) 0.0000 Time: 23:09:03 Distribution: chi2(1) Cov. Estimator: robust Parameter Estimates ============================================================================== Parameter Std. Err. T-stat P-value Lower CI Upper CI ------------------------------------------------------------------------------ Intercept 4.5898 0.2494 18.404 0.0000 4.1010 5.0786 educ 0.1026 0.0195 5.2539 0.0000 0.0643 0.1409 ============================================================================== Endogenous: educ Instruments: qob_2, qob_3, qob_4 Robust Covariance (Heteroskedastic) Debiased: False
import statsmodels.stats.sandwich_covariance as sw
base_mincer = smf.ols(formula='lwklywge ~ educ', data=pums).fit(cov_type='HC1')
print(base_mincer.summary())
OLS Regression Results ============================================================================== Dep. Variable: lwklywge R-squared: 0.117 Model: OLS Adj. R-squared: 0.117 Method: Least Squares F-statistic: 3.458e+04 Date: Fri, 04 Jan 2019 Prob (F-statistic): 0.00 Time: 23:30:54 Log-Likelihood: -3.1935e+05 No. Observations: 329509 AIC: 6.387e+05 Df Residuals: 329507 BIC: 6.387e+05 Df Model: 1 Covariance Type: HC1 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 4.9952 0.005 984.491 0.000 4.985 5.005 educ 0.0709 0.000 185.949 0.000 0.070 0.072 ============================================================================== Omnibus: 191064.440 Durbin-Watson: 1.870 Prob(Omnibus): 0.000 Jarque-Bera (JB): 4082110.366 Skew: -2.377 Prob(JB): 0.00 Kurtosis: 19.575 Cond. No. 53.3 ============================================================================== Warnings: [1] Standard Errors are heteroscedasticity robust (HC1)
cluster_mincer = smf.ols(formula='lwklywge ~ educ', data=pums).fit(cov_type='cluster',
cov_kwds={'groups': pums['pob']},
use_t=True)
print(cluster_mincer.summary())
OLS Regression Results ============================================================================== Dep. Variable: lwklywge R-squared: 0.117 Model: OLS Adj. R-squared: 0.117 Method: Least Squares F-statistic: 1610. Date: Fri, 04 Jan 2019 Prob (F-statistic): 1.07e-39 Time: 23:30:55 Log-Likelihood: -3.1935e+05 No. Observations: 329509 AIC: 6.387e+05 Df Residuals: 329507 BIC: 6.387e+05 Df Model: 1 Covariance Type: cluster ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 4.9952 0.031 162.935 0.000 4.934 5.057 educ 0.0709 0.002 40.126 0.000 0.067 0.074 ============================================================================== Omnibus: 191064.440 Durbin-Watson: 1.870 Prob(Omnibus): 0.000 Jarque-Bera (JB): 4082110.366 Skew: -2.377 Prob(JB): 0.00 Kurtosis: 19.575 Cond. No. 53.3 ============================================================================== Warnings: [1] Standard Errors are robust tocluster correlation (cluster)
fe_mincer = smf.ols(formula='lwklywge ~ educ + C(pob)', data=pums).fit(cov_type='HC1',
use_t=True)
print(fe_mincer.summary())
OLS Regression Results ============================================================================== Dep. Variable: lwklywge R-squared: 0.128 Model: OLS Adj. R-squared: 0.128 Method: Least Squares F-statistic: 759.0 Date: Fri, 04 Jan 2019 Prob (F-statistic): 0.00 Time: 23:32:30 Log-Likelihood: -3.1726e+05 No. Observations: 329509 AIC: 6.346e+05 Df Residuals: 329457 BIC: 6.352e+05 Df Model: 51 Covariance Type: HC1 ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 4.9322 0.009 547.748 0.000 4.915 4.950 C(pob)[T.2] 0.2584 0.095 2.720 0.007 0.072 0.445 C(pob)[T.4] 0.1459 0.019 7.526 0.000 0.108 0.184 C(pob)[T.5] 0.0457 0.012 3.927 0.000 0.023 0.069 C(pob)[T.6] 0.1729 0.010 17.644 0.000 0.154 0.192 C(pob)[T.8] 0.1214 0.014 8.583 0.000 0.094 0.149 C(pob)[T.9] 0.1352 0.012 10.965 0.000 0.111 0.159 C(pob)[T.10] 0.0888 0.024 3.693 0.000 0.042 0.136 C(pob)[T.11] 0.1534 0.019 8.128 0.000 0.116 0.190 C(pob)[T.12] 0.0013 0.013 0.097 0.923 -0.024 0.026 C(pob)[T.13] -0.0196 0.011 -1.845 0.065 -0.041 0.001 C(pob)[T.15] 0.1531 0.042 3.637 0.000 0.071 0.236 C(pob)[T.16] 0.1043 0.017 5.989 0.000 0.070 0.138 C(pob)[T.17] 0.2064 0.009 22.922 0.000 0.189 0.224 C(pob)[T.18] 0.1533 0.010 15.276 0.000 0.134 0.173 C(pob)[T.19] 0.0994 0.011 8.820 0.000 0.077 0.122 C(pob)[T.20] 0.0748 0.012 6.186 0.000 0.051 0.098 C(pob)[T.21] 0.1115 0.010 10.653 0.000 0.091 0.132 C(pob)[T.22] 0.0947 0.012 8.109 0.000 0.072 0.118 C(pob)[T.23] -0.0111 0.014 -0.778 0.437 -0.039 0.017 C(pob)[T.24] 0.1178 0.013 9.415 0.000 0.093 0.142 C(pob)[T.25] 0.1112 0.010 11.348 0.000 0.092 0.130 C(pob)[T.26] 0.2268 0.009 24.538 0.000 0.209 0.245 C(pob)[T.27] 0.1501 0.011 13.860 0.000 0.129 0.171 C(pob)[T.28] -0.0215 0.012 -1.801 0.072 -0.045 0.002 C(pob)[T.29] 0.1138 0.010 11.017 0.000 0.094 0.134 C(pob)[T.30] 0.0775 0.020 3.900 0.000 0.039 0.116 C(pob)[T.31] 0.0922 0.014 6.726 0.000 0.065 0.119 C(pob)[T.32] 0.1524 0.041 3.710 0.000 0.072 0.233 C(pob)[T.33] 0.0203 0.019 1.092 0.275 -0.016 0.057 C(pob)[T.34] 0.1712 0.010 16.898 0.000 0.151 0.191 C(pob)[T.35] 0.0703 0.018 3.975 0.000 0.036 0.105 C(pob)[T.36] 0.1623 0.009 18.981 0.000 0.146 0.179 C(pob)[T.37] -0.0566 0.010 -5.664 0.000 -0.076 -0.037 C(pob)[T.38] 0.1305 0.017 7.800 0.000 0.098 0.163 C(pob)[T.39] 0.1654 0.009 18.398 0.000 0.148 0.183 C(pob)[T.40] 0.0924 0.011 8.504 0.000 0.071 0.114 C(pob)[T.41] 0.1408 0.015 9.255 0.000 0.111 0.171 C(pob)[T.42] 0.1316 0.009 15.441 0.000 0.115 0.148 C(pob)[T.44] 0.0463 0.017 2.727 0.006 0.013 0.080 C(pob)[T.45] -0.0805 0.012 -6.496 0.000 -0.105 -0.056 C(pob)[T.46] 0.0785 0.017 4.604 0.000 0.045 0.112 C(pob)[T.47] 0.0480 0.011 4.533 0.000 0.027 0.069 C(pob)[T.48] 0.0882 0.009 9.445 0.000 0.070 0.107 C(pob)[T.49] 0.1411 0.015 9.251 0.000 0.111 0.171 C(pob)[T.50] -0.0375 0.021 -1.779 0.075 -0.079 0.004 C(pob)[T.51] 0.0319 0.011 2.910 0.004 0.010 0.053 C(pob)[T.53] 0.1877 0.013 14.544 0.000 0.162 0.213 C(pob)[T.54] 0.1274 0.011 11.757 0.000 0.106 0.149 C(pob)[T.55] 0.1281 0.010 12.378 0.000 0.108 0.148 C(pob)[T.56] 0.1438 0.026 5.522 0.000 0.093 0.195 educ 0.0671 0.000 173.051 0.000 0.066 0.068 ============================================================================== Omnibus: 193121.979 Durbin-Watson: 1.903 Prob(Omnibus): 0.000 Jarque-Bera (JB): 4260317.292 Skew: -2.403 Prob(JB): 0.00 Kurtosis: 19.947 Cond. No. 867. ============================================================================== Warnings: [1] Standard Errors are heteroscedasticity robust (HC1)