In [1]:

#%%
# pyscience imports
import os
import sys
import glob
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# graphics
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use("seaborn-darkgrid")
sns.set(style="ticks", context="talk")
plt.style.use("dark_background") # dark bg plots

# %matplotlib inline
# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:

%pwd

Out[2]:

'/home/alal/Desktop/code/mostly-harmless-replication'

Regressions¶

In [3]:

import urllib
import zipfile
import urllib.request

In [4]:

dldat = False
if dldat:
    # Download data and unzip 
    urllib.request.urlretrieve('http://economics.mit.edu/files/397', 'asciiqob.zip')
    with zipfile.ZipFile('asciiqob.zip', "r") as z:
       z.extractall()

In [3]:

# Read the data into a pandas dataframe
pums         = pd.read_csv('asciiqob.txt',
                           header           = None,
                           delim_whitespace = True)
pums.columns = ['lwklywge', 'educ', 'yob', 'qob', 'pob']
pums.head()

Out[3]:

	lwklywge	educ	yob	qob	pob
0	5.790019	12	30	1	45
1	5.952494	11	30	1	45
2	5.315949	12	30	1	45
3	5.595926	12	30	1	45
4	6.068915	12	30	1	37

In [6]:

model      = smf.ols('lwklywge ~ educ', data=pums)
print(model.fit().summary())
#%%
results    = model.fit()
educ_coef  = results.params[1]
intercept  = results.params[0]
#%%

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lwklywge   R-squared:                       0.117
Model:                            OLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                 4.378e+04
Date:                Fri, 04 Jan 2019   Prob (F-statistic):               0.00
Time:                        22:53:12   Log-Likelihood:            -3.1935e+05
No. Observations:              329509   AIC:                         6.387e+05
Df Residuals:                  329507   BIC:                         6.387e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.9952      0.004   1118.882      0.000       4.986       5.004
educ           0.0709      0.000    209.243      0.000       0.070       0.072
==============================================================================
Omnibus:                   191064.440   Durbin-Watson:                   1.870
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          4082110.366
Skew:                          -2.377   Prob(JB):                         0.00
Kurtosis:                      19.575   Cond. No.                         53.3
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [7]:

# Calculate means by educ attainment and predicted values
groupbyeduc = pums.groupby('educ')
educ_means  = groupbyeduc['lwklywge'].mean()
yhat        = pd.Series(intercept + educ_coef * educ_means.index.values,
                        index = educ_means.index.values)
#%%

In [8]:

sumstats = educ_means.reset_index()
sumstats['intercept'] = 1
sumstats['yhat'] = results.predict(sumstats)
sumstats

Out[8]:

	educ	lwklywge	intercept	yhat
0	0	5.022035	1	4.995182
1	1	5.064200	1	5.066033
2	2	5.166692	1	5.136884
3	3	5.173006	1	5.207735
4	4	5.264427	1	5.278586
5	5	5.330808	1	5.349438
6	6	5.403939	1	5.420289
7	7	5.470224	1	5.491140
8	8	5.572456	1	5.561991
9	9	5.646150	1	5.632842
10	10	5.685303	1	5.703693
11	11	5.731728	1	5.774544
12	12	5.844358	1	5.845395
13	13	5.917124	1	5.916246
14	14	5.962863	1	5.987097
15	15	6.007515	1	6.057948
16	16	6.231889	1	6.128799
17	17	6.229527	1	6.199650
18	18	6.268082	1	6.270501
19	19	6.211449	1	6.341352
20	20	6.310895	1	6.412203

In [9]:

# Create plot
plt.figure()
educ_means.plot(style='-o')
yhat.plot()
plt.xlabel("Years of completed education")
plt.ylabel("Log weekly earnings, \$2003")
plt.suptitle("Education ~ Wage CEF and linear fit")
plt.legend().set_visible(False)

Out[9]:

<Figure size 432x288 with 0 Axes>

Out[9]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f8519073358>

Out[9]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f8519073358>

Out[9]:

Text(0.5, 0, 'Years of completed education')

Out[9]:

Text(0, 0.5, 'Log weekly earnings, \\$2003')

Out[9]:

Text(0.5, 0.98, 'Education ~ Wage CEF and linear fit')

In [10]:

ggplot(sumstats, aes(x = 'educ')) + geom_point(aes(y = 'lwklywge')) + geom_line(aes(y='yhat'), color = 'red') +\
    labs(x = 'Years of Completed Education', y = 'Log Weekly earnings, $2003', title = 'Education ~ Wage CEF and linear fit')

Out[10]:

<ggplot: (-9223363273753235403)>

Propensity Score¶

In [11]:

import patsy
from tabulate import tabulate
#%%

In [12]:

#%%
# Download data
urllib.request.urlretrieve('http://economics.mit.edu/files/3828', 'nswre74.dta')
urllib.request.urlretrieve('http://economics.mit.edu/files/3824', 'cps1re74.dta')
urllib.request.urlretrieve('http://economics.mit.edu/files/3825', 'cps3re74.dta')
#%%
# Read the Stata files into Python
nswre74  = pd.read_stata("nswre74.dta")
cps1re74 = pd.read_stata("cps1re74.dta")
cps3re74 = pd.read_stata("cps3re74.dta")

Out[12]:

('nswre74.dta', <http.client.HTTPMessage at 0x7f85190733c8>)

Out[12]:

('cps1re74.dta', <http.client.HTTPMessage at 0x7f8518f90828>)

Out[12]:

('cps3re74.dta', <http.client.HTTPMessage at 0x7f8518f90b00>)

In [13]:

# nswre74.head()
ggplot(nswre74, aes(x = 're78')) + geom_histogram(bins=30)

Out[13]:

<ggplot: (8763101617612)>

In [14]:

#%%
# Store list of variables for summary
summary_vars = ['age', 'ed', 'black', 'hisp', 'nodeg', 'married', 're74', 're75']

# Calculate propensity scores
# Create formula for probit
f = 'treat ~ ' + ' + '.join(['age', 'age2', 'ed', 'black', 'hisp', \
                             'nodeg', 'married', 're74', 're75'])
#%%

In [15]:

# Run probit with CPS-1
y, X   = patsy.dmatrices(f, cps1re74, return_type = 'dataframe')
model  = sm.Probit(y, X).fit()
cps1re74['pscore'] = model.predict(X)

Optimization terminated successfully.
         Current function value: 0.028323
         Iterations 11

In [16]:

# nswre74.head()
ggplot(cps1re74, aes(x = 'pscore')) + geom_histogram(bins=30) + geom_density()

Out[16]:

<ggplot: (-9223363273753158382)>

In [17]:

#%%
# Run probit with CPS-3
y, X   = patsy.dmatrices(f, cps3re74, return_type = 'dataframe')
model  = sm.Probit(y, X).fit()
cps3re74['pscore'] = model.predict(X)

Optimization terminated successfully.
         Current function value: 0.355062
         Iterations 7

In [18]:

# nswre74.head()
ggplot(cps3re74, aes(x = 'pscore')) + geom_histogram(bins=30) + geom_density()

Out[18]:

<ggplot: (8763101187788)>

In [19]:

#%%
# Create function to summarize data
def summarize(dataset, conditions):
  stats          = dataset[summary_vars][conditions].mean()
  stats['count'] = sum(conditions)
  return stats
#%%
# Summarize data
nswre74_treat_stats    = summarize(nswre74, nswre74.treat == 1)
nswre74_control_stats  = summarize(nswre74, nswre74.treat == 0)
cps1re74_control_stats = summarize(cps1re74, cps1re74.treat == 0)
cps3re74_control_stats = summarize(cps3re74, cps3re74.treat == 0)
cps1re74_ptrim_stats   = summarize(cps1re74, (cps1re74.treat == 0)   & \
                                             (cps1re74.pscore > 0.1) & \
                                             (cps1re74.pscore < 0.9))
cps3re74_ptrim_stats   = summarize(cps3re74, (cps3re74.treat == 0)   & \
                                             (cps3re74.pscore > 0.1) & \
                                             (cps3re74.pscore < 0.9))

In [20]:

#%%
# Combine summary stats, add header and print to markdown
frames = [nswre74_treat_stats,
          nswre74_control_stats,
          cps1re74_control_stats,
          cps3re74_control_stats,
          cps1re74_ptrim_stats,
          cps3re74_ptrim_stats]
#%%
summary_stats = pd.concat(frames, axis = 1)
header        = ["NSW Treat", "NSW Control", \
                 "Full CPS-1", "Full CPS-3", \
                 "P-score CPS-1", "P-score CPS-3"]

print(tabulate(summary_stats, header, tablefmt = "pipe"))

|         |    NSW Treat |   NSW Control |    Full CPS-1 |   Full CPS-3 |   P-score CPS-1 |   P-score CPS-3 |
|:--------|-------------:|--------------:|--------------:|-------------:|----------------:|----------------:|
| age     |   25.8162    |     25.0538   |    33.2252    |    28.0303   |      25.6278    |       25.9682   |
| ed      |   10.3459    |     10.0885   |    12.0275    |    10.2354   |      10.4858    |       10.4204   |
| black   |    0.843243  |      0.826923 |     0.0735368 |     0.202797 |       0.957386  |        0.515924 |
| hisp    |    0.0594595 |      0.107692 |     0.072036  |     0.142191 |       0.0284091 |        0.203822 |
| nodeg   |    0.708108  |      0.834615 |     0.295835  |     0.596737 |       0.599432  |        0.630573 |
| married |    0.189189  |      0.153846 |     0.711731  |     0.512821 |       0.261364  |        0.286624 |
| re74    | 2095.57      |   2107.03     | 14016.4       |  5619.23     |    2821.37      |     2968.51     |
| re75    | 1532.06      |   1266.91     | 13650.9       |  2466.49     |    1949.81      |     1858.79     |
| count   |  185         |    260        | 15992         |   429        |     352         |      157        |

Instrumental variables¶

In [21]:

from matplotlib.ticker import FormatStrFormatter

Angrist and Krueger¶

In [22]:

# Calculate means by educ and lwklywge
groupbybirth = pums.groupby(['yob', 'qob'])
birth_means  = groupbybirth['lwklywge', 'educ'].mean()

In [39]:

# Create function to plot figures
def plot_qob(yvar, ax, title, ylabel):
    values = yvar.values
    ax.plot(values, color = 'k')

    for i, y in enumerate(yvar):
        qob = yvar.index.get_level_values('qob')[i]
        ax.annotate(qob,
                    (i, y),
                    xytext = (-5, 5),
                    textcoords = 'offset points')
        if qob == 1:
            ax.scatter(i, y, marker = 's', facecolors = 'none', edgecolors = 'y')
        else:
            ax.scatter(i, y, marker = 's', color = 'r')

    ax.set_xticks(range(0, len(yvar), 4))
    ax.set_xticklabels(yvar.index.get_level_values('yob')[1::4])
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    ax.set_xlabel("Year of birth")
    ax.margins(0.1)

In [40]:

fig, (ax1, ax2) = plt.subplots(2, figsize=(12, 9), sharex = True)

plot_qob(yvar   = birth_means['educ'],
         ax     = ax1,
         title  = 'A. Average education by quarter of birth (first stage)',
         ylabel = 'Years of education')

plot_qob(yvar   = birth_means['lwklywge'],
         ax     = ax2,
         title  = 'B. Average weekly wage by quarter of birth (reduced form)',
         ylabel = 'Log weekly earnings')

fig.tight_layout()

IV estimation¶

In [26]:

from linearmodels.iv import IV2SLS

In [31]:

qobs = pd.get_dummies(pums.qob, prefix='qob')
pums = pd.concat([pums, qobs], axis=1)

In [32]:

pums.head()

Out[32]:

	lwklywge	educ	yob	qob	pob	qob_1
0	5.790019	12	30	1	45	1
1	5.952494	11	30	1	45	1
2	5.315949	12	30	1	45	1
3	5.595926	12	30	1	45	1
4	6.068915	12	30	1	37	1

In [37]:

formula = 'lwklywge ~ 1 [educ ~ qob_2 + qob_3 + qob_4]'
mod = IV2SLS.from_formula(formula, pums).fit(cov_type='robust')

In [38]:

print(mod.summary)

                          IV-2SLS Estimation Summary                          
==============================================================================
Dep. Variable:               lwklywge   R-squared:                      0.0937
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0937
No. Observations:              329509   F-statistic:                    27.603
Date:                Fri, Jan 04 2019   P-value (F-stat)                0.0000
Time:                        23:09:03   Distribution:                  chi2(1)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      4.5898     0.2494     18.404     0.0000      4.1010      5.0786
educ           0.1026     0.0195     5.2539     0.0000      0.0643      0.1409
==============================================================================

Endogenous: educ
Instruments: qob_2, qob_3, qob_4
Robust Covariance (Heteroskedastic)
Debiased: False

Clustering / Fixed Effects¶

In [7]:

import statsmodels.stats.sandwich_covariance as sw

In [8]:

base_mincer = smf.ols(formula='lwklywge ~ educ', data=pums).fit(cov_type='HC1')
print(base_mincer.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lwklywge   R-squared:                       0.117
Model:                            OLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                 3.458e+04
Date:                Fri, 04 Jan 2019   Prob (F-statistic):               0.00
Time:                        23:30:54   Log-Likelihood:            -3.1935e+05
No. Observations:              329509   AIC:                         6.387e+05
Df Residuals:                  329507   BIC:                         6.387e+05
Df Model:                           1                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.9952      0.005    984.491      0.000       4.985       5.005
educ           0.0709      0.000    185.949      0.000       0.070       0.072
==============================================================================
Omnibus:                   191064.440   Durbin-Watson:                   1.870
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          4082110.366
Skew:                          -2.377   Prob(JB):                         0.00
Kurtosis:                      19.575   Cond. No.                         53.3
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)

In [9]:

cluster_mincer = smf.ols(formula='lwklywge ~ educ', data=pums).fit(cov_type='cluster',
                                                        cov_kwds={'groups': pums['pob']},
                                                        use_t=True)
print(cluster_mincer.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lwklywge   R-squared:                       0.117
Model:                            OLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                     1610.
Date:                Fri, 04 Jan 2019   Prob (F-statistic):           1.07e-39
Time:                        23:30:55   Log-Likelihood:            -3.1935e+05
No. Observations:              329509   AIC:                         6.387e+05
Df Residuals:                  329507   BIC:                         6.387e+05
Df Model:                           1                                         
Covariance Type:              cluster                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.9952      0.031    162.935      0.000       4.934       5.057
educ           0.0709      0.002     40.126      0.000       0.067       0.074
==============================================================================
Omnibus:                   191064.440   Durbin-Watson:                   1.870
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          4082110.366
Skew:                          -2.377   Prob(JB):                         0.00
Kurtosis:                      19.575   Cond. No.                         53.3
==============================================================================

Warnings:
[1] Standard Errors are robust tocluster correlation (cluster)

In [10]:

fe_mincer = smf.ols(formula='lwklywge ~ educ + C(pob)', data=pums).fit(cov_type='HC1',
                                                        use_t=True)
print(fe_mincer.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               lwklywge   R-squared:                       0.128
Model:                            OLS   Adj. R-squared:                  0.128
Method:                 Least Squares   F-statistic:                     759.0
Date:                Fri, 04 Jan 2019   Prob (F-statistic):               0.00
Time:                        23:32:30   Log-Likelihood:            -3.1726e+05
No. Observations:              329509   AIC:                         6.346e+05
Df Residuals:                  329457   BIC:                         6.352e+05
Df Model:                          51                                         
Covariance Type:                  HC1                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        4.9322      0.009    547.748      0.000       4.915       4.950
C(pob)[T.2]      0.2584      0.095      2.720      0.007       0.072       0.445
C(pob)[T.4]      0.1459      0.019      7.526      0.000       0.108       0.184
C(pob)[T.5]      0.0457      0.012      3.927      0.000       0.023       0.069
C(pob)[T.6]      0.1729      0.010     17.644      0.000       0.154       0.192
C(pob)[T.8]      0.1214      0.014      8.583      0.000       0.094       0.149
C(pob)[T.9]      0.1352      0.012     10.965      0.000       0.111       0.159
C(pob)[T.10]     0.0888      0.024      3.693      0.000       0.042       0.136
C(pob)[T.11]     0.1534      0.019      8.128      0.000       0.116       0.190
C(pob)[T.12]     0.0013      0.013      0.097      0.923      -0.024       0.026
C(pob)[T.13]    -0.0196      0.011     -1.845      0.065      -0.041       0.001
C(pob)[T.15]     0.1531      0.042      3.637      0.000       0.071       0.236
C(pob)[T.16]     0.1043      0.017      5.989      0.000       0.070       0.138
C(pob)[T.17]     0.2064      0.009     22.922      0.000       0.189       0.224
C(pob)[T.18]     0.1533      0.010     15.276      0.000       0.134       0.173
C(pob)[T.19]     0.0994      0.011      8.820      0.000       0.077       0.122
C(pob)[T.20]     0.0748      0.012      6.186      0.000       0.051       0.098
C(pob)[T.21]     0.1115      0.010     10.653      0.000       0.091       0.132
C(pob)[T.22]     0.0947      0.012      8.109      0.000       0.072       0.118
C(pob)[T.23]    -0.0111      0.014     -0.778      0.437      -0.039       0.017
C(pob)[T.24]     0.1178      0.013      9.415      0.000       0.093       0.142
C(pob)[T.25]     0.1112      0.010     11.348      0.000       0.092       0.130
C(pob)[T.26]     0.2268      0.009     24.538      0.000       0.209       0.245
C(pob)[T.27]     0.1501      0.011     13.860      0.000       0.129       0.171
C(pob)[T.28]    -0.0215      0.012     -1.801      0.072      -0.045       0.002
C(pob)[T.29]     0.1138      0.010     11.017      0.000       0.094       0.134
C(pob)[T.30]     0.0775      0.020      3.900      0.000       0.039       0.116
C(pob)[T.31]     0.0922      0.014      6.726      0.000       0.065       0.119
C(pob)[T.32]     0.1524      0.041      3.710      0.000       0.072       0.233
C(pob)[T.33]     0.0203      0.019      1.092      0.275      -0.016       0.057
C(pob)[T.34]     0.1712      0.010     16.898      0.000       0.151       0.191
C(pob)[T.35]     0.0703      0.018      3.975      0.000       0.036       0.105
C(pob)[T.36]     0.1623      0.009     18.981      0.000       0.146       0.179
C(pob)[T.37]    -0.0566      0.010     -5.664      0.000      -0.076      -0.037
C(pob)[T.38]     0.1305      0.017      7.800      0.000       0.098       0.163
C(pob)[T.39]     0.1654      0.009     18.398      0.000       0.148       0.183
C(pob)[T.40]     0.0924      0.011      8.504      0.000       0.071       0.114
C(pob)[T.41]     0.1408      0.015      9.255      0.000       0.111       0.171
C(pob)[T.42]     0.1316      0.009     15.441      0.000       0.115       0.148
C(pob)[T.44]     0.0463      0.017      2.727      0.006       0.013       0.080
C(pob)[T.45]    -0.0805      0.012     -6.496      0.000      -0.105      -0.056
C(pob)[T.46]     0.0785      0.017      4.604      0.000       0.045       0.112
C(pob)[T.47]     0.0480      0.011      4.533      0.000       0.027       0.069
C(pob)[T.48]     0.0882      0.009      9.445      0.000       0.070       0.107
C(pob)[T.49]     0.1411      0.015      9.251      0.000       0.111       0.171
C(pob)[T.50]    -0.0375      0.021     -1.779      0.075      -0.079       0.004
C(pob)[T.51]     0.0319      0.011      2.910      0.004       0.010       0.053
C(pob)[T.53]     0.1877      0.013     14.544      0.000       0.162       0.213
C(pob)[T.54]     0.1274      0.011     11.757      0.000       0.106       0.149
C(pob)[T.55]     0.1281      0.010     12.378      0.000       0.108       0.148
C(pob)[T.56]     0.1438      0.026      5.522      0.000       0.093       0.195
educ             0.0671      0.000    173.051      0.000       0.066       0.068
==============================================================================
Omnibus:                   193121.979   Durbin-Watson:                   1.903
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          4260317.292
Skew:                          -2.403   Prob(JB):                         0.00
Kurtosis:                      19.947   Cond. No.                         867.
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)