#!/usr/bin/env python # coding: utf-8 # In[1]: #%% import os import sys import glob import numpy as np import pandas as pd import matplotlib.pyplot as plt from patsy import dmatrices, dmatrix, demo_data import seaborn as sns import statsmodels.api as sm import statsmodels.formula.api as smf from linearmodels.iv import IV2SLS get_ipython().run_line_magic('matplotlib', 'inline') # run for jupyter notebook # from IPython.core.interactiveshell import InteractiveShell # InteractiveShell.ast_node_interactivity = 'all' #%% # Manual Matrix Calculation # In[2]: nsample = 100 x = np.linspace(0, 10, 100) cons = np.ones([nsample,1]) X = np.column_stack((x, x**2, cons)) # In[3]: beta = np.array([1, 0.1, 10]) e = np.random.normal(size=nsample) # In[4]: y = np.dot(X, beta) + e # $$ # \hat{\beta} = (X'X)^{-1}X'Y # $$ # In[5]: beta_hat = np.linalg.inv(X.T @ X)@X.T@y beta_hat.T # In[7]: beta # In[8]: beta1 = sm.OLS(y, X).fit() beta1.params # In[9]: y_hat = beta1.predict() print(beta1.summary()) # In[10]: plt.plot(y_hat, alpha=0.5, label='predicted') # Plot observed values plt.plot(y, alpha=0.5, label='observed') plt.legend() plt.show() # ## Tabulating Multiple Regressions # In[11]: df2 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable2.dta') # Add constant term to dataset df2['const'] = 1 # In[12]: # Create lists of variables to be used in each regression X1 = ['const', 'avexpr'] X2 = ['const', 'avexpr', 'lat_abst'] X3 = ['const', 'avexpr', 'lat_abst', 'asia', 'africa', 'other'] # Estimate an OLS regression for each set of variables reg1 = sm.OLS(df2['logpgp95'], df2[X1], missing='drop').fit(cov_type='HC2') reg2 = sm.OLS(df2['logpgp95'], df2[X2], missing='drop').fit(cov_type='HC2') reg3 = sm.OLS(df2['logpgp95'], df2[X3], missing='drop').fit(cov_type='HC2') # In[13]: from statsmodels.iolib.summary2 import summary_col info_dict={'R-squared' : lambda x: "{:.2f}".format(x.rsquared), 'No. observations' : lambda x: "{0:d}".format(int(x.nobs))} results_table = summary_col(results=[reg1,reg2,reg3], float_format='%0.2f', stars = True, model_names=['Model 1', 'Model 3', 'Model 4'], info_dict=info_dict, regressor_order=['const', 'avexpr', 'lat_abst', 'asia', 'africa']) results_table.add_title('Table 2 - OLS Regressions') print(results_table) # # AJR (2001) # In[14]: df1 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable1.dta') df1.head() df1.info() # In[15]: plt.style.use('seaborn') # In[16]: sns.jointplot('avexpr','logpgp95',data=df1,kind='scatter') # In[17]: sns.regplot('avexpr','logpgp95',data=df1) # In[18]: # Dropping NA's is required to use numpy's polyfit df1_subset = df1.dropna(subset=['logpgp95', 'avexpr']) # Use only 'base sample' for plotting purposes df1_subset = df1_subset[df1_subset['baseco'] == 1] X = df1_subset['avexpr'] y = df1_subset['logpgp95'] labels = df1_subset['shortnam'] # Replace markers with country labels plt.scatter(X, y, marker='') for i, label in enumerate(labels): plt.annotate(label, (X.iloc[i], y.iloc[i])) # Fit a linear trend line plt.plot(np.unique(X), np.poly1d(np.polyfit(X, y, 1))(np.unique(X)), color='black') plt.xlim([3.3,10.5]) plt.ylim([4,10.5]) plt.xlabel('Average Expropriation Risk 1985-95') plt.ylabel('Log GDP per capita, PPP, 1995') plt.title('Figure 2: OLS relationship between expropriation risk and income') plt.show() # In[19]: reg1 = smf.ols('logpgp95 ~ avexpr', data=df1, missing='drop').fit(cov_type='HC2') print(reg1.summary()) # In[20]: # Drop missing observations from whole sample df1_plot = df1.dropna(subset=['logpgp95', 'avexpr']) # Plot predicted values plt.scatter(df1_plot['avexpr'], reg1.predict(), alpha=0.5, label='predicted') # Plot observed values plt.scatter(df1_plot['avexpr'], df1_plot['logpgp95'], alpha=0.5, label='observed') plt.legend() plt.title('OLS predicted values') plt.xlabel('avexpr') plt.ylabel('logpgp95') plt.show() # ## Instrumental Variables # In[21]: # Dropping NA's is required to use numpy's polyfit df1_subset2 = df1.dropna(subset=['logem4', 'avexpr']) X = df1_subset2['logem4'] y = df1_subset2['avexpr'] labels = df1_subset2['shortnam'] # In[22]: # Replace markers with country labels plt.scatter(X, y, marker='') for i, label in enumerate(labels): plt.annotate(label, (X.iloc[i], y.iloc[i])) # Fit a linear trend line plt.plot(np.unique(X), np.poly1d(np.polyfit(X, y, 1))(np.unique(X)), color='black') # plt.xlim([1.8,8.4]) # plt.ylim([3.3,10.4]) plt.xlabel('Log of Settler Mortality') plt.ylabel('Average Expropriation Risk 1985-95') plt.title('Figure 3: First-stage relationship between settler mortality and expropriation risk') # Manual Calculation # $$ \beta_{IV} = \frac{ \beta_{rf} }{ \beta_{fs} } $$ # where # $$ \beta_{fs} = \frac{Cov(X,Z)}{Var(Z)} $$ # # and # # $$ \beta_{rf} = \frac{Cov(Y,Z)}{Var(Z)} $$ # Which yields # $$ # \beta_{IV} = \frac{Cov(Y,Z)}{Cov(X,Z)} # $$ # In[23]: # Import and select the data df4 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable4.dta') df4 = df4[df4['baseco'] == 1] # Add a constant variable df4['const'] = 1 # *First Stage* # In[24]: # Fit the first stage regression and print summary results_fs = sm.OLS(df4['avexpr'], df4[['const', 'logem4']], missing='drop').fit() print(results_fs.summary()) # In[25]: df4['predicted_avexpr'] = results_fs.predict() results_ss = sm.OLS(df4['logpgp95'], df4[['const', 'predicted_avexpr']]).fit() print(results_ss.summary()) # ### IV subroutine from linearmodels # In[26]: from linearmodels.iv import IV2SLS iv = IV2SLS(dependent=df4['logpgp95'], exog=df4['const'], endog=df4['avexpr'], instruments=df4['logem4']).fit(cov_type='unadjusted') print(iv.summary) # ### IV Formula syntax # In[27]: from statsmodels.api import add_constant from linearmodels.datasets import mroz print(mroz.DESCR) data = mroz.load() data = data.dropna() data = add_constant(data, has_constant='add') data['lnwage']=np.log(data.wage) # In[28]: y = np.log(data.wage).as_matrix() X = data[['const','educ']].as_matrix() beta = np.linalg.inv(X.T @ X)@X.T@y beta # In[29]: data.head() # In[30]: res_ols = sm.formula.ols('lnwage ~ educ', data=data).fit() print(res_ols.summary()) # In[31]: first_stage = sm.OLS(data.educ, data[['fatheduc','const']]).fit() print(first_stage.summary()) b_fs = first_stage.params[0] data['educ_hat'] = first_stage.predict() # In[32]: reduced_form = sm.OLS(np.log(data.wage), data[['fatheduc','const']]).fit() b_rf = reduced_form.params[0] print(reduced_form.summary()) # In[33]: second_stage = sm.OLS(np.log(data.wage), data[['educ_hat','const']]).fit() b_ss = second_stage.params[0] print(second_stage.summary()) # $$ # \beta_{iv} = \frac{cov(y_i,z_i)}{cov(x_i,z_i)}= \frac{\beta_{rf}}{\beta_{fs}} = \frac{cov(y_i,\hat{x_i})}{var(\hat{x_i})}$$ # ### calculate $\beta_{iv}$ in all 3 ways # In[34]: iv = IV2SLS.from_formula('lnwage ~ 1 + [educ ~ fatheduc]', data=data).fit(cov_type='unadjusted') print(iv.summary) # In[35]: iv.params[1] # In[36]: b_iv = b_rf/b_fs b_iv # In[37]: b_ss # Interaction term # In[53]: sm.ols('lnwage ~ educ + hours * kidsge6 + huseduc', data = data).fit(cov_type='HC1').summary() # ## Fixed effects ; Standard Errors (Robust, Clustered, Multiway clustered) # Verbatim fork of [this repo](https://github.com/vgreg/python-se) for reference. Will likely attempt to implement Conley(1999) in python # In[51]: import statsmodels.formula.api as sm import statsmodels.stats.sandwich_covariance as sw # In[52]: from urllib.request import urlopen with urlopen('http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt') as conn: df = pd.read_table(conn, names=['firmid','year','x','y'], delim_whitespace=True) df.head() # In[45]: ols = sm.ols(formula='y ~ x', data=df).fit(use_t=True) ols.summary() # White standard errors (` reg y x, robust`) # In[46]: robust_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='HC1', use_t=True) robust_ols.summary() # Clustered # In[47]: cluster_firm_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster', cov_kwds={'groups': df['firmid']}, use_t=True) cluster_firm_ols.summary() # In[48]: cluster_year_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster', cov_kwds={'groups': df['year']}, use_t=True) cluster_year_ols.summary() # In[49]: cluster_2ways_ols = sm.ols(formula='y ~ x', data=df).fit(cov_type='cluster', cov_kwds={'groups': np.array(df[['firmid', 'year']])}, use_t=True) cluster_2ways_ols.summary() # Fixed effects (Python really needs an equivalent to `areg` / `reghdfe` / `lfe` to absorb estimates) # In[50]: year_fe_ols = sm.ols(formula='y ~ x + C(year)', data=df).fit(use_t=True) year_fe_ols.summary() # In[45]: firm_cluster_year_fe_ols = sm.ols(formula='y ~ x + C(year)', data=df).fit(cov_type='cluster', cov_kwds={'groups': df['firmid']}, use_t=True) firm_cluster_year_fe_ols.summary() # In[ ]: