!pip install linearmodels import matplotlib.pyplot as plt plt.rcParams["figure.figsize"] = (11, 5) #set default figure size import numpy as np import pandas as pd import statsmodels.api as sm from statsmodels.iolib.summary2 import summary_col from linearmodels.iv import IV2SLS import seaborn as sns sns.set_theme() df1 = pd.read_stata('https://github.com/QuantEcon/lecture-python/blob/master/source/_static/lecture_specific/ols/maketable1.dta?raw=true') df1.head() df1.plot(x='avexpr', y='logpgp95', kind='scatter') plt.show() # Dropping NA's is required to use numpy's polyfit df1_subset = df1.dropna(subset=['logpgp95', 'avexpr']) # Use only 'base sample' for plotting purposes df1_subset = df1_subset[df1_subset['baseco'] == 1] X = df1_subset['avexpr'] y = df1_subset['logpgp95'] labels = df1_subset['shortnam'] # Replace markers with country labels fig, ax = plt.subplots() ax.scatter(X, y, marker='') for i, label in enumerate(labels): ax.annotate(label, (X.iloc[i], y.iloc[i])) # Fit a linear trend line ax.plot(np.unique(X), np.poly1d(np.polyfit(X, y, 1))(np.unique(X)), color='black') ax.set_xlim([3.3,10.5]) ax.set_ylim([4,10.5]) ax.set_xlabel('Average Expropriation Risk 1985-95') ax.set_ylabel('Log GDP per capita, PPP, 1995') ax.set_title('Figure 2: OLS relationship between expropriation \ risk and income') plt.show() df1['const'] = 1 reg1 = sm.OLS(endog=df1['logpgp95'], exog=df1[['const', 'avexpr']], \ missing='drop') type(reg1) results = reg1.fit() type(results) print(results.summary()) mean_expr = np.mean(df1_subset['avexpr']) mean_expr predicted_logpdp95 = 4.63 + 0.53 * 7.07 predicted_logpdp95 results.predict(exog=[1, mean_expr]) # Drop missing observations from whole sample df1_plot = df1.dropna(subset=['logpgp95', 'avexpr']) # Plot predicted values fix, ax = plt.subplots() ax.scatter(df1_plot['avexpr'], results.predict(), alpha=0.5, label='predicted') # Plot observed values ax.scatter(df1_plot['avexpr'], df1_plot['logpgp95'], alpha=0.5, label='observed') ax.legend() ax.set_title('OLS predicted values') ax.set_xlabel('avexpr') ax.set_ylabel('logpgp95') plt.show() df2 = pd.read_stata('https://github.com/QuantEcon/lecture-python/blob/master/source/_static/lecture_specific/ols/maketable2.dta?raw=true') # Add constant term to dataset df2['const'] = 1 # Create lists of variables to be used in each regression X1 = ['const', 'avexpr'] X2 = ['const', 'avexpr', 'lat_abst'] X3 = ['const', 'avexpr', 'lat_abst', 'asia', 'africa', 'other'] # Estimate an OLS regression for each set of variables reg1 = sm.OLS(df2['logpgp95'], df2[X1], missing='drop').fit() reg2 = sm.OLS(df2['logpgp95'], df2[X2], missing='drop').fit() reg3 = sm.OLS(df2['logpgp95'], df2[X3], missing='drop').fit() info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}", 'No. observations' : lambda x: f"{int(x.nobs):d}"} results_table = summary_col(results=[reg1,reg2,reg3], float_format='%0.2f', stars = True, model_names=['Model 1', 'Model 3', 'Model 4'], info_dict=info_dict, regressor_order=['const', 'avexpr', 'lat_abst', 'asia', 'africa']) results_table.add_title('Table 2 - OLS Regressions') print(results_table) # Dropping NA's is required to use numpy's polyfit df1_subset2 = df1.dropna(subset=['logem4', 'avexpr']) X = df1_subset2['logem4'] y = df1_subset2['avexpr'] labels = df1_subset2['shortnam'] # Replace markers with country labels fig, ax = plt.subplots() ax.scatter(X, y, marker='') for i, label in enumerate(labels): ax.annotate(label, (X.iloc[i], y.iloc[i])) # Fit a linear trend line ax.plot(np.unique(X), np.poly1d(np.polyfit(X, y, 1))(np.unique(X)), color='black') ax.set_xlim([1.8,8.4]) ax.set_ylim([3.3,10.4]) ax.set_xlabel('Log of Settler Mortality') ax.set_ylabel('Average Expropriation Risk 1985-95') ax.set_title('Figure 3: First-stage relationship between settler mortality \ and expropriation risk') plt.show() # Import and select the data df4 = pd.read_stata('https://github.com/QuantEcon/lecture-python/blob/master/source/_static/lecture_specific/ols/maketable4.dta?raw=true') df4 = df4[df4['baseco'] == 1] # Add a constant variable df4['const'] = 1 # Fit the first stage regression and print summary results_fs = sm.OLS(df4['avexpr'], df4[['const', 'logem4']], missing='drop').fit() print(results_fs.summary()) df4['predicted_avexpr'] = results_fs.predict() results_ss = sm.OLS(df4['logpgp95'], df4[['const', 'predicted_avexpr']]).fit() print(results_ss.summary()) iv = IV2SLS(dependent=df4['logpgp95'], exog=df4['const'], endog=df4['avexpr'], instruments=df4['logem4']).fit(cov_type='unadjusted') print(iv.summary) # Load in data df4 = pd.read_stata('https://github.com/QuantEcon/lecture-python/blob/master/source/_static/lecture_specific/ols/maketable4.dta?raw=true') # Add a constant term df4['const'] = 1 # Estimate the first stage regression reg1 = sm.OLS(endog=df4['avexpr'], exog=df4[['const', 'logem4']], missing='drop').fit() # Retrieve the residuals df4['resid'] = reg1.resid # Estimate the second stage residuals reg2 = sm.OLS(endog=df4['logpgp95'], exog=df4[['const', 'avexpr', 'resid']], missing='drop').fit() print(reg2.summary()) # Load in data df1 = pd.read_stata('https://github.com/QuantEcon/lecture-python/blob/master/source/_static/lecture_specific/ols/maketable1.dta?raw=true') df1 = df1.dropna(subset=['logpgp95', 'avexpr']) # Add a constant term df1['const'] = 1 # Define the X and y variables y = np.asarray(df1['logpgp95']) X = np.asarray(df1[['const', 'avexpr']]) # Compute β_hat β_hat = np.linalg.solve(X.T @ X, X.T @ y) # Print out the results from the 2 x 1 vector β_hat print(f'β_0 = {β_hat[0]:.2}') print(f'β_1 = {β_hat[1]:.2}')