#%% standard imports
import os
import sys
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
#%%
/home/alal/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
%cd '/home/alal/Desktop/code/tutorials/python/Sheppard_Python_econometrics'
/home/alal/Desktop/code/tutorials/python/Sheppard_Python_econometrics
state_gdp = pd.read_excel('US_state_GDP.xls', 'Sheet1')
state_gdp.shape
state_gdp.head()
(51, 11)
state_code | state | gdp_2009 | gdp_2010 | gdp_2011 | gdp_2012 | gdp_growth_2009 | gdp_growth_2010 | gdp_growth_2011 | gdp_growth_2012 | region | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | AK | Alaska | 44215 | 43472 | 44232 | 44732 | 7.7 | -1.7 | 1.7 | 1.1 | FW |
1 | AL | Alabama | 149843 | 153839 | 155390 | 157272 | -3.9 | 2.7 | 1.0 | 1.2 | SE |
2 | AR | Arkansas | 89776 | 92075 | 92684 | 93892 | -2.0 | 2.6 | 0.7 | 1.3 | SE |
3 | AZ | Arizona | 221405 | 221016 | 224787 | 230641 | -8.2 | -0.2 | 1.7 | 2.6 | SW |
4 | CA | California | 1667152 | 1672473 | 1692301 | 1751002 | -5.1 | 0.3 | 1.2 | 3.5 | FW |
state_long_recession = state_gdp['gdp_growth_2010'] < 0
state_long_recession.shape
(51,)
Basic indexing operations
state_gdp_2012 = state_gdp[['state', 'gdp_2012','gdp_growth_2012']].copy()
state_gdp_2012.head()
state | gdp_2012 | gdp_growth_2012 | |
---|---|---|---|
0 | Alaska | 44732 | 1.1 |
1 | Alabama | 157272 | 1.2 |
2 | Arkansas | 93892 | 1.3 |
3 | Arizona | 230641 | 2.6 |
4 | California | 1751002 | 3.5 |
state_gdp.state_code.head()
type(state_gdp.state_code)
0 AK 1 AL 2 AR 3 AZ 4 CA Name: state_code, dtype: object
pandas.core.series.Series
state_gdp.loc[state_long_recession, ['state', 'gdp_growth_2009', 'gdp_growth_2010']]
state | gdp_growth_2009 | gdp_growth_2010 | |
---|---|---|---|
0 | Alaska | 7.7 | -1.7 |
3 | Arizona | -8.2 | -0.2 |
33 | Nevada | -8.2 | -0.4 |
50 | Wyoming | 3.4 | -1.3 |
state_gdp.iloc[10:15, :2]
state_code | state | |
---|---|---|
10 | GA | Georgia |
11 | HI | Hawaii |
12 | IA | Iowa |
13 | ID | Idaho |
14 | IL | Illinois |
Grouping operations
subset = state_gdp[['state', 'gdp_growth_2009', 'gdp_growth_2010', 'region']]
subset.head()
subset.pivot_table(index='region').head()
state | gdp_growth_2009 | gdp_growth_2010 | region | |
---|---|---|---|---|
0 | Alaska | 7.7 | -1.7 | FW |
1 | Alabama | -3.9 | 2.7 | SE |
2 | Arkansas | -2.0 | 2.6 | SE |
3 | Arizona | -8.2 | -0.2 | SW |
4 | California | -5.1 | 0.3 | FW |
gdp_growth_2009 | gdp_growth_2010 | |
---|---|---|
region | ||
FW | -2.483333 | 1.550000 |
GL | -5.400000 | 3.660000 |
MW | -1.250000 | 2.433333 |
NE | -2.350000 | 2.783333 |
PL | -1.357143 | 2.900000 |
grouped_data = subset.groupby('region')
region_avg_growth = grouped_data.gdp_growth_2009.mean()
region_avg_growth.head()
region FW -2.483333 GL -5.400000 MW -1.250000 NE -2.350000 PL -1.357143 Name: gdp_growth_2009, dtype: float64
grouped_data.gdp_growth_2009.first()
#%% bys generate equivalent
subset['region_avg_2009'] = subset.groupby(
'region')['gdp_growth_2009'].transform('mean')
region FW 7.7 GL -3.4 MW -0.7 NE -3.6 PL -1.6 RM -2.2 SE -3.9 SW -8.2 Name: gdp_growth_2009, dtype: float64
/home/alal/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy after removing the cwd from sys.path.
subset['diff_regional_avg_2009'] = subset.gdp_growth_2009 - \
subset.region_avg_2009
subset.head()
/home/alal/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """Entry point for launching an IPython kernel.
state | gdp_growth_2009 | gdp_growth_2010 | region | region_avg_2009 | diff_regional_avg_2009 | |
---|---|---|---|---|---|---|
0 | Alaska | 7.7 | -1.7 | FW | -2.483333 | 10.183333 |
1 | Alabama | -3.9 | 2.7 | SE | -2.633333 | -1.266667 |
2 | Arkansas | -2.0 | 2.6 | SE | -2.633333 | 0.633333 |
3 | Arizona | -8.2 | -0.2 | SW | -2.175000 | -6.025000 |
4 | California | -5.1 | 0.3 | FW | -2.483333 | -2.616667 |
Regressions
df = pd.read_csv(
'http://vincentarelbundock.github.io/Rdatasets/csv/datasets/longley.csv', index_col=0)
df.head()
df.info()
GNP.deflator | GNP | Unemployed | Armed.Forces | Population | Year | Employed | |
---|---|---|---|---|---|---|---|
1947 | 83.0 | 234.289 | 235.6 | 159.0 | 107.608 | 1947 | 60.323 |
1948 | 88.5 | 259.426 | 232.5 | 145.6 | 108.632 | 1948 | 61.122 |
1949 | 88.2 | 258.054 | 368.2 | 161.6 | 109.773 | 1949 | 60.171 |
1950 | 89.5 | 284.599 | 335.1 | 165.0 | 110.929 | 1950 | 61.187 |
1951 | 96.2 | 328.975 | 209.9 | 309.9 | 112.075 | 1951 | 63.221 |
<class 'pandas.core.frame.DataFrame'> Int64Index: 16 entries, 1947 to 1962 Data columns (total 7 columns): GNP.deflator 16 non-null float64 GNP 16 non-null float64 Unemployed 16 non-null float64 Armed.Forces 16 non-null float64 Population 16 non-null float64 Year 16 non-null int64 Employed 16 non-null float64 dtypes: float64(6), int64(1) memory usage: 1.0 KB
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x7f5e4a0abf28>
est = smf.ols(formula='Employed ~ GNP', data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: Employed R-squared: 0.967 Model: OLS Adj. R-squared: 0.965 Method: Least Squares F-statistic: 415.1 Date: Fri, 30 Mar 2018 Prob (F-statistic): 8.36e-12 Time: 11:18:30 Log-Likelihood: -14.904 No. Observations: 16 AIC: 33.81 Df Residuals: 14 BIC: 35.35 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 51.8436 0.681 76.087 0.000 50.382 53.305 GNP 0.0348 0.002 20.374 0.000 0.031 0.038 ============================================================================== Omnibus: 1.925 Durbin-Watson: 1.619 Prob(Omnibus): 0.382 Jarque-Bera (JB): 1.215 Skew: 0.664 Prob(JB): 0.545 Kurtosis: 2.759 Cond. No. 1.66e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.66e+03. This might indicate that there are strong multicollinearity or other numerical problems.
/home/alal/anaconda3/lib/python3.6/site-packages/scipy/stats/stats.py:1390: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=16 "anyway, n=%i" % int(n))
df['y_hat'] = est.fittedvalues
df['e_hat'] = est.resid
plt.scatter(df.GNP, df.Employed, alpha=0.3) # Plot the raw data
plt.xlabel("Gross National Product")
plt.ylabel("Total Employment")
plt.plot(df.GNP, df.y_hat, 'r', alpha=0.9)
<matplotlib.collections.PathCollection at 0x7f5e4a0976d8>
Text(0.5,0,'Gross National Product')
Text(0,0.5,'Total Employment')
[<matplotlib.lines.Line2D at 0x7f5e4a097b70>]
Standard erros
data = sm.datasets.statecrime.load_pandas()
data.endog_name
data.exog_name
/home/alal/anaconda3/lib/python3.6/site-packages/numpy/lib/npyio.py:2222: VisibleDeprecationWarning: Reading unicode strings without specifying the encoding argument is deprecated. Set the encoding, use None for the system default. output = genfromtxt(fname, **kwargs)
'murder'
['urban', 'poverty', 'hs_grad', 'single']
mod = sm.OLS(data.endog, sm.add_constant(data.exog))
res = mod.fit()
print(res.summary())
OLS Regression Results ============================================================================== Dep. Variable: murder R-squared: 0.813 Model: OLS Adj. R-squared: 0.797 Method: Least Squares F-statistic: 50.08 Date: Thu, 15 Mar 2018 Prob (F-statistic): 3.42e-16 Time: 16:19:38 Log-Likelihood: -95.050 No. Observations: 51 AIC: 200.1 Df Residuals: 46 BIC: 209.8 Df Model: 4 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -44.1024 12.086 -3.649 0.001 -68.430 -19.774 urban 0.0109 0.015 0.707 0.483 -0.020 0.042 poverty 0.4121 0.140 2.939 0.005 0.130 0.694 hs_grad 0.3059 0.117 2.611 0.012 0.070 0.542 single 0.6374 0.070 9.065 0.000 0.496 0.779 ============================================================================== Omnibus: 1.618 Durbin-Watson: 2.507 Prob(Omnibus): 0.445 Jarque-Bera (JB): 0.831 Skew: -0.220 Prob(JB): 0.660 Kurtosis: 3.445 Cond. No. 5.80e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 5.8e+03. This might indicate that there are strong multicollinearity or other numerical problems.
df = pd.concat([data.endog, data.exog], 1)
mod = smf.ols('murder ~ 1+ urban + single', data=df)
res = mod.fit(cov_type='HC1')
print(res.summary())
OLS Regression Results ============================================================================== Dep. Variable: murder R-squared: 0.776 Model: OLS Adj. R-squared: 0.766 Method: Least Squares F-statistic: 25.52 Date: Fri, 30 Mar 2018 Prob (F-statistic): 2.83e-08 Time: 11:19:39 Log-Likelihood: -99.713 No. Observations: 51 AIC: 205.4 Df Residuals: 48 BIC: 211.2 Df Model: 2 Covariance Type: HC1 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -11.7079 2.217 -5.282 0.000 -16.052 -7.363 urban -0.0152 0.015 -1.005 0.315 -0.045 0.014 single 0.6961 0.102 6.803 0.000 0.496 0.897 ============================================================================== Omnibus: 0.909 Durbin-Watson: 2.535 Prob(Omnibus): 0.635 Jarque-Bera (JB): 0.489 Skew: -0.234 Prob(JB): 0.783 Kurtosis: 3.104 Cond. No. 375. ============================================================================== Warnings: [1] Standard Errors are heteroscedasticity robust (HC1)