In [1]:

#%% standard imports
import os
import sys
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
#%%

/home/alal/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [2]:

%cd '/home/alal/Desktop/code/tutorials/python/Sheppard_Python_econometrics'

/home/alal/Desktop/code/tutorials/python/Sheppard_Python_econometrics

In [3]:

state_gdp = pd.read_excel('US_state_GDP.xls', 'Sheet1')
state_gdp.shape
state_gdp.head()

Out[3]:

(51, 11)

Out[3]:

	state_code	state	gdp_2009	gdp_2010	gdp_2011	gdp_2012	gdp_growth_2009	gdp_growth_2010	gdp_growth_2011	gdp_growth_2012	region
0	AK	Alaska	44215	43472	44232	44732	7.7	-1.7	1.7	1.1	FW
1	AL	Alabama	149843	153839	155390	157272	-3.9	2.7	1.0	1.2	SE
2	AR	Arkansas	89776	92075	92684	93892	-2.0	2.6	0.7	1.3	SE
3	AZ	Arizona	221405	221016	224787	230641	-8.2	-0.2	1.7	2.6	SW
4	CA	California	1667152	1672473	1692301	1751002	-5.1	0.3	1.2	3.5	FW

In [4]:

state_long_recession = state_gdp['gdp_growth_2010'] < 0
state_long_recession.shape

Out[4]:

(51,)

Basic indexing operations

In [5]:

state_gdp_2012 = state_gdp[['state', 'gdp_2012','gdp_growth_2012']].copy()
state_gdp_2012.head()

Out[5]:

	state	gdp_2012	gdp_growth_2012
0	Alaska	44732	1.1
1	Alabama	157272	1.2
2	Arkansas	93892	1.3
3	Arizona	230641	2.6
4	California	1751002	3.5

In [6]:

state_gdp.state_code.head()
type(state_gdp.state_code)

Out[6]:

0    AK
1    AL
2    AR
3    AZ
4    CA
Name: state_code, dtype: object

Out[6]:

pandas.core.series.Series

In [7]:

state_gdp.loc[state_long_recession, ['state', 'gdp_growth_2009', 'gdp_growth_2010']]

Out[7]:

	state	gdp_growth_2009	gdp_growth_2010
0	Alaska	7.7	-1.7
3	Arizona	-8.2	-0.2
33	Nevada	-8.2	-0.4
50	Wyoming	3.4	-1.3

In [8]:

state_gdp.iloc[10:15, :2]

Out[8]:

	state_code	state
10	GA	Georgia
11	HI	Hawaii
12	IA	Iowa
13	ID	Idaho
14	IL	Illinois

Grouping operations

In [9]:

subset = state_gdp[['state', 'gdp_growth_2009', 'gdp_growth_2010', 'region']]
subset.head()
subset.pivot_table(index='region').head()

Out[9]:

	state	gdp_growth_2009	gdp_growth_2010	region
0	Alaska	7.7	-1.7	FW
1	Alabama	-3.9	2.7	SE
2	Arkansas	-2.0	2.6	SE
3	Arizona	-8.2	-0.2	SW
4	California	-5.1	0.3	FW

Out[9]:

	gdp_growth_2009	gdp_growth_2010
region
FW	-2.483333	1.550000
GL	-5.400000	3.660000
MW	-1.250000	2.433333
NE	-2.350000	2.783333
PL	-1.357143	2.900000

In [10]:

grouped_data = subset.groupby('region')
region_avg_growth = grouped_data.gdp_growth_2009.mean()
region_avg_growth.head()

Out[10]:

region
FW   -2.483333
GL   -5.400000
MW   -1.250000
NE   -2.350000
PL   -1.357143
Name: gdp_growth_2009, dtype: float64

In [11]:

grouped_data.gdp_growth_2009.first()
#%% bys generate equivalent
subset['region_avg_2009'] = subset.groupby(
    'region')['gdp_growth_2009'].transform('mean')

Out[11]:

region
FW    7.7
GL   -3.4
MW   -0.7
NE   -3.6
PL   -1.6
RM   -2.2
SE   -3.9
SW   -8.2
Name: gdp_growth_2009, dtype: float64

/home/alal/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.

In [12]:

subset['diff_regional_avg_2009'] = subset.gdp_growth_2009 - \
    subset.region_avg_2009
subset.head()

/home/alal/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

Out[12]:

	state	gdp_growth_2009	gdp_growth_2010	region	region_avg_2009	diff_regional_avg_2009
0	Alaska	7.7	-1.7	FW	-2.483333	10.183333
1	Alabama	-3.9	2.7	SE	-2.633333	-1.266667
2	Arkansas	-2.0	2.6	SE	-2.633333	0.633333
3	Arizona	-8.2	-0.2	SW	-2.175000	-6.025000
4	California	-5.1	0.3	FW	-2.483333	-2.616667

Regressions

In [22]:

df = pd.read_csv(
    'http://vincentarelbundock.github.io/Rdatasets/csv/datasets/longley.csv', index_col=0)
df.head()
df.info()

Out[22]:

	GNP.deflator	GNP	Unemployed	Armed.Forces	Population	Year	Employed
1947	83.0	234.289	235.6	159.0	107.608	1947	60.323
1948	88.5	259.426	232.5	145.6	108.632	1948	61.122
1949	88.2	258.054	368.2	161.6	109.773	1949	60.171
1950	89.5	284.599	335.1	165.0	110.929	1950	61.187
1951	96.2	328.975	209.9	309.9	112.075	1951	63.221

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 1947 to 1962
Data columns (total 7 columns):
GNP.deflator    16 non-null float64
GNP             16 non-null float64
Unemployed      16 non-null float64
Armed.Forces    16 non-null float64
Population      16 non-null float64
Year            16 non-null int64
Employed        16 non-null float64
dtypes: float64(6), int64(1)
memory usage: 1.0 KB

In [21]:

sns.pairplot(df)

Out[21]:

<seaborn.axisgrid.PairGrid at 0x7f5e4a0abf28>

In [14]:

est = smf.ols(formula='Employed ~ GNP', data=df).fit()
print(est.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               Employed   R-squared:                       0.967
Model:                            OLS   Adj. R-squared:                  0.965
Method:                 Least Squares   F-statistic:                     415.1
Date:                Fri, 30 Mar 2018   Prob (F-statistic):           8.36e-12
Time:                        11:18:30   Log-Likelihood:                -14.904
No. Observations:                  16   AIC:                             33.81
Df Residuals:                      14   BIC:                             35.35
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     51.8436      0.681     76.087      0.000      50.382      53.305
GNP            0.0348      0.002     20.374      0.000       0.031       0.038
==============================================================================
Omnibus:                        1.925   Durbin-Watson:                   1.619
Prob(Omnibus):                  0.382   Jarque-Bera (JB):                1.215
Skew:                           0.664   Prob(JB):                        0.545
Kurtosis:                       2.759   Cond. No.                     1.66e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.66e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

/home/alal/anaconda3/lib/python3.6/site-packages/scipy/stats/stats.py:1390: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=16
  "anyway, n=%i" % int(n))

In [15]:

df['y_hat'] = est.fittedvalues
df['e_hat'] = est.resid
plt.scatter(df.GNP, df.Employed, alpha=0.3)  # Plot the raw data
plt.xlabel("Gross National Product")
plt.ylabel("Total Employment")

plt.plot(df.GNP, df.y_hat, 'r', alpha=0.9)

Out[15]:

<matplotlib.collections.PathCollection at 0x7f5e4a0976d8>

Out[15]:

Text(0.5,0,'Gross National Product')

Out[15]:

Text(0,0.5,'Total Employment')

Out[15]:

[<matplotlib.lines.Line2D at 0x7f5e4a097b70>]

Standard erros

In [16]:

data = sm.datasets.statecrime.load_pandas()
data.endog_name
data.exog_name

/home/alal/anaconda3/lib/python3.6/site-packages/numpy/lib/npyio.py:2222: VisibleDeprecationWarning: Reading unicode strings without specifying the encoding argument is deprecated. Set the encoding, use None for the system default.
  output = genfromtxt(fname, **kwargs)

Out[16]:

'murder'

Out[16]:

['urban', 'poverty', 'hs_grad', 'single']

In [31]:

mod = sm.OLS(data.endog, sm.add_constant(data.exog))
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 murder   R-squared:                       0.813
Model:                            OLS   Adj. R-squared:                  0.797
Method:                 Least Squares   F-statistic:                     50.08
Date:                Thu, 15 Mar 2018   Prob (F-statistic):           3.42e-16
Time:                        16:19:38   Log-Likelihood:                -95.050
No. Observations:                  51   AIC:                             200.1
Df Residuals:                      46   BIC:                             209.8
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -44.1024     12.086     -3.649      0.001     -68.430     -19.774
urban          0.0109      0.015      0.707      0.483      -0.020       0.042
poverty        0.4121      0.140      2.939      0.005       0.130       0.694
hs_grad        0.3059      0.117      2.611      0.012       0.070       0.542
single         0.6374      0.070      9.065      0.000       0.496       0.779
==============================================================================
Omnibus:                        1.618   Durbin-Watson:                   2.507
Prob(Omnibus):                  0.445   Jarque-Bera (JB):                0.831
Skew:                          -0.220   Prob(JB):                        0.660
Kurtosis:                       3.445   Cond. No.                     5.80e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.8e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [19]:

df = pd.concat([data.endog, data.exog], 1)
mod = smf.ols('murder ~ 1+ urban + single', data=df)
res = mod.fit(cov_type='HC1')
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 murder   R-squared:                       0.776
Model:                            OLS   Adj. R-squared:                  0.766
Method:                 Least Squares   F-statistic:                     25.52
Date:                Fri, 30 Mar 2018   Prob (F-statistic):           2.83e-08
Time:                        11:19:39   Log-Likelihood:                -99.713
No. Observations:                  51   AIC:                             205.4
Df Residuals:                      48   BIC:                             211.2
Df Model:                           2                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -11.7079      2.217     -5.282      0.000     -16.052      -7.363
urban         -0.0152      0.015     -1.005      0.315      -0.045       0.014
single         0.6961      0.102      6.803      0.000       0.496       0.897
==============================================================================
Omnibus:                        0.909   Durbin-Watson:                   2.535
Prob(Omnibus):                  0.635   Jarque-Bera (JB):                0.489
Skew:                          -0.234   Prob(JB):                        0.783
Kurtosis:                       3.104   Cond. No.                         375.
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)

In [ ]: