Chapter 3 - Linear Regression

Imports and Configurations

In [1]:
# Standard imports
import warnings

# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri

# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd

# StatsModels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from statsmodels.stats.anova import anova_lm

# scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Visulization
from IPython.display import display, HTML
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import statsmodels.graphics.api as smg

Utility Functions Definitions

In [2]:
def ortho_poly_fit(x, degree = 1):
    '''
    Convert data into orthogonal basis for polynomial regression by QR decomposition.
    Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly
    '''
    n = degree + 1
    x = np.asarray(x).flatten()
    if(degree >= len(np.unique(x))):
            stop("'degree' must be less than number of unique points")
    xbar = np.mean(x)
    x = x - xbar
    X = np.fliplr(np.vander(x, n))
    q,r = np.linalg.qr(X)

    z = np.diag(np.diag(r))
    raw = np.dot(q, z)

    norm2 = np.sum(raw**2, axis=0)
    alpha = (np.sum((raw**2)*np.reshape(x,(-1,1)), axis=0)/norm2 + xbar)[:degree]
    Z = raw / np.sqrt(norm2)
    return Z, norm2, alpha

def ortho_poly_predict(x, alpha, norm2, degree = 1):
    '''
    Convert new data for prediction into orthogonal basis, based on input parameters.
    Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly
    '''
    x = np.asarray(x).flatten()
    n = degree + 1
    Z = np.empty((len(x), n))
    Z[:,0] = 1
    if degree > 0:
        Z[:, 1] = x - alpha[0]
    if degree > 1:
        for i in np.arange(1,degree):
            Z[:, i+1] = (x - alpha[i]) * Z[:, i] - (norm2[i] / norm2[i-1]) * Z[:, i-1]
    Z /= np.sqrt(norm2)
    return Z

Lab 3.6.2 Simple Linear Regression

In [3]:
# Import Boston house price data from R package MASS
# This dataset is also available at https://archive.ics.uci.edu/ml/datasets/Housing
mass = importr('MASS')
boston_rdf = rdata(mass).fetch('Boston')['Boston']
boston = pandas2ri.ri2py(boston_rdf)
In [4]:
# boston.describe  # Head and tails in text.
# display(boston)  # Head and tail in a data table.
HTML(boston.to_html(max_rows=60))  # All rows, or a subset in a data table.
Out[4]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0
2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6
3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7
4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4
5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2
6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222.0 18.7 394.12 5.21 28.7
7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311.0 15.2 395.60 12.43 22.9
8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311.0 15.2 396.90 19.15 27.1
9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311.0 15.2 386.63 29.93 16.5
10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311.0 15.2 386.71 17.10 18.9
11 0.22489 12.5 7.87 0 0.524 6.377 94.3 6.3467 5 311.0 15.2 392.52 20.45 15.0
12 0.11747 12.5 7.87 0 0.524 6.009 82.9 6.2267 5 311.0 15.2 396.90 13.27 18.9
13 0.09378 12.5 7.87 0 0.524 5.889 39.0 5.4509 5 311.0 15.2 390.50 15.71 21.7
14 0.62976 0.0 8.14 0 0.538 5.949 61.8 4.7075 4 307.0 21.0 396.90 8.26 20.4
15 0.63796 0.0 8.14 0 0.538 6.096 84.5 4.4619 4 307.0 21.0 380.02 10.26 18.2
16 0.62739 0.0 8.14 0 0.538 5.834 56.5 4.4986 4 307.0 21.0 395.62 8.47 19.9
17 1.05393 0.0 8.14 0 0.538 5.935 29.3 4.4986 4 307.0 21.0 386.85 6.58 23.1
18 0.78420 0.0 8.14 0 0.538 5.990 81.7 4.2579 4 307.0 21.0 386.75 14.67 17.5
19 0.80271 0.0 8.14 0 0.538 5.456 36.6 3.7965 4 307.0 21.0 288.99 11.69 20.2
20 0.72580 0.0 8.14 0 0.538 5.727 69.5 3.7965 4 307.0 21.0 390.95 11.28 18.2
21 1.25179 0.0 8.14 0 0.538 5.570 98.1 3.7979 4 307.0 21.0 376.57 21.02 13.6
22 0.85204 0.0 8.14 0 0.538 5.965 89.2 4.0123 4 307.0 21.0 392.53 13.83 19.6
23 1.23247 0.0 8.14 0 0.538 6.142 91.7 3.9769 4 307.0 21.0 396.90 18.72 15.2
24 0.98843 0.0 8.14 0 0.538 5.813 100.0 4.0952 4 307.0 21.0 394.54 19.88 14.5
25 0.75026 0.0 8.14 0 0.538 5.924 94.1 4.3996 4 307.0 21.0 394.33 16.30 15.6
26 0.84054 0.0 8.14 0 0.538 5.599 85.7 4.4546 4 307.0 21.0 303.42 16.51 13.9
27 0.67191 0.0 8.14 0 0.538 5.813 90.3 4.6820 4 307.0 21.0 376.88 14.81 16.6
28 0.95577 0.0 8.14 0 0.538 6.047 88.8 4.4534 4 307.0 21.0 306.38 17.28 14.8
29 0.77299 0.0 8.14 0 0.538 6.495 94.4 4.4547 4 307.0 21.0 387.94 12.80 18.4
30 1.00245 0.0 8.14 0 0.538 6.674 87.3 4.2390 4 307.0 21.0 380.23 11.98 21.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
477 4.87141 0.0 18.10 0 0.614 6.484 93.6 2.3053 24 666.0 20.2 396.21 18.68 16.7
478 15.02340 0.0 18.10 0 0.614 5.304 97.3 2.1007 24 666.0 20.2 349.48 24.91 12.0
479 10.23300 0.0 18.10 0 0.614 6.185 96.7 2.1705 24 666.0 20.2 379.70 18.03 14.6
480 14.33370 0.0 18.10 0 0.614 6.229 88.0 1.9512 24 666.0 20.2 383.32 13.11 21.4
481 5.82401 0.0 18.10 0 0.532 6.242 64.7 3.4242 24 666.0 20.2 396.90 10.74 23.0
482 5.70818 0.0 18.10 0 0.532 6.750 74.9 3.3317 24 666.0 20.2 393.07 7.74 23.7
483 5.73116 0.0 18.10 0 0.532 7.061 77.0 3.4106 24 666.0 20.2 395.28 7.01 25.0
484 2.81838 0.0 18.10 0 0.532 5.762 40.3 4.0983 24 666.0 20.2 392.92 10.42 21.8
485 2.37857 0.0 18.10 0 0.583 5.871 41.9 3.7240 24 666.0 20.2 370.73 13.34 20.6
486 3.67367 0.0 18.10 0 0.583 6.312 51.9 3.9917 24 666.0 20.2 388.62 10.58 21.2
487 5.69175 0.0 18.10 0 0.583 6.114 79.8 3.5459 24 666.0 20.2 392.68 14.98 19.1
488 4.83567 0.0 18.10 0 0.583 5.905 53.2 3.1523 24 666.0 20.2 388.22 11.45 20.6
489 0.15086 0.0 27.74 0 0.609 5.454 92.7 1.8209 4 711.0 20.1 395.09 18.06 15.2
490 0.18337 0.0 27.74 0 0.609 5.414 98.3 1.7554 4 711.0 20.1 344.05 23.97 7.0
491 0.20746 0.0 27.74 0 0.609 5.093 98.0 1.8226 4 711.0 20.1 318.43 29.68 8.1
492 0.10574 0.0 27.74 0 0.609 5.983 98.8 1.8681 4 711.0 20.1 390.11 18.07 13.6
493 0.11132 0.0 27.74 0 0.609 5.983 83.5 2.1099 4 711.0 20.1 396.90 13.35 20.1
494 0.17331 0.0 9.69 0 0.585 5.707 54.0 2.3817 6 391.0 19.2 396.90 12.01 21.8
495 0.27957 0.0 9.69 0 0.585 5.926 42.6 2.3817 6 391.0 19.2 396.90 13.59 24.5
496 0.17899 0.0 9.69 0 0.585 5.670 28.8 2.7986 6 391.0 19.2 393.29 17.60 23.1
497 0.28960 0.0 9.69 0 0.585 5.390 72.9 2.7986 6 391.0 19.2 396.90 21.14 19.7
498 0.26838 0.0 9.69 0 0.585 5.794 70.6 2.8927 6 391.0 19.2 396.90 14.10 18.3
499 0.23912 0.0 9.69 0 0.585 6.019 65.3 2.4091 6 391.0 19.2 396.90 12.92 21.2
500 0.17783 0.0 9.69 0 0.585 5.569 73.5 2.3999 6 391.0 19.2 395.77 15.10 17.5
501 0.22438 0.0 9.69 0 0.585 6.027 79.7 2.4982 6 391.0 19.2 396.90 14.33 16.8
502 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273.0 21.0 391.99 9.67 22.4
503 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273.0 21.0 396.90 9.08 20.6
504 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273.0 21.0 396.90 5.64 23.9
505 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273.0 21.0 393.45 6.48 22.0
506 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273.0 21.0 396.90 7.88 11.9
In [5]:
boston.info()  # Summary of the data
<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 1 to 506
Data columns (total 14 columns):
crim       506 non-null float64
zn         506 non-null float64
indus      506 non-null float64
chas       506 non-null int32
nox        506 non-null float64
rm         506 non-null float64
age        506 non-null float64
dis        506 non-null float64
rad        506 non-null int32
tax        506 non-null float64
ptratio    506 non-null float64
black      506 non-null float64
lstat      506 non-null float64
medv       506 non-null float64
dtypes: float64(12), int32(2)
memory usage: 55.3 KB
In [6]:
# Simple linear regression and plot using seaborn
sns.lmplot(x='lstat', y='medv', data=boston, scatter_kws={'color':'r', 's':15});
plt.xlim(xmin=0)
plt.ylim(ymin=0)
sns.plt.show()
plt.figure()
sns.residplot(x='lstat', y='medv', data=boston)
sns.plt.show()
In [7]:
# Simple linear regression with scikit-learn
boston_sk = LinearRegression()
nrow = boston.shape[0]
boston_sk.fit(boston['lstat'].values.reshape(nrow, 1), boston['medv'].values.reshape(nrow, 1))
print("Slope: {}, Intercept: {}".format(boston_sk.coef_, boston_sk.intercept_))
Slope: [[-0.95004935]], Intercept: [ 34.55384088]
In [8]:
# Simple linear regression with StatsModels
boston_smslr = smf.ols('medv ~ lstat', data=boston).fit()
print(boston_smslr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smslr.mse_resid))
print("\nResiduals:")
display(boston_smslr.resid.describe()[3:])
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   medv   R-squared:                       0.544
Model:                            OLS   Adj. R-squared:                  0.543
Method:                 Least Squares   F-statistic:                     601.6
Date:                Tue, 28 Feb 2017   Prob (F-statistic):           5.08e-88
Time:                        16:53:32   Log-Likelihood:                -1641.5
No. Observations:                 506   AIC:                             3287.
Df Residuals:                     504   BIC:                             3295.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     34.5538      0.563     61.415      0.000        33.448    35.659
lstat         -0.9500      0.039    -24.528      0.000        -1.026    -0.874
==============================================================================
Omnibus:                      137.043   Durbin-Watson:                   0.892
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              291.373
Skew:                           1.453   Prob(JB):                     5.36e-64
Kurtosis:                       5.319   Cond. No.                         29.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Residual Standard Error:
6.2157604054

Residuals:
min   -15.167452
25%    -3.989612
50%    -1.318186
75%     2.033701
max    24.500129
dtype: float64
In [9]:
# Plot StatsModels OLS fitting results
fig, ax = plt.subplots()
plt.scatter(boston.lstat, boston.medv, color='r', s=15, alpha=0.5)  # Data points
smg.abline_plot(model_results=boston_smslr, ax=ax)  # Regression line
plt.xlim(0, 40)
plt.ylim(0, 55)
plt.xlabel('lstat')
plt.ylabel('medv')
plt.show()
In [10]:
# 97.5% confidence interval for the coefficient estimates
ci = boston_smslr.conf_int(0.025)
ci.columns = ['2.5%', '97.5%']
print(ci)
                2.5%      97.5%
Intercept  33.288987  35.818694
lstat      -1.037127  -0.862972

We can see the confidence interval computed by StatsModels is slightly difference than R.

In [11]:
# Prediction (To be finished)
fit = boston_smslr.predict(exog=dict(lstat=[5,10,15]))
print(fit)
# Prediction Intervals
#from statsmodels.sandbox.regression.predstd import wls_prediction_std
#prstd, iv_l, iv_u = wls_prediction_std(boston_smslr)
#print(prstd,iv_l,iv_u)

# Confidence Intervals
[ 29.80359411  25.05334734  20.30310057]
In [12]:
# Diagnostic plots for simple linear regression
infl = boston_smslr.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))

# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(boston_smslr.fittedvalues, boston_smslr.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')

# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_external, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')

# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(boston_smslr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')

# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(boston_smslr, size=20, ax=ax4)
plt.xlim(0, 0.03)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')

plt.tight_layout()
plt.show()
In [13]:
# Find the data index with the maximum leverage statistics
infl.hat_matrix_diag.argmax() + 1
Out[13]:
375

Lab 3.6.3 Multiple Linear Regression

In [14]:
# Regression with two predictors on Boston dataset
boston_smmlr = smf.ols('medv ~ lstat + age', data=boston).fit()
print(boston_smmlr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr.mse_resid))
print("\nResiduals:")
print(boston_smmlr.resid.describe()[3:])
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   medv   R-squared:                       0.551
Model:                            OLS   Adj. R-squared:                  0.549
Method:                 Least Squares   F-statistic:                     309.0
Date:                Tue, 28 Feb 2017   Prob (F-statistic):           2.98e-88
Time:                        16:53:33   Log-Likelihood:                -1637.5
No. Observations:                 506   AIC:                             3281.
Df Residuals:                     503   BIC:                             3294.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     33.2228      0.731     45.458      0.000        31.787    34.659
lstat         -1.0321      0.048    -21.416      0.000        -1.127    -0.937
age            0.0345      0.012      2.826      0.005         0.011     0.059
==============================================================================
Omnibus:                      124.288   Durbin-Watson:                   0.945
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              244.026
Skew:                           1.362   Prob(JB):                     1.02e-53
Kurtosis:                       5.038   Cond. No.                         201.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Residual Standard Error:
6.17313628136

Residuals:
min   -15.981243
25%    -3.977470
50%    -1.283443
75%     1.968309
max    23.158419
dtype: float64
In [15]:
# Regression with all predictors on Boston dataset
all_predictors = '+'.join(boston.columns.drop('medv'))
boston_smmlr_all = smf.ols('medv ~ ' + all_predictors, data=boston).fit()
print(boston_smmlr_all.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_all.mse_resid))
print("\nResiduals:")
print(boston_smmlr_all.resid.describe()[3:])
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   medv   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Tue, 28 Feb 2017   Prob (F-statistic):          6.72e-135
Time:                        16:53:34   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     36.4595      5.103      7.144      0.000        26.432    46.487
crim          -0.1080      0.033     -3.287      0.001        -0.173    -0.043
zn             0.0464      0.014      3.382      0.001         0.019     0.073
indus          0.0206      0.061      0.334      0.738        -0.100     0.141
chas           2.6867      0.862      3.118      0.002         0.994     4.380
nox          -17.7666      3.820     -4.651      0.000       -25.272   -10.262
rm             3.8099      0.418      9.116      0.000         2.989     4.631
age            0.0007      0.013      0.052      0.958        -0.025     0.027
dis           -1.4756      0.199     -7.398      0.000        -1.867    -1.084
rad            0.3060      0.066      4.613      0.000         0.176     0.436
tax           -0.0123      0.004     -3.280      0.001        -0.020    -0.005
ptratio       -0.9527      0.131     -7.283      0.000        -1.210    -0.696
black          0.0093      0.003      3.467      0.001         0.004     0.015
lstat         -0.5248      0.051    -10.347      0.000        -0.624    -0.425
==============================================================================
Omnibus:                      178.041   Durbin-Watson:                   1.078
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              783.126
Skew:                           1.521   Prob(JB):                    8.84e-171
Kurtosis:                       8.281   Cond. No.                     1.51e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.51e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

Residual Standard Error:
4.7452981817

Residuals:
min   -15.594474
25%    -2.729716
50%    -0.518049
75%     1.777051
max    26.199271
dtype: float64
In [16]:
# Compute variance inflation factors
exog = boston_smmlr_all.model.exog
exog_names = boston_smmlr_all.model.exog_names
vif_df = pd.DataFrame([vif(exog, ix) for ix in range(1, exog.shape[1])]).transpose()
vif_df.columns = exog_names[1:]
display(vif_df)
crim zn indus chas nox rm age dis rad tax ptratio black lstat
0 1.792192 2.298758 3.991596 1.073995 4.39372 1.933744 3.100826 3.955945 7.484496 9.008554 1.799084 1.348521 2.941491

Lab 3.6.4 Interaction Terms

In [17]:
# Regression with two interaction terms on Boston dataset
boston_smmlr_inter = smf.ols('medv ~ lstat * age', data=boston).fit()
print(boston_smmlr_inter.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_inter.mse_resid))
print("\nResiduals:")
print(boston_smmlr_inter.resid.describe()[3:])
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   medv   R-squared:                       0.556
Model:                            OLS   Adj. R-squared:                  0.553
Method:                 Least Squares   F-statistic:                     209.3
Date:                Tue, 28 Feb 2017   Prob (F-statistic):           4.86e-88
Time:                        16:53:34   Log-Likelihood:                -1635.0
No. Observations:                 506   AIC:                             3278.
Df Residuals:                     502   BIC:                             3295.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     36.0885      1.470     24.553      0.000        33.201    38.976
lstat         -1.3921      0.167     -8.313      0.000        -1.721    -1.063
age           -0.0007      0.020     -0.036      0.971        -0.040     0.038
lstat:age      0.0042      0.002      2.244      0.025         0.001     0.008
==============================================================================
Omnibus:                      135.601   Durbin-Watson:                   0.965
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              296.955
Skew:                           1.417   Prob(JB):                     3.29e-65
Kurtosis:                       5.461   Cond. No.                     6.88e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.88e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

Residual Standard Error:
6.14851326968

Residuals:
min   -15.806521
25%    -4.044681
50%    -1.333163
75%     2.084707
max    27.552059
dtype: float64

Lab 3.6.5 Non-linear Transformations of the Predictors

In [18]:
# Regression with quandratic term on Boston dataset
boston_smmlr_quad = smf.ols('medv ~ lstat + I(lstat**2)', data=boston).fit()
print(boston_smmlr_quad.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_quad.mse_resid))
print("\nResiduals:")
print(boston_smmlr_quad.resid.describe()[3:])
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   medv   R-squared:                       0.641
Model:                            OLS   Adj. R-squared:                  0.639
Method:                 Least Squares   F-statistic:                     448.5
Date:                Tue, 28 Feb 2017   Prob (F-statistic):          1.56e-112
Time:                        16:53:34   Log-Likelihood:                -1581.3
No. Observations:                 506   AIC:                             3169.
Df Residuals:                     503   BIC:                             3181.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------
Intercept        42.8620      0.872     49.149      0.000        41.149    44.575
lstat            -2.3328      0.124    -18.843      0.000        -2.576    -2.090
I(lstat ** 2)     0.0435      0.004     11.628      0.000         0.036     0.051
==============================================================================
Omnibus:                      107.006   Durbin-Watson:                   0.921
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              228.388
Skew:                           1.128   Prob(JB):                     2.55e-50
Kurtosis:                       5.397   Cond. No.                     1.13e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.13e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

Residual Standard Error:
5.52371413181

Residuals:
min   -15.283395
25%    -3.831307
50%    -0.529500
75%     2.309535
max    25.414810
dtype: float64
In [19]:
# ANOVA for both simple linear and quandratic fits
with warnings.catch_warnings(): 
    warnings.filterwarnings("ignore")  ## Supress warnings
    display(anova_lm(boston_smslr, boston_smmlr_quad))
df_resid ssr df_diff ss_diff F Pr(>F)
0 504.0 19472.381418 0.0 NaN NaN NaN
1 503.0 15347.243158 1.0 4125.13826 135.199822 7.630116e-28
In [20]:
# Diagnostic plots for quandratic fit
infl = boston_smmlr_quad.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))

# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(boston_smmlr_quad.fittedvalues, boston_smmlr_quad.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')

# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_external, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')

# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(boston_smmlr_quad.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')

# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(boston_smslr, size=20, ax=ax4)
plt.xlim(0, 0.03)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')

plt.tight_layout()
plt.show()
In [21]:
# Regression with polynomial terms on Boston dataset
# poly() function in R produces orthogonal polynomials with QR decomposition by default
# Without orthogonization, powers of X are correlated, and regression on correlated predictors
# leads tounstable coefficients.
# In addition, large powers of X will require very small regression coefficients,
# potentially leading to underflow and coefficients that are hard to interpret or compare intuitively.
# Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly
poly_degree = 5
boston_smmlr_poly = smf.ols('medv ~ ortho_poly_fit(lstat, poly_degree)[0][:, 1:]', data=boston).fit()
print(boston_smmlr_poly.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_poly.mse_resid))
print("\nResiduals:")
print(boston_smmlr_poly.resid.describe()[3:])
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   medv   R-squared:                       0.682
Model:                            OLS   Adj. R-squared:                  0.679
Method:                 Least Squares   F-statistic:                     214.2
Date:                Tue, 28 Feb 2017   Prob (F-statistic):          8.73e-122
Time:                        16:53:35   Log-Likelihood:                -1550.6
No. Observations:                 506   AIC:                             3113.
Df Residuals:                     500   BIC:                             3139.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
===================================================================================================================
                                                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------------------------------
Intercept                                          22.5328      0.232     97.197      0.000        22.077    22.988
ortho_poly_fit(lstat, poly_degree)[0][:, 1:][0]  -152.4595      5.215    -29.236      0.000      -162.705  -142.214
ortho_poly_fit(lstat, poly_degree)[0][:, 1:][1]    64.2272      5.215     12.316      0.000        53.982    74.473
ortho_poly_fit(lstat, poly_degree)[0][:, 1:][2]   -27.0511      5.215     -5.187      0.000       -37.297   -16.805
ortho_poly_fit(lstat, poly_degree)[0][:, 1:][3]    25.4517      5.215      4.881      0.000        15.206    35.697
ortho_poly_fit(lstat, poly_degree)[0][:, 1:][4]   -19.2524      5.215     -3.692      0.000       -29.498    -9.007
==============================================================================
Omnibus:                      144.085   Durbin-Watson:                   0.987
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              494.545
Skew:                           1.292   Prob(JB):                    4.08e-108
Kurtosis:                       7.096   Cond. No.                         22.5
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Residual Standard Error:
5.21479338564

Residuals:
min   -13.543286
25%    -3.103925
50%    -0.705232
75%     2.084364
max    27.115265
dtype: float64

Lab 3.6.6 Qualitative Predictors

In [22]:
# Import Carseats data from R package ISLR
islr = importr('ISLR')
carseats_rdf = rdata(islr).fetch('Carseats')['Carseats']
carseats = pandas2ri.ri2py(carseats_rdf)
display(carseats)
Sales CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US
1 9.50 138.0 73.0 11.0 276.0 120.0 Bad 42.0 17.0 Yes Yes
2 11.22 111.0 48.0 16.0 260.0 83.0 Good 65.0 10.0 Yes Yes
3 10.06 113.0 35.0 10.0 269.0 80.0 Medium 59.0 12.0 Yes Yes
4 7.40 117.0 100.0 4.0 466.0 97.0 Medium 55.0 14.0 Yes Yes
5 4.15 141.0 64.0 3.0 340.0 128.0 Bad 38.0 13.0 Yes No
6 10.81 124.0 113.0 13.0 501.0 72.0 Bad 78.0 16.0 No Yes
7 6.63 115.0 105.0 0.0 45.0 108.0 Medium 71.0 15.0 Yes No
8 11.85 136.0 81.0 15.0 425.0 120.0 Good 67.0 10.0 Yes Yes
9 6.54 132.0 110.0 0.0 108.0 124.0 Medium 76.0 10.0 No No
10 4.69 132.0 113.0 0.0 131.0 124.0 Medium 76.0 17.0 No Yes
11 9.01 121.0 78.0 9.0 150.0 100.0 Bad 26.0 10.0 No Yes
12 11.96 117.0 94.0 4.0 503.0 94.0 Good 50.0 13.0 Yes Yes
13 3.98 122.0 35.0 2.0 393.0 136.0 Medium 62.0 18.0 Yes No
14 10.96 115.0 28.0 11.0 29.0 86.0 Good 53.0 18.0 Yes Yes
15 11.17 107.0 117.0 11.0 148.0 118.0 Good 52.0 18.0 Yes Yes
16 8.71 149.0 95.0 5.0 400.0 144.0 Medium 76.0 18.0 No No
17 7.58 118.0 32.0 0.0 284.0 110.0 Good 63.0 13.0 Yes No
18 12.29 147.0 74.0 13.0 251.0 131.0 Good 52.0 10.0 Yes Yes
19 13.91 110.0 110.0 0.0 408.0 68.0 Good 46.0 17.0 No Yes
20 8.73 129.0 76.0 16.0 58.0 121.0 Medium 69.0 12.0 Yes Yes
21 6.41 125.0 90.0 2.0 367.0 131.0 Medium 35.0 18.0 Yes Yes
22 12.13 134.0 29.0 12.0 239.0 109.0 Good 62.0 18.0 No Yes
23 5.08 128.0 46.0 6.0 497.0 138.0 Medium 42.0 13.0 Yes No
24 5.87 121.0 31.0 0.0 292.0 109.0 Medium 79.0 10.0 Yes No
25 10.14 145.0 119.0 16.0 294.0 113.0 Bad 42.0 12.0 Yes Yes
26 14.90 139.0 32.0 0.0 176.0 82.0 Good 54.0 11.0 No No
27 8.33 107.0 115.0 11.0 496.0 131.0 Good 50.0 11.0 No Yes
28 5.27 98.0 118.0 0.0 19.0 107.0 Medium 64.0 17.0 Yes No
29 2.99 103.0 74.0 0.0 359.0 97.0 Bad 55.0 11.0 Yes Yes
30 7.81 104.0 99.0 15.0 226.0 102.0 Bad 58.0 17.0 Yes Yes
... ... ... ... ... ... ... ... ... ... ... ...
371 7.68 126.0 41.0 22.0 403.0 119.0 Bad 42.0 12.0 Yes Yes
372 9.08 152.0 81.0 0.0 191.0 126.0 Medium 54.0 16.0 Yes No
373 7.80 121.0 50.0 0.0 508.0 98.0 Medium 65.0 11.0 No No
374 5.58 137.0 71.0 0.0 402.0 116.0 Medium 78.0 17.0 Yes No
375 9.44 131.0 47.0 7.0 90.0 118.0 Medium 47.0 12.0 Yes Yes
376 7.90 132.0 46.0 4.0 206.0 124.0 Medium 73.0 11.0 Yes No
377 16.27 141.0 60.0 19.0 319.0 92.0 Good 44.0 11.0 Yes Yes
378 6.81 132.0 61.0 0.0 263.0 125.0 Medium 41.0 12.0 No No
379 6.11 133.0 88.0 3.0 105.0 119.0 Medium 79.0 12.0 Yes Yes
380 5.81 125.0 111.0 0.0 404.0 107.0 Bad 54.0 15.0 Yes No
381 9.64 106.0 64.0 10.0 17.0 89.0 Medium 68.0 17.0 Yes Yes
382 3.90 124.0 65.0 21.0 496.0 151.0 Bad 77.0 13.0 Yes Yes
383 4.95 121.0 28.0 19.0 315.0 121.0 Medium 66.0 14.0 Yes Yes
384 9.35 98.0 117.0 0.0 76.0 68.0 Medium 63.0 10.0 Yes No
385 12.85 123.0 37.0 15.0 348.0 112.0 Good 28.0 12.0 Yes Yes
386 5.87 131.0 73.0 13.0 455.0 132.0 Medium 62.0 17.0 Yes Yes
387 5.32 152.0 116.0 0.0 170.0 160.0 Medium 39.0 16.0 Yes No
388 8.67 142.0 73.0 14.0 238.0 115.0 Medium 73.0 14.0 No Yes
389 8.14 135.0 89.0 11.0 245.0 78.0 Bad 79.0 16.0 Yes Yes
390 8.44 128.0 42.0 8.0 328.0 107.0 Medium 35.0 12.0 Yes Yes
391 5.47 108.0 75.0 9.0 61.0 111.0 Medium 67.0 12.0 Yes Yes
392 6.10 153.0 63.0 0.0 49.0 124.0 Bad 56.0 16.0 Yes No
393 4.53 129.0 42.0 13.0 315.0 130.0 Bad 34.0 13.0 Yes Yes
394 5.57 109.0 51.0 10.0 26.0 120.0 Medium 30.0 17.0 No Yes
395 5.35 130.0 58.0 19.0 366.0 139.0 Bad 33.0 16.0 Yes Yes
396 12.57 138.0 108.0 17.0 203.0 128.0 Good 33.0 14.0 Yes Yes
397 6.14 139.0 23.0 3.0 37.0 120.0 Medium 55.0 11.0 No Yes
398 7.41 162.0 26.0 12.0 368.0 159.0 Medium 40.0 18.0 Yes Yes
399 5.94 100.0 79.0 7.0 284.0 95.0 Bad 50.0 12.0 Yes Yes
400 9.71 134.0 37.0 0.0 27.0 120.0 Good 49.0 16.0 Yes Yes

400 rows × 11 columns

In [23]:
# Regression with all terms plus two interaction terms on Carseats dataset
all_predictors = '+'.join(carseats.columns.drop('Sales'))
carseats_smmlr = smf.ols('Sales ~ ' + all_predictors + ' + Income:Advertising + Price:Age', data=carseats).fit()
print(carseats_smmlr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(carseats_smmlr.mse_resid))
print("\nResiduals:")
print(carseats_smmlr.resid.describe()[3:])
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Sales   R-squared:                       0.876
Model:                            OLS   Adj. R-squared:                  0.872
Method:                 Least Squares   F-statistic:                     210.0
Date:                Tue, 28 Feb 2017   Prob (F-statistic):          6.14e-166
Time:                        16:53:35   Log-Likelihood:                -564.67
No. Observations:                 400   AIC:                             1157.
Df Residuals:                     386   BIC:                             1213.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------
Intercept               6.5756      1.009      6.519      0.000         4.592     8.559
ShelveLoc[T.Good]       4.8487      0.153     31.724      0.000         4.548     5.149
ShelveLoc[T.Medium]     1.9533      0.126     15.531      0.000         1.706     2.201
Urban[T.Yes]            0.1402      0.112      1.247      0.213        -0.081     0.361
US[T.Yes]              -0.1576      0.149     -1.058      0.291        -0.450     0.135
CompPrice               0.0929      0.004     22.567      0.000         0.085     0.101
Income                  0.0109      0.003      4.183      0.000         0.006     0.016
Advertising             0.0702      0.023      3.107      0.002         0.026     0.115
Population              0.0002      0.000      0.433      0.665        -0.001     0.001
Price                  -0.1008      0.007    -13.549      0.000        -0.115    -0.086
Age                    -0.0579      0.016     -3.633      0.000        -0.089    -0.027
Education              -0.0209      0.020     -1.063      0.288        -0.059     0.018
Income:Advertising      0.0008      0.000      2.698      0.007         0.000     0.001
Price:Age               0.0001      0.000      0.801      0.424        -0.000     0.000
==============================================================================
Omnibus:                        1.281   Durbin-Watson:                   2.047
Prob(Omnibus):                  0.527   Jarque-Bera (JB):                1.147
Skew:                           0.129   Prob(JB):                        0.564
Kurtosis:                       3.050   Cond. No.                     1.31e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

Residual Standard Error:
1.01060309838

Residuals:
min   -2.920817
25%   -0.750294
50%    0.017678
75%    0.675410
max    3.341301
dtype: float64
In [24]:
# Show contrast matrix, the dummy variable coding
from patsy.contrasts import Treatment
levels = list(carseats.ShelveLoc.unique())
contrast = Treatment(reference=0).code_without_intercept(levels)
cm_df = pd.DataFrame(contrast.matrix, columns=contrast.column_suffixes, index=levels, dtype=int)
display(cm_df)
[T.Good] [T.Medium]
Bad 0 0
Good 1 0
Medium 0 1

Exercise 8

In [25]:
# Auto dataset is in R ISLR package
islr = importr('ISLR')
auto_rdf = rdata(islr).fetch('Auto')['Auto']
auto = pandas2ri.ri2py(auto_rdf)
display(auto)
mpg cylinders displacement horsepower weight acceleration year origin name
1 18.0 8.0 307.0 130.0 3504.0 12.0 70.0 1.0 chevrolet chevelle malibu
2 15.0 8.0 350.0 165.0 3693.0 11.5 70.0 1.0 buick skylark 320
3 18.0 8.0 318.0 150.0 3436.0 11.0 70.0 1.0 plymouth satellite
4 16.0 8.0 304.0 150.0 3433.0 12.0 70.0 1.0 amc rebel sst
5 17.0 8.0 302.0 140.0 3449.0 10.5 70.0 1.0 ford torino
6 15.0 8.0 429.0 198.0 4341.0 10.0 70.0 1.0 ford galaxie 500
7 14.0 8.0 454.0 220.0 4354.0 9.0 70.0 1.0 chevrolet impala
8 14.0 8.0 440.0 215.0 4312.0 8.5 70.0 1.0 plymouth fury iii
9 14.0 8.0 455.0 225.0 4425.0 10.0 70.0 1.0 pontiac catalina
10 15.0 8.0 390.0 190.0 3850.0 8.5 70.0 1.0 amc ambassador dpl
11 15.0 8.0 383.0 170.0 3563.0 10.0 70.0 1.0 dodge challenger se
12 14.0 8.0 340.0 160.0 3609.0 8.0 70.0 1.0 plymouth 'cuda 340
13 15.0 8.0 400.0 150.0 3761.0 9.5 70.0 1.0 chevrolet monte carlo
14 14.0 8.0 455.0 225.0 3086.0 10.0 70.0 1.0 buick estate wagon (sw)
15 24.0 4.0 113.0 95.0 2372.0 15.0 70.0 3.0 toyota corona mark ii
16 22.0 6.0 198.0 95.0 2833.0 15.5 70.0 1.0 plymouth duster
17 18.0 6.0 199.0 97.0 2774.0 15.5 70.0 1.0 amc hornet
18 21.0 6.0 200.0 85.0 2587.0 16.0 70.0 1.0 ford maverick
19 27.0 4.0 97.0 88.0 2130.0 14.5 70.0 3.0 datsun pl510
20 26.0 4.0 97.0 46.0 1835.0 20.5 70.0 2.0 volkswagen 1131 deluxe sedan
21 25.0 4.0 110.0 87.0 2672.0 17.5 70.0 2.0 peugeot 504
22 24.0 4.0 107.0 90.0 2430.0 14.5 70.0 2.0 audi 100 ls
23 25.0 4.0 104.0 95.0 2375.0 17.5 70.0 2.0 saab 99e
24 26.0 4.0 121.0 113.0 2234.0 12.5 70.0 2.0 bmw 2002
25 21.0 6.0 199.0 90.0 2648.0 15.0 70.0 1.0 amc gremlin
26 10.0 8.0 360.0 215.0 4615.0 14.0 70.0 1.0 ford f250
27 10.0 8.0 307.0 200.0 4376.0 15.0 70.0 1.0 chevy c20
28 11.0 8.0 318.0 210.0 4382.0 13.5 70.0 1.0 dodge d200
29 9.0 8.0 304.0 193.0 4732.0 18.5 70.0 1.0 hi 1200d
30 27.0 4.0 97.0 88.0 2130.0 14.5 71.0 3.0 datsun pl510
... ... ... ... ... ... ... ... ... ...
368 28.0 4.0 112.0 88.0 2605.0 19.6 82.0 1.0 chevrolet cavalier
369 27.0 4.0 112.0 88.0 2640.0 18.6 82.0 1.0 chevrolet cavalier wagon
370 34.0 4.0 112.0 88.0 2395.0 18.0 82.0 1.0 chevrolet cavalier 2-door
371 31.0 4.0 112.0 85.0 2575.0 16.2 82.0 1.0 pontiac j2000 se hatchback
372 29.0 4.0 135.0 84.0 2525.0 16.0 82.0 1.0 dodge aries se
373 27.0 4.0 151.0 90.0 2735.0 18.0 82.0 1.0 pontiac phoenix
374 24.0 4.0 140.0 92.0 2865.0 16.4 82.0 1.0 ford fairmont futura
375 36.0 4.0 105.0 74.0 1980.0 15.3 82.0 2.0 volkswagen rabbit l
376 37.0 4.0 91.0 68.0 2025.0 18.2 82.0 3.0 mazda glc custom l
377 31.0 4.0 91.0 68.0 1970.0 17.6 82.0 3.0 mazda glc custom
378 38.0 4.0 105.0 63.0 2125.0 14.7 82.0 1.0 plymouth horizon miser
379 36.0 4.0 98.0 70.0 2125.0 17.3 82.0 1.0 mercury lynx l
380 36.0 4.0 120.0 88.0 2160.0 14.5 82.0 3.0 nissan stanza xe
381 36.0 4.0 107.0 75.0 2205.0 14.5 82.0 3.0 honda accord
382 34.0 4.0 108.0 70.0 2245.0 16.9 82.0 3.0 toyota corolla
383 38.0 4.0 91.0 67.0 1965.0 15.0 82.0 3.0 honda civic
384 32.0 4.0 91.0 67.0 1965.0 15.7 82.0 3.0 honda civic (auto)
385 38.0 4.0 91.0 67.0 1995.0 16.2 82.0 3.0 datsun 310 gx
386 25.0 6.0 181.0 110.0 2945.0 16.4 82.0 1.0 buick century limited
387 38.0 6.0 262.0 85.0 3015.0 17.0 82.0 1.0 oldsmobile cutlass ciera (diesel)
388 26.0 4.0 156.0 92.0 2585.0 14.5 82.0 1.0 chrysler lebaron medallion
389 22.0 6.0 232.0 112.0 2835.0 14.7 82.0 1.0 ford granada l
390 32.0 4.0 144.0 96.0 2665.0 13.9 82.0 3.0 toyota celica gt
391 36.0 4.0 135.0 84.0 2370.0 13.0 82.0 1.0 dodge charger 2.2
392 27.0 4.0 151.0 90.0 2950.0 17.3 82.0 1.0 chevrolet camaro
393 27.0 4.0 140.0 86.0 2790.0 15.6 82.0 1.0 ford mustang gl
394 44.0 4.0 97.0 52.0 2130.0 24.6 82.0 2.0 vw pickup
395 32.0 4.0 135.0 84.0 2295.0 11.6 82.0 1.0 dodge rampage
396 28.0 4.0 120.0 79.0 2625.0 18.6 82.0 1.0 ford ranger
397 31.0 4.0 119.0 82.0 2720.0 19.4 82.0 1.0 chevy s-10

392 rows × 9 columns

In [26]:
# simple linear regression on Auto dataset
auto_slr = smf.ols('mpg ~ horsepower ', data=auto).fit()
print(auto_slr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(auto_slr.mse_resid))
print("\nResiduals:")
print(auto_slr.resid.describe()[3:])
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                     599.7
Date:                Tue, 28 Feb 2017   Prob (F-statistic):           7.03e-81
Time:                        16:53:35   Log-Likelihood:                -1178.7
No. Observations:                 392   AIC:                             2361.
Df Residuals:                     390   BIC:                             2369.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     39.9359      0.717     55.660      0.000        38.525    41.347
horsepower    -0.1578      0.006    -24.489      0.000        -0.171    -0.145
==============================================================================
Omnibus:                       16.432   Durbin-Watson:                   0.920
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               17.305
Skew:                           0.492   Prob(JB):                     0.000175
Kurtosis:                       3.299   Cond. No.                         322.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Residual Standard Error:
4.90575691955

Residuals:
min   -13.571040
25%    -3.259151
50%    -0.343543
75%     2.763033
max    16.924047
dtype: float64
In [27]:
# 8 (a) - iv. Prediction
auto_pred = auto_slr.predict(exog=dict(horsepower=98))
print(auto_pred)
[ 24.46707715]
In [28]:
# Plot SLR fitting results
fig, ax = plt.subplots()
plt.scatter(auto.horsepower, auto.mpg, color='r', s=15, alpha=0.5)  # Data points
smg.abline_plot(model_results=auto_slr, ax=ax)  # Regression line
plt.xlim(25, 250)
plt.ylim(5, 50)
plt.xlabel('Horsepower')
plt.ylabel('MPG')
plt.title('Simple Linear Regression on Auto dataset')
plt.show()
In [29]:
# Diagnostic plots
infl = auto_slr.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))

# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(auto_slr.fittedvalues, auto_slr.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')

# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_internal, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')

# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(auto_slr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')

# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(auto_slr, size=20, ax=ax4)
plt.xlim(0, 0.03)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')

plt.tight_layout()
plt.show()

Exercise 9

In [30]:
# 9 - (a) Plot scatter matrix.
sns.pairplot(auto, size=2)
plt.show()