# Standard imports
import warnings
# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri
# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd
# StatsModels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from statsmodels.stats.anova import anova_lm
# scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Visulization
from IPython.display import display, HTML
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import statsmodels.graphics.api as smg
def ortho_poly_fit(x, degree = 1):
'''
Convert data into orthogonal basis for polynomial regression by QR decomposition.
Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly
'''
n = degree + 1
x = np.asarray(x).flatten()
if(degree >= len(np.unique(x))):
stop("'degree' must be less than number of unique points")
xbar = np.mean(x)
x = x - xbar
X = np.fliplr(np.vander(x, n))
q,r = np.linalg.qr(X)
z = np.diag(np.diag(r))
raw = np.dot(q, z)
norm2 = np.sum(raw**2, axis=0)
alpha = (np.sum((raw**2)*np.reshape(x,(-1,1)), axis=0)/norm2 + xbar)[:degree]
Z = raw / np.sqrt(norm2)
return Z, norm2, alpha
def ortho_poly_predict(x, alpha, norm2, degree = 1):
'''
Convert new data for prediction into orthogonal basis, based on input parameters.
Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly
'''
x = np.asarray(x).flatten()
n = degree + 1
Z = np.empty((len(x), n))
Z[:,0] = 1
if degree > 0:
Z[:, 1] = x - alpha[0]
if degree > 1:
for i in np.arange(1,degree):
Z[:, i+1] = (x - alpha[i]) * Z[:, i] - (norm2[i] / norm2[i-1]) * Z[:, i-1]
Z /= np.sqrt(norm2)
return Z
# Import Boston house price data from R package MASS
# This dataset is also available at https://archive.ics.uci.edu/ml/datasets/Housing
mass = importr('MASS')
boston_rdf = rdata(mass).fetch('Boston')['Boston']
boston = pandas2ri.ri2py(boston_rdf)
# boston.describe # Head and tails in text.
# display(boston) # Head and tail in a data table.
HTML(boston.to_html(max_rows=60)) # All rows, or a subset in a data table.
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
2 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
3 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
4 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
5 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
6 | 0.02985 | 0.0 | 2.18 | 0 | 0.458 | 6.430 | 58.7 | 6.0622 | 3 | 222.0 | 18.7 | 394.12 | 5.21 | 28.7 |
7 | 0.08829 | 12.5 | 7.87 | 0 | 0.524 | 6.012 | 66.6 | 5.5605 | 5 | 311.0 | 15.2 | 395.60 | 12.43 | 22.9 |
8 | 0.14455 | 12.5 | 7.87 | 0 | 0.524 | 6.172 | 96.1 | 5.9505 | 5 | 311.0 | 15.2 | 396.90 | 19.15 | 27.1 |
9 | 0.21124 | 12.5 | 7.87 | 0 | 0.524 | 5.631 | 100.0 | 6.0821 | 5 | 311.0 | 15.2 | 386.63 | 29.93 | 16.5 |
10 | 0.17004 | 12.5 | 7.87 | 0 | 0.524 | 6.004 | 85.9 | 6.5921 | 5 | 311.0 | 15.2 | 386.71 | 17.10 | 18.9 |
11 | 0.22489 | 12.5 | 7.87 | 0 | 0.524 | 6.377 | 94.3 | 6.3467 | 5 | 311.0 | 15.2 | 392.52 | 20.45 | 15.0 |
12 | 0.11747 | 12.5 | 7.87 | 0 | 0.524 | 6.009 | 82.9 | 6.2267 | 5 | 311.0 | 15.2 | 396.90 | 13.27 | 18.9 |
13 | 0.09378 | 12.5 | 7.87 | 0 | 0.524 | 5.889 | 39.0 | 5.4509 | 5 | 311.0 | 15.2 | 390.50 | 15.71 | 21.7 |
14 | 0.62976 | 0.0 | 8.14 | 0 | 0.538 | 5.949 | 61.8 | 4.7075 | 4 | 307.0 | 21.0 | 396.90 | 8.26 | 20.4 |
15 | 0.63796 | 0.0 | 8.14 | 0 | 0.538 | 6.096 | 84.5 | 4.4619 | 4 | 307.0 | 21.0 | 380.02 | 10.26 | 18.2 |
16 | 0.62739 | 0.0 | 8.14 | 0 | 0.538 | 5.834 | 56.5 | 4.4986 | 4 | 307.0 | 21.0 | 395.62 | 8.47 | 19.9 |
17 | 1.05393 | 0.0 | 8.14 | 0 | 0.538 | 5.935 | 29.3 | 4.4986 | 4 | 307.0 | 21.0 | 386.85 | 6.58 | 23.1 |
18 | 0.78420 | 0.0 | 8.14 | 0 | 0.538 | 5.990 | 81.7 | 4.2579 | 4 | 307.0 | 21.0 | 386.75 | 14.67 | 17.5 |
19 | 0.80271 | 0.0 | 8.14 | 0 | 0.538 | 5.456 | 36.6 | 3.7965 | 4 | 307.0 | 21.0 | 288.99 | 11.69 | 20.2 |
20 | 0.72580 | 0.0 | 8.14 | 0 | 0.538 | 5.727 | 69.5 | 3.7965 | 4 | 307.0 | 21.0 | 390.95 | 11.28 | 18.2 |
21 | 1.25179 | 0.0 | 8.14 | 0 | 0.538 | 5.570 | 98.1 | 3.7979 | 4 | 307.0 | 21.0 | 376.57 | 21.02 | 13.6 |
22 | 0.85204 | 0.0 | 8.14 | 0 | 0.538 | 5.965 | 89.2 | 4.0123 | 4 | 307.0 | 21.0 | 392.53 | 13.83 | 19.6 |
23 | 1.23247 | 0.0 | 8.14 | 0 | 0.538 | 6.142 | 91.7 | 3.9769 | 4 | 307.0 | 21.0 | 396.90 | 18.72 | 15.2 |
24 | 0.98843 | 0.0 | 8.14 | 0 | 0.538 | 5.813 | 100.0 | 4.0952 | 4 | 307.0 | 21.0 | 394.54 | 19.88 | 14.5 |
25 | 0.75026 | 0.0 | 8.14 | 0 | 0.538 | 5.924 | 94.1 | 4.3996 | 4 | 307.0 | 21.0 | 394.33 | 16.30 | 15.6 |
26 | 0.84054 | 0.0 | 8.14 | 0 | 0.538 | 5.599 | 85.7 | 4.4546 | 4 | 307.0 | 21.0 | 303.42 | 16.51 | 13.9 |
27 | 0.67191 | 0.0 | 8.14 | 0 | 0.538 | 5.813 | 90.3 | 4.6820 | 4 | 307.0 | 21.0 | 376.88 | 14.81 | 16.6 |
28 | 0.95577 | 0.0 | 8.14 | 0 | 0.538 | 6.047 | 88.8 | 4.4534 | 4 | 307.0 | 21.0 | 306.38 | 17.28 | 14.8 |
29 | 0.77299 | 0.0 | 8.14 | 0 | 0.538 | 6.495 | 94.4 | 4.4547 | 4 | 307.0 | 21.0 | 387.94 | 12.80 | 18.4 |
30 | 1.00245 | 0.0 | 8.14 | 0 | 0.538 | 6.674 | 87.3 | 4.2390 | 4 | 307.0 | 21.0 | 380.23 | 11.98 | 21.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
477 | 4.87141 | 0.0 | 18.10 | 0 | 0.614 | 6.484 | 93.6 | 2.3053 | 24 | 666.0 | 20.2 | 396.21 | 18.68 | 16.7 |
478 | 15.02340 | 0.0 | 18.10 | 0 | 0.614 | 5.304 | 97.3 | 2.1007 | 24 | 666.0 | 20.2 | 349.48 | 24.91 | 12.0 |
479 | 10.23300 | 0.0 | 18.10 | 0 | 0.614 | 6.185 | 96.7 | 2.1705 | 24 | 666.0 | 20.2 | 379.70 | 18.03 | 14.6 |
480 | 14.33370 | 0.0 | 18.10 | 0 | 0.614 | 6.229 | 88.0 | 1.9512 | 24 | 666.0 | 20.2 | 383.32 | 13.11 | 21.4 |
481 | 5.82401 | 0.0 | 18.10 | 0 | 0.532 | 6.242 | 64.7 | 3.4242 | 24 | 666.0 | 20.2 | 396.90 | 10.74 | 23.0 |
482 | 5.70818 | 0.0 | 18.10 | 0 | 0.532 | 6.750 | 74.9 | 3.3317 | 24 | 666.0 | 20.2 | 393.07 | 7.74 | 23.7 |
483 | 5.73116 | 0.0 | 18.10 | 0 | 0.532 | 7.061 | 77.0 | 3.4106 | 24 | 666.0 | 20.2 | 395.28 | 7.01 | 25.0 |
484 | 2.81838 | 0.0 | 18.10 | 0 | 0.532 | 5.762 | 40.3 | 4.0983 | 24 | 666.0 | 20.2 | 392.92 | 10.42 | 21.8 |
485 | 2.37857 | 0.0 | 18.10 | 0 | 0.583 | 5.871 | 41.9 | 3.7240 | 24 | 666.0 | 20.2 | 370.73 | 13.34 | 20.6 |
486 | 3.67367 | 0.0 | 18.10 | 0 | 0.583 | 6.312 | 51.9 | 3.9917 | 24 | 666.0 | 20.2 | 388.62 | 10.58 | 21.2 |
487 | 5.69175 | 0.0 | 18.10 | 0 | 0.583 | 6.114 | 79.8 | 3.5459 | 24 | 666.0 | 20.2 | 392.68 | 14.98 | 19.1 |
488 | 4.83567 | 0.0 | 18.10 | 0 | 0.583 | 5.905 | 53.2 | 3.1523 | 24 | 666.0 | 20.2 | 388.22 | 11.45 | 20.6 |
489 | 0.15086 | 0.0 | 27.74 | 0 | 0.609 | 5.454 | 92.7 | 1.8209 | 4 | 711.0 | 20.1 | 395.09 | 18.06 | 15.2 |
490 | 0.18337 | 0.0 | 27.74 | 0 | 0.609 | 5.414 | 98.3 | 1.7554 | 4 | 711.0 | 20.1 | 344.05 | 23.97 | 7.0 |
491 | 0.20746 | 0.0 | 27.74 | 0 | 0.609 | 5.093 | 98.0 | 1.8226 | 4 | 711.0 | 20.1 | 318.43 | 29.68 | 8.1 |
492 | 0.10574 | 0.0 | 27.74 | 0 | 0.609 | 5.983 | 98.8 | 1.8681 | 4 | 711.0 | 20.1 | 390.11 | 18.07 | 13.6 |
493 | 0.11132 | 0.0 | 27.74 | 0 | 0.609 | 5.983 | 83.5 | 2.1099 | 4 | 711.0 | 20.1 | 396.90 | 13.35 | 20.1 |
494 | 0.17331 | 0.0 | 9.69 | 0 | 0.585 | 5.707 | 54.0 | 2.3817 | 6 | 391.0 | 19.2 | 396.90 | 12.01 | 21.8 |
495 | 0.27957 | 0.0 | 9.69 | 0 | 0.585 | 5.926 | 42.6 | 2.3817 | 6 | 391.0 | 19.2 | 396.90 | 13.59 | 24.5 |
496 | 0.17899 | 0.0 | 9.69 | 0 | 0.585 | 5.670 | 28.8 | 2.7986 | 6 | 391.0 | 19.2 | 393.29 | 17.60 | 23.1 |
497 | 0.28960 | 0.0 | 9.69 | 0 | 0.585 | 5.390 | 72.9 | 2.7986 | 6 | 391.0 | 19.2 | 396.90 | 21.14 | 19.7 |
498 | 0.26838 | 0.0 | 9.69 | 0 | 0.585 | 5.794 | 70.6 | 2.8927 | 6 | 391.0 | 19.2 | 396.90 | 14.10 | 18.3 |
499 | 0.23912 | 0.0 | 9.69 | 0 | 0.585 | 6.019 | 65.3 | 2.4091 | 6 | 391.0 | 19.2 | 396.90 | 12.92 | 21.2 |
500 | 0.17783 | 0.0 | 9.69 | 0 | 0.585 | 5.569 | 73.5 | 2.3999 | 6 | 391.0 | 19.2 | 395.77 | 15.10 | 17.5 |
501 | 0.22438 | 0.0 | 9.69 | 0 | 0.585 | 6.027 | 79.7 | 2.4982 | 6 | 391.0 | 19.2 | 396.90 | 14.33 | 16.8 |
502 | 0.06263 | 0.0 | 11.93 | 0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273.0 | 21.0 | 391.99 | 9.67 | 22.4 |
503 | 0.04527 | 0.0 | 11.93 | 0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273.0 | 21.0 | 396.90 | 9.08 | 20.6 |
504 | 0.06076 | 0.0 | 11.93 | 0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273.0 | 21.0 | 396.90 | 5.64 | 23.9 |
505 | 0.10959 | 0.0 | 11.93 | 0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1 | 273.0 | 21.0 | 393.45 | 6.48 | 22.0 |
506 | 0.04741 | 0.0 | 11.93 | 0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1 | 273.0 | 21.0 | 396.90 | 7.88 | 11.9 |
boston.info() # Summary of the data
<class 'pandas.core.frame.DataFrame'> Int64Index: 506 entries, 1 to 506 Data columns (total 14 columns): crim 506 non-null float64 zn 506 non-null float64 indus 506 non-null float64 chas 506 non-null int32 nox 506 non-null float64 rm 506 non-null float64 age 506 non-null float64 dis 506 non-null float64 rad 506 non-null int32 tax 506 non-null float64 ptratio 506 non-null float64 black 506 non-null float64 lstat 506 non-null float64 medv 506 non-null float64 dtypes: float64(12), int32(2) memory usage: 55.3 KB
# Simple linear regression and plot using seaborn
sns.lmplot(x='lstat', y='medv', data=boston, scatter_kws={'color':'r', 's':15});
plt.xlim(xmin=0)
plt.ylim(ymin=0)
sns.plt.show()
plt.figure()
sns.residplot(x='lstat', y='medv', data=boston)
sns.plt.show()
# Simple linear regression with scikit-learn
boston_sk = LinearRegression()
nrow = boston.shape[0]
boston_sk.fit(boston['lstat'].values.reshape(nrow, 1), boston['medv'].values.reshape(nrow, 1))
print("Slope: {}, Intercept: {}".format(boston_sk.coef_, boston_sk.intercept_))
Slope: [[-0.95004935]], Intercept: [ 34.55384088]
# Simple linear regression with StatsModels
boston_smslr = smf.ols('medv ~ lstat', data=boston).fit()
print(boston_smslr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smslr.mse_resid))
print("\nResiduals:")
display(boston_smslr.resid.describe()[3:])
OLS Regression Results ============================================================================== Dep. Variable: medv R-squared: 0.544 Model: OLS Adj. R-squared: 0.543 Method: Least Squares F-statistic: 601.6 Date: Tue, 28 Feb 2017 Prob (F-statistic): 5.08e-88 Time: 16:53:32 Log-Likelihood: -1641.5 No. Observations: 506 AIC: 3287. Df Residuals: 504 BIC: 3295. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ Intercept 34.5538 0.563 61.415 0.000 33.448 35.659 lstat -0.9500 0.039 -24.528 0.000 -1.026 -0.874 ============================================================================== Omnibus: 137.043 Durbin-Watson: 0.892 Prob(Omnibus): 0.000 Jarque-Bera (JB): 291.373 Skew: 1.453 Prob(JB): 5.36e-64 Kurtosis: 5.319 Cond. No. 29.7 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. Residual Standard Error: 6.2157604054 Residuals:
min -15.167452 25% -3.989612 50% -1.318186 75% 2.033701 max 24.500129 dtype: float64
# Plot StatsModels OLS fitting results
fig, ax = plt.subplots()
plt.scatter(boston.lstat, boston.medv, color='r', s=15, alpha=0.5) # Data points
smg.abline_plot(model_results=boston_smslr, ax=ax) # Regression line
plt.xlim(0, 40)
plt.ylim(0, 55)
plt.xlabel('lstat')
plt.ylabel('medv')
plt.show()
# 97.5% confidence interval for the coefficient estimates
ci = boston_smslr.conf_int(0.025)
ci.columns = ['2.5%', '97.5%']
print(ci)
2.5% 97.5% Intercept 33.288987 35.818694 lstat -1.037127 -0.862972
We can see the confidence interval computed by StatsModels is slightly difference than R.
# Prediction (To be finished)
fit = boston_smslr.predict(exog=dict(lstat=[5,10,15]))
print(fit)
# Prediction Intervals
#from statsmodels.sandbox.regression.predstd import wls_prediction_std
#prstd, iv_l, iv_u = wls_prediction_std(boston_smslr)
#print(prstd,iv_l,iv_u)
# Confidence Intervals
[ 29.80359411 25.05334734 20.30310057]
References: http://stackoverflow.com/questions/17559408/confidence-and-prediction-intervals-with-statsmodels http://stackoverflow.com/questions/20572706/mathematical-background-of-statsmodels-wls-prediction-std http://markthegraph.blogspot.com/2015/05/using-python-statsmodels-for-ols-linear.html http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/wls.html?highlight=wls_prediction_std
# Diagnostic plots for simple linear regression
infl = boston_smslr.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))
# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(boston_smslr.fittedvalues, boston_smslr.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_external, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(boston_smslr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')
# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(boston_smslr, size=20, ax=ax4)
plt.xlim(0, 0.03)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')
plt.tight_layout()
plt.show()
# Find the data index with the maximum leverage statistics
infl.hat_matrix_diag.argmax() + 1
375
# Regression with two predictors on Boston dataset
boston_smmlr = smf.ols('medv ~ lstat + age', data=boston).fit()
print(boston_smmlr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr.mse_resid))
print("\nResiduals:")
print(boston_smmlr.resid.describe()[3:])
OLS Regression Results ============================================================================== Dep. Variable: medv R-squared: 0.551 Model: OLS Adj. R-squared: 0.549 Method: Least Squares F-statistic: 309.0 Date: Tue, 28 Feb 2017 Prob (F-statistic): 2.98e-88 Time: 16:53:33 Log-Likelihood: -1637.5 No. Observations: 506 AIC: 3281. Df Residuals: 503 BIC: 3294. Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ Intercept 33.2228 0.731 45.458 0.000 31.787 34.659 lstat -1.0321 0.048 -21.416 0.000 -1.127 -0.937 age 0.0345 0.012 2.826 0.005 0.011 0.059 ============================================================================== Omnibus: 124.288 Durbin-Watson: 0.945 Prob(Omnibus): 0.000 Jarque-Bera (JB): 244.026 Skew: 1.362 Prob(JB): 1.02e-53 Kurtosis: 5.038 Cond. No. 201. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. Residual Standard Error: 6.17313628136 Residuals: min -15.981243 25% -3.977470 50% -1.283443 75% 1.968309 max 23.158419 dtype: float64
# Regression with all predictors on Boston dataset
all_predictors = '+'.join(boston.columns.drop('medv'))
boston_smmlr_all = smf.ols('medv ~ ' + all_predictors, data=boston).fit()
print(boston_smmlr_all.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_all.mse_resid))
print("\nResiduals:")
print(boston_smmlr_all.resid.describe()[3:])
OLS Regression Results ============================================================================== Dep. Variable: medv R-squared: 0.741 Model: OLS Adj. R-squared: 0.734 Method: Least Squares F-statistic: 108.1 Date: Tue, 28 Feb 2017 Prob (F-statistic): 6.72e-135 Time: 16:53:34 Log-Likelihood: -1498.8 No. Observations: 506 AIC: 3026. Df Residuals: 492 BIC: 3085. Df Model: 13 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ Intercept 36.4595 5.103 7.144 0.000 26.432 46.487 crim -0.1080 0.033 -3.287 0.001 -0.173 -0.043 zn 0.0464 0.014 3.382 0.001 0.019 0.073 indus 0.0206 0.061 0.334 0.738 -0.100 0.141 chas 2.6867 0.862 3.118 0.002 0.994 4.380 nox -17.7666 3.820 -4.651 0.000 -25.272 -10.262 rm 3.8099 0.418 9.116 0.000 2.989 4.631 age 0.0007 0.013 0.052 0.958 -0.025 0.027 dis -1.4756 0.199 -7.398 0.000 -1.867 -1.084 rad 0.3060 0.066 4.613 0.000 0.176 0.436 tax -0.0123 0.004 -3.280 0.001 -0.020 -0.005 ptratio -0.9527 0.131 -7.283 0.000 -1.210 -0.696 black 0.0093 0.003 3.467 0.001 0.004 0.015 lstat -0.5248 0.051 -10.347 0.000 -0.624 -0.425 ============================================================================== Omnibus: 178.041 Durbin-Watson: 1.078 Prob(Omnibus): 0.000 Jarque-Bera (JB): 783.126 Skew: 1.521 Prob(JB): 8.84e-171 Kurtosis: 8.281 Cond. No. 1.51e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.51e+04. This might indicate that there are strong multicollinearity or other numerical problems. Residual Standard Error: 4.7452981817 Residuals: min -15.594474 25% -2.729716 50% -0.518049 75% 1.777051 max 26.199271 dtype: float64
# Compute variance inflation factors
exog = boston_smmlr_all.model.exog
exog_names = boston_smmlr_all.model.exog_names
vif_df = pd.DataFrame([vif(exog, ix) for ix in range(1, exog.shape[1])]).transpose()
vif_df.columns = exog_names[1:]
display(vif_df)
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.792192 | 2.298758 | 3.991596 | 1.073995 | 4.39372 | 1.933744 | 3.100826 | 3.955945 | 7.484496 | 9.008554 | 1.799084 | 1.348521 | 2.941491 |
# Regression with two interaction terms on Boston dataset
boston_smmlr_inter = smf.ols('medv ~ lstat * age', data=boston).fit()
print(boston_smmlr_inter.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_inter.mse_resid))
print("\nResiduals:")
print(boston_smmlr_inter.resid.describe()[3:])
OLS Regression Results ============================================================================== Dep. Variable: medv R-squared: 0.556 Model: OLS Adj. R-squared: 0.553 Method: Least Squares F-statistic: 209.3 Date: Tue, 28 Feb 2017 Prob (F-statistic): 4.86e-88 Time: 16:53:34 Log-Likelihood: -1635.0 No. Observations: 506 AIC: 3278. Df Residuals: 502 BIC: 3295. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ Intercept 36.0885 1.470 24.553 0.000 33.201 38.976 lstat -1.3921 0.167 -8.313 0.000 -1.721 -1.063 age -0.0007 0.020 -0.036 0.971 -0.040 0.038 lstat:age 0.0042 0.002 2.244 0.025 0.001 0.008 ============================================================================== Omnibus: 135.601 Durbin-Watson: 0.965 Prob(Omnibus): 0.000 Jarque-Bera (JB): 296.955 Skew: 1.417 Prob(JB): 3.29e-65 Kurtosis: 5.461 Cond. No. 6.88e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 6.88e+03. This might indicate that there are strong multicollinearity or other numerical problems. Residual Standard Error: 6.14851326968 Residuals: min -15.806521 25% -4.044681 50% -1.333163 75% 2.084707 max 27.552059 dtype: float64
# Regression with quandratic term on Boston dataset
boston_smmlr_quad = smf.ols('medv ~ lstat + I(lstat**2)', data=boston).fit()
print(boston_smmlr_quad.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_quad.mse_resid))
print("\nResiduals:")
print(boston_smmlr_quad.resid.describe()[3:])
OLS Regression Results ============================================================================== Dep. Variable: medv R-squared: 0.641 Model: OLS Adj. R-squared: 0.639 Method: Least Squares F-statistic: 448.5 Date: Tue, 28 Feb 2017 Prob (F-statistic): 1.56e-112 Time: 16:53:34 Log-Likelihood: -1581.3 No. Observations: 506 AIC: 3169. Df Residuals: 503 BIC: 3181. Df Model: 2 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [95.0% Conf. Int.] --------------------------------------------------------------------------------- Intercept 42.8620 0.872 49.149 0.000 41.149 44.575 lstat -2.3328 0.124 -18.843 0.000 -2.576 -2.090 I(lstat ** 2) 0.0435 0.004 11.628 0.000 0.036 0.051 ============================================================================== Omnibus: 107.006 Durbin-Watson: 0.921 Prob(Omnibus): 0.000 Jarque-Bera (JB): 228.388 Skew: 1.128 Prob(JB): 2.55e-50 Kurtosis: 5.397 Cond. No. 1.13e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.13e+03. This might indicate that there are strong multicollinearity or other numerical problems. Residual Standard Error: 5.52371413181 Residuals: min -15.283395 25% -3.831307 50% -0.529500 75% 2.309535 max 25.414810 dtype: float64
# ANOVA for both simple linear and quandratic fits
with warnings.catch_warnings():
warnings.filterwarnings("ignore") ## Supress warnings
display(anova_lm(boston_smslr, boston_smmlr_quad))
df_resid | ssr | df_diff | ss_diff | F | Pr(>F) | |
---|---|---|---|---|---|---|
0 | 504.0 | 19472.381418 | 0.0 | NaN | NaN | NaN |
1 | 503.0 | 15347.243158 | 1.0 | 4125.13826 | 135.199822 | 7.630116e-28 |
# Diagnostic plots for quandratic fit
infl = boston_smmlr_quad.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))
# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(boston_smmlr_quad.fittedvalues, boston_smmlr_quad.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_external, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(boston_smmlr_quad.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')
# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(boston_smslr, size=20, ax=ax4)
plt.xlim(0, 0.03)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')
plt.tight_layout()
plt.show()
# Regression with polynomial terms on Boston dataset
# poly() function in R produces orthogonal polynomials with QR decomposition by default
# Without orthogonization, powers of X are correlated, and regression on correlated predictors
# leads tounstable coefficients.
# In addition, large powers of X will require very small regression coefficients,
# potentially leading to underflow and coefficients that are hard to interpret or compare intuitively.
# Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly
poly_degree = 5
boston_smmlr_poly = smf.ols('medv ~ ortho_poly_fit(lstat, poly_degree)[0][:, 1:]', data=boston).fit()
print(boston_smmlr_poly.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_poly.mse_resid))
print("\nResiduals:")
print(boston_smmlr_poly.resid.describe()[3:])
OLS Regression Results ============================================================================== Dep. Variable: medv R-squared: 0.682 Model: OLS Adj. R-squared: 0.679 Method: Least Squares F-statistic: 214.2 Date: Tue, 28 Feb 2017 Prob (F-statistic): 8.73e-122 Time: 16:53:35 Log-Likelihood: -1550.6 No. Observations: 506 AIC: 3113. Df Residuals: 500 BIC: 3139. Df Model: 5 Covariance Type: nonrobust =================================================================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------------------------------------------- Intercept 22.5328 0.232 97.197 0.000 22.077 22.988 ortho_poly_fit(lstat, poly_degree)[0][:, 1:][0] -152.4595 5.215 -29.236 0.000 -162.705 -142.214 ortho_poly_fit(lstat, poly_degree)[0][:, 1:][1] 64.2272 5.215 12.316 0.000 53.982 74.473 ortho_poly_fit(lstat, poly_degree)[0][:, 1:][2] -27.0511 5.215 -5.187 0.000 -37.297 -16.805 ortho_poly_fit(lstat, poly_degree)[0][:, 1:][3] 25.4517 5.215 4.881 0.000 15.206 35.697 ortho_poly_fit(lstat, poly_degree)[0][:, 1:][4] -19.2524 5.215 -3.692 0.000 -29.498 -9.007 ============================================================================== Omnibus: 144.085 Durbin-Watson: 0.987 Prob(Omnibus): 0.000 Jarque-Bera (JB): 494.545 Skew: 1.292 Prob(JB): 4.08e-108 Kurtosis: 7.096 Cond. No. 22.5 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. Residual Standard Error: 5.21479338564 Residuals: min -13.543286 25% -3.103925 50% -0.705232 75% 2.084364 max 27.115265 dtype: float64
# Import Carseats data from R package ISLR
islr = importr('ISLR')
carseats_rdf = rdata(islr).fetch('Carseats')['Carseats']
carseats = pandas2ri.ri2py(carseats_rdf)
display(carseats)
Sales | CompPrice | Income | Advertising | Population | Price | ShelveLoc | Age | Education | Urban | US | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 9.50 | 138.0 | 73.0 | 11.0 | 276.0 | 120.0 | Bad | 42.0 | 17.0 | Yes | Yes |
2 | 11.22 | 111.0 | 48.0 | 16.0 | 260.0 | 83.0 | Good | 65.0 | 10.0 | Yes | Yes |
3 | 10.06 | 113.0 | 35.0 | 10.0 | 269.0 | 80.0 | Medium | 59.0 | 12.0 | Yes | Yes |
4 | 7.40 | 117.0 | 100.0 | 4.0 | 466.0 | 97.0 | Medium | 55.0 | 14.0 | Yes | Yes |
5 | 4.15 | 141.0 | 64.0 | 3.0 | 340.0 | 128.0 | Bad | 38.0 | 13.0 | Yes | No |
6 | 10.81 | 124.0 | 113.0 | 13.0 | 501.0 | 72.0 | Bad | 78.0 | 16.0 | No | Yes |
7 | 6.63 | 115.0 | 105.0 | 0.0 | 45.0 | 108.0 | Medium | 71.0 | 15.0 | Yes | No |
8 | 11.85 | 136.0 | 81.0 | 15.0 | 425.0 | 120.0 | Good | 67.0 | 10.0 | Yes | Yes |
9 | 6.54 | 132.0 | 110.0 | 0.0 | 108.0 | 124.0 | Medium | 76.0 | 10.0 | No | No |
10 | 4.69 | 132.0 | 113.0 | 0.0 | 131.0 | 124.0 | Medium | 76.0 | 17.0 | No | Yes |
11 | 9.01 | 121.0 | 78.0 | 9.0 | 150.0 | 100.0 | Bad | 26.0 | 10.0 | No | Yes |
12 | 11.96 | 117.0 | 94.0 | 4.0 | 503.0 | 94.0 | Good | 50.0 | 13.0 | Yes | Yes |
13 | 3.98 | 122.0 | 35.0 | 2.0 | 393.0 | 136.0 | Medium | 62.0 | 18.0 | Yes | No |
14 | 10.96 | 115.0 | 28.0 | 11.0 | 29.0 | 86.0 | Good | 53.0 | 18.0 | Yes | Yes |
15 | 11.17 | 107.0 | 117.0 | 11.0 | 148.0 | 118.0 | Good | 52.0 | 18.0 | Yes | Yes |
16 | 8.71 | 149.0 | 95.0 | 5.0 | 400.0 | 144.0 | Medium | 76.0 | 18.0 | No | No |
17 | 7.58 | 118.0 | 32.0 | 0.0 | 284.0 | 110.0 | Good | 63.0 | 13.0 | Yes | No |
18 | 12.29 | 147.0 | 74.0 | 13.0 | 251.0 | 131.0 | Good | 52.0 | 10.0 | Yes | Yes |
19 | 13.91 | 110.0 | 110.0 | 0.0 | 408.0 | 68.0 | Good | 46.0 | 17.0 | No | Yes |
20 | 8.73 | 129.0 | 76.0 | 16.0 | 58.0 | 121.0 | Medium | 69.0 | 12.0 | Yes | Yes |
21 | 6.41 | 125.0 | 90.0 | 2.0 | 367.0 | 131.0 | Medium | 35.0 | 18.0 | Yes | Yes |
22 | 12.13 | 134.0 | 29.0 | 12.0 | 239.0 | 109.0 | Good | 62.0 | 18.0 | No | Yes |
23 | 5.08 | 128.0 | 46.0 | 6.0 | 497.0 | 138.0 | Medium | 42.0 | 13.0 | Yes | No |
24 | 5.87 | 121.0 | 31.0 | 0.0 | 292.0 | 109.0 | Medium | 79.0 | 10.0 | Yes | No |
25 | 10.14 | 145.0 | 119.0 | 16.0 | 294.0 | 113.0 | Bad | 42.0 | 12.0 | Yes | Yes |
26 | 14.90 | 139.0 | 32.0 | 0.0 | 176.0 | 82.0 | Good | 54.0 | 11.0 | No | No |
27 | 8.33 | 107.0 | 115.0 | 11.0 | 496.0 | 131.0 | Good | 50.0 | 11.0 | No | Yes |
28 | 5.27 | 98.0 | 118.0 | 0.0 | 19.0 | 107.0 | Medium | 64.0 | 17.0 | Yes | No |
29 | 2.99 | 103.0 | 74.0 | 0.0 | 359.0 | 97.0 | Bad | 55.0 | 11.0 | Yes | Yes |
30 | 7.81 | 104.0 | 99.0 | 15.0 | 226.0 | 102.0 | Bad | 58.0 | 17.0 | Yes | Yes |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
371 | 7.68 | 126.0 | 41.0 | 22.0 | 403.0 | 119.0 | Bad | 42.0 | 12.0 | Yes | Yes |
372 | 9.08 | 152.0 | 81.0 | 0.0 | 191.0 | 126.0 | Medium | 54.0 | 16.0 | Yes | No |
373 | 7.80 | 121.0 | 50.0 | 0.0 | 508.0 | 98.0 | Medium | 65.0 | 11.0 | No | No |
374 | 5.58 | 137.0 | 71.0 | 0.0 | 402.0 | 116.0 | Medium | 78.0 | 17.0 | Yes | No |
375 | 9.44 | 131.0 | 47.0 | 7.0 | 90.0 | 118.0 | Medium | 47.0 | 12.0 | Yes | Yes |
376 | 7.90 | 132.0 | 46.0 | 4.0 | 206.0 | 124.0 | Medium | 73.0 | 11.0 | Yes | No |
377 | 16.27 | 141.0 | 60.0 | 19.0 | 319.0 | 92.0 | Good | 44.0 | 11.0 | Yes | Yes |
378 | 6.81 | 132.0 | 61.0 | 0.0 | 263.0 | 125.0 | Medium | 41.0 | 12.0 | No | No |
379 | 6.11 | 133.0 | 88.0 | 3.0 | 105.0 | 119.0 | Medium | 79.0 | 12.0 | Yes | Yes |
380 | 5.81 | 125.0 | 111.0 | 0.0 | 404.0 | 107.0 | Bad | 54.0 | 15.0 | Yes | No |
381 | 9.64 | 106.0 | 64.0 | 10.0 | 17.0 | 89.0 | Medium | 68.0 | 17.0 | Yes | Yes |
382 | 3.90 | 124.0 | 65.0 | 21.0 | 496.0 | 151.0 | Bad | 77.0 | 13.0 | Yes | Yes |
383 | 4.95 | 121.0 | 28.0 | 19.0 | 315.0 | 121.0 | Medium | 66.0 | 14.0 | Yes | Yes |
384 | 9.35 | 98.0 | 117.0 | 0.0 | 76.0 | 68.0 | Medium | 63.0 | 10.0 | Yes | No |
385 | 12.85 | 123.0 | 37.0 | 15.0 | 348.0 | 112.0 | Good | 28.0 | 12.0 | Yes | Yes |
386 | 5.87 | 131.0 | 73.0 | 13.0 | 455.0 | 132.0 | Medium | 62.0 | 17.0 | Yes | Yes |
387 | 5.32 | 152.0 | 116.0 | 0.0 | 170.0 | 160.0 | Medium | 39.0 | 16.0 | Yes | No |
388 | 8.67 | 142.0 | 73.0 | 14.0 | 238.0 | 115.0 | Medium | 73.0 | 14.0 | No | Yes |
389 | 8.14 | 135.0 | 89.0 | 11.0 | 245.0 | 78.0 | Bad | 79.0 | 16.0 | Yes | Yes |
390 | 8.44 | 128.0 | 42.0 | 8.0 | 328.0 | 107.0 | Medium | 35.0 | 12.0 | Yes | Yes |
391 | 5.47 | 108.0 | 75.0 | 9.0 | 61.0 | 111.0 | Medium | 67.0 | 12.0 | Yes | Yes |
392 | 6.10 | 153.0 | 63.0 | 0.0 | 49.0 | 124.0 | Bad | 56.0 | 16.0 | Yes | No |
393 | 4.53 | 129.0 | 42.0 | 13.0 | 315.0 | 130.0 | Bad | 34.0 | 13.0 | Yes | Yes |
394 | 5.57 | 109.0 | 51.0 | 10.0 | 26.0 | 120.0 | Medium | 30.0 | 17.0 | No | Yes |
395 | 5.35 | 130.0 | 58.0 | 19.0 | 366.0 | 139.0 | Bad | 33.0 | 16.0 | Yes | Yes |
396 | 12.57 | 138.0 | 108.0 | 17.0 | 203.0 | 128.0 | Good | 33.0 | 14.0 | Yes | Yes |
397 | 6.14 | 139.0 | 23.0 | 3.0 | 37.0 | 120.0 | Medium | 55.0 | 11.0 | No | Yes |
398 | 7.41 | 162.0 | 26.0 | 12.0 | 368.0 | 159.0 | Medium | 40.0 | 18.0 | Yes | Yes |
399 | 5.94 | 100.0 | 79.0 | 7.0 | 284.0 | 95.0 | Bad | 50.0 | 12.0 | Yes | Yes |
400 | 9.71 | 134.0 | 37.0 | 0.0 | 27.0 | 120.0 | Good | 49.0 | 16.0 | Yes | Yes |
400 rows × 11 columns
# Regression with all terms plus two interaction terms on Carseats dataset
all_predictors = '+'.join(carseats.columns.drop('Sales'))
carseats_smmlr = smf.ols('Sales ~ ' + all_predictors + ' + Income:Advertising + Price:Age', data=carseats).fit()
print(carseats_smmlr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(carseats_smmlr.mse_resid))
print("\nResiduals:")
print(carseats_smmlr.resid.describe()[3:])
OLS Regression Results ============================================================================== Dep. Variable: Sales R-squared: 0.876 Model: OLS Adj. R-squared: 0.872 Method: Least Squares F-statistic: 210.0 Date: Tue, 28 Feb 2017 Prob (F-statistic): 6.14e-166 Time: 16:53:35 Log-Likelihood: -564.67 No. Observations: 400 AIC: 1157. Df Residuals: 386 BIC: 1213. Df Model: 13 Covariance Type: nonrobust ======================================================================================= coef std err t P>|t| [95.0% Conf. Int.] --------------------------------------------------------------------------------------- Intercept 6.5756 1.009 6.519 0.000 4.592 8.559 ShelveLoc[T.Good] 4.8487 0.153 31.724 0.000 4.548 5.149 ShelveLoc[T.Medium] 1.9533 0.126 15.531 0.000 1.706 2.201 Urban[T.Yes] 0.1402 0.112 1.247 0.213 -0.081 0.361 US[T.Yes] -0.1576 0.149 -1.058 0.291 -0.450 0.135 CompPrice 0.0929 0.004 22.567 0.000 0.085 0.101 Income 0.0109 0.003 4.183 0.000 0.006 0.016 Advertising 0.0702 0.023 3.107 0.002 0.026 0.115 Population 0.0002 0.000 0.433 0.665 -0.001 0.001 Price -0.1008 0.007 -13.549 0.000 -0.115 -0.086 Age -0.0579 0.016 -3.633 0.000 -0.089 -0.027 Education -0.0209 0.020 -1.063 0.288 -0.059 0.018 Income:Advertising 0.0008 0.000 2.698 0.007 0.000 0.001 Price:Age 0.0001 0.000 0.801 0.424 -0.000 0.000 ============================================================================== Omnibus: 1.281 Durbin-Watson: 2.047 Prob(Omnibus): 0.527 Jarque-Bera (JB): 1.147 Skew: 0.129 Prob(JB): 0.564 Kurtosis: 3.050 Cond. No. 1.31e+05 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.31e+05. This might indicate that there are strong multicollinearity or other numerical problems. Residual Standard Error: 1.01060309838 Residuals: min -2.920817 25% -0.750294 50% 0.017678 75% 0.675410 max 3.341301 dtype: float64
# Show contrast matrix, the dummy variable coding
from patsy.contrasts import Treatment
levels = list(carseats.ShelveLoc.unique())
contrast = Treatment(reference=0).code_without_intercept(levels)
cm_df = pd.DataFrame(contrast.matrix, columns=contrast.column_suffixes, index=levels, dtype=int)
display(cm_df)
[T.Good] | [T.Medium] | |
---|---|---|
Bad | 0 | 0 |
Good | 1 | 0 |
Medium | 0 | 1 |
# Auto dataset is in R ISLR package
islr = importr('ISLR')
auto_rdf = rdata(islr).fetch('Auto')['Auto']
auto = pandas2ri.ri2py(auto_rdf)
display(auto)
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
1 | 18.0 | 8.0 | 307.0 | 130.0 | 3504.0 | 12.0 | 70.0 | 1.0 | chevrolet chevelle malibu |
2 | 15.0 | 8.0 | 350.0 | 165.0 | 3693.0 | 11.5 | 70.0 | 1.0 | buick skylark 320 |
3 | 18.0 | 8.0 | 318.0 | 150.0 | 3436.0 | 11.0 | 70.0 | 1.0 | plymouth satellite |
4 | 16.0 | 8.0 | 304.0 | 150.0 | 3433.0 | 12.0 | 70.0 | 1.0 | amc rebel sst |
5 | 17.0 | 8.0 | 302.0 | 140.0 | 3449.0 | 10.5 | 70.0 | 1.0 | ford torino |
6 | 15.0 | 8.0 | 429.0 | 198.0 | 4341.0 | 10.0 | 70.0 | 1.0 | ford galaxie 500 |
7 | 14.0 | 8.0 | 454.0 | 220.0 | 4354.0 | 9.0 | 70.0 | 1.0 | chevrolet impala |
8 | 14.0 | 8.0 | 440.0 | 215.0 | 4312.0 | 8.5 | 70.0 | 1.0 | plymouth fury iii |
9 | 14.0 | 8.0 | 455.0 | 225.0 | 4425.0 | 10.0 | 70.0 | 1.0 | pontiac catalina |
10 | 15.0 | 8.0 | 390.0 | 190.0 | 3850.0 | 8.5 | 70.0 | 1.0 | amc ambassador dpl |
11 | 15.0 | 8.0 | 383.0 | 170.0 | 3563.0 | 10.0 | 70.0 | 1.0 | dodge challenger se |
12 | 14.0 | 8.0 | 340.0 | 160.0 | 3609.0 | 8.0 | 70.0 | 1.0 | plymouth 'cuda 340 |
13 | 15.0 | 8.0 | 400.0 | 150.0 | 3761.0 | 9.5 | 70.0 | 1.0 | chevrolet monte carlo |
14 | 14.0 | 8.0 | 455.0 | 225.0 | 3086.0 | 10.0 | 70.0 | 1.0 | buick estate wagon (sw) |
15 | 24.0 | 4.0 | 113.0 | 95.0 | 2372.0 | 15.0 | 70.0 | 3.0 | toyota corona mark ii |
16 | 22.0 | 6.0 | 198.0 | 95.0 | 2833.0 | 15.5 | 70.0 | 1.0 | plymouth duster |
17 | 18.0 | 6.0 | 199.0 | 97.0 | 2774.0 | 15.5 | 70.0 | 1.0 | amc hornet |
18 | 21.0 | 6.0 | 200.0 | 85.0 | 2587.0 | 16.0 | 70.0 | 1.0 | ford maverick |
19 | 27.0 | 4.0 | 97.0 | 88.0 | 2130.0 | 14.5 | 70.0 | 3.0 | datsun pl510 |
20 | 26.0 | 4.0 | 97.0 | 46.0 | 1835.0 | 20.5 | 70.0 | 2.0 | volkswagen 1131 deluxe sedan |
21 | 25.0 | 4.0 | 110.0 | 87.0 | 2672.0 | 17.5 | 70.0 | 2.0 | peugeot 504 |
22 | 24.0 | 4.0 | 107.0 | 90.0 | 2430.0 | 14.5 | 70.0 | 2.0 | audi 100 ls |
23 | 25.0 | 4.0 | 104.0 | 95.0 | 2375.0 | 17.5 | 70.0 | 2.0 | saab 99e |
24 | 26.0 | 4.0 | 121.0 | 113.0 | 2234.0 | 12.5 | 70.0 | 2.0 | bmw 2002 |
25 | 21.0 | 6.0 | 199.0 | 90.0 | 2648.0 | 15.0 | 70.0 | 1.0 | amc gremlin |
26 | 10.0 | 8.0 | 360.0 | 215.0 | 4615.0 | 14.0 | 70.0 | 1.0 | ford f250 |
27 | 10.0 | 8.0 | 307.0 | 200.0 | 4376.0 | 15.0 | 70.0 | 1.0 | chevy c20 |
28 | 11.0 | 8.0 | 318.0 | 210.0 | 4382.0 | 13.5 | 70.0 | 1.0 | dodge d200 |
29 | 9.0 | 8.0 | 304.0 | 193.0 | 4732.0 | 18.5 | 70.0 | 1.0 | hi 1200d |
30 | 27.0 | 4.0 | 97.0 | 88.0 | 2130.0 | 14.5 | 71.0 | 3.0 | datsun pl510 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
368 | 28.0 | 4.0 | 112.0 | 88.0 | 2605.0 | 19.6 | 82.0 | 1.0 | chevrolet cavalier |
369 | 27.0 | 4.0 | 112.0 | 88.0 | 2640.0 | 18.6 | 82.0 | 1.0 | chevrolet cavalier wagon |
370 | 34.0 | 4.0 | 112.0 | 88.0 | 2395.0 | 18.0 | 82.0 | 1.0 | chevrolet cavalier 2-door |
371 | 31.0 | 4.0 | 112.0 | 85.0 | 2575.0 | 16.2 | 82.0 | 1.0 | pontiac j2000 se hatchback |
372 | 29.0 | 4.0 | 135.0 | 84.0 | 2525.0 | 16.0 | 82.0 | 1.0 | dodge aries se |
373 | 27.0 | 4.0 | 151.0 | 90.0 | 2735.0 | 18.0 | 82.0 | 1.0 | pontiac phoenix |
374 | 24.0 | 4.0 | 140.0 | 92.0 | 2865.0 | 16.4 | 82.0 | 1.0 | ford fairmont futura |
375 | 36.0 | 4.0 | 105.0 | 74.0 | 1980.0 | 15.3 | 82.0 | 2.0 | volkswagen rabbit l |
376 | 37.0 | 4.0 | 91.0 | 68.0 | 2025.0 | 18.2 | 82.0 | 3.0 | mazda glc custom l |
377 | 31.0 | 4.0 | 91.0 | 68.0 | 1970.0 | 17.6 | 82.0 | 3.0 | mazda glc custom |
378 | 38.0 | 4.0 | 105.0 | 63.0 | 2125.0 | 14.7 | 82.0 | 1.0 | plymouth horizon miser |
379 | 36.0 | 4.0 | 98.0 | 70.0 | 2125.0 | 17.3 | 82.0 | 1.0 | mercury lynx l |
380 | 36.0 | 4.0 | 120.0 | 88.0 | 2160.0 | 14.5 | 82.0 | 3.0 | nissan stanza xe |
381 | 36.0 | 4.0 | 107.0 | 75.0 | 2205.0 | 14.5 | 82.0 | 3.0 | honda accord |
382 | 34.0 | 4.0 | 108.0 | 70.0 | 2245.0 | 16.9 | 82.0 | 3.0 | toyota corolla |
383 | 38.0 | 4.0 | 91.0 | 67.0 | 1965.0 | 15.0 | 82.0 | 3.0 | honda civic |
384 | 32.0 | 4.0 | 91.0 | 67.0 | 1965.0 | 15.7 | 82.0 | 3.0 | honda civic (auto) |
385 | 38.0 | 4.0 | 91.0 | 67.0 | 1995.0 | 16.2 | 82.0 | 3.0 | datsun 310 gx |
386 | 25.0 | 6.0 | 181.0 | 110.0 | 2945.0 | 16.4 | 82.0 | 1.0 | buick century limited |
387 | 38.0 | 6.0 | 262.0 | 85.0 | 3015.0 | 17.0 | 82.0 | 1.0 | oldsmobile cutlass ciera (diesel) |
388 | 26.0 | 4.0 | 156.0 | 92.0 | 2585.0 | 14.5 | 82.0 | 1.0 | chrysler lebaron medallion |
389 | 22.0 | 6.0 | 232.0 | 112.0 | 2835.0 | 14.7 | 82.0 | 1.0 | ford granada l |
390 | 32.0 | 4.0 | 144.0 | 96.0 | 2665.0 | 13.9 | 82.0 | 3.0 | toyota celica gt |
391 | 36.0 | 4.0 | 135.0 | 84.0 | 2370.0 | 13.0 | 82.0 | 1.0 | dodge charger 2.2 |
392 | 27.0 | 4.0 | 151.0 | 90.0 | 2950.0 | 17.3 | 82.0 | 1.0 | chevrolet camaro |
393 | 27.0 | 4.0 | 140.0 | 86.0 | 2790.0 | 15.6 | 82.0 | 1.0 | ford mustang gl |
394 | 44.0 | 4.0 | 97.0 | 52.0 | 2130.0 | 24.6 | 82.0 | 2.0 | vw pickup |
395 | 32.0 | 4.0 | 135.0 | 84.0 | 2295.0 | 11.6 | 82.0 | 1.0 | dodge rampage |
396 | 28.0 | 4.0 | 120.0 | 79.0 | 2625.0 | 18.6 | 82.0 | 1.0 | ford ranger |
397 | 31.0 | 4.0 | 119.0 | 82.0 | 2720.0 | 19.4 | 82.0 | 1.0 | chevy s-10 |
392 rows × 9 columns
# simple linear regression on Auto dataset
auto_slr = smf.ols('mpg ~ horsepower ', data=auto).fit()
print(auto_slr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(auto_slr.mse_resid))
print("\nResiduals:")
print(auto_slr.resid.describe()[3:])
OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.606 Model: OLS Adj. R-squared: 0.605 Method: Least Squares F-statistic: 599.7 Date: Tue, 28 Feb 2017 Prob (F-statistic): 7.03e-81 Time: 16:53:35 Log-Likelihood: -1178.7 No. Observations: 392 AIC: 2361. Df Residuals: 390 BIC: 2369. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ Intercept 39.9359 0.717 55.660 0.000 38.525 41.347 horsepower -0.1578 0.006 -24.489 0.000 -0.171 -0.145 ============================================================================== Omnibus: 16.432 Durbin-Watson: 0.920 Prob(Omnibus): 0.000 Jarque-Bera (JB): 17.305 Skew: 0.492 Prob(JB): 0.000175 Kurtosis: 3.299 Cond. No. 322. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. Residual Standard Error: 4.90575691955 Residuals: min -13.571040 25% -3.259151 50% -0.343543 75% 2.763033 max 16.924047 dtype: float64
# 8 (a) - iv. Prediction
auto_pred = auto_slr.predict(exog=dict(horsepower=98))
print(auto_pred)
[ 24.46707715]
# Plot SLR fitting results
fig, ax = plt.subplots()
plt.scatter(auto.horsepower, auto.mpg, color='r', s=15, alpha=0.5) # Data points
smg.abline_plot(model_results=auto_slr, ax=ax) # Regression line
plt.xlim(25, 250)
plt.ylim(5, 50)
plt.xlabel('Horsepower')
plt.ylabel('MPG')
plt.title('Simple Linear Regression on Auto dataset')
plt.show()
# Diagnostic plots
infl = auto_slr.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))
# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(auto_slr.fittedvalues, auto_slr.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_internal, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(auto_slr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')
# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(auto_slr, size=20, ax=ax4)
plt.xlim(0, 0.03)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')
plt.tight_layout()
plt.show()
# 9 - (a) Plot scatter matrix.
sns.pairplot(auto, size=2)
plt.show()
# 9 - (b) Compute correlation matrix.
auto.corr()
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | |
---|---|---|---|---|---|---|---|---|
mpg | 1.000000 | -0.777618 | -0.805127 | -0.778427 | -0.832244 | 0.423329 | 0.580541 | 0.565209 |
cylinders | -0.777618 | 1.000000 | 0.950823 | 0.842983 | 0.897527 | -0.504683 | -0.345647 | -0.568932 |
displacement | -0.805127 | 0.950823 | 1.000000 | 0.897257 | 0.932994 | -0.543800 | -0.369855 | -0.614535 |
horsepower | -0.778427 | 0.842983 | 0.897257 | 1.000000 | 0.864538 | -0.689196 | -0.416361 | -0.455171 |
weight | -0.832244 | 0.897527 | 0.932994 | 0.864538 | 1.000000 | -0.416839 | -0.309120 | -0.585005 |
acceleration | 0.423329 | -0.504683 | -0.543800 | -0.689196 | -0.416839 | 1.000000 | 0.290316 | 0.212746 |
year | 0.580541 | -0.345647 | -0.369855 | -0.416361 | -0.309120 | 0.290316 | 1.000000 | 0.181528 |
origin | 0.565209 | -0.568932 | -0.614535 | -0.455171 | -0.585005 | 0.212746 | 0.181528 | 1.000000 |
# 9 - (c) Multiple linear regression on Auto dataset.
formula = 'mpg ~ ' + ' + '.join(auto.columns.drop(['name', 'mpg']))
print('\nRegression formula: {}\n'.format(formula))
auto_mlr = smf.ols(formula, data=auto).fit()
print(auto_mlr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(auto_mlr.mse_resid))
print("\nResiduals:")
print(auto_mlr.resid.describe()[3:])
Regression formula: mpg ~ cylinders + displacement + horsepower + weight + acceleration + year + origin OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.821 Model: OLS Adj. R-squared: 0.818 Method: Least Squares F-statistic: 252.4 Date: Tue, 28 Feb 2017 Prob (F-statistic): 2.04e-139 Time: 16:53:43 Log-Likelihood: -1023.5 No. Observations: 392 AIC: 2063. Df Residuals: 384 BIC: 2095. Df Model: 7 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [95.0% Conf. Int.] -------------------------------------------------------------------------------- Intercept -17.2184 4.644 -3.707 0.000 -26.350 -8.087 cylinders -0.4934 0.323 -1.526 0.128 -1.129 0.142 displacement 0.0199 0.008 2.647 0.008 0.005 0.035 horsepower -0.0170 0.014 -1.230 0.220 -0.044 0.010 weight -0.0065 0.001 -9.929 0.000 -0.008 -0.005 acceleration 0.0806 0.099 0.815 0.415 -0.114 0.275 year 0.7508 0.051 14.729 0.000 0.651 0.851 origin 1.4261 0.278 5.127 0.000 0.879 1.973 ============================================================================== Omnibus: 31.906 Durbin-Watson: 1.309 Prob(Omnibus): 0.000 Jarque-Bera (JB): 53.100 Skew: 0.529 Prob(JB): 2.95e-12 Kurtosis: 4.460 Cond. No. 8.59e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 8.59e+04. This might indicate that there are strong multicollinearity or other numerical problems. Residual Standard Error: 3.32768239641 Residuals: min -9.590261 25% -2.156516 50% -0.116941 75% 1.868966 max 13.060427 dtype: float64
# Diagnostic plots
infl = auto_mlr.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))
# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(auto_mlr.fittedvalues, auto_mlr.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_internal, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(auto_mlr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')
# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(auto_mlr, size=20, ax=ax4)
plt.xlim(0, 0.2)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')
plt.tight_layout()
plt.show()
# 9 - (e) Multiple linear regression with interaction terms on Auto dataset.
formula = 'mpg ~ ' + ' + '.join(auto.columns.drop(['name', 'mpg'])) + ' + horsepower:weight'
print('\nRegression formula: {}\n'.format(formula))
auto_mlr_inter = smf.ols(formula, data=auto).fit()
print(auto_mlr_inter.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(auto_mlr_inter.mse_resid))
print("\nResiduals:")
print(auto_mlr_inter.resid.describe()[3:])
Regression formula: mpg ~ cylinders + displacement + horsepower + weight + acceleration + year + origin + horsepower:weight OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.862 Model: OLS Adj. R-squared: 0.859 Method: Least Squares F-statistic: 298.6 Date: Tue, 28 Feb 2017 Prob (F-statistic): 1.88e-159 Time: 16:53:45 Log-Likelihood: -973.24 No. Observations: 392 AIC: 1964. Df Residuals: 383 BIC: 2000. Df Model: 8 Covariance Type: nonrobust ===================================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------------- Intercept 2.8757 4.511 0.638 0.524 -5.993 11.744 cylinders -0.0296 0.288 -0.103 0.918 -0.596 0.537 displacement 0.0059 0.007 0.881 0.379 -0.007 0.019 horsepower -0.2313 0.024 -9.791 0.000 -0.278 -0.185 weight -0.0112 0.001 -15.393 0.000 -0.013 -0.010 acceleration -0.0902 0.089 -1.019 0.309 -0.264 0.084 year 0.7695 0.045 17.124 0.000 0.681 0.858 origin 0.8344 0.251 3.320 0.001 0.340 1.329 horsepower:weight 5.529e-05 5.23e-06 10.577 0.000 4.5e-05 6.56e-05 ============================================================================== Omnibus: 40.936 Durbin-Watson: 1.474 Prob(Omnibus): 0.000 Jarque-Bera (JB): 73.199 Skew: 0.629 Prob(JB): 1.27e-16 Kurtosis: 4.703 Cond. No. 1.23e+07 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.23e+07. This might indicate that there are strong multicollinearity or other numerical problems. Residual Standard Error: 2.93127737288 Residuals: min -8.589487 25% -1.616990 50% -0.184041 75% 1.540995 max 12.001446 dtype: float64