import pandas as pd
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import datasets
from scipy.stats import t
from yellowbrick.regressor import ResidualsPlot
import statsmodels.formula.api as smf
from patsy import dmatrices, dmatrix
dat = pd.read_csv('auto.csv')
# convert horsepower from string to float and mean fill the 5 missing values.
hp = pd.to_numeric(dat['horsepower'], downcast='float', errors='coerce')
hp = hp.fillna(value=hp.mean())
dat['horsepower'] = hp
X = sm.add_constant(dat['horsepower'])
est = sm.OLS(dat['mpg'], X).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.595 Model: OLS Adj. R-squared: 0.594 Method: Least Squares F-statistic: 580.6 Date: Wed, 09 Jan 2019 Prob (F-statistic): 1.45e-79 Time: 09:41:39 Log-Likelihood: -1200.1 No. Observations: 397 AIC: 2404. Df Residuals: 395 BIC: 2412. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 40.0058 0.729 54.903 0.000 38.573 41.438 horsepower -0.1578 0.007 -24.096 0.000 -0.171 -0.145 ============================================================================== Omnibus: 21.884 Durbin-Watson: 0.902 Prob(Omnibus): 0.000 Jarque-Bera (JB): 24.108 Skew: 0.557 Prob(JB): 5.82e-06 Kurtosis: 3.464 Cond. No. 324. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Yes - a statistically significant releationship exists.
est.get_prediction(np.array([1, 98])).summary_frame(alpha=0.05)
mean | mean_se | mean_ci_lower | mean_ci_upper | obs_ci_lower | obs_ci_upper | |
---|---|---|---|---|---|---|
0 | 24.537028 | 0.253797 | 24.038067 | 25.035989 | 14.722193 | 34.351862 |
sns.regplot(x='horsepower', y='mpg', data=dat)
X = sm.add_constant(dat['horsepower'])
est = sm.OLS(dat['mpg'], X).fit()
sns.residplot(dat['horsepower'], dat['mpg'], lowess=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1c230d6ef0>
sns.pairplot(dat)
<seaborn.axisgrid.PairGrid at 0x11dcd6c50>
sns.heatmap(dat.corr(), annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x120f70908>
X = sm.add_constant(dat).drop(['name', 'mpg'], axis=1)
est = sm.OLS(dat['mpg'], X).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.822 Model: OLS Adj. R-squared: 0.818 Method: Least Squares F-statistic: 256.0 Date: Wed, 05 Sep 2018 Prob (F-statistic): 2.41e-141 Time: 10:53:54 Log-Likelihood: -1037.4 No. Observations: 397 AIC: 2091. Df Residuals: 389 BIC: 2123. Df Model: 7 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- const -18.7116 4.609 -4.060 0.000 -27.773 -9.650 cylinders -0.4452 0.323 -1.380 0.168 -1.079 0.189 displacement 0.0189 0.007 2.524 0.012 0.004 0.034 horsepower -0.0094 0.013 -0.709 0.479 -0.035 0.017 weight -0.0067 0.001 -10.508 0.000 -0.008 -0.005 acceleration 0.1179 0.097 1.217 0.224 -0.073 0.308 year 0.7625 0.051 15.071 0.000 0.663 0.862 origin 1.3968 0.275 5.073 0.000 0.855 1.938 ============================================================================== Omnibus: 29.782 Durbin-Watson: 1.291 Prob(Omnibus): 0.000 Jarque-Bera (JB): 47.819 Skew: 0.506 Prob(JB): 4.13e-11 Kurtosis: 4.366 Cond. No. 8.53e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 8.53e+04. This might indicate that there are strong multicollinearity or other numerical problems.
# Residual plot with sklearn and seaborn
X = dat.drop(['mpg', 'name'], axis=1)
y = dat['mpg']
regr = LinearRegression()
regr.fit(X, y)
y_hat = regr.predict(X)
resid = y - y_hat
sns.residplot(y_hat, resid,lowess=True)
/usr/local/lib/python3.6/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver. linalg.lstsq(X, y)
<matplotlib.axes._subplots.AxesSubplot at 0x121087ac8>
# Studentised residuals plot with statsmodels and seaborn
X = sm.add_constant(dat).drop(['name', 'mpg'], axis=1)
est = sm.OLS(dat['mpg'], X).fit()
sns.residplot(est.fittedvalues, est.get_influence().resid_studentized_internal,lowess=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1210e64a8>
# Residual plot with yellowbick
# .. they're plotting y_hat - y?
regr = LinearRegression()
vis = ResidualsPlot(regr)
vis.fit(X, y)
vis.poof()
# Influence plot with statsmodels
fig, ax = plt.subplots(figsize=(15,15))
fig = sm.graphics.influence_plot(est,ax=ax, criterion="cooks")
# Partial regression plot matrix with statsmodels
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(est, fig=fig)
# Partial regression plot / addded variable plot with statsmodels
# https://www.statsmodels.org/dev/generated/statsmodels.graphics.regressionplots.plot_partregress.html
y_name = 'mpg'
indi_name = 'weight'
rest = [x for x in dat.columns.drop([y_name, indi_name, 'name']).values]
fig, ax = plt.subplots(figsize=(12,8))
fig = sm.graphics.plot_partregress('mpg', 'weight', rest, data=dat, ax=ax)
# QQ plot with statsmodels
qq = sm.qqplot(resid)
# There are 8 choose 2 = 28 combos.
# Trying out a couple..making mental note about how to search space that could be
# larger than this.
print('NO INTERACTIONS')
form = 'mpg ~ cylinders + displacement + horsepower + weight + acceleration + year + origin'
est = smf.ols(formula=form, data=dat).fit()
print(est.summary())
print('-----------------------')
print('CYLINDERS X DISPLACEMENT')
form = 'mpg ~ cylinders * displacement + horsepower + weight + acceleration + year + origin'
est = smf.ols(formula=form, data=dat).fit()
print(est.summary())
print('-----------------------')
print('DISPLACEMENT X HORSEPOWER')
form = 'mpg ~ cylinders + displacement * horsepower + weight + acceleration + year + origin'
est = smf.ols(formula=form, data=dat).fit()
print(est.summary())
print('-----------------------')
NO INTERACTIONS OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.822 Model: OLS Adj. R-squared: 0.818 Method: Least Squares F-statistic: 256.0 Date: Tue, 04 Sep 2018 Prob (F-statistic): 2.41e-141 Time: 16:51:50 Log-Likelihood: -1037.4 No. Observations: 397 AIC: 2091. Df Residuals: 389 BIC: 2123. Df Model: 7 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept -18.7116 4.609 -4.060 0.000 -27.773 -9.650 cylinders -0.4452 0.323 -1.380 0.168 -1.079 0.189 displacement 0.0189 0.007 2.524 0.012 0.004 0.034 horsepower -0.0094 0.013 -0.709 0.479 -0.035 0.017 weight -0.0067 0.001 -10.508 0.000 -0.008 -0.005 acceleration 0.1179 0.097 1.217 0.224 -0.073 0.308 year 0.7625 0.051 15.071 0.000 0.663 0.862 origin 1.3968 0.275 5.073 0.000 0.855 1.938 ============================================================================== Omnibus: 29.782 Durbin-Watson: 1.291 Prob(Omnibus): 0.000 Jarque-Bera (JB): 47.819 Skew: 0.506 Prob(JB): 4.13e-11 Kurtosis: 4.366 Cond. No. 8.53e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 8.53e+04. This might indicate that there are strong multicollinearity or other numerical problems. ----------------------- CYLINDERS X DISPLACEMENT OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.846 Model: OLS Adj. R-squared: 0.843 Method: Least Squares F-statistic: 267.0 Date: Tue, 04 Sep 2018 Prob (F-statistic): 1.26e-152 Time: 16:51:50 Log-Likelihood: -1007.9 No. Observations: 397 AIC: 2034. Df Residuals: 388 BIC: 2070. Df Model: 8 Covariance Type: nonrobust ========================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------------ Intercept -4.2248 4.661 -0.906 0.365 -13.388 4.939 cylinders -2.6365 0.409 -6.452 0.000 -3.440 -1.833 displacement -0.0783 0.014 -5.533 0.000 -0.106 -0.051 cylinders:displacement 0.0136 0.002 7.890 0.000 0.010 0.017 horsepower -0.0399 0.013 -3.092 0.002 -0.065 -0.015 weight -0.0055 0.001 -8.939 0.000 -0.007 -0.004 acceleration 0.0982 0.090 1.091 0.276 -0.079 0.275 year 0.7709 0.047 16.390 0.000 0.678 0.863 origin 0.6749 0.272 2.483 0.013 0.141 1.209 ============================================================================== Omnibus: 33.121 Durbin-Watson: 1.430 Prob(Omnibus): 0.000 Jarque-Bera (JB): 80.138 Skew: 0.411 Prob(JB): 3.97e-18 Kurtosis: 5.042 Cond. No. 1.03e+05 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.03e+05. This might indicate that there are strong multicollinearity or other numerical problems. ----------------------- DISPLACEMENT X HORSEPOWER OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.859 Model: OLS Adj. R-squared: 0.856 Method: Least Squares F-statistic: 295.5 Date: Tue, 04 Sep 2018 Prob (F-statistic): 7.17e-160 Time: 16:51:50 Log-Likelihood: -990.77 No. Observations: 397 AIC: 2000. Df Residuals: 388 BIC: 2035. Df Model: 8 Covariance Type: nonrobust =========================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------------- Intercept -5.1268 4.316 -1.188 0.236 -13.613 3.359 cylinders 0.6520 0.307 2.125 0.034 0.049 1.255 displacement -0.0708 0.011 -6.391 0.000 -0.093 -0.049 horsepower -0.1721 0.020 -8.641 0.000 -0.211 -0.133 displacement:horsepower 0.0005 4.83e-05 10.140 0.000 0.000 0.001 weight -0.0038 0.001 -5.946 0.000 -0.005 -0.003 acceleration -0.1269 0.090 -1.418 0.157 -0.303 0.049 year 0.7550 0.045 16.761 0.000 0.666 0.844 origin 0.6489 0.256 2.535 0.012 0.146 1.152 ============================================================================== Omnibus: 45.972 Durbin-Watson: 1.506 Prob(Omnibus): 0.000 Jarque-Bera (JB): 89.743 Skew: 0.658 Prob(JB): 3.25e-20 Kurtosis: 4.922 Cond. No. 9.33e+05 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 9.33e+05. This might indicate that there are strong multicollinearity or other numerical problems. -----------------------
print('NO TRANSFORMATIONS')
form = 'mpg ~ cylinders + displacement + horsepower + weight + acceleration + year + origin'
est = smf.ols(formula=form, data=dat).fit()
print(est.summary())
print('-----------------------')
print('LOG MPG')
form = 'np.log(mpg) ~ cylinders + displacement + horsepower + weight + acceleration + year + origin'
est = smf.ols(formula=form, data=dat).fit()
print(est.summary())
print('-----------------------')
print('RECIPROCAL MPG')
form = 'np.reciprocal(mpg) ~ cylinders + displacement + horsepower + weight + acceleration + year + origin'
est = smf.ols(formula=form, data=dat).fit()
print(est.summary())
print('-----------------------')
print('LOG EVERYTHING SENSIBLE')
form = 'np.log(mpg) ~ cylinders + np.log(displacement) + np.log(horsepower) + np.log(weight) + np.log(acceleration) + year + origin'
est = smf.ols(formula=form, data=dat).fit()
print(est.summary())
print('-----------------------')
NO TRANSFORMATIONS OLS Regression Results ============================================================================== Dep. Variable: mpg R-squared: 0.822 Model: OLS Adj. R-squared: 0.818 Method: Least Squares F-statistic: 256.0 Date: Wed, 05 Sep 2018 Prob (F-statistic): 2.41e-141 Time: 11:05:05 Log-Likelihood: -1037.4 No. Observations: 397 AIC: 2091. Df Residuals: 389 BIC: 2123. Df Model: 7 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept -18.7116 4.609 -4.060 0.000 -27.773 -9.650 cylinders -0.4452 0.323 -1.380 0.168 -1.079 0.189 displacement 0.0189 0.007 2.524 0.012 0.004 0.034 horsepower -0.0094 0.013 -0.709 0.479 -0.035 0.017 weight -0.0067 0.001 -10.508 0.000 -0.008 -0.005 acceleration 0.1179 0.097 1.217 0.224 -0.073 0.308 year 0.7625 0.051 15.071 0.000 0.663 0.862 origin 1.3968 0.275 5.073 0.000 0.855 1.938 ============================================================================== Omnibus: 29.782 Durbin-Watson: 1.291 Prob(Omnibus): 0.000 Jarque-Bera (JB): 47.819 Skew: 0.506 Prob(JB): 4.13e-11 Kurtosis: 4.366 Cond. No. 8.53e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 8.53e+04. This might indicate that there are strong multicollinearity or other numerical problems. ----------------------- LOG MPG OLS Regression Results ============================================================================== Dep. Variable: np.log(mpg) R-squared: 0.880 Model: OLS Adj. R-squared: 0.877 Method: Least Squares F-statistic: 405.9 Date: Wed, 05 Sep 2018 Prob (F-statistic): 1.83e-174 Time: 11:05:05 Log-Likelihood: 285.57 No. Observations: 397 AIC: -555.1 Df Residuals: 389 BIC: -523.3 Df Model: 7 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 1.7070 0.165 10.373 0.000 1.383 2.031 cylinders -0.0262 0.012 -2.279 0.023 -0.049 -0.004 displacement 0.0006 0.000 2.232 0.026 7.12e-05 0.001 horsepower -0.0012 0.000 -2.558 0.011 -0.002 -0.000 weight -0.0003 2.29e-05 -11.539 0.000 -0.000 -0.000 acceleration -4.102e-06 0.003 -0.001 0.999 -0.007 0.007 year 0.0299 0.002 16.543 0.000 0.026 0.033 origin 0.0392 0.010 3.990 0.000 0.020 0.059 ============================================================================== Omnibus: 7.894 Durbin-Watson: 1.391 Prob(Omnibus): 0.019 Jarque-Bera (JB): 10.607 Skew: -0.163 Prob(JB): 0.00497 Kurtosis: 3.732 Cond. No. 8.53e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 8.53e+04. This might indicate that there are strong multicollinearity or other numerical problems. ----------------------- RECIPROCAL MPG OLS Regression Results ============================================================================== Dep. Variable: np.reciprocal(mpg) R-squared: 0.885 Model: OLS Adj. R-squared: 0.883 Method: Least Squares F-statistic: 428.9 Date: Wed, 05 Sep 2018 Prob (F-statistic): 1.48e-178 Time: 11:05:05 Log-Likelihood: 1493.8 No. Observations: 397 AIC: -2972. Df Residuals: 389 BIC: -2940. Df Model: 7 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 0.0929 0.008 11.839 0.000 0.077 0.108 cylinders 0.0014 0.001 2.613 0.009 0.000 0.003 displacement -2.369e-05 1.28e-05 -1.858 0.064 -4.88e-05 1.38e-06 horsepower 0.0001 2.26e-05 5.047 0.000 6.95e-05 0.000 weight 1.124e-05 1.09e-06 10.307 0.000 9.1e-06 1.34e-05 acceleration 0.0003 0.000 1.705 0.089 -4.31e-05 0.001 year -0.0013 8.61e-05 -14.769 0.000 -0.001 -0.001 origin -0.0009 0.000 -1.951 0.052 -0.002 7.16e-06 ============================================================================== Omnibus: 52.723 Durbin-Watson: 1.477 Prob(Omnibus): 0.000 Jarque-Bera (JB): 114.694 Skew: 0.706 Prob(JB): 1.24e-25 Kurtosis: 5.223 Cond. No. 8.53e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 8.53e+04. This might indicate that there are strong multicollinearity or other numerical problems. ----------------------- LOG EVERYTHING SENSIBLE OLS Regression Results ============================================================================== Dep. Variable: np.log(mpg) R-squared: 0.891 Model: OLS Adj. R-squared: 0.889 Method: Least Squares F-statistic: 452.3 Date: Wed, 05 Sep 2018 Prob (F-statistic): 1.57e-182 Time: 11:05:05 Log-Likelihood: 304.55 No. Observations: 397 AIC: -593.1 Df Residuals: 389 BIC: -561.2 Df Model: 7 Covariance Type: nonrobust ======================================================================================== coef std err t P>|t| [0.025 0.975] ---------------------------------------------------------------------------------------- Intercept 7.2502 0.370 19.596 0.000 6.523 7.978 cylinders -0.0123 0.011 -1.174 0.241 -0.033 0.008 np.log(displacement) -0.0130 0.053 -0.248 0.805 -0.116 0.090 np.log(horsepower) -0.2377 0.053 -4.448 0.000 -0.343 -0.133 np.log(weight) -0.6101 0.078 -7.774 0.000 -0.764 -0.456 np.log(acceleration) -0.1406 0.057 -2.486 0.013 -0.252 -0.029 year 0.0300 0.002 17.543 0.000 0.027 0.033 origin 0.0200 0.010 1.955 0.051 -0.000 0.040 ============================================================================== Omnibus: 8.161 Durbin-Watson: 1.488 Prob(Omnibus): 0.017 Jarque-Bera (JB): 13.327 Skew: -0.037 Prob(JB): 0.00128 Kurtosis: 3.894 Cond. No. 5.05e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 5.05e+03. This might indicate that there are strong multicollinearity or other numerical problems. -----------------------
# Visualising variable transformations.
trans = dat.copy()
trans['mpg'] = np.log(dat['mpg'])
trans['displacement'] = np.log(dat['displacement'])
trans['horsepower'] = np.sqrt(dat['horsepower'])
trans['weight'] = np.log(dat['weight'])
trans['acceleration'] = np.log(dat['acceleration'])
sns.pairplot(trans)
<seaborn.axisgrid.PairGrid at 0x166593128>
car_dat = pd.read_csv('carseats.csv')
car_dat.head()
form = 'Sales ~ Price + Urban + US'
est = smf.ols(formula=form, data=car_dat).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: Sales R-squared: 0.239 Model: OLS Adj. R-squared: 0.234 Method: Least Squares F-statistic: 41.52 Date: Wed, 05 Sep 2018 Prob (F-statistic): 2.39e-23 Time: 11:06:16 Log-Likelihood: -927.66 No. Observations: 400 AIC: 1863. Df Residuals: 396 BIC: 1879. Df Model: 3 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 13.0435 0.651 20.036 0.000 11.764 14.323 Urban[T.Yes] -0.0219 0.272 -0.081 0.936 -0.556 0.512 US[T.Yes] 1.2006 0.259 4.635 0.000 0.691 1.710 Price -0.0545 0.005 -10.389 0.000 -0.065 -0.044 ============================================================================== Omnibus: 0.676 Durbin-Watson: 1.912 Prob(Omnibus): 0.713 Jarque-Bera (JB): 0.758 Skew: 0.093 Prob(JB): 0.684 Kurtosis: 2.897 Cond. No. 628. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Intercept:
Urban[T.Yes]:
US[T.Yes]:
Price:
i.e:
If Urban and US: y = b0 + b1 + b2 + b3 * price
If Urban and not US: y = b0 + b1 + b3 * price
If not Urban and US: y = b0 + b2 + b3 * price
If not Urban nor US: y = b0 + b3 * price
form = 'Sales ~ Price + US'
est = smf.ols(formula=form, data=car_dat).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: Sales R-squared: 0.239 Model: OLS Adj. R-squared: 0.235 Method: Least Squares F-statistic: 62.43 Date: Wed, 05 Sep 2018 Prob (F-statistic): 2.66e-24 Time: 11:10:11 Log-Likelihood: -927.66 No. Observations: 400 AIC: 1861. Df Residuals: 397 BIC: 1873. Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 13.0308 0.631 20.652 0.000 11.790 14.271 US[T.Yes] 1.1996 0.258 4.641 0.000 0.692 1.708 Price -0.0545 0.005 -10.416 0.000 -0.065 -0.044 ============================================================================== Omnibus: 0.666 Durbin-Watson: 1.912 Prob(Omnibus): 0.717 Jarque-Bera (JB): 0.749 Skew: 0.092 Prob(JB): 0.688 Kurtosis: 2.895 Cond. No. 607. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# "Residual Standard Error"(RSE) = "Standard Error of the Regression"
# "Residual Sum of Squares" (RSS) = "Sum of Squared Residuals"(SSR)
# "Residual Standard Error"(RSE) = "Standard Error of the Regression" = sqrt(scale)
rse = np.sqrt(est.ssr / (car_dat.index.size - 3)) # = np.sqrt(est.scale)
per_error = rse / car_dat['Sales'].mean() * 100
rse = np.round(rse, decimals=3)
per_error = np.round(per_error, decimals=3)
print('RSE = {}, which represents {}% error'.format(rse, per_error))
RSE = 2.469, which represents 32.941% error
form = 'Sales ~ Price + US'
est = smf.ols(formula=form, data=car_dat).fit()
est.conf_int(0.05)
0 | 1 | |
---|---|---|
Intercept | 11.79032 | 14.271265 |
US[T.Yes] | 0.69152 | 1.707766 |
Price | -0.06476 | -0.044195 |
# Influence plot with statsmodels
fig, ax = plt.subplots(figsize=(15,15))
fig = sm.graphics.influence_plot(est,ax=ax, criterion="cooks")
# DETECTING HIGH LEVERAGE POINTS:
# --------------------------------
# In ISL:
# They say expected leverage = (p + 1) / n, values "greatly exceeding" this have high leverage.
# Hmmm... how much greater exactly is "greatly exceeding?"
# In StatsModels docs NOTES for the influence_plot:
# Row labels for the observations in which the leverage,
# measured by the diagonal of the hat matrix,
# is high or the residuals are large, as the combination of large residuals
# and a high influence value indicates an influence point.
# The value of large residuals can be controlled using the alpha parameter.
# ---> Large leverage points are identified as hat_i > 2 * (df_model + 1)/nobs.
# Looks like statsmodels default is to identify high levarage as 2 * (p + 1) / n
# Looks like we can't control this - the alpha paramater is for setting the
# threshold on large residuals - not high leverage.
print('expected leverage: {}'.format((2 + 1) / car_dat.index.size))
# plotting a line at 2 * (p + 1) / n to check I have understood.
plt.axvline(x= (2 * 0.0075), c='y')
# DETECTING OUTLIERS:
# -------------------
# In ISL:
# They say studentized Residuals "greater than 3 in absolute value are possible outliers"
# In StatsModels docs `alpha` paramater for the influence_plot:
# The alpha value to identify large studentized residuals.
# Large means abs(resid_studentized) > t.ppf(1-alpha/2, dof=results.df_resid)
# The defalt value for `alpha` is 0.75.
# t.ppf(1.-alpha/2, est.df_resid) = 0.3188604894390613
# From the plot, it looks like this works out at *studentized* residuals greater than 2 in
# absolute value.
expected leverage: 0.0075
<matplotlib.lines.Line2D at 0x1223b9d68>
set . seed (1)
x=rnorm(100)
y=2*x+rnorm(100)
# python equivalent
np.random.seed(1)
x = np.random.normal(size=100)
y = 2 * x + np.random.normal(size=100)
norm_data = pd.DataFrame(data={'x':x, 'y':y})
est = smf.ols(formula='y ~ x -1', data=norm_data).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.798 Model: OLS Adj. R-squared: 0.796 Method: Least Squares F-statistic: 391.7 Date: Wed, 05 Sep 2018 Prob (F-statistic): 3.46e-36 Time: 11:18:00 Log-Likelihood: -135.67 No. Observations: 100 AIC: 273.3 Df Residuals: 99 BIC: 275.9 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ x 2.1067 0.106 19.792 0.000 1.896 2.318 ============================================================================== Omnibus: 0.880 Durbin-Watson: 2.106 Prob(Omnibus): 0.644 Jarque-Bera (JB): 0.554 Skew: -0.172 Prob(JB): 0.758 Kurtosis: 3.119 Cond. No. 1.00 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
est = smf.ols(formula='x ~ y -1', data=norm_data).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: x R-squared: 0.798 Model: OLS Adj. R-squared: 0.796 Method: Least Squares F-statistic: 391.7 Date: Wed, 05 Sep 2018 Prob (F-statistic): 3.46e-36 Time: 11:18:54 Log-Likelihood: -49.891 No. Observations: 100 AIC: 101.8 Df Residuals: 99 BIC: 104.4 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ y 0.3789 0.019 19.792 0.000 0.341 0.417 ============================================================================== Omnibus: 0.476 Durbin-Watson: 2.166 Prob(Omnibus): 0.788 Jarque-Bera (JB): 0.631 Skew: 0.115 Prob(JB): 0.729 Kurtosis: 2.685 Cond. No. 1.00 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
for example: @ x = 2
from (a):
y = 2.1067 * 2 = 4.2134
4.2134 = 2.0 * 2.0 + epsilon
epsilon = 0.2134
from (b):
2.0 = 0.5 * (4.2134 - 0.2134)
2.0 = 2.0
x = norm_data['x']
y = norm_data['y']
num = np.sqrt(norm_data.index.size - 1) * sum(x * y)
denom = np.sqrt(sum(x * x) * sum(y * y) - (sum(x * y)) ** 2)
print(num / denom)
19.791801987091272
print(smf.ols(formula= 'y ~ x', data=norm_data).fit().summary())
print(smf.ols(formula= 'x ~ y', data=norm_data).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.800 Model: OLS Adj. R-squared: 0.798 Method: Least Squares F-statistic: 391.4 Date: Wed, 05 Sep 2018 Prob (F-statistic): 5.39e-36 Time: 11:23:12 Log-Likelihood: -134.44 No. Observations: 100 AIC: 272.9 Df Residuals: 98 BIC: 278.1 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 0.1470 0.094 1.564 0.121 -0.039 0.334 x 2.0954 0.106 19.783 0.000 1.885 2.306 ============================================================================== Omnibus: 0.898 Durbin-Watson: 2.157 Prob(Omnibus): 0.638 Jarque-Bera (JB): 0.561 Skew: -0.172 Prob(JB): 0.755 Kurtosis: 3.127 Cond. No. 1.15 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: x R-squared: 0.800 Model: OLS Adj. R-squared: 0.798 Method: Least Squares F-statistic: 391.4 Date: Wed, 05 Sep 2018 Prob (F-statistic): 5.39e-36 Time: 11:23:12 Log-Likelihood: -49.289 No. Observations: 100 AIC: 102.6 Df Residuals: 98 BIC: 107.8 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -0.0440 0.040 -1.090 0.279 -0.124 0.036 y 0.3817 0.019 19.783 0.000 0.343 0.420 ============================================================================== Omnibus: 0.456 Durbin-Watson: 2.192 Prob(Omnibus): 0.796 Jarque-Bera (JB): 0.611 Skew: 0.118 Prob(JB): 0.737 Kurtosis: 2.698 Cond. No. 2.12 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
sum(x ** 2 ) == sum(y ** 2)
x = np.random.randn(100)
y = np.random.randn(100)
print(smf.ols(formula='x ~ y -1', data={'x': x, 'y':y}).fit().summary())
print(smf.ols(formula='y ~ x -1', data={'x': x, 'y':y}).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: x R-squared: 0.003 Model: OLS Adj. R-squared: -0.007 Method: Least Squares F-statistic: 0.2769 Date: Wed, 05 Sep 2018 Prob (F-statistic): 0.600 Time: 11:25:05 Log-Likelihood: -142.25 No. Observations: 100 AIC: 286.5 Df Residuals: 99 BIC: 289.1 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ y -0.0499 0.095 -0.526 0.600 -0.238 0.138 ============================================================================== Omnibus: 0.381 Durbin-Watson: 2.209 Prob(Omnibus): 0.826 Jarque-Bera (JB): 0.472 Skew: 0.139 Prob(JB): 0.790 Kurtosis: 2.812 Cond. No. 1.00 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.003 Model: OLS Adj. R-squared: -0.007 Method: Least Squares F-statistic: 0.2769 Date: Wed, 05 Sep 2018 Prob (F-statistic): 0.600 Time: 11:25:05 Log-Likelihood: -148.01 No. Observations: 100 AIC: 298.0 Df Residuals: 99 BIC: 300.6 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ x -0.0559 0.106 -0.526 0.600 -0.267 0.155 ============================================================================== Omnibus: 0.426 Durbin-Watson: 1.977 Prob(Omnibus): 0.808 Jarque-Bera (JB): 0.121 Skew: -0.045 Prob(JB): 0.941 Kurtosis: 3.145 Cond. No. 1.00 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x = np.arange(100)
y = np.arange(100)[::-1]
print(smf.ols(formula='x ~ y -1', data={'x': x, 'y':y}).fit().summary())
print(smf.ols(formula='y ~ x -1', data={'x': x, 'y':y}).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: x R-squared: 0.243 Model: OLS Adj. R-squared: 0.235 Method: Least Squares F-statistic: 31.70 Date: Wed, 05 Sep 2018 Prob (F-statistic): 1.69e-07 Time: 11:26:36 Log-Likelihood: -532.84 No. Observations: 100 AIC: 1068. Df Residuals: 99 BIC: 1070. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ y 0.4925 0.087 5.630 0.000 0.319 0.666 ============================================================================== Omnibus: 33.630 Durbin-Watson: 0.001 Prob(Omnibus): 0.000 Jarque-Bera (JB): 6.002 Skew: -0.000 Prob(JB): 0.0497 Kurtosis: 1.800 Cond. No. 1.00 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.243 Model: OLS Adj. R-squared: 0.235 Method: Least Squares F-statistic: 31.70 Date: Wed, 05 Sep 2018 Prob (F-statistic): 1.69e-07 Time: 11:26:36 Log-Likelihood: -532.84 No. Observations: 100 AIC: 1068. Df Residuals: 99 BIC: 1070. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ x 0.4925 0.087 5.630 0.000 0.319 0.666 ============================================================================== Omnibus: 33.630 Durbin-Watson: 0.001 Prob(Omnibus): 0.000 Jarque-Bera (JB): 6.002 Skew: -0.000 Prob(JB): 0.0497 Kurtosis: 1.800 Cond. No. 1.00 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x = np.random.randn(100)
eps = np.random.normal(0, 0.25, 100)
Y =−1+0.5X+ε
.¶y = -1 + 0.5 * x + eps
print('length of vector y: {}'.format(np.linalg.norm(y)))
length of vector y: 11.56529239894809
sns.scatterplot(x=x, y=y)
<matplotlib.axes._subplots.AxesSubplot at 0x122bf2cf8>
print(smf.ols(formula='y ~ x',data=pd.DataFrame({'x' : x, 'y' :y})).fit().summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.816 Model: OLS Adj. R-squared: 0.814 Method: Least Squares F-statistic: 433.9 Date: Wed, 05 Sep 2018 Prob (F-statistic): 8.97e-38 Time: 11:46:25 Log-Likelihood: -1.8928 No. Observations: 100 AIC: 7.786 Df Residuals: 98 BIC: 13.00 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -0.9741 0.025 -39.043 0.000 -1.024 -0.925 x 0.5416 0.026 20.830 0.000 0.490 0.593 ============================================================================== Omnibus: 2.160 Durbin-Watson: 2.504 Prob(Omnibus): 0.340 Jarque-Bera (JB): 1.624 Skew: -0.162 Prob(JB): 0.444 Kurtosis: 3.533 Cond. No. 1.07 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# plotting population regression line in yellow
ax = sns.regplot(x=x, y=y)
x1 = -3
y1 = -1 + 0.5 * x1
x2 =3
y2 = -1 + 0.5 * x2
plt.plot([x1, x2],[y1, y2],c='y', label='population regression line')
plt.legend()
<matplotlib.legend.Legend at 0x123ba5e10>
print(smf.ols(formula= 'y ~ x + I(x ** 2)', data=pd.DataFrame({'x':x, 'y':y})).fit().summary())
sns.regplot(x=x,y=y, order=2)
# plotting population regression line in yellow
x1 = -3
y1 = -1 + 0.5 * x1
x2 =3
y2 = -1 + 0.5 * x2
plt.plot([x1, x2],[y1, y2],c='y', label='population regression line')
plt.legend()
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.816 Model: OLS Adj. R-squared: 0.813 Method: Least Squares F-statistic: 215.6 Date: Wed, 05 Sep 2018 Prob (F-statistic): 2.00e-36 Time: 12:14:17 Log-Likelihood: -1.7262 No. Observations: 100 AIC: 9.452 Df Residuals: 97 BIC: 17.27 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -0.9636 0.031 -31.002 0.000 -1.025 -0.902 x 0.5388 0.027 20.316 0.000 0.486 0.591 I(x ** 2) -0.0115 0.020 -0.569 0.571 -0.052 0.029 ============================================================================== Omnibus: 1.975 Durbin-Watson: 2.529 Prob(Omnibus): 0.373 Jarque-Bera (JB): 1.410 Skew: -0.172 Prob(JB): 0.494 Kurtosis: 3.469 Cond. No. 2.33 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
<matplotlib.legend.Legend at 0x123e5d8d0>
x = np.random.randn(100)
eps = np.random.normal(0, 0.1, 100)
y = - 1 + 0.5 * x + eps
print(smf.ols(formula='y ~ x', data=pd.DataFrame({'x':x, 'y': y})).fit().summary())
sns.regplot(x=x, y=y)
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.963 Model: OLS Adj. R-squared: 0.963 Method: Least Squares F-statistic: 2584. Date: Wed, 05 Sep 2018 Prob (F-statistic): 3.06e-72 Time: 12:32:37 Log-Likelihood: 97.892 No. Observations: 100 AIC: -191.8 Df Residuals: 98 BIC: -186.6 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -1.0040 0.009 -109.031 0.000 -1.022 -0.986 x 0.5032 0.010 50.838 0.000 0.484 0.523 ============================================================================== Omnibus: 0.876 Durbin-Watson: 2.033 Prob(Omnibus): 0.645 Jarque-Bera (JB): 0.875 Skew: -0.217 Prob(JB): 0.646 Kurtosis: 2.855 Cond. No. 1.11 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
<matplotlib.axes._subplots.AxesSubplot at 0x12449ae80>
x = np.random.randn(100)
eps = np.random.normal(0, 2, 100)
y = 1 - 0.5 * x + eps
print(smf.ols(formula='y ~ x', data=pd.DataFrame({'x':x, 'y':y})).fit().summary())
sns.regplot(x=x,y=y)
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.037 Model: OLS Adj. R-squared: 0.027 Method: Least Squares F-statistic: 3.764 Date: Wed, 05 Sep 2018 Prob (F-statistic): 0.0553 Time: 12:38:22 Log-Likelihood: -202.87 No. Observations: 100 AIC: 409.7 Df Residuals: 98 BIC: 414.9 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.2510 0.188 6.669 0.000 0.879 1.623 x -0.3653 0.188 -1.940 0.055 -0.739 0.008 ============================================================================== Omnibus: 3.529 Durbin-Watson: 2.033 Prob(Omnibus): 0.171 Jarque-Bera (JB): 2.984 Skew: -0.411 Prob(JB): 0.225 Kurtosis: 3.203 Cond. No. 1.15 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
<matplotlib.axes._subplots.AxesSubplot at 0x1245d6240>
x = np.random.randn(100)
eps_orig = np.random.normal(0, 0.25, 100)
eps_quiet = np.random.normal(0, 0.1, 100)
eps_noisey = np.random.normal(0, 2, 100)
y_orig = 1 - 0.5 * x + eps_orig
y_quiet = 1 - 0.5 * x + eps_quiet
y_noisey = 1 - 0.5 * x + eps_noisey
print(smf.ols(formula='y ~ x', data=pd.DataFrame({'x':x, 'y':y_orig})).fit().conf_int(0.05).loc['x']) #0.100231
print(smf.ols(formula='y ~ x', data=pd.DataFrame({'x':x, 'y':y_quiet})).fit().conf_int(0.05).loc['x']) #0.047068
print(smf.ols(formula='y ~ x', data=pd.DataFrame({'x':x, 'y':y_noisey})).fit().conf_int(0.05).loc['x']) #0.80483
0 -0.563428 1 -0.472417 Name: x, dtype: float64 0 -0.523437 1 -0.484602 Name: x, dtype: float64 0 -0.893729 1 -0.151275 Name: x, dtype: float64
> set.seed(1)
> x1=runif(100)
> x2=0.5*x1+rnorm(100)/10
> y=2+2*x1+0.3*x2+rnorm(100)
y = bo + b1 * x1 + b2 * x2 + eps
b0 = 2, b1 = 2, b2 = 0.3
# python equivalent
x1 = np.random.uniform(size=100)
x2 = 0.5 * x1 + np.random.randn(100)/10
y = 2 + 2 * x1 + 0.3 * x2 + np.random.randn(100)
sns.regplot(x=x1, y=x2)
np.corrcoef(x1, x2)
array([[1. , 0.84348228], [0.84348228, 1. ]])
smf.ols(formula='y ~ x1 + x2', data=pd.DataFrame({'x1' : x1, 'x2' : x2, 'y': y})).fit().summary()
Dep. Variable: | y | R-squared: | 0.243 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.227 |
Method: | Least Squares | F-statistic: | 15.72 |
Date: | Wed, 05 Sep 2018 | Prob (F-statistic): | 1.20e-06 |
Time: | 14:33:28 | Log-Likelihood: | -144.38 |
No. Observations: | 101 | AIC: | 294.8 |
Df Residuals: | 98 | BIC: | 302.6 |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 2.2686 | 0.196 | 11.558 | 0.000 | 1.879 | 2.658 |
x1 | 0.3340 | 0.536 | 0.624 | 0.534 | -0.729 | 1.397 |
x2 | 2.6116 | 0.854 | 3.060 | 0.003 | 0.918 | 4.305 |
Omnibus: | 2.304 | Durbin-Watson: | 1.917 |
---|---|---|---|
Prob(Omnibus): | 0.316 | Jarque-Bera (JB): | 2.318 |
Skew: | -0.348 | Prob(JB): | 0.314 |
Kurtosis: | 2.742 | Cond. No. | 10.9 |
smf.ols(formula='y ~ x1', data = pd.DataFrame({'x1' : x1, 'y' : y})).fit().summary()
Dep. Variable: | y | R-squared: | 0.214 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.206 |
Method: | Least Squares | F-statistic: | 26.75 |
Date: | Wed, 05 Sep 2018 | Prob (F-statistic): | 1.23e-06 |
Time: | 14:29:29 | Log-Likelihood: | -142.13 |
No. Observations: | 100 | AIC: | 288.3 |
Df Residuals: | 98 | BIC: | 293.5 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 2.2382 | 0.195 | 11.504 | 0.000 | 1.852 | 2.624 |
x1 | 1.7518 | 0.339 | 5.172 | 0.000 | 1.080 | 2.424 |
Omnibus: | 2.659 | Durbin-Watson: | 2.061 |
---|---|---|---|
Prob(Omnibus): | 0.265 | Jarque-Bera (JB): | 2.323 |
Skew: | -0.268 | Prob(JB): | 0.313 |
Kurtosis: | 2.479 | Cond. No. | 4.21 |
smf.ols(formula='y ~ x2', data = pd.DataFrame({'x2' : x2, 'y' : y})).fit().summary()
Dep. Variable: | y | R-squared: | 0.204 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.196 |
Method: | Least Squares | F-statistic: | 25.11 |
Date: | Wed, 05 Sep 2018 | Prob (F-statistic): | 2.40e-06 |
Time: | 14:29:51 | Log-Likelihood: | -142.79 |
No. Observations: | 100 | AIC: | 289.6 |
Df Residuals: | 98 | BIC: | 294.8 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 2.3630 | 0.179 | 13.236 | 0.000 | 2.009 | 2.717 |
x2 | 2.8161 | 0.562 | 5.011 | 0.000 | 1.701 | 3.931 |
Omnibus: | 1.861 | Durbin-Watson: | 1.992 |
---|---|---|---|
Prob(Omnibus): | 0.394 | Jarque-Bera (JB): | 1.842 |
Skew: | -0.319 | Prob(JB): | 0.398 |
Kurtosis: | 2.811 | Cond. No. | 5.90 |
> x1=c(x1, 0.1)
> x2=c(x2, 0.8)
> y=c(y,6)
y ~ x1 + x2
y ~ x1
y ~ x2
x1 = np.append(x1, 0.1)
x2 = np.append(x2,0.8)
y = np.append(y,6)
# model 1
est = smf.ols(formula='y ~ x1 + x2', data=pd.DataFrame({'x1' : x1, 'x2' : x2, 'y': y})).fit()
print(est.summary())
fig, ax = plt.subplots(figsize=(15,15))
fig = sm.graphics.influence_plot(est,ax=ax, criterion="cooks")
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.273 Model: OLS Adj. R-squared: 0.258 Method: Least Squares F-statistic: 18.58 Date: Wed, 05 Sep 2018 Prob (F-statistic): 1.41e-07 Time: 14:34:28 Log-Likelihood: -146.22 No. Observations: 102 AIC: 298.4 Df Residuals: 99 BIC: 306.3 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 2.2865 0.197 11.632 0.000 1.896 2.677 x1 0.0040 0.477 0.008 0.993 -0.942 0.950 x2 3.1908 0.737 4.328 0.000 1.728 4.654 ============================================================================== Omnibus: 2.740 Durbin-Watson: 1.910 Prob(Omnibus): 0.254 Jarque-Bera (JB): 2.705 Skew: -0.387 Prob(JB): 0.259 Kurtosis: 2.805 Cond. No. 9.34 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# model 2
est = smf.ols(formula='y ~ x1', data=pd.DataFrame({'x1' : x1, 'x2' : x2, 'y': y})).fit()
print(est.summary())
fig, ax = plt.subplots(figsize=(15,15))
fig = sm.graphics.influence_plot(est,ax=ax, criterion="cooks")
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.135 Model: OLS Adj. R-squared: 0.127 Method: Least Squares F-statistic: 15.65 Date: Wed, 05 Sep 2018 Prob (F-statistic): 0.000143 Time: 14:38:43 Log-Likelihood: -155.06 No. Observations: 102 AIC: 314.1 Df Residuals: 100 BIC: 319.4 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 2.4521 0.209 11.721 0.000 2.037 2.867 x1 1.4545 0.368 3.955 0.000 0.725 2.184 ============================================================================== Omnibus: 2.939 Durbin-Watson: 1.723 Prob(Omnibus): 0.230 Jarque-Bera (JB): 2.333 Skew: 0.269 Prob(JB): 0.311 Kurtosis: 3.510 Cond. No. 4.16 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# model 3
est = smf.ols(formula='y ~ x2', data=pd.DataFrame({'x1' : x1, 'y': y})).fit()
print(est.summary())
fig, ax = plt.subplots(figsize=(15,15))
fig = sm.graphics.influence_plot(est,ax=ax, criterion="cooks")
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.273 Model: OLS Adj. R-squared: 0.266 Method: Least Squares F-statistic: 37.53 Date: Wed, 05 Sep 2018 Prob (F-statistic): 1.79e-08 Time: 14:51:39 Log-Likelihood: -146.22 No. Observations: 102 AIC: 296.4 Df Residuals: 100 BIC: 301.7 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 2.2872 0.174 13.132 0.000 1.942 2.633 x2 3.1952 0.522 6.126 0.000 2.160 4.230 ============================================================================== Omnibus: 2.739 Durbin-Watson: 1.911 Prob(Omnibus): 0.254 Jarque-Bera (JB): 2.703 Skew: -0.387 Prob(JB): 0.259 Kurtosis: 2.806 Cond. No. 5.53 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
boston = datasets.load_boston()
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
indi = boston_df.columns.drop('CRIM')
dep = 'CRIM'
for v in indi:
print(smf.ols(formula='CRIM ~ {}'.format(v), data=boston_df).fit().summary())
# ZN -0.0735 0.016 -4.570 0.000 -0.105 -0.042
# INDUS 0.5068 0.051 9.929 0.000 0.407 0.607
# CHAS -1.8715 1.505 -1.243 0.214 -4.829 1.086
# NOX 30.9753 3.003 10.315 0.000 25.076 36.875
# RM -2.6910 0.532 -5.062 0.000 -3.736 -1.646
# AGE 0.1071 0.013 8.409 0.000 0.082 0.132
# DIS -1.5428 0.168 -9.163 0.000 -1.874 -1.212
# RAD 0.6141 0.034 17.835 0.000 0.546 0.682
# TAX 0.0296 0.002 15.966 0.000 0.026 0.033
# PTRATIO 1.1446 0.169 6.758 0.000 0.812 1.477
# B -0.0355 0.004 -9.148 0.000 -0.043 -0.028
# LSTAT 0.5444 0.048 11.383 0.000 0.450 0.638
OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.040 Model: OLS Adj. R-squared: 0.038 Method: Least Squares F-statistic: 20.88 Date: Sat, 08 Sep 2018 Prob (F-statistic): 6.15e-06 Time: 10:35:06 Log-Likelihood: -1795.8 No. Observations: 506 AIC: 3596. Df Residuals: 504 BIC: 3604. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 4.4292 0.417 10.620 0.000 3.610 5.249 ZN -0.0735 0.016 -4.570 0.000 -0.105 -0.042 ============================================================================== Omnibus: 568.366 Durbin-Watson: 0.862 Prob(Omnibus): 0.000 Jarque-Bera (JB): 32952.356 Skew: 5.270 Prob(JB): 0.00 Kurtosis: 41.103 Cond. No. 28.8 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.164 Model: OLS Adj. R-squared: 0.162 Method: Least Squares F-statistic: 98.58 Date: Sat, 08 Sep 2018 Prob (F-statistic): 2.44e-21 Time: 10:35:06 Log-Likelihood: -1760.9 No. Observations: 506 AIC: 3526. Df Residuals: 504 BIC: 3534. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -2.0509 0.668 -3.072 0.002 -3.362 -0.739 INDUS 0.5068 0.051 9.929 0.000 0.407 0.607 ============================================================================== Omnibus: 585.528 Durbin-Watson: 0.990 Prob(Omnibus): 0.000 Jarque-Bera (JB): 41469.710 Skew: 5.456 Prob(JB): 0.00 Kurtosis: 45.987 Cond. No. 25.1 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.003 Model: OLS Adj. R-squared: 0.001 Method: Least Squares F-statistic: 1.546 Date: Sat, 08 Sep 2018 Prob (F-statistic): 0.214 Time: 10:35:06 Log-Likelihood: -1805.3 No. Observations: 506 AIC: 3615. Df Residuals: 504 BIC: 3623. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 3.7232 0.396 9.404 0.000 2.945 4.501 CHAS -1.8715 1.505 -1.243 0.214 -4.829 1.086 ============================================================================== Omnibus: 562.698 Durbin-Watson: 0.822 Prob(Omnibus): 0.000 Jarque-Bera (JB): 30864.755 Skew: 5.205 Prob(JB): 0.00 Kurtosis: 39.818 Cond. No. 3.96 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.174 Model: OLS Adj. R-squared: 0.173 Method: Least Squares F-statistic: 106.4 Date: Sat, 08 Sep 2018 Prob (F-statistic): 9.16e-23 Time: 10:35:06 Log-Likelihood: -1757.6 No. Observations: 506 AIC: 3519. Df Residuals: 504 BIC: 3528. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -13.5881 1.702 -7.986 0.000 -16.931 -10.245 NOX 30.9753 3.003 10.315 0.000 25.076 36.875 ============================================================================== Omnibus: 591.496 Durbin-Watson: 0.994 Prob(Omnibus): 0.000 Jarque-Bera (JB): 42994.381 Skew: 5.544 Prob(JB): 0.00 Kurtosis: 46.776 Cond. No. 11.3 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.048 Model: OLS Adj. R-squared: 0.046 Method: Least Squares F-statistic: 25.62 Date: Sat, 08 Sep 2018 Prob (F-statistic): 5.84e-07 Time: 10:35:06 Log-Likelihood: -1793.5 No. Observations: 506 AIC: 3591. Df Residuals: 504 BIC: 3600. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 20.5060 3.362 6.099 0.000 13.901 27.111 RM -2.6910 0.532 -5.062 0.000 -3.736 -1.646 ============================================================================== Omnibus: 576.890 Durbin-Watson: 0.883 Prob(Omnibus): 0.000 Jarque-Bera (JB): 36966.825 Skew: 5.361 Prob(JB): 0.00 Kurtosis: 43.477 Cond. No. 58.4 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.123 Model: OLS Adj. R-squared: 0.121 Method: Least Squares F-statistic: 70.72 Date: Sat, 08 Sep 2018 Prob (F-statistic): 4.26e-16 Time: 10:35:06 Log-Likelihood: -1772.9 No. Observations: 506 AIC: 3550. Df Residuals: 504 BIC: 3558. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -3.7527 0.944 -3.974 0.000 -5.608 -1.898 AGE 0.1071 0.013 8.409 0.000 0.082 0.132 ============================================================================== Omnibus: 575.090 Durbin-Watson: 0.960 Prob(Omnibus): 0.000 Jarque-Bera (JB): 36851.412 Skew: 5.331 Prob(JB): 0.00 Kurtosis: 43.426 Cond. No. 195. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.143 Model: OLS Adj. R-squared: 0.141 Method: Least Squares F-statistic: 83.97 Date: Sat, 08 Sep 2018 Prob (F-statistic): 1.27e-18 Time: 10:35:06 Log-Likelihood: -1767.1 No. Observations: 506 AIC: 3538. Df Residuals: 504 BIC: 3547. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 9.4489 0.731 12.934 0.000 8.014 10.884 DIS -1.5428 0.168 -9.163 0.000 -1.874 -1.212 ============================================================================== Omnibus: 577.090 Durbin-Watson: 0.957 Prob(Omnibus): 0.000 Jarque-Bera (JB): 37542.100 Skew: 5.357 Prob(JB): 0.00 Kurtosis: 43.815 Cond. No. 9.32 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.387 Model: OLS Adj. R-squared: 0.386 Method: Least Squares F-statistic: 318.1 Date: Sat, 08 Sep 2018 Prob (F-statistic): 1.62e-55 Time: 10:35:06 Log-Likelihood: -1682.3 No. Observations: 506 AIC: 3369. Df Residuals: 504 BIC: 3377. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -2.2709 0.445 -5.105 0.000 -3.145 -1.397 RAD 0.6141 0.034 17.835 0.000 0.546 0.682 ============================================================================== Omnibus: 654.232 Durbin-Watson: 1.336 Prob(Omnibus): 0.000 Jarque-Bera (JB): 74327.568 Skew: 6.441 Prob(JB): 0.00 Kurtosis: 60.961 Cond. No. 19.2 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.336 Model: OLS Adj. R-squared: 0.335 Method: Least Squares F-statistic: 254.9 Date: Sat, 08 Sep 2018 Prob (F-statistic): 9.76e-47 Time: 10:35:06 Log-Likelihood: -1702.5 No. Observations: 506 AIC: 3409. Df Residuals: 504 BIC: 3418. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -8.4748 0.818 -10.365 0.000 -10.081 -6.868 TAX 0.0296 0.002 15.966 0.000 0.026 0.033 ============================================================================== Omnibus: 634.003 Durbin-Watson: 1.252 Prob(Omnibus): 0.000 Jarque-Bera (JB): 63141.063 Skew: 6.134 Prob(JB): 0.00 Kurtosis: 56.332 Cond. No. 1.16e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.16e+03. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.083 Model: OLS Adj. R-squared: 0.081 Method: Least Squares F-statistic: 45.67 Date: Sat, 08 Sep 2018 Prob (F-statistic): 3.88e-11 Time: 10:35:06 Log-Likelihood: -1784.1 No. Observations: 506 AIC: 3572. Df Residuals: 504 BIC: 3581. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -17.5307 3.147 -5.570 0.000 -23.714 -11.347 PTRATIO 1.1446 0.169 6.758 0.000 0.812 1.477 ============================================================================== Omnibus: 568.808 Durbin-Watson: 0.909 Prob(Omnibus): 0.000 Jarque-Bera (JB): 34373.378 Skew: 5.256 Prob(JB): 0.00 Kurtosis: 41.985 Cond. No. 160. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.142 Model: OLS Adj. R-squared: 0.141 Method: Least Squares F-statistic: 83.69 Date: Sat, 08 Sep 2018 Prob (F-statistic): 1.43e-18 Time: 10:35:06 Log-Likelihood: -1767.2 No. Observations: 506 AIC: 3538. Df Residuals: 504 BIC: 3547. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 16.2680 1.430 11.376 0.000 13.458 19.078 B -0.0355 0.004 -9.148 0.000 -0.043 -0.028 ============================================================================== Omnibus: 591.626 Durbin-Watson: 1.001 Prob(Omnibus): 0.000 Jarque-Bera (JB): 43282.465 Skew: 5.543 Prob(JB): 0.00 Kurtosis: 46.932 Cond. No. 1.49e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.49e+03. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.205 Model: OLS Adj. R-squared: 0.203 Method: Least Squares F-statistic: 129.6 Date: Sat, 08 Sep 2018 Prob (F-statistic): 7.12e-27 Time: 10:35:06 Log-Likelihood: -1748.2 No. Observations: 506 AIC: 3500. Df Residuals: 504 BIC: 3509. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -3.2946 0.695 -4.742 0.000 -4.660 -1.930 LSTAT 0.5444 0.048 11.383 0.000 0.450 0.638 ============================================================================== Omnibus: 600.766 Durbin-Watson: 1.184 Prob(Omnibus): 0.000 Jarque-Bera (JB): 49637.173 Skew: 5.638 Prob(JB): 0.00 Kurtosis: 50.193 Cond. No. 29.7 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
g = sns.PairGrid(boston_df, y_vars=[dep], x_vars=indi)
g.map(sns.regplot)
<seaborn.axisgrid.PairGrid at 0x13378f278>
sm.OLS(boston_df[dep], sm.add_constant(boston_df[indi])).fit().summary()
Dep. Variable: | CRIM | R-squared: | 0.436 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.422 |
Method: | Least Squares | F-statistic: | 31.77 |
Date: | Sat, 08 Sep 2018 | Prob (F-statistic): | 6.16e-54 |
Time: | 11:07:28 | Log-Likelihood: | -1661.2 |
No. Observations: | 506 | AIC: | 3348. |
Df Residuals: | 493 | BIC: | 3403. |
Df Model: | 12 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 10.3701 | 7.012 | 1.479 | 0.140 | -3.408 | 24.148 |
ZN | 0.0365 | 0.019 | 1.936 | 0.053 | -0.001 | 0.073 |
INDUS | -0.0672 | 0.085 | -0.794 | 0.428 | -0.233 | 0.099 |
CHAS | -1.3049 | 1.185 | -1.101 | 0.271 | -3.633 | 1.023 |
NOX | -7.2552 | 5.250 | -1.382 | 0.168 | -17.570 | 3.060 |
RM | -0.3851 | 0.575 | -0.670 | 0.503 | -1.515 | 0.745 |
AGE | 0.0019 | 0.018 | 0.105 | 0.917 | -0.034 | 0.038 |
DIS | -0.7163 | 0.273 | -2.626 | 0.009 | -1.252 | -0.180 |
RAD | 0.5395 | 0.088 | 6.128 | 0.000 | 0.366 | 0.712 |
TAX | -0.0013 | 0.005 | -0.254 | 0.799 | -0.011 | 0.009 |
PTRATIO | -0.0907 | 0.180 | -0.504 | 0.615 | -0.445 | 0.263 |
B | -0.0089 | 0.004 | -2.428 | 0.016 | -0.016 | -0.002 |
LSTAT | 0.2309 | 0.069 | 3.346 | 0.001 | 0.095 | 0.366 |
Omnibus: | 680.813 | Durbin-Watson: | 1.507 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 94712.935 |
Skew: | 6.846 | Prob(JB): | 0.00 |
Kurtosis: | 68.611 | Cond. No. | 1.51e+04 |
In 11 (all except CHAS) of the predictors had statistically significant coefficeints. In (b) only 4 of the predictors had statistically significant coefficeints: DIS, RAD, B and LSTAT.
The NOX coefficient is dramatically different; it is large and positive in the univarate regression, large and negative in the multivariate regression.
univar_coefs = [-0.0735, 0.5068, -1.8715, 30.9753, \
-2.6910, 0.1071, -1.5428, 0.6141, \
0.0296, 1.1446, -0.0355, 0.5444]
multivar_coefs = sm.OLS(boston_df[dep], sm.add_constant(boston_df[indi])).fit().params.drop('const')
plot_dat = pd.DataFrame({'univar_coefs' : univar_coefs, 'multivar_coefs' : multivar_coefs})
sns.scatterplot(x='univar_coefs', y='multivar_coefs', data=plot_dat, hue=multivar_coefs.index)
<matplotlib.axes._subplots.AxesSubplot at 0x127935748>
Y = β0 +β1X +β2X2 +β3X3 +ε.
for v in indi:
print(smf.ols(formula='CRIM ~ {0} + I({0} ** 2) + I({0} ** 3) '.format(v), data=boston_df).fit().summary())
# ZN -0.3303 0.110 -3.008 0.003 -0.546 -0.115
# I(ZN ** 2) 0.0064 0.004 1.670 0.096 -0.001 0.014
# I(ZN ** 3) -3.753e-05 3.14e-05 -1.196 0.232 -9.92e-05 2.41e-05
# INDUS -1.9533 0.483 -4.047 0.000 -2.901 -1.005
# I(INDUS ** 2) 0.2504 0.039 6.361 0.000 0.173 0.328
# I(INDUS ** 3) -0.0069 0.001 -7.239 0.000 -0.009 -0.005
# CHAS 1.116e+14 2.7e+14 0.413 0.680 -4.2e+14 6.43e+14
# I(CHAS ** 2) -5.582e+13 1.35e+14 -0.413 0.680 -3.21e+14 2.1e+14
# I(CHAS ** 3) -5.582e+13 1.35e+14 -0.413 0.680 -3.21e+14 2.1e+14
# NOX -1264.1021 170.860 -7.398 0.000 -1599.791 -928.414
# I(NOX ** 2) 2223.2265 280.659 7.921 0.000 1671.816 2774.637
# I(NOX ** 3) -1232.3894 149.687 -8.233 0.000 -1526.479 -938.300
# RM -38.7040 31.284 -1.237 0.217 -100.167 22.759
# I(RM ** 2) 4.4655 5.005 0.892 0.373 -5.369 14.300
# I(RM ** 3) -0.1694 0.264 -0.643 0.521 -0.687 0.348
# AGE 0.2743 0.186 1.471 0.142 -0.092 0.641
# I(AGE ** 2) -0.0072 0.004 -1.987 0.047 -0.014 -8.25e-05
# I(AGE ** 3) 5.737e-05 2.11e-05 2.719 0.007 1.59e-05 9.88e-05
# DIS -15.5172 1.737 -8.931 0.000 -18.931 -12.104
# I(DIS ** 2) 2.4479 0.347 7.061 0.000 1.767 3.129
# I(DIS ** 3) -0.1185 0.020 -5.802 0.000 -0.159 -0.078
# RAD 0.5122 1.047 0.489 0.625 -1.545 2.569
# I(RAD ** 2) -0.0750 0.149 -0.504 0.615 -0.368 0.218
# I(RAD ** 3) 0.0032 0.005 0.699 0.485 -0.006 0.012
# TAX -0.1524 0.096 -1.589 0.113 -0.341 0.036
# I(TAX ** 2) 0.0004 0.000 1.476 0.141 -0.000 0.001
# I(TAX ** 3) -2.193e-07 1.89e-07 -1.158 0.247 -5.91e-07 1.53e-07
# PTRATIO -81.8089 27.649 -2.959 0.003 -136.131 -27.487
# I(PTRATIO ** 2) 4.6039 1.609 2.862 0.004 1.444 7.764
# I(PTRATIO ** 3) -0.0842 0.031 -2.724 0.007 -0.145 -0.023
# B -0.0845 0.056 -1.497 0.135 -0.196 0.026
# I(B ** 2) 0.0002 0.000 0.760 0.447 -0.000 0.001
# I(B ** 3) -2.895e-07 4.38e-07 -0.661 0.509 -1.15e-06 5.7e-07
# LSTAT -0.4133 0.466 -0.887 0.375 -1.328 0.502
# I(LSTAT ** 2) 0.0530 0.030 1.758 0.079 -0.006 0.112
# I(LSTAT ** 3) -0.0008 0.001 -1.423 0.155 -0.002 0.000
OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.058 Model: OLS Adj. R-squared: 0.052 Method: Least Squares F-statistic: 10.24 Date: Sat, 08 Sep 2018 Prob (F-statistic): 1.49e-06 Time: 11:42:03 Log-Likelihood: -1791.1 No. Observations: 506 AIC: 3590. Df Residuals: 502 BIC: 3607. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 4.8193 0.433 11.133 0.000 3.969 5.670 ZN -0.3303 0.110 -3.008 0.003 -0.546 -0.115 I(ZN ** 2) 0.0064 0.004 1.670 0.096 -0.001 0.014 I(ZN ** 3) -3.753e-05 3.14e-05 -1.196 0.232 -9.92e-05 2.41e-05 ============================================================================== Omnibus: 570.003 Durbin-Watson: 0.879 Prob(Omnibus): 0.000 Jarque-Bera (JB): 33886.468 Skew: 5.285 Prob(JB): 0.00 Kurtosis: 41.672 Cond. No. 1.89e+05 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.89e+05. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.257 Model: OLS Adj. R-squared: 0.252 Method: Least Squares F-statistic: 57.86 Date: Sat, 08 Sep 2018 Prob (F-statistic): 3.88e-32 Time: 11:42:03 Log-Likelihood: -1731.0 No. Observations: 506 AIC: 3470. Df Residuals: 502 BIC: 3487. Df Model: 3 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 3.6410 1.576 2.310 0.021 0.545 6.737 INDUS -1.9533 0.483 -4.047 0.000 -2.901 -1.005 I(INDUS ** 2) 0.2504 0.039 6.361 0.000 0.173 0.328 I(INDUS ** 3) -0.0069 0.001 -7.239 0.000 -0.009 -0.005 ============================================================================== Omnibus: 611.416 Durbin-Watson: 1.118 Prob(Omnibus): 0.000 Jarque-Bera (JB): 51547.097 Skew: 5.815 Prob(JB): 0.00 Kurtosis: 51.059 Cond. No. 2.47e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.47e+04. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.003 Model: OLS Adj. R-squared: -0.001 Method: Least Squares F-statistic: 0.7713 Date: Sat, 08 Sep 2018 Prob (F-statistic): 0.463 Time: 11:42:03 Log-Likelihood: -1805.3 No. Observations: 506 AIC: 3617. Df Residuals: 503 BIC: 3629. Df Model: 2 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 3.7232 0.396 9.395 0.000 2.945 4.502 CHAS 1.116e+14 2.7e+14 0.413 0.680 -4.2e+14 6.43e+14 I(CHAS ** 2) -5.582e+13 1.35e+14 -0.413 0.680 -3.21e+14 2.1e+14 I(CHAS ** 3) -5.582e+13 1.35e+14 -0.413 0.680 -3.21e+14 2.1e+14 ============================================================================== Omnibus: 562.698 Durbin-Watson: 0.822 Prob(Omnibus): 0.000 Jarque-Bera (JB): 30864.725 Skew: 5.205 Prob(JB): 0.00 Kurtosis: 39.818 Cond. No. 6.48e+29 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 1.23e-57. This might indicate that there are strong multicollinearity problems or that the design matrix is singular. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.292 Model: OLS Adj. R-squared: 0.288 Method: Least Squares F-statistic: 69.14 Date: Sat, 08 Sep 2018 Prob (F-statistic): 1.94e-37 Time: 11:42:03 Log-Likelihood: -1718.6 No. Observations: 506 AIC: 3445. Df Residuals: 502 BIC: 3462. Df Model: 3 Covariance Type: nonrobust =============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------- Intercept 230.1421 33.734 6.822 0.000 163.864 296.420 NOX -1264.1021 170.860 -7.398 0.000 -1599.791 -928.414 I(NOX ** 2) 2223.2265 280.659 7.921 0.000 1671.816 2774.637 I(NOX ** 3) -1232.3894 149.687 -8.233 0.000 -1526.479 -938.300 ============================================================================== Omnibus: 612.604 Durbin-Watson: 1.159 Prob(Omnibus): 0.000 Jarque-Bera (JB): 52872.508 Skew: 5.824 Prob(JB): 0.00 Kurtosis: 51.705 Cond. No. 1.36e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.36e+03. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.068 Model: OLS Adj. R-squared: 0.063 Method: Least Squares F-statistic: 12.29 Date: Sat, 08 Sep 2018 Prob (F-statistic): 9.06e-08 Time: 11:42:03 Log-Likelihood: -1788.2 No. Observations: 506 AIC: 3584. Df Residuals: 502 BIC: 3601. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 111.9002 64.460 1.736 0.083 -14.744 238.545 RM -38.7040 31.284 -1.237 0.217 -100.167 22.759 I(RM ** 2) 4.4655 5.005 0.892 0.373 -5.369 14.300 I(RM ** 3) -0.1694 0.264 -0.643 0.521 -0.687 0.348 ============================================================================== Omnibus: 586.445 Durbin-Watson: 0.919 Prob(Omnibus): 0.000 Jarque-Bera (JB): 40548.719 Skew: 5.484 Prob(JB): 0.00 Kurtosis: 45.461 Cond. No. 5.36e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 5.36e+04. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.172 Model: OLS Adj. R-squared: 0.167 Method: Least Squares F-statistic: 34.86 Date: Sat, 08 Sep 2018 Prob (F-statistic): 1.76e-20 Time: 11:42:03 Log-Likelihood: -1758.2 No. Observations: 506 AIC: 3524. Df Residuals: 502 BIC: 3541. Df Model: 3 Covariance Type: nonrobust =============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------- Intercept -2.5592 2.771 -0.924 0.356 -8.003 2.884 AGE 0.2743 0.186 1.471 0.142 -0.092 0.641 I(AGE ** 2) -0.0072 0.004 -1.987 0.047 -0.014 -8.25e-05 I(AGE ** 3) 5.737e-05 2.11e-05 2.719 0.007 1.59e-05 9.88e-05 ============================================================================== Omnibus: 577.859 Durbin-Watson: 1.027 Prob(Omnibus): 0.000 Jarque-Bera (JB): 39629.126 Skew: 5.342 Prob(JB): 0.00 Kurtosis: 45.018 Cond. No. 4.74e+06 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 4.74e+06. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.276 Model: OLS Adj. R-squared: 0.272 Method: Least Squares F-statistic: 63.74 Date: Sat, 08 Sep 2018 Prob (F-statistic): 6.20e-35 Time: 11:42:03 Log-Likelihood: -1724.4 No. Observations: 506 AIC: 3457. Df Residuals: 502 BIC: 3474. Df Model: 3 Covariance Type: nonrobust =============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------- Intercept 29.9496 2.448 12.235 0.000 25.140 34.759 DIS -15.5172 1.737 -8.931 0.000 -18.931 -12.104 I(DIS ** 2) 2.4479 0.347 7.061 0.000 1.767 3.129 I(DIS ** 3) -0.1185 0.020 -5.802 0.000 -0.159 -0.078 ============================================================================== Omnibus: 577.986 Durbin-Watson: 1.133 Prob(Omnibus): 0.000 Jarque-Bera (JB): 42441.952 Skew: 5.310 Prob(JB): 0.00 Kurtosis: 46.592 Cond. No. 2.10e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.1e+03. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.396 Model: OLS Adj. R-squared: 0.392 Method: Least Squares F-statistic: 109.5 Date: Sat, 08 Sep 2018 Prob (F-statistic): 1.47e-54 Time: 11:42:03 Log-Likelihood: -1678.7 No. Observations: 506 AIC: 3365. Df Residuals: 502 BIC: 3382. Df Model: 3 Covariance Type: nonrobust =============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------- Intercept -0.6050 2.057 -0.294 0.769 -4.645 3.435 RAD 0.5122 1.047 0.489 0.625 -1.545 2.569 I(RAD ** 2) -0.0750 0.149 -0.504 0.615 -0.368 0.218 I(RAD ** 3) 0.0032 0.005 0.699 0.485 -0.006 0.012 ============================================================================== Omnibus: 657.375 Durbin-Watson: 1.349 Prob(Omnibus): 0.000 Jarque-Bera (JB): 76643.757 Skew: 6.487 Prob(JB): 0.00 Kurtosis: 61.881 Cond. No. 5.43e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 5.43e+04. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.365 Model: OLS Adj. R-squared: 0.361 Method: Least Squares F-statistic: 96.10 Date: Sat, 08 Sep 2018 Prob (F-statistic): 3.69e-49 Time: 11:42:03 Log-Likelihood: -1691.3 No. Observations: 506 AIC: 3391. Df Residuals: 502 BIC: 3407. Df Model: 3 Covariance Type: nonrobust =============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------- Intercept 19.0705 11.827 1.612 0.107 -4.166 42.307 TAX -0.1524 0.096 -1.589 0.113 -0.341 0.036 I(TAX ** 2) 0.0004 0.000 1.476 0.141 -0.000 0.001 I(TAX ** 3) -2.193e-07 1.89e-07 -1.158 0.247 -5.91e-07 1.53e-07 ============================================================================== Omnibus: 642.369 Durbin-Watson: 1.292 Prob(Omnibus): 0.000 Jarque-Bera (JB): 68905.900 Skew: 6.249 Prob(JB): 0.00 Kurtosis: 58.786 Cond. No. 6.16e+09 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 6.16e+09. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.112 Model: OLS Adj. R-squared: 0.107 Method: Least Squares F-statistic: 21.21 Date: Sat, 08 Sep 2018 Prob (F-statistic): 5.99e-13 Time: 11:42:03 Log-Likelihood: -1775.9 No. Observations: 506 AIC: 3560. Df Residuals: 502 BIC: 3577. Df Model: 3 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- Intercept 474.0255 156.823 3.023 0.003 165.915 782.135 PTRATIO -81.8089 27.649 -2.959 0.003 -136.131 -27.487 I(PTRATIO ** 2) 4.6039 1.609 2.862 0.004 1.444 7.764 I(PTRATIO ** 3) -0.0842 0.031 -2.724 0.007 -0.145 -0.023 ============================================================================== Omnibus: 572.978 Durbin-Watson: 0.949 Prob(Omnibus): 0.000 Jarque-Bera (JB): 36189.609 Skew: 5.303 Prob(JB): 0.00 Kurtosis: 43.050 Cond. No. 3.02e+06 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 3.02e+06. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.144 Model: OLS Adj. R-squared: 0.139 Method: Least Squares F-statistic: 28.14 Date: Sat, 08 Sep 2018 Prob (F-statistic): 7.83e-17 Time: 11:42:03 Log-Likelihood: -1766.8 No. Observations: 506 AIC: 3542. Df Residuals: 502 BIC: 3558. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 17.9898 2.312 7.782 0.000 13.448 22.531 B -0.0845 0.056 -1.497 0.135 -0.196 0.026 I(B ** 2) 0.0002 0.000 0.760 0.447 -0.000 0.001 I(B ** 3) -2.895e-07 4.38e-07 -0.661 0.509 -1.15e-06 5.7e-07 ============================================================================== Omnibus: 589.534 Durbin-Watson: 0.990 Prob(Omnibus): 0.000 Jarque-Bera (JB): 42752.655 Skew: 5.512 Prob(JB): 0.00 Kurtosis: 46.661 Cond. No. 3.59e+08 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 3.59e+08. This might indicate that there are strong multicollinearity or other numerical problems. OLS Regression Results ============================================================================== Dep. Variable: CRIM R-squared: 0.214 Model: OLS Adj. R-squared: 0.210 Method: Least Squares F-statistic: 45.67 Date: Sat, 08 Sep 2018 Prob (F-statistic): 4.13e-26 Time: 11:42:03 Log-Likelihood: -1745.0 No. Observations: 506 AIC: 3498. Df Residuals: 502 BIC: 3515. Df Model: 3 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 1.0836 2.032 0.533 0.594 -2.909 5.076 LSTAT -0.4133 0.466 -0.887 0.375 -1.328 0.502 I(LSTAT ** 2) 0.0530 0.030 1.758 0.079 -0.006 0.112 I(LSTAT ** 3) -0.0008 0.001 -1.423 0.155 -0.002 0.000 ============================================================================== Omnibus: 607.032 Durbin-Watson: 1.239 Prob(Omnibus): 0.000 Jarque-Bera (JB): 53255.699 Skew: 5.717 Prob(JB): 0.00 Kurtosis: 51.941 Cond. No. 5.20e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 5.2e+04. This might indicate that there are strong multicollinearity or other numerical problems.
sns.heatmap(boston_df.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x127b30390>