import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from collections import OrderedDict
plt.style.use('seaborn') # pretty matplotlib plots
plt.rc('font', size = 14)
plt.rc('figure', titlesize = 18)
plt.rc('axes', labelsize = 15)
plt.rc('axes', titlesize = 18)
%matplotlib inline
/opt/conda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
df = pd.read_csv('../../../data/Boston.csv', header = 0)
df.head()
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
# list(df)
df.columns
Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv'], dtype='object')
df.shape
(506, 14)
Using statsmodels
# predictor & dependent var
x_train = df['lstat']
y_true = df['medv']
# ols model with intercept added to predictor
lm = sm.OLS(y_true, sm.add_constant(x_train))
# fitted model and summary
lm_fit = lm.fit()
lm_fit.summary()
# robust SE
# lm = sm.RLM(y_true, x_train, M=sm.robust.norms.LeastSquares())
# lm_fit = lm.fit(cov='H2')
# lm_fit.summary()
Dep. Variable: | medv | R-squared: | 0.544 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.543 |
Method: | Least Squares | F-statistic: | 601.6 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 5.08e-88 |
Time: | 15:48:01 | Log-Likelihood: | -1641.5 |
No. Observations: | 506 | AIC: | 3287. |
Df Residuals: | 504 | BIC: | 3295. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 34.5538 | 0.563 | 61.415 | 0.000 | 33.448 | 35.659 |
lstat | -0.9500 | 0.039 | -24.528 | 0.000 | -1.026 | -0.874 |
Omnibus: | 137.043 | Durbin-Watson: | 0.892 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 291.373 |
Skew: | 1.453 | Prob(JB): | 5.36e-64 |
Kurtosis: | 5.319 | Cond. No. | 29.7 |
or
lm = smf.ols('medv~lstat', data = df)
lm_fit = lm.fit()
lm_fit.summary()
Dep. Variable: | medv | R-squared: | 0.544 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.543 |
Method: | Least Squares | F-statistic: | 601.6 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 5.08e-88 |
Time: | 15:48:01 | Log-Likelihood: | -1641.5 |
No. Observations: | 506 | AIC: | 3287. |
Df Residuals: | 504 | BIC: | 3295. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 34.5538 | 0.563 | 61.415 | 0.000 | 33.448 | 35.659 |
lstat | -0.9500 | 0.039 | -24.528 | 0.000 | -1.026 | -0.874 |
Omnibus: | 137.043 | Durbin-Watson: | 0.892 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 291.373 |
Skew: | 1.453 | Prob(JB): | 5.36e-64 |
Kurtosis: | 5.319 | Cond. No. | 29.7 |
dir(lm_fit)
['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cache', '_data_attr', '_get_robustcov_results', '_is_nested', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'diagn', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_prediction', 'get_robustcov_results', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wald_test_terms', 'wresid']
lm_fit.params
Intercept 34.553841 lstat -0.950049 dtype: float64
lm_fit.conf_int()
0 | 1 | |
---|---|---|
Intercept | 33.448457 | 35.659225 |
lstat | -1.026148 | -0.873951 |
x_test = pd.DataFrame({'lstat': [5, 10, 15]})
y_pred = lm_fit.get_prediction(x_test)
y_pred.summary_frame()
mean | mean_se | mean_ci_lower | mean_ci_upper | obs_ci_lower | obs_ci_upper | |
---|---|---|---|---|---|---|
0 | 29.803594 | 0.405247 | 29.007412 | 30.599776 | 17.565675 | 42.041513 |
1 | 25.053347 | 0.294814 | 24.474132 | 25.632563 | 12.827626 | 37.279068 |
2 | 20.303101 | 0.290893 | 19.731588 | 20.874613 | 8.077742 | 32.528459 |
x_test = pd.DataFrame({'lstat': [df.lstat.min(), df.lstat.max()]})
y_pred = lm_fit.predict(x_test)
df.plot(x = 'lstat', y = 'medv', kind = 'scatter')
plt.plot(x_test, y_pred, c = 'red', linewidth = 2)
plt.xlabel("lstat")
plt.ylabel("medv")
plt.show()
x_test = df['lstat']
y_pred = lm_fit.predict(x_test)
f, axes = plt.subplots(1, 2, sharex = False, sharey = False)
f.set_figheight(10)
f.set_figwidth(15)
sns.regplot('lstat', 'medv', data = df, ax=axes[0], scatter_kws = {'alpha': '0.5'})
sns.residplot(y_pred, 'medv', data=df, ax=axes[1], scatter_kws={'alpha': '0.5'}, lowess = True) # residual plot
axes[1].set_xlabel('Fitted values')
axes[1].set_ylabel('Residuals')
plt.show()
For creating diagnostic plots see Emulating R regression plots in Python
# fitted values (need a constant term for intercept)
lm_fitted_y = lm_fit.fittedvalues
# model residuals
lm_residuals = lm_fit.resid
# normalized residuals
lm_norm_residuals = lm_fit.get_influence().resid_studentized_internal
# absolute squared normalized residuals
lm_norm_residuals_abs_sqrt = np.sqrt(np.abs(lm_norm_residuals))
# absolute residuals
lm_abs_resid = np.abs(lm_residuals)
# leverage, from statsmodels internals
lm_leverage = lm_fit.get_influence().hat_matrix_diag
# cook's distance, from statsmodels internals
lm_cooks = lm_fit.get_influence().cooks_distance[0]
Residual plot
plot_lm_1 = plt.figure(1)
plot_lm_1.set_figheight(8)
plot_lm_1.set_figwidth(12)
plot_lm_1.axes[0] = sns.residplot(lm_fitted_y, 'medv', data = df,
lowess = True,
scatter_kws = {'alpha': 0.5},
line_kws = {'color': 'red', 'lw': 1, 'alpha': 0.8})
plot_lm_1.axes[0].set_title('Residuals vs Fitted')
plot_lm_1.axes[0].set_xlabel('Fitted values')
plot_lm_1.axes[0].set_ylabel('Residuals')
# annotations
abs_resid = lm_abs_resid.sort_values(ascending = False)
abs_resid_top_3 = abs_resid[:3]
for i in abs_resid_top_3.index:
plot_lm_1.axes[0].annotate(i, xy = (lm_fitted_y[i], lm_residuals[i]));
QQ plot
QQ = ProbPlot(lm_norm_residuals)
plot_lm_2 = QQ.qqplot(line = '45', alpha = 0.5, color = '#4C72B0', lw = 1)
plot_lm_2.set_figheight(8)
plot_lm_2.set_figwidth(12)
plot_lm_2.axes[0].set_title('Normal Q-Q')
plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
plot_lm_2.axes[0].set_ylabel('Standardized Residuals');
# annotations
abs_norm_resid = np.flip(np.argsort(np.abs(lm_norm_residuals)), 0)
abs_norm_resid_top_3 = abs_norm_resid[:3]
for r, i in enumerate(abs_norm_resid_top_3):
plot_lm_2.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], lm_norm_residuals[i]));
Scale-Location Plot
plot_lm_3 = plt.figure(3)
plot_lm_3.set_figheight(8)
plot_lm_3.set_figwidth(12)
plt.scatter(lm_fitted_y, lm_norm_residuals_abs_sqrt, alpha=0.5)
sns.regplot(lm_fitted_y, lm_norm_residuals_abs_sqrt,
scatter=False,
ci=False,
lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plot_lm_3.axes[0].set_title('Scale-Location')
plot_lm_3.axes[0].set_xlabel('Fitted values')
plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$');
# annotations
abs_sq_norm_resid = np.flip(np.argsort(lm_norm_residuals_abs_sqrt), 0)
abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]
for i in abs_norm_resid_top_3:
plot_lm_3.axes[0].annotate(i, xy=(lm_fitted_y[i], lm_norm_residuals_abs_sqrt[i]));
Leverage plot
plot_lm_4 = plt.figure(4)
plot_lm_4.set_figheight(8)
plot_lm_4.set_figwidth(12)
plt.scatter(lm_leverage, lm_norm_residuals, alpha=0.5)
sns.regplot(lm_leverage, lm_norm_residuals,
scatter=False,
ci=False,
lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plot_lm_4.axes[0].set_xlim(0, 0.20)
plot_lm_4.axes[0].set_ylim(-3, 5)
plot_lm_4.axes[0].set_title('Residuals vs Leverage')
plot_lm_4.axes[0].set_xlabel('Leverage')
plot_lm_4.axes[0].set_ylabel('Standardized Residuals')
# annotations
leverage_top_3 = np.flip(np.argsort(lm_cooks), 0)[:3]
for i in leverage_top_3:
plot_lm_4.axes[0].annotate(i, xy=(lm_leverage[i], lm_norm_residuals[i]))
# shenanigans for cook's distance contours
def graph(formula, x_range, label=None):
x = x_range
y = formula(x)
plt.plot(x, y, label=label, lw=1, ls='--', color='red')
p = len(lm_fit.params) # number of model parameters
graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
np.linspace(0.001, 0.200, 50),
'Cook\'s distance') # 0.5 line
graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
np.linspace(0.001, 0.200, 50)) # 1 line
plt.legend(loc='upper right');
lm_fit = smf.ols('medv~lstat+age', data = df).fit()
lm_fit.summary()
Dep. Variable: | medv | R-squared: | 0.551 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.549 |
Method: | Least Squares | F-statistic: | 309.0 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 2.98e-88 |
Time: | 15:48:03 | Log-Likelihood: | -1637.5 |
No. Observations: | 506 | AIC: | 3281. |
Df Residuals: | 503 | BIC: | 3294. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 33.2228 | 0.731 | 45.458 | 0.000 | 31.787 | 34.659 |
lstat | -1.0321 | 0.048 | -21.416 | 0.000 | -1.127 | -0.937 |
age | 0.0345 | 0.012 | 2.826 | 0.005 | 0.011 | 0.059 |
Omnibus: | 124.288 | Durbin-Watson: | 0.945 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 244.026 |
Skew: | 1.362 | Prob(JB): | 1.02e-53 |
Kurtosis: | 5.038 | Cond. No. | 201. |
If we want to use all the variable. We can use the following trick to manually construct the list. In Python, most of time, you have to manully construct the variable list.
def ols_formula(df, dependent_var, *excluded_cols):
'''
Generates the R style formula for statsmodels (patsy) given
the dataframe, dependent variable and optional excluded columns
as strings
'''
df_columns = list(df.columns.values)
df_columns.remove(dependent_var)
for col in excluded_cols:
df_columns.remove(col)
return dependent_var + ' ~ ' + ' + '.join(df_columns)
lm_fit = smf.ols(formula = ols_formula(df, 'medv'), data = df).fit() # formula = 'medv ~ .'
lm_fit.summary()
Dep. Variable: | medv | R-squared: | 0.741 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.734 |
Method: | Least Squares | F-statistic: | 108.1 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 6.72e-135 |
Time: | 15:48:03 | Log-Likelihood: | -1498.8 |
No. Observations: | 506 | AIC: | 3026. |
Df Residuals: | 492 | BIC: | 3085. |
Df Model: | 13 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 36.4595 | 5.103 | 7.144 | 0.000 | 26.432 | 46.487 |
crim | -0.1080 | 0.033 | -3.287 | 0.001 | -0.173 | -0.043 |
zn | 0.0464 | 0.014 | 3.382 | 0.001 | 0.019 | 0.073 |
indus | 0.0206 | 0.061 | 0.334 | 0.738 | -0.100 | 0.141 |
chas | 2.6867 | 0.862 | 3.118 | 0.002 | 0.994 | 4.380 |
nox | -17.7666 | 3.820 | -4.651 | 0.000 | -25.272 | -10.262 |
rm | 3.8099 | 0.418 | 9.116 | 0.000 | 2.989 | 4.631 |
age | 0.0007 | 0.013 | 0.052 | 0.958 | -0.025 | 0.027 |
dis | -1.4756 | 0.199 | -7.398 | 0.000 | -1.867 | -1.084 |
rad | 0.3060 | 0.066 | 4.613 | 0.000 | 0.176 | 0.436 |
tax | -0.0123 | 0.004 | -3.280 | 0.001 | -0.020 | -0.005 |
ptratio | -0.9527 | 0.131 | -7.283 | 0.000 | -1.210 | -0.696 |
black | 0.0093 | 0.003 | 3.467 | 0.001 | 0.004 | 0.015 |
lstat | -0.5248 | 0.051 | -10.347 | 0.000 | -0.624 | -0.425 |
Omnibus: | 178.041 | Durbin-Watson: | 1.078 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 783.126 |
Skew: | 1.521 | Prob(JB): | 8.84e-171 |
Kurtosis: | 8.281 | Cond. No. | 1.51e+04 |
Variance inflation factors
# don't forget to add constant if the ols model includes intercept
df_exog = sm.add_constant(df.drop('medv', axis = 1))
# too fancy for printing results?
for i, col in enumerate(df.columns):
if col == 'const':
pass
elif len(col) > 6:
print(col, ':', "{0:.2f}".format(vif(df_exog.as_matrix(), i)))
else:
print(col, '\t:', "{0:.2f}".format(vif(df_exog.as_matrix(), i)))
crim : 585.27 zn : 1.79 indus : 2.30 chas : 3.99 nox : 1.07 rm : 4.39 age : 1.93 dis : 3.10 rad : 3.96 tax : 7.48 ptratio : 9.01 black : 1.80 lstat : 1.35 medv : 2.94
Run a regression excluding age predictor (formula = 'medv ~ . - age')
lm = smf.ols(formula = ols_formula(df, 'medv', 'age'), data = df)
lm_fit = lm.fit()
lm_fit.summary()
Dep. Variable: | medv | R-squared: | 0.741 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.734 |
Method: | Least Squares | F-statistic: | 117.3 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 6.08e-136 |
Time: | 15:48:04 | Log-Likelihood: | -1498.8 |
No. Observations: | 506 | AIC: | 3024. |
Df Residuals: | 493 | BIC: | 3079. |
Df Model: | 12 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 36.4369 | 5.080 | 7.172 | 0.000 | 26.456 | 46.418 |
crim | -0.1080 | 0.033 | -3.290 | 0.001 | -0.173 | -0.043 |
zn | 0.0463 | 0.014 | 3.404 | 0.001 | 0.020 | 0.073 |
indus | 0.0206 | 0.061 | 0.335 | 0.738 | -0.100 | 0.141 |
chas | 2.6890 | 0.860 | 3.128 | 0.002 | 1.000 | 4.378 |
nox | -17.7135 | 3.679 | -4.814 | 0.000 | -24.943 | -10.484 |
rm | 3.8144 | 0.408 | 9.338 | 0.000 | 3.012 | 4.617 |
dis | -1.4786 | 0.191 | -7.757 | 0.000 | -1.853 | -1.104 |
rad | 0.3058 | 0.066 | 4.627 | 0.000 | 0.176 | 0.436 |
tax | -0.0123 | 0.004 | -3.283 | 0.001 | -0.020 | -0.005 |
ptratio | -0.9522 | 0.130 | -7.308 | 0.000 | -1.208 | -0.696 |
black | 0.0093 | 0.003 | 3.481 | 0.001 | 0.004 | 0.015 |
lstat | -0.5239 | 0.048 | -10.999 | 0.000 | -0.617 | -0.430 |
Omnibus: | 178.343 | Durbin-Watson: | 1.078 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 786.386 |
Skew: | 1.523 | Prob(JB): | 1.73e-171 |
Kurtosis: | 8.294 | Cond. No. | 1.48e+04 |
lm = smf.ols(formula = 'medv ~ lstat * age', data = df)
lm_fit = lm.fit()
lm_fit.summary()
Dep. Variable: | medv | R-squared: | 0.556 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.553 |
Method: | Least Squares | F-statistic: | 209.3 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 4.86e-88 |
Time: | 15:48:04 | Log-Likelihood: | -1635.0 |
No. Observations: | 506 | AIC: | 3278. |
Df Residuals: | 502 | BIC: | 3295. |
Df Model: | 3 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 36.0885 | 1.470 | 24.553 | 0.000 | 33.201 | 38.976 |
lstat | -1.3921 | 0.167 | -8.313 | 0.000 | -1.721 | -1.063 |
age | -0.0007 | 0.020 | -0.036 | 0.971 | -0.040 | 0.038 |
lstat:age | 0.0042 | 0.002 | 2.244 | 0.025 | 0.001 | 0.008 |
Omnibus: | 135.601 | Durbin-Watson: | 0.965 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 296.955 |
Skew: | 1.417 | Prob(JB): | 3.29e-65 |
Kurtosis: | 5.461 | Cond. No. | 6.88e+03 |
lm = smf.ols(formula = 'medv ~ lstat + I(lstat ** 2.0)', data = df)
lm_fit = lm.fit()
lm_fit.summary()
Dep. Variable: | medv | R-squared: | 0.641 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.639 |
Method: | Least Squares | F-statistic: | 448.5 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 1.56e-112 |
Time: | 15:48:04 | Log-Likelihood: | -1581.3 |
No. Observations: | 506 | AIC: | 3169. |
Df Residuals: | 503 | BIC: | 3181. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 42.8620 | 0.872 | 49.149 | 0.000 | 41.149 | 44.575 |
lstat | -2.3328 | 0.124 | -18.843 | 0.000 | -2.576 | -2.090 |
I(lstat ** 2.0) | 0.0435 | 0.004 | 11.628 | 0.000 | 0.036 | 0.051 |
Omnibus: | 107.006 | Durbin-Watson: | 0.921 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 228.388 |
Skew: | 1.128 | Prob(JB): | 2.55e-50 |
Kurtosis: | 5.397 | Cond. No. | 1.13e+03 |
# anova of the two models
lm_fit1 = smf.ols(formula='medv ~ lstat', data = df).fit()
lm_fit2 = smf.ols(formula='medv ~ lstat + I(lstat**2.0)', data = df).fit()
sm.stats.anova_lm(lm_fit1, lm_fit2)
/opt/conda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater return (self.a < x) & (x < self.b) /opt/conda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less return (self.a < x) & (x < self.b) /opt/conda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal cond2 = cond0 & (x <= self.a)
df_resid | ssr | df_diff | ss_diff | F | Pr(>F) | |
---|---|---|---|---|---|---|
0 | 504.0 | 19472.381418 | 0.0 | NaN | NaN | NaN |
1 | 503.0 | 15347.243158 | 1.0 | 4125.13826 | 135.199822 | 7.630116e-28 |
f, axes = plt.subplots(4, 2, sharex = False, sharey = False)
f.set_figheight(20)
sns.regplot('lstat', 'medv', data=df, ax=axes[0, 0], order=1, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'})
sns.regplot('lstat', 'medv', data=df, ax=axes[0, 1], order=2, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'})
sns.regplot('lstat', 'medv', data=df, ax=axes[1, 0], order=3, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'})
sns.regplot('lstat', 'medv', data=df, ax=axes[1, 1], order=5, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'})
sns.residplot('lstat', 'medv', data=df, ax=axes[2, 0], order=1, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'}, lowess = True)
sns.residplot('lstat', 'medv', data=df, ax=axes[2, 1], order=2, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'}, lowess = True)
sns.residplot('lstat', 'medv', data=df, ax=axes[3, 0], order=3, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'}, lowess = True)
sns.residplot('lstat', 'medv', data=df, ax=axes[3, 1], order=5, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'}, lowess = True);
lm = smf.ols(formula = 'medv ~ lstat + I(lstat**2) + I(lstat**3) + I(lstat**4) + I(lstat**5)', data = df)
lm_fit = lm.fit()
lm_fit.summary()
Dep. Variable: | medv | R-squared: | 0.682 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.679 |
Method: | Least Squares | F-statistic: | 214.2 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 8.73e-122 |
Time: | 15:48:06 | Log-Likelihood: | -1550.6 |
No. Observations: | 506 | AIC: | 3113. |
Df Residuals: | 500 | BIC: | 3139. |
Df Model: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 67.6997 | 3.604 | 18.783 | 0.000 | 60.618 | 74.781 |
lstat | -11.9911 | 1.526 | -7.859 | 0.000 | -14.989 | -8.994 |
I(lstat ** 2) | 1.2728 | 0.223 | 5.703 | 0.000 | 0.834 | 1.711 |
I(lstat ** 3) | -0.0683 | 0.014 | -4.747 | 0.000 | -0.097 | -0.040 |
I(lstat ** 4) | 0.0017 | 0.000 | 4.143 | 0.000 | 0.001 | 0.003 |
I(lstat ** 5) | -1.632e-05 | 4.42e-06 | -3.692 | 0.000 | -2.5e-05 | -7.63e-06 |
Omnibus: | 144.085 | Durbin-Watson: | 0.987 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 494.545 |
Skew: | 1.292 | Prob(JB): | 4.08e-108 |
Kurtosis: | 7.096 | Cond. No. | 1.37e+08 |
lm = smf.ols(formula = 'medv ~ np.log(rm)', data = df)
lm_fit = lm.fit()
lm_fit.summary()
Dep. Variable: | medv | R-squared: | 0.436 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.435 |
Method: | Least Squares | F-statistic: | 389.3 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 1.22e-64 |
Time: | 15:48:06 | Log-Likelihood: | -1695.4 |
No. Observations: | 506 | AIC: | 3395. |
Df Residuals: | 504 | BIC: | 3403. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -76.4878 | 5.028 | -15.213 | 0.000 | -86.366 | -66.610 |
np.log(rm) | 54.0546 | 2.739 | 19.732 | 0.000 | 48.672 | 59.437 |
Omnibus: | 117.102 | Durbin-Watson: | 0.681 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 584.336 |
Skew: | 0.916 | Prob(JB): | 1.30e-127 |
Kurtosis: | 7.936 | Cond. No. | 38.9 |
sns.regplot('rm', 'medv', data = df, logx = True, line_kws = {'color': 'gray'}, scatter_kws = {'alpha': '0.5'});
df = pd.read_csv('../../../data/Carseats.csv')
df.head()
Sales | CompPrice | Income | Advertising | Population | Price | ShelveLoc | Age | Education | Urban | US | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9.50 | 138 | 73 | 11 | 276 | 120 | Bad | 42 | 17 | Yes | Yes |
1 | 11.22 | 111 | 48 | 16 | 260 | 83 | Good | 65 | 10 | Yes | Yes |
2 | 10.06 | 113 | 35 | 10 | 269 | 80 | Medium | 59 | 12 | Yes | Yes |
3 | 7.40 | 117 | 100 | 4 | 466 | 97 | Medium | 55 | 14 | Yes | Yes |
4 | 4.15 | 141 | 64 | 3 | 340 | 128 | Bad | 38 | 13 | Yes | No |
formula = ols_formula(df, 'Sales', 'ShelveLoc') + ' + Income:Advertising + Price:Age + C(ShelveLoc)'
lm = smf.ols(formula, data = df)
lm_fit = lm.fit()
lm_fit.summary()
Dep. Variable: | Sales | R-squared: | 0.876 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.872 |
Method: | Least Squares | F-statistic: | 210.0 |
Date: | Sun, 03 Jun 2018 | Prob (F-statistic): | 6.14e-166 |
Time: | 15:48:06 | Log-Likelihood: | -564.67 |
No. Observations: | 400 | AIC: | 1157. |
Df Residuals: | 386 | BIC: | 1213. |
Df Model: | 13 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 6.5756 | 1.009 | 6.519 | 0.000 | 4.592 | 8.559 |
Urban[T.Yes] | 0.1402 | 0.112 | 1.247 | 0.213 | -0.081 | 0.361 |
US[T.Yes] | -0.1576 | 0.149 | -1.058 | 0.291 | -0.450 | 0.135 |
C(ShelveLoc)[T.Good] | 4.8487 | 0.153 | 31.724 | 0.000 | 4.548 | 5.149 |
C(ShelveLoc)[T.Medium] | 1.9533 | 0.126 | 15.531 | 0.000 | 1.706 | 2.201 |
CompPrice | 0.0929 | 0.004 | 22.567 | 0.000 | 0.085 | 0.101 |
Income | 0.0109 | 0.003 | 4.183 | 0.000 | 0.006 | 0.016 |
Advertising | 0.0702 | 0.023 | 3.107 | 0.002 | 0.026 | 0.115 |
Population | 0.0002 | 0.000 | 0.433 | 0.665 | -0.001 | 0.001 |
Price | -0.1008 | 0.007 | -13.549 | 0.000 | -0.115 | -0.086 |
Age | -0.0579 | 0.016 | -3.633 | 0.000 | -0.089 | -0.027 |
Education | -0.0209 | 0.020 | -1.063 | 0.288 | -0.059 | 0.018 |
Income:Advertising | 0.0008 | 0.000 | 2.698 | 0.007 | 0.000 | 0.001 |
Price:Age | 0.0001 | 0.000 | 0.801 | 0.424 | -0.000 | 0.000 |
Omnibus: | 1.281 | Durbin-Watson: | 2.047 |
---|---|---|---|
Prob(Omnibus): | 0.527 | Jarque-Bera (JB): | 1.147 |
Skew: | 0.129 | Prob(JB): | 0.564 |
Kurtosis: | 3.050 | Cond. No. | 1.31e+05 |
We already did.