3.6 Lab - Linear Regression¶

3.6.1 Libraries¶

In [1]:

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

from collections import OrderedDict

plt.style.use('seaborn') # pretty matplotlib plots
plt.rc('font', size = 14)
plt.rc('figure', titlesize = 18)
plt.rc('axes', labelsize = 15)
plt.rc('axes', titlesize = 18)

%matplotlib inline

/opt/conda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

3.6.2 Simple Linear Regression¶

In [2]:

df = pd.read_csv('../../../data/Boston.csv', header = 0)
df.head()

Out[2]:

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

In [3]:

# list(df)
df.columns

Out[3]:

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'black', 'lstat', 'medv'],
      dtype='object')

In [4]:

df.shape

Out[4]:

(506, 14)

Using statsmodels

In [5]:

# predictor & dependent var
x_train = df['lstat']
y_true = df['medv']

# ols model with intercept added to predictor
lm = sm.OLS(y_true, sm.add_constant(x_train))

# fitted model and summary
lm_fit = lm.fit()
lm_fit.summary()

# robust SE
# lm = sm.RLM(y_true, x_train, M=sm.robust.norms.LeastSquares())
# lm_fit = lm.fit(cov='H2')
# lm_fit.summary()

Out[5]:

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.544
Model:	OLS	Adj. R-squared:	0.543
Method:	Least Squares	F-statistic:	601.6
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	5.08e-88
Time:	15:48:01	Log-Likelihood:	-1641.5
No. Observations:	506	AIC:	3287.
Df Residuals:	504	BIC:	3295.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	34.5538	0.563	61.415	0.000	33.448	35.659
lstat	-0.9500	0.039	-24.528	0.000	-1.026	-0.874

Omnibus:	137.043	Durbin-Watson:	0.892
Prob(Omnibus):	0.000	Jarque-Bera (JB):	291.373
Skew:	1.453	Prob(JB):	5.36e-64
Kurtosis:	5.319	Cond. No.	29.7

or

In [6]:

lm = smf.ols('medv~lstat', data = df)
lm_fit = lm.fit()
lm_fit.summary()

Out[6]:

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.544
Model:	OLS	Adj. R-squared:	0.543
Method:	Least Squares	F-statistic:	601.6
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	5.08e-88
Time:	15:48:01	Log-Likelihood:	-1641.5
No. Observations:	506	AIC:	3287.
Df Residuals:	504	BIC:	3295.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	34.5538	0.563	61.415	0.000	33.448	35.659
lstat	-0.9500	0.039	-24.528	0.000	-1.026	-0.874

Omnibus:	137.043	Durbin-Watson:	0.892
Prob(Omnibus):	0.000	Jarque-Bera (JB):	291.373
Skew:	1.453	Prob(JB):	5.36e-64
Kurtosis:	5.319	Cond. No.	29.7

In [7]:

dir(lm_fit)

Out[7]:

['HC0_se',
 'HC1_se',
 'HC2_se',
 'HC3_se',
 '_HCCM',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cache',
 '_data_attr',
 '_get_robustcov_results',
 '_is_nested',
 '_wexog_singular_values',
 'aic',
 'bic',
 'bse',
 'centered_tss',
 'compare_f_test',
 'compare_lm_test',
 'compare_lr_test',
 'condition_number',
 'conf_int',
 'conf_int_el',
 'cov_HC0',
 'cov_HC1',
 'cov_HC2',
 'cov_HC3',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'df_model',
 'df_resid',
 'diagn',
 'eigenvals',
 'el_test',
 'ess',
 'f_pvalue',
 'f_test',
 'fittedvalues',
 'fvalue',
 'get_influence',
 'get_prediction',
 'get_robustcov_results',
 'initialize',
 'k_constant',
 'llf',
 'load',
 'model',
 'mse_model',
 'mse_resid',
 'mse_total',
 'nobs',
 'normalized_cov_params',
 'outlier_test',
 'params',
 'predict',
 'pvalues',
 'remove_data',
 'resid',
 'resid_pearson',
 'rsquared',
 'rsquared_adj',
 'save',
 'scale',
 'ssr',
 'summary',
 'summary2',
 't_test',
 'tvalues',
 'uncentered_tss',
 'use_t',
 'wald_test',
 'wald_test_terms',
 'wresid']

In [8]:

lm_fit.params

Out[8]:

Intercept    34.553841
lstat        -0.950049
dtype: float64

In [9]:

lm_fit.conf_int()

Out[9]:

	0	1
Intercept	33.448457	35.659225
lstat	-1.026148	-0.873951

In [10]:

x_test = pd.DataFrame({'lstat': [5, 10, 15]})
y_pred = lm_fit.get_prediction(x_test)
y_pred.summary_frame()

Out[10]:

	mean	mean_se	mean_ci_lower	mean_ci_upper	obs_ci_lower	obs_ci_upper
0	29.803594	0.405247	29.007412	30.599776	17.565675	42.041513
1	25.053347	0.294814	24.474132	25.632563	12.827626	37.279068
2	20.303101	0.290893	19.731588	20.874613	8.077742	32.528459

In [11]:

x_test = pd.DataFrame({'lstat': [df.lstat.min(), df.lstat.max()]})
y_pred = lm_fit.predict(x_test)

df.plot(x = 'lstat', y = 'medv', kind = 'scatter')
plt.plot(x_test, y_pred, c = 'red', linewidth = 2)
plt.xlabel("lstat")
plt.ylabel("medv")
plt.show()

In [12]:

x_test = df['lstat']
y_pred = lm_fit.predict(x_test)

f, axes = plt.subplots(1, 2, sharex = False, sharey = False) 
f.set_figheight(10)
f.set_figwidth(15)

sns.regplot('lstat', 'medv', data = df, ax=axes[0], scatter_kws = {'alpha': '0.5'})
sns.residplot(y_pred, 'medv', data=df, ax=axes[1], scatter_kws={'alpha': '0.5'}, lowess = True) # residual plot
axes[1].set_xlabel('Fitted values')
axes[1].set_ylabel('Residuals')
plt.show()

For creating diagnostic plots see Emulating R regression plots in Python

In [13]:

# fitted values (need a constant term for intercept)
lm_fitted_y = lm_fit.fittedvalues

# model residuals
lm_residuals = lm_fit.resid

# normalized residuals
lm_norm_residuals = lm_fit.get_influence().resid_studentized_internal

# absolute squared normalized residuals
lm_norm_residuals_abs_sqrt = np.sqrt(np.abs(lm_norm_residuals))

# absolute residuals
lm_abs_resid = np.abs(lm_residuals)

# leverage, from statsmodels internals
lm_leverage = lm_fit.get_influence().hat_matrix_diag

# cook's distance, from statsmodels internals
lm_cooks = lm_fit.get_influence().cooks_distance[0]

Residual plot

In [14]:

plot_lm_1 = plt.figure(1)
plot_lm_1.set_figheight(8)
plot_lm_1.set_figwidth(12)

plot_lm_1.axes[0] = sns.residplot(lm_fitted_y, 'medv', data = df,
                                  lowess = True,
                                  scatter_kws = {'alpha': 0.5},
                                  line_kws = {'color': 'red', 'lw': 1, 'alpha': 0.8})

plot_lm_1.axes[0].set_title('Residuals vs Fitted')
plot_lm_1.axes[0].set_xlabel('Fitted values')
plot_lm_1.axes[0].set_ylabel('Residuals')


# annotations
abs_resid = lm_abs_resid.sort_values(ascending = False)
abs_resid_top_3 = abs_resid[:3]

for i in abs_resid_top_3.index:
    plot_lm_1.axes[0].annotate(i, xy = (lm_fitted_y[i], lm_residuals[i]));

QQ plot

In [15]:

QQ = ProbPlot(lm_norm_residuals)
plot_lm_2 = QQ.qqplot(line = '45', alpha = 0.5, color = '#4C72B0', lw = 1)

plot_lm_2.set_figheight(8)
plot_lm_2.set_figwidth(12)

plot_lm_2.axes[0].set_title('Normal Q-Q')
plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
plot_lm_2.axes[0].set_ylabel('Standardized Residuals');

# annotations
abs_norm_resid = np.flip(np.argsort(np.abs(lm_norm_residuals)), 0)
abs_norm_resid_top_3 = abs_norm_resid[:3]

for r, i in enumerate(abs_norm_resid_top_3):
    plot_lm_2.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], lm_norm_residuals[i]));

Scale-Location Plot

In [16]:

plot_lm_3 = plt.figure(3)
plot_lm_3.set_figheight(8)
plot_lm_3.set_figwidth(12)

plt.scatter(lm_fitted_y, lm_norm_residuals_abs_sqrt, alpha=0.5)
sns.regplot(lm_fitted_y, lm_norm_residuals_abs_sqrt, 
            scatter=False, 
            ci=False, 
            lowess=True,
            line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})

plot_lm_3.axes[0].set_title('Scale-Location')
plot_lm_3.axes[0].set_xlabel('Fitted values')
plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$');

# annotations
abs_sq_norm_resid = np.flip(np.argsort(lm_norm_residuals_abs_sqrt), 0)
abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]

for i in abs_norm_resid_top_3:
    plot_lm_3.axes[0].annotate(i, xy=(lm_fitted_y[i], lm_norm_residuals_abs_sqrt[i]));

Leverage plot

In [17]:

plot_lm_4 = plt.figure(4)
plot_lm_4.set_figheight(8)
plot_lm_4.set_figwidth(12)

plt.scatter(lm_leverage, lm_norm_residuals, alpha=0.5)
sns.regplot(lm_leverage, lm_norm_residuals, 
            scatter=False, 
            ci=False, 
            lowess=True,
            line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})

plot_lm_4.axes[0].set_xlim(0, 0.20)
plot_lm_4.axes[0].set_ylim(-3, 5)
plot_lm_4.axes[0].set_title('Residuals vs Leverage')
plot_lm_4.axes[0].set_xlabel('Leverage')
plot_lm_4.axes[0].set_ylabel('Standardized Residuals')

# annotations
leverage_top_3 = np.flip(np.argsort(lm_cooks), 0)[:3]

for i in leverage_top_3:
    plot_lm_4.axes[0].annotate(i, xy=(lm_leverage[i], lm_norm_residuals[i]))
    
# shenanigans for cook's distance contours
def graph(formula, x_range, label=None):
    x = x_range
    y = formula(x)
    plt.plot(x, y, label=label, lw=1, ls='--', color='red')

p = len(lm_fit.params) # number of model parameters

graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x), 
      np.linspace(0.001, 0.200, 50), 
      'Cook\'s distance') # 0.5 line

graph(lambda x: np.sqrt((1 * p * (1 - x)) / x), 
      np.linspace(0.001, 0.200, 50)) # 1 line

plt.legend(loc='upper right');

3.6.3 Multiple Linear Regression¶

In [18]:

lm_fit = smf.ols('medv~lstat+age', data = df).fit()
lm_fit.summary()

Out[18]:

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.551
Model:	OLS	Adj. R-squared:	0.549
Method:	Least Squares	F-statistic:	309.0
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	2.98e-88
Time:	15:48:03	Log-Likelihood:	-1637.5
No. Observations:	506	AIC:	3281.
Df Residuals:	503	BIC:	3294.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	33.2228	0.731	45.458	0.000	31.787	34.659
lstat	-1.0321	0.048	-21.416	0.000	-1.127	-0.937
age	0.0345	0.012	2.826	0.005	0.011	0.059

Omnibus:	124.288	Durbin-Watson:	0.945
Prob(Omnibus):	0.000	Jarque-Bera (JB):	244.026
Skew:	1.362	Prob(JB):	1.02e-53
Kurtosis:	5.038	Cond. No.	201.

If we want to use all the variable. We can use the following trick to manually construct the list. In Python, most of time, you have to manully construct the variable list.

In [19]:

def ols_formula(df, dependent_var, *excluded_cols):
    '''
    Generates the R style formula for statsmodels (patsy) given
    the dataframe, dependent variable and optional excluded columns
    as strings
    '''
    df_columns = list(df.columns.values)
    df_columns.remove(dependent_var)
    for col in excluded_cols:
        df_columns.remove(col)
    return dependent_var + ' ~ ' + ' + '.join(df_columns)

In [20]:

lm_fit = smf.ols(formula = ols_formula(df, 'medv'), data = df).fit() # formula = 'medv ~ .'
lm_fit.summary()

Out[20]:

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.741
Model:	OLS	Adj. R-squared:	0.734
Method:	Least Squares	F-statistic:	108.1
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	6.72e-135
Time:	15:48:03	Log-Likelihood:	-1498.8
No. Observations:	506	AIC:	3026.
Df Residuals:	492	BIC:	3085.
Df Model:	13
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	36.4595	5.103	7.144	0.000	26.432	46.487
crim	-0.1080	0.033	-3.287	0.001	-0.173	-0.043
zn	0.0464	0.014	3.382	0.001	0.019	0.073
indus	0.0206	0.061	0.334	0.738	-0.100	0.141
chas	2.6867	0.862	3.118	0.002	0.994	4.380
nox	-17.7666	3.820	-4.651	0.000	-25.272	-10.262
rm	3.8099	0.418	9.116	0.000	2.989	4.631
age	0.0007	0.013	0.052	0.958	-0.025	0.027
dis	-1.4756	0.199	-7.398	0.000	-1.867	-1.084
rad	0.3060	0.066	4.613	0.000	0.176	0.436
tax	-0.0123	0.004	-3.280	0.001	-0.020	-0.005
ptratio	-0.9527	0.131	-7.283	0.000	-1.210	-0.696
black	0.0093	0.003	3.467	0.001	0.004	0.015
lstat	-0.5248	0.051	-10.347	0.000	-0.624	-0.425

Omnibus:	178.041	Durbin-Watson:	1.078
Prob(Omnibus):	0.000	Jarque-Bera (JB):	783.126
Skew:	1.521	Prob(JB):	8.84e-171
Kurtosis:	8.281	Cond. No.	1.51e+04

Variance inﬂation factors

In [21]:

# don't forget to add constant if the ols model includes intercept
df_exog = sm.add_constant(df.drop('medv', axis = 1))

# too fancy for printing results?
for i, col in enumerate(df.columns):
    if col == 'const':
        pass
    elif len(col) > 6:
        print(col, ':', "{0:.2f}".format(vif(df_exog.as_matrix(), i)))
    else:
        print(col, '\t:', "{0:.2f}".format(vif(df_exog.as_matrix(), i)))

crim 	: 585.27
zn 	: 1.79
indus 	: 2.30
chas 	: 3.99
nox 	: 1.07
rm 	: 4.39
age 	: 1.93
dis 	: 3.10
rad 	: 3.96
tax 	: 7.48
ptratio : 9.01
black 	: 1.80
lstat 	: 1.35
medv 	: 2.94

Run a regression excluding age predictor (formula = 'medv ~ . - age')

In [22]:

lm = smf.ols(formula = ols_formula(df, 'medv', 'age'), data = df)
lm_fit = lm.fit()
lm_fit.summary()

Out[22]:

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.741
Model:	OLS	Adj. R-squared:	0.734
Method:	Least Squares	F-statistic:	117.3
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	6.08e-136
Time:	15:48:04	Log-Likelihood:	-1498.8
No. Observations:	506	AIC:	3024.
Df Residuals:	493	BIC:	3079.
Df Model:	12
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	36.4369	5.080	7.172	0.000	26.456	46.418
crim	-0.1080	0.033	-3.290	0.001	-0.173	-0.043
zn	0.0463	0.014	3.404	0.001	0.020	0.073
indus	0.0206	0.061	0.335	0.738	-0.100	0.141
chas	2.6890	0.860	3.128	0.002	1.000	4.378
nox	-17.7135	3.679	-4.814	0.000	-24.943	-10.484
rm	3.8144	0.408	9.338	0.000	3.012	4.617
dis	-1.4786	0.191	-7.757	0.000	-1.853	-1.104
rad	0.3058	0.066	4.627	0.000	0.176	0.436
tax	-0.0123	0.004	-3.283	0.001	-0.020	-0.005
ptratio	-0.9522	0.130	-7.308	0.000	-1.208	-0.696
black	0.0093	0.003	3.481	0.001	0.004	0.015
lstat	-0.5239	0.048	-10.999	0.000	-0.617	-0.430

Omnibus:	178.343	Durbin-Watson:	1.078
Prob(Omnibus):	0.000	Jarque-Bera (JB):	786.386
Skew:	1.523	Prob(JB):	1.73e-171
Kurtosis:	8.294	Cond. No.	1.48e+04

3.6.4 Interaction Terms¶

In [23]:

lm = smf.ols(formula = 'medv ~ lstat * age', data = df)
lm_fit = lm.fit()
lm_fit.summary()

Out[23]:

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.556
Model:	OLS	Adj. R-squared:	0.553
Method:	Least Squares	F-statistic:	209.3
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	4.86e-88
Time:	15:48:04	Log-Likelihood:	-1635.0
No. Observations:	506	AIC:	3278.
Df Residuals:	502	BIC:	3295.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	36.0885	1.470	24.553	0.000	33.201	38.976
lstat	-1.3921	0.167	-8.313	0.000	-1.721	-1.063
age	-0.0007	0.020	-0.036	0.971	-0.040	0.038
lstat:age	0.0042	0.002	2.244	0.025	0.001	0.008

Omnibus:	135.601	Durbin-Watson:	0.965
Prob(Omnibus):	0.000	Jarque-Bera (JB):	296.955
Skew:	1.417	Prob(JB):	3.29e-65
Kurtosis:	5.461	Cond. No.	6.88e+03

3.6.5 Non-linear Transformations of Predictors¶

In [24]:

lm = smf.ols(formula = 'medv ~ lstat + I(lstat ** 2.0)', data = df)
lm_fit = lm.fit()
lm_fit.summary()

Out[24]:

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.641
Model:	OLS	Adj. R-squared:	0.639
Method:	Least Squares	F-statistic:	448.5
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	1.56e-112
Time:	15:48:04	Log-Likelihood:	-1581.3
No. Observations:	506	AIC:	3169.
Df Residuals:	503	BIC:	3181.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	42.8620	0.872	49.149	0.000	41.149	44.575
lstat	-2.3328	0.124	-18.843	0.000	-2.576	-2.090
I(lstat ** 2.0)	0.0435	0.004	11.628	0.000	0.036	0.051

Omnibus:	107.006	Durbin-Watson:	0.921
Prob(Omnibus):	0.000	Jarque-Bera (JB):	228.388
Skew:	1.128	Prob(JB):	2.55e-50
Kurtosis:	5.397	Cond. No.	1.13e+03

In [25]:

# anova of the two models
lm_fit1 = smf.ols(formula='medv ~ lstat', data = df).fit()
lm_fit2 = smf.ols(formula='medv ~ lstat + I(lstat**2.0)', data = df).fit()

sm.stats.anova_lm(lm_fit1, lm_fit2)

/opt/conda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
/opt/conda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
/opt/conda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)

Out[25]:

	df_resid	ssr	df_diff	ss_diff	F	Pr(>F)
0	504.0	19472.381418	0.0	NaN	NaN	NaN
1	503.0	15347.243158	1.0	4125.13826	135.199822	7.630116e-28

In [26]:

f, axes = plt.subplots(4, 2, sharex = False, sharey = False)
f.set_figheight(20)

sns.regplot('lstat', 'medv', data=df, ax=axes[0, 0], order=1, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'})
sns.regplot('lstat', 'medv', data=df, ax=axes[0, 1], order=2, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'})
sns.regplot('lstat', 'medv', data=df, ax=axes[1, 0], order=3, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'})
sns.regplot('lstat', 'medv', data=df, ax=axes[1, 1], order=5, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'})
sns.residplot('lstat', 'medv', data=df, ax=axes[2, 0], order=1, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'}, lowess = True)
sns.residplot('lstat', 'medv', data=df, ax=axes[2, 1], order=2, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'}, lowess = True)
sns.residplot('lstat', 'medv', data=df, ax=axes[3, 0], order=3, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'}, lowess = True)
sns.residplot('lstat', 'medv', data=df, ax=axes[3, 1], order=5, line_kws={'color': 'gray'}, scatter_kws={'alpha': '0.5'}, lowess = True);

In [27]:

lm = smf.ols(formula = 'medv ~ lstat + I(lstat**2) + I(lstat**3) + I(lstat**4) + I(lstat**5)', data = df)
lm_fit = lm.fit()
lm_fit.summary()

Out[27]:

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.682
Model:	OLS	Adj. R-squared:	0.679
Method:	Least Squares	F-statistic:	214.2
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	8.73e-122
Time:	15:48:06	Log-Likelihood:	-1550.6
No. Observations:	506	AIC:	3113.
Df Residuals:	500	BIC:	3139.
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	67.6997	3.604	18.783	0.000	60.618	74.781
lstat	-11.9911	1.526	-7.859	0.000	-14.989	-8.994
I(lstat ** 2)	1.2728	0.223	5.703	0.000	0.834	1.711
I(lstat ** 3)	-0.0683	0.014	-4.747	0.000	-0.097	-0.040
I(lstat ** 4)	0.0017	0.000	4.143	0.000	0.001	0.003
I(lstat ** 5)	-1.632e-05	4.42e-06	-3.692	0.000	-2.5e-05	-7.63e-06

Omnibus:	144.085	Durbin-Watson:	0.987
Prob(Omnibus):	0.000	Jarque-Bera (JB):	494.545
Skew:	1.292	Prob(JB):	4.08e-108
Kurtosis:	7.096	Cond. No.	1.37e+08

In [28]:

lm = smf.ols(formula = 'medv ~ np.log(rm)', data = df)
lm_fit = lm.fit()
lm_fit.summary()

Out[28]:

OLS Regression Results
Dep. Variable:	medv	R-squared:	0.436
Model:	OLS	Adj. R-squared:	0.435
Method:	Least Squares	F-statistic:	389.3
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	1.22e-64
Time:	15:48:06	Log-Likelihood:	-1695.4
No. Observations:	506	AIC:	3395.
Df Residuals:	504	BIC:	3403.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-76.4878	5.028	-15.213	0.000	-86.366	-66.610
np.log(rm)	54.0546	2.739	19.732	0.000	48.672	59.437

Omnibus:	117.102	Durbin-Watson:	0.681
Prob(Omnibus):	0.000	Jarque-Bera (JB):	584.336
Skew:	0.916	Prob(JB):	1.30e-127
Kurtosis:	7.936	Cond. No.	38.9

In [29]:

sns.regplot('rm', 'medv', data = df, logx = True, line_kws = {'color': 'gray'}, scatter_kws = {'alpha': '0.5'});

3.6.6 Qualitative Predictors¶

In [30]:

df = pd.read_csv('../../../data/Carseats.csv')
df.head()

Out[30]:

	Sales	CompPrice	Income	Advertising	Population	Price	ShelveLoc	Age	Education	Urban	US
0	9.50	138	73	11	276	120	Bad	42	17	Yes	Yes
1	11.22	111	48	16	260	83	Good	65	10	Yes	Yes
2	10.06	113	35	10	269	80	Medium	59	12	Yes	Yes
3	7.40	117	100	4	466	97	Medium	55	14	Yes	Yes
4	4.15	141	64	3	340	128	Bad	38	13	Yes	No

In [31]:

formula = ols_formula(df, 'Sales', 'ShelveLoc') + ' + Income:Advertising + Price:Age + C(ShelveLoc)'
lm = smf.ols(formula, data = df)
lm_fit = lm.fit()
lm_fit.summary()

Out[31]:

OLS Regression Results
Dep. Variable:	Sales	R-squared:	0.876
Model:	OLS	Adj. R-squared:	0.872
Method:	Least Squares	F-statistic:	210.0
Date:	Sun, 03 Jun 2018	Prob (F-statistic):	6.14e-166
Time:	15:48:06	Log-Likelihood:	-564.67
No. Observations:	400	AIC:	1157.
Df Residuals:	386	BIC:	1213.
Df Model:	13
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	6.5756	1.009	6.519	0.000	4.592	8.559
Urban[T.Yes]	0.1402	0.112	1.247	0.213	-0.081	0.361
US[T.Yes]	-0.1576	0.149	-1.058	0.291	-0.450	0.135
C(ShelveLoc)[T.Good]	4.8487	0.153	31.724	0.000	4.548	5.149
C(ShelveLoc)[T.Medium]	1.9533	0.126	15.531	0.000	1.706	2.201
CompPrice	0.0929	0.004	22.567	0.000	0.085	0.101
Income	0.0109	0.003	4.183	0.000	0.006	0.016
Advertising	0.0702	0.023	3.107	0.002	0.026	0.115
Population	0.0002	0.000	0.433	0.665	-0.001	0.001
Price	-0.1008	0.007	-13.549	0.000	-0.115	-0.086
Age	-0.0579	0.016	-3.633	0.000	-0.089	-0.027
Education	-0.0209	0.020	-1.063	0.288	-0.059	0.018
Income:Advertising	0.0008	0.000	2.698	0.007	0.000	0.001
Price:Age	0.0001	0.000	0.801	0.424	-0.000	0.000

Omnibus:	1.281	Durbin-Watson:	2.047
Prob(Omnibus):	0.527	Jarque-Bera (JB):	1.147
Skew:	0.129	Prob(JB):	0.564
Kurtosis:	3.050	Cond. No.	1.31e+05

3.6.7 Writing Functions¶

We already did.