In [3]:

from plot_lm import plot_lm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [4]:

%matplotlib inline
plt.style.use('seaborn-white')

Load Datasets¶

In [5]:

advertising = pd.read_csv('Data/Advertising.csv', usecols=[1,2,3,4])
advertising.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
TV           200 non-null float64
Radio        200 non-null float64
Newspaper    200 non-null float64
Sales        200 non-null float64
dtypes: float64(4)
memory usage: 6.3 KB

In [6]:

credit = pd.read_csv('Data/Credit.csv', usecols=list(range(1,12)))
credit['Student2'] = credit.Student.map({'No':0, 'Yes':1})
credit.head(3)

Out[6]:

	Income	Limit	Rating	Cards	Age	Education	Gender	Student	Married	Ethnicity	Balance	Student2
0	14.891	3606	283	2	34	11	Male	No	Yes	Caucasian	333	0
1	106.025	6645	483	3	82	15	Female	Yes	Yes	Asian	903	1
2	104.593	7075	514	4	71	11	Male	No	No	Asian	580	0

In [7]:

auto = pd.read_csv('Data/Auto.csv', na_values='?').dropna()
auto.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 396
Data columns (total 9 columns):
mpg             392 non-null float64
cylinders       392 non-null int64
displacement    392 non-null float64
horsepower      392 non-null float64
weight          392 non-null int64
acceleration    392 non-null float64
year            392 non-null int64
origin          392 non-null int64
name            392 non-null object
dtypes: float64(4), int64(4), object(1)
memory usage: 30.6+ KB

Sales dataset¶

In [8]:

est = smf.ols('Sales ~ TV', advertising).fit()
est.summary().tables[1]

Out[8]:

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	7.0326	0.458	15.360	0.000	6.130	7.935
TV	0.0475	0.003	17.668	0.000	0.042	0.053

In [9]:

est = smf.ols('Sales ~ Radio', advertising).fit()
est.summary().tables[1]

Out[9]:

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	9.3116	0.563	16.542	0.000	8.202	10.422
Radio	0.2025	0.020	9.921	0.000	0.162	0.243

In [10]:

est = smf.ols('Sales ~ Newspaper', advertising).fit()
est.summary().tables[1]

Out[10]:

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	12.3514	0.621	19.876	0.000	11.126	13.577
Newspaper	0.0547	0.017	3.300	0.001	0.022	0.087

In [11]:

est = smf.ols('Sales ~ TV + Radio + Newspaper', advertising).fit()
est.summary()

Out[11]:

OLS Regression Results
Dep. Variable:	Sales	R-squared:	0.897
Model:	OLS	Adj. R-squared:	0.896
Method:	Least Squares	F-statistic:	570.3
Date:	Sat, 26 May 2018	Prob (F-statistic):	1.58e-96
Time:	14:29:40	Log-Likelihood:	-386.18
No. Observations:	200	AIC:	780.4
Df Residuals:	196	BIC:	793.6
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	2.9389	0.312	9.422	0.000	2.324	3.554
TV	0.0458	0.001	32.809	0.000	0.043	0.049
Radio	0.1885	0.009	21.893	0.000	0.172	0.206
Newspaper	-0.0010	0.006	-0.177	0.860	-0.013	0.011

Omnibus:	60.414	Durbin-Watson:	2.084
Prob(Omnibus):	0.000	Jarque-Bera (JB):	151.241
Skew:	-1.327	Prob(JB):	1.44e-33
Kurtosis:	6.332	Cond. No.	454.

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [12]:

advertising.corr()

Out[12]:

	TV	Radio	Newspaper	Sales
TV	1.000000	0.054809	0.056648	0.782224
Radio	0.054809	1.000000	0.354104	0.576223
Newspaper	0.056648	0.354104	1.000000	0.228299
Sales	0.782224	0.576223	0.228299	1.000000

In [13]:

est = smf.ols('Sales ~ TV + Radio + TV*Radio', advertising).fit()
est.summary()

Out[13]:

OLS Regression Results
Dep. Variable:	Sales	R-squared:	0.968
Model:	OLS	Adj. R-squared:	0.967
Method:	Least Squares	F-statistic:	1963.
Date:	Sat, 26 May 2018	Prob (F-statistic):	6.68e-146
Time:	14:29:40	Log-Likelihood:	-270.14
No. Observations:	200	AIC:	548.3
Df Residuals:	196	BIC:	561.5
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	6.7502	0.248	27.233	0.000	6.261	7.239
TV	0.0191	0.002	12.699	0.000	0.016	0.022
Radio	0.0289	0.009	3.241	0.001	0.011	0.046
TV:Radio	0.0011	5.24e-05	20.727	0.000	0.001	0.001

Omnibus:	128.132	Durbin-Watson:	2.224
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1183.719
Skew:	-2.323	Prob(JB):	9.09e-258
Kurtosis:	13.975	Cond. No.	1.80e+04

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.8e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

Using scikit-learn¶

In [22]:

regr = skl_lm.LinearRegression()

X = advertising[['TV', 'Radio', 'Newspaper']].values
y = advertising.Sales

regr.fit(X,y)
print([regr.intercept_, *regr.coef_])

[2.9388893694594085, 0.045764645455397615, 0.18853001691820456, -0.0010374930424763272]

Credit dataset¶

In [23]:

sns.pairplot(credit[['Balance','Age','Cards','Education','Income','Limit','Rating']]);

In [16]:

est1 = smf.ols('Balance ~ Income + C(Student)', credit).fit()
regr1 = est1.params
est2 = smf.ols('Balance ~ Income + Income*C(Student)', credit).fit()
regr2 = est2.params

print('Regression 1 - without interaction term')
print(regr1)
print('\nRegression 2 - with interaction term')
print(regr2)

Regression 1 - without interaction term
Intercept            211.142964
C(Student)[T.Yes]    382.670539
Income                 5.984336
dtype: float64

Regression 2 - with interaction term
Intercept                   200.623153
C(Student)[T.Yes]           476.675843
Income                        6.218169
Income:C(Student)[T.Yes]     -1.999151
dtype: float64

In [17]:

print('Change in AIC: ', est2.aic-est1.aic)

Change in AIC:  0.655361215896

Auto dataset¶

In [18]:

plt.scatter(auto.horsepower, auto.mpg, facecolors='None', edgecolors='k', alpha=.5) 
sns.regplot(auto.horsepower, auto.mpg, ci=100, label='Linear', scatter=False, color='orange')
sns.regplot(auto.horsepower, auto.mpg, ci=100, label='Degree 2', order=2, scatter=False, color='lightblue')
sns.regplot(auto.horsepower, auto.mpg, ci=100, label='Degree 5', order=5, scatter=False, color='g')
plt.legend()
plt.ylim(5,55)
plt.xlim(40,240);

In [19]:

est = smf.ols('mpg ~ horsepower + np.power(horsepower, 2)', auto).fit()
est.summary()

Out[19]:

OLS Regression Results
Dep. Variable:	mpg	R-squared:	0.688
Model:	OLS	Adj. R-squared:	0.686
Method:	Least Squares	F-statistic:	428.0
Date:	Sat, 26 May 2018	Prob (F-statistic):	5.40e-99
Time:	14:29:45	Log-Likelihood:	-1133.2
No. Observations:	392	AIC:	2272.
Df Residuals:	389	BIC:	2284.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	56.9001	1.800	31.604	0.000	53.360	60.440
horsepower	-0.4662	0.031	-14.978	0.000	-0.527	-0.405
np.power(horsepower, 2)	0.0012	0.000	10.080	0.000	0.001	0.001

Omnibus:	16.158	Durbin-Watson:	1.078
Prob(Omnibus):	0.000	Jarque-Bera (JB):	30.662
Skew:	0.218	Prob(JB):	2.20e-07
Kurtosis:	4.299	Cond. No.	1.29e+05

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.29e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

In [20]:

plot_lm(est, auto, 'mpg')

In [ ]: