from plot_lm import plot_lm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns
from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
plt.style.use('seaborn-white')
advertising = pd.read_csv('Data/Advertising.csv', usecols=[1,2,3,4])
advertising.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 4 columns): TV 200 non-null float64 Radio 200 non-null float64 Newspaper 200 non-null float64 Sales 200 non-null float64 dtypes: float64(4) memory usage: 6.3 KB
credit = pd.read_csv('Data/Credit.csv', usecols=list(range(1,12)))
credit['Student2'] = credit.Student.map({'No':0, 'Yes':1})
credit.head(3)
Income | Limit | Rating | Cards | Age | Education | Gender | Student | Married | Ethnicity | Balance | Student2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14.891 | 3606 | 283 | 2 | 34 | 11 | Male | No | Yes | Caucasian | 333 | 0 |
1 | 106.025 | 6645 | 483 | 3 | 82 | 15 | Female | Yes | Yes | Asian | 903 | 1 |
2 | 104.593 | 7075 | 514 | 4 | 71 | 11 | Male | No | No | Asian | 580 | 0 |
auto = pd.read_csv('Data/Auto.csv', na_values='?').dropna()
auto.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 396 Data columns (total 9 columns): mpg 392 non-null float64 cylinders 392 non-null int64 displacement 392 non-null float64 horsepower 392 non-null float64 weight 392 non-null int64 acceleration 392 non-null float64 year 392 non-null int64 origin 392 non-null int64 name 392 non-null object dtypes: float64(4), int64(4), object(1) memory usage: 30.6+ KB
est = smf.ols('Sales ~ TV', advertising).fit()
est.summary().tables[1]
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 7.0326 | 0.458 | 15.360 | 0.000 | 6.130 | 7.935 |
TV | 0.0475 | 0.003 | 17.668 | 0.000 | 0.042 | 0.053 |
est = smf.ols('Sales ~ Radio', advertising).fit()
est.summary().tables[1]
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 9.3116 | 0.563 | 16.542 | 0.000 | 8.202 | 10.422 |
Radio | 0.2025 | 0.020 | 9.921 | 0.000 | 0.162 | 0.243 |
est = smf.ols('Sales ~ Newspaper', advertising).fit()
est.summary().tables[1]
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 12.3514 | 0.621 | 19.876 | 0.000 | 11.126 | 13.577 |
Newspaper | 0.0547 | 0.017 | 3.300 | 0.001 | 0.022 | 0.087 |
est = smf.ols('Sales ~ TV + Radio + Newspaper', advertising).fit()
est.summary()
Dep. Variable: | Sales | R-squared: | 0.897 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.896 |
Method: | Least Squares | F-statistic: | 570.3 |
Date: | Sat, 26 May 2018 | Prob (F-statistic): | 1.58e-96 |
Time: | 14:29:40 | Log-Likelihood: | -386.18 |
No. Observations: | 200 | AIC: | 780.4 |
Df Residuals: | 196 | BIC: | 793.6 |
Df Model: | 3 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 2.9389 | 0.312 | 9.422 | 0.000 | 2.324 | 3.554 |
TV | 0.0458 | 0.001 | 32.809 | 0.000 | 0.043 | 0.049 |
Radio | 0.1885 | 0.009 | 21.893 | 0.000 | 0.172 | 0.206 |
Newspaper | -0.0010 | 0.006 | -0.177 | 0.860 | -0.013 | 0.011 |
Omnibus: | 60.414 | Durbin-Watson: | 2.084 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 151.241 |
Skew: | -1.327 | Prob(JB): | 1.44e-33 |
Kurtosis: | 6.332 | Cond. No. | 454. |
advertising.corr()
TV | Radio | Newspaper | Sales | |
---|---|---|---|---|
TV | 1.000000 | 0.054809 | 0.056648 | 0.782224 |
Radio | 0.054809 | 1.000000 | 0.354104 | 0.576223 |
Newspaper | 0.056648 | 0.354104 | 1.000000 | 0.228299 |
Sales | 0.782224 | 0.576223 | 0.228299 | 1.000000 |
est = smf.ols('Sales ~ TV + Radio + TV*Radio', advertising).fit()
est.summary()
Dep. Variable: | Sales | R-squared: | 0.968 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.967 |
Method: | Least Squares | F-statistic: | 1963. |
Date: | Sat, 26 May 2018 | Prob (F-statistic): | 6.68e-146 |
Time: | 14:29:40 | Log-Likelihood: | -270.14 |
No. Observations: | 200 | AIC: | 548.3 |
Df Residuals: | 196 | BIC: | 561.5 |
Df Model: | 3 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 6.7502 | 0.248 | 27.233 | 0.000 | 6.261 | 7.239 |
TV | 0.0191 | 0.002 | 12.699 | 0.000 | 0.016 | 0.022 |
Radio | 0.0289 | 0.009 | 3.241 | 0.001 | 0.011 | 0.046 |
TV:Radio | 0.0011 | 5.24e-05 | 20.727 | 0.000 | 0.001 | 0.001 |
Omnibus: | 128.132 | Durbin-Watson: | 2.224 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1183.719 |
Skew: | -2.323 | Prob(JB): | 9.09e-258 |
Kurtosis: | 13.975 | Cond. No. | 1.80e+04 |
regr = skl_lm.LinearRegression()
X = advertising[['TV', 'Radio', 'Newspaper']].values
y = advertising.Sales
regr.fit(X,y)
print([regr.intercept_, *regr.coef_])
[2.9388893694594085, 0.045764645455397615, 0.18853001691820456, -0.0010374930424763272]
sns.pairplot(credit[['Balance','Age','Cards','Education','Income','Limit','Rating']]);
est1 = smf.ols('Balance ~ Income + C(Student)', credit).fit()
regr1 = est1.params
est2 = smf.ols('Balance ~ Income + Income*C(Student)', credit).fit()
regr2 = est2.params
print('Regression 1 - without interaction term')
print(regr1)
print('\nRegression 2 - with interaction term')
print(regr2)
Regression 1 - without interaction term Intercept 211.142964 C(Student)[T.Yes] 382.670539 Income 5.984336 dtype: float64 Regression 2 - with interaction term Intercept 200.623153 C(Student)[T.Yes] 476.675843 Income 6.218169 Income:C(Student)[T.Yes] -1.999151 dtype: float64
print('Change in AIC: ', est2.aic-est1.aic)
Change in AIC: 0.655361215896
plt.scatter(auto.horsepower, auto.mpg, facecolors='None', edgecolors='k', alpha=.5)
sns.regplot(auto.horsepower, auto.mpg, ci=100, label='Linear', scatter=False, color='orange')
sns.regplot(auto.horsepower, auto.mpg, ci=100, label='Degree 2', order=2, scatter=False, color='lightblue')
sns.regplot(auto.horsepower, auto.mpg, ci=100, label='Degree 5', order=5, scatter=False, color='g')
plt.legend()
plt.ylim(5,55)
plt.xlim(40,240);
est = smf.ols('mpg ~ horsepower + np.power(horsepower, 2)', auto).fit()
est.summary()
Dep. Variable: | mpg | R-squared: | 0.688 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.686 |
Method: | Least Squares | F-statistic: | 428.0 |
Date: | Sat, 26 May 2018 | Prob (F-statistic): | 5.40e-99 |
Time: | 14:29:45 | Log-Likelihood: | -1133.2 |
No. Observations: | 392 | AIC: | 2272. |
Df Residuals: | 389 | BIC: | 2284. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 56.9001 | 1.800 | 31.604 | 0.000 | 53.360 | 60.440 |
horsepower | -0.4662 | 0.031 | -14.978 | 0.000 | -0.527 | -0.405 |
np.power(horsepower, 2) | 0.0012 | 0.000 | 10.080 | 0.000 | 0.001 | 0.001 |
Omnibus: | 16.158 | Durbin-Watson: | 1.078 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 30.662 |
Skew: | 0.218 | Prob(JB): | 2.20e-07 |
Kurtosis: | 4.299 | Cond. No. | 1.29e+05 |
plot_lm(est, auto, 'mpg')