Author: Thomas Haslwanter, Date: Feb-2017
%matplotlib inline
import numpy as np
import pandas as pd
from urllib.request import urlopen
from statsmodels.formula.api import ols
import statsmodels.regression.linear_model as sm
from statsmodels.stats.anova import anova_lm
# Get the data
inFile = 'swim100m.csv'
url_base = 'https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_kaplan/'
url = url_base + inFile
data = pd.read_csv(urlopen(url))
# Different models
model1 = ols("time ~ sex", data).fit() # one factor
print(model1.summary())
OLS Regression Results ============================================================================== Dep. Variable: time R-squared: 0.287 Model: OLS Adj. R-squared: 0.275 Method: Least Squares F-statistic: 24.13 Date: Sat, 04 Feb 2017 Prob (F-statistic): 7.28e-06 Time: 14:21:20 Log-Likelihood: -219.23 No. Observations: 62 AIC: 442.5 Df Residuals: 60 BIC: 446.7 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 65.1923 1.517 42.986 0.000 62.159 68.226 sex[T.M] -10.5361 2.145 -4.912 0.000 -14.826 -6.246 ============================================================================== Omnibus: 16.370 Durbin-Watson: 0.353 Prob(Omnibus): 0.000 Jarque-Bera (JB): 19.838 Skew: 1.113 Prob(JB): 4.92e-05 Kurtosis: 4.649 Cond. No. 2.62 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(anova_lm(model1))
df sum_sq mean_sq F PR(>F) sex 1.0 1720.655232 1720.655232 24.132575 0.000007 Residual 60.0 4278.006477 71.300108 NaN NaN
C:\Programs\WinPython-64bit-3.6.0.1Qt5\python-3.6.0.amd64\lib\site-packages\scipy\stats\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater return (self.a < x) & (x < self.b) C:\Programs\WinPython-64bit-3.6.0.1Qt5\python-3.6.0.amd64\lib\site-packages\scipy\stats\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less return (self.a < x) & (x < self.b) C:\Programs\WinPython-64bit-3.6.0.1Qt5\python-3.6.0.amd64\lib\site-packages\scipy\stats\_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal cond2 = cond0 & (x <= self.a)
model2 = ols("time ~ sex + year", data).fit() # two factors
print(model2.summary())
OLS Regression Results ============================================================================== Dep. Variable: time R-squared: 0.844 Model: OLS Adj. R-squared: 0.839 Method: Least Squares F-statistic: 159.6 Date: Sat, 04 Feb 2017 Prob (F-statistic): 1.58e-24 Time: 14:21:20 Log-Likelihood: -172.12 No. Observations: 62 AIC: 350.2 Df Residuals: 59 BIC: 356.6 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 555.7168 33.800 16.441 0.000 488.083 623.350 sex[T.M] -9.7980 1.013 -9.673 0.000 -11.825 -7.771 year -0.2515 0.017 -14.516 0.000 -0.286 -0.217 ============================================================================== Omnibus: 52.546 Durbin-Watson: 0.375 Prob(Omnibus): 0.000 Jarque-Bera (JB): 241.626 Skew: 2.430 Prob(JB): 3.40e-53 Kurtosis: 11.362 Cond. No. 1.30e+05 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.3e+05. This might indicate that there are strong multicollinearity or other numerical problems.
print(anova_lm(model2))
df sum_sq mean_sq F PR(>F) sex 1.0 1720.655232 1720.655232 108.479881 5.475511e-15 year 1.0 3342.177104 3342.177104 210.709831 3.935386e-21 Residual 59.0 935.829374 15.861515 NaN NaN
C:\Programs\WinPython-64bit-3.6.0.1Qt5\python-3.6.0.amd64\lib\site-packages\scipy\stats\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater return (self.a < x) & (x < self.b) C:\Programs\WinPython-64bit-3.6.0.1Qt5\python-3.6.0.amd64\lib\site-packages\scipy\stats\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less return (self.a < x) & (x < self.b) C:\Programs\WinPython-64bit-3.6.0.1Qt5\python-3.6.0.amd64\lib\site-packages\scipy\stats\_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal cond2 = cond0 & (x <= self.a)
model3 = ols("time ~ sex * year", data).fit() # two factors with interaction
print(model3.summary())
OLS Regression Results ============================================================================== Dep. Variable: time R-squared: 0.893 Model: OLS Adj. R-squared: 0.888 Method: Least Squares F-statistic: 162.1 Date: Sat, 04 Feb 2017 Prob (F-statistic): 3.67e-28 Time: 14:21:21 Log-Likelihood: -160.30 No. Observations: 62 AIC: 328.6 Df Residuals: 58 BIC: 337.1 Df Model: 3 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 697.3012 39.221 17.779 0.000 618.791 775.811 sex[T.M] -302.4638 56.412 -5.362 0.000 -415.384 -189.544 year -0.3240 0.020 -16.118 0.000 -0.364 -0.284 sex[T.M]:year 0.1499 0.029 5.189 0.000 0.092 0.208 ============================================================================== Omnibus: 49.919 Durbin-Watson: 0.575 Prob(Omnibus): 0.000 Jarque-Bera (JB): 243.008 Skew: 2.224 Prob(JB): 1.70e-53 Kurtosis: 11.619 Cond. No. 3.40e+05 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 3.4e+05. This might indicate that there are strong multicollinearity or other numerical problems.
print(anova_lm(model3))
df sum_sq mean_sq F PR(>F) sex 1.0 1720.655232 1720.655232 156.140793 4.299569e-18 year 1.0 3342.177104 3342.177104 303.285733 1.039245e-24 sex:year 1.0 296.675432 296.675432 26.921801 2.826421e-06 Residual 58.0 639.153942 11.019896 NaN NaN
C:\Programs\WinPython-64bit-3.6.0.1Qt5\python-3.6.0.amd64\lib\site-packages\scipy\stats\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in greater return (self.a < x) & (x < self.b) C:\Programs\WinPython-64bit-3.6.0.1Qt5\python-3.6.0.amd64\lib\site-packages\scipy\stats\_distn_infrastructure.py:875: RuntimeWarning: invalid value encountered in less return (self.a < x) & (x < self.b) C:\Programs\WinPython-64bit-3.6.0.1Qt5\python-3.6.0.amd64\lib\site-packages\scipy\stats\_distn_infrastructure.py:1814: RuntimeWarning: invalid value encountered in less_equal cond2 = cond0 & (x <= self.a)
Here we define the model directly through the design matrix. Similar to MATLAB's "regress" command.
# Generate the data
t = np.arange(0,10,0.1)
y = 4 + 3*t + 2*t**2 + 5*np.random.randn(len(t))
# Make the fit. Note that this is another "OLS" than the one in "model_formulas"!
M = np.column_stack((np.ones(len(t)), t, t**2))
res = sm.OLS(y, M).fit()
# Display the results
print('Summary:')
print(res.summary())
Summary: OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.993 Model: OLS Adj. R-squared: 0.993 Method: Least Squares F-statistic: 7046. Date: Sat, 04 Feb 2017 Prob (F-statistic): 9.76e-106 Time: 14:21:21 Log-Likelihood: -313.30 No. Observations: 100 AIC: 632.6 Df Residuals: 97 BIC: 640.4 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 3.7463 1.658 2.260 0.026 0.456 7.036 x1 3.1027 0.774 4.009 0.000 1.567 4.639 x2 1.9708 0.076 26.055 0.000 1.821 2.121 ============================================================================== Omnibus: 1.436 Durbin-Watson: 1.845 Prob(Omnibus): 0.488 Jarque-Bera (JB): 1.189 Skew: -0.043 Prob(JB): 0.552 Kurtosis: 2.473 Cond. No. 142. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print('The fit parameters are: {0}'.format(str(res.params)))
print('The confidence intervals are:')
print(res.conf_int())
The fit parameters are: [ 3.74631449 3.10268496 1.97082745] The confidence intervals are: [[ 0.45628815 7.03634082] [ 1.56675671 4.6386132 ] [ 1.82070297 2.12095193]]