%pylab inline
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
import os
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pandas import *
# formulas.api lets us use patsy formulas.
from statsmodels.formula.api import ols
heights_weights = read_csv('data/01_heights_weights_genders.csv')
Using patsy formulas.
fit_reg = ols(formula = 'Weight ~ Height', df = heights_weights).fit()
print fit_reg.summary()
OLS Regression Results ============================================================================== Dep. Variable: Weight R-squared: 0.855 Model: OLS Adj. R-squared: 0.855 Method: Least Squares F-statistic: 5.904e+04 Date: Sat, 24 Nov 2012 Prob (F-statistic): 0.00 Time: 09:50:14 Log-Likelihood: -39219. No. Observations: 10000 AIC: 7.844e+04 Df Residuals: 9998 BIC: 7.846e+04 Df Model: 1 ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ Intercept -350.7372 2.111 -166.109 0.000 -354.876 -346.598 Height 7.7173 0.032 242.975 0.000 7.655 7.780 ============================================================================== Omnibus: 2.141 Durbin-Watson: 1.677 Prob(Omnibus): 0.343 Jarque-Bera (JB): 2.150 Skew: 0.036 Prob(JB): 0.341 Kurtosis: 2.991 Cond. No. 1.15e+03 ============================================================================== The condition number is large, 1.15e+03. This might indicate that there are strong multicollinearity or other numerical problems.
Note we get the condition number warning. This is really just a scale issue (with only one variable, the multicollinearity warning doesn't make sense). If we run the log-log version, this goes away. Note we can specify the log transforms within the Patsy/sm formula.
fit_reg_log = ols(formula = 'np.log(Weight) ~ np.log(Height)',
df = heights_weights).fit()
print fit_reg_log.summary()
OLS Regression Results ============================================================================== Dep. Variable: np.log(Weight) R-squared: 0.848 Model: OLS Adj. R-squared: 0.848 Method: Least Squares F-statistic: 5.576e+04 Date: Sat, 24 Nov 2012 Prob (F-statistic): 0.00 Time: 09:50:17 Log-Likelihood: 11038. No. Observations: 10000 AIC: -2.207e+04 Df Residuals: 9998 BIC: -2.206e+04 Df Model: 1 ================================================================================== coef std err t P>|t| [95.0% Conf. Int.] ---------------------------------------------------------------------------------- Intercept -8.6155 0.058 -148.708 0.000 -8.729 -8.502 np.log(Height) 3.2619 0.014 236.129 0.000 3.235 3.289 ============================================================================== Omnibus: 38.178 Durbin-Watson: 1.713 Prob(Omnibus): 0.000 Jarque-Bera (JB): 42.749 Skew: -0.105 Prob(JB): 5.21e-10 Kurtosis: 3.241 Cond. No. 320. ==============================================================================
pred_weights = fit_reg.predict()
heights = heights_weights['Height']
weights = heights_weights['Weight']
plt.figure(figsize = (8, 8))
plt.plot(heights, weights, '.', mfc = 'white', alpha = .2)
plt.xlabel('Height (in)')
plt.ylabel('Weight (lbs)')
plt.plot(heights, pred_weights, '-k')
[<matplotlib.lines.Line2D at 0x108fb8450>]
The authors refer to this exercise as "predicting" page views, but given that the main explanatory variable is the number of visitors, this isn't really a prediction model.
top_1k_sites = read_csv('data/top_1000_sites.tsv', sep = '\t')
print top_1k_sites.head()
Rank Site Category UniqueVisitors Reach PageViews HasAdvertising InEnglish TLD 0 1 facebook.com Social Networks 880000000 47.2 910000000000 True Yes com 1 2 youtube.com Online Video 800000000 42.7 100000000000 True Yes com 2 3 yahoo.com Web Portals 660000000 35.3 77000000000 True Yes com 3 4 live.com Search Engines 550000000 29.3 36000000000 True Yes com 4 5 wikipedia.org Dictionaries & Encyclopedias 490000000 26.2 7000000000 False Yes org
kde_model = sm.nonparametric.KDE(np.log(top_1k_sites['PageViews'].values))
kde_model.fit()
plt.plot(kde_model.support, kde_model.density)
plt.xlabel('Page Views (log)')
<matplotlib.text.Text at 0x108f89890>
pageview_fit = ols('np.log(PageViews) ~ np.log(UniqueVisitors)',
df = top_1k_sites).fit()
print pageview_fit.summary()
OLS Regression Results ============================================================================== Dep. Variable: np.log(PageViews) R-squared: 0.462 Model: OLS Adj. R-squared: 0.461 Method: Least Squares F-statistic: 855.6 Date: Sat, 24 Nov 2012 Prob (F-statistic): 2.46e-136 Time: 09:50:22 Log-Likelihood: -1498.3 No. Observations: 1000 AIC: 3001. Df Residuals: 998 BIC: 3010. Df Model: 1 ========================================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------------------ Intercept -2.8344 0.752 -3.769 0.000 -4.310 -1.359 np.log(UniqueVisitors) 1.3363 0.046 29.251 0.000 1.247 1.426 ============================================================================== Omnibus: 68.454 Durbin-Watson: 2.068 Prob(Omnibus): 0.000 Jarque-Bera (JB): 85.893 Skew: 0.615 Prob(JB): 2.23e-19 Kurtosis: 3.742 Cond. No. 363. ==============================================================================
plt.figure(figsize = (8,8))
plt.plot(np.log(top_1k_sites['UniqueVisitors']), np.log(top_1k_sites['PageViews']),
'.', mfc = 'white')
plt.plot(np.log(top_1k_sites['UniqueVisitors']), pageview_fit.predict(), '-k')
plt.ylabel('Page Views (log)')
plt.xlabel('Unique Visitors (log)')
<matplotlib.text.Text at 0x108e2e290>
model = 'np.log(PageViews) ~ np.log(UniqueVisitors) + HasAdvertising + InEnglish'
pageview_fit_multi = ols(model, top_1k_sites).fit()
print pageview_fit_multi.summary()
OLS Regression Results ============================================================================== Dep. Variable: np.log(PageViews) R-squared: 0.480 Model: OLS Adj. R-squared: 0.478 Method: Least Squares F-statistic: 229.4 Date: Sat, 24 Nov 2012 Prob (F-statistic): 1.52e-139 Time: 09:50:25 Log-Likelihood: -1481.1 No. Observations: 1000 AIC: 2972. Df Residuals: 995 BIC: 2997. Df Model: 4 ========================================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------------------ Intercept -1.9450 1.148 -1.695 0.090 -4.197 0.307 HasAdvertising[T.True] 0.3060 0.092 3.336 0.001 0.126 0.486 InEnglish[T.No] 0.8347 0.209 4.001 0.000 0.425 1.244 InEnglish[T.Yes] -0.1691 0.204 -0.828 0.408 -0.570 0.232 np.log(UniqueVisitors) 1.2651 0.071 17.936 0.000 1.127 1.403 ============================================================================== Omnibus: 73.424 Durbin-Watson: 2.068 Prob(Omnibus): 0.000 Jarque-Bera (JB): 92.632 Skew: 0.646 Prob(JB): 7.68e-21 Kurtosis: 3.744 Cond. No. 570. ==============================================================================
As comparison, here's the non-patsy code for the above regression The patsy version is a clear improvement.
#top_1k_sites['LogUniqueVisitors'] = np.log(top_1k_sites['UniqueVisitors'])
#top_1k_sites['HasAdvertisingYes'] = np.where(top_1k_sites['HasAdvertising'] == 'Yes', 1, 0)
#top_1k_sites['InEnglishYes'] = np.where(top_1k_sites['InEnglish'] == 'Yes', 1, 0)
#top_1k_sites['InEnglishNo'] = np.where(top_1k_sites['InEnglish'] == 'No', 1, 0)
#linreg_fit = sm.OLS(np.log(top_1k_sites['PageViews']),
# sm.add_constant(top_1k_sites[['HasAdvertisingYes', 'LogUniqueVisitors',
# 'InEnglishNo', 'InEnglishYes']],
# prepend = True)).fit()
#linreg_fit.summary()