import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas
from scipy import stats
np.set_printoptions(precision=4, suppress=True)
pandas.set_printoptions(notebook_repr_html=False,
precision=4,
max_columns=12)
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot, qqplot
from statsmodels.stats.api import anova_lm
try:
salary_table = pandas.read_csv('salary.table')
except:
url = 'http://stats191.stanford.edu/data/salary.table'
salary_table = pandas.read_table(url) # needs pandas 0.7.3
salary_table.to_csv('salary.table', index=False)
print salary_table.head(10)
S X E M 0 13876 1 1 1 1 11608 1 3 0 2 18701 1 3 1 3 11283 1 2 0 4 11767 1 3 0 5 20872 2 2 1 6 11772 2 2 0 7 10535 2 1 0 8 12195 2 3 0 9 12313 3 2 0
E = salary_table.E # Education
M = salary_table.M # Management
X = salary_table.X # Experience
S = salary_table.S # Salary
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, xlabel='Experience', ylabel='Salary',
xlim=(0, 27), ylim=(9600, 28800))
symbols = ['D', '^']
man_label = ["Non-Mgmt", "Mgmt"]
educ_label = ["Bachelors", "Masters", "PhD"]
colors = ['r', 'g', 'blue']
factor_groups = salary_table.groupby(['E','M'])
for values, group in factor_groups:
i,j = values
label = "%s - %s" % (man_label[j], educ_label[i-1])
ax.scatter(group['X'], group['S'], marker=symbols[j], color=colors[i-1],
s=350, label=label)
ax.legend(scatterpoints=1, markerscale=.7, labelspacing=1);
Fit a linear model
$$S_i = \beta_0 + \beta_1X_i + \beta_2E_{i2} + \beta_3E_{i3} + \beta_4M_i + \epsilon_i$$where
$$ E_{i2}=\cases{1,&if $E_i=2$;\cr 0,&otherwise. \cr}$$$$ E_{i3}=\cases{1,&if $E_i=3$;\cr 0,&otherwise. \cr}$$
formula = 'S ~ C(E) + C(M) + X'
lm = ols(formula, salary_table).fit()
print lm.summary()
OLS Regression Results ============================================================================== Dep. Variable: S R-squared: 0.957 Model: OLS Adj. R-squared: 0.953 Method: Least Squares F-statistic: 226.8 Date: Sat, 01 Dec 2012 Prob (F-statistic): 2.23e-27 Time: 12:28:58 Log-Likelihood: -381.63 No. Observations: 46 AIC: 773.3 Df Residuals: 41 BIC: 782.4 Df Model: 4 ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ Intercept 8035.5976 386.689 20.781 0.000 7254.663 8816.532 C(E)[T.2] 3144.0352 361.968 8.686 0.000 2413.025 3875.045 C(E)[T.3] 2996.2103 411.753 7.277 0.000 2164.659 3827.762 C(M)[T.1] 6883.5310 313.919 21.928 0.000 6249.559 7517.503 X 546.1840 30.519 17.896 0.000 484.549 607.819 ============================================================================== Omnibus: 2.293 Durbin-Watson: 2.237 Prob(Omnibus): 0.318 Jarque-Bera (JB): 1.362 Skew: -0.077 Prob(JB): 0.506 Kurtosis: 2.171 Cond. No. 33.5 ==============================================================================
lm.model.exog
array([[ 1., 0., 0., 1., 1.], [ 1., 0., 1., 0., 1.], [ 1., 0., 1., 1., 1.], [ 1., 1., 0., 0., 1.], [ 1., 0., 1., 0., 1.], [ 1., 1., 0., 1., 2.], [ 1., 1., 0., 0., 2.], [ 1., 0., 0., 0., 2.], [ 1., 0., 1., 0., 2.], [ 1., 1., 0., 0., 3.], [ 1., 0., 0., 1., 3.], [ 1., 1., 0., 1., 3.], [ 1., 0., 1., 1., 3.], [ 1., 0., 0., 0., 4.], [ 1., 0., 1., 1., 4.], [ 1., 0., 1., 0., 4.], [ 1., 1., 0., 0., 4.], [ 1., 1., 0., 0., 5.], [ 1., 0., 1., 0., 5.], [ 1., 0., 0., 1., 5.], [ 1., 0., 0., 0., 6.], [ 1., 0., 1., 1., 6.], [ 1., 1., 0., 0., 6.], [ 1., 1., 0., 1., 6.], [ 1., 0., 0., 1., 7.], [ 1., 1., 0., 0., 8.], [ 1., 0., 0., 1., 8.], [ 1., 0., 1., 1., 8.], [ 1., 0., 0., 0., 8.], [ 1., 0., 0., 0., 10.], [ 1., 1., 0., 0., 10.], [ 1., 0., 1., 1., 10.], [ 1., 1., 0., 1., 10.], [ 1., 1., 0., 1., 11.], [ 1., 0., 0., 0., 11.], [ 1., 1., 0., 0., 12.], [ 1., 0., 1., 1., 12.], [ 1., 0., 0., 0., 13.], [ 1., 1., 0., 1., 13.], [ 1., 1., 0., 0., 14.], [ 1., 0., 1., 1., 15.], [ 1., 1., 0., 1., 16.], [ 1., 1., 0., 0., 16.], [ 1., 0., 0., 0., 16.], [ 1., 1., 0., 0., 17.], [ 1., 0., 0., 0., 20.]])
try: # interface changed in most recent statsmodels
print lm.model._data._orig_exog.head(10)
except Exception:
print lm.model.data.orig_exog.head(10)
Intercept C(E)[T.2] C(E)[T.3] C(M)[T.1] X 0 1 0 0 1 1 1 1 0 1 0 1 2 1 0 1 1 1 3 1 1 0 0 1 4 1 0 1 0 1 5 1 1 0 1 2 6 1 1 0 0 2 7 1 0 0 0 2 8 1 0 1 0 2 9 1 1 0 0 3
try:
print lm.model._data.frame.head(10)
except Exception:
print lm.model.data.frame.head(10)
S X E M 0 13876 1 1 1 1 11608 1 3 0 2 18701 1 3 1 3 11283 1 2 0 4 11767 1 3 0 5 20872 2 2 1 6 11772 2 2 0 7 10535 2 1 0 8 12195 2 3 0 9 12313 3 2 0
lm.predict({'X' : [12], 'M' : [1], 'E' : [2]})
array([ 24617.3721])
resid = lm.resid
fig = plt.figure(figsize=(12,8))
xticks = []
ax = fig.add_subplot(111, xlabel='Group (E, M)', ylabel='Residuals')
for values, group in factor_groups:
i,j = values
xticks.append(str((i, j)))
group_num = i*2 + j - 1 # for plotting purposes
x = [group_num] * len(group)
ax.scatter(x, resid[group.index], marker=symbols[j], color=colors[i-1],
s=144, edgecolors='black')
ax.set_xticks([1,2,3,4,5,6])
ax.set_xticklabels(xticks)
ax.axis('tight');
Add an interaction between salary and experience, allowing different intercepts for level of experience.
$$S_i = \beta_0+\beta_1X_i+\beta_2E_{i2}+\beta_3E_{i3}+\beta_4M_i+\beta_5E_{i2}X_i+\beta_6E_{i3}X_i+\epsilon_i$$interX_lm = ols('S ~ C(E)*X + C(M)', salary_table).fit()
print interX_lm.summary()
OLS Regression Results ============================================================================== Dep. Variable: S R-squared: 0.961 Model: OLS Adj. R-squared: 0.955 Method: Least Squares F-statistic: 158.6 Date: Sat, 01 Dec 2012 Prob (F-statistic): 8.23e-26 Time: 12:28:59 Log-Likelihood: -379.47 No. Observations: 46 AIC: 772.9 Df Residuals: 39 BIC: 785.7 Df Model: 6 =============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------- Intercept 7256.2800 549.494 13.205 0.000 6144.824 8367.736 C(E)[T.2] 4172.5045 674.966 6.182 0.000 2807.256 5537.753 C(E)[T.3] 3946.3649 686.693 5.747 0.000 2557.396 5335.333 C(M)[T.1] 7102.4539 333.442 21.300 0.000 6428.005 7776.903 X 632.2878 53.185 11.888 0.000 524.710 739.865 C(E)[T.2]:X -125.5147 69.863 -1.797 0.080 -266.826 15.796 C(E)[T.3]:X -141.2741 89.281 -1.582 0.122 -321.861 39.313 ============================================================================== Omnibus: 0.432 Durbin-Watson: 2.179 Prob(Omnibus): 0.806 Jarque-Bera (JB): 0.590 Skew: 0.144 Prob(JB): 0.744 Kurtosis: 2.526 Cond. No. 69.7 ==============================================================================
Test that $B_5 = \beta_6 = 0$. We can use anova_lm or we can use an F-test.
print anova_lm(lm, interX_lm)
df_resid ssr df_diff ss_diff F Pr(>F) 0 41 43280719.493 0 NaN NaN NaN 1 39 39410679.808 2 3870039.685 1.915 0.161
print interX_lm.f_test('C(E)[T.2]:X = C(E)[T.3]:X = 0')
<F test: F=array([[ 1.9149]]), p=[[ 0.161]], df_denom=39, df_num=2>
interM_lm = ols('S ~ X + C(E)*C(M)', salary_table).fit()
print interM_lm.summary()
OLS Regression Results ============================================================================== Dep. Variable: S R-squared: 0.999 Model: OLS Adj. R-squared: 0.999 Method: Least Squares F-statistic: 5517. Date: Sat, 01 Dec 2012 Prob (F-statistic): 1.67e-55 Time: 12:29:00 Log-Likelihood: -298.74 No. Observations: 46 AIC: 611.5 Df Residuals: 39 BIC: 624.3 Df Model: 6 ======================================================================================= coef std err t P>|t| [95.0% Conf. Int.] --------------------------------------------------------------------------------------- Intercept 9472.6854 80.344 117.902 0.000 9310.175 9635.196 C(E)[T.2] 1381.6706 77.319 17.870 0.000 1225.279 1538.063 C(E)[T.3] 1730.7483 105.334 16.431 0.000 1517.690 1943.806 C(M)[T.1] 3981.3769 101.175 39.351 0.000 3776.732 4186.022 C(E)[T.2]:C(M)[T.1] 4902.5231 131.359 37.322 0.000 4636.825 5168.222 C(E)[T.3]:C(M)[T.1] 3066.0351 149.330 20.532 0.000 2763.986 3368.084 X 496.9870 5.566 89.283 0.000 485.728 508.246 ============================================================================== Omnibus: 74.761 Durbin-Watson: 2.244 Prob(Omnibus): 0.000 Jarque-Bera (JB): 1037.873 Skew: -4.103 Prob(JB): 4.25e-226 Kurtosis: 24.776 Cond. No. 79.0 ==============================================================================
print anova_lm(lm, interM_lm)
df_resid ssr df_diff ss_diff F Pr(>F) 0 41 43280719.493 0 NaN NaN NaN 1 39 1178167.865 2 42102551.628 696.844 3.026e-31
infl = interM_lm.get_influence()
resid = infl.resid_studentized_internal
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='X', ylabel='standardized resids')
for values, group in factor_groups:
i,j = values
idx = group.index
ax.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1],
s=144, edgecolors='black')
ax.axis('tight');
outl = interM_lm.outlier_test('fdr_bh')
outl.sort('unadj_p', inplace=True)
print outl
student_resid unadj_p fdr_bh(p) 32 -14.951 1.677e-17 7.714e-16 33 1.288 2.055e-01 9.599e-01 23 1.023 3.127e-01 9.599e-01 41 0.942 3.520e-01 9.599e-01 11 0.897 3.756e-01 9.599e-01 5 0.891 3.787e-01 9.599e-01 39 0.835 4.089e-01 9.599e-01 38 0.819 4.178e-01 9.599e-01 21 0.730 4.700e-01 9.599e-01 20 -0.726 4.722e-01 9.599e-01 30 0.704 4.854e-01 9.599e-01 28 0.603 5.500e-01 9.599e-01 44 -0.600 5.523e-01 9.599e-01 1 -0.590 5.584e-01 9.599e-01 17 -0.565 5.755e-01 9.599e-01 0 -0.482 6.322e-01 9.599e-01 34 -0.475 6.377e-01 9.599e-01 6 -0.464 6.453e-01 9.599e-01 7 0.429 6.705e-01 9.599e-01 45 -0.426 6.722e-01 9.599e-01 4 0.424 6.736e-01 9.599e-01 3 -0.419 6.779e-01 9.599e-01 35 0.382 7.043e-01 9.599e-01 43 0.361 7.203e-01 9.599e-01 12 0.358 7.224e-01 9.599e-01 37 0.343 7.338e-01 9.599e-01 2 -0.292 7.721e-01 9.599e-01 24 0.287 7.756e-01 9.599e-01 31 -0.285 7.771e-01 9.599e-01 36 -0.275 7.848e-01 9.599e-01 13 -0.269 7.896e-01 9.599e-01 27 -0.260 7.964e-01 9.599e-01 15 0.252 8.023e-01 9.599e-01 16 0.250 8.040e-01 9.599e-01 42 0.196 8.458e-01 9.599e-01 9 -0.195 8.466e-01 9.599e-01 10 0.191 8.497e-01 9.599e-01 26 -0.166 8.691e-01 9.599e-01 19 0.165 8.697e-01 9.599e-01 25 -0.162 8.724e-01 9.599e-01 14 0.148 8.831e-01 9.599e-01 29 0.147 8.837e-01 9.599e-01 40 -0.130 8.973e-01 9.599e-01 18 -0.072 9.426e-01 9.855e-01 22 0.016 9.872e-01 9.879e-01 8 -0.015 9.879e-01 9.879e-01
salary_table_clean = salary_table.drop(32)
print salary_table_clean
S X E M 0 13876 1 1 1 1 11608 1 3 0 2 18701 1 3 1 3 11283 1 2 0 4 11767 1 3 0 5 20872 2 2 1 6 11772 2 2 0 7 10535 2 1 0 8 12195 2 3 0 9 12313 3 2 0 10 14975 3 1 1 11 21371 3 2 1 12 19800 3 3 1 13 11417 4 1 0 14 20263 4 3 1 15 13231 4 3 0 16 12884 4 2 0 17 13245 5 2 0 18 13677 5 3 0 19 15965 5 1 1 20 12336 6 1 0 21 21352 6 3 1 22 13839 6 2 0 23 22884 6 2 1 24 16978 7 1 1 25 14803 8 2 0 26 17404 8 1 1 27 22184 8 3 1 28 13548 8 1 0 29 14467 10 1 0 30 15942 10 2 0 31 23174 10 3 1 33 25410 11 2 1 34 14861 11 1 0 35 16882 12 2 0 36 24170 12 3 1 37 15990 13 1 0 38 26330 13 2 1 39 17949 14 2 0 40 25685 15 3 1 41 27837 16 2 1 42 18838 16 2 0 43 17483 16 1 0 44 19207 17 2 0 45 19346 20 1 0
idx = salary_table.index.drop(32)
interM_lm32 = ols('S ~ X + C(E) * C(M)', salary_table, subset=idx).fit()
resid = interM_lm32.get_influence().summary_frame()['standard_resid']
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='X[~[32]]', ylabel='standardized resids')
for values, group in factor_groups:
i,j = values
idx = group.index
ax.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1],
s=144, edgecolors='black')
ax.axis('tight');
lm_final = ols('S ~ X + C(E)*C(M)', salary_table.drop([32])).fit()
try:
mf = lm_final.model._data._orig_exog
except Exception:
mf = lm_final.model.data.orig_exog
lstyle = ['-','--']
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='Experience', ylabel='Salary')
for values, group in factor_groups:
i,j = values
idx = group.index
ax.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i-1],
s=144, edgecolors='black')
# drop NA because there is no idx 32 in the final model
ax.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(),
ls=lstyle[j], color=colors[i-1])
ax.axis('tight');
U = S - X * interM_lm32.params['X']
U.name = 'Salary|X'
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax = interaction_plot(E, M, U, colors=['red','blue'], markers=['^','D'],
markersize=10, ax=ax)