%pylab inline import numpy as np import matplotlib.pylab as plt %load_ext rmagic def scale_design_mtx(X): """utility to scale the design matrix for display""" mi, ma = X.min(axis=0), X.max(axis=0) col_neq = (ma - mi) > 1.e-8 Xs = np.ones_like(X) mi = mi[col_neq] ma = ma[col_neq] Xs[:,col_neq] = (X[:,col_neq] - mi)/(ma - mi) return Xs def show_x(X, title='Design matrix'): """ Utility to show a design matrix """ N, P = X.shape plt.imshow(scale_design_mtx(X), interpolation='nearest', cmap=plt.cm.gray) plt.title(title) # Only integer ticks for the design matrix columns ax = plt.gca() ticks = ax.get_xticks() ax.set_xticks(ticks[(ticks == np.round(ticks)) & (ticks >= 0)]) ax.set_xlim(-0.5, P-0.5) show_x(np.random.normal(size=(10, 3)), 'My design') N = 15 x1 = [49., 50, 45, 53, 69, 73, 35, 40, 36, 30, 49, 36, 73, 61, 74] Y = [ 132.03, 98.64, 146.24, 97.9, 145.12, 191.78, 155.34, 123.08, 140.87, 122.51, 146.08, 89.3, 134.52, 89.16, 136.08] %%R -i x1,Y -o X XYfit = lm(Y~x1) X = model.matrix(XYfit) X show_x(X, 'Simple linear regression') %R model.matrix(~ x1) %R model.matrix(~x1 + 1) C = np.ones((N,)) %Rpush C %R model.matrix(~x1 + C) %R model.matrix(~x1 - 1) x2 = np.arange(N) %Rpush x2 X = %R model.matrix(~ x1 + x2) show_x(X, 'Two variables') X = %R model.matrix(~ x2 + x1) show_x(X, 'Same two variables') X = %R model.matrix(~x1 + x2 + x1:x2) show_x(X, "Two variables and their interaction") X[:,3] == x1 * x2 X2 = %R model.matrix(~ x1:x2 + x1 + x2) if np.all(X == X2): print "Designs are the same" %R print(coef(lm(Y ~ x1:x2 + x1 + x2))) x3 = np.random.normal(size=(N,)) x4 = np.random.normal(size=(N,)) %Rpush x3 x4 %R print(coef(lm(Y ~ x1 + x2 + x1:x2 + x3 + x4))) nationality = ['USA'] * 5 + ['UK'] * 5 + ['France'] * 5 %Rpush nationality %R print(model.matrix( ~ nationality)) %%R nationality = factor(nationality) print(levels(nationality)) print(model.matrix( ~ nationality)) %R print(model.matrix( ~ C(nationality, helmert))) %R print(model.matrix( ~ C(nationality, c(2,-1,-1)))) nationality = np.array(nationality) indicate_usa = nationality == 'USA' indicate_uk = nationality == 'UK' indicate_france = nationality == 'France' X = np.column_stack((indicate_usa, indicate_uk, indicate_france)).astype(int) X W = np.column_stack((np.ones(N,), X)) W Z = %R model.matrix( ~ nationality) Z C = np.array([[0, 0, 1], [0, 1, 0], [1, -1, -1]]).T np.dot(Z, C) np.all(np.dot(X, np.linalg.inv(C)) == Z) X_indicator = %R model.matrix(~ nationality-1) X_contrast = %R model.matrix(~ nationality) C_map = %R contr.treatment(3) # Three levels for nationality np.all(np.dot(X_indicator, C_map) == X_contrast[:, 1:]) C_map_with_inter = np.column_stack(([1, 1, 1], C_map)) np.all(np.dot(X_indicator, C_map_with_inter) == X_contrast) Z = %R model.matrix( ~ C(nationality, helmert)) C = np.array([[1./3, -0.5, -1./6], [1./3, 0.5, -1./6], [1./3, 0, 1./3]]).T np.set_printoptions(suppress=True) # suppress very small floating point values np.dot(Z, C) # same as X_indicator %R model.matrix( ~ nationality-1) awesome = np.array(["Yes", "No"] * 7 + ["Yes"]) awesome %%R -i awesome awesome = factor(awesome) print(model.matrix(~ nationality + awesome)) indicator_columns = [] for country in 'USA', 'UK', 'France': for quality in 'Yes', 'No': indicator_columns.append((nationality == country) & (awesome == quality)) X = np.column_stack(indicator_columns).astype(int) X show_x(X, 'Interaction of nationality and awesome') nat_cols = [] for country in 'USA', 'UK', 'France': nat_cols.append(nationality == country) awe_cols = [] for quality in 'Yes', 'No': awe_cols.append(awesome == quality) all_cols = [] for nat_col in nat_cols: for awe_col in awe_cols: all_cols.append(nat_col * awe_col) X_again = np.column_stack(all_cols).astype(int) show_x(X_again, 'Interaction by product of indicator columns') X_r = %R model.matrix(~ awesome:nationality-1) show_x(X_r, 'R interaction using indicator coding') X_nations = %R model.matrix(~ nationality-1) X_nations[:, 0] == X_r[:, 0] + X_r[:, 1] X_awesome = %R model.matrix(~ awesome-1) X_awesome[:, 0] == np.sum(X_r[:, (0, 2, 4)], axis=1) %%R fit_with_intercept = lm(Y ~ nationality) print(model.matrix(fit_with_intercept)) %R print(summary(fit_with_intercept)) %%R fit_no_intercept = lm(Y ~ nationality -1) print(summary(fit_no_intercept)) %R sum(abs(fit_no_intercept$residuals - fit_with_intercept$residuals)) %R fit_main_inters = lm(Y ~ nationality + awesome + nationality:awesome) X_main_inters = %R model.matrix(fit_main_inters) print X_main_inters names = %R colnames(model.matrix(fit_main_inters)) for i, name in enumerate(names): print i, name print np.all(X_main_inters[:, 4] == X_main_inters[:, 1] * X_main_inters[:, 3]) print np.all(X_main_inters[:, 5] == X_main_inters[:, 2] * X_main_inters[:, 3]) %R print(summary(fit_main_inters)) %R fit_inters = lm(Y ~ nationality:awesome-1) X_inters = %R model.matrix(fit_inters) print X_inters %R print(summary(fit_inters)) C = np.dot(np.linalg.pinv(X_inters), X_main_inters) np.allclose(np.dot(X_inters, C), X_main_inters) C_ind2treat = %R contr.treatment(3) # 3 factor levels for nationality C_N2W = np.column_stack(([1, 1, 1], C_ind2treat)) # Add intercept generating column X_N = %R model.matrix(fit_no_intercept) X_W = %R model.matrix(fit_with_intercept) print np.all(np.dot(X_N, C_N2W) == X_W) # Confirm we have C' to map X_N to X_W C_W2N = np.linalg.inv(C_N2W) # The C we want print np.all(np.dot(X_W, C_W2N) == X_N) # confirm it's the C we want B_N = %R coef(fit_no_intercept) # No intercept coefficients B_N = B_N.reshape(-1, 1) # Make into a column vector for dot product B_I_est = np.dot(C_W2N, B_N) # Estimate coefficients for with-intercept model B_I = %R coef(fit_with_intercept) print "Coefficients from intercept model", B_I print "Coefficients recreated from non-intercept model", B_I_est.T %%R fit_sequential = lm(Y ~ nationality + awesome + nationality:awesome) print(anova(fit_sequential)) %R print(anova(lm(Y ~ nationality))) (2179.8 / 2) / (3481.3 / 9) %%R print(anova(lm(Y ~ nationality + awesome))) print(anova(lm(Y ~ awesome + nationality))) %R print(anova(lm(Y ~ nationality:awesome))) %%R print(attributes(terms(~A * B))$term) print(attributes(terms(~A * B * C))$term) print(attributes(terms(~A %in% B))$term) print(attributes(terms(~A / B))$term) print(attributes(terms(~A + B - A))$term) print(attributes(terms(~A + B + A:B - B))$term) print(attributes(terms(~(A + B + C)^2))$term) print(attributes(terms(~(A + B + C)^3))$term) %%R print(attributes(terms(~A:B +1 + B))$term) # actually the intercept does not appear in the terms print(attributes(terms(~D + A:B:C + A + B + E))$term) %R print(model.matrix(~ x1 + x2 + x1:x2 - 1)) x1 * x2 X_nat = %R model.matrix(~ nationality - 1) # indicator coding X_awe = %R model.matrix(~ awesome - 1) # indicator coding X_both = %R model.matrix(~ nationality:awesome - 1) print X_nat print X_awe print X_both %R model.matrix(~ x1:nationality - 1) %R model.matrix(~ 1 + nationality) %R model.matrix(~ nationality - 1) %R print(model.matrix(~ nationality + awesome + nationality:awesome - 1)) %R print(model.matrix(~ nationality + nationality:awesome - 1)) %R model.matrix(~ x1:nationality + x1:awesome) X_nat = %R model.matrix(~ nationality - 1) # indicator coding X_awe = %R model.matrix(~ awesome - 1) # indicator coding X = np.column_stack((X_nat, X_awe)) # our design with indicator coding for both # Do an estimation. This is ordinary least squares, but it doesn't matter how we get the estimate B = np.dot(np.linalg.pinv(X), Y) Y_hat = np.dot(X, B) # fitted values print "OLS coefficient estimate", B print "Fitted values", Y_hat S = 100 # any old constant value B2 = B[:] # Make a new parameter vector with the constant applied B2[:3] += S B2[3:] -= S print "Parameters with constant added, subtracted", B2 print "Same Yhat?", np.allclose(Y_hat, np.dot(X, B2)) # Are the fitted values the same? %R print(lm(Y ~ nationality:awesome)) popular = ['Yes'] * 8 + ['No'] * 7 overrated = ['Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No'] %%R -i popular,overrated popular = factor(popular) overrated = factor(overrated) print(lm(Y ~ awesome + popular + overrated + awesome:popular:overrated)) X_3 = %R model.matrix(~ awesome + popular + overrated + awesome:popular:overrated) show_x(X_3)