options(repr.plot.width=5, repr.plot.height=5) url = 'http://stats191.stanford.edu/data/election.table' election.table = read.table(url, header=T) pairs(election.table[,2:ncol(election.table)], cex.labels=3, pch=23, bg='orange', cex=2) election.lm = lm(V ~ I + D + W + G:I + P + N, election.table) election.lm X = model.matrix(election.lm)[,-1] library(leaps) election.leaps = leaps(X, election.table$V, nbest=3, method='r2') best.model.r2 = election.leaps$which[which((election.leaps$r2 == max(election.leaps$r2))),] best.model.r2 plot(election.leaps$size, election.leaps$r2, pch=23, bg='orange', cex=2) election.leaps = leaps(X, election.table$V, nbest=3, method='adjr2') best.model.adjr2 = election.leaps$which[which((election.leaps$adjr2 == max(election.leaps$adjr2))),] best.model.adjr2 plot(election.leaps$size, election.leaps$adjr2, pch=23, bg='orange', cex=2) election.leaps = leaps(X, election.table$V, nbest=3, method='Cp') best.model.Cp = election.leaps$which[which((election.leaps$Cp == min(election.leaps$Cp))),] best.model.Cp plot(election.leaps$size, election.leaps$Cp, pch=23, bg='orange', cex=2) n = nrow(X) p = 7 + 1 c(n * log(2*pi*sum(resid(election.lm)^2)/n) + n + 2*p, AIC(election.lm)) election.step.forward = step(lm(V ~ 1, election.table), list(upper = ~ I + D + W + G + G:I + P + N), direction='forward', k=2, trace=FALSE) election.step.forward summary(election.step.forward) election.step.forward.BIC = step(lm(V ~ 1, election.table), list(upper = ~ I + D + W +G:I + P + N), direction='forward', k=log(nrow(X))) summary(election.step.forward.BIC) election.step.backward = step(election.lm, direction='backward') summary(election.step.backward) library(boot) election.glm = glm(V ~ ., data=election.table) cv.glm(model.frame(election.glm), election.glm, K=5)$delta election.leaps = leaps(X, election.table$V, nbest=3, method='Cp') V = election.table$V election.leaps$cv = 0 * election.leaps$Cp for (i in 1:nrow(election.leaps$which)) { subset = c(1:ncol(X))[election.leaps$which[i,]] if (length(subset) > 1) { Xw = X[,subset] wlm = glm(V ~ Xw) election.leaps$CV[i] = cv.glm(model.frame(wlm), wlm, K=5)$delta[1] } else { Xw = X[,subset[1]] wlm = glm(V ~ Xw) election.leaps$CV[i] = cv.glm(model.frame(wlm), wlm, K=5)$delta[1] } } plot(election.leaps$Cp, election.leaps$CV, pch=23, bg='orange', cex=2) plot(election.leaps$size, election.leaps$CV, pch=23, bg='orange', cex=2) best.model.Cv = election.leaps$which[which((election.leaps$CV == min(election.leaps$CV))),] best.model.Cv X_HIV = read.table('http://stats191.stanford.edu/data/NRTI_X.csv', header=FALSE, sep=',') Y_HIV = read.table('http://stats191.stanford.edu/data/NRTI_Y.txt', header=FALSE, sep=',') set.seed(0) Y_HIV = as.matrix(Y_HIV)[,1] X_HIV = as.matrix(X_HIV) nrow(X_HIV) D = data.frame(X_HIV, Y_HIV) M = lm(Y_HIV ~ ., data=D) M_forward = step(lm(Y_HIV ~ 1, data=D), list(upper=M), trace=FALSE, direction='forward') M_forward M_backward = step(M, list(lower= ~ 1), trace=FALSE, direction='backward') M_backward M_both1 = step(M, list(lower= ~ 1, upper=M), trace=FALSE, direction='both') M_both1 M_both2 = step(lm(Y_HIV ~ 1, data=D), list(lower= ~ 1, upper=M), trace=FALSE, direction='both') M_both2 sort(names(coef(M_forward))) sort(names(coef(M_backward))) sort(names(coef(M_both1))) sort(names(coef(M_both2))) M_backward_BIC = step(M, list(lower= ~ 1), trace=FALSE, direction='backward', k=log(633)) M_forward_BIC = step(lm(Y_HIV ~ 1, data=D), list(upper=M), trace=FALSE, direction='forward', k=log(633)) M_both1_BIC = step(M, list(upper=M, lower=~1), trace=FALSE, direction='both', k=log(633)) M_both2_BIC = step(lm(Y_HIV ~ 1, data=D), list(upper=M, lower=~1), trace=FALSE, direction='both', k=log(633)) sort(names(coef(M_backward_BIC))) sort(names(coef(M_forward_BIC))) sort(names(coef(M_both1_BIC))) sort(names(coef(M_both2_BIC))) summary(election.step.forward) confint(election.step.forward) library(selectiveInference) plot(fs(X, V)) fsInf(fs(X, V)) X_fake = matrix(rnorm(4000), 100, 40) nsim = 1000 naiveP = c() for (i in 1:nsim) { Z = rnorm(nrow(X_fake)) fsfit = fs(X_fake, Z, maxsteps=1) fsinf = fsInf(fsfit, sigma=1) naive.lm = lm(Z ~ X_fake[,fsinf$vars[1]]) naiveP = c(naiveP, summary(naive.lm)$coef[2,4]) } ecdf(naiveP)(0.05) splitP = c() for (i in 1:nsim) { Z = rnorm(nrow(X_fake)) subset_s = sample(1:nrow(X_fake), nrow(X_fake)/2, replace=FALSE) #_s for selection subset_i = rep(TRUE, nrow(X_fake)) # _i for inference subset_i[subset_s] = FALSE # Step 1: choose your model fsfit = fs(X_fake[subset_s,], Z[subset_s], maxsteps=1) # Step 2: compute p-values split.lm = lm(Z ~ X_fake[,fsinf$vars[1]], subset=subset_i) splitP = c(splitP, summary(split.lm)$coef[2,4]) } plot(ecdf(splitP), lwd=3, col='red') abline(0,1, lwd=2, lty=2) exactP = c() for (i in 1:nsim) { Z = rnorm(nrow(X_fake)) fsfit = fs(X_fake, Z, maxsteps=1) fsinf = fsInf(fsfit, sigma=1) exactP = c(exactP, fsinf$pv[1]) } plot(ecdf(exactP), lwd=3, col='blue', main='Uncorrected vs. corrected null p-values') plot(ecdf(naiveP), lwd=3, col='red', add=TRUE) abline(0,1, lwd=2, lty=2) exactP_A = c() splitP_A = c() correct_model = c() beta1 = .3 # signal size for (i in 1:nsim) { Z = rnorm(nrow(X_fake)) + X_fake[,1] * beta1 fsfit = fs(X_fake, Z, maxsteps=1) fsinf = fsInf(fsfit, sigma=1) exactP_A = c(exactP_A, fsinf$pv[1]) correct_model = c(correct_model, fsinf$vars[1] == 1) # data splitting pvalue subset = sample(1:nrow(X_fake), nrow(X_fake)/2, replace=FALSE) subset_c = rep(TRUE, nrow(X_fake)) subset_c[subset] = FALSE fsfit = fs(X_fake[subset,], Z[subset], maxsteps=1) split.lm = lm(Z ~ X_fake[,fsinf$vars[1]], subset=subset_c) splitP_A = c(splitP_A, summary(split.lm)$coef[2,4]) } plot(ecdf(exactP_A), lwd=3, col='blue', xlim=c(0,1), main='Power comparison (all pvalues)') plot(ecdf(splitP_A), lwd=3, col='red', add=TRUE) abline(0,1, lwd=2, lty=2) plot(ecdf(exactP_A[correct_model]), lwd=3, col='blue', xlim=c(0,1), main='Power comparison (correct model)') plot(ecdf(splitP_A[correct_model]), lwd=3, col='red', add=TRUE) abline(0,1, lwd=2, lty=2) exactL = c() splitL = c() correct_model = c() beta1 = 0.3 for (i in 1:nsim) { Z = rnorm(nrow(X_fake)) + X_fake[,1] * beta1 fsfit = fs(X_fake, Z, maxsteps=1) fsinf = fsInf(fsfit, sigma=1) exactL = c(exactL, fsinf$ci[1,2] - fsinf$ci[1,1]) correct_model = c(correct_model, fsinf$vars[1] == 1) # data splitting pvalue subset = sample(1:nrow(X_fake), nrow(X_fake)/2, replace=FALSE) subset_c = rep(TRUE, nrow(X)) #_c for confirmatory subset_c[subset] = FALSE fsfit = fs(X_fake[subset,], Z[subset], maxsteps=1) split.lm = lm(Z ~ X_fake[,fsinf$vars[1]], subset=subset_c) conf = confint(split.lm, level=0.9) splitL = c(splitL, conf[2,2] - conf[2,1]) } data.frame(median(exactL), median(splitL)) data.frame(median(exactL[correct_model]), median(splitL[correct_model]))