options(repr.plot.width = 4, repr.plot.height = 3) ## resizing plots options(scipen = 999) ## has scientific notation ever annoyed you? library(tidyverse) library(data.table) library(origami) library(ggsci) set.seed(76548208) # structural equation for W f_W <- function(U_W) { return(U_W) } # structural equation for A f_A <- function(W, U_A) { return(as.numeric(plogis(0.02 * W + U_A) > 0.25)) } # structural equation for Y f_Y <- function(W, A, U_Y) { return(-W + 10 * A - U_Y) } # function to draw n observations from an scm # n = the number of observations to draw # returns a tibble with named columns simObsSCM <- function(n) { ## first we draw the errors # draw Uniform(-0.5, 50.5) and round U_W <- round(runif(n, -0.5, 50.5)) # draw U_A U_A <- rnorm(n, 0, 1) # draw U_Y U_Y <- rnorm(n, 0, 3) #evaluate the observations sequentially # evaluate W W <- f_W(U_W) # evaluate A A <- f_A(W = W, U_A = U_A) # evaluate Y Y <- f_Y(W = W, A = A, U_Y = U_Y) ## return a data.frame object out <- as.data.table(cbind(W = W, A = A, Y = Y)) return(out) } # function that draws n observations from an SCM that is # intervened on to set A = setA # n = number of observations # setA = the value to set A equal to (0 or 1) # returns a data.frame of coutnerfactual observations simIntSCM <- function(n, set_A = 1) { ## first we draw the errors # draw Uniform(-0.5,50.5) and round U_W <- round(runif(n, -0.5, 50.5)) # draw U_A U_A <- rnorm(n, 0, 1) # draw U_Y U_Y <- rnorm(n, 0, 1) # evaluate the observations sequentially # evaluate W W <- f_W(U_W) # evaluate A A <- rep(set_A, n) # evaluate Y Y <- f_Y(W = W, A = A, U_Y = U_Y) ## return a data.frame object out <- as.data.table(cbind(W = W, A = A, Y = Y)) return(out) } int_big_samp <- simIntSCM(n = 1e6, set_A = 1) E0Y1 <- mean(int_big_samp$Y) round(E0Y1, 3) # let's go ahead and write a function that takes as input # a data frame from the observed SCM and computes the # mean in each strata and returns it as a vector getStratEst <- function(data, strata = (seq_len(51) - 1)) { est <- rep(NA, length(strata)) # over each unique value of W compute the mean of Y in the A=1 grp for (w in strata) { # let's check to make sure someone is in each strata if (sum(data$A == 1 & data$W == w) > 0) { est[w + 1] <- mean(data$Y[data$A == 1 & data$W == w]) } else { est[w + 1] <- NA } } # return the vector est } # simulate very large data set from the observed SCM big_obs <- simObsSCM(n = 5e6) # get the stratified estimate E0Y_A1Ww <- getStratEst(data = big_obs) # we know the probability that W=w, it's 1/51 pw <- rep(1/51, 51) # sum over all the strata to get E_0(E_0(Y | A = 1, W)) E0_E0Y_A1Ww <- sum(pw * E0Y_A1Ww) # the statistical target parameter E0_E0Y_A1Ww <- sum(pw * E0Y_A1Ww) E0_E0Y_A1Ww # vector of all w values, makes it easier to plot allw <- (seq_len(51) - 1) # plot "estimated" values plot(E0Y_A1Ww ~ allw, bty="n", xlab="w", ylab = expression(E[0]*"(Y | A=1, W)")) # add true values E0Y_A1Ww_true <- -allw + 10 points(E0Y_A1Ww_true ~ allw, pch=3) # add legend legend(x=30, y=10, bty="n", pch = c(1,3), legend = c("'Estimated'", "True")) # simulate n=5,000 observations from the observed SCM smallerObs <- simObsSCM(n = 5e3) # get stratified estimates EhatY_A1Ww <- getStratEst(data = smallerObs) # plot stratified-estimated values plot(EhatY_A1Ww ~ allw, bty="n", xlab="w", ylab=expression(hat(E)*"(Y | A=1, W)"), mgp = c(2.1,0.5,0)) # add true values points(E0Y_A1Ww_true ~ allw, pch=3) # add legend legend(x=30, y=10, bty="n", pch = c(1,3), legend = c("Estimated", "True")) # simulate n=5,000 observations from the observed SCM smallerObs <- simObsSCM(n = 5e2) # get stratified estimates EhatY_A1Ww <- getStratEst(data = smallerObs) # plot stratified-estimated values plot(EhatY_A1Ww ~ allw, bty="n", xlab="w", ylab=expression(hat(E)*"(Y | A=1, W)"), mgp = c(2.1,0.5,0)) # add true values points(E0Y_A1Ww_true ~ allw, pch=3) # add legend legend(x=30, y=10, bty="n", pch = c(1,3), legend = c("Estimated", "True")) # simulate n=5,000 observations from the observed SCM smallerObs <- simObsSCM(n = 1e2) # get stratified estimates EhatY_A1Ww <- getStratEst(data = smallerObs) # plot stratified-estimated values plot(EhatY_A1Ww ~ allw, bty="n", xlab="w", ylab=expression(hat(E)*"(Y | A=1, W)"), mgp = c(2.1,0.5,0)) # add true values points(E0Y_A1Ww_true ~ allw, pch=3) # add legend legend(x=30, y=10, bty="n", pch = c(1,3), legend = c("Estimated", "True")) # function to draw n observations from an scm # n = the number of observations to draw # returns a data.frame with named columns simObsSCM <- function(n){ ## first we draw the errors # draw Uniform(-0.5,50.5) and round U_W <- runif(n,0,50) # draw U_A U_A <- rnorm(n,0,1) # draw U_Y U_Y <- rnorm(n,0,3) #evaluate the observations sequentially # evaluate W W <- f_W(U_W) # evaluate A A <- f_A(W = W, U_A = U_A) # evaluate Y Y <- f_Y(W = W, A = A, U_Y = U_Y) ## return a data.frame object out <- data.frame(W = W, A = A, Y = Y) return(out) } # this function takes as input a dataframe of observations # and computes the moving window average of the mean of Y # given W at each value of the vector wValues # the function returns a data.frame with columns w (wValues) simpleKern <- function( dat, # data.frame of observations h, # the size of the window wValues # values at which to return kernel predictions ){ # for each value in wValues, compute the mean of the observed # data with A = 1 in that window # empty vector of results EhatY <- rep(NA, length(wValues)) ct <- 0 for(w in wValues){ ct <- ct + 1 EhatY[ct] <- mean(dat$Y[dat$A == 1 & dat$W < w + h/2 & dat$W > w - h/2]) } return(data.frame( w = wValues, EhatY = EhatY )) } # try it out dat <- simObsSCM(n=200) # for window size 2 fit2 <- simpleKern(dat = dat, h = 2, # window of size 2 wValues = seq(0,50, length = 200)) # evenly spaced from 0,50 fit20 <- simpleKern(dat = dat, h = 20, # window of size 20 wValues = seq(0,50, length = 200)) # evenly spaced from 0,50 plot(fit2$EhatY ~ fit2$w, bty="n",xlab="w", type="l", ylab=expression(hat(E)*"(Y | A = 1, W =w)"), mgp = c(2.1, 0.5, 0),lwd=2) lines(fit20$EhatY ~ fit20$w,lwd=2, lty=2) points(dat$Y ~ dat$W, col="gray75") legend(x="topright", lty=c(1,2), title="h", legend = c(2,20)) getBiasVariance <- function( estimator, # a function that takes as input a data set and # vector of w values called 'wValues' and outputs # a value for the estimator at w truth, # a function that takes as input a vector wValues and outputs the # true value at those values n, # sample size wValues, # vector of strata to compute bias and variance nSim = 100, # number of repeated draws to compute bias and variance getMSE = FALSE, # should bias and variance be returned or MSE? ... # other args passed to estimator (e.g., h) ){ estMat <- NULL for(i in 1:nSim){ dat <- simObsSCM(n = n) est <- do.call(estimator, args = c(list(dat = dat, wValues = wValues), list(...))) estMat <- rbind(estMat, est$EhatY) } trueValues <- do.call(truth, args = list(wValues = wValues)) if(!getMSE){ # compute the bias and variance if(is.matrix(estMat)){ bias <- colMeans(estMat, na.rm=TRUE) - trueValues variance <- apply(estMat, 2, var) }else{ bias <- mean(estMat, na.rm = TRUE) - trueValues variance <- var(estMat, na.rm = TRUE) } names(bias) <- wValues names(variance) <- wValues # return a list out <- list(bias = bias, variance = variance) }else{ mse <- apply(matrix(1:ncol(estMat)), 1, function(i){ mean((estMat[,i] - trueValues[i])^2) }) names(mse) <- wValues out <- list(mse = mse) } out } # call the function to get the estimated bias and variance of # \hat{E} getBiasVariance( estimator = "simpleKern", # estimation function to use truth = function(wValues){ -wValues + 10 }, # the true value n = 500, # sample size wValues = c(5, 25, 45), nSim = 1000, h = 5 ) # call the function to get the estimated bias and variance of # \hat{E} getBiasVariance( estimator = "simpleKern", # estimation function to use truth = function(wValues){ -wValues + 10 }, # the true value n = 500, # sample size wValues = c(5, 25, 45), nSim = 1000, h = 5, getMSE = TRUE ) # I'm going to be repeating the same code under several different # scenarios, so let's go ahead and write a function that takes an input # of h's and outputs a graph of the the Bias^2 and Variance at a particular # point for each of those choices of bandwidth. plotBiasVariance <- function(hGrid, wValue, n, estimator = "simpleKern", plotMSE = FALSE){ rslt <- lapply(split(hGrid, hGrid), getBiasVariance, estimator = estimator, truth = function(wValues){ f_Y(W=wValues, A=1, U_Y = 0) }, n = n, getMSE = plotMSE, wValues = wValue, nSim = 1000) if(!plotMSE){ # transform result list into data.frame biasRslt <- data.frame(h=hGrid, bias=unlist(lapply(rslt, function(x){x$bias}), use.names=FALSE)) varRslt <- data.frame(h=hGrid, variance=unlist( lapply(rslt, function(x){x$variance}), use.names = FALSE)) # standardize results to put on same graph biasRslt$bias2_s <- (biasRslt$bias^2 - min(biasRslt$bias^2))/diff(range(biasRslt$bias^2)) varRslt$variance_s <- (varRslt$variance -min(varRslt$variance))/diff(range(varRslt$variance)) # plot results par(mar = c(4.1, 3.1, 0.5, 3.1), mgp = c(1.5, 0.5, 0)) plot(bias2_s ~ h, data = biasRslt, type="b",yaxt="n", xlab = "h", ylab = expression(Bias^2)) points(variance_s ~ h, data = varRslt, type="b", pch=2, lty=2) axis(side = 2, at = seq(0,1,length = 5), labels = rep("", 5)) axis(side = 4, at = seq(0,1,length = 5), labels = rep("", 5)) mtext(side = 4, line = par()$mgp[1], "Variance") legend(x="topleft", pch = c(1,2), legend = c("Bias", "Variance")) }else{ # mse results list into data.frame mseRslt <- data.frame(h=hGrid, mse=unlist(lapply(rslt, function(x){x$mse}), use.names=FALSE)) # plot results par(mar = c(4.1, 3.1, 0.5, 3.1), mgp = c(1.5, 0.5, 0)) plot(mse ~ h, data = mseRslt, type="b", xlab = "h", ylab = expression(MSE)) legend(x="topleft", pch = c(1,2), legend = c("MSE")) } } # run the function to plot bias and variance set.seed(1234) plotBiasVariance(hGrid = seq(1, 30, length = 10), wValue = 25, n = 500) # run the function plot MSE set.seed(1234) plotBiasVariance(hGrid = seq(1, 30, length = 10), wValue = 25, n = 500, plotMSE = TRUE) # plot the bias and variance set.seed(1234) plotBiasVariance(hGrid = seq(1, 10, length = 10), wValue = 25, n = 10000) # plot the mse set.seed(1234) plotBiasVariance(hGrid = seq(1, 10, length = 10), wValue = 25, n = 10000, plotMSE = TRUE) # redefine structural equation for Y f_Y <- function(W, A, U_Y){ -0.01*W + 10*A - U_Y } # plot the bias and variance set.seed(1234) plotBiasVariance(hGrid = seq(1, 50, length = 10), wValue = 25, n = 500) # plot the mse set.seed(1234) plotBiasVariance(hGrid = seq(1, 50, length = 10), wValue = 25, n = 500, plotMSE = TRUE) # redefine structural equation for Y f_Y <- function(W, A, U_Y){ 10*sin(W/2) + 10*A - U_Y } # plot the bias and variance set.seed(1234) plotBiasVariance(hGrid = seq(1, 20, length = 10), wValue = 25, n = 2000) # plot the mse set.seed(1234) plotBiasVariance(hGrid = seq(1, 20, length = 10), wValue = 25, n = 2000, plotMSE = TRUE) # normal kernel. Takes as input a vector of observed points wObs, # a bandwidth h and a single point w and returns the normal kernel # evaluated at each wObs. Knorm <- function(wObs, h, w){ dnorm((w - wObs)/h) } # try it out to see what it does Knorm(wObs = 0:10, h = 5, w = 5) # this function takes as input a dataframe of observations # and computes the moving window average of the mean of Y # given W at each value of the vector wValues # the function returns a data.frame with columns w (wValues) kernReg <- function( dat, # data.frame of observations h, # bandwidth K, # kernel function that takes inputs called wObs, h, and w wValues # values at which to return kernel predictions ){ # for each value in wValues, compute the mean of the observed # data with A = 1 in that window # empty vector of results EhatY <- rep(NA, length(wValues)) ct <- 0 for(w in wValues){ ct <- ct + 1 thisK <- do.call(K, args = list(wObs = dat$W, h = h, w = w)) EhatY[ct] <- sum(thisK * dat$Y) / sum(thisK) } return(data.frame( w = wValues, EhatY = EhatY )) } # try it out dat <- simObsSCM(n=200) # for window size 2 fit2 <- kernReg(dat = dat, K = "Knorm", h = 2, # window of size 2 wValues = seq(0,50, length = 200)) # evenly spaced from 0,50 fit20 <- kernReg(dat = dat, K = "Knorm", h = 20, # window of size 20 wValues = seq(0,50, length = 200)) # evenly spaced from 0,50 plot(fit2$EhatY ~ fit2$w, bty="n",xlab="w", type="l", ylab=expression(hat(E)*"(Y | A = 1, W =w)"), mgp = c(2.1, 0.5, 0),lwd=2, main= "Kernel regression, normal kernel") lines(fit20$EhatY ~ fit20$w,lwd=2, lty=2) points(dat$Y ~ dat$W, col="gray75") legend(x="topright", lty=c(1,2), title="h", legend = c(2,20)) # normal kernel. Takes as input a vector of observed points wObs, # a bandwidth h and a single point w and returns the normal kernel # evaluated at each wObs. Kepan <- function(wObs, h, w){ u <- (w - wObs)/h k <- 3/4 * (1-u^2)*(abs(u)<=1) } # try it out to see what it does Kepan(wObs = 0:10, h = 5, w = 5) # compare to the normal kernal k1 <- Kepan(wObs = 0:10, h = 5, w = 5) k2 <- Knorm(wObs = 0:10, h = 5, w = 5) comp <- cbind( k1/sum(k1), k2/sum(k2) ) colnames(comp) <- c("Epanechnikov", "Normal") comp # try it out dat <- simObsSCM(n=200) # for window size 2 fit2 <- kernReg(dat = dat, K = "Kepan", h = 2, # window of size 2 wValues = seq(0,50, length = 200)) # evenly spaced from 0,50 plot(fit2$EhatY ~ fit2$w, bty="n",xlab="w", type="l", ylab=expression(hat(E)*"(Y | A = 1, W =w)"), mgp = c(2.1, 0.5, 0),lwd=2, main= "Kernel regression, Epanechnikov kernel") points(dat$Y ~ dat$W, col="gray75") data(mtcars) head(mtcars) lm_mod <- lm(mpg ~ ., data = mtcars) summary(lm_mod) err <- mean(resid(lm_mod)^2) cv_lm <- function(fold, data, reg_form) { # get name and index of outcome variable from regression formula out_var <- as.character(unlist(stringr::str_split(reg_form, " "))[1]) out_var_ind <- as.numeric(which(colnames(data) == out_var)) # split up data into training and validation sets train_data <- origami::training(data) valid_data <- origami::validation(data) # fit linear model on training set and predict on validation set mod <- lm(as.formula(reg_form), data = train_data) preds <- predict(mod, newdata = valid_data) # capture results to be returned as output out <- list(coef = data.frame(t(coef(mod))), SE = ((preds - valid_data[, out_var_ind])^2)) return(out) } # resubstitution estimate resub <- make_folds(mtcars, fold_fun = folds_resubstitution)[[1]] resub_results <- cv_lm(fold = resub, data = mtcars, reg_form = "mpg ~ .") mean(resub_results$SE) # cross-validated estimate folds <- origami::make_folds(mtcars) cvlm_results <- origami::cross_validate(cv_fun = cv_lm, folds = folds, data = mtcars, reg_form = "mpg ~ .") mean(cvlm_results$SE)