options(repr.plot.width = 4, repr.plot.height = 3)  ## resizing plots
options(scipen = 999)  ## has scientific notation ever annoyed you?
library(tidyverse)
library(data.table)
library(origami)
library(ggsci)
set.seed(76548208)

# structural equation for W
f_W <- function(U_W) {
    return(U_W)
}

# structural equation for A
f_A <- function(W, U_A) {
    return(as.numeric(plogis(0.02 * W + U_A) > 0.25))
}

# structural equation for Y
f_Y <- function(W, A, U_Y) {
    return(-W + 10 * A - U_Y)
}

# function to draw n observations from an scm
# n = the number of observations to draw
# returns a tibble with named columns
simObsSCM <- function(n) {
    ## first we draw the errors
    # draw Uniform(-0.5, 50.5) and round
    U_W <- round(runif(n, -0.5, 50.5))
    # draw U_A
    U_A <- rnorm(n, 0, 1)
    # draw U_Y
    U_Y <- rnorm(n, 0, 3)

	#evaluate the observations sequentially
    # evaluate W
    W <- f_W(U_W)
    # evaluate A
    A <- f_A(W = W, U_A = U_A)
    # evaluate Y
    Y <- f_Y(W = W, A = A, U_Y = U_Y)

    ## return a data.frame object
    out <- as.data.table(cbind(W = W, A = A, Y = Y))
    return(out)
}

# function that draws n observations from an SCM that is 
# intervened on to set A = setA
# n = number of observations
# setA = the value to set A equal to (0 or 1)
# returns a data.frame of coutnerfactual observations
simIntSCM <- function(n, set_A = 1) {
	## first we draw the errors
    # draw Uniform(-0.5,50.5) and round
    U_W <- round(runif(n, -0.5, 50.5))
    # draw U_A
    U_A <- rnorm(n, 0, 1)
    # draw U_Y
    U_Y <- rnorm(n, 0, 1)

	# evaluate the observations sequentially
    # evaluate W
    W <- f_W(U_W)
    # evaluate A
    A <- rep(set_A, n)
    # evaluate Y
    Y <- f_Y(W = W, A = A, U_Y = U_Y)

    ## return a data.frame object
    out <- as.data.table(cbind(W = W, A = A, Y = Y))
    return(out)
}

int_big_samp <- simIntSCM(n = 1e6, set_A = 1)
E0Y1 <- mean(int_big_samp$Y)
round(E0Y1, 3)

# let's go ahead and write a function that takes as input
# a data frame from the observed SCM and computes the 
# mean in each strata and returns it as a vector
getStratEst <- function(data, strata = (seq_len(51) - 1)) {
    est <- rep(NA, length(strata))
    # over each unique value of W compute the mean of Y in the A=1 grp
    for (w in strata) {
         # let's check to make sure someone is in each strata
        if (sum(data$A == 1 & data$W == w) > 0) {
            est[w + 1] <- mean(data$Y[data$A == 1 & data$W == w])
        } else {
            est[w + 1] <- NA
        }
    }
    # return the vector
    est
}

# simulate very large data set from the observed SCM
big_obs <- simObsSCM(n = 5e6)

# get the stratified estimate 
E0Y_A1Ww <- getStratEst(data = big_obs)

# we know the probability that W=w, it's 1/51
pw <- rep(1/51, 51)

# sum over all the strata to get E_0(E_0(Y | A = 1, W))
E0_E0Y_A1Ww <- sum(pw * E0Y_A1Ww)

# the statistical target parameter
E0_E0Y_A1Ww <- sum(pw * E0Y_A1Ww)
E0_E0Y_A1Ww

# vector of all w values, makes it easier to plot
allw <- (seq_len(51) - 1)

# plot "estimated" values
plot(E0Y_A1Ww ~ allw, bty="n",
     xlab="w", 
     ylab = expression(E[0]*"(Y | A=1, W)")) 

# add true values
E0Y_A1Ww_true <- -allw + 10

points(E0Y_A1Ww_true ~ allw, pch=3)
# add legend
legend(x=30, y=10, bty="n", pch = c(1,3),
       legend = c("'Estimated'", "True"))

# simulate n=5,000 observations from the observed SCM
smallerObs <- simObsSCM(n = 5e3)

# get stratified estimates 
EhatY_A1Ww <- getStratEst(data = smallerObs)

# plot stratified-estimated values
plot(EhatY_A1Ww ~ allw, bty="n", xlab="w", 
     ylab=expression(hat(E)*"(Y | A=1, W)"),
     mgp = c(2.1,0.5,0))
# add true values
points(E0Y_A1Ww_true ~ allw, pch=3) 
# add legend
legend(x=30, y=10, bty="n", pch = c(1,3),
       legend = c("Estimated", "True"))

# simulate n=5,000 observations from the observed SCM
smallerObs <- simObsSCM(n = 5e2)

# get stratified estimates 
EhatY_A1Ww <- getStratEst(data = smallerObs)

# plot stratified-estimated values
plot(EhatY_A1Ww ~ allw, bty="n", xlab="w", 
     ylab=expression(hat(E)*"(Y | A=1, W)"),
     mgp = c(2.1,0.5,0))
# add true values
points(E0Y_A1Ww_true ~ allw, pch=3) 
# add legend
legend(x=30, y=10, bty="n", pch = c(1,3),
       legend = c("Estimated", "True"))

# simulate n=5,000 observations from the observed SCM
smallerObs <- simObsSCM(n = 1e2)

# get stratified estimates 
EhatY_A1Ww <- getStratEst(data = smallerObs)

# plot stratified-estimated values
plot(EhatY_A1Ww ~ allw, bty="n", xlab="w", 
     ylab=expression(hat(E)*"(Y | A=1, W)"),
     mgp = c(2.1,0.5,0))
# add true values
points(E0Y_A1Ww_true ~ allw, pch=3) 
# add legend
legend(x=30, y=10, bty="n", pch = c(1,3),
       legend = c("Estimated", "True"))

# function to draw n observations from an scm
# n = the number of observations to draw
# returns a data.frame with named columns
simObsSCM <- function(n){
    ## first we draw the errors
    # draw Uniform(-0.5,50.5) and round
    U_W <- runif(n,0,50)
    # draw U_A
    U_A <- rnorm(n,0,1)
    # draw U_Y
    U_Y <- rnorm(n,0,3)

	#evaluate the observations sequentially
    # evaluate W
    W <- f_W(U_W)
    # evaluate A
    A <- f_A(W = W, U_A = U_A)
    # evaluate Y
    Y <- f_Y(W = W, A = A, U_Y = U_Y)

    ## return a data.frame object
    out <- data.frame(W = W, A = A, Y = Y)
    return(out)
}

# this function takes as input a dataframe of observations
# and computes the moving window average of the mean of Y
# given W at each value of the vector wValues 
# the function returns a data.frame with columns w (wValues)
simpleKern <- function(
    dat, # data.frame of observations
    h, # the size of the window
    wValues # values at which to return kernel predictions
    ){
    # for each value in wValues, compute the mean of the observed
    # data with A = 1 in that window 
    # empty vector of results
    EhatY <- rep(NA, length(wValues))
    ct <- 0
    for(w in wValues){
        ct <- ct + 1
        EhatY[ct] <- 
            mean(dat$Y[dat$A == 1 & dat$W < w + h/2 & dat$W > w - h/2])
    }
    
    return(data.frame(
        w = wValues,
        EhatY = EhatY
    ))
}

# try it out
dat <- simObsSCM(n=200)

# for window size 2
fit2 <- simpleKern(dat = dat, 
                  h = 2, # window of size 2
                  wValues = seq(0,50, length = 200)) # evenly spaced from 0,50
fit20 <- simpleKern(dat = dat, 
                  h = 20, # window of size 20
                  wValues = seq(0,50, length = 200)) # evenly spaced from 0,50

plot(fit2$EhatY ~ fit2$w, bty="n",xlab="w", type="l",
     ylab=expression(hat(E)*"(Y | A = 1, W =w)"),
     mgp = c(2.1, 0.5, 0),lwd=2)
lines(fit20$EhatY ~ fit20$w,lwd=2, lty=2)
points(dat$Y ~ dat$W, col="gray75")
legend(x="topright", lty=c(1,2), title="h", legend = c(2,20))

getBiasVariance <- function(
    estimator, # a function that takes as input a data set and 
               # vector of w values called 'wValues' and outputs
               # a value for the estimator at w
    truth, # a function that takes as input a vector wValues and outputs the
           # true value at those values
    n, # sample size
    wValues, # vector of strata to compute bias and variance 
    nSim = 100, # number of repeated draws to compute bias and variance
    getMSE = FALSE, # should bias and variance be returned or MSE?
    ... # other args passed to estimator (e.g., h)
){
    estMat <- NULL
    for(i in 1:nSim){
        dat <- simObsSCM(n = n)
        est <- do.call(estimator, 
                       args = c(list(dat = dat, wValues = wValues),
                                 list(...)))
        estMat <- rbind(estMat, est$EhatY)
    }
    trueValues <- do.call(truth, args = list(wValues = wValues))
    if(!getMSE){
       # compute the bias and variance
        if(is.matrix(estMat)){
            bias <- colMeans(estMat, na.rm=TRUE) - trueValues
            variance <- apply(estMat, 2, var)
        }else{
            bias <- mean(estMat, na.rm = TRUE) - trueValues
            variance <- var(estMat, na.rm = TRUE)
        }
        names(bias) <- wValues
        names(variance) <- wValues
       # return a list
        out <- list(bias = bias, variance = variance)
    }else{
        mse <- apply(matrix(1:ncol(estMat)), 1, function(i){
            mean((estMat[,i] - trueValues[i])^2)
        })
        names(mse) <- wValues
        out <- list(mse = mse)
    }
    out
}

# call the function to get the estimated bias and variance of 
# \hat{E}
getBiasVariance(
    estimator = "simpleKern", # estimation function to use
    truth = function(wValues){ -wValues + 10 }, # the true value
    n = 500, # sample size
    wValues = c(5, 25, 45),
    nSim = 1000,
    h = 5
)

# call the function to get the estimated bias and variance of 
# \hat{E}
getBiasVariance(
    estimator = "simpleKern", # estimation function to use
    truth = function(wValues){ -wValues + 10 }, # the true value
    n = 500, # sample size
    wValues = c(5, 25, 45),
    nSim = 1000,
    h = 5, 
    getMSE = TRUE
)

# I'm going to be repeating the same code under several different  
# scenarios, so let's go ahead and write a function that takes an input
# of h's and outputs a graph of the the Bias^2 and Variance at a particular 
# point for each of those choices of bandwidth. 
plotBiasVariance <- function(hGrid, wValue, n, 
                             estimator = "simpleKern",
                             plotMSE = FALSE){
    rslt <- lapply(split(hGrid, hGrid), getBiasVariance, 
                   estimator = estimator, 
                   truth = function(wValues){ f_Y(W=wValues, A=1, U_Y = 0) },
                   n = n, getMSE = plotMSE,
                   wValues = wValue, 
                   nSim = 1000)
    
    if(!plotMSE){
       # transform result list into data.frame
        biasRslt <- data.frame(h=hGrid, 
                               bias=unlist(lapply(rslt, function(x){x$bias}),
                                           use.names=FALSE))
        varRslt <- data.frame(h=hGrid, 
                              variance=unlist(
                                  lapply(rslt, function(x){x$variance}),
                                          use.names = FALSE))
        # standardize results to put on same graph
        biasRslt$bias2_s <- 
            (biasRslt$bias^2 - min(biasRslt$bias^2))/diff(range(biasRslt$bias^2))
      varRslt$variance_s <- 
            (varRslt$variance -min(varRslt$variance))/diff(range(varRslt$variance))

        # plot results
        par(mar = c(4.1, 3.1, 0.5, 3.1), mgp = c(1.5, 0.5, 0))
        plot(bias2_s ~ h, data = biasRslt, type="b",yaxt="n", 
             xlab = "h", ylab = expression(Bias^2))
        points(variance_s ~ h, data = varRslt, type="b", pch=2, lty=2)
        axis(side = 2, at = seq(0,1,length = 5), labels = rep("", 5))
        axis(side = 4, at = seq(0,1,length = 5), labels = rep("", 5))
        mtext(side = 4, line = par()$mgp[1], "Variance")
        legend(x="topleft", pch = c(1,2), legend = c("Bias", "Variance"))
    }else{
        # mse results list into data.frame
        mseRslt <- data.frame(h=hGrid, 
                              mse=unlist(lapply(rslt, function(x){x$mse}),
                                           use.names=FALSE))
        # plot results
        par(mar = c(4.1, 3.1, 0.5, 3.1), mgp = c(1.5, 0.5, 0))
        plot(mse ~ h, data = mseRslt, type="b",
             xlab = "h", ylab = expression(MSE))
        legend(x="topleft", pch = c(1,2), legend = c("MSE"))
    }
}

# run the function to plot bias and variance
set.seed(1234)
plotBiasVariance(hGrid = seq(1, 30, length = 10), 
                 wValue = 25, 
                 n = 500)

# run the function plot MSE
set.seed(1234)
plotBiasVariance(hGrid = seq(1, 30, length = 10), 
                 wValue = 25, 
                 n = 500, plotMSE = TRUE)

# plot the bias and variance
set.seed(1234)
plotBiasVariance(hGrid = seq(1, 10, length = 10), 
                 wValue = 25, 
                 n = 10000)
# plot the mse
set.seed(1234)
plotBiasVariance(hGrid = seq(1, 10, length = 10), 
                 wValue = 25, 
                 n = 10000, plotMSE = TRUE)

# redefine structural equation for Y
f_Y <- function(W, A, U_Y){
    -0.01*W + 10*A - U_Y
}
# plot the bias and variance
set.seed(1234)
plotBiasVariance(hGrid = seq(1, 50, length = 10), 
                 wValue = 25, 
                 n = 500)

# plot the mse
set.seed(1234)
plotBiasVariance(hGrid = seq(1, 50, length = 10), 
                 wValue = 25, 
                 n = 500, plotMSE = TRUE)

# redefine structural equation for Y
f_Y <- function(W, A, U_Y){
    10*sin(W/2) + 10*A - U_Y
}

# plot the bias and variance
set.seed(1234)
plotBiasVariance(hGrid = seq(1, 20, length = 10), 
                 wValue = 25, 
                 n = 2000)

# plot the mse
set.seed(1234)
plotBiasVariance(hGrid = seq(1, 20, length = 10), 
                 wValue = 25, 
                 n = 2000, plotMSE = TRUE)

# normal kernel. Takes as input a vector of observed points wObs, 
# a bandwidth h and a single point w and returns the normal kernel 
# evaluated at each wObs.
Knorm <- function(wObs, h, w){
    dnorm((w - wObs)/h)
}

# try it out to see what it does
Knorm(wObs = 0:10, h = 5, w = 5)

# this function takes as input a dataframe of observations
# and computes the moving window average of the mean of Y
# given W at each value of the vector wValues 
# the function returns a data.frame with columns w (wValues)
kernReg <- function(
    dat, # data.frame of observations
    h, # bandwidth
    K, # kernel function that takes inputs called wObs, h, and w
    wValues # values at which to return kernel predictions
    ){
    # for each value in wValues, compute the mean of the observed
    # data with A = 1 in that window 
    # empty vector of results
    EhatY <- rep(NA, length(wValues))
    ct <- 0
    for(w in wValues){
        ct <- ct + 1
        thisK <- do.call(K, args = list(wObs = dat$W, h = h, w = w))
        EhatY[ct] <- sum(thisK * dat$Y) / sum(thisK)
    }
    
    return(data.frame(
        w = wValues,
        EhatY = EhatY
    ))
}

# try it out
dat <- simObsSCM(n=200)

# for window size 2
fit2 <- kernReg(dat = dat, K = "Knorm",
                  h = 2, # window of size 2
                  wValues = seq(0,50, length = 200)) # evenly spaced from 0,50
fit20 <- kernReg(dat = dat, K = "Knorm",
                  h = 20, # window of size 20
                  wValues = seq(0,50, length = 200)) # evenly spaced from 0,50

plot(fit2$EhatY ~ fit2$w, bty="n",xlab="w", type="l",
     ylab=expression(hat(E)*"(Y | A = 1, W =w)"),
     mgp = c(2.1, 0.5, 0),lwd=2, main= "Kernel regression, normal kernel")
lines(fit20$EhatY ~ fit20$w,lwd=2, lty=2)
points(dat$Y ~ dat$W, col="gray75")
legend(x="topright", lty=c(1,2), title="h", legend = c(2,20))

# normal kernel. Takes as input a vector of observed points wObs, 
# a bandwidth h and a single point w and returns the normal kernel 
# evaluated at each wObs.
Kepan <- function(wObs, h, w){
    u <- (w - wObs)/h
    k <- 3/4 * (1-u^2)*(abs(u)<=1)
}

# try it out to see what it does
Kepan(wObs = 0:10, h = 5, w = 5)

# compare to the normal kernal
k1 <- Kepan(wObs = 0:10, h = 5, w = 5)
k2 <- Knorm(wObs = 0:10, h = 5, w = 5)
comp <- cbind(
    k1/sum(k1),
    k2/sum(k2)
)
colnames(comp) <- c("Epanechnikov", "Normal")
comp

# try it out
dat <- simObsSCM(n=200)

# for window size 2
fit2 <- kernReg(dat = dat, K = "Kepan",
                  h = 2, # window of size 2
                  wValues = seq(0,50, length = 200)) # evenly spaced from 0,50

plot(fit2$EhatY ~ fit2$w, bty="n",xlab="w", type="l",
     ylab=expression(hat(E)*"(Y | A = 1, W =w)"),
     mgp = c(2.1, 0.5, 0),lwd=2, 
     main= "Kernel regression, Epanechnikov kernel")
points(dat$Y ~ dat$W, col="gray75")

data(mtcars)
head(mtcars)

lm_mod <- lm(mpg ~ ., data = mtcars)
summary(lm_mod)

err <- mean(resid(lm_mod)^2)

cv_lm <- function(fold, data, reg_form) {
  # get name and index of outcome variable from regression formula
  out_var <- as.character(unlist(stringr::str_split(reg_form, " "))[1])
  out_var_ind <- as.numeric(which(colnames(data) == out_var))

  # split up data into training and validation sets
  train_data <- origami::training(data)
  valid_data <- origami::validation(data)

  # fit linear model on training set and predict on validation set
  mod <- lm(as.formula(reg_form), data = train_data)
  preds <- predict(mod, newdata = valid_data)

  # capture results to be returned as output
  out <- list(coef = data.frame(t(coef(mod))),
              SE = ((preds - valid_data[, out_var_ind])^2))
  return(out)
}

# resubstitution estimate
resub <- make_folds(mtcars, fold_fun = folds_resubstitution)[[1]]
resub_results <- cv_lm(fold = resub, data = mtcars, reg_form = "mpg ~ .")
mean(resub_results$SE)

# cross-validated estimate
folds <- origami::make_folds(mtcars)
cvlm_results <- origami::cross_validate(cv_fun = cv_lm,
                                        folds = folds, data = mtcars,
                                        reg_form = "mpg ~ .")
mean(cvlm_results$SE)