options(repr.plot.width = 4, repr.plot.height = 3)  ## resizing plots
options(scipen = 999)  ## has scientific notation ever annoyed you?

# packages for project management
library(usethis)  # commonly used package components
library(here)  # set paths relative to a .Rproj file

# some more data-oriented packages we'll use
library(tidyverse)  ## core tools for "tidy" data science
library(skimr)  ## take a deeper look at your data

# we're simulating -- it's important to set the seed for the PRNG
set.seed(3724591)  ## note something like '5' is a BAD seed, it might be worth reading about PRNGs

# simulate 100 draws from a normal distribution.
Y <- rnorm(n = 100, mean = 0 , sd = 1)

# we'll use skimr::skim to take a look at y
skim(Y)

# let's take a closer look at the histogram of Y
p1 <- gather(as_tibble(Y)) %>%
  ggplot(., aes(x = value)) +
  geom_histogram(aes(y = ..density..))
p1

p2 <- p1 +
  stat_function(fun = dnorm, args = list(mean = 0, sd = 1),
                linetype = "dashed") +  ## let's add a normal distribution to compare our histogram to
  xlim(c(-3, 3)) +  ## we'll set hard limits to the x-axis to make the visualization clearer
  xlab("observed values of y") + ## labeling our axes never hurts
  ggtitle("Density Histogram of R.V. Y") +  ## titles are good too...
  theme_bw()  ## let's get rid of that annoying gray background
p2

# simulate 100 draws from a uniform(0,1) distribution
Y_unif <- runif(n = 100, min = 0, max = 1)

# simulate 100 draws from a bernoulli(0.5) distribution
# recall that a bernoulli(p) distribution is just a binomial(n=1, p)
Y_bern <- rbinom(n = 100, size = 1, prob = 0.5)

# simulate 100 draws from a gamma(1,2) distribution
Y_gamma <- rgamma(n = 100, shape = 1, scale = 2)

# Let's skim these
y_rv <- as_tibble(list(unif = Y_unif, bern = Y_bern, gamma = Y_gamma))
skim(y_rv)

# define a function that takes as input n (a numeric)
make_data_O <- function(n) {
    W <- runif(n, min = 0, max = 1)
    # plogis is the pdf of the logistic distribution and 
    # plogis(x) = expit(x) = exp(x)/(1+exp(x))
    A <- rbinom(n, size = 1, prob = plogis(-1 + W))
    Y <- rnorm(n, mean = W + A, sd = sqrt(1/2))
    
    # return a data.frame object with named columns W, A, Y
    return(as_tibble(list(W = W, A = A, Y = Y)))
}

# make a data set with 100 observations
data_O_obs <- make_data_O(n = 100)

# confirm that make_data_O() returned a tibble object
class(data_O_obs)

# let's skim over the observed data
skim(data_O_obs)

make_data_O(n = "something crazy")

# define a function that takes as input n (a numeric)
make_sane_data_O <- function(n){
    # check whether n is of class numeric
    if(class(n) != "numeric"){
        stop("n is not numeric. Can't simulate data.")
    }
    W <- runif(n, min = 0, max = 1)
    # plogis is the pdf of the logistic distribution and 
    # plogis(x) = expit(x) = exp(x)/(1+exp(x))
    A <- rbinom(n, size = 1, prob = plogis(-1 + W))
    Y <- rnorm(n, mean = W + A, sd = sqrt(1/2))
    
    # return a data.frame object with named columns W, A, Y
    return(as_tibble(list(W = W, A = A, Y = Y)))
}

make_sane_data_O(n = "something crazy")

# for logisitic regression
binomial()
# for linear regression
gaussian()
# for poisson regression
poisson()
# for Gamma regression (make sure G is capital!)
Gamma()

# we'll use our data.frame object dat from before to fit the
# linear regression Y = \beta_0 + \beta_1 A + \beta_2 W

# first we can specify a formula and a data.frame object
mod_1 <- data_O_obs %>%
  glm(formula = "Y ~ A + W", data = ., family = gaussian())

# check the class
class(mod_1)

# summarize the results
summary(mod_1)

# define a function that takes as input n (a numeric)
# and returns n copies of O=(W, A, Y) where Y \in \{0,1\}
make_binary_O <- function(n) {
    W <- runif(n, min = 0, max = 1)
    # plogis is the pdf of the logistic distribution and 
    # plogis(x) = expit(x) = exp(x)/(1+exp(x))
    A <- rbinom(n, size = 1, prob = plogis(-1 + W))
    # now Y is binary
    Y <- rbinom(n, size = 1, prob = plogis(-1 + W + A))
    
    # return a data.frame object with named columns W, A, Y
    return(as_tibble(list(W = W, A = A, Y = Y)))
}

# generate the data and fit logistic regression
binary_O <- make_binary_O(n = 100)

# remember, we need family = binomial for logistic regression
mod_2 <- binary_O %>%
  glm("Y ~ A + W", data = ., family = binomial())

# summarize the model output
summary(mod_2)

objects(mod_2)

## looks like we might want "coefficients"
mod_2$coefficients

# a function that takes as input 'fm', a glm object and returns whether
# or not the p-value for the coefficient associated with A is 
# rejected at the 0.05 level
is_null_rejected <- function(fm) {
    sumFm <- summary(fm)
    p_val <- sumFm$coefficients["A","Pr(>|z|)"]
    return(p_val < 0.05)
}


# a function that takes as input 'n', a scalar, simulates a data set
# using makeBinaryO, estimates a main-terms glm and determines whether 
# the coefficient associated with A is rejected at the 0.05 level
run_one_sim <- function(n) {
    # make data set
    dat <- make_binary_O(n = n)
    # fit glm
    fm <- glm("Y ~ A + W", data = dat, family = binomial())
    # get hypothesis test result
    out <- is_null_rejected(fm)
    # return
    return(out)
}

# try it out
run_one_sim(n = 100)

# number of simulations, in general more is better to decrease
# the Monte Carlo error in your answer (i.e., error due to random seed used)
n_sim <- 100

# let's set a seed so the results are reproducible
set.seed(1234)

## get results using a for() loop
# create empty vector of length nSim that results will go into
results_vector <- rep(NA, n_sim)
for(i in seq_len(n_sim)){
    results_vector[i] <- run_one_sim(n = 100)
}

# check out first and last results
head(results_vector) 
tail(results_vector)

# power is the average number of times you reject the null
cat("The power of the test is " , mean(results_vector))

# again set a seed so reproducible
set.seed(1234)
# replicate
results_rep <- replicate(n = n_sim, expr = run_one_sim(n = 100))

cat("The power of the test is " , mean(results_rep))