options(repr.plot.width = 4, repr.plot.height = 3) ## resizing plots options(scipen = 999) ## has scientific notation ever annoyed you? # packages for project management library(usethis) # commonly used package components library(here) # set paths relative to a .Rproj file # some more data-oriented packages we'll use library(tidyverse) ## core tools for "tidy" data science library(skimr) ## take a deeper look at your data # we're simulating -- it's important to set the seed for the PRNG set.seed(3724591) ## note something like '5' is a BAD seed, it might be worth reading about PRNGs # simulate 100 draws from a normal distribution. Y <- rnorm(n = 100, mean = 0 , sd = 1) # we'll use skimr::skim to take a look at y skim(Y) # let's take a closer look at the histogram of Y p1 <- gather(as_tibble(Y)) %>% ggplot(., aes(x = value)) + geom_histogram(aes(y = ..density..)) p1 p2 <- p1 + stat_function(fun = dnorm, args = list(mean = 0, sd = 1), linetype = "dashed") + ## let's add a normal distribution to compare our histogram to xlim(c(-3, 3)) + ## we'll set hard limits to the x-axis to make the visualization clearer xlab("observed values of y") + ## labeling our axes never hurts ggtitle("Density Histogram of R.V. Y") + ## titles are good too... theme_bw() ## let's get rid of that annoying gray background p2 # simulate 100 draws from a uniform(0,1) distribution Y_unif <- runif(n = 100, min = 0, max = 1) # simulate 100 draws from a bernoulli(0.5) distribution # recall that a bernoulli(p) distribution is just a binomial(n=1, p) Y_bern <- rbinom(n = 100, size = 1, prob = 0.5) # simulate 100 draws from a gamma(1,2) distribution Y_gamma <- rgamma(n = 100, shape = 1, scale = 2) # Let's skim these y_rv <- as_tibble(list(unif = Y_unif, bern = Y_bern, gamma = Y_gamma)) skim(y_rv) # define a function that takes as input n (a numeric) make_data_O <- function(n) { W <- runif(n, min = 0, max = 1) # plogis is the pdf of the logistic distribution and # plogis(x) = expit(x) = exp(x)/(1+exp(x)) A <- rbinom(n, size = 1, prob = plogis(-1 + W)) Y <- rnorm(n, mean = W + A, sd = sqrt(1/2)) # return a data.frame object with named columns W, A, Y return(as_tibble(list(W = W, A = A, Y = Y))) } # make a data set with 100 observations data_O_obs <- make_data_O(n = 100) # confirm that make_data_O() returned a tibble object class(data_O_obs) # let's skim over the observed data skim(data_O_obs) make_data_O(n = "something crazy") # define a function that takes as input n (a numeric) make_sane_data_O <- function(n){ # check whether n is of class numeric if(class(n) != "numeric"){ stop("n is not numeric. Can't simulate data.") } W <- runif(n, min = 0, max = 1) # plogis is the pdf of the logistic distribution and # plogis(x) = expit(x) = exp(x)/(1+exp(x)) A <- rbinom(n, size = 1, prob = plogis(-1 + W)) Y <- rnorm(n, mean = W + A, sd = sqrt(1/2)) # return a data.frame object with named columns W, A, Y return(as_tibble(list(W = W, A = A, Y = Y))) } make_sane_data_O(n = "something crazy") # for logisitic regression binomial() # for linear regression gaussian() # for poisson regression poisson() # for Gamma regression (make sure G is capital!) Gamma() # we'll use our data.frame object dat from before to fit the # linear regression Y = \beta_0 + \beta_1 A + \beta_2 W # first we can specify a formula and a data.frame object mod_1 <- data_O_obs %>% glm(formula = "Y ~ A + W", data = ., family = gaussian()) # check the class class(mod_1) # summarize the results summary(mod_1) # define a function that takes as input n (a numeric) # and returns n copies of O=(W, A, Y) where Y \in \{0,1\} make_binary_O <- function(n) { W <- runif(n, min = 0, max = 1) # plogis is the pdf of the logistic distribution and # plogis(x) = expit(x) = exp(x)/(1+exp(x)) A <- rbinom(n, size = 1, prob = plogis(-1 + W)) # now Y is binary Y <- rbinom(n, size = 1, prob = plogis(-1 + W + A)) # return a data.frame object with named columns W, A, Y return(as_tibble(list(W = W, A = A, Y = Y))) } # generate the data and fit logistic regression binary_O <- make_binary_O(n = 100) # remember, we need family = binomial for logistic regression mod_2 <- binary_O %>% glm("Y ~ A + W", data = ., family = binomial()) # summarize the model output summary(mod_2) objects(mod_2) ## looks like we might want "coefficients" mod_2$coefficients # a function that takes as input 'fm', a glm object and returns whether # or not the p-value for the coefficient associated with A is # rejected at the 0.05 level is_null_rejected <- function(fm) { sumFm <- summary(fm) p_val <- sumFm$coefficients["A","Pr(>|z|)"] return(p_val < 0.05) } # a function that takes as input 'n', a scalar, simulates a data set # using makeBinaryO, estimates a main-terms glm and determines whether # the coefficient associated with A is rejected at the 0.05 level run_one_sim <- function(n) { # make data set dat <- make_binary_O(n = n) # fit glm fm <- glm("Y ~ A + W", data = dat, family = binomial()) # get hypothesis test result out <- is_null_rejected(fm) # return return(out) } # try it out run_one_sim(n = 100) # number of simulations, in general more is better to decrease # the Monte Carlo error in your answer (i.e., error due to random seed used) n_sim <- 100 # let's set a seed so the results are reproducible set.seed(1234) ## get results using a for() loop # create empty vector of length nSim that results will go into results_vector <- rep(NA, n_sim) for(i in seq_len(n_sim)){ results_vector[i] <- run_one_sim(n = 100) } # check out first and last results head(results_vector) tail(results_vector) # power is the average number of times you reject the null cat("The power of the test is " , mean(results_vector)) # again set a seed so reproducible set.seed(1234) # replicate results_rep <- replicate(n = n_sim, expr = run_one_sim(n = 100)) cat("The power of the test is " , mean(results_rep))