options(jupyter.plot_mimetypes = "image/png")
# In R:
# Load necessary libraries and set up multi-core processing for Stan
options(warn=-1, message =-1)
library(dplyr); library(ggplot2); library(rstan); library(reshape2)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
# In R, or you could save the contents of the string in a file with .stan file type
dgp_string <- "
functions {
/**
* Return draws from a linear regression with data matrix X,
* coefficients beta, and student-t noise with degrees of freedom nu
* and scale sigma.
*
* @param X Data matrix (N x P)
* @param beta Coefficient vector (P x 1)
* @param nu Residual distribution degrees of freedom.
* @param sigma Residual distribution scale.
* @return Return an N-vector of draws from the model.
*/
vector dgp_rng(matrix X, vector beta, real nu, real sigma) {
vector[rows(X)] y; // define the output vector to be as long as the number of rows in X
// Now fill it in
for (n in 1:rows(X))
y[n] <- student_t_rng(nu, X[n] * beta, sigma);
return y;
}
}
data {
// If we were estimating a model, we'd define the data inputs here
}
parameters {
// ... and the parameters we want to estimate would go in here
}
model {
// This is where the probability model we want to estimate would go
}
"
# Generate a matrix of random numbers, and values for beta, nu and sigma
set.seed(42) # Set the random number generator seed so that we get the same parameters
N <- 1000 # Number of observations
P <- 10 # Number of covariates
X <- matrix(rnorm(N*P), N, P) # generate an N*P covariate matrix of random data
nu <- 5 # Set degrees of freedom
sigma <- 5 # And scale parameter
beta <- rnorm(10) # Generate some random coefficients that we'll try to recover
# Make sure the first element of beta is positive as in our chosen DGP
beta[1] <- abs(beta[1])
# Compile the script
compiled_function <- stan_model(model_code = dgp_string) # you could use file = "path/to/yourfile.stan" if you have saved it as so
# And make the function available to the user in R
expose_stan_functions(compiled_function)
# Draw a vector of random numbers for known Xs and parameters
y_sim <- dgp_rng(nu = nu, X = X, sigma = sigma, beta = beta)
# Plot the data
data_frame(y_sim = y_sim) %>% # Declare a data frame and pipe it into a ggplot
ggplot(aes( x = y_sim)) + # Where we state the x-axis aesthetic (our simulated values)
geom_histogram(binwidth = 3) # And tell ggplot what sort of chart to build
# In R, or in your .stan file (contents from within the quotes only)
incorrect_model <- "
data {
// In this section, we define the data that must be passed to Stan (from whichever environment you are using)
int N; // number of observations
int P; // number of covariates
matrix[N, P] X; //covariate matrix
vector[N] y; //outcome vector
}
parameters {
// Define the parameters that we will estimate, as well as any restrictions on the parameter values (standard deviations can't be negative...)
vector[P] beta; // the regression coefficients
real sigma; // the residual standard deviation (note that it's restricted to be non-negative)
}
model {
// This is where we write out the probability model, in very similar form to how we would using paper and pen
// Define the priors
beta ~ normal(0, 5); // same prior for all betas; we could define a different one for each, or use a multivariate prior
sigma ~ cauchy(0, 2.5);
// The likelihood
y ~ normal(X*beta, sigma);
}
generated quantities {
// For model comparison, we'll want to keep the likelihood contribution of each point
// We will also generate posterior predictive draws (yhat) for each data point. These will be elaborated on below.
vector[N] log_lik;
vector[N] y_sim;
for(i in 1:N){
log_lik[i] <- normal_log(y[i], X[i,]*beta, sigma);
y_sim[i] <- normal_rng(X[i,]*beta, sigma);
}
}
"
# In R, or in your .stan file (contents from within the quotes only)
correct_model <- "
data {
int N; // number of observations
int P; // number of covariates
matrix[N, P] X; //covariate matrix
vector[N] y; //outcome vector
}
parameters {
// We need to define two betas--the first is the restricted value, the next are the others. We'll join these in the next block
real beta_1;
vector[P-1] beta_2; // the regression coefficients
real sigma; // the residual scale (note that it's restricted to be non-negative)
real nu;
}
transformed parameters {
vector[P] beta;
beta <- append_row(rep_vector(beta_1, 1), beta_2);
}
model {
// Define the priors
beta ~ normal(0, 5); // same prior for all betas; we could define a different one for each, or use a multivariate prior. The first beta will have a prior of the N+(0, 5)
sigma ~ cauchy(0, 2.5);
nu ~ cauchy(7, 5);
// The likelihood
y ~ student_t(nu, X*beta, sigma);
}
generated quantities {
// For model comparison, we'll want to keep the likelihood contribution of each point
vector[N] log_lik;
vector[N] y_sim;
for(i in 1:N){
log_lik[i] <- student_t_log(y[i], nu, X[i,]*beta, sigma);
y_sim[i] <- student_t_rng(nu, X[i,]*beta, sigma);
}
}
"
# In R
# Specify the data list that we will pass to Stan. This gives Stan everything declared in the data{} block.
data_list_2 <- list(X = X, N = N, y = y_sim, P = P)
# Call Stan. You'll need to give it either model_code (like the ones we defined above), a file (.stan file),
# or a fitted Stan object (fit)
# You should also pass Stan a data list, number of cores to estimate on (jupyter only has access to one),
# the number of Markov chains to run (4 by default)
# and number of iterations (2000 by default).
# We use multiple chains to make sure that the posterior distribution that we converge on
# is stable, and not affected by starting values.
# The first time you run the models, they will take some time to compile before sampling.
# On subsequent runs, it will only re-compile if you change the model code.
incorrect_fit <- stan(model_code = incorrect_model, data = data_list_2, cores = 1, chains = 2, iter = 2000)
correct_fit <- stan(model_code = correct_model, data = data_list_2, cores = 1, chains = 2, iter = 2000)
# In R:
print(incorrect_fit, pars = c("beta", "sigma"))
# Notice that we specify which parameters we want; else we'd get values for `log_lik` and `yhat` also
# Some things to note:
# - mean is the mean of the draws for each observation
# - se_mean is the Monte Carlo error (standard error of the Monte Carlo estimate from the true mean)
# - sd is the standard deviation of the parameter's draws
# - the quantiles are self-explanatory
# - n_eff is the effective number of independent draws. If there is serial correlation between sequential draws,
# the draws cannot be considered independent. In Stan, high serial correlation is typically a problem in
# poorly specified models
# - Rhat: this is the Gelman Rubin convergence diagnostic. Values close to 1 indicate that the multiple chains
# that you estimated have converged to the same distribution and are "mixing" well.
# In R
print(correct_fit, pars = c("beta", "sigma", "nu"))
# In R
# Declare a data frame that contains the known parameter names in one column `variable` and their known values
known_parameters <- data_frame(variable = c(paste0("beta[",1:P,"]"),"sigma", "nu"), real_value = c(beta, sigma, nu))
# Extract params as a (draws * number of chains * number of params) array
extract(correct_fit, permuted = F, pars = c("beta", "sigma", "nu")) %>%
# Stack the chains on top of one another and drop the chains label
plyr::adply(2) %>%
dplyr::select(-chains) %>%
# Convert from wide form to long form (stack the columns on one another)
melt() %>%
# Perform a left join with the known parameters
left_join(known_parameters, by = "variable") %>%
# Generate the plot
ggplot(aes(x = value)) +
geom_density(fill = "orange", alpha = 0.5) + # Make it pretty
facet_wrap(~ variable, scales = "free") +
geom_vline(aes(xintercept = real_value), colour = "red") +
ggtitle("Actual parameters and estimates\ncorrectly specified model\n")
extract(incorrect_fit, permuted = F, pars = c("beta", "sigma")) %>%
# Extract params as a (draws * number of chains * number of params) array
plyr::adply(2) %>%
dplyr::select(-chains) %>%
# Stack the chains on top of one another and drop the chains label
melt() %>%
left_join(known_parameters, by = "variable") %>% # Join the known parameter table
# Convert from wide form to long form (stack the columns on one another)
# Write out the plot
ggplot(aes(x = value)) +
geom_density(fill = "orange", alpha = 0.5) + # Make it pretty
facet_wrap(~ variable, scales = "free") + # small sub-plots of each variable
geom_vline(aes(xintercept = real_value), colour = "red") + # red vertical lines for the known parameters
ggtitle("Actual parameters and estimates\nincorrectly specified model\n") # A title
# in R
library(loo) # Load the library
# Extract the log likelihoods of both models. Note that we need to declare log_lik in the generated quantities {} block
llik_incorrect <- extract_log_lik(incorrect_fit, parameter_name = "log_lik")
llik_correct <- extract_log_lik(correct_fit, parameter_name = "log_lik")
# estimate the leave-one-out cross validation error
loo_incorrect <- loo(llik_incorrect)
loo_correct <- loo(llik_correct)
# Print the LOO statistics
print("Incorrect model")
print(loo_incorrect)
sprintf("\n\nCorrect model")
print(loo_correct)
# Print the comparison between the two models
print(compare(loo_incorrect, loo_correct), digits = 2)
# In R
known_y <- data_frame(variable = paste0("y_sim[",1:N,"]"), real_y = y_sim)
# Extract params as a (draws * number of chains * number of params) array
plot_data <- extract(correct_fit, permuted = F, pars = c("y_sim")) %>%
plyr::adply(2) %>%
dplyr::select(-chains) %>%
# Stack the chains on top of one another and drop the chains label
melt() %>%
left_join(known_y, by = "variable") %>% # Join the known parameter table
# Convert from wide form to long form (stack the columns on one another)
# Write out the plot
group_by(variable) %>%
summarise(median = median(value),
lower = quantile(value, 0.025),
upper = quantile(value, 0.975),
actual = first(real_y))
plot_data %>%
ggplot(aes(x = median)) +
geom_ribbon(aes(ymin = lower, ymax = upper), fill = "orange", alpha = 0.5) +
geom_line(aes(y = median)) +
geom_point(aes(y = actual)) +
ggtitle("Actual outcomes and 95% posterior predictive interval\n") # A title
plot_data %>% summarize(proportion_within_95pc = mean(actual>=lower & actual<=upper))