This is a modularized version of the rat tumor analysis, factored into reusable functions.
Grab packages
library(ggplot2)
library(plyr)
library(tidyverse)
library(rstan)
library(doParallel)
registerDoParallel()
rstan_options(auto_write=TRUE)
options(mc.cores=parallel::detectCores())
Set options
options(repr.matrix.max.rows = 20)
Load the data:
rat.data = read_csv("rats.csv")
summary(rat.data)
rat.data
In this data set, for each experiment $j$, we have two observations: $c_j$, the number of infected rats, and $n_j$, the number of rats.
We model the data as having an infection rate $\theta_j$ for each experiment. We want to make inferences about the distribution of $\theta_j$.
In more detail, we use the following model:
For the purposes of notational simplicity theorem, we parameterize our model parameters as $\phi = (\alpha, \beta)$.
Therefore, we have the goal of computing $P(\phi,\mathbf{\theta}|\mathbf{c})$ given $\mathbf{c}$ and our model / likelihood function $P(\mathbf{c},\mathbf{\theta}|\phi) = \prod_j P(c_j,\theta_j|\phi_j) = \prod_j P(c_j|\theta_j)P(\theta_j|\phi_j)$.
The final derivation, as per Eq. 5.8 in Gelman et al., is
$$P(\alpha, \beta|\mathrm{y}) \propto P(\alpha, \beta) \prod_j \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)} \frac{\Gamma(\alpha + c_j)\Gamma(\beta+n_j-c_j)}{\Gamma(\alpha+\beta+n_j)}$$
We will compute this log-transformed.
So we define the log likelihood function for this model. It will take as input $\alpha$ and $\beta$ vectors (which should be the same length), and return the log likelihood of each $(\alpha, \beta)$ value given the $\mathbf{c}$ and $\mathbf{n}$ vectors:
model.loglike = function(alpha, beta, cs, ns) {
base = lgamma(alpha + beta) - lgamma(alpha) - lgamma(beta)
data = mapply(a=alpha, b=beta, FUN=function(a, b) {
sum(lgamma(a + cs) +
lgamma(b + ns - cs) -
lgamma(a + b + ns))
})
nrow(rat.data) * base + data
}
We're also going to use the diffuse prior from Equation 5.9:
$$P(\alpha, \beta) \propto (\alpha + \beta)^{-5/2}$$
We will not worry about the constant, since we care about relative densities.
model.prior = prior = function(alpha, beta) {
(alpha + beta) ^ (-5 / 2)
}
We want to be able to compute and reason about the probability density at various $\alpha$ and $\beta$ values. However, raw $\alpha \times \beta$ space is not very convenient, so we will plot instead over $x = \mathrm{log}(\frac{\alpha}{\beta})$ and $y = \mathrm{log}(\alpha + \beta)$, yielding:
$$\begin{align*} x & = \mathrm{log}(\frac{\alpha}{\beta}) \\ y & = \mathrm{log}(\alpha + \beta) \\ \alpha & = e^x \beta \\ \beta & = \frac{e^y}{e^x + 1} \end{align*}$$
par.beta = function(x, y) {
exp(y) / (exp(x) + 1)
}
par.alpha = function(x, y) {
exp(x) * par.beta(x, y)
}
Given ranges (really, sequences) of $x$ and $y$ values, we want to compute the probability density. We call this xydensity
because it is the density at $(x, y)$. It returns the density at each pair $(x, y)$, taking the Cartesian product of the x
and y
vectors. The counts
and totals
parameters ($\mathbf{c}$ and $\mathbf{n}$) are also vectors, but the value for each $(x,y)$ poitn depends on the entire $\mathbf{c}$ and $\mathbf{n}$ vectors.
xydensity = function(x, y, counts, totals, prior=model.prior, loglike=model.loglike) {
expand.grid(x=x, y=y) %>%
mutate(alpha = par.alpha(x, y),
beta = par.beta(x, y)) %>%
mutate(logPrior = log(prior(alpha, beta)),
logLike = loglike(alpha, beta, counts, totals),
rawLogPost = logPrior + logLike) %>%
mutate(logJacobian = log(alpha) + log(beta),
logPost = rawLogPost + logJacobian)
}
Let's now try it:
dens.points = xydensity(x=sample(seq(-2.3, -1.3, 0.005), 100),
y=sample(seq(1, 5, 0.005), 100),
counts=rat.data$Infected, totals=rat.data$Total)
dens.points
dim(dens.points)
summary(dens.points %>% select(rawLogPost, logLike, logPost))
ggplot(dens.points) +
aes(x=x, y=y, z=logPost) +
geom_contour(binwidth=0.5) +
xlab("log(a / b)") +
ylab("log(a + b)")
With the model in place, we want to use the computed point densities to do some esimation of the posterior distributions.
expvals = function(dens) {
normPost = dens$logPost - max(dens$logPost)
alpha = sum(dens$alpha * exp(normPost)) / sum(exp(normPost))
beta = sum(dens$beta * exp(normPost)) / sum(exp(normPost))
x = log(alpha / beta)
y = log(alpha + beta)
mean = alpha / (alpha + beta)
data.frame(alpha=alpha, beta=beta, x=x, y=y, mean=mean)
}
expvals(dens.points)
We want to find the maximum value of the log posterior, so that we can use it later for normalization in the integral. The optim
function seems perfect for this!
model.map = optim(c(1, 1), function(pars) {
alpha = pars[1]
beta = pars[2]
log(model.prior(alpha, beta)) + model.loglike(alpha, beta, rat.data$Infected, rat.data$Total)
}, control=list(fnscale=-1))
model.map
First, we will write a function to compute the probability density of a particular $\theta$, as an integral $P(\theta|\mathbf{y}) = \iint P(\theta|\alpha,\beta)P(\alpha,\beta|\mathbf{y}) \, \mathrm{d}\alpha \mathrm{d}\beta$.
This function is over theta
(it will return a vector of densities, one corresponding to each value of theta
). The norm
parameter is a normalizing constant used to make the log likelihoods exponentiable within the limits of floating point accuracy; a value a little less than the maximum log likelihood computed by the xydensity
function is a good choice.
model.theta.post = function(theta, logScale, counts, totals, prior=model.prior, loglike=model.loglike,
parallel=!identical(Sys.info()[["sysname"]], "Windows")) {
aaply(theta, 1, function(thv) {
integrate(function(beta) {
aaply(beta, 1, function(bv) {
integrate(function(alpha) {
ll = loglike(alpha, bv, counts, totals)
like = exp(ll - logScale)
pth = dbeta(thv, alpha, bv)
pth * prior(alpha, bv) * like
}, 0, Inf, rel.tol = 0.001)$value
})
}, 0, Inf, rel.tol = 0.001)$value
}, .parallel=parallel)
}
What's our maximum density? We'll use that as a starting point for scaling.
model.map$value
Let's compute; we do a second normalization to account for the fact that we are computing on intervals of 0.001, to try to make the area under the curve sum to 1. This computation takes a little while.
theta.post = data.frame(Theta=seq(0.01, 0.99, 0.005)) %>%
mutate(RawPostDens = model.theta.post(Theta, model.map$value + 1,
rat.data$Infected, rat.data$Total)) %>%
mutate(PostDens = RawPostDens / sum(RawPostDens * 0.005))
ggplot(theta.post) +
aes(x=Theta, y=PostDens) +
geom_line()
Now we are going to use STAN to conduct an MCMC simulation of the posterior distribution.
writeLines(readLines("ratmodel.stan"))
rat_fit = stan(file="ratmodel.stan", data=list(J=nrow(rat.data), y=rat.data$Infected, n=rat.data$Total),
iter=2000, chains=4, control=list(adapt_delta=0.99))
print(rat_fit)
pairs(rat_fit, pars=c("alpha", "beta", "lp__"))
rat_sim = rstan::extract(rat_fit, permuted=TRUE)
n_sims = length(rat_sim$lp__)
n_sims
theta_sims = data_frame(alpha=rat_sim$alpha, beta=rat_sim$beta) %>%
mutate(Theta=rbeta(n(), alpha, beta))
summary(theta_sims)
theta_dens = density(theta_sims$Theta)
ggplot(data_frame(x=theta_dens$x, y=theta_dens$y)) +
aes(x, y) +
geom_line()
writeLines(readLines("ratxmodel.stan"))
ratx_fit = stan(file="ratxmodel.stan",
data=list(J=nrow(rat.data), y=rat.data$Infected, n=rat.data$Total),
iter=2000, chains=4, control=list(adapt_delta=0.99))
print(ratx_fit)
ratx_sim = rstan::extract(ratx_fit, permuted=TRUE)
n_sims = length(ratx_sim$lp__)
n_sims
thetax_sims = data_frame(alpha=ratx_sim$alpha, beta=ratx_sim$beta) %>%
mutate(Theta=rbeta(n(), alpha, beta))
summary(thetax_sims)
thetax_dens = density(thetax_sims$Theta)
ggplot(data_frame(x=thetax_dens$x, y=thetax_dens$y)) +
aes(x, y) +
geom_line()
posteriors = bind_rows(Integrated=select(theta.post, Theta, Density=PostDens),
MCMC=data_frame(Theta=theta_dens$x, Density=theta_dens$y),
MCMC_RP=data_frame(Theta=thetax_dens$x, Density=thetax_dens$y),
.id="Method")
ggplot(posteriors) +
aes(x=Theta, y=Density, color=Method) +
geom_line()