#!/usr/bin/env python
# coding: utf-8

# Answer all questions and submit them either as an IPython notebook, LaTeX document, or Markdown document. Each question is worth 25 points.
# 
# This homework is due Tuesday, October 25, 2016.

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pymc3 as pm
import pylab as plt
import pandas as pd

# Set seed
np.random.seed(10011)


# # Question 1
# 
# Epidemiologists are interested in studying the sexual behavior of individuals at risk for HIV infection. Suppose 1500 gay men were surveyed and each was asked how many risky sexual encounters he had in the previous 30 days. Let $n_i$ denote the number of respondents reporting $i$ encounters, for $i = 1, \ldots , 16$. The DataFrame below contains these reponses:

# In[30]:


encounters = pd.DataFrame({'ecount': np.arange(17),
                          'freq': [379, 299, 222, 145, 109, 95, 73, 59,
                                  45, 30, 24, 12, 4, 2, 0, 1, 1]})

encounters.freq.plot.bar()


# These data are poorly fitted by a Poisson model. It is more realistic to assume that the respondents comprise three groups. First, there is a group of people who, for whatever reason, report zero risky encounters even if this is not true. Suppose a respondent has probability $\alpha$ of belonging to this group. 
# 
# With probability $\beta$, a respondent belongs to a second group representing typical behavior. Such people respond truthfully, and their numbers of risky encounters are assumed to follow a $\text{Poisson}(\mu)$ distribution. 
# 
# Finally, with probability $1 − \alpha − \beta$, a respondent belongs to a high-risk group. Such people respond truthfully, and their numbers of risky encounters are assumed to follow a $\text{Poisson}(\lambda)$ distribution. 
# 
# The parameters in the model are $\alpha, \beta, \mu$ and $\lambda$. At the tth iteration of EM, we use $\theta^{(t)} = (\alpha^{(t)}, \beta^{(t)}, \mu^{(t)}, \lambda^{(t)})$ to denote the current parameter values. The likelihood of the observed data is given by:
# 
# $$L(\theta | n_0, \ldots, n_16) \propto \prod_{i=0}^{16} \left[ \frac{\pi_i(\theta)}{i!} \right]^{n_i}$$
# 
# where $\pi_i(\theta) = \alpha 1_{(i=0)} + \beta \mu^i \exp(-\mu) + (1-\alpha - \beta) \lambda^i \exp(-\lambda)$.
# 
# The observed data are in the `encounters` table above; the *complete* data may be construed to be $(n_{z,0}, n_{t,0}, n_{p,0}), \ldots, (n_{z,16}, n_{t,16}, n_{p,16})$, where $k = z, t, p$ correspond to zero, typical and promiscuous groups, respectively. That is, $n_0 = n_{z,0} + n_{t,0} + n_{p,0}$, and so on. Let $N = \sum_{i=0}^{16} n_i = 1500$.
# 
# Also define:
# 
# $$\begin{align}
# z_0(\theta) &=& \frac{\alpha}{\pi_0(\theta)} \\
# t_i(\theta) &=& \frac{\beta \mu^i \exp(-\mu)}{\pi_i(\theta)} \\
# p_i(\theta) &=& \frac{(1-\alpha-\beta)\lambda^i \exp(-\lambda)}{\pi_i(\theta)}
# \end{align}$$
# 
# which correspond to probabilities that respondents with $i$ risky encounters belong to various groups.
# 
# a. Show that the EM algorithm provides the following updates:
# 
# $$\begin{align}
# \alpha^{(t+1)} &=& \frac{n_0 z_0(\theta^{(t)})}{N} \\
# \beta^{(t+1)} &=& \sum_i \frac{n_i t_i(\theta^{(t)})}{N} \\
# \mu^{(t+1)} &=& \frac{\sum_i i n_i t_i(\theta^{(t)})}{\sum_i n_i t_i(\theta^{(t)})} \\
# \lambda^{(t+1)} &=& \frac{\sum_i i n_i p_i(\theta^{(t)})}{\sum_i n_i p_i(\theta^{(t)})}
# \end{align}$$
# 
# 
# b. Extimate the parameters of the model using the observed data.

# ### Part a
# 
# Recall that the E step involves obtaining the expected value of the latent values. In our case, what we need are the number of each behavior type for each encounter count. For example, the number of "typical" behaviour individuals for the 4-encounter data:
# 
# $$n_{t4} = n_4 t_4(\theta)$$
# 
# we are given the probabilites $z_i(\theta), t_i(\theta), p_i(\theta)$ above, so this is easy:
# 
# $$\begin{align}
# n_{z0} &=& n_0 z_0(\theta) \\
# n_{ti} &=& \sum_i n_i t_i(\theta) \\
# n_{pi} &=& \sum_i n_i p_i(\theta)
# \end{align}$$
# 
# And so, $n_z = n_{z0}, n_t = \sum_i n_{ti}$, and $n_p = \sum_i n_{pi}$.
# 
# Since the mixture proportions $\alpha$ and $\beta$ are multinomial, we there maximum likelihood values are known ($\hat{p} = x/N$), so:
# 
# $$\alpha^{(t+1)} = \frac{n_{z0}}{N} = \frac{n_0 z_0(\theta^{(t)})}{N}$$
# 
# as shown. Similarly, for $\beta$:
# 
# $$\beta^{(t+1)} = \frac{n_t}{N} = \frac{\sum_i n_{ti}}{N} = \frac{\sum_i n_i t_i(\theta^{(t)})}{N}$$
# 
# For the last two, these are Poisson parameters. We can solve for this:
# 
# $$\begin{align}
# L(\lambda) &=& \log \Pi_{i=1}^{n} f(x_i | \lambda) \\
# &=& \sum_i \log \left[ \frac{e^{-\lambda} \lambda^{x_i}}{x_i!} \right] \\ 
# &=& -n\lambda + \log(\lambda)\sum_i x_i - \sum_i \log(x!)
# \end{align}$$
# 
# $$\frac{dL(\lambda)}{d\lambda} = -n \frac{\sum_i x_i}{\lambda} = 0$$
# 
# which is just the mean, $\bar{x}$. So, for the mean of the data for the two Poisson populations:
# 
# $$\bar{x}_j = \frac{\sum_i y_{ij} x_i}{\sum_i y_{ij}}$$
# 
# where for $\mu$, $y_{ij} = n_i t_i(\theta)$ and for $\lambda$, $y_{ij} = n_i p_i(\theta)$. Plug these in, and you have the result.

# $$\pi_i(\theta) = \alpha 1_{(i=0)} + \beta \mu^i \exp(-\mu) + (1-\alpha - \beta) \lambda^i \exp(-\lambda)$$

# ### Part b

# In[24]:


def calc_pi(θ, i):
    α, β, μ, λ = θ
    return α*(i==0) + β*(μ**i)*np.exp(-μ) + (1-α-β)*(λ**i)*np.exp(-λ)


# In[26]:


def e_step(x, θ):
    
    α, β, μ, λ = θ
    
    z = α/calc_pi(θ, 0)
    
    t = [β*(μ**i)*np.exp(-μ)/calc_pi(θ,i) for i,x in encounters.iterrows()]
    
    p = [(1-α-β)*(λ**i)*np.exp(-λ)/calc_pi(θ,i) for i,x in encounters.iterrows()]
    
    return z, t, p


# In[31]:


def m_step(x, z, t, p):
    
    i = x.ecount.values
    n = x.freq.values
    N = n.sum()
    
    α = n[0]*z/N
    
    β = (n*t/N).sum()
    
    μ = (i*n*t).sum() / (n*t).sum()
    
    λ = (i*n*p).sum() / (n*p).sum()
    
    return α, β, μ, λ


# In[49]:


def run_em(theta0, x=encounters):
    
    # Initialize values
    θ = theta0

    # Stopping criterion
    crit = 1e-5

    # Convergence flag
    converged = False

    # Loop until converged
    while not converged:

        # E-step
        z, t, p = e_step(x, θ)
        # M-step
        θ_new = m_step(x, z, t, p)

        # Check convergence
        converged = np.all(np.abs((np.array(θ_new) - np.array(θ)) < crit))
        θ = θ_new

    return θ


# In[50]:


run_em(theta0=(0.1, 0.1, 2, 3))


# # Question 2
# 
# Suppose $y$ has a binomial distribution with parameters $n$ and $p$, and we are interested in the log-odds value $\theta = \log(p/(1 − p))$. Our prior for $\theta$ is that $\theta \sim N(\mu, \sigma^2)$. It follows that the posterior density of $\theta$ is given, up to a proportionality constant, by:
# 
# $$\pi(\theta | y) \propto \frac{\exp(y\theta)}{(1 + exp(\theta))^n} \exp\left[\frac{-(\theta − \mu)^2}{2\sigma^2}\right]$$
# 
# For example, suppose we are interested in learning about the probability that a possibly-biased coin lands heads when tossed. *A priori* we believe that the coin is fair, so we assign $\theta$ a $N(0,.25)$ prior. We toss the coin $n = 5$ times and obtain $y = 5$ heads.
# 
# 1. Using a normal approximation to the posterior density, compute the probability that the coin is biased toward heads (i.e., that θ is positive).
# 2. Using the prior density as a proposal density, design a rejection algorithm for sampling from the posterior distribution. Using simulated draws from your algorithm, approximate the probability that the coin is biased toward heads.
# 3. Using the prior density as a proposal density, simulate values from the posterior distribution using the SIR algorithm. Approximate the probability that the coin is biased toward heads.

# In[124]:


n = 5
y = 5


# In[132]:


def calc_posterior(θ, y=y, n=n, μ=0, σ=0.25):
    return  y*θ - n*np.log(1 + np.exp(θ)) - ((θ - μ)**2/(2*σ**2))

invlogit = lambda x: 1/(1+np.exp(-x))


# ### Part 1

# In[134]:


from scipy.optimize import minimize

calc_posterior_min = lambda *args: -calc_posterior(*args)

init_value = -1

opt = minimize(calc_posterior_min, init_value, method='L-BFGS-B')
mode = opt.x
var = opt.hess_inv.todense()
mode, var


# In[151]:


norm.cdf(0, mode[0], np.sqrt(var[0][0]))


# ### Part 2

# In[133]:


from scipy.stats import norm

prior = norm(0, 0.25)


# In[136]:


def calc_diff(theta, mu=0, sigma=0.25):
    
    return calc_posterior(theta) - prior.logpdf(theta)

calc_diff_min = lambda *args: -calc_diff(*args)


# In[137]:


opt = minimize(calc_diff_min, 
                -1, 
                method='bfgs')


# In[140]:


c = opt.fun


# In[146]:


draws = 10000


# In[147]:


# Draw samples from g(theta)
theta = prior.rvs(size=draws)

# Calculate probability under g(theta)
gvals = prior.logpdf(theta)

# Calculate probability under f(theta)
fvals = calc_posterior(theta)

# Calculate acceptance probability
p = np.exp(fvals - gvals + c)

sample = theta[np.random.random(draws) < p]


# In[148]:


(invlogit(sample)<0.5).mean()


# ### Part 3

# Calculate importance weights

# In[152]:


w = np.exp(fvals - gvals - max(fvals - gvals))


# In[153]:


p_sir = w/w.sum()


# In[154]:


theta_sir = theta[np.random.choice(range(len(theta)), size=10000, p=p_sir)]


# In[157]:


plt.hist(theta_sir);


# In[158]:


(theta_sir<0).mean()


# # Question 3
# 
# The goal of this problem is to investigate the role of the proposal distribution in a Metropolis-Hastings algorithm designed to simulate from the posterior distribution of the mixture parameter $\delta$. 
# 
# 1. Simulate 200 realizations from the mixture distribution:
#     $$y_i \sim \delta N(7, 0.5^2) + (1-\delta) N(10, 0.5^2)$$
#     with $\delta = 0.7$. Plot a histogram of these data. 
# 2. Implement an ABC procedure to simulate from the posterior distribution of $\delta$, using your data from part (1). 
# 3. Implement a random walk M-H algorithm with proposal $\delta^{\prime} = \delta^{(i)} + \epsilon$ with $\epsilon \sim Unif(−1,1)$. 
# 4. Reparameterize the problem letting $U = \log\left[\frac{\delta}{1 - \delta}\right]$ and $u^{\prime} = u^{(i)} + \epsilon$. Implement a random walk chain in U-space. 
# 5. Compare the estimates and convergence behavior of the three algorithms.
# 
# In part (1), you are asked to simulate data from a distribution with $\delta$ known. For parts (2)–(4), assume $\delta$ is unknown with prior $\delta \sim Unif( 0,1)$. For parts (2)–(4), provide an appropriate plot and a table summarizing the output of the algorithm. 
# 
# To facilitate comparisons, use the same number of iterations, random seed, starting values, and burn-in period for all implementations of the algorithm. 

# ### Part 1

# In[3]:


# Sample data
n = 200
delta = 0.7

y = np.where(np.random.random(n) < delta, np.random.normal(7, 0.5, n), np.random.normal(10, 0.5, n))


# In[4]:


plt.hist(y)


# ### Part 2

# In[5]:


np.mean(y), np.std(y)


# In[6]:


n_samples = 100
epsilons = [0.05, 0.01]

# List for holding accepted samples
samples = []
    
while len(samples) < n_samples:
        
    # Simulate from prior
    d = np.random.random()
    
    # Simulate data
    y_sim = np.where(np.random.random(n) < d, np.random.normal(7, 0.5, n), np.random.normal(10, 0.5, n))
    
    if (np.abs(np.mean(y_sim) - np.mean(y)) < epsilons[0]) & (np.abs(np.std(y_sim) - np.std(y)) < epsilons[1]):
        
        samples.append(d)


# In[7]:


plt.hist(samples)


# ### Part 3

# In[8]:


plt.hist(y)


# There are several ways of solving this; here is the easiest (though not the most general)

# In[9]:


from scipy.stats import norm
dnorm = norm.pdf

def calc_posterior(d):
    return np.log(d*dnorm(y, 7, 0.5) + (1-d)*dnorm(y, 10, 0.5)).sum()


# In[10]:


n_iterations = 10000

# Initialize trace for parameters
trace = np.empty(n_iterations+1)

# Set initial values
trace[0] = 0.5
# Calculate joint posterior for initial values
current_log_prob = calc_posterior(trace[0])

# Initialize acceptance counts
accepted = 0

for i in range(n_iterations):

    if not i%1000: print('Iteration %i' % i)

    # Grab current parameter values
    d_current = trace[i]

    # Propose new value for d
    d_prop = d_current + np.random.uniform(-1, 1)
    while (d_prop<0) or (d_prop>1):
        d_prop = d_current + np.random.uniform(-1, 1)
    
    # Calculate log posterior with proposed value
    proposed_log_prob = calc_posterior(d_prop)

    # Log-acceptance rate
    alpha = proposed_log_prob - current_log_prob

    # Test proposed value
    if np.log(np.random.random()) < alpha:
        # Accept
        trace[i+1] = d_prop
        current_log_prob = proposed_log_prob
        accepted += 1
    else:
        # Reject
        trace[i+1] = trace[i]


# In[11]:


plt.hist(trace)


# In[12]:


logit = lambda p: np.log(p/(1-p))
invlogit = lambda x: 1./(1 + np.exp(-x))

n_iterations = 10000

# Initialize trace for parameters
trace = np.empty(n_iterations+1)

# Set initial values
trace[0] = 0.5
# Calculate joint posterior for initial values
current_log_prob = calc_posterior(trace[0])

# Initialize acceptance counts
accepted = 0

for i in range(n_iterations):

    if not i%1000: print('Iteration %i' % i)

    # Grab current parameter values
    d_current = trace[i]

    # Propose new value for d
    d_prop = invlogit(logit(d_current) + np.random.uniform(-1, 1))
    
    # Calculate log posterior with proposed value
    proposed_log_prob = calc_posterior(d_prop)

    # Log-acceptance rate
    alpha = proposed_log_prob - current_log_prob

    # Test proposed value
    if np.log(np.random.random()) < alpha:
        # Accept
        trace[i+1] = d_prop
        current_log_prob = proposed_log_prob
        accepted += 1
    else:
        # Reject
        trace[i+1] = trace[i]


# In[13]:


plt.plot(trace)


# # Question 4
# 
# Carlin (1992) considers a Bayesian approach to meta-analysis, and includes the following examples of 22 trials of beta-blockers to prevent mortality after myocardial infarction. These data are given below.
# 
# In one possible random effects model we assume the true baseline mean (on a log-odds scale) $m_i$ in a trial $i$
# is drawn from some population distribution. Let $r^C_i$ denote number of events in the control group in trial $i$, and $r^T_i$ denote events under active treatment in trial $i$. Our model is:
# 
# $$\begin{aligned}
# r^C_i &\sim \text{Binomial}\left(p^C_i, n^C_i\right) \\
# r^T_i &\sim \text{Binomial}\left(p^T_i, n^T_i\right) \\
# \text{logit}\left(p^C_i\right) &= \mu_i \\
# \text{logit}\left(p^T_i\right) &= \mu_i + \delta \\
# \mu_i &\sim \text{Normal}(m, s).
# \end{aligned}$$
# 
# In this case, we want to make inferences about the population effect $m$, and the predictive distribution for the effect $\delta_{\text{new}}$ in a new trial. 
# 
# This particular model uses a random effect for the population mean, and a fixed effect for the treatment effect. There are 3 other models you could fit to represent all possible combinations of fixed or random effects for these two parameters.
# 
# Build all 4 models to estimate the treatment effect in PyMC3 and 
# 
# 1. use convergence diagnostics to check for convergence in each model 
# 2. use posterior predictive checks to compare the fit of the models
# 3. use DIC to compare the models as approximations of the true generating model
# 
# Which model would you select and why?

# In[57]:


r_t_obs = np.array([3, 7, 5, 102, 28, 4, 98, 60, 25, 138, 64, 
                    45, 9, 57, 25, 33, 28, 8, 6, 32, 27, 22])
n_t_obs = np.array([38, 114, 69, 1533, 355, 59, 945, 632, 278,1916, 
                    873, 263, 291, 858, 154, 207, 251, 151, 174, 209, 391, 680])
r_c_obs = np.array([3, 14, 11, 127, 27, 6, 152, 48, 37, 188, 52, 
                    47, 16, 45, 31, 38, 12, 6, 3, 40, 43, 39])
n_c_obs = np.array([39, 116, 93, 1520, 365, 52, 939, 471, 282, 1921, 583, 266, 
                    293, 883, 147, 213, 122, 154, 134, 218, 364, 674])
N = len(n_c_obs)


# In[69]:


with pm.Model() as fixed:
    
    delta = pm.Normal('delta', 0, sd=10000)
    mu = pm.Normal('mu', 0, sd=10000)
    
    theta_t = pm.invlogit(mu + delta)*np.ones(N)
    theta_c = pm.invlogit(mu)*np.ones(N)
    
    treat = pm.Binomial('treat', n_t_obs, theta_t, observed=r_t_obs)
    control = pm.Binomial('control', n_c_obs, theta_c, observed=r_c_obs)


# In[70]:


with fixed:
    fixed_trace = pm.sample(5000, njobs=2)


# In[71]:


pm.forestplot(fixed_trace[-1000:], varnames=['delta', 'mu'])


# In[72]:


with pm.Model() as random_mean:
    
    delta = pm.Normal('delta', 0, sd=10000)
    m_mu = pm.Normal('m_mu', 0, sd=10000)
    sigma_mu = pm.Uniform('sigma_mu', 0, 1000)
    mu = pm.Normal('mu', m_mu, sd=sigma_mu, shape=N)
    
    theta_t = pm.invlogit(mu + delta)*np.ones(N)
    theta_c = pm.invlogit(mu)*np.ones(N)
    
    treat = pm.Binomial('treat', n_t_obs, theta_t, observed=r_t_obs)
    control = pm.Binomial('control', n_c_obs, theta_c, observed=r_c_obs)
    

# In[73]:


with random_mean:
    random_mean_trace = pm.sample(5000, njobs=2)


# In[74]:


pm.forestplot(random_mean_trace[-1000:], varnames=['delta', 'm_mu', 'sigma_mu', 'mu'])


# In[75]:


with pm.Model() as random_treat:
    
    mu = pm.Normal('mu', 0, sd=10000)
    m_delta = pm.Normal('m_delta', 0, sd=10000)
    sigma_delta = pm.Uniform('sigma_delta', 0, 1000)
    delta = pm.Normal('delta', m_delta, sd=sigma_delta, shape=N)
    
    theta_t = pm.invlogit(mu + delta)*np.ones(N)
    theta_c = pm.invlogit(mu)*np.ones(N)
    
    treat = pm.Binomial('treat', n_t_obs, theta_t, observed=r_t_obs)
    control = pm.Binomial('control', n_c_obs, theta_c, observed=r_c_obs)


# In[76]:


with random_treat:
    random_treat_trace = pm.sample(5000, njobs=2)


# In[79]:


pm.forestplot(random_treat_trace[-1000:], varnames=['delta', 'm_delta', 'sigma_delta', 'mu'])


# In[80]:


with pm.Model() as random:
    
    m_mu = pm.Normal('m_mu', 0, sd=10000)
    sigma_mu = pm.Uniform('sigma_mu', 0, 1000)
    mu = pm.Normal('mu', m_mu, sd=sigma_mu, shape=N)
    
    m_delta = pm.Normal('m_delta', 0, sd=10000)
    sigma_delta = pm.Uniform('sigma_delta', 0, 1000)
    delta = pm.Normal('delta', m_delta, sd=sigma_delta, shape=N)
    
    theta_t = pm.invlogit(mu + delta)*np.ones(N)
    theta_c = pm.invlogit(mu)*np.ones(N)
    
    treat = pm.Binomial('treat', n_t_obs, theta_t, observed=r_t_obs)
    control = pm.Binomial('control', n_c_obs, theta_c, observed=r_c_obs)


# In[81]:


with random:
    random_trace = pm.sample(5000, njobs=2)


# In[82]:


pm.forestplot(random_trace[-1000:], varnames=['delta', 'm_delta', 'sigma_delta', 
                                                    'mu', 'm_mu', 'sigma_mu'])


# In[87]:


dic = {}

with random:
    dic['random'] = pm.dic(random_trace)
    
with fixed:
    dic['fixed'] = pm.dic(fixed_trace)
    
with random_mean:
    dic['random_mean'] = pm.dic(random_mean_trace)
    
with random_treat:
    dic['random_treat'] = pm.dic(random_treat_trace)
    
dic


# This implies that the fully random or the random mean models are preferred.
# 
# ### Posterior predictive checks

# In[88]:


ppc_random = pm.sample_ppc(random_trace, model=random, samples=500)


# I'm calculating Bayesian p-values here, but could do it graphically as well.

# In[89]:


from scipy.stats import percentileofscore
        
p = [percentileofscore(s, o).round(2) for s,o in zip(ppc_random['treat'].T, r_t_obs)]
p += [percentileofscore(s, o).round(2) for s,o in zip(ppc_random['control'].T, r_c_obs)]
        
plt.hist(p)
plt.xlabel('percentile')
plt.ylabel('frequency');


# In[90]:


ppc_random_mean = pm.sample_ppc(random_mean_trace, model=random_mean, samples=500)


# In[91]:


p = [percentileofscore(s, o).round(2) for s,o in zip(ppc_random_mean['treat'].T, r_t_obs)]
p += [percentileofscore(s, o).round(2) for s,o in zip(ppc_random_mean['control'].T, r_c_obs)]
        
plt.hist(p)
plt.xlabel('percentile')
plt.ylabel('frequency');


# There is no evidence of lack of fit in either of the preferred models.