#!/usr/bin/env python
# coding: utf-8

# Answer all questions and submit them either as an IPython notebook, LaTeX document, or Markdown document. Each question is worth 25 points.

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pymc as pm
import pylab as plt
import pandas as pd

# Set seed
np.random.seed(10011)


# ## Question 1
# 
# The goal of this problem is to investigate the role of the proposal distribution in a Metropolis-Hastings algorithm designed to simulate from the posterior distribution of the mixture parameter $\delta$. 
# 
# 1. Simulate 200 realizations from the mixture distribution:
#     $$y_i \sim \delta N(7, 0.5^2) + (1-\delta) N(10, 0.5^2)$$
#     with $\delta = 0.7$. Plot a histogram of these data. 
# 2. Implement an ABC procedure to simulate from the posterior distribution of $\delta$, using your data from part (1). 
# 3. Implement a random walk M-H algorithm with proposal $\delta^{\prime} = \delta^{(i)} + \epsilon$ with $\epsilon \sim Unif(−1,1)$. 
# 4. Reparameterize the problem letting $U = \log\left[\frac{\delta}{1 - \delta}\right]$ and $u^{\prime} = u^{(i)} + \epsilon$. Implement a random walk chain in U-space. 
# 5. Compare the estimates and convergence behavior of the three algorithms.
# 
# In part (1), you are asked to simulate data from a distribution with $\delta$ known. For parts (2)–(4), assume $\delta$ is unknown with prior $\delta \sim Unif( 0,1)$. For parts (2)–(4), provide an appropriate plot and a table summarizing the output of the algorithm. To facilitate comparisons, use the same number of iterations, random seed, starting values, and burn-in period for all implementations of the algorithm. 

# **Part 1**

# In[2]:


# Sample data
n = 200
delta = 0.7

y = np.where(np.random.random(n) < delta, np.random.normal(7, 0.5, n), np.random.normal(10, 0.5, n))


# In[3]:


plt.hist(y)


# **Part 2**

# In[4]:


np.mean(y), np.std(y)


# In[5]:


n_samples = 100
epsilons = [0.05, 0.01]

# List for holding accepted samples
samples = []
    
while len(samples) < n_samples:
        
    # Simulate from prior
    d = np.random.random()
    
    # Simulate data
    y_sim = np.where(np.random.random(n) < d, np.random.normal(7, 0.5, n), np.random.normal(10, 0.5, n))
    
    if (np.abs(np.mean(y_sim) - np.mean(y)) < epsilons[0]) & (np.abs(np.std(y_sim) - np.std(y)) < epsilons[1]):
        
        samples.append(d)


# In[6]:


plt.hist(samples)


# **Part 3**

# In[7]:


plt.hist(y)


# There are several ways of solving this; here is the easiest (though not the most general)

# In[8]:


from scipy.stats import norm
dnorm = norm.pdf

def calc_posterior(d):
    return np.log(d*dnorm(y, 7, 0.5) + (1-d)*dnorm(y, 10, 0.5)).sum()


# In[9]:


n_iterations = 10000

# Initialize trace for parameters
trace = np.empty(n_iterations+1)

# Set initial values
trace[0] = 0.5
# Calculate joint posterior for initial values
current_log_prob = calc_posterior(trace[0])

# Initialize acceptance counts
accepted = 0

for i in range(n_iterations):

    if not i%1000: print('Iteration %i' % i)

    # Grab current parameter values
    d_current = trace[i]

    # Propose new value for d
    d_prop = d_current + np.random.uniform(-1, 1)
    while (d_prop<0) or (d_prop>1):
        d_prop = d_current + np.random.uniform(-1, 1)
    
    # Calculate log posterior with proposed value
    proposed_log_prob = calc_posterior(d_prop)

    # Log-acceptance rate
    alpha = proposed_log_prob - current_log_prob

    # Test proposed value
    if np.log(np.random.random()) < alpha:
        # Accept
        trace[i+1] = d_prop
        current_log_prob = proposed_log_prob
        accepted += 1
    else:
        # Reject
        trace[i+1] = trace[i]


# In[10]:


plt.hist(trace)


# In[11]:


logit = lambda p: np.log(p/(1-p))
invlogit = lambda x: 1./(1 + np.exp(-x))

n_iterations = 10000

# Initialize trace for parameters
trace = np.empty(n_iterations+1)

# Set initial values
trace[0] = 0.5
# Calculate joint posterior for initial values
current_log_prob = calc_posterior(trace[0])

# Initialize acceptance counts
accepted = 0

for i in range(n_iterations):

    if not i%1000: print('Iteration %i' % i)

    # Grab current parameter values
    d_current = trace[i]

    # Propose new value for d
    d_prop = invlogit(logit(d_current) + np.random.uniform(-1, 1))
    
    # Calculate log posterior with proposed value
    proposed_log_prob = calc_posterior(d_prop)

    # Log-acceptance rate
    alpha = proposed_log_prob - current_log_prob

    # Test proposed value
    if np.log(np.random.random()) < alpha:
        # Accept
        trace[i+1] = d_prop
        current_log_prob = proposed_log_prob
        accepted += 1
    else:
        # Reject
        trace[i+1] = trace[i]


# In[12]:


plt.plot(trace)


# ## Question 2
# 
# Carlin (1992) considers a Bayesian approach to meta-analysis, and includes the following examples of 22 trials of beta-blockers to prevent mortality after myocardial infarction. These data are given below.
# 
# In one possible random effects model we assume the true baseline mean (on a log-odds scale) $m_i$ in a trial $i$
# is drawn from some population distribution. Let $r^C_i$ denote number of events in the control group in trial $i$, and $r^T_i$ denote events under active treatment in trial $i$. Our model is:
# 
# $$\begin{aligned}
# r^C_i &\sim \text{Binomial}\left(p^C_i, n^C_i\right) \\
# r^T_i &\sim \text{Binomial}\left(p^T_i, n^T_i\right) \\
# \text{logit}\left(p^C_i\right) &= \mu_i \\
# \text{logit}\left(p^T_i\right) &= \mu_i + \delta \\
# \mu_i &\sim \text{Normal}(m, s).
# \end{aligned}$$
# 
# In this case, we want to make inferences about the population effect $m$, and the predictive distribution for the effect $\delta_{\text{new}}$ in a new trial. 
# 
# This particular model uses a random effect for the population mean, and a fixed effect for the treatment effect. There are 3 other models you could fit to represent all possible combinations of fixed or random effects for these two parameters.
# 
# Build all 4 models to estimate the treatment effect in PyMC and 
# 
# 1. use convergence diagnostics to check for convergence in each model 
# 2. use posterior predictive checks to compare the fit of the models
# 3. use DIC to compare the models as approximations of the true generating model
# 
# Which model would you select and why?

# In[13]:


r_t_obs = [3, 7, 5, 102, 28, 4, 98, 60, 25, 138, 64, 45, 9, 57, 25, 33, 28, 8, 6, 32, 27, 22]
n_t_obs = [38, 114, 69, 1533, 355, 59, 945, 632, 278,1916, 873, 263, 291, 858, 154, 207, 251, 151, 174, 209, 391, 680]
r_c_obs = [3, 14, 11, 127, 27, 6, 152, 48, 37, 188, 52, 47, 16, 45, 31, 38, 12, 6, 3, 40, 43, 39]
n_c_obs = [39, 116, 93, 1520, 365, 52, 939, 471, 282, 1921, 583, 266, 293, 883, 147, 213, 122, 154, 134, 218, 364, 674]
N = len(n_c_obs)


# In[14]:


def fixed():
    
    delta = pm.Normal('delta', 0, 1e-5, value=0)
    mu = pm.Normal('mu', 0, 1e-5, value=1)
    
    theta_t = pm.Lambda('theta_t', lambda mu=mu, d=delta: pm.invlogit(mu + d)*np.ones(N))
    theta_c = pm.Lambda('theta_c', lambda mu=mu: pm.invlogit(mu)*np.ones(N))
    
    treat = pm.Binomial('treat', n_t_obs, theta_t, value=r_t_obs, observed=True)
    control = pm.Binomial('control', n_c_obs, theta_c, value=r_c_obs, observed=True)
    
    treat_pred = pm.Binomial('treat_pred', n_t_obs, theta_t, value=r_t_obs)
    control_pred = pm.Binomial('control_pred', n_c_obs, theta_c, value=r_c_obs)
    
    return locals()


# In[15]:


M_fixed = pm.MCMC(fixed())
M_fixed.sample(10000, 5000)
M_fixed.sample(10000, 5000)


# In[16]:


pm.Matplot.summary_plot([M_fixed.delta, M_fixed.mu])


# In[17]:


def random_mean():
    
    delta = pm.Normal('delta', 0, 1e-5, value=0)
    
    m_mu = pm.Normal('m_mu', 0, 1e-5, value=0)
    tau_mu = pm.Gamma('tau_m', 1, 0.01, value=1)
    mu = pm.Normal('mu', m_mu, tau_mu, value=[1]*N)
    
    theta_t = pm.Lambda('theta_t', lambda mu=mu, d=delta: pm.invlogit(mu + d))
    theta_c = pm.Lambda('theta_c', lambda mu=mu: pm.invlogit(mu))
    
    treat = pm.Binomial('treat', n_t_obs, theta_t, value=r_t_obs, observed=True)
    control = pm.Binomial('control', n_c_obs, theta_c, value=r_c_obs, observed=True)
    
    treat_pred = pm.Binomial('treat_pred', n_t_obs, theta_t, value=r_t_obs)
    control_pred = pm.Binomial('control_pred', n_c_obs, theta_c, value=r_c_obs)
    
    return locals()


# In[18]:


M_random_mean = pm.MCMC(random_mean())
M_random_mean.sample(10000, 5000)
M_random_mean.sample(10000, 5000)


# In[20]:


pm.Matplot.summary_plot([M_random_mean.delta, M_random_mean.m_mu, M_random_mean.tau_mu, M_random_mean.mu])


# In[21]:


def random_treat():
    
    mu = pm.Normal('mu', 0, 1e-5, value=1)
    
    m_delta = pm.Normal('m_delta', 0, 1e-5, value=0.5)
    tau_delta = pm.Gamma('tau_delta', 1, 0.01, value=1)
    delta = pm.Normal('delta', m_delta, tau_delta, size=N)
    
    theta_t = pm.Lambda('theta_t', lambda mu=mu, d=delta: pm.invlogit(mu + d))
    theta_c = pm.Lambda('theta_c', lambda mu=mu: pm.invlogit(mu)*np.ones(N))
    
    treat = pm.Binomial('treat', n_t_obs, theta_t, value=r_t_obs, observed=True)
    control = pm.Binomial('control', n_c_obs, theta_c, value=r_c_obs, observed=True)
    
    treat_pred = pm.Binomial('treat_pred', n_t_obs, theta_t, value=r_t_obs)
    control_pred = pm.Binomial('control_pred', n_c_obs, theta_c, value=r_c_obs)
    
    return locals()


# In[22]:


M_random_treat = pm.MCMC(random_treat())
M_random_treat.sample(10000, 5000)
M_random_treat.sample(10000, 5000)


# In[23]:


pm.Matplot.summary_plot([M_random_treat.m_delta, M_random_treat.delta, M_random_treat.tau_delta, M_random_treat.mu])


# In[24]:


def random():
    
    m_mu = pm.Normal('m_mu', 0, 1e-5, value=0)
    tau_mu = pm.Gamma('tau_m', 1, 0.01, value=1)
    mu = pm.Normal('mu', m_mu, tau_mu, value=[0]*N)
    
    m_delta = pm.Normal('m_delta', 0, 1e-5, value=0)
    tau_delta = pm.Gamma('tau_delta', 1, 0.01, value=1)
    delta = pm.Normal('delta', m_delta, tau_delta, value=[0]*N)
    
    theta_t = pm.Lambda('theta_t', lambda mu=mu, d=delta: pm.invlogit(mu + d))
    theta_c = pm.Lambda('theta_c', lambda mu=mu, d=delta: pm.invlogit(mu))
    
    treat = pm.Binomial('treat', n_t_obs, theta_t, value=r_t_obs, observed=True)
    control = pm.Binomial('control', n_c_obs, theta_c, value=r_c_obs, observed=True)
    
    treat_pred = pm.Binomial('treat_pred', n_t_obs, theta_t, value=r_t_obs)
    control_pred = pm.Binomial('control_pred', n_c_obs, theta_c, value=r_c_obs)
    
    return locals()


# In[25]:


M_random = pm.MCMC(random())
M_random.sample(10000, 5000)
M_random.sample(10000, 5000)


# In[26]:


pm.Matplot.summary_plot([M_random.m_delta, M_random.delta, M_random.tau_delta, M_random_treat.mu, M_random_mean.m_mu, M_random_mean.tau_mu])


# DIC

# In[28]:


dic = np.array([M_fixed.dic, M_random_mean.dic, M_random_treat.dic, M_random.dic])
dic


# In[31]:


delta_dic = dic - dic.min()
np.round(np.exp(-0.5*delta_dic) / np.exp(-0.5*delta_dic).sum(), 3)


# Hence, 85% of model weight is on the random mean model, while 15% is on the full random model.