#!/usr/bin/env python
# coding: utf-8

# # Simulated clinical trial
# 
# Simulated randomized clinical trials (RCT) are useful because we know the true treatment effects being estimated.  Consider a two-treatment (A, B) RCT for hypertension where covariate adjustment is used for baseline systolic blood pressure (SBP) and where there are two outcomes: (1) incidence of death or stroke (DS) within one year and (2) systolic blood pressure at one year after randomization.   Even though time to DS would be a preferred outcome (and would handle censoring) we ignore the timing of events for this example and use a binary logistic model for DS.  SBP is assumed to follow a normal distribution with constant SD $\sigma=7$ given baseline SBP.  The true B:A treatment effect is assumed to be a mean 7mmHg drop in SBP with B.  We assume a correlation between SBP reduction and incidence of DS by forming a population logistic model for DS in which baseline and 1y SBP each have a regression coefficient of 0.05 for predicting DS and treatment has a coefficient of log(0.8).  To estimate the true effect of treatment on DS not adjusted for follow-up SBP we first simulate a trial with $n=40000$.
# 
# The regression coefficient for treatment in the proper outcome model that did not adjust for 1y SBP was `round(btrt, 4)` which corresponds to a B:A odds ratio of `round(exp(btrt), 4)` which we take as the true treatment effect on the binary outcome.
# 
# The trial could easily be run sequentially but we treat the sample size as fixed at $n=1500$ and simulate the trial data as such.  

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pystan
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import logistic

np.random.seed(1)


# In[10]:


invlogit = lambda x: 1 / (1 + np.exp(-x))


# In[11]:


A,B = 0,1

def sim(n):
    trt = np.r_[np.ones(int(n/2)), np.zeros(int(n/2))]
    sbp0 = np.random.normal(140, 7, size=n)
    sbp = sbp0 - 5 - 7*(trt==B) + np.random.normal(scale=5, size=n)
    logit = -2.6 - np.log(0.8)*(trt==B) + 0.05*(sbp0 - 140) + 0.05*(sbp - 130)
    ds = np.arange(2)[(np.random.random(n) <= invlogit(logit)).astype(int)]
    
    df = pd.DataFrame({'trt': trt,
                      'sbp0': sbp0,
                      'sbp': sbp,
                      'ds': ds})
    return df


# In[12]:


n = 1500
data = sim(n)
data.plot.scatter('sbp0', 'sbp', c='trt', cmap='plasma', alpha=0.4);


# In[13]:


fit1 = smf.ols(formula='sbp ~ sbp0 + trt', data=data).fit()
fit1.summary()


# In[14]:


fit2 = smf.glm(formula='ds ~ sbp0 + sbp + trt', data=data, family=sm.families.Binomial()).fit()
fit2.summary()


# Specify Stan model

# In[15]:


model = """
data {
    int n;
    vector[n] x;
    real y1[n];
    int y2[n];
    vector[n] treat;
    vector<lower=0>[2] sigma_b;
    vector[2] Zero;
}

parameters {
    vector[2] alpha;
    vector[2] beta;
    vector[2] mu;
    real<lower=0> sigma;
    corr_matrix[2] Omega_b; 
}


transformed parameters {
    vector[n] theta1;
    vector[n] theta2;
    cov_matrix[2] Sigma; 
    
    Sigma <- quad_form_diag(Omega_b, sigma_b); 
    
    theta1 <- mu[1] + alpha[1]*x + beta[1]*treat;
    theta2 <- mu[2] + alpha[2]*x + beta[2]*treat;
}

model {
    beta ~ multi_normal(Zero, Sigma); 
    Omega_b ~ lkj_corr(1);

    sigma ~ cauchy(0, 5);
    
    y1 ~ normal(theta1, sigma);
    y2 ~ bernoulli_logit(theta2);
}

generated quantities {
    real A;
    real B;
    real A_or_B;
    real A_and_B;

    A <- if_else(beta[1] < -7, 1, 0);
    B <- if_else(exp(beta[2]) < 0.8, 1, 0);
    
    A_or_B <- if_else(A || B, 1, 0);
    A_and_B <- if_else(A && B, 1, 0);
}
"""


# In[16]:


fit = pystan.stan(model_code=model, 
                  data=dict(x=data.sbp0, 
                            y1=data.sbp, 
                            y2=data.ds, 
                            treat=data.trt, n=n, sigma_b=(2, 1.5), Zero=(0,0)))


# In[17]:


fit['beta'].mean(0)


# In[18]:


np.exp(fit['beta'][:,1]).mean()


# In[19]:


fit.plot(pars=['beta']);


# Probabilities of A OR B and A AND B::

# In[20]:


sample = fit.extract(pars=['A_or_B', 'A_and_B'])


# In[21]:


[sample[s].mean(0) for s in sample]


# Correlation between betas

# In[22]:


fit['Omega_b'].mean(0)