#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

rseeds = [20090425, 19700903]


# In[2]:


data_file = 'data/Tonsillectomy_KQ5_Bleeding_Data_07202016.xlsx'

raw_data = (pd.read_excel(data_file, 
              sheetname='Bleeding_Dexa vs Placebo_KQ5', 
              na_values=['null', 'ND']))
raw_data.shape


# In[3]:


raw_data.head()


# In[4]:


raw_data.REFID.unique()


# In[5]:


high_ROB = 6586, 12791


# In[6]:


raw_data.New_Bleeding_specify.value_counts().shape


# In[7]:


raw_data.Intervention_category.value_counts()


# In[19]:


from pymc3 import Model, sample, traceplot, forestplot, summary, waic
from pymc3 import Normal, Binomial, invlogit, HalfCauchy, Deterministic, exp

def create_model(_data, bleeding_specific=False):
    
    data = _data.copy()
    
    unique_studies = data.REFID.unique()
    n_studies = len(unique_studies)
    
    bleeding = data.New_Bleeding_specify.unique()
    n_bleeding = len(bleeding)
    
    data['study_id'] = data.REFID.replace(dict(zip(unique_studies, range(n_studies))))
    data['bleeding_type'] = (data.New_Bleeding_specify
                             .replace(dict(zip(bleeding, range(n_bleeding)))))
    
    data['treatment'] = (data.Intervention_category=='perioperative steroid').astype(int)
    
    b, t, i, n, y = data[['bleeding_type', 'treatment', 
                          'study_id', 'OUTCOME_SAMPLE_SIZE', 'Outcome_count']].T.values
    
    with Model() as model:

        # Study random effect
        σ = HalfCauchy('σ', 5)
        ϵ = Normal('ϵ', 0, sd=σ, shape=n_studies)
        
        # Bleeding type means
        μ = Normal('μ', 0, sd=100, shape=n_bleeding)

        if bleeding_specific:
            # Perioperative steroid effect
            δ = Normal('δ', 0, sd=100, shape=n_bleeding)

            θ = invlogit(μ[b] + δ[b]*t + ϵ[i])
        else:
            # Perioperative steroid effect
            δ = Normal('δ', 0, sd=100)

            θ = invlogit(μ[b] + δ*t + ϵ[i])
        
        data_likelhood = Binomial('data_likelihood', n=n, p=θ, observed=y)
        
        odds = Deterministic('odds', exp(δ))
        
    return model


# ### Pooled effect size, all studies

# In[25]:


pooled_model = create_model(raw_data)


# In[26]:


with pooled_model:
    
    pooled_trace = sample(2000, njobs=2, random_seed=rseeds)


# In[28]:


traceplot(pooled_trace, varnames=['odds'])


# In[29]:


summary(pooled_trace, varnames=['odds'])


# In[30]:


waic(pooled_trace, model=pooled_model)


# ### Pooled effect size, exclude high ROB studies

# In[9]:


low_rob_model = create_model(raw_data[~raw_data.REFID.isin(high_ROB)])


# In[10]:


with low_rob_model:
    
    low_rob_trace = sample(2000, njobs=2, random_seed=rseeds)


# In[11]:


summary(low_rob_trace, varnames=['odds'])


# In[20]:


waic(low_rob_trace, model=low_rob_model)


# ## Bleeding-specific effects, all studies

# In[13]:


full_model = create_model(raw_data, bleeding_specific=True)


# In[14]:


with full_model:
    
    full_trace = sample(2000, njobs=2, random_seed=rseeds)


# In[18]:


forestplot(full_trace, varnames=['odds'], ylabels=raw_data.New_Bleeding_specify.unique())


# In[16]:


summary(full_trace, varnames=['odds'])


# In[21]:


waic(full_trace, model=full_model)


# ## Goodness of fit
# 
# For full model

# In[58]:


from pymc3 import sample_ppc

full_ppc = sample_ppc(full_trace, model=full_model, samples=500)


# In[59]:


bayes_p = lambda sim, true: [(s > t).mean() for s,t in zip(sim, true)]

sim_days = full_ppc['data_likelihood']
sim_days.shape


# In[64]:


(sim_days < raw_data.Outcome_count.values).mean(0)


# The fit of this model is very poor, as evidenced by the zeros.

# ## Mixture model

# In[68]:


from pymc3 import Model, sample, traceplot, forestplot, summary, waic, DensityDist
from pymc3 import Normal, Binomial, invlogit, HalfCauchy, Deterministic, exp, Beta, log
import theano.tensor as tt

def create_zib_model(_data, bleeding_specific=False):
    
    data = _data.copy()
    
    unique_studies = data.REFID.unique()
    n_studies = len(unique_studies)
    
    bleeding = data.New_Bleeding_specify.unique()
    n_bleeding = len(bleeding)
    
    data['study_id'] = data.REFID.replace(dict(zip(unique_studies, range(n_studies))))
    data['bleeding_type'] = (data.New_Bleeding_specify
                             .replace(dict(zip(bleeding, range(n_bleeding)))))
    
    data['treatment'] = (data.Intervention_category=='perioperative steroid').astype(int)
    
    b, t, i, n, y = data[['bleeding_type', 'treatment', 
                          'study_id', 'OUTCOME_SAMPLE_SIZE', 'Outcome_count']].T.values
    
    with Model() as model:
        
        # Probability of no bleeding
        ψ = Beta('ψ', 1, 1)

        # Study random effect
        σ = HalfCauchy('σ', 5)
        ϵ = Normal('ϵ', 0, sd=σ, shape=n_studies)
        
        # Bleeding type means
        μ = Normal('μ', 0, sd=100, shape=n_bleeding)

        if bleeding_specific:
            # Perioperative steroid effect
            δ = Normal('δ', 0, sd=100, shape=n_bleeding)

            θ = invlogit(μ[b] + δ[b]*t + ϵ[i])
        else:
            # Perioperative steroid effect
            δ = Normal('δ', 0, sd=100)

            θ = invlogit(μ[b] + δ*t + ϵ[i])
            
        def zero_inflated_binomial(value):
            return tt.switch(value>0, log(ψ) + Binomial.dist(n=n, p=θ).logp(value),
                                    log((1-ψ)) * (1-θ)**n).sum()
        
        data_likelhood = DensityDist('data_likelihood', zero_inflated_binomial, observed=y)
        
        odds = Deterministic('odds', exp(δ))
        
    return model


# In[69]:


full_zib_model = create_zib_model(raw_data, bleeding_specific=True)


# In[71]:


with full_zib_model:
    
    full_zib_trace = sample(2000)


# In[72]:


traceplot(full_zib_trace, varnames=['ψ'])


# In[73]:


forestplot(full_zib_trace, varnames=['odds'], ylabels=raw_data.New_Bleeding_specify.unique())


# Note that this model fails when running `njobs`>1

# In[74]:


with full_zib_model:
    
    full_zib_trace = sample(2000, njobs=2)