#!/usr/bin/env python
# coding: utf-8

# In[20]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

rseed = 20090425


# ## Import and clean data

# In[2]:


data_file = 'data/Bleeding_Dataonly_For Chris_04152016.xlsx'

raw_data = (pd.read_excel(data_file, sheetname='Bleeding_Outcomes_AllKQs',
                         na_values=['null', 'ND'])
            .drop(['Comments', 'Other stats \nName','Other Stats','Results','Comments 2',
                   'Presentation \nlocation','Last Assesment tmpt for the study','Followup duration category'], 
                  axis=1)
            .rename(columns={'Outcome timepoint (when was this outcome measured, e.g., in PACU, 12 months post─op, immediately post─op──would need a row for each outcome at each timepoint of interest)': 'outcome_time',
                             'OUTC_Main_\nCATG':'outcome_cat',
                            "Outc_SUB_\nCATG":'outcome_subcat',
                            'OUTCOME SAMPLE SIZE': 'N',
                             'Outcome\ncount': 'outcome_obs',
                            'Outcome  \n%': 'outcome_pct',
                            "Outcome\n Mean": 'outcome_mean',
                            "Outcome \nSD": 'outcome_sd',
                             'Outcome \n SE': 'outcome_se',
                             'Outcome\n _Q1': 'outcome_q1', 
                             'Outcome \n_Q3': 'outcome_q3',
                            "Outcome \n Median": 'outcome_med',
                            "Outcome \n 95% L": 'outcome_lo_95',
                            "Outcome \n 95% H": 'outcome_hi_95',
                            "Outcome\n  Min": 'outcome_min', 
                             'Outcome \n Max': 'outcome_max'}))

raw_data.head()


# In[3]:


raw_data['Intervention_category'] = raw_data.Intervention_category.str.lower().str.strip()
clean_data = raw_data.replace({'Intervention_category':{'saline':'control',
                        'control (no rx)':'control',
                        'control (no dexamethasone)':'control',
                        'no antibiotics':'control',
                        'no steroid':'control',
                        'placebo':'control'}})


# Counts of bleeding outcomes by major category

# In[4]:


clean_data.New_Bleeding_specify.value_counts()


# Here is the full list of intervention categories. For the purposes of this analysis, I will just use the ones with "total" or "partial" in the name, and take them to be surgical interventions.

# In[5]:


clean_data.Intervention_category.value_counts()


# Here is the surgery filter:

# In[6]:


surgery_data = clean_data[clean_data.Intervention_category.str.contains('total') 
                   | clean_data.Intervention_category.str.contains('partial')].dropna(subset=['outcome_obs']).copy()


# List of interventions after filtering. Note that there were a lot of papers that did not report bleeding outcomes, so the numbers dropped significantly. 

# In[7]:


surgery_data.Intervention_category.value_counts()


# Indicator for partial

# In[8]:


surgery_data['partial'] = surgery_data.Intervention_category.str.contains('partial')


# Generate column for technique

# In[9]:


surgery_data['technique'] = (surgery_data.Intervention_category
        .apply(lambda x: ' '.join(x.split(' ')[1:]))
        .replace({'tonsillectomy - coblation':'coblation'}))

surgery_data = surgery_data[~(surgery_data.technique.str.contains('specified')|
                             surgery_data.technique.str.contains('\+'))]


# In[10]:


surgery_data.technique.value_counts()


# I will drop microdebrider, argon plasma and plasma knife due to lack of information.

# In[11]:


technique_includes = surgery_data.technique.value_counts().index[:-3].tolist()
surgery_data = surgery_data[surgery_data.technique.isin(technique_includes)]


# In[12]:


technique_includes


# Subsets of data according to bleeding outcome

# In[13]:


readmission_bleeding_data = surgery_data[surgery_data.New_Bleeding_specify=='Revisit/Readmission-bleeding']
reoperation_bleeding_data = surgery_data[surgery_data.New_Bleeding_specify=='Reoperation-bleeding']
primary_bleeding_data = surgery_data[surgery_data.New_Bleeding_specify=='Primary bleeding']
secondary_bleeding_data = surgery_data[surgery_data.New_Bleeding_specify=='Secondary bleeding']


# ## Model Specification

# In[14]:


from pymc3 import (NUTS, sample, Model, Deterministic, find_MAP,
                  Binomial, Normal, HalfCauchy, advi,
                  traceplot, summary, forestplot)
import theano.tensor as tt

def tinvlogit(x):
    return tt.exp(x) / (1 + tt.exp(x))


# In[17]:


def specify_model(model, data):
    
    N, REFID, events, partial = data[['N', 'REFID', 'outcome_obs', 'partial']].values.T
    
    refid_list = list(set(REFID))
    study_id = data.REFID.apply(lambda i: refid_list.index(i)).values
    n_studies = len(refid_list)
    
    technique = data.technique.apply(lambda i: technique_includes.index(i)).values
    n_tech = len(technique_includes)
    
    assert not np.isnan(N.astype(int)).any()
    assert not np.isnan(events.astype(int)).any()
    
    with model:
        
        # Mean for surgery interventions
        θ = Normal('θ', 0, sd=5, shape=n_tech, testval=np.ones(n_tech)*-1)
        # Effect of partial intervention
        β = Normal('β', 0, sd=5, testval=0)

        # Study random effect
        σ = HalfCauchy('σ', 5, testval=1)
        ϵ = Normal('ϵ', 0, sd=σ, shape=n_studies, testval=np.zeros(n_studies))

        # Transform to probability scale
        π = tinvlogit(θ[technique] + β*partial.astype(int) + ϵ[study_id])

        # Mean probabilities
        π_total = Deterministic('π_total', tinvlogit(θ))
        π_partial = Deterministic('π_partial', tinvlogit(θ + β))

        # Data likelihood
        obs = Binomial('obs', N.astype(int), π, observed=events.astype(int))

    return model


# In[18]:


reoperation_model = specify_model(Model(), reoperation_bleeding_data)


# Model run using Hamiltonian Monte Carlo

# In[19]:


with reoperation_model:
    reoperation_trace = sample(2000, random_seed=rseed)


# In[32]:


readmission_model = specify_model(Model(), readmission_bleeding_data)


# In[33]:


with readmission_model:
    readmission_trace = sample(2000, random_seed=rseed)


# In[34]:


primary_model = specify_model(Model(), primary_bleeding_data)


# In[35]:


with primary_model:
    primary_trace = sample(2000, random_seed=rseed)


# In[36]:


secondary_model = specify_model(Model(), secondary_bleeding_data)


# In[37]:


with secondary_model:
    secondary_trace = sample(2000, random_seed=rseed)


# ## Reoperation Bleeding Results

# Posterior traces and histograms of surgical technique bleeding estimates for partial (top plot) and total (bottom plot) removal.

# In[37]:


forestplot(reoperation_trace[1000:], varnames=['π_partial'], ylabels=technique_includes, 
           main='Partial removal', xtitle='Probability of re-operation bleeding');


# In[38]:


forestplot(reoperation_trace[1000:], varnames=['π_total'], ylabels=technique_includes, 
           main='Total removal', xtitle='Probability of re-operation bleeding');


# In[39]:


summary(reoperation_trace[1000:], varnames=['π_total', 'π_partial'])


# ## Readmission Bleeding Results
# 
# Posterior traces and histograms of surgical technique bleeding estimates for partial (top plot) and total (bottom plot) removal.

# In[40]:


forestplot(readmission_trace[1000:], varnames=['π_partial'], ylabels=technique_includes, 
           main='Partial removal', xtitle='Probability of re-admission bleeding');


# In[41]:


forestplot(readmission_trace[1000:], varnames=['π_total'], ylabels=technique_includes, 
           main='Total removal', xtitle='Probability of re-admission bleeding');


# In[42]:


summary(readmission_trace[1000:], varnames=['π_total', 'π_partial'])


# ## Primary Bleeding Results
# 
# Posterior traces and histograms of surgical technique bleeding estimates for partial (top plot) and total (bottom plot) removal.

# In[43]:


forestplot(primary_trace[1000:], varnames=['π_partial'], ylabels=technique_includes, 
           main='Partial removal', xtitle='Probability of primary bleeding');


# In[44]:


forestplot(primary_trace[1000:], varnames=['π_total'], ylabels=technique_includes, 
           main='Total removal', xtitle='Probability of primary bleeding');


# In[45]:


summary(primary_trace[1000:], varnames=['π_total', 'π_partial'])


# ## Secondary Bleeding Results
# 
# Posterior traces and histograms of surgical technique bleeding estimates for partial (top plot) and total (bottom plot) removal.

# In[46]:


forestplot(secondary_trace[1000:], varnames=['π_partial'], ylabels=technique_includes, 
           main='Partial removal', xtitle='Probability of secondary bleeding');


# In[47]:


forestplot(secondary_trace[1000:], varnames=['π_total'], ylabels=technique_includes, 
           main='Total removal', xtitle='Probability of secondary bleeding');


# In[48]:


summary(secondary_trace[1000:], varnames=['π_total', 'π_partial'])


# ## Goodness of fit
# 
# Plots of Bayesian p-values. An excess of very large or very small values may suggest lack of fit.

# In[53]:


from pymc3 import sample_ppc

ppc = [sample_ppc(trace, model=model, samples=500) for trace,model in ([reoperation_trace, reoperation_model],
                                                                      [readmission_trace, readmission_model],
                                                                      [primary_trace, primary_model],
                                                                      [secondary_trace, secondary_model])]


# In[94]:


bayes_p = lambda sim, true: [(s > t).mean() for s,t in zip(sim, true)]


# In[95]:


obs_sim = [p['obs'] for p in ppc]
p_values = [bayes_p(simdata, model.obs.tag.test_value) for simdata,model in zip(obs_sim, 
                                        [reoperation_model, readmission_model, primary_model, secondary_model])]  


# In[103]:


dfp = pd.DataFrame(p_values).T
dfp.columns = 'Reoperation', 'Readmission', 'Primary', 'Secondary'
dfp.hist(sharex=True, bins=15);