#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set() rseeds = [20090425, 19700903] # In[2]: data_file = 'data/Tonsillectomy_KQ5_Bleeding_Data_07202016.xlsx' raw_data = (pd.read_excel(data_file, sheetname='Bleeding_Dexa vs Placebo_KQ5', na_values=['null', 'ND'])) raw_data.shape # In[3]: raw_data.head() # In[4]: raw_data.REFID.unique() # In[5]: high_ROB = 6586, 12791 # In[6]: raw_data.New_Bleeding_specify.value_counts().shape # In[7]: raw_data.Intervention_category.value_counts() # In[19]: from pymc3 import Model, sample, traceplot, forestplot, summary, waic from pymc3 import Normal, Binomial, invlogit, HalfCauchy, Deterministic, exp def create_model(_data, bleeding_specific=False): data = _data.copy() unique_studies = data.REFID.unique() n_studies = len(unique_studies) bleeding = data.New_Bleeding_specify.unique() n_bleeding = len(bleeding) data['study_id'] = data.REFID.replace(dict(zip(unique_studies, range(n_studies)))) data['bleeding_type'] = (data.New_Bleeding_specify .replace(dict(zip(bleeding, range(n_bleeding))))) data['treatment'] = (data.Intervention_category=='perioperative steroid').astype(int) b, t, i, n, y = data[['bleeding_type', 'treatment', 'study_id', 'OUTCOME_SAMPLE_SIZE', 'Outcome_count']].T.values with Model() as model: # Study random effect σ = HalfCauchy('σ', 5) ϵ = Normal('ϵ', 0, sd=σ, shape=n_studies) # Bleeding type means μ = Normal('μ', 0, sd=100, shape=n_bleeding) if bleeding_specific: # Perioperative steroid effect δ = Normal('δ', 0, sd=100, shape=n_bleeding) θ = invlogit(μ[b] + δ[b]*t + ϵ[i]) else: # Perioperative steroid effect δ = Normal('δ', 0, sd=100) θ = invlogit(μ[b] + δ*t + ϵ[i]) data_likelhood = Binomial('data_likelihood', n=n, p=θ, observed=y) odds = Deterministic('odds', exp(δ)) return model # ### Pooled effect size, all studies # In[25]: pooled_model = create_model(raw_data) # In[26]: with pooled_model: pooled_trace = sample(2000, njobs=2, random_seed=rseeds) # In[28]: traceplot(pooled_trace, varnames=['odds']) # In[29]: summary(pooled_trace, varnames=['odds']) # In[30]: waic(pooled_trace, model=pooled_model) # ### Pooled effect size, exclude high ROB studies # In[9]: low_rob_model = create_model(raw_data[~raw_data.REFID.isin(high_ROB)]) # In[10]: with low_rob_model: low_rob_trace = sample(2000, njobs=2, random_seed=rseeds) # In[11]: summary(low_rob_trace, varnames=['odds']) # In[20]: waic(low_rob_trace, model=low_rob_model) # ## Bleeding-specific effects, all studies # In[13]: full_model = create_model(raw_data, bleeding_specific=True) # In[14]: with full_model: full_trace = sample(2000, njobs=2, random_seed=rseeds) # In[18]: forestplot(full_trace, varnames=['odds'], ylabels=raw_data.New_Bleeding_specify.unique()) # In[16]: summary(full_trace, varnames=['odds']) # In[21]: waic(full_trace, model=full_model) # ## Goodness of fit # # For full model # In[58]: from pymc3 import sample_ppc full_ppc = sample_ppc(full_trace, model=full_model, samples=500) # In[59]: bayes_p = lambda sim, true: [(s > t).mean() for s,t in zip(sim, true)] sim_days = full_ppc['data_likelihood'] sim_days.shape # In[64]: (sim_days < raw_data.Outcome_count.values).mean(0) # The fit of this model is very poor, as evidenced by the zeros. # ## Mixture model # In[68]: from pymc3 import Model, sample, traceplot, forestplot, summary, waic, DensityDist from pymc3 import Normal, Binomial, invlogit, HalfCauchy, Deterministic, exp, Beta, log import theano.tensor as tt def create_zib_model(_data, bleeding_specific=False): data = _data.copy() unique_studies = data.REFID.unique() n_studies = len(unique_studies) bleeding = data.New_Bleeding_specify.unique() n_bleeding = len(bleeding) data['study_id'] = data.REFID.replace(dict(zip(unique_studies, range(n_studies)))) data['bleeding_type'] = (data.New_Bleeding_specify .replace(dict(zip(bleeding, range(n_bleeding))))) data['treatment'] = (data.Intervention_category=='perioperative steroid').astype(int) b, t, i, n, y = data[['bleeding_type', 'treatment', 'study_id', 'OUTCOME_SAMPLE_SIZE', 'Outcome_count']].T.values with Model() as model: # Probability of no bleeding ψ = Beta('ψ', 1, 1) # Study random effect σ = HalfCauchy('σ', 5) ϵ = Normal('ϵ', 0, sd=σ, shape=n_studies) # Bleeding type means μ = Normal('μ', 0, sd=100, shape=n_bleeding) if bleeding_specific: # Perioperative steroid effect δ = Normal('δ', 0, sd=100, shape=n_bleeding) θ = invlogit(μ[b] + δ[b]*t + ϵ[i]) else: # Perioperative steroid effect δ = Normal('δ', 0, sd=100) θ = invlogit(μ[b] + δ*t + ϵ[i]) def zero_inflated_binomial(value): return tt.switch(value>0, log(ψ) + Binomial.dist(n=n, p=θ).logp(value), log((1-ψ)) * (1-θ)**n).sum() data_likelhood = DensityDist('data_likelihood', zero_inflated_binomial, observed=y) odds = Deterministic('odds', exp(δ)) return model # In[69]: full_zib_model = create_zib_model(raw_data, bleeding_specific=True) # In[71]: with full_zib_model: full_zib_trace = sample(2000) # In[72]: traceplot(full_zib_trace, varnames=['ψ']) # In[73]: forestplot(full_zib_trace, varnames=['odds'], ylabels=raw_data.New_Bleeding_specify.unique()) # Note that this model fails when running `njobs`>1 # In[74]: with full_zib_model: full_zib_trace = sample(2000, njobs=2)