In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

rseeds = [20090425, 19700903]
In [2]:
data_file = 'data/Tonsillectomy_KQ5_Bleeding_Data_07202016.xlsx'

raw_data = (pd.read_excel(data_file, 
              sheetname='Bleeding_Dexa vs Placebo_KQ5', 
              na_values=['null', 'ND']))
raw_data.shape
Out[2]:
(48, 16)
In [3]:
raw_data.head()
Out[3]:
Citation REFID Number_ of_Arms Rx_Grouping Group_Desc Intervention_category Dose Route Rx_Durn OUTC_Main_ CATG Outc_SUB_ CATG New_Bleeding_specify Outcome_timepoint OUTCOME_SAMPLE_SIZE Outcome_count Outcome_%
0 T. Q. Gallagher, C. Hill, S. Ojha, E. Ference,... 1039 2 G1: dexamethasone perioperative steroid 0.5 mg/kg Injection NaN Harms Harms: Post-op bleeding Primary bleeding POD5 154 2 1.0
1 T. Q. Gallagher, C. Hill, S. Ojha, E. Ference,... 1039 2 G2: placebo (saline) placebo 0.5 mg/kg Injection NaN Harms Harms: Post-op bleeding Primary bleeding POD5 151 2 1.0
2 T. Q. Gallagher, C. Hill, S. Ojha, E. Ference,... 1039 2 G1: dexamethasone perioperative steroid 0.5 mg/kg Injection NaN Harms Harms: Post-op bleeding Undefined bleeding POD5 154 15 9.7
3 T. Q. Gallagher, C. Hill, S. Ojha, E. Ference,... 1039 2 G2: placebo (saline) placebo 0.5 mg/kg Injection NaN Harms Harms: Post-op bleeding Undefined bleeding POD5 151 11 7.3
4 T. Q. Gallagher, C. Hill, S. Ojha, E. Ference,... 1039 2 G1: dexamethasone perioperative steroid 0.5 mg/kg Injection NaN Harms Harms: Post-op bleeding Revisit/Readmission-bleeding POD5 154 3 1.9
In [4]:
raw_data.REFID.unique()
Out[4]:
array([ 1039,  1085,  2213,  3031,  3213,  3498,  6452,  6586, 12791])
In [5]:
high_ROB = 6586, 12791
In [6]:
raw_data.New_Bleeding_specify.value_counts().shape
Out[6]:
(5,)
In [7]:
raw_data.Intervention_category.value_counts()
Out[7]:
perioperative steroid    27
placebo                  21
Name: Intervention_category, dtype: int64
In [19]:
from pymc3 import Model, sample, traceplot, forestplot, summary, waic
from pymc3 import Normal, Binomial, invlogit, HalfCauchy, Deterministic, exp

def create_model(_data, bleeding_specific=False):
    
    data = _data.copy()
    
    unique_studies = data.REFID.unique()
    n_studies = len(unique_studies)
    
    bleeding = data.New_Bleeding_specify.unique()
    n_bleeding = len(bleeding)
    
    data['study_id'] = data.REFID.replace(dict(zip(unique_studies, range(n_studies))))
    data['bleeding_type'] = (data.New_Bleeding_specify
                             .replace(dict(zip(bleeding, range(n_bleeding)))))
    
    data['treatment'] = (data.Intervention_category=='perioperative steroid').astype(int)
    
    b, t, i, n, y = data[['bleeding_type', 'treatment', 
                          'study_id', 'OUTCOME_SAMPLE_SIZE', 'Outcome_count']].T.values
    
    with Model() as model:

        # Study random effect
        σ = HalfCauchy('σ', 5)
        ϵ = Normal('ϵ', 0, sd=σ, shape=n_studies)
        
        # Bleeding type means
        μ = Normal('μ', 0, sd=100, shape=n_bleeding)

        if bleeding_specific:
            # Perioperative steroid effect
            δ = Normal('δ', 0, sd=100, shape=n_bleeding)

            θ = invlogit(μ[b] + δ[b]*t + ϵ[i])
        else:
            # Perioperative steroid effect
            δ = Normal('δ', 0, sd=100)

            θ = invlogit(μ[b] + δ*t + ϵ[i])
        
        data_likelhood = Binomial('data_likelihood', n=n, p=θ, observed=y)
        
        odds = Deterministic('odds', exp(δ))
        
    return model

Pooled effect size, all studies

In [25]:
pooled_model = create_model(raw_data)
Applied log-transform to σ and added transformed σ_log_ to model.
In [26]:
with pooled_model:
    
    pooled_trace = sample(2000, njobs=2, random_seed=rseeds)
Assigned NUTS to σ_log_
Assigned NUTS to ϵ
Assigned NUTS to μ
Assigned NUTS to δ
 [-----------------100%-----------------] 2000 of 2000 complete in 71.0 sec
In [28]:
traceplot(pooled_trace, varnames=['odds'])
Out[28]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12503fdd8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1245ac668>]], dtype=object)
In [29]:
summary(pooled_trace, varnames=['odds'])
odds:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  1.181            0.285            0.006            [0.680, 1.755]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  0.721          0.982          1.150          1.343          1.820

In [30]:
waic(pooled_trace, model=pooled_model)
Out[30]:
435.35957088771352

Pooled effect size, exclude high ROB studies

In [9]:
low_rob_model = create_model(raw_data[~raw_data.REFID.isin(high_ROB)])
Applied log-transform to σ and added transformed σ_log_ to model.
In [10]:
with low_rob_model:
    
    low_rob_trace = sample(2000, njobs=2, random_seed=rseeds)
Assigned NUTS to σ_log_
Assigned NUTS to ϵ
Assigned NUTS to μ
Assigned NUTS to δ
 [-----------------100%-----------------] 2000 of 2000 complete in 80.5 sec
In [11]:
summary(low_rob_trace, varnames=['odds'])
odds:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  1.228            0.304            0.006            [0.685, 1.840]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  0.731          1.015          1.196          1.410          1.901

In [20]:
waic(low_rob_trace, model=low_rob_model)
Out[20]:
322.29973709316511

Bleeding-specific effects, all studies

In [13]:
full_model = create_model(raw_data, bleeding_specific=True)
Applied log-transform to σ and added transformed σ_log_ to model.
In [14]:
with full_model:
    
    full_trace = sample(2000, njobs=2, random_seed=rseeds)
Assigned NUTS to σ_log_
Assigned NUTS to ϵ
Assigned NUTS to μ
Assigned NUTS to δ
 [-----------------100%-----------------] 2000 of 2000 complete in 111.4 sec
In [18]:
forestplot(full_trace, varnames=['odds'], ylabels=raw_data.New_Bleeding_specify.unique())
Out[18]:
<matplotlib.gridspec.GridSpec at 0x11bbc2da0>
In [16]:
summary(full_trace, varnames=['odds'])
odds:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  0.971            1.293            0.035            [0.008, 2.819]
  1.667            0.591            0.009            [0.711, 2.905]
  0.474            0.322            0.006            [0.054, 1.081]
  7.349            13.011           0.521            [0.134, 21.827]
  0.458            0.487            0.009            [0.002, 1.309]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  0.066          0.313          0.606          1.183          3.761
  0.830          1.254          1.552          1.960          3.178
  0.108          0.254          0.399          0.600          1.319
  1.015          2.450          4.179          7.527          32.291
  0.038          0.172          0.324          0.575          1.707

In [21]:
waic(full_trace, model=full_model)
Out[21]:
393.11841730449868

Goodness of fit

For full model

In [58]:
from pymc3 import sample_ppc

full_ppc = sample_ppc(full_trace, model=full_model, samples=500)
In [59]:
bayes_p = lambda sim, true: [(s > t).mean() for s,t in zip(sim, true)]

sim_days = full_ppc['data_likelihood']
sim_days.shape
Out[59]:
(500, 48)
In [64]:
(sim_days < raw_data.Outcome_count.values).mean(0)
Out[64]:
array([ 0.564,  0.422,  0.604,  0.67 ,  0.522,  0.312,  0.258,  0.42 ,
        0.   ,  0.   ,  0.588,  0.   ,  0.   ,  0.924,  0.426,  0.022,
        0.972,  0.112,  0.726,  0.208,  0.876,  0.   ,  0.   ,  0.   ,
        0.   ,  0.43 ,  0.   ,  0.506,  0.   ,  0.848,  0.   ,  0.612,
        0.   ,  0.   ,  0.   ,  0.652,  0.   ,  0.78 ,  0.   ,  0.   ,
        0.832,  0.568,  0.   ,  0.   ,  0.   ,  0.876,  0.   ,  0.   ])

The fit of this model is very poor, as evidenced by the zeros.

Mixture model

In [68]:
from pymc3 import Model, sample, traceplot, forestplot, summary, waic, DensityDist
from pymc3 import Normal, Binomial, invlogit, HalfCauchy, Deterministic, exp, Beta, log
import theano.tensor as tt

def create_zib_model(_data, bleeding_specific=False):
    
    data = _data.copy()
    
    unique_studies = data.REFID.unique()
    n_studies = len(unique_studies)
    
    bleeding = data.New_Bleeding_specify.unique()
    n_bleeding = len(bleeding)
    
    data['study_id'] = data.REFID.replace(dict(zip(unique_studies, range(n_studies))))
    data['bleeding_type'] = (data.New_Bleeding_specify
                             .replace(dict(zip(bleeding, range(n_bleeding)))))
    
    data['treatment'] = (data.Intervention_category=='perioperative steroid').astype(int)
    
    b, t, i, n, y = data[['bleeding_type', 'treatment', 
                          'study_id', 'OUTCOME_SAMPLE_SIZE', 'Outcome_count']].T.values
    
    with Model() as model:
        
        # Probability of no bleeding
        ψ = Beta('ψ', 1, 1)

        # Study random effect
        σ = HalfCauchy('σ', 5)
        ϵ = Normal('ϵ', 0, sd=σ, shape=n_studies)
        
        # Bleeding type means
        μ = Normal('μ', 0, sd=100, shape=n_bleeding)

        if bleeding_specific:
            # Perioperative steroid effect
            δ = Normal('δ', 0, sd=100, shape=n_bleeding)

            θ = invlogit(μ[b] + δ[b]*t + ϵ[i])
        else:
            # Perioperative steroid effect
            δ = Normal('δ', 0, sd=100)

            θ = invlogit(μ[b] + δ*t + ϵ[i])
            
        def zero_inflated_binomial(value):
            return tt.switch(value>0, log(ψ) + Binomial.dist(n=n, p=θ).logp(value),
                                    log((1-ψ)) * (1-θ)**n).sum()
        
        data_likelhood = DensityDist('data_likelihood', zero_inflated_binomial, observed=y)
        
        odds = Deterministic('odds', exp(δ))
        
    return model
In [69]:
full_zib_model = create_zib_model(raw_data, bleeding_specific=True)
Applied logodds-transform to ψ and added transformed ψ_logodds_ to model.
Applied log-transform to σ and added transformed σ_log_ to model.
In [71]:
with full_zib_model:
    
    full_zib_trace = sample(2000)
Assigned NUTS to ψ_logodds_
Assigned NUTS to σ_log_
Assigned NUTS to ϵ
Assigned NUTS to μ
Assigned NUTS to δ
 [-----------------100%-----------------] 2000 of 2000 complete in 174.4 sec
In [72]:
traceplot(full_zib_trace, varnames=['ψ'])
Out[72]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12ba16f60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x132f11e80>]], dtype=object)
In [73]:
forestplot(full_zib_trace, varnames=['odds'], ylabels=raw_data.New_Bleeding_specify.unique())
Out[73]:
<matplotlib.gridspec.GridSpec at 0x132d9f5c0>

Note that this model fails when running njobs>1

In [74]:
with full_zib_model:
    
    full_zib_trace = sample(2000, njobs=2)
Assigned NUTS to ψ_logodds_
Assigned NUTS to σ_log_
Assigned NUTS to ϵ
Assigned NUTS to μ
Assigned NUTS to δ
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-74-709c705a780d> in <module>()
      1 with full_zib_model:
      2 
----> 3     full_zib_trace = sample(2000, njobs=2)

/Users/fonnescj/Repositories/pymc3/pymc3/sampling.py in sample(draws, step, start, trace, chain, njobs, tune, progressbar, model, random_seed)
    149         sample_func = _sample
    150 
--> 151     return sample_func(**sample_args)
    152 
    153 

/Users/fonnescj/Repositories/pymc3/pymc3/sampling.py in _mp_sample(**kwargs)
    287                                                     random_seed=rseed[i],
    288                                                     start=start_vals[i],
--> 289                                                     **kwargs) for i in range(njobs))
    290     return merge_traces(traces)
    291 

/Users/fonnescj/anaconda3/lib/python3.5/site-packages/joblib-0.10.0-py3.5.egg/joblib/parallel.py in __call__(self, iterable)
    762                 # consumption.
    763                 self._iterating = False
--> 764             self.retrieve()
    765             # Make sure that we get a last message telling us we are done
    766             elapsed_time = time.time() - self._start_time

/Users/fonnescj/anaconda3/lib/python3.5/site-packages/joblib-0.10.0-py3.5.egg/joblib/parallel.py in retrieve(self)
    713                     ensure_ready = self._managed_backend
    714                     backend.abort_everything(ensure_ready=ensure_ready)
--> 715                 raise exception
    716 
    717     def __call__(self, iterable):

/Users/fonnescj/anaconda3/lib/python3.5/site-packages/joblib-0.10.0-py3.5.egg/joblib/parallel.py in retrieve(self)
    676                 # check if timeout supported in backend future implementation
    677                 if 'timeout' in getfullargspec(job.get).args:
--> 678                     self._output.extend(job.get(timeout=self.timeout))
    679                 else:
    680                     self._output.extend(job.get())

/Users/fonnescj/anaconda3/lib/python3.5/multiprocessing/pool.py in get(self, timeout)
    606             return self._value
    607         else:
--> 608             raise self._value
    609 
    610     def _set(self, i, obj):

/Users/fonnescj/anaconda3/lib/python3.5/multiprocessing/pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
    383                         break
    384                     try:
--> 385                         put(task)
    386                     except Exception as e:
    387                         job, ind = task[:2]

/Users/fonnescj/anaconda3/lib/python3.5/site-packages/joblib-0.10.0-py3.5.egg/joblib/pool.py in send(obj)
    369             def send(obj):
    370                 buffer = BytesIO()
--> 371                 CustomizablePickler(buffer, self._reducers).dump(obj)
    372                 self._writer.send_bytes(buffer.getvalue())
    373             self._send = send

AttributeError: Can't pickle local object 'create_zib_model.<locals>.zero_inflated_binomial'