In [1]:

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import re

In [6]:

data_file = 'data/Tonsillectomy_KQ2_data_For_MA_07202016.xlsx'

throat_data = (pd.read_excel(data_file, 
              sheetname='Throat infections_KQ2', 
              na_values=['null', 'ND'])
            .drop(['Comments', 'Comments 2'], 
                  axis=1))
throat_data.head(2)

Out[6]:

	Citation	REFID	Number_of_Arms	Rx_Grouping	Group_Desc	Intervention_category	OUTC_Main_ CATG	Outc_SUB_ CATG	Outcome_specify	Outc_Unit	...	Outcome_count	Outcome_%	Outcome_Mean	Outcome_SD	Outcome_ Median	Outcome_Min	Outcome_Max	Outcome_95%L	Outcome_95%H	Results
0	L. J. Orvidas, J. L. St Sauver and A. L. Weave...	2746	2	G1:	Tonsillectomy	Total tonsillectomy - unspecified	Throat infection	Throat infection-# strep infections	Group A Beta-strep throat infections/year	number of episodes/year	...	NaN	13.2	NaN	NaN	NaN	NaN	NaN	7.5	18.6	number still at risk=124
1	L. J. Orvidas, J. L. St Sauver and A. L. Weave...	2746	2	G2:	No surgery	No surgery	Throat infection	Throat infection-# strep infections	Group A Beta-strep throat infections/year	number of episodes/year	...	NaN	39.3	NaN	NaN	NaN	NaN	NaN	30.8	46.8	number still at risk=87

2 rows × 30 columns

In [7]:

school_data = (pd.read_excel(data_file, 
              sheetname='Missed_School_KQ2', 
              na_values=['null', 'ND']))
school_data.head(2)

Out[7]:

	Citation	REFID	Number_of_Arms	Rx_Grouping	Group_Desc	Intervention_category	OUTC_Main_ CATG	Outc_SUB_ CATG	Outcome_specify	Outc_Unit	...	BL Min	BL Max	BL 95% L	BL 95% H	Outcome_timepoint	OUTCOME_SAMPLE_SIZE	Outcome_count	Outcome_ %	Outcome_ Mean	Outcome_SD
0	C. Lock, J. Wilson, N. Steen, M. Eccles, H. Ma...	1861_1377_1940 nRCT	2	G1:	dissection or bipolar diathermy tonsillectomy	Total tonsillectomy - dissection/bipolar tonsi...	Missed school	Missed school	number of school days missed	days	...	NaN	NaN	NaN	NaN	2 yrs post-op	187	NaN	NaN	11.2	9.0
1	C. Lock, J. Wilson, N. Steen, M. Eccles, H. Ma...	1861_1377_1940 nRCT	2	G2:	medical treatment	conventional medical treatment	Missed school	Missed school	number of school days missed	days	...	NaN	NaN	NaN	NaN	2 yrs post-op	52	NaN	NaN	6.6	6.4

2 rows × 30 columns

Throat infection model¶

In [ ]:

excludes = '2746'

throat_data

In [15]:

throat_data.Group_Desc.str.lower().replace({'no surgery':'control', 
                                            'watchful waiting':'control'}).value_counts()

Out[15]:

control                                           33
adenotonsillectomy                                28
tonsillectomy                                     17
dissection or bipolar diathermy tonsillectomy      8
medical treatment                                  8
Name: Group_Desc, dtype: int64

In [11]:

throat_data.Group_Desc.value_counts()

Out[11]:

Adenotonsillectomy                                28
No Surgery                                        24
Tonsillectomy                                     17
medical treatment                                  8
dissection or bipolar diathermy tonsillectomy      8
No surgery                                         5
Watchful Waiting                                   4
Name: Group_Desc, dtype: int64

In [ ]:

School days model¶

In [10]:

school_data.Group_Desc.value_counts()

Out[10]:

Adenotonsillectomy                                6
No Surgery                                        6
Tonsillectomy                                     3
medical treatment                                 1
dissection or bipolar diathermy tonsillectomy     1
Name: Group_Desc, dtype: int64

Treatment group includes all surgical interventions, otherwise control

In [16]:

school_data['treatment'] = school_data.Group_Desc.str.contains('tonsil').astype(int)

Only three studies

In [18]:

school_data.REFID.unique()

Out[18]:

array(['1861_1377_1940 nRCT', '3641_a', '3641_b'], dtype=object)

In [20]:

school_data.columns

Out[20]:

Index(['Citation', 'REFID', 'Number_of_Arms', 'Rx_Grouping', 'Group_Desc',
       'Intervention_category', 'OUTC_Main_\nCATG', 'Outc_SUB_\nCATG',
       'Outcome_specify', 'Outc_Unit', 'Outc_Tool', 'BASELINE SAMPLE SIZE',
       'BL_Count', 'BL %', 'BL Mean', 'BL SD', 'BL SE', 'BL_Median', 'BL_Q1',
       'BL_Q3', 'BL Min', 'BL Max', 'BL 95% L', 'BL 95% H',
       'Outcome_timepoint', 'OUTCOME_SAMPLE_SIZE', 'Outcome_count',
       'Outcome_ %', 'Outcome_ Mean', 'Outcome_SD', 'treatment'],
      dtype='object')

In [23]:

school_analysis_subset = school_data[['REFID', 'treatment', 
                            'OUTCOME_SAMPLE_SIZE', 'Outcome_ Mean', 'Outcome_SD']]
school_analysis_subset.isnull().sum()

Out[23]:

REFID                  0
treatment              0
OUTCOME_SAMPLE_SIZE    0
Outcome_ Mean          0
Outcome_SD             0
dtype: int64

In [26]:

from pymc3 import Model, Normal, sample, forestplot, traceplot, summary, sample_ppc

with Model() as school_days_model:
    
    x, y, s = school_analysis_subset[['treatment', 'Outcome_ Mean', 'Outcome_SD']].T.values
    
    μ = Normal('μ', 0, sd=100)
    δ = Normal('δ', 0, sd=100)
    
    θ = μ + δ*x
    
    data_likelihood = Normal('data_likelihood', θ, sd=s, observed=y)

In [27]:

with school_days_model:
    
    school_trace = sample(2000, njobs=2)

Assigned NUTS to μ
Assigned NUTS to δ
 [-----------------100%-----------------] 2000 of 2000 complete in 1.9 sec

In [28]:

traceplot(school_trace[1000:], varnames=['μ', 'δ'])

Out[28]:

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11a35d5f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1197ce240>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1196f9828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a365b70>]], dtype=object)

Treatment effect on school days missed

In [29]:

summary(school_trace[1000:], varnames=['δ'])

δ:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  -0.655           1.999            0.080            [-4.425, 3.274]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  -4.575         -2.007         -0.621         0.677          3.167

Goodness of fit¶

In [33]:

school_ppc = sample_ppc(school_trace, model=school_days_model, samples=500)

In [37]:

bayes_p = lambda sim, true: [(s > t).mean() for s,t in zip(sim, true)]

sim_days = school_ppc['data_likelihood']
sim_days.shape

Out[37]:

(500, 17)

Bayesian p-values for posterior predictive checks

In [44]:

(sim_days < y).mean(0)

Out[44]:

array([ 0.81 ,  0.616,  0.548,  0.446,  0.606,  0.532,  0.656,  0.378,
        0.42 ,  0.548,  0.518,  0.58 ,  0.442,  0.334,  0.462,  0.404,
        0.52 ])