In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import re
In [6]:
data_file = 'data/Tonsillectomy_KQ2_data_For_MA_07202016.xlsx'

throat_data = (pd.read_excel(data_file, 
              sheetname='Throat infections_KQ2', 
              na_values=['null', 'ND'])
            .drop(['Comments', 'Comments 2'], 
                  axis=1))
throat_data.head(2)
Out[6]:
Citation REFID Number_of_Arms Rx_Grouping Group_Desc Intervention_category OUTC_Main_ CATG Outc_SUB_ CATG Outcome_specify Outc_Unit ... Outcome_count Outcome_% Outcome_Mean Outcome_SD Outcome_ Median Outcome_Min Outcome_Max Outcome_95%L Outcome_95%H Results
0 L. J. Orvidas, J. L. St Sauver and A. L. Weave... 2746 2 G1: Tonsillectomy Total tonsillectomy - unspecified Throat infection Throat infection-# strep infections Group A Beta-strep throat infections/year number of episodes/year ... NaN 13.2 NaN NaN NaN NaN NaN 7.5 18.6 number still at risk=124
1 L. J. Orvidas, J. L. St Sauver and A. L. Weave... 2746 2 G2: No surgery No surgery Throat infection Throat infection-# strep infections Group A Beta-strep throat infections/year number of episodes/year ... NaN 39.3 NaN NaN NaN NaN NaN 30.8 46.8 number still at risk=87

2 rows × 30 columns

In [7]:
school_data = (pd.read_excel(data_file, 
              sheetname='Missed_School_KQ2', 
              na_values=['null', 'ND']))
school_data.head(2)
Out[7]:
Citation REFID Number_of_Arms Rx_Grouping Group_Desc Intervention_category OUTC_Main_ CATG Outc_SUB_ CATG Outcome_specify Outc_Unit ... BL Min BL Max BL 95% L BL 95% H Outcome_timepoint OUTCOME_SAMPLE_SIZE Outcome_count Outcome_ % Outcome_ Mean Outcome_SD
0 C. Lock, J. Wilson, N. Steen, M. Eccles, H. Ma... 1861_1377_1940 nRCT 2 G1: dissection or bipolar diathermy tonsillectomy Total tonsillectomy - dissection/bipolar tonsi... Missed school Missed school number of school days missed days ... NaN NaN NaN NaN 2 yrs post-op 187 NaN NaN 11.2 9.0
1 C. Lock, J. Wilson, N. Steen, M. Eccles, H. Ma... 1861_1377_1940 nRCT 2 G2: medical treatment conventional medical treatment Missed school Missed school number of school days missed days ... NaN NaN NaN NaN 2 yrs post-op 52 NaN NaN 6.6 6.4

2 rows × 30 columns

Throat infection model

In [ ]:
excludes = '2746'

throat_data
In [15]:
throat_data.Group_Desc.str.lower().replace({'no surgery':'control', 
                                            'watchful waiting':'control'}).value_counts()
Out[15]:
control                                           33
adenotonsillectomy                                28
tonsillectomy                                     17
dissection or bipolar diathermy tonsillectomy      8
medical treatment                                  8
Name: Group_Desc, dtype: int64
In [11]:
throat_data.Group_Desc.value_counts()
Out[11]:
Adenotonsillectomy                                28
No Surgery                                        24
Tonsillectomy                                     17
medical treatment                                  8
dissection or bipolar diathermy tonsillectomy      8
No surgery                                         5
Watchful Waiting                                   4
Name: Group_Desc, dtype: int64

School days model

In [10]:
school_data.Group_Desc.value_counts()
Out[10]:
Adenotonsillectomy                                6
No Surgery                                        6
Tonsillectomy                                     3
medical treatment                                 1
dissection or bipolar diathermy tonsillectomy     1
Name: Group_Desc, dtype: int64

Treatment group includes all surgical interventions, otherwise control

In [16]:
school_data['treatment'] = school_data.Group_Desc.str.contains('tonsil').astype(int)

Only three studies

In [18]:
school_data.REFID.unique()
Out[18]:
array(['1861_1377_1940 nRCT', '3641_a', '3641_b'], dtype=object)
In [20]:
school_data.columns
Out[20]:
Index(['Citation', 'REFID', 'Number_of_Arms', 'Rx_Grouping', 'Group_Desc',
       'Intervention_category', 'OUTC_Main_\nCATG', 'Outc_SUB_\nCATG',
       'Outcome_specify', 'Outc_Unit', 'Outc_Tool', 'BASELINE SAMPLE SIZE',
       'BL_Count', 'BL %', 'BL Mean', 'BL SD', 'BL SE', 'BL_Median', 'BL_Q1',
       'BL_Q3', 'BL Min', 'BL Max', 'BL 95% L', 'BL 95% H',
       'Outcome_timepoint', 'OUTCOME_SAMPLE_SIZE', 'Outcome_count',
       'Outcome_ %', 'Outcome_ Mean', 'Outcome_SD', 'treatment'],
      dtype='object')
In [23]:
school_analysis_subset = school_data[['REFID', 'treatment', 
                            'OUTCOME_SAMPLE_SIZE', 'Outcome_ Mean', 'Outcome_SD']]
school_analysis_subset.isnull().sum()
Out[23]:
REFID                  0
treatment              0
OUTCOME_SAMPLE_SIZE    0
Outcome_ Mean          0
Outcome_SD             0
dtype: int64
In [26]:
from pymc3 import Model, Normal, sample, forestplot, traceplot, summary, sample_ppc

with Model() as school_days_model:
    
    x, y, s = school_analysis_subset[['treatment', 'Outcome_ Mean', 'Outcome_SD']].T.values
    
    μ = Normal('μ', 0, sd=100)
    δ = Normal('δ', 0, sd=100)
    
    θ = μ + δ*x
    
    data_likelihood = Normal('data_likelihood', θ, sd=s, observed=y)
In [27]:
with school_days_model:
    
    school_trace = sample(2000, njobs=2)
Assigned NUTS to μ
Assigned NUTS to δ
 [-----------------100%-----------------] 2000 of 2000 complete in 1.9 sec
In [28]:
traceplot(school_trace[1000:], varnames=['μ', 'δ'])
Out[28]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11a35d5f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1197ce240>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1196f9828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a365b70>]], dtype=object)

Treatment effect on school days missed

In [29]:
summary(school_trace[1000:], varnames=['δ'])
δ:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  -0.655           1.999            0.080            [-4.425, 3.274]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  -4.575         -2.007         -0.621         0.677          3.167

Goodness of fit

In [33]:
school_ppc = sample_ppc(school_trace, model=school_days_model, samples=500)
In [37]:
bayes_p = lambda sim, true: [(s > t).mean() for s,t in zip(sim, true)]

sim_days = school_ppc['data_likelihood']
sim_days.shape
Out[37]:
(500, 17)

Bayesian p-values for posterior predictive checks

In [44]:
(sim_days < y).mean(0)
Out[44]:
array([ 0.81 ,  0.616,  0.548,  0.446,  0.606,  0.532,  0.656,  0.378,
        0.42 ,  0.548,  0.518,  0.58 ,  0.442,  0.334,  0.462,  0.404,
        0.52 ])