%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import re
data_file = 'data/Tonsillectomy_KQ2_data_For_MA_07202016.xlsx'
throat_data = (pd.read_excel(data_file,
sheetname='Throat infections_KQ2',
na_values=['null', 'ND'])
.drop(['Comments', 'Comments 2'],
axis=1))
throat_data.head(2)
Citation | REFID | Number_of_Arms | Rx_Grouping | Group_Desc | Intervention_category | OUTC_Main_ CATG | Outc_SUB_ CATG | Outcome_specify | Outc_Unit | ... | Outcome_count | Outcome_% | Outcome_Mean | Outcome_SD | Outcome_ Median | Outcome_Min | Outcome_Max | Outcome_95%L | Outcome_95%H | Results | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | L. J. Orvidas, J. L. St Sauver and A. L. Weave... | 2746 | 2 | G1: | Tonsillectomy | Total tonsillectomy - unspecified | Throat infection | Throat infection-# strep infections | Group A Beta-strep throat infections/year | number of episodes/year | ... | NaN | 13.2 | NaN | NaN | NaN | NaN | NaN | 7.5 | 18.6 | number still at risk=124 |
1 | L. J. Orvidas, J. L. St Sauver and A. L. Weave... | 2746 | 2 | G2: | No surgery | No surgery | Throat infection | Throat infection-# strep infections | Group A Beta-strep throat infections/year | number of episodes/year | ... | NaN | 39.3 | NaN | NaN | NaN | NaN | NaN | 30.8 | 46.8 | number still at risk=87 |
2 rows × 30 columns
school_data = (pd.read_excel(data_file,
sheetname='Missed_School_KQ2',
na_values=['null', 'ND']))
school_data.head(2)
Citation | REFID | Number_of_Arms | Rx_Grouping | Group_Desc | Intervention_category | OUTC_Main_ CATG | Outc_SUB_ CATG | Outcome_specify | Outc_Unit | ... | BL Min | BL Max | BL 95% L | BL 95% H | Outcome_timepoint | OUTCOME_SAMPLE_SIZE | Outcome_count | Outcome_ % | Outcome_ Mean | Outcome_SD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C. Lock, J. Wilson, N. Steen, M. Eccles, H. Ma... | 1861_1377_1940 nRCT | 2 | G1: | dissection or bipolar diathermy tonsillectomy | Total tonsillectomy - dissection/bipolar tonsi... | Missed school | Missed school | number of school days missed | days | ... | NaN | NaN | NaN | NaN | 2 yrs post-op | 187 | NaN | NaN | 11.2 | 9.0 |
1 | C. Lock, J. Wilson, N. Steen, M. Eccles, H. Ma... | 1861_1377_1940 nRCT | 2 | G2: | medical treatment | conventional medical treatment | Missed school | Missed school | number of school days missed | days | ... | NaN | NaN | NaN | NaN | 2 yrs post-op | 52 | NaN | NaN | 6.6 | 6.4 |
2 rows × 30 columns
excludes = '2746'
throat_data
throat_data.Group_Desc.str.lower().replace({'no surgery':'control',
'watchful waiting':'control'}).value_counts()
control 33 adenotonsillectomy 28 tonsillectomy 17 dissection or bipolar diathermy tonsillectomy 8 medical treatment 8 Name: Group_Desc, dtype: int64
throat_data.Group_Desc.value_counts()
Adenotonsillectomy 28 No Surgery 24 Tonsillectomy 17 medical treatment 8 dissection or bipolar diathermy tonsillectomy 8 No surgery 5 Watchful Waiting 4 Name: Group_Desc, dtype: int64
school_data.Group_Desc.value_counts()
Adenotonsillectomy 6 No Surgery 6 Tonsillectomy 3 medical treatment 1 dissection or bipolar diathermy tonsillectomy 1 Name: Group_Desc, dtype: int64
Treatment group includes all surgical interventions, otherwise control
school_data['treatment'] = school_data.Group_Desc.str.contains('tonsil').astype(int)
Only three studies
school_data.REFID.unique()
array(['1861_1377_1940 nRCT', '3641_a', '3641_b'], dtype=object)
school_data.columns
Index(['Citation', 'REFID', 'Number_of_Arms', 'Rx_Grouping', 'Group_Desc', 'Intervention_category', 'OUTC_Main_\nCATG', 'Outc_SUB_\nCATG', 'Outcome_specify', 'Outc_Unit', 'Outc_Tool', 'BASELINE SAMPLE SIZE', 'BL_Count', 'BL %', 'BL Mean', 'BL SD', 'BL SE', 'BL_Median', 'BL_Q1', 'BL_Q3', 'BL Min', 'BL Max', 'BL 95% L', 'BL 95% H', 'Outcome_timepoint', 'OUTCOME_SAMPLE_SIZE', 'Outcome_count', 'Outcome_ %', 'Outcome_ Mean', 'Outcome_SD', 'treatment'], dtype='object')
school_analysis_subset = school_data[['REFID', 'treatment',
'OUTCOME_SAMPLE_SIZE', 'Outcome_ Mean', 'Outcome_SD']]
school_analysis_subset.isnull().sum()
REFID 0 treatment 0 OUTCOME_SAMPLE_SIZE 0 Outcome_ Mean 0 Outcome_SD 0 dtype: int64
from pymc3 import Model, Normal, sample, forestplot, traceplot, summary, sample_ppc
with Model() as school_days_model:
x, y, s = school_analysis_subset[['treatment', 'Outcome_ Mean', 'Outcome_SD']].T.values
μ = Normal('μ', 0, sd=100)
δ = Normal('δ', 0, sd=100)
θ = μ + δ*x
data_likelihood = Normal('data_likelihood', θ, sd=s, observed=y)
with school_days_model:
school_trace = sample(2000, njobs=2)
Assigned NUTS to μ Assigned NUTS to δ [-----------------100%-----------------] 2000 of 2000 complete in 1.9 sec
traceplot(school_trace[1000:], varnames=['μ', 'δ'])
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11a35d5f8>, <matplotlib.axes._subplots.AxesSubplot object at 0x1197ce240>], [<matplotlib.axes._subplots.AxesSubplot object at 0x1196f9828>, <matplotlib.axes._subplots.AxesSubplot object at 0x11a365b70>]], dtype=object)
Treatment effect on school days missed
summary(school_trace[1000:], varnames=['δ'])
δ: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------- -0.655 1.999 0.080 [-4.425, 3.274] Posterior quantiles: 2.5 25 50 75 97.5 |--------------|==============|==============|--------------| -4.575 -2.007 -0.621 0.677 3.167
school_ppc = sample_ppc(school_trace, model=school_days_model, samples=500)
bayes_p = lambda sim, true: [(s > t).mean() for s,t in zip(sim, true)]
sim_days = school_ppc['data_likelihood']
sim_days.shape
(500, 17)
Bayesian p-values for posterior predictive checks
(sim_days < y).mean(0)
array([ 0.81 , 0.616, 0.548, 0.446, 0.606, 0.532, 0.656, 0.378, 0.42 , 0.548, 0.518, 0.58 , 0.442, 0.334, 0.462, 0.404, 0.52 ])