#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set() import re # In[6]: data_file = 'data/Tonsillectomy_KQ2_data_For_MA_07202016.xlsx' throat_data = (pd.read_excel(data_file, sheetname='Throat infections_KQ2', na_values=['null', 'ND']) .drop(['Comments', 'Comments 2'], axis=1)) throat_data.head(2) # In[7]: school_data = (pd.read_excel(data_file, sheetname='Missed_School_KQ2', na_values=['null', 'ND'])) school_data.head(2) # ## Throat infection model # In[ ]: excludes = '2746' throat_data # In[15]: throat_data.Group_Desc.str.lower().replace({'no surgery':'control', 'watchful waiting':'control'}).value_counts() # In[11]: throat_data.Group_Desc.value_counts() # In[ ]: # ## School days model # In[10]: school_data.Group_Desc.value_counts() # Treatment group includes all surgical interventions, otherwise control # In[16]: school_data['treatment'] = school_data.Group_Desc.str.contains('tonsil').astype(int) # Only three studies # In[18]: school_data.REFID.unique() # In[20]: school_data.columns # In[23]: school_analysis_subset = school_data[['REFID', 'treatment', 'OUTCOME_SAMPLE_SIZE', 'Outcome_ Mean', 'Outcome_SD']] school_analysis_subset.isnull().sum() # In[26]: from pymc3 import Model, Normal, sample, forestplot, traceplot, summary, sample_ppc with Model() as school_days_model: x, y, s = school_analysis_subset[['treatment', 'Outcome_ Mean', 'Outcome_SD']].T.values μ = Normal('μ', 0, sd=100) δ = Normal('δ', 0, sd=100) θ = μ + δ*x data_likelihood = Normal('data_likelihood', θ, sd=s, observed=y) # In[27]: with school_days_model: school_trace = sample(2000, njobs=2) # In[28]: traceplot(school_trace[1000:], varnames=['μ', 'δ']) # Treatment effect on school days missed # In[29]: summary(school_trace[1000:], varnames=['δ']) # ### Goodness of fit # In[33]: school_ppc = sample_ppc(school_trace, model=school_days_model, samples=500) # In[37]: bayes_p = lambda sim, true: [(s > t).mean() for s,t in zip(sim, true)] sim_days = school_ppc['data_likelihood'] sim_days.shape # Bayesian p-values for posterior predictive checks # In[44]: (sim_days < y).mean(0)