#!/usr/bin/env python # coding: utf-8 # In[20]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set() rseed = 20090425 # ## Import and clean data # In[2]: data_file = 'data/Bleeding_Dataonly_For Chris_04152016.xlsx' raw_data = (pd.read_excel(data_file, sheetname='Bleeding_Outcomes_AllKQs', na_values=['null', 'ND']) .drop(['Comments', 'Other stats \nName','Other Stats','Results','Comments 2', 'Presentation \nlocation','Last Assesment tmpt for the study','Followup duration category'], axis=1) .rename(columns={'Outcome timepoint (when was this outcome measured, e.g., in PACU, 12 months post─op, immediately post─op──would need a row for each outcome at each timepoint of interest)': 'outcome_time', 'OUTC_Main_\nCATG':'outcome_cat', "Outc_SUB_\nCATG":'outcome_subcat', 'OUTCOME SAMPLE SIZE': 'N', 'Outcome\ncount': 'outcome_obs', 'Outcome \n%': 'outcome_pct', "Outcome\n Mean": 'outcome_mean', "Outcome \nSD": 'outcome_sd', 'Outcome \n SE': 'outcome_se', 'Outcome\n _Q1': 'outcome_q1', 'Outcome \n_Q3': 'outcome_q3', "Outcome \n Median": 'outcome_med', "Outcome \n 95% L": 'outcome_lo_95', "Outcome \n 95% H": 'outcome_hi_95', "Outcome\n Min": 'outcome_min', 'Outcome \n Max': 'outcome_max'})) raw_data.head() # In[3]: raw_data['Intervention_category'] = raw_data.Intervention_category.str.lower().str.strip() clean_data = raw_data.replace({'Intervention_category':{'saline':'control', 'control (no rx)':'control', 'control (no dexamethasone)':'control', 'no antibiotics':'control', 'no steroid':'control', 'placebo':'control'}}) # Counts of bleeding outcomes by major category # In[4]: clean_data.New_Bleeding_specify.value_counts() # Here is the full list of intervention categories. For the purposes of this analysis, I will just use the ones with "total" or "partial" in the name, and take them to be surgical interventions. # In[5]: clean_data.Intervention_category.value_counts() # Here is the surgery filter: # In[6]: surgery_data = clean_data[clean_data.Intervention_category.str.contains('total') | clean_data.Intervention_category.str.contains('partial')].dropna(subset=['outcome_obs']).copy() # List of interventions after filtering. Note that there were a lot of papers that did not report bleeding outcomes, so the numbers dropped significantly. # In[7]: surgery_data.Intervention_category.value_counts() # Indicator for partial # In[8]: surgery_data['partial'] = surgery_data.Intervention_category.str.contains('partial') # Generate column for technique # In[9]: surgery_data['technique'] = (surgery_data.Intervention_category .apply(lambda x: ' '.join(x.split(' ')[1:])) .replace({'tonsillectomy - coblation':'coblation'})) surgery_data = surgery_data[~(surgery_data.technique.str.contains('specified')| surgery_data.technique.str.contains('\+'))] # In[10]: surgery_data.technique.value_counts() # I will drop microdebrider, argon plasma and plasma knife due to lack of information. # In[11]: technique_includes = surgery_data.technique.value_counts().index[:-3].tolist() surgery_data = surgery_data[surgery_data.technique.isin(technique_includes)] # In[12]: technique_includes # Subsets of data according to bleeding outcome # In[13]: readmission_bleeding_data = surgery_data[surgery_data.New_Bleeding_specify=='Revisit/Readmission-bleeding'] reoperation_bleeding_data = surgery_data[surgery_data.New_Bleeding_specify=='Reoperation-bleeding'] primary_bleeding_data = surgery_data[surgery_data.New_Bleeding_specify=='Primary bleeding'] secondary_bleeding_data = surgery_data[surgery_data.New_Bleeding_specify=='Secondary bleeding'] # ## Model Specification # In[14]: from pymc3 import (NUTS, sample, Model, Deterministic, find_MAP, Binomial, Normal, HalfCauchy, advi, traceplot, summary, forestplot) import theano.tensor as tt def tinvlogit(x): return tt.exp(x) / (1 + tt.exp(x)) # In[17]: def specify_model(model, data): N, REFID, events, partial = data[['N', 'REFID', 'outcome_obs', 'partial']].values.T refid_list = list(set(REFID)) study_id = data.REFID.apply(lambda i: refid_list.index(i)).values n_studies = len(refid_list) technique = data.technique.apply(lambda i: technique_includes.index(i)).values n_tech = len(technique_includes) assert not np.isnan(N.astype(int)).any() assert not np.isnan(events.astype(int)).any() with model: # Mean for surgery interventions θ = Normal('θ', 0, sd=5, shape=n_tech, testval=np.ones(n_tech)*-1) # Effect of partial intervention β = Normal('β', 0, sd=5, testval=0) # Study random effect σ = HalfCauchy('σ', 5, testval=1) ϵ = Normal('ϵ', 0, sd=σ, shape=n_studies, testval=np.zeros(n_studies)) # Transform to probability scale π = tinvlogit(θ[technique] + β*partial.astype(int) + ϵ[study_id]) # Mean probabilities π_total = Deterministic('π_total', tinvlogit(θ)) π_partial = Deterministic('π_partial', tinvlogit(θ + β)) # Data likelihood obs = Binomial('obs', N.astype(int), π, observed=events.astype(int)) return model # In[18]: reoperation_model = specify_model(Model(), reoperation_bleeding_data) # Model run using Hamiltonian Monte Carlo # In[19]: with reoperation_model: reoperation_trace = sample(2000, random_seed=rseed) # In[32]: readmission_model = specify_model(Model(), readmission_bleeding_data) # In[33]: with readmission_model: readmission_trace = sample(2000, random_seed=rseed) # In[34]: primary_model = specify_model(Model(), primary_bleeding_data) # In[35]: with primary_model: primary_trace = sample(2000, random_seed=rseed) # In[36]: secondary_model = specify_model(Model(), secondary_bleeding_data) # In[37]: with secondary_model: secondary_trace = sample(2000, random_seed=rseed) # ## Reoperation Bleeding Results # Posterior traces and histograms of surgical technique bleeding estimates for partial (top plot) and total (bottom plot) removal. # In[37]: forestplot(reoperation_trace[1000:], varnames=['π_partial'], ylabels=technique_includes, main='Partial removal', xtitle='Probability of re-operation bleeding'); # In[38]: forestplot(reoperation_trace[1000:], varnames=['π_total'], ylabels=technique_includes, main='Total removal', xtitle='Probability of re-operation bleeding'); # In[39]: summary(reoperation_trace[1000:], varnames=['π_total', 'π_partial']) # ## Readmission Bleeding Results # # Posterior traces and histograms of surgical technique bleeding estimates for partial (top plot) and total (bottom plot) removal. # In[40]: forestplot(readmission_trace[1000:], varnames=['π_partial'], ylabels=technique_includes, main='Partial removal', xtitle='Probability of re-admission bleeding'); # In[41]: forestplot(readmission_trace[1000:], varnames=['π_total'], ylabels=technique_includes, main='Total removal', xtitle='Probability of re-admission bleeding'); # In[42]: summary(readmission_trace[1000:], varnames=['π_total', 'π_partial']) # ## Primary Bleeding Results # # Posterior traces and histograms of surgical technique bleeding estimates for partial (top plot) and total (bottom plot) removal. # In[43]: forestplot(primary_trace[1000:], varnames=['π_partial'], ylabels=technique_includes, main='Partial removal', xtitle='Probability of primary bleeding'); # In[44]: forestplot(primary_trace[1000:], varnames=['π_total'], ylabels=technique_includes, main='Total removal', xtitle='Probability of primary bleeding'); # In[45]: summary(primary_trace[1000:], varnames=['π_total', 'π_partial']) # ## Secondary Bleeding Results # # Posterior traces and histograms of surgical technique bleeding estimates for partial (top plot) and total (bottom plot) removal. # In[46]: forestplot(secondary_trace[1000:], varnames=['π_partial'], ylabels=technique_includes, main='Partial removal', xtitle='Probability of secondary bleeding'); # In[47]: forestplot(secondary_trace[1000:], varnames=['π_total'], ylabels=technique_includes, main='Total removal', xtitle='Probability of secondary bleeding'); # In[48]: summary(secondary_trace[1000:], varnames=['π_total', 'π_partial']) # ## Goodness of fit # # Plots of Bayesian p-values. An excess of very large or very small values may suggest lack of fit. # In[53]: from pymc3 import sample_ppc ppc = [sample_ppc(trace, model=model, samples=500) for trace,model in ([reoperation_trace, reoperation_model], [readmission_trace, readmission_model], [primary_trace, primary_model], [secondary_trace, secondary_model])] # In[94]: bayes_p = lambda sim, true: [(s > t).mean() for s,t in zip(sim, true)] # In[95]: obs_sim = [p['obs'] for p in ppc] p_values = [bayes_p(simdata, model.obs.tag.test_value) for simdata,model in zip(obs_sim, [reoperation_model, readmission_model, primary_model, secondary_model])] # In[103]: dfp = pd.DataFrame(p_values).T dfp.columns = 'Reoperation', 'Readmission', 'Primary', 'Secondary' dfp.hist(sharex=True, bins=15);