#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import matplotlib.pyplot as plt import pymc as pm import pdb pm.__version__ # In[2]: flu = pd.read_csv('data/flu_organism.csv', index_col=0) # In[3]: flu.shape # In[4]: flu.PatientID.unique().shape # ### Turn into table of unique patients, with appropriate organism columns # # Create indicator for flu-only. # In[5]: flu_only = flu.groupby('PatientID')['OrganismName'].apply(lambda s: len([x for x in s if str(x).startswith('Influenza')])==len(s)).astype(bool) # In[6]: flu_only.head() # In[7]: flu_only.name = 'flu_only' # In[8]: flu = flu.merge(pd.DataFrame(flu_only), left_on='PatientID', right_index=True) # In[9]: # flu_staph_organism = flu.groupby('PatientID')['OrganismName'].apply(lambda s: # len([x for x in s if str(x).startswith('Influenza') or # str(x).startswith('Staphylococcus aureus')])>1).astype(bool) # In[10]: # flu_staph_organism.name = 'flu_staph_org' # In[11]: # flu = flu.merge(pd.DataFrame(flu_staph_organism), left_on='PatientID', right_index=True) # See if there are any flu-only or flu-other individuals with an ICD9 diagnosis of S. Aureus # In[12]: flu_other = flu.groupby('PatientID')['OrganismName'].apply(lambda s: len([x for x in s if str(x).startswith('Influenza') or str(x).startswith('Other')])>1).astype(bool) # In[13]: flu_other.name = 'flu_other' # In[14]: flu = flu.merge(pd.DataFrame(flu_other), left_on='PatientID', right_index=True) # There do not appear to be cases of flu-only or flu-other via organism that have a S.Aureus diagnosis via ICD9 # In[15]: (flu_other & flu.s_aureus_icd9).sum() # In[16]: (flu_only & flu.s_aureus_icd9).sum() # Same for pneumo # In[17]: (flu_other & flu.pneumo_icd9).sum() # In[18]: flu.groupby('PatientID')['OrganismName'].value_counts() # Create indictors for coinfection type # In[19]: for org in flu.Type.unique(): flu[org] = (flu.Type==org).astype(int) # Create data frame of unique patients # In[20]: flu_unique = flu.drop_duplicates(subset=['PatientID']).set_index('PatientID') # In[21]: flu_unique.s_aureus_icd9.sum() # In[22]: flu_unique['flu_only'] = flu_only # In[23]: flu_unique.flu_only.mean() # Several missing values for admission to time on ECMO # In[24]: flu_unique.AdmitToTimeOnHours.isnull().mean() # Create variables for use in analysis # In[25]: # ECMO Type flu_unique['VA'] = flu_unique.Mode.isin(['VA', 'VV-VA']) # Set "Other" type to NA (there are only a couple) flu_unique.loc[flu_unique.Mode=='Other', 'VA'] = None # Create oxygen index # In[26]: flu_unique['OI'] = flu_unique.FiO2 * flu_unique.MAP / flu_unique.PO2 # In[27]: flu_unique.OI.hist(bins=20) # In[28]: covariates = ['AgeYears', 'pH', 'PO2', 'VA'] # Get counts of missing values # In[29]: flu_unique[covariates].isnull().mean() # In[30]: flu_unique[covariates].hist(bins=25); # In[31]: flu_unique[flu_unique.AgeYears< 1].AgeDays.hist(bins=15) # In[32]: flu_unique.s_aureus_icd9.sum() # In[33]: (flu_unique.flu_staph_org | flu_unique.s_aureus_icd9).sum() # In[34]: foo = '01AC9644-7CDF-4E28-A5F4-FE5B7F9FA22A' flu_unique.ix[foo][['flu_staph_org', 's_aureus_icd9']] # In[35]: flu_unique['flu_staph'] = (flu_unique.flu_staph_org | flu_unique.s_aureus_icd9).astype(int) # In[36]: flu_unique.to_csv('Data/flu_unique.csv') # ## Logistic regression model # Fate of each patient # In[37]: died = (flu_unique.fate=='Died').astype(int).values # In[38]: N_obs = len(died) # Functions for standarizing and centering # In[39]: center = lambda x: (x - x[np.isnan(x) ^ True].mean()) standardize = lambda x: center(x) / x[np.isnan(x) ^ True].std() # In[40]: covariates # In[41]: AgeYears, pH, PO2, va = flu_unique[covariates].values.T AgeYears_center = center(AgeYears).round(1) # Center pH at 7.4 pH_center = pH - 7.4 # OI_std = standardize(OI) # PO2_std = standardize(PO2) # In[42]: no_coinfection = flu_unique.flu_only.values with_staph = flu_unique.flu_staph.values # In[43]: flu_unique.flu_staph.sum() # ## Model # In[44]: from pymc import MCMC, Normal, Poisson, Uniform, Lambda, Laplace, Exponential, Beta, HalfCauchy, Uninformative from pymc import Bernoulli, Lognormal, observed, Matplot, deterministic, Gamma, stochastic from pymc import AdaptiveMetropolis, normal_like, rnormal, potential, invlogit from pymc.gp import * # In[45]: # Values for missing elements, to be replaced in model # OI_std[np.isnan(OI_std)] = 0 pH_center[np.isnan(pH_center)] = 7.111 va[np.isnan(va)] = 0.5 PO2[np.isnan(PO2)] = 20.111 # In[46]: def spline(name, x, knots, smoothing, interpolation_method='linear', pred_points=10): """ Generate PyMC objects for a spline model of age-specific rate Parameters ---------- name : str knots : array x : array, points to interpolate to smoothing : pymc.Node, smoothness parameter for smoothing spline interpolation_method : str, optional, one of 'linear', 'nearest', 'zero', 'slinear', 'quadratic, 'cubic' Results ------- Returns dict of PyMC objects, including 'gamma' (log of rate at knots) and 'mu_age' (age-specific rate interpolated at all age points) """ assert np.all(np.diff(knots) > 0), 'Spline knots must be strictly increasing' # TODO: consider changing this prior distribution to be something more familiar in linear space gamma = [Normal('gamma_%s_%d'%(name,k), 0., 10.**-2, value=0.) for k in knots] #gamma = [mc.Uniform('gamma_%s_%d'%(name,k), -20., 20., value=-10.) for k in knots] import scipy.interpolate @deterministic(name='mu_x_%s'%name) def mu_x(gamma=gamma, knots=knots, x=x): mu = scipy.interpolate.interp1d(knots, np.exp(gamma), kind=interpolation_method, bounds_error=False, fill_value=0.) return mu(x) @deterministic(name='pred_%s'%name) def pred(gamma=gamma, knots=knots): mu = scipy.interpolate.interp1d(knots, np.exp(gamma), kind=interpolation_method, bounds_error=False, fill_value=0.) return mu(np.linspace(x.min(), x.max(), pred_points)) vars = dict(gamma=gamma, mu_x=mu_x, pred=pred, x=x, knots=knots) if (smoothing > 0) and (smoothing < 1e10): #print 'adding smoothing of', smoothing @potential(name='smooth_mu_%s'%name) def smooth_gamma(gamma=gamma, knots=knots, tau=smoothing**-2): # the following is to include a "noise floor" so that level value # zero prior does not exert undue influence on age pattern # smoothing # TODO: consider changing this to an offset log normal gamma = np.clip(gamma, np.log(np.exp(gamma).mean()/10.), np.inf) # only include smoothing on values within 10x of mean return normal_like(np.sqrt(np.sum(np.diff(gamma)**2 / np.diff(knots))), 0, tau) vars['smooth_gamma'] = smooth_gamma return vars # In[47]: np.mean(PO2) # In[48]: plt.hist(PO2-80, bins=20) # In[59]: def survival_model(age_mask, n_knots=5): # Imputation of missing values p_va = Beta('p_VA', 1, 1, value=0.5) va_masked = np.ma.masked_values(va[age_mask], value=0.5) x_va = Bernoulli('x_va', p_va, value=va_masked, observed=True) mu_pH = Normal('mu_pH', 0, 0.0001, value=7) sigma_pH = Uniform('sigma_pH', 0, 500, value=10) tau_pH = sigma_pH**-2 pH_masked = np.ma.masked_values(pH_center[age_mask], value=7.111) x_pH = Normal('x_pH', mu_pH, tau_pH, value=pH_masked, observed=True) alpha_PO2 = Exponential('alpha_PO2', 100, value=0.5) beta_PO2 = Exponential('beta_PO2', 100, value=200) PO2_masked = np.ma.masked_values(PO2[age_mask], value=20.111) x_PO2 = Gamma('x_PO2', alpha_PO2, beta_PO2, value=PO2_masked, observed=True) x_PO2_std = Lambda('x_PO2_std', lambda x=x_PO2: (x - 80)/np.std(x)) X = [x_pH, x_PO2_std, x_va, flu_unique.flu_only.values[age_mask], flu_unique.flu_staph.values[age_mask]] # Intercept for survival rate beta0 = Normal('beta0', 0.0, 0.001, value=0) # Covariates beta = Normal('beta', 0, 0.001, value=np.zeros(len(X))) odds = Lambda('odds', lambda b=beta: np.exp(b)) if AgeYears[age_mask].max()>1: age_knots = np.linspace(AgeYears_center[age_mask].min(), AgeYears_center[age_mask].max(), n_knots) α_age = Exponential('α_age', 1, value=0.1) spline_age = spline('spline_age', AgeYears_center[age_mask], age_knots, α_age) # Event rates @deterministic def π(b0=beta0, b=beta, x=X, γ=spline_age['mu_x']): return invlogit(b0 + np.dot(np.transpose(x), b) + γ) else: # Event rates @deterministic def π(b0=beta0, b=beta, x=X): return invlogit(b0 + np.dot(np.transpose(x), b)) deaths = Bernoulli('deaths', π, value=died[age_mask], observed=True) return locals() # ### Adults model # In[60]: adult_mask = AgeYears>=18 # In[61]: M_adult = MCMC(survival_model(adult_mask)) # In[62]: M_adult.sample(100000, 90000) # In[63]: cov_labels = ['pH', 'PO2', 'VA', 'flu only', 'flu + staph'] # In[64]: Matplot.summary_plot(M_adult.odds, custom_labels=cov_labels, hpd=False, vline_pos=1) # In[65]: M_adult.odds.summary() # In[66]: Matplot.summary_plot(M_adult.__dict__['spline_age']['pred'], custom_labels=np.linspace(AgeYears[adult_mask].min(), AgeYears[adult_mask].max(), 10).round(1).astype(str), xlab='Effect size', main='Age effect') # ### Child model # In[67]: child_mask = (AgeYears>=1) & (AgeYears<18) M_child = MCMC(survival_model(child_mask)) M_child.sample(100000, 90000) # In[68]: Matplot.summary_plot(M_child.odds, custom_labels=cov_labels, hpd=False, vline_pos=1) # In[69]: Matplot.summary_plot(M_child.__dict__['spline_age']['pred'], custom_labels=np.linspace(AgeYears[child_mask].min(), AgeYears[child_mask].max(), 10).round(1).astype(str), xlab='Effect size', main='Age effect') # In[70]: M_child.odds.summary() # ### Infant model # In[71]: infant_mask = (AgeYears<1) M_infant = MCMC(survival_model(infant_mask)) M_infant.sample(100000, 90000) # In[72]: Matplot.summary_plot(M_infant.odds, custom_labels=cov_labels, hpd=False, vline_pos=1) # In[73]: M_infant.odds.summary()