%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc as pm
import pdb
pm.__version__
'2.3.5'
flu = pd.read_csv('data/flu_organism.csv', index_col=0)
flu.shape
(2937, 64)
flu.PatientID.unique().shape
(1654,)
Create indicator for flu-only.
flu_only = flu.groupby('PatientID')['OrganismName'].apply(lambda s:
len([x for x in s if str(x).startswith('Influenza')])==len(s)).astype(bool)
flu_only.head()
PatientID 001B0EC6-66F5-48C9-8516-52D2ABFE28AA False 0034181F-59AB-4B2C-A06A-A83EE1DF1A17 True 004AE387-1096-410D-A7CB-F9A9086AA385 False 0084DCF0-8E3C-4F1C-8650-BE795A57EF2F False 0084E706-90D0-4314-8C99-496171AB639D True Name: OrganismName, dtype: bool
flu_only.name = 'flu_only'
flu = flu.merge(pd.DataFrame(flu_only), left_on='PatientID', right_index=True)
# flu_staph_organism = flu.groupby('PatientID')['OrganismName'].apply(lambda s:
# len([x for x in s if str(x).startswith('Influenza') or
# str(x).startswith('Staphylococcus aureus')])>1).astype(bool)
# flu_staph_organism.name = 'flu_staph_org'
# flu = flu.merge(pd.DataFrame(flu_staph_organism), left_on='PatientID', right_index=True)
See if there are any flu-only or flu-other individuals with an ICD9 diagnosis of S. Aureus
flu_other = flu.groupby('PatientID')['OrganismName'].apply(lambda s:
len([x for x in s if str(x).startswith('Influenza') or
str(x).startswith('Other')])>1).astype(bool)
flu_other.name = 'flu_other'
flu = flu.merge(pd.DataFrame(flu_other), left_on='PatientID', right_index=True)
There do not appear to be cases of flu-only or flu-other via organism that have a S.Aureus diagnosis via ICD9
(flu_other & flu.s_aureus_icd9).sum()
0
(flu_only & flu.s_aureus_icd9).sum()
0
Same for pneumo
(flu_other & flu.pneumo_icd9).sum()
0
flu.groupby('PatientID')['OrganismName'].value_counts()
PatientID 001B0EC6-66F5-48C9-8516-52D2ABFE28AA Enterococcus 1 Klebsiella pneumoniae 1 0034181F-59AB-4B2C-A06A-A83EE1DF1A17 Influenza A 1 004AE387-1096-410D-A7CB-F9A9086AA385 Influenza A 1 Pseudomonas aerugenosa 1 Candida albicans 1 0084DCF0-8E3C-4F1C-8650-BE795A57EF2F Influenza A 1 Staphylococcus aureus 1 0084E706-90D0-4314-8C99-496171AB639D Influenza B 1 00EFD11D-650A-4317-8C4F-4B7378AD7946 Other 2 0118B486-8ACE-4BFF-8ED6-753D96E6B441 Influenza A 1 011B5A5D-9967-44A7-92F9-02F82D31216D Candida albicans 2 Influenza A 1 Epstein-Barr virus (EBV) 1 01AC9644-7CDF-4E28-A5F4-FE5B7F9FA22A Streptococcus pneumoniae 1 Staphylococcus, coag neg 1 Influenza A 1 Moraxella catarrhalis (Branhamella) 1 01F84F52-92D3-403B-B8AC-21A72A4BC771 Influenza A 1 02373407-74E6-4A1E-9629-F4DCBF707B0C Influenza A 1 025D87C9-CF70-4B9A-94BE-B15EA2185472 Eschericha coli 2 Influenza A 1 Enterococcus 1 02A94D2E-768A-4ABB-86BF-C8A188D698CC Influenza A 1 02C9FF5A-A4D6-461A-A650-E3EB405D62AE Influenza A 2 02D33150-FDDB-4F12-AAAB-D6F4B8765EBF Influenza A 1 031C1231-B585-478F-A98F-35E017B788CD Other 2 035A053E-91B1-4B68-87D4-F92C675FE6E3 Other 2 03AA4AD0-1B0E-4CAD-BD76-03FE52AA580D Influenza A 1 Epstein-Barr virus (EBV) 1 .. FC9F84B4-B2CD-4C28-B824-D5CE1AF7BEF1 Gram positive, other 2 Yeast sp. 1 Aspergillus fumigatus 1 Pseudomonas fluorescens 1 Enterococcus 1 Candida albicans 1 Stenotrophomonas maltophilia (Xanthomonas) 1 Other 1 FCBAEC19-EFA1-4A95-A98D-1D8FE65B53CE Influenza A 1 FCD795A0-9FD1-4C20-961B-04BEF1B23E96 Influenza A 1 Staphylococcus aureus 1 FCDF1967-C307-4FF1-8CF1-7AA02FE0BE47 Influenza A 1 FD2FA339-7417-4F13-811B-2E1873E529C7 Influenza A 1 FDC32D1D-0D67-4752-A2E1-72E790FCB797 Influenza A 1 Pseudomonas aerugenosa 1 FDED68D1-1DFF-4899-9CF9-797E6AE70D18 Influenza A 1 FDFF23E3-C0CB-468D-98D9-998192CA7590 Influenza A 1 Yeast sp. 1 Eschericha coli 1 Candida albicans 1 FEA94F00-083A-479C-B862-05274A9842B5 Influenza A 1 Streptococcus, group A 1 Staphylococcus aureus 1 FF0D88BF-E131-4E56-9E78-CC6B60E461FD Influenza A 1 Yeast sp. 1 Staphylococcus, coag neg 1 FF0FB831-9804-4506-AE0B-B447AB54D4D5 Hemophilus influenza 1 FF528500-5274-4225-96C2-CF1153EE3AB0 Other 1 FF6CD7E8-8956-4C15-AF6C-3033A38C48D7 Influenza A 1 Staphylococcus epidermidis 1 dtype: int64
Create indictors for coinfection type
for org in flu.Type.unique():
flu[org] = (flu.Type==org).astype(int)
Create data frame of unique patients
flu_unique = flu.drop_duplicates(subset=['PatientID']).set_index('PatientID')
flu_unique.s_aureus_icd9.sum()
100
flu_unique['flu_only'] = flu_only
flu_unique.flu_only.mean()
0.34159613059250304
Several missing values for admission to time on ECMO
flu_unique.AdmitToTimeOnHours.isnull().mean()
0.16686819830713423
Create variables for use in analysis
# ECMO Type
flu_unique['VA'] = flu_unique.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu_unique.loc[flu_unique.Mode=='Other', 'VA'] = None
Create oxygen index
flu_unique['OI'] = flu_unique.FiO2 * flu_unique.MAP / flu_unique.PO2
flu_unique.OI.hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x10dbac6a0>
covariates = ['AgeYears', 'pH', 'PO2', 'VA']
Get counts of missing values
flu_unique[covariates].isnull().mean()
AgeYears 0.000000 pH 0.107618 PO2 0.091898 VA 0.010883 dtype: float64
flu_unique[covariates].hist(bins=25);
flu_unique[flu_unique.AgeYears< 1].AgeDays.hist(bins=15)
<matplotlib.axes._subplots.AxesSubplot at 0x10dbc1e80>
flu_unique.s_aureus_icd9.sum()
100
(flu_unique.flu_staph_org | flu_unique.s_aureus_icd9).sum()
244
foo = '01AC9644-7CDF-4E28-A5F4-FE5B7F9FA22A'
flu_unique.ix[foo][['flu_staph_org', 's_aureus_icd9']]
flu_staph_org False s_aureus_icd9 True Name: 01AC9644-7CDF-4E28-A5F4-FE5B7F9FA22A, dtype: object
flu_unique['flu_staph'] = (flu_unique.flu_staph_org | flu_unique.s_aureus_icd9).astype(int)
flu_unique.to_csv('Data/flu_unique.csv')
Fate of each patient
died = (flu_unique.fate=='Died').astype(int).values
N_obs = len(died)
Functions for standarizing and centering
center = lambda x: (x - x[np.isnan(x) ^ True].mean())
standardize = lambda x: center(x) / x[np.isnan(x) ^ True].std()
covariates
['AgeYears', 'pH', 'PO2', 'VA']
AgeYears, pH, PO2, va = flu_unique[covariates].values.T
AgeYears_center = center(AgeYears).round(1)
# Center pH at 7.4
pH_center = pH - 7.4
# OI_std = standardize(OI)
# PO2_std = standardize(PO2)
no_coinfection = flu_unique.flu_only.values
with_staph = flu_unique.flu_staph.values
flu_unique.flu_staph.sum()
244
from pymc import MCMC, Normal, Poisson, Uniform, Lambda, Laplace, Exponential, Beta, HalfCauchy, Uninformative
from pymc import Bernoulli, Lognormal, observed, Matplot, deterministic, Gamma, stochastic
from pymc import AdaptiveMetropolis, normal_like, rnormal, potential, invlogit
from pymc.gp import *
# Values for missing elements, to be replaced in model
# OI_std[np.isnan(OI_std)] = 0
pH_center[np.isnan(pH_center)] = 7.111
va[np.isnan(va)] = 0.5
PO2[np.isnan(PO2)] = 20.111
def spline(name, x, knots, smoothing, interpolation_method='linear', pred_points=10):
""" Generate PyMC objects for a spline model of age-specific rate
Parameters
----------
name : str
knots : array
x : array, points to interpolate to
smoothing : pymc.Node, smoothness parameter for smoothing spline
interpolation_method : str, optional, one of 'linear', 'nearest', 'zero', 'slinear', 'quadratic, 'cubic'
Results
-------
Returns dict of PyMC objects, including 'gamma' (log of rate at
knots) and 'mu_age' (age-specific rate interpolated at all age
points)
"""
assert np.all(np.diff(knots) > 0), 'Spline knots must be strictly increasing'
# TODO: consider changing this prior distribution to be something more familiar in linear space
gamma = [Normal('gamma_%s_%d'%(name,k), 0., 10.**-2, value=0.) for k in knots]
#gamma = [mc.Uniform('gamma_%s_%d'%(name,k), -20., 20., value=-10.) for k in knots]
import scipy.interpolate
@deterministic(name='mu_x_%s'%name)
def mu_x(gamma=gamma, knots=knots, x=x):
mu = scipy.interpolate.interp1d(knots, np.exp(gamma), kind=interpolation_method,
bounds_error=False, fill_value=0.)
return mu(x)
@deterministic(name='pred_%s'%name)
def pred(gamma=gamma, knots=knots):
mu = scipy.interpolate.interp1d(knots, np.exp(gamma), kind=interpolation_method,
bounds_error=False, fill_value=0.)
return mu(np.linspace(x.min(), x.max(), pred_points))
vars = dict(gamma=gamma, mu_x=mu_x, pred=pred, x=x, knots=knots)
if (smoothing > 0) and (smoothing < 1e10):
#print 'adding smoothing of', smoothing
@potential(name='smooth_mu_%s'%name)
def smooth_gamma(gamma=gamma, knots=knots, tau=smoothing**-2):
# the following is to include a "noise floor" so that level value
# zero prior does not exert undue influence on age pattern
# smoothing
# TODO: consider changing this to an offset log normal
gamma = np.clip(gamma, np.log(np.exp(gamma).mean()/10.), np.inf) # only include smoothing on values within 10x of mean
return normal_like(np.sqrt(np.sum(np.diff(gamma)**2 / np.diff(knots))), 0, tau)
vars['smooth_gamma'] = smooth_gamma
return vars
np.mean(PO2)
58.167999999999999
plt.hist(PO2-80, bins=20)
(array([ 500., 886., 182., 25., 16., 11., 9., 5., 5., 2., 3., 1., 1., 0., 3., 0., 2., 2., 0., 1.]), array([ -76. , -40.85, -5.7 , 29.45, 64.6 , 99.75, 134.9 , 170.05, 205.2 , 240.35, 275.5 , 310.65, 345.8 , 380.95, 416.1 , 451.25, 486.4 , 521.55, 556.7 , 591.85, 627. ]), <a list of 20 Patch objects>)
def survival_model(age_mask, n_knots=5):
# Imputation of missing values
p_va = Beta('p_VA', 1, 1, value=0.5)
va_masked = np.ma.masked_values(va[age_mask], value=0.5)
x_va = Bernoulli('x_va', p_va, value=va_masked, observed=True)
mu_pH = Normal('mu_pH', 0, 0.0001, value=7)
sigma_pH = Uniform('sigma_pH', 0, 500, value=10)
tau_pH = sigma_pH**-2
pH_masked = np.ma.masked_values(pH_center[age_mask], value=7.111)
x_pH = Normal('x_pH', mu_pH, tau_pH, value=pH_masked, observed=True)
alpha_PO2 = Exponential('alpha_PO2', 100, value=0.5)
beta_PO2 = Exponential('beta_PO2', 100, value=200)
PO2_masked = np.ma.masked_values(PO2[age_mask], value=20.111)
x_PO2 = Gamma('x_PO2', alpha_PO2, beta_PO2, value=PO2_masked, observed=True)
x_PO2_std = Lambda('x_PO2_std', lambda x=x_PO2: (x - 80)/np.std(x))
X = [x_pH, x_PO2_std, x_va, flu_unique.flu_only.values[age_mask], flu_unique.flu_staph.values[age_mask]]
# Intercept for survival rate
beta0 = Normal('beta0', 0.0, 0.001, value=0)
# Covariates
beta = Normal('beta', 0, 0.001, value=np.zeros(len(X)))
odds = Lambda('odds', lambda b=beta: np.exp(b))
if AgeYears[age_mask].max()>1:
age_knots = np.linspace(AgeYears_center[age_mask].min(), AgeYears_center[age_mask].max(), n_knots)
α_age = Exponential('α_age', 1, value=0.1)
spline_age = spline('spline_age', AgeYears_center[age_mask], age_knots, α_age)
# Event rates
@deterministic
def π(b0=beta0, b=beta, x=X, γ=spline_age['mu_x']):
return invlogit(b0 + np.dot(np.transpose(x), b) + γ)
else:
# Event rates
@deterministic
def π(b0=beta0, b=beta, x=X):
return invlogit(b0 + np.dot(np.transpose(x), b))
deaths = Bernoulli('deaths', π, value=died[age_mask], observed=True)
return locals()
adult_mask = AgeYears>=18
M_adult = MCMC(survival_model(adult_mask))
M_adult.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 332.3 sec
cov_labels = ['pH', 'PO2', 'VA', 'flu only', 'flu + staph']
Matplot.summary_plot(M_adult.odds, custom_labels=cov_labels, hpd=False, vline_pos=1)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
M_adult.odds.summary()
odds: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.086 0.038 0.003 [ 0.029 0.162] 0.927 0.069 0.003 [ 0.795 1.065] 2.825 0.588 0.036 [ 1.795 4.004] 1.002 0.151 0.008 [ 0.722 1.295] 1.069 0.22 0.012 [ 0.625 1.472] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.034 0.056 0.078 0.106 0.18 0.789 0.88 0.927 0.974 1.06 1.894 2.4 2.759 3.173 4.191 0.734 0.896 0.993 1.1 1.322 0.68 0.916 1.05 1.206 1.55
Matplot.summary_plot(M_adult.__dict__['spline_age']['pred'],
custom_labels=np.linspace(AgeYears[adult_mask].min(), AgeYears[adult_mask].max(), 10).round(1).astype(str),
xlab='Effect size', main='Age effect')
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
child_mask = (AgeYears>=1) & (AgeYears<18)
M_child = MCMC(survival_model(child_mask))
M_child.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 253.8 sec
Matplot.summary_plot(M_child.odds, custom_labels=cov_labels,
hpd=False, vline_pos=1)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Matplot.summary_plot(M_child.__dict__['spline_age']['pred'],
custom_labels=np.linspace(AgeYears[child_mask].min(), AgeYears[child_mask].max(), 10).round(1).astype(str),
xlab='Effect size', main='Age effect')
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
M_child.odds.summary()
odds: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.554 0.369 0.029 [ 0.068 1.244] 0.857 0.108 0.004 [ 0.653 1.075] 1.525 0.331 0.018 [ 0.888 2.138] 0.947 0.226 0.011 [ 0.549 1.435] 2.513 0.814 0.053 [ 1.173 4.082] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.152 0.319 0.458 0.687 1.548 0.653 0.784 0.855 0.927 1.078 0.997 1.3 1.485 1.715 2.33 0.57 0.788 0.922 1.077 1.476 1.311 1.956 2.378 2.904 4.526
infant_mask = (AgeYears<1)
M_infant = MCMC(survival_model(infant_mask))
M_infant.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 57.5 sec
Matplot.summary_plot(M_infant.odds, custom_labels=cov_labels, hpd=False, vline_pos=1)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
M_infant.odds.summary()
odds: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 1.344 2.017 0.134 [ 0.034 4.176] 1.128 0.246 0.013 [ 0.67 1.624] 3.313 1.821 0.133 [ 0.833 6.437] 1.599 0.63 0.034 [ 0.606 2.866] 5.411 4.469 0.333 [ 0.509 14.367] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.093 0.409 0.784 1.461 6.261 0.725 0.959 1.092 1.262 1.719 1.247 2.144 2.837 4.017 7.654 0.739 1.153 1.486 1.871 3.276 1.169 2.625 4.044 6.681 19.302