%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
:0: FutureWarning: IPython widgets are experimental and may change in the future.
flu = pd.read_csv('data/flu.csv', index_col=0)
flu.shape
(1406, 60)
Turn into table of unique patients, with appropriate organism columns
flu_only = flu.groupby('PatientID')['OrganismName'].apply(lambda s:
len([x for x in s if str(x).startswith('Influenza')])==len(s)).astype(int)
coinfections = flu.groupby('PatientID')['OrganismName'].apply(lambda s:
[x for x in s.dropna() if not str(x).startswith('Influenza')])
MRSA coinfection
mrsa = coinfections.apply(lambda x: x.count('Staphylococcus aureus, meth resist') > 0).astype(int)
Any coinfection
any_coinf = coinfections.apply(lambda x: int(len(x)>0))
flu_unique = flu.drop_duplicates(cols=['PatientID']).set_index('PatientID')
/usr/local/lib/python3.4/site-packages/pandas/util/decorators.py:81: FutureWarning: the 'cols' keyword is deprecated, use 'subset' instead warnings.warn(msg, FutureWarning)
flu_unique['flu_only'] = flu_only
flu_unique['mrsa'] = mrsa
flu_unique['any_coinf'] = any_coinf
Add time from admission to time on ECMO
admit_through_emco = flu_unique.AdmitToTimeOnHours.add(flu_unique.HoursECMO, fill_value=0)
See how many have no time information from admission, and drop these
missing = admit_through_emco.isnull()
missing.sum()
1
flu_complete = flu_unique[missing-True]
admit_through_emco.dropna(inplace=True)
/usr/local/lib/python3.4/site-packages/pandas/computation/expressions.py:190: UserWarning: evaluating in Python space because the '-' operator is not supported by numexpr for the bool dtype, use '^' instead unsupported[op_str]))
# Confirm no null here
admit_through_emco.isnull().sum()
0
Time off ECMO through to event (death or discharge)
off_ecmo_to_event = flu_complete.TimeOffToDCDateHours.add(flu_complete.TimeOffToDeathDateHours,
fill_value=0)
off_ecmo_to_event.isnull().sum()
51
# Not sure why add() does not work here
flu_complete['time_to_event'] = (admit_through_emco.values + off_ecmo_to_event.fillna(0).values)
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from IPython.kernel.zmq import kernelapp as app
flu_complete.time_to_event.isnull().sum()
0
Clean covariates
flu_complete.Race.value_counts()
W 554 A 102 B 83 H 82 O 56 dtype: int64
flu_complete['non_white'] = (flu_complete.Race!='W').astype(int)
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__':
flu_complete['male'] = (flu_complete.Sex=='M').astype(int)
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__':
flu_complete['OI'] = flu_complete.FiO2 * flu_complete.MAP / flu_complete.PO2
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__':
# ECMO Type
flu_complete['VA'] = flu_complete.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu_complete.loc[flu_complete.Mode=='Other', 'VA'] = None
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from IPython.kernel.zmq import kernelapp as app /usr/local/lib/python3.4/site-packages/pandas/core/indexing.py:415: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self.obj[item] = s
# covariates = ['male', 'AgeYears', 'non_white', 'HoursECMO', 'pH', 'PCO2', 'PO2', 'HCO3',
# 'SaO2', 'mrsa', 'any_coinf']
covariates = ['male', 'AgeYears', 'non_white', 'pH', 'PCO2', 'HCO3', 'PO2', 'OI',
'SaO2', 'mrsa', 'any_coinf', 'AdmitToTimeOnHours', 'VA', 'PEEP']
Get counts of missing values
flu_complete[covariates].isnull().mean().round(2)*100
male 0 AgeYears 0 non_white 0 HoursECMO 0 pH 11 PCO2 10 HCO3 19 PO2 9 OI 43 SaO2 16 mrsa 0 any_coinf 0 AdmitToTimeOnHours 13 VA 2 PEEP 38 dtype: float64
flu_complete.columns
Index(['RunNo', 'AgeDays', 'HoursECMO', 'SupportType', 'PrimaryDx', 'Mode', 'Discontinuation', 'DischargedAlive', 'DischargeLocation', 'YearECLS', 'VentType', 'Rate', 'FiO2', 'PIP', 'PEEP', 'MAP', 'HandBagging', 'pH', 'PCO2', 'PO2', 'HCO3', 'SaO2', 'Venttype24', 'Rate24', 'Fio224', 'PIP24', 'PEEP24', 'MAP24', 'Handbagging24', 'SBP', 'DBP', 'MapHemo', 'SVO2', 'PCWP', 'SPAP', 'DPAP', 'MPAP', 'CI', 'Race', 'Sex', 'AdmitToTimeOnHours', 'TimeOffToExtubationDateHours', 'TimeOffToDeathDateHours', 'TimeOffToDCDateHours', 'ExtubationToDCDateHours', 'ExtubationToDeathDateHours', 'year', 'HoursVent', 'AgeYears', 'age_class', 'RunID', 'OrganismNo', 'OrganismName', 'CultureSite', 'CultureTimeIsApproximate', 'OrganismTiming', 'Type', 'has_flu', 'fate', 'flu_only', 'mrsa', 'any_coinf', 'time_to_event', 'non_white', 'male', 'OI', 'VA'], dtype='object')
flu_complete.groupby('fate')[['HoursECMO', 'AdmitToTimeOnHours']].mean()
HoursECMO | AdmitToTimeOnHours | |
---|---|---|
fate | ||
Died | 324.935754 | 101.009709 |
Survived | 293.647280 | 60.439914 |
We are going to drop the missing values for some of the covariates, and not use some covariates with many missing values.
drop_subset = ['HoursECMO', 'pH', 'PCO2', 'PO2', 'VA']
flu_complete = flu_complete.dropna(subset=drop_subset)
flu_complete[['pH', 'PCO2', 'PO2', 'HCO3',
'SaO2']].hist(bins=25);
Surivival time, in days
obs_t = flu_complete.time_to_event.values/24.
plt.hist(obs_t, bins=25);
Fate of each patient
died = (flu_complete.fate=='Died').astype(int).values
from lifelines import KaplanMeierFitter
km = KaplanMeierFitter()
km.fit(obs_t, event_observed=died)
ax = km.plot()
mrsa_ind = flu_complete.mrsa.astype(bool).values
km = KaplanMeierFitter()
km.fit(obs_t[~mrsa_ind], event_observed=died[~mrsa_ind], label='non-MRSA')
ax = km.plot()
km.fit(obs_t[mrsa_ind], event_observed=died[mrsa_ind], label='MRSA')
km.plot(ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x10b8d1ba8>
from lifelines import NelsonAalenFitter
na = NelsonAalenFitter()
na.fit(obs_t, event_observed=died)
na.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10ba0d748>
Unique failure times
times = np.unique((flu_complete.time_to_event[flu_complete.fate=='Died'].values/24).astype(int))
times
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 53, 54, 55, 59, 61, 62, 63, 67, 68, 73, 77, 108, 121, 125, 126, 133, 142, 150, 201, 227, 235, 301])
N_obs = len(obs_t)
T = len(times) - 1
Calculate risk set
Y = np.array([[int(obs >= t) for t in times] for obs in obs_t])
Y
array([[1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], ..., [1, 0, 0, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0]])
Counting process. Jump = 1 if $\text{obs}_t \in [ t_j, t_{j+1} )$
dN = np.array([[Y[i,j]*(times[j+1] >= obs_t[i])*died[i] for i in range(N_obs)] for j in range(T)])
# Process for one patient
dN[:,1]
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Covariates
standardize = lambda x: (x - x[np.isnan(x)-True].mean()) / x[np.isnan(x)-True].std()
analysis_covariates = ['male', 'AgeYears', 'non_white', 'HoursECMO', 'pH', 'PCO2', 'PO2', 'mrsa', 'any_coinf', 'VA']
flu_complete[analysis_covariates].isnull().sum()
male 0 AgeYears 0 non_white 0 HoursECMO 0 pH 0 PCO2 0 PO2 0 mrsa 0 any_coinf 0 VA 0 dtype: int64
male, AgeYears, non_white, HoursECMO, pH, PCO2, PO2, mrsa, any_coinf, VA= \
flu_complete[analysis_covariates].values.T
HoursECMO_std = standardize(HoursECMO)
AgeYears_std = standardize(AgeYears)
pH_std = standardize(pH)
PCO2_std = standardize(PCO2)
PO2_std = standardize(PO2)
# HCO3_std = standardize(HCO3)
import theano.tensor as tt
from pymc3 import Model, Normal, Poisson, DensityDist, T as StudentT, Uniform, Deterministic, HalfCauchy, Laplace, Exponential
from pymc3 import Bernoulli, Gamma, MvNormal, BinaryMetropolis
from pymc3.distributions.timeseries import GaussianRandomWalk
from pymc3 import sample, NUTS, Slice, Metropolis, forestplot, find_MAP
n_kernels = 5
kernels = np.linspace(AgeYears_std.min(), AgeYears_std.max(), n_kernels)
def rbf(s, x):
return [tt.exp( ((x - k)/ s)**2) for k in kernels]
with Model() as model:
sigma = Uniform('sigma', 0, 100)
alpha_sd = Uniform('alpha_sd', 0, 100)
alpha = GaussianRandomWalk('alpha', sd=alpha_sd, shape=n_kernels)
a = tt.dot(alpha, rbf(sigma, AgeYears_std))
X = np.c_[[male, pH_std, PCO2_std, PO2_std, HoursECMO_std, non_white, mrsa, any_coinf]].T
# Intercept for survival rate
beta0 = Normal('beta0', 0.0, 0.001)
# Treatment effect
beta = Normal('beta', 0, 0.001, shape=X.shape[1])
# Survival rates
lam = tt.exp(beta0 + tt.dot(X, beta) + a)
def exp_logp(failure, value):
# Exponential survival log-likelihood
return tt.sum(failure * tt.log(lam) - lam * value)
surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t))
with model:
trace = sample(2000, step=NUTS(), njobs=2)
[-----------------100%-----------------] 2000 of 2000 complete in 821.2 sec
forestplot(trace, vars=['alpha'])
<matplotlib.gridspec.GridSpec at 0x10dcbffd0>
forestplot(trace, vars=['beta'], ylabels=['male', 'pH', 'PCO2', 'PO2', 'HoursECMO', 'non_white', 'mrsa', 'any_coinf'])
<matplotlib.gridspec.GridSpec at 0x10fae6b70>
flu_complete.columns
Index(['RunNo', 'AgeDays', 'HoursECMO', 'SupportType', 'PrimaryDx', 'Mode', 'Discontinuation', 'DischargedAlive', 'DischargeLocation', 'YearECLS', 'VentType', 'Rate', 'FiO2', 'PIP', 'PEEP', 'MAP', 'HandBagging', 'pH', 'PCO2', 'PO2', 'HCO3', 'SaO2', 'Venttype24', 'Rate24', 'Fio224', 'PIP24', 'PEEP24', 'MAP24', 'Handbagging24', 'SBP', 'DBP', 'MapHemo', 'SVO2', 'PCWP', 'SPAP', 'DPAP', 'MPAP', 'CI', 'Race', 'Sex', 'AdmitToTimeOnHours', 'TimeOffToExtubationDateHours', 'TimeOffToDeathDateHours', 'TimeOffToDCDateHours', 'ExtubationToDCDateHours', 'ExtubationToDeathDateHours', 'year', 'HoursVent', 'AgeYears', 'age_class', 'RunID', 'OrganismNo', 'OrganismName', 'CultureSite', 'CultureTimeIsApproximate', 'OrganismTiming', 'Type', 'has_flu', 'fate', 'flu_only', 'mrsa', 'any_coinf', 'time_to_event', 'non_white', 'male', 'OI', 'VA'], dtype='object')
flu_complete.year.value_counts()
2009 338 2011 186 2010 115 2012 73 2008 31 2007 25 2003 24 2005 19 2006 17 2000 14 1998 10 1999 9 2002 8 2001 8 2004 6 1997 3 1996 2 1995 1 1994 1 1993 1 1992 1 dtype: int64
forestplot(trace, vars=['beta'], ylabels=['male', 'pH', 'PCO2', 'PO2', 'HoursECMO', 'non_white', 'mrsa', 'any_coinf'])
<matplotlib.gridspec.GridSpec at 0x125108850>
Same model above, but with a Weibull hazard function
with Model() as model_weibull:
X = np.c_[[male, pH_std, PCO2_std, PO2_std, HoursECMO_std, non_white, mrsa, any_coinf]].T
# Intercept for survival rate
beta0 = Normal('beta0', 0.0, 0.001)
# Treatment effect
beta = Normal('beta', 0, 0.001, shape=X.shape[1])
# Survival rates
lam = tt.exp(beta0 + tt.dot(X, beta))
alpha = Exponential('alpha', 1)
def weibull_logp(failure, times):
# Weibull survival log-likelihood
return tt.sum(failure*(tt.log(alpha) + (alpha-1)*tt.log(times*lam) + tt.log(lam)) - (lam*times)**alpha)
surv_like = DensityDist('surv_like', weibull_logp, observed=(died, obs_t))
with model_weibull:
trace_weibull = sample(2000, step=NUTS())
--------------------------------------------------------------------------- PositiveDefiniteError Traceback (most recent call last) <ipython-input-52-4e1716b5764d> in <module>() 1 with model_weibull: ----> 2 trace_weibull = sample(2000, step=NUTS()) /usr/local/lib/python3.4/site-packages/pymc3/step_methods/nuts.py in __init__(self, vars, scaling, step_scale, is_cov, state, Emax, target_accept, gamma, k, t0, model, **kwargs) 67 68 ---> 69 self.potential = quad_potential(scaling, is_cov, as_cov=False) 70 71 if state is None: /usr/local/lib/python3.4/site-packages/pymc3/step_methods/quadpotential.py in quad_potential(C, is_cov, as_cov) 33 return QuadPotential_SparseInv(C) 34 ---> 35 partial_check_positive_definite(C) 36 if C.ndim == 1: 37 if is_cov != as_cov: /usr/local/lib/python3.4/site-packages/pymc3/step_methods/quadpotential.py in partial_check_positive_definite(C) 56 if len(i): 57 raise PositiveDefiniteError( ---> 58 "Simple check failed. Diagonal contains negatives", i) 59 60 PositiveDefiniteError: Scaling is not positive definite. Simple check failed. Diagonal contains negatives. Check indexes [0 1 2 3 4 5 6 7 8]
The following incorporates a spline for the age effect.
def interpolate(x0, y0, x):
x = np.array(x)
idx = np.searchsorted(x0, x)
dl = np.array(x - x0[idx - 1])
dr = np.array(x0[idx] - x)
d=dl+dr
wl = dr/d
return wl*y0[idx-1] + (1-wl)*y0[idx]
nknots = 10
with Model() as model_spline:
#X = np.c_[[male, non_white, mrsa, any_coinf]].T
coeff_sd = Uniform('coeff_sd', 0, 100)
coeff_tau = coeff_sd**-2
spline = Normal('spline', 0, coeff_tau, shape=nknots)
knots = np.linspace(AgeYears.min(), AgeYears.max()+1, nknots)
beta1 = Deterministic('beta1', interpolate(knots, spline, AgeYears))
# Intercept for survival rate
beta0 = Normal('beta0', 0.0, 0.001)
# Treatment effect
#beta = Normal('beta', 0, 0.001, shape=X.shape[1])
# Survival rates
lam = tt.exp(beta0 + beta1)# + tt.dot(X, beta))
def exp_logp(failure, value):
# Exponential survival log-likelihood
return tt.sum(failure * tt.log(lam) - lam * value)
surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-37-db5cbe3c3dee> in <module>() 9 spline = Normal('spline', 0, coeff_tau, shape=nknots) 10 knots = np.linspace(AgeYears.min(), AgeYears.max()+1, nknots) ---> 11 beta1 = Deterministic('beta1', interpolate(knots, spline, AgeYears)) 12 13 # Intercept for survival rate NameError: name 'interpolate' is not defined
with model_spline:
trace_spline = sample(1000, step=NUTS())
forestplot(trace_spline, vars=['beta'],
ylabels=['male', 'non_white', 'mrsa', 'any_coinf'])
<matplotlib.gridspec.GridSpec at 0x11bb06a90>
forestplot(trace_spline, vars=['spline'])
<matplotlib.gridspec.GridSpec at 0x11da06210>
with Model() as log_logistic_model:
sigma = Uniform('sigma', 0, 100)
alpha_sd = Uniform('alpha_sd', 0, 100)
alpha = GaussianRandomWalk('alpha', sd=alpha_sd, shape=n_kernels)
a = tt.dot(alpha, rbf(sigma, AgeYears_std))
X = np.c_[[male, pH_std, PCO2_std, PO2_std, HoursECMO_std, non_white, mrsa, any_coinf]].T
# Intercept for survival rate
beta0 = Normal('beta0', 0.0, 0.001)
# Treatment effect
beta = Normal('beta', 0, 0.001, shape=X.shape[1])
# Survival rates
lam = tt.exp(beta0 + tt.dot(X, beta) + a)
def exp_logp(failure, value):
# Exponential survival log-likelihood
return tt.sum(failure * tt.log(lam) - lam * value)
surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t))
plt.hist(AgeYears)
(array([ 248., 106., 75., 103., 79., 64., 66., 40., 7., 4.]), array([ 0. , 8.13123288, 16.26246575, 24.39369863, 32.52493151, 40.65616438, 48.78739726, 56.91863014, 65.04986301, 73.18109589, 81.31232877]), <a list of 10 Patch objects>)
np.eye(4)*[2,3,4,5]
array([[ 2., 0., 0., 0.], [ 0., 3., 0., 0.], [ 0., 0., 4., 0.], [ 0., 0., 0., 5.]])
gaussian_basis = lambda x, h, l: np.exp(-(np.abs(x - h) / l)**2)
n_rbf = 20
centers = np.linspace(AgeYears.min(), AgeYears.max()+1, n_rbf)
l = 3
# Weights
w = [[gaussian_basis(x, h, l) for h in centers] for x in AgeYears]
w_pred = [[gaussian_basis(x, h, l) for h in centers] for x in np.linspace(AgeYears.min(), AgeYears.max()+1, 100)]
with Model() as model_rbf:
# Prior on number of basis functions
z = Bernoulli('z', 0.5, shape=n_rbf)
# Prior for non-zero coefficients
sigma = HalfCauchy('sigma', 25)
#kappa = Gamma('kappa', 0.5, 0.5, shape=n_rbf)
#Tau = tt.diag(kappa) #* sigma**-2
#beta = MvNormal('beta', [0]*n_rbf, Tau, shape=n_rbf)
beta = GaussianRandomWalk('beta', sd=sigma, shape=n_rbf)
# Intercept for survival rate
beta0 = Normal('beta0', 0.0, 0.001)
# Hazard rates
lam = Deterministic('lam', tt.exp(beta0 + tt.dot(w, z*beta)))
lam_pred = Deterministic('lam_pred', tt.exp(beta0 + tt.dot(w_pred, z*beta)))
def exp_logp(failure, value):
# Exponential survival log-likelihood
return tt.sum(failure * tt.log(lam) - lam * value)
surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t))
with model_rbf:
start = {'z': [1]*n_rbf, 'sigma': 10, 'beta': [0]*n_rbf, 'beta0': 0}
step1 = BinaryMetropolis(vars=[z])
step2 = NUTS(vars=[sigma, beta, beta0], scaling=start)
trace_rbf = sample(2000, step=[step1, step2], start=start)
/Library/Python/2.7/site-packages/theano/scan_module/scan_perform_ext.py:85: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility from scan_perform.scan_perform import *
from pymc import traceplot
forestplot(trace_rbf[5000:], vars=['lam_pred'])
_ = traceplot(trace_rbf[5000:], vars=['sigma'])
from pymc import summary
summary(trace_rbf, vars=['z'])