%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc as pm
pm.__version__
'2.3.2'
flu = pd.read_csv('data/flu.csv', index_col=0)
flu.shape
(1406, 60)
Turn into table of unique patients, with appropriate organism columns
flu_only = flu.groupby('PatientID')['OrganismName'].apply(lambda s:
len([x for x in s if str(x).startswith('Influenza')])==len(s)).astype(int)
Create indictors for coinfection type
for org in flu.Type.unique():
flu[org] = (flu.Type==org).astype(int)
flu_unique = flu.drop_duplicates(subset=['PatientID']).set_index('PatientID')
flu_unique['flu_only'] = flu_only
Add time from admission to time on ECMO
admit_through_emco = flu_unique.AdmitToTimeOnHours.add(flu_unique.HoursECMO, fill_value=0)
See how many have no time information from admission, and drop these
missing = admit_through_emco.isnull()
missing.sum()
1
flu_complete = flu_unique[missing ^ True]
admit_through_emco.dropna(inplace=True)
# Confirm no null here
admit_through_emco.isnull().sum()
0
flu_complete['admit_through_emco'] = admit_through_emco
Time off ECMO through to event (death or discharge)
off_ecmo_to_event = flu_complete.TimeOffToDCDateHours.add(flu_complete.TimeOffToDeathDateHours,
fill_value=0)
off_ecmo_to_event.isnull().sum()
51
# Not sure why add() does not work here
flu_complete['time_to_event'] = (admit_through_emco.values + off_ecmo_to_event.fillna(0).values)
flu_complete.time_to_event.isnull().sum()
0
Clean covariates
flu_complete.Race.value_counts()
W 554 A 102 B 83 H 82 O 56 dtype: int64
flu_complete['non_white'] = (flu_complete.Race!='W').astype(int)
flu_complete['male'] = (flu_complete.Sex=='M').astype(int)
flu_complete['PO2'][flu_complete.PO2 > 200] = None
/usr/local/lib/python2.7/site-packages/pandas/core/series.py:632: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame self.where(~key, value, inplace=True)
# ECMO Type
flu_complete['VA'] = flu_complete.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu_complete.VA[flu_complete.Mode=='Other'] = np.nan
Create oxygen index
flu_complete['OI'] = flu_complete.FiO2 * flu_complete.MAP / flu_complete.PO2
flu_complete.OI.hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x1153ef190>
flu_complete[flu_complete.OI>250][['pH', 'PCO2', 'PO2', 'HCO3',
'SaO2', 'FiO2']]
pH | PCO2 | PO2 | HCO3 | SaO2 | FiO2 | |
---|---|---|---|---|---|---|
PatientID | ||||||
C2CFFB42-3BED-4ED3-888D-213F16088627 | 7.12 | 70.0 | 8.0 | 23.0 | 4.0 | 100 |
B94B0EE6-9085-4963-99F1-92A626E2DAB9 | 7.27 | 9.0 | 4.0 | 34.0 | 53.0 | 100 |
EB457A6D-41CB-48C0-A305-CEB4B034E3B2 | 7.32 | 6.3 | 7.5 | 22.5 | 83.0 | 100 |
EB0E87F5-39EA-4330-A104-E90254929813 | 7.33 | 6.4 | 5.6 | 23.2 | 71.9 | 100 |
covariates = ['male', 'AgeYears', 'HoursECMO', 'pH', 'PCO2', 'HCO3', 'OI',
'SaO2', 'viral', 'fungal', 'bacterial', 'admit_through_emco', 'VA']
Get counts of missing values
flu_complete[covariates].isnull().sum()
male 0 AgeYears 0 HoursECMO 1 pH 95 PCO2 85 HCO3 167 OI 396 SaO2 147 viral 0 fungal 0 bacterial 0 admit_through_emco 0 VA 14 dtype: int64
We are going to drop the missing values for some of the covariates, and not use some covariates with many missing values.
drop_subset = ['HoursECMO', 'VA']
flu_complete = flu_complete.dropna(subset=drop_subset)
flu_complete[['pH', 'PCO2', 'PO2', 'HCO3',
'SaO2', 'FiO2']].hist(bins=25);
Surivival time, in days
obs_t = flu_complete.time_to_event.values/24.
plt.hist(obs_t, bins=25);
Fate of each patient
died = (flu_complete.fate=='Died').astype(int).values
Unique failure times
times = np.unique((flu_complete.time_to_event[flu_complete.fate=='Died'].values/24).astype(int))
times
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 53, 54, 55, 59, 61, 62, 63, 67, 68, 73, 77, 108, 121, 125, 126, 133, 142, 150, 201, 227, 235, 301])
N_obs = len(obs_t)
T = len(times) - 1
Calculate risk set
Y = np.array([[int(obs >= t) for t in times] for obs in obs_t])
Y
array([[1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], ..., [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0], [1, 1, 1, ..., 0, 0, 0]])
Counting process. Jump = 1 if $\text{obs}_t \in [ t_j, t_{j+1} )$
dN = np.array([[Y[i,j]*(times[j+1] >= obs_t[i])*died[i] for i in range(N_obs)] for j in range(T)])
# Process for one patient
dN[:,1]
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Covariates
standardize = lambda x: (x - x[np.isnan(x) ^ True].mean()) / x[np.isnan(x) ^ True].std()
center = lambda x: (x - x[np.isnan(x) ^ True].mean())
male, AgeYears, HoursECMO, pH, PCO2, HCO3, OI, SaO2, viral, fungal,bacterial, admit_through_emco, va = \
flu_complete[covariates].values.T
SaO2 /= 100.
SaO2 -= 1e-6
HoursECMO_std = standardize(HoursECMO)
AgeYears_std = standardize(AgeYears)
pH_std = standardize(pH)
PCO2_std = standardize(PCO2)
HCO3_std = standardize(HCO3)
SaO2_std = standardize(SaO2)
OI_std = standardize(OI)
admit_through_emco_std = standardize(admit_through_emco)
from pymc import MCMC, Normal, Poisson, Uniform, Lambda, Laplace, Exponential, Beta
from pymc import Bernoulli, Lognormal, observed, Matplot, deterministic
SaO2[np.isnan(SaO2)] = 0.911
OI_std[np.isnan(OI_std)] = 0
pH_std[np.isnan(pH_std)] = 0
PCO2_std[np.isnan(PCO2_std)] = 0
def survival_model():
mu_pH = Normal('mu_pH', 0, 0.0001, value=0)
sigma_pH = Uniform('sigma_pH', 0, 500, value=10)
tau_pH = sigma_pH**-2
pH_masked = np.ma.masked_values(pH_std, value=0)
x_pH = Normal('x_pH', mu_pH, tau_pH, value=pH_masked, observed=True)
mu_PCO2 = Normal('mu_PCO2', 0, 0.0001, value=0)
sigma_PCO2 = Uniform('sigma_PCO2', 0, 500, value=10)
tau_PCO2 = sigma_PCO2**-2
PCO2_masked = np.ma.masked_values(PCO2_std, value=0)
x_PCO2 = Normal('x_PCO2', mu_PCO2, tau_PCO2, value=PCO2_masked, observed=True)
mu_OI = Normal('mu_OI', 0, 0.0001, value=0)
sigma_OI = Uniform('sigma_OI', 0, 500, value=10)
tau_OI = sigma_OI**-2
OI_masked = np.ma.masked_values(OI_std, value=0)
x_OI = Normal('x_OI', mu_OI, tau_OI, value=OI_masked, observed=True)
a_SaO2 = Exponential('a_SaO2', 1, value=1)
b_SaO2 = Exponential('b_SaO2', 1, value=1)
SaO2_masked = np.ma.masked_values(SaO2, value=0.911)
x_SaO2 = Beta('x_SaO2', a_SaO2, b_SaO2, value=SaO2_masked, observed=True)
SaO2_std = Lambda('SaO2_std', lambda s=x_SaO2: (s - s.mean())/s.std())
imputed_vars = [x_pH, x_PCO2, x_OI, SaO2_std]
X = np.c_[[male, AgeYears_std, admit_through_emco_std, bacterial, viral, fungal, va]].T
# Intercept for survival rate
beta0 = Normal('beta0', 0.0, 0.001, value=0)
# Treatment effect
beta = Normal('beta', 0, 0.001, value=[0]*(X.shape[1]+len(imputed_vars)))
# Survival rates
lam = Lambda('lam', lambda b0=beta0, b=beta, x=X, z=imputed_vars: np.exp(b0 + np.dot(np.c_[x,np.transpose(z)], b)))
@observed
def survival(value=obs_t, lam=lam, f=died):
"""Exponential survival likelihood, accounting for censoring"""
return (f*np.log(lam) - lam*value).sum()
return locals()
M = MCMC(survival_model())
M.sample(50000, 40000)
[-----------------100%-----------------] 50000 of 50000 complete in 88.5 sec
M.sample(50000, 40000)
[-----------------100%-----------------] 50000 of 50000 complete in 88.9 sec
Matplot.summary_plot([M.beta],
custom_labels=['male', 'AgeYears', 'Admit to ECMO', 'bacterial', 'viral', 'fungal',
'pH', 'PCO2', 'OI', 'SaO2', 'VA'])
Same model above, but with a Weibull hazard function
def weibull_survival_model():
mu_pH = Normal('mu_pH', 0, 0.0001, value=0)
sigma_pH = Uniform('sigma_pH', 0, 500, value=10)
tau_pH = sigma_pH**-2
pH_masked = np.ma.masked_values(pH_std, value=0)
x_pH = Normal('x_pH', mu_pH, tau_pH, value=pH_masked, observed=True)
mu_PCO2 = Normal('mu_PCO2', 0, 0.0001, value=0)
sigma_PCO2 = Uniform('sigma_PCO2', 0, 500, value=10)
tau_PCO2 = sigma_PCO2**-2
PCO2_masked = np.ma.masked_values(PCO2_std, value=0)
x_PCO2 = Normal('x_PCO2', mu_PCO2, tau_PCO2, value=PCO2_masked, observed=True)
mu_OI = Normal('mu_OI', 0, 0.0001, value=0)
sigma_OI = Uniform('sigma_OI', 0, 500, value=10)
tau_OI = sigma_OI**-2
OI_masked = np.ma.masked_values(OI_std, value=0)
x_OI = Normal('x_OI', mu_OI, tau_OI, value=OI_masked, observed=True)
a_SaO2 = Exponential('a_SaO2', 1, value=1)
b_SaO2 = Exponential('b_SaO2', 1, value=1)
SaO2_masked = np.ma.masked_values(SaO2, value=0.911)
x_SaO2 = Beta('x_SaO2', a_SaO2, b_SaO2, value=SaO2_masked, observed=True)
SaO2_std = Lambda('SaO2_std', lambda s=x_SaO2: (s - s.mean())/s.std())
imputed_vars = [x_pH, x_PCO2, x_OI, SaO2_std]
X = np.c_[[male, AgeYears_std, admit_through_emco_std, bacterial, viral, fungal]].T
# Intercept for survival rate
beta0 = Normal('beta0', 0.0, 0.001, value=0)
# Treatment effect
beta = Normal('beta', 0, 0.001, value=[0]*(X.shape[1]+len(imputed_vars)))
# Survival rates
lam = Lambda('lam', lambda b0=beta0, b=beta, x=X, z=imputed_vars: np.exp(b0 + np.dot(np.c_[x,np.transpose(z)], b)))
alpha = Exponential('alpha', 1, value=1)
@observed
def survival(value=obs_t, lam=lam, f=died, alpha=alpha):
# Weibull survival log-likelihood
return (f*(tt.log(alpha) + (alpha-1)*np.log(value*lam) + np.log(lam)) - (lam*value)**alpha).sum()
return locals()
W = MCMC(survival_model())
W.sample(50000, 40000)
[-----------------100%-----------------] 50000 of 50000 complete in 109.3 sec
W.sample(50000, 40000)
[-----------------100%-----------------] 50000 of 50000 complete in 108.4 sec
Matplot.summary_plot([W.beta],
custom_labels=['male', 'AgeYears', 'Admit to ECMO', 'bacterial', 'viral', 'fungal',
'pH', 'PCO2', 'OI', 'SaO2'])
from scipy.interpolate import Rbf
def rbf_survival_model():
node_loc = standardize(flu_complete.AgeYears).quantile([0.1, 0.25, 0.5, 0.75, .9]).values
rbf_nodes = Normal('rbf_nodes', 0, 0.0001, value=[0]*len(node_loc))
@deterministic
def age_rbf(nodes=rbf_nodes):
r = Rbf(node_loc, nodes)
return r(AgeYears_std)
# evaluate RBF at several nodes
age_mean = AgeYears[np.isnan(AgeYears) ^ True].mean()
age_std = AgeYears[np.isnan(AgeYears) ^ True].std()
@deterministic
def age_rbf_x(nodes=rbf_nodes):
r = Rbf(node_loc, nodes)
vals = [10, 20, 30, 40, 50, 60, 70]
return r((vals - age_mean)/age_std)
mu_pH = Normal('mu_pH', 0, 0.0001, value=0)
sigma_pH = Uniform('sigma_pH', 0, 500, value=10)
tau_pH = sigma_pH**-2
pH_masked = np.ma.masked_values(pH_std, value=0)
x_pH = Normal('x_pH', mu_pH, tau_pH, value=pH_masked, observed=True)
mu_PCO2 = Normal('mu_PCO2', 0, 0.0001, value=0)
sigma_PCO2 = Uniform('sigma_PCO2', 0, 500, value=10)
tau_PCO2 = sigma_PCO2**-2
PCO2_masked = np.ma.masked_values(PCO2_std, value=0)
x_PCO2 = Normal('x_PCO2', mu_PCO2, tau_PCO2, value=PCO2_masked, observed=True)
mu_OI = Normal('mu_OI', 0, 0.0001, value=0)
sigma_OI = Uniform('sigma_OI', 0, 500, value=10)
tau_OI = sigma_OI**-2
OI_masked = np.ma.masked_values(OI_std, value=0)
x_OI = Normal('x_OI', mu_OI, tau_OI, value=OI_masked, observed=True)
a_SaO2 = Exponential('a_SaO2', 1, value=1)
b_SaO2 = Exponential('b_SaO2', 1, value=1)
SaO2_masked = np.ma.masked_values(SaO2, value=0.911)
x_SaO2 = Beta('x_SaO2', a_SaO2, b_SaO2, value=SaO2_masked, observed=True)
SaO2_std = Lambda('SaO2_std', lambda s=x_SaO2: (s - s.mean())/s.std())
imputed_vars = [x_pH, x_PCO2, x_OI, SaO2_std]
X = np.c_[[male, admit_through_emco_std, bacterial, viral, fungal, va]].T
# Intercept for survival rate
#beta0 = Normal('beta0', 0.0, 0.001, value=0)
# Treatment effect
beta = Normal('beta', 0, 0.001, value=[0]*(X.shape[1]+len(imputed_vars)))
# Survival rates
lam = Lambda('lam', lambda b0=age_rbf, b=beta, x=X, z=imputed_vars:
np.exp(b0 + np.dot(np.c_[x,np.transpose(z)], b)))
@observed
def survival(value=obs_t, lam=lam, f=died):
"""Exponential survival likelihood, accounting for censoring"""
return (f*np.log(lam) - lam*value).sum()
return locals()
R = MCMC(rbf_survival_model())
R.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 199.4 sec
R.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 216.6 sec
Matplot.summary_plot([R.beta],
custom_labels=['male', 'Admit to ECMO', 'bacterial', 'viral', 'fungal',
'pH', 'PCO2', 'OI', 'SaO2', 'VA'])
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Below is an example of how the age effect varies over different values of age (decades). It is used as the intercept (mean) of the model.
Matplot.summary_plot([R.age_rbf_x], custom_labels=np.array([10, 20, 30, 40, 50, 60, 70], 'str'))
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.