#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import matplotlib.pyplot as plt import pymc3 as pm # In[2]: flu = pd.read_csv('data/flu.csv', index_col=0) # In[3]: flu.shape # Turn into table of unique patients, with appropriate organism columns # In[4]: flu_only = flu.groupby('PatientID')['OrganismName'].apply(lambda s: len([x for x in s if str(x).startswith('Influenza')])==len(s)).astype(int) # In[5]: coinfections = flu.groupby('PatientID')['OrganismName'].apply(lambda s: [x for x in s.dropna() if not str(x).startswith('Influenza')]) # MRSA coinfection # In[6]: mrsa = coinfections.apply(lambda x: x.count('Staphylococcus aureus, meth resist') > 0).astype(int) # Any coinfection # In[7]: any_coinf = coinfections.apply(lambda x: int(len(x)>0)) # In[8]: flu_unique = flu.drop_duplicates(cols=['PatientID']).set_index('PatientID') # In[9]: flu_unique['flu_only'] = flu_only flu_unique['mrsa'] = mrsa flu_unique['any_coinf'] = any_coinf # Add time from admission to time on ECMO # In[10]: admit_through_emco = flu_unique.AdmitToTimeOnHours.add(flu_unique.HoursECMO, fill_value=0) # See how many have no time information from admission, and drop these # In[11]: missing = admit_through_emco.isnull() missing.sum() # In[12]: flu_complete = flu_unique[missing-True] admit_through_emco.dropna(inplace=True) # In[13]: # Confirm no null here admit_through_emco.isnull().sum() # Time off ECMO through to event (death or discharge) # In[14]: off_ecmo_to_event = flu_complete.TimeOffToDCDateHours.add(flu_complete.TimeOffToDeathDateHours, fill_value=0) # In[15]: off_ecmo_to_event.isnull().sum() # In[16]: # Not sure why add() does not work here flu_complete['time_to_event'] = (admit_through_emco.values + off_ecmo_to_event.fillna(0).values) # In[17]: flu_complete.time_to_event.isnull().sum() # Clean covariates # In[18]: flu_complete.Race.value_counts() # In[19]: flu_complete['non_white'] = (flu_complete.Race!='W').astype(int) # In[20]: flu_complete['male'] = (flu_complete.Sex=='M').astype(int) # In[21]: flu_complete['OI'] = flu_complete.FiO2 * flu_complete.MAP / flu_complete.PO2 # In[22]: # ECMO Type flu_complete['VA'] = flu_complete.Mode.isin(['VA', 'VV-VA']) # Set "Other" type to NA (there are only a couple) flu_complete.loc[flu_complete.Mode=='Other', 'VA'] = None # In[23]: # covariates = ['male', 'AgeYears', 'non_white', 'HoursECMO', 'pH', 'PCO2', 'PO2', 'HCO3', # 'SaO2', 'mrsa', 'any_coinf'] covariates = ['male', 'AgeYears', 'non_white', 'pH', 'PCO2', 'HCO3', 'PO2', 'OI', 'SaO2', 'mrsa', 'any_coinf', 'AdmitToTimeOnHours', 'VA', 'PEEP'] # Get counts of missing values # In[24]: flu_complete[covariates].isnull().mean().round(2)*100 # In[28]: flu_complete.columns # In[29]: flu_complete.groupby('fate')[['HoursECMO', 'AdmitToTimeOnHours']].mean() # We are going to drop the missing values for some of the covariates, and not use some covariates with many missing values. # In[25]: drop_subset = ['HoursECMO', 'pH', 'PCO2', 'PO2', 'VA'] # In[26]: flu_complete = flu_complete.dropna(subset=drop_subset) # In[27]: flu_complete[['pH', 'PCO2', 'PO2', 'HCO3', 'SaO2']].hist(bins=25); # ## Survival Analysis # Surivival time, in days # In[28]: obs_t = flu_complete.time_to_event.values/24. # In[29]: plt.hist(obs_t, bins=25); # Fate of each patient # In[30]: died = (flu_complete.fate=='Died').astype(int).values # ### Kaplan-Meier Curves # In[31]: from lifelines import KaplanMeierFitter # In[32]: km = KaplanMeierFitter() km.fit(obs_t, event_observed=died) ax = km.plot() # In[33]: mrsa_ind = flu_complete.mrsa.astype(bool).values # In[34]: km = KaplanMeierFitter() km.fit(obs_t[~mrsa_ind], event_observed=died[~mrsa_ind], label='non-MRSA') ax = km.plot() km.fit(obs_t[mrsa_ind], event_observed=died[mrsa_ind], label='MRSA') km.plot(ax=ax) # ### Cumulative Hazard # In[35]: from lifelines import NelsonAalenFitter na = NelsonAalenFitter() na.fit(obs_t, event_observed=died) na.plot() # Unique failure times # In[36]: times = np.unique((flu_complete.time_to_event[flu_complete.fate=='Died'].values/24).astype(int)) times # In[37]: N_obs = len(obs_t) T = len(times) - 1 # Calculate risk set # In[38]: Y = np.array([[int(obs >= t) for t in times] for obs in obs_t]) Y # Counting process. Jump = 1 if $\text{obs}_t \in [ t_j, t_{j+1} )$ # In[39]: dN = np.array([[Y[i,j]*(times[j+1] >= obs_t[i])*died[i] for i in range(N_obs)] for j in range(T)]) # In[40]: # Process for one patient dN[:,1] # Covariates # In[41]: standardize = lambda x: (x - x[np.isnan(x)-True].mean()) / x[np.isnan(x)-True].std() # In[42]: analysis_covariates = ['male', 'AgeYears', 'non_white', 'HoursECMO', 'pH', 'PCO2', 'PO2', 'mrsa', 'any_coinf', 'VA'] # In[43]: flu_complete[analysis_covariates].isnull().sum() # In[44]: male, AgeYears, non_white, HoursECMO, pH, PCO2, PO2, mrsa, any_coinf, VA= \ flu_complete[analysis_covariates].values.T HoursECMO_std = standardize(HoursECMO) AgeYears_std = standardize(AgeYears) pH_std = standardize(pH) PCO2_std = standardize(PCO2) PO2_std = standardize(PO2) # HCO3_std = standardize(HCO3) # ## Model # In[45]: import theano.tensor as tt from pymc3 import Model, Normal, Poisson, DensityDist, T as StudentT, Uniform, Deterministic, HalfCauchy, Laplace, Exponential from pymc3 import Bernoulli, Gamma, MvNormal, BinaryMetropolis from pymc3.distributions.timeseries import GaussianRandomWalk from pymc3 import sample, NUTS, Slice, Metropolis, forestplot, find_MAP # In[46]: n_kernels = 5 kernels = np.linspace(AgeYears_std.min(), AgeYears_std.max(), n_kernels) def rbf(s, x): return [tt.exp( ((x - k)/ s)**2) for k in kernels] # In[54]: with Model() as model: sigma = Uniform('sigma', 0, 100) alpha_sd = Uniform('alpha_sd', 0, 100) alpha = GaussianRandomWalk('alpha', sd=alpha_sd, shape=n_kernels) a = tt.dot(alpha, rbf(sigma, AgeYears_std)) X = np.c_[[male, pH_std, PCO2_std, PO2_std, HoursECMO_std, non_white, mrsa, any_coinf]].T # Intercept for survival rate beta0 = Normal('beta0', 0.0, 0.001) # Treatment effect beta = Normal('beta', 0, 0.001, shape=X.shape[1]) # Survival rates lam = tt.exp(beta0 + tt.dot(X, beta) + a) def exp_logp(failure, value): # Exponential survival log-likelihood return tt.sum(failure * tt.log(lam) - lam * value) surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t)) # In[55]: with model: trace = sample(2000, step=NUTS(), njobs=2) # In[56]: forestplot(trace, vars=['alpha']) # In[57]: forestplot(trace, vars=['beta'], ylabels=['male', 'pH', 'PCO2', 'PO2', 'HoursECMO', 'non_white', 'mrsa', 'any_coinf']) # In[31]: flu_complete.columns # In[33]: flu_complete.year.value_counts() # In[82]: forestplot(trace, vars=['beta'], ylabels=['male', 'pH', 'PCO2', 'PO2', 'HoursECMO', 'non_white', 'mrsa', 'any_coinf']) # Same model above, but with a Weibull hazard function # In[51]: with Model() as model_weibull: X = np.c_[[male, pH_std, PCO2_std, PO2_std, HoursECMO_std, non_white, mrsa, any_coinf]].T # Intercept for survival rate beta0 = Normal('beta0', 0.0, 0.001) # Treatment effect beta = Normal('beta', 0, 0.001, shape=X.shape[1]) # Survival rates lam = tt.exp(beta0 + tt.dot(X, beta)) alpha = Exponential('alpha', 1) def weibull_logp(failure, times): # Weibull survival log-likelihood return tt.sum(failure*(tt.log(alpha) + (alpha-1)*tt.log(times*lam) + tt.log(lam)) - (lam*times)**alpha) surv_like = DensityDist('surv_like', weibull_logp, observed=(died, obs_t)) # In[52]: with model_weibull: trace_weibull = sample(2000, step=NUTS()) # The following incorporates a spline for the age effect. # In[41]: def interpolate(x0, y0, x): x = np.array(x) idx = np.searchsorted(x0, x) dl = np.array(x - x0[idx - 1]) dr = np.array(x0[idx] - x) d=dl+dr wl = dr/d return wl*y0[idx-1] + (1-wl)*y0[idx] # In[37]: nknots = 10 with Model() as model_spline: #X = np.c_[[male, non_white, mrsa, any_coinf]].T coeff_sd = Uniform('coeff_sd', 0, 100) coeff_tau = coeff_sd**-2 spline = Normal('spline', 0, coeff_tau, shape=nknots) knots = np.linspace(AgeYears.min(), AgeYears.max()+1, nknots) beta1 = Deterministic('beta1', interpolate(knots, spline, AgeYears)) # Intercept for survival rate beta0 = Normal('beta0', 0.0, 0.001) # Treatment effect #beta = Normal('beta', 0, 0.001, shape=X.shape[1]) # Survival rates lam = tt.exp(beta0 + beta1)# + tt.dot(X, beta)) def exp_logp(failure, value): # Exponential survival log-likelihood return tt.sum(failure * tt.log(lam) - lam * value) surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t)) # In[ ]: with model_spline: trace_spline = sample(1000, step=NUTS()) # In[51]: forestplot(trace_spline, vars=['beta'], ylabels=['male', 'non_white', 'mrsa', 'any_coinf']) # In[ ]: # In[59]: forestplot(trace_spline, vars=['spline']) # ## Log-logistic model # In[ ]: with Model() as log_logistic_model: sigma = Uniform('sigma', 0, 100) alpha_sd = Uniform('alpha_sd', 0, 100) alpha = GaussianRandomWalk('alpha', sd=alpha_sd, shape=n_kernels) a = tt.dot(alpha, rbf(sigma, AgeYears_std)) X = np.c_[[male, pH_std, PCO2_std, PO2_std, HoursECMO_std, non_white, mrsa, any_coinf]].T # Intercept for survival rate beta0 = Normal('beta0', 0.0, 0.001) # Treatment effect beta = Normal('beta', 0, 0.001, shape=X.shape[1]) # Survival rates lam = tt.exp(beta0 + tt.dot(X, beta) + a) def exp_logp(failure, value): # Exponential survival log-likelihood return tt.sum(failure * tt.log(lam) - lam * value) surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t)) # ### Radial basis function model # In[40]: plt.hist(AgeYears) # In[75]: np.eye(4)*[2,3,4,5] # In[ ]: gaussian_basis = lambda x, h, l: np.exp(-(np.abs(x - h) / l)**2) n_rbf = 20 centers = np.linspace(AgeYears.min(), AgeYears.max()+1, n_rbf) l = 3 # Weights w = [[gaussian_basis(x, h, l) for h in centers] for x in AgeYears] w_pred = [[gaussian_basis(x, h, l) for h in centers] for x in np.linspace(AgeYears.min(), AgeYears.max()+1, 100)] with Model() as model_rbf: # Prior on number of basis functions z = Bernoulli('z', 0.5, shape=n_rbf) # Prior for non-zero coefficients sigma = HalfCauchy('sigma', 25) #kappa = Gamma('kappa', 0.5, 0.5, shape=n_rbf) #Tau = tt.diag(kappa) #* sigma**-2 #beta = MvNormal('beta', [0]*n_rbf, Tau, shape=n_rbf) beta = GaussianRandomWalk('beta', sd=sigma, shape=n_rbf) # Intercept for survival rate beta0 = Normal('beta0', 0.0, 0.001) # Hazard rates lam = Deterministic('lam', tt.exp(beta0 + tt.dot(w, z*beta))) lam_pred = Deterministic('lam_pred', tt.exp(beta0 + tt.dot(w_pred, z*beta))) def exp_logp(failure, value): # Exponential survival log-likelihood return tt.sum(failure * tt.log(lam) - lam * value) surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t)) # In[ ]: with model_rbf: start = {'z': [1]*n_rbf, 'sigma': 10, 'beta': [0]*n_rbf, 'beta0': 0} step1 = BinaryMetropolis(vars=[z]) step2 = NUTS(vars=[sigma, beta, beta0], scaling=start) trace_rbf = sample(2000, step=[step1, step2], start=start) # In[ ]: from pymc import traceplot forestplot(trace_rbf[5000:], vars=['lam_pred']) # In[ ]: _ = traceplot(trace_rbf[5000:], vars=['sigma']) # In[ ]: from pymc import summary summary(trace_rbf, vars=['z']) # In[ ]: