#!/usr/bin/env python # coding: utf-8 # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import pymc3 as pm import seaborn as sns import matplotlib.pyplot as plt import pdb sns.set() # In[3]: kq4_data = pd.read_excel('data/KQ4 Workbook.xlsx', sheetname='Sheet1', na_values=['NR', 'NA']).drop(['Notes'], axis=1) kq4_data.head() # Some studies show event time truncated to the end of the followup period # In[4]: followup_event_time = ['Lin', 'Perri', 'Park'] kq4_data['followup_event_time'] = kq4_data.Author.isin(followup_event_time) kq4_data.followup_event_time.value_counts() # In[5]: kq4_data.isnull().mean().round(2) # In[6]: kq4_data = kq4_data.dropna(axis='rows', subset=['Months follow-up']) # In[7]: kq4_data.Outcome.value_counts() # In[8]: kq4_data['Pre-op'].value_counts() # In[9]: pd.crosstab(kq4_data.Morcellation, kq4_data.Power) # In[10]: kq4_data['Initial Surgery'].value_counts() # In[11]: axes = kq4_data.Age.hist(bins=15) axes.set_xlabel('Age') # In[12]: axes = kq4_data['Months follow-up'].hist() axes.set_xlabel('Followup (months)') # Create death outcome column # In[13]: kq4_data['died'] = kq4_data.Outcome.isin(['Dead', 'Died']) kq4_data.loc[kq4_data.Outcome.isnull(), 'died'] = np.nan kq4_data.died.value_counts() # In[14]: kq4_data = kq4_data.dropna(subset=['Morcellation', 'Power']).reset_index(drop=True) # In[15]: from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() # In[16]: kmf.fit(kq4_data['Months follow-up'], event_observed=kq4_data.died) # In[17]: time = kq4_data['Months follow-up'] event = event_observed=kq4_data.died ix0 = kq4_data.Morcellation=='No' ix1 = (kq4_data.Morcellation=='Yes') & (kq4_data.Power=='No') ix2 = (kq4_data.Morcellation=='Yes') & (kq4_data.Power=='Yes') kmf.fit(time[ix0], event[ix0], label='no morcellation') ax = kmf.plot() kmf.fit(time[ix1], event[ix1], label='non-power morcellation') kmf.plot(ax=ax) kmf.fit(time[ix2], event[ix2], label='power morcellation') kmf.plot(ax=ax) # In[18]: fig, axes = plt.subplots(4,2,figsize=(16,12)) for ax in axes.flatten(): time_sim = kq4_data['Months follow-up'].copy() event = event_observed=kq4_data.died time_sim.loc[kq4_data.followup_event_time] = np.random.uniform(0, time_sim[kq4_data.followup_event_time]) ix0 = kq4_data.Morcellation=='No' ix1 = (kq4_data.Morcellation=='Yes') & (kq4_data.Power=='No') ix2 = (kq4_data.Morcellation=='Yes') & (kq4_data.Power=='Yes') kmf = KaplanMeierFitter() kmf.fit(time_sim[ix0], event[ix0], label='no morcellation') kmf.plot(ax=ax) kmf.fit(time_sim[ix1], event[ix1], label='non-power morcellation') kmf.plot(ax=ax) kmf.fit(time_sim[ix2], event[ix2], label='power morcellation') kmf.plot(ax=ax) # In[19]: fig, ax = plt.subplots(figsize=(8, 12)) n_patients = kq4_data.shape[0] blue, green, red = sns.color_palette()[:3] ax.hlines(kq4_data.index[ix0], 0, time[ix0], color=blue, label='no morcellation') ax.hlines(kq4_data.index[ix1], 0, time[ix1], color=green, label='non-power morcellation') ax.hlines(kq4_data.index[ix2], 0, time[ix2], color=red, label='power morcellation') ax.scatter(time[kq4_data.died == 1], kq4_data.index[kq4_data.died == 1], color='k', zorder=10, label='Death'); ax.set_xlim(left=0) ax.set_ylabel('Subject') ax.set_xlabel('Followup') ax.set_yticks([]) ax.set_ylim(-0.25, n_patients + 0.25) ax.legend(loc='upper right'); # In[20]: model_data = (kq4_data.assign(non_power_morcellation=(kq4_data.Morcellation=='Yes') & (kq4_data.Power=='No')) .assign(power_morcellation=(kq4_data.Morcellation=='Yes') & (kq4_data.Power=='Yes'))) # In[21]: time = model_data['Months follow-up'] event = event_observed = model_data.died.astype(int) n_intervals = int(time.max()+1) n_patients = model_data.shape[0] # In[22]: time_missing = model_data['Months follow-up'].copy() time_missing[model_data.followup_event_time] = 1 time_masked = np.ma.masked_values(time_missing, value=1) non_power, power = model_data[['non_power_morcellation', 'power_morcellation']].values.astype(int).T # In[38]: from pymc3 import Normal, Deterministic, sample, NUTS, Model, DensityDist, Uniform, Metropolis, find_MAP, Gamma from pymc3 import traceplot, forestplot, summary, sample_ppc import theano.tensor as tt # In[24]: with Model() as model: # Impute missing event times time_imputed = Uniform('time_imputed', np.zeros(n_patients), time.values, observed=time_masked, testval=2) # Linear model parameters β = Normal('β', mu=0.0, tau=0.0001, shape=3) # Survival rates λ = Deterministic('λ', tt.exp(β[0] + β[1] * non_power + β[2] * power)) # Exponential survival likelihood, accounting for censoring def logp(event): return event * tt.log(λ) - λ * time_imputed survival = DensityDist('survival', logp, observed={'event':event}) # In[25]: n_iterations = 20000 n_burn = 10000 # In[26]: with model: trace = sample(n_iterations, step=Metropolis()) # In[28]: traceplot(trace[n_burn:], varnames=['β']); # In[29]: summary(trace[n_burn:], varnames=['β']) # In[30]: forestplot(trace, varnames=['time_imputed_missing']) # In[31]: baseline_hazard = np.exp(trace['β'][n_burn:, 0]) non_power_hazard = np.exp(trace['β'][n_burn:, 0] + trace['β'][n_burn:, 1]) power_hazard = np.exp(trace['β'][n_burn:, 0] + trace['β'][n_burn:, 2]) # In[32]: hazard_df = pd.DataFrame({'baseline':baseline_hazard, 'non-power':non_power_hazard, 'power':power_hazard}) # In[33]: hazard_df.boxplot() plt.ylabel('Hazard'); # In[34]: def cum_hazard(hazard, n=n_intervals): return (np.arange(n) * hazard.reshape(-1,1)).cumsum(axis=-1) def survival(hazard): return np.exp(-cum_hazard(hazard)) # In[35]: def plot_with_hpd(x, hazard, f, ax, color=None, label=None, alpha=0.05): mean = f(hazard.mean(axis=0)) percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.]) hpd = np.percentile(f(hazard), percentiles, axis=0) ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25) ax.step(x, mean.squeeze(), color=color, label=label); # In[36]: interval_bounds = np.arange(0, time.max() + 2, 1) # In[37]: fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6)) plot_with_hpd(interval_bounds[:-1], baseline_hazard, cum_hazard, hazard_ax, color=blue, label='Baseline') plot_with_hpd(interval_bounds[:-1], non_power_hazard, cum_hazard, hazard_ax, color=green, label='Non-power morecellation') plot_with_hpd(interval_bounds[:-1], power_hazard, cum_hazard, hazard_ax, color=red, label='Power morecellation') hazard_ax.set_xlim(0, time.max()); hazard_ax.set_xlabel('Followup time'); hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$'); hazard_ax.legend(loc=2); plot_with_hpd(interval_bounds[:-1], baseline_hazard, survival, surv_ax, color=blue) plot_with_hpd(interval_bounds[:-1], non_power_hazard, survival, surv_ax, color=green) plot_with_hpd(interval_bounds[:-1], power_hazard, survival, surv_ax, color=red) surv_ax.set_xlim(0, time.max()); surv_ax.set_xlabel('Followup time'); surv_ax.set_ylabel('Survival function $S(t)$'); fig.suptitle('Bayesian survival model'); # ## Cox model # In[ ]: with Model() as cox_model: # Impute missing event times time_imputed = Uniform('time_imputed', np.zeros(n_patients), time.values, observed=time_masked, testval=2) interval_bounds = np.arange(0, time_masked.max() + 2, 1) n_intervals = len(interval_bounds) - 1 # risk set equals one if obs_t >= t Y = tt.ge(time_imputed.dimshuffle(0, 'x'), interval_bounds[:-1].reshape(1, -1)) import pdb;pdb.set_trace() last_period = tt.cast(tt.floor(time_imputed), 'int32') dN = tt.zeros((n_patients, n_intervals)) dN = Y * tt.set_subtensor(dN[tt.arange(n_patients), last_period], event) # dN = tt.ge(time_imputed.dimshuffle(0, 'x')[1:], ) # # counting process. jump = 1 if obs_t \in [t[j], t[j+1]) # dN = np.array([[Y[i,j]*int(t[j+1] >= obs_t[i])*fail[i] for j in range(T)] for i in # range(nsubj)]) c = Gamma('c', .0001, .00001) r = Gamma('r', .001, .0001) dL0_star = r*tt.extra_ops.diff(time_imputed) # prior mean hazard μ = Deterministic('μ', dL0_star * c) dL0 = Gamma('dL0', μ, c, shape=μ.tag.test_value.shape[0]) β = Normal('β', np.zeros(2), sd=np.ones(2)*1000, shape=2) X = model_data[['non_power_morcellation', 'power_morcellation']].values linear_model = tt.exp(tt.dot(X, β)) idt = Y[:, :-1] * tt.outer(linear_model, dL0) dn_like = Poisson('dn_like', idt, observed=dN) # In[52]: np.diff([2,3,4]) # In[ ]: