%matplotlib inline
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns
import matplotlib.pyplot as plt
import pdb
sns.set()
rseed = 20090425
Import relevant data from KQ4 spreadsheet.
keep_columns = ['Author', 'Year', 'Refid', 'Pre-op', 'Initial Surgery', 'Morcellation',
'Power', 'Upstaged', 'Months follow-up',
'Median_\nFUP_time', 'Range_\nFUP_Time', 'Outcome', 'Age',
'Menopausal Status', 'Lost_to_Fup', 'Include', 'Inclusion staging criteria restricted or not (y/n)']
kq4_data = (pd.read_excel('data/UF_KQ4_data_for_Chris_12142016_SK.xlsx', sheetname='All_KQ4_data_Original',
na_values=['NR', 'NA'])[keep_columns]
.rename(columns={'Months follow-up': 'fup_months',
'Median_\nFUP_time': 'fup_median',
'Range_\nFUP_Time': 'fup_range',
'Pre-op': 'preop',
'Initial Surgery': 'surgery_init',
'Menopausal Status': 'menop_status',
'Inclusion staging criteria restricted or not (y/n)': 'stage_include'}))
kq4_data.columns = kq4_data.columns.str.lower()
Drop excludes (rows colored yellow in the spreadsheet, or those with "lost_to_fup" values instead of censoring time).
kq4_data = kq4_data[(kq4_data.include==1) & (kq4_data.lost_to_fup!='yes')].drop(['include', 'lost_to_fup'], axis=1)
kq4_data.head()
author | year | refid | preop | surgery_init | morcellation | power | upstaged | fup_months | fup_median | fup_range | outcome | age | menop_status | stage_include | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Einstein | 2008.0 | 405 | Benign | LSC SCH | Yes | Yes | No | 30.0 | NaN | NaN | NED | NaN | NaN | No |
1 | Einstein | 2008.0 | 405 | Benign | LMYOM | Yes | Yes | Yes | 61.0 | NaN | NaN | NED | NaN | NaN | No |
2 | Einstein | 2008.0 | 405 | Benign | SCH | Yes | No | Yes | 31.0 | NaN | NaN | AWD | NaN | NaN | No |
3 | Einstein | 2008.0 | 405 | Benign | SCH BSO | No | No | No | 37.0 | NaN | NaN | NED | NaN | NaN | No |
4 | Einstein | 2008.0 | 405 | Benign | SCH BSO | No | No | Yes | 6.0 | NaN | NaN | AWD | NaN | NaN | No |
stage1_only = True
if stage1_only:
kq4_data = kq4_data[kq4_data.stage_include=='Yes']
Drop those with missing power
or morcellation
covariates. Perhaps we can impute later.
kq4_data = kq4_data.dropna(subset=['morcellation', 'power']).reset_index(drop=True)
Values are a mess in several columns:
kq4_data.outcome.value_counts()
Alive 124 alive 52 dead 46 Dead 27 AWD 16 Died 14 Name: outcome, dtype: int64
kq4_data['preop'].value_counts()
benign 193 Benign 46 sarcoma 3 Name: preop, dtype: int64
kq4_data.outcome = kq4_data.outcome.replace({'Alive': 'alive',
'Dead': 'dead',
'AWD': 'awd',
'NED': 'ned',
'Died': 'dead'})
kq4_data.outcome.value_counts()
alive 176 dead 87 awd 16 Name: outcome, dtype: int64
kq4_data.preop = kq4_data.preop.str.strip().replace({'Benign': 'benign'})
kq4_data['preop'].value_counts()
benign 239 sarcoma 3 Name: preop, dtype: int64
This is also a mess. Need to fix.
pd.crosstab(kq4_data.morcellation, kq4_data.power)
power | No | Yes | no | yes |
---|---|---|---|---|
morcellation | ||||
No | 199 | 0 | 0 | 0 |
Yes | 14 | 8 | 24 | 0 |
yes | 13 | 0 | 0 | 21 |
kq4_data.morcellation = kq4_data.morcellation.str.lower()
kq4_data.power = kq4_data.power.str.lower()
pd.crosstab(kq4_data.morcellation, kq4_data.power)
power | no | yes |
---|---|---|
morcellation | ||
no | 199 | 0 |
yes | 51 | 29 |
Create death outcome column
kq4_data['died'] = (kq4_data.outcome=='dead').astype(int)
kq4_data.died.value_counts()
0 192 1 87 Name: died, dtype: int64
kq4_data.head(20)
author | year | refid | preop | surgery_init | morcellation | power | upstaged | fup_months | fup_median | fup_range | outcome | age | menop_status | stage_include | died | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Lin | 2015.0 | 826 | benign | TAH | no | no | No | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
1 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
2 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
3 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
4 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
5 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
6 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
7 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
8 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
9 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
10 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
11 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
12 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
13 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | alive | 52.7 | NaN | Yes | 0 |
14 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | dead | 52.7 | NaN | Yes | 1 |
15 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | dead | 52.7 | NaN | Yes | 1 |
16 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | dead | 52.7 | NaN | Yes | 1 |
17 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | dead | 52.7 | NaN | Yes | 1 |
18 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | dead | 52.7 | NaN | Yes | 1 |
19 | Lin | 2015.0 | 826 | benign | TAH | no | no | NaN | 34.0 | yes | 1-248 | dead | 52.7 | NaN | Yes | 1 |
fig, ax = plt.subplots(figsize=(8, 12))
n_patients = kq4_data.shape[0]
time = kq4_data.fup_months
ix0 = kq4_data.morcellation=='no'
ix1 = (kq4_data.morcellation=='yes') & (kq4_data.power=='no')
ix2 = (kq4_data.morcellation=='yes') & (kq4_data.power=='yes')
blue, green, red = sns.color_palette()[:3]
ax.hlines(kq4_data.index[ix0], 0, time[ix0],
color=blue, label='no morcellation')
ax.hlines(kq4_data.index[ix1], 0, time[ix1],
color=green, label='non-power morcellation')
ax.hlines(kq4_data.index[ix2], 0, time[ix2],
color=red, label='power morcellation')
ax.scatter(time[kq4_data.died == 1], kq4_data.index[kq4_data.died == 1],
color='k', zorder=10, label='Death');
ax.set_xlim(left=0)
ax.set_ylabel('Subject')
ax.set_xlabel('Followup')
ax.set_yticks([])
ax.set_ylim(-0.25, n_patients + 0.25)
ax.legend(loc='upper right');
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
kmf.fit(kq4_data.fup_months, event_observed=kq4_data.died)
<lifelines.KaplanMeierFitter: fitted with 279 observations, 192 censored>
time = kq4_data.fup_months
event = event_observed=kq4_data.died
ix0 = kq4_data.morcellation=='no'
ix1 = (kq4_data.morcellation=='yes') & (kq4_data.power=='no')
ix2 = (kq4_data.morcellation=='yes') & (kq4_data.power=='yes')
kmf.fit(time[ix0], event[ix0], label='no morcellation')
ax = kmf.plot()
kmf.fit(time[ix1], event[ix1], label='non-power morcellation')
kmf.plot(ax=ax)
kmf.fit(time[ix2], event[ix2], label='power morcellation')
kmf.plot(ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x113f05780>
Some studies show event time truncated to the end of the followup period. These values should be treated as missing and their values imputed from the range of possible followup times.
kq4_data.loc[kq4_data.fup_median.notnull(), 'fup_months'] = np.nan
kq4_data['fup_min'] = 0
kq4_data['fup_max'] = kq4_data.fup_months.max()
fup_min, fup_max = np.transpose([[float(x[0]), float(x[1])] for x in kq4_data.fup_range[kq4_data.fup_median.notnull()].str.split('-')])
If the event was a death event, make the fup_min
equal to zero.
kq4_data.loc[kq4_data.fup_median.notnull(), 'fup_min'] = fup_min * ((1-kq4_data.died)[kq4_data.fup_median.notnull()])
kq4_data.loc[kq4_data.fup_median.notnull(), 'fup_max'] = fup_max
Summarize proportion of missing values
kq4_data.isnull().mean().round(2)
author 0.00 year 0.00 refid 0.00 preop 0.13 surgery_init 0.00 morcellation 0.00 power 0.00 upstaged 0.67 fup_months 0.67 fup_median 0.33 fup_range 0.33 outcome 0.00 age 0.69 menop_status 0.76 stage_include 0.00 died 0.00 fup_min 0.00 fup_max 0.00 dtype: float64
axes = kq4_data.age.hist(bins=15)
axes.set_xlabel('Age')
<matplotlib.text.Text at 0x1142284a8>
axes = kq4_data['fup_months'].hist()
axes.set_xlabel('Followup (months)')
<matplotlib.text.Text at 0x1143f2a58>
fig, axes = plt.subplots(4,2,figsize=(16,12))
for ax in axes.flatten():
time_sim = kq4_data.fup_months.copy()
event = event_observed=kq4_data.died
missing_times = kq4_data.fup_months.isnull()
time_sim.loc[missing_times] = np.random.uniform(kq4_data.fup_min[missing_times],
kq4_data.fup_max[missing_times])
ix0 = kq4_data.morcellation=='no'
ix1 = (kq4_data.morcellation=='yes') & (kq4_data.power=='no')
ix2 = (kq4_data.morcellation=='yes') & (kq4_data.power=='yes')
kmf = KaplanMeierFitter()
kmf.fit(time_sim[ix0], event[ix0], label='no morcellation')
kmf.plot(ax=ax)
kmf.fit(time_sim[ix1], event[ix1], label='non-power morcellation')
kmf.plot(ax=ax)
kmf.fit(time_sim[ix2], event[ix2], label='power morcellation')
kmf.plot(ax=ax)
model_data = (kq4_data.assign(non_power_morcellation=(kq4_data.morcellation=='yes') & (kq4_data.power=='no'))
.assign(power_morcellation=(kq4_data.morcellation=='yes') & (kq4_data.power=='yes')))
time = model_data['fup_months']
event = event_observed = model_data.died.astype(int)
n_intervals = int(time.max()+1)
n_patients = model_data.shape[0]
time_missing = model_data['fup_months'].copy()
time_missing[time_missing.isnull()] = 1
time_masked = np.ma.masked_values(time_missing, value=1)
#non_power, power = model_data[['non_power_morcellation', 'power_morcellation']].values.astype(int).T
power = (model_data.power=='yes').astype(int)
morcellation = (model_data.morcellation=='yes').astype(int)
fup_min, fup_max = model_data[['fup_min', 'fup_max']].T.values
The study IDs used in the analyis:
model_data.refid.unique()
array([826, 364, 286, 932, 2114], dtype=object)
from pymc3 import Normal, Deterministic, sample, NUTS, Model, DensityDist, Metropolis, Potential, Exponential, Uniform
from pymc3 import traceplot, forestplot, summary, sample_ppc
import theano.tensor as tt
with Model() as exp_model:
# Linear model parameters
β = Normal('β', mu=0.0, tau=0.0001, shape=3)
# Survival rates
λ = Deterministic('λ', tt.exp(β[0] + β[1] * morcellation + β[2] * power))
# Impute missing event times
# time_imputed = Exponential('time_imputed', lam=λ, observed=time_masked)
time_imputed = Uniform('time_imputed', lower=fup_min, upper=fup_max, observed=time_masked)
# Impose bound on missing times
fup_bound = Potential('fup_bound', Uniform.dist(lower=fup_min, upper=fup_max).logp(time_imputed))
# Exponential survival likelihood, accounting for censoring
def logp(event):
return event * tt.log(λ) - λ * time_imputed
survival = DensityDist('survival', logp, observed={'event':event})
n_iterations = 50000
n_burn = 40000
with exp_model:
start = {'time_imputed_missing': np.random.uniform((model_data.fup_min)[model_data.fup_months.isnull()],
(model_data.fup_max)[model_data.fup_months.isnull()])}
exp_trace = sample(n_iterations, step=Metropolis(), start=start, random_seed=rseed)
100%|██████████| 50000/50000 [00:13<00:00, 3749.83it/s]
traceplot(exp_trace[n_burn:], varnames=['β']);
summary(exp_trace[n_burn:], varnames=['β'])
β: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------- -5.261 0.136 0.005 [-5.513, -4.986] -0.051 0.291 0.018 [-0.624, 0.479] 1.781 0.342 0.022 [1.101, 2.443] Posterior quantiles: 2.5 25 50 75 97.5 |--------------|==============|==============|--------------| -5.529 -5.353 -5.258 -5.168 -4.998 -0.624 -0.253 -0.053 0.158 0.479 1.092 1.550 1.790 2.012 2.439
Imputed follow-up times
plt.figure(figsize=(6,12))
forestplot(exp_trace, varnames=['time_imputed_missing'])
<matplotlib.gridspec.GridSpec at 0x124f7f390>
baseline_hazard = np.exp(exp_trace['β'][n_burn:, 0])
non_power_hazard = np.exp(exp_trace['β'][n_burn:, 0] + exp_trace['β'][n_burn:, 1])
power_hazard = np.exp(exp_trace['β'][n_burn:, 0] + exp_trace['β'][n_burn:, 1] + exp_trace['β'][n_burn:, 2])
Since this is a constant-hazard model, we can easily summarize the hazard estimates.
hazard_df = pd.DataFrame({'baseline':baseline_hazard, 'non-power':non_power_hazard, 'power':power_hazard})
hazard_df.describe(percentiles=[0.025, 0.975]).T[['mean', '2.5%', '97.5%']].round(4)
mean | 2.5% | 97.5% | |
---|---|---|---|
baseline | 0.0052 | 0.0040 | 0.0067 |
non-power | 0.0051 | 0.0029 | 0.0078 |
power | 0.0301 | 0.0177 | 0.0458 |
hazard_df.boxplot()
plt.ylabel('Hazard');
def cum_hazard(hazard, n=n_intervals):
return (np.arange(n) * hazard.reshape(-1,1)).cumsum(axis=-1)
def survival(hazard):
return np.exp(-cum_hazard(hazard))
def plot_with_hpd(x, hazard, f, ax, color=None, label=None, alpha=0.05, **kwargs):
mean = f(hazard.mean(axis=0), **kwargs)
percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.])
hpd = np.percentile(f(hazard, **kwargs), percentiles, axis=0)
ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25)
ax.step(x, mean.squeeze(), color=color, label=label);
interval_bounds = np.arange(0, time.max() + 2, 1)
fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6))
plot_with_hpd(interval_bounds[:-1], baseline_hazard, cum_hazard,
hazard_ax, color=blue, label='Baseline')
plot_with_hpd(interval_bounds[:-1], non_power_hazard, cum_hazard,
hazard_ax, color=green, label='Non-power morecellation')
plot_with_hpd(interval_bounds[:-1], power_hazard, cum_hazard,
hazard_ax, color=red, label='Power morecellation')
hazard_ax.set_xlim(0, time.max());
hazard_ax.set_xlabel('Followup time');
hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$');
hazard_ax.legend(loc=2);
plot_with_hpd(interval_bounds[:-1], baseline_hazard, survival,
surv_ax, color=blue)
plot_with_hpd(interval_bounds[:-1], non_power_hazard, survival,
surv_ax, color=green)
plot_with_hpd(interval_bounds[:-1], power_hazard, survival,
surv_ax, color=red)
surv_ax.set_xlim(0, time.max());
surv_ax.set_xlabel('Followup time');
surv_ax.set_ylabel('Survival function $S(t)$');
fig.suptitle('Bayesian survival model');
Unfortunately, the exponential (i.e. constant hazard) model does not seem to fit the data, based on the comparison between the K-M curves and the survival function.
Below, we try a Weibull model for the survival function.
from pymc3 import Weibull, df_summary
with Model() as weibull_model:
# Linear model parameters
β = Normal('β', mu=0.0, tau=0.0001, shape=3)
# Survival function parameters
k = Uniform('k', 0, 20)
λ = Deterministic('λ', tt.exp(β[0] + β[1] * morcellation + β[2] * power))
# Impute missing event times
# time_imputed = Weibull('time_imputed', k, λ, observed=time_masked)
time_imputed = Uniform('time_imputed', lower=fup_min, upper=fup_max, observed=time_masked)
# Impose bound on missing times
fup_bound = Potential('fup_bound', Uniform.dist(lower=fup_min, upper=fup_max).logp(time_imputed))
# Exponential survival likelihood, accounting for censoring
def logp(event):
return event * (tt.log(k/λ) + (k-1)*tt.log(time_imputed/λ)) - (time_imputed/λ)**k
survival = DensityDist('survival', logp, observed={'event':event})
with weibull_model:
start = {'time_imputed_missing': np.random.uniform((model_data.fup_min)[model_data.fup_months.isnull()],
(model_data.fup_max)[model_data.fup_months.isnull()])}
weibull_trace = sample(n_iterations, step=Metropolis(), start=start, random_seed=rseed)
100%|██████████| 50000/50000 [00:21<00:00, 2327.62it/s]
traceplot(weibull_trace[n_burn:], varnames=['β']);
Weibull shape parameter
traceplot(weibull_trace[n_burn:], varnames=['k']);
baseline_lam = np.exp(weibull_trace['β'][n_burn:, 0])
non_power_lam = np.exp(weibull_trace['β'][n_burn:, 0] + weibull_trace['β'][n_burn:, 1])
power_lam = np.exp(weibull_trace['β'][n_burn:, 0] + weibull_trace['β'][n_burn:, 1] + weibull_trace['β'][n_burn:, 2])
k_trace = weibull_trace['k'][n_burn:]
n_intervals = 250
interval_bounds = np.arange(0, n_intervals + 1, 1)
def cum_weibull_hazard(lam, k=k_trace, t=n_intervals):
return (np.arange(t)/(lam.reshape(-1,1))**k.reshape(-1,1))
def weibull_survival(lam, k=k_trace, t=n_intervals):
return np.exp(-cum_weibull_hazard(lam, k, t))
def plot_with_hpd(x, lam, f, ax, color=None, label=None, alpha=0.05, debug=False, **kwargs):
m = f(lam, **kwargs).mean(axis=0)
percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.])
hpd = np.percentile(f(lam, **kwargs), percentiles, axis=0)
if debug:
import pdb;pdb.set_trace()
ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25)
ax.step(x, m, color=color, label=label);
fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6))
plot_with_hpd(interval_bounds[:-1], baseline_lam, cum_weibull_hazard,
hazard_ax, color=blue, label='Baseline')
plot_with_hpd(interval_bounds[:-1], non_power_lam, cum_weibull_hazard,
hazard_ax, color=green, label='Non-power morecellation')
plot_with_hpd(interval_bounds[:-1], power_lam, cum_weibull_hazard,
hazard_ax, color=red, label='Power morecellation')
hazard_ax.set_xlim(0, n_intervals);
hazard_ax.set_xlabel('Followup time');
hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$');
hazard_ax.legend(loc=2);
plot_with_hpd(interval_bounds[:-1], baseline_lam, weibull_survival,
surv_ax, color=blue)
plot_with_hpd(interval_bounds[:-1], non_power_lam, weibull_survival,
surv_ax, color=green)
plot_with_hpd(interval_bounds[:-1], power_lam, weibull_survival,
surv_ax, color=red)
surv_ax.set_xlim(0, n_intervals);
surv_ax.set_xlabel('Followup time');
surv_ax.set_ylabel('Survival function $S(t)$');
fig.suptitle('Bayesian survival model');
The above is a much more reasonable fit.