#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns
import matplotlib.pyplot as plt
import pdb
sns.set()

rseed = 20090425


# Import relevant data from KQ4 spreadsheet.

# In[2]:


keep_columns = ['Author', 'Year', 'Refid', 'Pre-op', 'Initial Surgery', 'Morcellation',
       'Power', 'Upstaged', 'Months follow-up',
       'Median_\nFUP_time', 'Range_\nFUP_Time', 'Outcome', 'Age',
       'Menopausal Status', 'Lost_to_Fup', 'Include', 'Inclusion staging criteria restricted or not (y/n)']

kq4_data = (pd.read_excel('data/UF_KQ4_data_for_Chris_12142016_SK.xlsx', sheetname='All_KQ4_data_Original', 
                         na_values=['NR', 'NA'])[keep_columns]
                .rename(columns={'Months follow-up': 'fup_months',
                               'Median_\nFUP_time': 'fup_median', 
                               'Range_\nFUP_Time': 'fup_range',
                                'Pre-op': 'preop', 
                                 'Initial Surgery': 'surgery_init',
                                'Menopausal Status': 'menop_status',
                                'Inclusion staging criteria restricted or not (y/n)': 'stage_include'}))

kq4_data.columns = kq4_data.columns.str.lower()


# Drop excludes (rows colored yellow in the spreadsheet, or those with "lost_to_fup" values instead of censoring time).

# In[3]:


kq4_data = kq4_data[(kq4_data.include==1) & (kq4_data.lost_to_fup!='yes')].drop(['include', 'lost_to_fup'], axis=1)


# In[4]:


kq4_data.head()


# In[5]:


stage1_only = True
    
if stage1_only:
    kq4_data = kq4_data[kq4_data.stage_include=='Yes']


# Drop those with missing `power` or `morcellation` covariates. Perhaps we can impute later.

# In[6]:


kq4_data = kq4_data.dropna(subset=['morcellation', 'power']).reset_index(drop=True)


# Values are a mess in several columns:

# In[7]:


kq4_data.outcome.value_counts()


# In[8]:


kq4_data['preop'].value_counts()


# In[9]:


kq4_data.outcome = kq4_data.outcome.replace({'Alive': 'alive',
                         'Dead': 'dead',
                         'AWD': 'awd',
                         'NED': 'ned',
                         'Died': 'dead'})
kq4_data.outcome.value_counts()


# In[10]:


kq4_data.preop = kq4_data.preop.str.strip().replace({'Benign': 'benign'})
kq4_data['preop'].value_counts()


# This is also a mess. Need to fix.

# In[11]:


pd.crosstab(kq4_data.morcellation, kq4_data.power)


# In[12]:


kq4_data.morcellation = kq4_data.morcellation.str.lower()
kq4_data.power = kq4_data.power.str.lower()

pd.crosstab(kq4_data.morcellation, kq4_data.power)


# Create death outcome column

# In[13]:


kq4_data['died'] = (kq4_data.outcome=='dead').astype(int)
kq4_data.died.value_counts()


# In[14]:


kq4_data.head(20)


# In[15]:


fig, ax = plt.subplots(figsize=(8, 12))
n_patients = kq4_data.shape[0]

time = kq4_data.fup_months

ix0 = kq4_data.morcellation=='no'
ix1 = (kq4_data.morcellation=='yes') & (kq4_data.power=='no')
ix2 = (kq4_data.morcellation=='yes') & (kq4_data.power=='yes')

blue, green, red = sns.color_palette()[:3]

ax.hlines(kq4_data.index[ix0], 0, time[ix0],
          color=blue, label='no morcellation')

ax.hlines(kq4_data.index[ix1], 0, time[ix1],
          color=green, label='non-power morcellation')

ax.hlines(kq4_data.index[ix2], 0, time[ix2],
          color=red, label='power morcellation')

ax.scatter(time[kq4_data.died == 1], kq4_data.index[kq4_data.died == 1],
           color='k', zorder=10, label='Death');

ax.set_xlim(left=0)
ax.set_ylabel('Subject')
ax.set_xlabel('Followup')
ax.set_yticks([])

ax.set_ylim(-0.25, n_patients + 0.25)

ax.legend(loc='upper right');


# In[16]:


from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()


# In[17]:


kmf.fit(kq4_data.fup_months, event_observed=kq4_data.died)


# In[18]:


time = kq4_data.fup_months
event = event_observed=kq4_data.died
ix0 = kq4_data.morcellation=='no'
ix1 = (kq4_data.morcellation=='yes') & (kq4_data.power=='no')
ix2 = (kq4_data.morcellation=='yes') & (kq4_data.power=='yes')


kmf.fit(time[ix0], event[ix0], label='no morcellation')
ax = kmf.plot()

kmf.fit(time[ix1], event[ix1], label='non-power morcellation')
kmf.plot(ax=ax)

kmf.fit(time[ix2], event[ix2], label='power morcellation')
kmf.plot(ax=ax)


# Some studies show event time truncated to the end of the followup period. These values should be treated as **missing** and their values imputed from the range of possible followup times.

# In[19]:


kq4_data.loc[kq4_data.fup_median.notnull(), 'fup_months'] = np.nan


# In[20]:


kq4_data['fup_min'] = 0
kq4_data['fup_max'] = kq4_data.fup_months.max()

fup_min, fup_max = np.transpose([[float(x[0]), float(x[1])] for x in kq4_data.fup_range[kq4_data.fup_median.notnull()].str.split('-')])


# If the event was a death event, make the `fup_min` equal to zero.

# In[21]:


kq4_data.loc[kq4_data.fup_median.notnull(), 'fup_min'] = fup_min * ((1-kq4_data.died)[kq4_data.fup_median.notnull()])
kq4_data.loc[kq4_data.fup_median.notnull(), 'fup_max'] = fup_max


# Summarize proportion of missing values

# In[22]:


kq4_data.isnull().mean().round(2)


# In[23]:


axes = kq4_data.age.hist(bins=15)
axes.set_xlabel('Age')


# In[24]:


axes = kq4_data['fup_months'].hist()
axes.set_xlabel('Followup (months)')


# In[25]:


fig, axes = plt.subplots(4,2,figsize=(16,12))

for ax in axes.flatten():
    
    time_sim = kq4_data.fup_months.copy()
    event = event_observed=kq4_data.died
    missing_times = kq4_data.fup_months.isnull()
    time_sim.loc[missing_times] = np.random.uniform(kq4_data.fup_min[missing_times], 
                                                    kq4_data.fup_max[missing_times])
    
    ix0 = kq4_data.morcellation=='no'
    ix1 = (kq4_data.morcellation=='yes') & (kq4_data.power=='no')
    ix2 = (kq4_data.morcellation=='yes') & (kq4_data.power=='yes')

    kmf = KaplanMeierFitter()
    kmf.fit(time_sim[ix0], event[ix0], label='no morcellation')
    kmf.plot(ax=ax)

    kmf.fit(time_sim[ix1], event[ix1], label='non-power morcellation')
    kmf.plot(ax=ax)

    kmf.fit(time_sim[ix2], event[ix2], label='power morcellation')
    kmf.plot(ax=ax)


# In[26]:


model_data = (kq4_data.assign(non_power_morcellation=(kq4_data.morcellation=='yes') & (kq4_data.power=='no'))
                      .assign(power_morcellation=(kq4_data.morcellation=='yes') & (kq4_data.power=='yes')))


# In[27]:


time = model_data['fup_months']
event = event_observed = model_data.died.astype(int)
n_intervals = int(time.max()+1)
n_patients = model_data.shape[0]


# In[28]:


time_missing = model_data['fup_months'].copy()
time_missing[time_missing.isnull()] = 1
time_masked = np.ma.masked_values(time_missing, value=1)

#non_power, power = model_data[['non_power_morcellation', 'power_morcellation']].values.astype(int).T
power = (model_data.power=='yes').astype(int)
morcellation = (model_data.morcellation=='yes').astype(int)


# In[29]:


fup_min, fup_max = model_data[['fup_min', 'fup_max']].T.values


# The study IDs used in the analyis:

# In[30]:


model_data.refid.unique()


# In[31]:


from pymc3 import Normal, Deterministic, sample, NUTS, Model, DensityDist, Metropolis, Potential, Exponential, Uniform
from pymc3 import traceplot, forestplot, summary, sample_ppc
import theano.tensor as tt


# In[32]:


with Model() as exp_model:

    # Linear model parameters
    β = Normal('β', mu=0.0, tau=0.0001, shape=3)

    # Survival rates
    λ = Deterministic('λ', tt.exp(β[0] + β[1] * morcellation + β[2] * power))
    
    # Impute missing event times
#     time_imputed = Exponential('time_imputed', lam=λ, observed=time_masked)
    time_imputed = Uniform('time_imputed', lower=fup_min, upper=fup_max, observed=time_masked)
    
    # Impose bound on missing times
    fup_bound = Potential('fup_bound', Uniform.dist(lower=fup_min, upper=fup_max).logp(time_imputed))

    # Exponential survival likelihood, accounting for censoring
    def logp(event):
        return event * tt.log(λ) - λ * time_imputed

    survival = DensityDist('survival', logp, observed={'event':event})


# In[33]:


n_iterations = 50000
n_burn = 40000


# In[34]:


with exp_model:
    start = {'time_imputed_missing': np.random.uniform((model_data.fup_min)[model_data.fup_months.isnull()],
                                                      (model_data.fup_max)[model_data.fup_months.isnull()])}
    exp_trace = sample(n_iterations, step=Metropolis(), start=start, random_seed=rseed)


# In[35]:


traceplot(exp_trace[n_burn:], varnames=['β']);


# In[36]:


summary(exp_trace[n_burn:], varnames=['β'])


# Imputed follow-up times

# In[37]:


plt.figure(figsize=(6,12))
forestplot(exp_trace, varnames=['time_imputed_missing'])


# In[38]:


baseline_hazard = np.exp(exp_trace['β'][n_burn:, 0])
non_power_hazard = np.exp(exp_trace['β'][n_burn:, 0] + exp_trace['β'][n_burn:, 1])
power_hazard = np.exp(exp_trace['β'][n_burn:, 0] + exp_trace['β'][n_burn:, 1] + exp_trace['β'][n_burn:, 2])


# Since this is a constant-hazard model, we can easily summarize the hazard estimates.

# In[39]:


hazard_df = pd.DataFrame({'baseline':baseline_hazard, 'non-power':non_power_hazard, 'power':power_hazard})
hazard_df.describe(percentiles=[0.025, 0.975]).T[['mean', '2.5%', '97.5%']].round(4)


# In[40]:


hazard_df.boxplot()
plt.ylabel('Hazard');


# In[41]:


def cum_hazard(hazard, n=n_intervals):
    return (np.arange(n) * hazard.reshape(-1,1)).cumsum(axis=-1)

def survival(hazard):
    return np.exp(-cum_hazard(hazard))


# In[42]:


def plot_with_hpd(x, hazard, f, ax, color=None, label=None, alpha=0.05, **kwargs):

    mean = f(hazard.mean(axis=0), **kwargs)
    
    percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.])
    hpd = np.percentile(f(hazard, **kwargs), percentiles, axis=0)
    
    ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25)
    ax.step(x, mean.squeeze(), color=color, label=label);


# In[43]:


interval_bounds = np.arange(0, time.max() + 2, 1)


# In[44]:


fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6))

plot_with_hpd(interval_bounds[:-1], baseline_hazard, cum_hazard,
              hazard_ax, color=blue, label='Baseline')
plot_with_hpd(interval_bounds[:-1], non_power_hazard, cum_hazard,
              hazard_ax, color=green, label='Non-power morecellation')
plot_with_hpd(interval_bounds[:-1], power_hazard, cum_hazard,
              hazard_ax, color=red, label='Power morecellation')

hazard_ax.set_xlim(0, time.max());
hazard_ax.set_xlabel('Followup time');

hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$');

hazard_ax.legend(loc=2);

plot_with_hpd(interval_bounds[:-1], baseline_hazard, survival,
              surv_ax, color=blue)
plot_with_hpd(interval_bounds[:-1], non_power_hazard, survival,
              surv_ax, color=green)
plot_with_hpd(interval_bounds[:-1], power_hazard, survival,
              surv_ax, color=red)

surv_ax.set_xlim(0, time.max());
surv_ax.set_xlabel('Followup time');

surv_ax.set_ylabel('Survival function $S(t)$');

fig.suptitle('Bayesian survival model');


# Unfortunately, the exponential (*i.e.* constant hazard) model does not seem to fit the data, based on the comparison between the K-M curves and the survival function. 
# 
# Below, we try a Weibull model for the survival function.

# ## Weibull survival model

# In[45]:


from pymc3 import Weibull, df_summary


# In[46]:


with Model() as weibull_model:

    # Linear model parameters
    β = Normal('β', mu=0.0, tau=0.0001, shape=3)

    # Survival function parameters
    k = Uniform('k', 0, 20)
    λ = Deterministic('λ', tt.exp(β[0] + β[1] * morcellation + β[2] * power))
    
    # Impute missing event times
#     time_imputed = Weibull('time_imputed', k, λ, observed=time_masked)
    time_imputed = Uniform('time_imputed', lower=fup_min, upper=fup_max, observed=time_masked)

    
    # Impose bound on missing times
    fup_bound = Potential('fup_bound', Uniform.dist(lower=fup_min, upper=fup_max).logp(time_imputed))

    # Exponential survival likelihood, accounting for censoring
    def logp(event):
        return event * (tt.log(k/λ) + (k-1)*tt.log(time_imputed/λ)) - (time_imputed/λ)**k

    survival = DensityDist('survival', logp, observed={'event':event})


# In[47]:


with weibull_model:
    start = {'time_imputed_missing': np.random.uniform((model_data.fup_min)[model_data.fup_months.isnull()],
                                                      (model_data.fup_max)[model_data.fup_months.isnull()])}
    weibull_trace = sample(n_iterations, step=Metropolis(), start=start, random_seed=rseed)


# In[48]:


traceplot(weibull_trace[n_burn:], varnames=['β']);


# Weibull shape parameter

# In[49]:


traceplot(weibull_trace[n_burn:], varnames=['k']);


# In[50]:


baseline_lam = np.exp(weibull_trace['β'][n_burn:, 0])
non_power_lam = np.exp(weibull_trace['β'][n_burn:, 0] + weibull_trace['β'][n_burn:, 1])
power_lam = np.exp(weibull_trace['β'][n_burn:, 0] + weibull_trace['β'][n_burn:, 1] + weibull_trace['β'][n_burn:, 2])
k_trace = weibull_trace['k'][n_burn:]


# In[51]:


n_intervals = 250
interval_bounds = np.arange(0, n_intervals + 1, 1)


# In[52]:


def cum_weibull_hazard(lam, k=k_trace, t=n_intervals):
    return (np.arange(t)/(lam.reshape(-1,1))**k.reshape(-1,1))

def weibull_survival(lam, k=k_trace, t=n_intervals):
    return np.exp(-cum_weibull_hazard(lam, k, t))


# In[53]:


def plot_with_hpd(x, lam, f, ax, color=None, label=None, alpha=0.05, debug=False, **kwargs):

    m = f(lam, **kwargs).mean(axis=0)
    
    percentiles = 100 * np.array([alpha / 2., 1. - alpha / 2.])
    hpd = np.percentile(f(lam, **kwargs), percentiles, axis=0)
    
    if debug:
        import pdb;pdb.set_trace()
    ax.fill_between(x, hpd[0], hpd[1], color=color, alpha=0.25)
    ax.step(x, m, color=color, label=label);


# In[54]:


fig, (hazard_ax, surv_ax) = plt.subplots(ncols=2, sharex=True, sharey=False, figsize=(16, 6))

plot_with_hpd(interval_bounds[:-1], baseline_lam, cum_weibull_hazard,
              hazard_ax, color=blue, label='Baseline')
plot_with_hpd(interval_bounds[:-1], non_power_lam, cum_weibull_hazard,
              hazard_ax, color=green, label='Non-power morecellation')
plot_with_hpd(interval_bounds[:-1], power_lam, cum_weibull_hazard,
              hazard_ax, color=red, label='Power morecellation')

hazard_ax.set_xlim(0, n_intervals);
hazard_ax.set_xlabel('Followup time');

hazard_ax.set_ylabel(r'Cumulative hazard $\Lambda(t)$');

hazard_ax.legend(loc=2);

plot_with_hpd(interval_bounds[:-1], baseline_lam, weibull_survival,
              surv_ax, color=blue)
plot_with_hpd(interval_bounds[:-1], non_power_lam, weibull_survival,
              surv_ax, color=green)
plot_with_hpd(interval_bounds[:-1], power_lam, weibull_survival,
              surv_ax, color=red)

surv_ax.set_xlim(0, n_intervals);
surv_ax.set_xlabel('Followup time');

surv_ax.set_ylabel('Survival function $S(t)$');

fig.suptitle('Bayesian survival model');


# The above is a much more reasonable fit.