#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm


# In[2]:


flu = pd.read_csv('data/flu.csv', index_col=0)


# In[3]:


flu.shape


# Turn into table of unique patients, with appropriate organism columns

# In[4]:


flu_only = flu.groupby('PatientID')['OrganismName'].apply(lambda s: 
                        len([x for x in s if str(x).startswith('Influenza')])==len(s)).astype(int)


# In[5]:


coinfections = flu.groupby('PatientID')['OrganismName'].apply(lambda s: 
                        [x for x in s.dropna() if not str(x).startswith('Influenza')])


# MRSA coinfection

# In[6]:


mrsa = coinfections.apply(lambda x: x.count('Staphylococcus aureus, meth resist') > 0).astype(int)


# Any coinfection

# In[7]:


any_coinf = coinfections.apply(lambda x: int(len(x)>0))


# In[8]:


flu_unique = flu.drop_duplicates(cols=['PatientID']).set_index('PatientID')


# In[9]:


flu_unique['flu_only'] = flu_only
flu_unique['mrsa'] = mrsa
flu_unique['any_coinf'] = any_coinf


# Add time from admission to time on ECMO

# In[10]:


admit_through_emco = flu_unique.AdmitToTimeOnHours.add(flu_unique.HoursECMO, fill_value=0)


# See how many have no time information from admission, and drop these

# In[11]:


missing = admit_through_emco.isnull()
missing.sum()


# In[12]:


flu_complete = flu_unique[missing-True]
admit_through_emco.dropna(inplace=True)


# In[13]:


# Confirm no null here
admit_through_emco.isnull().sum()


# Time off ECMO through to event (death or discharge)

# In[14]:


off_ecmo_to_event = flu_complete.TimeOffToDCDateHours.add(flu_complete.TimeOffToDeathDateHours, 
                                                          fill_value=0)


# In[15]:


off_ecmo_to_event.isnull().sum()


# In[16]:


# Not sure why add() does not work here
flu_complete['time_to_event'] = (admit_through_emco.values + off_ecmo_to_event.fillna(0).values)


# In[17]:


flu_complete.time_to_event.isnull().sum()


# Clean covariates

# In[18]:


flu_complete.Race.value_counts()


# In[19]:


flu_complete['non_white'] = (flu_complete.Race!='W').astype(int)


# In[20]:


flu_complete['male'] = (flu_complete.Sex=='M').astype(int)


# In[21]:


flu_complete['OI'] = flu_complete.FiO2 * flu_complete.MAP / flu_complete.PO2


# In[22]:


# ECMO Type
flu_complete['VA'] = flu_complete.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu_complete.loc[flu_complete.Mode=='Other', 'VA'] = None


# In[23]:


# covariates = ['male', 'AgeYears', 'non_white', 'HoursECMO', 'pH', 'PCO2', 'PO2', 'HCO3', 
#               'SaO2', 'mrsa', 'any_coinf']
covariates = ['male', 'AgeYears', 'non_white', 'pH', 'PCO2', 'HCO3', 'PO2', 'OI', 
              'SaO2', 'mrsa', 'any_coinf', 'AdmitToTimeOnHours', 'VA', 'PEEP']


# Get counts of missing values

# In[24]:


flu_complete[covariates].isnull().mean().round(2)*100


# In[28]:


flu_complete.columns


# In[29]:


flu_complete.groupby('fate')[['HoursECMO', 'AdmitToTimeOnHours']].mean()


# We are going to drop the missing values for some of the covariates, and not use some covariates with many missing values.

# In[25]:


drop_subset = ['HoursECMO', 'pH', 'PCO2', 'PO2', 'VA']


# In[26]:


flu_complete = flu_complete.dropna(subset=drop_subset)


# In[27]:


flu_complete[['pH', 'PCO2', 'PO2', 'HCO3', 
              'SaO2']].hist(bins=25);


# ## Survival Analysis

# Surivival time, in days

# In[28]:


obs_t = flu_complete.time_to_event.values/24.


# In[29]:


plt.hist(obs_t, bins=25);


# Fate of each patient

# In[30]:


died = (flu_complete.fate=='Died').astype(int).values


# ### Kaplan-Meier Curves

# In[31]:


from lifelines import KaplanMeierFitter


# In[32]:


km = KaplanMeierFitter()
km.fit(obs_t, event_observed=died)
ax = km.plot()


# In[33]:


mrsa_ind = flu_complete.mrsa.astype(bool).values


# In[34]:


km = KaplanMeierFitter()
km.fit(obs_t[~mrsa_ind], event_observed=died[~mrsa_ind], label='non-MRSA')
ax = km.plot()

km.fit(obs_t[mrsa_ind], event_observed=died[mrsa_ind], label='MRSA')
km.plot(ax=ax)


# ### Cumulative Hazard

# In[35]:


from lifelines import NelsonAalenFitter
na = NelsonAalenFitter()

na.fit(obs_t, event_observed=died)
na.plot()


# Unique failure times

# In[36]:


times = np.unique((flu_complete.time_to_event[flu_complete.fate=='Died'].values/24).astype(int))
times


# In[37]:


N_obs = len(obs_t)
T = len(times) - 1


# Calculate risk set

# In[38]:


Y = np.array([[int(obs >= t) for t in times] for obs in obs_t])
Y


# Counting process. Jump = 1 if $\text{obs}_t \in [ t_j, t_{j+1} )$

# In[39]:


dN = np.array([[Y[i,j]*(times[j+1] >= obs_t[i])*died[i] for i in range(N_obs)] for j in range(T)])


# In[40]:


# Process for one patient
dN[:,1]


# Covariates

# In[41]:


standardize = lambda x: (x - x[np.isnan(x)-True].mean()) / x[np.isnan(x)-True].std()


# In[42]:


analysis_covariates = ['male', 'AgeYears', 'non_white', 'HoursECMO', 'pH', 'PCO2', 'PO2', 'mrsa', 'any_coinf', 'VA']


# In[43]:


flu_complete[analysis_covariates].isnull().sum()


# In[44]:


male, AgeYears, non_white, HoursECMO, pH, PCO2, PO2, mrsa, any_coinf, VA= \
    flu_complete[analysis_covariates].values.T


HoursECMO_std = standardize(HoursECMO)
AgeYears_std = standardize(AgeYears)
pH_std = standardize(pH)
PCO2_std = standardize(PCO2)
PO2_std = standardize(PO2)
# HCO3_std = standardize(HCO3)


# ## Model

# In[45]:


import theano.tensor as tt
from pymc3 import Model, Normal, Poisson, DensityDist, T as StudentT, Uniform, Deterministic, HalfCauchy, Laplace, Exponential
from pymc3 import Bernoulli, Gamma, MvNormal, BinaryMetropolis
from pymc3.distributions.timeseries import GaussianRandomWalk
from pymc3 import sample, NUTS, Slice, Metropolis, forestplot, find_MAP


# In[46]:


n_kernels = 5
kernels = np.linspace(AgeYears_std.min(), AgeYears_std.max(), n_kernels)

def rbf(s, x):
    return [tt.exp( ((x - k)/ s)**2) for k in kernels]


# In[54]:


with Model() as model:
    
    sigma = Uniform('sigma', 0, 100)
    
    alpha_sd = Uniform('alpha_sd', 0, 100)

    alpha = GaussianRandomWalk('alpha', sd=alpha_sd, shape=n_kernels)

    a = tt.dot(alpha, rbf(sigma, AgeYears_std))
    
    X = np.c_[[male, pH_std, PCO2_std, PO2_std, HoursECMO_std, non_white, mrsa, any_coinf]].T
    
    # Intercept for survival rate
    beta0 = Normal('beta0', 0.0, 0.001)
    # Treatment effect
    beta = Normal('beta', 0, 0.001, shape=X.shape[1])

    # Survival rates
    lam = tt.exp(beta0 + tt.dot(X, beta) + a)

    def exp_logp(failure, value):
        # Exponential survival log-likelihood
        return tt.sum(failure * tt.log(lam) - lam * value)

    surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t))


# In[55]:


with model:
    trace = sample(2000, step=NUTS(), njobs=2)


# In[56]:


forestplot(trace, vars=['alpha'])


# In[57]:


forestplot(trace, vars=['beta'], ylabels=['male', 'pH', 'PCO2', 'PO2', 'HoursECMO', 'non_white', 'mrsa', 'any_coinf'])


# In[31]:


flu_complete.columns


# In[33]:


flu_complete.year.value_counts()


# In[82]:


forestplot(trace, vars=['beta'], ylabels=['male', 'pH', 'PCO2', 'PO2', 'HoursECMO', 'non_white', 'mrsa', 'any_coinf'])


# Same model above, but with a Weibull hazard function

# In[51]:


with Model() as model_weibull:
    
    X = np.c_[[male, pH_std, PCO2_std, PO2_std, HoursECMO_std, non_white, mrsa, any_coinf]].T
    
    # Intercept for survival rate
    beta0 = Normal('beta0', 0.0, 0.001)
    # Treatment effect
    beta = Normal('beta', 0, 0.001, shape=X.shape[1])

    # Survival rates
    lam = tt.exp(beta0 + tt.dot(X, beta))

    alpha = Exponential('alpha', 1)
    def weibull_logp(failure, times):
        # Weibull survival log-likelihood
        return tt.sum(failure*(tt.log(alpha) + (alpha-1)*tt.log(times*lam) + tt.log(lam)) - (lam*times)**alpha)

    surv_like = DensityDist('surv_like', weibull_logp, observed=(died, obs_t))


# In[52]:


with model_weibull:
    trace_weibull = sample(2000, step=NUTS())


# The following incorporates a spline for the age effect.

# In[41]:


def interpolate(x0, y0, x):
    
    x = np.array(x)

    idx = np.searchsorted(x0, x)
    dl = np.array(x - x0[idx - 1])
    dr = np.array(x0[idx] - x)
    d=dl+dr
    wl = dr/d

    return wl*y0[idx-1] + (1-wl)*y0[idx]


# In[37]:


nknots = 10

with Model() as model_spline:
    
    #X = np.c_[[male, non_white, mrsa, any_coinf]].T
    
    coeff_sd = Uniform('coeff_sd', 0, 100)
    coeff_tau = coeff_sd**-2
    spline = Normal('spline', 0, coeff_tau, shape=nknots)
    knots = np.linspace(AgeYears.min(), AgeYears.max()+1, nknots)
    beta1 = Deterministic('beta1', interpolate(knots, spline, AgeYears))
    
    # Intercept for survival rate
    beta0 = Normal('beta0', 0.0, 0.001)
    # Treatment effect
    #beta = Normal('beta', 0, 0.001, shape=X.shape[1])

    # Survival rates
    lam = tt.exp(beta0 + beta1)# + tt.dot(X, beta))
    
    def exp_logp(failure, value):
        # Exponential survival log-likelihood
        return tt.sum(failure * tt.log(lam) - lam * value)

    surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t))


# In[ ]:


with model_spline:
    trace_spline = sample(1000, step=NUTS())


# In[51]:


forestplot(trace_spline, vars=['beta'], 
           ylabels=['male', 'non_white', 'mrsa', 'any_coinf'])


# In[ ]:


# In[59]:


forestplot(trace_spline, vars=['spline']) 


# ## Log-logistic model

# In[ ]:


with Model() as log_logistic_model:
    
    sigma = Uniform('sigma', 0, 100)
    
    alpha_sd = Uniform('alpha_sd', 0, 100)

    alpha = GaussianRandomWalk('alpha', sd=alpha_sd, shape=n_kernels)

    a = tt.dot(alpha, rbf(sigma, AgeYears_std))
    
    X = np.c_[[male, pH_std, PCO2_std, PO2_std, HoursECMO_std, non_white, mrsa, any_coinf]].T
    
    # Intercept for survival rate
    beta0 = Normal('beta0', 0.0, 0.001)
    # Treatment effect
    beta = Normal('beta', 0, 0.001, shape=X.shape[1])

    # Survival rates
    lam = tt.exp(beta0 + tt.dot(X, beta) + a)

    def exp_logp(failure, value):
        # Exponential survival log-likelihood
        return tt.sum(failure * tt.log(lam) - lam * value)

    surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t))


# ### Radial basis function model

# In[40]:


plt.hist(AgeYears)


# In[75]:


np.eye(4)*[2,3,4,5]


# In[ ]:


gaussian_basis = lambda x, h, l: np.exp(-(np.abs(x - h) / l)**2)

n_rbf = 20
centers = np.linspace(AgeYears.min(), AgeYears.max()+1, n_rbf)
l = 3
# Weights
w = [[gaussian_basis(x, h, l) for h in centers] for x in AgeYears]
w_pred = [[gaussian_basis(x, h, l) for h in centers] for x in np.linspace(AgeYears.min(), AgeYears.max()+1, 100)]

with Model() as model_rbf:
    
    # Prior on number of basis functions
    z = Bernoulli('z', 0.5, shape=n_rbf)
    
    # Prior for non-zero coefficients
    sigma = HalfCauchy('sigma', 25)
    #kappa = Gamma('kappa', 0.5, 0.5, shape=n_rbf)
    #Tau = tt.diag(kappa) #* sigma**-2
    
    #beta = MvNormal('beta', [0]*n_rbf, Tau, shape=n_rbf)
    beta = GaussianRandomWalk('beta', sd=sigma, shape=n_rbf)


    # Intercept for survival rate
    beta0 = Normal('beta0', 0.0, 0.001)

    # Hazard rates
    lam = Deterministic('lam', tt.exp(beta0 + tt.dot(w, z*beta)))
    lam_pred = Deterministic('lam_pred', tt.exp(beta0 + tt.dot(w_pred, z*beta)))
    
    def exp_logp(failure, value):
        # Exponential survival log-likelihood
        return tt.sum(failure * tt.log(lam) - lam * value)

    surv_like = DensityDist('surv_like', exp_logp, observed=(died, obs_t))


# In[ ]:


with model_rbf:
    start = {'z': [1]*n_rbf, 'sigma': 10, 'beta': [0]*n_rbf, 'beta0': 0}
    step1 = BinaryMetropolis(vars=[z])
    step2 = NUTS(vars=[sigma, beta, beta0], scaling=start)
    trace_rbf = sample(2000, step=[step1, step2], start=start)


# In[ ]:


from pymc import traceplot

forestplot(trace_rbf[5000:], vars=['lam_pred'])


# In[ ]:


_ = traceplot(trace_rbf[5000:], vars=['sigma'])


# In[ ]:


from pymc import summary
summary(trace_rbf, vars=['z'])


# In[ ]: