#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc as pm
import pdb

pm.__version__


# In[2]:


flu = pd.read_csv('data/flu_organism.csv', index_col=0)


# In[3]:


flu.shape


# In[4]:


flu.PatientID.unique().shape


# ### Turn into table of unique patients, with appropriate organism columns
# 
# Create indicator for flu-only.

# In[5]:


flu_only = flu.groupby('PatientID')['OrganismName'].apply(lambda s: 
                        len([x for x in s if str(x).startswith('Influenza')])==len(s)).astype(bool)


# In[6]:


flu_only.head()


# In[7]:


flu_only.name = 'flu_only'


# In[8]:


flu = flu.merge(pd.DataFrame(flu_only), left_on='PatientID', right_index=True)


# In[9]:


flu_staph_organism = flu.groupby('PatientID')['OrganismName'].apply(lambda s: 
            len([x for x in s if str(x).startswith('Influenza') or 
                 str(x).startswith('Staphylococcus aureus')])>1).astype(bool)


# In[10]:


flu_staph_organism.name = 'flu_staph'


# In[11]:


flu = flu.merge(pd.DataFrame(flu_staph_organism), left_on='PatientID', right_index=True)


# In[12]:


flu_staph_organism.mean()


# See if there are any flu-only or flu-other individuals with an ICD9 diagnosis of S. Aureus

# In[13]:


flu_other = flu.groupby('PatientID')['OrganismName'].apply(lambda s: 
            len([x for x in s if str(x).startswith('Influenza') or 
                 str(x).startswith('Other')])>1).astype(bool)


# In[14]:


flu_other.name = 'flu_other'


# In[15]:


flu = flu.merge(pd.DataFrame(flu_other), left_on='PatientID', right_index=True)


# There do not appear to be cases of flu-only or flu-other via organism that have a S.Aureus diagnosis via ICD9

# In[16]:


(flu_other & flu.s_aureus_icd9).sum()


# In[17]:


(flu_only & flu.s_aureus_icd9).sum()


# In[18]:


flu.groupby('PatientID')['OrganismName'].value_counts()


# Create indictors for coinfection type

# In[19]:


for org in flu.Type.unique():
    flu[org] = (flu.Type==org).astype(int)


# Create data frame of unique patients

# In[20]:


flu_unique = flu.drop_duplicates(subset=['PatientID']).set_index('PatientID')


# In[21]:


flu_unique['flu_only'] = flu_only


# In[22]:


flu_unique.flu_only.mean()


# In[23]:


flu_unique.flu_staph.mean()


# Several missing values for admission to time on ECMO

# In[24]:


flu_unique.AdmitToTimeOnHours.isnull().mean()


# Since we need this field to calculate event time, we will have to drop individuals with missing values from the survival analysis.

# In[25]:


flu_unique = flu_unique.dropna(subset=['AdmitToTimeOnHours', 'HoursECMO'])
assert not flu_unique.AdmitToTimeOnHours.isnull().sum()


# Add time from admission to time on ECMO to get total exposure time.

# In[26]:


admit_through_emco = flu_unique.AdmitToTimeOnHours.add(flu_unique.HoursECMO)
flu_unique['admit_through_emco'] = admit_through_emco


# See how many have no time information from admission, and drop these

# In[27]:


missing = admit_through_emco.isnull()
flu_complete = flu_unique[missing ^ True]
admit_through_emco.dropna(inplace=True)
assert not admit_through_emco.isnull().sum()


# ### Time to event
# 
# Time off ECMO through to event (death or discharge). Fill with zeros for `TimeOffToDeathDateHours`, implying censoring.

# In[28]:


flu_unique.TimeOffToDCDateHours.add(flu_unique.TimeOffToDeathDateHours, fill_value=0).isnull().sum()


# In[29]:


off_ecmo_to_event = flu_complete.TimeOffToDCDateHours.add(flu_complete.TimeOffToDeathDateHours, 
                                                          fill_value=0)


# In[30]:


off_ecmo_to_event.isnull().sum()


# Time to event is admit throught ecmo, plus off ecmo to event times.

# In[35]:


flu_complete['time_to_event'] = admit_through_emco + off_ecmo_to_event.fillna(0)
assert not flu_complete.time_to_event.isnull().sum()


# Create variables for use in analysis

# In[36]:


# ECMO Type
flu_complete['VA'] = flu_complete.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu_complete.loc[flu_complete.Mode=='Other', 'VA'] = None


# Create oxygen index

# In[37]:


flu.columns


# In[38]:


flu_complete['OI'] = flu_complete.FiO2 * flu_complete.MAP / flu_complete.PO2


# In[39]:


flu_complete.OI.hist(bins=20)


# In[40]:


covariates = ['AgeYears', 'pH', 'OI', 'VA']


# Get counts of missing values

# In[41]:


flu_complete[covariates].isnull().mean()


# In[42]:


flu_complete[covariates].hist(bins=25);


# In[43]:


flu_complete[flu_complete.AgeYears< 1].AgeDays.hist(bins=15)


# ## Survival Model

# Surivival time, in days

# In[44]:


obs_t = flu_complete.time_to_event.values/24.


# In[45]:


plt.hist(obs_t, bins=25);


# Fate of each patient

# In[46]:


died = (flu_complete.fate=='Died').astype(int).values


# Unique failure times

# In[47]:


times = np.unique((flu_complete.time_to_event[flu_complete.fate=='Died'].values/24).astype(int))
times


# In[48]:


N_obs = len(obs_t)
T = len(times) - 1


# Calculate risk set (the number at risk at each event time)

# In[49]:


Y = np.array([[int(obs >= t) for t in times] for obs in obs_t])
Y


# Counting process. Jump = 1 if $\text{obs}_t \in [ t_j, t_{j+1} )$

# In[50]:


dN = np.array([[Y[i,j]*(times[j+1] >= obs_t[i])*died[i] for i in range(N_obs)] for j in range(T)])


# In[51]:


# Sample process for one patient
dN[:,np.random.randint(len(dN))]


# Functions for standarizing and centering

# In[52]:


center = lambda x: (x - x[np.isnan(x) ^ True].mean())
standardize = lambda x: center(x) / x[np.isnan(x) ^ True].std()


# In[53]:


covariates


# In[54]:


AgeYears, pH, OI, va = flu_complete[covariates].values.T

AgeYears_center = center(AgeYears).round(1)

# Center pH at 7.4
pH_center = pH - 7.4

OI_std = standardize(OI)


# ## Survival curves

# In[55]:


from lifelines import KaplanMeierFitter

km = KaplanMeierFitter()
km.fit(obs_t, event_observed=died)
ax = km.plot()


# In[56]:


flu_only_ind = flu_complete.flu_only.values
flu_staph_ind = flu_complete.flu_staph.values
flu_other_ind = flu_complete.flu_other.values


# In[57]:


km = KaplanMeierFitter()
km.fit(obs_t[flu_staph_ind], event_observed=died[flu_staph_ind], label='flu+staph')
ax = km.plot()

km.fit(obs_t[flu_only_ind], event_observed=died[flu_only_ind], label='flu only')
km.plot(ax=ax)

km.fit(obs_t[flu_other_ind], event_observed=died[flu_other_ind], label='flu+other')
km.plot(ax=ax)


# In[58]:


age_complete = flu_complete.AgeYears.values
infant_ind = age_complete<1
kids_ind = (age_complete>=1) & (age_complete<19)
adult_ind = age_complete>=19


# In[59]:


km = KaplanMeierFitter()
km.fit(obs_t[infant_ind], event_observed=died[infant_ind], label='infants')
ax = km.plot()

km.fit(obs_t[kids_ind], event_observed=died[kids_ind], label='children')
km.plot(ax=ax)

km.fit(obs_t[adult_ind], event_observed=died[adult_ind], label='adults')
km.plot(ax=ax)


# In[60]:


va_ind = flu_complete.VA.astype(bool).values


# In[61]:


km = KaplanMeierFitter()
km.fit(obs_t[va_ind], event_observed=died[va_ind], label='VA')
ax = km.plot()

km.fit(obs_t[~va_ind], event_observed=died[~va_ind], label='VV')
km.plot(ax=ax)


# ## Model

# In[95]:


from pymc import MCMC, Normal, Poisson, Uniform, Lambda, Laplace, Exponential, Beta, HalfCauchy, Uninformative
from pymc import Bernoulli, Lognormal, observed, Matplot, deterministic, Gamma, stochastic
from pymc import AdaptiveMetropolis, normal_like, rnormal, potential
from pymc.gp import *


# In[96]:


# Values for missing elements, to be replaced in model
OI_std[np.isnan(OI_std)] = 0
pH_center[np.isnan(pH_center)] = 7.111
va[np.isnan(va)] = 0.5


# In[97]:


gaussian_rbf = lambda x, mu, l: np.exp(-np.abs(x - mu)**2 / l**2)


# In[98]:


def rbf(name, x, n=10):
    
    # Number of basis functions for age effect
    h = np.linspace(x.min(), x.max(), n)
    l = HalfCauchy('l_%s' % name, 0, 5, value=1)
    
    # RBF weights for all the points, for all the functions
    w = Lambda('w_%s' % name, lambda l=l: np.array([gaussian_rbf(i, h, l) for i in x]))
    
    τ = Exponential('τ_%s' % name, 1, value=np.ones(n))

    # Coefficients
    β = Normal('β_%s' % name, 0, τ, value=np.zeros(n))
    
    τ_rbf = Exponential('τ_rbf_%s' % name, 1, value=1)
    
    @stochastic(name='rbf_%s' % name)
    def rbf(value=np.zeros(len(x)), β=β, w=w, τ=τ_rbf):
        m = w.dot(β)
        return normal_like(value, m, τ)
    
    @deterministic(name='rbf_eval_%s' % name)
    def rbf_eval(β=β, τ=τ_rbf, l=l):
        
        wts = np.array([gaussian_rbf(i, h, l) for i in h])
        m = wts.dot(β)
        return rnormal(m, τ)
    
    return locals()


# In[134]:


def spline(name, x, knots, smoothing, interpolation_method='linear'):
    """ Generate PyMC objects for a spline model of age-specific rate

    Parameters
    ----------
    name : str
    knots : array
    x : array, points to interpolate to
    smoothing : pymc.Node, smoothness parameter for smoothing spline
    interpolation_method : str, optional, one of 'linear', 'nearest', 'zero', 'slinear', 'quadratic, 'cubic'

    Results
    -------
    Returns dict of PyMC objects, including 'gamma' (log of rate at
    knots) and 'mu_age' (age-specific rate interpolated at all age
    points)
    """
    assert np.all(np.diff(knots) > 0), 'Spline knots must be strictly increasing'

    # TODO: consider changing this prior distribution to be something more familiar in linear space
    gamma = [Normal('gamma_%s_%d'%(name,k), 0., 10.**-2, value=0.) for k in knots]
    #gamma = [mc.Uniform('gamma_%s_%d'%(name,k), -20., 20., value=-10.) for k in knots]

    import scipy.interpolate
    @deterministic(name='mu_x_%s'%name)
    def mu_x(gamma=gamma, knots=knots, x=x):
        mu = scipy.interpolate.interp1d(knots, np.exp(gamma), kind=interpolation_method, 
                                        bounds_error=False, fill_value=0.)
        return mu(x)

    vars = dict(gamma=gamma, mu_x=mu_x, x=x, knots=knots)

    if (smoothing > 0) and (smoothing < 1e10):
        #print 'adding smoothing of', smoothing
        @potential(name='smooth_mu_%s'%name)
        def smooth_gamma(gamma=gamma, knots=knots, tau=smoothing**-2):
            # the following is to include a "noise floor" so that level value
            # zero prior does not exert undue influence on age pattern
            # smoothing
            # TODO: consider changing this to an offset log normal
            gamma = np.clip(gamma, np.log(np.exp(gamma).mean()/10.), np.inf)  # only include smoothing on values within 10x of mean

            return normal_like(np.sqrt(np.sum(np.diff(gamma)**2 / np.diff(knots))), 0, tau)
        vars['smooth_gamma'] = smooth_gamma

    return vars


# In[135]:


def exponential_survival_model():
    
    # Imputation of missing values
    p_va = Beta('p_VA', 1, 1, value=0.5)
    va_masked = np.ma.masked_values(va, value=0.5)
    x_va = Bernoulli('x_va', p_va, value=va_masked, observed=True)
    
    mu_pH = Normal('mu_pH', 0, 0.0001, value=7)
    sigma_pH = Uniform('sigma_pH', 0, 500, value=10)
    tau_pH = sigma_pH**-2
    pH_masked = np.ma.masked_values(pH_center, value=7.111)
    x_pH = Normal('x_pH', mu_pH, tau_pH, value=pH_masked, observed=True)
    
#     mu_OI = Normal('mu_OI', 0, 0.0001, value=0)
#     sigma_OI = Uniform('sigma_OI', 0, 500, value=10)
#     tau_OI = sigma_OI**-2
#     OI_masked = np.ma.masked_values(OI_std, value=0)
#     x_OI = Normal('x_OI', mu_OI, tau_OI, value=OI_masked, observed=True)
    
    X = [x_va, flu_unique.flu_only, flu_unique.flu_staph]
        
    # Intercept for survival rate
    beta0 = Normal('beta0', 0.0, 0.001, value=0)
    # Covariates
    beta = Normal('beta', 0, 0.001, value=np.zeros(len(X)))

    age_knots = np.linspace(AgeYears_center.min(), AgeYears_center.max(), 7)
    α_age = Exponential('α_age', 1, value=0.1)
    spline_age = spline('spline_age', AgeYears_center, age_knots, α_age)
    
    pH_knots = np.linspace(pH_center.min(), pH_center.max(), 7)
    α_pH = Exponential('α_pH', 1, value=0.1)
    spline_pH = spline('spline_pH', pH_center, pH_knots, α_pH)
    
    # Survival rates
    @deterministic
    def lam(b0=beta0, b=beta, x=X, γ=spline_age['mu_x'], ρ=spline_pH['mu_x']):
        return np.exp(b0 + np.dot(np.transpose(x), b) + γ + ρ)

    @observed
    def survival(value=obs_t, lam=lam, f=died):
        """Exponential survival likelihood, accounting for censoring"""
        return (f*np.log(lam) - lam*value).sum()

    return locals()


# In[136]:


M = MCMC(exponential_survival_model())


# In[137]:


iterations = 50000
burn = 40000


# In[138]:


M.sample(iterations, burn)


# In[139]:


M.sample(iterations, burn)


# In[140]:


Matplot.summary_plot(M.beta, custom_labels=['VA', 'flu only', 'flu + staph'], hpd=False)


# In[141]:


Matplot.summary_plot(M.__dict__['spline_pH']['gamma'], 
                     custom_labels=np.linspace(flu_unique.pH.dropna().min(), 
                                               flu_unique.pH.dropna().max(), 7).round(1).astype(str),
                    xlab='Effect size', main='pH effect')


# In[142]:


Matplot.summary_plot(M.__dict__['spline_age']['gamma'], 
                     custom_labels=np.linspace(AgeYears.min(), AgeYears.max(), 7).round(1).astype(str),
                    xlab='Effect size', main='Age effect')


# Same model above, but with a Weibull hazard function

# In[143]:


def weibull_survival_model():
    
    # Imputation of missing values
    p_va = Beta('p_VA', 1, 1, value=0.5)
    va_masked = np.ma.masked_values(va, value=0.5)
    x_va = Bernoulli('x_va', p_va, value=va_masked, observed=True)
    
    mu_pH = Normal('mu_pH', 0, 0.0001, value=7)
    sigma_pH = Uniform('sigma_pH', 0, 500, value=10)
    tau_pH = sigma_pH**-2
    pH_masked = np.ma.masked_values(pH_center, value=7.111)
    x_pH = Normal('x_pH', mu_pH, tau_pH, value=pH_masked, observed=True)

    X = [x_va, flu_unique.flu_only, flu_unique.flu_staph]
        
    # Intercept for survival rate
    beta0 = Normal('beta0', 0.0, 0.001, value=0)
    # Covariates
    beta = Normal('beta', 0, 0.001, value=np.zeros(len(X)))

    age_knots = np.linspace(AgeYears_center.min(), AgeYears_center.max(), 7)
    α_age = Exponential('α_age', 1, value=0.1)
    spline_age = spline('spline_age', AgeYears_center, age_knots, α_age)
    
    pH_knots = np.linspace(pH_center.min(), pH_center.max(), 7)
    α_pH = Exponential('α_pH', 1, value=0.1)
    spline_pH = spline('spline_pH', pH_center, pH_knots, α_pH)
    
    # Survival rates
    @deterministic
    def lam(b0=beta0, b=beta, x=X, γ=spline_age['mu_x'], ρ=spline_pH['mu_x']):
        return np.exp(b0 + np.dot(np.transpose(x), b) + γ + ρ)
    
    α = Exponential('α', 1, value=1)

    @observed
    def survival(value=obs_t, lam=lam, f=died, α=α):
        # Weibull survival log-likelihood
        return (f*(np.log(α) + (α-1)*np.log(value*lam) + np.log(lam)) - (lam*value)**α).sum()

    return locals()


# In[144]:


W = MCMC(weibull_survival_model())


# In[145]:


W.sample(50000, 40000)


# ## Cox model

# In[162]:


def cox_model():
    
    # Imputation of missing values
    p_va = Beta('p_VA', 1, 1, value=0.5)
    va_masked = np.ma.masked_values(va, value=0.5)
    x_va = Bernoulli('x_va', p_va, value=va_masked, observed=True)
    
    mu_pH = Normal('mu_pH', 0, 0.0001, value=7)
    sigma_pH = Uniform('sigma_pH', 0, 500, value=10)
    tau_pH = sigma_pH**-2
    pH_masked = np.ma.masked_values(pH_center, value=7.111)
    x_pH = Normal('x_pH', mu_pH, tau_pH, value=pH_masked, observed=True)

    X = [x_va, flu_unique.flu_only, flu_unique.flu_staph]
        
    # Intercept for survival rate
    beta0 = Normal('beta0', 0.0, 0.001, value=0)
    # Covariates
    beta = Normal('beta', 0, 0.001, value=np.zeros(len(X)))

    age_knots = np.linspace(AgeYears_center.min(), AgeYears_center.max(), 7)
    α_age = Exponential('α_age', 1, value=0.1)
    spline_age = spline('spline_age', AgeYears_center, age_knots, α_age)
    
    pH_knots = np.linspace(pH_center.min(), pH_center.max(), 7)
    α_pH = Exponential('α_pH', 1, value=0.1)
    spline_pH = spline('spline_pH', pH_center, pH_knots, α_pH)
    
    c = Gamma('c', .0001, .00001, value=0.1)
    r = Gamma('r', .001, .0001, value=0.1)
    
    dL0_star = Lambda('dL0_star', lambda r=r: r*np.diff(times))
    
    # prior mean hazard
    mu = Lambda('mu', lambda dL0_star=dL0_star, c=c: dL0_star * c)
    
    dL0 = Gamma('dL0', mu, c, value=np.ones(T))
    
    @deterministic
    def Idt(b0=beta0, b=beta, x=X, dL0=dL0, γ=spline_age['mu_x'], ρ=spline_pH['mu_x']): 
        # Poisson trick: independent log-normal hazard increments
        Xb = np.exp(b0 + np.dot(np.transpose(x), b) + γ + ρ)
        return np.transpose(Y[:,:-1] * np.outer(Xb, dL0))
#         return [[Y[i,j]*Xb[i] for i in range(N_obs)] for j in range(T)]

    dN_like = Poisson('dN_like', Idt, value=dN, observed=True)
    
    return locals()


# In[163]:


M_cox = MCMC(cox_model())


# In[164]:


M_cox.sample(20000, 10000)


# In[165]:


Matplot.summary_plot(M_cox.beta, custom_labels=['VA', 'flu only', 'flu + staph'], hpd=False)


# In[166]:


Matplot.summary_plot(M_cox.__dict__['spline_pH']['gamma'], 
                     custom_labels=np.linspace(flu_unique.pH.dropna().min(), 
                                               flu_unique.pH.dropna().max(), 7).round(1).astype(str),
                    xlab='Effect size', main='pH effect')


# In[167]:


Matplot.summary_plot(M_cox.__dict__['spline_age']['gamma'], 
                     custom_labels=np.linspace(AgeYears.min(), AgeYears.max(), 7).round(1).astype(str),
                    xlab='Effect size', main='Age effect')