#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc as pm
import pdb

pm.__version__


# In[2]:


flu = pd.read_csv('data/flu_organism.csv', index_col=0)


# In[3]:


flu.shape


# In[4]:


flu.PatientID.unique().shape


# ### Turn into table of unique patients, with appropriate organism columns
# 
# Create indicator for flu-only.

# In[5]:


flu_only = flu.groupby('PatientID')['OrganismName'].apply(lambda s: 
                        len([x for x in s if str(x).startswith('Influenza')])==len(s)).astype(bool)


# In[6]:


flu_only.head()


# In[7]:


flu_only.name = 'flu_only'


# In[8]:


flu = flu.merge(pd.DataFrame(flu_only), left_on='PatientID', right_index=True)


# In[9]:


# flu_staph_organism = flu.groupby('PatientID')['OrganismName'].apply(lambda s: 
#             len([x for x in s if str(x).startswith('Influenza') or 
#                  str(x).startswith('Staphylococcus aureus')])>1).astype(bool)


# In[10]:


# flu_staph_organism.name = 'flu_staph_org'


# In[11]:


# flu = flu.merge(pd.DataFrame(flu_staph_organism), left_on='PatientID', right_index=True)


# See if there are any flu-only or flu-other individuals with an ICD9 diagnosis of S. Aureus

# In[12]:


flu_other = flu.groupby('PatientID')['OrganismName'].apply(lambda s: 
            len([x for x in s if str(x).startswith('Influenza') or 
                 str(x).startswith('Other')])>1).astype(bool)


# In[13]:


flu_other.name = 'flu_other'


# In[14]:


flu = flu.merge(pd.DataFrame(flu_other), left_on='PatientID', right_index=True)


# There do not appear to be cases of flu-only or flu-other via organism that have a S.Aureus diagnosis via ICD9

# In[15]:


(flu_other & flu.s_aureus_icd9).sum()


# In[16]:


(flu_only & flu.s_aureus_icd9).sum()


# Same for pneumo

# In[17]:


(flu_other & flu.pneumo_icd9).sum()


# In[18]:


flu.groupby('PatientID')['OrganismName'].value_counts()


# Create indictors for coinfection type

# In[19]:


for org in flu.Type.unique():
    flu[org] = (flu.Type==org).astype(int)


# Create data frame of unique patients

# In[20]:


flu_unique = flu.drop_duplicates(subset=['PatientID']).set_index('PatientID')


# In[21]:


flu_unique.s_aureus_icd9.sum()


# In[22]:


flu_unique['flu_only'] = flu_only


# In[23]:


flu_unique.flu_only.mean()


# Several missing values for admission to time on ECMO

# In[24]:


flu_unique.AdmitToTimeOnHours.isnull().mean()


# Create variables for use in analysis

# In[25]:


# ECMO Type
flu_unique['VA'] = flu_unique.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu_unique.loc[flu_unique.Mode=='Other', 'VA'] = None


# Create oxygen index

# In[26]:


flu_unique['OI'] = flu_unique.FiO2 * flu_unique.MAP / flu_unique.PO2


# In[27]:


flu_unique.OI.hist(bins=20)


# In[28]:


covariates = ['AgeYears', 'pH', 'PO2', 'VA']


# Get counts of missing values

# In[29]:


flu_unique[covariates].isnull().mean()


# In[30]:


flu_unique[covariates].hist(bins=25);


# In[31]:


flu_unique[flu_unique.AgeYears< 1].AgeDays.hist(bins=15)


# In[32]:


flu_unique.s_aureus_icd9.sum()


# In[33]:


(flu_unique.flu_staph_org | flu_unique.s_aureus_icd9).sum()


# In[34]:


foo = '01AC9644-7CDF-4E28-A5F4-FE5B7F9FA22A'
flu_unique.ix[foo][['flu_staph_org', 's_aureus_icd9']]


# In[35]:


flu_unique['flu_staph'] = (flu_unique.flu_staph_org | flu_unique.s_aureus_icd9).astype(int)


# In[36]:


flu_unique.to_csv('Data/flu_unique.csv')


# ## Logistic regression model

# Fate of each patient

# In[37]:


died = (flu_unique.fate=='Died').astype(int).values


# In[38]:


N_obs = len(died)


# Functions for standarizing and centering

# In[39]:


center = lambda x: (x - x[np.isnan(x) ^ True].mean())
standardize = lambda x: center(x) / x[np.isnan(x) ^ True].std()


# In[40]:


covariates


# In[41]:


AgeYears, pH, PO2, va = flu_unique[covariates].values.T

AgeYears_center = center(AgeYears).round(1)

# Center pH at 7.4
pH_center = pH - 7.4

# OI_std = standardize(OI)
# PO2_std = standardize(PO2)


# In[42]:


no_coinfection = flu_unique.flu_only.values
with_staph = flu_unique.flu_staph.values


# In[43]:


flu_unique.flu_staph.sum()


# ## Model

# In[44]:


from pymc import MCMC, Normal, Poisson, Uniform, Lambda, Laplace, Exponential, Beta, HalfCauchy, Uninformative
from pymc import Bernoulli, Lognormal, observed, Matplot, deterministic, Gamma, stochastic
from pymc import AdaptiveMetropolis, normal_like, rnormal, potential, invlogit
from pymc.gp import *


# In[45]:


# Values for missing elements, to be replaced in model
# OI_std[np.isnan(OI_std)] = 0
pH_center[np.isnan(pH_center)] = 7.111
va[np.isnan(va)] = 0.5
PO2[np.isnan(PO2)] = 20.111


# In[46]:


def spline(name, x, knots, smoothing, interpolation_method='linear', pred_points=10):
    """ Generate PyMC objects for a spline model of age-specific rate

    Parameters
    ----------
    name : str
    knots : array
    x : array, points to interpolate to
    smoothing : pymc.Node, smoothness parameter for smoothing spline
    interpolation_method : str, optional, one of 'linear', 'nearest', 'zero', 'slinear', 'quadratic, 'cubic'

    Results
    -------
    Returns dict of PyMC objects, including 'gamma' (log of rate at
    knots) and 'mu_age' (age-specific rate interpolated at all age
    points)
    """
    assert np.all(np.diff(knots) > 0), 'Spline knots must be strictly increasing'

    # TODO: consider changing this prior distribution to be something more familiar in linear space
    gamma = [Normal('gamma_%s_%d'%(name,k), 0., 10.**-2, value=0.) for k in knots]
    #gamma = [mc.Uniform('gamma_%s_%d'%(name,k), -20., 20., value=-10.) for k in knots]

    import scipy.interpolate
    @deterministic(name='mu_x_%s'%name)
    def mu_x(gamma=gamma, knots=knots, x=x):
        mu = scipy.interpolate.interp1d(knots, np.exp(gamma), kind=interpolation_method, 
                                        bounds_error=False, fill_value=0.)
        return mu(x)
    
    @deterministic(name='pred_%s'%name)
    def pred(gamma=gamma, knots=knots):
        mu = scipy.interpolate.interp1d(knots, np.exp(gamma), kind=interpolation_method, 
                                        bounds_error=False, fill_value=0.)
        return mu(np.linspace(x.min(), x.max(), pred_points))

    vars = dict(gamma=gamma, mu_x=mu_x, pred=pred, x=x, knots=knots)

    if (smoothing > 0) and (smoothing < 1e10):
        #print 'adding smoothing of', smoothing
        @potential(name='smooth_mu_%s'%name)
        def smooth_gamma(gamma=gamma, knots=knots, tau=smoothing**-2):
            # the following is to include a "noise floor" so that level value
            # zero prior does not exert undue influence on age pattern
            # smoothing
            # TODO: consider changing this to an offset log normal
            gamma = np.clip(gamma, np.log(np.exp(gamma).mean()/10.), np.inf)  # only include smoothing on values within 10x of mean

            return normal_like(np.sqrt(np.sum(np.diff(gamma)**2 / np.diff(knots))), 0, tau)
        vars['smooth_gamma'] = smooth_gamma

    return vars


# In[47]:


np.mean(PO2)


# In[48]:


plt.hist(PO2-80, bins=20)


# In[59]:


def survival_model(age_mask, n_knots=5):

    # Imputation of missing values
    p_va = Beta('p_VA', 1, 1, value=0.5)
    va_masked = np.ma.masked_values(va[age_mask], value=0.5)
    x_va = Bernoulli('x_va', p_va, value=va_masked, observed=True)
    
    mu_pH = Normal('mu_pH', 0, 0.0001, value=7)
    sigma_pH = Uniform('sigma_pH', 0, 500, value=10)
    tau_pH = sigma_pH**-2
    pH_masked = np.ma.masked_values(pH_center[age_mask], value=7.111)
    x_pH = Normal('x_pH', mu_pH, tau_pH, value=pH_masked, observed=True)
    
    alpha_PO2 = Exponential('alpha_PO2', 100, value=0.5)
    beta_PO2 = Exponential('beta_PO2', 100, value=200)
    PO2_masked = np.ma.masked_values(PO2[age_mask], value=20.111)
    x_PO2 = Gamma('x_PO2', alpha_PO2, beta_PO2, value=PO2_masked, observed=True)
    x_PO2_std = Lambda('x_PO2_std', lambda x=x_PO2: (x - 80)/np.std(x))

    X = [x_pH, x_PO2_std, x_va, flu_unique.flu_only.values[age_mask], flu_unique.flu_staph.values[age_mask]]
        
    # Intercept for survival rate
    beta0 = Normal('beta0', 0.0, 0.001, value=0)
    # Covariates
    beta = Normal('beta', 0, 0.001, value=np.zeros(len(X)))
    
    odds = Lambda('odds', lambda b=beta: np.exp(b))

    if AgeYears[age_mask].max()>1:
        age_knots = np.linspace(AgeYears_center[age_mask].min(), AgeYears_center[age_mask].max(), n_knots)
        α_age = Exponential('α_age', 1, value=0.1)
        spline_age = spline('spline_age', AgeYears_center[age_mask], age_knots, α_age)

        # Event rates
        @deterministic
        def π(b0=beta0, b=beta, x=X, γ=spline_age['mu_x']):
            return invlogit(b0 + np.dot(np.transpose(x), b) + γ)
    
    else:
        # Event rates
        @deterministic
        def π(b0=beta0, b=beta, x=X):
            return invlogit(b0 + np.dot(np.transpose(x), b))
    
    deaths = Bernoulli('deaths', π, value=died[age_mask], observed=True)
    
    return locals()


# ### Adults model

# In[60]:


adult_mask = AgeYears>=18


# In[61]:


M_adult = MCMC(survival_model(adult_mask))


# In[62]:


M_adult.sample(100000, 90000)


# In[63]:


cov_labels = ['pH', 'PO2', 'VA', 'flu only', 'flu + staph']


# In[64]:


Matplot.summary_plot(M_adult.odds, custom_labels=cov_labels, hpd=False, vline_pos=1)


# In[65]:


M_adult.odds.summary()


# In[66]:


Matplot.summary_plot(M_adult.__dict__['spline_age']['pred'], 
                     custom_labels=np.linspace(AgeYears[adult_mask].min(), AgeYears[adult_mask].max(), 10).round(1).astype(str),
                    xlab='Effect size', main='Age effect')


# ### Child model

# In[67]:


child_mask = (AgeYears>=1) & (AgeYears<18)

M_child = MCMC(survival_model(child_mask))
M_child.sample(100000, 90000)


# In[68]:


Matplot.summary_plot(M_child.odds, custom_labels=cov_labels, 
                     hpd=False, vline_pos=1)


# In[69]:


Matplot.summary_plot(M_child.__dict__['spline_age']['pred'], 
                     custom_labels=np.linspace(AgeYears[child_mask].min(), AgeYears[child_mask].max(), 10).round(1).astype(str),
                    xlab='Effect size', main='Age effect')


# In[70]:


M_child.odds.summary()


# ### Infant model

# In[71]:


infant_mask = (AgeYears<1)

M_infant = MCMC(survival_model(infant_mask))
M_infant.sample(100000, 90000)


# In[72]:


Matplot.summary_plot(M_infant.odds, custom_labels=cov_labels, hpd=False, vline_pos=1)


# In[73]:


M_infant.odds.summary()