#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 100)


# Import data from Excel, using `RunID` as an index.

# In[9]:


flu = pd.read_excel("Data/Flu.xlsx", "Flu", index_col='RunID', na_values=['V04.82','v03.81'])
diagnoses = pd.read_excel("Data/Flu.xlsx", "Diagnoses")
complications = pd.read_excel("Data/Flu.xlsx", "Complications")
organisms = pd.read_excel("Data/Flu.xlsx", "Organisms")
runs_year = pd.read_excel("Data/Flu.xlsx", "Runs by Year")
# runs_sex = pd.read_excel("Data/Flu.xlsx", "Runs by Year and Sex")
# runs_race = pd.read_excel("Data/Flu.xlsx", "Runs by Year and Race")
# Average age of patients on ECMO
year_days = pd.read_csv("Data/year_days.csv", index_col='YearEcls')

support = pd.read_excel("Data/Flu.xlsx", "Pre-ECLS Support")
survived_year = pd.read_csv("Data/survived.csv")


# In[10]:


flu.columns


# In[11]:


(year_days/365).plot(legend=False, grid=False)
plt.xlabel('Year'); plt.ylabel('Age (years)')
plt.xlim(1992, 2013);


# Number of patients inf flu dataset

# In[12]:


flu.PatientID.unique().size


# Convert year to integer value

# In[13]:


flu['year'] = flu.YearECLS.astype(int)


# Constrain data to study years

# In[14]:


flu = flu[(flu.YearECLS>=1992) & (flu.YearECLS<=2012)]
flu.PatientID.unique().size


# In[15]:


runs_year = runs_year[(runs_year.YearEcls>=1992) & (runs_year.YearEcls<=2012)]


# Create hours on ventilator variable

# In[16]:


flu['HoursVent'] = flu.HoursECMO + flu.TimeOffToExtubationDateHours.fillna(0)


# ## Complications
# 
# Divide into mechanical, hemorragic, renal, cardiovascular, pulmonary, neural (ignore others).

# In[17]:


complications.head()


# In[18]:


complications['complication_type'] = complications.Description.apply(lambda x: x[:x.index(':')])


# In[19]:


complications.complication_type.value_counts()


# ## Age Summary
# 
# Convert age to years

# In[20]:


flu['AgeYears'] = flu.AgeDays/365.


# Age distribution of flu data.

# In[21]:


(flu.drop_duplicates('PatientID').AgeYears).hist()


# Create age classes:
# 
# * neonatal: <29 days
# * pediatric 29 days - 17 years
# * adult >17 years

# In[22]:


flu['age_class'] = 'pediatric'
flu.age_class[flu.AgeDays<29] = 'neonatal'
flu.age_class[flu.AgeYears>17] = 'adult'


# Here is the proportion in each class:

# In[23]:


flu.age_class.value_counts(True)


# Distribution by race and sex:

# In[24]:


pd.crosstab(flu.Race, flu.Sex).sort_index(by='F', ascending=False)


# ## Diagnoses

# Proportion with flu as `PrimaryDx` (ICD9 codes 487 to (but not including) 489) from diagnosis dataset:

# In[25]:


primary_dx = diagnoses.ICD9Code[diagnoses.PrimaryDiagnosis==1]
primary_flu = primary_dx.apply(lambda x: x.startswith('487') or x.startswith('488'))


# Proportion and number with flu as primary diagnosis

# In[26]:


primary_flu.mean()


# In[27]:


primary_flu.sum()


# The same calculation using the `flu` dataset:

# In[32]:


flu.PrimaryDx = flu.PrimaryDx.astype(float)


# In[33]:


((flu.PrimaryDx >= 487) & (flu.PrimaryDx < 489)).mean()


# In[34]:


((flu.PrimaryDx >= 487) & (flu.PrimaryDx < 489)).sum()


# Here is a plot of the frequencies of secondary diagnoses (frequencies >5):

# In[35]:


secondary_diag_count = diagnoses.ICD9Code[diagnoses.PrimaryDiagnosis==0].value_counts()
secondary_diag_count[secondary_diag_count>5].plot(kind='bar', figsize=(16,6), grid=False)


#     518.83 chronic respiratory failure
#     491* obstructive chronic bronchitis 
#     493 asthma

# In[36]:


pulmonary_codes = '518.83', '491', '493'


# In[37]:


diagnoses.ICD9Code.isin(pulmonary_codes).mean()


#     746* congenital heart disease
#     428* congestive heart failure
#     427.5 cardiac arrest

# In[38]:


cardiac_codes = '746', '428', '427.5'


# In[39]:


diagnoses.ICD9Code.isin(cardiac_codes).mean()


#     Primary
#     279.3 Unspecified Immunity Deficiency
#     279.2 Combined Immunity Deficiency
#     229.12 Wiskott Aldrich Syndrome
#     279.11 DiGeorge’s Syndrome
#     288.0 Neutropenia
#     Secondary
#     042 HIV
#     249* Diabetes Mellitus (DMII)
#     250* Diabetes Mellitus (DM1)
#     Transplant Organ 
#     996.8* Complications of transplanted organ
#     V42 Organ replacement/HSCT
#     Malignancy
#     200.22 Burkitt’s tumor
#     201.5* Hodgkin’s disease
#     208* Leukemia
#     202.8 lymphoma
#     204.00 acute lymphoid lymphoma
#     205.0 myeloid leukemia
#     155* neoplasm of liver
#     158* neoplasm of retroperitoneum
#     163.9 neoplasm of pleura
#     170.6 malignant neoplasm of pelvic bones and coccyx
#     171.4 neoplasm of connective and other soft tissue of thorax
#     189* neoplasm of kidney, ureter etc.
#     191.9 malignant neoplasm of brain
#     194* malignant neoplasm of glands
#     195* malignant neoplasm of different body parts

# In[40]:


immuno_codes = ('279.3', '279.2', '229.12', '279.11', '288.0', '042', '249', '250', '996.8', 'V42', '200.22', 
                '201.5', '208', '202.8', '204.00', '205.0', '155', '158', '163.9', '170.6', '171.4', '189', '191.9', 
                '194', '195')


# In[41]:


diagnoses.ICD9Code.isin(immuno_codes).mean()


# In[42]:


s_aureus_codes = ["038.11","038.12","O41.11","041.12","482.41","482.42","V02.53","V02.54","V12.0"]


# In[43]:


diagnoses.ICD9Code.isin(s_aureus_codes).sum()


# In[44]:


s_aureus_diagnoses = diagnoses[diagnoses.ICD9Code.isin(s_aureus_codes)]


# ## Organisms

# Import organism lookup table

# In[45]:


organism_type = pd.read_csv("Data/organisms.csv")


# Merge organism information in a single table

# In[46]:


organisms = organisms.merge(organism_type)
organisms.head()


# Number of unique runs in organisms table

# In[47]:


organisms.RunID.unique().size


# Frequencies of organism counts per run

# In[48]:


organisms.groupby('RunID').OrganismNo.count().value_counts()


# Count of organisms by run

# In[49]:


counts = organisms.groupby('RunID').OrganismNo.count()


# Frequencies of organism occurrence (for counts >10)

# In[50]:


organism_counts = organisms.groupby('OrganismName').RunID.count()
organism_counts.sort(ascending=False)
organism_counts[organism_counts>10].plot(kind='bar', figsize=(18,4), grid=False)


# Rate of organism occurence (do not sum to one due to multple coinfections):
type_counts = organisms.groupby('Type').RunID.count()
type_counts.sort(ascending=False)
(type_counts.astype(float)/organisms.RunID.unique().size).round(2)
# In[51]:


ax1 = flu.groupby('year')['AgeYears'].median().plot(kind='bar', stacked=True, figsize=(10,6), grid=False)
#ax1.set_xticks(range(1992, 2012))
#ax2 = ax1.twinx()
ax1.plot(year_days.values/10.)
plt.ylim(0,100)


# ## ECMO
# 
# Distribution of hours on ECMO, by year

# In[52]:


flu.groupby(['year', 'age_class'])['PatientID'].count().unstack().plot(kind='bar', stacked=True, figsize=(10,6), grid=False)
plt.ylabel('count')
# flu[flu.Sex=='F'].groupby(['year', 'age_class'])['PatientID'].count().unstack().plot(
#                                             kind='bar', stacked=True, figsize=(10,6), grid=False)
# flu[flu.Sex=='M'].groupby(['year', 'age_class'])['PatientID'].count().unstack().plot(
#                                             kind='bar', stacked=True, figsize=(10,6), grid=False)


# Same plot as above, except on log scale.

# In[53]:


flu.groupby(['year', 'age_class'])['PatientID'].count().unstack().apply(np.log).plot(kind='bar', stacked=True, 
                                                                                     figsize=(10,6), grid=False)
plt.ylabel('log(count)')


# ECMO incidence, by year and sex:

# In[54]:


flu.groupby(['year','Sex']).RunNo.count()


# ## Abstract Tables
# 
# ### Table 1 summaries
# 
# **Demographics of patients with influenza on ECMO**

# In[59]:


from scipy.stats import beta
from scipy.stats import norm

def binomial_hpdr(x, pct=0.95, a=1, b=1, n_pbins=1e3, roundto=2):
    """
    Function computes the posterior mode along with the upper and lower bounds of the
    **Highest Posterior Density Region**.

    Parameters
    ----------
    x: data (boolean)
    pct: the size of the confidence interval (between 0 and 1)
    a: the alpha hyper-parameter for the Beta distribution used as a prior (Default=1)
    b: the beta hyper-parameter for the Beta distribution used as a prior (Default=1)
    n_pbins: the number of bins to segment the p_range into (Default=1e3)

    Returns
    -------
    A tuple that contains the mode as well as the lower and upper bounds of the interval
    (mode, lower, upper)

    """
    n, N = sum(x), len(x)
    # fixed random variable object for posterior Beta distribution
    rv = beta(n+a, N-n+b)
    # determine the mode and standard deviation of the posterior
    stdev = rv.stats('v')**0.5
    mode = (n+a-1.)/(N+a+b-2.)
    # compute the number of sigma that corresponds to this confidence
    # this is used to set the rough range of possible success probabilities
    n_sigma = np.ceil(norm.ppf( (1+pct)/2. ))+1
    # set the min and max values for success probability 
    max_p = mode + n_sigma * stdev
    if max_p > 1:
        max_p = 1.
    min_p = mode - n_sigma * stdev
    if min_p > 1:
        min_p = 1.
    # make the range of success probabilities
    p_range = np.linspace(min_p, max_p, n_pbins+1)
    # construct the probability mass function over the given range
    if mode > 0.5:
        sf = rv.sf(p_range)
        pmf = sf[:-1] - sf[1:]
    else:
        cdf = rv.cdf(p_range)
        pmf = cdf[1:] - cdf[:-1]
    # find the upper and lower bounds of the interval 
    sorted_idxs = np.argsort( pmf )[::-1]
    cumsum = np.cumsum( np.sort(pmf)[::-1] )
    j = np.argmin( np.abs(cumsum - pct) )
    upper = p_range[ (sorted_idxs[:j+1]).max()+1 ]
    lower = p_range[ (sorted_idxs[:j+1]).min() ]    

    return [np.round(v, roundto) for v in (mode, lower, upper)]


# Code for calculating lognormal posterior intervals

# In[60]:


from scipy.stats import invgamma
from numpy.random import normal

def lognorm_interval(y, pct=0.95, draws=10000, roundto=2, logm=True):
    
    x = np.log(y + 0.001)
    # Calculate sufficiecnt statistics
    mu_hat = x.mean()
    s2_hat = x.var()
    nu  = len(x) - 1
    
    beta = nu * s2_hat / 2.
    
    s2_sim = invgamma.rvs(nu/2., scale=beta, size=draws)
    mu_sim = normal(mu_hat, np.sqrt(s2_sim)/2.)
    
    m = mu_sim + 0.5*s2_sim
    if not logm:
        m = np.exp(m)
    m.sort()
    
    c = int(draws*(1-pct)/2.)

    return([np.round(v, roundto) for v in (m[int(draws/2)], m[c], m[-c])]) 


# Merge flu dataset with organism dataset for summarization by infection type

# In[61]:


flu_organism= flu.merge(organisms, left_index=True, right_on='RunID', how='left')


# In[62]:


flu_organism.groupby('PatientID')['OrganismName'].count().hist(bins=20)


# In[63]:


flu_organism.head()


# Number with flu by organism name.

# In[64]:


has_flu_organism = flu_organism[flu_organism.OrganismName.str.startswith('Influenza')==True].drop_duplicates('PatientID')
len(has_flu_organism)


# In[65]:


flu_organism['has_flu'] = (flu_organism.OrganismName.str.startswith('Influenza')==True).replace(
                            {True: 'ECMO Flu', False: 'ECMO No Flu'})


# In[66]:


year_days.columns = ['All Runs']


# In[67]:


axes = flu_organism.groupby(['YearECLS', 'has_flu'])['AgeYears'].mean().unstack().plot()
(year_days/365).plot(ax=axes, legend=False, grid=False)
plt.xlabel('Year'); plt.ylabel('Age (years)')
plt.xlim(1992, 2013);


# In[68]:


axes = flu_organism.groupby(['YearECLS', 'has_flu'])['PatientID'].count().unstack().plot()
plt.xlabel('Year');
plt.xlim(1996, 2013);


# Merge with diagnoses dataset for ICD9 lookup.

# In[69]:


flu_ICD9 = flu.merge(diagnoses, left_index=True, right_on='RunID', how='left')


# Number with flu by ICD9

# In[70]:


has_flu_ICD9 = flu_ICD9[flu_ICD9.ICD9Code.apply(
                        lambda x: x.startswith('487') or x.startswith('488'))].drop_duplicates('PatientID')
len(has_flu_ICD9)


# This is the *union* of the patients with flu via ICD9 and those with flu via organism. I think this is what we need. 

# In[71]:


len(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID)))


# Extract runs from patients with Flu by either ICD9 or organism

# In[72]:


flu_organism_runs = flu_organism[flu_organism.PatientID.isin(list(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID))))]


# In[73]:


flu_ICD9_runs = flu_ICD9[flu_ICD9.PatientID.isin(list(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID))))]


# This is a smaller number than is in the `Flu` table.

# In[74]:


len(flu)


# Total number of runs for people with influenza on ECMO

# In[75]:


flu_runs_organisms = organisms.RunID[organisms.OrganismName.str.startswith('Influenza')==True]
flu_runs_organisms.unique().shape


# In[76]:


flu_runs_ICD9 = diagnoses.RunID[diagnoses.ICD9Code.apply(
                        lambda x: x.startswith('487') or x.startswith('488'))]
flu_runs_ICD9.unique().shape


# In[77]:


# This is the union of the RunIDs from both sets
len(set(flu_runs_organisms).union(set(flu_runs_ICD9)))


# Proportion of runs with influenza

# In[78]:


# From organism list
float(len(flu_runs_organisms.unique())) / runs_year.CNT.sum()


# In[79]:


# From ICD9 list
float(len(flu_runs_ICD9.unique())) / runs_year.CNT.sum()


# Proportion female before and during 2009

# In[80]:


flu['female'] = flu.Sex=='F'
x_2009 = flu.drop_duplicates('PatientID')[flu.YearECLS==2009]['female'].sum()
n_2009 = flu.drop_duplicates('PatientID')[flu.YearECLS==2009]['female'].count()
print('Proportion female in 2009: {0:.3f}'.format(float(x_2009)/n_2009))


# In[81]:


flu['pre_2009'] = flu['YearECLS'] < 2009
x_pre_2009 = flu.drop_duplicates('PatientID')[flu.pre_2009]['female'].sum()
n_pre_2009 = flu.drop_duplicates('PatientID')[flu.pre_2009]['female'].count()
print('Proportion female before 2009: {0:.3f}'.format(float(x_pre_2009)/n_pre_2009))


# In[82]:


from scipy import stats

def ztest(x, n): 
    n = np.array(n, float)
    pbar = sum(x)/sum(n)
    z = np.diff(x/n) / np.sqrt(pbar * (1. - pbar) * sum(1./n))
    return {'z':z, 'p':1.- stats.norm.cdf(np.abs(z))}


# In[83]:


ztest([x_2009, x_pre_2009], [n_2009, n_pre_2009])


# Proportion of females with flu.

# In[84]:


with_flu = flu[flu.PatientID.isin(pd.Series(list(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID)))))]


# In[85]:


(with_flu.Sex=='F').mean()


# In[86]:


binomial_hpdr((with_flu.Sex=='F'))


# Age statistics

# In[87]:


with_flu.AgeYears.describe()


# Mortality of patients with the diagnosis of influenza

# In[88]:


with_flu_survivors = flu.drop_duplicates('PatientID').DischargedAlive==True
with_flu_survivors.mean()


# In[89]:


binomial_hpdr(with_flu_survivors)


# Median hours on ECMO for influenza

# In[90]:


# Estimate of interval on log scale, due to skew in distribution
lognorm_interval(with_flu.HoursECMO/24., draws=100000)


# In[91]:


# Age distribution
flu.drop_duplicates('PatientID')['AgeYears'].describe()


# In[92]:


# Race distribution
race_counts = flu.drop_duplicates('PatientID')['Race'].value_counts()
race_counts/float(race_counts.sum())


# In[93]:


binomial_hpdr(flu.drop_duplicates('PatientID')['Race']=='W')


# In[94]:


# Survival
flu.drop_duplicates('PatientID')['DischargedAlive'].mean()


# In[95]:


# Sex
sex_counts = flu.drop_duplicates('PatientID')['Sex'].value_counts()
sex_counts/float(sex_counts.sum())


# Complications

# In[96]:


complications_with_flu = with_flu.merge(complications, left_index=True, right_on='RunID', how='left')


# In[97]:


complications_with_flu.complication_type.value_counts()


# In[98]:


def complication_intervals(dataset):
    
    complications_merged = dataset.merge(complications, left_index=True, right_on='RunID', how='left')

    for kind in ('Neurologic', 'Renal', 'Hemorrhagic', 'Cardiovascular', 'Pulmonary'):
    
        x = complications_merged[complications_merged.complication_type==kind].PatientID.drop_duplicates().count()
        n = dataset.PatientID.drop_duplicates().count()
        
        m, lo, up = binomial_hpdr([1]*x + [0]*(n-x))
        
        print('{0}:\n\t {1} ({2}, {3})\n'.format(kind, m, lo, up))


# In[99]:


complication_intervals(with_flu)


# Days on ventilator

# In[100]:


(with_flu.HoursVent.dropna()/24).hist(bins=20)


# In[101]:


# Interval on log scale
lognorm_interval(with_flu.HoursVent.dropna()/24)


# **Flu only**

# Patients with influenza ONLY on ECMO= patients btwn 1992-2012 with EITHER an ICD9 code of flu AND/OR an organism of flu but NO additional organism codes for other bacteria or viruses. 
# 
# We have established above that all the entries in the `Flu` table have influenza. We only have to filter out co-infection.

# Number of infections by `PatientID`

# In[102]:


infection_count = flu_organism.groupby('PatientID')['PatientID'].count()


# In[103]:


infection_count.count()


# In[104]:


infection_count.hist()


# In[105]:


organisms[organisms.OrganismName=='Staphylococcus aureus, meth resist'].drop_duplicates('RunID').shape


# In[106]:


single_infection = flu_organism.merge(pd.DataFrame(infection_count[infection_count==1]), left_on='PatientID', right_index=True)


# In[107]:


single_infection_ICD9 = flu_ICD9.merge(pd.DataFrame(infection_count[infection_count==1]), left_on='PatientID', right_index=True)


# In[108]:


set(single_infection.PatientID.drop_duplicates()).difference(set(single_infection_ICD9.PatientID.drop_duplicates()))


# In[109]:


len(set(single_infection.PatientID.drop_duplicates()))


# In[110]:


len(set(single_infection_ICD9.PatientID.drop_duplicates()))


# In[111]:


single_infection.drop_duplicates('PatientID').PatientID.count()


# In[112]:


single_infection_ICD9.drop_duplicates('PatientID').PatientID.count()


# In[113]:


influenza_only = single_infection[single_infection.OrganismName.str.startswith('Influenza')==True]


# In[114]:


influenza_only['AgeYears'].describe()


# In[115]:


race_counts_flu = influenza_only['Race'].value_counts()
race_counts_flu/float(race_counts_flu.sum())


# In[116]:


binomial_hpdr(influenza_only.Race.dropna()=='W')


# Posterior interval for days on ECMO

# In[117]:


lognorm_interval(influenza_only.HoursECMO/24., draws=100000)


# In[118]:


flu_only_survivors = influenza_only['DischargedAlive']
flu_only_survivors.mean()


# In[119]:


binomial_hpdr(flu_only_survivors)


# In[120]:


sex_counts_flu = influenza_only['Sex'].value_counts()
sex_counts_flu/float(sex_counts_flu.sum())


# In[121]:


binomial_hpdr((influenza_only.Sex=='F'))


# In[122]:


# Index out patients with flu only
complication_intervals(with_flu[with_flu.PatientID.isin(influenza_only.PatientID)])


# Days on ventilator

# In[123]:


# Interval on log scale
lognorm_interval(influenza_only.HoursVent.dropna()/24)


# **Subset with co-infection**
# 
# Patients with influenza and co-infection on ECMO=patients btwn 1992-2012 with EITHER an ICD9 code of flu AND/OR an organism of flu AND an organism code of something else.
# 
# First, identify individuals with multiple infections based on organism code.

# In[124]:


multiple_infection = flu_organism.merge(pd.DataFrame(infection_count[infection_count>1]), 
                                        left_on='PatientID', right_index=True)


# Merge with the subset that has flu by ICD9

# In[125]:


multiple_infection_ICD9 = flu_ICD9.merge(pd.DataFrame(infection_count[infection_count>1]), 
                                         left_on='PatientID', right_index=True)


# No patients in one set that arent in the other

# In[126]:


set(multiple_infection.PatientID.drop_duplicates()).difference(set(multiple_infection_ICD9.PatientID.drop_duplicates()))


# In[127]:


flu_coinfection = multiple_infection.drop_duplicates('PatientID')


# Organism numbers for bacterial infections

# In[128]:


bacterial_numbers = [1,2,9,11,12,13,14,15,16,19,30,31,32,35,36,37,38,
                     39,40,48,52,54,55,58,59,60,61,67,68,69,71,77,80,84,85,86,91,95,104]


# In[129]:


bacterial_coinf = flu_organism[flu_organism.OrganismNo.isin(bacterial_numbers)].merge(
                pd.DataFrame(infection_count[infection_count>1]), 
                left_on='PatientID', right_index=True)


# Number with bacterial coinfection via organism code

# In[130]:


with_bacterial_coinf = bacterial_coinf.drop_duplicates(cols=['PatientID'])
with_bacterial_coinf.shape


# In[131]:


bacterial_ICD9 = ['033.9 ','033 ','036.2 ','038.0 ','038.1','038.11','038.12','038.19',
                  '038.2','038.3','038.4','038.41','038.42','03.43','038.44','038.49',
                  '040.82','040.89','081','100.9','481']


# In[132]:


with_bacterial_ICD9 = flu_ICD9[flu_ICD9.ICD9Code.apply(
        lambda x: x.startswith('041') or x.startswith('320') or x.startswith('482') or
        bool(bacterial_ICD9.count(x)))]


# Number with bacterial coinfection via ICD9

# In[133]:


with_bacterial_ICD9.PatientID.drop_duplicates().shape


# Comparison with flu-only:

# In[134]:


influenza_only['AgeYears'].hist(alpha=0.3, grid=False)
with_bacterial_coinf['AgeYears'].hist(alpha=0.3, grid=False)


# In[135]:


print('u={0:.0f}, p={1:.3f}'.format(*stats.mannwhitneyu(influenza_only['AgeYears'].dropna().values, 
                                                        with_bacterial_coinf['AgeYears'].dropna().values)))


# In[136]:


with_bacterial_coinf['AgeYears'].describe()


# Survival

# In[137]:


flu_bacterial_survivors = with_bacterial_coinf['DischargedAlive']
flu_bacterial_survivors.mean()


# In[138]:


binomial_hpdr(flu_bacterial_survivors)


# Days on ECMO

# In[139]:


lognorm_interval(with_bacterial_coinf.HoursECMO/24., draws=100000)


# In[140]:


race_counts_bacterial = with_bacterial_coinf['Race'].value_counts()
race_counts_bacterial/float(race_counts_bacterial.sum())


# Posterior interval for proportion white

# In[141]:


binomial_hpdr(with_bacterial_coinf.Race=='W')


# Proportion female

# In[142]:


binomial_hpdr((with_bacterial_coinf.Sex=='F'))


# Complications

# In[143]:


complication_intervals(with_flu[with_flu.PatientID.isin(with_bacterial_coinf.PatientID)])


# In[144]:


# Interval on log scale
lognorm_interval(with_bacterial_coinf.HoursVent.dropna()/24)


# ### Table 2 summaries
# 
# > race, age, gender
# pre-ECMO lab values (blood gas (pH, pCO2, HCO3, Oxygen Index)
# type of ECMO, ionotrope/pressor support, nitric, CVVH, oscillator
# type of co-infection 
# 
# Race distribution

# In[145]:


flu_organism['fate'] = flu_organism.DischargedAlive.replace({1: 'Survived', 0: 'Died'})
by_fate = flu_organism.drop_duplicates('PatientID').groupby("fate")


# In[146]:


by_fate['Race'].apply(lambda x: (x.value_counts()/float(x.value_counts().sum())).round(3))


# In[147]:


by_fate['Race'].value_counts().plot(kind='bar')


# Age distribution

# In[148]:


by_fate['AgeYears'].describe()


# Age distribution is not normally distributed.

# In[149]:


age_dist = by_fate['AgeYears']
age_dist.hist(alpha=0.3, grid=False)


# In[150]:


_ = by_fate.boxplot(column='AgeYears')


# Mann-Whitney U-test (non-parametric)

# In[151]:


age_data = list(age_dist)
print('u={0:.0f}, p={1:.3f}'.format(*stats.mannwhitneyu(age_data[0][1].dropna().values, age_data[1][1].dropna().values)))


# Sex proportions

# In[152]:


by_fate['Sex'].apply(lambda x: x.value_counts()/float(x.value_counts().sum()))


# In[153]:


sex_dist = by_fate['Sex'].apply(lambda x: x.value_counts())
ztest(sex_dist.unstack()['M'].values, sex_dist.unstack().sum(1))


# Gases

# In[154]:


gases = ['pH', 'PCO2', 'HCO3', 'PO2', 'SaO2']

by_fate.describe()[gases]


# In[155]:


for g in gases:
    by_fate.boxplot(column=g)


# In[156]:


for i in gases:
    data = list(flu[i].groupby(flu['DischargedAlive']))
    print('{0}: u={1:.0f}, p={2:.3f}'.format(i, *stats.mannwhitneyu(data[0][1].dropna().values, data[1][1].dropna().values)))


# Prevalence of each type of confection:
# 
# *Bacterial*

# In[157]:


bacterial = flu_organism[flu_organism.Type=='bacterial'].drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count()
bacterial


# In[158]:


bacterial_prop = bacterial/multiple_infection.groupby('DischargedAlive')['PatientID'].count().astype(float)
bacterial_prop


# In[159]:


ztest(bacterial, bacterial / bacterial_prop)


# *Fungal*

# In[160]:


fungal = flu_organism[flu_organism.Type=='fungal'].drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count()
fungal


# In[161]:


fungal_prop = fungal/multiple_infection.groupby('DischargedAlive')['PatientID'].count().astype(float)
fungal_prop


# In[162]:


ztest(fungal, fungal / fungal_prop)


# *Viral*

# In[163]:


viral = flu_organism[(flu_organism.Type=='viral') & (flu_organism.OrganismName.str.startswith('Influenza')==False)].drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count()
viral


# In[164]:


viral_prop = viral/multiple_infection.groupby('DischargedAlive')['PatientID'].count().astype(float)
viral_prop


# In[165]:


ztest(viral, viral / viral_prop)


# Type of ECMO

# In[166]:


ecmo_type = (flu.Mode=='VA').replace({True:'VA', False:'VV'})
ecmo_type.value_counts() / float(ecmo_type.notnull().sum())


# In[167]:


flu['ecmo_type'] = (flu.Mode=='VA').replace({True:'VA', False:'VV'})
flu.ecmo_type.groupby(flu['DischargedAlive']).apply(lambda x: x.value_counts()/float(x.notnull().sum()))


# Merge Flu with Pre-ECLS Support

# In[168]:


flu_support = flu.merge(support, left_index=True, right_on='RunID', how='left')


# In[169]:


support.Description.value_counts()


# In[170]:


n = float(flu_support.PatientID.drop_duplicates().count())
n


# In[171]:


n_alive = flu_support.drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count().astype(float)
n_alive


# Nitric oxide

# In[172]:


flu_support.PatientID[flu_support.Description=='Nitric oxide'].drop_duplicates().count()/n


# In[173]:


nitric_oxide = flu_support[flu_support.Description=='Nitric oxide'].drop_duplicates('PatientID').groupby(
                                                    'DischargedAlive')['PatientID'].count()
nitric_oxide/n_alive


# CVVH

# In[174]:


flu_support.PatientID[flu_support.Description=='CVVH'].drop_duplicates().count()/n


# In[175]:


cvvh = flu_support[flu_support.Description=='CVVH'].drop_duplicates('PatientID').groupby(
                                                    'DischargedAlive')['PatientID'].count()
cvvh/n_alive


# High frequency ventilation/oscillation

# In[176]:


flu_support.PatientID[flu_support.Description=='High frequency ventilation/oscillation'].drop_duplicates().count()/n


# In[177]:


hfvo = flu_support[flu_support.Description=='High frequency ventilation/oscillation'].drop_duplicates('PatientID').groupby(
                                                    'DischargedAlive')['PatientID'].count()
hfvo/n_alive


# Ionotrope/pressor support

# In[178]:


pressor_set = ['Dobutamine', 'Vasopressor/inotropic drugs', 'Norepinephrine', 'Epinephrine', 'Dopamine', 'Milrinone']

ionotrope_pressor = flu_support[flu_support.Description.isin(pressor_set)].drop_duplicates('PatientID').groupby(
                                                    'DischargedAlive')['PatientID'].count()
ionotrope_pressor/n_alive


# ## Table 1 summaries
# 
# **All ECMO**

# In[179]:


# Age
fig, axes = plt.subplots(ncols=2, figsize=(14,4))
flu.AgeDays.hist(ax=axes[0])
axes[1].boxplot(flu.AgeDays.values);


# In[180]:


flu.AgeDays.describe()


# In[181]:


# Race
flu.Race.value_counts()


# In[182]:


flu.Race.value_counts()/flu.Race.value_counts().sum()


# In[183]:


# Gender
flu.Sex.value_counts()


# In[184]:


flu.Sex.value_counts()/flu.Sex.value_counts().sum()


# In[185]:


# Death
flu.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})


# In[186]:


(flu.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
    /flu.DischargedAlive.value_counts().sum())


# TODO: Break down by Mode

# In[187]:


# ECMO Type
flu.Mode.unique()


# In[188]:


# ECMO Type
flu['VA'] = flu.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu.VA[flu.Mode=='Other'] = np.nan


# This is the proportion that is VA

# In[189]:


flu.VA.mean()


# In[190]:


# Support type
flu.SupportType.value_counts()


# Proportion CPR

# In[191]:


(flu.SupportType==3).mean()


# In[192]:


# Complications
ecmo_complications = flu_organism.merge(complications, on='RunID')
ecmo_complications.complication_type.value_counts()


# **Influenza only**

# In[193]:


# Age
fig, axes = plt.subplots(ncols=2, figsize=(14,4))
influenza_only.AgeDays.hist(ax=axes[0])
axes[1].boxplot(influenza_only.AgeDays.values);


# In[194]:


influenza_only.AgeDays.describe()


# In[195]:


# Race
influenza_only.Race.value_counts()


# In[196]:


influenza_only.Race.value_counts()/influenza_only.Race.value_counts().sum()


# In[197]:


# Gender
influenza_only.Sex.value_counts()


# In[198]:


influenza_only.Sex.value_counts()/influenza_only.Sex.value_counts().sum()


# In[199]:


# Death
influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})


# In[200]:


(influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
    /influenza_only.DischargedAlive.value_counts().sum())


# In[201]:


# ECMO Type
influenza_only.Mode.unique()


# In[202]:


influenza_only['VA'] = influenza_only.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
influenza_only.VA[influenza_only.Mode=='Other'] = np.nan


# This is the proportion that is VA

# In[203]:


influenza_only.VA.mean()


# In[204]:


# Support type
influenza_only.SupportType.value_counts()


# In[205]:


(influenza_only.SupportType==3).mean()


# In[206]:


# Complications
influenza_complications = influenza_only.merge(complications, on='RunID')
influenza_complications.complication_type.value_counts()


# ## Table 3 summaries
# 
#  Severity of illness in patients with influenza alone on ECMO versus patients with bacterial co-infection

# In[207]:


def normal_post(y, pct=0.95, mu_0=0., var_0=1e6, draws=1000, roundto=2):
    
    s2 = y.var()
    ybar = y.mean()
    n = len(y)
    
    mu = (n*ybar/s2 + mu_0/var_0)/(n*1./s2 + 1./var_0)
    sigma = np.sqrt(1./(n/s2 + 1./var_0))
    
    y_post = np.sort(np.random.normal(mu, sigma, size=draws))
    c = int(draws*(1-pct)/2.)

    return([np.round(v, roundto) for v in (mu, y_post[c], y_post[-c])])


# Days on ECMO

# In[208]:


flu_hours_ecmo = (flu.drop_duplicates(cols=['PatientID']).HoursECMO/24.)
flu_hours_ecmo.describe()


# In[209]:


np.log(flu_hours_ecmo+0.01).hist(bins=25)


# In[210]:


flu_days_ECMO = with_flu.HoursECMO/24.
flu_days_ECMO.describe()


# In[211]:


flu_days_ECMO.hist(bins=25)


# In[212]:


(influenza_only.HoursECMO/24.).describe()


# In[213]:


(influenza_only.HoursECMO/24.).hist(bins=20)


# In[214]:


lognorm_interval(influenza_only.HoursECMO/24., draws=100000)


# In[215]:


multiple_infection['VA'] = multiple_infection.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
multiple_infection.VA[multiple_infection.Mode=='Other'] = np.nan


# In[216]:


has_flu = multiple_infection[multiple_infection.OrganismName.str.startswith('Influenza')==True].drop_duplicates('PatientID')
has_mrsa = multiple_infection[multiple_infection.OrganismName=='Staphylococcus aureus, meth resist'].drop_duplicates('PatientID')
has_mssa = multiple_infection[multiple_infection.OrganismName=='Staphylococcus aureus'].drop_duplicates('PatientID')


# In[217]:


flu_with_mrsa = has_flu[has_flu.PatientID.isin(has_mrsa.PatientID)]


# In[218]:


(flu_with_mrsa.HoursECMO/24.).describe()


# In[219]:


flu_with_mssa = has_flu[has_flu.PatientID.isin(has_mssa.PatientID)]


# In[220]:


(flu_with_mssa.HoursECMO/24.).describe()


# Proportion VA (vs. VV)

# In[221]:


influenza_only.VA.mean()


# In[222]:


flu_coinfection['VA'] = flu_coinfection.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu_coinfection.loc[flu_coinfection.Mode=='Other', 'VA'] = np.nan


# In[223]:


flu_coinfection.VA.mean()


# In[224]:


flu_with_mrsa.VA.mean()


# In[225]:


flu_with_mssa.VA.mean()


# Death

# In[226]:


influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})


# In[227]:


(influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
    /influenza_only.DischargedAlive.value_counts().sum())


# In[228]:


flu_coinfection.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})


# In[229]:


(flu_coinfection.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
    /flu_coinfection.DischargedAlive.value_counts().sum())


# In[230]:


flu_with_mrsa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})


# In[231]:


(flu_with_mrsa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
    /flu_with_mrsa.DischargedAlive.value_counts().sum())


# In[232]:


flu_with_mssa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})


# In[233]:


(flu_with_mssa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
    /flu_with_mssa.DischargedAlive.value_counts().sum())


# ## Enumerate complications

# In[234]:


complications.complication_type.value_counts()


# In[235]:


complication_list = ['Mechanical', 'Hemorrhagic', 'Renal', 'Cardiovascular', 'Pulmonary']


# In[236]:


has_complication = pd.DataFrame(complications.groupby('RunID').apply(
        lambda x: x['complication_type'].isin(complication_list).any()))


# In[237]:


has_complication.columns = ['complication']


# In[238]:


flu_with_comp = pd.merge(flu_organism, has_complication, left_on='RunID', right_index=True, how='left')


# In[239]:


flu_with_comp.shape


# In[240]:


flu_with_comp.PatientID.unique().shape


# In[241]:


flu_with_comp.complication.mean()


# In[242]:


flu_organism.RunID.isin(has_complication[has_complication.complication].index).mean()


# In[243]:


flu_organism['has_complication'] = flu_organism.RunID.isin(has_complication[has_complication.complication].index)


# In[250]:


flu_organism = flu_organism.drop('has_complication', axis=1)


# Merge with S. Aureus diagnoses from ICD9

# In[251]:


flu_organism_icd9 = pd.merge(flu_organism, s_aureus_diagnoses, on='RunID', how='left')


# In[252]:


flu_organism_icd9['s_aureus_icd9'] = flu_organism_icd9.ICD9Code.notnull()


# In[253]:


flu_organism_icd9 = flu_organism_icd9.drop(['ICD9Code', 'PrimaryDiagnosis'], axis=1 )


# In[254]:


flu_organism.PatientID.unique().shape


# In[255]:


flu_organism_icd9.head()


# ## Export data

# In[256]:


flu_organism_icd9.to_csv("data/flu_organism.csv")


# In[257]:


flu_with_comp.to_csv("data/flu.csv")


# In[258]:


flu_with_comp.head()