#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt # Set some Pandas options pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', 40) pd.set_option('display.max_rows', 100) # Import data from Excel, using `RunID` as an index. # In[9]: flu = pd.read_excel("Data/Flu.xlsx", "Flu", index_col='RunID', na_values=['V04.82','v03.81']) diagnoses = pd.read_excel("Data/Flu.xlsx", "Diagnoses") complications = pd.read_excel("Data/Flu.xlsx", "Complications") organisms = pd.read_excel("Data/Flu.xlsx", "Organisms") runs_year = pd.read_excel("Data/Flu.xlsx", "Runs by Year") # runs_sex = pd.read_excel("Data/Flu.xlsx", "Runs by Year and Sex") # runs_race = pd.read_excel("Data/Flu.xlsx", "Runs by Year and Race") # Average age of patients on ECMO year_days = pd.read_csv("Data/year_days.csv", index_col='YearEcls') support = pd.read_excel("Data/Flu.xlsx", "Pre-ECLS Support") survived_year = pd.read_csv("Data/survived.csv") # In[10]: flu.columns # In[11]: (year_days/365).plot(legend=False, grid=False) plt.xlabel('Year'); plt.ylabel('Age (years)') plt.xlim(1992, 2013); # Number of patients inf flu dataset # In[12]: flu.PatientID.unique().size # Convert year to integer value # In[13]: flu['year'] = flu.YearECLS.astype(int) # Constrain data to study years # In[14]: flu = flu[(flu.YearECLS>=1992) & (flu.YearECLS<=2012)] flu.PatientID.unique().size # In[15]: runs_year = runs_year[(runs_year.YearEcls>=1992) & (runs_year.YearEcls<=2012)] # Create hours on ventilator variable # In[16]: flu['HoursVent'] = flu.HoursECMO + flu.TimeOffToExtubationDateHours.fillna(0) # ## Complications # # Divide into mechanical, hemorragic, renal, cardiovascular, pulmonary, neural (ignore others). # In[17]: complications.head() # In[18]: complications['complication_type'] = complications.Description.apply(lambda x: x[:x.index(':')]) # In[19]: complications.complication_type.value_counts() # ## Age Summary # # Convert age to years # In[20]: flu['AgeYears'] = flu.AgeDays/365. # Age distribution of flu data. # In[21]: (flu.drop_duplicates('PatientID').AgeYears).hist() # Create age classes: # # * neonatal: <29 days # * pediatric 29 days - 17 years # * adult >17 years # In[22]: flu['age_class'] = 'pediatric' flu.age_class[flu.AgeDays<29] = 'neonatal' flu.age_class[flu.AgeYears>17] = 'adult' # Here is the proportion in each class: # In[23]: flu.age_class.value_counts(True) # Distribution by race and sex: # In[24]: pd.crosstab(flu.Race, flu.Sex).sort_index(by='F', ascending=False) # ## Diagnoses # Proportion with flu as `PrimaryDx` (ICD9 codes 487 to (but not including) 489) from diagnosis dataset: # In[25]: primary_dx = diagnoses.ICD9Code[diagnoses.PrimaryDiagnosis==1] primary_flu = primary_dx.apply(lambda x: x.startswith('487') or x.startswith('488')) # Proportion and number with flu as primary diagnosis # In[26]: primary_flu.mean() # In[27]: primary_flu.sum() # The same calculation using the `flu` dataset: # In[32]: flu.PrimaryDx = flu.PrimaryDx.astype(float) # In[33]: ((flu.PrimaryDx >= 487) & (flu.PrimaryDx < 489)).mean() # In[34]: ((flu.PrimaryDx >= 487) & (flu.PrimaryDx < 489)).sum() # Here is a plot of the frequencies of secondary diagnoses (frequencies >5): # In[35]: secondary_diag_count = diagnoses.ICD9Code[diagnoses.PrimaryDiagnosis==0].value_counts() secondary_diag_count[secondary_diag_count>5].plot(kind='bar', figsize=(16,6), grid=False) # 518.83 chronic respiratory failure # 491* obstructive chronic bronchitis # 493 asthma # In[36]: pulmonary_codes = '518.83', '491', '493' # In[37]: diagnoses.ICD9Code.isin(pulmonary_codes).mean() # 746* congenital heart disease # 428* congestive heart failure # 427.5 cardiac arrest # In[38]: cardiac_codes = '746', '428', '427.5' # In[39]: diagnoses.ICD9Code.isin(cardiac_codes).mean() # Primary # 279.3 Unspecified Immunity Deficiency # 279.2 Combined Immunity Deficiency # 229.12 Wiskott Aldrich Syndrome # 279.11 DiGeorge’s Syndrome # 288.0 Neutropenia # Secondary # 042 HIV # 249* Diabetes Mellitus (DMII) # 250* Diabetes Mellitus (DM1) # Transplant Organ # 996.8* Complications of transplanted organ # V42 Organ replacement/HSCT # Malignancy # 200.22 Burkitt’s tumor # 201.5* Hodgkin’s disease # 208* Leukemia # 202.8 lymphoma # 204.00 acute lymphoid lymphoma # 205.0 myeloid leukemia # 155* neoplasm of liver # 158* neoplasm of retroperitoneum # 163.9 neoplasm of pleura # 170.6 malignant neoplasm of pelvic bones and coccyx # 171.4 neoplasm of connective and other soft tissue of thorax # 189* neoplasm of kidney, ureter etc. # 191.9 malignant neoplasm of brain # 194* malignant neoplasm of glands # 195* malignant neoplasm of different body parts # In[40]: immuno_codes = ('279.3', '279.2', '229.12', '279.11', '288.0', '042', '249', '250', '996.8', 'V42', '200.22', '201.5', '208', '202.8', '204.00', '205.0', '155', '158', '163.9', '170.6', '171.4', '189', '191.9', '194', '195') # In[41]: diagnoses.ICD9Code.isin(immuno_codes).mean() # In[42]: s_aureus_codes = ["038.11","038.12","O41.11","041.12","482.41","482.42","V02.53","V02.54","V12.0"] # In[43]: diagnoses.ICD9Code.isin(s_aureus_codes).sum() # In[44]: s_aureus_diagnoses = diagnoses[diagnoses.ICD9Code.isin(s_aureus_codes)] # ## Organisms # Import organism lookup table # In[45]: organism_type = pd.read_csv("Data/organisms.csv") # Merge organism information in a single table # In[46]: organisms = organisms.merge(organism_type) organisms.head() # Number of unique runs in organisms table # In[47]: organisms.RunID.unique().size # Frequencies of organism counts per run # In[48]: organisms.groupby('RunID').OrganismNo.count().value_counts() # Count of organisms by run # In[49]: counts = organisms.groupby('RunID').OrganismNo.count() # Frequencies of organism occurrence (for counts >10) # In[50]: organism_counts = organisms.groupby('OrganismName').RunID.count() organism_counts.sort(ascending=False) organism_counts[organism_counts>10].plot(kind='bar', figsize=(18,4), grid=False) # Rate of organism occurence (do not sum to one due to multple coinfections): type_counts = organisms.groupby('Type').RunID.count() type_counts.sort(ascending=False) (type_counts.astype(float)/organisms.RunID.unique().size).round(2) # In[51]: ax1 = flu.groupby('year')['AgeYears'].median().plot(kind='bar', stacked=True, figsize=(10,6), grid=False) #ax1.set_xticks(range(1992, 2012)) #ax2 = ax1.twinx() ax1.plot(year_days.values/10.) plt.ylim(0,100) # ## ECMO # # Distribution of hours on ECMO, by year # In[52]: flu.groupby(['year', 'age_class'])['PatientID'].count().unstack().plot(kind='bar', stacked=True, figsize=(10,6), grid=False) plt.ylabel('count') # flu[flu.Sex=='F'].groupby(['year', 'age_class'])['PatientID'].count().unstack().plot( # kind='bar', stacked=True, figsize=(10,6), grid=False) # flu[flu.Sex=='M'].groupby(['year', 'age_class'])['PatientID'].count().unstack().plot( # kind='bar', stacked=True, figsize=(10,6), grid=False) # Same plot as above, except on log scale. # In[53]: flu.groupby(['year', 'age_class'])['PatientID'].count().unstack().apply(np.log).plot(kind='bar', stacked=True, figsize=(10,6), grid=False) plt.ylabel('log(count)') # ECMO incidence, by year and sex: # In[54]: flu.groupby(['year','Sex']).RunNo.count() # ## Abstract Tables # # ### Table 1 summaries # # **Demographics of patients with influenza on ECMO** # In[59]: from scipy.stats import beta from scipy.stats import norm def binomial_hpdr(x, pct=0.95, a=1, b=1, n_pbins=1e3, roundto=2): """ Function computes the posterior mode along with the upper and lower bounds of the **Highest Posterior Density Region**. Parameters ---------- x: data (boolean) pct: the size of the confidence interval (between 0 and 1) a: the alpha hyper-parameter for the Beta distribution used as a prior (Default=1) b: the beta hyper-parameter for the Beta distribution used as a prior (Default=1) n_pbins: the number of bins to segment the p_range into (Default=1e3) Returns ------- A tuple that contains the mode as well as the lower and upper bounds of the interval (mode, lower, upper) """ n, N = sum(x), len(x) # fixed random variable object for posterior Beta distribution rv = beta(n+a, N-n+b) # determine the mode and standard deviation of the posterior stdev = rv.stats('v')**0.5 mode = (n+a-1.)/(N+a+b-2.) # compute the number of sigma that corresponds to this confidence # this is used to set the rough range of possible success probabilities n_sigma = np.ceil(norm.ppf( (1+pct)/2. ))+1 # set the min and max values for success probability max_p = mode + n_sigma * stdev if max_p > 1: max_p = 1. min_p = mode - n_sigma * stdev if min_p > 1: min_p = 1. # make the range of success probabilities p_range = np.linspace(min_p, max_p, n_pbins+1) # construct the probability mass function over the given range if mode > 0.5: sf = rv.sf(p_range) pmf = sf[:-1] - sf[1:] else: cdf = rv.cdf(p_range) pmf = cdf[1:] - cdf[:-1] # find the upper and lower bounds of the interval sorted_idxs = np.argsort( pmf )[::-1] cumsum = np.cumsum( np.sort(pmf)[::-1] ) j = np.argmin( np.abs(cumsum - pct) ) upper = p_range[ (sorted_idxs[:j+1]).max()+1 ] lower = p_range[ (sorted_idxs[:j+1]).min() ] return [np.round(v, roundto) for v in (mode, lower, upper)] # Code for calculating lognormal posterior intervals # In[60]: from scipy.stats import invgamma from numpy.random import normal def lognorm_interval(y, pct=0.95, draws=10000, roundto=2, logm=True): x = np.log(y + 0.001) # Calculate sufficiecnt statistics mu_hat = x.mean() s2_hat = x.var() nu = len(x) - 1 beta = nu * s2_hat / 2. s2_sim = invgamma.rvs(nu/2., scale=beta, size=draws) mu_sim = normal(mu_hat, np.sqrt(s2_sim)/2.) m = mu_sim + 0.5*s2_sim if not logm: m = np.exp(m) m.sort() c = int(draws*(1-pct)/2.) return([np.round(v, roundto) for v in (m[int(draws/2)], m[c], m[-c])]) # Merge flu dataset with organism dataset for summarization by infection type # In[61]: flu_organism= flu.merge(organisms, left_index=True, right_on='RunID', how='left') # In[62]: flu_organism.groupby('PatientID')['OrganismName'].count().hist(bins=20) # In[63]: flu_organism.head() # Number with flu by organism name. # In[64]: has_flu_organism = flu_organism[flu_organism.OrganismName.str.startswith('Influenza')==True].drop_duplicates('PatientID') len(has_flu_organism) # In[65]: flu_organism['has_flu'] = (flu_organism.OrganismName.str.startswith('Influenza')==True).replace( {True: 'ECMO Flu', False: 'ECMO No Flu'}) # In[66]: year_days.columns = ['All Runs'] # In[67]: axes = flu_organism.groupby(['YearECLS', 'has_flu'])['AgeYears'].mean().unstack().plot() (year_days/365).plot(ax=axes, legend=False, grid=False) plt.xlabel('Year'); plt.ylabel('Age (years)') plt.xlim(1992, 2013); # In[68]: axes = flu_organism.groupby(['YearECLS', 'has_flu'])['PatientID'].count().unstack().plot() plt.xlabel('Year'); plt.xlim(1996, 2013); # Merge with diagnoses dataset for ICD9 lookup. # In[69]: flu_ICD9 = flu.merge(diagnoses, left_index=True, right_on='RunID', how='left') # Number with flu by ICD9 # In[70]: has_flu_ICD9 = flu_ICD9[flu_ICD9.ICD9Code.apply( lambda x: x.startswith('487') or x.startswith('488'))].drop_duplicates('PatientID') len(has_flu_ICD9) # This is the *union* of the patients with flu via ICD9 and those with flu via organism. I think this is what we need. # In[71]: len(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID))) # Extract runs from patients with Flu by either ICD9 or organism # In[72]: flu_organism_runs = flu_organism[flu_organism.PatientID.isin(list(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID))))] # In[73]: flu_ICD9_runs = flu_ICD9[flu_ICD9.PatientID.isin(list(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID))))] # This is a smaller number than is in the `Flu` table. # In[74]: len(flu) # Total number of runs for people with influenza on ECMO # In[75]: flu_runs_organisms = organisms.RunID[organisms.OrganismName.str.startswith('Influenza')==True] flu_runs_organisms.unique().shape # In[76]: flu_runs_ICD9 = diagnoses.RunID[diagnoses.ICD9Code.apply( lambda x: x.startswith('487') or x.startswith('488'))] flu_runs_ICD9.unique().shape # In[77]: # This is the union of the RunIDs from both sets len(set(flu_runs_organisms).union(set(flu_runs_ICD9))) # Proportion of runs with influenza # In[78]: # From organism list float(len(flu_runs_organisms.unique())) / runs_year.CNT.sum() # In[79]: # From ICD9 list float(len(flu_runs_ICD9.unique())) / runs_year.CNT.sum() # Proportion female before and during 2009 # In[80]: flu['female'] = flu.Sex=='F' x_2009 = flu.drop_duplicates('PatientID')[flu.YearECLS==2009]['female'].sum() n_2009 = flu.drop_duplicates('PatientID')[flu.YearECLS==2009]['female'].count() print('Proportion female in 2009: {0:.3f}'.format(float(x_2009)/n_2009)) # In[81]: flu['pre_2009'] = flu['YearECLS'] < 2009 x_pre_2009 = flu.drop_duplicates('PatientID')[flu.pre_2009]['female'].sum() n_pre_2009 = flu.drop_duplicates('PatientID')[flu.pre_2009]['female'].count() print('Proportion female before 2009: {0:.3f}'.format(float(x_pre_2009)/n_pre_2009)) # In[82]: from scipy import stats def ztest(x, n): n = np.array(n, float) pbar = sum(x)/sum(n) z = np.diff(x/n) / np.sqrt(pbar * (1. - pbar) * sum(1./n)) return {'z':z, 'p':1.- stats.norm.cdf(np.abs(z))} # In[83]: ztest([x_2009, x_pre_2009], [n_2009, n_pre_2009]) # Proportion of females with flu. # In[84]: with_flu = flu[flu.PatientID.isin(pd.Series(list(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID)))))] # In[85]: (with_flu.Sex=='F').mean() # In[86]: binomial_hpdr((with_flu.Sex=='F')) # Age statistics # In[87]: with_flu.AgeYears.describe() # Mortality of patients with the diagnosis of influenza # In[88]: with_flu_survivors = flu.drop_duplicates('PatientID').DischargedAlive==True with_flu_survivors.mean() # In[89]: binomial_hpdr(with_flu_survivors) # Median hours on ECMO for influenza # In[90]: # Estimate of interval on log scale, due to skew in distribution lognorm_interval(with_flu.HoursECMO/24., draws=100000) # In[91]: # Age distribution flu.drop_duplicates('PatientID')['AgeYears'].describe() # In[92]: # Race distribution race_counts = flu.drop_duplicates('PatientID')['Race'].value_counts() race_counts/float(race_counts.sum()) # In[93]: binomial_hpdr(flu.drop_duplicates('PatientID')['Race']=='W') # In[94]: # Survival flu.drop_duplicates('PatientID')['DischargedAlive'].mean() # In[95]: # Sex sex_counts = flu.drop_duplicates('PatientID')['Sex'].value_counts() sex_counts/float(sex_counts.sum()) # Complications # In[96]: complications_with_flu = with_flu.merge(complications, left_index=True, right_on='RunID', how='left') # In[97]: complications_with_flu.complication_type.value_counts() # In[98]: def complication_intervals(dataset): complications_merged = dataset.merge(complications, left_index=True, right_on='RunID', how='left') for kind in ('Neurologic', 'Renal', 'Hemorrhagic', 'Cardiovascular', 'Pulmonary'): x = complications_merged[complications_merged.complication_type==kind].PatientID.drop_duplicates().count() n = dataset.PatientID.drop_duplicates().count() m, lo, up = binomial_hpdr([1]*x + [0]*(n-x)) print('{0}:\n\t {1} ({2}, {3})\n'.format(kind, m, lo, up)) # In[99]: complication_intervals(with_flu) # Days on ventilator # In[100]: (with_flu.HoursVent.dropna()/24).hist(bins=20) # In[101]: # Interval on log scale lognorm_interval(with_flu.HoursVent.dropna()/24) # **Flu only** # Patients with influenza ONLY on ECMO= patients btwn 1992-2012 with EITHER an ICD9 code of flu AND/OR an organism of flu but NO additional organism codes for other bacteria or viruses. # # We have established above that all the entries in the `Flu` table have influenza. We only have to filter out co-infection. # Number of infections by `PatientID` # In[102]: infection_count = flu_organism.groupby('PatientID')['PatientID'].count() # In[103]: infection_count.count() # In[104]: infection_count.hist() # In[105]: organisms[organisms.OrganismName=='Staphylococcus aureus, meth resist'].drop_duplicates('RunID').shape # In[106]: single_infection = flu_organism.merge(pd.DataFrame(infection_count[infection_count==1]), left_on='PatientID', right_index=True) # In[107]: single_infection_ICD9 = flu_ICD9.merge(pd.DataFrame(infection_count[infection_count==1]), left_on='PatientID', right_index=True) # In[108]: set(single_infection.PatientID.drop_duplicates()).difference(set(single_infection_ICD9.PatientID.drop_duplicates())) # In[109]: len(set(single_infection.PatientID.drop_duplicates())) # In[110]: len(set(single_infection_ICD9.PatientID.drop_duplicates())) # In[111]: single_infection.drop_duplicates('PatientID').PatientID.count() # In[112]: single_infection_ICD9.drop_duplicates('PatientID').PatientID.count() # In[113]: influenza_only = single_infection[single_infection.OrganismName.str.startswith('Influenza')==True] # In[114]: influenza_only['AgeYears'].describe() # In[115]: race_counts_flu = influenza_only['Race'].value_counts() race_counts_flu/float(race_counts_flu.sum()) # In[116]: binomial_hpdr(influenza_only.Race.dropna()=='W') # Posterior interval for days on ECMO # In[117]: lognorm_interval(influenza_only.HoursECMO/24., draws=100000) # In[118]: flu_only_survivors = influenza_only['DischargedAlive'] flu_only_survivors.mean() # In[119]: binomial_hpdr(flu_only_survivors) # In[120]: sex_counts_flu = influenza_only['Sex'].value_counts() sex_counts_flu/float(sex_counts_flu.sum()) # In[121]: binomial_hpdr((influenza_only.Sex=='F')) # In[122]: # Index out patients with flu only complication_intervals(with_flu[with_flu.PatientID.isin(influenza_only.PatientID)]) # Days on ventilator # In[123]: # Interval on log scale lognorm_interval(influenza_only.HoursVent.dropna()/24) # **Subset with co-infection** # # Patients with influenza and co-infection on ECMO=patients btwn 1992-2012 with EITHER an ICD9 code of flu AND/OR an organism of flu AND an organism code of something else. # # First, identify individuals with multiple infections based on organism code. # In[124]: multiple_infection = flu_organism.merge(pd.DataFrame(infection_count[infection_count>1]), left_on='PatientID', right_index=True) # Merge with the subset that has flu by ICD9 # In[125]: multiple_infection_ICD9 = flu_ICD9.merge(pd.DataFrame(infection_count[infection_count>1]), left_on='PatientID', right_index=True) # No patients in one set that arent in the other # In[126]: set(multiple_infection.PatientID.drop_duplicates()).difference(set(multiple_infection_ICD9.PatientID.drop_duplicates())) # In[127]: flu_coinfection = multiple_infection.drop_duplicates('PatientID') # Organism numbers for bacterial infections # In[128]: bacterial_numbers = [1,2,9,11,12,13,14,15,16,19,30,31,32,35,36,37,38, 39,40,48,52,54,55,58,59,60,61,67,68,69,71,77,80,84,85,86,91,95,104] # In[129]: bacterial_coinf = flu_organism[flu_organism.OrganismNo.isin(bacterial_numbers)].merge( pd.DataFrame(infection_count[infection_count>1]), left_on='PatientID', right_index=True) # Number with bacterial coinfection via organism code # In[130]: with_bacterial_coinf = bacterial_coinf.drop_duplicates(cols=['PatientID']) with_bacterial_coinf.shape # In[131]: bacterial_ICD9 = ['033.9 ','033 ','036.2 ','038.0 ','038.1','038.11','038.12','038.19', '038.2','038.3','038.4','038.41','038.42','03.43','038.44','038.49', '040.82','040.89','081','100.9','481'] # In[132]: with_bacterial_ICD9 = flu_ICD9[flu_ICD9.ICD9Code.apply( lambda x: x.startswith('041') or x.startswith('320') or x.startswith('482') or bool(bacterial_ICD9.count(x)))] # Number with bacterial coinfection via ICD9 # In[133]: with_bacterial_ICD9.PatientID.drop_duplicates().shape # Comparison with flu-only: # In[134]: influenza_only['AgeYears'].hist(alpha=0.3, grid=False) with_bacterial_coinf['AgeYears'].hist(alpha=0.3, grid=False) # In[135]: print('u={0:.0f}, p={1:.3f}'.format(*stats.mannwhitneyu(influenza_only['AgeYears'].dropna().values, with_bacterial_coinf['AgeYears'].dropna().values))) # In[136]: with_bacterial_coinf['AgeYears'].describe() # Survival # In[137]: flu_bacterial_survivors = with_bacterial_coinf['DischargedAlive'] flu_bacterial_survivors.mean() # In[138]: binomial_hpdr(flu_bacterial_survivors) # Days on ECMO # In[139]: lognorm_interval(with_bacterial_coinf.HoursECMO/24., draws=100000) # In[140]: race_counts_bacterial = with_bacterial_coinf['Race'].value_counts() race_counts_bacterial/float(race_counts_bacterial.sum()) # Posterior interval for proportion white # In[141]: binomial_hpdr(with_bacterial_coinf.Race=='W') # Proportion female # In[142]: binomial_hpdr((with_bacterial_coinf.Sex=='F')) # Complications # In[143]: complication_intervals(with_flu[with_flu.PatientID.isin(with_bacterial_coinf.PatientID)]) # In[144]: # Interval on log scale lognorm_interval(with_bacterial_coinf.HoursVent.dropna()/24) # ### Table 2 summaries # # > race, age, gender # pre-ECMO lab values (blood gas (pH, pCO2, HCO3, Oxygen Index) # type of ECMO, ionotrope/pressor support, nitric, CVVH, oscillator # type of co-infection # # Race distribution # In[145]: flu_organism['fate'] = flu_organism.DischargedAlive.replace({1: 'Survived', 0: 'Died'}) by_fate = flu_organism.drop_duplicates('PatientID').groupby("fate") # In[146]: by_fate['Race'].apply(lambda x: (x.value_counts()/float(x.value_counts().sum())).round(3)) # In[147]: by_fate['Race'].value_counts().plot(kind='bar') # Age distribution # In[148]: by_fate['AgeYears'].describe() # Age distribution is not normally distributed. # In[149]: age_dist = by_fate['AgeYears'] age_dist.hist(alpha=0.3, grid=False) # In[150]: _ = by_fate.boxplot(column='AgeYears') # Mann-Whitney U-test (non-parametric) # In[151]: age_data = list(age_dist) print('u={0:.0f}, p={1:.3f}'.format(*stats.mannwhitneyu(age_data[0][1].dropna().values, age_data[1][1].dropna().values))) # Sex proportions # In[152]: by_fate['Sex'].apply(lambda x: x.value_counts()/float(x.value_counts().sum())) # In[153]: sex_dist = by_fate['Sex'].apply(lambda x: x.value_counts()) ztest(sex_dist.unstack()['M'].values, sex_dist.unstack().sum(1)) # Gases # In[154]: gases = ['pH', 'PCO2', 'HCO3', 'PO2', 'SaO2'] by_fate.describe()[gases] # In[155]: for g in gases: by_fate.boxplot(column=g) # In[156]: for i in gases: data = list(flu[i].groupby(flu['DischargedAlive'])) print('{0}: u={1:.0f}, p={2:.3f}'.format(i, *stats.mannwhitneyu(data[0][1].dropna().values, data[1][1].dropna().values))) # Prevalence of each type of confection: # # *Bacterial* # In[157]: bacterial = flu_organism[flu_organism.Type=='bacterial'].drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count() bacterial # In[158]: bacterial_prop = bacterial/multiple_infection.groupby('DischargedAlive')['PatientID'].count().astype(float) bacterial_prop # In[159]: ztest(bacterial, bacterial / bacterial_prop) # *Fungal* # In[160]: fungal = flu_organism[flu_organism.Type=='fungal'].drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count() fungal # In[161]: fungal_prop = fungal/multiple_infection.groupby('DischargedAlive')['PatientID'].count().astype(float) fungal_prop # In[162]: ztest(fungal, fungal / fungal_prop) # *Viral* # In[163]: viral = flu_organism[(flu_organism.Type=='viral') & (flu_organism.OrganismName.str.startswith('Influenza')==False)].drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count() viral # In[164]: viral_prop = viral/multiple_infection.groupby('DischargedAlive')['PatientID'].count().astype(float) viral_prop # In[165]: ztest(viral, viral / viral_prop) # Type of ECMO # In[166]: ecmo_type = (flu.Mode=='VA').replace({True:'VA', False:'VV'}) ecmo_type.value_counts() / float(ecmo_type.notnull().sum()) # In[167]: flu['ecmo_type'] = (flu.Mode=='VA').replace({True:'VA', False:'VV'}) flu.ecmo_type.groupby(flu['DischargedAlive']).apply(lambda x: x.value_counts()/float(x.notnull().sum())) # Merge Flu with Pre-ECLS Support # In[168]: flu_support = flu.merge(support, left_index=True, right_on='RunID', how='left') # In[169]: support.Description.value_counts() # In[170]: n = float(flu_support.PatientID.drop_duplicates().count()) n # In[171]: n_alive = flu_support.drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count().astype(float) n_alive # Nitric oxide # In[172]: flu_support.PatientID[flu_support.Description=='Nitric oxide'].drop_duplicates().count()/n # In[173]: nitric_oxide = flu_support[flu_support.Description=='Nitric oxide'].drop_duplicates('PatientID').groupby( 'DischargedAlive')['PatientID'].count() nitric_oxide/n_alive # CVVH # In[174]: flu_support.PatientID[flu_support.Description=='CVVH'].drop_duplicates().count()/n # In[175]: cvvh = flu_support[flu_support.Description=='CVVH'].drop_duplicates('PatientID').groupby( 'DischargedAlive')['PatientID'].count() cvvh/n_alive # High frequency ventilation/oscillation # In[176]: flu_support.PatientID[flu_support.Description=='High frequency ventilation/oscillation'].drop_duplicates().count()/n # In[177]: hfvo = flu_support[flu_support.Description=='High frequency ventilation/oscillation'].drop_duplicates('PatientID').groupby( 'DischargedAlive')['PatientID'].count() hfvo/n_alive # Ionotrope/pressor support # In[178]: pressor_set = ['Dobutamine', 'Vasopressor/inotropic drugs', 'Norepinephrine', 'Epinephrine', 'Dopamine', 'Milrinone'] ionotrope_pressor = flu_support[flu_support.Description.isin(pressor_set)].drop_duplicates('PatientID').groupby( 'DischargedAlive')['PatientID'].count() ionotrope_pressor/n_alive # ## Table 1 summaries # # **All ECMO** # In[179]: # Age fig, axes = plt.subplots(ncols=2, figsize=(14,4)) flu.AgeDays.hist(ax=axes[0]) axes[1].boxplot(flu.AgeDays.values); # In[180]: flu.AgeDays.describe() # In[181]: # Race flu.Race.value_counts() # In[182]: flu.Race.value_counts()/flu.Race.value_counts().sum() # In[183]: # Gender flu.Sex.value_counts() # In[184]: flu.Sex.value_counts()/flu.Sex.value_counts().sum() # In[185]: # Death flu.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) # In[186]: (flu.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) /flu.DischargedAlive.value_counts().sum()) # TODO: Break down by Mode # In[187]: # ECMO Type flu.Mode.unique() # In[188]: # ECMO Type flu['VA'] = flu.Mode.isin(['VA', 'VV-VA']) # Set "Other" type to NA (there are only a couple) flu.VA[flu.Mode=='Other'] = np.nan # This is the proportion that is VA # In[189]: flu.VA.mean() # In[190]: # Support type flu.SupportType.value_counts() # Proportion CPR # In[191]: (flu.SupportType==3).mean() # In[192]: # Complications ecmo_complications = flu_organism.merge(complications, on='RunID') ecmo_complications.complication_type.value_counts() # **Influenza only** # In[193]: # Age fig, axes = plt.subplots(ncols=2, figsize=(14,4)) influenza_only.AgeDays.hist(ax=axes[0]) axes[1].boxplot(influenza_only.AgeDays.values); # In[194]: influenza_only.AgeDays.describe() # In[195]: # Race influenza_only.Race.value_counts() # In[196]: influenza_only.Race.value_counts()/influenza_only.Race.value_counts().sum() # In[197]: # Gender influenza_only.Sex.value_counts() # In[198]: influenza_only.Sex.value_counts()/influenza_only.Sex.value_counts().sum() # In[199]: # Death influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) # In[200]: (influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) /influenza_only.DischargedAlive.value_counts().sum()) # In[201]: # ECMO Type influenza_only.Mode.unique() # In[202]: influenza_only['VA'] = influenza_only.Mode.isin(['VA', 'VV-VA']) # Set "Other" type to NA (there are only a couple) influenza_only.VA[influenza_only.Mode=='Other'] = np.nan # This is the proportion that is VA # In[203]: influenza_only.VA.mean() # In[204]: # Support type influenza_only.SupportType.value_counts() # In[205]: (influenza_only.SupportType==3).mean() # In[206]: # Complications influenza_complications = influenza_only.merge(complications, on='RunID') influenza_complications.complication_type.value_counts() # ## Table 3 summaries # # Severity of illness in patients with influenza alone on ECMO versus patients with bacterial co-infection # In[207]: def normal_post(y, pct=0.95, mu_0=0., var_0=1e6, draws=1000, roundto=2): s2 = y.var() ybar = y.mean() n = len(y) mu = (n*ybar/s2 + mu_0/var_0)/(n*1./s2 + 1./var_0) sigma = np.sqrt(1./(n/s2 + 1./var_0)) y_post = np.sort(np.random.normal(mu, sigma, size=draws)) c = int(draws*(1-pct)/2.) return([np.round(v, roundto) for v in (mu, y_post[c], y_post[-c])]) # Days on ECMO # In[208]: flu_hours_ecmo = (flu.drop_duplicates(cols=['PatientID']).HoursECMO/24.) flu_hours_ecmo.describe() # In[209]: np.log(flu_hours_ecmo+0.01).hist(bins=25) # In[210]: flu_days_ECMO = with_flu.HoursECMO/24. flu_days_ECMO.describe() # In[211]: flu_days_ECMO.hist(bins=25) # In[212]: (influenza_only.HoursECMO/24.).describe() # In[213]: (influenza_only.HoursECMO/24.).hist(bins=20) # In[214]: lognorm_interval(influenza_only.HoursECMO/24., draws=100000) # In[215]: multiple_infection['VA'] = multiple_infection.Mode.isin(['VA', 'VV-VA']) # Set "Other" type to NA (there are only a couple) multiple_infection.VA[multiple_infection.Mode=='Other'] = np.nan # In[216]: has_flu = multiple_infection[multiple_infection.OrganismName.str.startswith('Influenza')==True].drop_duplicates('PatientID') has_mrsa = multiple_infection[multiple_infection.OrganismName=='Staphylococcus aureus, meth resist'].drop_duplicates('PatientID') has_mssa = multiple_infection[multiple_infection.OrganismName=='Staphylococcus aureus'].drop_duplicates('PatientID') # In[217]: flu_with_mrsa = has_flu[has_flu.PatientID.isin(has_mrsa.PatientID)] # In[218]: (flu_with_mrsa.HoursECMO/24.).describe() # In[219]: flu_with_mssa = has_flu[has_flu.PatientID.isin(has_mssa.PatientID)] # In[220]: (flu_with_mssa.HoursECMO/24.).describe() # Proportion VA (vs. VV) # In[221]: influenza_only.VA.mean() # In[222]: flu_coinfection['VA'] = flu_coinfection.Mode.isin(['VA', 'VV-VA']) # Set "Other" type to NA (there are only a couple) flu_coinfection.loc[flu_coinfection.Mode=='Other', 'VA'] = np.nan # In[223]: flu_coinfection.VA.mean() # In[224]: flu_with_mrsa.VA.mean() # In[225]: flu_with_mssa.VA.mean() # Death # In[226]: influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) # In[227]: (influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) /influenza_only.DischargedAlive.value_counts().sum()) # In[228]: flu_coinfection.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) # In[229]: (flu_coinfection.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) /flu_coinfection.DischargedAlive.value_counts().sum()) # In[230]: flu_with_mrsa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) # In[231]: (flu_with_mrsa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) /flu_with_mrsa.DischargedAlive.value_counts().sum()) # In[232]: flu_with_mssa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) # In[233]: (flu_with_mssa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'}) /flu_with_mssa.DischargedAlive.value_counts().sum()) # ## Enumerate complications # In[234]: complications.complication_type.value_counts() # In[235]: complication_list = ['Mechanical', 'Hemorrhagic', 'Renal', 'Cardiovascular', 'Pulmonary'] # In[236]: has_complication = pd.DataFrame(complications.groupby('RunID').apply( lambda x: x['complication_type'].isin(complication_list).any())) # In[237]: has_complication.columns = ['complication'] # In[238]: flu_with_comp = pd.merge(flu_organism, has_complication, left_on='RunID', right_index=True, how='left') # In[239]: flu_with_comp.shape # In[240]: flu_with_comp.PatientID.unique().shape # In[241]: flu_with_comp.complication.mean() # In[242]: flu_organism.RunID.isin(has_complication[has_complication.complication].index).mean() # In[243]: flu_organism['has_complication'] = flu_organism.RunID.isin(has_complication[has_complication.complication].index) # In[250]: flu_organism = flu_organism.drop('has_complication', axis=1) # Merge with S. Aureus diagnoses from ICD9 # In[251]: flu_organism_icd9 = pd.merge(flu_organism, s_aureus_diagnoses, on='RunID', how='left') # In[252]: flu_organism_icd9['s_aureus_icd9'] = flu_organism_icd9.ICD9Code.notnull() # In[253]: flu_organism_icd9 = flu_organism_icd9.drop(['ICD9Code', 'PrimaryDiagnosis'], axis=1 ) # In[254]: flu_organism.PatientID.unique().shape # In[255]: flu_organism_icd9.head() # ## Export data # In[256]: flu_organism_icd9.to_csv("data/flu_organism.csv") # In[257]: flu_with_comp.to_csv("data/flu.csv") # In[258]: flu_with_comp.head()