#!/usr/bin/env python # coding: utf-8 # In[1]: # Import modules and set options get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np from redcap import Project # ## Query data from REDCap # In[2]: def get_redcap_data(token_url=None, api_url='https://redcap.vanderbilt.edu/api/', metadata=False, project=False, **kwargs): if token_url is None: print('Enter token:') api_key = input() else: api_key = open(token_url).read() project = Project(api_url, api_key) data = project.export_records(format='df', **kwargs) if metadata: return data, project.export_metadata() elif project: return data, project else: return data # ### Main trial data # In[3]: trial_data, trial_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_trial.txt", metadata=True) # Size of dataset (rows x columns) # In[4]: trial_data.shape # ### MiniBAL culture data # In[5]: culture_data, culture_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_bal.txt", metadata=True) # Size of dataset (rows x columns) # In[6]: culture_data.shape # ### BAL PCR data # In[7]: pcr_data, pcr_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_pcr.txt", metadata=True) # Size of dataset (rows x columns) # In[8]: pcr_data.shape # # Patient population # # Number of individuals with complete (PCR and culture) and incomplete (one or the other) data # In[9]: pcr_date_cols = pcr_data.columns[pcr_data.columns.str.contains('date')] culture_date_cols = culture_data.columns[culture_data.columns.str.contains('date')] # In[10]: culture_data['some_bal_data'] = (culture_data[culture_date_cols].notnull().sum(1)>0) pcr_data['some_pcr_data'] = (pcr_data[pcr_date_cols].notnull().sum(1)>0) # In[11]: (culture_data.join(pcr_data, lsuffix='-bal', rsuffix='-pcr')[['some_bal_data', 'some_pcr_data']] .sum(1).replace({1: 'One', 2: 'Both'}).value_counts()) # ## Distribution of number of vent days # In[12]: ax = trial_data.vent_days.hist() ax.set_ylabel('Frequency') ax.set_xlabel('Days on ventilator'); # ## Statistical information regarding number of vent days # In[13]: trial_data.vent_days.describe() # Suspected VAP # In[14]: VAP_lookup = {1: 'Pneumonia', 2: 'Infection (not pneumonia)', 3: 'Acute renal failure', 4: 'ARDS', 5: 'Cardiac arrest', 6: 'MI', 7: 'CVA', 8: 'DVT', 9: 'PE', 10: 'None of the above'} # In[15]: trial_data['pneumonia'] = trial_data.morbidity___1 # ## Summary of pneumonia # # There are currently 18 individuals with pneumonia # In[16]: trial_data.pneumonia.sum() # This accounts for 49% of the sample # In[17]: trial_data.pneumonia.mean() # ### Patients taking antibiotics at time of admission: # In[18]: trial_data.preadmit_abx.replace({1:'Yes', 2:'No', 3:'Unknown'}).value_counts() # ### Patients administered antibiotics at some point during study: # In[19]: trial_data.add_med_0.replace({1:'Yes', 0:'No'}).value_counts() # In[20]: abx_cols = trial_data.columns[trial_data.columns.str.startswith('abx')] # In[21]: abx_table = pd.wide_to_long(trial_data[abx_cols].assign(id=trial_data.index), stubnames=['abx_generic_', 'abx_antifungals_', 'abx_start_', 'abx_stop_', 'abx_indication_', 'abx_suspect_infect_source_', 'abx_known_infection_source_'], i='id', j='abx_number').dropna(thresh=2).dropna(axis=1, thresh=100) abx_table.columns = ['abx_name', 'antifungal', 'abx_start', 'abx_stop', 'abx_indication'] abx_table.abx_start = pd.to_datetime(abx_table.abx_start) abx_table.abx_stop = pd.to_datetime(abx_table.abx_stop) abx_table # In[22]: abx_table.abx_name.value_counts() # In[23]: trial_data # # Positive bronchioscopic BALs # In[24]: def daily_data_to_long(dataset, pattern='_d\d+$', suffix=''): # Find the largest number of study days _tokens = dataset.columns[dataset.columns.str.contains(pattern, regex=True)].str.split('_').values last_day = np.array([''.join(filter(lambda x: x.isdigit(), t[-1])) for t in _tokens]).astype(int).max() all_days_data = [] for i in range(last_day): print('Processing day',i+1) study_day_cols = dataset.columns[dataset.columns.str.endswith('_d{0}{1}'.format(i+1,suffix))] study_day_data = dataset[study_day_cols].copy() study_day_data.columns = ['_'.join(col.split('_')[:-1]) for col in study_day_cols] study_day_data['day'] = i+1 all_days_data.append(study_day_data) if False: print(study_day_data.shape) return pd.concat(all_days_data).reset_index() # In[25]: bbal_long = daily_data_to_long(trial_data).query('bbal==1') # In[26]: bbal_cfu_cols = bbal_long.columns[bbal_long.columns.str.contains('cfu') & bbal_long.columns.str.startswith('bbal')] bbal_pathogens_cols = bbal_long.columns[bbal_long.columns.str.contains('pathogen') & ~bbal_long.columns.str.contains('add') & bbal_long.columns.str.startswith('culture')] # Encode CFU count ranges to ordinal variable # In[27]: cfu_coding = {'<1,000': 0, '<10,000': 1, '10,000 - 25,000': 2, '25,000 - 50,000': 3, '50,000 - 100,000': 4, '>100,000': 5} # Lookup table to translate species codes # In[28]: bbal_path_lookup = {1: 'Staphylococcus aureus', 2: 'Streptococcus pneumonia', 3: 'Streptococcus Group B', 4: 'Acetinobacter baumannii', 5: 'Pseudomonas aeruginosa', 6: 'Haemophilus influenza', 7: 'Klebsiella pneumoniae', 8: 'Escherichia coli', 9: 'Enterobacter cloacae', 10: 'Stenotrophomonas maltophilia', 11: 'Enterobacter aerogenes', 12: 'Serratia marcescens', 13: 'Klebsiella oxytoca', 14: 'Proteus mirabilis', 15: 'Other'} # In[29]: bbal_pathogens_long = (bbal_long[bbal_pathogens_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id', 'day'])) # Strip out day from variable and replace pathogen code with name bbal_pathogens = bbal_pathogens_long.assign(pathogen=bbal_pathogens_long.value.replace(bbal_path_lookup) ).drop(['variable', 'value'], axis=1) # In[30]: bbal_cfu_long = (bbal_long[bbal_cfu_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id','day'])) # Make sure they are the same size! assert bbal_pathogens_long.shape==bbal_cfu_long.shape # In[31]: bbal_pathogens['cfu_count'] = (bbal_cfu_long.value.astype(str).str.replace('E','e').str.replace('+','') .str.replace('/','.')).astype(float) bbal_pathogens_sorted = bbal_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True) # In[32]: def fill_pathogens(x, labels, lookup): return (x.dropna() .drop_duplicates(subset=['pathogen']) .set_index('pathogen') .reindex(list(lookup.values())) .reset_index() .assign(record_id=labels[0], day=labels[1]) .fillna(0)) # In[33]: bbal_groups = [] for labels, group in bbal_pathogens_sorted.groupby(['record_id', 'day']): recid, day = labels group_full = fill_pathogens(group, labels, bbal_path_lookup) bbal_groups.append(group_full) # In[34]: bbal_complete = pd.concat(bbal_groups).reset_index() bbal_complete.to_csv('../data/clean/bbal_pathogens.csv') # In[35]: bbal_complete.head() # In[36]: bbal_complete.cfu_count.hist() # In[37]: (bbal_complete.groupby('record_id').cfu_count.max()>=3).sum() # In[38]: (bbal_complete.groupby('record_id').cfu_count.max()==0).sum() # In[39]: ax = bbal_complete.groupby('record_id').cfu_count.max().hist(bins=range(8), align='left') ax.set_xlabel('max. count') ax.set_ylabel('frequency'); # ### Trajectories of CFU counts from bronchioscopic BAL data # # Each panel represents a patient; each colored line a pathogen. X an Y axes are days and CFU count, respectively. # In[40]: bbal_grid = sns.FacetGrid(bbal_complete, col='record_id', hue="pathogen", col_wrap=4, size=2.5) bbal_grid.map(plt.plot, "day", "cfu_count", marker="o", ms=4) bbal_grid.add_legend() # ### The number of patients with at least one positive bbal ($> 10^4$) in the dataset: # In[41]: positive_bbal = bbal_cfu_long.fillna(0).groupby('record_id').apply(lambda x: (x.value>0).any()) positive_bbal.sum() # ### Proportion patients with a positive bbal: # In[42]: positive_bbal.mean() # ## Convert trial data to long format # # First need to reshape data to long format (one row per date of study per patient) # In[43]: daily_long = daily_data_to_long(trial_data) # In[44]: daily_long.head() # Drop rows with no date, columns with no data # In[45]: data_long_complete = daily_long.dropna(subset=['date_study_day']).dropna(axis=1, thresh=1) data_long_complete.head() # ## PCR counts in mBAL and HME # In[46]: pcr_pattern = '_d\d+pcr$' pcr_data.columns.str.contains(pcr_pattern, regex=True).sum() # In[47]: pcr_data_long = daily_data_to_long(pcr_data, pattern=pcr_pattern, suffix='pcr') # Drop rows with no date or no PCR result, and columns with no data # In[48]: pcr_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'mbal_pcr_result']).dropna(axis=1, thresh=1) pcr_long_complete['date_study_day'] = pd.to_datetime(pcr_long_complete['date_study_day']) pcr_long_complete.head() # ## Bacterial counts from mBAL over time # # Plotting time series for mini-BAL PCR counts. # In[49]: mbal_pcr_copies_cols = pcr_long_complete.columns[pcr_long_complete.columns.str.contains('copies') & pcr_long_complete.columns.str.startswith('mbal')] mbal_pcr_pathogens_cols = pcr_long_complete.columns[pcr_long_complete.columns.str.contains('path') & ~pcr_long_complete.columns.str.contains('add') & pcr_long_complete.columns.str.startswith('mbal')] # Lookup table to translate species codes # In[50]: pcr_path_lookup = {1: 'Staphylococcus aureus', 2: 'Streptococcus pneumonia', 3: 'Streptococcus Group B', 4: 'Acetinobacter baumannii', 5: 'Pseudomonas aeruginosa', 6: 'Haemophilus influenza', 7: 'Klebsiella pneumoniae', 8: 'Escherichia coli', 9: 'Enterobacter cloacae', 10: 'Stenotrophomonas maltophilia', 11: 'Enterobacter aerogenes', 12: 'Serratia marcescens', 13: 'Klebsiella oxytoca', 14: 'Proteus mirabilis', 15: 'Enterococcus faecalis', 16: 'Enterococcus faecium', 17: 'Candida albicans', 18: 'Other'} # Convert pathogens table from wide to long format # In[51]: mbal_pcr_pathogens_long = (pcr_long_complete[mbal_pcr_pathogens_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id', 'day'])) # Strip out day from variable and replace pathogen code with name mbal_pcr_pathogens = mbal_pcr_pathogens_long.assign(pathogen=mbal_pcr_pathogens_long.value.replace(pcr_path_lookup) ).drop(['variable', 'value'], axis=1) # Convert counts table from wide to long format # In[52]: mbal_pcr_counts_long = (pcr_long_complete[mbal_pcr_copies_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id','day'])) # Make sure they are the same size! assert mbal_pcr_pathogens_long.shape==mbal_pcr_counts_long.shape # Append count onto pathogens table # In[53]: mbal_pcr_pathogens['pcr_count'] = (mbal_pcr_counts_long.value.astype(str).str.replace('E','e').str.replace('+','') .str.replace('/','.')).astype(float) mbal_pcr_pathogens_sorted = mbal_pcr_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True) # In[54]: mbal_pcr_groups = [] for labels, group in mbal_pcr_pathogens_sorted.groupby(['record_id', 'day']): recid, day = labels group_full = fill_pathogens(group, labels, pcr_path_lookup) mbal_pcr_groups.append(group_full) # In[55]: mbal_pcr_complete = pd.concat(mbal_pcr_groups).reset_index() mbal_pcr_complete.to_csv('../data/clean/mbal_pcr_pathogens.csv') mbal_pcr_complete.head() # ### Trajectories of PCR counts from mini-BAL data # # Each panel represents a patient; each colored line a pathogen. X an Y axes are days and log-count, respectively. # In[56]: abx_table.ix[1002] # In[57]: def vertical_abx_line(x, count_data=pcr_long_complete, **kwargs): record_id = x.min() start_dates = np.unique(abx_table.ix[record_id].abx_start.dt.date.values) first_date = count_data.loc[count_data.record_id==record_id, ['date_study_day', 'day']].min() reference_date = first_date.date_study_day - pd.Timedelta('{} days'.format(first_date.day)) if np.any(start_dates): for date in start_dates: plt.axvline(x=(pd.Timestamp(date)-reference_date).days, ls=':', c='grey', **kwargs) # In[58]: sns.set_style('ticks') grid = sns.FacetGrid(mbal_pcr_complete.assign(log_count=(mbal_pcr_complete.pcr_count + 1).apply(np.log)), col='record_id', hue="pathogen", col_wrap=4, size=2.5) grid.map(plt.plot, "day", "log_count", marker="o", ms=4) grid.map(vertical_abx_line, 'record_id', alpha=0.2) grid.add_legend() # ### bBAL PCR counts # In[59]: bbal_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'bbal_pcr_result']).dropna(axis=1, thresh=1) bbal_long_complete.head() # In[60]: bbal_pcr_copies_cols = bbal_long_complete.columns[bbal_long_complete.columns.str.contains('copies') & bbal_long_complete.columns.str.startswith('bbal')] bbal_pcr_pathogens_cols = bbal_long_complete.columns[bbal_long_complete.columns.str.contains('path') & ~bbal_long_complete.columns.str.contains('add') & bbal_long_complete.columns.str.startswith('bbal')] # Convert pathogens table from wide to long format # In[61]: bbal_pcr_pathogens_long = (bbal_long_complete[bbal_pcr_pathogens_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id', 'day'])) # Strip out day from variable and replace pathogen code with name bbal_pcr_pathogens = bbal_pcr_pathogens_long.assign(pathogen=bbal_pcr_pathogens_long.value.replace(pcr_path_lookup) ).drop(['variable', 'value'], axis=1) # Convert counts table from wide to long format # In[62]: bbal_pcr_counts_long = (bbal_long_complete[bbal_pcr_copies_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id','day'])) # Make sure they are the same size! assert bbal_pcr_pathogens_long.shape==bbal_pcr_counts_long.shape # Append count onto pathogens table # In[63]: bbal_pcr_pathogens['pcr_count'] = (bbal_pcr_counts_long.value.astype(str).str.replace('E','e').str.replace('+','') .str.replace('/','.')).astype(float) bbal_pcr_pathogens_sorted = bbal_pcr_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True) # In[64]: bbal_pcr_groups = [] for labels, group in bbal_pcr_pathogens_sorted.groupby(['record_id', 'day']): recid, day = labels group_full = fill_pathogens(group, labels, pcr_path_lookup) bbal_pcr_groups.append(group_full) # In[65]: bbal_pcr_complete = pd.concat(bbal_pcr_groups).reset_index() bbal_pcr_complete.to_csv('../data/clean/bbal_pcr_pathogens.csv') bbal_pcr_complete.head() # In[66]: sns.set_style('ticks') grid = sns.FacetGrid(bbal_pcr_complete.assign(log_count=(bbal_pcr_complete.pcr_count + 1 + 0.1*np.random.randn(bbal_pcr_complete.shape[0])) .apply(np.log)), col='record_id', hue="pathogen", col_wrap=4, size=2.5, palette=sns.color_palette("hls", 18)) grid.map(plt.plot, "day", "log_count", marker="o", ms=4, alpha=0.5) # grid.map(vertical_abx_line, 'record_id', alpha=0.2) grid.add_legend() # ### HME PCR counts # In[67]: hme_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'hme_pcr_result']).dropna(axis=1, thresh=1) hme_long_complete.head() # In[68]: hme_copies_cols = hme_long_complete.columns[hme_long_complete.columns.str.contains('copies') & hme_long_complete.columns.str.startswith('hme')] hme_pathogens_cols = hme_long_complete.columns[hme_long_complete.columns.str.contains('path') & ~hme_long_complete.columns.str.contains('add') & hme_long_complete.columns.str.startswith('hme')] # Convert pathogens and counts from long to wide # In[69]: hme_pathogens_long = (hme_long_complete[hme_pathogens_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id', 'day'])) # Strip out day from variable and replace pathogen code with name hme_pathogens = hme_pathogens_long.assign(pathogen=hme_pathogens_long.value.replace(pcr_path_lookup) ).drop(['variable', 'value'], axis=1) # In[70]: hme_counts_long = (hme_long_complete[hme_copies_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id','day'])) # Make sure they are the same size! assert hme_pathogens_long.shape==hme_counts_long.shape # Replace bad values # In[71]: hme_counts_long.value[~hme_counts_long.value.apply(np.isreal)] # In[72]: hme_counts_long.loc[47, 'value'] = 5.47E+08 # Append count onto pathogens table # In[73]: hme_pathogens['pcr_count'] = (hme_counts_long.value.astype(str).str.replace('E','e').str.replace('+','') .str.replace('/','.')).astype(float) hme_pathogens_sorted = hme_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True) # In[74]: hme_groups = [] for labels, group in hme_pathogens_sorted.groupby(['record_id', 'day']): recid, day = labels group_full = fill_pathogens(group, labels, pcr_path_lookup) hme_groups.append(group_full) # In[75]: hme_complete = pd.concat(hme_groups).reset_index() hme_complete.to_csv('../data/clean/hme_pathogens.csv') hme_complete.head() # ### Trajectories of PCR counts from HME data # # Each panel represents a patient; each colored line a pathogen. X an Y axes are days and log-count, respectively. # In[76]: hme_grid = sns.FacetGrid(hme_complete.assign(log_count=(hme_complete.pcr_count + 1).apply(np.log)), col='record_id', hue="pathogen", col_wrap=4, size=2.5) hme_grid.map(plt.plot, "day", "log_count", marker="o", ms=4) hme_grid.add_legend() # ### Mini-BAL culture CFU counts # In[77]: # Same pathogen list as bBAL mbal_path_lookup = bbal_path_lookup # In[78]: culture_long = daily_data_to_long(culture_data) # Drop rows with no date or no culture test, and columns with no data # In[79]: culture_long_complete = culture_long.dropna(subset=['date_study_day', 'mbal_culture_test']).dropna(axis=1, thresh=1) # In[80]: culture_cfu_cols = culture_long_complete.columns[culture_long_complete.columns.str.contains('cfu') & culture_long_complete.columns.str.startswith('mbal')] culture_pathogens_cols = culture_long_complete.columns[culture_long_complete.columns.str.contains('pathogen') & ~culture_long_complete.columns.str.contains('add') & culture_long_complete.columns.str.startswith('mbal')] # In[81]: culture_pathogens_long = (culture_long_complete[culture_pathogens_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id', 'day'])) # Strip out day from variable and replace pathogen code with name culture_pathogens = culture_pathogens_long.assign(pathogen=culture_pathogens_long.value.replace(mbal_path_lookup) ).drop(['variable', 'value'], axis=1) # In[82]: culture_counts_long = (culture_long_complete[culture_cfu_cols.tolist() + ['record_id', 'day']] .pipe(pd.melt, id_vars=['record_id','day'])) # Make sure they are the same size! assert culture_pathogens_long.shape==culture_counts_long.shape # Append CFU count onto pathogens table # In[83]: culture_pathogens['cfu_count'] = (culture_counts_long.value.astype(str).str.replace('E','e').str.replace('+','') .str.replace('/','.')).astype(float) culture_pathogens_sorted = culture_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True) # In[84]: groups = [] for labels, group in culture_pathogens_sorted.groupby(['record_id', 'day']): recid, day = labels group_full = fill_pathogens(group, labels, mbal_path_lookup) groups.append(group_full) # In[85]: culture_complete = pd.concat(groups).reset_index() culture_complete.to_csv('../data/clean/culture_pathogens.csv') # ### Trajectories of culture counts from mini-BAL data # # Each panel represents a patient; each colored line a pathogen. X an Y axes are days and CFU count, respectively. # In[86]: grid = sns.FacetGrid(culture_complete, col='record_id', hue="pathogen", col_wrap=4, size=2.5) grid.map(plt.plot, "day", "cfu_count", marker="o", ms=4) grid.add_legend() # In[87]: pathogens = ['Staphylococcus aureus', 'Haemophilus influenza', 'Acetinobacter baumannii', 'Klebsiella pneumoniae'] fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16,14)) cbar_ax = fig.add_axes([.91, .3, .03, .4]) for ax,pathogen in zip(axes.ravel(), pathogens): draw_cbar = pathogen=='Haemophilus influenza' pathogen_grid = culture_complete[culture_complete.pathogen==pathogen].pivot('record_id', 'day', 'cfu_count') sns.heatmap(pathogen_grid, ax=ax, vmin=0, vmax=6, cbar=draw_cbar, cbar_ax=cbar_ax if draw_cbar else None) ax.set_yticklabels(['']) ax.set_title(pathogen) ax.set_ylabel('patient') fig.tight_layout(rect=[0, 0, .9, 1]) # ## Correlation between copies and CFU for mini BALs # # Merge PCR and CFU DataFrames on subject, day and pathogen: # In[88]: mbal_merged = mbal_pcr_pathogens.merge(culture_pathogens, on=['record_id', 'day', 'pathogen'], how='left').fillna(0) mbal_merged.shape # In[89]: mbal_merged.head() # In[90]: axes = (mbal_merged.assign(log_pcr_count=mbal_merged.pcr_count.apply(np.log)) .plot.scatter(x='log_pcr_count', y='cfu_count')) axes.set_ylim(0, mbal_merged.cfu_count.max()+1); # ## Correlation between HME and bBAL # In[91]: bbal_agg = (bbal_pathogens.dropna() .groupby(['record_id', 'pathogen'])[['cfu_count']].max()) bbal_agg.head() # In[92]: hme_agg = (hme_pathogens.assign(log_count=np.log(hme_pathogens.pcr_count)) .drop('pcr_count', axis=1) .dropna() .groupby(['record_id', 'pathogen'])[['log_count']].max()) hme_agg.head() # In[93]: bbal_hme = (bbal_agg.join(hme_agg).fillna(0) .reset_index() .rename(columns={'log_count':'HME count', 'cfu_count':'bBAL count'})) bbal_hme.head() # In[94]: n = bbal_hme.shape[0] s = 0.1 grid = sns.FacetGrid(data=bbal_hme.assign(hme_count=bbal_hme['HME count']+np.random.randn(n)*s, bbal_count=bbal_hme['bBAL count']+np.random.randn(n)*s), hue='pathogen', aspect=1.5, size=5) grid.map(plt.scatter, 'hme_count', 'bbal_count', alpha=0.5).add_legend() # ## Estimate sensitivity, PPV and NPV for HME # In[95]: bbal_hme['bbal_positive'] = bbal_hme['bBAL count'] >=3 bbal_hme['hme_positive'] = bbal_hme['HME count'] > 0 # In[96]: from sklearn import metrics # Sensitivity (recall) # In[97]: metrics.recall_score(bbal_hme.bbal_positive, bbal_hme.hme_positive) # Precision score (PPV) # In[98]: metrics.precision_score(bbal_hme.bbal_positive, bbal_hme.hme_positive) # In[99]: def calc_recall(x): if not x.bbal_positive.sum(): return np.nan return metrics.recall_score(x.bbal_positive, x.hme_positive) bbal_hme.groupby('pathogen').apply(calc_recall) # In[100]: bbal_positive = bbal_hme[bbal_hme.bbal_positive==1].copy() pathogen_list = bbal_positive.pathogen.unique() n_pathogens = pathogen_list.shape[0] pathogen_encode = dict(zip(pathogen_list, range(n_pathogens))) pathogen_decode = dict(zip(range(n_pathogens), pathogen_list)) bbal_positive['pathogen_id'] = bbal_positive.pathogen.replace(pathogen_encode) # In[101]: import pymc3 as pm x = bbal_positive.pathogen_id.values y = bbal_positive.hme_positive.values with pm.Model() as recall_model: μ = pm.Normal('μ', 0, sd=10) σ = pm.HalfCauchy('σ', 2.5) θ = pm.Normal('θ', μ, sd=σ, shape=n_pathogens) π = pm.Deterministic('π', pm.math.invlogit(θ)) pm.Bernoulli('likeihood', π[x], observed=y) # In[102]: with recall_model: tr = pm.sample(2000, n_init=20000, random_seed=20090425) # In[103]: pm.summary(tr[1000:], varnames=['π']) # In[104]: gs = pm.forestplot(tr[1000:], varnames=['π'], ylabels=pathogen_list, xtitle='Sensitivity (Recall)') # In[105]: pm.plot_posterior(tr[1000:], varnames=['μ'], point_estimate='median', transform=lambda x: 1/(1+np.exp(-x))) # PPV estimation # In[106]: hme_positive = bbal_hme[bbal_hme.hme_positive==1].copy() pathogen_list = hme_positive.pathogen.unique() n_pathogens = pathogen_list.shape[0] pathogen_encode = dict(zip(pathogen_list, range(n_pathogens))) pathogen_decode = dict(zip(range(n_pathogens), pathogen_list)) hme_positive['pathogen_id'] = hme_positive.pathogen.replace(pathogen_encode) # In[107]: hme_positive.shape # In[108]: x = hme_positive.pathogen_id.values y = hme_positive.bbal_positive.values # In[109]: with pm.Model() as ppv_model: μ = pm.Normal('μ', 0, sd=10) σ = pm.HalfCauchy('σ', 2.5) θ = pm.Normal('θ', μ, sd=σ, shape=n_pathogens) π = pm.Deterministic('π', pm.math.invlogit(θ)) pm.Bernoulli('likeihood', π[x], observed=y) # In[110]: with ppv_model: tr_ppv = pm.sample(2000, n_init=20000, random_seed=20090425) # In[111]: pm.forestplot(tr_ppv[1000:], varnames=['π'], ylabels=pathogen_list, xtitle='Positive predictive value') # NPV estimation # In[112]: bbal_hme_complete = bbal_hme.set_index(['record_id', 'pathogen']) complete_index = pd.MultiIndex.from_product(bbal_hme_complete.index.levels, names=bbal_hme_complete.index.names) bbal_hme_complete = bbal_hme_complete.reindex(complete_index, fill_value=0).reset_index() # In[113]: bbal_negative = bbal_hme_complete[bbal_hme_complete.bbal_positive==0].copy() pathogen_list = bbal_negative.pathogen.unique() n_pathogens = pathogen_list.shape[0] pathogen_encode = dict(zip(pathogen_list, range(n_pathogens))) pathogen_decode = dict(zip(range(n_pathogens), pathogen_list)) bbal_negative['pathogen_id'] = bbal_negative.pathogen.replace(pathogen_encode) # In[114]: x = bbal_negative.pathogen_id.values y = bbal_negative.hme_positive.values.astype(int)==0 # In[115]: with pm.Model() as npv_model: μ = pm.Normal('μ', 0, sd=10) σ = pm.HalfCauchy('σ', 2.5) θ = pm.Normal('θ', μ, sd=σ, shape=n_pathogens) π = pm.Deterministic('π', pm.math.invlogit(θ)) pm.Bernoulli('likeihood', π[x], observed=y) # In[116]: with npv_model: tr_npv = pm.sample(2000, n_init=20000, random_seed=20090425) # In[117]: gs = pm.forestplot(tr_npv[1000:], varnames=['π'], ylabels=pathogen_list, xtitle='Negative predictive value') # ## Correlation between PCR counts for HME vs mBAL # In[118]: pcr_merged = mbal_pcr_pathogens.merge(hme_pathogens, on=['record_id', 'day', 'pathogen'], how='left', suffixes=['_mbal', '_hme']).fillna(0) # In[119]: pcr_merged.head() # In[120]: axes = (pcr_merged.assign(log_mbal_count=(pcr_merged.pcr_count_mbal+1).apply(np.log), log_hme_count=(pcr_merged.pcr_count_hme+1).apply(np.log)) .plot.scatter(x='log_mbal_count', y='log_hme_count')) axes.set_xlim(0, 25) axes.set_ylim(0, 25); # Calculate proportion of culture detections that show up in PCR # In[121]: mbal_merged.head() # In[122]: mbal_merged['pcr_positive'] = mbal_merged.pcr_count>0 mbal_merged['cfu_positive'] = mbal_merged.cfu_count>0 # In[123]: mbal_merged.pcr_positive.sum() # In[124]: mbal_merged.cfu_positive.sum() # In[125]: mbal_merged[mbal_merged.cfu_positive==1].pcr_positive.mean() # In[126]: bbal_long.head() # ## Comparison of pathogen presence in bBAL and HME # In[127]: bbal_pathogens.dropna(subset=['pathogen']) # In[128]: fig, axes = plt.subplots(1, 2, figsize=(14,8)) (hme_pathogens.assign(log_count=np.log(hme_pathogens.pcr_count)) .dropna(subset=['pathogen']) .fillna(0) .groupby(['record_id', 'pathogen']) .log_count.max() .reset_index() .groupby('pathogen') .log_count.mean() .sort_values(ascending=False) .plot.bar(ax=axes[0])) axes[0].set_title('Mean maximum pathogen count in HME') axes[0].set_xlabel('') axes[0].set_ylabel('Log PCR count') (bbal_pathogens.dropna(subset=['pathogen']) .fillna(0) .groupby(['record_id', 'pathogen']) .cfu_count.max() .reset_index() .groupby('pathogen') .cfu_count.mean() .sort_values(ascending=False) .plot.bar(ax=axes[1])) axes[1].set_title('Mean maximum pathogen count in bBAL') axes[1].set_xlabel('') axes[1].set_ylabel('CFU count') # ## bBAL symptom plots # # Plot the bBAL patients and their WBC, temperature, pulse rate for the 14 days they were on the study. # In[129]: bbal_ids = bbal_long.record_id.unique() # In[130]: def plot_variable(var_basename, label, title='', subset=None, data_filter=None): cols = trial_data.columns[trial_data.columns.str.contains(var_basename)] if subset is not None: data = trial_data.loc[subset, cols] else: data = trial_data[cols] if data_filter: data = data_filter(data) data_long = pd.melt(data.reset_index(), id_vars=['record_id']).dropna() data_long = (data_long.assign(day=data_long.variable.str .split('_d') .apply(lambda x: x[-1])) .drop('variable', axis=1) .rename(columns={'value':label})) grid = sns.FacetGrid(data_long, col='record_id', col_wrap=4, size=2.5) grid.map(plt.plot, "day", label, marker="o", ms=4) grid.fig.subplots_adjust(top=0.95) grid.fig.suptitle(title) return grid # In[131]: plot_variable('wbc', 'WBC', 'WBC Count', subset=np.arange(1001, 1038)) # In[132]: plot_variable('map', 'map', 'Mean Arterial Pressure', subset=np.arange(1001, 1038)) # In[133]: from scipy.constants import convert_temperature plot_variable('temp', 'temperature (C)', 'Temperature', subset=np.arange(1001, 1038)) # In[134]: pao2_cols = trial_data.columns[trial_data.columns.str.contains('pao2')] fio2_cols = trial_data.columns[trial_data.columns.str.contains('pf_fio2')] # In[135]: pao2_data = trial_data.loc[bbal_long.record_id.unique(), pao2_cols] fio2_data = trial_data.loc[bbal_long.record_id.unique(), fio2_cols] # In[136]: pao2_data_long = pd.melt(pao2_data.reset_index(), id_vars=['record_id']).dropna() pao2_data_long = (pao2_data_long.assign(day=pao2_data_long.variable.str .split('_d') .apply(lambda x: x[-1])) .drop('variable', axis=1) .rename(columns={'value':'PaO2'})) fio2_data_long = pd.melt(fio2_data.reset_index(), id_vars=['record_id']).dropna() fio2_data_long = (fio2_data_long.assign(day=fio2_data_long.variable.str .split('_d') .apply(lambda x: x[-1])) .drop('variable', axis=1) .rename(columns={'value':'FiO2'})) # In[137]: ratio_data = fio2_data_long.set_index(['record_id','day']).join(pao2_data_long.set_index(['record_id','day'])).dropna() ratio_data['P/F ratio'] = 100*ratio_data.PaO2/ratio_data.FiO2 # In[138]: ratio_data = ratio_data.reset_index() # In[139]: grid = sns.FacetGrid(ratio_data, col='record_id', col_wrap=4, size=2.5) grid.map(plt.plot, "day", "P/F ratio", marker="o", ms=4) grid.fig.subplots_adjust(top=0.95) grid.fig.suptitle('P/F ratio')