In [1]:

# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from redcap import Project

Query data from REDCap¶

In [2]:

def get_redcap_data(token_url=None, api_url='https://redcap.vanderbilt.edu/api/', metadata=False, 
                    project=False, **kwargs):

    if token_url is None:
        print('Enter token:')
        api_key = input()
    else:
        api_key = open(token_url).read()
    
    project = Project(api_url, api_key)
    
    data = project.export_records(format='df',  **kwargs)
    
    if metadata:
        return data, project.export_metadata()
    elif project:
        return data, project
    else:
        return data

Main trial data¶

In [3]:

trial_data, trial_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_trial.txt", metadata=True)

Size of dataset (rows x columns)

In [4]:

trial_data.shape

Out[4]:

(46, 1871)

MiniBAL culture data¶

In [5]:

culture_data, culture_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_bal.txt", metadata=True)

Size of dataset (rows x columns)

In [6]:

culture_data.shape

Out[6]:

(46, 575)

BAL PCR data¶

In [7]:

pcr_data, pcr_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_pcr.txt", metadata=True)

Size of dataset (rows x columns)

In [8]:

pcr_data.shape

Out[8]:

(44, 1642)

Patient population¶

Number of individuals with complete (PCR and culture) and incomplete (one or the other) data

In [9]:

pcr_date_cols = pcr_data.columns[pcr_data.columns.str.contains('date')]
culture_date_cols = culture_data.columns[culture_data.columns.str.contains('date')]

In [10]:

culture_data['some_bal_data'] = (culture_data[culture_date_cols].notnull().sum(1)>0)
pcr_data['some_pcr_data'] = (pcr_data[pcr_date_cols].notnull().sum(1)>0)

In [11]:

(culture_data.join(pcr_data, lsuffix='-bal', rsuffix='-pcr')[['some_bal_data', 'some_pcr_data']]
     .sum(1).replace({1: 'One', 2: 'Both'}).value_counts())

Both    44
One      2
dtype: int64

Distribution of number of vent days¶

In [12]:

ax = trial_data.vent_days.hist()
ax.set_ylabel('Frequency')
ax.set_xlabel('Days on ventilator');

Statistical information regarding number of vent days¶

In [13]:

trial_data.vent_days.describe()

count    45.000000
mean     10.933333
std       7.566073
min       2.000000
25%       6.000000
50%       9.000000
75%      13.000000
max      38.000000
Name: vent_days, dtype: float64

Suspected VAP

In [14]:

VAP_lookup = {1: 'Pneumonia',
'Infection (not pneumonia)',
'Acute renal failure',
'ARDS',
'Cardiac arrest',
'MI',
'CVA',
'DVT',
'PE',
'None of the above'}

In [15]:

trial_data['pneumonia'] = trial_data.morbidity___1

Summary of pneumonia¶

There are currently 18 individuals with pneumonia

In [16]:

trial_data.pneumonia.sum()

Out[16]:

This accounts for 49% of the sample

In [17]:

trial_data.pneumonia.mean()

Out[17]:

0.5

Patients taking antibiotics at time of admission:¶

In [18]:

trial_data.preadmit_abx.replace({1:'Yes', 2:'No', 3:'Unknown'}).value_counts()

No         39
Unknown     7
Name: preadmit_abx, dtype: int64

Patients administered antibiotics at some point during study:¶

In [19]:

trial_data.add_med_0.replace({1:'Yes', 0:'No'}).value_counts()

Yes    42
No      3
Name: add_med_0, dtype: int64

In [20]:

abx_cols = trial_data.columns[trial_data.columns.str.startswith('abx')]

In [21]:

abx_table = pd.wide_to_long(trial_data[abx_cols].assign(id=trial_data.index), 
                stubnames=['abx_generic_', 'abx_antifungals_', 'abx_start_', 'abx_stop_', 'abx_indication_', 
                           'abx_suspect_infect_source_', 'abx_known_infection_source_'], 
                i='id', j='abx_number').dropna(thresh=2).dropna(axis=1, thresh=100)
abx_table.columns = ['abx_name', 'antifungal', 'abx_start', 'abx_stop', 'abx_indication']
abx_table.abx_start = pd.to_datetime(abx_table.abx_start)
abx_table.abx_stop = pd.to_datetime(abx_table.abx_stop)
abx_table

		abx_name	antifungal	abx_start	abx_stop	abx_indication
id	abx_number
1001	1	Ampicillin-sulbactam	1.0	2016-03-27	2016-03-27	1.0
1002	1	Cefazolin	1.0	2016-04-04	2016-04-04	1.0
1003	1	Cefazolin	1.0	2016-04-14	2016-04-14	1.0
1004	1	Cefazolin	1.0	2016-04-18	2016-04-18	1.0
1005	1	Cefazolin	1.0	2016-05-06	2016-05-06	1.0
1006	1	Amoxicillin-k clavulanate	1.0	2016-05-16	2016-05-22	3.0
1007	1	Cefazolin	1.0	2016-05-17	2016-05-17	1.0
1008	1	Cefazolin	1.0	2016-05-23	2016-05-23	1.0
1010	1	Amoxicillin-k clavulanate	1.0	2016-06-26	2016-07-28	3.0
1011	1	Cefazolin	1.0	2016-06-20	2016-06-22	1.0
1012	1	Cefazolin	1.0	2016-06-25	2016-06-25	3.0
1013	1	Meropenem	1.0	2016-06-28	2016-07-01	2.0
1014	1	Cefazolin	1.0	2016-07-02	2016-07-02	1.0
1015	1	Cefazolin	1.0	2016-07-06	2016-07-06	1.0
1016	1	Cefazolin	1.0	2016-07-15	2016-07-18	3.0
1017	1	Cefazolin	1.0	2016-07-12	2016-07-14	1.0
1018	1	Ceftazidime	1.0	2016-07-20	2016-07-22	2.0
1019	1	Cefazolin	1.0	2016-07-20	2016-07-21	1.0
1020	1	Cefazolin	1.0	2016-07-24	2016-07-24	1.0
1021	1	Cefazolin	1.0	2016-07-27	2016-07-27	1.0
1022	1	Cefazolin	1.0	2016-08-09	2016-08-11	1.0
1023	1	Piperacillin-tazobactam	1.0	2016-08-16	2016-08-16	1.0
1024	1	Cefazolin	1.0	2016-08-20	2016-08-20	1.0
1025	1	Cefazolin	1.0	2016-08-24	2016-08-24	1.0
1026	1	Cefazolin	1.0	2016-09-01	2016-09-01	1.0
1027	1	Cefazolin	1.0	2016-09-22	2016-09-22	1.0
1028	1	Ceftazidime	1.0	2016-09-21	2016-09-29	2.0
1029	1	Cefazolin	1.0	2016-09-17	2016-09-17	1.0
1030	1	Cefazolin	1.0	2016-09-16	2016-09-18	1.0
1031	1	Cefazolin	1.0	2016-09-23	2016-09-24	1.0
...	...	...	...	...	...	...
1022	5	trimethoprim-sulfamethoxazole	1.0	2016-08-17	2016-08-22	3.0
1024	5	Vancomycin	1.0	2016-08-23	2016-09-01	2.0
1025	5	Vancomycin	1.0	2016-08-27	2016-08-29	2.0
1029	5	Levofloxacin	1.0	2016-09-20	2016-09-20	2.0
1037	5	Vancomycin	1.0	2016-10-28	2016-11-04	1.0
1038	5	Vancomycin	1.0	2016-12-11	2016-12-18	2.0
1039	5	Piperacillin-tazobactam	1.0	2016-12-09	2016-12-11	2.0
1042	5	Vancomycin	1.0	2017-01-15	2017-01-17	1.0
1044	5	Tobramycin	1.0	2017-01-31	2017-02-01	2.0
1045	5	Vancomycin	1.0	2017-02-01	2017-02-04	2.0
1001	6	Levofloxacin	1.0	2016-03-23	2016-03-24	1.0
1002	6	Vancomycin	1.0	2016-04-17	2016-04-19	2.0
1014	6	Vancomycin	1.0	2016-07-05	2016-07-10	2.0
1015	6	Vancomycin	1.0	2016-07-06	2016-07-07	1.0
1022	6	Tobramycin	1.0	2016-08-14	2016-08-14	3.0
1029	6	Piperacillin-tazobactam	1.0	2016-09-30	2016-10-03	2.0
1039	6	Vancomycin	1.0	2016-12-04	2016-12-04	1.0
1042	6	Fluconazole	2.0	2017-01-15	2017-01-15	1.0
1044	6	Vancomycin	1.0	2017-01-31	2017-02-02	2.0
1001	7	Meropenem	1.0	2016-04-02	2016-04-05	2.0
1014	7	Vancomycin	1.0	2016-07-13	2016-07-18	2.0
1015	7	Vancomycin	1.0	2016-07-15	2016-07-18	2.0
1022	7	Vancomycin	1.0	2016-08-14	2016-08-16	3.0
1029	7	Vancomycin	1.0	2016-09-20	2016-09-23	2.0
1039	7	Vancomycin	1.0	2016-12-09	2016-12-11	2.0
1042	7	Metronidazole	1.0	2017-01-15	2017-01-15	1.0
1001	8	Vancomycin	1.0	2016-03-23	2016-03-24	1.0
1022	8	Vancomycin	1.0	2016-08-21	2016-08-22	3.0
1029	8	Vancomycin	1.0	2016-09-30	2016-10-03	2.0
1001	9	Vancomycin	1.0	2016-04-02	2016-04-06	2.0

174 rows × 5 columns

In [22]:

abx_table.abx_name.value_counts()

Cefazolin                        47
Vancomycin                       39
Ceftazidime                      14
Levofloxacin                     14
Meropenem                        11
Ceftriaxone                       8
Piperacillin-tazobactam           8
Tobramycin                        5
Clindamycin                       5
Cefoxitin                         4
Ampicillin-sulbactam              3
Ciprofloxacin                     2
Cefepime                          2
Amoxicillin-k clavulanate         2
Linezolid                         2
Gentamicin                        1
Trimethoprim/sulfamethoxazole     1
Cefuroxime axetil                 1
Fluconazole                       1
Metronidazole                     1
Cephalexin                        1
trimethoprim-sulfamethoxazole     1
Cefdinir                          1
Name: abx_name, dtype: int64

In [23]:

trial_data

	lab_mrn	ic	name	mrn	dob	age	ethnicity	race	sex	height	...	nutrition_goal_d14	urine_output_d14	spo2_d14	low_peep_d14	fio2_low_d14	gcs_d14	cxr_yn_d14	cxr_d14	ards_d14	pneumonia
record_id
1001	99019881, 99019865	[document]	MCMURRAY, PHEAIRIS	40181455	1993-09-08	22	1	3	1	180.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1002	99020041	[document]	HUTCHINSON, LYNN MARIE	40238339	1970-04-29	45	1	4	0	155.0	...	1.0	2405.0	95.0	5.0	40.0	3.0	1.0	2.0	1.0	0
1003	99020323	[document]	FOURAKRE, JAMES	40279804	1943-06-24	72	1	4	1	183.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1004	099020377	[document]	BURCHAM, BETTY	36498293	1936-10-18	79	1	4	0	152.4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1005	99020801	[document]	MCCUTCHEON, CARI	40385486	1987-11-26	28	1	4	0	177.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1006	99020869	[document]	BASTANCHURY, RICHARD	40398752	1968-03-26	48	1	4	1	175.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1007	99021068	[document]	FESSLER, HANNAH	40435208	1992-03-10	24	1	4	0	170.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1008	99021194	[document]	KING, JAMES	40472474	1961-11-08	54	1	4	1	175.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1009	99021641	[document]	BACON, THELMA	40550907	1935-05-02	81	1	4	0	158.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1010	99021664	[document]	MORRIS, JESSE	40565244	1978-06-14	38	1	4	1	180.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1011	99021787	[document]	ALLEN, NATASHA LYNN	40585937	1992-11-18	23	1	4	0	163.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1012	99021788	[document]	ROBINSON, COREY	40581316	1993-12-22	22	1	4	1	175.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1013	99021975	[document]	STEVENS, JOHN	40587511	1962-10-11	53	1	4	1	170.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1014	99022055	[document]	MULLINS, DOUGLAS	40643975	1963-03-10	53	1	4	1	193.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1015	99022167	[document]	COMPTON, KENNETH	40653602	1937-01-29	79	1	4	1	172.7	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1016	99022180	[document]	RANSOM, KERRY EYULNN	40669319	1972-12-14	43	1	3	1	180.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1017	99022195	[document]	SHERROD, KAYLA	40671133	1989-11-22	26	1	3	0	158.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1018	99022322	[document]	NEAL, MATTHEW	40699902	1992-08-04	23	1	4	1	193.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1019	99022382	[document]	COLLINS, JONATHAN	40713232	1993-01-12	23	1	4	1	188.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1020	99022427	[document]	GROVES, BIRDIE	40729972	1934-05-30	82	1	4	0	168.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1021	99022487	[document]	GENTRY, CHRISTINA	40736860	1966-07-11	50	1	4	0	180.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1022	99022756	[document]	VINCENT, EMMETT EUGENE	40790834	1955-06-19	61	1	4	1	182.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1023	99022843	[document]	THOMSON, MICHAEL	40821654	1968-01-13	48	1	4	1	182.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1024	99023016	[document]	MCREYNOLDS, ROBERT ERIC	40852360	1950-03-11	66	1	3	1	185.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1025	99023134	[document]	NEAL, JAMES	36041507	1960-11-02	55	1	4	1	178.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1026	99023292	[document]	LOWHORN, ALTON	40897175	1968-08-09	48	1	4	1	170.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1027	99023538	[document]	JIMENEZ, RAYNA	40954513	1992-10-09	23	1	4	0	157.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1028	99023569	[document]	POTTER, STEVEN	40955700	1962-08-09	54	1	4	1	172.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1029	99023616	[document]	MCCOIN, DANNY	40970675	1969-08-07	47	1	4	1	188.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1030	99023618	[document]	MORRIS, WILLIAM PAUL	40960858	1970-07-16	46	1	4	1	175.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1031	99023683	[document]	CHEEK, JASON	40973968	1980-01-02	36	1	4	1	180.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1032	99023899	[document]	Sailliez, Jeffrey	41003393	1976-08-17	40	1	4	1	182.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1033	99024357	[document]	Lloyd, David	41092511	1960-11-03	55	1	4	1	170.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1034	99024409	[document]	MOORE, WESLEY	41099110	1957-10-13	59	1	4	1	183.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1035	99024481	[document]	RICE, STEVEN	41120908	1959-04-13	57	1	4	1	178.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1036	99024534	[document]	BARTON, MILEAH BROOKE	41123670	1987-06-16	29	1	4	0	165.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1037	99024666	[document]	PILLOW, VICKI JEAN	41141656	1952-01-26	64	1	4	0	162.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1038	99025230	[document]	LYNN, JEWELL	41269663	1943-08-21	73	1	4	0	152.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1039	99025306	[document]	NEWMAN, JOSHUA	41291360	1990-11-16	26	1	4	1	185.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1040	99025443	[document]	PROCK, TONYA	41293788	1969-12-03	47	1	4	0	152.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1041	NaN	[document]	ELROD, DENNIS	41382466	1978-09-18	38	1	4	1	188.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1042	99026014	[document]	JASTRE, LAURA SAVANNAH	41452038	1988-04-08	28	1	4	0	152.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1043	99026078	[document]	DAVIS, ADAM M	32417198	1980-03-22	36	1	4	1	188.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1044	99026259	[document]	PIGG, JAMIE	41490277	1977-09-25	39	1	4	1	172.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1045	99026322	[document]	GRIFFIN, SHAWN	41506536	1965-06-22	51	1	4	1	167.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
1046	99027068	[document]	CRAFTON, DANIEL L	41666090	1997-04-29	19	1	4	1	178.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0

46 rows × 1872 columns

Positive bronchioscopic BALs¶

In [24]:

def daily_data_to_long(dataset, pattern='_d\d+$', suffix=''):
    
    # Find the largest number of study days
    _tokens = dataset.columns[dataset.columns.str.contains(pattern, regex=True)].str.split('_').values
    last_day = np.array([''.join(filter(lambda x: x.isdigit(), t[-1])) for t in _tokens]).astype(int).max()

    all_days_data = []

    for i in range(last_day):

        print('Processing day',i+1)
        study_day_cols = dataset.columns[dataset.columns.str.endswith('_d{0}{1}'.format(i+1,suffix))]

        study_day_data = dataset[study_day_cols].copy()
        study_day_data.columns = ['_'.join(col.split('_')[:-1]) for col in study_day_cols]
        
        study_day_data['day'] = i+1
        all_days_data.append(study_day_data)
        if False:
            print(study_day_data.shape)

    return pd.concat(all_days_data).reset_index()

In [25]:

bbal_long = daily_data_to_long(trial_data).query('bbal==1')

Processing day 1
Processing day 2
Processing day 3
Processing day 4
Processing day 5
Processing day 6
Processing day 7
Processing day 8
Processing day 9
Processing day 10
Processing day 11
Processing day 12
Processing day 13
Processing day 14

In [26]:

bbal_cfu_cols = bbal_long.columns[bbal_long.columns.str.contains('cfu')  
                         & bbal_long.columns.str.startswith('bbal')]
bbal_pathogens_cols = bbal_long.columns[bbal_long.columns.str.contains('pathogen') 
                         & ~bbal_long.columns.str.contains('add')
                         & bbal_long.columns.str.startswith('culture')]

Encode CFU count ranges to ordinal variable

In [27]:

cfu_coding = {'<1,000': 0, 
              '<10,000': 1,
              '10,000 - 25,000': 2, 
              '25,000 - 50,000': 3,
              '50,000 - 100,000': 4,  
              '>100,000': 5}

Lookup table to translate species codes

In [28]:

bbal_path_lookup = {1: 'Staphylococcus aureus',
'Streptococcus pneumonia',
'Streptococcus Group B',
'Acetinobacter baumannii',
'Pseudomonas aeruginosa',
'Haemophilus influenza',
'Klebsiella pneumoniae',
'Escherichia coli',
'Enterobacter cloacae',
'Stenotrophomonas maltophilia',
'Enterobacter aerogenes',
'Serratia marcescens',
'Klebsiella oxytoca',
'Proteus mirabilis',
'Other'}

In [29]:

bbal_pathogens_long = (bbal_long[bbal_pathogens_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
bbal_pathogens = bbal_pathogens_long.assign(pathogen=bbal_pathogens_long.value.replace(bbal_path_lookup)
                         ).drop(['variable', 'value'], axis=1)

In [30]:

bbal_cfu_long = (bbal_long[bbal_cfu_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert bbal_pathogens_long.shape==bbal_cfu_long.shape

In [31]:

bbal_pathogens['cfu_count'] = (bbal_cfu_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
bbal_pathogens_sorted = bbal_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)

In [32]:

def fill_pathogens(x, labels, lookup):
    return (x.dropna()
          .drop_duplicates(subset=['pathogen'])
          .set_index('pathogen')
          .reindex(list(lookup.values()))
          .reset_index()
          .assign(record_id=labels[0], day=labels[1])
          .fillna(0))

In [33]:

bbal_groups = []
for labels, group in bbal_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, bbal_path_lookup)
    bbal_groups.append(group_full)

In [34]:

bbal_complete = pd.concat(bbal_groups).reset_index()
bbal_complete.to_csv('../data/clean/bbal_pathogens.csv')

In [35]:

bbal_complete.head()

	index	pathogen	record_id	day
0	0	Staphylococcus aureus	1001	10
1	1	Streptococcus pneumonia	1001	10
2	2	Streptococcus Group B	1001	10
3	3	Acetinobacter baumannii	1001	10
4	4	Pseudomonas aeruginosa	1001	10

In [36]:

bbal_complete.cfu_count.hist()

Out[36]:

<matplotlib.axes._subplots.AxesSubplot at 0x10ceca940>

In [37]:

(bbal_complete.groupby('record_id').cfu_count.max()>=3).sum()

Out[37]:

In [38]:

(bbal_complete.groupby('record_id').cfu_count.max()==0).sum()

Out[38]:

In [39]:

ax = bbal_complete.groupby('record_id').cfu_count.max().hist(bins=range(8), align='left')
ax.set_xlabel('max. count')
ax.set_ylabel('frequency');

Trajectories of CFU counts from bronchioscopic BAL data¶

Each panel represents a patient; each colored line a pathogen. X an Y axes are days and CFU count, respectively.

In [40]:

bbal_grid = sns.FacetGrid(bbal_complete, 
                     col='record_id', hue="pathogen", col_wrap=4, size=2.5)
bbal_grid.map(plt.plot, "day", "cfu_count", marker="o", ms=4)
bbal_grid.add_legend()

Out[40]:

<seaborn.axisgrid.FacetGrid at 0x10d14dda0>

The number of patients with at least one positive bbal ($> 10^4$) in the dataset:¶

In [41]:

positive_bbal = bbal_cfu_long.fillna(0).groupby('record_id').apply(lambda x: (x.value>0).any())
positive_bbal.sum()

Out[41]:

Proportion patients with a positive bbal:¶

In [42]:

positive_bbal.mean()

Out[42]:

0.75

Convert trial data to long format¶

First need to reshape data to long format (one row per date of study per patient)

In [43]:

daily_long = daily_data_to_long(trial_data)

Processing day 1
Processing day 2
Processing day 3
Processing day 4
Processing day 5
Processing day 6
Processing day 7
Processing day 8
Processing day 9
Processing day 10
Processing day 11
Processing day 12
Processing day 13
Processing day 14

In [44]:

daily_long.head()

	record_id	abg_yn	amt_irrigant	bbal_add_pathogen3	bbal_add_pathogen4	bbal_add_pathogen_2	bbal_cfu	bbal_cfu2	...	plt	spo2	temperature	urine_output	vasopressin_rate	vasopressin_yn	vasopressors_yn	vent_day	wbc_count
0	1001	0.0	1.0	NaN	NaN	NaN	NaN	NaN	...	131.0	100.0	38.2	1800.0	NaN	NaN	0.0	1.0	11.0
1	1002	1.0	40.0	NaN	NaN	NaN	NaN	NaN	...	219.0	NaN	37.1	1875.0	NaN	0.0	1.0	1.0	13.6
2	1003	1.0	20.0	NaN	NaN	NaN	NaN	NaN	...	94.0	NaN	37.9	1045.0	NaN	0.0	1.0	1.0	8.4
3	1004	1.0	40.0	NaN	NaN	NaN	NaN	NaN	...	307.0	NaN	39.3	1474.0	NaN	NaN	0.0	1.0	14.1
4	1005	1.0	20.0	NaN	NaN	NaN	NaN	NaN	...	55.0	NaN	38.2	1550.0	NaN	0.0	1.0	1.0	15.4

5 rows × 106 columns

Drop rows with no date, columns with no data

In [45]:

data_long_complete = daily_long.dropna(subset=['date_study_day']).dropna(axis=1, thresh=1)
data_long_complete.head()

	record_id	abg_yn	amt_irrigant	bbal_add_pathogen3	bbal_add_pathogen4	bbal_add_pathogen_2	bbal_cfu	bbal_cfu2	...	plt	spo2	temperature	urine_output	vasopressin_rate	vasopressin_yn	vasopressors_yn	vent_day	wbc_count
0	1001	0.0	1.0	NaN	NaN	NaN	NaN	NaN	...	131.0	100.0	38.2	1800.0	NaN	NaN	0.0	1.0	11.0
1	1002	1.0	40.0	NaN	NaN	NaN	NaN	NaN	...	219.0	NaN	37.1	1875.0	NaN	0.0	1.0	1.0	13.6
2	1003	1.0	20.0	NaN	NaN	NaN	NaN	NaN	...	94.0	NaN	37.9	1045.0	NaN	0.0	1.0	1.0	8.4
3	1004	1.0	40.0	NaN	NaN	NaN	NaN	NaN	...	307.0	NaN	39.3	1474.0	NaN	NaN	0.0	1.0	14.1
4	1005	1.0	20.0	NaN	NaN	NaN	NaN	NaN	...	55.0	NaN	38.2	1550.0	NaN	0.0	1.0	1.0	15.4

5 rows × 90 columns

PCR counts in mBAL and HME¶

In [46]:

pcr_pattern = '_d\d+pcr$'
pcr_data.columns.str.contains(pcr_pattern, regex=True).sum()

Out[46]:

In [47]:

pcr_data_long = daily_data_to_long(pcr_data, pattern=pcr_pattern, suffix='pcr')

Processing day 1
Processing day 2
Processing day 3
Processing day 4
Processing day 5
Processing day 6
Processing day 7
Processing day 8
Processing day 9
Processing day 10
Processing day 11
Processing day 12
Processing day 13
Processing day 14

Drop rows with no date or no PCR result, and columns with no data

In [48]:

pcr_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'mbal_pcr_result']).dropna(axis=1, thresh=1)
pcr_long_complete['date_study_day'] = pd.to_datetime(pcr_long_complete['date_study_day'])
pcr_long_complete.head()

	record_id	bbal_add_path2	bbal_add_path3	bbal_copies1	bbal_path1	bbal_pcr_result	bhme_add_path2	bhme_add_path3	bhme_add_path4	...	mbal_copies6	mbal_copies7	mbal_path1	mbal_path2	mbal_path3	mbal_path4	mbal_path5	mbal_path6	mbal_path7	mbal_pcr_result
1	1003	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0
3	1005	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0
5	1007	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	2.0	13.0	6.0	11.0	NaN	NaN	NaN	1.0
6	1008	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0
7	1009	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	1.0

5 rows × 67 columns

Bacterial counts from mBAL over time¶

Plotting time series for mini-BAL PCR counts.

In [49]:

mbal_pcr_copies_cols = pcr_long_complete.columns[pcr_long_complete.columns.str.contains('copies')  
                         & pcr_long_complete.columns.str.startswith('mbal')]
mbal_pcr_pathogens_cols = pcr_long_complete.columns[pcr_long_complete.columns.str.contains('path') 
                         & ~pcr_long_complete.columns.str.contains('add')
                         & pcr_long_complete.columns.str.startswith('mbal')]

Lookup table to translate species codes

In [50]:

pcr_path_lookup = {1: 'Staphylococcus aureus',
'Streptococcus pneumonia',
'Streptococcus Group B',
'Acetinobacter baumannii',
'Pseudomonas aeruginosa',
'Haemophilus influenza',
'Klebsiella pneumoniae',
'Escherichia coli',
'Enterobacter cloacae',
'Stenotrophomonas maltophilia',
'Enterobacter aerogenes',
'Serratia marcescens',
'Klebsiella oxytoca',
'Proteus mirabilis',
'Enterococcus faecalis',
'Enterococcus faecium',
'Candida albicans',
'Other'}

Convert pathogens table from wide to long format

In [51]:

mbal_pcr_pathogens_long = (pcr_long_complete[mbal_pcr_pathogens_cols.tolist() + ['record_id', 'day']]
                        .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
mbal_pcr_pathogens = mbal_pcr_pathogens_long.assign(pathogen=mbal_pcr_pathogens_long.value.replace(pcr_path_lookup)
                         ).drop(['variable', 'value'], axis=1)

Convert counts table from wide to long format

In [52]:

mbal_pcr_counts_long = (pcr_long_complete[mbal_pcr_copies_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert mbal_pcr_pathogens_long.shape==mbal_pcr_counts_long.shape

Append count onto pathogens table

In [53]:

mbal_pcr_pathogens['pcr_count'] = (mbal_pcr_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
mbal_pcr_pathogens_sorted = mbal_pcr_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)

In [54]:

mbal_pcr_groups = []
for labels, group in mbal_pcr_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, pcr_path_lookup)
    mbal_pcr_groups.append(group_full)

In [55]:

mbal_pcr_complete = pd.concat(mbal_pcr_groups).reset_index()
mbal_pcr_complete.to_csv('../data/clean/mbal_pcr_pathogens.csv')

mbal_pcr_complete.head()

	index	pathogen	record_id	day
0	0	Staphylococcus aureus	1002	4
1	1	Streptococcus pneumonia	1002	4
2	2	Streptococcus Group B	1002	4
3	3	Acetinobacter baumannii	1002	4
4	4	Pseudomonas aeruginosa	1002	4

Trajectories of PCR counts from mini-BAL data¶

Each panel represents a patient; each colored line a pathogen. X an Y axes are days and log-count, respectively.

In [56]:

abx_table.ix[1002]

/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/ipykernel/__main__.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':

	abx_name	antifungal	abx_start	abx_stop	abx_indication
abx_number
1	Cefazolin	1.0	2016-04-04	2016-04-04	1.0
2	Cefazolin	1.0	2016-04-11	2016-04-12	1.0
3	Meropenem	1.0	2016-04-12	2016-05-15	2.0
4	Meropenem	1.0	2016-04-17	2016-05-20	2.0
5	Vancomycin	1.0	2016-04-12	2016-04-15	2.0
6	Vancomycin	1.0	2016-04-17	2016-04-19	2.0

In [57]:

def vertical_abx_line(x, count_data=pcr_long_complete, **kwargs):
    record_id = x.min()
    start_dates = np.unique(abx_table.ix[record_id].abx_start.dt.date.values)
    first_date = count_data.loc[count_data.record_id==record_id, ['date_study_day', 'day']].min()
    reference_date = first_date.date_study_day - pd.Timedelta('{} days'.format(first_date.day))
    if np.any(start_dates):
        for date in start_dates:
            plt.axvline(x=(pd.Timestamp(date)-reference_date).days, ls=':', c='grey', **kwargs)

In [58]:

sns.set_style('ticks')
grid = sns.FacetGrid(mbal_pcr_complete.assign(log_count=(mbal_pcr_complete.pcr_count + 1).apply(np.log)), 
                     col='record_id', hue="pathogen", col_wrap=4, size=2.5)
grid.map(plt.plot, "day", "log_count", marker="o", ms=4)
grid.map(vertical_abx_line, 'record_id', alpha=0.2)
grid.add_legend()

Out[58]:

<seaborn.axisgrid.FacetGrid at 0x10d54cdd8>

bBAL PCR counts¶

In [59]:

bbal_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'bbal_pcr_result']).dropna(axis=1, thresh=1)
bbal_long_complete.head()

	record_id	bbal_add_path2	bbal_add_path3	bbal_add_path4	bbal_add_path5	bbal_add_path6	bbal_add_path7	bbal_collected	bbal_copies1	bbal_copies2	...	hme_path2	hme_path3	hme_pcr_result	mbal_add_path2	mbal_add_path3	mbal_collected	mbal_copies1	mbal_path1	mbal_path2	mbal_pcr_result
11	1013	1.0	0.0	NaN	NaN	NaN	NaN	1.0	330000.0	1060000.0	...	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN
37	1039	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN
39	1041	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN
41	1043	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	...	NaN	NaN	2.0	NaN	NaN	1.0	NaN	NaN	NaN	2.0
56	1014	1.0	0.0	NaN	NaN	NaN	NaN	1.0	447000.0	1040000.0	...	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN

5 rows × 51 columns

In [60]:

bbal_pcr_copies_cols = bbal_long_complete.columns[bbal_long_complete.columns.str.contains('copies')  
                         & bbal_long_complete.columns.str.startswith('bbal')]
bbal_pcr_pathogens_cols = bbal_long_complete.columns[bbal_long_complete.columns.str.contains('path') 
                         & ~bbal_long_complete.columns.str.contains('add')
                         & bbal_long_complete.columns.str.startswith('bbal')]

Convert pathogens table from wide to long format

In [61]:

bbal_pcr_pathogens_long = (bbal_long_complete[bbal_pcr_pathogens_cols.tolist() + ['record_id', 'day']]
                        .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
bbal_pcr_pathogens = bbal_pcr_pathogens_long.assign(pathogen=bbal_pcr_pathogens_long.value.replace(pcr_path_lookup)
                         ).drop(['variable', 'value'], axis=1)

Convert counts table from wide to long format

In [62]:

bbal_pcr_counts_long = (bbal_long_complete[bbal_pcr_copies_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert bbal_pcr_pathogens_long.shape==bbal_pcr_counts_long.shape

Append count onto pathogens table

In [63]:

bbal_pcr_pathogens['pcr_count'] = (bbal_pcr_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
bbal_pcr_pathogens_sorted = bbal_pcr_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)

In [64]:

bbal_pcr_groups = []
for labels, group in bbal_pcr_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, pcr_path_lookup)
    bbal_pcr_groups.append(group_full)

In [65]:

bbal_pcr_complete = pd.concat(bbal_pcr_groups).reset_index()
bbal_pcr_complete.to_csv('../data/clean/bbal_pcr_pathogens.csv')

bbal_pcr_complete.head()

	index	pathogen	record_id	day	pcr_count
0	0	Staphylococcus aureus	1011	6	591000.0
1	1	Streptococcus pneumonia	1011	6	0.0
2	2	Streptococcus Group B	1011	6	0.0
3	3	Acetinobacter baumannii	1011	6	0.0
4	4	Pseudomonas aeruginosa	1011	6	0.0

In [66]:

sns.set_style('ticks')
grid = sns.FacetGrid(bbal_pcr_complete.assign(log_count=(bbal_pcr_complete.pcr_count 
                                                         + 1 + 0.1*np.random.randn(bbal_pcr_complete.shape[0]))
                                              .apply(np.log)), 
                     col='record_id', hue="pathogen", 
                     col_wrap=4, size=2.5, palette=sns.color_palette("hls", 18))
grid.map(plt.plot, "day", "log_count", marker="o", ms=4, alpha=0.5)
# grid.map(vertical_abx_line, 'record_id', alpha=0.2)
grid.add_legend()

Out[66]:

<seaborn.axisgrid.FacetGrid at 0x119ab7358>

HME PCR counts¶

In [67]:

hme_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'hme_pcr_result']).dropna(axis=1, thresh=1)
hme_long_complete.head()

	record_id	bbal_add_path2	bbal_add_path3	bbal_copies1	bbal_path1	bbal_pcr_result	bhme_add_path2	bhme_add_path3	bhme_add_path4	...	mbal_copies2	mbal_copies3	mbal_copies4	mbal_copies5	mbal_path1	mbal_path2	mbal_path3	mbal_path4	mbal_path5	mbal_pcr_result
1	1003	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0
5	1007	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	299000.0	2520000.0	15700000.0	NaN	2.0	13.0	6.0	11.0	NaN	1.0
6	1008	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0
7	1009	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	1.0
10	1012	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	557000.0	77800.0	NaN	NaN	1.0	13.0	6.0	NaN	NaN	1.0

5 rows × 62 columns

In [68]:

hme_copies_cols = hme_long_complete.columns[hme_long_complete.columns.str.contains('copies')  
                         & hme_long_complete.columns.str.startswith('hme')]
hme_pathogens_cols = hme_long_complete.columns[hme_long_complete.columns.str.contains('path') 
                         & ~hme_long_complete.columns.str.contains('add')
                         & hme_long_complete.columns.str.startswith('hme')]

Convert pathogens and counts from long to wide

In [69]:

hme_pathogens_long = (hme_long_complete[hme_pathogens_cols.tolist() + ['record_id', 'day']]
                        .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
hme_pathogens = hme_pathogens_long.assign(pathogen=hme_pathogens_long.value.replace(pcr_path_lookup)
                         ).drop(['variable', 'value'], axis=1)

In [70]:

hme_counts_long = (hme_long_complete[hme_copies_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert hme_pathogens_long.shape==hme_counts_long.shape

Replace bad values

In [71]:

hme_counts_long.value[~hme_counts_long.value.apply(np.isreal)]

Series([], Name: value, dtype: float64)

In [72]:

hme_counts_long.loc[47, 'value'] = 5.47E+08

Append count onto pathogens table

In [73]:

hme_pathogens['pcr_count'] = (hme_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
hme_pathogens_sorted = hme_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)

In [74]:

hme_groups = []
for labels, group in hme_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, pcr_path_lookup)
    hme_groups.append(group_full)

In [75]:

hme_complete = pd.concat(hme_groups).reset_index()
hme_complete.to_csv('../data/clean/hme_pathogens.csv')

hme_complete.head()

	index	pathogen	record_id	day
0	0	Staphylococcus aureus	1002	5
1	1	Streptococcus pneumonia	1002	5
2	2	Streptococcus Group B	1002	5
3	3	Acetinobacter baumannii	1002	5
4	4	Pseudomonas aeruginosa	1002	5

Trajectories of PCR counts from HME data¶

Each panel represents a patient; each colored line a pathogen. X an Y axes are days and log-count, respectively.

In [76]:

hme_grid = sns.FacetGrid(hme_complete.assign(log_count=(hme_complete.pcr_count + 1).apply(np.log)), 
                     col='record_id', hue="pathogen", col_wrap=4, size=2.5)
hme_grid.map(plt.plot, "day", "log_count", marker="o", ms=4)
hme_grid.add_legend()

Out[76]:

<seaborn.axisgrid.FacetGrid at 0x10d54ca20>

Mini-BAL culture CFU counts¶

In [77]:

# Same pathogen list as bBAL
mbal_path_lookup = bbal_path_lookup

In [78]:

culture_long = daily_data_to_long(culture_data)

Processing day 1
Processing day 2
Processing day 3
Processing day 4
Processing day 5
Processing day 6
Processing day 7
Processing day 8
Processing day 9
Processing day 10
Processing day 11
Processing day 12
Processing day 13
Processing day 14

Drop rows with no date or no culture test, and columns with no data

In [79]:

culture_long_complete =  culture_long.dropna(subset=['date_study_day', 'mbal_culture_test']).dropna(axis=1, thresh=1)

In [80]:

culture_cfu_cols = culture_long_complete.columns[culture_long_complete.columns.str.contains('cfu')  
                         & culture_long_complete.columns.str.startswith('mbal')]
culture_pathogens_cols = culture_long_complete.columns[culture_long_complete.columns.str.contains('pathogen') 
                         & ~culture_long_complete.columns.str.contains('add')
                         & culture_long_complete.columns.str.startswith('mbal')]

In [81]:

culture_pathogens_long = (culture_long_complete[culture_pathogens_cols.tolist() + ['record_id', 'day']]
                        .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
culture_pathogens = culture_pathogens_long.assign(pathogen=culture_pathogens_long.value.replace(mbal_path_lookup)
                         ).drop(['variable', 'value'], axis=1)

In [82]:

culture_counts_long = (culture_long_complete[culture_cfu_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert culture_pathogens_long.shape==culture_counts_long.shape

Append CFU count onto pathogens table

In [83]:

culture_pathogens['cfu_count'] = (culture_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
culture_pathogens_sorted = culture_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)

In [84]:

groups = []
for labels, group in culture_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, mbal_path_lookup)
    groups.append(group_full)

In [85]:

culture_complete = pd.concat(groups).reset_index()
culture_complete.to_csv('../data/clean/culture_pathogens.csv')

Trajectories of culture counts from mini-BAL data¶

Each panel represents a patient; each colored line a pathogen. X an Y axes are days and CFU count, respectively.

In [86]:

grid = sns.FacetGrid(culture_complete, 
                     col='record_id', hue="pathogen", col_wrap=4, size=2.5)
grid.map(plt.plot, "day", "cfu_count", marker="o", ms=4)
grid.add_legend()

Out[86]:

<seaborn.axisgrid.FacetGrid at 0x11defaa20>

In [87]:

pathogens = ['Staphylococcus aureus', 'Haemophilus influenza', 'Acetinobacter baumannii', 'Klebsiella pneumoniae']
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16,14))
cbar_ax = fig.add_axes([.91, .3, .03, .4])

for ax,pathogen in zip(axes.ravel(), pathogens):
    draw_cbar = pathogen=='Haemophilus influenza'
    pathogen_grid = culture_complete[culture_complete.pathogen==pathogen].pivot('record_id', 'day', 'cfu_count')
    sns.heatmap(pathogen_grid, ax=ax, vmin=0, vmax=6, cbar=draw_cbar, cbar_ax=cbar_ax if draw_cbar else None)
    ax.set_yticklabels([''])
    ax.set_title(pathogen)
    ax.set_ylabel('patient')
    
fig.tight_layout(rect=[0, 0, .9, 1])

/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/matplotlib/figure.py:1742: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

Correlation between copies and CFU for mini BALs¶

Merge PCR and CFU DataFrames on subject, day and pathogen:

In [88]:

mbal_merged = mbal_pcr_pathogens.merge(culture_pathogens, on=['record_id', 'day', 'pathogen'], how='left').fillna(0)
mbal_merged.shape

Out[88]:

(4333, 5)

In [89]:

mbal_merged.head()

	record_id	day
0	1003	1
1	1003	1
2	1003	1
3	1003	1
4	1005	1

In [90]:

axes = (mbal_merged.assign(log_pcr_count=mbal_merged.pcr_count.apply(np.log))
        .plot.scatter(x='log_pcr_count', y='cfu_count'))
axes.set_ylim(0, mbal_merged.cfu_count.max()+1);

Correlation between HME and bBAL¶

In [91]:

bbal_agg = (bbal_pathogens.dropna()
             .groupby(['record_id', 'pathogen'])[['cfu_count']].max())
bbal_agg.head()

		cfu_count
record_id	pathogen
1001	Other	2.0
1003	Klebsiella pneumoniae	3.0
1003	Other	2.0
1007	Enterobacter cloacae	2.0
1007	Haemophilus influenza	6.0

In [92]:

hme_agg = (hme_pathogens.assign(log_count=np.log(hme_pathogens.pcr_count))
             .drop('pcr_count', axis=1)
             .dropna()
             .groupby(['record_id', 'pathogen'])[['log_count']].max())
hme_agg.head()

		log_count
record_id	pathogen
1002	Enterobacter aerogenes	16.985196
	Klebsiella oxytoca	14.883664
	Klebsiella pneumoniae	15.373655
1003	Acetinobacter baumannii	14.538217
1003	Haemophilus influenza	15.646491

In [93]:

bbal_hme = (bbal_agg.join(hme_agg).fillna(0)
                    .reset_index()
                    .rename(columns={'log_count':'HME count',
                                     'cfu_count':'bBAL count'}))
bbal_hme.head()

	record_id	pathogen	bBAL count	HME count
0	1001	Other	2.0	0.000000
1	1003	Klebsiella pneumoniae	3.0	20.681402
2	1003	Other	2.0	0.000000
3	1007	Enterobacter cloacae	2.0	0.000000
4	1007	Haemophilus influenza	6.0	14.200773

In [94]:

n = bbal_hme.shape[0]
s = 0.1
grid = sns.FacetGrid(data=bbal_hme.assign(hme_count=bbal_hme['HME count']+np.random.randn(n)*s,
                                         bbal_count=bbal_hme['bBAL count']+np.random.randn(n)*s), hue='pathogen', aspect=1.5, size=5)
grid.map(plt.scatter, 'hme_count', 'bbal_count', alpha=0.5).add_legend()

Out[94]:

<seaborn.axisgrid.FacetGrid at 0x1224c3780>

Estimate sensitivity, PPV and NPV for HME¶

In [95]:

bbal_hme['bbal_positive'] = bbal_hme['bBAL count'] >=3
bbal_hme['hme_positive'] = bbal_hme['HME count'] > 0

In [96]:

from sklearn import metrics

Sensitivity (recall)

In [97]:

metrics.recall_score(bbal_hme.bbal_positive, bbal_hme.hme_positive)

Out[97]:

0.70370370370370372

Precision score (PPV)

In [98]:

metrics.precision_score(bbal_hme.bbal_positive, bbal_hme.hme_positive)

Out[98]:

0.61290322580645162

In [99]:

def calc_recall(x): 
    if not x.bbal_positive.sum():
        return np.nan
    return metrics.recall_score(x.bbal_positive, x.hme_positive)

bbal_hme.groupby('pathogen').apply(calc_recall)

pathogen
Acetinobacter baumannii         1.000000
Enterobacter aerogenes               NaN
Enterobacter cloacae            0.000000
Escherichia coli                0.500000
Haemophilus influenza           0.750000
Klebsiella oxytoca                   NaN
Klebsiella pneumoniae           1.000000
Other                           0.000000
Pseudomonas aeruginosa          1.000000
Serratia marcescens                  NaN
Staphylococcus aureus           0.833333
Stenotrophomonas maltophilia    1.000000
Streptococcus pneumonia         1.000000
dtype: float64

In [100]:

bbal_positive = bbal_hme[bbal_hme.bbal_positive==1].copy()

pathogen_list = bbal_positive.pathogen.unique()
n_pathogens = pathogen_list.shape[0]

pathogen_encode = dict(zip(pathogen_list, range(n_pathogens)))
pathogen_decode = dict(zip(range(n_pathogens), pathogen_list))

bbal_positive['pathogen_id'] = bbal_positive.pathogen.replace(pathogen_encode)

In [101]:

import pymc3 as pm

x = bbal_positive.pathogen_id.values
y = bbal_positive.hme_positive.values

with pm.Model() as recall_model:
    
    μ = pm.Normal('μ', 0, sd=10)
    σ = pm.HalfCauchy('σ', 2.5)
    
    θ = pm.Normal('θ', μ, sd=σ, shape=n_pathogens)
    π = pm.Deterministic('π', pm.math.invlogit(θ))
    
    pm.Bernoulli('likeihood', π[x], observed=y)

In [102]:

with recall_model:
    tr = pm.sample(2000, n_init=20000, random_seed=20090425)

Auto-assigning NUTS sampler...
Initializing NUTS using advi...
Average ELBO = -20.161: 100%|██████████| 20000/20000 [00:01<00:00, 14296.79it/s]
Finished [100%]: Average ELBO = -20.103
100%|██████████| 2000/2000 [00:02<00:00, 803.39it/s]

In [103]:

pm.summary(tr[1000:], varnames=['π'])

π:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  0.893            0.131            0.006            [0.617, 1.000]
  0.746            0.177            0.005            [0.397, 0.996]
  0.883            0.143            0.006            [0.561, 1.000]
  0.819            0.132            0.005            [0.557, 0.999]
  0.876            0.149            0.006            [0.573, 1.000]
  0.869            0.160            0.007            [0.536, 1.000]
  0.275            0.241            0.014            [0.000, 0.745]
  0.822            0.221            0.008            [0.304, 1.000]
  0.588            0.255            0.009            [0.119, 0.996]
  0.217            0.211            0.012            [0.000, 0.658]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  0.540          0.843          0.946          0.990          1.000
  0.342          0.634          0.780          0.885          0.986
  0.455          0.833          0.938          0.990          1.000
  0.506          0.745          0.847          0.921          0.990
  0.472          0.809          0.934          0.990          1.000
  0.414          0.798          0.931          0.991          1.000
  0.000          0.066          0.216          0.451          0.804
  0.174          0.748          0.914          0.986          1.000
  0.070          0.401          0.619          0.796          0.976
  0.000          0.034          0.152          0.352          0.719

In [104]:

gs = pm.forestplot(tr[1000:], varnames=['π'], ylabels=pathogen_list, 
                   xtitle='Sensitivity (Recall)')

In [105]:

pm.plot_posterior(tr[1000:], varnames=['μ'], point_estimate='median', 
                  transform=lambda x: 1/(1+np.exp(-x)))

Out[105]:

array([<matplotlib.axes._subplots.AxesSubplot object at 0x12d4130f0>], dtype=object)

PPV estimation

In [106]:

hme_positive = bbal_hme[bbal_hme.hme_positive==1].copy()

pathogen_list = hme_positive.pathogen.unique()
n_pathogens = pathogen_list.shape[0]

pathogen_encode = dict(zip(pathogen_list, range(n_pathogens)))
pathogen_decode = dict(zip(range(n_pathogens), pathogen_list))

hme_positive['pathogen_id'] = hme_positive.pathogen.replace(pathogen_encode)

In [107]:

hme_positive.shape

Out[107]:

(31, 7)

In [108]:

x = hme_positive.pathogen_id.values
y = hme_positive.bbal_positive.values

In [109]:

with pm.Model() as ppv_model:
    
    μ = pm.Normal('μ', 0, sd=10)
    σ = pm.HalfCauchy('σ', 2.5)
    
    θ = pm.Normal('θ', μ, sd=σ, shape=n_pathogens)
    π = pm.Deterministic('π', pm.math.invlogit(θ))
    
    pm.Bernoulli('likeihood', π[x], observed=y)

In [110]:

with ppv_model:
    tr_ppv = pm.sample(2000, n_init=20000, random_seed=20090425)

Auto-assigning NUTS sampler...
Initializing NUTS using advi...
Average ELBO = -27.131: 100%|██████████| 20000/20000 [00:01<00:00, 13681.07it/s]
Finished [100%]: Average ELBO = -27.114
100%|██████████| 2000/2000 [00:02<00:00, 798.43it/s]

In [111]:

pm.forestplot(tr_ppv[1000:], varnames=['π'], ylabels=pathogen_list, 
                   xtitle='Positive predictive value')

Out[111]:

<matplotlib.gridspec.GridSpec at 0x12fd12dd8>

NPV estimation

In [112]:

bbal_hme_complete = bbal_hme.set_index(['record_id', 'pathogen'])

complete_index = pd.MultiIndex.from_product(bbal_hme_complete.index.levels, 
                                            names=bbal_hme_complete.index.names)
bbal_hme_complete = bbal_hme_complete.reindex(complete_index, fill_value=0).reset_index()

In [113]:

bbal_negative = bbal_hme_complete[bbal_hme_complete.bbal_positive==0].copy()

pathogen_list = bbal_negative.pathogen.unique()
n_pathogens = pathogen_list.shape[0]

pathogen_encode = dict(zip(pathogen_list, range(n_pathogens)))
pathogen_decode = dict(zip(range(n_pathogens), pathogen_list))

bbal_negative['pathogen_id'] = bbal_negative.pathogen.replace(pathogen_encode)

In [114]:

x = bbal_negative.pathogen_id.values
y = bbal_negative.hme_positive.values.astype(int)==0

In [115]:

with pm.Model() as npv_model:
    
    μ = pm.Normal('μ', 0, sd=10)
    σ = pm.HalfCauchy('σ', 2.5)
    
    θ = pm.Normal('θ', μ, sd=σ, shape=n_pathogens)
    π = pm.Deterministic('π', pm.math.invlogit(θ))
    
    pm.Bernoulli('likeihood', π[x], observed=y)

In [116]:

with npv_model:
    tr_npv = pm.sample(2000, n_init=20000, random_seed=20090425)

Auto-assigning NUTS sampler...
Initializing NUTS using advi...
Average ELBO = -55.094: 100%|██████████| 20000/20000 [00:01<00:00, 12545.31it/s]
Finished [100%]: Average ELBO = -54.432
100%|██████████| 2000/2000 [00:02<00:00, 689.76it/s]

In [117]:

gs = pm.forestplot(tr_npv[1000:], varnames=['π'], ylabels=pathogen_list, 
                   xtitle='Negative predictive value')

Correlation between PCR counts for HME vs mBAL¶

In [118]:

pcr_merged = mbal_pcr_pathogens.merge(hme_pathogens, on=['record_id', 'day', 'pathogen'], 
                                 how='left',
                                 suffixes=['_mbal', '_hme']).fillna(0)

In [119]:

pcr_merged.head()

	record_id	day
0	1003	1
1	1003	1
2	1003	1
3	1003	1
4	1003	1

In [120]:

axes = (pcr_merged.assign(log_mbal_count=(pcr_merged.pcr_count_mbal+1).apply(np.log),
                          log_hme_count=(pcr_merged.pcr_count_hme+1).apply(np.log))
        .plot.scatter(x='log_mbal_count', y='log_hme_count'))
axes.set_xlim(0, 25)
axes.set_ylim(0, 25);

Calculate proportion of culture detections that show up in PCR

In [121]:

mbal_merged.head()

	record_id	day
0	1003	1
1	1003	1
2	1003	1
3	1003	1
4	1005	1

In [122]:

mbal_merged['pcr_positive'] = mbal_merged.pcr_count>0
mbal_merged['cfu_positive'] = mbal_merged.cfu_count>0

In [123]:

mbal_merged.pcr_positive.sum()

Out[123]:

In [124]:

mbal_merged.cfu_positive.sum()

Out[124]:

In [125]:

mbal_merged[mbal_merged.cfu_positive==1].pcr_positive.mean()

Out[125]:

1.0

In [126]:

bbal_long.head()

	record_id	abg_yn	amt_irrigant	bbal	bbal_add_pathogen3	bbal_add_pathogen4	bbal_add_pathogen_2	bbal_cfu	bbal_cfu2	...	plt	spo2	temperature	urine_output	vasopressin_rate	vasopressin_yn	vasopressors_yn	vent_day	wbc_count
6	1007	1.0	20.0	1.0	NaN	NaN	NaN	NaN	NaN	...	151.0	NaN	38.3	504.0	NaN	0.0	1.0	1.0	16.5
11	1012	0.0	20.0	1.0	NaN	NaN	0.0	6.0	NaN	...	193.0	95.0	38.8	1.0	NaN	NaN	0.0	1.0	11.9
12	1013	0.0	NaN	1.0	0.0	NaN	1.0	NaN	NaN	...	135.0	94.0	38.9	1485.0	NaN	NaN	0.0	1.0	10.8
38	1039	0.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	...	143.0	95.0	38.2	1740.0	NaN	NaN	0.0	1.0	8.5
40	1041	1.0	NaN	1.0	1.0	NaN	1.0	6.0	5.0	...	81.0	NaN	38.5	3140.0	0.04	1.0	1.0	1.0	10.0

5 rows × 106 columns

Comparison of pathogen presence in bBAL and HME¶

In [127]:

bbal_pathogens.dropna(subset=['pathogen'])

	record_id	day	pathogen	cfu_count
1	1012	1	Staphylococcus aureus	6.0
2	1013	1	Haemophilus influenza	NaN
4	1041	1	Escherichia coli	6.0
5	1043	1	Haemophilus influenza	6.0
7	1016	2	Staphylococcus aureus	3.0
8	1022	2	Enterobacter cloacae	3.0
10	1029	2	Enterobacter cloacae	2.0
11	1034	2	Staphylococcus aureus	6.0
14	1007	3	Streptococcus pneumonia	6.0
15	1018	3	Klebsiella pneumoniae	6.0
17	1046	3	Other	4.0
20	1011	4	Staphylococcus aureus	6.0
22	1019	4	Acetinobacter baumannii	5.0
23	1045	4	Streptococcus pneumonia	6.0
24	1003	5	Klebsiella pneumoniae	3.0
25	1010	5	Haemophilus influenza	NaN
26	1024	5	Staphylococcus aureus	2.0
27	1039	5	Staphylococcus aureus	2.0
29	1022	6	Pseudomonas aeruginosa	6.0
32	1028	6	Staphylococcus aureus	2.0
35	1044	7	Enterobacter cloacae	5.0
38	1015	9	Stenotrophomonas maltophilia	2.0
41	1001	10	Other	2.0
43	1019	10	Pseudomonas aeruginosa	3.0
44	1038	11	Staphylococcus aureus	6.0
46	1022	13	Stenotrophomonas maltophilia	2.0
47	1028	13	Staphylococcus aureus	2.0
51	1013	1	Other	NaN
53	1041	1	Other	5.0
56	1016	2	Klebsiella oxytoca	1.0
57	1022	2	Klebsiella pneumoniae	2.0
59	1029	2	Other	3.0
60	1034	2	Haemophilus influenza	6.0
63	1007	3	Haemophilus influenza	6.0
64	1018	3	Other	2.0
66	1046	3	Staphylococcus aureus	3.0
71	1019	4	Haemophilus influenza	5.0
73	1003	5	Other	2.0
74	1010	5	Staphylococcus aureus	NaN
75	1024	5	Enterobacter aerogenes	2.0
78	1022	6	Stenotrophomonas maltophilia	5.0
81	1028	6	Escherichia coli	3.0
87	1015	9	Other	2.0
92	1019	10	Acetinobacter baumannii	3.0
95	1022	13	Pseudomonas aeruginosa	2.0
96	1028	13	Acetinobacter baumannii	1.0
102	1041	1	Staphylococcus aureus	2.0
106	1022	2	Acetinobacter baumannii	2.0
109	1034	2	Klebsiella pneumoniae	3.0
112	1007	3	Other	2.0
113	1018	3	Acetinobacter baumannii	6.0
120	1019	4	Staphylococcus aureus	2.0
130	1028	6	Other	2.0
145	1028	13	Stenotrophomonas maltophilia	1.0
155	1022	2	Staphylococcus aureus	2.0
161	1007	3	Enterobacter cloacae	2.0
179	1028	6	Serratia marcescens	2.0
194	1028	13	Other	1.0

In [128]:

fig, axes = plt.subplots(1, 2, figsize=(14,8))

(hme_pathogens.assign(log_count=np.log(hme_pathogens.pcr_count))
             .dropna(subset=['pathogen'])
             .fillna(0)
             .groupby(['record_id', 'pathogen'])
             .log_count.max()
             .reset_index()
             .groupby('pathogen')
             .log_count.mean()
             .sort_values(ascending=False)
             .plot.bar(ax=axes[0]))
axes[0].set_title('Mean maximum pathogen count in HME')
axes[0].set_xlabel('')
axes[0].set_ylabel('Log PCR count')

(bbal_pathogens.dropna(subset=['pathogen'])
             .fillna(0)
             .groupby(['record_id', 'pathogen'])
             .cfu_count.max()
             .reset_index()
             .groupby('pathogen')
             .cfu_count.mean()
             .sort_values(ascending=False)
             .plot.bar(ax=axes[1]))
axes[1].set_title('Mean maximum pathogen count in bBAL')
axes[1].set_xlabel('')
axes[1].set_ylabel('CFU count')

Out[128]:

<matplotlib.text.Text at 0x12f2d6048>

bBAL symptom plots¶

Plot the bBAL patients and their WBC, temperature, pulse rate for the 14 days they were on the study.

In [129]:

bbal_ids = bbal_long.record_id.unique()

In [130]:

def plot_variable(var_basename, label, title='', subset=None, data_filter=None):
    cols = trial_data.columns[trial_data.columns.str.contains(var_basename)]
    if subset is not None:
        data = trial_data.loc[subset, cols]
    else:
        data = trial_data[cols]
        
    if data_filter:
        data = data_filter(data)
        
    data_long = pd.melt(data.reset_index(), id_vars=['record_id']).dropna()
    data_long = (data_long.assign(day=data_long.variable.str
                            .split('_d')
                            .apply(lambda x: x[-1]))
                    .drop('variable', axis=1)
                    .rename(columns={'value':label}))
    
    grid = sns.FacetGrid(data_long, 
                     col='record_id', col_wrap=4, size=2.5)
    grid.map(plt.plot, "day", label, marker="o", ms=4)
    grid.fig.subplots_adjust(top=0.95)
    grid.fig.suptitle(title)
    
    return grid

In [131]:

plot_variable('wbc', 'WBC', 'WBC Count', subset=np.arange(1001, 1038))

Out[131]:

<seaborn.axisgrid.FacetGrid at 0x12c11cc50>

In [132]:

plot_variable('map', 'map', 'Mean Arterial Pressure', subset=np.arange(1001, 1038))

Out[132]:

<seaborn.axisgrid.FacetGrid at 0x12bb26fd0>

In [133]:

from scipy.constants import convert_temperature

plot_variable('temp', 'temperature (C)', 'Temperature', subset=np.arange(1001, 1038))

Out[133]:

<seaborn.axisgrid.FacetGrid at 0x12d94b940>

In [134]:

pao2_cols = trial_data.columns[trial_data.columns.str.contains('pao2')]
fio2_cols = trial_data.columns[trial_data.columns.str.contains('pf_fio2')]

In [135]:

pao2_data = trial_data.loc[bbal_long.record_id.unique(), pao2_cols]
fio2_data = trial_data.loc[bbal_long.record_id.unique(), fio2_cols]

In [136]:

pao2_data_long = pd.melt(pao2_data.reset_index(), id_vars=['record_id']).dropna()
pao2_data_long = (pao2_data_long.assign(day=pao2_data_long.variable.str
                        .split('_d')
                        .apply(lambda x: x[-1]))
                .drop('variable', axis=1)
                .rename(columns={'value':'PaO2'}))



fio2_data_long = pd.melt(fio2_data.reset_index(), id_vars=['record_id']).dropna()
fio2_data_long = (fio2_data_long.assign(day=fio2_data_long.variable.str
                        .split('_d')
                        .apply(lambda x: x[-1]))
                .drop('variable', axis=1)
                .rename(columns={'value':'FiO2'}))

In [137]:

ratio_data = fio2_data_long.set_index(['record_id','day']).join(pao2_data_long.set_index(['record_id','day'])).dropna()
ratio_data['P/F ratio'] = 100*ratio_data.PaO2/ratio_data.FiO2

In [138]:

ratio_data = ratio_data.reset_index()

In [139]:

grid = sns.FacetGrid(ratio_data, 
                 col='record_id', col_wrap=4, size=2.5)
grid.map(plt.plot, "day", "P/F ratio", marker="o", ms=4)
grid.fig.subplots_adjust(top=0.95)
grid.fig.suptitle('P/F ratio')

Out[139]:

<matplotlib.text.Text at 0x12bf74e80>