In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from redcap import Project

Query data from REDCap

In [2]:
def get_redcap_data(token_url=None, api_url='https://redcap.vanderbilt.edu/api/', metadata=False, 
                    project=False, **kwargs):

    if token_url is None:
        print('Enter token:')
        api_key = input()
    else:
        api_key = open(token_url).read()
    
    project = Project(api_url, api_key)
    
    data = project.export_records(format='df',  **kwargs)
    
    if metadata:
        return data, project.export_metadata()
    elif project:
        return data, project
    else:
        return data

Main trial data

In [3]:
trial_data, trial_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_trial.txt", metadata=True)

Size of dataset (rows x columns)

In [4]:
trial_data.shape
Out[4]:
(46, 1871)

MiniBAL culture data

In [5]:
culture_data, culture_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_bal.txt", metadata=True)

Size of dataset (rows x columns)

In [6]:
culture_data.shape
Out[6]:
(46, 575)

BAL PCR data

In [7]:
pcr_data, pcr_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_pcr.txt", metadata=True)

Size of dataset (rows x columns)

In [8]:
pcr_data.shape
Out[8]:
(44, 1642)

Patient population

Number of individuals with complete (PCR and culture) and incomplete (one or the other) data

In [9]:
pcr_date_cols = pcr_data.columns[pcr_data.columns.str.contains('date')]
culture_date_cols = culture_data.columns[culture_data.columns.str.contains('date')]
In [10]:
culture_data['some_bal_data'] = (culture_data[culture_date_cols].notnull().sum(1)>0)
pcr_data['some_pcr_data'] = (pcr_data[pcr_date_cols].notnull().sum(1)>0)
In [11]:
(culture_data.join(pcr_data, lsuffix='-bal', rsuffix='-pcr')[['some_bal_data', 'some_pcr_data']]
     .sum(1).replace({1: 'One', 2: 'Both'}).value_counts())
Both    44
One      2
dtype: int64

Distribution of number of vent days

In [12]:
ax = trial_data.vent_days.hist()
ax.set_ylabel('Frequency')
ax.set_xlabel('Days on ventilator');

Statistical information regarding number of vent days

In [13]:
trial_data.vent_days.describe()
count    45.000000
mean     10.933333
std       7.566073
min       2.000000
25%       6.000000
50%       9.000000
75%      13.000000
max      38.000000
Name: vent_days, dtype: float64

Suspected VAP

In [14]:
VAP_lookup = {1: 'Pneumonia',
2: 'Infection (not pneumonia)',
3: 'Acute renal failure',
4: 'ARDS',
5: 'Cardiac arrest',
6: 'MI',
7: 'CVA',
8: 'DVT',
9: 'PE',
10: 'None of the above'}
In [15]:
trial_data['pneumonia'] = trial_data.morbidity___1

Summary of pneumonia

There are currently 18 individuals with pneumonia

In [16]:
trial_data.pneumonia.sum()
Out[16]:
23

This accounts for 49% of the sample

In [17]:
trial_data.pneumonia.mean()
Out[17]:
0.5

Patients taking antibiotics at time of admission:

In [18]:
trial_data.preadmit_abx.replace({1:'Yes', 2:'No', 3:'Unknown'}).value_counts()
No         39
Unknown     7
Name: preadmit_abx, dtype: int64

Patients administered antibiotics at some point during study:

In [19]:
trial_data.add_med_0.replace({1:'Yes', 0:'No'}).value_counts()
Yes    42
No      3
Name: add_med_0, dtype: int64
In [20]:
abx_cols = trial_data.columns[trial_data.columns.str.startswith('abx')]
In [21]:
abx_table = pd.wide_to_long(trial_data[abx_cols].assign(id=trial_data.index), 
                stubnames=['abx_generic_', 'abx_antifungals_', 'abx_start_', 'abx_stop_', 'abx_indication_', 
                           'abx_suspect_infect_source_', 'abx_known_infection_source_'], 
                i='id', j='abx_number').dropna(thresh=2).dropna(axis=1, thresh=100)
abx_table.columns = ['abx_name', 'antifungal', 'abx_start', 'abx_stop', 'abx_indication']
abx_table.abx_start = pd.to_datetime(abx_table.abx_start)
abx_table.abx_stop = pd.to_datetime(abx_table.abx_stop)
abx_table
abx_name antifungal abx_start abx_stop abx_indication
id abx_number
1001 1 Ampicillin-sulbactam 1.0 2016-03-27 2016-03-27 1.0
1002 1 Cefazolin 1.0 2016-04-04 2016-04-04 1.0
1003 1 Cefazolin 1.0 2016-04-14 2016-04-14 1.0
1004 1 Cefazolin 1.0 2016-04-18 2016-04-18 1.0
1005 1 Cefazolin 1.0 2016-05-06 2016-05-06 1.0
1006 1 Amoxicillin-k clavulanate 1.0 2016-05-16 2016-05-22 3.0
1007 1 Cefazolin 1.0 2016-05-17 2016-05-17 1.0
1008 1 Cefazolin 1.0 2016-05-23 2016-05-23 1.0
1010 1 Amoxicillin-k clavulanate 1.0 2016-06-26 2016-07-28 3.0
1011 1 Cefazolin 1.0 2016-06-20 2016-06-22 1.0
1012 1 Cefazolin 1.0 2016-06-25 2016-06-25 3.0
1013 1 Meropenem 1.0 2016-06-28 2016-07-01 2.0
1014 1 Cefazolin 1.0 2016-07-02 2016-07-02 1.0
1015 1 Cefazolin 1.0 2016-07-06 2016-07-06 1.0
1016 1 Cefazolin 1.0 2016-07-15 2016-07-18 3.0
1017 1 Cefazolin 1.0 2016-07-12 2016-07-14 1.0
1018 1 Ceftazidime 1.0 2016-07-20 2016-07-22 2.0
1019 1 Cefazolin 1.0 2016-07-20 2016-07-21 1.0
1020 1 Cefazolin 1.0 2016-07-24 2016-07-24 1.0
1021 1 Cefazolin 1.0 2016-07-27 2016-07-27 1.0
1022 1 Cefazolin 1.0 2016-08-09 2016-08-11 1.0
1023 1 Piperacillin-tazobactam 1.0 2016-08-16 2016-08-16 1.0
1024 1 Cefazolin 1.0 2016-08-20 2016-08-20 1.0
1025 1 Cefazolin 1.0 2016-08-24 2016-08-24 1.0
1026 1 Cefazolin 1.0 2016-09-01 2016-09-01 1.0
1027 1 Cefazolin 1.0 2016-09-22 2016-09-22 1.0
1028 1 Ceftazidime 1.0 2016-09-21 2016-09-29 2.0
1029 1 Cefazolin 1.0 2016-09-17 2016-09-17 1.0
1030 1 Cefazolin 1.0 2016-09-16 2016-09-18 1.0
1031 1 Cefazolin 1.0 2016-09-23 2016-09-24 1.0
... ... ... ... ... ... ...
1022 5 trimethoprim-sulfamethoxazole 1.0 2016-08-17 2016-08-22 3.0
1024 5 Vancomycin 1.0 2016-08-23 2016-09-01 2.0
1025 5 Vancomycin 1.0 2016-08-27 2016-08-29 2.0
1029 5 Levofloxacin 1.0 2016-09-20 2016-09-20 2.0
1037 5 Vancomycin 1.0 2016-10-28 2016-11-04 1.0
1038 5 Vancomycin 1.0 2016-12-11 2016-12-18 2.0
1039 5 Piperacillin-tazobactam 1.0 2016-12-09 2016-12-11 2.0
1042 5 Vancomycin 1.0 2017-01-15 2017-01-17 1.0
1044 5 Tobramycin 1.0 2017-01-31 2017-02-01 2.0
1045 5 Vancomycin 1.0 2017-02-01 2017-02-04 2.0
1001 6 Levofloxacin 1.0 2016-03-23 2016-03-24 1.0
1002 6 Vancomycin 1.0 2016-04-17 2016-04-19 2.0
1014 6 Vancomycin 1.0 2016-07-05 2016-07-10 2.0
1015 6 Vancomycin 1.0 2016-07-06 2016-07-07 1.0
1022 6 Tobramycin 1.0 2016-08-14 2016-08-14 3.0
1029 6 Piperacillin-tazobactam 1.0 2016-09-30 2016-10-03 2.0
1039 6 Vancomycin 1.0 2016-12-04 2016-12-04 1.0
1042 6 Fluconazole 2.0 2017-01-15 2017-01-15 1.0
1044 6 Vancomycin 1.0 2017-01-31 2017-02-02 2.0
1001 7 Meropenem 1.0 2016-04-02 2016-04-05 2.0
1014 7 Vancomycin 1.0 2016-07-13 2016-07-18 2.0
1015 7 Vancomycin 1.0 2016-07-15 2016-07-18 2.0
1022 7 Vancomycin 1.0 2016-08-14 2016-08-16 3.0
1029 7 Vancomycin 1.0 2016-09-20 2016-09-23 2.0
1039 7 Vancomycin 1.0 2016-12-09 2016-12-11 2.0
1042 7 Metronidazole 1.0 2017-01-15 2017-01-15 1.0
1001 8 Vancomycin 1.0 2016-03-23 2016-03-24 1.0
1022 8 Vancomycin 1.0 2016-08-21 2016-08-22 3.0
1029 8 Vancomycin 1.0 2016-09-30 2016-10-03 2.0
1001 9 Vancomycin 1.0 2016-04-02 2016-04-06 2.0

174 rows × 5 columns

In [22]:
abx_table.abx_name.value_counts()
Cefazolin                        47
Vancomycin                       39
Ceftazidime                      14
Levofloxacin                     14
Meropenem                        11
Ceftriaxone                       8
Piperacillin-tazobactam           8
Tobramycin                        5
Clindamycin                       5
Cefoxitin                         4
Ampicillin-sulbactam              3
Ciprofloxacin                     2
Cefepime                          2
Amoxicillin-k clavulanate         2
Linezolid                         2
Gentamicin                        1
Trimethoprim/sulfamethoxazole     1
Cefuroxime axetil                 1
Fluconazole                       1
Metronidazole                     1
Cephalexin                        1
trimethoprim-sulfamethoxazole     1
Cefdinir                          1
Name: abx_name, dtype: int64
In [23]:
trial_data
lab_mrn ic name mrn dob age ethnicity race sex height ... nutrition_goal_d14 urine_output_d14 spo2_d14 low_peep_d14 fio2_low_d14 gcs_d14 cxr_yn_d14 cxr_d14 ards_d14 pneumonia
record_id
1001 99019881, 99019865 [document] MCMURRAY, PHEAIRIS 40181455 1993-09-08 22 1 3 1 180.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1002 99020041 [document] HUTCHINSON, LYNN MARIE 40238339 1970-04-29 45 1 4 0 155.0 ... 1.0 2405.0 95.0 5.0 40.0 3.0 1.0 2.0 1.0 0
1003 99020323 [document] FOURAKRE, JAMES 40279804 1943-06-24 72 1 4 1 183.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1004 099020377 [document] BURCHAM, BETTY 36498293 1936-10-18 79 1 4 0 152.4 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1005 99020801 [document] MCCUTCHEON, CARI 40385486 1987-11-26 28 1 4 0 177.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1006 99020869 [document] BASTANCHURY, RICHARD 40398752 1968-03-26 48 1 4 1 175.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1007 99021068 [document] FESSLER, HANNAH 40435208 1992-03-10 24 1 4 0 170.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1008 99021194 [document] KING, JAMES 40472474 1961-11-08 54 1 4 1 175.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1009 99021641 [document] BACON, THELMA 40550907 1935-05-02 81 1 4 0 158.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1010 99021664 [document] MORRIS, JESSE 40565244 1978-06-14 38 1 4 1 180.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1011 99021787 [document] ALLEN, NATASHA LYNN 40585937 1992-11-18 23 1 4 0 163.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1012 99021788 [document] ROBINSON, COREY 40581316 1993-12-22 22 1 4 1 175.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1013 99021975 [document] STEVENS, JOHN 40587511 1962-10-11 53 1 4 1 170.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1014 99022055 [document] MULLINS, DOUGLAS 40643975 1963-03-10 53 1 4 1 193.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1015 99022167 [document] COMPTON, KENNETH 40653602 1937-01-29 79 1 4 1 172.7 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1016 99022180 [document] RANSOM, KERRY EYULNN 40669319 1972-12-14 43 1 3 1 180.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1017 99022195 [document] SHERROD, KAYLA 40671133 1989-11-22 26 1 3 0 158.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1018 99022322 [document] NEAL, MATTHEW 40699902 1992-08-04 23 1 4 1 193.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1019 99022382 [document] COLLINS, JONATHAN 40713232 1993-01-12 23 1 4 1 188.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1020 99022427 [document] GROVES, BIRDIE 40729972 1934-05-30 82 1 4 0 168.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1021 99022487 [document] GENTRY, CHRISTINA 40736860 1966-07-11 50 1 4 0 180.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1022 99022756 [document] VINCENT, EMMETT EUGENE 40790834 1955-06-19 61 1 4 1 182.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1023 99022843 [document] THOMSON, MICHAEL 40821654 1968-01-13 48 1 4 1 182.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1024 99023016 [document] MCREYNOLDS, ROBERT ERIC 40852360 1950-03-11 66 1 3 1 185.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1025 99023134 [document] NEAL, JAMES 36041507 1960-11-02 55 1 4 1 178.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1026 99023292 [document] LOWHORN, ALTON 40897175 1968-08-09 48 1 4 1 170.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1027 99023538 [document] JIMENEZ, RAYNA 40954513 1992-10-09 23 1 4 0 157.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1028 99023569 [document] POTTER, STEVEN 40955700 1962-08-09 54 1 4 1 172.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1029 99023616 [document] MCCOIN, DANNY 40970675 1969-08-07 47 1 4 1 188.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1030 99023618 [document] MORRIS, WILLIAM PAUL 40960858 1970-07-16 46 1 4 1 175.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1031 99023683 [document] CHEEK, JASON 40973968 1980-01-02 36 1 4 1 180.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1032 99023899 [document] Sailliez, Jeffrey 41003393 1976-08-17 40 1 4 1 182.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1033 99024357 [document] Lloyd, David 41092511 1960-11-03 55 1 4 1 170.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1034 99024409 [document] MOORE, WESLEY 41099110 1957-10-13 59 1 4 1 183.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1035 99024481 [document] RICE, STEVEN 41120908 1959-04-13 57 1 4 1 178.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1036 99024534 [document] BARTON, MILEAH BROOKE 41123670 1987-06-16 29 1 4 0 165.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1037 99024666 [document] PILLOW, VICKI JEAN 41141656 1952-01-26 64 1 4 0 162.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1038 99025230 [document] LYNN, JEWELL 41269663 1943-08-21 73 1 4 0 152.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1039 99025306 [document] NEWMAN, JOSHUA 41291360 1990-11-16 26 1 4 1 185.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1040 99025443 [document] PROCK, TONYA 41293788 1969-12-03 47 1 4 0 152.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1041 NaN [document] ELROD, DENNIS 41382466 1978-09-18 38 1 4 1 188.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1042 99026014 [document] JASTRE, LAURA SAVANNAH 41452038 1988-04-08 28 1 4 0 152.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1043 99026078 [document] DAVIS, ADAM M 32417198 1980-03-22 36 1 4 1 188.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1044 99026259 [document] PIGG, JAMIE 41490277 1977-09-25 39 1 4 1 172.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1045 99026322 [document] GRIFFIN, SHAWN 41506536 1965-06-22 51 1 4 1 167.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 1
1046 99027068 [document] CRAFTON, DANIEL L 41666090 1997-04-29 19 1 4 1 178.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0

46 rows × 1872 columns

Positive bronchioscopic BALs

In [24]:
def daily_data_to_long(dataset, pattern='_d\d+$', suffix=''):
    
    # Find the largest number of study days
    _tokens = dataset.columns[dataset.columns.str.contains(pattern, regex=True)].str.split('_').values
    last_day = np.array([''.join(filter(lambda x: x.isdigit(), t[-1])) for t in _tokens]).astype(int).max()

    all_days_data = []

    for i in range(last_day):

        print('Processing day',i+1)
        study_day_cols = dataset.columns[dataset.columns.str.endswith('_d{0}{1}'.format(i+1,suffix))]

        study_day_data = dataset[study_day_cols].copy()
        study_day_data.columns = ['_'.join(col.split('_')[:-1]) for col in study_day_cols]
        
        study_day_data['day'] = i+1
        all_days_data.append(study_day_data)
        if False:
            print(study_day_data.shape)

    return pd.concat(all_days_data).reset_index()
In [25]:
bbal_long = daily_data_to_long(trial_data).query('bbal==1')
Processing day 1
Processing day 2
Processing day 3
Processing day 4
Processing day 5
Processing day 6
Processing day 7
Processing day 8
Processing day 9
Processing day 10
Processing day 11
Processing day 12
Processing day 13
Processing day 14
In [26]:
bbal_cfu_cols = bbal_long.columns[bbal_long.columns.str.contains('cfu')  
                         & bbal_long.columns.str.startswith('bbal')]
bbal_pathogens_cols = bbal_long.columns[bbal_long.columns.str.contains('pathogen') 
                         & ~bbal_long.columns.str.contains('add')
                         & bbal_long.columns.str.startswith('culture')]

Encode CFU count ranges to ordinal variable

In [27]:
cfu_coding = {'<1,000': 0, 
              '<10,000': 1,
              '10,000 - 25,000': 2, 
              '25,000 - 50,000': 3,
              '50,000 - 100,000': 4,  
              '>100,000': 5}

Lookup table to translate species codes

In [28]:
bbal_path_lookup = {1: 'Staphylococcus aureus',
2: 'Streptococcus pneumonia',
3: 'Streptococcus Group B',
4: 'Acetinobacter baumannii',
5: 'Pseudomonas aeruginosa',
6: 'Haemophilus influenza',
7: 'Klebsiella pneumoniae',
8: 'Escherichia coli',
9: 'Enterobacter cloacae',
10: 'Stenotrophomonas maltophilia',
11: 'Enterobacter aerogenes',
12: 'Serratia marcescens',
13: 'Klebsiella oxytoca',
14: 'Proteus mirabilis',
15: 'Other'}
In [29]:
bbal_pathogens_long = (bbal_long[bbal_pathogens_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
bbal_pathogens = bbal_pathogens_long.assign(pathogen=bbal_pathogens_long.value.replace(bbal_path_lookup)
                         ).drop(['variable', 'value'], axis=1)
In [30]:
bbal_cfu_long = (bbal_long[bbal_cfu_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert bbal_pathogens_long.shape==bbal_cfu_long.shape
In [31]:
bbal_pathogens['cfu_count'] = (bbal_cfu_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
bbal_pathogens_sorted = bbal_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
In [32]:
def fill_pathogens(x, labels, lookup):
    return (x.dropna()
          .drop_duplicates(subset=['pathogen'])
          .set_index('pathogen')
          .reindex(list(lookup.values()))
          .reset_index()
          .assign(record_id=labels[0], day=labels[1])
          .fillna(0))
In [33]:
bbal_groups = []
for labels, group in bbal_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, bbal_path_lookup)
    bbal_groups.append(group_full)
In [34]:
bbal_complete = pd.concat(bbal_groups).reset_index()
bbal_complete.to_csv('../data/clean/bbal_pathogens.csv')
In [35]:
bbal_complete.head()
index pathogen record_id day cfu_count
0 0 Staphylococcus aureus 1001 10 0.0
1 1 Streptococcus pneumonia 1001 10 0.0
2 2 Streptococcus Group B 1001 10 0.0
3 3 Acetinobacter baumannii 1001 10 0.0
4 4 Pseudomonas aeruginosa 1001 10 0.0
In [36]:
bbal_complete.cfu_count.hist()
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ceca940>
In [37]:
(bbal_complete.groupby('record_id').cfu_count.max()>=3).sum()
Out[37]:
17
In [38]:
(bbal_complete.groupby('record_id').cfu_count.max()==0).sum()
Out[38]:
7
In [39]:
ax = bbal_complete.groupby('record_id').cfu_count.max().hist(bins=range(8), align='left')
ax.set_xlabel('max. count')
ax.set_ylabel('frequency');

Trajectories of CFU counts from bronchioscopic BAL data

Each panel represents a patient; each colored line a pathogen. X an Y axes are days and CFU count, respectively.

In [40]:
bbal_grid = sns.FacetGrid(bbal_complete, 
                     col='record_id', hue="pathogen", col_wrap=4, size=2.5)
bbal_grid.map(plt.plot, "day", "cfu_count", marker="o", ms=4)
bbal_grid.add_legend()
Out[40]:
<seaborn.axisgrid.FacetGrid at 0x10d14dda0>

The number of patients with at least one positive bbal ($> 10^4$) in the dataset:

In [41]:
positive_bbal = bbal_cfu_long.fillna(0).groupby('record_id').apply(lambda x: (x.value>0).any())
positive_bbal.sum()
Out[41]:
21

Proportion patients with a positive bbal:

In [42]:
positive_bbal.mean()
Out[42]:
0.75

Convert trial data to long format

First need to reshape data to long format (one row per date of study per patient)

In [43]:
daily_long = daily_data_to_long(trial_data)
Processing day 1
Processing day 2
Processing day 3
Processing day 4
Processing day 5
Processing day 6
Processing day 7
Processing day 8
Processing day 9
Processing day 10
Processing day 11
Processing day 12
Processing day 13
Processing day 14
In [44]:
daily_long.head()
record_id abg_yn amt_irrigant ards bbal bbal_add_pathogen3 bbal_add_pathogen4 bbal_add_pathogen_2 bbal_cfu bbal_cfu2 ... plt reintubated spo2 temperature urine_output vasopressin_rate vasopressin_yn vasopressors_yn vent_day wbc_count
0 1001 0.0 1.0 0.0 0.0 NaN NaN NaN NaN NaN ... 131.0 0.0 100.0 38.2 1800.0 NaN NaN 0.0 1.0 11.0
1 1002 1.0 40.0 0.0 0.0 NaN NaN NaN NaN NaN ... 219.0 0.0 NaN 37.1 1875.0 NaN 0.0 1.0 1.0 13.6
2 1003 1.0 20.0 0.0 0.0 NaN NaN NaN NaN NaN ... 94.0 0.0 NaN 37.9 1045.0 NaN 0.0 1.0 1.0 8.4
3 1004 1.0 40.0 0.0 0.0 NaN NaN NaN NaN NaN ... 307.0 0.0 NaN 39.3 1474.0 NaN NaN 0.0 1.0 14.1
4 1005 1.0 20.0 0.0 0.0 NaN NaN NaN NaN NaN ... 55.0 0.0 NaN 38.2 1550.0 NaN 0.0 1.0 1.0 15.4

5 rows × 106 columns

Drop rows with no date, columns with no data

In [45]:
data_long_complete = daily_long.dropna(subset=['date_study_day']).dropna(axis=1, thresh=1)
data_long_complete.head()
record_id abg_yn amt_irrigant ards bbal bbal_add_pathogen3 bbal_add_pathogen4 bbal_add_pathogen_2 bbal_cfu bbal_cfu2 ... plt reintubated spo2 temperature urine_output vasopressin_rate vasopressin_yn vasopressors_yn vent_day wbc_count
0 1001 0.0 1.0 0.0 0.0 NaN NaN NaN NaN NaN ... 131.0 0.0 100.0 38.2 1800.0 NaN NaN 0.0 1.0 11.0
1 1002 1.0 40.0 0.0 0.0 NaN NaN NaN NaN NaN ... 219.0 0.0 NaN 37.1 1875.0 NaN 0.0 1.0 1.0 13.6
2 1003 1.0 20.0 0.0 0.0 NaN NaN NaN NaN NaN ... 94.0 0.0 NaN 37.9 1045.0 NaN 0.0 1.0 1.0 8.4
3 1004 1.0 40.0 0.0 0.0 NaN NaN NaN NaN NaN ... 307.0 0.0 NaN 39.3 1474.0 NaN NaN 0.0 1.0 14.1
4 1005 1.0 20.0 0.0 0.0 NaN NaN NaN NaN NaN ... 55.0 0.0 NaN 38.2 1550.0 NaN 0.0 1.0 1.0 15.4

5 rows × 90 columns

PCR counts in mBAL and HME

In [46]:
pcr_pattern = '_d\d+pcr$'
pcr_data.columns.str.contains(pcr_pattern, regex=True).sum()
Out[46]:
1642
In [47]:
pcr_data_long = daily_data_to_long(pcr_data, pattern=pcr_pattern, suffix='pcr')
Processing day 1
Processing day 2
Processing day 3
Processing day 4
Processing day 5
Processing day 6
Processing day 7
Processing day 8
Processing day 9
Processing day 10
Processing day 11
Processing day 12
Processing day 13
Processing day 14

Drop rows with no date or no PCR result, and columns with no data

In [48]:
pcr_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'mbal_pcr_result']).dropna(axis=1, thresh=1)
pcr_long_complete['date_study_day'] = pd.to_datetime(pcr_long_complete['date_study_day'])
pcr_long_complete.head()
record_id bbal_add_path2 bbal_add_path3 bbal_collected bbal_copies1 bbal_path1 bbal_pcr_result bhme_add_path2 bhme_add_path3 bhme_add_path4 ... mbal_copies6 mbal_copies7 mbal_path1 mbal_path2 mbal_path3 mbal_path4 mbal_path5 mbal_path6 mbal_path7 mbal_pcr_result
1 1003 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0
3 1005 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0
5 1007 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... NaN NaN 2.0 13.0 6.0 11.0 NaN NaN NaN 1.0
6 1008 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0
7 1009 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... NaN NaN 1.0 NaN NaN NaN NaN NaN NaN 1.0

5 rows × 67 columns

Bacterial counts from mBAL over time

Plotting time series for mini-BAL PCR counts.

In [49]:
mbal_pcr_copies_cols = pcr_long_complete.columns[pcr_long_complete.columns.str.contains('copies')  
                         & pcr_long_complete.columns.str.startswith('mbal')]
mbal_pcr_pathogens_cols = pcr_long_complete.columns[pcr_long_complete.columns.str.contains('path') 
                         & ~pcr_long_complete.columns.str.contains('add')
                         & pcr_long_complete.columns.str.startswith('mbal')]

Lookup table to translate species codes

In [50]:
pcr_path_lookup = {1: 'Staphylococcus aureus',
2: 'Streptococcus pneumonia',
3: 'Streptococcus Group B',
4: 'Acetinobacter baumannii',
5: 'Pseudomonas aeruginosa',
6: 'Haemophilus influenza',
7: 'Klebsiella pneumoniae',
8: 'Escherichia coli',
9: 'Enterobacter cloacae',
10: 'Stenotrophomonas maltophilia',
11: 'Enterobacter aerogenes',
12: 'Serratia marcescens',
13: 'Klebsiella oxytoca',
14: 'Proteus mirabilis',
15: 'Enterococcus faecalis',
16: 'Enterococcus faecium',
17: 'Candida albicans',
18: 'Other'}

Convert pathogens table from wide to long format

In [51]:
mbal_pcr_pathogens_long = (pcr_long_complete[mbal_pcr_pathogens_cols.tolist() + ['record_id', 'day']]
                        .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
mbal_pcr_pathogens = mbal_pcr_pathogens_long.assign(pathogen=mbal_pcr_pathogens_long.value.replace(pcr_path_lookup)
                         ).drop(['variable', 'value'], axis=1)

Convert counts table from wide to long format

In [52]:
mbal_pcr_counts_long = (pcr_long_complete[mbal_pcr_copies_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert mbal_pcr_pathogens_long.shape==mbal_pcr_counts_long.shape

Append count onto pathogens table

In [53]:
mbal_pcr_pathogens['pcr_count'] = (mbal_pcr_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
mbal_pcr_pathogens_sorted = mbal_pcr_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
In [54]:
mbal_pcr_groups = []
for labels, group in mbal_pcr_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, pcr_path_lookup)
    mbal_pcr_groups.append(group_full)
In [55]:
mbal_pcr_complete = pd.concat(mbal_pcr_groups).reset_index()
mbal_pcr_complete.to_csv('../data/clean/mbal_pcr_pathogens.csv')

mbal_pcr_complete.head()
index pathogen record_id day pcr_count
0 0 Staphylococcus aureus 1002 4 0.0
1 1 Streptococcus pneumonia 1002 4 0.0
2 2 Streptococcus Group B 1002 4 0.0
3 3 Acetinobacter baumannii 1002 4 0.0
4 4 Pseudomonas aeruginosa 1002 4 0.0

Trajectories of PCR counts from mini-BAL data

Each panel represents a patient; each colored line a pathogen. X an Y axes are days and log-count, respectively.

In [56]:
abx_table.ix[1002]
/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/ipykernel/__main__.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':
abx_name antifungal abx_start abx_stop abx_indication
abx_number
1 Cefazolin 1.0 2016-04-04 2016-04-04 1.0
2 Cefazolin 1.0 2016-04-11 2016-04-12 1.0
3 Meropenem 1.0 2016-04-12 2016-05-15 2.0
4 Meropenem 1.0 2016-04-17 2016-05-20 2.0
5 Vancomycin 1.0 2016-04-12 2016-04-15 2.0
6 Vancomycin 1.0 2016-04-17 2016-04-19 2.0
In [57]:
def vertical_abx_line(x, count_data=pcr_long_complete, **kwargs):
    record_id = x.min()
    start_dates = np.unique(abx_table.ix[record_id].abx_start.dt.date.values)
    first_date = count_data.loc[count_data.record_id==record_id, ['date_study_day', 'day']].min()
    reference_date = first_date.date_study_day - pd.Timedelta('{} days'.format(first_date.day))
    if np.any(start_dates):
        for date in start_dates:
            plt.axvline(x=(pd.Timestamp(date)-reference_date).days, ls=':', c='grey', **kwargs)
In [58]:
sns.set_style('ticks')
grid = sns.FacetGrid(mbal_pcr_complete.assign(log_count=(mbal_pcr_complete.pcr_count + 1).apply(np.log)), 
                     col='record_id', hue="pathogen", col_wrap=4, size=2.5)
grid.map(plt.plot, "day", "log_count", marker="o", ms=4)
grid.map(vertical_abx_line, 'record_id', alpha=0.2)
grid.add_legend()
Out[58]:
<seaborn.axisgrid.FacetGrid at 0x10d54cdd8>

bBAL PCR counts

In [59]:
bbal_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'bbal_pcr_result']).dropna(axis=1, thresh=1)
bbal_long_complete.head()
record_id bbal_add_path2 bbal_add_path3 bbal_add_path4 bbal_add_path5 bbal_add_path6 bbal_add_path7 bbal_collected bbal_copies1 bbal_copies2 ... hme_path2 hme_path3 hme_pcr_result mbal_add_path2 mbal_add_path3 mbal_collected mbal_copies1 mbal_path1 mbal_path2 mbal_pcr_result
11 1013 1.0 0.0 NaN NaN NaN NaN 1.0 330000.0 1060000.0 ... NaN NaN NaN NaN NaN 0.0 NaN NaN NaN NaN
37 1039 NaN NaN NaN NaN NaN NaN 1.0 NaN NaN ... NaN NaN NaN NaN NaN 0.0 NaN NaN NaN NaN
39 1041 NaN NaN NaN NaN NaN NaN 1.0 NaN NaN ... NaN NaN NaN NaN NaN 0.0 NaN NaN NaN NaN
41 1043 NaN NaN NaN NaN NaN NaN 1.0 NaN NaN ... NaN NaN 2.0 NaN NaN 1.0 NaN NaN NaN 2.0
56 1014 1.0 0.0 NaN NaN NaN NaN 1.0 447000.0 1040000.0 ... NaN NaN NaN NaN NaN 0.0 NaN NaN NaN NaN

5 rows × 51 columns

In [60]:
bbal_pcr_copies_cols = bbal_long_complete.columns[bbal_long_complete.columns.str.contains('copies')  
                         & bbal_long_complete.columns.str.startswith('bbal')]
bbal_pcr_pathogens_cols = bbal_long_complete.columns[bbal_long_complete.columns.str.contains('path') 
                         & ~bbal_long_complete.columns.str.contains('add')
                         & bbal_long_complete.columns.str.startswith('bbal')]

Convert pathogens table from wide to long format

In [61]:
bbal_pcr_pathogens_long = (bbal_long_complete[bbal_pcr_pathogens_cols.tolist() + ['record_id', 'day']]
                        .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
bbal_pcr_pathogens = bbal_pcr_pathogens_long.assign(pathogen=bbal_pcr_pathogens_long.value.replace(pcr_path_lookup)
                         ).drop(['variable', 'value'], axis=1)

Convert counts table from wide to long format

In [62]:
bbal_pcr_counts_long = (bbal_long_complete[bbal_pcr_copies_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert bbal_pcr_pathogens_long.shape==bbal_pcr_counts_long.shape

Append count onto pathogens table

In [63]:
bbal_pcr_pathogens['pcr_count'] = (bbal_pcr_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
bbal_pcr_pathogens_sorted = bbal_pcr_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
In [64]:
bbal_pcr_groups = []
for labels, group in bbal_pcr_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, pcr_path_lookup)
    bbal_pcr_groups.append(group_full)
In [65]:
bbal_pcr_complete = pd.concat(bbal_pcr_groups).reset_index()
bbal_pcr_complete.to_csv('../data/clean/bbal_pcr_pathogens.csv')

bbal_pcr_complete.head()
index pathogen record_id day pcr_count
0 0 Staphylococcus aureus 1011 6 591000.0
1 1 Streptococcus pneumonia 1011 6 0.0
2 2 Streptococcus Group B 1011 6 0.0
3 3 Acetinobacter baumannii 1011 6 0.0
4 4 Pseudomonas aeruginosa 1011 6 0.0
In [66]:
sns.set_style('ticks')
grid = sns.FacetGrid(bbal_pcr_complete.assign(log_count=(bbal_pcr_complete.pcr_count 
                                                         + 1 + 0.1*np.random.randn(bbal_pcr_complete.shape[0]))
                                              .apply(np.log)), 
                     col='record_id', hue="pathogen", 
                     col_wrap=4, size=2.5, palette=sns.color_palette("hls", 18))
grid.map(plt.plot, "day", "log_count", marker="o", ms=4, alpha=0.5)
# grid.map(vertical_abx_line, 'record_id', alpha=0.2)
grid.add_legend()
Out[66]:
<seaborn.axisgrid.FacetGrid at 0x119ab7358>

HME PCR counts

In [67]:
hme_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'hme_pcr_result']).dropna(axis=1, thresh=1)
hme_long_complete.head()
record_id bbal_add_path2 bbal_add_path3 bbal_collected bbal_copies1 bbal_path1 bbal_pcr_result bhme_add_path2 bhme_add_path3 bhme_add_path4 ... mbal_copies2 mbal_copies3 mbal_copies4 mbal_copies5 mbal_path1 mbal_path2 mbal_path3 mbal_path4 mbal_path5 mbal_pcr_result
1 1003 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0
5 1007 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... 299000.0 2520000.0 15700000.0 NaN 2.0 13.0 6.0 11.0 NaN 1.0
6 1008 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0
7 1009 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 1.0 NaN NaN NaN NaN 1.0
10 1012 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... 557000.0 77800.0 NaN NaN 1.0 13.0 6.0 NaN NaN 1.0

5 rows × 62 columns

In [68]:
hme_copies_cols = hme_long_complete.columns[hme_long_complete.columns.str.contains('copies')  
                         & hme_long_complete.columns.str.startswith('hme')]
hme_pathogens_cols = hme_long_complete.columns[hme_long_complete.columns.str.contains('path') 
                         & ~hme_long_complete.columns.str.contains('add')
                         & hme_long_complete.columns.str.startswith('hme')]

Convert pathogens and counts from long to wide

In [69]:
hme_pathogens_long = (hme_long_complete[hme_pathogens_cols.tolist() + ['record_id', 'day']]
                        .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
hme_pathogens = hme_pathogens_long.assign(pathogen=hme_pathogens_long.value.replace(pcr_path_lookup)
                         ).drop(['variable', 'value'], axis=1)
In [70]:
hme_counts_long = (hme_long_complete[hme_copies_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert hme_pathogens_long.shape==hme_counts_long.shape

Replace bad values

In [71]:
hme_counts_long.value[~hme_counts_long.value.apply(np.isreal)]
Series([], Name: value, dtype: float64)
In [72]:
hme_counts_long.loc[47, 'value'] = 5.47E+08

Append count onto pathogens table

In [73]:
hme_pathogens['pcr_count'] = (hme_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
hme_pathogens_sorted = hme_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
In [74]:
hme_groups = []
for labels, group in hme_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, pcr_path_lookup)
    hme_groups.append(group_full)
In [75]:
hme_complete = pd.concat(hme_groups).reset_index()
hme_complete.to_csv('../data/clean/hme_pathogens.csv')

hme_complete.head()
index pathogen record_id day pcr_count
0 0 Staphylococcus aureus 1002 5 0.0
1 1 Streptococcus pneumonia 1002 5 0.0
2 2 Streptococcus Group B 1002 5 0.0
3 3 Acetinobacter baumannii 1002 5 0.0
4 4 Pseudomonas aeruginosa 1002 5 0.0

Trajectories of PCR counts from HME data

Each panel represents a patient; each colored line a pathogen. X an Y axes are days and log-count, respectively.

In [76]:
hme_grid = sns.FacetGrid(hme_complete.assign(log_count=(hme_complete.pcr_count + 1).apply(np.log)), 
                     col='record_id', hue="pathogen", col_wrap=4, size=2.5)
hme_grid.map(plt.plot, "day", "log_count", marker="o", ms=4)
hme_grid.add_legend()
Out[76]:
<seaborn.axisgrid.FacetGrid at 0x10d54ca20>

Mini-BAL culture CFU counts

In [77]:
# Same pathogen list as bBAL
mbal_path_lookup = bbal_path_lookup
In [78]:
culture_long = daily_data_to_long(culture_data)
Processing day 1
Processing day 2
Processing day 3
Processing day 4
Processing day 5
Processing day 6
Processing day 7
Processing day 8
Processing day 9
Processing day 10
Processing day 11
Processing day 12
Processing day 13
Processing day 14

Drop rows with no date or no culture test, and columns with no data

In [79]:
culture_long_complete =  culture_long.dropna(subset=['date_study_day', 'mbal_culture_test']).dropna(axis=1, thresh=1)
In [80]:
culture_cfu_cols = culture_long_complete.columns[culture_long_complete.columns.str.contains('cfu')  
                         & culture_long_complete.columns.str.startswith('mbal')]
culture_pathogens_cols = culture_long_complete.columns[culture_long_complete.columns.str.contains('pathogen') 
                         & ~culture_long_complete.columns.str.contains('add')
                         & culture_long_complete.columns.str.startswith('mbal')]
In [81]:
culture_pathogens_long = (culture_long_complete[culture_pathogens_cols.tolist() + ['record_id', 'day']]
                        .pipe(pd.melt, id_vars=['record_id', 'day']))

# Strip out day from variable and replace pathogen code with name
culture_pathogens = culture_pathogens_long.assign(pathogen=culture_pathogens_long.value.replace(mbal_path_lookup)
                         ).drop(['variable', 'value'], axis=1)
In [82]:
culture_counts_long = (culture_long_complete[culture_cfu_cols.tolist() + ['record_id', 'day']]
             .pipe(pd.melt, id_vars=['record_id','day']))

# Make sure they are the same size!
assert culture_pathogens_long.shape==culture_counts_long.shape

Append CFU count onto pathogens table

In [83]:
culture_pathogens['cfu_count'] = (culture_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
                                .str.replace('/','.')).astype(float)
culture_pathogens_sorted = culture_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
In [84]:
groups = []
for labels, group in culture_pathogens_sorted.groupby(['record_id', 'day']):
    recid, day = labels
    group_full = fill_pathogens(group, labels, mbal_path_lookup)
    groups.append(group_full)
In [85]:
culture_complete = pd.concat(groups).reset_index()
culture_complete.to_csv('../data/clean/culture_pathogens.csv')

Trajectories of culture counts from mini-BAL data

Each panel represents a patient; each colored line a pathogen. X an Y axes are days and CFU count, respectively.

In [86]:
grid = sns.FacetGrid(culture_complete, 
                     col='record_id', hue="pathogen", col_wrap=4, size=2.5)
grid.map(plt.plot, "day", "cfu_count", marker="o", ms=4)
grid.add_legend()
Out[86]:
<seaborn.axisgrid.FacetGrid at 0x11defaa20>