# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from redcap import Project
def get_redcap_data(token_url=None, api_url='https://redcap.vanderbilt.edu/api/', metadata=False,
project=False, **kwargs):
if token_url is None:
print('Enter token:')
api_key = input()
else:
api_key = open(token_url).read()
project = Project(api_url, api_key)
data = project.export_records(format='df', **kwargs)
if metadata:
return data, project.export_metadata()
elif project:
return data, project
else:
return data
trial_data, trial_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_trial.txt", metadata=True)
Size of dataset (rows x columns)
trial_data.shape
(46, 1871)
culture_data, culture_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_bal.txt", metadata=True)
Size of dataset (rows x columns)
culture_data.shape
(46, 575)
pcr_data, pcr_metadata = get_redcap_data("/Users/fonnescj/Dropbox/Tokens/hme_pcr.txt", metadata=True)
Size of dataset (rows x columns)
pcr_data.shape
(44, 1642)
Number of individuals with complete (PCR and culture) and incomplete (one or the other) data
pcr_date_cols = pcr_data.columns[pcr_data.columns.str.contains('date')]
culture_date_cols = culture_data.columns[culture_data.columns.str.contains('date')]
culture_data['some_bal_data'] = (culture_data[culture_date_cols].notnull().sum(1)>0)
pcr_data['some_pcr_data'] = (pcr_data[pcr_date_cols].notnull().sum(1)>0)
(culture_data.join(pcr_data, lsuffix='-bal', rsuffix='-pcr')[['some_bal_data', 'some_pcr_data']]
.sum(1).replace({1: 'One', 2: 'Both'}).value_counts())
Both 44 One 2 dtype: int64
ax = trial_data.vent_days.hist()
ax.set_ylabel('Frequency')
ax.set_xlabel('Days on ventilator');
trial_data.vent_days.describe()
count 45.000000 mean 10.933333 std 7.566073 min 2.000000 25% 6.000000 50% 9.000000 75% 13.000000 max 38.000000 Name: vent_days, dtype: float64
Suspected VAP
VAP_lookup = {1: 'Pneumonia',
2: 'Infection (not pneumonia)',
3: 'Acute renal failure',
4: 'ARDS',
5: 'Cardiac arrest',
6: 'MI',
7: 'CVA',
8: 'DVT',
9: 'PE',
10: 'None of the above'}
trial_data['pneumonia'] = trial_data.morbidity___1
There are currently 18 individuals with pneumonia
trial_data.pneumonia.sum()
23
This accounts for 49% of the sample
trial_data.pneumonia.mean()
0.5
trial_data.preadmit_abx.replace({1:'Yes', 2:'No', 3:'Unknown'}).value_counts()
No 39 Unknown 7 Name: preadmit_abx, dtype: int64
trial_data.add_med_0.replace({1:'Yes', 0:'No'}).value_counts()
Yes 42 No 3 Name: add_med_0, dtype: int64
abx_cols = trial_data.columns[trial_data.columns.str.startswith('abx')]
abx_table = pd.wide_to_long(trial_data[abx_cols].assign(id=trial_data.index),
stubnames=['abx_generic_', 'abx_antifungals_', 'abx_start_', 'abx_stop_', 'abx_indication_',
'abx_suspect_infect_source_', 'abx_known_infection_source_'],
i='id', j='abx_number').dropna(thresh=2).dropna(axis=1, thresh=100)
abx_table.columns = ['abx_name', 'antifungal', 'abx_start', 'abx_stop', 'abx_indication']
abx_table.abx_start = pd.to_datetime(abx_table.abx_start)
abx_table.abx_stop = pd.to_datetime(abx_table.abx_stop)
abx_table
abx_name | antifungal | abx_start | abx_stop | abx_indication | ||
---|---|---|---|---|---|---|
id | abx_number | |||||
1001 | 1 | Ampicillin-sulbactam | 1.0 | 2016-03-27 | 2016-03-27 | 1.0 |
1002 | 1 | Cefazolin | 1.0 | 2016-04-04 | 2016-04-04 | 1.0 |
1003 | 1 | Cefazolin | 1.0 | 2016-04-14 | 2016-04-14 | 1.0 |
1004 | 1 | Cefazolin | 1.0 | 2016-04-18 | 2016-04-18 | 1.0 |
1005 | 1 | Cefazolin | 1.0 | 2016-05-06 | 2016-05-06 | 1.0 |
1006 | 1 | Amoxicillin-k clavulanate | 1.0 | 2016-05-16 | 2016-05-22 | 3.0 |
1007 | 1 | Cefazolin | 1.0 | 2016-05-17 | 2016-05-17 | 1.0 |
1008 | 1 | Cefazolin | 1.0 | 2016-05-23 | 2016-05-23 | 1.0 |
1010 | 1 | Amoxicillin-k clavulanate | 1.0 | 2016-06-26 | 2016-07-28 | 3.0 |
1011 | 1 | Cefazolin | 1.0 | 2016-06-20 | 2016-06-22 | 1.0 |
1012 | 1 | Cefazolin | 1.0 | 2016-06-25 | 2016-06-25 | 3.0 |
1013 | 1 | Meropenem | 1.0 | 2016-06-28 | 2016-07-01 | 2.0 |
1014 | 1 | Cefazolin | 1.0 | 2016-07-02 | 2016-07-02 | 1.0 |
1015 | 1 | Cefazolin | 1.0 | 2016-07-06 | 2016-07-06 | 1.0 |
1016 | 1 | Cefazolin | 1.0 | 2016-07-15 | 2016-07-18 | 3.0 |
1017 | 1 | Cefazolin | 1.0 | 2016-07-12 | 2016-07-14 | 1.0 |
1018 | 1 | Ceftazidime | 1.0 | 2016-07-20 | 2016-07-22 | 2.0 |
1019 | 1 | Cefazolin | 1.0 | 2016-07-20 | 2016-07-21 | 1.0 |
1020 | 1 | Cefazolin | 1.0 | 2016-07-24 | 2016-07-24 | 1.0 |
1021 | 1 | Cefazolin | 1.0 | 2016-07-27 | 2016-07-27 | 1.0 |
1022 | 1 | Cefazolin | 1.0 | 2016-08-09 | 2016-08-11 | 1.0 |
1023 | 1 | Piperacillin-tazobactam | 1.0 | 2016-08-16 | 2016-08-16 | 1.0 |
1024 | 1 | Cefazolin | 1.0 | 2016-08-20 | 2016-08-20 | 1.0 |
1025 | 1 | Cefazolin | 1.0 | 2016-08-24 | 2016-08-24 | 1.0 |
1026 | 1 | Cefazolin | 1.0 | 2016-09-01 | 2016-09-01 | 1.0 |
1027 | 1 | Cefazolin | 1.0 | 2016-09-22 | 2016-09-22 | 1.0 |
1028 | 1 | Ceftazidime | 1.0 | 2016-09-21 | 2016-09-29 | 2.0 |
1029 | 1 | Cefazolin | 1.0 | 2016-09-17 | 2016-09-17 | 1.0 |
1030 | 1 | Cefazolin | 1.0 | 2016-09-16 | 2016-09-18 | 1.0 |
1031 | 1 | Cefazolin | 1.0 | 2016-09-23 | 2016-09-24 | 1.0 |
... | ... | ... | ... | ... | ... | ... |
1022 | 5 | trimethoprim-sulfamethoxazole | 1.0 | 2016-08-17 | 2016-08-22 | 3.0 |
1024 | 5 | Vancomycin | 1.0 | 2016-08-23 | 2016-09-01 | 2.0 |
1025 | 5 | Vancomycin | 1.0 | 2016-08-27 | 2016-08-29 | 2.0 |
1029 | 5 | Levofloxacin | 1.0 | 2016-09-20 | 2016-09-20 | 2.0 |
1037 | 5 | Vancomycin | 1.0 | 2016-10-28 | 2016-11-04 | 1.0 |
1038 | 5 | Vancomycin | 1.0 | 2016-12-11 | 2016-12-18 | 2.0 |
1039 | 5 | Piperacillin-tazobactam | 1.0 | 2016-12-09 | 2016-12-11 | 2.0 |
1042 | 5 | Vancomycin | 1.0 | 2017-01-15 | 2017-01-17 | 1.0 |
1044 | 5 | Tobramycin | 1.0 | 2017-01-31 | 2017-02-01 | 2.0 |
1045 | 5 | Vancomycin | 1.0 | 2017-02-01 | 2017-02-04 | 2.0 |
1001 | 6 | Levofloxacin | 1.0 | 2016-03-23 | 2016-03-24 | 1.0 |
1002 | 6 | Vancomycin | 1.0 | 2016-04-17 | 2016-04-19 | 2.0 |
1014 | 6 | Vancomycin | 1.0 | 2016-07-05 | 2016-07-10 | 2.0 |
1015 | 6 | Vancomycin | 1.0 | 2016-07-06 | 2016-07-07 | 1.0 |
1022 | 6 | Tobramycin | 1.0 | 2016-08-14 | 2016-08-14 | 3.0 |
1029 | 6 | Piperacillin-tazobactam | 1.0 | 2016-09-30 | 2016-10-03 | 2.0 |
1039 | 6 | Vancomycin | 1.0 | 2016-12-04 | 2016-12-04 | 1.0 |
1042 | 6 | Fluconazole | 2.0 | 2017-01-15 | 2017-01-15 | 1.0 |
1044 | 6 | Vancomycin | 1.0 | 2017-01-31 | 2017-02-02 | 2.0 |
1001 | 7 | Meropenem | 1.0 | 2016-04-02 | 2016-04-05 | 2.0 |
1014 | 7 | Vancomycin | 1.0 | 2016-07-13 | 2016-07-18 | 2.0 |
1015 | 7 | Vancomycin | 1.0 | 2016-07-15 | 2016-07-18 | 2.0 |
1022 | 7 | Vancomycin | 1.0 | 2016-08-14 | 2016-08-16 | 3.0 |
1029 | 7 | Vancomycin | 1.0 | 2016-09-20 | 2016-09-23 | 2.0 |
1039 | 7 | Vancomycin | 1.0 | 2016-12-09 | 2016-12-11 | 2.0 |
1042 | 7 | Metronidazole | 1.0 | 2017-01-15 | 2017-01-15 | 1.0 |
1001 | 8 | Vancomycin | 1.0 | 2016-03-23 | 2016-03-24 | 1.0 |
1022 | 8 | Vancomycin | 1.0 | 2016-08-21 | 2016-08-22 | 3.0 |
1029 | 8 | Vancomycin | 1.0 | 2016-09-30 | 2016-10-03 | 2.0 |
1001 | 9 | Vancomycin | 1.0 | 2016-04-02 | 2016-04-06 | 2.0 |
174 rows × 5 columns
abx_table.abx_name.value_counts()
Cefazolin 47 Vancomycin 39 Ceftazidime 14 Levofloxacin 14 Meropenem 11 Ceftriaxone 8 Piperacillin-tazobactam 8 Tobramycin 5 Clindamycin 5 Cefoxitin 4 Ampicillin-sulbactam 3 Ciprofloxacin 2 Cefepime 2 Amoxicillin-k clavulanate 2 Linezolid 2 Gentamicin 1 Trimethoprim/sulfamethoxazole 1 Cefuroxime axetil 1 Fluconazole 1 Metronidazole 1 Cephalexin 1 trimethoprim-sulfamethoxazole 1 Cefdinir 1 Name: abx_name, dtype: int64
trial_data
lab_mrn | ic | name | mrn | dob | age | ethnicity | race | sex | height | ... | nutrition_goal_d14 | urine_output_d14 | spo2_d14 | low_peep_d14 | fio2_low_d14 | gcs_d14 | cxr_yn_d14 | cxr_d14 | ards_d14 | pneumonia | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
record_id | |||||||||||||||||||||
1001 | 99019881, 99019865 | [document] | MCMURRAY, PHEAIRIS | 40181455 | 1993-09-08 | 22 | 1 | 3 | 1 | 180.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1002 | 99020041 | [document] | HUTCHINSON, LYNN MARIE | 40238339 | 1970-04-29 | 45 | 1 | 4 | 0 | 155.0 | ... | 1.0 | 2405.0 | 95.0 | 5.0 | 40.0 | 3.0 | 1.0 | 2.0 | 1.0 | 0 |
1003 | 99020323 | [document] | FOURAKRE, JAMES | 40279804 | 1943-06-24 | 72 | 1 | 4 | 1 | 183.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1004 | 099020377 | [document] | BURCHAM, BETTY | 36498293 | 1936-10-18 | 79 | 1 | 4 | 0 | 152.4 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1005 | 99020801 | [document] | MCCUTCHEON, CARI | 40385486 | 1987-11-26 | 28 | 1 | 4 | 0 | 177.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1006 | 99020869 | [document] | BASTANCHURY, RICHARD | 40398752 | 1968-03-26 | 48 | 1 | 4 | 1 | 175.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1007 | 99021068 | [document] | FESSLER, HANNAH | 40435208 | 1992-03-10 | 24 | 1 | 4 | 0 | 170.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1008 | 99021194 | [document] | KING, JAMES | 40472474 | 1961-11-08 | 54 | 1 | 4 | 1 | 175.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1009 | 99021641 | [document] | BACON, THELMA | 40550907 | 1935-05-02 | 81 | 1 | 4 | 0 | 158.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1010 | 99021664 | [document] | MORRIS, JESSE | 40565244 | 1978-06-14 | 38 | 1 | 4 | 1 | 180.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1011 | 99021787 | [document] | ALLEN, NATASHA LYNN | 40585937 | 1992-11-18 | 23 | 1 | 4 | 0 | 163.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1012 | 99021788 | [document] | ROBINSON, COREY | 40581316 | 1993-12-22 | 22 | 1 | 4 | 1 | 175.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1013 | 99021975 | [document] | STEVENS, JOHN | 40587511 | 1962-10-11 | 53 | 1 | 4 | 1 | 170.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1014 | 99022055 | [document] | MULLINS, DOUGLAS | 40643975 | 1963-03-10 | 53 | 1 | 4 | 1 | 193.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1015 | 99022167 | [document] | COMPTON, KENNETH | 40653602 | 1937-01-29 | 79 | 1 | 4 | 1 | 172.7 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1016 | 99022180 | [document] | RANSOM, KERRY EYULNN | 40669319 | 1972-12-14 | 43 | 1 | 3 | 1 | 180.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1017 | 99022195 | [document] | SHERROD, KAYLA | 40671133 | 1989-11-22 | 26 | 1 | 3 | 0 | 158.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1018 | 99022322 | [document] | NEAL, MATTHEW | 40699902 | 1992-08-04 | 23 | 1 | 4 | 1 | 193.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1019 | 99022382 | [document] | COLLINS, JONATHAN | 40713232 | 1993-01-12 | 23 | 1 | 4 | 1 | 188.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1020 | 99022427 | [document] | GROVES, BIRDIE | 40729972 | 1934-05-30 | 82 | 1 | 4 | 0 | 168.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1021 | 99022487 | [document] | GENTRY, CHRISTINA | 40736860 | 1966-07-11 | 50 | 1 | 4 | 0 | 180.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1022 | 99022756 | [document] | VINCENT, EMMETT EUGENE | 40790834 | 1955-06-19 | 61 | 1 | 4 | 1 | 182.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1023 | 99022843 | [document] | THOMSON, MICHAEL | 40821654 | 1968-01-13 | 48 | 1 | 4 | 1 | 182.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1024 | 99023016 | [document] | MCREYNOLDS, ROBERT ERIC | 40852360 | 1950-03-11 | 66 | 1 | 3 | 1 | 185.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1025 | 99023134 | [document] | NEAL, JAMES | 36041507 | 1960-11-02 | 55 | 1 | 4 | 1 | 178.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1026 | 99023292 | [document] | LOWHORN, ALTON | 40897175 | 1968-08-09 | 48 | 1 | 4 | 1 | 170.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1027 | 99023538 | [document] | JIMENEZ, RAYNA | 40954513 | 1992-10-09 | 23 | 1 | 4 | 0 | 157.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1028 | 99023569 | [document] | POTTER, STEVEN | 40955700 | 1962-08-09 | 54 | 1 | 4 | 1 | 172.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1029 | 99023616 | [document] | MCCOIN, DANNY | 40970675 | 1969-08-07 | 47 | 1 | 4 | 1 | 188.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1030 | 99023618 | [document] | MORRIS, WILLIAM PAUL | 40960858 | 1970-07-16 | 46 | 1 | 4 | 1 | 175.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1031 | 99023683 | [document] | CHEEK, JASON | 40973968 | 1980-01-02 | 36 | 1 | 4 | 1 | 180.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1032 | 99023899 | [document] | Sailliez, Jeffrey | 41003393 | 1976-08-17 | 40 | 1 | 4 | 1 | 182.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1033 | 99024357 | [document] | Lloyd, David | 41092511 | 1960-11-03 | 55 | 1 | 4 | 1 | 170.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1034 | 99024409 | [document] | MOORE, WESLEY | 41099110 | 1957-10-13 | 59 | 1 | 4 | 1 | 183.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1035 | 99024481 | [document] | RICE, STEVEN | 41120908 | 1959-04-13 | 57 | 1 | 4 | 1 | 178.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1036 | 99024534 | [document] | BARTON, MILEAH BROOKE | 41123670 | 1987-06-16 | 29 | 1 | 4 | 0 | 165.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1037 | 99024666 | [document] | PILLOW, VICKI JEAN | 41141656 | 1952-01-26 | 64 | 1 | 4 | 0 | 162.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1038 | 99025230 | [document] | LYNN, JEWELL | 41269663 | 1943-08-21 | 73 | 1 | 4 | 0 | 152.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1039 | 99025306 | [document] | NEWMAN, JOSHUA | 41291360 | 1990-11-16 | 26 | 1 | 4 | 1 | 185.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1040 | 99025443 | [document] | PROCK, TONYA | 41293788 | 1969-12-03 | 47 | 1 | 4 | 0 | 152.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1041 | NaN | [document] | ELROD, DENNIS | 41382466 | 1978-09-18 | 38 | 1 | 4 | 1 | 188.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1042 | 99026014 | [document] | JASTRE, LAURA SAVANNAH | 41452038 | 1988-04-08 | 28 | 1 | 4 | 0 | 152.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
1043 | 99026078 | [document] | DAVIS, ADAM M | 32417198 | 1980-03-22 | 36 | 1 | 4 | 1 | 188.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1044 | 99026259 | [document] | PIGG, JAMIE | 41490277 | 1977-09-25 | 39 | 1 | 4 | 1 | 172.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1045 | 99026322 | [document] | GRIFFIN, SHAWN | 41506536 | 1965-06-22 | 51 | 1 | 4 | 1 | 167.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
1046 | 99027068 | [document] | CRAFTON, DANIEL L | 41666090 | 1997-04-29 | 19 | 1 | 4 | 1 | 178.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
46 rows × 1872 columns
def daily_data_to_long(dataset, pattern='_d\d+$', suffix=''):
# Find the largest number of study days
_tokens = dataset.columns[dataset.columns.str.contains(pattern, regex=True)].str.split('_').values
last_day = np.array([''.join(filter(lambda x: x.isdigit(), t[-1])) for t in _tokens]).astype(int).max()
all_days_data = []
for i in range(last_day):
print('Processing day',i+1)
study_day_cols = dataset.columns[dataset.columns.str.endswith('_d{0}{1}'.format(i+1,suffix))]
study_day_data = dataset[study_day_cols].copy()
study_day_data.columns = ['_'.join(col.split('_')[:-1]) for col in study_day_cols]
study_day_data['day'] = i+1
all_days_data.append(study_day_data)
if False:
print(study_day_data.shape)
return pd.concat(all_days_data).reset_index()
bbal_long = daily_data_to_long(trial_data).query('bbal==1')
Processing day 1 Processing day 2 Processing day 3 Processing day 4 Processing day 5 Processing day 6 Processing day 7 Processing day 8 Processing day 9 Processing day 10 Processing day 11 Processing day 12 Processing day 13 Processing day 14
bbal_cfu_cols = bbal_long.columns[bbal_long.columns.str.contains('cfu')
& bbal_long.columns.str.startswith('bbal')]
bbal_pathogens_cols = bbal_long.columns[bbal_long.columns.str.contains('pathogen')
& ~bbal_long.columns.str.contains('add')
& bbal_long.columns.str.startswith('culture')]
Encode CFU count ranges to ordinal variable
cfu_coding = {'<1,000': 0,
'<10,000': 1,
'10,000 - 25,000': 2,
'25,000 - 50,000': 3,
'50,000 - 100,000': 4,
'>100,000': 5}
Lookup table to translate species codes
bbal_path_lookup = {1: 'Staphylococcus aureus',
2: 'Streptococcus pneumonia',
3: 'Streptococcus Group B',
4: 'Acetinobacter baumannii',
5: 'Pseudomonas aeruginosa',
6: 'Haemophilus influenza',
7: 'Klebsiella pneumoniae',
8: 'Escherichia coli',
9: 'Enterobacter cloacae',
10: 'Stenotrophomonas maltophilia',
11: 'Enterobacter aerogenes',
12: 'Serratia marcescens',
13: 'Klebsiella oxytoca',
14: 'Proteus mirabilis',
15: 'Other'}
bbal_pathogens_long = (bbal_long[bbal_pathogens_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id', 'day']))
# Strip out day from variable and replace pathogen code with name
bbal_pathogens = bbal_pathogens_long.assign(pathogen=bbal_pathogens_long.value.replace(bbal_path_lookup)
).drop(['variable', 'value'], axis=1)
bbal_cfu_long = (bbal_long[bbal_cfu_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id','day']))
# Make sure they are the same size!
assert bbal_pathogens_long.shape==bbal_cfu_long.shape
bbal_pathogens['cfu_count'] = (bbal_cfu_long.value.astype(str).str.replace('E','e').str.replace('+','')
.str.replace('/','.')).astype(float)
bbal_pathogens_sorted = bbal_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
def fill_pathogens(x, labels, lookup):
return (x.dropna()
.drop_duplicates(subset=['pathogen'])
.set_index('pathogen')
.reindex(list(lookup.values()))
.reset_index()
.assign(record_id=labels[0], day=labels[1])
.fillna(0))
bbal_groups = []
for labels, group in bbal_pathogens_sorted.groupby(['record_id', 'day']):
recid, day = labels
group_full = fill_pathogens(group, labels, bbal_path_lookup)
bbal_groups.append(group_full)
bbal_complete = pd.concat(bbal_groups).reset_index()
bbal_complete.to_csv('../data/clean/bbal_pathogens.csv')
bbal_complete.head()
index | pathogen | record_id | day | cfu_count | |
---|---|---|---|---|---|
0 | 0 | Staphylococcus aureus | 1001 | 10 | 0.0 |
1 | 1 | Streptococcus pneumonia | 1001 | 10 | 0.0 |
2 | 2 | Streptococcus Group B | 1001 | 10 | 0.0 |
3 | 3 | Acetinobacter baumannii | 1001 | 10 | 0.0 |
4 | 4 | Pseudomonas aeruginosa | 1001 | 10 | 0.0 |
bbal_complete.cfu_count.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10ceca940>
(bbal_complete.groupby('record_id').cfu_count.max()>=3).sum()
17
(bbal_complete.groupby('record_id').cfu_count.max()==0).sum()
7
ax = bbal_complete.groupby('record_id').cfu_count.max().hist(bins=range(8), align='left')
ax.set_xlabel('max. count')
ax.set_ylabel('frequency');
Each panel represents a patient; each colored line a pathogen. X an Y axes are days and CFU count, respectively.
bbal_grid = sns.FacetGrid(bbal_complete,
col='record_id', hue="pathogen", col_wrap=4, size=2.5)
bbal_grid.map(plt.plot, "day", "cfu_count", marker="o", ms=4)
bbal_grid.add_legend()
<seaborn.axisgrid.FacetGrid at 0x10d14dda0>
positive_bbal = bbal_cfu_long.fillna(0).groupby('record_id').apply(lambda x: (x.value>0).any())
positive_bbal.sum()
21
positive_bbal.mean()
0.75
First need to reshape data to long format (one row per date of study per patient)
daily_long = daily_data_to_long(trial_data)
Processing day 1 Processing day 2 Processing day 3 Processing day 4 Processing day 5 Processing day 6 Processing day 7 Processing day 8 Processing day 9 Processing day 10 Processing day 11 Processing day 12 Processing day 13 Processing day 14
daily_long.head()
record_id | abg_yn | amt_irrigant | ards | bbal | bbal_add_pathogen3 | bbal_add_pathogen4 | bbal_add_pathogen_2 | bbal_cfu | bbal_cfu2 | ... | plt | reintubated | spo2 | temperature | urine_output | vasopressin_rate | vasopressin_yn | vasopressors_yn | vent_day | wbc_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | 0.0 | 1.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 131.0 | 0.0 | 100.0 | 38.2 | 1800.0 | NaN | NaN | 0.0 | 1.0 | 11.0 |
1 | 1002 | 1.0 | 40.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 219.0 | 0.0 | NaN | 37.1 | 1875.0 | NaN | 0.0 | 1.0 | 1.0 | 13.6 |
2 | 1003 | 1.0 | 20.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 94.0 | 0.0 | NaN | 37.9 | 1045.0 | NaN | 0.0 | 1.0 | 1.0 | 8.4 |
3 | 1004 | 1.0 | 40.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 307.0 | 0.0 | NaN | 39.3 | 1474.0 | NaN | NaN | 0.0 | 1.0 | 14.1 |
4 | 1005 | 1.0 | 20.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 55.0 | 0.0 | NaN | 38.2 | 1550.0 | NaN | 0.0 | 1.0 | 1.0 | 15.4 |
5 rows × 106 columns
Drop rows with no date, columns with no data
data_long_complete = daily_long.dropna(subset=['date_study_day']).dropna(axis=1, thresh=1)
data_long_complete.head()
record_id | abg_yn | amt_irrigant | ards | bbal | bbal_add_pathogen3 | bbal_add_pathogen4 | bbal_add_pathogen_2 | bbal_cfu | bbal_cfu2 | ... | plt | reintubated | spo2 | temperature | urine_output | vasopressin_rate | vasopressin_yn | vasopressors_yn | vent_day | wbc_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | 0.0 | 1.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 131.0 | 0.0 | 100.0 | 38.2 | 1800.0 | NaN | NaN | 0.0 | 1.0 | 11.0 |
1 | 1002 | 1.0 | 40.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 219.0 | 0.0 | NaN | 37.1 | 1875.0 | NaN | 0.0 | 1.0 | 1.0 | 13.6 |
2 | 1003 | 1.0 | 20.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 94.0 | 0.0 | NaN | 37.9 | 1045.0 | NaN | 0.0 | 1.0 | 1.0 | 8.4 |
3 | 1004 | 1.0 | 40.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 307.0 | 0.0 | NaN | 39.3 | 1474.0 | NaN | NaN | 0.0 | 1.0 | 14.1 |
4 | 1005 | 1.0 | 20.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | 55.0 | 0.0 | NaN | 38.2 | 1550.0 | NaN | 0.0 | 1.0 | 1.0 | 15.4 |
5 rows × 90 columns
pcr_pattern = '_d\d+pcr$'
pcr_data.columns.str.contains(pcr_pattern, regex=True).sum()
1642
pcr_data_long = daily_data_to_long(pcr_data, pattern=pcr_pattern, suffix='pcr')
Processing day 1 Processing day 2 Processing day 3 Processing day 4 Processing day 5 Processing day 6 Processing day 7 Processing day 8 Processing day 9 Processing day 10 Processing day 11 Processing day 12 Processing day 13 Processing day 14
Drop rows with no date or no PCR result, and columns with no data
pcr_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'mbal_pcr_result']).dropna(axis=1, thresh=1)
pcr_long_complete['date_study_day'] = pd.to_datetime(pcr_long_complete['date_study_day'])
pcr_long_complete.head()
record_id | bbal_add_path2 | bbal_add_path3 | bbal_collected | bbal_copies1 | bbal_path1 | bbal_pcr_result | bhme_add_path2 | bhme_add_path3 | bhme_add_path4 | ... | mbal_copies6 | mbal_copies7 | mbal_path1 | mbal_path2 | mbal_path3 | mbal_path4 | mbal_path5 | mbal_path6 | mbal_path7 | mbal_pcr_result | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1003 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
3 | 1005 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
5 | 1007 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 2.0 | 13.0 | 6.0 | 11.0 | NaN | NaN | NaN | 1.0 |
6 | 1008 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
7 | 1009 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
5 rows × 67 columns
Plotting time series for mini-BAL PCR counts.
mbal_pcr_copies_cols = pcr_long_complete.columns[pcr_long_complete.columns.str.contains('copies')
& pcr_long_complete.columns.str.startswith('mbal')]
mbal_pcr_pathogens_cols = pcr_long_complete.columns[pcr_long_complete.columns.str.contains('path')
& ~pcr_long_complete.columns.str.contains('add')
& pcr_long_complete.columns.str.startswith('mbal')]
Lookup table to translate species codes
pcr_path_lookup = {1: 'Staphylococcus aureus',
2: 'Streptococcus pneumonia',
3: 'Streptococcus Group B',
4: 'Acetinobacter baumannii',
5: 'Pseudomonas aeruginosa',
6: 'Haemophilus influenza',
7: 'Klebsiella pneumoniae',
8: 'Escherichia coli',
9: 'Enterobacter cloacae',
10: 'Stenotrophomonas maltophilia',
11: 'Enterobacter aerogenes',
12: 'Serratia marcescens',
13: 'Klebsiella oxytoca',
14: 'Proteus mirabilis',
15: 'Enterococcus faecalis',
16: 'Enterococcus faecium',
17: 'Candida albicans',
18: 'Other'}
Convert pathogens table from wide to long format
mbal_pcr_pathogens_long = (pcr_long_complete[mbal_pcr_pathogens_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id', 'day']))
# Strip out day from variable and replace pathogen code with name
mbal_pcr_pathogens = mbal_pcr_pathogens_long.assign(pathogen=mbal_pcr_pathogens_long.value.replace(pcr_path_lookup)
).drop(['variable', 'value'], axis=1)
Convert counts table from wide to long format
mbal_pcr_counts_long = (pcr_long_complete[mbal_pcr_copies_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id','day']))
# Make sure they are the same size!
assert mbal_pcr_pathogens_long.shape==mbal_pcr_counts_long.shape
Append count onto pathogens table
mbal_pcr_pathogens['pcr_count'] = (mbal_pcr_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
.str.replace('/','.')).astype(float)
mbal_pcr_pathogens_sorted = mbal_pcr_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
mbal_pcr_groups = []
for labels, group in mbal_pcr_pathogens_sorted.groupby(['record_id', 'day']):
recid, day = labels
group_full = fill_pathogens(group, labels, pcr_path_lookup)
mbal_pcr_groups.append(group_full)
mbal_pcr_complete = pd.concat(mbal_pcr_groups).reset_index()
mbal_pcr_complete.to_csv('../data/clean/mbal_pcr_pathogens.csv')
mbal_pcr_complete.head()
index | pathogen | record_id | day | pcr_count | |
---|---|---|---|---|---|
0 | 0 | Staphylococcus aureus | 1002 | 4 | 0.0 |
1 | 1 | Streptococcus pneumonia | 1002 | 4 | 0.0 |
2 | 2 | Streptococcus Group B | 1002 | 4 | 0.0 |
3 | 3 | Acetinobacter baumannii | 1002 | 4 | 0.0 |
4 | 4 | Pseudomonas aeruginosa | 1002 | 4 | 0.0 |
Each panel represents a patient; each colored line a pathogen. X an Y axes are days and log-count, respectively.
abx_table.ix[1002]
/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/ipykernel/__main__.py:1: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix if __name__ == '__main__':
abx_name | antifungal | abx_start | abx_stop | abx_indication | |
---|---|---|---|---|---|
abx_number | |||||
1 | Cefazolin | 1.0 | 2016-04-04 | 2016-04-04 | 1.0 |
2 | Cefazolin | 1.0 | 2016-04-11 | 2016-04-12 | 1.0 |
3 | Meropenem | 1.0 | 2016-04-12 | 2016-05-15 | 2.0 |
4 | Meropenem | 1.0 | 2016-04-17 | 2016-05-20 | 2.0 |
5 | Vancomycin | 1.0 | 2016-04-12 | 2016-04-15 | 2.0 |
6 | Vancomycin | 1.0 | 2016-04-17 | 2016-04-19 | 2.0 |
def vertical_abx_line(x, count_data=pcr_long_complete, **kwargs):
record_id = x.min()
start_dates = np.unique(abx_table.ix[record_id].abx_start.dt.date.values)
first_date = count_data.loc[count_data.record_id==record_id, ['date_study_day', 'day']].min()
reference_date = first_date.date_study_day - pd.Timedelta('{} days'.format(first_date.day))
if np.any(start_dates):
for date in start_dates:
plt.axvline(x=(pd.Timestamp(date)-reference_date).days, ls=':', c='grey', **kwargs)
sns.set_style('ticks')
grid = sns.FacetGrid(mbal_pcr_complete.assign(log_count=(mbal_pcr_complete.pcr_count + 1).apply(np.log)),
col='record_id', hue="pathogen", col_wrap=4, size=2.5)
grid.map(plt.plot, "day", "log_count", marker="o", ms=4)
grid.map(vertical_abx_line, 'record_id', alpha=0.2)
grid.add_legend()
<seaborn.axisgrid.FacetGrid at 0x10d54cdd8>
bbal_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'bbal_pcr_result']).dropna(axis=1, thresh=1)
bbal_long_complete.head()
record_id | bbal_add_path2 | bbal_add_path3 | bbal_add_path4 | bbal_add_path5 | bbal_add_path6 | bbal_add_path7 | bbal_collected | bbal_copies1 | bbal_copies2 | ... | hme_path2 | hme_path3 | hme_pcr_result | mbal_add_path2 | mbal_add_path3 | mbal_collected | mbal_copies1 | mbal_path1 | mbal_path2 | mbal_pcr_result | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11 | 1013 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | 1.0 | 330000.0 | 1060000.0 | ... | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN |
37 | 1039 | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN |
39 | 1041 | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN |
41 | 1043 | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | ... | NaN | NaN | 2.0 | NaN | NaN | 1.0 | NaN | NaN | NaN | 2.0 |
56 | 1014 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | 1.0 | 447000.0 | 1040000.0 | ... | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN |
5 rows × 51 columns
bbal_pcr_copies_cols = bbal_long_complete.columns[bbal_long_complete.columns.str.contains('copies')
& bbal_long_complete.columns.str.startswith('bbal')]
bbal_pcr_pathogens_cols = bbal_long_complete.columns[bbal_long_complete.columns.str.contains('path')
& ~bbal_long_complete.columns.str.contains('add')
& bbal_long_complete.columns.str.startswith('bbal')]
Convert pathogens table from wide to long format
bbal_pcr_pathogens_long = (bbal_long_complete[bbal_pcr_pathogens_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id', 'day']))
# Strip out day from variable and replace pathogen code with name
bbal_pcr_pathogens = bbal_pcr_pathogens_long.assign(pathogen=bbal_pcr_pathogens_long.value.replace(pcr_path_lookup)
).drop(['variable', 'value'], axis=1)
Convert counts table from wide to long format
bbal_pcr_counts_long = (bbal_long_complete[bbal_pcr_copies_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id','day']))
# Make sure they are the same size!
assert bbal_pcr_pathogens_long.shape==bbal_pcr_counts_long.shape
Append count onto pathogens table
bbal_pcr_pathogens['pcr_count'] = (bbal_pcr_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
.str.replace('/','.')).astype(float)
bbal_pcr_pathogens_sorted = bbal_pcr_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
bbal_pcr_groups = []
for labels, group in bbal_pcr_pathogens_sorted.groupby(['record_id', 'day']):
recid, day = labels
group_full = fill_pathogens(group, labels, pcr_path_lookup)
bbal_pcr_groups.append(group_full)
bbal_pcr_complete = pd.concat(bbal_pcr_groups).reset_index()
bbal_pcr_complete.to_csv('../data/clean/bbal_pcr_pathogens.csv')
bbal_pcr_complete.head()
index | pathogen | record_id | day | pcr_count | |
---|---|---|---|---|---|
0 | 0 | Staphylococcus aureus | 1011 | 6 | 591000.0 |
1 | 1 | Streptococcus pneumonia | 1011 | 6 | 0.0 |
2 | 2 | Streptococcus Group B | 1011 | 6 | 0.0 |
3 | 3 | Acetinobacter baumannii | 1011 | 6 | 0.0 |
4 | 4 | Pseudomonas aeruginosa | 1011 | 6 | 0.0 |
sns.set_style('ticks')
grid = sns.FacetGrid(bbal_pcr_complete.assign(log_count=(bbal_pcr_complete.pcr_count
+ 1 + 0.1*np.random.randn(bbal_pcr_complete.shape[0]))
.apply(np.log)),
col='record_id', hue="pathogen",
col_wrap=4, size=2.5, palette=sns.color_palette("hls", 18))
grid.map(plt.plot, "day", "log_count", marker="o", ms=4, alpha=0.5)
# grid.map(vertical_abx_line, 'record_id', alpha=0.2)
grid.add_legend()
<seaborn.axisgrid.FacetGrid at 0x119ab7358>
hme_long_complete = pcr_data_long.dropna(subset=['date_study_day', 'hme_pcr_result']).dropna(axis=1, thresh=1)
hme_long_complete.head()
record_id | bbal_add_path2 | bbal_add_path3 | bbal_collected | bbal_copies1 | bbal_path1 | bbal_pcr_result | bhme_add_path2 | bhme_add_path3 | bhme_add_path4 | ... | mbal_copies2 | mbal_copies3 | mbal_copies4 | mbal_copies5 | mbal_path1 | mbal_path2 | mbal_path3 | mbal_path4 | mbal_path5 | mbal_pcr_result | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1003 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
5 | 1007 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 299000.0 | 2520000.0 | 15700000.0 | NaN | 2.0 | 13.0 | 6.0 | 11.0 | NaN | 1.0 |
6 | 1008 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
7 | 1009 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | 1.0 |
10 | 1012 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 557000.0 | 77800.0 | NaN | NaN | 1.0 | 13.0 | 6.0 | NaN | NaN | 1.0 |
5 rows × 62 columns
hme_copies_cols = hme_long_complete.columns[hme_long_complete.columns.str.contains('copies')
& hme_long_complete.columns.str.startswith('hme')]
hme_pathogens_cols = hme_long_complete.columns[hme_long_complete.columns.str.contains('path')
& ~hme_long_complete.columns.str.contains('add')
& hme_long_complete.columns.str.startswith('hme')]
Convert pathogens and counts from long to wide
hme_pathogens_long = (hme_long_complete[hme_pathogens_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id', 'day']))
# Strip out day from variable and replace pathogen code with name
hme_pathogens = hme_pathogens_long.assign(pathogen=hme_pathogens_long.value.replace(pcr_path_lookup)
).drop(['variable', 'value'], axis=1)
hme_counts_long = (hme_long_complete[hme_copies_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id','day']))
# Make sure they are the same size!
assert hme_pathogens_long.shape==hme_counts_long.shape
Replace bad values
hme_counts_long.value[~hme_counts_long.value.apply(np.isreal)]
Series([], Name: value, dtype: float64)
hme_counts_long.loc[47, 'value'] = 5.47E+08
Append count onto pathogens table
hme_pathogens['pcr_count'] = (hme_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
.str.replace('/','.')).astype(float)
hme_pathogens_sorted = hme_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
hme_groups = []
for labels, group in hme_pathogens_sorted.groupby(['record_id', 'day']):
recid, day = labels
group_full = fill_pathogens(group, labels, pcr_path_lookup)
hme_groups.append(group_full)
hme_complete = pd.concat(hme_groups).reset_index()
hme_complete.to_csv('../data/clean/hme_pathogens.csv')
hme_complete.head()
index | pathogen | record_id | day | pcr_count | |
---|---|---|---|---|---|
0 | 0 | Staphylococcus aureus | 1002 | 5 | 0.0 |
1 | 1 | Streptococcus pneumonia | 1002 | 5 | 0.0 |
2 | 2 | Streptococcus Group B | 1002 | 5 | 0.0 |
3 | 3 | Acetinobacter baumannii | 1002 | 5 | 0.0 |
4 | 4 | Pseudomonas aeruginosa | 1002 | 5 | 0.0 |
Each panel represents a patient; each colored line a pathogen. X an Y axes are days and log-count, respectively.
hme_grid = sns.FacetGrid(hme_complete.assign(log_count=(hme_complete.pcr_count + 1).apply(np.log)),
col='record_id', hue="pathogen", col_wrap=4, size=2.5)
hme_grid.map(plt.plot, "day", "log_count", marker="o", ms=4)
hme_grid.add_legend()
<seaborn.axisgrid.FacetGrid at 0x10d54ca20>
# Same pathogen list as bBAL
mbal_path_lookup = bbal_path_lookup
culture_long = daily_data_to_long(culture_data)
Processing day 1 Processing day 2 Processing day 3 Processing day 4 Processing day 5 Processing day 6 Processing day 7 Processing day 8 Processing day 9 Processing day 10 Processing day 11 Processing day 12 Processing day 13 Processing day 14
Drop rows with no date or no culture test, and columns with no data
culture_long_complete = culture_long.dropna(subset=['date_study_day', 'mbal_culture_test']).dropna(axis=1, thresh=1)
culture_cfu_cols = culture_long_complete.columns[culture_long_complete.columns.str.contains('cfu')
& culture_long_complete.columns.str.startswith('mbal')]
culture_pathogens_cols = culture_long_complete.columns[culture_long_complete.columns.str.contains('pathogen')
& ~culture_long_complete.columns.str.contains('add')
& culture_long_complete.columns.str.startswith('mbal')]
culture_pathogens_long = (culture_long_complete[culture_pathogens_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id', 'day']))
# Strip out day from variable and replace pathogen code with name
culture_pathogens = culture_pathogens_long.assign(pathogen=culture_pathogens_long.value.replace(mbal_path_lookup)
).drop(['variable', 'value'], axis=1)
culture_counts_long = (culture_long_complete[culture_cfu_cols.tolist() + ['record_id', 'day']]
.pipe(pd.melt, id_vars=['record_id','day']))
# Make sure they are the same size!
assert culture_pathogens_long.shape==culture_counts_long.shape
Append CFU count onto pathogens table
culture_pathogens['cfu_count'] = (culture_counts_long.value.astype(str).str.replace('E','e').str.replace('+','')
.str.replace('/','.')).astype(float)
culture_pathogens_sorted = culture_pathogens.sort_values(by=['record_id', 'day']).reset_index(drop=True)
groups = []
for labels, group in culture_pathogens_sorted.groupby(['record_id', 'day']):
recid, day = labels
group_full = fill_pathogens(group, labels, mbal_path_lookup)
groups.append(group_full)
culture_complete = pd.concat(groups).reset_index()
culture_complete.to_csv('../data/clean/culture_pathogens.csv')
Each panel represents a patient; each colored line a pathogen. X an Y axes are days and CFU count, respectively.
grid = sns.FacetGrid(culture_complete,
col='record_id', hue="pathogen", col_wrap=4, size=2.5)
grid.map(plt.plot, "day", "cfu_count", marker="o", ms=4)
grid.add_legend()
<seaborn.axisgrid.FacetGrid at 0x11defaa20>
pathogens = ['Staphylococcus aureus', 'Haemophilus influenza', 'Acetinobacter baumannii', 'Klebsiella pneumoniae']
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16,14))
cbar_ax = fig.add_axes([.91, .3, .03, .4])
for ax,pathogen in zip(axes.ravel(), pathogens):
draw_cbar = pathogen=='Haemophilus influenza'
pathogen_grid = culture_complete[culture_complete.pathogen==pathogen].pivot('record_id', 'day', 'cfu_count')
sns.heatmap(pathogen_grid, ax=ax, vmin=0, vmax=6, cbar=draw_cbar, cbar_ax=cbar_ax if draw_cbar else None)
ax.set_yticklabels([''])
ax.set_title(pathogen)
ax.set_ylabel('patient')
fig.tight_layout(rect=[0, 0, .9, 1])
/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/matplotlib/figure.py:1742: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect. warnings.warn("This figure includes Axes that are not "
Merge PCR and CFU DataFrames on subject, day and pathogen:
mbal_merged = mbal_pcr_pathogens.merge(culture_pathogens, on=['record_id', 'day', 'pathogen'], how='left').fillna(0)
mbal_merged.shape
(4333, 5)
mbal_merged.head()
record_id | day | pathogen | pcr_count | cfu_count | |
---|---|---|---|---|---|
0 | 1003 | 1 | 0 | 0.0 | 0.0 |
1 | 1003 | 1 | 0 | 0.0 | 0.0 |
2 | 1003 | 1 | 0 | 0.0 | 0.0 |
3 | 1003 | 1 | 0 | 0.0 | 0.0 |
4 | 1005 | 1 | 0 | 0.0 | 0.0 |
axes = (mbal_merged.assign(log_pcr_count=mbal_merged.pcr_count.apply(np.log))
.plot.scatter(x='log_pcr_count', y='cfu_count'))
axes.set_ylim(0, mbal_merged.cfu_count.max()+1);
bbal_agg = (bbal_pathogens.dropna()
.groupby(['record_id', 'pathogen'])[['cfu_count']].max())
bbal_agg.head()
cfu_count | ||
---|---|---|
record_id | pathogen | |
1001 | Other | 2.0 |
1003 | Klebsiella pneumoniae | 3.0 |
Other | 2.0 | |
1007 | Enterobacter cloacae | 2.0 |
Haemophilus influenza | 6.0 |
hme_agg = (hme_pathogens.assign(log_count=np.log(hme_pathogens.pcr_count))
.drop('pcr_count', axis=1)
.dropna()
.groupby(['record_id', 'pathogen'])[['log_count']].max())
hme_agg.head()
log_count | ||
---|---|---|
record_id | pathogen | |
1002 | Enterobacter aerogenes | 16.985196 |
Klebsiella oxytoca | 14.883664 | |
Klebsiella pneumoniae | 15.373655 | |
1003 | Acetinobacter baumannii | 14.538217 |
Haemophilus influenza | 15.646491 |
bbal_hme = (bbal_agg.join(hme_agg).fillna(0)
.reset_index()
.rename(columns={'log_count':'HME count',
'cfu_count':'bBAL count'}))
bbal_hme.head()
record_id | pathogen | bBAL count | HME count | |
---|---|---|---|---|
0 | 1001 | Other | 2.0 | 0.000000 |
1 | 1003 | Klebsiella pneumoniae | 3.0 | 20.681402 |
2 | 1003 | Other | 2.0 | 0.000000 |
3 | 1007 | Enterobacter cloacae | 2.0 | 0.000000 |
4 | 1007 | Haemophilus influenza | 6.0 | 14.200773 |
n = bbal_hme.shape[0]
s = 0.1
grid = sns.FacetGrid(data=bbal_hme.assign(hme_count=bbal_hme['HME count']+np.random.randn(n)*s,
bbal_count=bbal_hme['bBAL count']+np.random.randn(n)*s), hue='pathogen', aspect=1.5, size=5)
grid.map(plt.scatter, 'hme_count', 'bbal_count', alpha=0.5).add_legend()
<seaborn.axisgrid.FacetGrid at 0x1224c3780>
bbal_hme['bbal_positive'] = bbal_hme['bBAL count'] >=3
bbal_hme['hme_positive'] = bbal_hme['HME count'] > 0
from sklearn import metrics
Sensitivity (recall)
metrics.recall_score(bbal_hme.bbal_positive, bbal_hme.hme_positive)
0.70370370370370372
Precision score (PPV)
metrics.precision_score(bbal_hme.bbal_positive, bbal_hme.hme_positive)
0.61290322580645162
def calc_recall(x):
if not x.bbal_positive.sum():
return np.nan
return metrics.recall_score(x.bbal_positive, x.hme_positive)
bbal_hme.groupby('pathogen').apply(calc_recall)
pathogen Acetinobacter baumannii 1.000000 Enterobacter aerogenes NaN Enterobacter cloacae 0.000000 Escherichia coli 0.500000 Haemophilus influenza 0.750000 Klebsiella oxytoca NaN Klebsiella pneumoniae 1.000000 Other 0.000000 Pseudomonas aeruginosa 1.000000 Serratia marcescens NaN Staphylococcus aureus 0.833333 Stenotrophomonas maltophilia 1.000000 Streptococcus pneumonia 1.000000 dtype: float64
bbal_positive = bbal_hme[bbal_hme.bbal_positive==1].copy()
pathogen_list = bbal_positive.pathogen.unique()
n_pathogens = pathogen_list.shape[0]
pathogen_encode = dict(zip(pathogen_list, range(n_pathogens)))
pathogen_decode = dict(zip(range(n_pathogens), pathogen_list))
bbal_positive['pathogen_id'] = bbal_positive.pathogen.replace(pathogen_encode)
import pymc3 as pm
x = bbal_positive.pathogen_id.values
y = bbal_positive.hme_positive.values
with pm.Model() as recall_model:
μ = pm.Normal('μ', 0, sd=10)
σ = pm.HalfCauchy('σ', 2.5)
θ = pm.Normal('θ', μ, sd=σ, shape=n_pathogens)
π = pm.Deterministic('π', pm.math.invlogit(θ))
pm.Bernoulli('likeihood', π[x], observed=y)
with recall_model:
tr = pm.sample(2000, n_init=20000, random_seed=20090425)
Auto-assigning NUTS sampler... Initializing NUTS using advi... Average ELBO = -20.161: 100%|██████████| 20000/20000 [00:01<00:00, 14296.79it/s] Finished [100%]: Average ELBO = -20.103 100%|██████████| 2000/2000 [00:02<00:00, 803.39it/s]
pm.summary(tr[1000:], varnames=['π'])
π: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------- 0.893 0.131 0.006 [0.617, 1.000] 0.746 0.177 0.005 [0.397, 0.996] 0.883 0.143 0.006 [0.561, 1.000] 0.819 0.132 0.005 [0.557, 0.999] 0.876 0.149 0.006 [0.573, 1.000] 0.869 0.160 0.007 [0.536, 1.000] 0.275 0.241 0.014 [0.000, 0.745] 0.822 0.221 0.008 [0.304, 1.000] 0.588 0.255 0.009 [0.119, 0.996] 0.217 0.211 0.012 [0.000, 0.658] Posterior quantiles: 2.5 25 50 75 97.5 |--------------|==============|==============|--------------| 0.540 0.843 0.946 0.990 1.000 0.342 0.634 0.780 0.885 0.986 0.455 0.833 0.938 0.990 1.000 0.506 0.745 0.847 0.921 0.990 0.472 0.809 0.934 0.990 1.000 0.414 0.798 0.931 0.991 1.000 0.000 0.066 0.216 0.451 0.804 0.174 0.748 0.914 0.986 1.000 0.070 0.401 0.619 0.796 0.976 0.000 0.034 0.152 0.352 0.719
gs = pm.forestplot(tr[1000:], varnames=['π'], ylabels=pathogen_list,
xtitle='Sensitivity (Recall)')
pm.plot_posterior(tr[1000:], varnames=['μ'], point_estimate='median',
transform=lambda x: 1/(1+np.exp(-x)))
array([<matplotlib.axes._subplots.AxesSubplot object at 0x12d4130f0>], dtype=object)
PPV estimation
hme_positive = bbal_hme[bbal_hme.hme_positive==1].copy()
pathogen_list = hme_positive.pathogen.unique()
n_pathogens = pathogen_list.shape[0]
pathogen_encode = dict(zip(pathogen_list, range(n_pathogens)))
pathogen_decode = dict(zip(range(n_pathogens), pathogen_list))
hme_positive['pathogen_id'] = hme_positive.pathogen.replace(pathogen_encode)
hme_positive.shape
(31, 7)
x = hme_positive.pathogen_id.values
y = hme_positive.bbal_positive.values
with pm.Model() as ppv_model:
μ = pm.Normal('μ', 0, sd=10)
σ = pm.HalfCauchy('σ', 2.5)
θ = pm.Normal('θ', μ, sd=σ, shape=n_pathogens)
π = pm.Deterministic('π', pm.math.invlogit(θ))
pm.Bernoulli('likeihood', π[x], observed=y)
with ppv_model:
tr_ppv = pm.sample(2000, n_init=20000, random_seed=20090425)
Auto-assigning NUTS sampler... Initializing NUTS using advi... Average ELBO = -27.131: 100%|██████████| 20000/20000 [00:01<00:00, 13681.07it/s] Finished [100%]: Average ELBO = -27.114 100%|██████████| 2000/2000 [00:02<00:00, 798.43it/s]
pm.forestplot(tr_ppv[1000:], varnames=['π'], ylabels=pathogen_list,
xtitle='Positive predictive value')
<matplotlib.gridspec.GridSpec at 0x12fd12dd8>
NPV estimation
bbal_hme_complete = bbal_hme.set_index(['record_id', 'pathogen'])
complete_index = pd.MultiIndex.from_product(bbal_hme_complete.index.levels,
names=bbal_hme_complete.index.names)
bbal_hme_complete = bbal_hme_complete.reindex(complete_index, fill_value=0).reset_index()
bbal_negative = bbal_hme_complete[bbal_hme_complete.bbal_positive==0].copy()
pathogen_list = bbal_negative.pathogen.unique()
n_pathogens = pathogen_list.shape[0]
pathogen_encode = dict(zip(pathogen_list, range(n_pathogens)))
pathogen_decode = dict(zip(range(n_pathogens), pathogen_list))
bbal_negative['pathogen_id'] = bbal_negative.pathogen.replace(pathogen_encode)
x = bbal_negative.pathogen_id.values
y = bbal_negative.hme_positive.values.astype(int)==0
with pm.Model() as npv_model:
μ = pm.Normal('μ', 0, sd=10)
σ = pm.HalfCauchy('σ', 2.5)
θ = pm.Normal('θ', μ, sd=σ, shape=n_pathogens)
π = pm.Deterministic('π', pm.math.invlogit(θ))
pm.Bernoulli('likeihood', π[x], observed=y)
with npv_model:
tr_npv = pm.sample(2000, n_init=20000, random_seed=20090425)
Auto-assigning NUTS sampler... Initializing NUTS using advi... Average ELBO = -55.094: 100%|██████████| 20000/20000 [00:01<00:00, 12545.31it/s] Finished [100%]: Average ELBO = -54.432 100%|██████████| 2000/2000 [00:02<00:00, 689.76it/s]
gs = pm.forestplot(tr_npv[1000:], varnames=['π'], ylabels=pathogen_list,
xtitle='Negative predictive value')
pcr_merged = mbal_pcr_pathogens.merge(hme_pathogens, on=['record_id', 'day', 'pathogen'],
how='left',
suffixes=['_mbal', '_hme']).fillna(0)
pcr_merged.head()
record_id | day | pathogen | pcr_count_mbal | pcr_count_hme | |
---|---|---|---|---|---|
0 | 1003 | 1 | 0 | 0.0 | 0.0 |
1 | 1003 | 1 | 0 | 0.0 | 0.0 |
2 | 1003 | 1 | 0 | 0.0 | 0.0 |
3 | 1003 | 1 | 0 | 0.0 | 0.0 |
4 | 1003 | 1 | 0 | 0.0 | 0.0 |
axes = (pcr_merged.assign(log_mbal_count=(pcr_merged.pcr_count_mbal+1).apply(np.log),
log_hme_count=(pcr_merged.pcr_count_hme+1).apply(np.log))
.plot.scatter(x='log_mbal_count', y='log_hme_count'))
axes.set_xlim(0, 25)
axes.set_ylim(0, 25);
Calculate proportion of culture detections that show up in PCR
mbal_merged.head()
record_id | day | pathogen | pcr_count | cfu_count | |
---|---|---|---|---|---|
0 | 1003 | 1 | 0 | 0.0 | 0.0 |
1 | 1003 | 1 | 0 | 0.0 | 0.0 |
2 | 1003 | 1 | 0 | 0.0 | 0.0 |
3 | 1003 | 1 | 0 | 0.0 | 0.0 |
4 | 1005 | 1 | 0 | 0.0 | 0.0 |
mbal_merged['pcr_positive'] = mbal_merged.pcr_count>0
mbal_merged['cfu_positive'] = mbal_merged.cfu_count>0
mbal_merged.pcr_positive.sum()
162
mbal_merged.cfu_positive.sum()
55
mbal_merged[mbal_merged.cfu_positive==1].pcr_positive.mean()
1.0
bbal_long.head()
record_id | abg_yn | amt_irrigant | ards | bbal | bbal_add_pathogen3 | bbal_add_pathogen4 | bbal_add_pathogen_2 | bbal_cfu | bbal_cfu2 | ... | plt | reintubated | spo2 | temperature | urine_output | vasopressin_rate | vasopressin_yn | vasopressors_yn | vent_day | wbc_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6 | 1007 | 1.0 | 20.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | ... | 151.0 | 0.0 | NaN | 38.3 | 504.0 | NaN | 0.0 | 1.0 | 1.0 | 16.5 |
11 | 1012 | 0.0 | 20.0 | 0.0 | 1.0 | NaN | NaN | 0.0 | 6.0 | NaN | ... | 193.0 | 0.0 | 95.0 | 38.8 | 1.0 | NaN | NaN | 0.0 | 1.0 | 11.9 |
12 | 1013 | 0.0 | NaN | 0.0 | 1.0 | 0.0 | NaN | 1.0 | NaN | NaN | ... | 135.0 | 0.0 | 94.0 | 38.9 | 1485.0 | NaN | NaN | 0.0 | 1.0 | 10.8 |
38 | 1039 | 0.0 | NaN | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | ... | 143.0 | 0.0 | 95.0 | 38.2 | 1740.0 | NaN | NaN | 0.0 | 1.0 | 8.5 |
40 | 1041 | 1.0 | NaN | 0.0 | 1.0 | 1.0 | NaN | 1.0 | 6.0 | 5.0 | ... | 81.0 | 0.0 | NaN | 38.5 | 3140.0 | 0.04 | 1.0 | 1.0 | 1.0 | 10.0 |
5 rows × 106 columns
bbal_pathogens.dropna(subset=['pathogen'])
record_id | day | pathogen | cfu_count | |
---|---|---|---|---|
1 | 1012 | 1 | Staphylococcus aureus | 6.0 |
2 | 1013 | 1 | Haemophilus influenza | NaN |
4 | 1041 | 1 | Escherichia coli | 6.0 |
5 | 1043 | 1 | Haemophilus influenza | 6.0 |
7 | 1016 | 2 | Staphylococcus aureus | 3.0 |
8 | 1022 | 2 | Enterobacter cloacae | 3.0 |
10 | 1029 | 2 | Enterobacter cloacae | 2.0 |
11 | 1034 | 2 | Staphylococcus aureus | 6.0 |
14 | 1007 | 3 | Streptococcus pneumonia | 6.0 |
15 | 1018 | 3 | Klebsiella pneumoniae | 6.0 |
17 | 1046 | 3 | Other | 4.0 |
20 | 1011 | 4 | Staphylococcus aureus | 6.0 |
22 | 1019 | 4 | Acetinobacter baumannii | 5.0 |
23 | 1045 | 4 | Streptococcus pneumonia | 6.0 |
24 | 1003 | 5 | Klebsiella pneumoniae | 3.0 |
25 | 1010 | 5 | Haemophilus influenza | NaN |
26 | 1024 | 5 | Staphylococcus aureus | 2.0 |
27 | 1039 | 5 | Staphylococcus aureus | 2.0 |
29 | 1022 | 6 | Pseudomonas aeruginosa | 6.0 |
32 | 1028 | 6 | Staphylococcus aureus | 2.0 |
35 | 1044 | 7 | Enterobacter cloacae | 5.0 |
38 | 1015 | 9 | Stenotrophomonas maltophilia | 2.0 |
41 | 1001 | 10 | Other | 2.0 |
43 | 1019 | 10 | Pseudomonas aeruginosa | 3.0 |
44 | 1038 | 11 | Staphylococcus aureus | 6.0 |
46 | 1022 | 13 | Stenotrophomonas maltophilia | 2.0 |
47 | 1028 | 13 | Staphylococcus aureus | 2.0 |
51 | 1013 | 1 | Other | NaN |
53 | 1041 | 1 | Other | 5.0 |
56 | 1016 | 2 | Klebsiella oxytoca | 1.0 |
57 | 1022 | 2 | Klebsiella pneumoniae | 2.0 |
59 | 1029 | 2 | Other | 3.0 |
60 | 1034 | 2 | Haemophilus influenza | 6.0 |
63 | 1007 | 3 | Haemophilus influenza | 6.0 |
64 | 1018 | 3 | Other | 2.0 |
66 | 1046 | 3 | Staphylococcus aureus | 3.0 |
71 | 1019 | 4 | Haemophilus influenza | 5.0 |
73 | 1003 | 5 | Other | 2.0 |
74 | 1010 | 5 | Staphylococcus aureus | NaN |
75 | 1024 | 5 | Enterobacter aerogenes | 2.0 |
78 | 1022 | 6 | Stenotrophomonas maltophilia | 5.0 |
81 | 1028 | 6 | Escherichia coli | 3.0 |
87 | 1015 | 9 | Other | 2.0 |
92 | 1019 | 10 | Acetinobacter baumannii | 3.0 |
95 | 1022 | 13 | Pseudomonas aeruginosa | 2.0 |
96 | 1028 | 13 | Acetinobacter baumannii | 1.0 |
102 | 1041 | 1 | Staphylococcus aureus | 2.0 |
106 | 1022 | 2 | Acetinobacter baumannii | 2.0 |
109 | 1034 | 2 | Klebsiella pneumoniae | 3.0 |
112 | 1007 | 3 | Other | 2.0 |
113 | 1018 | 3 | Acetinobacter baumannii | 6.0 |
120 | 1019 | 4 | Staphylococcus aureus | 2.0 |
130 | 1028 | 6 | Other | 2.0 |
145 | 1028 | 13 | Stenotrophomonas maltophilia | 1.0 |
155 | 1022 | 2 | Staphylococcus aureus | 2.0 |
161 | 1007 | 3 | Enterobacter cloacae | 2.0 |
179 | 1028 | 6 | Serratia marcescens | 2.0 |
194 | 1028 | 13 | Other | 1.0 |
fig, axes = plt.subplots(1, 2, figsize=(14,8))
(hme_pathogens.assign(log_count=np.log(hme_pathogens.pcr_count))
.dropna(subset=['pathogen'])
.fillna(0)
.groupby(['record_id', 'pathogen'])
.log_count.max()
.reset_index()
.groupby('pathogen')
.log_count.mean()
.sort_values(ascending=False)
.plot.bar(ax=axes[0]))
axes[0].set_title('Mean maximum pathogen count in HME')
axes[0].set_xlabel('')
axes[0].set_ylabel('Log PCR count')
(bbal_pathogens.dropna(subset=['pathogen'])
.fillna(0)
.groupby(['record_id', 'pathogen'])
.cfu_count.max()
.reset_index()
.groupby('pathogen')
.cfu_count.mean()
.sort_values(ascending=False)
.plot.bar(ax=axes[1]))
axes[1].set_title('Mean maximum pathogen count in bBAL')
axes[1].set_xlabel('')
axes[1].set_ylabel('CFU count')
<matplotlib.text.Text at 0x12f2d6048>
Plot the bBAL patients and their WBC, temperature, pulse rate for the 14 days they were on the study.
bbal_ids = bbal_long.record_id.unique()
def plot_variable(var_basename, label, title='', subset=None, data_filter=None):
cols = trial_data.columns[trial_data.columns.str.contains(var_basename)]
if subset is not None:
data = trial_data.loc[subset, cols]
else:
data = trial_data[cols]
if data_filter:
data = data_filter(data)
data_long = pd.melt(data.reset_index(), id_vars=['record_id']).dropna()
data_long = (data_long.assign(day=data_long.variable.str
.split('_d')
.apply(lambda x: x[-1]))
.drop('variable', axis=1)
.rename(columns={'value':label}))
grid = sns.FacetGrid(data_long,
col='record_id', col_wrap=4, size=2.5)
grid.map(plt.plot, "day", label, marker="o", ms=4)
grid.fig.subplots_adjust(top=0.95)
grid.fig.suptitle(title)
return grid
plot_variable('wbc', 'WBC', 'WBC Count', subset=np.arange(1001, 1038))
<seaborn.axisgrid.FacetGrid at 0x12c11cc50>
plot_variable('map', 'map', 'Mean Arterial Pressure', subset=np.arange(1001, 1038))
<seaborn.axisgrid.FacetGrid at 0x12bb26fd0>
from scipy.constants import convert_temperature
plot_variable('temp', 'temperature (C)', 'Temperature', subset=np.arange(1001, 1038))
<seaborn.axisgrid.FacetGrid at 0x12d94b940>
pao2_cols = trial_data.columns[trial_data.columns.str.contains('pao2')]
fio2_cols = trial_data.columns[trial_data.columns.str.contains('pf_fio2')]
pao2_data = trial_data.loc[bbal_long.record_id.unique(), pao2_cols]
fio2_data = trial_data.loc[bbal_long.record_id.unique(), fio2_cols]
pao2_data_long = pd.melt(pao2_data.reset_index(), id_vars=['record_id']).dropna()
pao2_data_long = (pao2_data_long.assign(day=pao2_data_long.variable.str
.split('_d')
.apply(lambda x: x[-1]))
.drop('variable', axis=1)
.rename(columns={'value':'PaO2'}))
fio2_data_long = pd.melt(fio2_data.reset_index(), id_vars=['record_id']).dropna()
fio2_data_long = (fio2_data_long.assign(day=fio2_data_long.variable.str
.split('_d')
.apply(lambda x: x[-1]))
.drop('variable', axis=1)
.rename(columns={'value':'FiO2'}))
ratio_data = fio2_data_long.set_index(['record_id','day']).join(pao2_data_long.set_index(['record_id','day'])).dropna()
ratio_data['P/F ratio'] = 100*ratio_data.PaO2/ratio_data.FiO2
ratio_data = ratio_data.reset_index()
grid = sns.FacetGrid(ratio_data,
col='record_id', col_wrap=4, size=2.5)
grid.map(plt.plot, "day", "P/F ratio", marker="o", ms=4)
grid.fig.subplots_adjust(top=0.95)
grid.fig.suptitle('P/F ratio')
<matplotlib.text.Text at 0x12bf74e80>