%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 100)
Import data from Excel, using RunID
as an index.
flu = pd.read_excel("Data/Flu.xlsx", "Flu", index_col='RunID', na_values=['V04.82','v03.81'])
diagnoses = pd.read_excel("Data/Flu.xlsx", "Diagnoses")
complications = pd.read_excel("Data/Flu.xlsx", "Complications")
organisms = pd.read_excel("Data/Flu.xlsx", "Organisms")
runs_year = pd.read_excel("Data/Flu.xlsx", "Runs by Year")
# runs_sex = pd.read_excel("Data/Flu.xlsx", "Runs by Year and Sex")
# runs_race = pd.read_excel("Data/Flu.xlsx", "Runs by Year and Race")
# Average age of patients on ECMO
year_days = pd.read_csv("Data/year_days.csv", index_col='YearEcls')
support = pd.read_excel("Data/Flu.xlsx", "Pre-ECLS Support")
survived_year = pd.read_csv("Data/survived.csv")
flu.columns
Index(['PatientID', 'RunNo', 'AgeDays', 'HoursECMO', 'SupportType', 'PrimaryDx', 'Mode', 'Discontinuation', 'DischargedAlive', 'DischargeLocation', 'YearECLS', 'VentType', 'Rate', 'FiO2', 'PIP', 'PEEP', 'MAP', 'HandBagging', 'pH', 'PCO2', 'PO2', 'HCO3', 'SaO2', 'Venttype24', 'Rate24', 'Fio224', 'PIP24', 'PEEP24', 'MAP24', 'Handbagging24', 'SBP', 'DBP', 'MapHemo', 'SVO2', 'PCWP', 'SPAP', 'DPAP', 'MPAP', 'CI', 'Race', 'Sex', 'AdmitToTimeOnHours', 'TimeOffToExtubationDateHours', 'TimeOffToDeathDateHours', 'TimeOffToDCDateHours', 'ExtubationToDCDateHours', 'ExtubationToDeathDateHours'], dtype='object')
(year_days/365).plot(legend=False, grid=False)
plt.xlabel('Year'); plt.ylabel('Age (years)')
plt.xlim(1992, 2013);
Number of patients inf flu dataset
flu.PatientID.unique().size
1712
Convert year to integer value
flu['year'] = flu.YearECLS.astype(int)
Constrain data to study years
flu = flu[(flu.YearECLS>=1992) & (flu.YearECLS<=2012)]
flu.PatientID.unique().size
922
runs_year = runs_year[(runs_year.YearEcls>=1992) & (runs_year.YearEcls<=2012)]
Create hours on ventilator variable
flu['HoursVent'] = flu.HoursECMO + flu.TimeOffToExtubationDateHours.fillna(0)
Divide into mechanical, hemorragic, renal, cardiovascular, pulmonary, neural (ignore others).
complications.head()
RunID ComplicationCode \ 0 69CC56CD-208B-4C05-8E3F-002A6541F768 111 1 69CC56CD-208B-4C05-8E3F-002A6541F768 312 2 69CC56CD-208B-4C05-8E3F-002A6541F768 401 3 69CC56CD-208B-4C05-8E3F-002A6541F768 502 4 69CC56CD-208B-4C05-8E3F-002A6541F768 514 Description 0 Mechanical: Clots: oxygenator 1 Neurologic: Seizures: EEG determined 2 Renal: Creatinine 1.5 - 3.0 3 Cardiovascular: CPR required 4 Cardiovascular: Hypertension requiring vasodil...
complications['complication_type'] = complications.Description.apply(lambda x: x[:x.index(':')])
complications.complication_type.value_counts()
Cardiovascular 1471 Renal 1368 Mechanical 1142 Hemorrhagic 922 Metabolic 612 Infectious 478 Pulmonary 399 Neurologic 275 Limb 28 dtype: int64
Convert age to years
flu['AgeYears'] = flu.AgeDays/365.
Age distribution of flu data.
(flu.drop_duplicates('PatientID').AgeYears).hist()
<matplotlib.axes._subplots.AxesSubplot at 0x110e7b978>
Create age classes:
flu['age_class'] = 'pediatric'
flu.age_class[flu.AgeDays<29] = 'neonatal'
flu.age_class[flu.AgeYears>17] = 'adult'
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from IPython.kernel.zmq import kernelapp as app /usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance()
Here is the proportion in each class:
flu.age_class.value_counts(True)
adult 0.570222 pediatric 0.403379 neonatal 0.026399 dtype: float64
Distribution by race and sex:
pd.crosstab(flu.Race, flu.Sex).sort_index(by='F', ascending=False)
Sex F M Race W 271 315 A 47 68 B 43 41 H 38 49 O 25 32
Proportion with flu as PrimaryDx
(ICD9 codes 487 to (but not including) 489) from diagnosis dataset:
primary_dx = diagnoses.ICD9Code[diagnoses.PrimaryDiagnosis==1]
primary_flu = primary_dx.apply(lambda x: x.startswith('487') or x.startswith('488'))
Proportion and number with flu as primary diagnosis
primary_flu.mean()
0.41896551724137931
primary_flu.sum()
729
The same calculation using the flu
dataset:
flu.PrimaryDx = flu.PrimaryDx.astype(float)
((flu.PrimaryDx >= 487) & (flu.PrimaryDx < 489)).mean()
0.43505807814149949
((flu.PrimaryDx >= 487) & (flu.PrimaryDx < 489)).sum()
412
Here is a plot of the frequencies of secondary diagnoses (frequencies >5):
secondary_diag_count = diagnoses.ICD9Code[diagnoses.PrimaryDiagnosis==0].value_counts()
secondary_diag_count[secondary_diag_count>5].plot(kind='bar', figsize=(16,6), grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x1106b5be0>
518.83 chronic respiratory failure
491* obstructive chronic bronchitis
493 asthma
pulmonary_codes = '518.83', '491', '493'
diagnoses.ICD9Code.isin(pulmonary_codes).mean()
0.0037817396002160996
746* congenital heart disease
428* congestive heart failure
427.5 cardiac arrest
cardiac_codes = '746', '428', '427.5'
diagnoses.ICD9Code.isin(cardiac_codes).mean()
0.014451647757968665
Primary
279.3 Unspecified Immunity Deficiency
279.2 Combined Immunity Deficiency
229.12 Wiskott Aldrich Syndrome
279.11 DiGeorge’s Syndrome
288.0 Neutropenia
Secondary
042 HIV
249* Diabetes Mellitus (DMII)
250* Diabetes Mellitus (DM1)
Transplant Organ
996.8* Complications of transplanted organ
V42 Organ replacement/HSCT
Malignancy
200.22 Burkitt’s tumor
201.5* Hodgkin’s disease
208* Leukemia
202.8 lymphoma
204.00 acute lymphoid lymphoma
205.0 myeloid leukemia
155* neoplasm of liver
158* neoplasm of retroperitoneum
163.9 neoplasm of pleura
170.6 malignant neoplasm of pelvic bones and coccyx
171.4 neoplasm of connective and other soft tissue of thorax
189* neoplasm of kidney, ureter etc.
191.9 malignant neoplasm of brain
194* malignant neoplasm of glands
195* malignant neoplasm of different body parts
immuno_codes = ('279.3', '279.2', '229.12', '279.11', '288.0', '042', '249', '250', '996.8', 'V42', '200.22',
'201.5', '208', '202.8', '204.00', '205.0', '155', '158', '163.9', '170.6', '171.4', '189', '191.9',
'194', '195')
diagnoses.ICD9Code.isin(immuno_codes).mean()
0.0068881685575364667
s_aureus_codes = ["038.11","038.12","O41.11","041.12","482.41","482.42","V02.53","V02.54","V12.0"]
diagnoses.ICD9Code.isin(s_aureus_codes).sum()
72
s_aureus_diagnoses = diagnoses[diagnoses.ICD9Code.isin(s_aureus_codes)]
Import organism lookup table
organism_type = pd.read_csv("Data/organisms.csv")
Merge organism information in a single table
organisms = organisms.merge(organism_type)
organisms.head()
RunID OrganismNo OrganismName \ 0 69CC56CD-208B-4C05-8E3F-002A6541F768 63 Influenza A 1 0E553625-2C5D-45D6-A147-008034605059 63 Influenza A 2 018ED307-A20E-4E20-A06A-00E2CBA5E64F 63 Influenza A 3 DB54A4A4-89DF-4177-885B-00F522D76259 63 Influenza A 4 D03B37F7-1394-4EC7-ADF1-012C767BE64D 63 Influenza A CultureSite CultureTimeIsApproximate OrganismTiming Type 0 Respiratory tract 0 Pre-ECLS viral 1 Unknown 1 Pre-ECLS viral 2 Respiratory tract 1 Pre-ECLS viral 3 Respiratory tract 1 Pre-ECLS viral 4 Respiratory tract 0 On-ECLS viral
Number of unique runs in organisms table
organisms.RunID.unique().size
1468
Frequencies of organism counts per run
organisms.groupby('RunID').OrganismNo.count().value_counts()
1 729 2 505 3 102 4 65 5 23 6 14 7 12 9 6 8 6 10 5 19 1 dtype: int64
Count of organisms by run
counts = organisms.groupby('RunID').OrganismNo.count()
Frequencies of organism occurrence (for counts >10)
organism_counts = organisms.groupby('OrganismName').RunID.count()
organism_counts.sort(ascending=False)
organism_counts[organism_counts>10].plot(kind='bar', figsize=(18,4), grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x110b3f908>
Rate of organism occurence (do not sum to one due to multple coinfections):
ax1 = flu.groupby('year')['AgeYears'].median().plot(kind='bar', stacked=True, figsize=(10,6), grid=False)
#ax1.set_xticks(range(1992, 2012))
#ax2 = ax1.twinx()
ax1.plot(year_days.values/10.)
plt.ylim(0,100)
(0, 100)
Distribution of hours on ECMO, by year
flu.groupby(['year', 'age_class'])['PatientID'].count().unstack().plot(kind='bar', stacked=True, figsize=(10,6), grid=False)
plt.ylabel('count')
# flu[flu.Sex=='F'].groupby(['year', 'age_class'])['PatientID'].count().unstack().plot(
# kind='bar', stacked=True, figsize=(10,6), grid=False)
# flu[flu.Sex=='M'].groupby(['year', 'age_class'])['PatientID'].count().unstack().plot(
# kind='bar', stacked=True, figsize=(10,6), grid=False)
<matplotlib.text.Text at 0x110ea6c18>
Same plot as above, except on log scale.
flu.groupby(['year', 'age_class'])['PatientID'].count().unstack().apply(np.log).plot(kind='bar', stacked=True,
figsize=(10,6), grid=False)
plt.ylabel('log(count)')
<matplotlib.text.Text at 0x111a036a0>
ECMO incidence, by year and sex:
flu.groupby(['year','Sex']).RunNo.count()
year Sex 1992 F 1 1993 F 1 1994 F 1 1995 F 1 1996 F 1 M 1 1997 F 2 M 1 1998 F 6 M 5 1999 F 2 M 7 2000 F 8 M 7 2001 F 5 M 3 2002 F 2 M 5 2003 F 10 M 15 2004 F 3 M 3 2005 F 13 M 6 2006 F 11 M 6 2007 F 14 M 11 2008 F 15 M 16 2009 F 169 M 184 2010 F 50 M 70 2011 F 84 M 122 2012 F 33 M 46 Name: RunNo, dtype: int64
from scipy.stats import beta
from scipy.stats import norm
def binomial_hpdr(x, pct=0.95, a=1, b=1, n_pbins=1e3, roundto=2):
"""
Function computes the posterior mode along with the upper and lower bounds of the
**Highest Posterior Density Region**.
Parameters
----------
x: data (boolean)
pct: the size of the confidence interval (between 0 and 1)
a: the alpha hyper-parameter for the Beta distribution used as a prior (Default=1)
b: the beta hyper-parameter for the Beta distribution used as a prior (Default=1)
n_pbins: the number of bins to segment the p_range into (Default=1e3)
Returns
-------
A tuple that contains the mode as well as the lower and upper bounds of the interval
(mode, lower, upper)
"""
n, N = sum(x), len(x)
# fixed random variable object for posterior Beta distribution
rv = beta(n+a, N-n+b)
# determine the mode and standard deviation of the posterior
stdev = rv.stats('v')**0.5
mode = (n+a-1.)/(N+a+b-2.)
# compute the number of sigma that corresponds to this confidence
# this is used to set the rough range of possible success probabilities
n_sigma = np.ceil(norm.ppf( (1+pct)/2. ))+1
# set the min and max values for success probability
max_p = mode + n_sigma * stdev
if max_p > 1:
max_p = 1.
min_p = mode - n_sigma * stdev
if min_p > 1:
min_p = 1.
# make the range of success probabilities
p_range = np.linspace(min_p, max_p, n_pbins+1)
# construct the probability mass function over the given range
if mode > 0.5:
sf = rv.sf(p_range)
pmf = sf[:-1] - sf[1:]
else:
cdf = rv.cdf(p_range)
pmf = cdf[1:] - cdf[:-1]
# find the upper and lower bounds of the interval
sorted_idxs = np.argsort( pmf )[::-1]
cumsum = np.cumsum( np.sort(pmf)[::-1] )
j = np.argmin( np.abs(cumsum - pct) )
upper = p_range[ (sorted_idxs[:j+1]).max()+1 ]
lower = p_range[ (sorted_idxs[:j+1]).min() ]
return [np.round(v, roundto) for v in (mode, lower, upper)]
Code for calculating lognormal posterior intervals
from scipy.stats import invgamma
from numpy.random import normal
def lognorm_interval(y, pct=0.95, draws=10000, roundto=2, logm=True):
x = np.log(y + 0.001)
# Calculate sufficiecnt statistics
mu_hat = x.mean()
s2_hat = x.var()
nu = len(x) - 1
beta = nu * s2_hat / 2.
s2_sim = invgamma.rvs(nu/2., scale=beta, size=draws)
mu_sim = normal(mu_hat, np.sqrt(s2_sim)/2.)
m = mu_sim + 0.5*s2_sim
if not logm:
m = np.exp(m)
m.sort()
c = int(draws*(1-pct)/2.)
return([np.round(v, roundto) for v in (m[int(draws/2)], m[c], m[-c])])
Merge flu dataset with organism dataset for summarization by infection type
flu_organism= flu.merge(organisms, left_index=True, right_on='RunID', how='left')
flu_organism.groupby('PatientID')['OrganismName'].count().hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x1106ae748>
flu_organism.head()
PatientID RunNo AgeDays HoursECMO \ 41 89C16594-7B2B-42A4-81FF-002B2E92CA75 1 8992 192 496 F54AD8CA-5FEF-4724-A89D-0061A3C51519 1 79 96 497 F54AD8CA-5FEF-4724-A89D-0061A3C51519 1 79 96 128 78545F85-C3BC-47EF-967A-00D941B25CF8 1 3062 102 1212 78545F85-C3BC-47EF-967A-00D941B25CF8 1 3062 102 SupportType PrimaryDx Mode Discontinuation DischargedAlive \ 41 1 488.10 VV 1 1 496 2 746.11 VA 1 0 497 2 746.11 VA 1 0 128 2 422.90 VA 1 1 1212 2 422.90 VA 1 1 DischargeLocation YearECLS VentType Rate FiO2 PIP PEEP MAP \ 41 1 2009 2 5 100 66 NaN 26 496 NaN 2000 2 9 88 44 NaN NaN 497 NaN 2000 2 9 88 44 NaN NaN 128 3 1997 2 20 70 22 2 NaN 1212 3 1997 2 20 70 22 2 NaN HandBagging pH PCO2 ... CI Race Sex AdmitToTimeOnHours \ 41 0 7.39 32.0 ... NaN B F 39 496 0 7.42 35.0 ... NaN W F 132 497 0 7.42 35.0 ... NaN W F 132 128 0 7.39 31.6 ... NaN W F 30 1212 0 7.39 31.6 ... NaN W F 30 TimeOffToExtubationDateHours TimeOffToDeathDateHours \ 41 NaN NaN 496 NaN 296 497 NaN 296 128 132 NaN 1212 132 NaN TimeOffToDCDateHours ExtubationToDCDateHours \ 41 609 NaN 496 NaN NaN 497 NaN NaN 128 204 72 1212 204 72 ExtubationToDeathDateHours year HoursVent AgeYears age_class \ 41 NaN 2009 192 24.635616 adult 496 NaN 2000 96 0.216438 pediatric 497 NaN 2000 96 0.216438 pediatric 128 NaN 1997 234 8.389041 pediatric 1212 NaN 1997 234 8.389041 pediatric RunID OrganismNo OrganismName \ 41 4BA2348E-18D8-44AE-8E7A-0EC2748E0AFE 63 Influenza A 496 4247D183-3A46-4945-ADE6-81FE57E97688 63 Influenza A 497 4247D183-3A46-4945-ADE6-81FE57E97688 63 Influenza A 128 70DC2DC7-B983-40B0-81B4-2998E80F7163 63 Influenza A 1212 70DC2DC7-B983-40B0-81B4-2998E80F7163 19 Gram negative, other CultureSite CultureTimeIsApproximate OrganismTiming Type 41 Unknown 1 Pre-ECLS viral 496 Unknown 1 Pre-ECLS viral 497 Unknown 1 On-ECLS viral 128 Unknown 1 Pre-ECLS viral 1212 Unknown 1 On-ECLS bacterial [5 rows x 58 columns]
Number with flu by organism name.
has_flu_organism = flu_organism[flu_organism.OrganismName.str.startswith('Influenza')==True].drop_duplicates('PatientID')
len(has_flu_organism)
533
flu_organism['has_flu'] = (flu_organism.OrganismName.str.startswith('Influenza')==True).replace(
{True: 'ECMO Flu', False: 'ECMO No Flu'})
year_days.columns = ['All Runs']
axes = flu_organism.groupby(['YearECLS', 'has_flu'])['AgeYears'].mean().unstack().plot()
(year_days/365).plot(ax=axes, legend=False, grid=False)
plt.xlabel('Year'); plt.ylabel('Age (years)')
plt.xlim(1992, 2013);
axes = flu_organism.groupby(['YearECLS', 'has_flu'])['PatientID'].count().unstack().plot()
plt.xlabel('Year');
plt.xlim(1996, 2013);
Merge with diagnoses dataset for ICD9 lookup.
flu_ICD9 = flu.merge(diagnoses, left_index=True, right_on='RunID', how='left')
Number with flu by ICD9
has_flu_ICD9 = flu_ICD9[flu_ICD9.ICD9Code.apply(
lambda x: x.startswith('487') or x.startswith('488'))].drop_duplicates('PatientID')
len(has_flu_ICD9)
760
This is the union of the patients with flu via ICD9 and those with flu via organism. I think this is what we need.
len(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID)))
922
Extract runs from patients with Flu by either ICD9 or organism
flu_organism_runs = flu_organism[flu_organism.PatientID.isin(list(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID))))]
flu_ICD9_runs = flu_ICD9[flu_ICD9.PatientID.isin(list(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID))))]
This is a smaller number than is in the Flu
table.
len(flu)
947
Total number of runs for people with influenza on ECMO
flu_runs_organisms = organisms.RunID[organisms.OrganismName.str.startswith('Influenza')==True]
flu_runs_organisms.unique().shape
(1087,)
flu_runs_ICD9 = diagnoses.RunID[diagnoses.ICD9Code.apply(
lambda x: x.startswith('487') or x.startswith('488'))]
flu_runs_ICD9.unique().shape
(1401,)
# This is the union of the RunIDs from both sets
len(set(flu_runs_organisms).union(set(flu_runs_ICD9)))
1747
Proportion of runs with influenza
# From organism list
float(len(flu_runs_organisms.unique())) / runs_year.CNT.sum()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-78-a9a9f73ac3dd> in <module>() 1 # From organism list ----> 2 float(len(flu_runs_organisms.unique())) / runs_year.CNT.sum() /usr/local/lib/python3.4/site-packages/pandas/core/generic.py in __getattr__(self, name) 2081 return self[name] 2082 raise AttributeError("'%s' object has no attribute '%s'" % -> 2083 (type(self).__name__, name)) 2084 2085 def __setattr__(self, name, value): AttributeError: 'DataFrame' object has no attribute 'CNT'
# From ICD9 list
float(len(flu_runs_ICD9.unique())) / runs_year.CNT.sum()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-79-daffd8b25576> in <module>() 1 # From ICD9 list ----> 2 float(len(flu_runs_ICD9.unique())) / runs_year.CNT.sum() /usr/local/lib/python3.4/site-packages/pandas/core/generic.py in __getattr__(self, name) 2081 return self[name] 2082 raise AttributeError("'%s' object has no attribute '%s'" % -> 2083 (type(self).__name__, name)) 2084 2085 def __setattr__(self, name, value): AttributeError: 'DataFrame' object has no attribute 'CNT'
Proportion female before and during 2009
flu['female'] = flu.Sex=='F'
x_2009 = flu.drop_duplicates('PatientID')[flu.YearECLS==2009]['female'].sum()
n_2009 = flu.drop_duplicates('PatientID')[flu.YearECLS==2009]['female'].count()
print('Proportion female in 2009: {0:.3f}'.format(float(x_2009)/n_2009))
Proportion female in 2009: 0.481
/usr/local/lib/python3.4/site-packages/pandas/core/frame.py:1819: UserWarning: Boolean Series key will be reindexed to match DataFrame index. "DataFrame index.", UserWarning)
flu['pre_2009'] = flu['YearECLS'] < 2009
x_pre_2009 = flu.drop_duplicates('PatientID')[flu.pre_2009]['female'].sum()
n_pre_2009 = flu.drop_duplicates('PatientID')[flu.pre_2009]['female'].count()
print('Proportion female before 2009: {0:.3f}'.format(float(x_pre_2009)/n_pre_2009))
Proportion female before 2009: 0.522
/usr/local/lib/python3.4/site-packages/pandas/core/frame.py:1819: UserWarning: Boolean Series key will be reindexed to match DataFrame index. "DataFrame index.", UserWarning)
from scipy import stats
def ztest(x, n):
n = np.array(n, float)
pbar = sum(x)/sum(n)
z = np.diff(x/n) / np.sqrt(pbar * (1. - pbar) * sum(1./n))
return {'z':z, 'p':1.- stats.norm.cdf(np.abs(z))}
ztest([x_2009, x_pre_2009], [n_2009, n_pre_2009])
{'p': array([ 0.18546614]), 'z': array([ 0.89472842])}
Proportion of females with flu.
with_flu = flu[flu.PatientID.isin(pd.Series(list(set(has_flu_ICD9.PatientID).union(set(has_flu_organism.PatientID)))))]
(with_flu.Sex=='F').mean()
0.45617740232312565
binomial_hpdr((with_flu.Sex=='F'))
[0.46000000000000002, 0.41999999999999998, 0.48999999999999999]
Age statistics
with_flu.AgeYears.describe()
count 947.000000 mean 24.904292 std 19.629840 min 0.000000 25% 6.189041 50% 23.090411 75% 40.226027 max 81.312329 Name: AgeYears, dtype: float64
Mortality of patients with the diagnosis of influenza
with_flu_survivors = flu.drop_duplicates('PatientID').DischargedAlive==True
with_flu_survivors.mean()
0.60086767895878523
binomial_hpdr(with_flu_survivors)
[0.59999999999999998, 0.56999999999999995, 0.63]
Median hours on ECMO for influenza
# Estimate of interval on log scale, due to skew in distribution
lognorm_interval(with_flu.HoursECMO/24., draws=100000)
[2.7000000000000002, 1.6699999999999999, 3.7200000000000002]
# Age distribution
flu.drop_duplicates('PatientID')['AgeYears'].describe()
count 922.000000 mean 24.793166 std 19.640615 min 0.000000 25% 6.058219 50% 22.954795 75% 40.239041 max 81.312329 Name: AgeYears, dtype: float64
# Race distribution
race_counts = flu.drop_duplicates('PatientID')['Race'].value_counts()
race_counts/float(race_counts.sum())
W 0.632044 A 0.121547 H 0.091713 B 0.091713 O 0.062983 dtype: float64
binomial_hpdr(flu.drop_duplicates('PatientID')['Race']=='W')
[0.62, 0.58999999999999997, 0.65000000000000002]
# Survival
flu.drop_duplicates('PatientID')['DischargedAlive'].mean()
0.60086767895878523
# Sex
sex_counts = flu.drop_duplicates('PatientID')['Sex'].value_counts()
sex_counts/float(sex_counts.sum())
M 0.537705 F 0.462295 dtype: float64
Complications
complications_with_flu = with_flu.merge(complications, left_index=True, right_on='RunID', how='left')
complications_with_flu.complication_type.value_counts()
Cardiovascular 933 Renal 783 Mechanical 741 Hemorrhagic 580 Metabolic 344 Infectious 269 Pulmonary 261 Neurologic 168 dtype: int64
def complication_intervals(dataset):
complications_merged = dataset.merge(complications, left_index=True, right_on='RunID', how='left')
for kind in ('Neurologic', 'Renal', 'Hemorrhagic', 'Cardiovascular', 'Pulmonary'):
x = complications_merged[complications_merged.complication_type==kind].PatientID.drop_duplicates().count()
n = dataset.PatientID.drop_duplicates().count()
m, lo, up = binomial_hpdr([1]*x + [0]*(n-x))
print('{0}:\n\t {1} ({2}, {3})\n'.format(kind, m, lo, up))
complication_intervals(with_flu)
Neurologic: 0.14 (0.12, 0.16) Renal: 0.55 (0.52, 0.58) Hemorrhagic: 0.45 (0.42, 0.48) Cardiovascular: 0.66 (0.63, 0.69) Pulmonary: 0.25 (0.22, 0.28)
Days on ventilator
(with_flu.HoursVent.dropna()/24).hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x1141a9828>
# Interval on log scale
lognorm_interval(with_flu.HoursVent.dropna()/24)
[3.1200000000000001, 2.0, 4.21]
Flu only
Patients with influenza ONLY on ECMO= patients btwn 1992-2012 with EITHER an ICD9 code of flu AND/OR an organism of flu but NO additional organism codes for other bacteria or viruses.
We have established above that all the entries in the Flu
table have influenza. We only have to filter out co-infection.
Number of infections by PatientID
infection_count = flu_organism.groupby('PatientID')['PatientID'].count()
infection_count.count()
922
infection_count.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x114ad0550>
organisms[organisms.OrganismName=='Staphylococcus aureus, meth resist'].drop_duplicates('RunID').shape
(121, 7)
single_infection = flu_organism.merge(pd.DataFrame(infection_count[infection_count==1]), left_on='PatientID', right_index=True)
single_infection_ICD9 = flu_ICD9.merge(pd.DataFrame(infection_count[infection_count==1]), left_on='PatientID', right_index=True)
set(single_infection.PatientID.drop_duplicates()).difference(set(single_infection_ICD9.PatientID.drop_duplicates()))
set()
len(set(single_infection.PatientID.drop_duplicates()))
550
len(set(single_infection_ICD9.PatientID.drop_duplicates()))
550
single_infection.drop_duplicates('PatientID').PatientID.count()
550
single_infection_ICD9.drop_duplicates('PatientID').PatientID.count()
550
influenza_only = single_infection[single_infection.OrganismName.str.startswith('Influenza')==True]
influenza_only['AgeYears'].describe()
count 301.000000 mean 22.404952 std 18.951795 min 0.005479 25% 4.682192 50% 18.561644 75% 35.421918 max 68.550685 Name: AgeYears, dtype: float64
race_counts_flu = influenza_only['Race'].value_counts()
race_counts_flu/float(race_counts_flu.sum())
W 0.626712 B 0.116438 H 0.109589 A 0.082192 O 0.065068 dtype: float64
binomial_hpdr(influenza_only.Race.dropna()=='W')
[0.63, 0.56999999999999995, 0.68000000000000005]
Posterior interval for days on ECMO
lognorm_interval(influenza_only.HoursECMO/24., draws=100000)
[2.3999999999999999, 1.4099999999999999, 3.4100000000000001]
flu_only_survivors = influenza_only['DischargedAlive']
flu_only_survivors.mean()
0.59800664451827246
binomial_hpdr(flu_only_survivors)
[0.59999999999999998, 0.54000000000000004, 0.65000000000000002]
sex_counts_flu = influenza_only['Sex'].value_counts()
sex_counts_flu/float(sex_counts_flu.sum())
M 0.531773 F 0.468227 dtype: float64
binomial_hpdr((influenza_only.Sex=='F'))
[0.46999999999999997, 0.40999999999999998, 0.52000000000000002]
# Index out patients with flu only
complication_intervals(with_flu[with_flu.PatientID.isin(influenza_only.PatientID)])
Neurologic: 0.15 (0.11, 0.19) Renal: 0.5 (0.44, 0.55) Hemorrhagic: 0.43 (0.37, 0.48) Cardiovascular: 0.58 (0.53, 0.64) Pulmonary: 0.21 (0.17, 0.26)
Days on ventilator
# Interval on log scale
lognorm_interval(influenza_only.HoursVent.dropna()/24)
[2.8100000000000001, 1.73, 3.9199999999999999]
Subset with co-infection
Patients with influenza and co-infection on ECMO=patients btwn 1992-2012 with EITHER an ICD9 code of flu AND/OR an organism of flu AND an organism code of something else.
First, identify individuals with multiple infections based on organism code.
multiple_infection = flu_organism.merge(pd.DataFrame(infection_count[infection_count>1]),
left_on='PatientID', right_index=True)
Merge with the subset that has flu by ICD9
multiple_infection_ICD9 = flu_ICD9.merge(pd.DataFrame(infection_count[infection_count>1]),
left_on='PatientID', right_index=True)
No patients in one set that arent in the other
set(multiple_infection.PatientID.drop_duplicates()).difference(set(multiple_infection_ICD9.PatientID.drop_duplicates()))
set()
flu_coinfection = multiple_infection.drop_duplicates('PatientID')
Organism numbers for bacterial infections
bacterial_numbers = [1,2,9,11,12,13,14,15,16,19,30,31,32,35,36,37,38,
39,40,48,52,54,55,58,59,60,61,67,68,69,71,77,80,84,85,86,91,95,104]
bacterial_coinf = flu_organism[flu_organism.OrganismNo.isin(bacterial_numbers)].merge(
pd.DataFrame(infection_count[infection_count>1]),
left_on='PatientID', right_index=True)
Number with bacterial coinfection via organism code
with_bacterial_coinf = bacterial_coinf.drop_duplicates(cols=['PatientID'])
with_bacterial_coinf.shape
/usr/local/lib/python3.4/site-packages/pandas/util/decorators.py:81: FutureWarning: the 'cols' keyword is deprecated, use 'subset' instead warnings.warn(msg, FutureWarning)
(222, 61)
bacterial_ICD9 = ['033.9 ','033 ','036.2 ','038.0 ','038.1','038.11','038.12','038.19',
'038.2','038.3','038.4','038.41','038.42','03.43','038.44','038.49',
'040.82','040.89','081','100.9','481']
with_bacterial_ICD9 = flu_ICD9[flu_ICD9.ICD9Code.apply(
lambda x: x.startswith('041') or x.startswith('320') or x.startswith('482') or
bool(bacterial_ICD9.count(x)))]
Number with bacterial coinfection via ICD9
with_bacterial_ICD9.PatientID.drop_duplicates().shape
(180,)
Comparison with flu-only:
influenza_only['AgeYears'].hist(alpha=0.3, grid=False)
with_bacterial_coinf['AgeYears'].hist(alpha=0.3, grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x114efce10>
print('u={0:.0f}, p={1:.3f}'.format(*stats.mannwhitneyu(influenza_only['AgeYears'].dropna().values,
with_bacterial_coinf['AgeYears'].dropna().values)))
u=29942, p=0.021
with_bacterial_coinf['AgeYears'].describe()
count 222.000000 mean 25.738517 std 19.239357 min 0.000000 25% 8.606164 50% 24.983562 75% 40.582877 max 76.326027 Name: AgeYears, dtype: float64
Survival
flu_bacterial_survivors = with_bacterial_coinf['DischargedAlive']
flu_bacterial_survivors.mean()
0.62612612612612617
binomial_hpdr(flu_bacterial_survivors)
[0.63, 0.56000000000000005, 0.68999999999999995]
Days on ECMO
lognorm_interval(with_bacterial_coinf.HoursECMO/24., draws=100000)
[2.96, 2.0600000000000001, 3.8700000000000001]
race_counts_bacterial = with_bacterial_coinf['Race'].value_counts()
race_counts_bacterial/float(race_counts_bacterial.sum())
W 0.615385 A 0.135747 H 0.108597 B 0.072398 O 0.067873 dtype: float64
Posterior interval for proportion white
binomial_hpdr(with_bacterial_coinf.Race=='W')
[0.60999999999999999, 0.55000000000000004, 0.67000000000000004]
Proportion female
binomial_hpdr((with_bacterial_coinf.Sex=='F'))
[0.42999999999999999, 0.35999999999999999, 0.48999999999999999]
Complications
complication_intervals(with_flu[with_flu.PatientID.isin(with_bacterial_coinf.PatientID)])
Neurologic: 0.14 (0.1, 0.19) Renal: 0.63 (0.57, 0.69) Hemorrhagic: 0.55 (0.48, 0.61) Cardiovascular: 0.65 (0.59, 0.71) Pulmonary: 0.33 (0.27, 0.4)
# Interval on log scale
lognorm_interval(with_bacterial_coinf.HoursVent.dropna()/24)
[3.4100000000000001, 2.4500000000000002, 4.3899999999999997]
race, age, gender
pre-ECMO lab values (blood gas (pH, pCO2, HCO3, Oxygen Index) type of ECMO, ionotrope/pressor support, nitric, CVVH, oscillator type of co-infection
Race distribution
flu_organism['fate'] = flu_organism.DischargedAlive.replace({1: 'Survived', 0: 'Died'})
by_fate = flu_organism.drop_duplicates('PatientID').groupby("fate")
by_fate['Race'].apply(lambda x: (x.value_counts()/float(x.value_counts().sum())).round(3))
fate Died W 0.642 A 0.100 B 0.094 H 0.089 O 0.075 Survived W 0.626 A 0.136 H 0.094 B 0.090 O 0.055 dtype: float64
by_fate['Race'].value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x114fc8e48>
Age distribution
by_fate['AgeYears'].describe()
fate Died count 368.000000 mean 23.698295 std 20.250845 min 0.002740 25% 4.233562 50% 17.271233 75% 40.534247 max 76.454795 Survived count 554.000000 mean 25.520444 std 19.208551 min 0.000000 25% 6.821918 50% 26.312329 75% 39.902055 max 81.312329 dtype: float64
Age distribution is not normally distributed.
age_dist = by_fate['AgeYears']
age_dist.hist(alpha=0.3, grid=False)
fate Died Axes(0.125,0.125;0.775x0.775) Survived Axes(0.125,0.125;0.775x0.775) Name: AgeYears, dtype: object
_ = by_fate.boxplot(column='AgeYears')
/usr/local/lib/python3.4/site-packages/pandas/tools/plotting.py:2633: FutureWarning: The default value for 'return_type' will change to 'axes' in a future release. To use the future behavior now, set return_type='axes'. To keep the previous behavior and silence this warning, set return_type='dict'. warnings.warn(msg, FutureWarning)
Mann-Whitney U-test (non-parametric)
age_data = list(age_dist)
print('u={0:.0f}, p={1:.3f}'.format(*stats.mannwhitneyu(age_data[0][1].dropna().values, age_data[1][1].dropna().values)))
u=95086, p=0.042
Sex proportions
by_fate['Sex'].apply(lambda x: x.value_counts()/float(x.value_counts().sum()))
fate Died M 0.552198 F 0.447802 Survived M 0.528131 F 0.471869 dtype: float64
sex_dist = by_fate['Sex'].apply(lambda x: x.value_counts())
ztest(sex_dist.unstack()['M'].values, sex_dist.unstack().sum(1))
{'p': array([ 0.23740493]), 'z': array([-0.71467505])}
Gases
gases = ['pH', 'PCO2', 'HCO3', 'PO2', 'SaO2']
by_fate.describe()[gases]
pH PCO2 HCO3 PO2 SaO2 fate Died count 328.000000 331.000000 299.000000 332.000000 304.000000 mean 7.217500 62.947432 24.470234 61.249699 76.943750 std 0.162997 26.688554 8.677740 50.421004 18.561107 min 6.560000 7.000000 2.400000 4.700000 9.000000 25% 7.127500 45.000000 18.950000 39.000000 70.000000 50% 7.225000 58.300000 23.200000 52.000000 82.000000 75% 7.332500 76.500000 29.000000 65.000000 90.000000 max 7.570000 190.500000 60.000000 512.000000 100.000000 Survived count 495.000000 502.000000 448.000000 505.000000 465.000000 mean 7.253152 60.037649 25.810491 68.051683 82.342796 std 0.162546 27.480175 8.036870 66.519264 15.281890 min 6.290000 6.000000 3.000000 4.000000 4.000000 25% 7.160000 42.000000 20.100000 44.200000 78.000000 50% 7.280000 55.200000 24.650000 56.000000 86.000000 75% 7.370000 72.000000 30.000000 69.700000 92.000000 max 7.670000 213.000000 60.000000 707.000000 100.000000
for g in gases:
by_fate.boxplot(column=g)
/usr/local/lib/python3.4/site-packages/pandas/tools/plotting.py:2633: FutureWarning: The default value for 'return_type' will change to 'axes' in a future release. To use the future behavior now, set return_type='axes'. To keep the previous behavior and silence this warning, set return_type='dict'. warnings.warn(msg, FutureWarning)
for i in gases:
data = list(flu[i].groupby(flu['DischargedAlive']))
print('{0}: u={1:.0f}, p={2:.3f}'.format(i, *stats.mannwhitneyu(data[0][1].dropna().values, data[1][1].dropna().values)))
pH: u=74566, p=0.001 PCO2: u=81282, p=0.038 HCO3: u=62278, p=0.003 PO2: u=77355, p=0.001 SaO2: u=60212, p=0.000
Prevalence of each type of confection:
Bacterial
bacterial = flu_organism[flu_organism.Type=='bacterial'].drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count()
bacterial
DischargedAlive 0 116 1 164 Name: PatientID, dtype: int64
bacterial_prop = bacterial/multiple_infection.groupby('DischargedAlive')['PatientID'].count().astype(float)
bacterial_prop
DischargedAlive 0 0.361371 1 0.286213 Name: PatientID, dtype: float64
ztest(bacterial, bacterial / bacterial_prop)
{'p': array([ 0.01005223]), 'z': array([-2.32439245])}
Fungal
fungal = flu_organism[flu_organism.Type=='fungal'].drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count()
fungal
DischargedAlive 0 42 1 39 Name: PatientID, dtype: int64
fungal_prop = fungal/multiple_infection.groupby('DischargedAlive')['PatientID'].count().astype(float)
fungal_prop
DischargedAlive 0 0.130841 1 0.068063 Name: PatientID, dtype: float64
ztest(fungal, fungal / fungal_prop)
{'p': array([ 0.00085331]), 'z': array([-3.13704187])}
Viral
viral = flu_organism[(flu_organism.Type=='viral') & (flu_organism.OrganismName.str.startswith('Influenza')==False)].drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count()
viral
DischargedAlive 0 18 1 24 Name: PatientID, dtype: int64
viral_prop = viral/multiple_infection.groupby('DischargedAlive')['PatientID'].count().astype(float)
viral_prop
DischargedAlive 0 0.056075 1 0.041885 Name: PatientID, dtype: float64
ztest(viral, viral / viral_prop)
{'p': array([ 0.1680471]), 'z': array([-0.96191123])}
Type of ECMO
ecmo_type = (flu.Mode=='VA').replace({True:'VA', False:'VV'})
ecmo_type.value_counts() / float(ecmo_type.notnull().sum())
VV 0.725449 VA 0.274551 dtype: float64
flu['ecmo_type'] = (flu.Mode=='VA').replace({True:'VA', False:'VV'})
flu.ecmo_type.groupby(flu['DischargedAlive']).apply(lambda x: x.value_counts()/float(x.notnull().sum()))
DischargedAlive 0 VV 0.656992 VA 0.343008 1 VV 0.771127 VA 0.228873 dtype: float64
Merge Flu with Pre-ECLS Support
flu_support = flu.merge(support, left_index=True, right_on='RunID', how='left')
support.Description.value_counts()
Narcotics 924 Neuromuscular blockers 835 Vasopressor/inotropic drugs 791 Norepinephrine 471 Nitric oxide 437 High frequency ventilation/oscillation 333 Epinephrine 259 Bicarbonate 222 Dopamine 184 Vasodilator drugs 167 Steroids 158 Milrinone 73 Dobutamine 65 CVVH 61 Epoprostenol 34 THAM 34 Surfactant 26 Hyperventilation 17 Intra-aortic balloon 16 Nitroprusside 11 Cardiopulmonary bypass 11 Inhaled anesthetic 10 Abdominal compression 8 Sildenafil 6 AVCO2R 5 Cardiac pacemaker 4 LVAD 4 Hypothermia 3 Liquid ventilation 3 Plasmapheresis 3 Berlin Heart 1 Inamrinone 1 dtype: int64
n = float(flu_support.PatientID.drop_duplicates().count())
n
922.0
n_alive = flu_support.drop_duplicates('PatientID').groupby('DischargedAlive')['PatientID'].count().astype(float)
n_alive
DischargedAlive 0 368 1 554 Name: PatientID, dtype: float64
Nitric oxide
flu_support.PatientID[flu_support.Description=='Nitric oxide'].drop_duplicates().count()/n
0.31670281995661603
nitric_oxide = flu_support[flu_support.Description=='Nitric oxide'].drop_duplicates('PatientID').groupby(
'DischargedAlive')['PatientID'].count()
nitric_oxide/n_alive
DischargedAlive 0 0.366848 1 0.283394 Name: PatientID, dtype: float64
CVVH
flu_support.PatientID[flu_support.Description=='CVVH'].drop_duplicates().count()/n
0.032537960954446853
cvvh = flu_support[flu_support.Description=='CVVH'].drop_duplicates('PatientID').groupby(
'DischargedAlive')['PatientID'].count()
cvvh/n_alive
DischargedAlive 0 0.043478 1 0.025271 Name: PatientID, dtype: float64
High frequency ventilation/oscillation
flu_support.PatientID[flu_support.Description=='High frequency ventilation/oscillation'].drop_duplicates().count()/n
0.2646420824295011
hfvo = flu_support[flu_support.Description=='High frequency ventilation/oscillation'].drop_duplicates('PatientID').groupby(
'DischargedAlive')['PatientID'].count()
hfvo/n_alive
DischargedAlive 0 0.320652 1 0.227437 Name: PatientID, dtype: float64
Ionotrope/pressor support
pressor_set = ['Dobutamine', 'Vasopressor/inotropic drugs', 'Norepinephrine', 'Epinephrine', 'Dopamine', 'Milrinone']
ionotrope_pressor = flu_support[flu_support.Description.isin(pressor_set)].drop_duplicates('PatientID').groupby(
'DischargedAlive')['PatientID'].count()
ionotrope_pressor/n_alive
DischargedAlive 0 0.801630 1 0.743682 Name: PatientID, dtype: float64
All ECMO
# Age
fig, axes = plt.subplots(ncols=2, figsize=(14,4))
flu.AgeDays.hist(ax=axes[0])
axes[1].boxplot(flu.AgeDays.values);
flu.AgeDays.describe()
count 947.000000 mean 9090.066526 std 7164.891484 min 0.000000 25% 2259.000000 50% 8428.000000 75% 14682.500000 max 29679.000000 Name: AgeDays, dtype: float64
# Race
flu.Race.value_counts()
W 587 A 115 H 87 B 84 O 57 dtype: int64
flu.Race.value_counts()/flu.Race.value_counts().sum()
W 0.631183 A 0.123656 H 0.093548 B 0.090323 O 0.061290 dtype: float64
# Gender
flu.Sex.value_counts()
M 508 F 432 dtype: int64
flu.Sex.value_counts()/flu.Sex.value_counts().sum()
M 0.540426 F 0.459574 dtype: float64
# Death
flu.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
Survived 568 Died 379 dtype: int64
(flu.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
/flu.DischargedAlive.value_counts().sum())
Survived 0.599789 Died 0.400211 dtype: float64
TODO: Break down by Mode
# ECMO Type
flu.Mode.unique()
array(['VV', 'VA', 'VV-VA', 'Other', 'VVDL', 'VA-VV', 'VVDL+V', 'VA+V', 'VVA', nan], dtype=object)
# ECMO Type
flu['VA'] = flu.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu.VA[flu.Mode=='Other'] = np.nan
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is the proportion that is VA
flu.VA.mean()
0.3236870310825295
# Support type
flu.SupportType.value_counts()
1 817 2 97 3 33 dtype: int64
Proportion CPR
(flu.SupportType==3).mean()
0.034846884899683211
# Complications
ecmo_complications = flu_organism.merge(complications, on='RunID')
ecmo_complications.complication_type.value_counts()
Cardiovascular 1525 Renal 1315 Mechanical 1150 Hemorrhagic 1018 Metabolic 603 Infectious 538 Pulmonary 463 Neurologic 241 dtype: int64
Influenza only
# Age
fig, axes = plt.subplots(ncols=2, figsize=(14,4))
influenza_only.AgeDays.hist(ax=axes[0])
axes[1].boxplot(influenza_only.AgeDays.values);
influenza_only.AgeDays.describe()
count 301.000000 mean 8177.807309 std 6917.405295 min 2.000000 25% 1709.000000 50% 6775.000000 75% 12929.000000 max 25021.000000 Name: AgeDays, dtype: float64
# Race
influenza_only.Race.value_counts()
W 183 B 34 H 32 A 24 O 19 dtype: int64
influenza_only.Race.value_counts()/influenza_only.Race.value_counts().sum()
W 0.626712 B 0.116438 H 0.109589 A 0.082192 O 0.065068 dtype: float64
# Gender
influenza_only.Sex.value_counts()
M 159 F 140 dtype: int64
influenza_only.Sex.value_counts()/influenza_only.Sex.value_counts().sum()
M 0.531773 F 0.468227 dtype: float64
# Death
influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
Survived 180 Died 121 dtype: int64
(influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
/influenza_only.DischargedAlive.value_counts().sum())
Survived 0.598007 Died 0.401993 dtype: float64
# ECMO Type
influenza_only.Mode.unique()
array(['VV', 'VV-VA', 'VA', 'VVDL+V', 'VVDL', 'VA-VV', 'VA+V', 'Other'], dtype=object)
influenza_only['VA'] = influenza_only.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
influenza_only.VA[influenza_only.Mode=='Other'] = np.nan
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__': /usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance() /usr/local/lib/python3.4/site-packages/pandas/core/generic.py:3572: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self._update_inplace(new_data) /usr/local/lib/python3.4/site-packages/IPython/core/interactiveshell.py:3035: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy exec(code_obj, self.user_global_ns, self.user_ns)
This is the proportion that is VA
influenza_only.VA.mean()
0.32214765100671139
# Support type
influenza_only.SupportType.value_counts()
1 255 2 36 3 10 dtype: int64
(influenza_only.SupportType==3).mean()
0.033222591362126248
# Complications
influenza_complications = influenza_only.merge(complications, on='RunID')
influenza_complications.complication_type.value_counts()
Cardiovascular 255 Renal 212 Mechanical 184 Hemorrhagic 165 Metabolic 104 Pulmonary 68 Neurologic 55 Infectious 31 dtype: int64
Severity of illness in patients with influenza alone on ECMO versus patients with bacterial co-infection
def normal_post(y, pct=0.95, mu_0=0., var_0=1e6, draws=1000, roundto=2):
s2 = y.var()
ybar = y.mean()
n = len(y)
mu = (n*ybar/s2 + mu_0/var_0)/(n*1./s2 + 1./var_0)
sigma = np.sqrt(1./(n/s2 + 1./var_0))
y_post = np.sort(np.random.normal(mu, sigma, size=draws))
c = int(draws*(1-pct)/2.)
return([np.round(v, roundto) for v in (mu, y_post[c], y_post[-c])])
Days on ECMO
flu_hours_ecmo = (flu.drop_duplicates(cols=['PatientID']).HoursECMO/24.)
flu_hours_ecmo.describe()
/usr/local/lib/python3.4/site-packages/pandas/util/decorators.py:81: FutureWarning: the 'cols' keyword is deprecated, use 'subset' instead warnings.warn(msg, FutureWarning)
count 918.000000 mean 12.699165 std 11.819622 min 0.000000 25% 5.510417 50% 9.604167 75% 16.354167 max 125.750000 Name: HoursECMO, dtype: float64
np.log(flu_hours_ecmo+0.01).hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x11530e978>
flu_days_ECMO = with_flu.HoursECMO/24.
flu_days_ECMO.describe()
count 943.000000 mean 12.760251 std 11.806250 min 0.000000 25% 5.541667 50% 9.625000 75% 16.520833 max 125.750000 Name: HoursECMO, dtype: float64
flu_days_ECMO.hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x11580ba20>
(influenza_only.HoursECMO/24.).describe()
count 301.000000 mean 9.511766 std 7.765752 min 0.041667 25% 4.583333 50% 7.375000 75% 12.666667 max 53.833333 Name: HoursECMO, dtype: float64
(influenza_only.HoursECMO/24.).hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x111b44860>
lognorm_interval(influenza_only.HoursECMO/24., draws=100000)
[2.3999999999999999, 1.3999999999999999, 3.4100000000000001]
multiple_infection['VA'] = multiple_infection.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
multiple_infection.VA[multiple_infection.Mode=='Other'] = np.nan
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance()
has_flu = multiple_infection[multiple_infection.OrganismName.str.startswith('Influenza')==True].drop_duplicates('PatientID')
has_mrsa = multiple_infection[multiple_infection.OrganismName=='Staphylococcus aureus, meth resist'].drop_duplicates('PatientID')
has_mssa = multiple_infection[multiple_infection.OrganismName=='Staphylococcus aureus'].drop_duplicates('PatientID')
flu_with_mrsa = has_flu[has_flu.PatientID.isin(has_mrsa.PatientID)]
(flu_with_mrsa.HoursECMO/24.).describe()
count 28.000000 mean 13.831845 std 9.215900 min 1.250000 25% 7.322917 50% 12.895833 75% 17.822917 max 34.666667 Name: HoursECMO, dtype: float64
flu_with_mssa = has_flu[has_flu.PatientID.isin(has_mssa.PatientID)]
(flu_with_mssa.HoursECMO/24.).describe()
count 33.000000 mean 15.878788 std 15.318045 min 0.333333 25% 6.500000 50% 12.166667 75% 20.583333 max 64.375000 Name: HoursECMO, dtype: float64
Proportion VA (vs. VV)
influenza_only.VA.mean()
0.32214765100671139
flu_coinfection['VA'] = flu_coinfection.Mode.isin(['VA', 'VV-VA'])
# Set "Other" type to NA (there are only a couple)
flu_coinfection.loc[flu_coinfection.Mode=='Other', 'VA'] = np.nan
/usr/local/lib/python3.4/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__': /usr/local/lib/python3.4/site-packages/pandas/core/indexing.py:407: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self.obj[item] = s
flu_coinfection.VA.mean()
0.25824175824175827
flu_with_mrsa.VA.mean()
0.38461538461538464
flu_with_mssa.VA.mean()
0.34375
Death
influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
Survived 180 Died 121 dtype: int64
(influenza_only.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
/influenza_only.DischargedAlive.value_counts().sum())
Survived 0.598007 Died 0.401993 dtype: float64
flu_coinfection.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
Survived 239 Died 133 dtype: int64
(flu_coinfection.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
/flu_coinfection.DischargedAlive.value_counts().sum())
Survived 0.642473 Died 0.357527 dtype: float64
flu_with_mrsa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
Died 16 Survived 12 dtype: int64
(flu_with_mrsa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
/flu_with_mrsa.DischargedAlive.value_counts().sum())
Died 0.571429 Survived 0.428571 dtype: float64
flu_with_mssa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
Survived 19 Died 14 dtype: int64
(flu_with_mssa.DischargedAlive.value_counts().rename({1: 'Survived', 0: 'Died'})
/flu_with_mssa.DischargedAlive.value_counts().sum())
Survived 0.575758 Died 0.424242 dtype: float64
complications.complication_type.value_counts()
Cardiovascular 1471 Renal 1368 Mechanical 1142 Hemorrhagic 922 Metabolic 612 Infectious 478 Pulmonary 399 Neurologic 275 Limb 28 dtype: int64
complication_list = ['Mechanical', 'Hemorrhagic', 'Renal', 'Cardiovascular', 'Pulmonary']
has_complication = pd.DataFrame(complications.groupby('RunID').apply(
lambda x: x['complication_type'].isin(complication_list).any()))
has_complication.columns = ['complication']
flu_with_comp = pd.merge(flu_organism, has_complication, left_on='RunID', right_index=True, how='left')
flu_with_comp.shape
(1444, 61)
flu_with_comp.PatientID.unique().shape
(922,)
flu_with_comp.complication.mean()
0.9724907063197026
flu_organism.RunID.isin(has_complication[has_complication.complication].index).mean()
0.90581717451523547
flu_organism['has_complication'] = flu_organism.RunID.isin(has_complication[has_complication.complication].index)
flu_organism = flu_organism.drop('has_complication', axis=1)
Merge with S. Aureus diagnoses from ICD9
flu_organism_icd9 = pd.merge(flu_organism, s_aureus_diagnoses, on='RunID', how='left')
flu_organism_icd9['s_aureus_icd9'] = flu_organism_icd9.ICD9Code.notnull()
flu_organism_icd9 = flu_organism_icd9.drop(['ICD9Code', 'PrimaryDiagnosis'], axis=1 )
flu_organism.PatientID.unique().shape
(922,)
flu_organism_icd9.head()
PatientID RunNo AgeDays HoursECMO \ 0 89C16594-7B2B-42A4-81FF-002B2E92CA75 1 8992 192 1 F54AD8CA-5FEF-4724-A89D-0061A3C51519 1 79 96 2 F54AD8CA-5FEF-4724-A89D-0061A3C51519 1 79 96 3 78545F85-C3BC-47EF-967A-00D941B25CF8 1 3062 102 4 78545F85-C3BC-47EF-967A-00D941B25CF8 1 3062 102 SupportType PrimaryDx Mode Discontinuation DischargedAlive \ 0 1 488.10 VV 1 1 1 2 746.11 VA 1 0 2 2 746.11 VA 1 0 3 2 422.90 VA 1 1 4 2 422.90 VA 1 1 DischargeLocation YearECLS VentType Rate FiO2 PIP PEEP MAP \ 0 1 2009 2 5 100 66 NaN 26 1 NaN 2000 2 9 88 44 NaN NaN 2 NaN 2000 2 9 88 44 NaN NaN 3 3 1997 2 20 70 22 2 NaN 4 3 1997 2 20 70 22 2 NaN HandBagging pH PCO2 ... AdmitToTimeOnHours \ 0 0 7.39 32.0 ... 39 1 0 7.42 35.0 ... 132 2 0 7.42 35.0 ... 132 3 0 7.39 31.6 ... 30 4 0 7.39 31.6 ... 30 TimeOffToExtubationDateHours TimeOffToDeathDateHours \ 0 NaN NaN 1 NaN 296 2 NaN 296 3 132 NaN 4 132 NaN TimeOffToDCDateHours ExtubationToDCDateHours ExtubationToDeathDateHours \ 0 609 NaN NaN 1 NaN NaN NaN 2 NaN NaN NaN 3 204 72 NaN 4 204 72 NaN year HoursVent AgeYears age_class \ 0 2009 192 24.635616 adult 1 2000 96 0.216438 pediatric 2 2000 96 0.216438 pediatric 3 1997 234 8.389041 pediatric 4 1997 234 8.389041 pediatric RunID OrganismNo OrganismName \ 0 4BA2348E-18D8-44AE-8E7A-0EC2748E0AFE 63 Influenza A 1 4247D183-3A46-4945-ADE6-81FE57E97688 63 Influenza A 2 4247D183-3A46-4945-ADE6-81FE57E97688 63 Influenza A 3 70DC2DC7-B983-40B0-81B4-2998E80F7163 63 Influenza A 4 70DC2DC7-B983-40B0-81B4-2998E80F7163 19 Gram negative, other CultureSite CultureTimeIsApproximate OrganismTiming Type \ 0 Unknown 1 Pre-ECLS viral 1 Unknown 1 Pre-ECLS viral 2 Unknown 1 On-ECLS viral 3 Unknown 1 Pre-ECLS viral 4 Unknown 1 On-ECLS bacterial has_flu fate s_aureus_icd9 0 ECMO Flu Survived False 1 ECMO Flu Died False 2 ECMO Flu Died False 3 ECMO Flu Survived False 4 ECMO No Flu Survived False [5 rows x 61 columns]
flu_organism_icd9.to_csv("data/flu_organism.csv")
flu_with_comp.to_csv("data/flu.csv")
flu_with_comp.head()
PatientID RunNo AgeDays HoursECMO \ 41 89C16594-7B2B-42A4-81FF-002B2E92CA75 1 8992 192 496 F54AD8CA-5FEF-4724-A89D-0061A3C51519 1 79 96 497 F54AD8CA-5FEF-4724-A89D-0061A3C51519 1 79 96 128 78545F85-C3BC-47EF-967A-00D941B25CF8 1 3062 102 1212 78545F85-C3BC-47EF-967A-00D941B25CF8 1 3062 102 SupportType PrimaryDx Mode Discontinuation DischargedAlive \ 41 1 488.10 VV 1 1 496 2 746.11 VA 1 0 497 2 746.11 VA 1 0 128 2 422.90 VA 1 1 1212 2 422.90 VA 1 1 DischargeLocation YearECLS VentType Rate FiO2 PIP PEEP MAP \ 41 1 2009 2 5 100 66 NaN 26 496 NaN 2000 2 9 88 44 NaN NaN 497 NaN 2000 2 9 88 44 NaN NaN 128 3 1997 2 20 70 22 2 NaN 1212 3 1997 2 20 70 22 2 NaN HandBagging pH PCO2 ... AdmitToTimeOnHours \ 41 0 7.39 32.0 ... 39 496 0 7.42 35.0 ... 132 497 0 7.42 35.0 ... 132 128 0 7.39 31.6 ... 30 1212 0 7.39 31.6 ... 30 TimeOffToExtubationDateHours TimeOffToDeathDateHours \ 41 NaN NaN 496 NaN 296 497 NaN 296 128 132 NaN 1212 132 NaN TimeOffToDCDateHours ExtubationToDCDateHours \ 41 609 NaN 496 NaN NaN 497 NaN NaN 128 204 72 1212 204 72 ExtubationToDeathDateHours year HoursVent AgeYears age_class \ 41 NaN 2009 192 24.635616 adult 496 NaN 2000 96 0.216438 pediatric 497 NaN 2000 96 0.216438 pediatric 128 NaN 1997 234 8.389041 pediatric 1212 NaN 1997 234 8.389041 pediatric RunID OrganismNo OrganismName \ 41 4BA2348E-18D8-44AE-8E7A-0EC2748E0AFE 63 Influenza A 496 4247D183-3A46-4945-ADE6-81FE57E97688 63 Influenza A 497 4247D183-3A46-4945-ADE6-81FE57E97688 63 Influenza A 128 70DC2DC7-B983-40B0-81B4-2998E80F7163 63 Influenza A 1212 70DC2DC7-B983-40B0-81B4-2998E80F7163 19 Gram negative, other CultureSite CultureTimeIsApproximate OrganismTiming Type \ 41 Unknown 1 Pre-ECLS viral 496 Unknown 1 Pre-ECLS viral 497 Unknown 1 On-ECLS viral 128 Unknown 1 Pre-ECLS viral 1212 Unknown 1 On-ECLS bacterial has_flu fate complication 41 ECMO Flu Survived True 496 ECMO Flu Died True 497 ECMO Flu Died True 128 ECMO Flu Survived True 1212 ECMO No Flu Survived True [5 rows x 61 columns]