%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
Correlate RSV severity with vitamin D levels (at hospital)
Covariates:
Set flag for extracting the MPV-positive subset
MPV_only = True
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
hospitalized.head()
/usr/local/lib/python2.7/site-packages/pandas/io/parsers.py:1130: DtypeWarning: Columns (140,142,144,146,148,181,206,207,212,213,261,280,281,282,296,297) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
greater_48hrs | fever_neutropenia | never_left | written_consent | child_name | mother_name | mother_birth_date | mother_record | mother_nationality | other_mother_nationality | ... | date | whole_blood_complete | age_months | length_of_stay | gest_age | death | hospitalized_vitamin_d | wheezing_ind | sex | z_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
case_id | |||||||||||||||||||||
A0001 | 0 | 0 | 0 | 1 | Remas Mahmoud Jbarah | Huda Katalo | 1976-01-21 | NaN | 3 | NaN | ... | NaN | 0 | 1 | 6 | 40 | False | 3 | 0 | F | 0.68 |
A0002 | 0 | 0 | 0 | 1 | Majed Abdel Kareem Majed | Noor SHa'aban Mahmood | 1989-09-09 | NaN | 3 | NaN | ... | NaN | 0 | 1 | 5 | 40 | False | 4 | 1 | M | -0.64 |
A0003 | 0 | 0 | 0 | 1 | Rayyan Jamal Muhyi Al.Deen | SAra Hussein Muhyi Al.Deen | 1965-01-01 | NaN | 1 | NaN | ... | NaN | 0 | 11 | 10 | 40 | False | 35 | 0 | F | -7.59 |
A0004 | 0 | 0 | 0 | 1 | Hanan Mohd Mustapha Abu Othman | Kawla Abu Shanab | 1983-10-31 | NaN | 1 | NaN | ... | NaN | 0 | 7 | 3 | 38 | False | 2 | 1 | F | -0.84 |
A0005 | 0 | 0 | 0 | 1 | Yara Mahmoud Azmi Ismael | Suha Abdel Aziz | 1986-02-28 | NaN | 1 | NaN | ... | NaN | 0 | 2 | 1 | 39 | False | 6 | 0 | F | -1.68 |
5 rows × 412 columns
hospitalized.child_birth_date = pd.to_datetime(hospitalized.child_birth_date)
hospitalized.enrollment_date = pd.to_datetime(hospitalized.enrollment_date)
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.discharge_date = pd.to_datetime(hospitalized.discharge_date)
hospitalized["prev_cond"] = hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1)
[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]
['diabetes_hx', 'heart_hx', 'kidney_hx', 'downs_hx', 'sickle_hx', 'cancer_hx', 'cf_hx', 'genetic_hx', 'cp_hx', 'seizure_hx', 'neuro_hx', 'mr_hx', 'asthma_hx', 'immuno_hx', 'liver_hx', 'gerd_hx', 'diarrhea_hx', 'other_hx', 'spec_hx']
Calculate age in months from birth and enrollment date:
from calendar import monthrange
from datetime import timedelta
def monthdelta(d1, d2):
delta = 0
while True:
mdays = monthrange(d1.year, d1.month)[1]
d1 += timedelta(days=mdays)
if d1 <= d2:
delta += 1
else:
break
return delta
if MPV_only:
hospitalized = hospitalized[hospitalized.pcr_result___2==1]
Either ICU or direct admission to ICU
hospitalized['icu_any'] = (hospitalized.icu + hospitalized.dir_icu).astype(bool)
Severity score
hospitalized.flaring.value_counts()
0 133 1 107 2 33 dtype: int64
hospitalized.cyanosis[hospitalized.cyanosis==4] = np.nan
hospitalized.cyanosis.value_counts()
0 217 1 50 2 4 3 1 dtype: int64
hospitalized.wheezing[hospitalized.wheezing==4] = np.nan
hospitalized.wheezing.value_counts()
1 157 0 82 2 33 dtype: int64
hospitalized.respiratory_rate = hospitalized.respiratory_rate.replace({'-': np.nan}).astype(float)
hospitalized['respiratory_class'] = 0
hospitalized.respiratory_class[(hospitalized.respiratory_rate>30) & (hospitalized.respiratory_rate<=45)] = 1
hospitalized.respiratory_class[(hospitalized.respiratory_rate>45) & (hospitalized.respiratory_rate<=60)] = 2
hospitalized.respiratory_class[hospitalized.respiratory_rate>60] = 3
hospitalized.respiratory_class.value_counts()
1 160 0 67 2 40 3 6 dtype: int64
hospitalized['sats_class'] = 0
hospitalized.sats_class[(hospitalized.sats_number>=90) & (hospitalized.sats_number<95)] = 1
hospitalized.sats_class[(hospitalized.sats_number>=85) & (hospitalized.sats_number<90)] = 2
hospitalized.sats_class[hospitalized.sats_number<85] = 3
hospitalized.sats_class.value_counts()
0 253 1 16 2 3 3 1 dtype: int64
hospitalized['severity_score'] = hospitalized[['flaring', 'cyanosis', 'wheezing', 'respiratory_class', 'sats_class']].sum(1)
hospitalized['admission_month'] = hospitalized.admission_date.apply(lambda x: x.month)
Distribution of smoking exposure
hospitalized.cigarette_smokers.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1116ab490>
Proportion and of children exposed to cigarette smoking
(hospitalized.cigarette_smokers > 0).mean()
0.74358974358974361
(hospitalized.cigarette_smokers > 0).sum()
203
Distribution of nargila exposure
hospitalized.nargila_smokers.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x11155bfd0>
Proportion and number of children exposed to nargila
(hospitalized.nargila_smokers > 0).mean()
0.17216117216117216
(hospitalized.nargila_smokers > 0).sum()
47
Proportion of children with mothers who smoked during pregnancy
hospitalized.cigarette_preg.mean()
0.062271062271062272
hospitalized.nargila_preg.mean()
0.029304029304029304
Days of symptoms before admission by gender
days_by_sex = hospitalized.groupby('sex_child')
_ = days_by_sex.boxplot(column='days_symptoms')
/usr/local/lib/python2.7/site-packages/pandas/tools/plotting.py:2421: FutureWarning: The default value for 'return_type' will change to 'axes' in a future release. To use the future behavior now, set return_type='axes'. To keep the previous behavior and silence this warning, set return_type='dict'. warnings.warn(msg, FutureWarning)
Vitamin D levels by breastfeeding status
hospitalized.breastfed = hospitalized.breastfed.replace({0: 'Not breastfed', 1: 'Breastfed'})
# _ = hospitalized.groupby('breastfed').boxplot(column='hospitalized_vitamin_d', grid=False, fontsize=0)
# plt.gca().set_ylabel('Vitamin D levels')
for index, group in hospitalized.groupby(['breastfed']):
group.boxplot(column='hospitalized_vitamin_d')
_ = hospitalized.groupby('breastfed')['oxygen'].hist(alpha=0.3)
RSV count by oxygen status
_ = hospitalized.groupby('oxygen').boxplot(column='rsv_count')
hospitalized['male'] = (hospitalized.sex=='M').astype(int)
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23]))
age_groups.index = hospitalized.index
age_groups.columns = 'under 2 months', '2-11 months', '12-23 months'
hospitalized = hospitalized.join(age_groups)
nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian',
6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup)
hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int)
hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int)
hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int)
hospitalized['vitamin D < 20'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan
hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int)
hospitalized['vitamin D < 11'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan
hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int)
hospitalized['breastfed'] = (hospitalized.breastfed=='Breastfed').astype(int)
groupby_icu = hospitalized.groupby('icu_any')
def make_table(groupby, table_vars, replace_dict={}):
table = np.round(groupby[table_vars].mean(), 2).T
ratios = [calc_or(groupby, v) for v in table.index]
table['OR'] = pd.Series([r[0] for r in ratios], index=table.index)
table['Interval'] = [r[1] for r in ratios]
table['N'] = [r[2] for r in ratios]
table.rename(columns=replace_dict, inplace=True)
table.columns.name = None
return(table)
table_vars = ['male', 'under 2 months', '2-11 months', '12-23 months',
'Jordanian', 'Palestinian', 'vitamin D < 20', 'vitamin D < 11',
'prev_cond', 'heart_hx', 'breastfed', 'premature',
'adm_pneumo', 'adm_bronchopneumo', 'adm_sepsis', 'adm_bronchiolitis']
Function for calculating odds ratios and simulation-based intervals.
se = lambda p, n: np.sqrt(p * (1. - p) / n)
def odds_ratio(x, y, n_sim=10000, alpha=0.05):
try:
n_x, n_y = len(x.dropna()), len(y.dropna())
p_x, p_y = x.mean(), y.mean()
se_x = se(p_x, n_x)
se_y = se(p_y, n_y)
p_x_sim = np.random.normal(p_x, se_x, n_sim)
p_y_sim = np.random.normal(p_y, se_y, n_sim)
ratio = ((p_x_sim / (1. - p_x_sim)) /
(p_y_sim / (1. - p_y_sim)))
interval = np.percentile(ratio, [100*(alpha/2.), 100*(1. - alpha/2.)])
return np.round(np.median(ratio), 2), np.round(interval, 2).tolist(), (n_y, n_x)
except ValueError:
return np.nan, np.nan, np.nan
def calc_or(groupby, var):
data = list(groupby[var])
return odds_ratio(data[1][1], data[0][1])
pcr_lookup = {'pcr_result___1': 'RSV',
'pcr_result___2': 'HMPV',
'pcr_result___3': 'flu A',
'pcr_result___4': 'flu B',
'pcr_result___5': 'rhino',
'pcr_result___6': 'PIV1',
'pcr_result___7': 'PIV2',
'pcr_result___8': 'PIV3',
'pcr_result___13': 'H1N1',
'pcr_result___14': 'H3N2',
'pcr_result___15': 'Swine',
'pcr_result___16': 'Swine H1',
'pcr_result___17': 'flu C',
'pcr_result___18': 'Adeno'}
hospitalized['RSV'] = hospitalized['pcr_result___1']
hospitalized['Influenza'] = (hospitalized['pcr_result___3'] | hospitalized['pcr_result___4']).astype(int)
hospitalized['HMPV'] = hospitalized['pcr_result___2']
hospitalized['Rhino'] = hospitalized['pcr_result___5']
hospitalized['vitamin_d_norm'] = ((hospitalized.hospitalized_vitamin_d - hospitalized.hospitalized_vitamin_d.mean())
/ hospitalized.hospitalized_vitamin_d.std())
hospitalized['age_centered'] = hospitalized.age_months/hospitalized.age_months.mean()
pd.__version__
'0.10.0-5065-gfb38fb6'
groupby_o2 = hospitalized.groupby('oxygen')
virus_vars = ['RSV', 'Influenza', 'HMPV', 'Rhino']
make_table(groupby_o2, table_vars=table_vars+virus_vars, replace_dict={0.0: 'No Oxygen', 1.0: 'Oxygen'})
No Oxygen | Oxygen | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.57 | 0.59 | 1.12 | [0.66, 1.93] | (189, 81) |
under 2 months | 0.06 | 0.23 | 4.98 | [2.26, 13.81] | (189, 81) |
2-11 months | 0.66 | 0.59 | 0.75 | [0.44, 1.29] | (189, 81) |
12-23 months | 0.23 | 0.10 | 0.36 | [0.11, 0.72] | (189, 81) |
Jordanian | 0.90 | 0.91 | 1.10 | [0.46, 4.11] | (189, 81) |
Palestinian | 0.05 | 0.06 | 1.17 | [0.14, 3.8] | (189, 81) |
vitamin D < 20 | 0.55 | 0.53 | 0.92 | [0.52, 1.6] | (164, 73) |
vitamin D < 11 | 0.40 | 0.38 | 0.95 | [0.52, 1.65] | (164, 73) |
prev_cond | 0.11 | 0.11 | 1.05 | [0.35, 2.32] | (189, 81) |
heart_hx | 0.05 | 0.04 | 0.77 | [-0.08, 2.81] | (189, 81) |
breastfed | 0.68 | 0.56 | 0.59 | [0.35, 1.02] | (189, 81) |
premature | 0.13 | 0.23 | 2.02 | [0.99, 4.01] | (189, 81) |
adm_pneumo | 0.09 | 0.25 | 3.33 | [1.6, 7.38] | (189, 81) |
adm_bronchopneumo | 0.52 | 0.28 | 0.37 | [0.2, 0.63] | (189, 81) |
adm_sepsis | 0.11 | 0.16 | 1.61 | [0.68, 3.51] | (189, 81) |
adm_bronchiolitis | 0.21 | 0.21 | 1.02 | [0.5, 1.9] | (189, 81) |
RSV | 0.23 | 0.35 | 1.80 | [0.99, 3.18] | (189, 81) |
Influenza | 0.03 | 0.02 | 0.78 | [-0.39, 3.91] | (189, 81) |
HMPV | 1.00 | 1.00 | NaN | NaN | NaN |
Rhino | 0.28 | 0.23 | 0.81 | [0.4, 1.44] | (189, 81) |
groupby_vent = hospitalized.groupby('vent')
make_table(groupby_vent, table_vars=table_vars+virus_vars,
replace_dict={0.0: 'No Ventilator', 1.0: 'Ventilator'})
No Ventilator | Ventilator | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.57 | 0.57 | 0.96 | [0.15, 7.47] | (263, 7) |
under 2 months | 0.10 | 0.43 | 6.55 | [0.57, 36.55] | (263, 7) |
2-11 months | 0.65 | 0.43 | 0.41 | [0.04, 2.12] | (263, 7) |
12-23 months | 0.19 | 0.14 | 0.69 | [-0.44, 2.98] | (263, 7) |
Jordanian | 0.90 | 1.00 | NaN | NaN | NaN |
Palestinian | 0.06 | 0.00 | NaN | NaN | NaN |
vitamin D < 20 | 0.55 | 0.43 | 0.61 | [0.05, 3.27] | (230, 7) |
vitamin D < 11 | 0.39 | 0.43 | 1.14 | [0.09, 5.81] | (230, 7) |
prev_cond | 0.10 | 0.29 | 3.51 | [-0.4, 15.66] | (263, 7) |
heart_hx | 0.04 | 0.14 | 3.86 | [-2.46, 19.82] | (263, 7) |
breastfed | 0.63 | 0.86 | 2.42 | [-40.14, 37.63] | (263, 7) |
premature | 0.16 | 0.14 | 0.87 | [-0.52, 3.65] | (263, 7) |
adm_pneumo | 0.13 | 0.43 | 5.12 | [0.42, 27.12] | (263, 7) |
adm_bronchopneumo | 0.45 | 0.29 | 0.48 | [-0.05, 2.03] | (263, 7) |
adm_sepsis | 0.12 | 0.14 | 1.22 | [-0.8, 5.26] | (263, 7) |
adm_bronchiolitis | 0.21 | 0.14 | 0.62 | [-0.41, 2.65] | (263, 7) |
RSV | 0.26 | 0.43 | 2.14 | [0.18, 11.73] | (263, 7) |
Influenza | 0.03 | 0.00 | NaN | NaN | NaN |
HMPV | 1.00 | 1.00 | NaN | NaN | NaN |
Rhino | 0.26 | 0.43 | 2.17 | [0.18, 12.07] | (263, 7) |
groupby_death = hospitalized.groupby('death')
make_table(groupby_death, table_vars=table_vars+virus_vars, replace_dict={False: 'Survived', True: 'Die'})
Survived | Die | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.58 | 0.5 | 0.57 | [-12.01, 14.33] | (271, 2) |
under 2 months | 0.11 | 0.5 | 6.31 | [-133.47, 138.76] | (271, 2) |
2-11 months | 0.65 | 0.0 | NaN | NaN | NaN |
12-23 months | 0.19 | 0.0 | NaN | NaN | NaN |
Jordanian | 0.91 | 0.5 | 0.07 | [-1.64, 1.74] | (271, 2) |
Palestinian | 0.06 | 0.0 | NaN | NaN | NaN |
vitamin D < 20 | 0.55 | 0.0 | NaN | NaN | NaN |
vitamin D < 11 | 0.39 | 0.0 | NaN | NaN | NaN |
prev_cond | 0.11 | 0.0 | NaN | NaN | NaN |
heart_hx | 0.04 | 0.0 | NaN | NaN | NaN |
breastfed | 0.64 | 0.5 | 0.42 | [-8.49, 11.28] | (271, 2) |
premature | 0.16 | 0.0 | NaN | NaN | NaN |
adm_pneumo | 0.13 | 0.5 | 4.80 | [-96.47, 118.21] | (271, 2) |
adm_bronchopneumo | 0.45 | 0.0 | NaN | NaN | NaN |
adm_sepsis | 0.12 | 0.5 | 5.46 | [-113.86, 119.82] | (271, 2) |
adm_bronchiolitis | 0.21 | 0.0 | NaN | NaN | NaN |
RSV | 0.26 | 0.5 | 2.16 | [-43.18, 55.72] | (271, 2) |
Influenza | 0.03 | 0.0 | NaN | NaN | NaN |
HMPV | 1.00 | 1.0 | NaN | NaN | NaN |
Rhino | 0.26 | 0.5 | 2.12 | [-46.95, 55.01] | (271, 2) |
groupby_breastfed = hospitalized.groupby('breastfed')
table_vars.pop(table_vars.index('breastfed'))
'breastfed'
make_table(groupby_breastfed, table_vars=table_vars+virus_vars, replace_dict={0: 'Not breastfed', 1: 'Breastfed'})
Not breastfed | Breastfed | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.52 | 0.61 | 1.42 | [0.86, 2.35] | (98, 175) |
under 2 months | 0.12 | 0.10 | 0.81 | [0.37, 2.01] | (98, 175) |
2-11 months | 0.64 | 0.65 | 1.02 | [0.59, 1.72] | (98, 175) |
12-23 months | 0.13 | 0.22 | 1.86 | [1.0, 4.18] | (98, 175) |
Jordanian | 0.92 | 0.90 | 0.83 | [0.24, 1.94] | (98, 175) |
Palestinian | 0.03 | 0.07 | 2.21 | [-9.65, 18.01] | (98, 175) |
vitamin D < 20 | 0.26 | 0.71 | 7.03 | [4.0, 13.42] | (88, 152) |
vitamin D < 11 | 0.09 | 0.56 | 12.77 | [6.48, 40.19] | (88, 152) |
prev_cond | 0.18 | 0.06 | 0.30 | [0.11, 0.66] | (98, 175) |
heart_hx | 0.06 | 0.03 | 0.54 | [0.09, 2.42] | (98, 175) |
premature | 0.34 | 0.06 | 0.13 | [0.05, 0.26] | (98, 175) |
adm_pneumo | 0.16 | 0.12 | 0.70 | [0.34, 1.51] | (98, 175) |
adm_bronchopneumo | 0.37 | 0.50 | 1.71 | [1.04, 2.91] | (98, 175) |
adm_sepsis | 0.16 | 0.10 | 0.59 | [0.27, 1.31] | (98, 175) |
adm_bronchiolitis | 0.23 | 0.19 | 0.75 | [0.41, 1.43] | (98, 175) |
RSV | 0.23 | 0.27 | 1.23 | [0.7, 2.29] | (98, 175) |
Influenza | 0.01 | 0.04 | 2.78 | [-37.71, 42.96] | (98, 175) |
HMPV | 1.00 | 1.00 | NaN | NaN | NaN |
Rhino | 0.23 | 0.28 | 1.28 | [0.73, 2.39] | (98, 175) |
Number of vitamin D records
hospitalized.hospitalized_vitamin_d.notnull().sum()
240
Median vitaimin D
hospitalized.hospitalized_vitamin_d.median()
16.549999999999997
Proportion of subjects with vitamin D < 20
(hospitalized.hospitalized_vitamin_d<20).mean()
0.47985347985347987
Number and proportion died:
hospitalized.death.value_counts().rename({False: 'Survived', True: 'Died'})
Survived 271 Died 2 dtype: int64
hospitalized.death.mean()
0.007326007326007326
groupby_death = hospitalized.groupby('death')
Median z-score
groupby_death['z_score'].median().rename({False: 'Survived', True: 'Died'})
death Survived 0.060 Died -0.825 Name: z_score, dtype: float64
Gestational age
groupby_death['gest_age'].median().rename({False: 'Survived', True: 'Died'})
death Survived 40.0 Died 37.5 Name: gest_age, dtype: float64
Vitamin D
groupby_death['hospitalized_vitamin_d'].median().rename({False: 'Survived', True: 'Died'})
death Survived 16.2 Died 21.1 Name: hospitalized_vitamin_d, dtype: float64
Previous conditions
groupby_death[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].mean().rename({False: 'Survived', True: 'Died'})
diabetes_hx | heart_hx | kidney_hx | downs_hx | sickle_hx | cancer_hx | cf_hx | genetic_hx | cp_hx | seizure_hx | neuro_hx | mr_hx | asthma_hx | immuno_hx | liver_hx | gerd_hx | diarrhea_hx | other_hx | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
death | ||||||||||||||||||
Survived | 0 | 0.04428 | 0.00369 | 0.01845 | 0 | 0 | 0 | 0.00369 | 0.00369 | 0.00369 | 0.01107 | 0 | 0.00738 | 0 | 0 | 0 | 0 | 0.01107 |
Died | 0 | 0.00000 | 0.00000 | 0.00000 | 0 | 0 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0 | 0.00000 | 0 | 0 | 0 | 0 | 0.00000 |
hospitalized[hospitalized.icu_any==1]['adm_pneumo'].sum()
7
hospitalized[hospitalized.icu_any==1]['adm_bronchopneumo'].sum()
2
hospitalized[hospitalized.icu_any==1]['adm_bronchiolitis'].sum()
1