%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
pd.__version__
'0.10.0-5065-gfb38fb6'
Correlate RSV severity with vitamin D levels (at hospital)
Covariates:
Set flag for extracting the RSV-positive subset
RSV_only = False
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
hospitalized.head()
/usr/local/lib/python2.7/site-packages/pandas/io/parsers.py:1130: DtypeWarning: Columns (140,142,144,146,148,181,206,207,212,213,261,280,281,282,296,297) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
greater_48hrs | fever_neutropenia | never_left | written_consent | child_name | mother_name | mother_birth_date | mother_record | mother_nationality | other_mother_nationality | ... | date | whole_blood_complete | age_months | length_of_stay | gest_age | death | hospitalized_vitamin_d | wheezing_ind | sex | z_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
case_id | |||||||||||||||||||||
A0001 | 0 | 0 | 0 | 1 | Remas Mahmoud Jbarah | Huda Katalo | 1976-01-21 | NaN | 3 | NaN | ... | NaN | 0 | 1 | 6 | 40 | False | 3 | 0 | F | 0.68 |
A0002 | 0 | 0 | 0 | 1 | Majed Abdel Kareem Majed | Noor SHa'aban Mahmood | 1989-09-09 | NaN | 3 | NaN | ... | NaN | 0 | 1 | 5 | 40 | False | 4 | 1 | M | -0.64 |
A0003 | 0 | 0 | 0 | 1 | Rayyan Jamal Muhyi Al.Deen | SAra Hussein Muhyi Al.Deen | 1965-01-01 | NaN | 1 | NaN | ... | NaN | 0 | 11 | 10 | 40 | False | 35 | 0 | F | -7.59 |
A0004 | 0 | 0 | 0 | 1 | Hanan Mohd Mustapha Abu Othman | Kawla Abu Shanab | 1983-10-31 | NaN | 1 | NaN | ... | NaN | 0 | 7 | 3 | 38 | False | 2 | 1 | F | -0.84 |
A0005 | 0 | 0 | 0 | 1 | Yara Mahmoud Azmi Ismael | Suha Abdel Aziz | 1986-02-28 | NaN | 1 | NaN | ... | NaN | 0 | 2 | 1 | 39 | False | 6 | 0 | F | -1.68 |
5 rows × 412 columns
hospitalized.child_birth_date = pd.to_datetime(hospitalized.child_birth_date)
hospitalized.enrollment_date = pd.to_datetime(hospitalized.enrollment_date)
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.discharge_date = pd.to_datetime(hospitalized.discharge_date)
hospitalized["prev_cond"] = hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1)
[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]
['diabetes_hx', 'heart_hx', 'kidney_hx', 'downs_hx', 'sickle_hx', 'cancer_hx', 'cf_hx', 'genetic_hx', 'cp_hx', 'seizure_hx', 'neuro_hx', 'mr_hx', 'asthma_hx', 'immuno_hx', 'liver_hx', 'gerd_hx', 'diarrhea_hx', 'other_hx', 'spec_hx']
Plot z-scores by year of admission
hospitalized['year_admission'] = hospitalized.admission_date.apply(lambda x: x.year)
hospitalized.groupby('year_admission')['z_score'].hist(alpha=0.2, bins=20)
year_admission 2010 Axes(0.125,0.125;0.775x0.775) 2011 Axes(0.125,0.125;0.775x0.775) 2012 Axes(0.125,0.125;0.775x0.775) 2013 Axes(0.125,0.125;0.775x0.775) Name: z_score, dtype: object
Calculate age in months from birth and enrollment date:
from calendar import monthrange
from datetime import timedelta
def monthdelta(d1, d2):
delta = 0
while True:
mdays = monthrange(d1.year, d1.month)[1]
d1 += timedelta(days=mdays)
if d1 <= d2:
delta += 1
else:
break
return delta
if RSV_only:
hospitalized = hospitalized[hospitalized.pcr_result___1==1]
Either ICU or direct admission to ICU
hospitalized['icu_any'] = (hospitalized.icu + hospitalized.dir_icu).astype(bool)
Severity score
hospitalized.flaring.value_counts()
0 1879 1 1009 2 264 3 17 dtype: int64
hospitalized.cyanosis[hospitalized.cyanosis==4] = np.nan
hospitalized.cyanosis.value_counts()
0 2549 1 558 2 51 3 8 dtype: int64
hospitalized.wheezing[hospitalized.wheezing==4] = np.nan
hospitalized.wheezing.value_counts()
1 1475 0 1412 2 270 3 10 dtype: int64
hospitalized.respiratory_rate = hospitalized.respiratory_rate.replace({'-': np.nan}).astype(float)
hospitalized['respiratory_class'] = 0
hospitalized.respiratory_class[(hospitalized.respiratory_rate>30) & (hospitalized.respiratory_rate<=45)] = 1
hospitalized.respiratory_class[(hospitalized.respiratory_rate>45) & (hospitalized.respiratory_rate<=60)] = 2
hospitalized.respiratory_class[hospitalized.respiratory_rate>60] = 3
hospitalized.respiratory_class.value_counts()
1 1648 0 764 2 636 3 121 dtype: int64
hospitalized['sats_class'] = 0
hospitalized.sats_class[(hospitalized.sats_number>=90) & (hospitalized.sats_number<95)] = 1
hospitalized.sats_class[(hospitalized.sats_number>=85) & (hospitalized.sats_number<90)] = 2
hospitalized.sats_class[hospitalized.sats_number<85] = 3
hospitalized.sats_class.value_counts()
0 3020 1 131 2 15 3 3 dtype: int64
hospitalized['severity_score'] = hospitalized[['flaring', 'cyanosis', 'wheezing', 'respiratory_class', 'sats_class']].sum(1)
hospitalized.severity_score.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x106303c50>
hospitalized.icu_any[hospitalized.sats_number<85]
case_id A0226 False C2028 False D3055 False Name: icu_any, dtype: bool
Month of admission
hospitalized['admission_month'] = hospitalized.admission_date.apply(lambda x: x.month)
covariates = hospitalized[["daycare",
"hospitalized_vitamin_d",
"gest_age",
"heart_hx",
"prev_cond",
"cigarette_smokers",
"cigarette_preg",
"sex_child",
"birth_wt_child",
"breastfed",
"severity_score",
"z_score",
"age_months",
"wheezing",
"admission_month"
]]
covariates.birth_wt_child.hist(bins=np.sqrt(len(covariates)))
<matplotlib.axes._subplots.AxesSubplot at 0x1062b3f10>
covariates.hospitalized_vitamin_d.hist(bins=np.sqrt(len(covariates)))
<matplotlib.axes._subplots.AxesSubplot at 0x106d84d50>
covariates.head()
daycare | hospitalized_vitamin_d | gest_age | heart_hx | prev_cond | cigarette_smokers | cigarette_preg | sex_child | birth_wt_child | breastfed | severity_score | z_score | age_months | wheezing | admission_month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
case_id | |||||||||||||||
A0001 | 0 | 3 | 40 | 0 | 0 | 0 | 0 | 0 | 2.95 | 1 | 6 | 0.68 | 1 | 0 | 3 |
A0002 | 0 | 4 | 40 | 0 | 0 | 1 | 0 | 1 | 3.00 | 1 | 4 | -0.64 | 1 | 1 | 3 |
A0003 | 0 | 35 | 40 | 0 | 1 | 0 | 0 | 0 | 2.00 | 0 | 3 | -7.59 | 11 | 0 | 3 |
A0004 | 0 | 2 | 38 | 0 | 0 | 1 | 0 | 0 | 2.70 | 1 | 2 | -0.84 | 7 | 1 | 3 |
A0005 | 0 | 6 | 39 | 0 | 0 | 1 | 0 | 0 | 3.00 | 1 | 2 | -1.68 | 2 | 0 | 3 |
covariates.daycare.mean()
0.015782828282828284
covariates.heart_hx.mean()
0.046071315872514992
covariates.cigarette_preg.mean()
0.076364783843483747
#axes = pd.scatter_matrix(covariates, figsize=(16,12))
severity = hospitalized[["oxygen", "days_oxygen",
"vent", "days_vent",
"icu_any",
"length_of_stay",
"death"]]
Some measures of severity are rare
severity.icu_any.mean()
0.098453770905648469
severity.oxygen.mean()
0.32281708094327599
severity.vent.mean()
0.035384124960153016
Histogram of days on oxygen
severity.days_oxygen.hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x106e29090>
severity.death.mean()
0.0097822656989586618
Attributes of non-survivors
hospitalized.age_months[hospitalized.death].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x106eabed0>
hospitalized.length_of_stay[hospitalized.death].hist(bins=15)
<matplotlib.axes._subplots.AxesSubplot at 0x109eea4d0>
hospitalized.gest_age[hospitalized.death].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x109fd5150>
(hospitalized.gest_age<37).mean()
0.14200063111391606
(hospitalized[hospitalized.death].gest_age<37).mean()
0.16129032258064516
Proportion positive blood culture
hospitalized.blood_neg[hospitalized.blood_neg<2].mean()
0.12827225130890052
hospitalized.blood_neg[hospitalized.blood_neg<2][hospitalized.death].mean()
0.23076923076923078
hospitalized.blood_pneumo.mean()
0.021276595744680851
hospitalized.blood_pneumo[hospitalized.death].mean()
0.0
Histogram of days on ventilation
severity.days_vent.hist(bins=15)
<matplotlib.axes._subplots.AxesSubplot at 0x10a09e210>
hospitalized[hospitalized.death].vent.median()
0.0
hospitalized.death[hospitalized.vent.astype(bool)].mean()
0.055944055944055944
outcome = 'oxygen'
complete = (covariates.isnull().sum(axis=1).astype(bool)==False) & (severity[outcome].isnull().astype(bool)==False)
covariates_complete = covariates[complete]
y_complete = severity[outcome][complete]
variables = pd.concat([covariates, severity], axis=1)
Drop rows not worth imputing
variables[['cigarette_smokers', 'birth_wt_child',
'oxygen', 'length_of_stay', 'hospitalized_vitamin_d', 'breastfed']].isnull().sum()
cigarette_smokers 1 birth_wt_child 2 oxygen 31 length_of_stay 29 hospitalized_vitamin_d 480 breastfed 0 dtype: int64
variables = variables.dropna(subset=['cigarette_smokers', 'birth_wt_child',
'oxygen', 'length_of_stay', 'hospitalized_vitamin_d', 'breastfed', 'z_score'])
Standardize vitamin D, age in months, and birth weight variables
variables['vitamin_d_norm'] = ((variables.hospitalized_vitamin_d - variables.hospitalized_vitamin_d.mean())
/ variables.hospitalized_vitamin_d.std())
variables['wt_norm'] = ((variables.birth_wt_child - variables.birth_wt_child.mean()) /
variables.birth_wt_child.std())
variables['age_norm'] = ((variables.age_months - variables.age_months.mean()) /
variables.age_months.std())
Convert gestational age to number of weeks less than 37.
variables['premature'] = np.maximum(0, 37 - variables.gest_age)
variables.premature.hist(bins=20, grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x109f54bd0>
Extract usable subset of covariates and response
variables_glm = variables[
['vitamin_d_norm', 'premature', 'cigarette_smokers', 'sex_child',
'wt_norm', 'oxygen', 'breastfed', 'z_score', 'age_norm']]
Confirm no nulls
variables_glm.isnull().sum(0)
vitamin_d_norm 0 premature 0 cigarette_smokers 0 sex_child 0 wt_norm 0 oxygen 0 breastfed 0 z_score 0 age_norm 0 dtype: int64
data_glm = variables_glm.to_dict(outtype='list')
from pymc import Model, glm
import theano.tensor as t
from pymc import sample, Slice, NUTS
from pymc import forestplot, traceplot, summary
def tinvlogit(x):
return t.exp(x) / (1 + t.exp(x))
with Model() as interactive_model:
formula = 'oxygen ~ (vitamin_d_norm + premature + cigarette_smokers + sex_child '
formula += '+ breastfed + z_score + age_norm) ** 2'
glm.glm(formula, data_glm,
family=glm.families.Binomial(link=glm.links.Logit))
with interactive_model:
trace_int = sample(2000, NUTS(interactive_model.vars), progressbar=False)
#_ = traceplot(trace_int)
Forest plot of parameter estimates. Thick line is the interquartile range of the estimate, thin line the 95% posterior interval. Positive values mean an increase in the probability of ending up on oxygen when the value of that parameter increasees by one unit.
So, for example, the z-score main effect is negative, so it decreases the probability of oxygen when it increases (which makes sense). The interaction of z-score with normalized age (bottom of plot) is negative, meaning that individuals that are older with higher z-scores even further reduce the probability of being on oxygen (non-additive effect).
plt.figure(figsize=(14,8))
forestplot(trace_int, vars=trace_int.varnames[1:])
<matplotlib.gridspec.GridSpec at 0x14d355410>
variables_rsv = variables.ix[hospitalized.pcr_result___1==1,
['vitamin_d_norm', 'premature', 'cigarette_smokers', 'sex_child', 'wt_norm',
'oxygen', 'breastfed', 'z_score', 'age_norm']].to_dict(outtype='list')
with Model() as rsv_model:
formula = 'oxygen ~ (vitamin_d_norm + premature + cigarette_smokers + sex_child + breastfed + z_score + age_norm) ** 2'
glm.glm(formula, variables_rsv,
family=glm.families.Binomial(link=glm.links.Logit))
with rsv_model:
trace_rsv = sample(2000, NUTS(), progressbar=False)
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-260-7051395c008c> in <module>() 1 with rsv_model: ----> 2 trace_rsv = sample(2000, NUTS(), progressbar=False) /Users/fonnescj/GitHub/pymc/pymc/step_methods/nuts.pyc in __init__(self, vars, scaling, step_scale, is_cov, state, Emax, target_accept, gamma, k, t0, model) 57 58 if isinstance(scaling, dict): ---> 59 scaling = guess_scaling(Point(scaling, model=model), model=model, vars = vars) 60 61 /Users/fonnescj/GitHub/pymc/pymc/tuning/scaling.pyc in guess_scaling(point, vars, model) 77 def guess_scaling(point, vars=None, model=None): 78 model = modelcontext(model) ---> 79 h = find_hessian_diag(point, vars, model=model) 80 return adjust_scaling(h) 81 /Users/fonnescj/GitHub/pymc/pymc/tuning/scaling.pyc in find_hessian_diag(point, vars, model) 72 """ 73 model = modelcontext(model) ---> 74 H = model.fastfn(hessian_diag(model.logpt, vars)) 75 return H(Point(point, model=model)) 76 /Users/fonnescj/GitHub/pymc/pymc/memoize.pyc in memoizer(*args, **kwargs) 12 13 if key not in cache: ---> 14 cache[key] = obj(*args, **kwargs) 15 16 return cache[key] /Users/fonnescj/GitHub/pymc/pymc/theanof.pyc in hessian_diag(f, vars) 92 vars = cont_inputs(f) 93 ---> 94 return -t.concatenate([hessian_diag1(f, v) for v in vars], axis=0) 95 96 /Users/fonnescj/GitHub/pymc/pymc/theanof.pyc in hessian_diag1(f, v) 78 def hessian_diag1(f, v): 79 ---> 80 g = gradient1(f, v) 81 idx = t.arange(g.shape[0]) 82 /Users/fonnescj/GitHub/pymc/pymc/theanof.pyc in gradient1(f, v) 41 def gradient1(f, v): 42 """flat gradient of f wrt v""" ---> 43 return t.flatten(t.grad(f, v, disconnected_inputs='warn')) 44 45 /usr/local/lib/python2.7/site-packages/theano/gradient.pyc in grad(cost, wrt, consider_constant, disconnected_inputs, add_names, known_grads, return_disconnected) 527 528 rval = _populate_grad_dict(var_to_app_to_idx, --> 529 grad_dict, wrt, cost_name) 530 531 for i in xrange(len(rval)): /usr/local/lib/python2.7/site-packages/theano/gradient.pyc in _populate_grad_dict(var_to_app_to_idx, grad_dict, wrt, cost_name) 1205 return grad_dict[var] 1206 -> 1207 rval = [access_grad_cache(elem) for elem in wrt] 1208 1209 return rval /usr/local/lib/python2.7/site-packages/theano/gradient.pyc in access_grad_cache(var) 1165 for idx in node_to_idx[node]: 1166 -> 1167 term = access_term_cache(node)[idx] 1168 1169 if not isinstance(term, gof.Variable): /usr/local/lib/python2.7/site-packages/theano/gradient.pyc in access_term_cache(node) 1026 str(g_shape)) 1027 -> 1028 input_grads = node.op.grad(inputs, new_output_grads) 1029 1030 if input_grads is None: /usr/local/lib/python2.7/site-packages/theano/tensor/opt.pyc in grad(self, inputs, output_gradients) 628 grads = [] 629 for i, inp in enumerate(inputs): --> 630 grads.append(output_gradients[0][i]) 631 return grads 632 /usr/local/lib/python2.7/site-packages/theano/tensor/var.pyc in __getitem__(self, args) 408 return theano.tensor.subtensor.Subtensor(args)( 409 self, *theano.tensor.subtensor.Subtensor.collapse(args, --> 410 lambda entry: isinstance(entry, Variable))) 411 412 def take(self, indices, axis=None, mode='raise'): /usr/local/lib/python2.7/site-packages/theano/gof/op.pyc in __call__(self, *inputs, **kwargs) 458 # compute output value once with test inputs to validate graph 459 thunk = node.op.make_thunk(node, storage_map, compute_map, --> 460 no_recycling=[]) 461 thunk.inputs = [storage_map[v] for v in node.inputs] 462 thunk.outputs = [storage_map[v] for v in node.outputs] /usr/local/lib/python2.7/site-packages/theano/gof/op.pyc in make_thunk(self, node, storage_map, compute_map, no_recycling) 606 if self._op_use_c_code: 607 try: --> 608 e = FunctionGraph(node.inputs, node.outputs) 609 610 e_no_recycling = [new_o /usr/local/lib/python2.7/site-packages/theano/gof/fg.pyc in __init__(self, inputs, outputs, features, clone) 91 """ 92 if clone: ---> 93 inputs, outputs = graph.clone(inputs, outputs) 94 95 self.execute_callbacks_time = 0 /usr/local/lib/python2.7/site-packages/theano/gof/graph.pyc in clone(i, o, copy_inputs) 659 Returns the inputs and outputs of that copy. 660 """ --> 661 equiv = clone_get_equiv(i, o, copy_inputs) 662 return [equiv[input] for input in i], [equiv[output] for output in o] 663 /usr/local/lib/python2.7/site-packages/theano/gof/graph.pyc in clone_get_equiv(inputs, outputs, copy_inputs_and_orphans, memo) 703 704 # go through the inputs -> outputs graph cloning as we go --> 705 for apply in io_toposort(inputs, outputs): 706 for input in apply.inputs: 707 if input not in memo: /usr/local/lib/python2.7/site-packages/theano/gof/graph.pyc in io_toposort(inputs, outputs, orderings) 814 return rval 815 --> 816 topo = general_toposort(outputs, deps) 817 return [o for o in topo if isinstance(o, Apply)] 818 /usr/local/lib/python2.7/site-packages/theano/gof/graph.pyc in general_toposort(r_out, deps, debug_print) 767 for client in clients.get(node, []): 768 deps_cache[client] = [a for a in deps_cache[client] if a is not node] --> 769 if not deps_cache[client]: 770 sources.append(client) 771 KeyboardInterrupt:
Traceback (most recent call last): File "/usr/local/lib/python2.7/site-packages/IPython/kernel/zmq/ipkernel.py", line 416, in execute_request shell.run_cell(code, store_history=store_history, silent=silent) File "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2754, in run_cell self.events.trigger('post_execute') File "/usr/local/lib/python2.7/site-packages/IPython/core/events.py", line 82, in trigger func(*args, **kwargs) File "/usr/local/lib/python2.7/site-packages/IPython/kernel/zmq/pylab/backend_inline.py", line 118, in flush_figures return show(True) File "/usr/local/lib/python2.7/site-packages/IPython/kernel/zmq/pylab/backend_inline.py", line 47, in show matplotlib.pyplot.close('all') File "/usr/local/lib/python2.7/site-packages/matplotlib/pyplot.py", line 517, in close _pylab_helpers.Gcf.destroy_all() File "/usr/local/lib/python2.7/site-packages/matplotlib/_pylab_helpers.py", line 93, in destroy_all gc.collect() KeyboardInterrupt
plt.figure(figsize=(14,8))
forestplot(trace_rsv, vars=trace_rsv.varnames[1:])
Distribution of smoking exposure
hospitalized.cigarette_smokers.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10a227190>
Proportion and of children exposed to cigarette smoking
(hospitalized.cigarette_smokers > 0).mean()
0.72578100347112651
(hospitalized.cigarette_smokers > 0).sum()
2300
Distribution of nargila exposure
hospitalized.nargila_smokers.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10a4617d0>
Proportion and number of children exposed to nargila
(hospitalized.nargila_smokers > 0).mean()
0.17828968128747238
(hospitalized.nargila_smokers > 0).sum()
565
Proportion of children with mothers who smoked during pregnancy
hospitalized.cigarette_preg.mean()
0.076364783843483747
hospitalized.nargila_preg.mean()
0.028400126222783213
Days of symptoms before admission by gender
days_by_sex = hospitalized.groupby('sex_child')
_ = days_by_sex.boxplot(column='days_symptoms')
/usr/local/lib/python2.7/site-packages/pandas/tools/plotting.py:2421: FutureWarning: The default value for 'return_type' will change to 'axes' in a future release. To use the future behavior now, set return_type='axes'. To keep the previous behavior and silence this warning, set return_type='dict'. warnings.warn(msg, FutureWarning)
Vitamin D levels by breastfeeding status
TODO: clean up plot for presentation
hospitalized.breastfed = hospitalized.breastfed.replace({0: 'Not breastfed', 1: 'Breastfed'})
# _ = hospitalized.groupby('breastfed').boxplot(column='hospitalized_vitamin_d', grid=False, fontsize=0)
# plt.gca().set_ylabel('Vitamin D levels')
for index, group in hospitalized.groupby(['breastfed']):
group.boxplot(column='hospitalized_vitamin_d')
_ = hospitalized.groupby('breastfed')['oxygen'].hist(alpha=0.3)
RSV count by oxygen status
_ = hospitalized.groupby('oxygen').boxplot(column='rsv_count')
rsv_only = hospitalized[hospitalized.pcr_result___1==1]
rsv_only['vitd_lt_20'] = rsv_only.hospitalized_vitamin_d<20
rsv_only.groupby('vitd_lt_20')['oxygen'].describe()
vitd_lt_20 False count 663.000000 mean 0.380090 std 0.485775 min 0.000000 25% 0.000000 50% 0.000000 75% 1.000000 max 1.000000 True count 719.000000 mean 0.436718 std 0.496324 min 0.000000 25% 0.000000 50% 0.000000 75% 1.000000 max 1.000000 dtype: float64
Proportion on oxygen and vitamin D, by month
groupby_month = hospitalized.groupby('admission_month')
ax = groupby_month['oxygen'].mean().plot(grid=False, color='c', figsize=(14,8))
groupby_month['hospitalized_vitamin_d'].mean().plot(secondary_y=True, color='r')
ax.set_ylabel('Proportion on oxygen', color='c')
ax.right_ax.set_ylabel('Mean vitamin D', color='r')
ax.set_xlabel('Admission month')
ax.set_xticklabels(['Jan','Feb','Apr','Jun','Aug','Oct','Dec'])
#ax.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
_ = plt.xlim(1,12)
ply = plotly.plotly.sign_in('jordan_ari', 'ixur4zqdut')
c = ['#1f77b4', # muted blue
'#ff7f0e', # safety orange
'#2ca02c' # cooked asparagus green
]
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
groupby_month = hospitalized.groupby('admission_month')
data = [{'y': list(groupby_month['oxygen'].mean()),
'x': months,
'name': 'oxygen'},
{'y': list(groupby_month['hospitalized_vitamin_d'].mean()),
'x': months,
'name': 'vitamin D',
'yaxis': 'y2'},
{'y': list(groupby_month['pcr_result___1'].mean()),
'x': months,
'name': 'percent RSV',
'yaxis': 'y3'},
# {'y': list(groupby_month['pcr_result___5'].mean()),
# 'x': months,
# 'name': 'percent rhino',
# 'yaxis': 'y4'},
]
layout = {
'showlegend': False,
"xaxis":{
"domain":[0.2, 0.9]
},
"yaxis":{
"title": 'Proportion on oxygen',
"position": 0.1,
"range":[0,1],
"titlefont":{
"color":c[0]
},
"tickfont":{
"color":c[0]
},
},
"yaxis2":{
"overlaying":"y",
"side":"right",
"anchor":"x",
"title": 'Mean vitamin D',
"titlefont":{
"color":c[1]
},
"tickfont":{
"color":c[1]
},
},
"yaxis3":{
"overlaying":"y",
"side":"left",
"anchor":"free",
"position": -0.15,
"range":[0,1],
"title": "Proportion RSV",
"titlefont":{
"color":c[2]
},
"tickfont":{
"color":c[2]
},
},
# "yaxis4":{
# "overlaying":"y",
# "side":"left",
# "anchor":"free",
# "position": -0.3,
# "range":[0,1],
# "title": "Proportion rhino",
# "titlefont":{
# "color":"#969696"
# },
# "tickfont":{
# "color":"#969696"
# },
# }
}
#ply.plot(data, layout=layout, width=750,height=500)
hospitalized['male'] = (hospitalized.sex=='M').astype(int)
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23]))
age_groups.index = hospitalized.index
age_groups.columns = 'under_2_months', 'months_2_11', 'months_12_23'
hospitalized = hospitalized.join(age_groups)
nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian',
6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup)
hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int)
hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int)
hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int)
hospitalized['vitamin D < 20'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan
hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int)
hospitalized['vitamin D < 11'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan
hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int)
hospitalized['breastfed'] = (hospitalized.breastfed=='Breastfed').astype(int)
#hospitalized.replace({'icu_any': {True: 'ICU', False: 'No ICU'},
# 'oxygen': {0: 'No oxygen', 1: 'Oxygen'}})['icu_any']
groupby_icu = hospitalized.groupby('icu_any')
def make_table(groupby, table_vars, replace_dict={}):
table = np.round(groupby[table_vars].mean(), 2).T
ratios = [calc_or(groupby, v) for v in table.index]
table['OR'] = [r[0] for r in ratios]
table['Interval'] = [r[1] for r in ratios]
table['N'] = [r[2] for r in ratios]
table.rename(columns=replace_dict, inplace=True)
table.columns.name = None
return(table)
table_vars = ['male', 'under_2_months', 'months_2_11', 'months_12_23',
'Jordanian', 'Palestinian', 'vitamin D < 20', 'vitamin D < 11',
'prev_cond', 'heart_hx', 'breastfed', 'premature',
'adm_pneumo', 'adm_bronchopneumo', 'adm_sepsis', 'adm_bronchiolitis']
Function for calculating odds ratios and simulation-based intervals.
se = lambda p, n: np.sqrt(p * (1. - p) / n)
def odds_ratio(x, y, n_sim=10000, alpha=0.05):
try:
n_x, n_y = len(x.dropna()), len(y.dropna())
p_x, p_y = x.mean(), y.mean()
se_x = se(p_x, n_x)
se_y = se(p_y, n_y)
p_x_sim = np.random.normal(p_x, se_x, n_sim)
p_y_sim = np.random.normal(p_y, se_y, n_sim)
ratio = ((p_x_sim / (1. - p_x_sim)) /
(p_y_sim / (1. - p_y_sim)))
interval = np.percentile(ratio, [100*(alpha/2.), 100*(1. - alpha/2.)])
return np.round(np.median(ratio), 2), np.round(interval, 2).tolist(), (n_y, n_x)
except ValueError:
return np.nan, np.nan, np.nan
def calc_or(groupby, var):
data = list(groupby[var])
return odds_ratio(data[1][1], data[0][1])
virus_vars = ['RSV', 'Influenza', 'HMPV', 'Rhino']
pcr_lookup = {'pcr_result___1': 'RSV',
'pcr_result___2': 'HMPV',
'pcr_result___3': 'flu A',
'pcr_result___4': 'flu B',
'pcr_result___5': 'rhino',
'pcr_result___6': 'PIV1',
'pcr_result___7': 'PIV2',
'pcr_result___8': 'PIV3',
'pcr_result___13': 'H1N1',
'pcr_result___14': 'H3N2',
'pcr_result___15': 'Swine',
'pcr_result___16': 'Swine H1',
'pcr_result___17': 'flu C',
'pcr_result___18': 'Adeno'}
hospitalized['RSV'] = hospitalized['pcr_result___1']
hospitalized['Influenza'] = (hospitalized['pcr_result___3'] | hospitalized['pcr_result___4']).astype(int)
hospitalized['HMPV'] = hospitalized['pcr_result___2']
hospitalized['Rhino'] = hospitalized['pcr_result___5']
hospitalized['vitamin_d_norm'] = ((hospitalized.hospitalized_vitamin_d - hospitalized.hospitalized_vitamin_d.mean())
/ hospitalized.hospitalized_vitamin_d.std())
hospitalized['age_centered'] = hospitalized.age_months/hospitalized.age_months.mean()
bronchiolitis_subset = hospitalized[hospitalized.adm_bronchiolitis==1]
bronchiolitis_subset.shape
(547, 435)
bronchiolitis_subset.severity_score.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1144dd050>
bronchiolitis_subset = bronchiolitis_subset.dropna(subset=['vitamin_d_norm', 'male'] + virus_vars)
import theano.tensor as T
from pymc import Bound, Normal, Uniform
BoundedNormal = Bound(Normal, 0, 15)
with Model() as bronchiolitis_severity_model:
beta = Normal('beta', 0, 0.1, shape=6)
mu = BoundedNormal('mu', 4, 0.1)
theta = mu + T.dot(bronchiolitis_subset[['vitamin_d_norm', 'male'] + virus_vars], beta)
sigma = Uniform('sigma', 0, 4)
tau = sigma ** -2
score = BoundedNormal('score', theta, tau, observed=bronchiolitis_subset.severity_score)
from pymc import sample
with bronchiolitis_severity_model:
trace_severity = sample(2000, NUTS(), progressbar=False)
from pymc import forestplot
_ = forestplot(trace_severity, vars=['beta'], ylabels=['vitamin_d_norm', 'male'] + virus_vars)
posterior_samples = {v: trace_severity.traces[0][v] for v in ['mu', 'beta']}
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-106-9cf9e8fda0d7> in <module>() ----> 1 posterior_samples = {v: trace_severity.traces[0][v] for v in ['mu', 'beta']} <ipython-input-106-9cf9e8fda0d7> in <dictcomp>((v,)) ----> 1 posterior_samples = {v: trace_severity.traces[0][v] for v in ['mu', 'beta']} AttributeError: 'NpTrace' object has no attribute 'traces'
bronchiolitis_subset.index = range(bronchiolitis_subset.shape[0])
def posterior_check(i, dataset=bronchiolitis_subset, samples=trace_severity):
row = dataset.ix[i]
pred = samples['mu']
pred += row[['vitamin_d_norm', 'male'] + virus_vars].values.dot(samples['beta'].T)
plt.hist(pred)
print(row.severity_score)
plt.axvline(row.severity_score, color='r')
posterior_check(250)
4.0
rsv_subset = hospitalized[hospitalized.RSV==1]
rsv_subset.shape
(1397, 435)
rsv_subset.severity_score.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10a461290>
rsv_vars = ['vitamin_d_norm'] + table_vars[:]
rsv_vars.pop(rsv_vars.index('vitamin D < 20'))
rsv_vars.pop(rsv_vars.index('vitamin D < 11'))
rsv_vars.pop(rsv_vars.index('Palestinian'))
rsv_vars.pop(rsv_vars.index('Jordanian'))
'Jordanian'
Drop missing values
rsv_subset[rsv_vars].isnull().sum()
vitamin_d_norm 204 male 0 under_2_months 0 months_2_11 0 months_12_23 0 prev_cond 0 heart_hx 0 breastfed 0 premature 0 adm_pneumo 0 adm_bronchopneumo 0 adm_sepsis 0 adm_bronchiolitis 0 dtype: int64
rsv_subset = rsv_subset.dropna(subset=['vitamin_d_norm', 'male'] + virus_vars)
with Model() as rsv_severity_model:
beta = Normal('beta', 0, 0.1, shape=len(rsv_vars))
mu = BoundedNormal('mu', 4, 0.1)
theta = mu + T.dot(rsv_subset[rsv_vars], beta)
sigma = Uniform('sigma', 0, 4)
tau = sigma ** -2
score = BoundedNormal('score', theta, tau, observed=rsv_subset.severity_score)
with rsv_severity_model:
trace_rsv_severity = sample(2000, NUTS())
[-----------------100%-----------------] 2000 of 2000 complete in 13.0 sec
_ = forestplot(trace_rsv_severity, vars=['beta'], ylabels=rsv_vars)
'+'.join(rsv_vars)
'vitamin_d_norm+male+under_2_months+months_2_11+months_12_23+prev_cond+heart_hx+breastfed+premature+adm_pneumo+adm_bronchopneumo+adm_sepsis+adm_bronchiolitis'
with Model() as rsv_oxygen_model:
formula = 'oxygen ~ ' + '+'.join(rsv_vars)
glm.glm(formula, rsv_subset,
family=glm.families.Binomial(link=glm.links.Logit))
with rsv_oxygen_model:
trace_int = sample(2000, NUTS())
/usr/local/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:85: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility from scan_perform.scan_perform import * /usr/local/lib/python2.7/site-packages/theano/tensor/subtensor.py:128: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future. start in [None, 0] or
[-----------------100%-----------------] 2000 of 2000 complete in 37.0 sec
_ = forestplot(trace_int, vars=rsv_vars, ylabels=rsv_vars)
bronchiolitis_subset.replace({'RSV': {1: 'RSV', 0: 'No RSV'}}).groupby('RSV').boxplot(column='hospitalized_vitamin_d');
pneumonia_subset = hospitalized[hospitalized.adm_pneumo==1]
pneumonia_subset.shape
(394, 435)
pneumonia_subset.replace({'RSV': {1: 'RSV', 0: 'No RSV'}}).groupby('RSV').boxplot(column='hospitalized_vitamin_d');
bronchopneumo_subset = hospitalized[hospitalized.adm_bronchopneumo==1]
bronchopneumo_subset.shape
(1020, 435)
bronchopneumo_subset.replace({'RSV': {1: 'RSV', 0: 'No RSV'}}).groupby('RSV').boxplot(column='hospitalized_vitamin_d');
groupby_o2 = hospitalized.groupby('oxygen')
make_table(groupby_o2, table_vars=table_vars+virus_vars, replace_dict={0.0: 'No Oxygen', 1.0: 'Oxygen'})
No Oxygen | Oxygen | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.62 | 0.58 | 0.86 | [0.74, 1.0] | (2125, 1013) |
under 2 months | 0.17 | 0.23 | 1.47 | [1.22, 1.76] | (2125, 1013) |
2-11 months | 0.54 | 0.53 | 0.96 | [0.82, 1.11] | (2125, 1013) |
12-23 months | 0.18 | 0.11 | 0.57 | [0.44, 0.71] | (2125, 1013) |
Jordanian | 0.89 | 0.91 | 1.21 | [0.94, 1.59] | (2125, 1013) |
Palestinian | 0.08 | 0.06 | 0.76 | [0.54, 1.02] | (2125, 1013) |
vitamin D < 20 | 0.57 | 0.60 | 1.13 | [0.96, 1.34] | (1807, 858) |
vitamin D < 11 | 0.36 | 0.43 | 1.30 | [1.1, 1.54] | (1807, 858) |
prev_cond | 0.09 | 0.18 | 2.16 | [1.73, 2.69] | (2125, 1013) |
heart_hx | 0.03 | 0.07 | 2.07 | [1.46, 2.9] | (2125, 1013) |
breastfed | 0.62 | 0.59 | 0.88 | [0.76, 1.03] | (2125, 1013) |
premature | 0.13 | 0.17 | 1.38 | [1.11, 1.71] | (2125, 1013) |
adm_pneumo | 0.09 | 0.21 | 2.78 | [2.24, 3.45] | (2125, 1013) |
adm_bronchopneumo | 0.34 | 0.28 | 0.73 | [0.62, 0.86] | (2125, 1013) |
adm_sepsis | 0.33 | 0.20 | 0.51 | [0.43, 0.61] | (2125, 1013) |
adm_bronchiolitis | 0.15 | 0.21 | 1.49 | [1.23, 1.8] | (2125, 1013) |
RSV | 0.38 | 0.56 | 2.03 | [1.75, 2.37] | (2125, 1013) |
Influenza | 0.04 | 0.03 | 0.73 | [0.43, 1.11] | (2125, 1013) |
HMPV | 0.09 | 0.08 | 0.89 | [0.67, 1.16] | (2125, 1013) |
Rhino | 0.39 | 0.38 | 0.95 | [0.82, 1.11] | (2125, 1013) |
groupby_vent = hospitalized.groupby('vent')
make_table(groupby_vent, table_vars=table_vars+virus_vars,
replace_dict={0.0: 'No Ventilator', 1.0: 'Ventilator'})
No Ventilator | Ventilator | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.60 | 0.59 | 0.93 | [0.63, 1.37] | (3026, 111) |
under 2 months | 0.19 | 0.24 | 1.36 | [0.83, 2.04] | (3026, 111) |
2-11 months | 0.54 | 0.47 | 0.77 | [0.51, 1.12] | (3026, 111) |
12-23 months | 0.15 | 0.14 | 0.93 | [0.47, 1.46] | (3026, 111) |
Jordanian | 0.89 | 0.94 | 1.75 | [0.95, 6.11] | (3026, 111) |
Palestinian | 0.07 | 0.04 | 0.49 | [0.02, 1.0] | (3026, 111) |
vitamin D < 20 | 0.58 | 0.50 | 0.70 | [0.47, 1.03] | (2557, 107) |
vitamin D < 11 | 0.39 | 0.35 | 0.84 | [0.54, 1.24] | (2557, 107) |
prev_cond | 0.12 | 0.28 | 2.98 | [1.83, 4.43] | (3026, 111) |
heart_hx | 0.04 | 0.14 | 3.51 | [1.66, 5.81] | (3026, 111) |
breastfed | 0.61 | 0.59 | 0.95 | [0.65, 1.43] | (3026, 111) |
premature | 0.14 | 0.18 | 1.35 | [0.76, 2.11] | (3026, 111) |
adm_pneumo | 0.12 | 0.23 | 2.23 | [1.34, 3.4] | (3026, 111) |
adm_bronchopneumo | 0.32 | 0.33 | 1.06 | [0.69, 1.56] | (3026, 111) |
adm_sepsis | 0.29 | 0.22 | 0.68 | [0.4, 1.03] | (3026, 111) |
adm_bronchiolitis | 0.17 | 0.19 | 1.13 | [0.63, 1.73] | (3026, 111) |
RSV | 0.44 | 0.47 | 1.12 | [0.76, 1.64] | (3026, 111) |
Influenza | 0.03 | 0.02 | 0.53 | [-0.2, 1.32] | (3026, 111) |
HMPV | 0.09 | 0.06 | 0.71 | [0.19, 1.29] | (3026, 111) |
Rhino | 0.39 | 0.37 | 0.92 | [0.61, 1.34] | (3026, 111) |
groupby_death = hospitalized.groupby('death')
make_table(groupby_death, table_vars=table_vars+virus_vars, replace_dict={False: 'Survived', True: 'Die'})
Survived | Die | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.60 | 0.48 | 0.61 | [0.29, 1.26] | (3138, 31) |
under 2 months | 0.19 | 0.39 | 2.68 | [1.18, 5.39] | (3138, 31) |
2-11 months | 0.53 | 0.39 | 0.55 | [0.25, 1.1] | (3138, 31) |
12-23 months | 0.15 | 0.06 | 0.38 | [-0.12, 0.98] | (3138, 31) |
Jordanian | 0.90 | 0.87 | 0.78 | [0.33, 4.82] | (3138, 31) |
Palestinian | 0.07 | 0.10 | 1.40 | [-0.11, 3.37] | (3138, 31) |
vitamin D < 20 | 0.58 | 0.44 | 0.56 | [0.23, 1.26] | (2664, 25) |
vitamin D < 11 | 0.39 | 0.28 | 0.62 | [0.18, 1.36] | (2664, 25) |
prev_cond | 0.12 | 0.52 | 8.05 | [3.84, 17.18] | (3138, 31) |
heart_hx | 0.05 | 0.13 | 3.13 | [0.24, 7.17] | (3138, 31) |
breastfed | 0.61 | 0.42 | 0.46 | [0.21, 0.94] | (3138, 31) |
premature | 0.14 | 0.16 | 1.17 | [0.2, 2.53] | (3138, 31) |
adm_pneumo | 0.12 | 0.32 | 3.42 | [1.37, 6.95] | (3138, 31) |
adm_bronchopneumo | 0.32 | 0.13 | 0.31 | [0.02, 0.69] | (3138, 31) |
adm_sepsis | 0.28 | 0.52 | 2.71 | [1.3, 5.77] | (3138, 31) |
adm_bronchiolitis | 0.17 | 0.03 | 0.16 | [-0.14, 0.51] | (3138, 31) |
RSV | 0.44 | 0.23 | 0.37 | [0.11, 0.74] | (3138, 31) |
Influenza | 0.03 | 0.13 | 4.52 | [0.31, 10.28] | (3138, 31) |
HMPV | 0.09 | 0.06 | 0.72 | [-0.22, 1.89] | (3138, 31) |
Rhino | 0.39 | 0.42 | 1.13 | [0.5, 2.31] | (3138, 31) |
The following calculations repeat the above tables, but for the RSV subset.
rsv_only = hospitalized[hospitalized.pcr_result___1==1]
rsv_icu = rsv_only.groupby('icu_any')
virus_vars
['RSV', 'Influenza', 'HMPV', 'Rhino']
make_table(rsv_icu, table_vars=table_vars+virus_vars[1:], replace_dict={False: 'No ICU', True: 'ICU'})
No ICU | ICU | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.60 | 0.56 | 0.86 | [0.61, 1.22] | (1255, 142) |
under 2 months | 0.19 | 0.32 | 2.09 | [1.41, 3.0] | (1255, 142) |
2-11 months | 0.61 | 0.38 | 0.39 | [0.27, 0.56] | (1255, 142) |
12-23 months | 0.12 | 0.06 | 0.48 | [0.17, 0.84] | (1255, 142) |
Jordanian | 0.90 | 0.89 | 0.87 | [0.54, 1.75] | (1255, 142) |
Palestinian | 0.07 | 0.08 | 1.18 | [0.47, 2.06] | (1255, 142) |
vitamin D < 20 | 0.60 | 0.66 | 1.28 | [0.88, 1.97] | (1072, 121) |
vitamin D < 11 | 0.44 | 0.50 | 1.24 | [0.84, 1.8] | (1072, 121) |
prev_cond | 0.07 | 0.12 | 1.88 | [0.94, 3.1] | (1255, 142) |
heart_hx | 0.03 | 0.03 | 0.99 | [0.05, 2.23] | (1255, 142) |
breastfed | 0.64 | 0.58 | 0.77 | [0.55, 1.1] | (1255, 142) |
premature | 0.12 | 0.20 | 1.89 | [1.16, 2.84] | (1255, 142) |
adm_pneumo | 0.14 | 0.32 | 2.89 | [1.92, 4.18] | (1255, 142) |
adm_bronchopneumo | 0.36 | 0.14 | 0.29 | [0.16, 0.44] | (1255, 142) |
adm_sepsis | 0.15 | 0.42 | 4.04 | [2.74, 5.84] | (1255, 142) |
adm_bronchiolitis | 0.28 | 0.17 | 0.53 | [0.31, 0.79] | (1255, 142) |
Influenza | 0.02 | 0.02 | 0.91 | [-0.11, 2.27] | (1255, 142) |
HMPV | 0.05 | 0.07 | 1.49 | [0.56, 2.71] | (1255, 142) |
Rhino | 0.33 | 0.32 | 0.97 | [0.65, 1.37] | (1255, 142) |
rsv_o2 = rsv_only.groupby('oxygen')
make_table(rsv_o2, table_vars=table_vars, replace_dict={0.0: 'No Oxygen', 1.0: 'Oxygen'})
No Oxygen | Oxygen | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.61 | 0.57 | 0.82 | [0.66, 1.02] | (816, 566) |
under 2 months | 0.15 | 0.28 | 2.27 | [1.74, 3.0] | (816, 566) |
2-11 months | 0.63 | 0.52 | 0.62 | [0.5, 0.77] | (816, 566) |
12-23 months | 0.16 | 0.06 | 0.34 | [0.22, 0.5] | (816, 566) |
Jordanian | 0.90 | 0.90 | 0.95 | [0.67, 1.39] | (816, 566) |
Palestinian | 0.07 | 0.07 | 0.97 | [0.61, 1.49] | (816, 566) |
vitamin D < 20 | 0.58 | 0.65 | 1.34 | [1.05, 1.71] | (697, 483) |
vitamin D < 11 | 0.41 | 0.51 | 1.49 | [1.18, 1.89] | (697, 483) |
prev_cond | 0.06 | 0.08 | 1.32 | [0.86, 2.03] | (816, 566) |
heart_hx | 0.03 | 0.03 | 1.12 | [0.54, 2.17] | (816, 566) |
breastfed | 0.65 | 0.62 | 0.90 | [0.72, 1.12] | (816, 566) |
premature | 0.11 | 0.15 | 1.45 | [1.05, 1.98] | (816, 566) |
adm_pneumo | 0.12 | 0.23 | 2.20 | [1.63, 2.93] | (816, 566) |
adm_bronchopneumo | 0.42 | 0.22 | 0.40 | [0.31, 0.5] | (816, 566) |
adm_sepsis | 0.15 | 0.23 | 1.71 | [1.3, 2.27] | (816, 566) |
adm_bronchiolitis | 0.27 | 0.27 | 0.98 | [0.77, 1.25] | (816, 566) |
rsv_vent = rsv_only.groupby('vent')
make_table(rsv_vent, table_vars=table_vars,
replace_dict={0.0: 'No Ventilator', 1.0: 'Ventilator'})
No Ventilator | Ventilator | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.60 | 0.56 | 0.85 | [0.49, 1.56] | (1329, 52) |
under 2 months | 0.20 | 0.33 | 2.00 | [1.01, 3.51] | (1329, 52) |
2-11 months | 0.59 | 0.46 | 0.59 | [0.33, 1.03] | (1329, 52) |
12-23 months | 0.12 | 0.08 | 0.62 | [0.03, 1.33] | (1329, 52) |
Jordanian | 0.90 | 0.94 | 1.75 | [-7.45, 15.22] | (1329, 52) |
Palestinian | 0.07 | 0.04 | 0.54 | [-0.19, 1.44] | (1329, 52) |
vitamin D < 20 | 0.62 | 0.48 | 0.58 | [0.32, 1.04] | (1127, 52) |
vitamin D < 11 | 0.45 | 0.38 | 0.75 | [0.41, 1.3] | (1127, 52) |
prev_cond | 0.07 | 0.10 | 1.40 | [0.22, 2.9] | (1329, 52) |
heart_hx | 0.03 | 0.00 | NaN | NaN | NaN |
breastfed | 0.64 | 0.52 | 0.61 | [0.35, 1.07] | (1329, 52) |
premature | 0.13 | 0.13 | 1.08 | [0.31, 2.1] | (1329, 52) |
adm_pneumo | 0.16 | 0.27 | 1.96 | [0.94, 3.48] | (1329, 52) |
adm_bronchopneumo | 0.34 | 0.25 | 0.64 | [0.28, 1.13] | (1329, 52) |
adm_sepsis | 0.18 | 0.21 | 1.25 | [0.52, 2.25] | (1329, 52) |
adm_bronchiolitis | 0.27 | 0.27 | 1.00 | [0.46, 1.78] | (1329, 52) |
rsv_death = rsv_only.groupby('death')
make_table(rsv_death, table_vars=table_vars, replace_dict={False: 'Survived', True: 'Died'})
Survived | Died | OR | Interval | N | |
---|---|---|---|---|---|
male | 0.60 | 0.43 | 0.51 | [0.05, 2.6] | (1390, 7) |
under 2 months | 0.20 | 0.43 | 2.97 | [0.24, 15.23] | (1390, 7) |
2-11 months | 0.59 | 0.29 | 0.28 | [-0.03, 1.14] | (1390, 7) |
12-23 months | 0.12 | 0.14 | 1.24 | [-0.76, 5.04] | (1390, 7) |
Jordanian | 0.90 | 0.71 | 0.25 | [-1.94, 2.86] | (1390, 7) |
Palestinian | 0.07 | 0.29 | 5.53 | [-0.69, 22.24] | (1390, 7) |
vitamin D < 20 | 0.61 | 0.71 | 1.44 | [-13.49, 15.56] | (1186, 7) |
vitamin D < 11 | 0.45 | 0.43 | 0.92 | [0.07, 5.0] | (1186, 7) |
prev_cond | 0.07 | 0.29 | 5.16 | [-0.53, 21.8] | (1390, 7) |
heart_hx | 0.03 | 0.00 | NaN | NaN | NaN |
breastfed | 0.63 | 0.57 | 0.75 | [0.11, 5.65] | (1390, 7) |
premature | 0.13 | 0.29 | 2.73 | [-0.36, 11.4] | (1390, 7) |
adm_pneumo | 0.16 | 0.57 | 6.97 | [1.01, 53.4] | (1390, 7) |
adm_bronchopneumo | 0.34 | 0.00 | NaN | NaN | NaN |
adm_sepsis | 0.18 | 0.43 | 3.52 | [0.3, 19.53] | (1390, 7) |
adm_bronchiolitis | 0.27 | 0.00 | NaN | NaN | NaN |
rsv_only.premature.value_counts()
0 1218 1 179 dtype: int64
Number of vitamin D records
hospitalized.hospitalized_vitamin_d.notnull().sum()
2689
Median vitaimin D
hospitalized.hospitalized_vitamin_d.median()
16.52601
Proportion of subjects with vitamin D < 20
(hospitalized.hospitalized_vitamin_d<20).mean()
0.4925844114862733
Create subset of RSV patients
rsv_only = hospitalized[hospitalized.pcr_result___1==1]
Number of RSV subjects
rsv_only.hospitalized_vitamin_d.notnull().sum()
1193
Median vitamin D of RSV subset
rsv_only.hospitalized_vitamin_d.median()
14.3
Proportion of RSV subjects with vitamin D < 20
(rsv_only.hospitalized_vitamin_d<20).mean()
0.51968503937007871
non_rsv = hospitalized[hospitalized.pcr_result___1==0]
This model estimates the difference in having vitamin D less than 20 for RSV-only versus non-RSV.
from pymc import Model, Beta, Deterministic, Binomial
n = non_rsv.shape[0]
n_rsv = rsv_only.shape[0]
with Model() as model:
p = Beta('p', 1, 1)
p_rsv = Beta('p_rsv', 1, 1)
p_diff = Deterministic('p_diff', p_rsv - p)
p_less = Deterministic('p_less', p_rsv < p)
y = Binomial('y', n=n, p=p, observed=(non_rsv.hospitalized_vitamin_d<20).sum())
y_rsv = Binomial('y_rsv', n=n_rsv, p=p_rsv, observed=(rsv_only.hospitalized_vitamin_d<20).sum())
with model:
step = NUTS()
trace = sample(2000, step)
[-----------------100%-----------------] 2000 of 2000 complete in 1.7 sec
_ = traceplot(trace, vars=[p, p_rsv])
The difference is positive, with an expected value of approximately 5% higher for the RSV subset.
_ = traceplot(trace, vars=[p_diff])
summary(trace, vars=[p_diff, p_less])
p_diff: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------- 0.048 0.018 0.000 [0.014, 0.084] Posterior quantiles: 2.5 25 50 75 97.5 |--------------|==============|==============|--------------| 0.012 0.037 0.048 0.059 0.084 p_less: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------- 0.004 0.059 0.001 [0.000, 0.000] Posterior quantiles: 2.5 25 50 75 97.5 |--------------|==============|==============|--------------| 0.000 0.000 0.000 0.000 0.000
Number and proportion died:
hospitalized.death.value_counts().rename({False: 'Survived', True: 'Died'})
Survived 3138 Died 31 dtype: int64
hospitalized.death.mean()
0.0097822656989586618
groupby_death = hospitalized.groupby('death')
Median z-score
groupby_death['z_score'].median().rename({False: 'Survived', True: 'Died'})
death Survived 0.01 Died -2.53 Name: z_score, dtype: float64
Gestational age
groupby_death['gest_age'].median().rename({False: 'Survived', True: 'Died'})
death Survived 40 Died 40 Name: gest_age, dtype: int64
Vitamin D
groupby_death['hospitalized_vitamin_d'].median().rename({False: 'Survived', True: 'Died'})
death Survived 16.5 Died 20.4 Name: hospitalized_vitamin_d, dtype: float64
Previous conditions
groupby_death[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].mean().rename({False: 'Survived', True: 'Died'})
diabetes_hx | heart_hx | kidney_hx | downs_hx | sickle_hx | cancer_hx | cf_hx | genetic_hx | cp_hx | seizure_hx | neuro_hx | mr_hx | asthma_hx | immuno_hx | liver_hx | gerd_hx | diarrhea_hx | other_hx | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
death | ||||||||||||||||||
Survived | 0.001912 | 0.045252 | 0.003824 | 0.012747 | 0.000319 | 0.000319 | 0.000637 | 0.005736 | 0.003824 | 0.003505 | 0.008604 | 0.000637 | 0.007967 | 0.000319 | 0.001593 | 0.000637 | 0 | 0.019758 |
Died | 0.000000 | 0.129032 | 0.032258 | 0.064516 | 0.000000 | 0.000000 | 0.000000 | 0.096774 | 0.064516 | 0.032258 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | 0.096774 |
Organism positive culture
groupby_death[pcr_lookup.keys()].mean().rename(columns=pcr_lookup)
flu B | rhino | PIV1 | PIV2 | RSV | HMPV | flu A | PIV3 | Adeno | Swine H1 | flu C | H3N2 | Swine | H1N1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
death | ||||||||||||||
False | 0.009242 | 0.390376 | 0.010835 | 0.004143 | 0.442957 | 0.086361 | 0.022945 | 0.04079 | 0.150414 | 0.010835 | 0.006055 | 0.009879 | 0.010198 | 0 |
True | 0.000000 | 0.419355 | 0.000000 | 0.000000 | 0.225806 | 0.064516 | 0.129032 | 0.00000 | 0.096774 | 0.032258 | 0.000000 | 0.064516 | 0.032258 | 0 |
direct to icu versus during hospitalization?? compare ourcome such as survival, days on the ventilator, duration of hospitalization and age group as well as z score
any predictors on those who were first admitted to the floor and then transferred, namely should they have been sent there in the first place??
what is the difference in the age, sex and diagnosis of the two groups and in comparison with the total group
what about the blood culture positivity in the two groups and versus the whole cohort
duration of hospitalization in the two groups and the whole cohort
comparison of these with the whole cohort is needed in order to determine the following from our study
hospitalized[hospitalized.icu_any==1]['adm_pneumo'].sum()
82
hospitalized[hospitalized.icu_any==1]['adm_bronchopneumo'].sum()
66
hospitalized[hospitalized.icu_any==1]['adm_bronchiolitis'].sum()
37
rsv_only.groupby('oxygen')['hospitalized_vitamin_d'].hist()
oxygen 0 Axes(0.125,0.125;0.775x0.775) 1 Axes(0.125,0.125;0.775x0.775) Name: hospitalized_vitamin_d, dtype: object
rsv_only.groupby('oxygen')['hospitalized_vitamin_d'].mean()
oxygen 0 16.528299 1 14.086796 Name: hospitalized_vitamin_d, dtype: float64
rsv_only['los5'] = (rsv_only.length_of_stay>=5).astype(int)
rsv_only.groupby('los5')['hospitalized_vitamin_d'].mean()
los5 0 16.505338 1 14.688211 Name: hospitalized_vitamin_d, dtype: float64
rsv_only.groupby('adm_pneumo')['hospitalized_vitamin_d'].mean()
adm_pneumo 0 16.250651 1 11.975369 Name: hospitalized_vitamin_d, dtype: float64
The following calculates the estimated median difference between the means of each group, along with a 95% bootstrap confidence interval. These can be interpreted as the difference in vitamin D levels between the two groups, and it is robust to the fact that vitamin D levels are not normally distributed within groups.
def bootstrap_difference(df, var, group, alpha=0.05, samples=10000):
grouped_var = df.groupby(group)[var]
diffs = [np.diff(grouped_var.apply(lambda x: x.iloc[np.random.randint(len(x), size=len(x))].mean())) for i in range(samples)]
interval = np.percentile(diffs, [100*(alpha/2.), 100*(1. - alpha/2.)])
return np.round(np.median(diffs), 2), np.round(interval, 2).tolist()
bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'adm_pneumo')
(-4.2800000000000002, [-6.03, -2.51])
bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'adm_bronchiolitis')
(0.059999999999999998, [-1.58, 1.73])
bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'adm_bronchopneumo')
(4.8499999999999996, [3.38, 6.32])
bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'adm_sepsis')
(-4.6399999999999997, [-6.37, -2.89])
rsv_only.groupby('adm_sepsis')['hospitalized_vitamin_d'].mean()
adm_sepsis 0 16.332006 1 11.701480 Name: hospitalized_vitamin_d, dtype: float64
bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'oxygen')
(-2.4500000000000002, [-3.88, -1.0])
bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'los5')
(-1.8100000000000001, [-3.24, -0.37])
Thus, none of the intervals above include zero, making it unlikely that the underlying populations are the same.
Predicting RSV infection from diagnoses
# Create indicator for LRTI
hospitalized['ltri'] = (hospitalized.adm_bronchopneumo + hospitalized.adm_bronchiolitis + hospitalized.adm_pneumo +
(hospitalized.wheezing>0) + (hospitalized.flaring>1)).astype(bool)
hospitalized.ltri.mean()
0.71252761123382768
Proportion with LTRI by virus
for p in pcr_lookup:
print('{0}:\t{1}'.format(pcr_lookup[p], np.round(hospitalized[hospitalized[p]==1].ltri.mean(), 2)))
flu B: 0.72 rhino: 0.7 PIV1: 0.82 PIV2: 0.69 RSV: 0.86 HMPV: 0.89 flu A: 0.7 PIV3: 0.68 Adeno: 0.74 Swine H1: 0.77 flu C: 0.84 H3N2: 0.61 Swine: 0.73 H1N1: nan