In [1]:

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly

pd.__version__

Out[1]:

'0.10.0-5065-gfb38fb6'

Correlate RSV severity with vitamin D levels (at hospital)

severity = oxygen, mech. vent, ICU, length of stay, death

Covariates:

Prematurity
low vitamin D
Congenital heart disease
any previous medical condition
Smoke exposure
And maternal smoking during pregnancy.
<10 weeks of age at time of season (or we can just say younger age if this is better)
Daycare
Siblings school age
Low birth weight
Male sex
high viral load (low Ct number)
young maternal age

Set flag for extracting the RSV-positive subset

In [2]:

RSV_only = False

In [3]:

hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
hospitalized.head()

/usr/local/lib/python2.7/site-packages/pandas/io/parsers.py:1130: DtypeWarning: Columns (140,142,144,146,148,181,206,207,212,213,261,280,281,282,296,297) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

Out[3]:

	greater_48hrs	fever_neutropenia	never_left	written_consent	child_name	mother_name	mother_birth_date	mother_record	mother_nationality	other_mother_nationality	...	date	whole_blood_complete	age_months	length_of_stay	gest_age	death	hospitalized_vitamin_d	wheezing_ind	sex	z_score
case_id
A0001	0	0	0	1	Remas Mahmoud Jbarah	Huda Katalo	1976-01-21	NaN	3	NaN	...	NaN	0	1	6	40	False	3	0	F	0.68
A0002	0	0	0	1	Majed Abdel Kareem Majed	Noor SHa'aban Mahmood	1989-09-09	NaN	3	NaN	...	NaN	0	1	5	40	False	4	1	M	-0.64
A0003	0	0	0	1	Rayyan Jamal Muhyi Al.Deen	SAra Hussein Muhyi Al.Deen	1965-01-01	NaN	1	NaN	...	NaN	0	11	10	40	False	35	0	F	-7.59
A0004	0	0	0	1	Hanan Mohd Mustapha Abu Othman	Kawla Abu Shanab	1983-10-31	NaN	1	NaN	...	NaN	0	7	3	38	False	2	1	F	-0.84
A0005	0	0	0	1	Yara Mahmoud Azmi Ismael	Suha Abdel Aziz	1986-02-28	NaN	1	NaN	...	NaN	0	2	1	39	False	6	0	F	-1.68

5 rows × 412 columns

In [4]:

hospitalized.child_birth_date = pd.to_datetime(hospitalized.child_birth_date)
hospitalized.enrollment_date = pd.to_datetime(hospitalized.enrollment_date)
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.discharge_date = pd.to_datetime(hospitalized.discharge_date)

In [5]:

hospitalized["prev_cond"] = hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1)

In [6]:

[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]

Out[6]:

['diabetes_hx',
 'heart_hx',
 'kidney_hx',
 'downs_hx',
 'sickle_hx',
 'cancer_hx',
 'cf_hx',
 'genetic_hx',
 'cp_hx',
 'seizure_hx',
 'neuro_hx',
 'mr_hx',
 'asthma_hx',
 'immuno_hx',
 'liver_hx',
 'gerd_hx',
 'diarrhea_hx',
 'other_hx',
 'spec_hx']

Plot z-scores by year of admission

In [7]:

hospitalized['year_admission'] = hospitalized.admission_date.apply(lambda x: x.year)

In [8]:

hospitalized.groupby('year_admission')['z_score'].hist(alpha=0.2, bins=20)

Out[8]:

year_admission
2010              Axes(0.125,0.125;0.775x0.775)
2011              Axes(0.125,0.125;0.775x0.775)
2012              Axes(0.125,0.125;0.775x0.775)
2013              Axes(0.125,0.125;0.775x0.775)
Name: z_score, dtype: object

Calculate age in months from birth and enrollment date:

In [9]:

from calendar import monthrange
from datetime import timedelta

def monthdelta(d1, d2):
    delta = 0
    while True:
        mdays = monthrange(d1.year, d1.month)[1]
        d1 += timedelta(days=mdays)
        if d1 <= d2:
            delta += 1
        else:
            break
    return delta

In [10]:

if RSV_only:
    hospitalized = hospitalized[hospitalized.pcr_result___1==1]

Covariates¶

Either ICU or direct admission to ICU

In [11]:

hospitalized['icu_any'] = (hospitalized.icu + hospitalized.dir_icu).astype(bool)

Severity score

In [12]:

hospitalized.flaring.value_counts()

Out[12]:

0    1879
1    1009
2     264
3      17
dtype: int64

In [13]:

hospitalized.cyanosis[hospitalized.cyanosis==4] = np.nan
hospitalized.cyanosis.value_counts()

Out[13]:

0    2549
1     558
2      51
3       8
dtype: int64

In [14]:

hospitalized.wheezing[hospitalized.wheezing==4] = np.nan
hospitalized.wheezing.value_counts()

Out[14]:

1    1475
0    1412
2     270
3      10
dtype: int64

In [15]:

hospitalized.respiratory_rate = hospitalized.respiratory_rate.replace({'-': np.nan}).astype(float)

In [16]:

hospitalized['respiratory_class'] = 0
hospitalized.respiratory_class[(hospitalized.respiratory_rate>30) & (hospitalized.respiratory_rate<=45)] = 1
hospitalized.respiratory_class[(hospitalized.respiratory_rate>45) & (hospitalized.respiratory_rate<=60)] = 2
hospitalized.respiratory_class[hospitalized.respiratory_rate>60] = 3
hospitalized.respiratory_class.value_counts()

Out[16]:

1    1648
0     764
2     636
3     121
dtype: int64

In [17]:

hospitalized['sats_class'] = 0
hospitalized.sats_class[(hospitalized.sats_number>=90) & (hospitalized.sats_number<95)] = 1
hospitalized.sats_class[(hospitalized.sats_number>=85) & (hospitalized.sats_number<90)] = 2
hospitalized.sats_class[hospitalized.sats_number<85] = 3
hospitalized.sats_class.value_counts()

Out[17]:

0    3020
1     131
2      15
3       3
dtype: int64

In [18]:

hospitalized['severity_score'] = hospitalized[['flaring', 'cyanosis', 'wheezing', 'respiratory_class', 'sats_class']].sum(1)

In [19]:

hospitalized.severity_score.hist()

Out[19]:

<matplotlib.axes._subplots.AxesSubplot at 0x106303c50>

In [20]:

hospitalized.icu_any[hospitalized.sats_number<85]

Out[20]:

case_id
A0226      False
C2028      False
D3055      False
Name: icu_any, dtype: bool

Month of admission

In [21]:

hospitalized['admission_month'] = hospitalized.admission_date.apply(lambda x: x.month)

In [22]:

covariates = hospitalized[["daycare", 
                           "hospitalized_vitamin_d", 
                           "gest_age", 
                           "heart_hx",
                           "prev_cond",
                           "cigarette_smokers",
                           "cigarette_preg",
                           "sex_child",
                           "birth_wt_child",
                           "breastfed",
                           "severity_score",
                           "z_score",
                           "age_months",
                           "wheezing",
                           "admission_month"
                           ]]

In [23]:

covariates.birth_wt_child.hist(bins=np.sqrt(len(covariates)))

Out[23]:

<matplotlib.axes._subplots.AxesSubplot at 0x1062b3f10>

In [24]:

covariates.hospitalized_vitamin_d.hist(bins=np.sqrt(len(covariates)))

Out[24]:

<matplotlib.axes._subplots.AxesSubplot at 0x106d84d50>

In [25]:

covariates.head()

Out[25]:

	daycare	hospitalized_vitamin_d	gest_age	heart_hx	prev_cond	cigarette_smokers	cigarette_preg	sex_child	birth_wt_child	breastfed	severity_score	z_score	age_months	wheezing	admission_month
case_id
A0001	0	3	40	0	0	0	0	0	2.95	1	6	0.68	1	0	3
A0002	0	4	40	0	0	1	0	1	3.00	1	4	-0.64	1	1	3
A0003	0	35	40	0	1	0	0	0	2.00	0	3	-7.59	11	0	3
A0004	0	2	38	0	0	1	0	0	2.70	1	2	-0.84	7	1	3
A0005	0	6	39	0	0	1	0	0	3.00	1	2	-1.68	2	0	3

In [26]:

covariates.daycare.mean()

Out[26]:

0.015782828282828284

In [27]:

covariates.heart_hx.mean()

Out[27]:

0.046071315872514992

In [28]:

covariates.cigarette_preg.mean()

Out[28]:

0.076364783843483747

In [29]:

#axes = pd.scatter_matrix(covariates, figsize=(16,12))

In [30]:

severity = hospitalized[["oxygen", "days_oxygen", 
                         "vent", "days_vent", 
                         "icu_any", 
                         "length_of_stay",
                         "death"]]

Some measures of severity are rare

In [31]:

severity.icu_any.mean()

Out[31]:

0.098453770905648469

In [32]:

severity.oxygen.mean()

Out[32]:

0.32281708094327599

In [33]:

severity.vent.mean()

Out[33]:

0.035384124960153016

Histogram of days on oxygen

In [34]:

severity.days_oxygen.hist(bins=25)

Out[34]:

<matplotlib.axes._subplots.AxesSubplot at 0x106e29090>

In [35]:

severity.death.mean()

Out[35]:

0.0097822656989586618

Attributes of non-survivors

In [36]:

hospitalized.age_months[hospitalized.death].hist(bins=25)

Out[36]:

<matplotlib.axes._subplots.AxesSubplot at 0x106eabed0>

In [37]:

hospitalized.length_of_stay[hospitalized.death].hist(bins=15)

Out[37]:

<matplotlib.axes._subplots.AxesSubplot at 0x109eea4d0>

In [38]:

hospitalized.gest_age[hospitalized.death].hist(bins=25)

Out[38]:

<matplotlib.axes._subplots.AxesSubplot at 0x109fd5150>

In [39]:

(hospitalized.gest_age<37).mean()

Out[39]:

0.14200063111391606

In [40]:

(hospitalized[hospitalized.death].gest_age<37).mean()

Out[40]:

0.16129032258064516

Proportion positive blood culture

In [41]:

hospitalized.blood_neg[hospitalized.blood_neg<2].mean()

Out[41]:

0.12827225130890052

In [42]:

hospitalized.blood_neg[hospitalized.blood_neg<2][hospitalized.death].mean()

Out[42]:

0.23076923076923078

In [43]:

hospitalized.blood_pneumo.mean()

Out[43]:

0.021276595744680851

In [44]:

hospitalized.blood_pneumo[hospitalized.death].mean()

Out[44]:

0.0

Histogram of days on ventilation

In [45]:

severity.days_vent.hist(bins=15)

Out[45]:

<matplotlib.axes._subplots.AxesSubplot at 0x10a09e210>

In [46]:

hospitalized[hospitalized.death].vent.median()

Out[46]:

0.0

In [47]:

hospitalized.death[hospitalized.vent.astype(bool)].mean()

Out[47]:

0.055944055944055944

In [48]:

outcome = 'oxygen'
complete = (covariates.isnull().sum(axis=1).astype(bool)==False) & (severity[outcome].isnull().astype(bool)==False)
covariates_complete = covariates[complete]
y_complete = severity[outcome][complete]

In [49]:

variables = pd.concat([covariates, severity], axis=1)

Drop rows not worth imputing

In [50]:

variables[['cigarette_smokers', 'birth_wt_child', 
                                     'oxygen', 'length_of_stay', 'hospitalized_vitamin_d', 'breastfed']].isnull().sum()

Out[50]:

cigarette_smokers           1
birth_wt_child              2
oxygen                     31
length_of_stay             29
hospitalized_vitamin_d    480
breastfed                   0
dtype: int64

In [51]:

variables = variables.dropna(subset=['cigarette_smokers', 'birth_wt_child', 
                                     'oxygen', 'length_of_stay', 'hospitalized_vitamin_d', 'breastfed', 'z_score'])

Standardize vitamin D, age in months, and birth weight variables

In [52]:

variables['vitamin_d_norm'] = ((variables.hospitalized_vitamin_d - variables.hospitalized_vitamin_d.mean()) 
                               / variables.hospitalized_vitamin_d.std())

variables['wt_norm'] = ((variables.birth_wt_child - variables.birth_wt_child.mean()) /
                        variables.birth_wt_child.std())

variables['age_norm'] = ((variables.age_months - variables.age_months.mean()) /
                        variables.age_months.std())

Convert gestational age to number of weeks less than 37.

In [53]:

variables['premature'] = np.maximum(0, 37 - variables.gest_age)

In [54]:

variables.premature.hist(bins=20, grid=False)

Out[54]:

<matplotlib.axes._subplots.AxesSubplot at 0x109f54bd0>

Extract usable subset of covariates and response

In [55]:

variables_glm = variables[
    ['vitamin_d_norm', 'premature', 'cigarette_smokers', 'sex_child', 
     'wt_norm', 'oxygen', 'breastfed', 'z_score', 'age_norm']]

Confirm no nulls

In [56]:

variables_glm.isnull().sum(0)

Out[56]:

vitamin_d_norm       0
premature            0
cigarette_smokers    0
sex_child            0
wt_norm              0
oxygen               0
breastfed            0
z_score              0
age_norm             0
dtype: int64

In [57]:

data_glm = variables_glm.to_dict(outtype='list')

In [99]:

from pymc import Model, glm
import theano.tensor as t
from pymc import sample, Slice, NUTS
from pymc import forestplot, traceplot, summary

def tinvlogit(x):
    return t.exp(x) / (1 + t.exp(x))

In [254]:

with Model() as interactive_model:
    
    formula = 'oxygen ~ (vitamin_d_norm + premature + cigarette_smokers + sex_child '
    formula += '+ breastfed + z_score + age_norm) ** 2'
    
    glm.glm(formula, data_glm,
            family=glm.families.Binomial(link=glm.links.Logit))

In [255]:

with interactive_model:
    trace_int = sample(2000, NUTS(interactive_model.vars), progressbar=False) 

In [256]:

#_ = traceplot(trace_int)

Forest plot of parameter estimates. Thick line is the interquartile range of the estimate, thin line the 95% posterior interval. Positive values mean an increase in the probability of ending up on oxygen when the value of that parameter increasees by one unit.

So, for example, the z-score main effect is negative, so it decreases the probability of oxygen when it increases (which makes sense). The interaction of z-score with normalized age (bottom of plot) is negative, meaning that individuals that are older with higher z-scores even further reduce the probability of being on oxygen (non-additive effect).

In [257]:

plt.figure(figsize=(14,8))
forestplot(trace_int, vars=trace_int.varnames[1:])

Out[257]:

<matplotlib.gridspec.GridSpec at 0x14d355410>

In [258]:

variables_rsv = variables.ix[hospitalized.pcr_result___1==1,
    ['vitamin_d_norm', 'premature', 'cigarette_smokers', 'sex_child', 'wt_norm', 
     'oxygen', 'breastfed', 'z_score', 'age_norm']].to_dict(outtype='list')

In [259]:

with Model() as rsv_model:
    
    formula = 'oxygen ~ (vitamin_d_norm + premature + cigarette_smokers + sex_child + breastfed + z_score + age_norm) ** 2'
    
    glm.glm(formula, variables_rsv,
            family=glm.families.Binomial(link=glm.links.Logit))

In [259]:

with rsv_model:
    trace_rsv = sample(2000, NUTS(), progressbar=False) 

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-260-7051395c008c> in <module>()
      1 with rsv_model:
----> 2     trace_rsv = sample(2000, NUTS(), progressbar=False)

/Users/fonnescj/GitHub/pymc/pymc/step_methods/nuts.pyc in __init__(self, vars, scaling, step_scale, is_cov, state, Emax, target_accept, gamma, k, t0, model)
     57 
     58         if isinstance(scaling, dict):
---> 59             scaling = guess_scaling(Point(scaling, model=model), model=model, vars = vars)
     60 
     61 

/Users/fonnescj/GitHub/pymc/pymc/tuning/scaling.pyc in guess_scaling(point, vars, model)
     77 def guess_scaling(point, vars=None, model=None):
     78     model = modelcontext(model)
---> 79     h = find_hessian_diag(point, vars, model=model)
     80     return adjust_scaling(h)
     81 

/Users/fonnescj/GitHub/pymc/pymc/tuning/scaling.pyc in find_hessian_diag(point, vars, model)
     72     """
     73     model = modelcontext(model)
---> 74     H = model.fastfn(hessian_diag(model.logpt, vars))
     75     return H(Point(point, model=model))
     76 

/Users/fonnescj/GitHub/pymc/pymc/memoize.pyc in memoizer(*args, **kwargs)
     12 
     13         if key not in cache:
---> 14             cache[key] = obj(*args, **kwargs)
     15 
     16         return cache[key]

/Users/fonnescj/GitHub/pymc/pymc/theanof.pyc in hessian_diag(f, vars)
     92         vars = cont_inputs(f)
     93 
---> 94     return -t.concatenate([hessian_diag1(f, v) for v in vars], axis=0)
     95 
     96 

/Users/fonnescj/GitHub/pymc/pymc/theanof.pyc in hessian_diag1(f, v)
     78 def hessian_diag1(f, v):
     79 
---> 80     g = gradient1(f, v)
     81     idx = t.arange(g.shape[0])
     82 

/Users/fonnescj/GitHub/pymc/pymc/theanof.pyc in gradient1(f, v)
     41 def gradient1(f, v):
     42     """flat gradient of f wrt v"""
---> 43     return t.flatten(t.grad(f, v, disconnected_inputs='warn'))
     44 
     45 

/usr/local/lib/python2.7/site-packages/theano/gradient.pyc in grad(cost, wrt, consider_constant, disconnected_inputs, add_names, known_grads, return_disconnected)
    527 
    528     rval = _populate_grad_dict(var_to_app_to_idx,
--> 529             grad_dict, wrt, cost_name)
    530 
    531     for i in xrange(len(rval)):

/usr/local/lib/python2.7/site-packages/theano/gradient.pyc in _populate_grad_dict(var_to_app_to_idx, grad_dict, wrt, cost_name)
   1205         return grad_dict[var]
   1206 
-> 1207     rval = [access_grad_cache(elem) for elem in wrt]
   1208 
   1209     return rval

/usr/local/lib/python2.7/site-packages/theano/gradient.pyc in access_grad_cache(var)
   1165                     for idx in node_to_idx[node]:
   1166 
-> 1167                         term = access_term_cache(node)[idx]
   1168 
   1169                         if not isinstance(term, gof.Variable):

/usr/local/lib/python2.7/site-packages/theano/gradient.pyc in access_term_cache(node)
   1026                                 str(g_shape))
   1027 
-> 1028                 input_grads = node.op.grad(inputs, new_output_grads)
   1029 
   1030                 if input_grads is None:

/usr/local/lib/python2.7/site-packages/theano/tensor/opt.pyc in grad(self, inputs, output_gradients)
    628         grads = []
    629         for i, inp in enumerate(inputs):
--> 630             grads.append(output_gradients[0][i])
    631         return grads
    632 

/usr/local/lib/python2.7/site-packages/theano/tensor/var.pyc in __getitem__(self, args)
    408                 return theano.tensor.subtensor.Subtensor(args)(
    409                     self, *theano.tensor.subtensor.Subtensor.collapse(args,
--> 410                     lambda entry: isinstance(entry, Variable)))
    411 
    412     def take(self, indices, axis=None, mode='raise'):

/usr/local/lib/python2.7/site-packages/theano/gof/op.pyc in __call__(self, *inputs, **kwargs)
    458                 # compute output value once with test inputs to validate graph
    459                 thunk = node.op.make_thunk(node, storage_map, compute_map,
--> 460                         no_recycling=[])
    461                 thunk.inputs = [storage_map[v] for v in node.inputs]
    462                 thunk.outputs = [storage_map[v] for v in node.outputs]

/usr/local/lib/python2.7/site-packages/theano/gof/op.pyc in make_thunk(self, node, storage_map, compute_map, no_recycling)
    606         if self._op_use_c_code:
    607             try:
--> 608                 e = FunctionGraph(node.inputs, node.outputs)
    609 
    610                 e_no_recycling = [new_o

/usr/local/lib/python2.7/site-packages/theano/gof/fg.pyc in __init__(self, inputs, outputs, features, clone)
     91         """
     92         if clone:
---> 93             inputs, outputs = graph.clone(inputs, outputs)
     94 
     95         self.execute_callbacks_time = 0

/usr/local/lib/python2.7/site-packages/theano/gof/graph.pyc in clone(i, o, copy_inputs)
    659     Returns the inputs and outputs of that copy.
    660     """
--> 661     equiv = clone_get_equiv(i, o, copy_inputs)
    662     return [equiv[input] for input in i], [equiv[output] for output in o]
    663 

/usr/local/lib/python2.7/site-packages/theano/gof/graph.pyc in clone_get_equiv(inputs, outputs, copy_inputs_and_orphans, memo)
    703 
    704     # go through the inputs -> outputs graph cloning as we go
--> 705     for apply in io_toposort(inputs, outputs):
    706         for input in apply.inputs:
    707             if input not in memo:

/usr/local/lib/python2.7/site-packages/theano/gof/graph.pyc in io_toposort(inputs, outputs, orderings)
    814         return rval
    815 
--> 816     topo = general_toposort(outputs, deps)
    817     return [o for o in topo if isinstance(o, Apply)]
    818 

/usr/local/lib/python2.7/site-packages/theano/gof/graph.pyc in general_toposort(r_out, deps, debug_print)
    767             for client in clients.get(node, []):
    768                 deps_cache[client] = [a for a in deps_cache[client] if a is not node]
--> 769                 if not deps_cache[client]:
    770                     sources.append(client)
    771 

KeyboardInterrupt:

Traceback (most recent call last):

  File "/usr/local/lib/python2.7/site-packages/IPython/kernel/zmq/ipkernel.py", line 416, in execute_request
    shell.run_cell(code, store_history=store_history, silent=silent)

  File "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2754, in run_cell
    self.events.trigger('post_execute')

  File "/usr/local/lib/python2.7/site-packages/IPython/core/events.py", line 82, in trigger
    func(*args, **kwargs)

  File "/usr/local/lib/python2.7/site-packages/IPython/kernel/zmq/pylab/backend_inline.py", line 118, in flush_figures
    return show(True)

  File "/usr/local/lib/python2.7/site-packages/IPython/kernel/zmq/pylab/backend_inline.py", line 47, in show
    matplotlib.pyplot.close('all')

  File "/usr/local/lib/python2.7/site-packages/matplotlib/pyplot.py", line 517, in close
    _pylab_helpers.Gcf.destroy_all()

  File "/usr/local/lib/python2.7/site-packages/matplotlib/_pylab_helpers.py", line 93, in destroy_all
    gc.collect()

KeyboardInterrupt

In [ ]:

plt.figure(figsize=(14,8))
forestplot(trace_rsv, vars=trace_rsv.varnames[1:])

Additional analyses¶

Distribution of smoking exposure

In [58]:

hospitalized.cigarette_smokers.hist()

Out[58]:

<matplotlib.axes._subplots.AxesSubplot at 0x10a227190>

Proportion and of children exposed to cigarette smoking

In [59]:

(hospitalized.cigarette_smokers > 0).mean()

Out[59]:

0.72578100347112651

In [60]:

(hospitalized.cigarette_smokers > 0).sum()

Out[60]:

Distribution of nargila exposure

In [61]:

hospitalized.nargila_smokers.hist()

Out[61]:

<matplotlib.axes._subplots.AxesSubplot at 0x10a4617d0>

Proportion and number of children exposed to nargila

In [62]:

(hospitalized.nargila_smokers > 0).mean()

Out[62]:

0.17828968128747238

In [63]:

(hospitalized.nargila_smokers > 0).sum()

Out[63]:

Proportion of children with mothers who smoked during pregnancy

In [64]:

hospitalized.cigarette_preg.mean()

Out[64]:

0.076364783843483747

In [65]:

hospitalized.nargila_preg.mean()

Out[65]:

0.028400126222783213

Days of symptoms before admission by gender

In [66]:

days_by_sex = hospitalized.groupby('sex_child')
_ = days_by_sex.boxplot(column='days_symptoms')

/usr/local/lib/python2.7/site-packages/pandas/tools/plotting.py:2421: FutureWarning: 
The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.
  warnings.warn(msg, FutureWarning)

Vitamin D levels by breastfeeding status

TODO: clean up plot for presentation

In [67]:

hospitalized.breastfed = hospitalized.breastfed.replace({0: 'Not breastfed', 1: 'Breastfed'})
# _ = hospitalized.groupby('breastfed').boxplot(column='hospitalized_vitamin_d', grid=False, fontsize=0)
# plt.gca().set_ylabel('Vitamin D levels')
for index, group in hospitalized.groupby(['breastfed']):
    group.boxplot(column='hospitalized_vitamin_d')

In [68]:

_ = hospitalized.groupby('breastfed')['oxygen'].hist(alpha=0.3)

RSV count by oxygen status

In [69]:

_ = hospitalized.groupby('oxygen').boxplot(column='rsv_count')

In [70]:

rsv_only = hospitalized[hospitalized.pcr_result___1==1]
rsv_only['vitd_lt_20'] = rsv_only.hospitalized_vitamin_d<20
rsv_only.groupby('vitd_lt_20')['oxygen'].describe()

Out[70]:

vitd_lt_20       
False       count    663.000000
            mean       0.380090
            std        0.485775
            min        0.000000
            25%        0.000000
            50%        0.000000
            75%        1.000000
            max        1.000000
True        count    719.000000
            mean       0.436718
            std        0.496324
            min        0.000000
            25%        0.000000
            50%        0.000000
            75%        1.000000
            max        1.000000
dtype: float64

Proportion on oxygen and vitamin D, by month

In [71]:

groupby_month = hospitalized.groupby('admission_month')
ax = groupby_month['oxygen'].mean().plot(grid=False, color='c', figsize=(14,8))
groupby_month['hospitalized_vitamin_d'].mean().plot(secondary_y=True, color='r')
ax.set_ylabel('Proportion on oxygen', color='c')
ax.right_ax.set_ylabel('Mean vitamin D', color='r')
ax.set_xlabel('Admission month')
ax.set_xticklabels(['Jan','Feb','Apr','Jun','Aug','Oct','Dec'])
#ax.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
_ = plt.xlim(1,12)

In [ ]:

ply = plotly.plotly.sign_in('jordan_ari', 'ixur4zqdut')

In [ ]:

c = ['#1f77b4', # muted blue
    '#ff7f0e', # safety orange
    '#2ca02c' # cooked asparagus green
    ]

months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

groupby_month = hospitalized.groupby('admission_month')

data = [{'y': list(groupby_month['oxygen'].mean()), 
         'x': months,
         'name': 'oxygen'},
        {'y': list(groupby_month['hospitalized_vitamin_d'].mean()), 
         'x': months,
         'name': 'vitamin D',
         'yaxis': 'y2'},
        {'y': list(groupby_month['pcr_result___1'].mean()), 
         'x': months,
         'name': 'percent RSV',
         'yaxis': 'y3'},
#         {'y': list(groupby_month['pcr_result___5'].mean()), 
#          'x': months,
#          'name': 'percent rhino',
#          'yaxis': 'y4'},
        
        ]

layout = {
    'showlegend': False,
    "xaxis":{
        "domain":[0.2, 0.9]
    },

   "yaxis":{
      "title": 'Proportion on oxygen',
      "position": 0.1,
"range":[0,1],
      "titlefont":{
         "color":c[0]
      },
      "tickfont":{
         "color":c[0]
      },
   },

   "yaxis2":{     
      "overlaying":"y",
      "side":"right",
      "anchor":"x",
      
      "title": 'Mean vitamin D',
      "titlefont":{
         "color":c[1]
      },
      "tickfont":{
         "color":c[1]
      },      
   },

   "yaxis3":{     
      "overlaying":"y",
      "side":"left",
      "anchor":"free",
      "position": -0.15,
"range":[0,1],

      "title": "Proportion RSV",
      "titlefont":{
         "color":c[2]
      },
      "tickfont":{
         "color":c[2]
      },      
   },

#    "yaxis4":{     
#       "overlaying":"y",
#       "side":"left",
#       "anchor":"free",
#       "position": -0.3,
# "range":[0,1],

#       "title": "Proportion rhino",
#       "titlefont":{
#          "color":"#969696"
#       },
#       "tickfont":{
#          "color":"#969696"
#       },      
#    }

}

#ply.plot(data, layout=layout, width=750,height=500)

Risk factor tables¶

In [72]:

hospitalized['male'] = (hospitalized.sex=='M').astype(int)
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23]))
age_groups.index = hospitalized.index
age_groups.columns = 'under_2_months', 'months_2_11', 'months_12_23'
hospitalized = hospitalized.join(age_groups)

In [73]:

nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian', 
                      6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup)
hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int)
hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int)

In [74]:

hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int)
hospitalized['vitamin D < 20'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan
hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int)
hospitalized['vitamin D < 11'][hospitalized.hospitalized_vitamin_d.isnull()] = np.nan

In [75]:

hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int)

In [76]:

hospitalized['breastfed'] = (hospitalized.breastfed=='Breastfed').astype(int)

ICU status¶

In [77]:

#hospitalized.replace({'icu_any': {True: 'ICU', False: 'No ICU'}, 
#                      'oxygen': {0: 'No oxygen', 1: 'Oxygen'}})['icu_any']

In [78]:

groupby_icu = hospitalized.groupby('icu_any')

In [79]:

def make_table(groupby, table_vars, replace_dict={}):
    table = np.round(groupby[table_vars].mean(), 2).T
    ratios = [calc_or(groupby, v) for v in table.index]
    table['OR'] = [r[0] for r in ratios]
    table['Interval'] = [r[1] for r in ratios]
    table['N'] = [r[2] for r in ratios]
    table.rename(columns=replace_dict, inplace=True)
    table.columns.name = None
    
    return(table)

In [92]:

table_vars = ['male', 'under_2_months', 'months_2_11', 'months_12_23', 
                'Jordanian', 'Palestinian', 'vitamin D < 20', 'vitamin D < 11', 
                'prev_cond', 'heart_hx', 'breastfed', 'premature', 
                'adm_pneumo', 'adm_bronchopneumo', 'adm_sepsis', 'adm_bronchiolitis']

Function for calculating odds ratios and simulation-based intervals.

In [81]:

se = lambda p, n: np.sqrt(p * (1. - p) / n)

def odds_ratio(x, y, n_sim=10000, alpha=0.05):

    try:
        n_x, n_y = len(x.dropna()), len(y.dropna())
        p_x, p_y = x.mean(), y.mean()

        se_x = se(p_x, n_x)
        se_y = se(p_y, n_y)

        p_x_sim = np.random.normal(p_x, se_x, n_sim)
        p_y_sim = np.random.normal(p_y, se_y, n_sim)

        ratio = ((p_x_sim / (1. - p_x_sim)) /
                    (p_y_sim / (1. - p_y_sim)))

        interval = np.percentile(ratio, [100*(alpha/2.), 100*(1. - alpha/2.)])

        return np.round(np.median(ratio), 2), np.round(interval, 2).tolist(), (n_y, n_x)
    
    except ValueError:
        
        return np.nan, np.nan, np.nan

In [82]:

def calc_or(groupby, var):
    data = list(groupby[var])
    return odds_ratio(data[1][1], data[0][1])

In [83]:

virus_vars = ['RSV', 'Influenza', 'HMPV', 'Rhino']

In [84]:

pcr_lookup = {'pcr_result___1': 'RSV',
'pcr_result___2': 'HMPV',
'pcr_result___3': 'flu A',
'pcr_result___4': 'flu B',
'pcr_result___5': 'rhino',
'pcr_result___6': 'PIV1',
'pcr_result___7': 'PIV2',
'pcr_result___8': 'PIV3',
'pcr_result___13': 'H1N1',
'pcr_result___14': 'H3N2',
'pcr_result___15': 'Swine',
'pcr_result___16': 'Swine H1',
'pcr_result___17': 'flu C',
'pcr_result___18': 'Adeno'}

In [85]:

hospitalized['RSV'] = hospitalized['pcr_result___1']
hospitalized['Influenza'] = (hospitalized['pcr_result___3'] | hospitalized['pcr_result___4']).astype(int)
hospitalized['HMPV'] = hospitalized['pcr_result___2']
hospitalized['Rhino'] = hospitalized['pcr_result___5']

In [86]:

hospitalized['vitamin_d_norm'] = ((hospitalized.hospitalized_vitamin_d - hospitalized.hospitalized_vitamin_d.mean()) 
                               / hospitalized.hospitalized_vitamin_d.std())

In [87]:

hospitalized['age_centered'] = hospitalized.age_months/hospitalized.age_months.mean()

Severity score model for bronchiolitis¶

In [105]:

bronchiolitis_subset = hospitalized[hospitalized.adm_bronchiolitis==1]
bronchiolitis_subset.shape

Out[105]:

(547, 435)

In [106]:

bronchiolitis_subset.severity_score.hist()

Out[106]:

<matplotlib.axes._subplots.AxesSubplot at 0x1144dd050>

In [ ]:

bronchiolitis_subset = bronchiolitis_subset.dropna(subset=['vitamin_d_norm', 'male'] + virus_vars)

In [102]:

import theano.tensor as T
from pymc import Bound, Normal, Uniform

BoundedNormal = Bound(Normal, 0, 15)

with Model() as bronchiolitis_severity_model:
    
    beta = Normal('beta', 0, 0.1, shape=6)
    mu = BoundedNormal('mu', 4, 0.1)
    
    theta = mu + T.dot(bronchiolitis_subset[['vitamin_d_norm', 'male'] + virus_vars], beta)
    sigma = Uniform('sigma', 0, 4)
    tau = sigma ** -2
    
    score = BoundedNormal('score', theta, tau, observed=bronchiolitis_subset.severity_score)

In [103]:

from pymc import sample
with bronchiolitis_severity_model:
    trace_severity = sample(2000, NUTS(), progressbar=False) 

In [104]:

from pymc import forestplot

In [105]:

_ = forestplot(trace_severity, vars=['beta'], ylabels=['vitamin_d_norm', 'male'] + virus_vars)

In [106]:

posterior_samples = {v: trace_severity.traces[0][v] for v in ['mu', 'beta']}

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-106-9cf9e8fda0d7> in <module>()
----> 1 posterior_samples = {v: trace_severity.traces[0][v] for v in ['mu', 'beta']}

<ipython-input-106-9cf9e8fda0d7> in <dictcomp>((v,))
----> 1 posterior_samples = {v: trace_severity.traces[0][v] for v in ['mu', 'beta']}

AttributeError: 'NpTrace' object has no attribute 'traces'

In [107]:

bronchiolitis_subset.index = range(bronchiolitis_subset.shape[0])

In [110]:

def posterior_check(i, dataset=bronchiolitis_subset, samples=trace_severity):
    
    row = dataset.ix[i]
    
    pred = samples['mu']
    pred += row[['vitamin_d_norm', 'male'] + virus_vars].values.dot(samples['beta'].T)

    plt.hist(pred)
    print(row.severity_score)
    plt.axvline(row.severity_score, color='r')

In [113]:

posterior_check(250)

4.0

Severity score model for RSV¶

In [88]:

rsv_subset = hospitalized[hospitalized.RSV==1]
rsv_subset.shape

Out[88]:

(1397, 435)

In [89]:

rsv_subset.severity_score.hist()

Out[89]:

<matplotlib.axes._subplots.AxesSubplot at 0x10a461290>

In [94]:

rsv_vars = ['vitamin_d_norm'] + table_vars[:]
rsv_vars.pop(rsv_vars.index('vitamin D < 20'))
rsv_vars.pop(rsv_vars.index('vitamin D < 11'))
rsv_vars.pop(rsv_vars.index('Palestinian'))
rsv_vars.pop(rsv_vars.index('Jordanian'))

Out[94]:

'Jordanian'

Drop missing values

In [95]:

rsv_subset[rsv_vars].isnull().sum()

Out[95]:

vitamin_d_norm       204
male                   0
under_2_months         0
months_2_11            0
months_12_23           0
prev_cond              0
heart_hx               0
breastfed              0
premature              0
adm_pneumo             0
adm_bronchopneumo      0
adm_sepsis             0
adm_bronchiolitis      0
dtype: int64

In [96]:

rsv_subset = rsv_subset.dropna(subset=['vitamin_d_norm', 'male'] + virus_vars)

In [185]:

with Model() as rsv_severity_model:
    
    beta = Normal('beta', 0, 0.1, shape=len(rsv_vars))
    mu = BoundedNormal('mu', 4, 0.1)
    
    theta = mu + T.dot(rsv_subset[rsv_vars], beta)
    sigma = Uniform('sigma', 0, 4)
    tau = sigma ** -2
    
    score = BoundedNormal('score', theta, tau, observed=rsv_subset.severity_score)

In [186]:

with rsv_severity_model:
    trace_rsv_severity = sample(2000, NUTS()) 

 [-----------------100%-----------------] 2000 of 2000 complete in 13.0 sec

In [187]:

_ = forestplot(trace_rsv_severity, vars=['beta'], ylabels=rsv_vars)

Oxygen model for RSV subset¶

In [97]:

'+'.join(rsv_vars)

Out[97]:

'vitamin_d_norm+male+under_2_months+months_2_11+months_12_23+prev_cond+heart_hx+breastfed+premature+adm_pneumo+adm_bronchopneumo+adm_sepsis+adm_bronchiolitis'

In [100]:

with Model() as rsv_oxygen_model:
    
    formula = 'oxygen ~ ' + '+'.join(rsv_vars)
    
    glm.glm(formula, rsv_subset,
            family=glm.families.Binomial(link=glm.links.Logit))

In [101]:

with rsv_oxygen_model:
    trace_int = sample(2000, NUTS()) 

/usr/local/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:85: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility
  from scan_perform.scan_perform import *
/usr/local/lib/python2.7/site-packages/theano/tensor/subtensor.py:128: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  start in [None, 0] or

 [-----------------100%-----------------] 2000 of 2000 complete in 37.0 sec

In [104]:

_ = forestplot(trace_int, vars=rsv_vars, ylabels=rsv_vars)

Vitamin D levels for diagnoses with and without RSV¶

In [122]:

bronchiolitis_subset.replace({'RSV': {1: 'RSV', 0: 'No RSV'}}).groupby('RSV').boxplot(column='hospitalized_vitamin_d');

In [123]:

pneumonia_subset = hospitalized[hospitalized.adm_pneumo==1]
pneumonia_subset.shape

Out[123]:

(394, 435)

In [124]:

pneumonia_subset.replace({'RSV': {1: 'RSV', 0: 'No RSV'}}).groupby('RSV').boxplot(column='hospitalized_vitamin_d');

In [125]:

bronchopneumo_subset = hospitalized[hospitalized.adm_bronchopneumo==1]
bronchopneumo_subset.shape

Out[125]:

(1020, 435)

In [126]:

bronchopneumo_subset.replace({'RSV': {1: 'RSV', 0: 'No RSV'}}).groupby('RSV').boxplot(column='hospitalized_vitamin_d');

Oxygen¶

In [127]:

groupby_o2 = hospitalized.groupby('oxygen')

In [128]:

make_table(groupby_o2, table_vars=table_vars+virus_vars, replace_dict={0.0: 'No Oxygen', 1.0: 'Oxygen'})

Out[128]:

	No Oxygen	Oxygen	OR	Interval	N
male	0.62	0.58	0.86	[0.74, 1.0]	(2125, 1013)
under 2 months	0.17	0.23	1.47	[1.22, 1.76]	(2125, 1013)
2-11 months	0.54	0.53	0.96	[0.82, 1.11]	(2125, 1013)
12-23 months	0.18	0.11	0.57	[0.44, 0.71]	(2125, 1013)
Jordanian	0.89	0.91	1.21	[0.94, 1.59]	(2125, 1013)
Palestinian	0.08	0.06	0.76	[0.54, 1.02]	(2125, 1013)
vitamin D < 20	0.57	0.60	1.13	[0.96, 1.34]	(1807, 858)
vitamin D < 11	0.36	0.43	1.30	[1.1, 1.54]	(1807, 858)
prev_cond	0.09	0.18	2.16	[1.73, 2.69]	(2125, 1013)
heart_hx	0.03	0.07	2.07	[1.46, 2.9]	(2125, 1013)
breastfed	0.62	0.59	0.88	[0.76, 1.03]	(2125, 1013)
premature	0.13	0.17	1.38	[1.11, 1.71]	(2125, 1013)
adm_pneumo	0.09	0.21	2.78	[2.24, 3.45]	(2125, 1013)
adm_bronchopneumo	0.34	0.28	0.73	[0.62, 0.86]	(2125, 1013)
adm_sepsis	0.33	0.20	0.51	[0.43, 0.61]	(2125, 1013)
adm_bronchiolitis	0.15	0.21	1.49	[1.23, 1.8]	(2125, 1013)
RSV	0.38	0.56	2.03	[1.75, 2.37]	(2125, 1013)
Influenza	0.04	0.03	0.73	[0.43, 1.11]	(2125, 1013)
HMPV	0.09	0.08	0.89	[0.67, 1.16]	(2125, 1013)
Rhino	0.39	0.38	0.95	[0.82, 1.11]	(2125, 1013)

Mechanical ventilation¶

In [129]:

groupby_vent = hospitalized.groupby('vent')

In [130]:

make_table(groupby_vent, table_vars=table_vars+virus_vars, 
           replace_dict={0.0: 'No Ventilator', 1.0: 'Ventilator'})

Out[130]:

	No Ventilator	Ventilator	OR	Interval	N
male	0.60	0.59	0.93	[0.63, 1.37]	(3026, 111)
under 2 months	0.19	0.24	1.36	[0.83, 2.04]	(3026, 111)
2-11 months	0.54	0.47	0.77	[0.51, 1.12]	(3026, 111)
12-23 months	0.15	0.14	0.93	[0.47, 1.46]	(3026, 111)
Jordanian	0.89	0.94	1.75	[0.95, 6.11]	(3026, 111)
Palestinian	0.07	0.04	0.49	[0.02, 1.0]	(3026, 111)
vitamin D < 20	0.58	0.50	0.70	[0.47, 1.03]	(2557, 107)
vitamin D < 11	0.39	0.35	0.84	[0.54, 1.24]	(2557, 107)
prev_cond	0.12	0.28	2.98	[1.83, 4.43]	(3026, 111)
heart_hx	0.04	0.14	3.51	[1.66, 5.81]	(3026, 111)
breastfed	0.61	0.59	0.95	[0.65, 1.43]	(3026, 111)
premature	0.14	0.18	1.35	[0.76, 2.11]	(3026, 111)
adm_pneumo	0.12	0.23	2.23	[1.34, 3.4]	(3026, 111)
adm_bronchopneumo	0.32	0.33	1.06	[0.69, 1.56]	(3026, 111)
adm_sepsis	0.29	0.22	0.68	[0.4, 1.03]	(3026, 111)
adm_bronchiolitis	0.17	0.19	1.13	[0.63, 1.73]	(3026, 111)
RSV	0.44	0.47	1.12	[0.76, 1.64]	(3026, 111)
Influenza	0.03	0.02	0.53	[-0.2, 1.32]	(3026, 111)
HMPV	0.09	0.06	0.71	[0.19, 1.29]	(3026, 111)
Rhino	0.39	0.37	0.92	[0.61, 1.34]	(3026, 111)

Death¶

In [131]:

groupby_death = hospitalized.groupby('death')

In [132]:

make_table(groupby_death, table_vars=table_vars+virus_vars, replace_dict={False: 'Survived', True: 'Die'})

Out[132]:

	Survived	Die	OR	Interval	N
male	0.60	0.48	0.61	[0.29, 1.26]	(3138, 31)
under 2 months	0.19	0.39	2.68	[1.18, 5.39]	(3138, 31)
2-11 months	0.53	0.39	0.55	[0.25, 1.1]	(3138, 31)
12-23 months	0.15	0.06	0.38	[-0.12, 0.98]	(3138, 31)
Jordanian	0.90	0.87	0.78	[0.33, 4.82]	(3138, 31)
Palestinian	0.07	0.10	1.40	[-0.11, 3.37]	(3138, 31)
vitamin D < 20	0.58	0.44	0.56	[0.23, 1.26]	(2664, 25)
vitamin D < 11	0.39	0.28	0.62	[0.18, 1.36]	(2664, 25)
prev_cond	0.12	0.52	8.05	[3.84, 17.18]	(3138, 31)
heart_hx	0.05	0.13	3.13	[0.24, 7.17]	(3138, 31)
breastfed	0.61	0.42	0.46	[0.21, 0.94]	(3138, 31)
premature	0.14	0.16	1.17	[0.2, 2.53]	(3138, 31)
adm_pneumo	0.12	0.32	3.42	[1.37, 6.95]	(3138, 31)
adm_bronchopneumo	0.32	0.13	0.31	[0.02, 0.69]	(3138, 31)
adm_sepsis	0.28	0.52	2.71	[1.3, 5.77]	(3138, 31)
adm_bronchiolitis	0.17	0.03	0.16	[-0.14, 0.51]	(3138, 31)
RSV	0.44	0.23	0.37	[0.11, 0.74]	(3138, 31)
Influenza	0.03	0.13	4.52	[0.31, 10.28]	(3138, 31)
HMPV	0.09	0.06	0.72	[-0.22, 1.89]	(3138, 31)
Rhino	0.39	0.42	1.13	[0.5, 2.31]	(3138, 31)

RSV-only subset¶

The following calculations repeat the above tables, but for the RSV subset.

In [133]:

rsv_only = hospitalized[hospitalized.pcr_result___1==1]

In [134]:

rsv_icu = rsv_only.groupby('icu_any')

In [135]:

virus_vars

Out[135]:

['RSV', 'Influenza', 'HMPV', 'Rhino']

In [136]:

make_table(rsv_icu, table_vars=table_vars+virus_vars[1:], replace_dict={False: 'No ICU', True: 'ICU'})

Out[136]:

	No ICU	ICU	OR	Interval	N
male	0.60	0.56	0.86	[0.61, 1.22]	(1255, 142)
under 2 months	0.19	0.32	2.09	[1.41, 3.0]	(1255, 142)
2-11 months	0.61	0.38	0.39	[0.27, 0.56]	(1255, 142)
12-23 months	0.12	0.06	0.48	[0.17, 0.84]	(1255, 142)
Jordanian	0.90	0.89	0.87	[0.54, 1.75]	(1255, 142)
Palestinian	0.07	0.08	1.18	[0.47, 2.06]	(1255, 142)
vitamin D < 20	0.60	0.66	1.28	[0.88, 1.97]	(1072, 121)
vitamin D < 11	0.44	0.50	1.24	[0.84, 1.8]	(1072, 121)
prev_cond	0.07	0.12	1.88	[0.94, 3.1]	(1255, 142)
heart_hx	0.03	0.03	0.99	[0.05, 2.23]	(1255, 142)
breastfed	0.64	0.58	0.77	[0.55, 1.1]	(1255, 142)
premature	0.12	0.20	1.89	[1.16, 2.84]	(1255, 142)
adm_pneumo	0.14	0.32	2.89	[1.92, 4.18]	(1255, 142)
adm_bronchopneumo	0.36	0.14	0.29	[0.16, 0.44]	(1255, 142)
adm_sepsis	0.15	0.42	4.04	[2.74, 5.84]	(1255, 142)
adm_bronchiolitis	0.28	0.17	0.53	[0.31, 0.79]	(1255, 142)
Influenza	0.02	0.02	0.91	[-0.11, 2.27]	(1255, 142)
HMPV	0.05	0.07	1.49	[0.56, 2.71]	(1255, 142)
Rhino	0.33	0.32	0.97	[0.65, 1.37]	(1255, 142)

In [137]:

rsv_o2 = rsv_only.groupby('oxygen')

In [138]:

make_table(rsv_o2, table_vars=table_vars, replace_dict={0.0: 'No Oxygen', 1.0: 'Oxygen'})

Out[138]:

	No Oxygen	Oxygen	OR	Interval	N
male	0.61	0.57	0.82	[0.66, 1.02]	(816, 566)
under 2 months	0.15	0.28	2.27	[1.74, 3.0]	(816, 566)
2-11 months	0.63	0.52	0.62	[0.5, 0.77]	(816, 566)
12-23 months	0.16	0.06	0.34	[0.22, 0.5]	(816, 566)
Jordanian	0.90	0.90	0.95	[0.67, 1.39]	(816, 566)
Palestinian	0.07	0.07	0.97	[0.61, 1.49]	(816, 566)
vitamin D < 20	0.58	0.65	1.34	[1.05, 1.71]	(697, 483)
vitamin D < 11	0.41	0.51	1.49	[1.18, 1.89]	(697, 483)
prev_cond	0.06	0.08	1.32	[0.86, 2.03]	(816, 566)
heart_hx	0.03	0.03	1.12	[0.54, 2.17]	(816, 566)
breastfed	0.65	0.62	0.90	[0.72, 1.12]	(816, 566)
premature	0.11	0.15	1.45	[1.05, 1.98]	(816, 566)
adm_pneumo	0.12	0.23	2.20	[1.63, 2.93]	(816, 566)
adm_bronchopneumo	0.42	0.22	0.40	[0.31, 0.5]	(816, 566)
adm_sepsis	0.15	0.23	1.71	[1.3, 2.27]	(816, 566)
adm_bronchiolitis	0.27	0.27	0.98	[0.77, 1.25]	(816, 566)

In [139]:

rsv_vent = rsv_only.groupby('vent')

In [140]:

make_table(rsv_vent, table_vars=table_vars, 
           replace_dict={0.0: 'No Ventilator', 1.0: 'Ventilator'})

Out[140]:

	No Ventilator	Ventilator	OR	Interval	N
male	0.60	0.56	0.85	[0.49, 1.56]	(1329, 52)
under 2 months	0.20	0.33	2.00	[1.01, 3.51]	(1329, 52)
2-11 months	0.59	0.46	0.59	[0.33, 1.03]	(1329, 52)
12-23 months	0.12	0.08	0.62	[0.03, 1.33]	(1329, 52)
Jordanian	0.90	0.94	1.75	[-7.45, 15.22]	(1329, 52)
Palestinian	0.07	0.04	0.54	[-0.19, 1.44]	(1329, 52)
vitamin D < 20	0.62	0.48	0.58	[0.32, 1.04]	(1127, 52)
vitamin D < 11	0.45	0.38	0.75	[0.41, 1.3]	(1127, 52)
prev_cond	0.07	0.10	1.40	[0.22, 2.9]	(1329, 52)
heart_hx	0.03	0.00	NaN	NaN	NaN
breastfed	0.64	0.52	0.61	[0.35, 1.07]	(1329, 52)
premature	0.13	0.13	1.08	[0.31, 2.1]	(1329, 52)
adm_pneumo	0.16	0.27	1.96	[0.94, 3.48]	(1329, 52)
adm_bronchopneumo	0.34	0.25	0.64	[0.28, 1.13]	(1329, 52)
adm_sepsis	0.18	0.21	1.25	[0.52, 2.25]	(1329, 52)
adm_bronchiolitis	0.27	0.27	1.00	[0.46, 1.78]	(1329, 52)

In [141]:

rsv_death = rsv_only.groupby('death')

In [142]:

make_table(rsv_death, table_vars=table_vars, replace_dict={False: 'Survived', True: 'Died'})

Out[142]:

	Survived	Died	OR	Interval	N
male	0.60	0.43	0.51	[0.05, 2.6]	(1390, 7)
under 2 months	0.20	0.43	2.97	[0.24, 15.23]	(1390, 7)
2-11 months	0.59	0.29	0.28	[-0.03, 1.14]	(1390, 7)
12-23 months	0.12	0.14	1.24	[-0.76, 5.04]	(1390, 7)
Jordanian	0.90	0.71	0.25	[-1.94, 2.86]	(1390, 7)
Palestinian	0.07	0.29	5.53	[-0.69, 22.24]	(1390, 7)
vitamin D < 20	0.61	0.71	1.44	[-13.49, 15.56]	(1186, 7)
vitamin D < 11	0.45	0.43	0.92	[0.07, 5.0]	(1186, 7)
prev_cond	0.07	0.29	5.16	[-0.53, 21.8]	(1390, 7)
heart_hx	0.03	0.00	NaN	NaN	NaN
breastfed	0.63	0.57	0.75	[0.11, 5.65]	(1390, 7)
premature	0.13	0.29	2.73	[-0.36, 11.4]	(1390, 7)
adm_pneumo	0.16	0.57	6.97	[1.01, 53.4]	(1390, 7)
adm_bronchopneumo	0.34	0.00	NaN	NaN	NaN
adm_sepsis	0.18	0.43	3.52	[0.3, 19.53]	(1390, 7)
adm_bronchiolitis	0.27	0.00	NaN	NaN	NaN

In [143]:

rsv_only.premature.value_counts()

Out[143]:

0    1218
1     179
dtype: int64

Vitamin D¶

Number of vitamin D records

In [144]:

hospitalized.hospitalized_vitamin_d.notnull().sum()

Out[144]:

Median vitaimin D

In [145]:

hospitalized.hospitalized_vitamin_d.median()

Out[145]:

16.52601

Proportion of subjects with vitamin D < 20

In [146]:

(hospitalized.hospitalized_vitamin_d<20).mean()

Out[146]:

0.4925844114862733

Create subset of RSV patients

In [147]:

rsv_only = hospitalized[hospitalized.pcr_result___1==1]

Number of RSV subjects

In [148]:

rsv_only.hospitalized_vitamin_d.notnull().sum()

Out[148]:

Median vitamin D of RSV subset

In [149]:

rsv_only.hospitalized_vitamin_d.median()

Out[149]:

14.3

Proportion of RSV subjects with vitamin D < 20

In [150]:

(rsv_only.hospitalized_vitamin_d<20).mean()

Out[150]:

0.51968503937007871

In [151]:

non_rsv = hospitalized[hospitalized.pcr_result___1==0]

This model estimates the difference in having vitamin D less than 20 for RSV-only versus non-RSV.

In [152]:

from pymc import Model, Beta, Deterministic, Binomial

n = non_rsv.shape[0]
n_rsv = rsv_only.shape[0]

with Model() as model:
    
    p = Beta('p', 1, 1)
    p_rsv = Beta('p_rsv', 1, 1)
    
    p_diff = Deterministic('p_diff', p_rsv - p)
    p_less = Deterministic('p_less', p_rsv < p)
    
    y = Binomial('y', n=n, p=p, observed=(non_rsv.hospitalized_vitamin_d<20).sum())
    y_rsv = Binomial('y_rsv', n=n_rsv, p=p_rsv, observed=(rsv_only.hospitalized_vitamin_d<20).sum())
    

In [153]:

with model:
    
    step = NUTS()
    trace = sample(2000, step)

 [-----------------100%-----------------] 2000 of 2000 complete in 1.7 sec

In [154]:

_ = traceplot(trace, vars=[p, p_rsv])

The difference is positive, with an expected value of approximately 5% higher for the RSV subset.

In [155]:

_ = traceplot(trace, vars=[p_diff])

In [156]:

summary(trace, vars=[p_diff, p_less])

p_diff:
 
  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  0.048            0.018            0.000            [0.014, 0.084]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  0.012          0.037          0.048          0.059          0.084


p_less:
 
  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  0.004            0.059            0.001            [0.000, 0.000]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  0.000          0.000          0.000          0.000          0.000

Factors associated with death¶

Number and proportion died:

In [157]:

hospitalized.death.value_counts().rename({False: 'Survived', True: 'Died'})

Out[157]:

Survived    3138
Died          31
dtype: int64

In [158]:

hospitalized.death.mean()

Out[158]:

0.0097822656989586618

In [159]:

groupby_death = hospitalized.groupby('death')

Median z-score

In [160]:

groupby_death['z_score'].median().rename({False: 'Survived', True: 'Died'})

Out[160]:

death
Survived    0.01
Died       -2.53
Name: z_score, dtype: float64

Gestational age

In [161]:

groupby_death['gest_age'].median().rename({False: 'Survived', True: 'Died'})

Out[161]:

death
Survived    40
Died        40
Name: gest_age, dtype: int64

Vitamin D

In [162]:

groupby_death['hospitalized_vitamin_d'].median().rename({False: 'Survived', True: 'Died'})

Out[162]:

death
Survived    16.5
Died        20.4
Name: hospitalized_vitamin_d, dtype: float64

Previous conditions

In [163]:

groupby_death[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].mean().rename({False: 'Survived', True: 'Died'})

Out[163]:

	diabetes_hx	heart_hx	kidney_hx	downs_hx	sickle_hx	cancer_hx	cf_hx	genetic_hx	cp_hx	seizure_hx	neuro_hx	mr_hx	asthma_hx	immuno_hx	liver_hx	gerd_hx	diarrhea_hx	other_hx
death
Survived	0.001912	0.045252	0.003824	0.012747	0.000319	0.000319	0.000637	0.005736	0.003824	0.003505	0.008604	0.000637	0.007967	0.000319	0.001593	0.000637	0	0.019758
Died	0.000000	0.129032	0.032258	0.064516	0.000000	0.000000	0.000000	0.096774	0.064516	0.032258	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0	0.096774

Organism positive culture

In [164]:

groupby_death[pcr_lookup.keys()].mean().rename(columns=pcr_lookup)

Out[164]:

	flu B	rhino	PIV1	PIV2	RSV	HMPV	flu A	PIV3	Adeno	Swine H1	flu C	H3N2	Swine	H1N1
death
False	0.009242	0.390376	0.010835	0.004143	0.442957	0.086361	0.022945	0.04079	0.150414	0.010835	0.006055	0.009879	0.010198	0
True	0.000000	0.419355	0.000000	0.000000	0.225806	0.064516	0.129032	0.00000	0.096774	0.032258	0.000000	0.064516	0.032258	0

direct to icu versus during hospitalization?? compare ourcome such as survival, days on the ventilator, duration of hospitalization and age group as well as z score

any predictors on those who were first admitted to the floor and then transferred, namely should they have been sent there in the first place??

what is the difference in the age, sex and diagnosis of the two groups and in comparison with the total group

what about the blood culture positivity in the two groups and versus the whole cohort

duration of hospitalization in the two groups and the whole cohort

comparison of these with the whole cohort is needed in order to determine the following from our study

why are some kids sicker than others?
what are the characteristics that a doctor should look for (predictors) in order to send them directly to the ICU
such characterisitcs should include, age, premature status, history of prior hospitalization ( if we have it) history of chronic disease or other medical conditions, history of having been on bronchodilators
younger age
low birth weight
and low z score (look at less than one and less than two)
duration of illness prior to admission
cyanosis
low O2 saturation on admission
septic looking?? dont know if we have that on record
vitamin d values
actual organism ( virus )
viral load
antibiotic used especialy when compared with organism cultured in case of positive blood culture
medical diagnosis, pneumonia, bronchopneumonia etc

ICU by diagnosis¶

In [165]:

hospitalized[hospitalized.icu_any==1]['adm_pneumo'].sum()

Out[165]:

In [166]:

hospitalized[hospitalized.icu_any==1]['adm_bronchopneumo'].sum()

Out[166]:

In [167]:

hospitalized[hospitalized.icu_any==1]['adm_bronchiolitis'].sum()

Out[167]:

Comparison of vitamin D levels¶

In [168]:

rsv_only.groupby('oxygen')['hospitalized_vitamin_d'].hist()

Out[168]:

oxygen
0         Axes(0.125,0.125;0.775x0.775)
1         Axes(0.125,0.125;0.775x0.775)
Name: hospitalized_vitamin_d, dtype: object

In [169]:

rsv_only.groupby('oxygen')['hospitalized_vitamin_d'].mean()

Out[169]:

oxygen
0         16.528299
1         14.086796
Name: hospitalized_vitamin_d, dtype: float64

In [170]:

rsv_only['los5'] = (rsv_only.length_of_stay>=5).astype(int)

In [171]:

rsv_only.groupby('los5')['hospitalized_vitamin_d'].mean()

Out[171]:

los5
0       16.505338
1       14.688211
Name: hospitalized_vitamin_d, dtype: float64

In [172]:

rsv_only.groupby('adm_pneumo')['hospitalized_vitamin_d'].mean()

Out[172]:

adm_pneumo
0             16.250651
1             11.975369
Name: hospitalized_vitamin_d, dtype: float64

The following calculates the estimated median difference between the means of each group, along with a 95% bootstrap confidence interval. These can be interpreted as the difference in vitamin D levels between the two groups, and it is robust to the fact that vitamin D levels are not normally distributed within groups.

In [173]:

def bootstrap_difference(df, var, group, alpha=0.05, samples=10000):
    
    grouped_var = df.groupby(group)[var]
    diffs = [np.diff(grouped_var.apply(lambda x: x.iloc[np.random.randint(len(x), size=len(x))].mean())) for i in range(samples)]
    
    interval = np.percentile(diffs, [100*(alpha/2.), 100*(1. - alpha/2.)])
    
    return np.round(np.median(diffs), 2), np.round(interval, 2).tolist()

In [174]:

bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'adm_pneumo')

Out[174]:

(-4.2800000000000002, [-6.03, -2.51])

In [175]:

bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'adm_bronchiolitis')

Out[175]:

(0.059999999999999998, [-1.58, 1.73])

In [176]:

bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'adm_bronchopneumo')

Out[176]:

(4.8499999999999996, [3.38, 6.32])

In [177]:

bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'adm_sepsis')

Out[177]:

(-4.6399999999999997, [-6.37, -2.89])

In [178]:

rsv_only.groupby('adm_sepsis')['hospitalized_vitamin_d'].mean()

Out[178]:

adm_sepsis
0             16.332006
1             11.701480
Name: hospitalized_vitamin_d, dtype: float64

In [179]:

bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'oxygen')

Out[179]:

(-2.4500000000000002, [-3.88, -1.0])

In [180]:

bootstrap_difference(rsv_only, 'hospitalized_vitamin_d', 'los5')

Out[180]:

(-1.8100000000000001, [-3.24, -0.37])

Thus, none of the intervals above include zero, making it unlikely that the underlying populations are the same.

LRTI Subset analysis¶

Predicting RSV infection from diagnoses

In [115]:

# Create indicator for LRTI
hospitalized['ltri'] = (hospitalized.adm_bronchopneumo + hospitalized.adm_bronchiolitis + hospitalized.adm_pneumo + 
 (hospitalized.wheezing>0) + (hospitalized.flaring>1)).astype(bool)

In [118]:

hospitalized.ltri.mean()

Out[118]:

0.71252761123382768

Proportion with LTRI by virus

In [127]:

for p in pcr_lookup:
    print('{0}:\t{1}'.format(pcr_lookup[p], np.round(hospitalized[hospitalized[p]==1].ltri.mean(), 2)))

flu B:	0.72
rhino:	0.7
PIV1:	0.82
PIV2:	0.69
RSV:	0.86
HMPV:	0.89
flu A:	0.7
PIV3:	0.68
Adeno:	0.74
Swine H1:	0.77
flu C:	0.84
H3N2:	0.61
Swine:	0.73
H1N1:	nan