%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sb
sb.set_style("whitegrid")
Import data
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
hospitalized.head()
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,263,282,283,284,298,299) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
greater_48hrs | fever_neutropenia | never_left | written_consent | child_name | mother_name | mother_birth_date | mother_record | mother_nationality | other_mother_nationality | ... | was_whole_blood_obtained_f | date | whole_blood_complete | age_months | length_of_stay | gest_age | death | hospitalized_vitamin_d | wheezing_ind | sex | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
case_id | |||||||||||||||||||||
A0001 | 0 | 0 | 0 | 1 | Remas Mahmoud Jbarah | Huda Katalo | 1976-01-21 | NaN | 3 | NaN | ... | NaN | NaN | 0 | 1 | 6 | 40 | False | 3 | 0 | F |
A0002 | 0 | 0 | 0 | 1 | Majed Abdel Kareem Majed | Noor SHa'aban Mahmood | 1989-09-09 | NaN | 3 | NaN | ... | NaN | NaN | 0 | 1 | 5 | 40 | False | 4 | 1 | M |
A0003 | 0 | 0 | 0 | 1 | Rayyan Jamal Muhyi Al.Deen | SAra Hussein Muhyi Al.Deen | 1965-01-01 | NaN | 1 | NaN | ... | NaN | NaN | 0 | 11 | 10 | 40 | False | 35 | 0 | F |
A0004 | 0 | 0 | 0 | 1 | Hanan Mohd Mustapha Abu Othman | Kawla Abu Shanab | 1983-10-31 | NaN | 1 | NaN | ... | NaN | NaN | 0 | 7 | 3 | 38 | False | 2 | 1 | F |
A0005 | 0 | 0 | 0 | 1 | Yara Mahmoud Azmi Ismael | Suha Abdel Aziz | 1986-02-28 | NaN | 1 | NaN | ... | NaN | NaN | 0 | 2 | 1 | 39 | False | 6 | 0 | F |
5 rows × 414 columns
Breakdown by:
Severity measures:
Columns:
[v for v in hospitalized.columns if v.endswith('hx')]
['no_hx', 'diabetes_hx', 'heart_hx', 'kidney_hx', 'downs_hx', 'sickle_hx', 'cancer_hx', 'cf_hx', 'genetic_hx', 'cp_hx', 'seizure_hx', 'neuro_hx', 'mr_hx', 'asthma_hx', 'immuno_hx', 'liver_hx', 'gerd_hx', 'diarrhea_hx', 'other_hx', 'spec_hx', 'no_asthma_hx', 'no_allergy_hx', 'no_eczema_hx']
hospitalized['asthma'] = (hospitalized.child_asthma==1) | hospitalized.asthma_hx
hospitalized.asthma.sum()
81
hospitalized.no_hx.value_counts()
1 2848 0 320 dtype: int64
((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool))).sum()
55
Past medical history calculated as those who claim to have past medical history and those who claim to have no past medical history and reported asthma.
hospitalized['past medical history'] = (~hospitalized.no_hx.astype(bool) |
((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool)))
hospitalized['past medical history'].sum()
375
# Positive culture lookup
pcr_lookup = {'pcr_result___1': 'RSV',
'pcr_result___2': 'HMPV',
'pcr_result___3': 'flu A',
'pcr_result___4': 'flu B',
'pcr_result___5': 'rhino',
'pcr_result___6': 'PIV1',
'pcr_result___7': 'PIV2',
'pcr_result___8': 'PIV3',
'pcr_result___13': 'H1N1',
'pcr_result___14': 'H3N2',
'pcr_result___15': 'Swine',
'pcr_result___16': 'Swine H1',
'pcr_result___17': 'flu C',
'pcr_result___18': 'Adeno'}
hospitalized['RSV'] = hospitalized.pcr_result___1.astype(bool)
hospitalized['HMPV'] = hospitalized.pcr_result___2.astype(bool)
hospitalized['Rhino'] = hospitalized.pcr_result___5.astype(bool)
hospitalized['Influenza'] = (hospitalized.pcr_result___3 | hospitalized.pcr_result___4 |
hospitalized.pcr_result___17)
hospitalized['Adeno'] = hospitalized.pcr_result___18.astype(bool)
hospitalized['PIV'] = (hospitalized.pcr_result___6 | hospitalized.pcr_result___7 |
hospitalized.pcr_result___8)
hospitalized['No virus'] = hospitalized[list(pcr_lookup.keys())].sum(1) == 0
hospitalized['All'] = True
viruses = ['RSV', 'HMPV', 'Rhino', 'Influenza', 'Adeno', 'PIV', 'No virus', 'All']
non_rsv_lookup = pcr_lookup.copy()
non_rsv_lookup.pop('pcr_result___1')
'RSV'
Identify individuals with coinfection
hospitalized['coinfection'] = hospitalized[list(pcr_lookup.keys())].sum(1) > 1
Virus frequency by coinfection status
hospitalized[(~hospitalized.coinfection) & (hospitalized.RSV)].shape[0]/hospitalized.shape[0]
0.2297979797979798
virus_by_coinf = hospitalized.groupby('coinfection')[viruses[:-2]].sum()/hospitalized.shape[0]
virus_by_coinf.T.plot(kind='bar', stacked=True, grid=False,
color=sb.color_palette('Paired', 2)[::-1])
<matplotlib.axes._subplots.AxesSubplot at 0x1098b2fd0>
hospitalized[viruses].sum(0)
RSV 1397 HMPV 273 Rhino 1238 Influenza 119 Adeno 475 PIV 175 No virus 587 All 3168 dtype: int64
other_virus_index = (hospitalized[list(non_rsv_lookup.keys())].sum(1) > 0).astype(int)
hospitalized['RSV'] = hospitalized['pcr_result___1']
hospitalized['non-RSV virus'] = (hospitalized['pcr_result___1']==0) & other_virus_index
hospitalized['no virus'] = (hospitalized['pcr_result___1']==0) & (other_virus_index==0)
hospitalized["prev_cond"] = (hospitalized[[c for c in hospitalized.columns if c.endswith('hx')
and not c.startswith('no_')]].sum(1) > 0)
hospitalized['male'] = (hospitalized.sex=='M').astype(int)
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23]))
age_groups.index = hospitalized.index
age_groups.columns = 'under 2 months', '2-11 months', '12-23 months'
hospitalized = hospitalized.join(age_groups)
nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian',
6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup)
hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int)
hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int)
hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int)
hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 20'] = np.nan
hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int)
hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 11'] = np.nan
hospitalized['any_cigarette'] = (hospitalized.cigarette_smokers > 0).astype(int)
hospitalized['any_smoke'] = (hospitalized.cigarette_smokers.astype(bool) |
hospitalized.nargila_smokers.astype(bool))
hospitalized.any_smoke.mean()
0.76546717171717171
hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int)
hospitalized.length_of_stay.hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x10c3996d8>
Diagnosis
hospitalized['ros'] = hospitalized.adm_sepsis | hospitalized.adm_febrile
hospitalized['pertussis-like cough'] = hospitalized.adm_pertussis | hospitalized.adm_cough
hospitalized['brochopneumonia'] = hospitalized.adm_bronchopneumo
hospitalized['bronchiolitis'] = hospitalized.adm_bronchiolitis
hospitalized['pneumonia'] = hospitalized.adm_pneumo
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.admission_date.describe()
count 3168 unique 794 top 2011-02-07 00:00:00 freq 17 first 2010-03-15 00:00:00 last 2013-03-30 00:00:00 Name: admission_date, dtype: object
Age groups:
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,5,11,24], include_lowest=True))
age_groups.index = hospitalized.index
age_groups.columns = 'age_under_2', 'age_2_5', 'age_6_11', 'age_over_11'
hospitalized = hospitalized.join(age_groups)
age_group_lookup = {'age_under_2': '<2',
'age_2_5': '2-5',
'age_6_11': '6-11',
'age_over_11': '>11'}
customcmap = sb.color_palette("coolwarm", 6)
fig, axes = plt.subplots(2,2, figsize=(10,6), sharey=True)
for i,ax in enumerate(np.ravel(axes)):
age_group = age_groups.columns[i]
age_subset = hospitalized[hospitalized[age_group].astype(bool)]
age_virus = age_subset[viruses[:-2]].mean()
age_virus.T.plot(kind='bar', grid=False, ax=ax,
color=customcmap)
ax.set_title(age_group_lookup[age_group])
hosp_age_counts = hospitalized.groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum').values
pre_2013 = hospitalized.admission_date < datetime(2013, 1, 1)
age_counts = hospitalized[pre_2013].groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum')
age_counts
age_under_2 | age_2_5 | age_6_11 | age_over_11 | |
---|---|---|---|---|
admission_date | ||||
2010-03-31 | 15 | 16 | 14 | 3 |
2010-04-30 | 17 | 23 | 14 | 7 |
2010-05-31 | 13 | 17 | 12 | 2 |
2010-06-30 | 21 | 6 | 7 | 3 |
2010-07-31 | 19 | 6 | 2 | 2 |
2010-08-31 | 8 | 9 | 3 | 3 |
2010-09-30 | 11 | 5 | 5 | 2 |
2010-10-31 | 5 | 5 | 4 | 5 |
2010-11-30 | 5 | 6 | 8 | 7 |
2010-12-31 | 7 | 8 | 5 | 10 |
2011-01-31 | 32 | 62 | 35 | 23 |
2011-02-28 | 56 | 69 | 40 | 24 |
2011-03-31 | 41 | 34 | 19 | 23 |
2011-04-30 | 23 | 39 | 28 | 12 |
2011-05-31 | 25 | 34 | 11 | 17 |
2011-06-30 | 25 | 19 | 13 | 9 |
2011-07-31 | 23 | 16 | 12 | 10 |
2011-08-31 | 24 | 7 | 0 | 3 |
2011-09-30 | 19 | 8 | 5 | 9 |
2011-10-31 | 20 | 10 | 7 | 10 |
2011-11-30 | 11 | 15 | 8 | 14 |
2011-12-31 | 25 | 22 | 21 | 19 |
2012-01-31 | 63 | 70 | 39 | 37 |
2012-02-29 | 58 | 89 | 41 | 26 |
2012-03-31 | 49 | 74 | 39 | 33 |
2012-04-30 | 37 | 46 | 32 | 23 |
2012-05-31 | 35 | 27 | 16 | 12 |
2012-06-30 | 24 | 22 | 15 | 11 |
2012-07-31 | 31 | 21 | 8 | 14 |
2012-08-31 | 22 | 12 | 7 | 5 |
2012-09-30 | 14 | 18 | 16 | 10 |
2012-10-31 | 16 | 20 | 4 | 9 |
2012-11-30 | 11 | 17 | 6 | 9 |
2012-12-31 | 45 | 48 | 32 | 19 |
Rates and demographics (via World Bank) and 2004 census data (via Jordan Department of Statistics).
# Jordan population, 2008-2012
population = 5786000, 5915000, 6046000, 6181000, 6318000
# Interpolated population by gender and age, 2010-2012
female_0 = 93649, 95739, 96486
male_0 = 98435, 100525, 101160
female_1 = 87941, 93511, 93067
male_1 = 92548, 98306, 97763
kids_0 = np.array(female_0) + np.array(male_0)
kids_1 = np.array(female_1) + np.array(male_1)
kids = kids_0 + kids_1
# Proportion in Amman
amman_urban_2004 = 1784502.
jordan_2004 = 5103639.
amman_prop = amman_urban_2004 / jordan_2004
# Birth rates (per 1000)
birth_rate = 29.665, 29.322, 28.869, 28.317, 27.699
# Neonatal mortality (per 1000)
neonatal_mort = 12.7, 12.4, 12.1, 11.8, 11.5
# Infant mortality (per 1000)
infant_mort = 18.4, 17.9, 17.3, 16.8, 16.4
amman_prop
0.3496528653378501
admissions_by_month = hospitalized.groupby('admission_date')['child_name'].count().resample('1M', how='sum')
admissions_by_month.head()
admission_date 2010-03-31 48 2010-04-30 61 2010-05-31 44 2010-06-30 37 2010-07-31 29 Freq: M, Name: child_name, dtype: int64
# Dict to return pcr_result variable corresponding to virus
virus_lookup = {pcr_lookup[k]: k for k in pcr_lookup.keys()}
# Enrolling 5 days per week
p_enroll = 5./7
rsv_subset = hospitalized[hospitalized.RSV==1]
rsv_subset.shape
(1397, 441)
kids_1
array([180489, 191817, 190830])
Import hospitalization data
spreadsheets = !ls data/Al-Bashir*
data = []
col_names = 'date', 'admissions', 'admissions_2', 'resp_admissions', 'resp_admissions_2'
for spreadsheet in spreadsheets:
this_file = pd.ExcelFile(spreadsheet)
# Need to fix data entry error in years > 2011
dot = spreadsheet.find('.')
fix_columns = int(spreadsheet[dot-4:dot]) > 2011
for name in this_file.sheet_names:
d = this_file.parse(name)
d.columns = col_names
if fix_columns:
total = d['resp_admissions'] + d['resp_admissions_2']
d['resp_admissions_2'] = d['resp_admissions']
d['resp_admissions'] = total
data.append(d)
hospitalizations = pd.concat(data).set_index('date')
hospitalizations.head()
admissions | admissions_2 | resp_admissions | resp_admissions_2 | |
---|---|---|---|---|
date | ||||
2010-03-14 | 23 | 21 | 15 | 14 |
2010-03-15 | 27 | 23 | 19 | 18 |
2010-03-16 | 11 | 8 | 6 | 3 |
2010-03-17 | 17 | 10 | 10 | 8 |
2010-03-18 | 23 | 17 | 12 | 10 |
under_2_hosp = hospitalizations[hospitalizations.index < datetime(2013, 1, 1)].resample('M', how=sum)['admissions_2']
under_2_hosp.head()
date 2010-03-31 239 2010-04-30 354 2010-05-31 334 2010-06-30 284 2010-07-31 232 Freq: M, Name: admissions_2, dtype: int64
rsv_by_date = rsv_subset.groupby('admission_date')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']]
rsv_counts = rsv_by_date.sum().resample('1M', how='sum').fillna(0)
rsv_counts.head()
age_under_2 | age_2_5 | age_6_11 | age_over_11 | |
---|---|---|---|---|
admission_date | ||||
2010-03-31 | 12 | 8 | 7 | 1 |
2010-04-30 | 8 | 8 | 5 | 0 |
2010-05-31 | 1 | 1 | 1 | 0 |
2010-06-30 | 1 | 1 | 0 | 0 |
2010-07-31 | 0 | 0 | 0 | 0 |
under2_rsv = pd.concat([rsv_counts, under_2_hosp], axis=1).dropna()
under2_rsv.head()
age_under_2 | age_2_5 | age_6_11 | age_over_11 | admissions_2 | |
---|---|---|---|---|---|
admission_date | |||||
2010-03-31 | 12 | 8 | 7 | 1 | 239 |
2010-04-30 | 8 | 8 | 5 | 0 | 354 |
2010-05-31 | 1 | 1 | 1 | 0 | 334 |
2010-06-30 | 1 | 1 | 0 | 0 | 284 |
2010-07-31 | 0 | 0 | 0 | 0 | 232 |
def extract_virus(virus):
virus_subset = hospitalized[hospitalized[virus]==1]
virus_by_date = virus_subset.groupby('admission_date')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']]
virus_counts = virus_by_date.sum().resample('1M', how='sum').fillna(0)
return pd.concat([virus_counts, under_2_hosp], axis=1).dropna()
[(x*(d/d.sum() if (d.sum()>10) else np.ones(4)*0.25)).astype(int) for x,d in zip(adm,data_subset)]
[array([102, 68, 59, 8]), array([134, 134, 84, 0]), array([83, 83, 83, 83]), array([71, 71, 71, 71]), array([58, 58, 58, 58]), array([54, 54, 54, 54]), array([53, 53, 53, 53]), array([55, 55, 55, 55]), array([93, 93, 93, 93]), array([118, 89, 148, 59]), array([115, 230, 123, 75]), array([118, 158, 79, 31]), array([123, 123, 37, 30]), array([134, 156, 0, 22]), array([83, 83, 83, 83]), array([79, 79, 79, 79]), array([68, 68, 68, 68]), array([41, 41, 41, 41]), array([56, 56, 56, 56]), array([62, 62, 62, 62]), array([79, 79, 79, 79]), array([ 60, 120, 135, 45]), array([138, 158, 72, 60]), array([109, 212, 67, 47]), array([112, 167, 83, 65]), array([ 92, 116, 87, 58]), array([163, 59, 44, 44]), array([116, 93, 70, 46]), array([70, 70, 70, 70]), array([53, 53, 53, 53]), array([62, 62, 62, 62]), array([69, 69, 69, 69]), array([68, 68, 68, 68]), array([ 93, 118, 71, 53])]
import pymc as pm
def hosp_subset(virus):
data_subset = extract_virus(virus)
admissions = data_subset.pop('admissions_2').values
data_subset = data_subset.values
n_months, n_ages = data_subset.shape
# Estimate age distribution of admissions
p_age = pm.Dirichlet('p_age', np.ones(4), value=data_subset.sum(0)[:-1]/data_subset.sum())
counts = pm.Multinomial('counts', data_subset.sum(1), p_age, value=data_subset, observed=True)
# Estimate denominators
n_hosp = []
for i,ni in enumerate(admissions):
d = data_subset[i]
n_init = (ni*(d/d.sum() if (d.sum()>10) else np.ones(4)*0.25)).astype(int)[:-1]
n_init = np.append(n_init, ni - n_init.sum())
n_hosp.append(pm.Multinomial('n_hosp_{}'.format(i), ni, p_age, value=n_init))
n_hosp_t = pm.Lambda('n_hosp_t', lambda x=n_hosp: np.array([[xij for xij in xi] for xi in x]).T)
# Virus rates by age and time
mu_alpha = pm.Normal('mu_alpha', 0, 0.01, value=[0]*n_ages)
sigma_alpha = pm.Uniform('sigma_alpha', 0, 100, value=[10]*n_ages)
rho = pm.Uniform('rho', -1, 1, value=0)
mu = [pm.Lambda('mu_{}'.format(i), lambda mu=mu_alpha: np.array([mu[i]]*n_months)) for i in range(n_ages)]
off_diag = np.eye(n_months, k=1)
Sigma = [pm.Lambda('Sigma_{}'.format(i), lambda s=sigma_alpha, r=rho:
(np.eye(n_months) + off_diag*r + off_diag.T*r)*(s[i]**2)) for i in range(n_ages)]
alpha = [pm.MvNormalCov('alpha_{}'.format(i), mu[i], Sigma[i], value=[0]*n_months) for i in range(n_ages)]
p_virus = [pm.Lambda('p_virus_{}'.format(i), lambda a=a: pm.invlogit(a)) for i,a in enumerate(alpha)]
# Viral rate likelihood
@pm.observed
def rate_like(value=data_subset.T, p=p_virus, n=n_hosp_t):
return np.sum([pm.binomial_like(value[i], n[i], p[i]) for i in range(n_ages)])
return(locals())
M = pm.MCMC(hosp_subset('RSV'))
M.sample(20000, 10000)
[-----------------100%-----------------] 20000 of 20000 complete in 147.1 sec
pm.Matplot.summary_plot(M.p_age)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
pm.Matplot.summary_plot(M.p_virus[0])
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
import seaborn as sb
M.p_virus[0].trace().shape
(10000, 34)
date_labels = [pd.to_datetime(str(d)).strftime('%b %d') for d in rsv_counts.index.values]
# date_labels = rsv_counts.index.values.astype('M8[D]')
# for i in range(4):
ax = sb.tsplot(M.p_virus[0].trace()[-1000:], err_style="unit_traces")
ax.set_xticklabels(date_labels, rotation='vertical')
ax.set_ylabel('Rate');
sb.set(style="white", palette="muted")
f, axes = plt.subplots(2, 2, figsize=(10, 8))
for i,axis in enumerate(np.ravel(axes)):
sb.tsplot(M.p_virus[i].trace()[-1000:], err_style="unit_traces", ax=axis)
axis.set_title(age_groups.columns[i])
axis.set_xticklabels(date_labels, rotation='vertical')
axis.set_ylabel('Rate');
M_rhino = pm.MCMC(hosp_subset('Rhino'))
M_rhino.sample(20000, 10000)
[-----------------100%-----------------] 20000 of 20000 complete in 145.6 sec
age_group_labels = {'age_under_2':'Under 2 yrs', 'age_2_5':'2-5 yrs', 'age_6_11':'6-11 yrs', 'age_over_11':'Over 11 yrs'}
sb.set(style="white", palette="muted")
f, axes = plt.subplots(2, 2, figsize=(10, 8))
for i,axis in enumerate(np.ravel(axes)):
sb.tsplot(M_rhino.p_virus[i].trace()[-1000:], err_style="unit_traces", ax=axis)
axis.set_title(age_group_labels[age_groups.columns[i]])
axis.set_xticklabels(date_labels, rotation='vertical')
axis.set_ylabel('Rate');
f.suptitle('Rhino', fontsize=14)
<matplotlib.text.Text at 0x1336d2630>
for virus in viruses:
print('\nRunning', virus)
M = pm.MCMC(hosp_subset(virus))
M.sample(50000, 40000)
sb.set(style="white", palette="muted")
f, axes = plt.subplots(2, 2, figsize=(10, 8))
for i,axis in enumerate(np.ravel(axes)):
sb.tsplot(M.p_virus[i].trace()[-1000:], err_style="unit_traces", ax=axis)
axis.set_title(age_group_labels[age_groups.columns[i]])
axis.set_xticklabels(date_labels, rotation='vertical')
axis.set_ylabel('Rate');
f.suptitle(virus, fontsize=14)
f.savefig(virus)
M.write_csv(virus, variables=[x.__name__ for x in M.p_virus])
Running RSV [-----------------100%-----------------] 50000 of 50000 complete in 385.2 sec Running HMPV [-----------------100%-----------------] 50000 of 50000 complete in 9824.9 sec Running Rhino [-----------------100%-----------------] 50000 of 50000 complete in 376.2 sec Running Influenza [-----------------100%-----------------] 50000 of 50000 complete in 373.6 sec Running Adeno [-----------------100%-----------------] 50000 of 50000 complete in 382.3 sec Running PIV [-----------------100%-----------------] 50000 of 50000 complete in 385.2 sec Running No virus [-----------------100%-----------------] 50000 of 50000 complete in 375.8 sec Running All [-----------------100%-----------------] 50000 of 50000 complete in 365.7 sec