In [2]:

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sb
import pymc as pm
sb.set_style("white")

Import data

In [3]:

hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
hospitalized.head()

/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,263,282,283,284,298,299) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

Out[3]:

	greater_48hrs	fever_neutropenia	never_left	written_consent	child_name	mother_name	mother_birth_date	mother_record	mother_nationality	other_mother_nationality	...	was_whole_blood_obtained_f	date	whole_blood_complete	age_months	length_of_stay	gest_age	death	hospitalized_vitamin_d	wheezing_ind	sex
case_id
A0001	0	0	0	1	Remas Mahmoud Jbarah	Huda Katalo	1976-01-21	NaN	3	NaN	...	NaN	NaN	0	1	6	40	False	3	0	F
A0002	0	0	0	1	Majed Abdel Kareem Majed	Noor SHa'aban Mahmood	1989-09-09	NaN	3	NaN	...	NaN	NaN	0	1	5	40	False	4	1	M
A0003	0	0	0	1	Rayyan Jamal Muhyi Al.Deen	SAra Hussein Muhyi Al.Deen	1965-01-01	NaN	1	NaN	...	NaN	NaN	0	11	10	40	False	35	0	F
A0004	0	0	0	1	Hanan Mohd Mustapha Abu Othman	Kawla Abu Shanab	1983-10-31	NaN	1	NaN	...	NaN	NaN	0	7	3	38	False	2	1	F
A0005	0	0	0	1	Yara Mahmoud Azmi Ismael	Suha Abdel Aziz	1986-02-28	NaN	1	NaN	...	NaN	NaN	0	2	1	39	False	6	0	F

5 rows × 414 columns

Breakdown by:

month of age (and greater than 1 year)
sex
birthweight
prematurity
feeding: % with any breastfeeding
maternal ed
number in household
daycare
smoking exposure: cigarette, maternal during pregnacy, nargalia
% with comorbidity: past medical history
nationality: jordanian, palestinean, syrian, egptian, other
diagnosis: pneumo, bronchiolitis, bronchopnewumonia

Severity measures:

length of stay
ICU
O2
Mechanical vent.
days of symptoms before admission
% with antibiotics prior
% with antibiotics during

Columns:

RSV, MPV, Rhino, Influenza A/B/C, Adeno, Parinfluenza, Hospitalzied pop, no virus

In [4]:

[v for v in hospitalized.columns if v.endswith('hx')]

Out[4]:

['no_hx',
 'diabetes_hx',
 'heart_hx',
 'kidney_hx',
 'downs_hx',
 'sickle_hx',
 'cancer_hx',
 'cf_hx',
 'genetic_hx',
 'cp_hx',
 'seizure_hx',
 'neuro_hx',
 'mr_hx',
 'asthma_hx',
 'immuno_hx',
 'liver_hx',
 'gerd_hx',
 'diarrhea_hx',
 'other_hx',
 'spec_hx',
 'no_asthma_hx',
 'no_allergy_hx',
 'no_eczema_hx']

In [5]:

hospitalized['asthma'] = (hospitalized.child_asthma==1) | hospitalized.asthma_hx
hospitalized.asthma.sum()

Out[5]:

In [6]:

hospitalized.no_hx.value_counts()

Out[6]:

1    2848
0     320
dtype: int64

In [7]:

((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool))).sum()

Out[7]:

Past medical history calculated as those who claim to have past medical history and those who claim to have no past medical history and reported asthma.

In [8]:

hospitalized['past medical history'] = (~hospitalized.no_hx.astype(bool) | 
                                ((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool))))
hospitalized['past medical history'].sum()

Out[8]:

In [9]:

# Positive culture lookup
pcr_lookup = {'pcr_result___1': 'RSV',
'pcr_result___2': 'HMPV',
'pcr_result___3': 'flu A',
'pcr_result___4': 'flu B',
'pcr_result___5': 'rhino',
'pcr_result___6': 'PIV1',
'pcr_result___7': 'PIV2',
'pcr_result___8': 'PIV3',
'pcr_result___13': 'H1N1',
'pcr_result___14': 'H3N2',
'pcr_result___15': 'Swine',
'pcr_result___16': 'Swine H1',
'pcr_result___17': 'flu C',
'pcr_result___18': 'Adeno'}

In [10]:

hospitalized['RSV'] = hospitalized.pcr_result___1.astype(bool)
hospitalized['HMPV'] = hospitalized.pcr_result___2.astype(bool)
hospitalized['Rhino'] = hospitalized.pcr_result___5.astype(bool)
hospitalized['Influenza'] = (hospitalized.pcr_result___3 | hospitalized.pcr_result___4 |
                             hospitalized.pcr_result___17)
hospitalized['Adeno'] = hospitalized.pcr_result___18.astype(bool)
hospitalized['PIV'] = (hospitalized.pcr_result___6 | hospitalized.pcr_result___7 | 
                       hospitalized.pcr_result___8)
hospitalized['No virus'] = hospitalized[list(pcr_lookup.keys())].sum(1) == 0
hospitalized['All'] = True

In [11]:

viruses = ['RSV', 'HMPV', 'Rhino', 'Influenza', 'Adeno', 'PIV', 'No virus', 'All']

In [12]:

non_rsv_lookup = pcr_lookup.copy()
non_rsv_lookup.pop('pcr_result___1')

Out[12]:

'RSV'

Identify individuals with coinfection

In [13]:

hospitalized['coinfection'] = hospitalized[list(pcr_lookup.keys())].sum(1) > 1

Virus frequency by coinfection status

In [14]:

hospitalized[(~hospitalized.coinfection) & (hospitalized.RSV)].shape[0]/hospitalized.shape[0]

Out[14]:

0.2297979797979798

In [15]:

virus_by_coinf = hospitalized.groupby('coinfection')[viruses[:-2]].sum()/hospitalized.shape[0]
virus_by_coinf.T.plot(kind='bar', stacked=True, grid=False, 
                      color=sb.color_palette('Paired', 2)[::-1])

Out[15]:

<matplotlib.axes._subplots.AxesSubplot at 0x1135d17f0>

In [16]:

hospitalized[viruses].sum(0)

Out[16]:

RSV          1397
HMPV          273
Rhino        1238
Influenza     119
Adeno         475
PIV           175
No virus      587
All          3168
dtype: float64

In [17]:

other_virus_index = (hospitalized[list(non_rsv_lookup.keys())].sum(1) > 0).astype(int)

In [18]:

hospitalized['RSV'] = hospitalized['pcr_result___1']
hospitalized['non-RSV virus'] = (hospitalized['pcr_result___1']==0) & other_virus_index
hospitalized['no virus'] = (hospitalized['pcr_result___1']==0) & (other_virus_index==0)

In [19]:

hospitalized["prev_cond"] = (hospitalized[[c for c in hospitalized.columns if c.endswith('hx') 
                                           and not c.startswith('no_')]].sum(1) > 0)

In [20]:

hospitalized['male'] = (hospitalized.sex=='M').astype(int)
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23]))
age_groups.index = hospitalized.index
age_groups.columns = 'under 2 months', '2-11 months', '12-23 months'
hospitalized = hospitalized.join(age_groups)

In [21]:

nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian', 
                      6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup)
hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int)
hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int)

In [22]:

hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int)
hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 20'] = np.nan
hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int)
hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 11'] = np.nan

In [23]:

hospitalized['any_cigarette'] = (hospitalized.cigarette_smokers > 0).astype(int)

In [24]:

hospitalized['any_smoke'] = (hospitalized.cigarette_smokers.astype(bool) | 
                             hospitalized.nargila_smokers.astype(bool))

In [25]:

hospitalized.any_smoke.mean()

Out[25]:

0.76546717171717171

In [26]:

hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int)

In [27]:

hospitalized.length_of_stay.hist(bins=20)

Out[27]:

<matplotlib.axes._subplots.AxesSubplot at 0x11380f0b8>

Diagnosis

In [28]:

hospitalized['ros'] = hospitalized.adm_sepsis | hospitalized.adm_febrile

In [29]:

hospitalized['pertussis-like cough'] = hospitalized.adm_pertussis | hospitalized.adm_cough

In [30]:

hospitalized['brochopneumonia'] = hospitalized.adm_bronchopneumo

In [31]:

hospitalized['bronchiolitis'] = hospitalized.adm_bronchiolitis

In [32]:

hospitalized['pneumonia'] = hospitalized.adm_pneumo

Rate Estimation¶

In [33]:

hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.admission_date.describe()

Out[33]:

count                    3168
unique                    794
top       2011-02-07 00:00:00
freq                       17
first     2010-03-15 00:00:00
last      2013-03-30 00:00:00
Name: admission_date, dtype: object

Age groups:

Under 2 months
2-5 mo.
6-11 mo.
Over 11 mo.

In [34]:

age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,5,11,24], include_lowest=True))
age_groups.index = hospitalized.index
age_groups.columns = 'age_under_2', 'age_2_5', 'age_6_11', 'age_over_11'
hospitalized = hospitalized.join(age_groups)

In [35]:

age_group_lookup = {'age_under_2': '<2', 
                    'age_2_5': '2-5', 
                    'age_6_11': '6-11', 
                    'age_over_11': '>11'}

In [36]:

customcmap = sb.color_palette("coolwarm", 6)

In [37]:

fig, axes = plt.subplots(2,2, figsize=(10,6), sharey=True)
for i,ax in enumerate(np.ravel(axes)):
    age_group = age_groups.columns[i]

    age_subset = hospitalized[hospitalized[age_group].astype(bool)]
    age_virus = age_subset[viruses[:-2]].mean()
    age_virus.T.plot(kind='bar', grid=False, ax=ax, 
                     color=customcmap)
    ax.set_title(age_group_lookup[age_group])

Population rate estimation¶

Recode year to virus season:

2011: March 2010 - March 2011
2012: Apr 2011 - Mar 2012
2013: April 2012 - Mar 2013

In [38]:

hospitalized['virus_year'] = 2011
hospitalized.loc[(hospitalized.admission_date >= '2011-03-31') 
                 & (hospitalized.admission_date <= '2012-03-31'), 'virus_year'] = 2012
hospitalized.loc[hospitalized.admission_date > '2012-03-31', 'virus_year'] = 2013

hospitalized.virus_year.value_counts()

Out[38]:

2012    1191
2013    1179
2011     798
dtype: int64

Recode zones

In [39]:

zone_string = "1, Amman Zone 1 | 2, Abdoun Zone 1 | 3, Abu Alanda Zone 3 | 4, Abu Nusair Zone 2 | 5, Airport street Zone 4 | 6, Al.Ashrafeyeh Zone 5 | 7, Al.Badya Zone 31 | 8, Al.Baqa'a Zone 22 | 9, Al.Ghour Zone 32 | 10, Al.Hashmi Zone 6 | 11, Al.Hezam Zone 29 | 12, Al.Hussein Camping Zone 1 | 13, Al.Istiklal Zone 1 | 14, Al.Jeeza Zone 8 | 15, Al.Joufeh Zone 5 | 16, Al.Karak Zone 35 | 17, Al.Lubban Zone 16 | 18, Al.Madenah Al.Reyadeyeh Zone 1 | 19, Al.Mahatta Zone 6 | 20, Al.Manarah Zone 5 | 21, Al.Mareikh Zone 5 | 22, Al.Marqab Zone 6 | 23, Al.Muhajreen Zone 5 | 24, Al.Musdar Zone 5 | 25, Al.Musherfeh Zone 15 | 26, Al.Naser Zone 6 | 27, Al.Natheef Zone 5 | 28, Al.Nuzha Zone 1 | 29, Al.Qastal Zone 10 | 30, Al.Qwesmeh Zone 5 | 31, Al.Shouneh Zone 32 | 32, Al.Taj Zone 5 | 33, Al.Taybeh Zone 7 | 34, Aqaba Zone 30 | 35, Arjan Zone 1 | 36, Bayader Wadi AL.Seer Zone 20 | 37, Bnayyat Zone 12 | 38, D.Al.Ameer Ali Zone 13 | 39, D.Al.Aqsa Zone 9 | 40, D.Haj Hasan Zone 9 | 41, Daheyet Al.Rasheed Zone 17 | 42, Daheyet al.Yasmeen Zone 1 | 43, Dead Sea Zone 32 | 44, Deer.Al.Ghbar Zone 1 | 45, Down Town (Al.Balad) Zone 1 | 46, Dra'a al.Qarbi Zone 1 | 47, Ein Al.Basha Zone 22 | 48, Ein Ghazal Zone 6 | 49, Eskan Al.Ameer Hashem Zone 29 | 50, Eskan Al.Ameer Talal Zone 29 | 51, Etha'a wal Telvesion Zone 9 | 52, Hay Al.Dabaybeh Zone 5 | 53, Hay Al.Tafayleh Zone 5 | 54, Hay Nazzal Zone 1 | 55, Huttein (shneler ) Refugee camping Zone 6 | 56, Iraq Al.Ameer Zone 23 | 57, Jabal AL.Akhdar Zone 1 | 58, Jabal Al.Ameer Faisal Zone 29 | 59, Jabal AL.Hadeed Zone 5 | 60, Jabal Al.Hussein Zone 1 | 61, Jabal Al.Qosoor Zone 1 | 62, Jabal Amman Zone 1 | 63, Jarash Zone 36 | 64, Jawa Zone 7 | 65, Juwaydeh Zone 14 | 66, Khalda Zone 18 | 67, Khreibt Al.Souk Zone 7 | 68, Ma'an Zone 31 | 69, Madaba Zone 34 | 70, Mafraq Zone 33 | 71, Marj Al.Hamam Zone 19 | 72, Marka Zone 6 | 73, Muqableen Zone 9 | 74, Muwaqqar Zone 28 | 75, Nadi Al.Sebaq Zone 6 | 76, Naur Zone 19 | 77, Petra Zone 30 | 78, Qatranah Zone 30 | 79, Raghadan Zone 1 | 80, Ras Al.Ein Zone 1 | 81, Rusayfah Zone 29 | 82, Saffout Zone 22 | 83, Sahab Zone 11 | 84, Salheyet Al.Abed Zone 6 | 85, Shafa Badran Zone 25 | 86, Sharq Al.Awsat Zone 11 | 87, Shemasani Zone 1 | 88, Street 30 Zone 5 | 89, Summaya Street Zone 5 | 90, Suweileh Zone 27 | 91, Tabarbour Zone 21 | 92, Tla'a al Ali Zone 17 | 93, Um Al.Heran Zone 11 | 94, Um Al.Summaq Zone 17 | 95, Um Nuwwara and Adan Zone 5 | 96, Um Uthayna Zone 1 | 97, Wadi Abdoun Zone 1 | 98, Wadi AL.Haddadeh Zone 1 | 99, Wadi AL.Hajar Zone 29 | 100, Wadi Al.Remam Zone 5 | 101, Wadi AL.Seer Zone 20 | 102, Wadi Saqra Zone 1 | 103, Wehdat Zone 11 | 104, Yadoudeh Zone 13 | 105, Yajouz Zone 26 | 106, Zarqa Zone 29 | 107, Zezya Zone 24"

In [40]:

zones = [z.strip().split(',') for z in zone_string.split('|')]

In [41]:

zone_dict = {int(n):int(s.strip().split(' ')[-1]) for n,s in zones}

In [42]:

hospitalized['zone'] = hospitalized.city_zone.replace(zone_dict)

Define Amman zone

In [43]:

hospitalized['amman_zone'] = hospitalized.zone<28

Hospitalized in Amman

In [44]:

hospitalized_amman = hospitalized[hospitalized.amman_zone]

In [45]:

hospitalized_amman.shape

Out[45]:

(3048, 451)

In [46]:

hospitalized_amman.index.is_unique

Out[46]:

True

In [47]:

hosp_age_counts = hospitalized_amman.groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum').values

In [48]:

pre_2013 = hospitalized_amman.admission_date <  datetime(2013, 1, 1)

In [49]:

age_counts = hospitalized_amman[pre_2013].groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum')
age_counts

Out[49]:

	age_under_2	age_2_5	age_6_11	age_over_11
admission_date
2010-03-31	15	16	14	3
2010-04-30	17	22	14	6
2010-05-31	12	16	11	2
2010-06-30	21	6	7	3
2010-07-31	19	6	2	2
2010-08-31	7	8	3	3
2010-09-30	10	5	4	2
2010-10-31	5	5	4	4
2010-11-30	5	5	8	7
2010-12-31	7	8	5	10
2011-01-31	32	58	34	21
2011-02-28	54	67	39	24
2011-03-31	40	31	18	23
2011-04-30	23	37	26	12
2011-05-31	24	34	11	17
2011-06-30	25	17	13	9
2011-07-31	20	16	10	10
2011-08-31	21	7	0	3
2011-09-30	17	8	5	8
2011-10-31	20	9	6	10
2011-11-30	10	14	7	14
2011-12-31	23	21	21	18
2012-01-31	60	68	37	37
2012-02-29	55	83	39	25
2012-03-31	49	71	37	33
2012-04-30	34	43	32	23
2012-05-31	34	27	15	11
2012-06-30	23	21	15	10
2012-07-31	29	18	8	14
2012-08-31	21	11	7	5
2012-09-30	13	17	15	10
2012-10-31	16	20	4	9
2012-11-30	11	17	5	9
2012-12-31	43	46	31	17

Rates and demographics (via World Bank) and 2004 census data (via Jordan Department of Statistics).

In [50]:

# Jordan population, 2008-2012
population = 5786000, 5915000, 6046000, 6181000, 6318000

# Interpolated population by gender and age, 2010-2012
female_0 = 93649, 95739, 96486
male_0 = 98435, 100525, 101160
female_1 = 87941, 93511, 93067
male_1 = 92548, 98306, 97763

kids_0 = np.array(female_0) + np.array(male_0)
kids_1 = np.array(female_1) + np.array(male_1)
kids = kids_0 + kids_1

kids_under6mo = kids_6to12mo = kids_0/2.

# Proportion in Amman
amman_urban_2004 = 1784502.
jordan_2004 = 5103639.
amman_prop = amman_urban_2004 / jordan_2004

# Birth rates (per 1000)
birth_rate = 29.665, 29.322, 28.869, 28.317, 27.699

# Neonatal mortality (per 1000)
neonatal_mort = 12.7, 12.4, 12.1, 11.8, 11.5

# Infant mortality (per 1000)
infant_mort = 18.4, 17.9, 17.3, 16.8, 16.4

In [51]:

births = np.array(population[-3:])/1000. * birth_rate[-3:]
births

Out[51]:

array([ 174541.974,  175027.377,  175002.282])

In [52]:

births_6m = births/2.
births_6m

Out[52]:

array([ 87270.987 ,  87513.6885,  87501.141 ])

In [53]:

deaths_6m = births_6m/1000. * infant_mort[-3:]
deaths_6m

Out[53]:

array([ 1509.7880751,  1470.2299668,  1435.0187124])

In [54]:

amman_prop

Out[54]:

0.3496528653378501

In [55]:

(births_6m - deaths_6m)*amman_prop

Out[55]:

array([ 29986.6489389 ,  30085.34181971,  30093.26626638])

In [56]:

n = np.array((kids_under6mo, kids_6to12mo, kids_1, kids))
n

Out[56]:

array([[  96042.,   98132.,   98823.],
       [  96042.,   98132.,   98823.],
       [ 180489.,  191817.,  190830.],
       [ 372573.,  388081.,  388476.]])

In [57]:

kids_0

Out[57]:

array([192084, 196264, 197646])

In [58]:

kids_1/kids_0

Out[58]:

array([ 0.93963578,  0.97734174,  0.9655141 ])

In [59]:

amman_prop

Out[59]:

0.3496528653378501

In [60]:

n_amman = np.floor(n*amman_prop).astype(int)

In [61]:

n_amman

Out[61]:

array([[ 33581,  34312,  34553],
       [ 33581,  34312,  34553],
       [ 63108,  67069,  66724],
       [130271, 135693, 135831]])

Monthly admissions in Ammaon

In [62]:

admissions_by_month = hospitalized_amman.groupby('admission_date')['child_name'].count().resample('1M', how='sum')

In [63]:

admissions_by_month.head()

Out[63]:

admission_date
2010-03-31    48
2010-04-30    59
2010-05-31    41
2010-06-30    37
2010-07-31    29
Freq: M, Name: child_name, dtype: int64

In [64]:

# Dict to return pcr_result variable corresponding to virus
virus_lookup = {pcr_lookup[k]: k for k in pcr_lookup.keys()}

In [65]:

# Enrolling 5 days per week
p_enroll = 5./7

In [66]:

rsv_subset = hospitalized_amman[hospitalized_amman.RSV==1]
rsv_subset.shape

Out[66]:

(1354, 451)

'age_under_2', 'age_2_5', 'age_6_11', 'age_over_11'

In [67]:

viruses

Out[67]:

['RSV', 'HMPV', 'Rhino', 'Influenza', 'Adeno', 'PIV', 'No virus', 'All']

In [68]:

virus_subset = hospitalized_amman[hospitalized['RSV']==1].copy()
virus_subset['admission_year'] = virus_subset.admission_date.apply(lambda x: x.year)
_under_6 = virus_subset[virus_subset.age_under_2.astype(bool) | virus_subset.age_2_5].groupby('admission_year')['child_name'].count()

/usr/local/lib/python3.4/site-packages/pandas/core/frame.py:1819: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)

In [69]:

_6_11 = virus_subset[virus_subset.age_6_11.astype(bool)].groupby('admission_year')['child_name'].count()

In [70]:

_over_11 = virus_subset[virus_subset.age_over_11.astype(bool)].groupby('admission_year')['child_name'].count()

In [71]:

_all = virus_subset.groupby('admission_year')['child_name'].count()

In [72]:

virus_subset.set_index(virus_subset.admission_date).groupby([pd.TimeGrouper('M')]).count()['mother_name']

Out[72]:

admission_date
2010-03-31     28
2010-04-30     20
2010-05-31      3
2010-06-30      2
2010-09-30      1
2010-10-31      1
2010-11-30      4
2010-12-31     14
2011-01-31    131
2011-02-28    145
2011-03-31     48
2011-04-30     13
2011-05-31      4
2011-06-30      1
2011-09-30      1
2011-10-31      1
2011-11-30      2
2011-12-31     24
2012-01-31    145
2012-02-29    138
2012-03-31    116
2012-04-30     71
2012-05-31     20
2012-06-30     14
2012-07-31      1
2012-10-31      1
2012-11-30      3
2012-12-31     89
2013-01-31    153
2013-02-28    114
2013-03-31     46
Name: mother_name, dtype: int64

In [73]:

virus_df = pd.concat([_under_6, _6_11, _over_11, _all], axis=1)
virus_df.columns = ('under 6 mo.', '6-11 mo.', '11-23 mo.', 'all under 2 yr.')
virus_df

Out[73]:

	under 6 mo.	6-11 mo.	11-23 mo.	all under 2 yr.
admission_year
2010	49	20	4	73
2011	255	77	38	370
2012	400	113	85	598
2013	219	62	32	313

In [74]:

virus_df.values.ravel()

Out[74]:

array([ 49,  20,   4,  73, 255,  77,  38, 370, 400, 113,  85, 598, 219,
        62,  32, 313])

Use proportion in Amman to calculate proportion of kids in each age group and year in Amman

In [79]:

def rate_model(virus):
    
    # Extract data subset for passed virus
    virus_subset = hospitalized_amman[hospitalized_amman[virus]==1].copy()
    
    # Create data frame of age x year counts
    u6 = virus_subset.age_under_2.astype(bool) | virus_subset.age_2_5
    _under_6 = virus_subset[u6].groupby('virus_year')['child_name'].count()
    _6_11 = virus_subset[virus_subset.age_6_11.astype(bool)].groupby('virus_year')['child_name'].count()
    _over_11 = virus_subset[virus_subset.age_over_11.astype(bool)].groupby('virus_year')['child_name'].count()
    _all = virus_subset.groupby('virus_year')['child_name'].count()
    
    virus_df = pd.concat([_under_6, _6_11, _over_11, _all], axis=1)
    virus_df.columns = ('under 6 mo.', '6-11 mo.', '11-23 mo.', 'all ages')
    
    # Al Bashir hospital market share
    market_share = pm.Uniform('market_share', 0.5, 0.6)
    
    # Prior probability
    prev_virus = [pm.Beta('prev_virus_%i' %i, 1, 5) for i in range(virus_df.size)]
    per_1000 = pm.Lambda('per_1000', lambda p=prev_virus: np.array(p)*1000)
        
    # Correct for 5 days of enrollment per week
    p_hosp = market_share * (5./7)
    
    # RSV in Amman
    n_hosp = pm.Binomial('n_hosp', n_amman.T.ravel(), p_hosp, value=n_amman.T.ravel()*0.2)

    # Likelihood for number with RSV in hospital (assumes Pr(hosp | RSV) = 1)
    y_hosp = pm.Binomial('y_hosp', n_hosp, prev_virus, value=virus_df.values.ravel(), observed=True)
    
    return locals()

Influenza rates¶

In [80]:

M = pm.MCMC(rate_model('Influenza'))

In [81]:

M.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 79.8 sec

In [82]:

M.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 72.9 sec

In [94]:

prevalence_labels = ['%s %s' % (year, age) for year in ('2011', '2012', '2013') 
                     for age in ('under 6 mo.', '6-11 mo.', '12-23 mo.', 'all ages')]

In [95]:

pm.Matplot.summary_plot(M.per_1000, custom_labels=prevalence_labels, main='Influenza (per 1000)')

In [96]:

M.per_1000.summary()

per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	0.857            0.261            0.006            [ 0.381  1.352]
	0.627            0.227            0.006            [ 0.228  1.069]
	0.332            0.119            0.003            [ 0.131  0.578]
	0.504            0.102            0.003            [ 0.314  0.706]
	1.524            0.357            0.01             [ 0.889  2.23 ]
	0.534            0.207            0.004            [ 0.163  0.937]
	0.435            0.133            0.003            [ 0.186  0.695]
	0.696            0.121            0.004            [ 0.476  0.937]
	2.507            0.459            0.015            [ 1.633  3.396]
	0.837            0.254            0.006            [ 0.366  1.334]
	0.594            0.157            0.004            [ 0.304  0.905]
	1.103            0.157            0.006            [ 0.822  1.419]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	0.438            0.673           0.825          1.013         1.44
	0.266            0.464           0.601          0.763         1.126
	0.141            0.246           0.318          0.405         0.6
	0.323            0.432           0.497          0.567         0.721
	0.929            1.261           1.492          1.756         2.293
	0.213            0.387           0.503          0.648         1.017
	0.214            0.34            0.418          0.517         0.738
	0.49             0.612           0.688          0.771         0.957
	1.708            2.179           2.483          2.8           3.487
	0.406            0.66            0.811          0.99          1.41
	0.328            0.483           0.582          0.693         0.936
	0.824            0.992           1.098          1.206         1.423

In [116]:

M.write_csv('Influenza_per_1000', variables=['per_1000'])

HMPV rates¶

In [83]:

M_hmpv = pm.MCMC(rate_model('HMPV'))
# M_hmpv.
M_hmpv.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 72.2 sec

In [84]:

M_hmpv.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 66.1 sec

In [97]:

pm.Matplot.summary_plot(M_hmpv.per_1000, custom_labels=prevalence_labels, main='HMPV (per 1000)')

In [98]:

M_hmpv.per_1000.summary()

per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	1.514            0.344            0.009            [ 0.88   2.205]
	0.843            0.255            0.005            [ 0.372  1.339]
	0.361            0.121            0.003            [ 0.148  0.599]
	0.747            0.125            0.004            [ 0.528  1.021]
	7.233            0.791            0.033            [ 5.727  8.785]
	2.913            0.479            0.015            [ 2.016  3.893]
	1.232            0.224            0.006            [ 0.821  1.673]
	3.134            0.275            0.014            [ 2.602  3.661]
	1.927            0.394            0.011            [ 1.208  2.722]
	1.935            0.394            0.01             [ 1.204  2.729]
	0.501            0.143            0.003            [ 0.235  0.774]
	1.189            0.16             0.006            [ 0.879  1.487]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	0.908            1.277           1.491          1.72          2.267
	0.412            0.662           0.817          0.998         1.397
	0.171            0.271           0.349          0.433         0.633
	0.527            0.658           0.741          0.827         1.021
	5.776            6.675           7.201          7.742         8.857
	2.037            2.582           2.884          3.227         3.935
	0.831            1.074           1.216          1.377         1.692
	2.626            2.936           3.126          3.316         3.703
	1.226            1.646           1.902          2.176         2.756
	1.245            1.662           1.917          2.179         2.793
	0.263            0.4             0.489          0.588         0.823
	0.901            1.077           1.18           1.292         1.522

In [115]:

M_hmpv.write_csv('HMPV_per_1000', variables=['per_1000'])

Rhino rates¶

In [104]:

M_rhino = pm.MCMC(rate_model('Rhino'))
M_rhino.sample(150000, 140000)

 [-----------------100%-----------------] 150000 of 150000 complete in 97.0 sec

In [105]:

M_rhino.sample(150000, 140000)

 [-----------------100%-----------------] 150000 of 150000 complete in 95.9 sec

In [106]:

pm.Matplot.summary_plot(M_rhino.per_1000, custom_labels=prevalence_labels, main='Rhino (per 1000)')

In [108]:

M_rhino.per_1000.summary()

per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	12.805           1.126            0.051          [ 10.72   15.081]
	3.505            0.55             0.016            [ 2.494  4.58 ]
	1.281            0.238            0.006            [ 0.848  1.759]
	4.806            0.359            0.019            [ 4.124  5.505]
	20.541           1.461            0.078          [ 17.563  23.244]
	5.739            0.689            0.024            [ 4.343  7.034]
	2.654            0.335            0.011            [ 2.027  3.312]
	7.909            0.482            0.029            [ 7.019  8.907]
	27.348           1.741            0.105          [ 23.963  30.637]
	8.21             0.831            0.032            [ 6.674  9.902]
	3.025            0.357            0.013              [ 2.32  3.71]
	10.494           0.59             0.038          [  9.355  11.603]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	10.736           12.006          12.788         13.541        15.105
	2.533            3.125           3.479          3.858         4.645
	0.868            1.109           1.271          1.431         1.79
	4.135            4.559           4.793          5.043         5.538
	17.808           19.486          20.539         21.543        23.54
	4.435            5.27            5.728          6.188         7.173
	2.057            2.419           2.642          2.874         3.345
	6.988            7.585           7.887          8.222         8.891
	24.089           26.128          27.335         28.504        30.801
	6.674            7.623           8.194          8.748         9.911
	2.35             2.772           3.008          3.257         3.758
	9.394            10.085          10.466         10.911        11.66

In [114]:

M_rhino.write_csv('Rhino_per_1000', variables=['per_1000'])

In [96]:

# The data
M_rhino.virus_df

Out[96]:

	under 6 mo.	6-11 mo.	11-23 mo.
virus_year
2011	163	44	30
2012	267	74	67
2013	359	107	76

RSV rates¶

In [87]:

M_rsv = pm.MCMC(rate_model('RSV'))
M_rsv.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 72.2 sec

In [88]:

M_rsv.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 72.6 sec

In [102]:

pm.Matplot.summary_plot(M_rsv.per_1000, custom_labels=prevalence_labels, main='RSV (per 1000)')

In [100]:

M_rsv.per_1000.summary()

per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	21.12            1.705            0.118          [ 17.914  24.408]
	6.82             0.805            0.038            [ 5.35   8.415]
	1.604            0.266            0.01             [ 1.112  2.117]
	23.477           1.859            0.132          [ 19.802  26.795]
	6.046            0.742            0.034            [ 4.659  7.5  ]
	2.29             0.32             0.013            [ 1.689  2.909]
	25.899           2.009            0.148          [ 22.041  29.442]
	8.049            0.895            0.046            [ 6.453  9.859]
	2.495            0.339            0.015            [ 1.85   3.163]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	17.979           19.899          21.076         22.3          24.517
	5.363            6.256           6.796          7.367         8.446
	1.131            1.421           1.584          1.768         2.17
	20.054           22.132          23.405         24.851        27.108
	4.692            5.523           6.001          6.54          7.563
	1.714            2.067           2.272          2.499         2.954
	22.313           24.397          25.845         27.345        29.759
	6.446            7.416           8.008          8.637         9.853
	1.88             2.257           2.485          2.711         3.216

In [117]:

M_rsv.write_csv('RSV_per_1000', variables=['per_1000'])

In [101]:

M_rsv.virus_df

Out[101]:

	under 6 mo.	6-11 mo.	11-23 mo.
virus_year
2011	272	87	38
2012	308	79	58
2013	343	106	63

Adeno rates¶

In [89]:

M_adeno = pm.MCMC(rate_model('Adeno'))
M_adeno.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 71.6 sec

In [90]:

M_adeno.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 72.6 sec

In [119]:

pm.Matplot.summary_plot(M_adeno.per_1000, custom_labels=prevalence_labels, main='Adeno (per 1000)')

In [120]:

M_adeno.per_1000.summary()

per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	2.672            0.471            0.011            [ 1.774  3.582]
	2.097            0.412            0.009            [ 1.333  2.925]
	0.94             0.203            0.004            [ 0.57   1.338]
	1.643            0.19             0.005            [ 1.289  2.018]
	5.585            0.677            0.017            [ 4.31   6.928]
	3.228            0.494            0.011            [ 2.34   4.224]
	1.776            0.274            0.007            [ 1.271  2.338]
	3.065            0.26             0.008            [ 2.565  3.571]
	10.74            0.936            0.031          [  8.993  12.639]
	3.83             0.546            0.011            [ 2.754  4.895]
	1.781            0.271            0.006            [ 1.259  2.303]
	4.539            0.32             0.013            [ 3.903  5.15 ]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	1.825            2.339           2.647          2.965         3.672
	1.37             1.805           2.06           2.349         2.974
	0.6              0.794           0.92           1.064         1.377
	1.296            1.512           1.633          1.769         2.032
	4.368            5.101           5.565          6.023         7.002
	2.345            2.879           3.211          3.537         4.239
	1.28             1.59            1.758          1.95          2.358
	2.57             2.883           3.062          3.239         3.585
	8.965            10.12           10.72          11.318        12.631
	2.836            3.44            3.802          4.18          5.0
	1.293            1.59            1.767          1.956         2.346
	3.919            4.325           4.54           4.749         5.174

In [118]:

M_adeno.write_csv('Adeno_per_1000', variables=['per_1000'])

In [106]:

M_adeno.virus_df

Out[106]:

	under 6 mo.	6-11 mo.	11-23 mo.
virus_year
2011	32	25	21
2012	70	40	43
2013	136	48	43

PIV rates¶

In [91]:

M_piv = pm.MCMC(rate_model('PIV'))
M_piv.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 71.6 sec

In [92]:

M_piv.sample(100000, 90000)

 [-----------------100%-----------------] 100000 of 100000 complete in 66.9 sec

In [121]:

pm.Matplot.summary_plot(M_piv.per_1000, custom_labels=prevalence_labels, main='PIV (per 1000)')

In [110]:

M_piv.per_1000.summary()

per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	1.37             0.332            0.006            [ 0.776  2.032]
	0.561            0.214            0.004            [ 0.193  0.989]
	0.385            0.128            0.002            [ 0.159  0.646]
	2.91             0.471            0.01             [ 2.015  3.87 ]
	0.946            0.276            0.005            [ 0.466  1.524]
	0.887            0.186            0.004            [ 0.571  1.287]
	3.66             0.542            0.011            [ 2.663  4.798]
	1.48             0.347            0.007            [ 0.811  2.141]
	0.402            0.126            0.002            [ 0.171  0.65 ]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	0.801            1.132           1.338          1.58          2.088
	0.227            0.406           0.535          0.684         1.063
	0.176            0.293           0.375          0.462         0.678
	2.05             2.581           2.884          3.207         3.924
	0.476            0.747           0.92           1.111         1.547
	0.567            0.75            0.875          1.004         1.285
	2.664            3.297           3.629          4.01          4.8
	0.886            1.229           1.448          1.693         2.26
	0.195            0.314           0.386          0.477         0.686

In [122]:

M_piv.write_csv('PIV_per_1000', variables=['per_1000'])

In [111]:

M_piv.virus_df

Out[111]:

	under 6 mo.	6-11 mo.	11-23 mo.
virus_year
2011	16	6	8
2012	36	11	21
2013	46	18	9

Hospitalization¶

Import hospitalization data

In [123]:

spreadsheets = !ls data/Al-Bashir*

data = []
col_names = 'date', 'admissions', 'admissions_2', 'resp_admissions', 'resp_admissions_2'
for spreadsheet in spreadsheets:
    this_file = pd.ExcelFile(spreadsheet)
    # Need to fix data entry error in years > 2011
    dot = spreadsheet.find('.')
    fix_columns = int(spreadsheet[dot-4:dot]) > 2011
    for name in this_file.sheet_names:
        d = this_file.parse(name)
        d.columns = col_names
        if fix_columns:
            total = d['resp_admissions'] + d['resp_admissions_2']
            d['resp_admissions_2'] = d['resp_admissions']
            d['resp_admissions'] = total
        data.append(d)
        
hospitalizations = pd.concat(data).set_index('date')

In [124]:

hospitalizations.head()

Out[124]:

	admissions	admissions_2	resp_admissions	resp_admissions_2
date
2010-03-14	23	21	15	14
2010-03-15	27	23	19	18
2010-03-16	11	8	6	3
2010-03-17	17	10	10	8
2010-03-18	23	17	12	10

In [125]:

hospitalizations['virus_year'] = 2011
hospitalizations.loc[(hospitalizations.index >= datetime(2011, 3, 31)) 
                 & (hospitalizations.index <= datetime(2012, 3, 31)), 'virus_year'] = 2012
hospitalizations.loc[hospitalizations.index > datetime(2012, 3, 31), 'virus_year'] = 2013

In [126]:

under_2_hosp = hospitalizations.groupby('virus_year').sum()['admissions_2']
under_2_hosp

Out[126]:

virus_year
2011    4123
2012    3865
2013    3776
Name: admissions_2, dtype: int64

In [127]:

rsv_by_year = rsv_subset.groupby('virus_year')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']]
rsv_counts = rsv_by_year.sum()

In [128]:

rsv_counts

Out[128]:

	age_under_2	age_2_5	age_6_11	age_over_11
virus_year
2011	119	153	87	38
2012	124	184	79	58
2013	159	184	106	63

In [129]:

under2_rsv = pd.concat([rsv_counts, under_2_hosp], axis=1).dropna()
under2_rsv

Out[129]:

	age_under_2	age_2_5	age_6_11	age_over_11	admissions_2
virus_year
2011	119	153	87	38	4123
2012	124	184	79	58	3865
2013	159	184	106	63	3776

In [130]:

def extract_virus(virus):
    
    virus_subset = hospitalized[hospitalized[virus]==1]
    
    virus_by_year = virus_subset.groupby('virus_year')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']]
    virus_counts = virus_by_year.sum()
    
    return pd.concat([virus_counts, under_2_hosp], axis=1).dropna()

In [131]:

def hosp_subset(virus):
    
    data_subset = extract_virus(virus)
    
    admissions = data_subset.pop('admissions_2').values
    data_subset = data_subset.values
    
    n_months, n_ages = data_subset.shape
    
    # Estimate age distribution of admissions
    p_age = pm.Dirichlet('p_age', np.ones(4), value=data_subset.sum(0)[:-1]/data_subset.sum())    
    counts = pm.Multinomial('counts', data_subset.sum(1), p_age, value=data_subset, observed=True)
    
    # Estimate denominators   
    n_hosp = []
    for i,ni in enumerate(admissions):
        
        d = data_subset[i]
        
        n_init = (ni*(d/d.sum() if (d.sum()>10) else np.ones(4)*0.25)).astype(int)[:-1]
        n_init = np.append(n_init, ni - n_init.sum())
        
        n_hosp.append(pm.Multinomial('n_hosp_{}'.format(i), ni, p_age, value=n_init))
    
    n_hosp_t = pm.Lambda('n_hosp_t', lambda x=n_hosp: np.array([[xij for xij in xi] for xi in x]).T)
    
    # Virus rates by age and time    
    mu_alpha = pm.Normal('mu_alpha', 0, 0.01, value=[0]*n_ages)
    sigma_alpha = pm.Uniform('sigma_alpha', 0, 100, value=[10]*n_ages)
    rho = pm.Uniform('rho', -1, 1, value=0)

    mu =  [pm.Lambda('mu_{}'.format(i), lambda mu=mu_alpha: np.array([mu[i]]*n_months)) for i in range(n_ages)]
    off_diag = np.eye(n_months, k=1)
    Sigma = [pm.Lambda('Sigma_{}'.format(i), lambda s=sigma_alpha, r=rho: 
                       (np.eye(n_months) + off_diag*r + off_diag.T*r)*(s[i]**2)) for i in range(n_ages)]

    alpha = [pm.MvNormalCov('alpha_{}'.format(i), mu[i], Sigma[i], value=[0]*n_months) for i in range(n_ages)]
    
    p_virus = [pm.Lambda('p_virus_{}'.format(i), lambda a=a: pm.invlogit(a)) for i,a in enumerate(alpha)]
    
    # Viral rate likelihood
    @pm.observed
    def rate_like(value=data_subset.T, p=p_virus, n=n_hosp_t):
        return np.sum([pm.binomial_like(value[i], n[i], p[i]) for i in range(n_ages)])
    
    return(locals())

In [132]:

M = pm.MCMC(hosp_subset('RSV'))

In [133]:

M.sample(20000, 10000)

 [-----------------100%-----------------] 20000 of 20000 complete in 32.0 sec

In [134]:

pm.Matplot.summary_plot(M.p_age, custom_labels=['age_under_2', 'age_2_5', 'age_6_11'])

Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.

In [135]:

date_labels = ['2011', '2012', '2013']
pm.Matplot.summary_plot(M.p_virus[0], custom_labels=date_labels, main='RSV rates')

Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.

In [136]:

import seaborn as sb

In [137]:

age_group_labels = ['Under 2 mo.', '2-5 mo.', '6-11 mo.', '12-23 mo.']

In [138]:

rsv_rates = pd.concat([pd.DataFrame({'rate':p.trace().T[t], 
                                   'age': age_group_labels[i], 
                                   'year': date_labels[t]}) for i,p in enumerate(M.p_virus) for t in range(3)])

In [139]:

sb.factorplot('year', 'rate', 'age', rsv_rates, kind='box', aspect=1.75)

Out[139]:

<seaborn.axisgrid.FacetGrid at 0x11a7a8780>

In [140]:

M_rhino = pm.MCMC(hosp_subset('Rhino'))
M_rhino.sample(20000, 10000)

 [-----------------100%-----------------] 20000 of 20000 complete in 31.5 sec

In [141]:

rhino_rates = pd.concat([pd.DataFrame({'rate':p.trace().T[t], 
                                   'age': age_group_labels[i], 
                                   'year': date_labels[t]}) for i,p in enumerate(M_rhino.p_virus) for t in range(3)])

In [142]:

sb.set(style="white", palette="muted")

fg = sb.factorplot('year', 'rate', 'age', rhino_rates, kind='box', aspect=1.75)

Production runs¶

In [143]:

for virus in viruses:
    
    print('\nRunning', virus)
    M = pm.MCMC(hosp_subset(virus))
    M.sample(50000, 40000)
    
    sb.set(style="white", palette="muted")
    
    virus_rates = pd.concat([pd.DataFrame({'rate':p.trace().T[t], 
                                   'age': age_group_labels[i], 
                                   'year': date_labels[t]}) for i,p in enumerate(M.p_virus) for t in range(3)])
    
    fg = sb.factorplot('year', 'rate', 'age', virus_rates, kind='box', aspect=1.75)
    fg.savefig(virus)
    
    M.write_csv(virus, variables=[x.__name__ for x in M.p_virus])

Running RSV
 [-----------------100%-----------------] 50000 of 50000 complete in 77.9 sec
Running HMPV
 [-----------------100%-----------------] 50000 of 50000 complete in 83.4 sec
Running Rhino
 [-----------------100%-----------------] 50000 of 50000 complete in 79.5 sec
Running Influenza
 [-----------------100%-----------------] 50000 of 50000 complete in 78.9 sec
Running Adeno
 [-----------------100%-----------------] 50000 of 50000 complete in 78.6 sec
Running PIV
 [-----------------100%-----------------] 50000 of 50000 complete in 78.0 sec
Running No virus
 [-----------------100%-----------------] 50000 of 50000 complete in 78.0 sec
Running All
 [-----------------100%-----------------] 50000 of 50000 complete in 78.2 sec