In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sb
import pymc as pm
sb.set_style("white")

Import data

In [3]:
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
hospitalized.head()
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,263,282,283,284,298,299) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
Out[3]:
greater_48hrs fever_neutropenia never_left written_consent child_name mother_name mother_birth_date mother_record mother_nationality other_mother_nationality ... was_whole_blood_obtained_f date whole_blood_complete age_months length_of_stay gest_age death hospitalized_vitamin_d wheezing_ind sex
case_id
A0001 0 0 0 1 Remas Mahmoud Jbarah Huda Katalo 1976-01-21 NaN 3 NaN ... NaN NaN 0 1 6 40 False 3 0 F
A0002 0 0 0 1 Majed Abdel Kareem Majed Noor SHa'aban Mahmood 1989-09-09 NaN 3 NaN ... NaN NaN 0 1 5 40 False 4 1 M
A0003 0 0 0 1 Rayyan Jamal Muhyi Al.Deen SAra Hussein Muhyi Al.Deen 1965-01-01 NaN 1 NaN ... NaN NaN 0 11 10 40 False 35 0 F
A0004 0 0 0 1 Hanan Mohd Mustapha Abu Othman Kawla Abu Shanab 1983-10-31 NaN 1 NaN ... NaN NaN 0 7 3 38 False 2 1 F
A0005 0 0 0 1 Yara Mahmoud Azmi Ismael Suha Abdel Aziz 1986-02-28 NaN 1 NaN ... NaN NaN 0 2 1 39 False 6 0 F

5 rows × 414 columns

Breakdown by:

  • month of age (and greater than 1 year)
  • sex
  • birthweight
  • prematurity
  • feeding: % with any breastfeeding
  • maternal ed
  • number in household
  • daycare
  • smoking exposure: cigarette, maternal during pregnacy, nargalia
  • % with comorbidity: past medical history
  • nationality: jordanian, palestinean, syrian, egptian, other
  • diagnosis: pneumo, bronchiolitis, bronchopnewumonia

Severity measures:

  • length of stay
  • ICU
  • O2
  • Mechanical vent.
  • days of symptoms before admission
  • % with antibiotics prior
  • % with antibiotics during

Columns:

  • RSV, MPV, Rhino, Influenza A/B/C, Adeno, Parinfluenza, Hospitalzied pop, no virus
In [4]:
[v for v in hospitalized.columns if v.endswith('hx')]
Out[4]:
['no_hx',
 'diabetes_hx',
 'heart_hx',
 'kidney_hx',
 'downs_hx',
 'sickle_hx',
 'cancer_hx',
 'cf_hx',
 'genetic_hx',
 'cp_hx',
 'seizure_hx',
 'neuro_hx',
 'mr_hx',
 'asthma_hx',
 'immuno_hx',
 'liver_hx',
 'gerd_hx',
 'diarrhea_hx',
 'other_hx',
 'spec_hx',
 'no_asthma_hx',
 'no_allergy_hx',
 'no_eczema_hx']
In [5]:
hospitalized['asthma'] = (hospitalized.child_asthma==1) | hospitalized.asthma_hx
hospitalized.asthma.sum()
Out[5]:
81
In [6]:
hospitalized.no_hx.value_counts()
Out[6]:
1    2848
0     320
dtype: int64
In [7]:
((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool))).sum()
Out[7]:
55

Past medical history calculated as those who claim to have past medical history and those who claim to have no past medical history and reported asthma.

In [8]:
hospitalized['past medical history'] = (~hospitalized.no_hx.astype(bool) | 
                                ((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool))))
hospitalized['past medical history'].sum()
Out[8]:
375
In [9]:
# Positive culture lookup
pcr_lookup = {'pcr_result___1': 'RSV',
'pcr_result___2': 'HMPV',
'pcr_result___3': 'flu A',
'pcr_result___4': 'flu B',
'pcr_result___5': 'rhino',
'pcr_result___6': 'PIV1',
'pcr_result___7': 'PIV2',
'pcr_result___8': 'PIV3',
'pcr_result___13': 'H1N1',
'pcr_result___14': 'H3N2',
'pcr_result___15': 'Swine',
'pcr_result___16': 'Swine H1',
'pcr_result___17': 'flu C',
'pcr_result___18': 'Adeno'}
In [10]:
hospitalized['RSV'] = hospitalized.pcr_result___1.astype(bool)
hospitalized['HMPV'] = hospitalized.pcr_result___2.astype(bool)
hospitalized['Rhino'] = hospitalized.pcr_result___5.astype(bool)
hospitalized['Influenza'] = (hospitalized.pcr_result___3 | hospitalized.pcr_result___4 |
                             hospitalized.pcr_result___17)
hospitalized['Adeno'] = hospitalized.pcr_result___18.astype(bool)
hospitalized['PIV'] = (hospitalized.pcr_result___6 | hospitalized.pcr_result___7 | 
                       hospitalized.pcr_result___8)
hospitalized['No virus'] = hospitalized[list(pcr_lookup.keys())].sum(1) == 0
hospitalized['All'] = True
In [11]:
viruses = ['RSV', 'HMPV', 'Rhino', 'Influenza', 'Adeno', 'PIV', 'No virus', 'All']
In [12]:
non_rsv_lookup = pcr_lookup.copy()
non_rsv_lookup.pop('pcr_result___1')
Out[12]:
'RSV'

Identify individuals with coinfection

In [13]:
hospitalized['coinfection'] = hospitalized[list(pcr_lookup.keys())].sum(1) > 1

Virus frequency by coinfection status

In [14]:
hospitalized[(~hospitalized.coinfection) & (hospitalized.RSV)].shape[0]/hospitalized.shape[0]
Out[14]:
0.2297979797979798
In [15]:
virus_by_coinf = hospitalized.groupby('coinfection')[viruses[:-2]].sum()/hospitalized.shape[0]
virus_by_coinf.T.plot(kind='bar', stacked=True, grid=False, 
                      color=sb.color_palette('Paired', 2)[::-1])
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1135d17f0>
In [16]:
hospitalized[viruses].sum(0)
Out[16]:
RSV          1397
HMPV          273
Rhino        1238
Influenza     119
Adeno         475
PIV           175
No virus      587
All          3168
dtype: float64
In [17]:
other_virus_index = (hospitalized[list(non_rsv_lookup.keys())].sum(1) > 0).astype(int)
In [18]:
hospitalized['RSV'] = hospitalized['pcr_result___1']
hospitalized['non-RSV virus'] = (hospitalized['pcr_result___1']==0) & other_virus_index
hospitalized['no virus'] = (hospitalized['pcr_result___1']==0) & (other_virus_index==0)
In [19]:
hospitalized["prev_cond"] = (hospitalized[[c for c in hospitalized.columns if c.endswith('hx') 
                                           and not c.startswith('no_')]].sum(1) > 0)
In [20]:
hospitalized['male'] = (hospitalized.sex=='M').astype(int)
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23]))
age_groups.index = hospitalized.index
age_groups.columns = 'under 2 months', '2-11 months', '12-23 months'
hospitalized = hospitalized.join(age_groups)
In [21]:
nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian', 
                      6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup)
hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int)
hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int)
In [22]:
hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int)
hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 20'] = np.nan
hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int)
hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 11'] = np.nan
In [23]:
hospitalized['any_cigarette'] = (hospitalized.cigarette_smokers > 0).astype(int)
In [24]:
hospitalized['any_smoke'] = (hospitalized.cigarette_smokers.astype(bool) | 
                             hospitalized.nargila_smokers.astype(bool))
In [25]:
hospitalized.any_smoke.mean()
Out[25]:
0.76546717171717171
In [26]:
hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int)
In [27]:
hospitalized.length_of_stay.hist(bins=20)
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x11380f0b8>

Diagnosis

In [28]:
hospitalized['ros'] = hospitalized.adm_sepsis | hospitalized.adm_febrile
In [29]:
hospitalized['pertussis-like cough'] = hospitalized.adm_pertussis | hospitalized.adm_cough
In [30]:
hospitalized['brochopneumonia'] = hospitalized.adm_bronchopneumo
In [31]:
hospitalized['bronchiolitis'] = hospitalized.adm_bronchiolitis
In [32]:
hospitalized['pneumonia'] = hospitalized.adm_pneumo

Rate Estimation

In [33]:
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.admission_date.describe()
Out[33]:
count                    3168
unique                    794
top       2011-02-07 00:00:00
freq                       17
first     2010-03-15 00:00:00
last      2013-03-30 00:00:00
Name: admission_date, dtype: object

Age groups:

  • Under 2 months
  • 2-5 mo.
  • 6-11 mo.
  • Over 11 mo.
In [34]:
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,5,11,24], include_lowest=True))
age_groups.index = hospitalized.index
age_groups.columns = 'age_under_2', 'age_2_5', 'age_6_11', 'age_over_11'
hospitalized = hospitalized.join(age_groups)
In [35]:
age_group_lookup = {'age_under_2': '<2', 
                    'age_2_5': '2-5', 
                    'age_6_11': '6-11', 
                    'age_over_11': '>11'}
In [36]:
customcmap = sb.color_palette("coolwarm", 6)
In [37]:
fig, axes = plt.subplots(2,2, figsize=(10,6), sharey=True)
for i,ax in enumerate(np.ravel(axes)):
    age_group = age_groups.columns[i]

    age_subset = hospitalized[hospitalized[age_group].astype(bool)]
    age_virus = age_subset[viruses[:-2]].mean()
    age_virus.T.plot(kind='bar', grid=False, ax=ax, 
                     color=customcmap)
    ax.set_title(age_group_lookup[age_group])

Population rate estimation

Recode year to virus season:

  • 2011: March 2010 - March 2011
  • 2012: Apr 2011 - Mar 2012
  • 2013: April 2012 - Mar 2013
In [38]:
hospitalized['virus_year'] = 2011
hospitalized.loc[(hospitalized.admission_date >= '2011-03-31') 
                 & (hospitalized.admission_date <= '2012-03-31'), 'virus_year'] = 2012
hospitalized.loc[hospitalized.admission_date > '2012-03-31', 'virus_year'] = 2013

hospitalized.virus_year.value_counts()
Out[38]:
2012    1191
2013    1179
2011     798
dtype: int64

Recode zones

In [39]:
zone_string = "1, Amman Zone 1 | 2, Abdoun Zone 1 | 3, Abu Alanda Zone 3 | 4, Abu Nusair Zone 2 | 5, Airport street Zone 4 | 6, Al.Ashrafeyeh Zone 5 | 7, Al.Badya Zone 31 | 8, Al.Baqa'a Zone 22 | 9, Al.Ghour Zone 32 | 10, Al.Hashmi Zone 6 | 11, Al.Hezam Zone 29 | 12, Al.Hussein Camping Zone 1 | 13, Al.Istiklal Zone 1 | 14, Al.Jeeza Zone 8 | 15, Al.Joufeh Zone 5 | 16, Al.Karak Zone 35 | 17, Al.Lubban Zone 16 | 18, Al.Madenah Al.Reyadeyeh Zone 1 | 19, Al.Mahatta Zone 6 | 20, Al.Manarah Zone 5 | 21, Al.Mareikh Zone 5 | 22, Al.Marqab Zone 6 | 23, Al.Muhajreen Zone 5 | 24, Al.Musdar Zone 5 | 25, Al.Musherfeh Zone 15 | 26, Al.Naser Zone 6 | 27, Al.Natheef Zone 5 | 28, Al.Nuzha Zone 1 | 29, Al.Qastal Zone 10 | 30, Al.Qwesmeh Zone 5 | 31, Al.Shouneh Zone 32 | 32, Al.Taj Zone 5 | 33, Al.Taybeh Zone 7 | 34, Aqaba Zone 30 | 35, Arjan Zone 1 | 36, Bayader Wadi AL.Seer Zone 20 | 37, Bnayyat Zone 12 | 38, D.Al.Ameer Ali Zone 13 | 39, D.Al.Aqsa Zone 9 | 40, D.Haj Hasan Zone 9 | 41, Daheyet Al.Rasheed Zone 17 | 42, Daheyet al.Yasmeen Zone 1 | 43, Dead Sea Zone 32 | 44, Deer.Al.Ghbar Zone 1 | 45, Down Town (Al.Balad) Zone 1 | 46, Dra'a al.Qarbi Zone 1 | 47, Ein Al.Basha Zone 22 | 48, Ein Ghazal Zone 6 | 49, Eskan Al.Ameer Hashem Zone 29 | 50, Eskan Al.Ameer Talal Zone 29 | 51, Etha'a wal Telvesion Zone 9 | 52, Hay Al.Dabaybeh Zone 5 | 53, Hay Al.Tafayleh Zone 5 | 54, Hay Nazzal Zone 1 | 55, Huttein (shneler ) Refugee camping Zone 6 | 56, Iraq Al.Ameer Zone 23 | 57, Jabal AL.Akhdar Zone 1 | 58, Jabal Al.Ameer Faisal Zone 29 | 59, Jabal AL.Hadeed Zone 5 | 60, Jabal Al.Hussein Zone 1 | 61, Jabal Al.Qosoor Zone 1 | 62, Jabal Amman Zone 1 | 63, Jarash Zone 36 | 64, Jawa Zone 7 | 65, Juwaydeh Zone 14 | 66, Khalda Zone 18 | 67, Khreibt Al.Souk Zone 7 | 68, Ma'an Zone 31 | 69, Madaba Zone 34 | 70, Mafraq Zone 33 | 71, Marj Al.Hamam Zone 19 | 72, Marka Zone 6 | 73, Muqableen Zone 9 | 74, Muwaqqar Zone 28 | 75, Nadi Al.Sebaq Zone 6 | 76, Naur Zone 19 | 77, Petra Zone 30 | 78, Qatranah Zone 30 | 79, Raghadan Zone 1 | 80, Ras Al.Ein Zone 1 | 81, Rusayfah Zone 29 | 82, Saffout Zone 22 | 83, Sahab Zone 11 | 84, Salheyet Al.Abed Zone 6 | 85, Shafa Badran Zone 25 | 86, Sharq Al.Awsat Zone 11 | 87, Shemasani Zone 1 | 88, Street 30 Zone 5 | 89, Summaya Street Zone 5 | 90, Suweileh Zone 27 | 91, Tabarbour Zone 21 | 92, Tla'a al Ali Zone 17 | 93, Um Al.Heran Zone 11 | 94, Um Al.Summaq Zone 17 | 95, Um Nuwwara and Adan Zone 5 | 96, Um Uthayna Zone 1 | 97, Wadi Abdoun Zone 1 | 98, Wadi AL.Haddadeh Zone 1 | 99, Wadi AL.Hajar Zone 29 | 100, Wadi Al.Remam Zone 5 | 101, Wadi AL.Seer Zone 20 | 102, Wadi Saqra Zone 1 | 103, Wehdat Zone 11 | 104, Yadoudeh Zone 13 | 105, Yajouz Zone 26 | 106, Zarqa Zone 29 | 107, Zezya Zone 24"
In [40]:
zones = [z.strip().split(',') for z in zone_string.split('|')]
In [41]:
zone_dict = {int(n):int(s.strip().split(' ')[-1]) for n,s in zones}
In [42]:
hospitalized['zone'] = hospitalized.city_zone.replace(zone_dict)

Define Amman zone

In [43]:
hospitalized['amman_zone'] = hospitalized.zone<28

Hospitalized in Amman

In [44]:
hospitalized_amman = hospitalized[hospitalized.amman_zone]
In [45]:
hospitalized_amman.shape
Out[45]:
(3048, 451)
In [46]:
hospitalized_amman.index.is_unique
Out[46]:
True
In [47]:
hosp_age_counts = hospitalized_amman.groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum').values
In [48]:
pre_2013 = hospitalized_amman.admission_date <  datetime(2013, 1, 1)
In [49]:
age_counts = hospitalized_amman[pre_2013].groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum')
age_counts
Out[49]:
age_under_2 age_2_5 age_6_11 age_over_11
admission_date
2010-03-31 15 16 14 3
2010-04-30 17 22 14 6
2010-05-31 12 16 11 2
2010-06-30 21 6 7 3
2010-07-31 19 6 2 2
2010-08-31 7 8 3 3
2010-09-30 10 5 4 2
2010-10-31 5 5 4 4
2010-11-30 5 5 8 7
2010-12-31 7 8 5 10
2011-01-31 32 58 34 21
2011-02-28 54 67 39 24
2011-03-31 40 31 18 23
2011-04-30 23 37 26 12
2011-05-31 24 34 11 17
2011-06-30 25 17 13 9
2011-07-31 20 16 10 10
2011-08-31 21 7 0 3
2011-09-30 17 8 5 8
2011-10-31 20 9 6 10
2011-11-30 10 14 7 14
2011-12-31 23 21 21 18
2012-01-31 60 68 37 37
2012-02-29 55 83 39 25
2012-03-31 49 71 37 33
2012-04-30 34 43 32 23
2012-05-31 34 27 15 11
2012-06-30 23 21 15 10
2012-07-31 29 18 8 14
2012-08-31 21 11 7 5
2012-09-30 13 17 15 10
2012-10-31 16 20 4 9
2012-11-30 11 17 5 9
2012-12-31 43 46 31 17

Rates and demographics (via World Bank) and 2004 census data (via Jordan Department of Statistics).

In [50]:
# Jordan population, 2008-2012
population = 5786000, 5915000, 6046000, 6181000, 6318000

# Interpolated population by gender and age, 2010-2012
female_0 = 93649, 95739, 96486
male_0 = 98435, 100525, 101160
female_1 = 87941, 93511, 93067
male_1 = 92548, 98306, 97763

kids_0 = np.array(female_0) + np.array(male_0)
kids_1 = np.array(female_1) + np.array(male_1)
kids = kids_0 + kids_1

kids_under6mo = kids_6to12mo = kids_0/2.

# Proportion in Amman
amman_urban_2004 = 1784502.
jordan_2004 = 5103639.
amman_prop = amman_urban_2004 / jordan_2004

# Birth rates (per 1000)
birth_rate = 29.665, 29.322, 28.869, 28.317, 27.699

# Neonatal mortality (per 1000)
neonatal_mort = 12.7, 12.4, 12.1, 11.8, 11.5

# Infant mortality (per 1000)
infant_mort = 18.4, 17.9, 17.3, 16.8, 16.4
In [51]:
births = np.array(population[-3:])/1000. * birth_rate[-3:]
births
Out[51]:
array([ 174541.974,  175027.377,  175002.282])
In [52]:
births_6m = births/2.
births_6m
Out[52]:
array([ 87270.987 ,  87513.6885,  87501.141 ])
In [53]:
deaths_6m = births_6m/1000. * infant_mort[-3:]
deaths_6m
Out[53]:
array([ 1509.7880751,  1470.2299668,  1435.0187124])
In [54]:
amman_prop
Out[54]:
0.3496528653378501
In [55]:
(births_6m - deaths_6m)*amman_prop
Out[55]:
array([ 29986.6489389 ,  30085.34181971,  30093.26626638])
In [56]:
n = np.array((kids_under6mo, kids_6to12mo, kids_1, kids))
n
Out[56]:
array([[  96042.,   98132.,   98823.],
       [  96042.,   98132.,   98823.],
       [ 180489.,  191817.,  190830.],
       [ 372573.,  388081.,  388476.]])
In [57]:
kids_0
Out[57]:
array([192084, 196264, 197646])
In [58]:
kids_1/kids_0
Out[58]:
array([ 0.93963578,  0.97734174,  0.9655141 ])
In [59]:
amman_prop
Out[59]:
0.3496528653378501
In [60]:
n_amman = np.floor(n*amman_prop).astype(int)
In [61]:
n_amman
Out[61]:
array([[ 33581,  34312,  34553],
       [ 33581,  34312,  34553],
       [ 63108,  67069,  66724],
       [130271, 135693, 135831]])

Monthly admissions in Ammaon

In [62]:
admissions_by_month = hospitalized_amman.groupby('admission_date')['child_name'].count().resample('1M', how='sum')
In [63]:
admissions_by_month.head()
Out[63]:
admission_date
2010-03-31    48
2010-04-30    59
2010-05-31    41
2010-06-30    37
2010-07-31    29
Freq: M, Name: child_name, dtype: int64
In [64]:
# Dict to return pcr_result variable corresponding to virus
virus_lookup = {pcr_lookup[k]: k for k in pcr_lookup.keys()}
In [65]:
# Enrolling 5 days per week
p_enroll = 5./7
In [66]:
rsv_subset = hospitalized_amman[hospitalized_amman.RSV==1]
rsv_subset.shape
Out[66]:
(1354, 451)

'age_under_2', 'age_2_5', 'age_6_11', 'age_over_11'

In [67]:
viruses
Out[67]:
['RSV', 'HMPV', 'Rhino', 'Influenza', 'Adeno', 'PIV', 'No virus', 'All']
In [68]:
virus_subset = hospitalized_amman[hospitalized['RSV']==1].copy()
virus_subset['admission_year'] = virus_subset.admission_date.apply(lambda x: x.year)
_under_6 = virus_subset[virus_subset.age_under_2.astype(bool) | virus_subset.age_2_5].groupby('admission_year')['child_name'].count()
/usr/local/lib/python3.4/site-packages/pandas/core/frame.py:1819: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)
In [69]:
_6_11 = virus_subset[virus_subset.age_6_11.astype(bool)].groupby('admission_year')['child_name'].count()
In [70]:
_over_11 = virus_subset[virus_subset.age_over_11.astype(bool)].groupby('admission_year')['child_name'].count()
In [71]:
_all = virus_subset.groupby('admission_year')['child_name'].count()
In [72]:
virus_subset.set_index(virus_subset.admission_date).groupby([pd.TimeGrouper('M')]).count()['mother_name']
Out[72]:
admission_date
2010-03-31     28
2010-04-30     20
2010-05-31      3
2010-06-30      2
2010-09-30      1
2010-10-31      1
2010-11-30      4
2010-12-31     14
2011-01-31    131
2011-02-28    145
2011-03-31     48
2011-04-30     13
2011-05-31      4
2011-06-30      1
2011-09-30      1
2011-10-31      1
2011-11-30      2
2011-12-31     24
2012-01-31    145
2012-02-29    138
2012-03-31    116
2012-04-30     71
2012-05-31     20
2012-06-30     14
2012-07-31      1
2012-10-31      1
2012-11-30      3
2012-12-31     89
2013-01-31    153
2013-02-28    114
2013-03-31     46
Name: mother_name, dtype: int64
In [73]:
virus_df = pd.concat([_under_6, _6_11, _over_11, _all], axis=1)
virus_df.columns = ('under 6 mo.', '6-11 mo.', '11-23 mo.', 'all under 2 yr.')
virus_df
Out[73]:
under 6 mo. 6-11 mo. 11-23 mo. all under 2 yr.
admission_year
2010 49 20 4 73
2011 255 77 38 370
2012 400 113 85 598
2013 219 62 32 313
In [74]:
virus_df.values.ravel()
Out[74]:
array([ 49,  20,   4,  73, 255,  77,  38, 370, 400, 113,  85, 598, 219,
        62,  32, 313])

Use proportion in Amman to calculate proportion of kids in each age group and year in Amman

In [79]:
def rate_model(virus):
    
    # Extract data subset for passed virus
    virus_subset = hospitalized_amman[hospitalized_amman[virus]==1].copy()
    
    # Create data frame of age x year counts
    u6 = virus_subset.age_under_2.astype(bool) | virus_subset.age_2_5
    _under_6 = virus_subset[u6].groupby('virus_year')['child_name'].count()
    _6_11 = virus_subset[virus_subset.age_6_11.astype(bool)].groupby('virus_year')['child_name'].count()
    _over_11 = virus_subset[virus_subset.age_over_11.astype(bool)].groupby('virus_year')['child_name'].count()
    _all = virus_subset.groupby('virus_year')['child_name'].count()
    
    virus_df = pd.concat([_under_6, _6_11, _over_11, _all], axis=1)
    virus_df.columns = ('under 6 mo.', '6-11 mo.', '11-23 mo.', 'all ages')
    
    # Al Bashir hospital market share
    market_share = pm.Uniform('market_share', 0.5, 0.6)
    
    # Prior probability
    prev_virus = [pm.Beta('prev_virus_%i' %i, 1, 5) for i in range(virus_df.size)]
    per_1000 = pm.Lambda('per_1000', lambda p=prev_virus: np.array(p)*1000)
        
    # Correct for 5 days of enrollment per week
    p_hosp = market_share * (5./7)
    
    # RSV in Amman
    n_hosp = pm.Binomial('n_hosp', n_amman.T.ravel(), p_hosp, value=n_amman.T.ravel()*0.2)

    # Likelihood for number with RSV in hospital (assumes Pr(hosp | RSV) = 1)
    y_hosp = pm.Binomial('y_hosp', n_hosp, prev_virus, value=virus_df.values.ravel(), observed=True)
    
    return locals()

Influenza rates

In [80]:
M = pm.MCMC(rate_model('Influenza'))
In [81]:
M.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 79.8 sec
In [82]:
M.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 72.9 sec
In [94]:
prevalence_labels = ['%s %s' % (year, age) for year in ('2011', '2012', '2013') 
                     for age in ('under 6 mo.', '6-11 mo.', '12-23 mo.', 'all ages')]
In [95]:
pm.Matplot.summary_plot(M.per_1000, custom_labels=prevalence_labels, main='Influenza (per 1000)')
In [96]:
M.per_1000.summary()
per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	0.857            0.261            0.006            [ 0.381  1.352]
	0.627            0.227            0.006            [ 0.228  1.069]
	0.332            0.119            0.003            [ 0.131  0.578]
	0.504            0.102            0.003            [ 0.314  0.706]
	1.524            0.357            0.01             [ 0.889  2.23 ]
	0.534            0.207            0.004            [ 0.163  0.937]
	0.435            0.133            0.003            [ 0.186  0.695]
	0.696            0.121            0.004            [ 0.476  0.937]
	2.507            0.459            0.015            [ 1.633  3.396]
	0.837            0.254            0.006            [ 0.366  1.334]
	0.594            0.157            0.004            [ 0.304  0.905]
	1.103            0.157            0.006            [ 0.822  1.419]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	0.438            0.673           0.825          1.013         1.44
	0.266            0.464           0.601          0.763         1.126
	0.141            0.246           0.318          0.405         0.6
	0.323            0.432           0.497          0.567         0.721
	0.929            1.261           1.492          1.756         2.293
	0.213            0.387           0.503          0.648         1.017
	0.214            0.34            0.418          0.517         0.738
	0.49             0.612           0.688          0.771         0.957
	1.708            2.179           2.483          2.8           3.487
	0.406            0.66            0.811          0.99          1.41
	0.328            0.483           0.582          0.693         0.936
	0.824            0.992           1.098          1.206         1.423
	
In [116]:
M.write_csv('Influenza_per_1000', variables=['per_1000'])

HMPV rates

In [83]:
M_hmpv = pm.MCMC(rate_model('HMPV'))
# M_hmpv.
M_hmpv.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 72.2 sec
In [84]:
M_hmpv.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 66.1 sec
In [97]:
pm.Matplot.summary_plot(M_hmpv.per_1000, custom_labels=prevalence_labels, main='HMPV (per 1000)')
In [98]:
M_hmpv.per_1000.summary()
per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	1.514            0.344            0.009            [ 0.88   2.205]
	0.843            0.255            0.005            [ 0.372  1.339]
	0.361            0.121            0.003            [ 0.148  0.599]
	0.747            0.125            0.004            [ 0.528  1.021]
	7.233            0.791            0.033            [ 5.727  8.785]
	2.913            0.479            0.015            [ 2.016  3.893]
	1.232            0.224            0.006            [ 0.821  1.673]
	3.134            0.275            0.014            [ 2.602  3.661]
	1.927            0.394            0.011            [ 1.208  2.722]
	1.935            0.394            0.01             [ 1.204  2.729]
	0.501            0.143            0.003            [ 0.235  0.774]
	1.189            0.16             0.006            [ 0.879  1.487]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	0.908            1.277           1.491          1.72          2.267
	0.412            0.662           0.817          0.998         1.397
	0.171            0.271           0.349          0.433         0.633
	0.527            0.658           0.741          0.827         1.021
	5.776            6.675           7.201          7.742         8.857
	2.037            2.582           2.884          3.227         3.935
	0.831            1.074           1.216          1.377         1.692
	2.626            2.936           3.126          3.316         3.703
	1.226            1.646           1.902          2.176         2.756
	1.245            1.662           1.917          2.179         2.793
	0.263            0.4             0.489          0.588         0.823
	0.901            1.077           1.18           1.292         1.522
	
In [115]:
M_hmpv.write_csv('HMPV_per_1000', variables=['per_1000'])

Rhino rates

In [104]:
M_rhino = pm.MCMC(rate_model('Rhino'))
M_rhino.sample(150000, 140000)
 [-----------------100%-----------------] 150000 of 150000 complete in 97.0 sec
In [105]:
M_rhino.sample(150000, 140000)
 [-----------------100%-----------------] 150000 of 150000 complete in 95.9 sec
In [106]:
pm.Matplot.summary_plot(M_rhino.per_1000, custom_labels=prevalence_labels, main='Rhino (per 1000)')
In [108]:
M_rhino.per_1000.summary()
per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	12.805           1.126            0.051          [ 10.72   15.081]
	3.505            0.55             0.016            [ 2.494  4.58 ]
	1.281            0.238            0.006            [ 0.848  1.759]
	4.806            0.359            0.019            [ 4.124  5.505]
	20.541           1.461            0.078          [ 17.563  23.244]
	5.739            0.689            0.024            [ 4.343  7.034]
	2.654            0.335            0.011            [ 2.027  3.312]
	7.909            0.482            0.029            [ 7.019  8.907]
	27.348           1.741            0.105          [ 23.963  30.637]
	8.21             0.831            0.032            [ 6.674  9.902]
	3.025            0.357            0.013              [ 2.32  3.71]
	10.494           0.59             0.038          [  9.355  11.603]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	10.736           12.006          12.788         13.541        15.105
	2.533            3.125           3.479          3.858         4.645
	0.868            1.109           1.271          1.431         1.79
	4.135            4.559           4.793          5.043         5.538
	17.808           19.486          20.539         21.543        23.54
	4.435            5.27            5.728          6.188         7.173
	2.057            2.419           2.642          2.874         3.345
	6.988            7.585           7.887          8.222         8.891
	24.089           26.128          27.335         28.504        30.801
	6.674            7.623           8.194          8.748         9.911
	2.35             2.772           3.008          3.257         3.758
	9.394            10.085          10.466         10.911        11.66
	
In [114]:
M_rhino.write_csv('Rhino_per_1000', variables=['per_1000'])
In [96]:
# The data
M_rhino.virus_df
Out[96]:
under 6 mo. 6-11 mo. 11-23 mo.
virus_year
2011 163 44 30
2012 267 74 67
2013 359 107 76

RSV rates

In [87]:
M_rsv = pm.MCMC(rate_model('RSV'))
M_rsv.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 72.2 sec
In [88]:
M_rsv.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 72.6 sec
In [102]:
pm.Matplot.summary_plot(M_rsv.per_1000, custom_labels=prevalence_labels, main='RSV (per 1000)')
In [100]:
M_rsv.per_1000.summary()
per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	21.12            1.705            0.118          [ 17.914  24.408]
	6.82             0.805            0.038            [ 5.35   8.415]
	1.604            0.266            0.01             [ 1.112  2.117]
	23.477           1.859            0.132          [ 19.802  26.795]
	6.046            0.742            0.034            [ 4.659  7.5  ]
	2.29             0.32             0.013            [ 1.689  2.909]
	25.899           2.009            0.148          [ 22.041  29.442]
	8.049            0.895            0.046            [ 6.453  9.859]
	2.495            0.339            0.015            [ 1.85   3.163]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	17.979           19.899          21.076         22.3          24.517
	5.363            6.256           6.796          7.367         8.446
	1.131            1.421           1.584          1.768         2.17
	20.054           22.132          23.405         24.851        27.108
	4.692            5.523           6.001          6.54          7.563
	1.714            2.067           2.272          2.499         2.954
	22.313           24.397          25.845         27.345        29.759
	6.446            7.416           8.008          8.637         9.853
	1.88             2.257           2.485          2.711         3.216
	
In [117]:
M_rsv.write_csv('RSV_per_1000', variables=['per_1000'])
In [101]:
M_rsv.virus_df
Out[101]:
under 6 mo. 6-11 mo. 11-23 mo.
virus_year
2011 272 87 38
2012 308 79 58
2013 343 106 63

Adeno rates

In [89]:
M_adeno = pm.MCMC(rate_model('Adeno'))
M_adeno.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 71.6 sec
In [90]:
M_adeno.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 72.6 sec
In [119]:
pm.Matplot.summary_plot(M_adeno.per_1000, custom_labels=prevalence_labels, main='Adeno (per 1000)')
In [120]:
M_adeno.per_1000.summary()
per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	2.672            0.471            0.011            [ 1.774  3.582]
	2.097            0.412            0.009            [ 1.333  2.925]
	0.94             0.203            0.004            [ 0.57   1.338]
	1.643            0.19             0.005            [ 1.289  2.018]
	5.585            0.677            0.017            [ 4.31   6.928]
	3.228            0.494            0.011            [ 2.34   4.224]
	1.776            0.274            0.007            [ 1.271  2.338]
	3.065            0.26             0.008            [ 2.565  3.571]
	10.74            0.936            0.031          [  8.993  12.639]
	3.83             0.546            0.011            [ 2.754  4.895]
	1.781            0.271            0.006            [ 1.259  2.303]
	4.539            0.32             0.013            [ 3.903  5.15 ]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	1.825            2.339           2.647          2.965         3.672
	1.37             1.805           2.06           2.349         2.974
	0.6              0.794           0.92           1.064         1.377
	1.296            1.512           1.633          1.769         2.032
	4.368            5.101           5.565          6.023         7.002
	2.345            2.879           3.211          3.537         4.239
	1.28             1.59            1.758          1.95          2.358
	2.57             2.883           3.062          3.239         3.585
	8.965            10.12           10.72          11.318        12.631
	2.836            3.44            3.802          4.18          5.0
	1.293            1.59            1.767          1.956         2.346
	3.919            4.325           4.54           4.749         5.174
	
In [118]:
M_adeno.write_csv('Adeno_per_1000', variables=['per_1000'])
In [106]:
M_adeno.virus_df
Out[106]:
under 6 mo. 6-11 mo. 11-23 mo.
virus_year
2011 32 25 21
2012 70 40 43
2013 136 48 43

PIV rates

In [91]:
M_piv = pm.MCMC(rate_model('PIV'))
M_piv.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 71.6 sec
In [92]:
M_piv.sample(100000, 90000)
 [-----------------100%-----------------] 100000 of 100000 complete in 66.9 sec
In [121]:
pm.Matplot.summary_plot(M_piv.per_1000, custom_labels=prevalence_labels, main='PIV (per 1000)')
In [110]:
M_piv.per_1000.summary()
per_1000:
 
	Mean             SD               MC Error        95% HPD interval
	------------------------------------------------------------------
	1.37             0.332            0.006            [ 0.776  2.032]
	0.561            0.214            0.004            [ 0.193  0.989]
	0.385            0.128            0.002            [ 0.159  0.646]
	2.91             0.471            0.01             [ 2.015  3.87 ]
	0.946            0.276            0.005            [ 0.466  1.524]
	0.887            0.186            0.004            [ 0.571  1.287]
	3.66             0.542            0.011            [ 2.663  4.798]
	1.48             0.347            0.007            [ 0.811  2.141]
	0.402            0.126            0.002            [ 0.171  0.65 ]
	
	
	Posterior quantiles:
	
	2.5             25              50              75             97.5
	 |---------------|===============|===============|---------------|
	0.801            1.132           1.338          1.58          2.088
	0.227            0.406           0.535          0.684         1.063
	0.176            0.293           0.375          0.462         0.678
	2.05             2.581           2.884          3.207         3.924
	0.476            0.747           0.92           1.111         1.547
	0.567            0.75            0.875          1.004         1.285
	2.664            3.297           3.629          4.01          4.8
	0.886            1.229           1.448          1.693         2.26
	0.195            0.314           0.386          0.477         0.686
	
In [122]:
M_piv.write_csv('PIV_per_1000', variables=['per_1000'])
In [111]:
M_piv.virus_df
Out[111]:
under 6 mo. 6-11 mo. 11-23 mo.
virus_year
2011 16 6 8
2012 36 11 21
2013 46 18 9

Hospitalization

Import hospitalization data

In [123]:
spreadsheets = !ls data/Al-Bashir*

data = []
col_names = 'date', 'admissions', 'admissions_2', 'resp_admissions', 'resp_admissions_2'
for spreadsheet in spreadsheets:
    this_file = pd.ExcelFile(spreadsheet)
    # Need to fix data entry error in years > 2011
    dot = spreadsheet.find('.')
    fix_columns = int(spreadsheet[dot-4:dot]) > 2011
    for name in this_file.sheet_names:
        d = this_file.parse(name)
        d.columns = col_names
        if fix_columns:
            total = d['resp_admissions'] + d['resp_admissions_2']
            d['resp_admissions_2'] = d['resp_admissions']
            d['resp_admissions'] = total
        data.append(d)
        
hospitalizations = pd.concat(data).set_index('date')
In [124]:
hospitalizations.head()
Out[124]:
admissions admissions_2 resp_admissions resp_admissions_2
date
2010-03-14 23 21 15 14
2010-03-15 27 23 19 18
2010-03-16 11 8 6 3
2010-03-17 17 10 10 8
2010-03-18 23 17 12 10
In [125]:
hospitalizations['virus_year'] = 2011
hospitalizations.loc[(hospitalizations.index >= datetime(2011, 3, 31)) 
                 & (hospitalizations.index <= datetime(2012, 3, 31)), 'virus_year'] = 2012
hospitalizations.loc[hospitalizations.index > datetime(2012, 3, 31), 'virus_year'] = 2013
In [126]:
under_2_hosp = hospitalizations.groupby('virus_year').sum()['admissions_2']
under_2_hosp
Out[126]:
virus_year
2011    4123
2012    3865
2013    3776
Name: admissions_2, dtype: int64
In [127]:
rsv_by_year = rsv_subset.groupby('virus_year')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']]
rsv_counts = rsv_by_year.sum()
In [128]:
rsv_counts
Out[128]:
age_under_2 age_2_5 age_6_11 age_over_11
virus_year
2011 119 153 87 38
2012 124 184 79 58
2013 159 184 106 63
In [129]:
under2_rsv = pd.concat([rsv_counts, under_2_hosp], axis=1).dropna()
under2_rsv
Out[129]:
age_under_2 age_2_5 age_6_11 age_over_11 admissions_2
virus_year
2011 119 153 87 38 4123
2012 124 184 79 58 3865
2013 159 184 106 63 3776
In [130]:
def extract_virus(virus):
    
    virus_subset = hospitalized[hospitalized[virus]==1]
    
    virus_by_year = virus_subset.groupby('virus_year')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']]
    virus_counts = virus_by_year.sum()
    
    return pd.concat([virus_counts, under_2_hosp], axis=1).dropna()
In [131]:
def hosp_subset(virus):
    
    data_subset = extract_virus(virus)
    
    admissions = data_subset.pop('admissions_2').values
    data_subset = data_subset.values
    
    n_months, n_ages = data_subset.shape
    
    # Estimate age distribution of admissions
    p_age = pm.Dirichlet('p_age', np.ones(4), value=data_subset.sum(0)[:-1]/data_subset.sum())    
    counts = pm.Multinomial('counts', data_subset.sum(1), p_age, value=data_subset, observed=True)
    
    # Estimate denominators   
    n_hosp = []
    for i,ni in enumerate(admissions):
        
        d = data_subset[i]
        
        n_init = (ni*(d/d.sum() if (d.sum()>10) else np.ones(4)*0.25)).astype(int)[:-1]
        n_init = np.append(n_init, ni - n_init.sum())
        
        n_hosp.append(pm.Multinomial('n_hosp_{}'.format(i), ni, p_age, value=n_init))
    
    n_hosp_t = pm.Lambda('n_hosp_t', lambda x=n_hosp: np.array([[xij for xij in xi] for xi in x]).T)
    
    # Virus rates by age and time    
    mu_alpha = pm.Normal('mu_alpha', 0, 0.01, value=[0]*n_ages)
    sigma_alpha = pm.Uniform('sigma_alpha', 0, 100, value=[10]*n_ages)
    rho = pm.Uniform('rho', -1, 1, value=0)

    mu =  [pm.Lambda('mu_{}'.format(i), lambda mu=mu_alpha: np.array([mu[i]]*n_months)) for i in range(n_ages)]
    off_diag = np.eye(n_months, k=1)
    Sigma = [pm.Lambda('Sigma_{}'.format(i), lambda s=sigma_alpha, r=rho: 
                       (np.eye(n_months) + off_diag*r + off_diag.T*r)*(s[i]**2)) for i in range(n_ages)]

    alpha = [pm.MvNormalCov('alpha_{}'.format(i), mu[i], Sigma[i], value=[0]*n_months) for i in range(n_ages)]
    
    p_virus = [pm.Lambda('p_virus_{}'.format(i), lambda a=a: pm.invlogit(a)) for i,a in enumerate(alpha)]
    
    # Viral rate likelihood
    @pm.observed
    def rate_like(value=data_subset.T, p=p_virus, n=n_hosp_t):
        return np.sum([pm.binomial_like(value[i], n[i], p[i]) for i in range(n_ages)])
    
    return(locals())
In [132]:
M = pm.MCMC(hosp_subset('RSV'))
In [133]:
M.sample(20000, 10000)
 [-----------------100%-----------------] 20000 of 20000 complete in 32.0 sec
In [134]:
pm.Matplot.summary_plot(M.p_age, custom_labels=['age_under_2', 'age_2_5', 'age_6_11'])
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
In [135]:
date_labels = ['2011', '2012', '2013']
pm.Matplot.summary_plot(M.p_virus[0], custom_labels=date_labels, main='RSV rates')
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
In [136]:
import seaborn as sb
In [137]:
age_group_labels = ['Under 2 mo.', '2-5 mo.', '6-11 mo.', '12-23 mo.']
In [138]:
rsv_rates = pd.concat([pd.DataFrame({'rate':p.trace().T[t], 
                                   'age': age_group_labels[i], 
                                   'year': date_labels[t]}) for i,p in enumerate(M.p_virus) for t in range(3)])
In [139]:
sb.factorplot('year', 'rate', 'age', rsv_rates, kind='box', aspect=1.75)
Out[139]:
<seaborn.axisgrid.FacetGrid at 0x11a7a8780>
In [140]:
M_rhino = pm.MCMC(hosp_subset('Rhino'))
M_rhino.sample(20000, 10000)
 [-----------------100%-----------------] 20000 of 20000 complete in 31.5 sec
In [141]:
rhino_rates = pd.concat([pd.DataFrame({'rate':p.trace().T[t], 
                                   'age': age_group_labels[i], 
                                   'year': date_labels[t]}) for i,p in enumerate(M_rhino.p_virus) for t in range(3)])
In [142]:
sb.set(style="white", palette="muted")

fg = sb.factorplot('year', 'rate', 'age', rhino_rates, kind='box', aspect=1.75)

Production runs

In [143]:
for virus in viruses:
    
    print('\nRunning', virus)
    M = pm.MCMC(hosp_subset(virus))
    M.sample(50000, 40000)
    
    sb.set(style="white", palette="muted")
    
    virus_rates = pd.concat([pd.DataFrame({'rate':p.trace().T[t], 
                                   'age': age_group_labels[i], 
                                   'year': date_labels[t]}) for i,p in enumerate(M.p_virus) for t in range(3)])
    
    fg = sb.factorplot('year', 'rate', 'age', virus_rates, kind='box', aspect=1.75)
    fg.savefig(virus)
    
    M.write_csv(virus, variables=[x.__name__ for x in M.p_virus])
Running RSV
 [-----------------100%-----------------] 50000 of 50000 complete in 77.9 sec
Running HMPV
 [-----------------100%-----------------] 50000 of 50000 complete in 83.4 sec
Running Rhino
 [-----------------100%-----------------] 50000 of 50000 complete in 79.5 sec
Running Influenza
 [-----------------100%-----------------] 50000 of 50000 complete in 78.9 sec
Running Adeno
 [-----------------100%-----------------] 50000 of 50000 complete in 78.6 sec
Running PIV
 [-----------------100%-----------------] 50000 of 50000 complete in 78.0 sec
Running No virus
 [-----------------100%-----------------] 50000 of 50000 complete in 78.0 sec
Running All
 [-----------------100%-----------------] 50000 of 50000 complete in 78.2 sec