%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sb
import pymc as pm
sb.set_style("white")
Import data
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
hospitalized.head()
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,263,282,283,284,298,299) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
greater_48hrs | fever_neutropenia | never_left | written_consent | child_name | mother_name | mother_birth_date | mother_record | mother_nationality | other_mother_nationality | ... | was_whole_blood_obtained_f | date | whole_blood_complete | age_months | length_of_stay | gest_age | death | hospitalized_vitamin_d | wheezing_ind | sex | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
case_id | |||||||||||||||||||||
A0001 | 0 | 0 | 0 | 1 | Remas Mahmoud Jbarah | Huda Katalo | 1976-01-21 | NaN | 3 | NaN | ... | NaN | NaN | 0 | 1 | 6 | 40 | False | 3 | 0 | F |
A0002 | 0 | 0 | 0 | 1 | Majed Abdel Kareem Majed | Noor SHa'aban Mahmood | 1989-09-09 | NaN | 3 | NaN | ... | NaN | NaN | 0 | 1 | 5 | 40 | False | 4 | 1 | M |
A0003 | 0 | 0 | 0 | 1 | Rayyan Jamal Muhyi Al.Deen | SAra Hussein Muhyi Al.Deen | 1965-01-01 | NaN | 1 | NaN | ... | NaN | NaN | 0 | 11 | 10 | 40 | False | 35 | 0 | F |
A0004 | 0 | 0 | 0 | 1 | Hanan Mohd Mustapha Abu Othman | Kawla Abu Shanab | 1983-10-31 | NaN | 1 | NaN | ... | NaN | NaN | 0 | 7 | 3 | 38 | False | 2 | 1 | F |
A0005 | 0 | 0 | 0 | 1 | Yara Mahmoud Azmi Ismael | Suha Abdel Aziz | 1986-02-28 | NaN | 1 | NaN | ... | NaN | NaN | 0 | 2 | 1 | 39 | False | 6 | 0 | F |
5 rows × 414 columns
Breakdown by:
Severity measures:
Columns:
[v for v in hospitalized.columns if v.endswith('hx')]
['no_hx', 'diabetes_hx', 'heart_hx', 'kidney_hx', 'downs_hx', 'sickle_hx', 'cancer_hx', 'cf_hx', 'genetic_hx', 'cp_hx', 'seizure_hx', 'neuro_hx', 'mr_hx', 'asthma_hx', 'immuno_hx', 'liver_hx', 'gerd_hx', 'diarrhea_hx', 'other_hx', 'spec_hx', 'no_asthma_hx', 'no_allergy_hx', 'no_eczema_hx']
hospitalized['asthma'] = (hospitalized.child_asthma==1) | hospitalized.asthma_hx
hospitalized.asthma.sum()
81
hospitalized.no_hx.value_counts()
1 2848 0 320 dtype: int64
((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool))).sum()
55
Past medical history calculated as those who claim to have past medical history and those who claim to have no past medical history and reported asthma.
hospitalized['past medical history'] = (~hospitalized.no_hx.astype(bool) |
((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool))))
hospitalized['past medical history'].sum()
375
# Positive culture lookup
pcr_lookup = {'pcr_result___1': 'RSV',
'pcr_result___2': 'HMPV',
'pcr_result___3': 'flu A',
'pcr_result___4': 'flu B',
'pcr_result___5': 'rhino',
'pcr_result___6': 'PIV1',
'pcr_result___7': 'PIV2',
'pcr_result___8': 'PIV3',
'pcr_result___13': 'H1N1',
'pcr_result___14': 'H3N2',
'pcr_result___15': 'Swine',
'pcr_result___16': 'Swine H1',
'pcr_result___17': 'flu C',
'pcr_result___18': 'Adeno'}
hospitalized['RSV'] = hospitalized.pcr_result___1.astype(bool)
hospitalized['HMPV'] = hospitalized.pcr_result___2.astype(bool)
hospitalized['Rhino'] = hospitalized.pcr_result___5.astype(bool)
hospitalized['Influenza'] = (hospitalized.pcr_result___3 | hospitalized.pcr_result___4 |
hospitalized.pcr_result___17)
hospitalized['Adeno'] = hospitalized.pcr_result___18.astype(bool)
hospitalized['PIV'] = (hospitalized.pcr_result___6 | hospitalized.pcr_result___7 |
hospitalized.pcr_result___8)
hospitalized['No virus'] = hospitalized[list(pcr_lookup.keys())].sum(1) == 0
hospitalized['All'] = True
viruses = ['RSV', 'HMPV', 'Rhino', 'Influenza', 'Adeno', 'PIV', 'No virus', 'All']
non_rsv_lookup = pcr_lookup.copy()
non_rsv_lookup.pop('pcr_result___1')
'RSV'
Identify individuals with coinfection
hospitalized['coinfection'] = hospitalized[list(pcr_lookup.keys())].sum(1) > 1
Virus frequency by coinfection status
hospitalized[(~hospitalized.coinfection) & (hospitalized.RSV)].shape[0]/hospitalized.shape[0]
0.2297979797979798
virus_by_coinf = hospitalized.groupby('coinfection')[viruses[:-2]].sum()/hospitalized.shape[0]
virus_by_coinf.T.plot(kind='bar', stacked=True, grid=False,
color=sb.color_palette('Paired', 2)[::-1])
<matplotlib.axes._subplots.AxesSubplot at 0x1135d17f0>
hospitalized[viruses].sum(0)
RSV 1397 HMPV 273 Rhino 1238 Influenza 119 Adeno 475 PIV 175 No virus 587 All 3168 dtype: float64
other_virus_index = (hospitalized[list(non_rsv_lookup.keys())].sum(1) > 0).astype(int)
hospitalized['RSV'] = hospitalized['pcr_result___1']
hospitalized['non-RSV virus'] = (hospitalized['pcr_result___1']==0) & other_virus_index
hospitalized['no virus'] = (hospitalized['pcr_result___1']==0) & (other_virus_index==0)
hospitalized["prev_cond"] = (hospitalized[[c for c in hospitalized.columns if c.endswith('hx')
and not c.startswith('no_')]].sum(1) > 0)
hospitalized['male'] = (hospitalized.sex=='M').astype(int)
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23]))
age_groups.index = hospitalized.index
age_groups.columns = 'under 2 months', '2-11 months', '12-23 months'
hospitalized = hospitalized.join(age_groups)
nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian',
6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'}
hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup)
hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int)
hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int)
hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int)
hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 20'] = np.nan
hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int)
hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 11'] = np.nan
hospitalized['any_cigarette'] = (hospitalized.cigarette_smokers > 0).astype(int)
hospitalized['any_smoke'] = (hospitalized.cigarette_smokers.astype(bool) |
hospitalized.nargila_smokers.astype(bool))
hospitalized.any_smoke.mean()
0.76546717171717171
hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int)
hospitalized.length_of_stay.hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x11380f0b8>
Diagnosis
hospitalized['ros'] = hospitalized.adm_sepsis | hospitalized.adm_febrile
hospitalized['pertussis-like cough'] = hospitalized.adm_pertussis | hospitalized.adm_cough
hospitalized['brochopneumonia'] = hospitalized.adm_bronchopneumo
hospitalized['bronchiolitis'] = hospitalized.adm_bronchiolitis
hospitalized['pneumonia'] = hospitalized.adm_pneumo
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.admission_date.describe()
count 3168 unique 794 top 2011-02-07 00:00:00 freq 17 first 2010-03-15 00:00:00 last 2013-03-30 00:00:00 Name: admission_date, dtype: object
Age groups:
age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,5,11,24], include_lowest=True))
age_groups.index = hospitalized.index
age_groups.columns = 'age_under_2', 'age_2_5', 'age_6_11', 'age_over_11'
hospitalized = hospitalized.join(age_groups)
age_group_lookup = {'age_under_2': '<2',
'age_2_5': '2-5',
'age_6_11': '6-11',
'age_over_11': '>11'}
customcmap = sb.color_palette("coolwarm", 6)
fig, axes = plt.subplots(2,2, figsize=(10,6), sharey=True)
for i,ax in enumerate(np.ravel(axes)):
age_group = age_groups.columns[i]
age_subset = hospitalized[hospitalized[age_group].astype(bool)]
age_virus = age_subset[viruses[:-2]].mean()
age_virus.T.plot(kind='bar', grid=False, ax=ax,
color=customcmap)
ax.set_title(age_group_lookup[age_group])
Recode year to virus season:
hospitalized['virus_year'] = 2011
hospitalized.loc[(hospitalized.admission_date >= '2011-03-31')
& (hospitalized.admission_date <= '2012-03-31'), 'virus_year'] = 2012
hospitalized.loc[hospitalized.admission_date > '2012-03-31', 'virus_year'] = 2013
hospitalized.virus_year.value_counts()
2012 1191 2013 1179 2011 798 dtype: int64
Recode zones
zone_string = "1, Amman Zone 1 | 2, Abdoun Zone 1 | 3, Abu Alanda Zone 3 | 4, Abu Nusair Zone 2 | 5, Airport street Zone 4 | 6, Al.Ashrafeyeh Zone 5 | 7, Al.Badya Zone 31 | 8, Al.Baqa'a Zone 22 | 9, Al.Ghour Zone 32 | 10, Al.Hashmi Zone 6 | 11, Al.Hezam Zone 29 | 12, Al.Hussein Camping Zone 1 | 13, Al.Istiklal Zone 1 | 14, Al.Jeeza Zone 8 | 15, Al.Joufeh Zone 5 | 16, Al.Karak Zone 35 | 17, Al.Lubban Zone 16 | 18, Al.Madenah Al.Reyadeyeh Zone 1 | 19, Al.Mahatta Zone 6 | 20, Al.Manarah Zone 5 | 21, Al.Mareikh Zone 5 | 22, Al.Marqab Zone 6 | 23, Al.Muhajreen Zone 5 | 24, Al.Musdar Zone 5 | 25, Al.Musherfeh Zone 15 | 26, Al.Naser Zone 6 | 27, Al.Natheef Zone 5 | 28, Al.Nuzha Zone 1 | 29, Al.Qastal Zone 10 | 30, Al.Qwesmeh Zone 5 | 31, Al.Shouneh Zone 32 | 32, Al.Taj Zone 5 | 33, Al.Taybeh Zone 7 | 34, Aqaba Zone 30 | 35, Arjan Zone 1 | 36, Bayader Wadi AL.Seer Zone 20 | 37, Bnayyat Zone 12 | 38, D.Al.Ameer Ali Zone 13 | 39, D.Al.Aqsa Zone 9 | 40, D.Haj Hasan Zone 9 | 41, Daheyet Al.Rasheed Zone 17 | 42, Daheyet al.Yasmeen Zone 1 | 43, Dead Sea Zone 32 | 44, Deer.Al.Ghbar Zone 1 | 45, Down Town (Al.Balad) Zone 1 | 46, Dra'a al.Qarbi Zone 1 | 47, Ein Al.Basha Zone 22 | 48, Ein Ghazal Zone 6 | 49, Eskan Al.Ameer Hashem Zone 29 | 50, Eskan Al.Ameer Talal Zone 29 | 51, Etha'a wal Telvesion Zone 9 | 52, Hay Al.Dabaybeh Zone 5 | 53, Hay Al.Tafayleh Zone 5 | 54, Hay Nazzal Zone 1 | 55, Huttein (shneler ) Refugee camping Zone 6 | 56, Iraq Al.Ameer Zone 23 | 57, Jabal AL.Akhdar Zone 1 | 58, Jabal Al.Ameer Faisal Zone 29 | 59, Jabal AL.Hadeed Zone 5 | 60, Jabal Al.Hussein Zone 1 | 61, Jabal Al.Qosoor Zone 1 | 62, Jabal Amman Zone 1 | 63, Jarash Zone 36 | 64, Jawa Zone 7 | 65, Juwaydeh Zone 14 | 66, Khalda Zone 18 | 67, Khreibt Al.Souk Zone 7 | 68, Ma'an Zone 31 | 69, Madaba Zone 34 | 70, Mafraq Zone 33 | 71, Marj Al.Hamam Zone 19 | 72, Marka Zone 6 | 73, Muqableen Zone 9 | 74, Muwaqqar Zone 28 | 75, Nadi Al.Sebaq Zone 6 | 76, Naur Zone 19 | 77, Petra Zone 30 | 78, Qatranah Zone 30 | 79, Raghadan Zone 1 | 80, Ras Al.Ein Zone 1 | 81, Rusayfah Zone 29 | 82, Saffout Zone 22 | 83, Sahab Zone 11 | 84, Salheyet Al.Abed Zone 6 | 85, Shafa Badran Zone 25 | 86, Sharq Al.Awsat Zone 11 | 87, Shemasani Zone 1 | 88, Street 30 Zone 5 | 89, Summaya Street Zone 5 | 90, Suweileh Zone 27 | 91, Tabarbour Zone 21 | 92, Tla'a al Ali Zone 17 | 93, Um Al.Heran Zone 11 | 94, Um Al.Summaq Zone 17 | 95, Um Nuwwara and Adan Zone 5 | 96, Um Uthayna Zone 1 | 97, Wadi Abdoun Zone 1 | 98, Wadi AL.Haddadeh Zone 1 | 99, Wadi AL.Hajar Zone 29 | 100, Wadi Al.Remam Zone 5 | 101, Wadi AL.Seer Zone 20 | 102, Wadi Saqra Zone 1 | 103, Wehdat Zone 11 | 104, Yadoudeh Zone 13 | 105, Yajouz Zone 26 | 106, Zarqa Zone 29 | 107, Zezya Zone 24"
zones = [z.strip().split(',') for z in zone_string.split('|')]
zone_dict = {int(n):int(s.strip().split(' ')[-1]) for n,s in zones}
hospitalized['zone'] = hospitalized.city_zone.replace(zone_dict)
Define Amman zone
hospitalized['amman_zone'] = hospitalized.zone<28
Hospitalized in Amman
hospitalized_amman = hospitalized[hospitalized.amman_zone]
hospitalized_amman.shape
(3048, 451)
hospitalized_amman.index.is_unique
True
hosp_age_counts = hospitalized_amman.groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum').values
pre_2013 = hospitalized_amman.admission_date < datetime(2013, 1, 1)
age_counts = hospitalized_amman[pre_2013].groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum')
age_counts
age_under_2 | age_2_5 | age_6_11 | age_over_11 | |
---|---|---|---|---|
admission_date | ||||
2010-03-31 | 15 | 16 | 14 | 3 |
2010-04-30 | 17 | 22 | 14 | 6 |
2010-05-31 | 12 | 16 | 11 | 2 |
2010-06-30 | 21 | 6 | 7 | 3 |
2010-07-31 | 19 | 6 | 2 | 2 |
2010-08-31 | 7 | 8 | 3 | 3 |
2010-09-30 | 10 | 5 | 4 | 2 |
2010-10-31 | 5 | 5 | 4 | 4 |
2010-11-30 | 5 | 5 | 8 | 7 |
2010-12-31 | 7 | 8 | 5 | 10 |
2011-01-31 | 32 | 58 | 34 | 21 |
2011-02-28 | 54 | 67 | 39 | 24 |
2011-03-31 | 40 | 31 | 18 | 23 |
2011-04-30 | 23 | 37 | 26 | 12 |
2011-05-31 | 24 | 34 | 11 | 17 |
2011-06-30 | 25 | 17 | 13 | 9 |
2011-07-31 | 20 | 16 | 10 | 10 |
2011-08-31 | 21 | 7 | 0 | 3 |
2011-09-30 | 17 | 8 | 5 | 8 |
2011-10-31 | 20 | 9 | 6 | 10 |
2011-11-30 | 10 | 14 | 7 | 14 |
2011-12-31 | 23 | 21 | 21 | 18 |
2012-01-31 | 60 | 68 | 37 | 37 |
2012-02-29 | 55 | 83 | 39 | 25 |
2012-03-31 | 49 | 71 | 37 | 33 |
2012-04-30 | 34 | 43 | 32 | 23 |
2012-05-31 | 34 | 27 | 15 | 11 |
2012-06-30 | 23 | 21 | 15 | 10 |
2012-07-31 | 29 | 18 | 8 | 14 |
2012-08-31 | 21 | 11 | 7 | 5 |
2012-09-30 | 13 | 17 | 15 | 10 |
2012-10-31 | 16 | 20 | 4 | 9 |
2012-11-30 | 11 | 17 | 5 | 9 |
2012-12-31 | 43 | 46 | 31 | 17 |
Rates and demographics (via World Bank) and 2004 census data (via Jordan Department of Statistics).
# Jordan population, 2008-2012
population = 5786000, 5915000, 6046000, 6181000, 6318000
# Interpolated population by gender and age, 2010-2012
female_0 = 93649, 95739, 96486
male_0 = 98435, 100525, 101160
female_1 = 87941, 93511, 93067
male_1 = 92548, 98306, 97763
kids_0 = np.array(female_0) + np.array(male_0)
kids_1 = np.array(female_1) + np.array(male_1)
kids = kids_0 + kids_1
kids_under6mo = kids_6to12mo = kids_0/2.
# Proportion in Amman
amman_urban_2004 = 1784502.
jordan_2004 = 5103639.
amman_prop = amman_urban_2004 / jordan_2004
# Birth rates (per 1000)
birth_rate = 29.665, 29.322, 28.869, 28.317, 27.699
# Neonatal mortality (per 1000)
neonatal_mort = 12.7, 12.4, 12.1, 11.8, 11.5
# Infant mortality (per 1000)
infant_mort = 18.4, 17.9, 17.3, 16.8, 16.4
births = np.array(population[-3:])/1000. * birth_rate[-3:]
births
array([ 174541.974, 175027.377, 175002.282])
births_6m = births/2.
births_6m
array([ 87270.987 , 87513.6885, 87501.141 ])
deaths_6m = births_6m/1000. * infant_mort[-3:]
deaths_6m
array([ 1509.7880751, 1470.2299668, 1435.0187124])
amman_prop
0.3496528653378501
(births_6m - deaths_6m)*amman_prop
array([ 29986.6489389 , 30085.34181971, 30093.26626638])
n = np.array((kids_under6mo, kids_6to12mo, kids_1, kids))
n
array([[ 96042., 98132., 98823.], [ 96042., 98132., 98823.], [ 180489., 191817., 190830.], [ 372573., 388081., 388476.]])
kids_0
array([192084, 196264, 197646])
kids_1/kids_0
array([ 0.93963578, 0.97734174, 0.9655141 ])
amman_prop
0.3496528653378501
n_amman = np.floor(n*amman_prop).astype(int)
n_amman
array([[ 33581, 34312, 34553], [ 33581, 34312, 34553], [ 63108, 67069, 66724], [130271, 135693, 135831]])
Monthly admissions in Ammaon
admissions_by_month = hospitalized_amman.groupby('admission_date')['child_name'].count().resample('1M', how='sum')
admissions_by_month.head()
admission_date 2010-03-31 48 2010-04-30 59 2010-05-31 41 2010-06-30 37 2010-07-31 29 Freq: M, Name: child_name, dtype: int64
# Dict to return pcr_result variable corresponding to virus
virus_lookup = {pcr_lookup[k]: k for k in pcr_lookup.keys()}
# Enrolling 5 days per week
p_enroll = 5./7
rsv_subset = hospitalized_amman[hospitalized_amman.RSV==1]
rsv_subset.shape
(1354, 451)
'age_under_2', 'age_2_5', 'age_6_11', 'age_over_11'
viruses
['RSV', 'HMPV', 'Rhino', 'Influenza', 'Adeno', 'PIV', 'No virus', 'All']
virus_subset = hospitalized_amman[hospitalized['RSV']==1].copy()
virus_subset['admission_year'] = virus_subset.admission_date.apply(lambda x: x.year)
_under_6 = virus_subset[virus_subset.age_under_2.astype(bool) | virus_subset.age_2_5].groupby('admission_year')['child_name'].count()
/usr/local/lib/python3.4/site-packages/pandas/core/frame.py:1819: UserWarning: Boolean Series key will be reindexed to match DataFrame index. "DataFrame index.", UserWarning)
_6_11 = virus_subset[virus_subset.age_6_11.astype(bool)].groupby('admission_year')['child_name'].count()
_over_11 = virus_subset[virus_subset.age_over_11.astype(bool)].groupby('admission_year')['child_name'].count()
_all = virus_subset.groupby('admission_year')['child_name'].count()
virus_subset.set_index(virus_subset.admission_date).groupby([pd.TimeGrouper('M')]).count()['mother_name']
admission_date 2010-03-31 28 2010-04-30 20 2010-05-31 3 2010-06-30 2 2010-09-30 1 2010-10-31 1 2010-11-30 4 2010-12-31 14 2011-01-31 131 2011-02-28 145 2011-03-31 48 2011-04-30 13 2011-05-31 4 2011-06-30 1 2011-09-30 1 2011-10-31 1 2011-11-30 2 2011-12-31 24 2012-01-31 145 2012-02-29 138 2012-03-31 116 2012-04-30 71 2012-05-31 20 2012-06-30 14 2012-07-31 1 2012-10-31 1 2012-11-30 3 2012-12-31 89 2013-01-31 153 2013-02-28 114 2013-03-31 46 Name: mother_name, dtype: int64
virus_df = pd.concat([_under_6, _6_11, _over_11, _all], axis=1)
virus_df.columns = ('under 6 mo.', '6-11 mo.', '11-23 mo.', 'all under 2 yr.')
virus_df
under 6 mo. | 6-11 mo. | 11-23 mo. | all under 2 yr. | |
---|---|---|---|---|
admission_year | ||||
2010 | 49 | 20 | 4 | 73 |
2011 | 255 | 77 | 38 | 370 |
2012 | 400 | 113 | 85 | 598 |
2013 | 219 | 62 | 32 | 313 |
virus_df.values.ravel()
array([ 49, 20, 4, 73, 255, 77, 38, 370, 400, 113, 85, 598, 219, 62, 32, 313])
Use proportion in Amman to calculate proportion of kids in each age group and year in Amman
def rate_model(virus):
# Extract data subset for passed virus
virus_subset = hospitalized_amman[hospitalized_amman[virus]==1].copy()
# Create data frame of age x year counts
u6 = virus_subset.age_under_2.astype(bool) | virus_subset.age_2_5
_under_6 = virus_subset[u6].groupby('virus_year')['child_name'].count()
_6_11 = virus_subset[virus_subset.age_6_11.astype(bool)].groupby('virus_year')['child_name'].count()
_over_11 = virus_subset[virus_subset.age_over_11.astype(bool)].groupby('virus_year')['child_name'].count()
_all = virus_subset.groupby('virus_year')['child_name'].count()
virus_df = pd.concat([_under_6, _6_11, _over_11, _all], axis=1)
virus_df.columns = ('under 6 mo.', '6-11 mo.', '11-23 mo.', 'all ages')
# Al Bashir hospital market share
market_share = pm.Uniform('market_share', 0.5, 0.6)
# Prior probability
prev_virus = [pm.Beta('prev_virus_%i' %i, 1, 5) for i in range(virus_df.size)]
per_1000 = pm.Lambda('per_1000', lambda p=prev_virus: np.array(p)*1000)
# Correct for 5 days of enrollment per week
p_hosp = market_share * (5./7)
# RSV in Amman
n_hosp = pm.Binomial('n_hosp', n_amman.T.ravel(), p_hosp, value=n_amman.T.ravel()*0.2)
# Likelihood for number with RSV in hospital (assumes Pr(hosp | RSV) = 1)
y_hosp = pm.Binomial('y_hosp', n_hosp, prev_virus, value=virus_df.values.ravel(), observed=True)
return locals()
M = pm.MCMC(rate_model('Influenza'))
M.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 79.8 sec
M.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 72.9 sec
prevalence_labels = ['%s %s' % (year, age) for year in ('2011', '2012', '2013')
for age in ('under 6 mo.', '6-11 mo.', '12-23 mo.', 'all ages')]
pm.Matplot.summary_plot(M.per_1000, custom_labels=prevalence_labels, main='Influenza (per 1000)')
M.per_1000.summary()
per_1000: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.857 0.261 0.006 [ 0.381 1.352] 0.627 0.227 0.006 [ 0.228 1.069] 0.332 0.119 0.003 [ 0.131 0.578] 0.504 0.102 0.003 [ 0.314 0.706] 1.524 0.357 0.01 [ 0.889 2.23 ] 0.534 0.207 0.004 [ 0.163 0.937] 0.435 0.133 0.003 [ 0.186 0.695] 0.696 0.121 0.004 [ 0.476 0.937] 2.507 0.459 0.015 [ 1.633 3.396] 0.837 0.254 0.006 [ 0.366 1.334] 0.594 0.157 0.004 [ 0.304 0.905] 1.103 0.157 0.006 [ 0.822 1.419] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.438 0.673 0.825 1.013 1.44 0.266 0.464 0.601 0.763 1.126 0.141 0.246 0.318 0.405 0.6 0.323 0.432 0.497 0.567 0.721 0.929 1.261 1.492 1.756 2.293 0.213 0.387 0.503 0.648 1.017 0.214 0.34 0.418 0.517 0.738 0.49 0.612 0.688 0.771 0.957 1.708 2.179 2.483 2.8 3.487 0.406 0.66 0.811 0.99 1.41 0.328 0.483 0.582 0.693 0.936 0.824 0.992 1.098 1.206 1.423
M.write_csv('Influenza_per_1000', variables=['per_1000'])
M_hmpv = pm.MCMC(rate_model('HMPV'))
# M_hmpv.
M_hmpv.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 72.2 sec
M_hmpv.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 66.1 sec
pm.Matplot.summary_plot(M_hmpv.per_1000, custom_labels=prevalence_labels, main='HMPV (per 1000)')
M_hmpv.per_1000.summary()
per_1000: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 1.514 0.344 0.009 [ 0.88 2.205] 0.843 0.255 0.005 [ 0.372 1.339] 0.361 0.121 0.003 [ 0.148 0.599] 0.747 0.125 0.004 [ 0.528 1.021] 7.233 0.791 0.033 [ 5.727 8.785] 2.913 0.479 0.015 [ 2.016 3.893] 1.232 0.224 0.006 [ 0.821 1.673] 3.134 0.275 0.014 [ 2.602 3.661] 1.927 0.394 0.011 [ 1.208 2.722] 1.935 0.394 0.01 [ 1.204 2.729] 0.501 0.143 0.003 [ 0.235 0.774] 1.189 0.16 0.006 [ 0.879 1.487] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.908 1.277 1.491 1.72 2.267 0.412 0.662 0.817 0.998 1.397 0.171 0.271 0.349 0.433 0.633 0.527 0.658 0.741 0.827 1.021 5.776 6.675 7.201 7.742 8.857 2.037 2.582 2.884 3.227 3.935 0.831 1.074 1.216 1.377 1.692 2.626 2.936 3.126 3.316 3.703 1.226 1.646 1.902 2.176 2.756 1.245 1.662 1.917 2.179 2.793 0.263 0.4 0.489 0.588 0.823 0.901 1.077 1.18 1.292 1.522
M_hmpv.write_csv('HMPV_per_1000', variables=['per_1000'])
M_rhino = pm.MCMC(rate_model('Rhino'))
M_rhino.sample(150000, 140000)
[-----------------100%-----------------] 150000 of 150000 complete in 97.0 sec
M_rhino.sample(150000, 140000)
[-----------------100%-----------------] 150000 of 150000 complete in 95.9 sec
pm.Matplot.summary_plot(M_rhino.per_1000, custom_labels=prevalence_labels, main='Rhino (per 1000)')
M_rhino.per_1000.summary()
per_1000: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 12.805 1.126 0.051 [ 10.72 15.081] 3.505 0.55 0.016 [ 2.494 4.58 ] 1.281 0.238 0.006 [ 0.848 1.759] 4.806 0.359 0.019 [ 4.124 5.505] 20.541 1.461 0.078 [ 17.563 23.244] 5.739 0.689 0.024 [ 4.343 7.034] 2.654 0.335 0.011 [ 2.027 3.312] 7.909 0.482 0.029 [ 7.019 8.907] 27.348 1.741 0.105 [ 23.963 30.637] 8.21 0.831 0.032 [ 6.674 9.902] 3.025 0.357 0.013 [ 2.32 3.71] 10.494 0.59 0.038 [ 9.355 11.603] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 10.736 12.006 12.788 13.541 15.105 2.533 3.125 3.479 3.858 4.645 0.868 1.109 1.271 1.431 1.79 4.135 4.559 4.793 5.043 5.538 17.808 19.486 20.539 21.543 23.54 4.435 5.27 5.728 6.188 7.173 2.057 2.419 2.642 2.874 3.345 6.988 7.585 7.887 8.222 8.891 24.089 26.128 27.335 28.504 30.801 6.674 7.623 8.194 8.748 9.911 2.35 2.772 3.008 3.257 3.758 9.394 10.085 10.466 10.911 11.66
M_rhino.write_csv('Rhino_per_1000', variables=['per_1000'])
# The data
M_rhino.virus_df
under 6 mo. | 6-11 mo. | 11-23 mo. | |
---|---|---|---|
virus_year | |||
2011 | 163 | 44 | 30 |
2012 | 267 | 74 | 67 |
2013 | 359 | 107 | 76 |
M_rsv = pm.MCMC(rate_model('RSV'))
M_rsv.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 72.2 sec
M_rsv.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 72.6 sec
pm.Matplot.summary_plot(M_rsv.per_1000, custom_labels=prevalence_labels, main='RSV (per 1000)')
M_rsv.per_1000.summary()
per_1000: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 21.12 1.705 0.118 [ 17.914 24.408] 6.82 0.805 0.038 [ 5.35 8.415] 1.604 0.266 0.01 [ 1.112 2.117] 23.477 1.859 0.132 [ 19.802 26.795] 6.046 0.742 0.034 [ 4.659 7.5 ] 2.29 0.32 0.013 [ 1.689 2.909] 25.899 2.009 0.148 [ 22.041 29.442] 8.049 0.895 0.046 [ 6.453 9.859] 2.495 0.339 0.015 [ 1.85 3.163] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 17.979 19.899 21.076 22.3 24.517 5.363 6.256 6.796 7.367 8.446 1.131 1.421 1.584 1.768 2.17 20.054 22.132 23.405 24.851 27.108 4.692 5.523 6.001 6.54 7.563 1.714 2.067 2.272 2.499 2.954 22.313 24.397 25.845 27.345 29.759 6.446 7.416 8.008 8.637 9.853 1.88 2.257 2.485 2.711 3.216
M_rsv.write_csv('RSV_per_1000', variables=['per_1000'])
M_rsv.virus_df
under 6 mo. | 6-11 mo. | 11-23 mo. | |
---|---|---|---|
virus_year | |||
2011 | 272 | 87 | 38 |
2012 | 308 | 79 | 58 |
2013 | 343 | 106 | 63 |
M_adeno = pm.MCMC(rate_model('Adeno'))
M_adeno.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 71.6 sec
M_adeno.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 72.6 sec
pm.Matplot.summary_plot(M_adeno.per_1000, custom_labels=prevalence_labels, main='Adeno (per 1000)')
M_adeno.per_1000.summary()
per_1000: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 2.672 0.471 0.011 [ 1.774 3.582] 2.097 0.412 0.009 [ 1.333 2.925] 0.94 0.203 0.004 [ 0.57 1.338] 1.643 0.19 0.005 [ 1.289 2.018] 5.585 0.677 0.017 [ 4.31 6.928] 3.228 0.494 0.011 [ 2.34 4.224] 1.776 0.274 0.007 [ 1.271 2.338] 3.065 0.26 0.008 [ 2.565 3.571] 10.74 0.936 0.031 [ 8.993 12.639] 3.83 0.546 0.011 [ 2.754 4.895] 1.781 0.271 0.006 [ 1.259 2.303] 4.539 0.32 0.013 [ 3.903 5.15 ] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 1.825 2.339 2.647 2.965 3.672 1.37 1.805 2.06 2.349 2.974 0.6 0.794 0.92 1.064 1.377 1.296 1.512 1.633 1.769 2.032 4.368 5.101 5.565 6.023 7.002 2.345 2.879 3.211 3.537 4.239 1.28 1.59 1.758 1.95 2.358 2.57 2.883 3.062 3.239 3.585 8.965 10.12 10.72 11.318 12.631 2.836 3.44 3.802 4.18 5.0 1.293 1.59 1.767 1.956 2.346 3.919 4.325 4.54 4.749 5.174
M_adeno.write_csv('Adeno_per_1000', variables=['per_1000'])
M_adeno.virus_df
under 6 mo. | 6-11 mo. | 11-23 mo. | |
---|---|---|---|
virus_year | |||
2011 | 32 | 25 | 21 |
2012 | 70 | 40 | 43 |
2013 | 136 | 48 | 43 |
M_piv = pm.MCMC(rate_model('PIV'))
M_piv.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 71.6 sec
M_piv.sample(100000, 90000)
[-----------------100%-----------------] 100000 of 100000 complete in 66.9 sec
pm.Matplot.summary_plot(M_piv.per_1000, custom_labels=prevalence_labels, main='PIV (per 1000)')
M_piv.per_1000.summary()
per_1000: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 1.37 0.332 0.006 [ 0.776 2.032] 0.561 0.214 0.004 [ 0.193 0.989] 0.385 0.128 0.002 [ 0.159 0.646] 2.91 0.471 0.01 [ 2.015 3.87 ] 0.946 0.276 0.005 [ 0.466 1.524] 0.887 0.186 0.004 [ 0.571 1.287] 3.66 0.542 0.011 [ 2.663 4.798] 1.48 0.347 0.007 [ 0.811 2.141] 0.402 0.126 0.002 [ 0.171 0.65 ] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.801 1.132 1.338 1.58 2.088 0.227 0.406 0.535 0.684 1.063 0.176 0.293 0.375 0.462 0.678 2.05 2.581 2.884 3.207 3.924 0.476 0.747 0.92 1.111 1.547 0.567 0.75 0.875 1.004 1.285 2.664 3.297 3.629 4.01 4.8 0.886 1.229 1.448 1.693 2.26 0.195 0.314 0.386 0.477 0.686
M_piv.write_csv('PIV_per_1000', variables=['per_1000'])
M_piv.virus_df
under 6 mo. | 6-11 mo. | 11-23 mo. | |
---|---|---|---|
virus_year | |||
2011 | 16 | 6 | 8 |
2012 | 36 | 11 | 21 |
2013 | 46 | 18 | 9 |
Import hospitalization data
spreadsheets = !ls data/Al-Bashir*
data = []
col_names = 'date', 'admissions', 'admissions_2', 'resp_admissions', 'resp_admissions_2'
for spreadsheet in spreadsheets:
this_file = pd.ExcelFile(spreadsheet)
# Need to fix data entry error in years > 2011
dot = spreadsheet.find('.')
fix_columns = int(spreadsheet[dot-4:dot]) > 2011
for name in this_file.sheet_names:
d = this_file.parse(name)
d.columns = col_names
if fix_columns:
total = d['resp_admissions'] + d['resp_admissions_2']
d['resp_admissions_2'] = d['resp_admissions']
d['resp_admissions'] = total
data.append(d)
hospitalizations = pd.concat(data).set_index('date')
hospitalizations.head()
admissions | admissions_2 | resp_admissions | resp_admissions_2 | |
---|---|---|---|---|
date | ||||
2010-03-14 | 23 | 21 | 15 | 14 |
2010-03-15 | 27 | 23 | 19 | 18 |
2010-03-16 | 11 | 8 | 6 | 3 |
2010-03-17 | 17 | 10 | 10 | 8 |
2010-03-18 | 23 | 17 | 12 | 10 |
hospitalizations['virus_year'] = 2011
hospitalizations.loc[(hospitalizations.index >= datetime(2011, 3, 31))
& (hospitalizations.index <= datetime(2012, 3, 31)), 'virus_year'] = 2012
hospitalizations.loc[hospitalizations.index > datetime(2012, 3, 31), 'virus_year'] = 2013
under_2_hosp = hospitalizations.groupby('virus_year').sum()['admissions_2']
under_2_hosp
virus_year 2011 4123 2012 3865 2013 3776 Name: admissions_2, dtype: int64
rsv_by_year = rsv_subset.groupby('virus_year')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']]
rsv_counts = rsv_by_year.sum()
rsv_counts
age_under_2 | age_2_5 | age_6_11 | age_over_11 | |
---|---|---|---|---|
virus_year | ||||
2011 | 119 | 153 | 87 | 38 |
2012 | 124 | 184 | 79 | 58 |
2013 | 159 | 184 | 106 | 63 |
under2_rsv = pd.concat([rsv_counts, under_2_hosp], axis=1).dropna()
under2_rsv
age_under_2 | age_2_5 | age_6_11 | age_over_11 | admissions_2 | |
---|---|---|---|---|---|
virus_year | |||||
2011 | 119 | 153 | 87 | 38 | 4123 |
2012 | 124 | 184 | 79 | 58 | 3865 |
2013 | 159 | 184 | 106 | 63 | 3776 |
def extract_virus(virus):
virus_subset = hospitalized[hospitalized[virus]==1]
virus_by_year = virus_subset.groupby('virus_year')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']]
virus_counts = virus_by_year.sum()
return pd.concat([virus_counts, under_2_hosp], axis=1).dropna()
def hosp_subset(virus):
data_subset = extract_virus(virus)
admissions = data_subset.pop('admissions_2').values
data_subset = data_subset.values
n_months, n_ages = data_subset.shape
# Estimate age distribution of admissions
p_age = pm.Dirichlet('p_age', np.ones(4), value=data_subset.sum(0)[:-1]/data_subset.sum())
counts = pm.Multinomial('counts', data_subset.sum(1), p_age, value=data_subset, observed=True)
# Estimate denominators
n_hosp = []
for i,ni in enumerate(admissions):
d = data_subset[i]
n_init = (ni*(d/d.sum() if (d.sum()>10) else np.ones(4)*0.25)).astype(int)[:-1]
n_init = np.append(n_init, ni - n_init.sum())
n_hosp.append(pm.Multinomial('n_hosp_{}'.format(i), ni, p_age, value=n_init))
n_hosp_t = pm.Lambda('n_hosp_t', lambda x=n_hosp: np.array([[xij for xij in xi] for xi in x]).T)
# Virus rates by age and time
mu_alpha = pm.Normal('mu_alpha', 0, 0.01, value=[0]*n_ages)
sigma_alpha = pm.Uniform('sigma_alpha', 0, 100, value=[10]*n_ages)
rho = pm.Uniform('rho', -1, 1, value=0)
mu = [pm.Lambda('mu_{}'.format(i), lambda mu=mu_alpha: np.array([mu[i]]*n_months)) for i in range(n_ages)]
off_diag = np.eye(n_months, k=1)
Sigma = [pm.Lambda('Sigma_{}'.format(i), lambda s=sigma_alpha, r=rho:
(np.eye(n_months) + off_diag*r + off_diag.T*r)*(s[i]**2)) for i in range(n_ages)]
alpha = [pm.MvNormalCov('alpha_{}'.format(i), mu[i], Sigma[i], value=[0]*n_months) for i in range(n_ages)]
p_virus = [pm.Lambda('p_virus_{}'.format(i), lambda a=a: pm.invlogit(a)) for i,a in enumerate(alpha)]
# Viral rate likelihood
@pm.observed
def rate_like(value=data_subset.T, p=p_virus, n=n_hosp_t):
return np.sum([pm.binomial_like(value[i], n[i], p[i]) for i in range(n_ages)])
return(locals())
M = pm.MCMC(hosp_subset('RSV'))
M.sample(20000, 10000)
[-----------------100%-----------------] 20000 of 20000 complete in 32.0 sec
pm.Matplot.summary_plot(M.p_age, custom_labels=['age_under_2', 'age_2_5', 'age_6_11'])
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
date_labels = ['2011', '2012', '2013']
pm.Matplot.summary_plot(M.p_virus[0], custom_labels=date_labels, main='RSV rates')
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
import seaborn as sb
age_group_labels = ['Under 2 mo.', '2-5 mo.', '6-11 mo.', '12-23 mo.']
rsv_rates = pd.concat([pd.DataFrame({'rate':p.trace().T[t],
'age': age_group_labels[i],
'year': date_labels[t]}) for i,p in enumerate(M.p_virus) for t in range(3)])
sb.factorplot('year', 'rate', 'age', rsv_rates, kind='box', aspect=1.75)
<seaborn.axisgrid.FacetGrid at 0x11a7a8780>
M_rhino = pm.MCMC(hosp_subset('Rhino'))
M_rhino.sample(20000, 10000)
[-----------------100%-----------------] 20000 of 20000 complete in 31.5 sec
rhino_rates = pd.concat([pd.DataFrame({'rate':p.trace().T[t],
'age': age_group_labels[i],
'year': date_labels[t]}) for i,p in enumerate(M_rhino.p_virus) for t in range(3)])
sb.set(style="white", palette="muted")
fg = sb.factorplot('year', 'rate', 'age', rhino_rates, kind='box', aspect=1.75)
for virus in viruses:
print('\nRunning', virus)
M = pm.MCMC(hosp_subset(virus))
M.sample(50000, 40000)
sb.set(style="white", palette="muted")
virus_rates = pd.concat([pd.DataFrame({'rate':p.trace().T[t],
'age': age_group_labels[i],
'year': date_labels[t]}) for i,p in enumerate(M.p_virus) for t in range(3)])
fg = sb.factorplot('year', 'rate', 'age', virus_rates, kind='box', aspect=1.75)
fg.savefig(virus)
M.write_csv(virus, variables=[x.__name__ for x in M.p_virus])
Running RSV [-----------------100%-----------------] 50000 of 50000 complete in 77.9 sec Running HMPV [-----------------100%-----------------] 50000 of 50000 complete in 83.4 sec Running Rhino [-----------------100%-----------------] 50000 of 50000 complete in 79.5 sec Running Influenza [-----------------100%-----------------] 50000 of 50000 complete in 78.9 sec Running Adeno [-----------------100%-----------------] 50000 of 50000 complete in 78.6 sec Running PIV [-----------------100%-----------------] 50000 of 50000 complete in 78.0 sec Running No virus [-----------------100%-----------------] 50000 of 50000 complete in 78.0 sec Running All [-----------------100%-----------------] 50000 of 50000 complete in 78.2 sec