#!/usr/bin/env python # coding: utf-8 # In[101]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt from datetime import datetime import seaborn as sb sb.set_style("whitegrid") # Import data # In[102]: hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0) hospitalized.head() # Breakdown by: # # - month of age (and greater than 1 year) # - sex # - birthweight # - prematurity # - feeding: % with any breastfeeding # - maternal ed # - number in household # - daycare # - smoking exposure: cigarette, maternal during pregnacy, nargalia # - % with comorbidity: past medical history # - nationality: jordanian, palestinean, syrian, egptian, other # - diagnosis: pneumo, bronchiolitis, bronchopnewumonia # # Severity measures: # # - length of stay # - ICU # - O2 # - Mechanical vent. # - days of symptoms before admission # - % with antibiotics prior # - % with antibiotics during # # Columns: # # - RSV, MPV, Rhino, Influenza A/B/C, Adeno, Parinfluenza, Hospitalzied pop, no virus # In[152]: [v for v in hospitalized.columns if v.endswith('hx')] # In[165]: hospitalized['asthma'] = (hospitalized.child_asthma==1) | hospitalized.asthma_hx hospitalized.asthma.sum() # In[185]: hospitalized.no_hx.value_counts() # In[189]: ((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool))).sum() # Past medical history calculated as those who claim to have past medical history and those who claim to have no past medical history and reported asthma. # In[192]: hospitalized['past medical history'] = (~hospitalized.no_hx.astype(bool) | ((hospitalized.no_hx==1) & (hospitalized.asthma.astype(bool))) hospitalized['past medical history'].sum() # In[103]: # Positive culture lookup pcr_lookup = {'pcr_result___1': 'RSV', 'pcr_result___2': 'HMPV', 'pcr_result___3': 'flu A', 'pcr_result___4': 'flu B', 'pcr_result___5': 'rhino', 'pcr_result___6': 'PIV1', 'pcr_result___7': 'PIV2', 'pcr_result___8': 'PIV3', 'pcr_result___13': 'H1N1', 'pcr_result___14': 'H3N2', 'pcr_result___15': 'Swine', 'pcr_result___16': 'Swine H1', 'pcr_result___17': 'flu C', 'pcr_result___18': 'Adeno'} # In[104]: hospitalized['RSV'] = hospitalized.pcr_result___1.astype(bool) hospitalized['HMPV'] = hospitalized.pcr_result___2.astype(bool) hospitalized['Rhino'] = hospitalized.pcr_result___5.astype(bool) hospitalized['Influenza'] = (hospitalized.pcr_result___3 | hospitalized.pcr_result___4 | hospitalized.pcr_result___17) hospitalized['Adeno'] = hospitalized.pcr_result___18.astype(bool) hospitalized['PIV'] = (hospitalized.pcr_result___6 | hospitalized.pcr_result___7 | hospitalized.pcr_result___8) hospitalized['No virus'] = hospitalized[list(pcr_lookup.keys())].sum(1) == 0 hospitalized['All'] = True # In[105]: viruses = ['RSV', 'HMPV', 'Rhino', 'Influenza', 'Adeno', 'PIV', 'No virus', 'All'] # In[106]: non_rsv_lookup = pcr_lookup.copy() non_rsv_lookup.pop('pcr_result___1') # Identify individuals with coinfection # In[107]: hospitalized['coinfection'] = hospitalized[list(pcr_lookup.keys())].sum(1) > 1 # Virus frequency by coinfection status # In[108]: hospitalized[(~hospitalized.coinfection) & (hospitalized.RSV)].shape[0]/hospitalized.shape[0] # In[109]: virus_by_coinf = hospitalized.groupby('coinfection')[viruses[:-2]].sum()/hospitalized.shape[0] virus_by_coinf.T.plot(kind='bar', stacked=True, grid=False, color=sb.color_palette('Paired', 2)[::-1]) # In[110]: hospitalized[viruses].sum(0) # In[111]: other_virus_index = (hospitalized[list(non_rsv_lookup.keys())].sum(1) > 0).astype(int) # In[112]: hospitalized['RSV'] = hospitalized['pcr_result___1'] hospitalized['non-RSV virus'] = (hospitalized['pcr_result___1']==0) & other_virus_index hospitalized['no virus'] = (hospitalized['pcr_result___1']==0) & (other_virus_index==0) # In[113]: hospitalized["prev_cond"] = (hospitalized[[c for c in hospitalized.columns if c.endswith('hx') and not c.startswith('no_')]].sum(1) > 0) # In[114]: hospitalized['male'] = (hospitalized.sex=='M').astype(int) age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,11,23])) age_groups.index = hospitalized.index age_groups.columns = 'under 2 months', '2-11 months', '12-23 months' hospitalized = hospitalized.join(age_groups) # In[115]: nationality_lookup = {1: 'Jordanian', 2: 'Egyptian', 3: 'Palestinian', 4: 'Iraqi', 5: 'Syrian', 6: 'Sudanese', 7: 'Russian', 8: 'Asian', 9: 'Other'} hospitalized['nationality'] = hospitalized.mother_nationality.replace(nationality_lookup) hospitalized['Jordanian'] = (hospitalized.nationality=='Jordanian').astype(int) hospitalized['Palestinian'] = (hospitalized.nationality=='Palestinian').astype(int) # In[116]: hospitalized['vitamin D < 20'] = (hospitalized.hospitalized_vitamin_d < 20).astype(int) hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 20'] = np.nan hospitalized['vitamin D < 11'] = (hospitalized.hospitalized_vitamin_d < 11).astype(int) hospitalized.loc[hospitalized.hospitalized_vitamin_d.isnull(), 'vitamin D < 11'] = np.nan # In[117]: hospitalized['any_cigarette'] = (hospitalized.cigarette_smokers > 0).astype(int) # In[194]: hospitalized['any_smoke'] = (hospitalized.cigarette_smokers.astype(bool) | hospitalized.nargila_smokers.astype(bool)) # In[196]: hospitalized.any_smoke.mean() # In[118]: hospitalized['premature'] = (hospitalized.gest_age < 37).astype(int) # In[198]: hospitalized.length_of_stay.hist(bins=20) # Diagnosis # In[143]: hospitalized['ros'] = hospitalized.adm_sepsis | hospitalized.adm_febrile # In[145]: hospitalized['pertussis-like cough'] = hospitalized.adm_pertussis | hospitalized.adm_cough # In[146]: hospitalized['brochopneumonia'] = hospitalized.adm_bronchopneumo # In[147]: hospitalized['bronchiolitis'] = hospitalized.adm_bronchiolitis # In[148]: hospitalized['pneumonia'] = hospitalized.adm_pneumo # ## Rate Estimation # In[119]: hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date) hospitalized.admission_date.describe() # Age groups: # # - Under 2 months # - 2-5 mo. # - 6-11 mo. # - Over 11 mo. # In[120]: age_groups = pd.get_dummies(pd.cut(hospitalized.age_months, [0,1,5,11,24], include_lowest=True)) age_groups.index = hospitalized.index age_groups.columns = 'age_under_2', 'age_2_5', 'age_6_11', 'age_over_11' hospitalized = hospitalized.join(age_groups) # In[121]: age_group_lookup = {'age_under_2': '<2', 'age_2_5': '2-5', 'age_6_11': '6-11', 'age_over_11': '>11'} # In[122]: customcmap = sb.color_palette("coolwarm", 6) # In[123]: fig, axes = plt.subplots(2,2, figsize=(10,6), sharey=True) for i,ax in enumerate(np.ravel(axes)): age_group = age_groups.columns[i] age_subset = hospitalized[hospitalized[age_group].astype(bool)] age_virus = age_subset[viruses[:-2]].mean() age_virus.T.plot(kind='bar', grid=False, ax=ax, color=customcmap) ax.set_title(age_group_lookup[age_group]) # ### Estimation of age group distribution # In[124]: hosp_age_counts = hospitalized.groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum').values # In[125]: pre_2013 = hospitalized.admission_date < datetime(2013, 1, 1) # In[126]: age_counts = hospitalized[pre_2013].groupby('admission_date')[age_groups.columns].sum().resample('M', how='sum') age_counts # Rates and demographics (via [World Bank](http://databank.worldbank.org)) and 2004 census data (via Jordan [Department of Statistics](http://www.dos.gov.jo/dos_home_e/main/population/census2004/group3/table_31.pdf)). # In[127]: # Jordan population, 2008-2012 population = 5786000, 5915000, 6046000, 6181000, 6318000 # Interpolated population by gender and age, 2010-2012 female_0 = 93649, 95739, 96486 male_0 = 98435, 100525, 101160 female_1 = 87941, 93511, 93067 male_1 = 92548, 98306, 97763 kids_0 = np.array(female_0) + np.array(male_0) kids_1 = np.array(female_1) + np.array(male_1) kids = kids_0 + kids_1 # Proportion in Amman amman_urban_2004 = 1784502. jordan_2004 = 5103639. amman_prop = amman_urban_2004 / jordan_2004 # Birth rates (per 1000) birth_rate = 29.665, 29.322, 28.869, 28.317, 27.699 # Neonatal mortality (per 1000) neonatal_mort = 12.7, 12.4, 12.1, 11.8, 11.5 # Infant mortality (per 1000) infant_mort = 18.4, 17.9, 17.3, 16.8, 16.4 # In[128]: amman_prop # In[129]: admissions_by_month = hospitalized.groupby('admission_date')['child_name'].count().resample('1M', how='sum') # In[130]: admissions_by_month.head() # In[131]: # Dict to return pcr_result variable corresponding to virus virus_lookup = {pcr_lookup[k]: k for k in pcr_lookup.keys()} # In[132]: # Enrolling 5 days per week p_enroll = 5./7 # In[133]: rsv_subset = hospitalized[hospitalized.RSV==1] rsv_subset.shape # In[134]: kids_1 # Import hospitalization data # In[135]: spreadsheets = get_ipython().getoutput('ls data/Al-Bashir*') data = [] col_names = 'date', 'admissions', 'admissions_2', 'resp_admissions', 'resp_admissions_2' for spreadsheet in spreadsheets: this_file = pd.ExcelFile(spreadsheet) # Need to fix data entry error in years > 2011 dot = spreadsheet.find('.') fix_columns = int(spreadsheet[dot-4:dot]) > 2011 for name in this_file.sheet_names: d = this_file.parse(name) d.columns = col_names if fix_columns: total = d['resp_admissions'] + d['resp_admissions_2'] d['resp_admissions_2'] = d['resp_admissions'] d['resp_admissions'] = total data.append(d) hospitalizations = pd.concat(data).set_index('date') # In[136]: hospitalizations.head() # In[137]: under_2_hosp = hospitalizations[hospitalizations.index < datetime(2013, 1, 1)].resample('M', how=sum)['admissions_2'] under_2_hosp.head() # In[138]: rsv_by_date = rsv_subset.groupby('admission_date')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']] rsv_counts = rsv_by_date.sum().resample('1M', how='sum').fillna(0) # In[139]: rsv_counts.head() # In[140]: under2_rsv = pd.concat([rsv_counts, under_2_hosp], axis=1).dropna() under2_rsv.head() # In[141]: def extract_virus(virus): virus_subset = hospitalized[hospitalized[virus]==1] virus_by_date = virus_subset.groupby('admission_date')[['age_under_2', 'age_2_5', 'age_6_11', 'age_over_11']] virus_counts = virus_by_date.sum().resample('1M', how='sum').fillna(0) return pd.concat([virus_counts, under_2_hosp], axis=1).dropna() # In[39]: [(x*(d/d.sum() if (d.sum()>10) else np.ones(4)*0.25)).astype(int) for x,d in zip(adm,data_subset)] # In[40]: import pymc as pm def hosp_subset(virus): data_subset = extract_virus(virus) admissions = data_subset.pop('admissions_2').values data_subset = data_subset.values n_months, n_ages = data_subset.shape # Estimate age distribution of admissions p_age = pm.Dirichlet('p_age', np.ones(4), value=data_subset.sum(0)[:-1]/data_subset.sum()) counts = pm.Multinomial('counts', data_subset.sum(1), p_age, value=data_subset, observed=True) # Estimate denominators n_hosp = [] for i,ni in enumerate(admissions): d = data_subset[i] n_init = (ni*(d/d.sum() if (d.sum()>10) else np.ones(4)*0.25)).astype(int)[:-1] n_init = np.append(n_init, ni - n_init.sum()) n_hosp.append(pm.Multinomial('n_hosp_{}'.format(i), ni, p_age, value=n_init)) n_hosp_t = pm.Lambda('n_hosp_t', lambda x=n_hosp: np.array([[xij for xij in xi] for xi in x]).T) # Virus rates by age and time mu_alpha = pm.Normal('mu_alpha', 0, 0.01, value=[0]*n_ages) sigma_alpha = pm.Uniform('sigma_alpha', 0, 100, value=[10]*n_ages) rho = pm.Uniform('rho', -1, 1, value=0) mu = [pm.Lambda('mu_{}'.format(i), lambda mu=mu_alpha: np.array([mu[i]]*n_months)) for i in range(n_ages)] off_diag = np.eye(n_months, k=1) Sigma = [pm.Lambda('Sigma_{}'.format(i), lambda s=sigma_alpha, r=rho: (np.eye(n_months) + off_diag*r + off_diag.T*r)*(s[i]**2)) for i in range(n_ages)] alpha = [pm.MvNormalCov('alpha_{}'.format(i), mu[i], Sigma[i], value=[0]*n_months) for i in range(n_ages)] p_virus = [pm.Lambda('p_virus_{}'.format(i), lambda a=a: pm.invlogit(a)) for i,a in enumerate(alpha)] # Viral rate likelihood @pm.observed def rate_like(value=data_subset.T, p=p_virus, n=n_hosp_t): return np.sum([pm.binomial_like(value[i], n[i], p[i]) for i in range(n_ages)]) return(locals()) # In[41]: M = pm.MCMC(hosp_subset('RSV')) # In[42]: M.sample(20000, 10000) # In[43]: pm.Matplot.summary_plot(M.p_age) # In[44]: pm.Matplot.summary_plot(M.p_virus[0]) # In[45]: import seaborn as sb # In[46]: M.p_virus[0].trace().shape # In[47]: date_labels = [pd.to_datetime(str(d)).strftime('%b %d') for d in rsv_counts.index.values] # date_labels = rsv_counts.index.values.astype('M8[D]') # In[48]: # for i in range(4): ax = sb.tsplot(M.p_virus[0].trace()[-1000:], err_style="unit_traces") ax.set_xticklabels(date_labels, rotation='vertical') ax.set_ylabel('Rate'); # In[49]: sb.set(style="white", palette="muted") f, axes = plt.subplots(2, 2, figsize=(10, 8)) for i,axis in enumerate(np.ravel(axes)): sb.tsplot(M.p_virus[i].trace()[-1000:], err_style="unit_traces", ax=axis) axis.set_title(age_groups.columns[i]) axis.set_xticklabels(date_labels, rotation='vertical') axis.set_ylabel('Rate'); # In[50]: M_rhino = pm.MCMC(hosp_subset('Rhino')) M_rhino.sample(20000, 10000) # In[51]: age_group_labels = {'age_under_2':'Under 2 yrs', 'age_2_5':'2-5 yrs', 'age_6_11':'6-11 yrs', 'age_over_11':'Over 11 yrs'} # In[52]: sb.set(style="white", palette="muted") f, axes = plt.subplots(2, 2, figsize=(10, 8)) for i,axis in enumerate(np.ravel(axes)): sb.tsplot(M_rhino.p_virus[i].trace()[-1000:], err_style="unit_traces", ax=axis) axis.set_title(age_group_labels[age_groups.columns[i]]) axis.set_xticklabels(date_labels, rotation='vertical') axis.set_ylabel('Rate'); f.suptitle('Rhino', fontsize=14) # ### Production runs # In[53]: for virus in viruses: print('\nRunning', virus) M = pm.MCMC(hosp_subset(virus)) M.sample(50000, 40000) sb.set(style="white", palette="muted") f, axes = plt.subplots(2, 2, figsize=(10, 8)) for i,axis in enumerate(np.ravel(axes)): sb.tsplot(M.p_virus[i].trace()[-1000:], err_style="unit_traces", ax=axis) axis.set_title(age_group_labels[age_groups.columns[i]]) axis.set_xticklabels(date_labels, rotation='vertical') axis.set_ylabel('Rate'); f.suptitle(virus, fontsize=14) f.savefig(virus) M.write_csv(virus, variables=[x.__name__ for x in M.p_virus])