#!/usr/bin/env python # coding: utf-8 # # Disease Outbreak Response Decision-making Under Uncertainty: A retrospective analysis of measles in Sao Paulo # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import numpy.ma as ma from datetime import datetime import matplotlib.pyplot as plt import pdb from IPython.core.display import HTML def css_styling(): styles = open("styles/custom.css", "r").read() return HTML(styles) css_styling() # In[3]: data_dir = "data/" # Import outbreak data # In[4]: measles_data = pd.read_csv(data_dir+"measles.csv", index_col=0) measles_data.NOTIFICATION = pd.to_datetime(measles_data.NOTIFICATION) measles_data.BIRTH = pd.to_datetime(measles_data.BIRTH) measles_data.ONSET = pd.to_datetime(measles_data.ONSET) # In[5]: measles_data = measles_data.replace({'DISTRICT': {'BRASILANDIA':'BRAZILANDIA'}}) # Sao Paulo population by district # In[6]: sp_pop = pd.read_csv(data_dir+'sp_pop.csv', index_col=0) # In[7]: _names = sp_pop.index.values _names[_names=='BRASILANDIA'] = 'BRAZILANDIA' sp_pop.set_index(_names, inplace = True) # In[8]: sp_pop.head() # Plot of cumulative cases by district # In[9]: measles_onset_dist = measles_data.groupby(['DISTRICT','ONSET']).size().unstack(level=0).fillna(0) measles_onset_dist.cumsum().plot(legend=False, grid=False) # In[10]: total_district_cases = measles_onset_dist.sum() # Top 5 districts by number of cases # In[11]: totals = measles_onset_dist.sum() totals.sort(ascending=False) totals[:5] # Age distribution of cases, by confirmation status # In[12]: by_conclusion = measles_data.groupby(["YEAR_AGE", "CONCLUSION"]) counts_by_cause = by_conclusion.size().unstack().fillna(0) ax = counts_by_cause.plot(kind='bar', stacked=True, xlim=(0,50), figsize=(15,5)) # ## Vaccination data # In[13]: vaccination_data = pd.read_csv('data/BrazilVaxRecords.csv', index_col=0) vaccination_data.head() # In[14]: vaccination_data.VAX[:18] # In[15]: vax_97 = np.r_[[0]*(1979-1921+1), vaccination_data.VAX[:17]] n = len(vax_97) FOI_mat = np.resize((1 - vax_97*0.9), (n,n)).T # In[16]: # Mean age of infection for those born prior to vaccination coverage, assuming R0=16 A = 4.37 # In[17]: (1 - vax_97*0.9)[:-1] # In[18]: np.tril(FOI_mat).sum(0) # In[19]: natural_susc = np.exp((-1/A) * np.tril(FOI_mat).sum(0))[::-1] vacc_susc = (1 - vax_97*0.9)[::-1] vacc_susc[0] = 0.5 vacc_susc # In[20]: sia_susc = np.ones(len(vax_97)) birth_year = np.arange(1922, 1998)[::-1] by_mask = (birth_year > 1983) & (birth_year < 1992) sia_susc[by_mask] *= 0.2 # ## Stochastic Disease Transmission Model # # As a baseline for comparison, we can fit a model to all the clinically-confirmed cases, regardless of lab confirmation status. For this, we will use a simple SIR disease model, which will be fit using MCMC. # # This model fits the series of 2-week infection totals in each district $i$ as a set of Poisson models: # # \\[Pr(I(t)_{i} | \lambda(t)_i) = \text{Poisson}(\lambda(t)_i) \\] # # Where the outbreak intensity is modeled as: # # \\[\lambda(t)_i = \beta [I^{(w)}(t-1)_i]^{\alpha} S(t-1)_i\\] # # \\[\alpha \sim \text{Exp}(1)\\] # # We will assume here that the transmission rate is constant over time (and across districts): # # \\[\beta \sim \text{Gamma}(1, 0.1)\\] # # To account for the influence of infected individuals from neighboring districts on new infections, the outbreak intensity was modeled using a spatial-weighted average of infecteds across districts, where populations were weighted as an exponential function of the distance between district centroids: # # \\[w_{d} = \text{exp}(-\theta d)\\] # # \\[\theta \sim \text{Exp}(1)\\] # # ### Confirmation Sub-model # # Rather than assume all clinical cases are true cases, we can adjust the model to account for lab confirmation probability. This is done by including a sub-model that estimates age group-specific probabilities of confirmation, and using these probabilities to estimate the number of lab-confirmed cases. These estimates are then plugged into the model in place of the clinically-confirmed cases. # # We specified a structured confirmation model to retrospectively determine the age group-specific probabilities of lab confirmation for measles, conditional on clinical diagnosis. Individual lab confirmation events $c_i$ were modeled as Bernoulli random variables, with the probability of confirmation being allowed to vary by age group: # # $$c_i \sim \text{Bernoulli}(p_{a(i)})$$ # # where $a(i)$ denotes the appropriate age group for the individual indexed by i. There were 16 age groups, the first 15 of which were 5-year age intervals $[0,5), [5, 10), \ldots , [70, 75)$, with the 16th interval including all individuals 75 years and older. # # Since the age interval choices were arbitrary, and the confirmation probabilities of adjacent groups likely correlated, we modeled the correlation structure directly, using a multivariate logit-normal model. Specifically, we allowed first-order autocorrelation among the age groups, whereby the variance-covariance matrix retained a tridiagonal structure. # # $$\begin{aligned} # \Sigma = \left[{ # \begin{array}{c} # {\sigma^2} & {\sigma^2 \rho} & 0& \ldots & {0} & {0} \\ # {\sigma^2 \rho} & {\sigma^2} & \sigma^2 \rho & \ldots & {0} & {0} \\ # {0} & \sigma^2 \rho & {\sigma^2} & \ldots & {0} & {0} \\ # \vdots & \vdots & \vdots & & \vdots & \vdots\\ # {0} & {0} & 0 & \ldots & {\sigma^2} & \sigma^2 \rho \\ # {0} & {0} & 0 & \ldots & \sigma^2 \rho & {\sigma^2} # \end{array} # }\right] # \end{aligned}$$ # # From this, the confirmation probabilities were specified as multivariate normal on the inverse-logit scale. # # $$ \text{logit}(p_a) = \{a\} \sim N(\mu, \Sigma)$$ # # Priors for the confirmation sub-model were specified by: # # $$\begin{aligned} # \mu_i &\sim N(0, 100) \\ # \sigma &\sim \text{HalfCauchy}(25) \\ # \rho &\sim U(-1, 1) # \end{aligned}$$ # Age classes are defined in 5-year intervals. # In[21]: age_classes = [0,5,10,15,20,25,30,35,40,100] measles_data.dropna(subset=['YEAR_AGE'], inplace=True) measles_data['YEAR_AGE'] = measles_data.YEAR_AGE.astype(int) measles_data['AGE_GROUP'] = pd.cut(measles_data.AGE, age_classes, right=False) # Lab-checked observations are extracted for use in estimating lab confirmation probability. # In[22]: CONFIRMED = measles_data.CONCLUSION == 'CONFIRMED' CLINICAL = measles_data.CONCLUSION == 'CLINICAL' DISCARDED = measles_data.CONCLUSION == 'DISCARDED' # Extract confirmed and clinical subset, with no missing county information. # In[23]: lab_subset = measles_data[(CONFIRMED | CLINICAL) & measles_data.COUNTY.notnull()].copy() # In[24]: age = lab_subset.YEAR_AGE.values ages = lab_subset.YEAR_AGE.unique() counties = lab_subset.COUNTY.unique() y = (lab_subset.CONCLUSION=='CONFIRMED').values # In[25]: _lab_subset = lab_subset.replace({"CONCLUSION": {"CLINICAL": "UNCONFIRMED"}}) by_conclusion = _lab_subset.groupby(["YEAR_AGE", "CONCLUSION"]) counts_by_cause = by_conclusion.size().unstack().fillna(0) ax = counts_by_cause.plot(kind='bar', stacked=True, xlim=(0,50), figsize=(15,5), grid=False) # In[26]: lab_subset.shape # In[27]: y.sum() # Proportion of lab-confirmed cases older than 20 years # In[28]: (measles_data[CONFIRMED].YEAR_AGE>20).mean() # In[29]: age_classes # In[30]: #Extract cases by age and time. age_group = pd.cut(age, age_classes, right=False) age_index = np.array([age_group.categories.tolist().index(i) for i in age_group]) age_groups = age_group.categories age_groups # In[31]: age_slice_endpoints = [g[1:-1].split(',') for g in age_groups] age_slices = [slice(int(i[0]), int(i[1])) for i in age_slice_endpoints] # In[32]: # Get index from full crosstabulation to use as index for each district dates_index = measles_data.groupby( ['ONSET', 'AGE_GROUP']).size().unstack().index # In[33]: unique_districts = measles_data.DISTRICT.dropna().unique() # In[34]: excludes = ['BOM RETIRO'] # In[35]: N = sp_pop.drop(excludes).ix[unique_districts].sum().drop('Total') N # In[36]: N_age = N.iloc[:8] N_age.index = age_groups[:-1] N_age[age_groups[-1]] = N.iloc[8:].sum() N_age # Compile bi-weekly confirmed and unconfirmed data by Sao Paulo district # In[37]: sp_counts_2w = lab_subset.groupby( ['ONSET', 'AGE_GROUP']).size().unstack().reindex(dates_index).fillna(0).resample('2W', how='sum') # All confirmed cases, by district confirmed_data = lab_subset[lab_subset.CONCLUSION=='CONFIRMED'] confirmed_counts = confirmed_data.groupby( ['ONSET', 'AGE_GROUP']).size().unstack().reindex(dates_index).fillna(0).sum() all_confirmed_cases = confirmed_counts.reindex_axis(measles_data['AGE_GROUP'].unique()).fillna(0) # In[38]: # Ensure the age groups are ordered I_obs = sp_counts_2w.reindex_axis(measles_data['AGE_GROUP'].unique(), axis=1).fillna(0).values.astype(int) # Check shape of data frame # # - 28 bi-monthly intervals, 9 age groups # In[39]: assert I_obs.shape == (28, len(age_groups)) # Prior distribution on susceptible proportion: # # $$p_s \sim \text{Beta}(2, 100)$$ # In[40]: from pymc import rbeta plt.hist(rbeta(2, 100, 10000)) # In[41]: I_obs # In[42]: obs_date = '1997-12-01' #'1997-06-15' obs_index = sp_counts_2w.index <= obs_date I_obs_t = I_obs[obs_index] # In[43]: np.sum(I_obs_t, (0)) / float(I_obs_t.sum()) # In[44]: from pymc import rgamma # In[45]: plt.hist(rgamma(16,1,size=10000)) # In[46]: 75./age.mean() # In[86]: from pymc import MCMC, Matplot, AdaptiveMetropolis, Slicer, MAP from pymc import (Uniform, DiscreteUniform, Beta, Binomial, Normal, CompletedDirichlet, Poisson, NegativeBinomial, negative_binomial_like, poisson_like, Lognormal, Exponential, binomial_like, TruncatedNormal, Binomial, Gamma, HalfCauchy, normal_like, MvNormalCov, Bernoulli, Uninformative, Multinomial, rmultinomial, rbinomial, Dirichlet, multinomial_like) from pymc import (Lambda, observed, invlogit, deterministic, potential, stochastic,) def measles_model(obs_date, confirmation=True, spatial_weighting=False, all_traces=True): n_periods, n_age_groups = I_obs.shape ### Confirmation sub-model if confirmation: # Specify priors on age-specific means age_classes = np.unique(age_index) mu = Normal("mu", mu=0, tau=0.0001, value=[0]*len(age_classes)) sig = HalfCauchy('sig', 0, 25, value=1) var = sig**2 cor = Uniform('cor', -1, 1, value=0) # Build variance-covariance matrix with first-order correlation # among age classes @deterministic def Sigma(var=var, cor=cor): I = np.eye(len(age_classes))*var E = np.diag(np.ones(len(age_classes)-1), k=-1)*var*cor return I + E + E.T # Age-specific probabilities of confirmation as multivariate normal # random variables beta_age = MvNormalCov("beta_age", mu=mu, C=Sigma, value=[1]*len(age_classes)) p_age = Lambda('p_age', lambda t=beta_age: invlogit(t)) @deterministic(trace=False) def p_confirm(beta=beta_age): return invlogit(beta[age_index]) # Confirmation likelihood lab_confirmed = Bernoulli('lab_confirmed', p=p_confirm, value=y, observed=True) ''' Truncate data at observation period ''' obs_index = sp_counts_2w.index <= obs_date I_obs_t = I_obs[obs_index] # Index for observation date, used to index out values of interest # from the model. t_obs = obs_index.sum() - 1 if confirmation: @stochastic(trace=all_traces, dtype=int) def I(value=(I_obs_t).astype(int), n=I_obs_t, p=p_age): # Binomial confirmation process return np.sum([binomial_like(x, x.sum(), p) for x in value]) else: I = I_obs_t assert I.shape == (t_obs +1, n_age_groups) # Transmission parameter beta = HalfCauchy('beta', 0, 25, value=[8]*n_age_groups) decay = Beta('decay', 1, 5, value=0.9) @deterministic def B(b=beta, d=decay): B = b*np.eye(n_age_groups) for i in range(1, n_age_groups): B += np.diag(np.ones(n_age_groups-i)*b[i:]*d**i, k=-i) B += np.diag(np.ones(n_age_groups-i)*b[:-i]*d**i, k=i) return B # Downsample annual series to observed age groups downsample = lambda x: np.array([x[s].mean() for s in age_slices]) A = Lambda('A', lambda beta=beta: 75./(beta - 1)) lt_sum = downsample(np.tril(FOI_mat).sum(0)[::-1]) natural_susc = Lambda('natural_susc', lambda A=A: np.exp((-1/A) * lt_sum)) # natural_susc = Beta('natural_susc', 1, 1, value=[0.02]*n_age_groups) @deterministic def p_susceptible(natural_susc=natural_susc): return downsample(sia_susc) * downsample(vacc_susc) * natural_susc # Estimated total initial susceptibles S_0 = Binomial('S_0', n=N_age.astype(int), p=p_susceptible) S = Lambda('S', lambda I=I, S_0=S_0: S_0 - I.cumsum(0)) # Check shape assert S.value.shape == (t_obs+1., n_age_groups) S_t = Lambda('S_t', lambda S=S: S[-1]) @deterministic def R(B=B, S=S): return (S.dot(B) / N_age.values).T # Force of infection @deterministic def lam(B=B, I=I, S=S): return (I.sum(1) * (S.dot(B) / N_age.values).T).T # Check shape assert lam.value.shape == (t_obs+1, n_age_groups) # Poisson likelihood for observed cases @potential def new_cases(I=I, lam=lam): return poisson_like(I[1:], lam[:-1]) return locals() # In[87]: n_age_groups = 8 beta = np.arange(2, n_age_groups+2) B = beta*np.eye(n_age_groups) d = 0.9 for i in range(1, n_age_groups): B += np.diag(np.ones(n_age_groups-i)*beta[i:]*d**i, k=-i) B += np.diag(np.ones(n_age_groups-i)*beta[:-i]*d**i, k=i) B.round(2) # In[88]: S = np.ones(n_age_groups)*1000 # In[89]: B.dot(S) # In[99]: iterations = 100000 burn = 90000 # In[100]: M = MCMC(measles_model('1997-06-15', confirmation=True)) M.sample(iterations, burn) # In[101]: M.sample(iterations, burn) # In[106]: chain = -1 # In[111]: Matplot.summary_plot(M.p_susceptible, chain=chain, custom_labels=age_groups) # In[112]: Matplot.summary_plot(M.natural_susc, chain=chain, custom_labels=age_groups) # In[113]: Matplot.summary_plot(M.beta, chain=chain, custom_labels=age_groups) # Transmission decay parameter (for modeling decay in beta to other age groups) # In[110]: Matplot.plot(M.decay) # Confirmation probabilities # In[114]: Matplot.summary_plot(M.p_age, chain=chain, custom_labels=age_groups)