%matplotlib inline
import pandas as pd
import numpy as np
import numpy.ma as ma
from datetime import datetime
import matplotlib.pyplot as plt
import pdb
from IPython.core.display import HTML
def css_styling():
styles = open("styles/custom.css", "r").read()
return HTML(styles)
css_styling()
data_dir = "data/"
Import outbreak data
measles_data = pd.read_csv(data_dir+"measles.csv", index_col=0)
measles_data.NOTIFICATION = pd.to_datetime(measles_data.NOTIFICATION)
measles_data.BIRTH = pd.to_datetime(measles_data.BIRTH)
measles_data.ONSET = pd.to_datetime(measles_data.ONSET)
measles_data = measles_data.replace({'DISTRICT': {'BRASILANDIA':'BRAZILANDIA'}})
Sao Paulo population by district
sp_pop = pd.read_csv(data_dir+'sp_pop.csv', index_col=0)
_names = sp_pop.index.values
_names[_names=='BRASILANDIA'] = 'BRAZILANDIA'
sp_pop.set_index(_names, inplace = True)
sp_pop.head()
0 a 4 anos | 5 a 9 anos | 10 a 14 anos | 15 a 19 anos | 20 a 24 anos | 25 a 29 anos | 30 a 34 anos | 35 a 39 anos | 40 a 44 anos | 45 a 49 anos | 50 a 54 anos | 55 a 59 anos | 60 a 64 anos | 65 a 69 anos | 70 a 74 anos | 75 anos e + | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AGUA RASA | 5411 | 5750 | 6450 | 7122 | 7621 | 7340 | 6999 | 6984 | 6346 | 5608 | 4987 | 4212 | 4152 | 3595 | 2937 | 3637 | 89151 |
ALTO DE PINHEIROS | 2070 | 2369 | 2953 | 3661 | 4612 | 4190 | 3539 | 3633 | 3448 | 3289 | 3040 | 2533 | 2298 | 1732 | 1305 | 1823 | 46495 |
ANHANGUERA | 3068 | 3006 | 2755 | 2431 | 2426 | 2636 | 2695 | 2308 | 1653 | 1107 | 753 | 509 | 352 | 217 | 162 | 171 | 26249 |
ARICANDUVA | 7732 | 7730 | 8373 | 8956 | 9182 | 8531 | 7813 | 7365 | 6551 | 5554 | 4887 | 3858 | 3320 | 2449 | 1611 | 1723 | 95635 |
ARTUR ALVIM | 9031 | 9078 | 10000 | 11058 | 11387 | 10347 | 9125 | 8658 | 7830 | 7055 | 5919 | 4612 | 3756 | 2633 | 1727 | 1724 | 113940 |
Plot of cumulative cases by district
measles_onset_dist = measles_data.groupby(['DISTRICT','ONSET']).size().unstack(level=0).fillna(0)
measles_onset_dist.cumsum().plot(legend=False, grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x10ef5ef60>
total_district_cases = measles_onset_dist.sum()
Top 5 districts by number of cases
totals = measles_onset_dist.sum()
totals.sort(ascending=False)
totals[:5]
DISTRICT GRAJAU 1074 JARDIM ANGELA 944 CAPAO REDONDO 849 JARDIM SAO LUIZ 778 CAMPO LIMPO 692 dtype: float64
Age distribution of cases, by confirmation status
by_conclusion = measles_data.groupby(["YEAR_AGE", "CONCLUSION"])
counts_by_cause = by_conclusion.size().unstack().fillna(0)
ax = counts_by_cause.plot(kind='bar', stacked=True, xlim=(0,50), figsize=(15,5))
As a baseline for comparison, we can fit a model to all the clinically-confirmed cases, regardless of lab confirmation status. For this, we will use a simple SIR disease model, which will be fit using MCMC.
This model fits the series of 2-week infection totals in each district $i$ as a set of Poisson models:
$$Pr(I(t)_{i} | \lambda(t)_i) = \text{Poisson}(\lambda(t)_i) $$Where the outbreak intensity is modeled as:
$$\lambda(t)_i = \beta [I^{(w)}(t-1)_i]^{\alpha} S(t-1)_i$$$$\alpha \sim \text{Exp}(1)$$We will assume here that the transmission rate is constant over time (and across districts):
$$\beta \sim \text{Gamma}(1, 0.1)$$To account for the influence of infected individuals from neighboring districts on new infections, the outbreak intensity was modeled using a spatial-weighted average of infecteds across districts, where populations were weighted as an exponential function of the distance between district centroids:
$$w_{d} = \text{exp}(-\theta d)$$$$\theta \sim \text{Exp}(1)$$Rather than assume all clinical cases are true cases, we can adjust the model to account for lab confirmation probability. This is done by including a sub-model that estimates age group-specific probabilities of confirmation, and using these probabilities to estimate the number of lab-confirmed cases. These estimates are then plugged into the model in place of the clinically-confirmed cases.
We specified a structured confirmation model to retrospectively determine the age group-specific probabilities of lab confirmation for measles, conditional on clinical diagnosis. Individual lab confirmation events $c_i$ were modeled as Bernoulli random variables, with the probability of confirmation being allowed to vary by age group:
$$c_i \sim \text{Bernoulli}(p_{a(i)})$$where $a(i)$ denotes the appropriate age group for the individual indexed by i. There were 16 age groups, the first 15 of which were 5-year age intervals $[0,5), [5, 10), \ldots , [70, 75)$, with the 16th interval including all individuals 75 years and older.
Since the age interval choices were arbitrary, and the confirmation probabilities of adjacent groups likely correlated, we modeled the correlation structure directly, using a multivariate logit-normal model. Specifically, we allowed first-order autocorrelation among the age groups, whereby the variance-covariance matrix retained a tridiagonal structure.
$$\begin{aligned} \Sigma = \left[{ \begin{array}{c} {\sigma^2} & {\sigma^2 \rho} & 0& \ldots & {0} & {0} \\ {\sigma^2 \rho} & {\sigma^2} & \sigma^2 \rho & \ldots & {0} & {0} \\ {0} & \sigma^2 \rho & {\sigma^2} & \ldots & {0} & {0} \\ \vdots & \vdots & \vdots & & \vdots & \vdots\\ {0} & {0} & 0 & \ldots & {\sigma^2} & \sigma^2 \rho \\ {0} & {0} & 0 & \ldots & \sigma^2 \rho & {\sigma^2} \end{array} }\right] \end{aligned}$$From this, the confirmation probabilities were specified as multivariate normal on the inverse-logit scale.
$$ \text{logit}(p_a) = \{a\} \sim N(\mu, \Sigma)$$Priors for the confirmation sub-model were specified by:
$$\begin{aligned} \mu_i &\sim N(0, 100) \\ \sigma &\sim \text{HalfCauchy}(25) \\ \rho &\sim U(-1, 1) \end{aligned}$$Age classes are defined in 5-year intervals.
age_classes = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,100]
measles_data.dropna(subset=['YEAR_AGE'], inplace=True)
measles_data['YEAR_AGE'] = measles_data.YEAR_AGE.astype(int)
measles_data['AGE_GROUP'] = pd.cut(measles_data.AGE, age_classes, right=False)
Lab-checked observations are extracted for use in estimating lab confirmation probability.
CONFIRMED = measles_data.CONCLUSION == 'CONFIRMED'
CLINICAL = measles_data.CONCLUSION == 'CLINICAL'
DISCARDED = measles_data.CONCLUSION == 'DISCARDED'
Extract confirmed and clinical subset, with no missing county information.
lab_subset = measles_data[(CONFIRMED | CLINICAL) & measles_data.COUNTY.notnull()].copy()
age = lab_subset.YEAR_AGE.values
ages = lab_subset.YEAR_AGE.unique()
counties = lab_subset.COUNTY.unique()
y = (lab_subset.CONCLUSION=='CONFIRMED').values
_lab_subset = lab_subset.replace({"CONCLUSION": {"CLINICAL": "UNCONFIRMED"}})
by_conclusion = _lab_subset.groupby(["YEAR_AGE", "CONCLUSION"])
counts_by_cause = by_conclusion.size().unstack().fillna(0)
ax = counts_by_cause.plot(kind='bar', stacked=True, xlim=(0,50), figsize=(15,5), grid=False)
lab_subset.shape
(39982, 16)
y.sum()
22097
Proportion of lab-confirmed cases older than 20 years
(measles_data[CONFIRMED].YEAR_AGE>20).mean()
0.60257048468117846
#Extract cases by age and time.
age_group = pd.cut(age, age_classes, right=False)
age_index = np.array([age_group.categories.tolist().index(i) for i in age_group])
age_group.categories
Index(['[0, 5)', '[5, 10)', '[10, 15)', '[15, 20)', '[20, 25)', '[25, 30)', '[30, 35)', '[35, 40)', '[40, 45)', '[45, 50)', '[50, 55)', '[55, 60)', '[60, 65)', '[65, 70)', '[70, 75)', '[75, 100)'], dtype='object')
# Get index from full crosstabulation to use as index for each district
dates_index = measles_data.groupby(
['ONSET', 'AGE_GROUP']).size().unstack().index
unique_districts = measles_data.DISTRICT.dropna().unique()
excludes = ['BOM RETIRO']
N = sp_pop.ix[unique_districts, 'Total'].dropna()
N = N.drop(excludes)
sp_districts = N.index.values
len(sp_districts)
92
Compile bi-weekly confirmed and unconfirmed data by Sao Paulo district
all_district_data = []
all_confirmed_cases = []
for d in sp_districts:
# All bi-weekly unconfirmed and confirmed cases
district_data = lab_subset[lab_subset.DISTRICT==d]
district_counts_2w = district_data.groupby(
['ONSET', 'AGE_GROUP']).size().unstack().reindex(dates_index).fillna(0).resample('2W', how='sum')
all_district_data.append(district_counts_2w)
# All confirmed cases, by district
confirmed_data = district_data[district_data.CONCLUSION=='CONFIRMED']
confirmed_counts = confirmed_data.groupby(
['ONSET', 'AGE_GROUP']).size().unstack().reindex(dates_index).fillna(0).sum()
all_confirmed_cases.append(confirmed_counts.reindex_axis(measles_data['AGE_GROUP'].unique()).fillna(0))
Time series of cases by district, summarized in 2-week intervals
# Sum over ages for susceptibles
sp_cases_2w = [dist.sum(1) for dist in all_district_data]
len(sp_cases_2w)
92
# Ensure the age groups are ordered
I_obs = np.array([dist.reindex_axis(measles_data['AGE_GROUP'].unique(),
axis=1).fillna(0).values.astype(int) for dist in all_district_data])
I_obs.max()
46
I_obs.sum()
16640
age_groups = measles_data['AGE_GROUP'].unique()
age_groups
array(['[20, 25)', '[5, 10)', '[25, 30)', '[0, 5)', '[35, 40)', '[30, 35)', '[50, 55)', '[15, 20)', '[10, 15)', '[40, 45)', '[45, 50)', '[75, 100)', '[65, 70)', '[55, 60)', '[70, 75)', '[60, 65)'], dtype=object)
Check shape of data frame
assert I_obs.shape == (92, 28, 16)
import geopandas as gpd
shp = gpd.GeoDataFrame.from_file("Sao Paulo/Brazil_full/BRA_adm3.shp")
district_names = N.index.unique()
import trans
shp['district_name'] = shp.NAME_3.apply(
lambda x: trans.trans(x).upper())
sp_shp = shp[shp.NAME_2=='São Paulo'].set_index('district_name')
centroids = sp_shp.geometry.centroid
distance_matrix = pd.concat([sp_shp.geometry.distance(o) for o in sp_shp.geometry],
axis=1)
distance_matrix.columns = sp_shp.index
assert (distance_matrix.index == centroids.index).all()
distance_matrix = distance_matrix.ix[sp_districts, sp_districts]
assert not distance_matrix.isnull().values.sum()
min_x, min_y = sp_shp.bounds.min()[:2]
max_x, max_y = sp_shp.bounds.max()[2:]
centroid_xy = np.array([[c.x, c.y] for c in sp_shp.geometry.centroid])
Here is an arbitrary distance metric for an arbitrary district, as an example.
_beta = -100
np.exp(_beta*distance_matrix).values.round(2)[0]
array([ 1. , 0.04, 0. , 1. , 0.01, 0. , 0.27, 0.05, 0. , 0.13, 0. , 0. , 0. , 0. , 1. , 0. , 0.01, 0.23, 0. , 0. , 0.1 , 0. , 0. , 0. , 0. , 1. , 0. , 0.06, 0. , 0.01, 0. , 0. , 0. , 0. , 0. , 0.01, 0.02, 0. , 0.01, 0. , 0. , 0.03, 1. , 0.48, 0. , 0. , 0.05, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.01, 0. , 0.02, 0.02, 0.8 , 0.09, 0.01, 0.28, 0. , 0. , 0. , 0.11, 0.03, 0. , 0. , 0.2 , 0.26, 0. , 0. , 0. , 0. , 0.01, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.01, 0.31, 0.02, 0. , 0. , 0. , 0. , 1. ])
Specifying a neighborhood, based on shared borders. This can be used to develop a conditional autoregressive (CAR) model.
neighbors = np.array([sp_shp.geometry.touches(v).values for i, v in sp_shp.geometry.iteritems()])
Prior distribution on susceptible proportion:
$$p_s \sim \text{Beta}(2, 100)$$from pymc import rbeta
plt.hist(rbeta(2, 100, 10000))
(array([ 3.12300000e+03, 3.52900000e+03, 1.94100000e+03, 8.50000000e+02, 3.50000000e+02, 1.40000000e+02, 5.20000000e+01, 1.10000000e+01, 3.00000000e+00, 1.00000000e+00]), array([ 0.00024003, 0.01133855, 0.02243708, 0.0335356 , 0.04463413, 0.05573266, 0.06683118, 0.07792971, 0.08902823, 0.10012676, 0.11122528]), <a list of 10 Patch objects>)
infected_by_age = I_obs.sum(0).sum(0)
infected_dist = infected_by_age/float(infected_by_age.sum())
from pymc import MCMC, Matplot
from pymc import (Uniform, DiscreteUniform, Beta, Lambda, Binomial, Normal,
Poisson, NegativeBinomial, observed, negative_binomial_like, poisson_like,
Lognormal, Exponential, binomial_like, stochastic, potential,
invlogit, TruncatedNormal, Binomial, Gamma, HalfCauchy, normal_like,
deterministic, MvNormalCov, Bernoulli, potential, Uninformative,
Multinomial, rmultinomial, rbinomial, AdaptiveMetropolis,
Dirichlet, multinomial_like)
def measles_model(obs_date, confirmation=True, all_traces=False):
n_districts, n_periods, n_age_groups = I_obs.shape
### Confirmation sub-model
if confirmation:
# Specify priors on age-specific means
age_classes = np.unique(age_index)
mu = Normal("mu", mu=0, tau=0.0001, value=[0]*len(age_classes))
sig = HalfCauchy('sig', 0, 25, value=1)
var = sig**2
cor = Uniform('cor', -1, 1, value=0)
# Build variance-covariance matrix with first-order correlation
# among age classes
@deterministic
def Sigma(var=var, cor=cor):
I = np.eye(len(age_classes))*var
E = np.diag(np.ones(len(age_classes)-1), k=-1)*var*cor
return I + E + E.T
# Age-specific probabilities of confirmation as multivariate normal
# random variables
beta_age = MvNormalCov("beta_age", mu=mu, C=Sigma,
value=[1]*len(age_classes))
p_age = Lambda('p_age', lambda t=beta_age: invlogit(t))
@deterministic(trace=False)
def p_confirm(beta=beta_age):
return invlogit(beta[age_index])
# Confirmation likelihood
lab_confirmed = Bernoulli('lab_confirmed', p=p_confirm, value=y,
observed=True)
'''
Truncate data at observation period
'''
obs_index = all_district_data[0].index <= obs_date
I_obs_t = np.array([I_dist[obs_index] for I_dist in I_obs])
# Index for observation date, used to index out values of interest
# from the model.
t_obs = obs_index.sum() - 1
### Chain binomial model for disease dynamics
if confirmation:
@deterministic(trace=all_traces, dtype=int)
def I_age(n=I_obs_t, p=p_age):
# Binomial confirmation process: confirm by age
return np.array([[rbinomial(n_period, p) for n_period in n_dist] for n_dist in n])
# Aggregate by age
I = Lambda('I_age', lambda I=I_age: I.sum(2))
else:
# No confirmation
I_age = I_obs_t
I = I_age.sum(2)
# Estimage age distribution from observed distribution of infecteds to date
age_dist = Dirichlet('age_dist', np.ones(n_age_groups))
@potential
def I_age_like(x=I_age, p=age_dist):
return multinomial_like(np.sum(x, (0,1)), n=x.sum(), p=np.append(p, 1-p.sum()))
# Weakly-informative prior on proportion susceptible being
# between 0 and 0.07
p_susceptible = Beta('p_susceptible', 2, 100, size=n_districts)
# Estimated total susceptibles by district
S_0 = Binomial('S_0', n=N.values.astype(int), p=p_susceptible)
@deterministic(trace=all_traces)
def S(I=I, S_0=S_0):
# Calculate susceptibles from total number of infections
return np.array([S_0[d] - I[d].cumsum() for d in range(n_districts)])
# Susceptibles at time t, by age
@deterministic
def S_age(S=S, p=age_dist):
return np.array([rmultinomial(S_dist[-1], np.append(p, 1-p.sum())) for S_dist in S])
# Transmission parameter
β = Gamma('β', 1, 0.1, value=0.001)
θ = Exponential('θ', 1, value=1)
@deterministic
def Iw(I=I, θ=θ):
# Distance-weighted infecteds
return np.transpose([np.exp(-θ*distance_matrix.values).dot(It) for It in I.T])
α = Exponential('α', 1, value=1)
# Force of infection
@deterministic
def λ(β=β, I=Iw, S=S, α=α):
return np.array([β*(I_d[:-1]**α)*S_d[:-1] for I_d, S_d in zip(I,S)])
# FOI in observation period
λ_t = Lambda('λ_t', lambda λ=λ: λ[:, -1])
# Poisson likelihood for observed cases
@potential
def new_cases(I=I, λ=λ):
return poisson_like(I[:,1:], λ)
'''
Vaccination targets
'''
@deterministic
def vacc_5(S=S_age):
# Vaccination of 15 and under
p = [0.95] + [0]*15
return rbinomial(S.sum(0), p)
# Proportion of susceptibles vaccinated
pct_5 = Lambda('pct_5',
lambda V=vacc_5, S=S_age: float(V.sum())/S.sum())
@deterministic
def vacc_15(S=S_age):
# Vaccination of 15 and under
p = [0.95]*3 + [0]*13
return rbinomial(S.sum(0), p)
# Proportion of susceptibles vaccinated
pct_15 = Lambda('pct_15',
lambda V=vacc_15, S=S_age: float(V.sum())/S.sum())
@deterministic
def vacc_30(S=S_age):
# Vaccination of 30 and under
p = [0.95]*6 + [0]*10
return rbinomial(S.sum(0), p)
# Proportion of 30 and under susceptibles vaccinated
pct_30 = Lambda('pct_30',
lambda V=vacc_30, S=S_age: float(V.sum())/S.sum())
@deterministic
def vacc_adult(S=S_age):
# Vaccination of adults under 30 (and young kids)
p = [0.95, 0, 0, 0, 0.95, 0.95] + [0]*10
return rbinomial(S.sum(0), p)
# Proportion of adults under 30 (and young kids)
pct_adult = Lambda('pct_adult',
lambda V=vacc_adult, S=S_age: float(V.sum())/S.sum())
return locals()
Run models for June 15 and July 15 observation points, both with and without clinical confirmation.
db = 'ram'
n_iterations = 50000
n_burn = 40000
June 15, with lab confirmation
model = measles_model
model_june = MCMC(model('1997-06-15'), db=db, dbname='model_june')
# model_june.use_step_method(AdaptiveMetropolis, model_june.S_0)
model_june.sample(n_iterations, n_burn)
[-----------------100%-----------------] 50000 of 50000 complete in 2186.6 sec
model_june.sample(n_iterations, n_burn)
[-----------------100%-----------------] 50000 of 50000 complete in 3026.9 sec
July 15, with lab confirmation
model_july = MCMC(model('1997-07-15'), db=db, dbname='model_july')
# model_july.use_step_method(AdaptiveMetropolis, model_july.S_0)
model_july.sample(n_iterations, n_burn)
[-----------------100%-----------------] 50000 of 50000 complete in 2498.9 sec
June 15, no lab confirmation
model_june_noconf = MCMC(model('1997-06-15',
confirmation=False),
db=db, dbname='model_june_noconf')
model_june_noconf.sample(n_iterations, n_burn)
[-----------------100%-----------------] 50000 of 50000 complete in 231.9 sec
July 15, no lab confirmation
model_july_noconf = MCMC(model('1997-07-15',
confirmation=False),
db=db, dbname='model_july_noconf')
model_july_noconf.sample(50000, 40000)
[-----------------100%-----------------] 50000 of 50000 complete in 258.5 sec
Matplot.plot(model_june.β)
Plotting β
Lab confirmation rates, June model
import seaborn as sb
p_age = pd.DataFrame(model_june.p_age.trace(), columns=age_groups)
f, axes = plt.subplots(figsize=(14,6))
sb.boxplot(data=p_age, linewidth=0.3, fliersize=0, ax=axes,
color=sb.color_palette("coolwarm", 5)[0])
axes.set_ylabel('Confirmation rate')
axes.set_xlabel('Age group')
<matplotlib.text.Text at 0x10eca3a90>
Proportion of population susceptible, June model.
Matplot.summary_plot(model_june.p_susceptible)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Proportion of population susceptible, June model with no confirmation correction
Matplot.summary_plot(model_june_noconf.p_susceptible)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Matplot.summary_plot(model_june.λ_t)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Matplot.summary_plot(model_july.λ_t)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Epidemic intensity for lab- versus clinical-confirmation models
lam_june = model_june.λ.stats()
fig, axes = plt.subplots(2, 1, sharey=True)
axes[0].plot(lam_june['quantiles'][50].T, 'b-', alpha=0.4)
axes[0].set_ylabel('Epidemic intensity')
axes[0].set_xlabel('time (2-week periods)')
axes[0].set_title('Lab confirmation')
lam_june_noconf = model_june_noconf.λ.stats()
axes[1].plot(lam_june_noconf['quantiles'][50].T, 'b-', alpha=0.4)
axes[1].set_ylabel('Epidemic intensity')
axes[1].set_xlabel('time (2-week periods)')
axes[1].set_title('Clinical confirmation')
<matplotlib.text.Text at 0x1a86c4c18>
S_age_june = pd.DataFrame(model_june.S_age.trace()[:, -1], columns=age_groups).unstack().reset_index()
S_age_june.columns = 'Age', 'Iteration', 'S'
S_age_june['Confirmation'] = 'Lab'
S_age_june = pd.DataFrame(model_june.S_age.trace()[:, -1], columns=age_groups).unstack().reset_index()
S_age_june.columns = 'Age', 'Iteration', 'S'
S_age_june['Confirmation'] = 'Lab'
S_age_june_noconf = pd.DataFrame(model_june_noconf.S_age.trace()[:, -1], columns=age_groups).unstack().reset_index()
S_age_june_noconf.columns = 'Age', 'Iteration', 'S'
S_age_june_noconf['Confirmation'] = 'Clinical'
S_age_june = pd.concat([S_age_june, S_age_june_noconf], ignore_index=True)
S_age_july = pd.DataFrame(model_july.S_age.trace()[:, -1], columns=age_groups).unstack().reset_index()
S_age_july.columns = 'Age', 'Iteration', 'S'
S_age_july['Confirmation'] = 'Lab'
S_age_july_noconf = pd.DataFrame(model_july_noconf.S_age.trace()[:, -1], columns=age_groups).unstack().reset_index()
S_age_july_noconf.columns = 'Age', 'Iteration', 'S'
S_age_july_noconf['Confirmation'] = 'Clinical'
S_age_july = pd.concat([S_age_july, S_age_july_noconf], ignore_index=True)
Numbers of suscepibles in each age group, under lab vs clinical confirmation
import seaborn as sb
sb.set_context("talk", font_scale=0.8)
sb.set_style("white")
g = sb.factorplot("Age", "S", "Confirmation", S_age_june, kind="box",
palette="hls", size=6, aspect=2, linewidth=0.3, fliersize=0,
order=np.sort(S_age_june.Age.unique()))
g.despine(offset=10, trim=True)
g.set_axis_labels("Age Group", "Susceptibles");
june_lam = pd.DataFrame(model_june.λ_t.trace()).unstack().reset_index()
june_lam.columns = ('district', 'iteration', 'λ')
june_lam['month'] = 'June'
june_lam_noconf = pd.DataFrame(model_june_noconf.λ_t.trace()).unstack().reset_index()
june_lam_noconf.columns = ('district', 'iteration', 'λ')
june_lam_noconf['month'] = 'June'
july_lam = pd.DataFrame(model_july.λ_t.trace()).unstack().reset_index()
july_lam.columns = ('district', 'iteration', 'λ')
july_lam['month'] = 'July'
model_july.S.value.min()
70
july_lam_noconf = pd.DataFrame(model_july_noconf.λ_t.trace()).unstack().reset_index()
july_lam_noconf.columns = ('district', 'iteration', 'λ')
july_lam_noconf['month'] = 'July'
confirmed_lam = june_lam.append(july_lam, ignore_index=True)
june_means = june_lam.groupby('district')['λ'].mean()
june_means.sort(ascending=False)
july_means = july_lam.groupby('district')['λ'].mean()
july_means.sort(ascending=False)
sorted_districts = june_means.index.values
$R_{eff}$ by district in June and July (with lab confirmation), sorted by June means.
sb.set_context("talk", font_scale=0.8)
f, (ax_1, ax_2) = plt.subplots(2, 1, figsize=(12,6), sharey=True, sharex=True)
sb.boxplot('district', 'λ', data=june_lam, ax=ax_1, linewidth=0.5,
fliersize=0, color='r', order=sorted_districts)
# ax_1.hlines(1, xmin=0, xmax=93, linestyles='dashed', linewidth=0.2)
ax_1.set_xticks([])
ax_1.set_xlabel('')
ax_1.set_ylabel('June')
ax_1.set_title(r'Epidemic intensity (λ) estimates, ordered by June means')
sb.boxplot('district', 'λ', data=july_lam, ax=ax_2, linewidth=0.5,
fliersize=0, color='r', order=sorted_districts)
# ax_2.hlines(1, xmin=0, xmax=93, linestyles='dashed', linewidth=0.2)
ax_2.set_xticks([])
ax_2.set_ylabel('July')
f.tight_layout()
$R_{eff}$ by district in June for lab-confirmed and clinical-confirmed, sorted by lab-confirmed means.
f, (ax_1, ax_2) = plt.subplots(2, 1, figsize=(12,6), sharey=True, sharex=True)
sb.boxplot('district', 'λ', data=june_lam, ax=ax_1, linewidth=0.5,
fliersize=0, color='r', order=june_means.index.values)
# ax_1.hlines(1, xmin=0, xmax=93, linestyles='dotted', linewidth=0.75)
ax_1.set_xticks([])
ax_1.set_xlabel('')
ax_1.set_ylabel('Lab')
ax_1.set_title(r'June epidemic intensity (λ) estimates, ordered by lab-confirmed means')
sb.boxplot('district', 'λ', data=june_lam_noconf, ax=ax_2, linewidth=0.5,
fliersize=0, color='r', order=june_means.index.values)
# ax_2.hlines(1, xmin=0, xmax=93, linestyles='dotted', linewidth=0.75)
ax_2.set_xticks([])
ax_2.set_ylabel('Clinical')
f.tight_layout()
$R_{eff}$ by district in July for lab-confirmed and clinical-confirmed, sorted by lab-confirmed means.
july_means = july_lam.groupby('district')['λ'].mean()
july_means.sort(ascending=False)
f, (ax_1, ax_2) = plt.subplots(2, 1, figsize=(12,6), sharey=True, sharex=True)
sb.boxplot('district', 'λ', data=july_lam, ax=ax_1, linewidth=0.5,
fliersize=0, color='r', order=july_means.index.values)
# ax_1.hlines(1, xmin=0, xmax=93, linestyles='dotted', linewidth=0.75)
ax_1.set_xticks([])
ax_1.set_xlabel('')
ax_1.set_ylabel('Lab')
# ax_1.set_yticks(np.arange(13, step=2))
ax_1.set_title(r'July epidemic intensity (λ) estimates, ordered by lab-confirmed means')
sb.boxplot('district', 'λ', data=july_lam_noconf, ax=ax_2, linewidth=0.5,
fliersize=0, color='r', order=sorted_districts)
# ax_2.hlines(1, xmin=0, xmax=93, linestyles='dotted', linewidth=0.75)
ax_2.set_xticks([])
ax_2.set_ylabel('Clinical')
f.tight_layout()
model_june.summary(['pct_5', 'pct_15', 'pct_30', 'pct_adult'])
pct_5: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.27 0.001 0.0 [ 0.268 0.273] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.268 0.27 0.27 0.271 0.273 pct_15: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.474 0.001 0.0 [ 0.471 0.476] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.471 0.473 0.474 0.475 0.476 pct_30: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.696 0.001 0.0 [ 0.693 0.698] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.693 0.695 0.696 0.696 0.698 pct_adult: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.419 0.001 0.0 [ 0.416 0.421] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.416 0.418 0.419 0.419 0.421
june_coverage = pd.DataFrame({name: model_june.trace(name)[:] for name in ['pct_5', 'pct_15', 'pct_30', 'pct_adult']})
june_coverage['Month'] = 'June'
june_coverage['Confirmation'] = 'Lab'
june_noconf_coverage = pd.DataFrame({name: model_june_noconf.trace(name)[:] for name in ['pct_5', 'pct_15', 'pct_30', 'pct_adult']})
june_noconf_coverage['Month'] = 'June'
june_noconf_coverage['Confirmation'] = 'Clinical'
july_coverage = pd.DataFrame({name: model_july.trace(name)[:] for name in ['pct_5', 'pct_15', 'pct_30', 'pct_adult']})
july_coverage['Month'] = 'July'
july_coverage['Confirmation'] = 'Lab'
july_noconf_coverage = pd.DataFrame({name: model_july_noconf.trace(name)[:] for name in ['pct_5', 'pct_15', 'pct_30', 'pct_adult']})
july_noconf_coverage['Month'] = 'July'
july_noconf_coverage['Confirmation'] = 'Clinical'
coverage = pd.concat([june_coverage, june_noconf_coverage, july_coverage, july_noconf_coverage],
ignore_index=True)
sb.factorplot(row="Month", col="Confirmation", data=coverage, kind='box',
row_order=['June', 'July'],
order=['pct_5', 'pct_15', 'pct_30', 'pct_adult'],
palette="YlGnBu_d", linewidth=0.7, fliersize=0, aspect=1.25).despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x1c0c58438>
sb.factorplot(row="Month", col="Confirmation", data=coverage, kind='box',
row_order=['June', 'July'],
order=['pct_5', 'pct_15', 'pct_30', 'pct_adult'],
palette="YlGnBu_d", linewidth=0.7, fliersize=0, aspect=1.25).despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x1ad8f2d30>
axes = sb.boxplot(data=june_coverage, order=['pct_5', 'pct_15', 'pct_30', 'pct_adult'],
color=sb.color_palette("coolwarm", 5)[0])
axes.set_xticklabels(['Under 5', 'Under 15', 'Under 30', 'Under 5 + 20-30'])
axes.set_ylabel('% susceptibles vaccinated')
sb.despine(offset=10, trim=True)
model_june_noconf.summary(['pct_5', 'pct_15', 'pct_30', 'pct_adult'])
pct_5: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.231 0.003 0.0 [ 0.225 0.236] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.225 0.229 0.231 0.233 0.236 pct_15: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.596 0.006 0.001 [ 0.588 0.609] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.589 0.592 0.595 0.598 0.61 pct_30: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.787 0.005 0.0 [ 0.776 0.796] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.775 0.785 0.788 0.791 0.796 pct_adult: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.313 0.004 0.0 [ 0.306 0.322] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.306 0.31 0.312 0.315 0.322
model_july.summary(['pct_5', 'pct_15', 'pct_30', 'pct_adult'])
pct_5: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.056 0.001 0.0 [ 0.055 0.057] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.055 0.055 0.056 0.056 0.057 pct_15: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.389 0.001 0.0 [ 0.386 0.392] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.386 0.388 0.389 0.39 0.392 pct_30: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.643 0.001 0.0 [ 0.64 0.646] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.64 0.642 0.643 0.644 0.646 pct_adult: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.158 0.001 0.0 [ 0.155 0.16 ] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.156 0.157 0.158 0.158 0.16
model_july_noconf.summary(['pct_5', 'pct_15', 'pct_30', 'pct_adult'])
pct_5: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.009 0.0 0.0 [ 0.008 0.009] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.008 0.009 0.009 0.009 0.009 pct_15: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.468 0.005 0.0 [ 0.458 0.477] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.458 0.465 0.469 0.472 0.477 pct_30: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.768 0.005 0.0 [ 0.761 0.778] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.76 0.764 0.767 0.771 0.778 pct_adult: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.154 0.004 0.0 [ 0.146 0.16 ] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.146 0.151 0.154 0.156 0.161
lam_july = model_july.λ.stats()
plt.plot(lam_july['quantiles'][50].T, 'b-', alpha=0.4)
plt.ylabel('λ')
plt.xlabel('time (2-week periods)')
<matplotlib.text.Text at 0x1b5eb8d30>
lam_june_noconf = model_june_noconf.λ.stats()
plt.plot(lam_june_noconf['quantiles'][50].T, 'b-', alpha=0.4)
plt.ylabel('λ')
plt.xlabel('time (2-week periods)')
<matplotlib.text.Text at 0x1bf9372e8>
from mpl_toolkits.basemap import Basemap
import geopandas as gpd
lllat=-24
urlat=-23.3
lllon=-47
urlon=-46.3
SP_base = Basemap(ax=None, lon_0=(urlon + lllon) / 2, lat_0=(urlat + lllat) / 2,
llcrnrlat=lllat, urcrnrlat=urlat, llcrnrlon=lllon, urcrnrlon=urlon,
resolution='i',
epsg='4326')
SP_dist = gpd.GeoDataFrame.from_file('Sao Paulo/Brazil_full/BRA_adm3.shp').to_crs({'proj': 'longlat',
'ellps': 'WGS84',
'datum': 'WGS84'})
SP_dist['DIST_NAME'] = [trans.trans(_).upper() for _ in SP_dist.NAME_3]
λ_june = pd.Series(model_june.λ_t.stats()['mean'], index=sp_districts)
λ_june
BRAS 0.062417 BARRA FUNDA 0.324059 FREGUESIA DO O 2.107492 CAMBUCI 0.603248 PENHA 0.743912 BRAZILANDIA 2.696296 SANTA CECILIA 0.211299 CASA VERDE 1.952935 CAPAO REDONDO 9.130688 CONSOLACAO 0.544182 JAGUARE 2.080369 CIDADE ADEMAR 2.306163 CIDADE TIRADENTES 1.738719 SAPOPEMBA 0.653725 MOOCA 0.920267 CANGAIBA 3.490634 SAUDE 2.662397 SANTANA 2.445026 JARDIM ANGELA 7.469341 CAMPO LIMPO 8.008153 VILA MARIANA 4.988978 VILA CURUCA 1.266134 CIDADE DUTRA 6.302569 ARTUR ALVIM 2.081722 JARDIM HELENA 0.898627 SE 0.967562 LAPA 0.608799 JARDIM PAULISTA 2.341109 JABAQUARA 3.355646 PINHEIROS 3.130386 ... BUTANTA 1.113759 SAO RAFAEL 1.206678 MORUMBI 0.328959 AGUA RASA 0.955486 MOEMA 0.250341 VILA ANDRADE 0.938559 ANHANGUERA 0.039575 VILA MARIA 2.183739 IPIRANGA 1.990685 SOCORRO 0.601724 CACHOEIRINHA 2.390857 ARICANDUVA 0.921606 CAMPO GRANDE 2.948286 MANDAQUI 0.338616 ITAQUERA 2.483490 SAO MIGUEL 1.003908 JAGUARA 0.172167 PARQUE DO CARMO 0.839247 JACANA 0.603664 CIDADE LIDER 1.134638 CAMPO BELO 1.602751 VILA JACUI 1.204909 ITAIM BIBI 0.461258 VILA GUILHERME 1.354458 CURSINO 1.673891 MARSILAC 0.068277 GUAIANASES 1.118360 VILA MATILDE 1.210700 PONTE RASA 1.042228 PARI 1.261043 dtype: float64
SP_dist_merged = SP_dist.merge(pd.DataFrame(λ_june, columns=['λ']), left_on='DIST_NAME', right_index=True)
Map of outbreak intensity by district
from matplotlib.pyplot import cm
map_fig = plt.figure(figsize=(16,12))
map_ax = plt.gca()
SP_base.drawcoastlines()
SP_base.drawrivers()
SP_dist_merged.plot(column='λ', colormap=cm.Reds, axes=map_ax)
<matplotlib.axes._subplots.AxesSubplot at 0x1fc140ba8>
measles_onset_conf = measles_data[CONFIRMED].groupby(['DISTRICT','ONSET']).size().unstack(level=0).fillna(0).sum()
measles_onset_conf
DISTRICT AGUA RASA 37 ALTO PINHEIROS 19 ANHANGUERA 22 ARICANDUVA 32 ARTUR ALVIM 68 BARRA FUNDA 37 BELA VISTA 83 BELEM 16 BOM RETIRO 40 BRAS 33 BRAZILANDIA 186 BUTANTA 86 C REDONDO 1 CACHOEIRINHA 132 CAJAMAR 1 CAMBUCI 26 CAMPINAS 1 CAMPO BELO 19 CAMPO GRANDE 30 CAMPO LIMPO 279 CANGAIBA 65 CAPAO REDONDO 342 CAPELA DO SOCOR 1 CAPELA SOCORRO 1 CARRAO 34 CASA VERDE 50 CERQUEIRA CESAR 1 CIDADE ADEMAR 134 CIDADE DUTRA 104 CIDADE LIDER 53 ... TATUAPE 36 TREMEMBE 117 TUCURUVI 54 VILA ANDRADE 107 VILA CACHOEIRINHA 1 VILA CURUCA 97 VILA DALVILAA 1 VILA FORMOSA 32 VILA GUILHERME 24 VILA JACUI 48 VILA LEOPOLDINA 34 VILA MARIA 88 VILA MARIANA 78 VILA MATILDE 35 VILA MEDEIROS 45 VILA PRUDENTE 72 VILA SONIA 120 VILA. FORMOSA 1 VILA. PRUDENTE 2 VILA.CARRAO 1 VILA.JAGUARA 1 VILA.LEOPOLDINA 1 VILAILA ANDRADE 1 VILAILA CURUCA 1 VILAILA FORMOSA 1 VILAILA MARIA 1 VILAILA MATILDE 1 VILAILA MEDEIROS 1 VILAILA SONIA 3 VILAL FORMOSA 1 dtype: float64
_rates = measles_onset_conf/sp_pop.sum(1)
SP_dist_conf = SP_dist.merge(pd.DataFrame(_rates, columns=['rate']), left_on='DIST_NAME', right_index=True)
For comparison, map of empirical incidence by district
map_fig = plt.figure(figsize=(16,12))
map_ax = plt.gca()
SP_base.drawcoastlines()
SP_base.drawrivers()
SP_dist_conf.plot(column='rate', colormap=cm.Reds, axes=map_ax)
<matplotlib.axes._subplots.AxesSubplot at 0x1fbc30eb8>