%matplotlib inline
import pandas as pd
import numpy as np
import numpy.ma as ma
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('max_columns', 20)
pd.set_option('max_rows', 25)
from IPython.core.display import HTML
def css_styling():
styles = open("styles/custom.css", "r").read()
return HTML(styles)
css_styling()
data_dir = "data/"
Import outbreak data
measles_data = pd.read_csv(data_dir+"measles.csv", index_col=0)
measles_data.NOTIFICATION = pd.to_datetime(measles_data.NOTIFICATION)
measles_data.BIRTH = pd.to_datetime(measles_data.BIRTH)
measles_data.ONSET = pd.to_datetime(measles_data.ONSET)
measles_data = measles_data.replace({'DISTRICT': {'BRASILANDIA':'BRAZILANDIA'}})
Sao Paulo population by district
sp_pop = pd.read_csv(data_dir+'sp_pop.csv', index_col=0)
_names = sp_pop.index.values
_names[_names=='BRASILANDIA'] = 'BRAZILANDIA'
sp_pop.set_index(_names, inplace = True)
sp_pop.head()
0 a 4 anos | 5 a 9 anos | 10 a 14 anos | 15 a 19 anos | 20 a 24 anos | 25 a 29 anos | 30 a 34 anos | 35 a 39 anos | 40 a 44 anos | 45 a 49 anos | 50 a 54 anos | 55 a 59 anos | 60 a 64 anos | 65 a 69 anos | 70 a 74 anos | 75 anos e + | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AGUA RASA | 5411 | 5750 | 6450 | 7122 | 7621 | 7340 | 6999 | 6984 | 6346 | 5608 | 4987 | 4212 | 4152 | 3595 | 2937 | 3637 | 89151 |
ALTO DE PINHEIROS | 2070 | 2369 | 2953 | 3661 | 4612 | 4190 | 3539 | 3633 | 3448 | 3289 | 3040 | 2533 | 2298 | 1732 | 1305 | 1823 | 46495 |
ANHANGUERA | 3068 | 3006 | 2755 | 2431 | 2426 | 2636 | 2695 | 2308 | 1653 | 1107 | 753 | 509 | 352 | 217 | 162 | 171 | 26249 |
ARICANDUVA | 7732 | 7730 | 8373 | 8956 | 9182 | 8531 | 7813 | 7365 | 6551 | 5554 | 4887 | 3858 | 3320 | 2449 | 1611 | 1723 | 95635 |
ARTUR ALVIM | 9031 | 9078 | 10000 | 11058 | 11387 | 10347 | 9125 | 8658 | 7830 | 7055 | 5919 | 4612 | 3756 | 2633 | 1727 | 1724 | 113940 |
Plot of cumulative cases by district
measles_onset_dist = measles_data.groupby(['DISTRICT','ONSET']).size().unstack(level=0).fillna(0)
measles_onset_dist.cumsum().plot(legend=False, grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x11114df98>
total_district_cases = measles_onset_dist.sum()
Top 5 districts by number of cases
totals = measles_onset_dist.sum()
totals.sort(ascending=False)
totals[:5]
DISTRICT GRAJAU 1074 JARDIM ANGELA 944 CAPAO REDONDO 849 JARDIM SAO LUIZ 778 CAMPO LIMPO 692 dtype: float64
Age distribution of cases, by confirmation status
by_conclusion = measles_data.groupby(["YEAR_AGE", "CONCLUSION"])
counts_by_cause = by_conclusion.size().unstack().fillna(0)
ax = counts_by_cause.plot(kind='bar', stacked=True, xlim=(0,50), figsize=(15,5))
As a baseline for comparison, we can fit a model to all the clinically-confirmed cases, regardless of lab confirmation status. For this, we will use a simple chain binomial model, which will be fit using MCMC.
This model fits the series of 2-week infection totals as a set of Binomial models:
$$Pr(I_{t+1} | S_t, p_t) = \text{Bin}(S_t, p_t) $$Where the binomial probability is modeled as:
$$p_t = 1 - \exp\(-\lambda_t\)$$$$\lambda_t = \frac{\beta_t I_t}{N}$$We will assume here that the transmission rate is constant over time (and across districts):
$$\beta_t = \beta_0$$which allows the effective reproductive number to be calculated as:
$$R_t = \frac{\beta S_t}{N}$$Rather than assume all clinical cases are true cases, we can adjust the model to account for lab confirmation probability. This is done by including a sub-model that estimates age group-specific probabilities of confirmation, and using these probabilities to estimate the number of lab-confirmed cases. These estimates are then plugged into the model in place of the clinically-confirmed cases.
We specified a structured confirmation model to retrospectively determine the age group-specific probabilities of lab confirmation for measles, conditional on clinical diagnosis. Individual lab confirmation events $c_i$ were modeled as Bernoulli random variables, with the probability of confirmation being allowed to vary by age group:
$$c_i \sim \text{Bernoulli}(p_{a(i)})$$where $a(i)$ denotes the appropriate age group for the individual indexed by i. There were 16 age groups, the first 15 of which were 5-year age intervals $[0,5), [5, 10), \ldots , [70, 75)$, with the 16th interval including all individuals 75 years and older.
Since the age interval choices were arbitrary, and the confirmation probabilities of adjacent groups likely correlated, we modeled the correlation structure directly, using a multivariate logit-normal model. Specifically, we allowed first-order autocorrelation among the age groups, whereby the variance-covariance matrix retained a tridiagonal structure.
$$\begin{aligned} \Sigma = \left[{ \begin{array}{c} {\sigma^2} & {\sigma^2 \rho} & 0& \ldots & {0} & {0} \\ {\sigma^2 \rho} & {\sigma^2} & \sigma^2 \rho & \ldots & {0} & {0} \\ {0} & \sigma^2 \rho & {\sigma^2} & \ldots & {0} & {0} \\ \vdots & \vdots & \vdots & & \vdots & \vdots\\ {0} & {0} & 0 & \ldots & {\sigma^2} & \sigma^2 \rho \\ {0} & {0} & 0 & \ldots & \sigma^2 \rho & {\sigma^2} \end{array} }\right] \end{aligned}$$From this, the confirmation probabilities were specified as multivariate normal on the inverse-logit scale.
$$ \text{logit}(p_a) = \{a\} \sim N(\mu, \Sigma)$$Priors for the confirmation sub-model were specified by:
$$\begin{aligned} \mu_i &\sim N(0, 100) \\ \sigma &\sim \text{HalfCauchy}(25) \\ \rho &\sim U(-1, 1) \end{aligned}$$Age classes are defined in 5-year intervals.
age_classes = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,100]
measles_data.dropna(subset=['AGE'], inplace=True)
measles_data['AGE_GROUP'] = pd.cut(measles_data.AGE, age_classes, right=False)
Lab-checked observations are extracted for use in estimating lab confirmation probability.
CONFIRMED = measles_data.CONCLUSION == 'CONFIRMED'
CLINICAL = measles_data.CONCLUSION == 'CLINICAL'
DISCARDED = measles_data.CONCLUSION == 'DISCARDED'
lab_subset = measles_data[(CONFIRMED | DISCARDED) & measles_data.YEAR_AGE.notnull() & measles_data.COUNTY.notnull()].copy()
lab_subset.loc[lab_subset.YEAR_AGE > 75, 'YEAR_AGE'] = 75
age = lab_subset.YEAR_AGE.astype(int).values
ages = lab_subset.YEAR_AGE.astype(int).unique()
counties = lab_subset.COUNTY.unique()
y = (lab_subset.CONCLUSION=='CONFIRMED').values
Proportion of lab-confirmed cases older than 20 years
(measles_data[CONFIRMED].AGE>20).mean()
0.6034981469764078
Extract cases by age and time.
age_group = pd.cut(age, age_classes, right=False)
age_index = np.array([age_group.categories.tolist().index(i) for i in age_group])
/usr/local/lib/python3.4/site-packages/pandas/core/categorical.py:462: FutureWarning: Accessing 'levels' is deprecated, use 'categories' warn("Accessing 'levels' is deprecated, use 'categories'", FutureWarning)
# Get index from full crosstabulation to use as index for each district
dates_index = measles_data.groupby(
['ONSET', 'AGE_GROUP']).size().unstack().index
unique_districts = measles_data.DISTRICT.dropna().unique()
N = sp_pop.ix[unique_districts, 'Total'].dropna()
sp_districts = N.index.values
len(sp_districts)
93
all_district_data = []
all_confirmed_cases = []
for d in sp_districts:
# All bi-weekly unconfirmed and confirmed cases
district_data = measles_data[measles_data.DISTRICT==d]
district_counts_2w = district_data.groupby(
['ONSET', 'AGE_GROUP']).size().unstack().reindex(dates_index).fillna(0).resample('2W', how='sum')
all_district_data.append(district_counts_2w)
# All confirmed cases, by district
confirmed_data = district_data[district_data.CONCLUSION!='CONFIRMED']
confirmed_counts = confirmed_data.groupby(
['ONSET', 'AGE_GROUP']).size().unstack().reindex(dates_index).fillna(0).sum()
all_confirmed_cases.append(confirmed_counts.reindex_axis(measles_data['AGE_GROUP'].unique()).fillna(0))
Time series of cases by district, summarized in 2-week intervals
# Sum over ages for susceptibles
sp_cases_2w = [dist.sum(1) for dist in all_district_data]
# Ensure the age groups are ordered
I_obs = np.array([dist.reindex_axis(measles_data['AGE_GROUP'].unique(),
axis=1).fillna(0).values.astype(int) for dist in all_district_data])
measles_data['AGE_GROUP'].unique()
array(['[0, 5)', '[5, 10)', '[10, 15)', '[15, 20)', '[20, 25)', '[25, 30)', '[30, 35)', '[35, 40)', '[40, 45)', '[45, 50)', '[50, 55)', '[55, 60)', '[60, 65)', '[65, 70)', '[70, 75)', '[75, 100)'], dtype=object)
Check shape of data frame
assert I_obs.shape == (93, 28, 16)
import geopandas as gpd
shp = gpd.GeoDataFrame.from_file("Sao Paulo/Brazil_full/BRA_adm3.shp")
district_names = N.index.unique()
import trans
shp['district_name'] = shp.NAME_3.apply(
lambda x: trans.trans(x).upper())
sp_shp = shp[shp.NAME_2=='São Paulo'].set_index('district_name')
centroids = sp_shp.geometry.centroid
distance_matrix = pd.concat([sp_shp.geometry.distance(o) for o in sp_shp.geometry],
axis=1)
distance_matrix.columns = sp_shp.index
min_x, min_y = sp_shp.bounds.min()[:2]
max_x, max_y = sp_shp.bounds.max()[2:]
assert (distance_matrix.index == centroids.index).all()
centroid_xy = np.array([[c.x, c.y] for c in sp_shp.geometry.centroid])
Here is an arbitrary distance metric for an arbitrary district, as an example.
_beta = -100
np.exp(_beta*distance_matrix).values.round(2)[0]
array([ 1. , 0. , 0. , 0.01, 0. , 0.12, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.04, 0. , 0. , 0. , 1. , 0. , 0. , 0. , 0.01, 0.01, 0. , 0. , 0. , 0.01, 0. , 0. , 0. , 0. , 0.18, 0. , 0. , 0. , 0. , 0.04, 1. , 0. , 0. , 0. , 0.1 , 0.01, 0. , 0. , 0.19, 0. , 0. , 0. , 0. , 0.04, 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0.11, 0. , 1. , 0.02, 0. , 1. , 0. , 1. , 0. , 0.01, 0. , 0.02, 0.03, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.1 , 0. , 0. , 0. , 0. , 0.65, 0. , 0.01, 0. , 0. , 0. , 1. ])
We attempt to estimate $R_t$ for a truncated subset of the data, to simulate a decision-maker's information during (rather than after) an outbreak. This essentially involves turning part of the time series into missing data, and running the model.
This is an example of creating a mask for data not observed by the decision date.
np.array([np.resize(all_district_data[0].index > '1997-06-15', I_obs[0].T.shape).T]*len(all_district_data))
array([[[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True]], [[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True]], [[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True]], ..., [[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True]], [[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True]], [[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True]]], dtype=bool)
from pymc import MCMC, Matplot
from pymc import (Uniform, DiscreteUniform, Beta, Lambda, Binomial, Normal, Poisson,
NegativeBinomial, observed, negative_binomial_like, Lognormal, Exponential, binomial_like,
stochastic, potential, invlogit, TruncatedNormal, Binomial, Gamma)
from pymc import (HalfCauchy, deterministic, MvNormalCov, Bernoulli, potential, Uninformative, Multinomial,
rmultinomial, rbinomial, AdaptiveMetropolis)
from pymc.gp import *
from pymc.gp.cov_funs import matern
def measles_model(obs_date, confirmation=True, spatially_weighted=True, all_traces=False):
### Confirmation sub-model
if confirmation:
# Specify priors on age-specific means
age_classes = np.unique(age_index)
mu = Normal("mu", mu=0, tau=0.0001, value=[0]*len(age_classes))
sig = HalfCauchy('sig', 0, 25, value=1)
var = sig**2
cor = Uniform('cor', -1, 1, value=0)
# Build variance-covariance matrix with first-order correlation among age classes
@deterministic
def Sigma(var=var, cor=cor):
I = np.eye(len(age_classes))*var
E = np.diag(np.ones(len(age_classes)-1), k=-1)*var*cor
return I + E + E.T
# Age-specific probabilities of confirmation as multivariate normal random variables
beta_age = MvNormalCov("beta_age", mu=mu, C=Sigma, value=[1]*len(age_classes))
p_age = Lambda('p_age', lambda t=beta_age: invlogit(t))
@deterministic(trace=False)
def p_confirm(beta=beta_age):
return invlogit(beta[age_index])
# Confirmation likelihood
lab_confirmed = Bernoulli('lab_confirmed', p=p_confirm, value=y, observed=True)
'''
Missing data sub-model
We treat observations later than the decision date as missing data. This is
implemented as a `masked_array` in NumPy, which requires a boolean mask to identify missing values.
'''
missing_mask = all_district_data[0].index > obs_date
district_mask = np.resize(missing_mask, I_obs[0].T.shape).T
# Index to the observation period
current_index = (~missing_mask).sum()-1
I_obs_masked = ma.masked_array(I_obs,
mask=np.array([district_mask]*len(all_district_data)),
fill_value=1)
# Index for observation date, used to index out values of interest from the model.
t_obs = (~missing_mask).sum() - 1
# Imputed infecteds
I_imp = DiscreteUniform('I_imp', 0, 2000, value=I_obs_masked, observed=True)
### Chain binomial model for disease dynamics
if confirmation:
# Confirmed cases
@stochastic(trace=all_traces, dtype=int)
def I(value=(I_imp.value*0.7).astype(int), n=I_imp, p=p_age):
return np.sum([np.sum([binomial_like(vj[:,i], nj[:,i], p[i]) for i in range(len(p))]).sum(0)
for vj, nj in zip(value, n)])
else:
I = I_imp
# Infecteds at time $t_{obs}$
It = Lambda('It', lambda I=I: I.sum(0)[t_obs])
# Calculate susceptibles from total number of infections
@deterministic(trace=all_traces)
def S(I=I):
return np.array([Ij.sum() - np.array([Ij[:i].sum() for i in range(len(Ij))]) for Ij in I])
# Total infecteds until time t_obs, by age
I_total = Lambda('I_total', lambda I=I: I[:,:t_obs].sum(0).sum(0))
# Age distribution of infecteds
age_dist = Lambda('age_dist', lambda I=I_total: I/I.sum())
# Susceptibles at time t
S_t = Lambda('S_t', lambda S=S: S[:, t_obs])
# Susceptibles at time t, by age
@deterministic
def S_age(S=S_t, p=age_dist):
return np.array([rmultinomial(si, p) for si in S])
@deterministic
def vacc_5(S=S_age):
# Vaccination of 15 and under
p = [0.95] + [0]*15
return rbinomial(S[current_index], p)
# Proportion of susceptibles vaccinated
pct_5 = Lambda('pct_5', lambda V=vacc_5, S=S_age: float(V.sum())/S[current_index].sum())
@deterministic
def vacc_15(S=S_age):
# Vaccination of 15 and under
p = [0.95]*3 + [0]*13
return rbinomial(S[current_index], p)
# Proportion of susceptibles vaccinated
pct_15 = Lambda('pct_15', lambda V=vacc_15, S=S_age: float(V.sum())/S[current_index].sum())
@deterministic
def vacc_30(S=S_age):
# Vaccination of 30 and under
p = [0.95]*6 + [0]*10
return rbinomial(S[current_index], p)
# Proportion of 30 and under susceptibles vaccinated
pct_30 = Lambda('pct_30', lambda V=vacc_30, S=S_age: float(V.sum())/S[current_index].sum())
@deterministic
def vacc_adult(S=S_age):
# Vaccination of adults under 30 (and young kids)
p = [0.95, 0, 0, 0, 0.95, 0.95] + [0]*10
return rbinomial(S[current_index], p)
# Proportion of adults under 30 (and young kids)
pct_adult = Lambda('pct_adult', lambda V=vacc_adult, S=S_age: float(V.sum())/S[current_index].sum())
# Transmission parameter
beta = Gamma('beta', 1, 0.1, value=10)
'''
Calculation of the effective reproduction number depends on whether we believe that the districts
are independent or not. If not (`spatially_weighted = True`), both the number of susceptibles and the
denominator population are calculated as a distance-weighted average of all the districts in Sao Paulo.
'''
if spatially_weighted:
geo_mesh = np.transpose([np.linspace(min_x, max_x),
np.linspace(min_y, max_y)])
# Vague prior for the overall mean
m = Uninformative('m', value=0)
def constant(x, val):
return np.zeros(x.shape[:-1],dtype=float) + val
@deterministic
def M(m=m):
return Mean(constant, val=m)
# ==================
# = The covariance =
# ==================
# Vague priors for the amplitude and scale
amp = Exponential('amp', 1e-5, value=1)
scale = Exponential('scale',1e-5, value=0.1)
@deterministic
def C(amp=amp, scale=scale):
return FullRankCovariance(exponential.euclidean, amp = amp, scale = scale)
# ===================
# = The GP submodel =
# ===================
zeta = GPSubmodel('zeta', M, C, geo_mesh)
z_eval = Lambda('z_eval', lambda z=zeta.f(centroid_xy): z)
if spatially_weighted:
alpha = Exponential('alpha', 1, value=0.0001)
Rt = Lambda('Rt', lambda B=beta, S=S, z=z_eval, a=alpha: ((B * (1 + a*z) * S.T) / N.values).T)
else:
Rt = Lambda('Rt', lambda B=beta, S=S: ((B * S).T / N.values).T)
# Effective reproduction number at time of observation, and implied vaccination target
Rt_obs = Lambda('Rt_obs', lambda r=Rt: r[:, t_obs])
vaccination_target = Lambda('vaccination_target', lambda r=Rt_obs: np.maximum(1-1./r, 0))
# Force of infection, assuming mass action transmission
if spatially_weighted:
lam = Lambda('lam', lambda B=beta, I=I, z=z_eval, a=alpha:
np.array([(B * (1 + zj) * Ij) / nj for Ij,nj,zj in zip(I, N.values, a*z)]),
trace=False)
else:
lam = Lambda('lam', lambda B=beta, I=I: np.array([(B * Ij) / nj for Ij,nj in zip(I, N.values)]),
trace=False)
# 2-week infection probabilities
p = Lambda('p', lambda lam=lam: 1. - np.exp(-lam) + 1e-6, trace=False)
# Binomial likelihood for observed cases
@potential
def new_cases(p=p, I=I, S=S):
return np.sum([[binomial_like(i, s, pt) for pt,i,s in zip(pj[:t_obs], Ij[:t_obs], Sj[:t_obs])]
for pj,Ij,Sj in zip(p, I, S)])
return locals()
Run models for June 15 and July 15 observation points, both with and without clinical confirmation.
db = 'txt'
n_iterations = 50000
n_burn = 40000
June 15, with lab confirmation
model_june = MCMC(measles_model('1997-06-15', spatially_weighted=False),
db=db, dbname='model_june')
model_june.sample(n_iterations, n_burn)
[-----------------100%-----------------] 50001 of 50000 complete in 6400.6 sec
July 15, with lab confirmation
model_july = MCMC(measles_model('1997-07-15', spatially_weighted=False),
db=db, dbname='model_july')
model_july.sample(n_iterations, n_burn)
[-----------------100%-----------------] 50000 of 50000 complete in 1974.3 sec
June 15, no lab confirmation
model_june_noconf = MCMC(measles_model('1997-06-15',
spatially_weighted=False,
confirmation=False),
db=db, dbname='model_june_noconf')
model_june_noconf.sample(n_iterations, n_burn)
[-----------------100%-----------------] 50001 of 50000 complete in 884.4 sec
July 15, no lab confirmation
model_july_noconf = MCMC(measles_model('1997-07-15', spatially_weighted=False,
confirmation=False),
db=db, dbname='model_july_noconf')
model_july_noconf.sample(50000, 40000)
[-----------------100%-----------------] 50000 of 50000 complete in 998.2 sec
Rt_june = model_june.Rt.stats()
plt.plot(Rt_june['quantiles'][50].T, 'b-', alpha=0.4)
plt.ylabel('R(t)')
plt.xlabel('time (2-week periods)')
<matplotlib.text.Text at 0x11307dc50>
Matplot.summary_plot(model_june.vacc_30)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Matplot.plot(model_june.pct_15)
Plotting pct_15
Matplot.plot(model_june.pct_30)
Plotting pct_30
june_Rt = pd.DataFrame(model_june.Rt_obs.trace()).unstack().reset_index()
june_Rt.columns = ('district', 'iteration', 'Rt')
june_Rt['month'] = 'June'
june_Rt_noconf = pd.DataFrame(model_june_noconf.Rt_obs.trace()).unstack().reset_index()
june_Rt_noconf.columns = ('district', 'iteration', 'Rt')
june_Rt_noconf['month'] = 'June'
july_Rt = pd.DataFrame(model_july.Rt_obs.trace()).unstack().reset_index()
july_Rt.columns = ('district', 'iteration', 'Rt')
july_Rt['month'] = 'July'
confirmed_Rt = june_Rt.append(july_Rt, ignore_index=True)
june_means = june_Rt.groupby('district')['Rt'].mean()
june_means.sort(ascending=False)
sorted_districts = june_means.index.values
import seaborn as sb
sb.set_context("talk", font_scale=1.1)
f, (ax_1, ax_2) = plt.subplots(2, 1, figsize=(12,6), sharey=True, sharex=True)
sb.boxplot('district', 'Rt', data=june_Rt, ax=ax_1, linewidth=0.5,
fliersize=0, color='r', order=sorted_districts)
ax_1.hlines(1, xmin=0, xmax=93, linestyles='dashed', linewidth=0.2)
ax_1.set_xticks([])
ax_1.set_xlabel('')
ax_1.set_ylabel('June')
ax_1.set_ylim(0, 6)
ax_1.set_yticks(range(7))
ax_1.set_title(r'$R_t$ estimates, ordered by June means')
sb.boxplot('district', 'Rt', data=july_Rt, ax=ax_2, linewidth=0.5,
fliersize=0, color='r', order=sorted_districts)
ax_2.hlines(1, xmin=0, xmax=93, linestyles='dashed', linewidth=0.2)
ax_2.set_xticks([])
ax_2.set_ylabel('July')
f.tight_layout()
sb.set_context("talk", font_scale=1.1)
f, (ax_1, ax_2) = plt.subplots(2, 1, figsize=(12,6), sharey=True, sharex=True)
sb.boxplot('district', 'Rt', data=june_Rt, ax=ax_1, linewidth=0.5,
fliersize=0, color='r', order=sorted_districts)
ax_1.hlines(1, xmin=0, xmax=93, linestyles='dotted', linewidth=0.75)
ax_1.set_xticks([])
ax_1.set_xlabel('')
ax_1.set_ylabel('Lab')
ax_1.set_yticks(np.arange(13, step=2))
ax_1.set_title(r'June $R_t$ estimates, ordered by confirmed means')
sb.boxplot('district', 'Rt', data=june_Rt_noconf, ax=ax_2, linewidth=0.5,
fliersize=0, color='r', order=sorted_districts)
ax_2.hlines(1, xmin=0, xmax=93, linestyles='dotted', linewidth=0.75)
ax_2.set_xticks([])
ax_2.set_ylabel('Clinical')
f.tight_layout()
model_june.summary(['pct_5', 'pct_15', 'pct_30', 'pct_adult'])
pct_5: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.102 0.056 0.001 [ 0. 0.2] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.0 0.062 0.097 0.129 0.226 pct_15: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.369 0.087 0.002 [ 0.219 0.556] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.194 0.312 0.37 0.424 0.548 pct_30: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.922 0.047 0.001 [ 0.839 1. ] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.815 0.893 0.926 0.963 1.0 pct_adult: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.593 0.09 0.003 [ 0.407 0.75 ] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.407 0.531 0.593 0.656 0.758
june_coverage = pd.DataFrame({name: model_june.trace(name)[:] for name in ['pct_5', 'pct_15', 'pct_30', 'pct_adult']})
axes = sb.boxplot(june_coverage, order=['pct_5', 'pct_15', 'pct_30', 'pct_adult'])
axes.set_xticklabels(['Under 5', 'Under 15', 'Under 30', 'Under 5 + 20-30'])
plt.ylabel('% susceptibles vaccinated')
sb.despine(offset=10, trim=True);
/usr/local/lib/python3.4/site-packages/seaborn/categorical.py:1587: UserWarning: The boxplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info. warnings.warn(msg, UserWarning)
model_june_noconf.summary(['pct_5', 'pct_15', 'pct_30', 'pct_adult'])
pct_5: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.163 0.001 0.0 [ 0.162 0.165] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.161 0.163 0.163 0.164 0.165 pct_15: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.464 0.001 0.0 [ 0.462 0.466] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.462 0.463 0.464 0.465 0.466 pct_30: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.872 0.001 0.0 [ 0.871 0.874] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.871 0.872 0.872 0.873 0.874 pct_adult: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.502 0.001 0.0 [ 0.499 0.504] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.499 0.501 0.502 0.502 0.504
model_july.summary(['pct_5', 'pct_15', 'pct_30', 'pct_adult'])
pct_5: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.11 0.05 0.001 [ 0.026 0.206] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.026 0.079 0.108 0.146 0.212 pct_15: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.292 0.073 0.002 [ 0.146 0.425] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.154 0.242 0.293 0.342 0.441 pct_30: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.899 0.049 0.001 [ 0.805 0.976] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.794 0.868 0.897 0.939 0.974 pct_adult: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.67 0.076 0.002 [ 0.526 0.818] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.514 0.618 0.667 0.727 0.816
model_july_noconf.summary(['pct_5', 'pct_15', 'pct_30', 'pct_adult'])
pct_5: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.133 0.001 0.0 [ 0.131 0.135] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.131 0.132 0.133 0.133 0.135 pct_15: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.365 0.001 0.0 [ 0.362 0.367] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.362 0.364 0.365 0.366 0.367 pct_30: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.861 0.001 0.0 [ 0.859 0.863] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.859 0.86 0.861 0.862 0.863 pct_adult: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------ 0.568 0.001 0.0 [ 0.566 0.571] Posterior quantiles: 2.5 25 50 75 97.5 |---------------|===============|===============|---------------| 0.565 0.567 0.568 0.569 0.57
Rt_july = model_july.Rt.stats()
plt.plot(Rt_july['quantiles'][50].T, 'b-', alpha=0.4)
plt.ylabel('R(t)')
plt.xlabel('time (2-week periods)')
<matplotlib.text.Text at 0x1172d5630>
Rt_june_noconf = model_june_noconf.Rt.stats()
plt.plot(Rt_june_noconf['quantiles'][50].T, 'b-', alpha=0.4)
plt.ylabel('R(t)')
plt.xlabel('time (2-week periods)')
<matplotlib.text.Text at 0x117602c88>
Matplot.summary_plot(model_june.p_age)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
june_age_dist = model_june.age_dist.trace()[:]
june_age_dist.shape
(10000, 16)
june_age_dist = pd.DataFrame(june_age_dist, columns=measles_data['AGE_GROUP'].unique())
plt.figure(figsize=(8,12))
Matplot.summary_plot(model_july.vaccination_target)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
plt.figure(figsize=(8,12))
Matplot.summary_plot(model_july.vaccination_target)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Matplot.plot(model_june.alpha)
Plotting alpha
from mpl_toolkits.basemap import Basemap
import geopandas as gpd
lllat=-24
urlat=-23.3
lllon=-47
urlon=-46.3
SP_base = Basemap(ax=None, lon_0=(urlon + lllon) / 2, lat_0=(urlat + lllat) / 2,
llcrnrlat=lllat, urcrnrlat=urlat, llcrnrlon=lllon, urcrnrlon=urlon,
resolution='i',
epsg='4326')
SP_dist = gpd.GeoDataFrame.from_file('Sao Paulo/Brazil_full/BRA_adm3.shp').to_crs({'proj': 'longlat',
'ellps': 'WGS84',
'datum': 'WGS84'})
SP_dist['DIST_NAME'] = [trans.trans(_).upper() for _ in SP_dist.NAME_3]
SP_dist.head()
ENGTYPE_3 | HASC_3 | ID_0 | ID_1 | ID_2 | ID_3 | ISO | NAME_0 | NAME_1 | NAME_2 | ... | NL_NAME_3 | REMARKS_3 | Shape_Area | Shape_Leng | TYPE_3 | VALIDFR_3 | VALIDTO_3 | VARNAME_3 | geometry | DIST_NAME | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | District | None | 32 | 434 | 4112 | 4868 | BRA | Brazil | Bahia | Piritiba | ... | None | None | 0.041872 | 1.033621 | Distrito | Unknown | Present | None | POLYGON ((-40.38711999999998 -11.6031739999999... | PIRITIBA |
1 | District | None | 32 | 434 | 4112 | 4869 | BRA | Brazil | Bahia | Piritiba | ... | None | None | 0.008318 | 0.375198 | Distrito | Unknown | Present | None | POLYGON ((-40.62181199999998 -11.7807669999999... | PORTO FELIZ |
2 | District | None | 32 | 434 | 4113 | 4870 | BRA | Brazil | Bahia | Planaltino | ... | None | None | 0.024796 | 0.751948 | Distrito | Unknown | Present | None | POLYGON ((-40.11320899999998 -13.3009799999998... | IBITIGUIRA |
3 | District | None | 32 | 434 | 4113 | 4871 | BRA | Brazil | Bahia | Planaltino | ... | None | None | 0.027577 | 0.776489 | Distrito | Unknown | Present | None | POLYGON ((-40.15503999999993 -13.1252719999998... | NOVA ITAIPE |
4 | District | None | 32 | 434 | 4113 | 4872 | BRA | Brazil | Bahia | Planaltino | ... | None | None | 0.025845 | 1.052880 | Distrito | Unknown | Present | None | POLYGON ((-40.38104599999997 -13.2141219999999... | PLANALTINO |
5 rows × 21 columns
SP_dist.NAME_3.unique()
array(['Piritiba', 'Porto Feliz', 'Ibitiguira', ..., 'Tupiratins', 'Wanderlândia', 'Xambioá'], dtype=object)
district_names
array(['BRAS', 'BARRA FUNDA', 'FREGUESIA DO O', 'CAMBUCI', 'PENHA', 'BRAZILANDIA', 'SANTA CECILIA', 'CASA VERDE', 'CAPAO REDONDO', 'CONSOLACAO', 'JAGUARE', 'CIDADE ADEMAR', 'CIDADE TIRADENTES', 'SAPOPEMBA', 'MOOCA', 'CANGAIBA', 'SAUDE', 'SANTANA', 'JARDIM ANGELA', 'CAMPO LIMPO', 'VILA MARIANA', 'VILA CURUCA', 'CIDADE DUTRA', 'ARTUR ALVIM', 'JARDIM HELENA', 'SE', 'LAPA', 'JARDIM PAULISTA', 'JABAQUARA', 'PINHEIROS', 'RIO PEQUENO', 'GRAJAU', 'VILA SONIA', 'IGUATEMI', 'SANTO AMARO', 'LIMAO', 'PERDIZES', 'SAO DOMINGOS', 'SAO LUCAS', 'PIRITUBA', 'RAPOSO TAVARES', 'VILA FORMOSA', 'BELEM', 'REPUBLICA', 'SAO MATEUS', 'TREMEMBE', 'VILA PRUDENTE', 'VILA LEOPOLDINA', 'BOM RETIRO', 'JOSE BONIFACIO', 'PARELHEIROS', 'PERUS', 'PEDREIRA', 'JARAGUA', 'LAJEADO', 'SACOMA', 'ITAIM PAULISTA', 'TUCURUVI', 'VILA MEDEIROS', 'LIBERDADE', 'TATUAPE', 'CARRAO', 'BELA VISTA', 'BUTANTA', 'SAO RAFAEL', 'MORUMBI', 'AGUA RASA', 'MOEMA', 'VILA ANDRADE', 'ANHANGUERA', 'VILA MARIA', 'IPIRANGA', 'SOCORRO', 'CACHOEIRINHA', 'ARICANDUVA', 'CAMPO GRANDE', 'MANDAQUI', 'ITAQUERA', 'SAO MIGUEL', 'JAGUARA', 'PARQUE DO CARMO', 'JACANA', 'CIDADE LIDER', 'CAMPO BELO', 'VILA JACUI', 'ITAIM BIBI', 'VILA GUILHERME', 'CURSINO', 'MARSILAC', 'GUAIANASES', 'VILA MATILDE', 'PONTE RASA', 'PARI'], dtype=object)
Rt_june = pd.Series(model_june.Rt_obs.stats()['mean'], index=sp_districts)
Rt_june
BRAS 1.027835 BARRA FUNDA 1.777664 FREGUESIA DO O 0.160857 CAMBUCI 0.897236 PENHA 0.167268 BRAZILANDIA 0.181805 SANTA CECILIA 0.423634 CASA VERDE 0.305393 CAPAO REDONDO 0.267271 CONSOLACAO 0.383250 ... CIDADE LIDER 0.216627 CAMPO BELO 0.375009 VILA JACUI 0.148078 ITAIM BIBI 0.424089 VILA GUILHERME 0.283753 CURSINO 0.177329 MARSILAC 3.106626 GUAIANASES 0.316312 VILA MATILDE 0.207121 PONTE RASA 0.205254 PARI 1.745154 Length: 93, dtype: float64
SP_dist_merged = SP_dist.merge(pd.DataFrame(Rt_june, columns=['Rt']), left_on='DIST_NAME', right_index=True)
SP_dist_merged = SP_dist.merge(foo, left_on='DIST_NAME', right_index=True)
measles_onset_conf = measles_data[CONFIRMED].groupby(['DISTRICT','ONSET']).size().unstack(level=0).fillna(0).sum()
measles_onset_conf
DISTRICT AGUA RASA 37 ALTO PINHEIROS 19 AMERICANOPOLIS 1 ANHANGUERA 22 ARICANDUVA 32 ARTUR ALVIM 68 BARRA FUNDA 37 BELA VISTA 83 BELEM 16 BOM RETIRO 40 ... VILA.CARRAO 1 VILA.JAGUARA 1 VILA.LEOPOLDINA 1 VILAILA ANDRADE 1 VILAILA CURUCA 1 VILAILA FORMOSA 1 VILAILA MARIA 1 VILAILA MATILDE 1 VILAILA MEDEIROS 1 VILAILA SONIA 3 VILAL FORMOSA 1 Length: 153, dtype: float64
_rates = measles_onset_conf/sp_pop.sum(1)
SP_dist_conf = SP_dist.merge(pd.DataFrame(_rates, columns=['rate']), left_on='DIST_NAME', right_index=True)
from matplotlib.pyplot import cm
map_fig = plt.figure(figsize=(16,12))
map_ax = plt.gca()
SP_base.drawcoastlines()
SP_base.drawrivers()
SP_dist_merged.plot(column='cases', colormap=cm.Reds, axes=map_ax)
<matplotlib.axes._subplots.AxesSubplot at 0x4723f3be0>