%matplotlib inline
import pandas as pd
import numpy as np
import trans
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('max_columns', 20)
pd.set_option('max_rows', 25)
from __future__ import division
measles_data = pd.read_csv("measles.csv", index_col=0)
measles_data.NOTIFICATION = pd.to_datetime(measles_data.NOTIFICATION)
measles_data.BIRTH = pd.to_datetime(measles_data.BIRTH)
measles_data.ONSET = pd.to_datetime(measles_data.ONSET)
sp_pop = pd.read_csv('sp_pop.csv', index_col=0)
Cases by district
measles_onset_dist = measles_data.groupby(['DISTRICT','ONSET']).size().unstack(level=0).fillna(0)
measles_onset_dist.cumsum().plot(legend=False, grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x1126697d0>
totals = measles_onset_dist.sum()
totals.sort(ascending=False)
totals[:5]
DISTRICT GRAJAU 1074 JARDIM ANGELA 944 CAPAO REDONDO 849 JARDIM SAO LUIZ 778 CAMPO LIMPO 692 dtype: float64
top5 = pd.Series(totals[:5].index)
measles_top5 = measles_data[measles_data.DISTRICT.isin(top5)]
District population sizes
N = sp_pop.ix[top5.replace({'JARDIM SAO LUIZ': 'JARDIM SAO LUIS'}), 'Total']
N
GRAJAU 278052 JARDIM ANGELA 221037 CAPAO REDONDO 224155 JARDIM SAO LUIS 227290 CAMPO LIMPO 180454 Name: Total, dtype: int64
Time series of cases by district
measles_groupby = measles_top5.groupby(['DISTRICT', 'ONSET']).size()
cases_by_ditrict = measles_groupby.unstack(0).fillna(0).resample('2W', how='sum').fillna(0)
cases_by_ditrict
DISTRICT | CAMPO LIMPO | CAPAO REDONDO | GRAJAU | JARDIM ANGELA | JARDIM SAO LUIZ |
---|---|---|---|---|---|
ONSET | |||||
1997-01-19 | 0 | 0 | 1 | 0 | 0 |
1997-02-02 | 0 | 0 | 0 | 0 | 0 |
1997-02-16 | 0 | 0 | 0 | 0 | 0 |
1997-03-02 | 0 | 4 | 0 | 0 | 1 |
1997-03-16 | 1 | 4 | 1 | 2 | 0 |
1997-03-30 | 2 | 4 | 1 | 1 | 2 |
1997-04-13 | 1 | 0 | 1 | 4 | 1 |
1997-04-27 | 0 | 8 | 3 | 3 | 2 |
1997-05-11 | 3 | 15 | 9 | 19 | 2 |
1997-05-25 | 21 | 26 | 13 | 15 | 8 |
1997-06-08 | 36 | 36 | 34 | 34 | 15 |
1997-06-22 | 45 | 77 | 43 | 78 | 63 |
... | ... | ... | ... | ... | ... |
1997-08-03 | 118 | 140 | 158 | 132 | 138 |
1997-08-17 | 88 | 114 | 111 | 129 | 140 |
1997-08-31 | 81 | 91 | 139 | 74 | 72 |
1997-09-14 | 54 | 61 | 120 | 76 | 50 |
1997-09-28 | 25 | 30 | 53 | 29 | 22 |
1997-10-12 | 9 | 13 | 43 | 7 | 14 |
1997-10-26 | 12 | 9 | 23 | 8 | 13 |
1997-11-09 | 6 | 13 | 22 | 8 | 4 |
1997-11-23 | 2 | 1 | 5 | 11 | 4 |
1997-12-07 | 3 | 10 | 3 | 2 | 2 |
1997-12-21 | 1 | 1 | 0 | 1 | 4 |
1998-01-04 | 2 | 1 | 3 | 0 | 1 |
26 rows × 5 columns
# Cases
I = cases_by_ditrict.values
I.shape
(26, 5)
S = (I.sum(0) - I.cumsum(0)).astype(int)
S.shape
(26, 5)
# Time steps
periods = S.shape[0]
periods
26
Get districts from shapefile
from shapely.geometry import Point, mapping, shape, MultiPolygon
from fiona import collection
import fiona
shp = fiona.open("Sao Paulo/Brazil_full/BRA_adm3.shp")
district_names = measles_data.DISTRICT.unique()
# Might need this: .decode('latin1').encode('trans')
get_name = lambda x: unicode(x['properties']['NAME_3']).encode('trans').upper()
district_polygons = {get_name(pol): shape(pol['geometry']) for pol in shp
if get_name(pol) in district_names}
-c:4: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
Number of matches between shapefile districts and districts from measles data
len(district_polygons)
109
Calculate distances between polygons
distances = {p1: {p2: district_polygons[p1].distance(district_polygons[p2])
for p2 in district_polygons} for p1 in district_polygons}
distance_matrix = pd.DataFrame(distances)
Convert these to weights for populations, so that we can use them to estimate an "effective N"
scale_factor = 120
distance_weights = np.exp(-scale_factor*distance_matrix)
For example, we can use this to calculate the effective N for cases in Grajau:
(sp_pop.Total*distance_weights.GRAJAU).dropna().sum()
681889.54880176287
Which compares to the actual population of Sao Paulo:
sp_pop.Total.sum()
20268440
from pymc import MCMC, Matplot
from pymc import (Uniform, DiscreteUniform, Beta, Lambda, Binomial, Normal, Poisson,
NegativeBinomial, observed, negative_binomial_like, Lognormal, Exponential, binomial_like,
stochastic, potential, invlogit, TruncatedNormal, Binomial, Gamma)
def chain_binomial(district='GRAJAU'):
'''
Extract data for district
'''
district_measles = measles_data[measles_data.DISTRICT==district]
# District population size
N = sp_pop.ix[district]['Total']
# Extract cases in 2-week intervals
sp_cases_2w = district_measles.groupby('ONSET').size().fillna(0).resample('2W', how='sum')
# Cases by 2-week interval
I = sp_cases_2w.fillna(0).values
# Susceptibles by 2-week interval
S = (sp_cases_2w.fillna(0).values.sum() - I.cumsum()).astype(int)
'''
Model specification
'''
beta = Gamma('beta', 1, 0.1, value=10)
# Effective reproduction number
Rt = Lambda('Rt', lambda B=beta: (B * S) / N)
# Force of infection
lam = Lambda('lam', lambda B=beta: (B * I[:-1]) / N)
# 2-week infection probabilities
p = Lambda('p', lambda lam=lam: 1. - np.exp(-lam) + 1e-6)
new_infections = Binomial('new_infections', S[:-1], p, value=I[1:], observed=True)
new_infections_pred = Binomial('new_infections_pred', S[:-1], p)
return(locals())
M = MCMC(chain_binomial())
M.sample(20000, 10000)
[-----------------100%-----------------] 20000 of 20000 complete in 3.0 sec
Estimates of effective reproductive number for clinical cases
Matplot.summary_plot([M.Rt])
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Matplot.summary_plot([M.lam])
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Create dummy variables corresponding to age groups in census data
age_classes = [0,5,10,15,20,25,30,35,40,45,50,55,60,66,70,75,100]
measles_data['AGE_GROUP'] = pd.cut(measles_data.AGE, age_classes, right=False)
Turn individual ages into age classes, and retrieve age class index
CONFIRMED = measles_data.CONCLUSION == 'CONFIRMED'
CLINICAL = measles_data.CONCLUSION == 'CLINICAL'
DISCARDED = measles_data.CONCLUSION == 'DISCARDED'
lab_subset = measles_data[(CONFIRMED | DISCARDED) & measles_data.YEAR_AGE.notnull() & measles_data.COUNTY.notnull()].copy()
lab_subset.loc[lab_subset.YEAR_AGE > 75, 'YEAR_AGE'] = 75
age = lab_subset.YEAR_AGE.astype(int).values
ages = lab_subset.YEAR_AGE.astype(int).unique()
counties = lab_subset.COUNTY.unique()
y = (lab_subset.CONCLUSION=='CONFIRMED').values
county_index = [np.where(counties==c)[0][0] for c in lab_subset.COUNTY]
age_group = pd.cut(age, age_classes, right=False)
age_index = np.array([age_group.categories.tolist().index(i) for i in age_group])
sp_cases_2w = measles_data.groupby('ONSET').size().fillna(0).resample('2W', how='sum')
sp_cases_2w
ONSET 1997-01-05 12 1997-01-19 85 1997-02-02 73 1997-02-16 74 1997-03-02 126 1997-03-16 234 1997-03-30 200 1997-04-13 206 1997-04-27 256 1997-05-11 373 ... 1997-08-31 8704 1997-09-14 8256 1997-09-28 5391 1997-10-12 3366 1997-10-26 2049 1997-11-09 1462 1997-11-23 939 1997-12-07 570 1997-12-21 465 1998-01-04 232 1998-01-18 14 Freq: 2W-SUN, Length: 28
district_measles = measles_data[measles_data.DISTRICT=='GRAJAU']
sp_cases_2w_age = district_measles.groupby(['ONSET', 'AGE_GROUP']).size().unstack().fillna(0).resample('2W', how='sum')
sp_cases_2w_age.reindex_axis(measles_data['AGE_GROUP'].unique(), axis=1).fillna(0).shape
(26, 16)
sp_cases_2w_age
AGE_GROUP | [0, 5) | [10, 15) | [15, 20) | [20, 25) | [25, 30) | [30, 35) | [35, 40) | [40, 45) | [45, 50) | [5, 10) | [75, 100) |
---|---|---|---|---|---|---|---|---|---|---|---|
ONSET | |||||||||||
1997-01-19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1997-02-02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1997-02-16 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1997-03-02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1997-03-16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1997-03-30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1997-04-13 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1997-04-27 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
1997-05-11 | 1 | 1 | 1 | 4 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
1997-05-25 | 3 | 0 | 1 | 6 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
1997-06-08 | 8 | 5 | 2 | 2 | 6 | 2 | 0 | 1 | 0 | 8 | 0 |
1997-06-22 | 9 | 2 | 1 | 10 | 8 | 3 | 0 | 1 | 0 | 8 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1997-08-03 | 18 | 9 | 5 | 34 | 45 | 8 | 6 | 0 | 0 | 32 | 0 |
1997-08-17 | 16 | 8 | 8 | 26 | 22 | 4 | 2 | 0 | 2 | 22 | 0 |
1997-08-31 | 23 | 9 | 9 | 33 | 28 | 7 | 4 | 1 | 0 | 23 | 0 |
1997-09-14 | 22 | 6 | 10 | 29 | 16 | 12 | 1 | 1 | 0 | 21 | 1 |
1997-09-28 | 8 | 4 | 3 | 14 | 12 | 2 | 0 | 1 | 1 | 8 | 0 |
1997-10-12 | 9 | 7 | 4 | 8 | 4 | 0 | 1 | 1 | 0 | 9 | 0 |
1997-10-26 | 8 | 1 | 0 | 2 | 3 | 1 | 1 | 0 | 0 | 6 | 0 |
1997-11-09 | 10 | 0 | 0 | 3 | 0 | 0 | 0 | 1 | 0 | 8 | 0 |
1997-11-23 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 |
1997-12-07 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1997-12-21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1998-01-04 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 |
26 rows × 11 columns
from pymc import HalfCauchy, deterministic, MvNormalCov, Bernoulli, potential
def SIR_confirmed(district='GRAJAU'):
'''
Confirmation submodel
'''
age_classes = np.unique(age_index)
# County random effect
sig_space = HalfCauchy("sig_space", 0, 25, value=10)
tau_space = sig_space**-2
beta_county = Normal("beta_county", mu=0, tau=tau_space, value=np.zeros(len(counties)))
# Means by age
mu = Normal("mu", mu=0, tau=0.0001, value=[0]*len(age_classes))
sig = HalfCauchy('sig', 0, 25, value=1)
var = sig**2
cov = Uniform('cov', -1000, 1000, value=0)
@deterministic
def Sigma(var=var, cov=cov):
# Variance-covariance matrix with first-order correlation
I = np.eye(len(age_classes))*var
E = np.diag(np.ones(len(age_classes)-1), k=-1)*cov
return I + E + E.T
beta_age = MvNormalCov("beta_age", mu=mu, C=Sigma, value=[1]*len(age_classes))
p_age = Lambda('p_age', lambda t=beta_age: invlogit(t))
@deterministic
def p_confirm(theta=beta_age, beta=beta_county):
# Probability of confirmation as a function of age and county
return invlogit(theta[age_index] + beta[county_index])
# Likelihood
lab_confirmed = Bernoulli('lab_confirmed', p=p_confirm, value=y, observed=True)
'''
Extract data for district
'''
district_measles = measles_data[measles_data.DISTRICT==district]
# District population size
N = sp_pop.ix[district]['Total']
# Cases by 2-week interval and age
sp_cases_2w_age = district_measles.groupby(['ONSET', 'AGE_GROUP']).size().unstack().fillna(0).resample('2W', how='sum')
# Sum over ages for susceptibles
sp_cases_2w = sp_cases_2w_age.sum(1)
I_obs = sp_cases_2w_age.reindex_axis(measles_data['AGE_GROUP'].unique(), axis=1).fillna(0).values.astype(int)
# Confirmed cases
@stochastic
def I(value=(I_obs*0.7).astype(int), n=I_obs, p=p_age):
return np.sum([binomial_like(value[:,i], n[:,i], p[i]) for i in range(len(p))]).sum(0)
# Susceptibles by 2-week interval
@deterministic
def S(I=I):
I_total = I.sum()
return I_total - np.array([I[:i].sum() for i in range(len(I))])
'''
Model specification
'''
beta = Gamma('beta', 1, 0.1, value=10)
# Effective reproduction number
Rt = Lambda('Rt', lambda B=beta, S=S: (B * S) / N)
# Force of infection
lam = Lambda('lam', lambda B=beta, I=I: (B * I) / N)
# 2-week infection probabilities
p = Lambda('p', lambda lam=lam: 1. - np.exp(-lam) + 1e-6)
# Binomial likelihood for observed cases
@potential
def new_cases(p=p, I=I, S=S):
return np.sum([binomial_like(i, s, pt) for pt,i,s in zip(p, I, S)])
return(locals())
M = MCMC(SIR_confirmed())
M.sample(20000, 10000)
[-----------------100%-----------------] 20000 of 20000 complete in 282.8 sec
Estimates of effective reproductive number for confirmed cases
Matplot.summary_plot(M.Rt)
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.
Estimates of age-specific confirmation probabilities
Matplot.summary_plot(M.p_age, custom_labels=measles_data['AGE_GROUP'].unique())
Could not calculate Gelman-Rubin statistics. Requires multiple chains of equal length.