Import data
rotavirus_data = pd.read_csv('ROTA.csv')
rotavirus_data.index.is_unique
True
Rename columns and recode data
race_cols = [race for race in rotavirus_data.columns if race.startswith('Race')]
(rotavirus_data[race_cols]=='Checked').sum()
Race (choice=White) 2249 Race (choice=Black/African American) 1545 Race (choice=American Indian/Alaskan Native) 5 Race (choice=Asian) 86 Race (choice=Native Hawaiian/Other Pacific Islander) 5 Race (choice=Other) 18 Race (choice=Unknown) 3 Race (choice=None) 1 dtype: int64
rotavirus_data['white'] = (rotavirus_data['Race (choice=White)']=='Checked').astype(int)
rotavirus_data['black'] = (rotavirus_data['Race (choice=Black/African American)']=='Checked').astype(int)
rotavirus_data = rotavirus_data[(rotavirus_data.white + rotavirus_data.black)==1].drop(race_cols, axis=1)
rotavirus_data['hispanic'] = (rotavirus_data.Ethnicity=='Hispanic or Latino').astype(int)
rotavirus_data['rotavirus'] = (rotavirus_data['Result of rotavirus testing'] == 'Positive').astype(int)
rotavirus_data['sapovirus'] = (rotavirus_data['Result of sapovirus testing'] == 'Positive').astype(int)
rotavirus_data['astrovorus'] = (rotavirus_data['Result of astrovirus testing'] == 'Positive').astype(int)
rotavirus_data['norovirus'] = (rotavirus_data['Result of Norovirus Testing'] == 'GII').astype(int)
rotavirus_data = rotavirus_data.drop([col for col in rotavirus_data.columns if col.startswith('Result')], axis=1)
rotavirus_data['Enrollment Date'] = pd.to_datetime(rotavirus_data['Enrollment Date'])
rotavirus_data['Date of Birth'] = pd.to_datetime(rotavirus_data['Date of Birth'])
pd.crosstab(rotavirus_data.black, rotavirus_data.hispanic)
hispanic | 0 | 1 |
---|---|---|
black | ||
0 | 719 | 1359 |
1 | 1357 | 17 |
Calculate age, and compare to given age.
rotavirus_data['age_months'] = (rotavirus_data['Enrollment Date']
- rotavirus_data['Date of Birth'] + timedelta(1)).astype('timedelta64[M]')
np.abs((rotavirus_data['Age in months (auto calculated) '] - rotavirus_data['age_months']) > 1).sum()
0
Group into ages under 5 and 5 or over
rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int)
Calculate natural year
rotavirus_data['Enrollment Date'].sort_values(ascending=False).tail()
1291 2012-12-02 1295 2012-12-02 1293 2012-12-02 1292 2012-12-02 1290 2012-12-02 Name: Enrollment Date, dtype: datetime64[ns]
rotavirus_data['year'] = rotavirus_data.Year - 2
rotavirus_data.year.value_counts()
0 1215 1 1178 2 1059 Name: year, dtype: int64
rotavirus_data.setting.value_counts()
0 2117 2 1190 1 145 Name: setting, dtype: int64
Recode sex and stool
rotavirus_data['male'] = rotavirus_data['Sex updated']=='Male'
rotavirus_data['stool_collected'] = rotavirus_data['Was stool collected?']=='Yes'
# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['Date of Birth',
'Age in months (auto calculated) ', 'Sex',
'Sex updated', 'Was stool collected?',
'Ethnicity', 'Year'], axis=1)
rotavirus_data = pd.concat([rotavirus_data, pd.get_dummies(rotavirus_data.Setting)], axis=1).drop('Setting', axis=1)
Summary of counts by setting
rotavirus_data.groupby(['year', 'setting'])['Record ID'].count()
year setting 0 0 716 1 58 2 441 1 0 736 1 49 2 393 2 0 665 1 38 2 356 Name: Record ID, dtype: int64
rotavirus_data.shape
(3452, 18)
rotavirus_data['setting'] = rotavirus_data.IP.astype(int)
rotavirus_data.loc[rotavirus_data.OP==1, 'setting'] = 2
rotavirus_data.setting.value_counts()
0 2117 2 1190 1 145 Name: setting, dtype: int64
rotavirus_data.shape
(3452, 18)
rotavirus_data.head()
Record ID | Enrollment Date | white | black | hispanic | rotavirus | sapovirus | astrovorus | norovirus | age_months | under_5 | year | male | stool_collected | ED | IP | OP | setting | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | EN1C-0001 | 2012-12-03 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 77.0 | 0 | 0 | True | True | 0.0 | 0.0 | 1.0 | 2 |
1 | EN1C-0002 | 2012-12-03 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 87.0 | 0 | 0 | True | False | 0.0 | 0.0 | 1.0 | 2 |
2 | EN1C-0003 | 2012-12-03 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 63.0 | 0 | 0 | False | False | 0.0 | 0.0 | 1.0 | 2 |
3 | EN1C-0004 | 2012-12-04 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 36.0 | 1 | 0 | False | True | 0.0 | 0.0 | 1.0 | 2 |
4 | EN1C-0006 | 2012-12-04 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 64.0 | 0 | 0 | False | False | 0.0 | 0.0 | 1.0 | 2 |
rotavirus_data.isnull().sum()
Record ID 0 Enrollment Date 0 white 0 black 0 hispanic 0 rotavirus 0 sapovirus 0 astrovorus 0 norovirus 0 age_months 0 under_5 0 year 0 male 0 stool_collected 0 ED 0 IP 0 OP 0 setting 0 dtype: int64
rotavirus_data.describe()
white | black | hispanic | rotavirus | sapovirus | astrovorus | norovirus | age_months | under_5 | year | ED | IP | OP | setting | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 |
mean | 0.601970 | 0.398030 | 0.398610 | 0.108633 | 0.081112 | 0.037080 | 0.152665 | 44.912804 | 0.715527 | 0.954809 | 0.613268 | 0.042005 | 0.344728 | 0.731460 |
std | 0.489563 | 0.489563 | 0.489683 | 0.311223 | 0.273047 | 0.188985 | 0.359717 | 45.485356 | 0.451228 | 0.810492 | 0.487072 | 0.200629 | 0.475348 | 0.941349 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 11.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 27.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 67.000000 | 1.000000 | 2.000000 | 1.000000 | 0.000000 | 1.000000 | 2.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 215.000000 | 1.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 |
Aggregate data by age group and setting
rotavirus_summary = (rotavirus_data.drop('Enrollment Date', axis=1)
.groupby(['under_5', 'year', 'setting', 'black', 'hispanic'])['Record ID'].count()
.reset_index()
.rename(columns={'Record ID':'cases'}))
rotavirus_summary.head()
under_5 | year | setting | black | hispanic | cases | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 57 |
1 | 0 | 0 | 0 | 0 | 1 | 62 |
2 | 0 | 0 | 0 | 1 | 0 | 99 |
3 | 0 | 0 | 0 | 1 | 1 | 1 |
4 | 0 | 0 | 1 | 0 | 0 | 4 |
The key for AGEGRP is as follows:
davidson_census = pd.read_csv('davidson_county.csv')
davidson_census.head()
SUMLEV | STATE | COUNTY | STNAME | CTYNAME | YEAR2008 | YEAR | AGEGRP | TOT_POP | TOT_MALE | ... | HWAC_MALE | HWAC_FEMALE | HBAC_MALE | HBAC_FEMALE | HIAC_MALE | HIAC_FEMALE | HAAC_MALE | HAAC_FEMALE | HNAC_MALE | HNAC_FEMALE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 0 | 626681 | 303540 | ... | 30400 | 24829 | 2178 | 2006 | 1501 | 1183 | 381 | 374 | 271 | 211 |
1 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 1 | 44691 | 22868 | ... | 3838 | 3664 | 302 | 303 | 185 | 189 | 63 | 53 | 26 | 35 |
2 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 2 | 37613 | 18977 | ... | 2910 | 2826 | 266 | 205 | 145 | 140 | 45 | 50 | 22 | 23 |
3 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 3 | 33904 | 17479 | ... | 2154 | 1890 | 166 | 170 | 105 | 91 | 25 | 29 | 18 | 16 |
4 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 4 | 38999 | 19674 | ... | 2029 | 1783 | 166 | 174 | 111 | 82 | 29 | 29 | 22 | 16 |
5 rows × 81 columns
Identify racial/ethnic subgroup populations
census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy()
census_data_child['HISPANIC_WHITE'] = census_data_child[['HWA_MALE', 'HWA_FEMALE']].sum(axis=1)
census_data_child['HISPANIC_BLACK'] = census_data_child[['HBA_MALE', 'HBA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_WHITE'] = census_data_child[['NHWA_MALE', 'NHWA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_BLACK'] = census_data_child[['NHBA_MALE', 'NHBA_FEMALE']].sum(axis=1)
census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
.groupby(['under_5','YEAR'])[['HISPANIC_WHITE', 'HISPANIC_BLACK',
'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']]
.sum()).reset_index()
census_data['YEAR'] = census_data.YEAR - 2011
census_data
under_5 | YEAR | HISPANIC_WHITE | HISPANIC_BLACK | NONHISPANIC_WHITE | NONHISPANIC_BLACK | |
---|---|---|---|---|---|---|
0 | False | 0 | 13889 | 891 | 47077 | 40334 |
1 | False | 1 | 14692 | 914 | 47527 | 40656 |
2 | False | 2 | 15653 | 983 | 47487 | 40556 |
3 | True | 0 | 7134 | 533 | 19290 | 13591 |
4 | True | 1 | 6994 | 566 | 19942 | 13511 |
5 | True | 2 | 6909 | 552 | 20479 | 13471 |
Reshape data
melted_census = (pd.melt(census_data, id_vars=['under_5', 'YEAR'],
value_vars=['HISPANIC_WHITE', 'HISPANIC_BLACK',
'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']))
census_clean = (melted_census.assign(hispanic=melted_census.variable.apply(lambda x: x.split('_')[0]),
black=melted_census.variable.apply(lambda x: x.split('_')[1]))
.drop('variable', axis=1)
.replace({'black':{'BLACK':1, 'WHITE':0},
'hispanic':{'HISPANIC':1, 'NONHISPANIC':0}})
.rename(columns={'YEAR':'year', 'value':'n'}))
census_clean
under_5 | year | n | black | hispanic | |
---|---|---|---|---|---|
0 | False | 0 | 13889 | 0 | 1 |
1 | False | 1 | 14692 | 0 | 1 |
2 | False | 2 | 15653 | 0 | 1 |
3 | True | 0 | 7134 | 0 | 1 |
4 | True | 1 | 6994 | 0 | 1 |
5 | True | 2 | 6909 | 0 | 1 |
6 | False | 0 | 891 | 1 | 1 |
7 | False | 1 | 914 | 1 | 1 |
8 | False | 2 | 983 | 1 | 1 |
9 | True | 0 | 533 | 1 | 1 |
10 | True | 1 | 566 | 1 | 1 |
11 | True | 2 | 552 | 1 | 1 |
12 | False | 0 | 47077 | 0 | 0 |
13 | False | 1 | 47527 | 0 | 0 |
14 | False | 2 | 47487 | 0 | 0 |
15 | True | 0 | 19290 | 0 | 0 |
16 | True | 1 | 19942 | 0 | 0 |
17 | True | 2 | 20479 | 0 | 0 |
18 | False | 0 | 40334 | 1 | 0 |
19 | False | 1 | 40656 | 1 | 0 |
20 | False | 2 | 40556 | 1 | 0 |
21 | True | 0 | 13591 | 1 | 0 |
22 | True | 1 | 13511 | 1 | 0 |
23 | True | 2 | 13471 | 1 | 0 |
census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)])
merged_data = (rotavirus_summary.merge(census_full, on=['under_5', 'year', 'black', 'hispanic','setting'], how='right')
.fillna(0).astype(int))
merged_data.head()
under_5 | year | setting | black | hispanic | cases | n | |
---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 57 | 47077 |
1 | 0 | 0 | 0 | 0 | 1 | 62 | 13889 |
2 | 0 | 0 | 0 | 1 | 0 | 99 | 40334 |
3 | 0 | 0 | 0 | 1 | 1 | 1 | 891 |
4 | 0 | 0 | 1 | 0 | 0 | 4 | 47077 |
VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health
inpatient_market_share = 0.94, 0.90, 0.83
ed_market_share = 0.95, 0.94, 0.92
Y2012, Y2013, Y2014 = 0, 1, 2
Will estimate 2014 market shares by extrapolating a linear fit to the previous years.
from sklearn import linear_model
inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share))
inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.reshape(-1, 1), y=inpatient_df.y)
inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]]
inpatient_market_share
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver. warnings.warn(mesg, RuntimeWarning)
[0.9, 0.83, 0.77500000000000002]
ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share))
ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.reshape(-1, 1), y=ed_df.y)
ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]]
ed_market_share
[0.94, 0.92, 0.90500000000000014]
Enrollment days/week for ED, inpatient, outpatient
monitoring_days = 4, 7, 4
# Indices to each value
ED, INPATIENT, OUTPATIENT = 0, 1, 2
monitoring_rate = np.array(np.array(monitoring_days)/7)
monitoring_rate
array([ 0.57142857, 1. , 0.57142857])
rotavirus_data[(rotavirus_data.hispanic==1) & (rotavirus_data.white==1)
& (rotavirus_data.setting==2) & (rotavirus_data.year==2) &
(rotavirus_data.under_5==1)].shape
(157, 18)
setting_data = dict(list(merged_data.groupby('setting')))
viruses = ['rotavirus','sapovirus','astrovorus','norovirus']
rotavirus_data.groupby(['under_5', 'year', 'setting']).stool_collected.sum().astype(int)
under_5 year setting 0 0 0 142 1 8 2 110 1 0 150 1 4 2 82 2 0 125 1 1 2 63 1 0 0 406 1 44 2 249 1 0 439 1 42 2 220 2 0 365 1 35 2 209 Name: stool_collected, dtype: int64
rotavirus_data[rotavirus_data.stool_collected].groupby(['under_5', 'year', 'setting'])[viruses].mean().round(2)
rotavirus | sapovirus | astrovorus | norovirus | |||
---|---|---|---|---|---|---|
under_5 | year | setting | ||||
0 | 0 | 0 | 0.24 | 0.11 | 0.04 | 0.09 |
1 | 0.12 | 0.00 | 0.00 | 0.12 | ||
2 | 0.12 | 0.09 | 0.05 | 0.12 | ||
1 | 0 | 0.14 | 0.07 | 0.03 | 0.18 | |
1 | 0.25 | 0.00 | 0.00 | 0.25 | ||
2 | 0.07 | 0.09 | 0.02 | 0.15 | ||
2 | 0 | 0.19 | 0.05 | 0.02 | 0.11 | |
1 | 1.00 | 0.00 | 0.00 | 0.00 | ||
2 | 0.11 | 0.03 | 0.02 | 0.22 | ||
1 | 0 | 0 | 0.16 | 0.13 | 0.03 | 0.16 |
1 | 0.23 | 0.02 | 0.02 | 0.14 | ||
2 | 0.15 | 0.12 | 0.08 | 0.18 | ||
1 | 0 | 0.12 | 0.12 | 0.06 | 0.23 | |
1 | 0.19 | 0.17 | 0.17 | 0.31 | ||
2 | 0.08 | 0.16 | 0.08 | 0.25 | ||
2 | 0 | 0.15 | 0.09 | 0.04 | 0.22 | |
1 | 0.17 | 0.06 | 0.00 | 0.23 | ||
2 | 0.10 | 0.08 | 0.03 | 0.29 |
from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform
from pymc3 import traceplot, forestplot, summary, df_summary
import theano.tensor as tt
def generate_inpatient_model(inpatient_data, pool_years=False):
with Model() as model:
under_5, cases, n = inpatient_data[['under_5', 'cases', 'n']].values.T
weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year]
weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=inpatient_data.shape[0],
testval=(cases/weights).astype(int))
weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)
if pool_years:
assert not len(cases) % 3
pooled_shape = int(len(cases)/3)
pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
θ = Normal('θ', 0, sd=1000, shape=pooled_shape)
λ = Deterministic('λ', tt.exp(θ))
AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
else:
θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0])
λ = Deterministic('λ', tt.exp(θ))
AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
return model
inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1))
with inpatient_model_full:
start = find_MAP()
step = NUTS(scaling=start)
inpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases [-----------------100%-----------------] 2000 of 2000 complete in 564.3 sec
plt.subplots(figsize=(8,12))
forestplot(inpatient_full_trace[1000:], varnames=['λ'])
<matplotlib.gridspec.GridSpec at 0x11bd5e6d8>
label_map = {'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
'black':{0:'white', 1:'black'},
'hispanic':{0:'non-hispanic', 1:'hispanic'},
'setting':{0:'ED', 1:'inpatient', 2:'outpatient'}}
inpatient_data_labels = (setting_data[INPATIENT][['under_5', 'year', 'black', 'hispanic']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'}))
Rate factor for population
rate_factor = 10000
rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns)
inpatient_data_labels
age | year | race | ethnicity | |
---|---|---|---|---|
0 | 5+ | 2012/13 | white | non-hispanic |
1 | 5+ | 2012/13 | white | hispanic |
2 | 5+ | 2012/13 | black | non-hispanic |
3 | 5+ | 2013/14 | white | non-hispanic |
4 | 5+ | 2013/14 | white | hispanic |
5 | 5+ | 2014/15 | white | hispanic |
6 | 5+ | 2014/15 | black | non-hispanic |
7 | <5 | 2012/13 | white | non-hispanic |
8 | <5 | 2012/13 | white | hispanic |
9 | <5 | 2012/13 | black | non-hispanic |
10 | <5 | 2013/14 | white | non-hispanic |
11 | <5 | 2013/14 | white | hispanic |
12 | <5 | 2013/14 | black | non-hispanic |
13 | <5 | 2014/15 | white | non-hispanic |
14 | <5 | 2014/15 | white | hispanic |
15 | <5 | 2014/15 | black | non-hispanic |
16 | 5+ | 2012/13 | black | hispanic |
17 | 5+ | 2013/14 | black | hispanic |
18 | 5+ | 2014/15 | black | hispanic |
19 | <5 | 2012/13 | black | hispanic |
20 | <5 | 2013/14 | black | hispanic |
21 | <5 | 2014/15 | black | hispanic |
22 | 5+ | 2014/15 | white | non-hispanic |
23 | 5+ | 2013/14 | black | non-hispanic |
from pymc3 import hpd
def _hpd_df(x, alpha):
cnames = ['hpd_{0:g}'.format(100 * alpha/2),
'hpd_{0:g}'.format(100 * (1 - alpha/2))]
return pd.DataFrame(hpd(x, alpha), columns=cnames)
estimates = (df_summary(inpatient_full_trace[1000:], varnames=['λ'],
stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates.index = inpatient_data_labels.index
pd.concat([inpatient_data_labels, estimates], axis=1)
age | year | race | ethnicity | mean | sd | mc_error | hpd_2.5 | hpd_97.5 | median | hpd_25 | hpd_75 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5+ | 2012/13 | white | non-hispanic | 0.845 | 0.430 | 0.009 | 0.205 | 1.772 | 0.773 | 0.396 | 0.895 |
1 | 5+ | 2012/13 | white | hispanic | 1.429 | 0.984 | 0.020 | 0.094 | 3.421 | 1.197 | 0.271 | 1.329 |
2 | 5+ | 2012/13 | black | non-hispanic | 1.004 | 0.500 | 0.012 | 0.172 | 1.917 | 0.915 | 0.422 | 1.034 |
3 | 5+ | 2013/14 | white | non-hispanic | 0.418 | 0.299 | 0.007 | 0.017 | 1.024 | 0.352 | 0.063 | 0.376 |
4 | 5+ | 2013/14 | white | hispanic | 2.060 | 1.180 | 0.028 | 0.259 | 4.427 | 1.834 | 0.937 | 2.254 |
5 | 5+ | 2014/15 | white | hispanic | 0.605 | 0.641 | 0.014 | 0.000 | 1.775 | 0.405 | 0.000 | 0.405 |
6 | 5+ | 2014/15 | black | non-hispanic | 0.253 | 0.247 | 0.006 | 0.000 | 0.738 | 0.186 | 0.000 | 0.186 |
7 | <5 | 2012/13 | white | non-hispanic | 11.429 | 2.432 | 0.047 | 7.174 | 16.208 | 11.257 | 9.176 | 12.480 |
8 | <5 | 2012/13 | white | hispanic | 22.628 | 5.570 | 0.139 | 12.106 | 33.637 | 22.098 | 16.911 | 24.050 |
9 | <5 | 2012/13 | black | non-hispanic | 10.254 | 2.636 | 0.051 | 5.234 | 15.231 | 10.017 | 7.652 | 11.122 |
10 | <5 | 2013/14 | white | non-hispanic | 14.066 | 2.614 | 0.051 | 9.129 | 19.126 | 13.886 | 12.318 | 15.786 |
11 | <5 | 2013/14 | white | hispanic | 17.146 | 4.843 | 0.098 | 8.274 | 26.776 | 16.796 | 12.789 | 19.115 |
12 | <5 | 2013/14 | black | non-hispanic | 8.824 | 2.480 | 0.050 | 4.343 | 13.940 | 8.641 | 7.006 | 10.218 |
13 | <5 | 2014/15 | white | non-hispanic | 6.849 | 1.850 | 0.036 | 3.491 | 10.579 | 6.633 | 5.111 | 7.435 |
14 | <5 | 2014/15 | white | hispanic | 30.361 | 6.633 | 0.141 | 18.231 | 43.795 | 29.874 | 25.511 | 34.154 |
15 | <5 | 2014/15 | black | non-hispanic | 7.424 | 2.293 | 0.044 | 3.092 | 11.894 | 7.274 | 5.583 | 8.557 |
16 | 5+ | 2012/13 | black | hispanic | 0.005 | 0.151 | 0.003 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
17 | 5+ | 2013/14 | black | hispanic | 0.006 | 0.135 | 0.004 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
18 | 5+ | 2014/15 | black | hispanic | 0.004 | 0.165 | 0.004 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
19 | <5 | 2012/13 | black | hispanic | 0.029 | 0.910 | 0.020 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
20 | <5 | 2013/14 | black | hispanic | 0.013 | 0.388 | 0.009 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
21 | <5 | 2014/15 | black | hispanic | 0.005 | 0.107 | 0.003 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
22 | 5+ | 2014/15 | white | non-hispanic | 0.000 | 0.002 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
23 | 5+ | 2013/14 | black | non-hispanic | 0.000 | 0.002 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
trace, labels = inpatient_full_trace, inpatient_data_labels
index=['age', 'year']
columns=['race', 'ethnicity']
varnames=['λ']
# rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(),
# names=labels.columns)
# rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
# rate_df.index = rate_df_index
# pivot_all = pd.pivot_table(rate_df
# .reset_index(),
# index=index, columns=columns)
# value_iterator = zip(pivot_all['mean'].values.flatten(),
# pivot_all['hpd_2.5'].values.flatten(),
# pivot_all['hpd_97.5'].values.flatten())
# rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator],
# pivot_all['mean'].shape)
# pd.DataFrame(rate_strings, index=pivot_mean.index, columns=pivot_mean.columns)
def generate_table(trace, labels, index, columns, varnames=['λ']):
rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(),
names=labels.columns)
rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
rate_df.index = rate_df_index
pivot_all = pd.pivot_table(rate_df
.reset_index(),
index=index, columns=columns)
value_iterator = zip(pivot_all['mean'].values.flatten(),
pivot_all['hpd_2.5'].values.flatten(),
pivot_all['hpd_97.5'].values.flatten())
rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator],
pivot_all['mean'].shape)
return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)
generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], columns=['race', 'ethnicity'])
race | black | white | |||
---|---|---|---|---|---|
ethnicity | hispanic | non-hispanic | hispanic | non-hispanic | |
age | year | ||||
5+ | 2012/13 | 0.01 (0.00, 0.00) | 1.00 (0.17, 1.92) | 1.43 (0.09, 3.42) | 0.85 (0.20, 1.77) |
2013/14 | 0.01 (0.00, 0.00) | 0.00 (0.00, 0.00) | 2.06 (0.26, 4.43) | 0.42 (0.02, 1.02) | |
2014/15 | 0.00 (0.00, 0.00) | 0.25 (0.00, 0.74) | 0.60 (0.00, 1.77) | 0.00 (0.00, 0.00) | |
<5 | 2012/13 | 0.03 (0.00, 0.00) | 10.25 (5.23, 15.23) | 22.63 (12.11, 33.64) | 11.43 (7.17, 16.21) |
2013/14 | 0.01 (0.00, 0.00) | 8.82 (4.34, 13.94) | 17.15 (8.27, 26.78) | 14.07 (9.13, 19.13) | |
2014/15 | 0.01 (0.00, 0.00) | 7.42 (3.09, 11.89) | 30.36 (18.23, 43.80) | 6.85 (3.49, 10.58) |
inpatient_rate_samples = pd.concat(
[inpatient_data_labels,
pd.DataFrame(inpatient_full_trace['λ'][-1000:]).T], axis=1)
inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race', 'ethnicity']))
inpatient_rate_samples['race/ethnicity'] = inpatient_rate_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)
Plot of rates broken down by year/age/race/ethnicity.
sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age", row='race',
col='ethnicity',
data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*10000),
palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0)
rateplot.despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x114c90128>
inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1)
.groupby(['under_5', 'year'])[['cases','n']]
.sum()
.reset_index())
inpatient_pool_race_data
under_5 | year | cases | n | |
---|---|---|---|---|
0 | 0 | 0 | 10 | 102191 |
1 | 0 | 1 | 5 | 103789 |
2 | 0 | 2 | 2 | 104679 |
3 | 1 | 0 | 48 | 40548 |
4 | 1 | 1 | 44 | 41013 |
5 | 1 | 2 | 36 | 41411 |
inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data)
with inpatient_model_pool_race:
start = find_MAP()
step = NUTS(scaling=start)
inpatient_pool_race_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases [-----------------100%-----------------] 2000 of 2000 complete in 3.1 sec
inpatient_pool_race_data_labels = (inpatient_pool_race_data[['under_5', 'year']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'under_5':'age'}))
generate_table(inpatient_pool_race_trace, inpatient_pool_race_data_labels, index=['year'], columns=['age'])
age | 5+ | <5 |
---|---|---|
year | ||
2012/13 | 1.06 (0.41, 1.70) | 13.09 (9.32, 16.38) |
2013/14 | 0.58 (0.14, 1.05) | 13.21 (9.34, 17.32) |
2014/15 | 0.25 (0.01, 0.60) | 11.43 (7.74, 15.17) |
inpatient_pool_race_samples = pd.concat(
[inpatient_pool_race_data[['under_5', 'year']].reset_index(drop=True),
pd.DataFrame(inpatient_pool_race_trace['λ'][-1000:]).T], axis=1)
inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples,
id_vars=['under_5', 'year'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
.rename(columns={'under_5':'age group'}))
Plot of rates pooled by ethnicity
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*10000),
palette="gray", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x11ada9ef0>
inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1)
.sort(columns=['under_5', 'black', 'hispanic', 'year'], ascending=False)).reset_index(drop=True)
inpatient_pool_year_data
under_5 | year | black | hispanic | cases | n | |
---|---|---|---|---|---|---|
0 | 1 | 2 | 1 | 1 | 0 | 552 |
1 | 1 | 1 | 1 | 1 | 0 | 566 |
2 | 1 | 0 | 1 | 1 | 0 | 533 |
3 | 1 | 2 | 1 | 0 | 8 | 13471 |
4 | 1 | 1 | 1 | 0 | 10 | 13511 |
5 | 1 | 0 | 1 | 0 | 13 | 13591 |
6 | 1 | 2 | 0 | 1 | 17 | 6909 |
7 | 1 | 1 | 0 | 1 | 10 | 6994 |
8 | 1 | 0 | 0 | 1 | 15 | 7134 |
9 | 1 | 2 | 0 | 0 | 11 | 20479 |
10 | 1 | 1 | 0 | 0 | 24 | 19942 |
11 | 1 | 0 | 0 | 0 | 20 | 19290 |
12 | 0 | 2 | 1 | 1 | 0 | 983 |
13 | 0 | 1 | 1 | 1 | 0 | 914 |
14 | 0 | 0 | 1 | 1 | 0 | 891 |
15 | 0 | 2 | 1 | 0 | 1 | 40556 |
16 | 0 | 1 | 1 | 0 | 0 | 40656 |
17 | 0 | 0 | 1 | 0 | 4 | 40334 |
18 | 0 | 2 | 0 | 1 | 1 | 15653 |
19 | 0 | 1 | 0 | 1 | 3 | 14692 |
20 | 0 | 0 | 0 | 1 | 2 | 13889 |
21 | 0 | 2 | 0 | 0 | 0 | 47487 |
22 | 0 | 1 | 0 | 0 | 2 | 47527 |
23 | 0 | 0 | 0 | 0 | 4 | 47077 |
inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True)
with inpatient_model_pool_year:
start = find_MAP()
step = NUTS(scaling=start)
inpatient_pool_year_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases [-----------------100%-----------------] 2000 of 2000 complete in 30.1 sec
inpatient_pool_year_labels = (inpatient_pool_year_data[['under_5', 'black', 'hispanic']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'})
.drop_duplicates().reset_index(drop=True))
estimates_pooled = (df_summary(inpatient_pool_year_trace[1000:], varnames=['λ'],
stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates_pooled.index = inpatient_pool_year_labels.index
pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1)
age | race | ethnicity | mean | sd | mc_error | hpd_2.5 | hpd_97.5 | median | hpd_25 | hpd_75 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | <5 | black | hispanic | 0.003 | 0.107 | 0.002 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
1 | <5 | black | non-hispanic | 26.644 | 4.522 | 0.130 | 18.073 | 35.547 | 26.388 | 23.132 | 29.001 |
2 | <5 | white | hispanic | 70.787 | 10.213 | 0.353 | 50.471 | 90.144 | 70.350 | 63.174 | 76.382 |
3 | <5 | white | non-hispanic | 31.384 | 3.838 | 0.089 | 24.137 | 38.477 | 31.189 | 27.996 | 33.245 |
4 | 5+ | black | hispanic | 0.001 | 0.017 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
5 | 5+ | black | non-hispanic | 1.219 | 0.540 | 0.015 | 0.323 | 2.292 | 1.145 | 0.727 | 1.400 |
6 | 5+ | white | hispanic | 3.804 | 1.572 | 0.041 | 1.113 | 6.877 | 3.589 | 2.288 | 4.234 |
7 | 5+ | white | non-hispanic | 1.267 | 0.507 | 0.014 | 0.440 | 2.268 | 1.206 | 0.758 | 1.410 |
generate_table(inpatient_pool_year_trace, inpatient_pool_year_labels, index=['race','ethnicity'], columns=['age'])
age | 5+ | <5 | |
---|---|---|---|
race | ethnicity | ||
black | hispanic | 0.00 (0.00, 0.00) | 0.00 (0.00, 0.00) |
non-hispanic | 1.22 (0.32, 2.29) | 26.64 (18.07, 35.55) | |
white | hispanic | 3.80 (1.11, 6.88) | 70.79 (50.47, 90.14) |
non-hispanic | 1.27 (0.44, 2.27) | 31.38 (24.14, 38.48) |
inpatient_pool_year_samples = pd.concat(
[inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['under_5', 'black', 'hispanic']],
pd.DataFrame(inpatient_pool_year_trace['λ'][-1000:]).T], axis=1)
inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['under_5', 'black', 'hispanic'])
.replace({'under_5':{0:'5+', 1:'<5'},
'black':{0:'white', 1:'black'},
'hispanic':{0:'non-hispanic', 1:'hispanic'}})
.rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age group'}))
inpatient_pool_year_samples['race/ethnicity'] = inpatient_pool_year_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)
Plot of rates pooled by year
sns.set_context(font_scale=1.5)
plt.subplots(figsize=(12,6))
sns.boxplot(x="race/ethnicity", y="rate", hue="age group",
data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*10000),
palette="gray_r", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x120133ba8>
Import outpatient denominator information.
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()
outpatient_project = Project(api_url, api_key)
outpatients = outpatient_project.export_records(format='df')
outpatients.groupby('year').year.count()
year 2012 15969 2015 17562 Name: year, dtype: int64
outpatients[(outpatients.ptage<5) & (outpatients.ptethnicity=='HL') & (outpatients.year==2012)].shape
(393, 15)
outpatients_kids = outpatients[outpatients.ptage<=18].copy()
outpatients_kids['under_5'] = (outpatients_kids.ptage<5).astype(int)
outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
.groupby(['year', 'under_5'])
.n.count()
.reset_index())
outpatient_n
year | under_5 | n | |
---|---|---|---|
0 | 2012 | 0 | 13913 |
1 | 2012 | 1 | 1422 |
2 | 2015 | 0 | 11003 |
3 | 2015 | 1 | 6135 |
interp_2013 = (outpatient_n.groupby(['under_5'])
.apply(lambda x: np.interp(2013, x.year, x.n))
.astype(int)
.reset_index()).assign(year=2013).rename(columns={0:'n'})
interp_2013
under_5 | n | year | |
---|---|---|---|
0 | 0 | 12943 | 2013 |
1 | 1 | 2993 | 2013 |
outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0)
outpatient_n = outpatient_n[outpatient_n.year<2015]
outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012)
outpatient_n
n | under_5 | year | |
---|---|---|---|
0 | 13913 | 0 | 0 |
1 | 1422 | 1 | 0 |
0 | 12943 | 0 | 1 |
1 | 2993 | 1 | 1 |
outpatient_data_full = setting_data[OUTPATIENT].groupby(['under_5','year']).cases.sum().reset_index()
outpatient_data_full
under_5 | year | cases | |
---|---|---|---|
0 | 0 | 0 | 148 |
1 | 0 | 1 | 120 |
2 | 0 | 2 | 87 |
3 | 1 | 0 | 293 |
4 | 1 | 1 | 273 |
5 | 1 | 2 | 269 |
outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['under_5','year'])
outpatient_merged
under_5 | year | cases | n | |
---|---|---|---|---|
0 | 0 | 0 | 148 | 13913 |
1 | 0 | 1 | 120 | 12943 |
2 | 1 | 0 | 293 | 1422 |
3 | 1 | 1 | 273 | 2993 |
def generate_outpatient_model(outpatient_data, pool_years=False):
with Model() as model:
cases,n = outpatient_data[['cases','n']].values.T
ω = monitoring_rate[OUTPATIENT]
shape = outpatient_data.shape[0]
if pool_years:
assert not len(cases) % 3
shape = int(len(cases)/3)
cases = cases.reshape((pooled_shape, 3)).sum(-1)
π = Beta('π', 1, 10, shape=shape)
AGE_obs = Binomial('AGE_obs', n, π*ω, observed=cases)
return model
outpatient_model_full = generate_outpatient_model(outpatient_merged)
Applied logodds-transform to π and added transformed π_logodds to model.
with outpatient_model_full:
start = find_MAP()
step = NUTS(scaling=start)
outpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
[-----------------100%-----------------] 2000 of 2000 complete in 1.7 sec
forestplot(outpatient_full_trace[1000:], varnames=['π'])
<matplotlib.gridspec.GridSpec at 0x12765d978>
outpatient_data_labels = (inpatient_pool_race_data[['under_5', 'year']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'under_5':'age'}))
outpatient_data_labels = outpatient_data_labels[outpatient_data_labels.year.isin(['2012/13','2013/14'])]
generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['π'],
index=['age'], columns=['year'])
year | 2012/13 | 2013/14 |
---|---|---|
age | ||
5+ | 187.53 (159.72, 219.80) | 163.25 (133.51, 192.79) |
<5 | 3563.82 (3196.31, 3912.01) | 1593.31 (1421.97, 1781.77) |
outpatient_full_samples = pd.concat(
[outpatient_merged[['under_5', 'year']].reset_index(drop=True),
pd.DataFrame(outpatient_full_trace['π'][-1000:]).T], axis=1)
outpatient_full_samples = (pd.melt(outpatient_full_samples,
id_vars=['under_5', 'year'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
.rename(columns={'under_5':'age group'}))
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
data=outpatient_full_samples.assign(rate=outpatient_full_samples.value*rate_factor),
palette="gray", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x12585b390>
ED2 = pd.read_excel('FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED3 = pd.read_excel('Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED2.columns = ED3.columns
ED2_total = ED2.sum()
ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'],
ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
ED3_total = ED3.sum()
ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'],
ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
ED_all = pd.concat([ED2, ED3])
eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')]
ED_counts = ED_all[eligible_cols].sum(1).astype(int)
rotavirus_ed = rotavirus_data[rotavirus_data.ED==1]
surveillance_counts = rotavirus_data.groupby(['Enrollment Date'])['Record ID'].count()
ED_counts.name = 'count24'
surveillance_counts.name = 'count8'
surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna()
# Drop days where 8-hour count exceeds 24-hour count
surveillance = surveillance[surveillance.count8<=surveillance.count24]
Here is what it looks like if you resample to weekly
weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W').sum()
(weekly_surveillance.count8 / weekly_surveillance.count24).hist()
<matplotlib.axes._subplots.AxesSubplot at 0x101bca7f0>
We will just use the proportion of the pooled suveillance
surveillance_total = surveillance.sum()
surveillance_total
count24 3486.0 count8 1849.0 dtype: float64
surveillance_proportion = surveillance_total.count8 / surveillance_total.count24
surveillance_proportion
0.53040734366035569
def generate_ed_model(ED_data):
with Model() as model:
data_len = ED_data.shape[0]
cases, n = ED_data[['cases','n']].values.T
weights = (np.array(monitoring_rate)[ED]
* np.array(ed_market_share[:-1])[ED_data.year]
* surveillance_proportion)
weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len,
testval=(cases/weights).astype(int))
weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)
θ = Normal('θ', 0, sd=1000, shape=data_len)
λ = Deterministic('λ', tt.exp(θ))
age_vanderbilt = Poisson('age_vanderbilt', λ * n, observed=weighted_cases)
return model
ed_pool_race_data = (setting_data[ED].drop('setting', axis=1)
.query('year<2')
.groupby(['under_5', 'year'])[['cases','n']]
.sum()
.reset_index())
ed_pool_race_data
under_5 | year | cases | n | |
---|---|---|---|---|
0 | 0 | 0 | 219 | 102191 |
1 | 0 | 1 | 203 | 103789 |
2 | 1 | 0 | 497 | 40548 |
3 | 1 | 1 | 533 | 41013 |
ed_model = generate_ed_model(ed_pool_race_data)
with ed_model:
start = find_MAP()
step = NUTS(scaling=start)
ed_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases [-----------------100%-----------------] 2000 of 2000 complete in 2.2 sec
forestplot(ed_trace[1000:], varnames=['λ'])
<matplotlib.gridspec.GridSpec at 0x128896320>
ed_data_labels = (ed_pool_race_data[['under_5', 'year']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'under_5':'age'}))
generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age'])
age | 5+ | <5 |
---|---|---|
year | ||
2012/13 | 75.48 (66.17, 86.08) | 432.16 (393.73, 468.47) |
2013/14 | 69.52 (60.21, 78.79) | 467.24 (428.29, 506.03) |
Create data frame of MCMC samples for plotting
ed_rate_samples = pd.concat([ed_pool_race_data[['under_5', 'year']],
pd.DataFrame(ed_trace['λ'][-1000:]).T], axis=1)
ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['under_5', 'year'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14'}})
.rename(columns={'under_5':'age group'}))
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
data=ed_rate_samples.assign(rate=ed_rate_samples.value*rate_factor),
palette="gray", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x1272e4278>
census_data_virus = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
.groupby(['under_5','YEAR'])['TOT_POP'].sum()).reset_index()
census_data_virus['YEAR'] = census_data_virus.YEAR - 2011
census_virus_clean = census_data_virus.rename(columns={'YEAR':'year', 'TOT_POP':'n'})
census_virus_clean.head()
under_5 | year | n | |
---|---|---|---|
0 | False | 0 | 111216 |
1 | False | 1 | 113190 |
2 | False | 2 | 114362 |
3 | True | 0 | 45013 |
4 | True | 1 | 45582 |
rotavirus_data[['rotavirus','norovirus']].sum(1).value_counts()
0 2598 1 806 2 48 dtype: int64
virus_summary = (rotavirus_data.groupby(['under_5', 'year', 'setting', 'rotavirus', 'sapovirus',
'astrovorus', 'norovirus'])
['Record ID'].count()
.reset_index()
.rename(columns={'Record ID':'cases', 'astrovorus':'astrovirus'}))
virus_summary.head()
under_5 | year | setting | rotavirus | sapovirus | astrovirus | norovirus | cases | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 158 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 10 |
2 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 |
3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 12 |
4 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 3 |
merged_virus_data = virus_summary.merge(census_virus_clean, on=['under_5', 'year'], how='left')
merged_virus_data.head(10)
under_5 | year | setting | rotavirus | sapovirus | astrovirus | norovirus | cases | n | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 158 | 111216 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 10 | 111216 |
2 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 2 | 111216 |
3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 12 | 111216 |
4 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 3 | 111216 |
5 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 30 | 111216 |
6 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 3 | 111216 |
7 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 111216 |
8 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | 111216 |
9 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 111216 |
rota_data = (merged_virus_data[merged_virus_data.rotavirus==1]
.groupby(['under_5', 'year', 'setting'])
.agg({'cases':sum, 'n':max}))
sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1]
.groupby(['under_5', 'year', 'setting'])
.agg({'cases':sum, 'n':max})
.reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))
astro_data = (merged_virus_data[merged_virus_data.astrovirus==1]
.groupby(['under_5', 'year', 'setting'])
.agg({'cases':sum, 'n':max})
.reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))
noro_data = (merged_virus_data[merged_virus_data.norovirus==1]
.groupby(['under_5', 'year', 'setting'])
.agg({'cases':sum, 'n':max})
.reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))
virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'),
noro_data.reset_index().assign(virus='norovirus'),
rota_data.reset_index().assign(virus='rotavirus'),
sapo_data.reset_index().assign(virus='sapovirus')])
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)
virus_totals.head(10)
under_5 | year | setting | cases | n | virus | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 5 | 111216 | astrovirus |
1 | 0 | 0 | 1 | 0 | 111216 | astrovirus |
2 | 0 | 0 | 2 | 6 | 111216 | astrovirus |
3 | 0 | 1 | 0 | 4 | 113190 | astrovirus |
4 | 0 | 1 | 1 | 0 | 113190 | astrovirus |
5 | 0 | 1 | 2 | 2 | 113190 | astrovirus |
6 | 0 | 2 | 0 | 3 | 114362 | astrovirus |
7 | 0 | 2 | 1 | 0 | 114362 | astrovirus |
8 | 0 | 2 | 2 | 1 | 114362 | astrovirus |
9 | 1 | 0 | 0 | 13 | 45013 | astrovirus |
n_lookup = virus_totals.groupby(['under_5', 'year', 'virus']).agg({'n':max})
n_lookup
n | |||
---|---|---|---|
under_5 | year | virus | |
0 | 0 | astrovirus | 111216 |
norovirus | 111216 | ||
rotavirus | 111216 | ||
sapovirus | 111216 | ||
1 | astrovirus | 113190 | |
norovirus | 113190 | ||
rotavirus | 113190 | ||
sapovirus | 113190 | ||
2 | astrovirus | 114362 | |
norovirus | 114362 | ||
rotavirus | 114362 | ||
sapovirus | 114362 | ||
1 | 0 | astrovirus | 45013 |
norovirus | 45013 | ||
rotavirus | 45013 | ||
sapovirus | 45013 | ||
1 | astrovirus | 45582 | |
norovirus | 45582 | ||
rotavirus | 45582 | ||
sapovirus | 45582 | ||
2 | astrovirus | 46003 | |
norovirus | 46003 | ||
rotavirus | 46003 | ||
sapovirus | 46003 |
virus_inpatient_data = (virus_totals.replace({'virus': {'norovirus':0,
'rotavirus':1,
'sapovirus':2,
'astrovirus':3}}))[virus_totals.setting==INPATIENT]
with Model() as virus_inpatient_model:
cases, year, n = virus_inpatient_data[['cases', 'year', 'n']].values.T
weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[year]
weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=virus_inpatient_data.shape[0],
testval=(cases/weights).astype(int))
weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)
θ = Normal('θ', 0, sd=1000, shape=virus_inpatient_data.shape[0])
λ = Deterministic('λ', tt.exp(θ))
virus_obs = Potential('virus_obs', Poisson.dist(λ*n).logp(weighted_cases))
from pymc3 import Metropolis
with virus_inpatient_model:
virus_inpatient_trace = sample(20000, step=Metropolis(), njobs=2, random_seed=seed_numbers)
[-----------------100%-----------------] 20000 of 20000 complete in 6.6 sec
virus_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']],
pd.DataFrame(virus_inpatient_trace['λ'][-1000:]).T], axis=1)
virus_samples = (pd.melt(virus_samples, id_vars=['under_5', 'year', 'virus'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
.rename(columns={'under_5':'age group'}))
sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus',
data=virus_samples.assign(rate=virus_samples.value*rate_factor),
palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
col_wrap=2)
rateplot.despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x128302278>
virus_outpatient_data = (virus_totals.replace({'virus': {'norovirus':0,
'rotavirus':1,
'sapovirus':2,
'astrovirus':3}}))[virus_totals.setting==OUTPATIENT]
with Model() as virus_outpatient_model:
cases,n = virus_outpatient_data[['cases','n']].values.T
ω = monitoring_rate[OUTPATIENT]
π = Beta('π', 1, 10, shape=virus_outpatient_data.shape[0])
virus_out_obs = Binomial('virus_out_obs', n, π*ω, observed=cases)
Applied logodds-transform to π and added transformed π_logodds to model.
with virus_outpatient_model:
virus_outpatient_trace = sample(2000, njobs=2, random_seed=seed_numbers)
Assigned NUTS to π_logodds [-----------------100%-----------------] 2000 of 2000 complete in 7.4 sec
virus_out_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']],
pd.DataFrame(virus_outpatient_trace['π'][-1000:]).T], axis=1)
virus_out_samples = (pd.melt(virus_out_samples, id_vars=['under_5', 'year', 'virus'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
.rename(columns={'under_5':'age group'}))
sns.set(style="ticks")
sns.set_context(font_scale=1.5)
rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus',
data=virus_out_samples.assign(rate=virus_out_samples.value*rate_factor),
palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
col_wrap=2)
rateplot.despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x12b235198>
virus_ed_data = (virus_totals.replace({'virus': {'norovirus':0,
'rotavirus':1,
'sapovirus':2,
'astrovirus':3}}))[virus_totals.setting==ED]
with Model() as virus_ed_model:
data_len = virus_ed_data.shape[0]
cases, year, n = virus_ed_data[['cases', 'year', 'n']].values.T
weights = (np.array(monitoring_rate)[ED]
* np.array(ed_market_share)[year]
* surveillance_proportion)
weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len,
testval=(cases/weights).astype(int))
weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)
θ = Normal('θ', 0, sd=1000, shape=data_len)
λ = Deterministic('λ', tt.exp(θ))
virus_ed_obs = Poisson('virus_ed_obs', λ * n, observed=weighted_cases)
with virus_ed_model:
virus_ed_trace = sample(2000, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases Assigned NUTS to θ [-----------------100%-----------------] 2000 of 2000 complete in 10.4 sec
virus_ed_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']],
pd.DataFrame(virus_ed_trace['λ'][-1000:]).T], axis=1)
virus_ed_samples = (pd.melt(virus_ed_samples, id_vars=['under_5', 'year', 'virus'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
.rename(columns={'under_5':'age group'}))
sns.set(style="ticks")
sns.set_context(font_scale=1.5)
rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus',
data=virus_ed_samples.assign(rate=virus_ed_samples.value*rate_factor),
palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
col_wrap=2)
rateplot.despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x12d316550>