%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')
seed_numbers = 20090425, 19700903
from datetime import timedelta, datetime
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))
Import data
rotavirus_data = pd.read_csv('ROTA.csv')
rotavirus_data.index.is_unique
True
Rename columns and recode data
race_cols = [race for race in rotavirus_data.columns if race.startswith('Race')]
(rotavirus_data[race_cols]=='Checked').sum()
Race (choice=White) 2249 Race (choice=Black/African American) 1545 Race (choice=American Indian/Alaskan Native) 5 Race (choice=Asian) 86 Race (choice=Native Hawaiian/Other Pacific Islander) 5 Race (choice=Other) 18 Race (choice=Unknown) 3 Race (choice=None) 1 dtype: int64
rotavirus_data['white'] = (rotavirus_data['Race (choice=White)']=='Checked').astype(int)
rotavirus_data['black'] = (rotavirus_data['Race (choice=Black/African American)']=='Checked').astype(int)
rotavirus_data = rotavirus_data[(rotavirus_data.white + rotavirus_data.black)==1].drop(race_cols, axis=1)
rotavirus_data['hispanic'] = (rotavirus_data.Ethnicity=='Hispanic or Latino').astype(int)
rotavirus_data['rotavirus'] = (rotavirus_data['Result of rotavirus testing'] == 'Positive').astype(int)
rotavirus_data['sapovirus'] = (rotavirus_data['Result of sapovirus testing'] == 'Positive').astype(int)
rotavirus_data['astrovorus'] = (rotavirus_data['Result of astrovirus testing'] == 'Positive').astype(int)
rotavirus_data['norovirus'] = (rotavirus_data['Result of Norovirus Testing'] == 'GII').astype(int)
rotavirus_data = rotavirus_data.drop([col for col in rotavirus_data.columns if col.startswith('Result')], axis=1)
rotavirus_data['Enrollment Date'] = pd.to_datetime(rotavirus_data['Enrollment Date'])
rotavirus_data['Date of Birth'] = pd.to_datetime(rotavirus_data['Date of Birth'])
Calculate age, and compare to given age.
rotavirus_data['age_months'] = (rotavirus_data['Enrollment Date']
- rotavirus_data['Date of Birth'] + timedelta(1)).astype('timedelta64[M]')
np.abs((rotavirus_data['Age in months (auto calculated) '] - rotavirus_data['age_months']) > 1).sum()
0
Group into ages under 5 and 5 or over
rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int)
Calculate natural year
rotavirus_data['year'] = rotavirus_data.Year - 2
rotavirus_data.year.value_counts()
0 1215 1 1178 2 1059 Name: year, dtype: int64
Recode sex and stool
rotavirus_data['male'] = rotavirus_data['Sex updated']=='Male'
rotavirus_data['stool_collected'] = rotavirus_data['Was stool collected?']=='Yes'
# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['Date of Birth',
'Age in months (auto calculated) ', 'Sex',
'Sex updated', 'Was stool collected?',
'Ethnicity', 'Year'], axis=1)
rotavirus_data = pd.concat([rotavirus_data, pd.get_dummies(rotavirus_data.Setting)], axis=1).drop('Setting', axis=1)
rotavirus_data['setting'] = rotavirus_data.IP.astype(int)
rotavirus_data.loc[rotavirus_data.OP==1, 'setting'] = 2
rotavirus_data.setting.value_counts()
0 2117 2 1190 1 145 Name: setting, dtype: int64
rotavirus_data.shape
(3452, 18)
rotavirus_data.head()
Record ID | Enrollment Date | white | black | hispanic | rotavirus | sapovirus | astrovorus | norovirus | age_months | under_5 | year | male | stool_collected | ED | IP | OP | setting | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | EN1C-0001 | 2012-12-03 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 77.0 | 0 | 0 | True | True | 0.0 | 0.0 | 1.0 | 2 |
1 | EN1C-0002 | 2012-12-03 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 87.0 | 0 | 0 | True | False | 0.0 | 0.0 | 1.0 | 2 |
2 | EN1C-0003 | 2012-12-03 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 63.0 | 0 | 0 | False | False | 0.0 | 0.0 | 1.0 | 2 |
3 | EN1C-0004 | 2012-12-04 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 36.0 | 1 | 0 | False | True | 0.0 | 0.0 | 1.0 | 2 |
4 | EN1C-0006 | 2012-12-04 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 64.0 | 0 | 0 | False | False | 0.0 | 0.0 | 1.0 | 2 |
rotavirus_data.isnull().sum()
Record ID 0 Enrollment Date 0 white 0 black 0 hispanic 0 rotavirus 0 sapovirus 0 astrovorus 0 norovirus 0 age_months 0 under_5 0 year 0 male 0 stool_collected 0 ED 0 IP 0 OP 0 setting 0 dtype: int64
rotavirus_data.describe()
white | black | hispanic | rotavirus | sapovirus | astrovorus | norovirus | age_months | under_5 | year | ED | IP | OP | setting | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 | 3452.000000 |
mean | 0.601970 | 0.398030 | 0.398610 | 0.108633 | 0.081112 | 0.037080 | 0.152665 | 44.912804 | 0.715527 | 0.954809 | 0.613268 | 0.042005 | 0.344728 | 0.731460 |
std | 0.489563 | 0.489563 | 0.489683 | 0.311223 | 0.273047 | 0.188985 | 0.359717 | 45.485356 | 0.451228 | 0.810492 | 0.487072 | 0.200629 | 0.475348 | 0.941349 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 11.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 27.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 67.000000 | 1.000000 | 2.000000 | 1.000000 | 0.000000 | 1.000000 | 2.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 215.000000 | 1.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 |
Aggregate data by age group and setting
rotavirus_summary = (rotavirus_data.drop('Enrollment Date', axis=1)
.groupby(['under_5', 'year', 'setting', 'black', 'hispanic'])['Record ID'].count()
.reset_index()
.rename(columns={'Record ID':'cases'}))
rotavirus_summary.head()
under_5 | year | setting | black | hispanic | cases | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 57 |
1 | 0 | 0 | 0 | 0 | 1 | 62 |
2 | 0 | 0 | 0 | 1 | 0 | 99 |
3 | 0 | 0 | 0 | 1 | 1 | 1 |
4 | 0 | 0 | 1 | 0 | 0 | 4 |
The key for AGEGRP is as follows:
davidson_census = pd.read_csv('davidson_county.csv')
davidson_census.head()
SUMLEV | STATE | COUNTY | STNAME | CTYNAME | YEAR2008 | YEAR | AGEGRP | TOT_POP | TOT_MALE | ... | HWAC_MALE | HWAC_FEMALE | HBAC_MALE | HBAC_FEMALE | HIAC_MALE | HIAC_FEMALE | HAAC_MALE | HAAC_FEMALE | HNAC_MALE | HNAC_FEMALE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 0 | 626681 | 303540 | ... | 30400 | 24829 | 2178 | 2006 | 1501 | 1183 | 381 | 374 | 271 | 211 |
1 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 1 | 44691 | 22868 | ... | 3838 | 3664 | 302 | 303 | 185 | 189 | 63 | 53 | 26 | 35 |
2 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 2 | 37613 | 18977 | ... | 2910 | 2826 | 266 | 205 | 145 | 140 | 45 | 50 | 22 | 23 |
3 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 3 | 33904 | 17479 | ... | 2154 | 1890 | 166 | 170 | 105 | 91 | 25 | 29 | 18 | 16 |
4 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 4 | 38999 | 19674 | ... | 2029 | 1783 | 166 | 174 | 111 | 82 | 29 | 29 | 22 | 16 |
5 rows × 81 columns
Identify racial/ethnic subgroup populations
census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy()
census_data_child['HISPANIC_WHITE'] = census_data_child[['HWA_MALE', 'HWA_FEMALE']].sum(axis=1)
census_data_child['HISPANIC_BLACK'] = census_data_child[['HBA_MALE', 'HBA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_WHITE'] = census_data_child[['NHWA_MALE', 'NHWA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_BLACK'] = census_data_child[['NHBA_MALE', 'NHBA_FEMALE']].sum(axis=1)
census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
.groupby(['under_5','YEAR'])[['HISPANIC_WHITE', 'HISPANIC_BLACK',
'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']]
.sum()).reset_index()
census_data['YEAR'] = census_data.YEAR - 2011
census_data
under_5 | YEAR | HISPANIC_WHITE | HISPANIC_BLACK | NONHISPANIC_WHITE | NONHISPANIC_BLACK | |
---|---|---|---|---|---|---|
0 | False | 0 | 13889 | 891 | 47077 | 40334 |
1 | False | 1 | 14692 | 914 | 47527 | 40656 |
2 | False | 2 | 15653 | 983 | 47487 | 40556 |
3 | True | 0 | 7134 | 533 | 19290 | 13591 |
4 | True | 1 | 6994 | 566 | 19942 | 13511 |
5 | True | 2 | 6909 | 552 | 20479 | 13471 |
Reshape data
melted_census = (pd.melt(census_data, id_vars=['under_5', 'YEAR'],
value_vars=['HISPANIC_WHITE', 'HISPANIC_BLACK',
'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']))
census_clean = (melted_census.assign(hispanic=melted_census.variable.apply(lambda x: x.split('_')[0]),
black=melted_census.variable.apply(lambda x: x.split('_')[1]))
.drop('variable', axis=1)
.replace({'black':{'BLACK':1, 'WHITE':0},
'hispanic':{'HISPANIC':1, 'NONHISPANIC':0}})
.rename(columns={'YEAR':'year', 'value':'n'}))
census_clean
under_5 | year | n | black | hispanic | |
---|---|---|---|---|---|
0 | False | 0 | 13889 | 0 | 1 |
1 | False | 1 | 14692 | 0 | 1 |
2 | False | 2 | 15653 | 0 | 1 |
3 | True | 0 | 7134 | 0 | 1 |
4 | True | 1 | 6994 | 0 | 1 |
5 | True | 2 | 6909 | 0 | 1 |
6 | False | 0 | 891 | 1 | 1 |
7 | False | 1 | 914 | 1 | 1 |
8 | False | 2 | 983 | 1 | 1 |
9 | True | 0 | 533 | 1 | 1 |
10 | True | 1 | 566 | 1 | 1 |
11 | True | 2 | 552 | 1 | 1 |
12 | False | 0 | 47077 | 0 | 0 |
13 | False | 1 | 47527 | 0 | 0 |
14 | False | 2 | 47487 | 0 | 0 |
15 | True | 0 | 19290 | 0 | 0 |
16 | True | 1 | 19942 | 0 | 0 |
17 | True | 2 | 20479 | 0 | 0 |
18 | False | 0 | 40334 | 1 | 0 |
19 | False | 1 | 40656 | 1 | 0 |
20 | False | 2 | 40556 | 1 | 0 |
21 | True | 0 | 13591 | 1 | 0 |
22 | True | 1 | 13511 | 1 | 0 |
23 | True | 2 | 13471 | 1 | 0 |
census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)])
merged_data = (rotavirus_summary.merge(census_full, on=['under_5', 'year', 'black', 'hispanic','setting'], how='right')
.fillna(0).astype(int))
merged_data.head()
under_5 | year | setting | black | hispanic | cases | n | |
---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 57 | 47077 |
1 | 0 | 0 | 0 | 0 | 1 | 62 | 13889 |
2 | 0 | 0 | 0 | 1 | 0 | 99 | 40334 |
3 | 0 | 0 | 0 | 1 | 1 | 1 | 891 |
4 | 0 | 0 | 1 | 0 | 0 | 4 | 47077 |
VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health
inpatient_market_share = 0.94, 0.90, 0.83
ed_market_share = 0.95, 0.94, 0.92
Y2012, Y2013, Y2014 = 0, 1, 2
Will estimate 2014 market shares by extrapolating a linear fit to the previous years.
from sklearn import linear_model
inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share))
inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.reshape(-1, 1), y=inpatient_df.y)
inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]]
inpatient_market_share
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver. warnings.warn(mesg, RuntimeWarning)
[0.9, 0.83, 0.77500000000000002]
ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share))
ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.reshape(-1, 1), y=ed_df.y)
ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]]
ed_market_share
[0.94, 0.92, 0.90500000000000014]
Enrollment days/week for ED, inpatient, outpatient
monitoring_days = 4, 7, 4
# Indices to each value
ED, INPATIENT, OUTPATIENT = 0, 1, 2
monitoring_rate = np.array(np.array(monitoring_days)/7)
monitoring_rate
array([ 0.57142857, 1. , 0.57142857])
rotavirus_data[(rotavirus_data.hispanic==1) & (rotavirus_data.white==1)
& (rotavirus_data.setting==2) & (rotavirus_data.year==2) &
(rotavirus_data.under_5==1)].shape
(157, 18)
setting_data = dict(list(merged_data.groupby('setting')))
from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform
from pymc3 import traceplot, forestplot, summary, df_summary
import theano.tensor as tt
Couldn't import dot_parser, loading of dot files will not be possible.
def generate_inpatient_model(inpatient_data, pool_years=False):
with Model() as model:
under_5, cases, n = inpatient_data[['under_5', 'cases', 'n']].values.T
weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year]
weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=inpatient_data.shape[0],
testval=(cases/weights).astype(int))
weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)
if pool_years:
assert not len(cases) % 3
pooled_shape = int(len(cases)/3)
pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
θ = Normal('θ', 0, sd=1000, shape=pooled_shape)
λ = Deterministic('λ', tt.exp(θ))
AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
else:
θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0])
λ = Deterministic('λ', tt.exp(θ))
AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
return model
inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1))
with inpatient_model_full:
start = find_MAP()
step = NUTS(scaling=start)
inpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases [-----------------100%-----------------] 2000 of 2000 complete in 97.1 sec
plt.subplots(figsize=(8,12))
forestplot(inpatient_full_trace[1000:], varnames=['λ'])
<matplotlib.gridspec.GridSpec at 0x1198dec50>
label_map = {'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
'black':{0:'white', 1:'black'},
'hispanic':{0:'non-hispanic', 1:'hispanic'},
'setting':{0:'ED', 1:'inpatient', 2:'outpatient'}}
inpatient_data_labels = (setting_data[INPATIENT][['under_5', 'year', 'black', 'hispanic']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'}))
Rate factor for population
rate_factor = 10000
rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns)
inpatient_data_labels
age | year | race | ethnicity | |
---|---|---|---|---|
0 | 5+ | 2012/13 | white | non-hispanic |
1 | 5+ | 2012/13 | white | hispanic |
2 | 5+ | 2012/13 | black | non-hispanic |
3 | 5+ | 2013/14 | white | non-hispanic |
4 | 5+ | 2013/14 | white | hispanic |
5 | 5+ | 2014/15 | white | hispanic |
6 | 5+ | 2014/15 | black | non-hispanic |
7 | <5 | 2012/13 | white | non-hispanic |
8 | <5 | 2012/13 | white | hispanic |
9 | <5 | 2012/13 | black | non-hispanic |
10 | <5 | 2013/14 | white | non-hispanic |
11 | <5 | 2013/14 | white | hispanic |
12 | <5 | 2013/14 | black | non-hispanic |
13 | <5 | 2014/15 | white | non-hispanic |
14 | <5 | 2014/15 | white | hispanic |
15 | <5 | 2014/15 | black | non-hispanic |
16 | 5+ | 2012/13 | black | hispanic |
17 | 5+ | 2013/14 | black | hispanic |
18 | 5+ | 2014/15 | black | hispanic |
19 | <5 | 2012/13 | black | hispanic |
20 | <5 | 2013/14 | black | hispanic |
21 | <5 | 2014/15 | black | hispanic |
22 | 5+ | 2014/15 | white | non-hispanic |
23 | 5+ | 2013/14 | black | non-hispanic |
from pymc3 import hpd
def _hpd_df(x, alpha):
cnames = ['hpd_{0:g}'.format(100 * alpha/2),
'hpd_{0:g}'.format(100 * (1 - alpha/2))]
return pd.DataFrame(hpd(x, alpha), columns=cnames)
estimates = (df_summary(inpatient_full_trace[1000:], varnames=['λ'],
stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates.index = inpatient_data_labels.index
pd.concat([inpatient_data_labels, estimates], axis=1)
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future return reshape(newshape, order=order)
age | year | race | ethnicity | mean | sd | mc_error | hpd_2.5 | hpd_97.5 | median | hpd_25 | hpd_75 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5+ | 2012/13 | white | non-hispanic | 0.814 | 0.438 | 0.031 | 0.140 | 1.656 | 0.720 | 0.363 | 0.859 |
1 | 5+ | 2012/13 | white | hispanic | 1.283 | 0.930 | 0.068 | 0.037 | 3.083 | 1.123 | 0.104 | 1.168 |
2 | 5+ | 2012/13 | black | non-hispanic | 1.025 | 0.495 | 0.031 | 0.195 | 1.988 | 0.966 | 0.471 | 1.083 |
3 | 5+ | 2013/14 | white | non-hispanic | 0.384 | 0.273 | 0.017 | 0.003 | 0.894 | 0.318 | 0.083 | 0.363 |
4 | 5+ | 2013/14 | white | hispanic | 2.152 | 1.267 | 0.088 | 0.203 | 4.546 | 1.927 | 0.791 | 2.192 |
5 | 5+ | 2014/15 | white | hispanic | 0.661 | 0.608 | 0.042 | 0.001 | 1.951 | 0.478 | 0.001 | 0.479 |
6 | 5+ | 2014/15 | black | non-hispanic | 0.246 | 0.241 | 0.017 | 0.004 | 0.720 | 0.170 | 0.004 | 0.171 |
7 | <5 | 2012/13 | white | non-hispanic | 11.252 | 2.339 | 0.144 | 7.025 | 16.002 | 11.011 | 9.193 | 12.169 |
8 | <5 | 2012/13 | white | hispanic | 23.234 | 5.438 | 0.351 | 13.545 | 34.042 | 22.690 | 19.251 | 26.387 |
9 | <5 | 2012/13 | black | non-hispanic | 10.256 | 2.640 | 0.178 | 5.582 | 15.543 | 10.108 | 7.967 | 11.511 |
10 | <5 | 2013/14 | white | non-hispanic | 14.167 | 2.759 | 0.175 | 9.420 | 19.676 | 13.953 | 12.353 | 15.856 |
11 | <5 | 2013/14 | white | hispanic | 17.910 | 4.991 | 0.326 | 8.796 | 28.690 | 17.573 | 14.587 | 20.601 |
12 | <5 | 2013/14 | black | non-hispanic | 8.902 | 2.495 | 0.151 | 4.272 | 13.772 | 8.717 | 6.978 | 10.325 |
13 | <5 | 2014/15 | white | non-hispanic | 6.928 | 1.980 | 0.137 | 3.343 | 10.624 | 6.732 | 5.372 | 7.968 |
14 | <5 | 2014/15 | white | hispanic | 30.327 | 6.354 | 0.398 | 19.451 | 43.707 | 29.692 | 24.363 | 32.647 |
15 | <5 | 2014/15 | black | non-hispanic | 7.642 | 2.316 | 0.158 | 3.258 | 11.928 | 7.531 | 5.498 | 8.595 |
16 | 5+ | 2012/13 | black | hispanic | 0.009 | 0.188 | 0.004 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
17 | 5+ | 2013/14 | black | hispanic | 0.027 | 0.477 | 0.016 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
18 | 5+ | 2014/15 | black | hispanic | 0.018 | 0.338 | 0.008 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
19 | <5 | 2012/13 | black | hispanic | 0.018 | 0.439 | 0.010 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
20 | <5 | 2013/14 | black | hispanic | 0.004 | 0.113 | 0.003 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
21 | <5 | 2014/15 | black | hispanic | 0.039 | 0.704 | 0.020 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
22 | 5+ | 2014/15 | white | non-hispanic | 0.000 | 0.007 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
23 | 5+ | 2013/14 | black | non-hispanic | 0.001 | 0.011 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
trace, labels = inpatient_full_trace, inpatient_data_labels
index=['age', 'year']
columns=['race', 'ethnicity']
varnames=['λ']
# rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(),
# names=labels.columns)
# rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
# rate_df.index = rate_df_index
# pivot_all = pd.pivot_table(rate_df
# .reset_index(),
# index=index, columns=columns)
# value_iterator = zip(pivot_all['mean'].values.flatten(),
# pivot_all['hpd_2.5'].values.flatten(),
# pivot_all['hpd_97.5'].values.flatten())
# rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator],
# pivot_all['mean'].shape)
# pd.DataFrame(rate_strings, index=pivot_mean.index, columns=pivot_mean.columns)
def generate_table(trace, labels, index, columns, varnames=['λ']):
rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(),
names=labels.columns)
rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
rate_df.index = rate_df_index
pivot_all = pd.pivot_table(rate_df
.reset_index(),
index=index, columns=columns)
value_iterator = zip(pivot_all['mean'].values.flatten(),
pivot_all['hpd_2.5'].values.flatten(),
pivot_all['hpd_97.5'].values.flatten())
rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator],
pivot_all['mean'].shape)
return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)
generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], columns=['race', 'ethnicity'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future return reshape(newshape, order=order)
race | black | white | |||
---|---|---|---|---|---|
ethnicity | hispanic | non-hispanic | hispanic | non-hispanic | |
age | year | ||||
5+ | 2012/13 | 0.01 (0.00, 0.00) | 1.02 (0.19, 1.99) | 1.28 (0.04, 3.08) | 0.81 (0.14, 1.66) |
2013/14 | 0.03 (0.00, 0.00) | 0.00 (0.00, 0.00) | 2.15 (0.20, 4.55) | 0.38 (0.00, 0.89) | |
2014/15 | 0.02 (0.00, 0.00) | 0.25 (0.00, 0.72) | 0.66 (0.00, 1.95) | 0.00 (0.00, 0.00) | |
<5 | 2012/13 | 0.02 (0.00, 0.00) | 10.26 (5.58, 15.54) | 23.23 (13.54, 34.04) | 11.25 (7.02, 16.00) |
2013/14 | 0.00 (0.00, 0.00) | 8.90 (4.27, 13.77) | 17.91 (8.80, 28.69) | 14.17 (9.42, 19.68) | |
2014/15 | 0.04 (0.00, 0.00) | 7.64 (3.26, 11.93) | 30.33 (19.45, 43.71) | 6.93 (3.34, 10.62) |
inpatient_rate_samples = pd.concat(
[inpatient_data_labels,
pd.DataFrame(inpatient_full_trace['λ'][-1000:]).T], axis=1)
inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race', 'ethnicity']))
inpatient_rate_samples['race/ethnicity'] = inpatient_rate_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)
Plot of rates broken down by year/age/race/ethnicity.
sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age", row='race',
col='ethnicity',
data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*10000),
palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0)
rateplot.despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x11ad19710>
inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1)
.groupby(['under_5', 'year'])[['cases','n']]
.sum()
.reset_index())
inpatient_pool_race_data
under_5 | year | cases | n | |
---|---|---|---|---|
0 | 0 | 0 | 10 | 102191 |
1 | 0 | 1 | 5 | 103789 |
2 | 0 | 2 | 2 | 104679 |
3 | 1 | 0 | 48 | 40548 |
4 | 1 | 1 | 44 | 41013 |
5 | 1 | 2 | 36 | 41411 |
inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data)
with inpatient_model_pool_race:
start = find_MAP()
step = NUTS(scaling=start)
inpatient_pool_race_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases [-----------------100%-----------------] 2000 of 2000 complete in 3.1 sec
inpatient_pool_race_data_labels = (inpatient_pool_race_data[['under_5', 'year']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'under_5':'age'}))
generate_table(inpatient_pool_race_trace, inpatient_pool_race_data_labels, index=['year'], columns=['age'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future return reshape(newshape, order=order)
age | 5+ | <5 |
---|---|---|
year | ||
2012/13 | 1.08 (0.48, 1.72) | 12.98 (9.51, 16.87) |
2013/14 | 0.57 (0.14, 1.08) | 13.38 (9.14, 17.50) |
2014/15 | 0.23 (0.00, 0.55) | 11.19 (7.56, 14.86) |
inpatient_pool_race_samples = pd.concat(
[inpatient_pool_race_data[['under_5', 'year']].reset_index(drop=True),
pd.DataFrame(inpatient_pool_race_trace['λ'][-1000:]).T], axis=1)
inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples,
id_vars=['under_5', 'year'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
.rename(columns={'under_5':'age group'}))
Plot of rates pooled by ethnicity
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*10000),
palette="gray", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x11c735080>
inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1)
.sort(columns=['under_5', 'black', 'hispanic', 'year'], ascending=False)).reset_index(drop=True)
inpatient_pool_year_data
under_5 | year | black | hispanic | cases | n | |
---|---|---|---|---|---|---|
0 | 1 | 2 | 1 | 1 | 0 | 552 |
1 | 1 | 1 | 1 | 1 | 0 | 566 |
2 | 1 | 0 | 1 | 1 | 0 | 533 |
3 | 1 | 2 | 1 | 0 | 8 | 13471 |
4 | 1 | 1 | 1 | 0 | 10 | 13511 |
5 | 1 | 0 | 1 | 0 | 13 | 13591 |
6 | 1 | 2 | 0 | 1 | 17 | 6909 |
7 | 1 | 1 | 0 | 1 | 10 | 6994 |
8 | 1 | 0 | 0 | 1 | 15 | 7134 |
9 | 1 | 2 | 0 | 0 | 11 | 20479 |
10 | 1 | 1 | 0 | 0 | 24 | 19942 |
11 | 1 | 0 | 0 | 0 | 20 | 19290 |
12 | 0 | 2 | 1 | 1 | 0 | 983 |
13 | 0 | 1 | 1 | 1 | 0 | 914 |
14 | 0 | 0 | 1 | 1 | 0 | 891 |
15 | 0 | 2 | 1 | 0 | 1 | 40556 |
16 | 0 | 1 | 1 | 0 | 0 | 40656 |
17 | 0 | 0 | 1 | 0 | 4 | 40334 |
18 | 0 | 2 | 0 | 1 | 1 | 15653 |
19 | 0 | 1 | 0 | 1 | 3 | 14692 |
20 | 0 | 0 | 0 | 1 | 2 | 13889 |
21 | 0 | 2 | 0 | 0 | 0 | 47487 |
22 | 0 | 1 | 0 | 0 | 2 | 47527 |
23 | 0 | 0 | 0 | 0 | 4 | 47077 |
inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True)
with inpatient_model_pool_year:
start = find_MAP()
step = NUTS(scaling=start)
inpatient_pool_year_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases [-----------------100%-----------------] 2000 of 2000 complete in 105.1 sec
inpatient_pool_year_labels = (inpatient_pool_year_data[['under_5', 'black', 'hispanic']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'})
.drop_duplicates().reset_index(drop=True))
estimates_pooled = (df_summary(inpatient_pool_year_trace[1000:], varnames=['λ'],
stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates_pooled.index = inpatient_pool_year_labels.index
pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1)
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future return reshape(newshape, order=order)
age | race | ethnicity | mean | sd | mc_error | hpd_2.5 | hpd_97.5 | median | hpd_25 | hpd_75 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | <5 | black | hispanic | 0.012 | 0.318 | 0.007 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
1 | <5 | black | non-hispanic | 26.795 | 4.774 | 0.225 | 17.589 | 36.073 | 26.370 | 23.418 | 29.633 |
2 | <5 | white | hispanic | 71.156 | 10.641 | 0.409 | 50.333 | 91.705 | 70.631 | 61.987 | 76.206 |
3 | <5 | white | non-hispanic | 31.473 | 3.894 | 0.139 | 23.883 | 38.592 | 31.386 | 29.269 | 34.573 |
4 | 5+ | black | hispanic | 0.003 | 0.079 | 0.002 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
5 | 5+ | black | non-hispanic | 1.267 | 0.550 | 0.021 | 0.273 | 2.238 | 1.189 | 0.749 | 1.422 |
6 | 5+ | white | hispanic | 3.815 | 1.511 | 0.066 | 1.351 | 6.796 | 3.527 | 2.091 | 3.913 |
7 | 5+ | white | non-hispanic | 1.283 | 0.534 | 0.023 | 0.410 | 2.338 | 1.193 | 0.754 | 1.419 |
generate_table(inpatient_pool_year_trace, inpatient_pool_year_labels, index=['race','ethnicity'], columns=['age'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future return reshape(newshape, order=order)
age | 5+ | <5 | |
---|---|---|---|
race | ethnicity | ||
black | hispanic | 0.00 (0.00, 0.00) | 0.01 (0.00, 0.00) |
non-hispanic | 1.27 (0.27, 2.24) | 26.80 (17.59, 36.07) | |
white | hispanic | 3.81 (1.35, 6.80) | 71.16 (50.33, 91.71) |
non-hispanic | 1.28 (0.41, 2.34) | 31.47 (23.88, 38.59) |
inpatient_pool_year_samples = pd.concat(
[inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['under_5', 'black', 'hispanic']],
pd.DataFrame(inpatient_pool_year_trace['λ'][-1000:]).T], axis=1)
inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['under_5', 'black', 'hispanic'])
.replace({'under_5':{0:'5+', 1:'<5'},
'black':{0:'white', 1:'black'},
'hispanic':{0:'non-hispanic', 1:'hispanic'}})
.rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age group'}))
inpatient_pool_year_samples['race/ethnicity'] = inpatient_pool_year_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)
Plot of rates pooled by year
sns.set_context(font_scale=1.5)
plt.subplots(figsize=(12,6))
sns.boxplot(x="race/ethnicity", y="rate", hue="age group",
data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*10000),
palette="gray_r", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x11c9eb4a8>
Import outpatient denominator information.
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()
outpatient_project = Project(api_url, api_key)
outpatients = outpatient_project.export_records(format='df')
outpatients.groupby('year').year.count()
year 2012 15969 2015 17562 Name: year, dtype: int64
outpatients[(outpatients.ptage<5) & (outpatients.ptethnicity=='HL') & (outpatients.year==2012)].shape
(393, 15)
outpatients_kids = outpatients[(outpatients.ptage<=18)
& (outpatients.ptrace.isin(['B','W']))
& (outpatients.ptethnicity.isin(['NH', 'HL']))].copy()
outpatients_kids.ptrace.value_counts()
B 12119 W 10179 Name: ptrace, dtype: int64
outpatients_kids.ptethnicity.value_counts()
NH 18859 HL 3439 Name: ptethnicity, dtype: int64
outpatients_kids['under_5'] = (outpatients_kids.ptage<5).astype(int)
outpatients_kids['black'] = (outpatients_kids.ptrace=='B').astype(int)
outpatients_kids['hispanic'] = (outpatients_kids.ptethnicity=='HL').astype(int)
outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
.groupby(['year', 'black', 'hispanic', 'under_5'])
.n.count()
.reset_index())
outpatient_n
year | black | hispanic | under_5 | n | |
---|---|---|---|---|---|
0 | 2012 | 0 | 0 | 0 | 2692 |
1 | 2012 | 0 | 0 | 1 | 379 |
2 | 2012 | 0 | 1 | 0 | 1262 |
3 | 2012 | 0 | 1 | 1 | 176 |
4 | 2012 | 1 | 0 | 0 | 5081 |
5 | 2012 | 1 | 0 | 1 | 455 |
6 | 2012 | 1 | 1 | 0 | 88 |
7 | 2012 | 1 | 1 | 1 | 14 |
8 | 2015 | 0 | 0 | 0 | 2276 |
9 | 2015 | 0 | 0 | 1 | 1629 |
10 | 2015 | 0 | 1 | 0 | 1129 |
11 | 2015 | 0 | 1 | 1 | 636 |
12 | 2015 | 1 | 0 | 0 | 4301 |
13 | 2015 | 1 | 0 | 1 | 2046 |
14 | 2015 | 1 | 1 | 0 | 84 |
15 | 2015 | 1 | 1 | 1 | 50 |
interp_2013 = (outpatient_n.groupby(['black','hispanic','under_5'])
.apply(lambda x: np.interp(2013, x.year, x.n))
.astype(int)
.reset_index()).assign(year=2013).rename(columns={0:'n'})
interp_2013
black | hispanic | under_5 | n | year | |
---|---|---|---|---|---|
0 | 0 | 0 | 0 | 2553 | 2013 |
1 | 0 | 0 | 1 | 795 | 2013 |
2 | 0 | 1 | 0 | 1217 | 2013 |
3 | 0 | 1 | 1 | 329 | 2013 |
4 | 1 | 0 | 0 | 4821 | 2013 |
5 | 1 | 0 | 1 | 985 | 2013 |
6 | 1 | 1 | 0 | 86 | 2013 |
7 | 1 | 1 | 1 | 26 | 2013 |
outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0)
outpatient_n = outpatient_n[outpatient_n.year<2015]
outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012)
outpatient_n
black | hispanic | n | under_5 | year | |
---|---|---|---|---|---|
0 | 0 | 0 | 2692 | 0 | 0 |
1 | 0 | 0 | 379 | 1 | 0 |
2 | 0 | 1 | 1262 | 0 | 0 |
3 | 0 | 1 | 176 | 1 | 0 |
4 | 1 | 0 | 5081 | 0 | 0 |
5 | 1 | 0 | 455 | 1 | 0 |
6 | 1 | 1 | 88 | 0 | 0 |
7 | 1 | 1 | 14 | 1 | 0 |
0 | 0 | 0 | 2553 | 0 | 1 |
1 | 0 | 0 | 795 | 1 | 1 |
2 | 0 | 1 | 1217 | 0 | 1 |
3 | 0 | 1 | 329 | 1 | 1 |
4 | 1 | 0 | 4821 | 0 | 1 |
5 | 1 | 0 | 985 | 1 | 1 |
6 | 1 | 1 | 86 | 0 | 1 |
7 | 1 | 1 | 26 | 1 | 1 |
outpatient_data_full = setting_data[OUTPATIENT].drop(['setting','n'], axis=1)
outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['black','hispanic','under_5','year'])
outpatient_merged
under_5 | year | black | hispanic | cases | n | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 28 | 2692 |
1 | 0 | 0 | 0 | 1 | 68 | 1262 |
2 | 0 | 0 | 1 | 0 | 52 | 5081 |
3 | 0 | 1 | 0 | 0 | 22 | 2553 |
4 | 0 | 1 | 0 | 1 | 63 | 1217 |
5 | 0 | 1 | 1 | 0 | 35 | 4821 |
6 | 1 | 0 | 0 | 0 | 40 | 379 |
7 | 1 | 0 | 0 | 1 | 144 | 176 |
8 | 1 | 0 | 1 | 0 | 109 | 455 |
9 | 1 | 1 | 0 | 0 | 38 | 795 |
10 | 1 | 1 | 0 | 1 | 138 | 329 |
11 | 1 | 1 | 1 | 0 | 96 | 985 |
12 | 1 | 1 | 1 | 1 | 1 | 26 |
13 | 0 | 0 | 1 | 1 | 0 | 88 |
14 | 0 | 1 | 1 | 1 | 0 | 86 |
15 | 1 | 0 | 1 | 1 | 0 | 14 |
def generate_outpatient_model(outpatient_data, pool_years=False):
with Model() as model:
cases,n = outpatient_data[['cases','n']].values.T
ω = monitoring_rate[INPATIENT]
shape = outpatient_data.shape[0]
if pool_years:
assert not len(cases) % 3
shape = int(len(cases)/3)
cases = cases.reshape((pooled_shape, 3)).sum(-1)
π = Beta('π', 1, 10, shape=shape)
AGE_obs = Binomial('AGE_obs', n, π*ω, observed=cases)
return model
outpatient_model_full = generate_outpatient_model(outpatient_merged)
Applied logodds-transform to π and added transformed π_logodds to model.
with outpatient_model_full:
start = find_MAP()
step = NUTS(scaling=start)
outpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
[-----------------100%-----------------] 2000 of 2000 complete in 3.9 sec
plt.subplots(figsize=(8,12))
forestplot(outpatient_full_trace[1000:], varnames=['π'])
<matplotlib.gridspec.GridSpec at 0x1221b8be0>
outpatient_data_labels = inpatient_data_labels[inpatient_data_labels.year.isin(['2012/13','2013/14'])]
generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['π'],
index=['age', 'year'], columns=['race', 'ethnicity'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future return reshape(newshape, order=order)
race | black | white | |||
---|---|---|---|---|---|
ethnicity | hispanic | non-hispanic | hispanic | non-hispanic | |
age | year | ||||
5+ | 2012/13 | 973.57 (804.49, 1149.48) | 103.82 (78.73, 131.44) | 542.07 (411.50, 646.74) | 107.32 (71.36, 149.99) |
2013/14 | 539.15 (27.72, 1234.00) | 396.80 (0.10, 1126.13) | 521.78 (405.35, 652.50) | 90.36 (55.57, 128.20) | |
<5 | 2012/13 | 105.93 (0.10, 322.00) | 7748.09 (7142.60, 8349.21) | 1051.22 (756.89, 1333.33) | 74.36 (51.22, 98.51) |
2013/14 | 102.21 (0.01, 308.45) | 4084.32 (3576.56, 4616.85) | 486.26 (338.46, 627.66) | 2363.01 (1974.50, 2774.73) |
ED2 = pd.read_excel('FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED3 = pd.read_excel('Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED2.columns = ED3.columns
ED2_total = ED2.sum()
ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'],
ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
ED3_total = ED3.sum()
ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'],
ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
ED_all = pd.concat([ED2, ED3])
eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')]
ED_counts = ED_all[eligible_cols].sum(1).astype(int)
rotavirus_ed = rotavirus_data[rotavirus_data.ED==1]
surveillance_counts = rotavirus_data.groupby(['Enrollment Date'])['Record ID'].count()
ED_counts.name = 'count24'
surveillance_counts.name = 'count8'
surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna()
# Drop days where 8-hour count exceeds 24-hour count
surveillance = surveillance[surveillance.count8<=surveillance.count24]
Here is what it looks like if you resample to weekly
weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W').sum()
(weekly_surveillance.count8 / weekly_surveillance.count24).hist()
<matplotlib.axes._subplots.AxesSubplot at 0x12332b940>
We will just use the proportion of the pooled suveillance
surveillance_total = surveillance.sum()
surveillance_total
count24 3486.0 count8 1849.0 dtype: float64
surveillance_proportion = surveillance_total.count8 / surveillance_total.count24
surveillance_proportion
0.53040734366035569
def generate_ed_model(ED_data):
with Model() as model:
data_len = ED_data.shape[0]
cases, n = ED_data[['cases','n']].values.T
weights = (np.array(monitoring_rate)[ED]
* np.array(ed_market_share[:-1])[ED_data.year]
* surveillance_proportion)
weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len,
testval=(cases/weights).astype(int))
weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)
θ = Normal('θ', 0, sd=1000, shape=data_len)
λ = Deterministic('λ', tt.exp(θ))
age_vanderbilt = Poisson('age_vanderbilt', λ * n, observed=weighted_cases)
return model
ed_pool_race_data = (setting_data[ED].drop('setting', axis=1)
.query('year<2')
.groupby(['under_5', 'year'])[['cases','n']]
.sum()
.reset_index())
ed_pool_race_data
under_5 | year | cases | n | |
---|---|---|---|---|
0 | 0 | 0 | 219 | 102191 |
1 | 0 | 1 | 203 | 103789 |
2 | 1 | 0 | 497 | 40548 |
3 | 1 | 1 | 533 | 41013 |
ed_model = generate_ed_model(ed_pool_race_data)
with ed_model:
start = find_MAP()
step = NUTS(scaling=start)
ed_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases [-----------------100%-----------------] 2000 of 2000 complete in 3.7 sec
forestplot(ed_trace[1000:], varnames=['λ'])
<matplotlib.gridspec.GridSpec at 0x126433f28>
ed_data_labels = (ed_pool_race_data[['under_5', 'year']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'under_5':'age'}))
generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future return reshape(newshape, order=order)
age | 5+ | <5 |
---|---|---|
year | ||
2012/13 | 75.29 (66.11, 84.55) | 429.88 (396.08, 469.63) |
2013/14 | 70.02 (61.57, 79.29) | 464.98 (418.34, 507.64) |
Create data frame of MCMC samples for plotting
ed_rate_samples = pd.concat([ed_pool_race_data[['under_5', 'year']],
pd.DataFrame(ed_trace['λ'][-1000:]).T], axis=1)
ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['under_5', 'year'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14'}})
.rename(columns={'under_5':'age group'}))
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
data=ed_rate_samples.assign(rate=ed_rate_samples.value*10000),
palette="gray", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x122de23c8>
census_data_virus = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
.groupby(['under_5','YEAR'])['TOT_POP'].sum()).reset_index()
census_data_virus['YEAR'] = census_data_virus.YEAR - 2011
census_virus_clean = census_data_virus.rename(columns={'YEAR':'year', 'TOT_POP':'n'})
census_virus_clean.head()
virus_summary = (rotavirus_data.groupby(['under_5', 'year', 'setting', 'rotavirus', 'sapovirus',
'astrovorus', 'norovirus'])
['Record ID'].count()
.reset_index()
.rename(columns={'Record ID':'cases', 'astrovorus':'astrovirus'}))
virus_summary.head()
merged_virus_data = virus_summary.merge(census_virus_clean, on=['under_5', 'year'], how='left')
merged_virus_data.head(10)
rota_data = (merged_virus_data[merged_virus_data.rotavirus==1]
.groupby(['under_5', 'year', 'setting'])
.agg({'cases':sum, 'n':max}))
sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1]
.groupby(['under_5', 'year', 'setting'])
.agg({'cases':sum, 'n':max})
.reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))
astro_data = (merged_virus_data[merged_virus_data.astrovirus==1]
.groupby(['under_5', 'year', 'setting'])
.agg({'cases':sum, 'n':max})
.reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))
noro_data = (merged_virus_data[merged_virus_data.norovirus==1]
.groupby(['under_5', 'year', 'setting'])
.agg({'cases':sum, 'n':max})
.reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))
virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'),
noro_data.reset_index().assign(virus='norovirus'),
rota_data.reset_index().assign(virus='rotavirus'),
sapo_data.reset_index().assign(virus='sapovirus')])
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)
virus_totals.head(10)
n_lookup = virus_totals.groupby(['under_5', 'year', 'virus']).agg({'n':max})
n_lookup
with Model() as virus_model:
n_total = int(n_lookup.shape[0])
p_enroll = Beta('p_enroll', 1, 1, testval=0.9)
enrolled = Binomial('enrolled', n=rotavirus_data.shape[0], p=p_enroll,
observed=rotavirus_data.stool_collected.sum())
weights = (np.array(monitoring_rate)[virus_totals.setting]
* np.array(market_share)[virus_totals.year])
weighted_cases = DiscreteUniform('weighted_cases', virus_totals.cases,
virus_totals.n, shape=virus_totals.shape[0],
testval=np.maximum((virus_totals.cases.values/weights).astype(int), 1))
p_weight = Deterministic('p_weight', weights*p_enroll)
weighting = Binomial('weighting', n=weighted_cases, p=p_weight, observed=virus_totals.cases)
weighted_cases_total = Deterministic('weighted_cases_total',
weighted_cases.reshape((n_total, 3)).sum(1))
θ = Normal('θ', 0, sd=1000, shape=n_total, testval=np.ones(n_total))
λ = Deterministic('λ', tt.exp(θ))
virus_like = Potential('virus_like',
Poisson.dist(λ * (n_lookup.n/10000.)).logp(weighted_cases_total))
with virus_model:
virus_trace = sample(20000, njobs=2, random_seed=seed_numbers)
traceplot(virus_trace[10000:], varnames=['p_enroll'])
forestplot(virus_trace[10000:], varnames=['weighted_cases_total'])
virus_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']],
pd.DataFrame(virus_trace['λ'][-1000:]).T], axis=1)
virus_samples = (pd.melt(virus_samples, id_vars=['under_5', 'year', 'virus'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
.rename(columns={'under_5':'age group'}))
sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus',
data=virus_samples.rename(columns={'value':'rate'}),
palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
col_wrap=2)
rateplot.despine(left=True)