%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')
seed_numbers = 20090425, 19700903
from datetime import timedelta, datetime
# rotavirus_data = pd.read_csv('data/AGE Year 2-4 with Final Rota.csv', low_memory=False)
rotavirus_data = pd.read_csv('data/AGE Cases and Healthy Controls Merged with Lab Results Year 2 to 4 v05.23.2017.csv')
rotavirus_data.index.is_unique
/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (142,143,159,160,161,208,209,226,229,230,234,235,237,242,243,246,251,252,253,258,262,266,279,280,281,285,286,288,289,290,291,294,295,296,297,298,299,302,304,305,307,312,313,320,321,322,323,325,329,334,335,336,341,342,343,344,347,351,357,358,360,361,362,366,371,372,378,382,383,384,385,389,390,391,395,400,401,402,415,416,417,420,421,423,425,430,431,433,435,436,438,439,440,442,443,444,448,452,458,460,461,462,466,470,472,474,486,498,508,512,520,540,549,555,558,577,584,591,598,606,607,677,678,679,680,681,682,683,684,685,688,689,690,691,692,693,694,695,696,697) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
True
rotavirus_data.head()
caseid | case | settingnew | rota_od | vaxstrain | rota_genotype | noro_type | noroseq_gii_c | noroseq_gii_d | noroseq_gi_c | ... | ddxdioten1 | ddxdioten2 | ddxdioten3 | ddxdioten4 | ddxdioten5 | ddxdioten6 | ddxdioten7 | ddxdioten8 | ddxdioten9 | ddxdioten10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | EN1C0001 | 1 | 3 | 0.088 | NaN | NEG | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | EN1C0002 | 1 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | EN1C0003 | 1 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | EN1C0004 | 1 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | EN1C0006 | 1 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 698 columns
Rename columns and recode data
rotavirus_data.race.value_counts()
1 2766 2 1968 6 411 4 105 7 10 5 3 3 1 8 1 Name: race, dtype: int64
rotavirus_data['white'] = rotavirus_data['race'] == 1
rotavirus_data['black'] = rotavirus_data['race'] == 2
rotavirus_data['astro'].value_counts().index
Float64Index([0.0, 1.0], dtype='float64')
# rotavirus_data['rotavirus'] = (rotavirus_data['rotacdc'] == 'Positive').astype(float)
# rotavirus_data['sapovirus'] = (rotavirus_data['sapo'] == 'Positive').astype(float)
# rotavirus_data['astrovirus'] = (rotavirus_data['astro'] == 'Positive').astype(float)
# rotavirus_data['norovirus'] = (rotavirus_data['noro'] == 'Positive').astype(float)
# rotavirus_data.loc[rotavirus_data['rotacdc'].isnull() | (rotavirus_data.rotacdc=='Inhibition') | (rotavirus_data.specimencol=='No'),
# 'rotavirus'] = np.nan
# rotavirus_data.loc[rotavirus_data['sapo'].isnull() | (rotavirus_data.sapo=='Inhibition') | (rotavirus_data.specimencol=='No'),
# 'sapovirus'] = np.nan
# rotavirus_data.loc[rotavirus_data['astro'].isnull() | (rotavirus_data.astro=='Inhibition') | (rotavirus_data.specimencol=='No'),
# 'astrovirus'] = np.nan
# rotavirus_data.loc[rotavirus_data['noro'].isnull() | (rotavirus_data.noro=='Inhibition') | (rotavirus_data.specimencol=='No'),
# 'norovirus'] = np.nan
rotavirus_data=rotavirus_data.rename(columns={'rotacdc':'rotavirus',
'sapo':'sapovirus',
'astro':'astrovirus',
'noro':'norovirus'})
rotavirus_data.rotavirus.value_counts()
0.0 3722 1.0 290 Name: rotavirus, dtype: int64
rotavirus_data.rotavu.value_counts()
0.0 3574 1.0 441 Name: rotavu, dtype: int64
virus_cols = ['rotavirus','sapovirus','astrovirus','norovirus']
Proportion of test results missing
rotavirus_data[virus_cols].isnull().mean()
rotavirus 0.237987 sapovirus 0.237417 astrovirus 0.237417 norovirus 0.237417 dtype: float64
rotavirus_data['scrdate'] = pd.to_datetime(rotavirus_data['scrdate'])
rotavirus_data['dob'] = pd.to_datetime(rotavirus_data['dob'])
astro_data = rotavirus_data[rotavirus_data.astrovirus==1].dropna(subset=['astro_ct'])
astro_data.shape
(159, 700)
noro_data = rotavirus_data[rotavirus_data.norovirus==1].dropna(subset=['noro_ct'])
noro_data.shape
(729, 700)
noro_data['coinfect'] = noro_data[['rotavirus', 'sapovirus', 'astrovirus']].sum(1)
astro_data['coinfect'] = astro_data[['rotavirus', 'sapovirus', 'norovirus']].sum(1)
with_coinf = noro_data.noro_ct[noro_data.coinfect==1].astype(float)
no_coinf = noro_data.noro_ct[(noro_data.coinfect==0)].astype(float)
# with_coinf = astro_data.astro_ct[astro_data.coinfect==1].astype(float)
# no_coinf = astro_data.astro_ct[(astro_data.coinfect==0)].astype(float)
from scipy.stats import ranksums
ranksums(with_coinf, no_coinf)
RanksumsResult(statistic=3.6398985691965766, pvalue=0.00027274545863325173)
rotavirus_data.sapo_ct.dropna().astype(float).hist()
<matplotlib.axes._subplots.AxesSubplot at 0x115755438>
Calculate age, and compare to given age.
rotavirus_data['age_months'] = (rotavirus_data['scrdate']
- rotavirus_data['dob'] + timedelta(1)).astype('timedelta64[M]')
rotavirus_data.age_months.hist();
Group into ages under 5 and 5 or over
rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int)
rotavirus_data['under_2'] = ((rotavirus_data.age_months / 12) < 2).astype(int)
rotavirus_data.loc[rotavirus_data.caseid=='EN1I0141', ['under_5', 'under_2']]
under_5 | under_2 | |
---|---|---|
5192 | 1 | 0 |
rotavirus_data['age_group'] = 2
rotavirus_data.loc[rotavirus_data.under_5==1, 'age_group'] = 1
rotavirus_data.loc[rotavirus_data.under_2==1, 'age_group'] = 0
rotavirus_data.age_group.value_counts()
0 2462 2 1459 1 1344 Name: age_group, dtype: int64
rotavirus_data.year.value_counts()
2 1989 3 1740 4 1536 Name: year, dtype: int64
Calculate natural year
rotavirus_data['year'] = rotavirus_data.year - 2
Rename setting variable
assert not rotavirus_data.settingnew.isnull().sum()
rotavirus_data['setting'] = rotavirus_data.settingnew - 1
# Indices to each value
INPATIENT, ED, OUTPATIENT, HC = 0, 1, 2, 3
# rotavirus_data['setting'] = rotavirus_data.provider.replace({1:INPATIENT, 2:ED, 3:OUTPATIENT, 4:HC})
Recode sex and stool
assert not rotavirus_data['specimencol'].isnull().sum()
rotavirus_data['male'] = rotavirus_data['sex']==1
rotavirus_data['stool_collected'] = rotavirus_data['specimencol']==1
Recode insurance
rotavirus_data.insurance.value_counts()
1 4599 2 500 3 84 8 44 4 38 Name: insurance, dtype: int64
rotavirus_data['public_ins'] = rotavirus_data.insurance.isin({1, 3})
rotavirus_data['private_ins'] = rotavirus_data.insurance.isin({2, 3})
rotavirus_data.loc[rotavirus_data.insurance==8, ['public_ins', 'private_ins']] = np.nan
# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['intvwdate', 'dob', 'provider', 'coldat', 'tdat',
'insurance', 'sex', 'hispcr', 'mdegree_7', 'rvacver_7',
'rvacver_7r', 'race', 'race2_8'], axis=1)
Summary of counts by setting
rotavirus_data.shape
(5265, 696)
Summary of missing values
rotavirus_data.isnull().sum()
caseid 0 case 0 settingnew 0 rota_od 5085 vaxstrain 5264 rota_genotype 4826 noro_type 4535 noroseq_gii_c 4772 noroseq_gii_d 4772 noroseq_gi_c 5022 noroseq_gi_d 5037 year 0 rotavirus 1253 rotavu 1250 sapovirus 1250 sapo_ct 4917 astrovirus 1250 astro_ct 5106 norovirus 1250 noro_ct 4536 studysite 0 hospital 0 abiniscr 0 deiniscr 0 scrdate 0 admitdate 0 admittime 13 agedays 0 agemonths 0 ageyears 0 ... ddxdcten2 5198 ddxdcten3 5235 ddxdcten4 5254 ddxdcten5 5259 ddxdcten6 5263 ddxdcten7 5264 ddxdcten8 5264 ddxdcten9 5265 ddxdcten10 5265 ddxdioten1 5258 ddxdioten2 5259 ddxdioten3 5261 ddxdioten4 5261 ddxdioten5 5263 ddxdioten6 5263 ddxdioten7 5263 ddxdioten8 5264 ddxdioten9 5264 ddxdioten10 5264 white 0 black 0 age_months 0 under_5 0 under_2 0 age_group 0 setting 0 male 0 stool_collected 0 public_ins 44 private_ins 44 Length: 696, dtype: int64
Aggregate data by age group and setting
rotavirus_summary = (rotavirus_data.drop('scrdate', axis=1)
.groupby(['age_group', 'year', 'setting', 'black'])['caseid'].count()
.reset_index()
.rename(columns={'caseid':'cases'}))
rotavirus_summary.head()
age_group | year | setting | black | cases | |
---|---|---|---|---|---|
0 | 0 | 0 | 0 | False | 35 |
1 | 0 | 0 | 0 | True | 16 |
2 | 0 | 0 | 1 | False | 173 |
3 | 0 | 0 | 1 | True | 137 |
4 | 0 | 0 | 2 | False | 119 |
The key for AGEGRP is as follows:
davidson_census = pd.read_csv('data/davidson_county.csv')
davidson_census.head()
SUMLEV | STATE | COUNTY | STNAME | CTYNAME | YEAR2008 | YEAR | AGEGRP | TOT_POP | TOT_MALE | ... | HWAC_MALE | HWAC_FEMALE | HBAC_MALE | HBAC_FEMALE | HIAC_MALE | HIAC_FEMALE | HAAC_MALE | HAAC_FEMALE | HNAC_MALE | HNAC_FEMALE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 0 | 626681 | 303540 | ... | 30400 | 24829 | 2178 | 2006 | 1501 | 1183 | 381 | 374 | 271 | 211 |
1 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 1 | 44691 | 22868 | ... | 3838 | 3664 | 302 | 303 | 185 | 189 | 63 | 53 | 26 | 35 |
2 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 2 | 37613 | 18977 | ... | 2910 | 2826 | 266 | 205 | 145 | 140 | 45 | 50 | 22 | 23 |
3 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 3 | 33904 | 17479 | ... | 2154 | 1890 | 166 | 170 | 105 | 91 | 25 | 29 | 18 | 16 |
4 | 50 | 47 | 37 | Tennessee | Davidson County | 1 | 2008 | 4 | 38999 | 19674 | ... | 2029 | 1783 | 166 | 174 | 111 | 82 | 29 | 29 | 22 | 16 |
5 rows × 81 columns
Number under 18 and under 5 from 2010 census:
under_18_2010 = 136612
under_5_2010 = 44493
over_4_under_18 = under_18_2010 - under_5_2010
over_4_under_18
92119
Children 5-19 from census dataset:
under_20 = davidson_census.loc[(davidson_census.YEAR==2010) & davidson_census.AGEGRP.isin([2,3,4]), 'TOT_POP'].sum()
under_20
110614
Proportion over 4 but also under 18:
pct_under_18 = over_4_under_18 / under_20
pct_under_18
0.8327969334803913
Identify racial/ethnic subgroup populations
census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy()
census_data_child['WHITE'] = census_data_child[['WA_MALE', 'WA_FEMALE']].sum(axis=1)
census_data_child['BLACK'] = census_data_child[['BA_MALE', 'BA_FEMALE']].sum(axis=1)
census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
.groupby(['under_5','YEAR'])[['WHITE', 'BLACK']]
.sum()).reset_index()
census_data['year'] = census_data.YEAR - 2011
census_data
under_5 | YEAR | WHITE | BLACK | year | |
---|---|---|---|---|---|
0 | False | 2011 | 60966 | 41225 | 0 |
1 | False | 2012 | 62219 | 41570 | 1 |
2 | False | 2013 | 63140 | 41539 | 2 |
3 | True | 2011 | 26424 | 14124 | 0 |
4 | True | 2012 | 26936 | 14077 | 1 |
5 | True | 2013 | 27388 | 14023 | 2 |
Correct over-4 counts
census_data.loc[~census_data.under_5, ['WHITE', 'BLACK']] *= pct_under_18
Add "under 2" category that is 40% of under_5
census_data['under_2'] = False
census_under_2 = census_data[census_data.under_5].copy()
census_under_2.under_2 = True
census_under_2.under_5 = False
census_under_2.WHITE = (census_under_2.WHITE*0.4).astype(int)
census_under_2.BLACK = (census_under_2.BLACK*0.4).astype(int)
census_under_2
under_5 | YEAR | WHITE | BLACK | year | under_2 | |
---|---|---|---|---|---|---|
3 | False | 2011 | 10569 | 5649 | 0 | True |
4 | False | 2012 | 10774 | 5630 | 1 | True |
5 | False | 2013 | 10955 | 5609 | 2 | True |
Subtract under-2's from under-5's
census_data.loc[census_data.under_5, ['WHITE', 'BLACK']] -= census_under_2[['WHITE', 'BLACK']]
census_data = (pd.concat([census_data, census_under_2])
.reset_index(drop=True)[['under_2', 'under_5', 'year', 'WHITE', 'BLACK']]
.rename(columns={'under_5':'2_to_4'}))
census_data
under_2 | 2_to_4 | year | WHITE | BLACK | |
---|---|---|---|---|---|
0 | False | False | 0 | 50772.297847 | 34332.053583 |
1 | False | False | 1 | 51815.792404 | 34619.368525 |
2 | False | False | 2 | 52582.798380 | 34593.551820 |
3 | False | True | 0 | 15855.000000 | 8475.000000 |
4 | False | True | 1 | 16162.000000 | 8447.000000 |
5 | False | True | 2 | 16433.000000 | 8414.000000 |
6 | True | False | 0 | 10569.000000 | 5649.000000 |
7 | True | False | 1 | 10774.000000 | 5630.000000 |
8 | True | False | 2 | 10955.000000 | 5609.000000 |
census_data[['WHITE', 'BLACK']] = census_data[['WHITE', 'BLACK']].astype(int)
census_data
under_2 | 2_to_4 | year | WHITE | BLACK | |
---|---|---|---|---|---|
0 | False | False | 0 | 50772 | 34332 |
1 | False | False | 1 | 51815 | 34619 |
2 | False | False | 2 | 52582 | 34593 |
3 | False | True | 0 | 15855 | 8475 |
4 | False | True | 1 | 16162 | 8447 |
5 | False | True | 2 | 16433 | 8414 |
6 | True | False | 0 | 10569 | 5649 |
7 | True | False | 1 | 10774 | 5630 |
8 | True | False | 2 | 10955 | 5609 |
census_data['age_group'] = 2
census_data.loc[census_data['under_2'], 'age_group'] = 0
census_data.loc[census_data['2_to_4'], 'age_group'] = 1
census_data
under_2 | 2_to_4 | year | WHITE | BLACK | age_group | |
---|---|---|---|---|---|---|
0 | False | False | 0 | 50772 | 34332 | 2 |
1 | False | False | 1 | 51815 | 34619 | 2 |
2 | False | False | 2 | 52582 | 34593 | 2 |
3 | False | True | 0 | 15855 | 8475 | 1 |
4 | False | True | 1 | 16162 | 8447 | 1 |
5 | False | True | 2 | 16433 | 8414 | 1 |
6 | True | False | 0 | 10569 | 5649 | 0 |
7 | True | False | 1 | 10774 | 5630 | 0 |
8 | True | False | 2 | 10955 | 5609 | 0 |
Reshape data
melted_census = (pd.melt(census_data, id_vars=['age_group', 'year'],
value_vars=['WHITE', 'BLACK']))
census_clean = (melted_census.assign(black=(melted_census.variable=='BLACK').astype(int))
.drop('variable', axis=1)
.rename(columns={'value':'n'}))
census_clean
age_group | year | n | black | |
---|---|---|---|---|
0 | 2 | 0 | 50772 | 0 |
1 | 2 | 1 | 51815 | 0 |
2 | 2 | 2 | 52582 | 0 |
3 | 1 | 0 | 15855 | 0 |
4 | 1 | 1 | 16162 | 0 |
5 | 1 | 2 | 16433 | 0 |
6 | 0 | 0 | 10569 | 0 |
7 | 0 | 1 | 10774 | 0 |
8 | 0 | 2 | 10955 | 0 |
9 | 2 | 0 | 34332 | 1 |
10 | 2 | 1 | 34619 | 1 |
11 | 2 | 2 | 34593 | 1 |
12 | 1 | 0 | 8475 | 1 |
13 | 1 | 1 | 8447 | 1 |
14 | 1 | 2 | 8414 | 1 |
15 | 0 | 0 | 5649 | 1 |
16 | 0 | 1 | 5630 | 1 |
17 | 0 | 2 | 5609 | 1 |
census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)])
merged_data = (rotavirus_summary.merge(census_full, on=['age_group', 'year', 'black', 'setting'], how='right')
.fillna(0).astype(int))
VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health
inpatient_market_share = 0.94, 0.90, 0.83
ed_market_share = 0.60, 0.59, 0.62
Y2012, Y2013, Y2014 = 0, 1, 2
Will estimate 2014 market shares by extrapolating a linear fit to the previous years.
from sklearn import linear_model
inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share))
inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.values.reshape(-1, 1), y=inpatient_df.y)
inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]]
inpatient_market_share
[0.9, 0.83, 0.77500000000000002]
ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share))
ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.values.reshape(-1, 1), y=ed_df.y)
ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]]
ed_market_share
[0.59, 0.62, 0.63]
Enrollment days/week for inpatient, outpatient, ED
monitoring_days = 7, 4, 4
# Only 5.5 days of intake on outpatient
monitoring_rate = np.array(monitoring_days)/np.array([7, 5.5, 7])
monitoring_rate
array([ 1. , 0.72727273, 0.57142857])
rotavirus_data[(rotavirus_data.white==1) & (rotavirus_data.setting==2) & (rotavirus_data.year==2) &
(rotavirus_data.under_5==1)].shape
(202, 696)
setting_data = dict(list(merged_data.groupby('setting')))
from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential, Metropolis
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform, Uniform, Flat, Gamma
from pymc3 import traceplot, forestplot, summary, df_summary
import theano.tensor as tt
setting_data[INPATIENT].drop('setting', axis=1).head()
age_group | year | black | cases | n | |
---|---|---|---|---|---|
0 | 0 | 0 | 0 | 35 | 10569 |
1 | 0 | 0 | 1 | 16 | 5649 |
6 | 0 | 1 | 0 | 40 | 10774 |
7 | 0 | 1 | 1 | 13 | 5630 |
12 | 0 | 2 | 0 | 40 | 10955 |
inpatient_data = setting_data[INPATIENT].drop('setting', axis=1)
inpatient_data[['cases', 'n']]
cases | n | |
---|---|---|
0 | 35 | 10569 |
1 | 16 | 5649 |
6 | 40 | 10774 |
7 | 13 | 5630 |
12 | 40 | 10955 |
13 | 13 | 5609 |
18 | 18 | 15855 |
19 | 7 | 8475 |
24 | 15 | 16162 |
25 | 3 | 8447 |
30 | 7 | 16433 |
31 | 3 | 8414 |
36 | 12 | 50772 |
37 | 8 | 34332 |
42 | 14 | 51815 |
43 | 1 | 34619 |
48 | 5 | 52582 |
49 | 2 | 34593 |
def generate_inpatient_model(inpatient_data, pool_years=False):
assert not inpatient_data.isnull().sum().any()
with Model() as model:
cases, n = inpatient_data[['cases', 'n']].values.T
# Cases weighted by monitoring effort and market share
weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year]
# Uniform prior on weighted cases
weighted_cases = Uniform('weighted_cases', cases, n, shape=inpatient_data.shape[0])
# testval=(cases/weights).astype(int))
# Likelihood of observed cases
weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)
if pool_years:
assert not len(cases) % 3
pooled_shape = int(len(cases)/3)
pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
θ = Normal('θ', 0, sd=1000, shape=pooled_shape)
λ = Deterministic('λ', tt.exp(θ))
AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
else:
θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0])
# Poisson rate parameter
λ = Deterministic('λ', tt.exp(θ))
# Poisson likelihood
AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
return model
inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1))
iterations = 1000
tune = 1000
with inpatient_model_full:
inpatient_full_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler... Initializing NUTS using advi+adapt_diag... Average Loss = 29,588: 6%|▌ | 12270/200000 [00:01<00:21, 8539.43it/s] Convergence archived at 12900 Interrupted at 12,900 [6%]: Average Loss = 1.7097e+05 100%|██████████| 2000/2000 [00:05<00:00, 398.56it/s]
forestplot(inpatient_full_trace, varnames=['λ'])
<matplotlib.gridspec.GridSpec at 0x11e9b5400>
label_map = {'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
'black':{0:'white', 1:'black'},
'setting':{0:'inpatient', 1:'ED', 2:'outpatient'}}
inpatient_data_labels = (setting_data[INPATIENT][['age_group', 'year', 'black']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'black':'race', 'age_group':'age'}))
Rate factor for population
rate_factor = 10000
rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns)
from pymc3 import hpd
def _hpd_df(x, alpha):
cnames = ['hpd_{0:g}'.format(100 * alpha/2),
'hpd_{0:g}'.format(100 * (1 - alpha/2))]
return pd.DataFrame(hpd(x, alpha), columns=cnames)
estimates = (df_summary(inpatient_full_trace, varnames=['λ'],
stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates.index = inpatient_data_labels.index
trace, labels = inpatient_full_trace, inpatient_data_labels
def generate_table(trace, labels, index, columns, varnames=['λ']):
rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(),
names=labels.columns)
rate_df = (df_summary(trace, varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
rate_df.index = rate_df_index
pivot_all = pd.pivot_table(rate_df
.reset_index(),
index=index, columns=columns)
value_iterator = zip(pivot_all['mean'].values.flatten(),
pivot_all['hpd_2.5'].values.flatten(),
pivot_all['hpd_97.5'].values.flatten())
rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator],
pivot_all['mean'].shape)
return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)
inpatient_full_table = generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'],
columns=['race']).sort_index()
inpatient_full_table.to_excel('results/inpatient_full.xlsx')
inpatient_full_table
race | black | white | |
---|---|---|---|
age | year | ||
0-1 | 2012/13 | 31.52 (18.20, 45.72) | 36.87 (25.41, 49.71) |
2013/14 | 28.04 (14.19, 42.66) | 44.75 (31.68, 58.47) | |
2014/15 | 30.38 (15.63, 47.14) | 46.84 (31.71, 60.66) | |
2-4 | 2012/13 | 9.55 (3.55, 16.46) | 12.79 (6.98, 18.59) |
2013/14 | 4.74 (0.68, 9.93) | 11.22 (5.79, 16.97) | |
2014/15 | 4.97 (0.67, 10.40) | 5.61 (2.11, 9.92) | |
5+ | 2012/13 | 2.65 (1.04, 4.32) | 2.66 (1.32, 4.13) |
2013/14 | 0.47 (0.00, 1.32) | 3.29 (1.73, 5.00) | |
2014/15 | 0.86 (0.02, 1.98) | 1.27 (0.37, 2.36) |
inpatient_rate_samples = pd.concat(
[inpatient_data_labels,
pd.DataFrame(inpatient_full_trace['λ']).T], axis=1)
inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race']))
Plot of rates broken down by year/age/race/ethnicity.
sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age", col='race',
hue_order=['0-1','2-4','5+'],
data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*rate_factor),
palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0)
rateplot.despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x1174e4fd0>
inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1)
.groupby(['age_group', 'year'])[['cases','n']]
.sum()
.reset_index())
inpatient_pool_race_data
age_group | year | cases | n | |
---|---|---|---|---|
0 | 0 | 0 | 51 | 16218 |
1 | 0 | 1 | 53 | 16404 |
2 | 0 | 2 | 53 | 16564 |
3 | 1 | 0 | 25 | 24330 |
4 | 1 | 1 | 18 | 24609 |
5 | 1 | 2 | 10 | 24847 |
6 | 2 | 0 | 20 | 85104 |
7 | 2 | 1 | 15 | 86434 |
8 | 2 | 2 | 7 | 87175 |
inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data)
with inpatient_model_pool_race:
inpatient_pool_race_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler... Initializing NUTS using advi+adapt_diag... Average Loss = 35,642: 6%|▌ | 11602/200000 [00:01<00:24, 7698.74it/s] Convergence archived at 12500 Interrupted at 12,500 [6%]: Average Loss = 1.7517e+05 100%|██████████| 2000/2000 [00:03<00:00, 513.05it/s]
inpatient_pool_race_data_labels = (inpatient_pool_race_data[['age_group', 'year']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'age_group':'age'}))
inpatient_pool_race_table = generate_table(inpatient_pool_race_trace,
inpatient_pool_race_data_labels,
index=['year'], columns=['age'])
inpatient_pool_race_table.to_excel('results/inpatient_pool_race.xlsx')
inpatient_pool_race_table
age | 0-1 | 2-4 | 5+ |
---|---|---|---|
year | |||
2012/13 | 35.01 (25.59, 44.07) | 11.47 (7.67, 15.95) | 2.58 (1.55, 3.81) |
2013/14 | 38.83 (29.17, 49.64) | 8.91 (4.83, 13.09) | 2.08 (1.15, 3.22) |
2014/15 | 41.46 (31.25, 53.61) | 5.19 (2.13, 8.26) | 1.06 (0.39, 1.83) |
inpatient_pool_race_samples = pd.concat(
[inpatient_pool_race_data[['age_group', 'year']].reset_index(drop=True),
pd.DataFrame(inpatient_pool_race_trace['λ']).T], axis=1)
inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples,
id_vars=['age_group', 'year'])
.replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
.rename(columns={'age_group':'age group'}))
Plot of rates pooled by ethnicity
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*rate_factor),
palette="gray", linewidth=0.6, fliersize=0).set_xlabel('year')
<matplotlib.text.Text at 0x1209bada0>
inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1)
.sort_values(by=['age_group', 'black', 'year'], ascending=False)).reset_index(drop=True)
inpatient_pool_year_data
age_group | year | black | cases | n | |
---|---|---|---|---|---|
0 | 2 | 2 | 1 | 2 | 34593 |
1 | 2 | 1 | 1 | 1 | 34619 |
2 | 2 | 0 | 1 | 8 | 34332 |
3 | 2 | 2 | 0 | 5 | 52582 |
4 | 2 | 1 | 0 | 14 | 51815 |
5 | 2 | 0 | 0 | 12 | 50772 |
6 | 1 | 2 | 1 | 3 | 8414 |
7 | 1 | 1 | 1 | 3 | 8447 |
8 | 1 | 0 | 1 | 7 | 8475 |
9 | 1 | 2 | 0 | 7 | 16433 |
10 | 1 | 1 | 0 | 15 | 16162 |
11 | 1 | 0 | 0 | 18 | 15855 |
12 | 0 | 2 | 1 | 13 | 5609 |
13 | 0 | 1 | 1 | 13 | 5630 |
14 | 0 | 0 | 1 | 16 | 5649 |
15 | 0 | 2 | 0 | 40 | 10955 |
16 | 0 | 1 | 0 | 40 | 10774 |
17 | 0 | 0 | 0 | 35 | 10569 |
inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True)
with inpatient_model_pool_year:
inpatient_pool_year_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler... Initializing NUTS using advi+adapt_diag... Average Loss = 10,723: 8%|▊ | 15014/200000 [00:01<00:19, 9712.66it/s] Convergence archived at 15900 Interrupted at 15,900 [7%]: Average Loss = 1.1632e+05 100%|██████████| 2000/2000 [00:11<00:00, 174.10it/s]
inpatient_pool_year_labels = (inpatient_pool_year_data[['age_group', 'black']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'black':'race', 'age_group':'age'})
.drop_duplicates().reset_index(drop=True))
estimates_pooled = (df_summary(inpatient_pool_year_trace, varnames=['λ'],
stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates_pooled.index = inpatient_pool_year_labels.index
pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1)
age | race | mean | sd | mc_error | hpd_2.5 | hpd_97.5 | median | hpd_25 | hpd_75 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 5+ | black | 4.085 | 1.166 | 0.022 | 1.902 | 6.384 | 3.978 | 3.185 | 4.703 |
1 | 5+ | white | 7.109 | 1.272 | 0.028 | 4.769 | 9.627 | 7.055 | 6.281 | 7.990 |
2 | 2-4 | black | 19.782 | 5.080 | 0.118 | 10.649 | 29.775 | 19.245 | 15.546 | 22.065 |
3 | 2-4 | white | 29.073 | 4.485 | 0.090 | 20.202 | 37.472 | 28.845 | 25.940 | 31.733 |
4 | 0-1 | black | 90.983 | 14.169 | 0.337 | 64.011 | 119.772 | 90.464 | 80.813 | 98.960 |
5 | 0-1 | white | 126.747 | 11.787 | 0.277 | 106.351 | 151.540 | 126.593 | 120.010 | 136.382 |
inpatient_pool_year_table = generate_table(inpatient_pool_year_trace,
inpatient_pool_year_labels,
index=['race'], columns=['age'])
inpatient_pool_year_table.to_excel('results/inpatient_pool_year.xlsx')
inpatient_pool_year_table
age | 0-1 | 2-4 | 5+ |
---|---|---|---|
race | |||
black | 90.98 (64.01, 119.77) | 19.78 (10.65, 29.78) | 4.08 (1.90, 6.38) |
white | 126.75 (106.35, 151.54) | 29.07 (20.20, 37.47) | 7.11 (4.77, 9.63) |
inpatient_pool_year_samples = pd.concat(
[inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['age_group', 'black']],
pd.DataFrame(inpatient_pool_year_trace['λ']).T], axis=1)
inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['age_group', 'black'])
.replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'black':{0:'white', 1:'black'}})
.rename(columns={'black':'race', 'age_group':'age group'}))
Plot of rates pooled by year
sns.set_context(font_scale=1.5)
plt.subplots(figsize=(12,6))
sns.boxplot(x="race", y="rate", hue="age group",
data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*rate_factor),
palette="gray_r", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x11004c828>
Import outpatient denominator information from REDCap
import redcap
import getpass
local_auth = True
api_url = 'https://redcap.vanderbilt.edu/api/'
if local_auth:
api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()
else:
api_key = getpass.getpass()
outpatient_project = redcap.Project(api_url, api_key)
outpatients = outpatient_project.export_records(format='df')
outpatients.groupby('year').year.count()
year 2012 15969 2015 17562 Name: year, dtype: int64
outpatients.shape
(33531, 15)
outpatients.head()
mrn | year | dateextracted | timeextracted | ptname | ptdob | ptsex | ptrace | ptethnicity | ptage | ptcounty | datetosubstract | ageinjan | datetosubstract2 | ageinnov | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
record_id | |||||||||||||||
A1 | 9030842 | 2012 | 4/29/2016 | 10:16 | VEACH, ELIJAH | 1993-11-29 | M | W | NH | 22.42 | Davidson-TN | 1/1/2012 | 18.088980 | 11/30/2012 | 19.003422 |
A2 | 11416534 | 2012 | 4/29/2016 | 10:16 | MCGRAW, TRINEN TYRONE | 1998-09-04 | M | B | NH | 17.65 | Davidson-TN | 1/1/2012 | 13.325120 | 11/30/2012 | 14.239562 |
A3 | 12265690 | 2012 | 4/29/2016 | 10:16 | RANDOLPH, JONATHAN THOMAS | 1996-07-30 | M | W | NH | 19.75 | Robertson - TN | 1/1/2012 | 15.422313 | 11/30/2012 | 16.336756 |
A4 | 12270971 | 2012 | 4/29/2016 | 10:16 | POINTER, LAQUINCAS L | 1997-02-08 | M | B | NaN | 19.22 | Davidson-TN | 1/1/2012 | 14.893908 | 11/30/2012 | 15.808350 |
A5 | 12750493 | 2012 | 4/29/2016 | 10:16 | HUMPHREY, NORMA | 1995-12-15 | F | B | NaN | 20.37 | Davidson-TN | 1/1/2012 | 16.046543 | 11/30/2012 | 16.960986 |
outpatients[['ageinjan', 'ageinnov']].hist(bins=20)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12b8f0550>, <matplotlib.axes._subplots.AxesSubplot object at 0x128b06be0>]], dtype=object)
Going to assume that the outlier ages above were mistakenly entered as months.
outliers = outpatients.ageinjan > 20
outpatients.loc[outliers, ['ageinjan', 'ageinnov']] = outpatients.loc[outliers, ['ageinjan', 'ageinnov']] / 12
outpatients[['ageinjan', 'ageinnov']].hist(bins=20)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x128a12160>, <matplotlib.axes._subplots.AxesSubplot object at 0x1289ed898>]], dtype=object)
outpatients['age'] = np.maximum(outpatients[['ageinjan', 'ageinnov']].mean(1), 0.1)
outpatients_kids = outpatients[outpatients.age<18].copy()
outpatients_kids['under_5'] = (outpatients_kids.age<5).astype(int)
outpatients_kids['under_2'] = (outpatients_kids.age<2).astype(int)
outpatients_kids['age_group'] = 2
outpatients_kids.loc[outpatients_kids.under_5==1, 'age_group'] = 1
outpatients_kids.loc[outpatients_kids.under_2==1, 'age_group'] = 0
outpatients_kids.groupby('year').age.hist(alpha=0.3)
year 2012 Axes(0.125,0.125;0.775x0.755) 2015 Axes(0.125,0.125;0.775x0.755) Name: age, dtype: object
outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
.groupby(['year', 'age_group'])
.n.count()
.reset_index())
outpatient_n
year | age_group | n | |
---|---|---|---|
0 | 2012 | 0 | 2889 |
1 | 2012 | 1 | 4446 |
2 | 2012 | 2 | 8605 |
3 | 2015 | 0 | 3054 |
4 | 2015 | 1 | 4307 |
5 | 2015 | 2 | 9870 |
Interpolate 2013
interp_2013 = (outpatient_n.groupby(['age_group'])
.apply(lambda x: np.interp(2013, x.year, x.n))
.astype(int)
.reset_index()).assign(year=2013).rename(columns={0:'n'})
interp_2013
age_group | n | year | |
---|---|---|---|
0 | 0 | 2944 | 2013 |
1 | 1 | 4399 | 2013 |
2 | 2 | 9026 | 2013 |
Drop 2015 and recode year
outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0)
outpatient_n = outpatient_n[outpatient_n.year<2015]
outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012)
outpatient_n
age_group | n | year | |
---|---|---|---|
0 | 0 | 2889 | 0 |
1 | 1 | 4446 | 0 |
2 | 2 | 8605 | 0 |
0 | 0 | 2944 | 1 |
1 | 1 | 4399 | 1 |
2 | 2 | 9026 | 1 |
outpatient_data_full = setting_data[OUTPATIENT].groupby(['age_group','year']).cases.sum().reset_index()
outpatient_data_full
age_group | year | cases | |
---|---|---|---|
0 | 0 | 0 | 187 |
1 | 0 | 1 | 185 |
2 | 0 | 2 | 179 |
3 | 1 | 0 | 120 |
4 | 1 | 1 | 119 |
5 | 1 | 2 | 116 |
6 | 2 | 0 | 152 |
7 | 2 | 1 | 124 |
8 | 2 | 2 | 92 |
outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['age_group','year'])
outpatient_merged
age_group | year | cases | n | |
---|---|---|---|---|
0 | 0 | 0 | 187 | 2889 |
1 | 0 | 1 | 185 | 2944 |
2 | 1 | 0 | 120 | 4446 |
3 | 1 | 1 | 119 | 4399 |
4 | 2 | 0 | 152 | 8605 |
5 | 2 | 1 | 124 | 9026 |
def generate_outpatient_model(outpatient_data, pool_years=False):
assert not outpatient_data.isnull().sum().any()
with Model() as model:
cases,n = outpatient_data[['cases','n']].values.T
shape = outpatient_data.shape[0]
# Cases weighted by monitoring effort
weights = monitoring_rate[OUTPATIENT]
# Uniform prior on weighted cases
weighted_cases = Uniform('weighted_cases', cases, n, shape=shape)
# Likelihood of observed cases
weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)
if pool_years:
assert not len(cases) % 3
pooled_shape = int(len(cases)/3)
pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
θ = Normal('θ', 0, sd=1000, shape=pooled_shape)
λ = Deterministic('λ', tt.exp(θ))
AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
else:
θ = Normal('θ', 0, sd=1000, shape=outpatient_data.shape[0])
# Poisson rate parameter
λ = Deterministic('λ', tt.exp(θ))
# Poisson likelihood
AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
return model
outpatient_model_full = generate_outpatient_model(outpatient_merged)
with outpatient_model_full:
outpatient_full_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler... Initializing NUTS using advi+adapt_diag... Average Loss = 1,053: 6%|▌ | 11993/200000 [00:01<00:15, 12142.93it/s] Convergence archived at 12800 Interrupted at 12,800 [6%]: Average Loss = 7,340.5 100%|██████████| 2000/2000 [00:03<00:00, 535.76it/s]
outpatient_data_labels = (inpatient_pool_race_data[['age_group', 'year']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'age_group':'age'}))
outpatient_data_labels = outpatient_data_labels[outpatient_data_labels.year.isin(['2012/13','2013/14'])]
outpatient_data_labels.columns = ['age', 'year']
outpatient_data_labels
age | year | |
---|---|---|
0 | 0-1 | 2012/13 |
1 | 0-1 | 2013/14 |
3 | 2-4 | 2012/13 |
4 | 2-4 | 2013/14 |
6 | 5+ | 2012/13 |
7 | 5+ | 2013/14 |
outpatient_full_table = generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['λ'],
index=['age'], columns=['year'])
outpatient_full_table.to_excel('results/outpatient_pool_race.xlsx')
outpatient_full_table
year | 2012/13 | 2013/14 |
---|---|---|
age | ||
0-1 | 1132.37 (975.65, 1297.42) | 1097.63 (949.24, 1257.14) |
2-4 | 472.92 (391.60, 562.46) | 472.22 (393.97, 549.70) |
5+ | 308.94 (261.60, 358.79) | 240.12 (199.26, 288.89) |
outpatient_full_samples = pd.concat(
[outpatient_merged[['age_group', 'year']].reset_index(drop=True),
pd.DataFrame(outpatient_full_trace['λ']).T], axis=1)
outpatient_full_samples = (pd.melt(outpatient_full_samples,
id_vars=['age_group', 'year'])
.replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
.rename(columns={'age_group':'age group'}))
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
data=outpatient_full_samples.assign(rate=outpatient_full_samples.value*rate_factor),
palette="gray", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x128642710>
ED2 = pd.read_excel('data/FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED3 = pd.read_excel('data/Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED2.columns = ED3.columns
ED4 = pd.read_excel('data/Year 4 denominators.xlsx', index_col=0)
ED4 = ED4.fillna(ED4.mean())
ED4.columns = ED3.columns
ED2_total = ED2.sum()
ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'],
ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
ED3_total = ED3.sum()
ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'],
ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
ED4_total = ED4.sum()
ED4_eligible = (ED4_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'],
ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED4_visits = (ED4_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
ED4_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
ED_all = pd.concat([ED2, ED3, ED4])
eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')]
ED_counts = ED_all[eligible_cols].sum(1).astype(int)
rotavirus_ed = rotavirus_data[rotavirus_data.setting==ED]
surveillance_counts = rotavirus_data.groupby(['scrdate'])['caseid'].count()
ED_counts.name = 'count24'
surveillance_counts.name = 'count8'
surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna()
# Drop days where 8-hour count exceeds 24-hour count
surveillance = surveillance[surveillance.count8<=surveillance.count24]
Here is what it looks like if you resample to weekly
weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W')
(weekly_surveillance.count8.mean() / weekly_surveillance.count24.mean()).hist()
<matplotlib.axes._subplots.AxesSubplot at 0x12a308a20>
We will just use the proportion of the pooled suveillance
surveillance_total = surveillance.sum()
surveillance_total
count24 3935.0 count8 2493.0 dtype: float64
surveillance_proportion = surveillance_total.count8 / surveillance_total.count24
surveillance_proportion
0.63354510800508257
def generate_ed_model(ED_data):
assert not ED_data.isnull().sum().any()
with Model() as model:
data_len = ED_data.shape[0]
cases, n = ED_data[['cases','n']].values.T
# Weights according to ED monitoring intensity, market share and 8-hour surveillance
weights = (np.array(monitoring_rate)[ED]
* np.array(ed_market_share)[ED_data.year]
* surveillance_proportion)
# Uniform prior on weighted cases
weighted_cases = Uniform('weighted_cases', cases, n, shape=data_len)
# Likelihood of observed cases
weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)
θ = Normal('θ', 0, sd=1000, shape=data_len)
# Poisson rate parametera
λ = Deterministic('λ', tt.exp(θ))
# Poisson likelihood
AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
return model
ed_pool_race_data = (setting_data[ED].drop('setting', axis=1)
.groupby(['age_group', 'year'])[['cases','n']]
.sum()
.reset_index())
ed_pool_race_data
age_group | year | cases | n | |
---|---|---|---|---|
0 | 0 | 0 | 310 | 16218 |
1 | 0 | 1 | 387 | 16404 |
2 | 0 | 2 | 324 | 16564 |
3 | 1 | 0 | 199 | 24330 |
4 | 1 | 1 | 173 | 24609 |
5 | 1 | 2 | 165 | 24847 |
6 | 2 | 0 | 219 | 85104 |
7 | 2 | 1 | 206 | 86434 |
8 | 2 | 2 | 196 | 87175 |
Back-of-the-envelope rate calculation
ed_pool_race_data.assign(empirical_rate=rate_factor * ed_pool_race_data.cases/(ed_pool_race_data.n
* np.array(monitoring_rate)[ED]
* surveillance_proportion
* np.array(ed_market_share)[ed_pool_race_data.year]))
age_group | year | cases | n | empirical_rate | |
---|---|---|---|---|---|
0 | 0 | 0 | 310 | 16218 | 703.133099 |
1 | 0 | 1 | 387 | 16404 | 825.837636 |
2 | 0 | 2 | 324 | 16564 | 673.851804 |
3 | 1 | 0 | 199 | 24330 | 300.873621 |
4 | 1 | 1 | 173 | 24609 | 246.085261 |
5 | 1 | 2 | 165 | 24847 | 228.767640 |
6 | 2 | 0 | 219 | 85104 | 94.660171 |
7 | 2 | 1 | 206 | 86434 | 83.428815 |
8 | 2 | 2 | 196 | 87175 | 77.454869 |
ed_model = generate_ed_model(ed_pool_race_data)
with ed_model:
ed_trace = sample(1000, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler... Initializing NUTS using advi+adapt_diag... Average Loss = 8,128.4: 8%|▊ | 15674/200000 [00:01<00:16, 11297.71it/s] Convergence archived at 16800 Interrupted at 16,800 [8%]: Average Loss = 53,496 100%|██████████| 2000/2000 [00:05<00:00, 398.10it/s]
forestplot(ed_trace, varnames=['λ'])
<matplotlib.gridspec.GridSpec at 0x12dd7e3c8>
ed_data_labels = (ed_pool_race_data[['age_group', 'year']]
.reset_index(drop=True)
.replace(label_map)
.rename(columns={'age_group':'age'}))
ed_table = generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age'])
ed_table.to_excel('results/ed_pool_race.xlsx')
ed_table
age | 0-1 | 2-4 | 5+ |
---|---|---|---|
year | |||
2012/13 | 703.48 (631.69, 782.60) | 301.48 (263.52, 344.45) | 94.73 (82.72, 107.80) |
2013/14 | 826.17 (742.86, 904.69) | 245.95 (208.05, 282.66) | 83.29 (72.36, 94.93) |
2014/15 | 673.15 (602.54, 749.37) | 228.58 (191.78, 261.12) | 77.28 (67.22, 87.96) |
Create data frame of MCMC samples for plotting
ed_rate_samples = pd.concat([ed_pool_race_data[['age_group', 'year']],
pd.DataFrame(ed_trace['λ']).T], axis=1)
ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['age_group', 'year'])
.replace({'under_5':{0:'5+', 1:'<5'},
'year':{0:'2012/13', 1:'2013/14'}})
.rename(columns={'age_group':'age group'}))
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
data=ed_rate_samples.assign(rate=ed_rate_samples.value*rate_factor),
palette="gray", linewidth=0.6, fliersize=0)
<matplotlib.axes._subplots.AxesSubplot at 0x12c54cbe0>
collected_stools = (rotavirus_data.groupby(['age_group', 'black', 'year', 'setting'])
.agg({'stool_collected':['sum', 'count']}))
levels = collected_stools.columns.levels
labels = collected_stools.columns.labels
collected_stools.columns = ['stool_collected', 'enrolled']
collected_stools = collected_stools.reset_index()
collected_stools.head()
age_group | black | year | setting | stool_collected | enrolled | |
---|---|---|---|---|---|---|
0 | 0 | False | 0 | 0 | 35.0 | 35 |
1 | 0 | False | 0 | 1 | 154.0 | 173 |
2 | 0 | False | 0 | 2 | 112.0 | 119 |
3 | 0 | False | 0 | 3 | 157.0 | 174 |
4 | 0 | False | 1 | 0 | 39.0 | 40 |
virus_summary = (rotavirus_data.dropna(subset=virus_cols)
.groupby(['age_group', 'black', 'year', 'setting', 'rotavirus', 'sapovirus',
'astrovirus', 'norovirus'])['caseid'].count()
.reset_index()
.rename(columns={'caseid':'cases'}))
virus_summary.head()
age_group | black | year | setting | rotavirus | sapovirus | astrovirus | norovirus | cases | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | False | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 19 |
1 | 0 | False | 0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 6 |
2 | 0 | False | 0 | 0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 |
3 | 0 | False | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 8 |
4 | 0 | False | 0 | 0 | 1.0 | 0.0 | 1.0 | 0.0 | 1 |
merged_virus_data = (virus_summary.merge(collected_stools,
on=['age_group', 'black', 'year', 'setting'])
.merge(census_clean,
on=['age_group', 'year', 'black'],
how='left'))
merged_virus_data.head(10)
age_group | black | year | setting | rotavirus | sapovirus | astrovirus | norovirus | cases | stool_collected | enrolled | n | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | False | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 19 | 35.0 | 35 | 10569 |
1 | 0 | False | 0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 6 | 35.0 | 35 | 10569 |
2 | 0 | False | 0 | 0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 35.0 | 35 | 10569 |
3 | 0 | False | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 8 | 35.0 | 35 | 10569 |
4 | 0 | False | 0 | 0 | 1.0 | 0.0 | 1.0 | 0.0 | 1 | 35.0 | 35 | 10569 |
5 | 0 | False | 0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 91 | 154.0 | 173 | 10569 |
6 | 0 | False | 0 | 1 | 0.0 | 0.0 | 0.0 | 1.0 | 25 | 154.0 | 173 | 10569 |
7 | 0 | False | 0 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 4 | 154.0 | 173 | 10569 |
8 | 0 | False | 0 | 1 | 0.0 | 0.0 | 1.0 | 1.0 | 1 | 154.0 | 173 | 10569 |
9 | 0 | False | 0 | 1 | 0.0 | 1.0 | 0.0 | 0.0 | 16 | 154.0 | 173 | 10569 |
any_virus = (merged_virus_data.groupby(['black','age_group', 'year', 'setting'])
.agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})).sum(level=['age_group', 'year', 'setting'])
any_virus
cases | stool_collected | enrolled | n | |||
---|---|---|---|---|---|---|
age_group | year | setting | ||||
0 | 0 | 0 | 51 | 51.0 | 51 | 16218 |
1 | 266 | 268.0 | 310 | 16218 | ||
2 | 172 | 172.0 | 187 | 16218 | ||
3 | 249 | 250.0 | 301 | 16218 | ||
1 | 0 | 52 | 52.0 | 53 | 16404 | |
1 | 334 | 334.0 | 387 | 16404 | ||
2 | 157 | 157.0 | 185 | 16404 | ||
3 | 174 | 174.0 | 230 | 16404 | ||
2 | 0 | 50 | 50.0 | 53 | 16564 | |
1 | 267 | 268.0 | 324 | 16564 | ||
2 | 157 | 158.0 | 179 | 16564 | ||
3 | 159 | 159.0 | 202 | 16564 | ||
1 | 0 | 0 | 21 | 21.0 | 25 | 24330 |
1 | 150 | 150.0 | 199 | 24330 | ||
2 | 91 | 92.0 | 120 | 24330 | ||
3 | 130 | 130.0 | 195 | 24330 | ||
1 | 0 | 15 | 15.0 | 18 | 24609 | |
1 | 119 | 119.0 | 173 | 24609 | ||
2 | 82 | 82.0 | 119 | 24609 | ||
3 | 71 | 71.0 | 104 | 24609 | ||
2 | 0 | 5 | 5.0 | 10 | 24847 | |
1 | 107 | 107.0 | 165 | 24847 | ||
2 | 76 | 76.0 | 116 | 24847 | ||
3 | 62 | 62.0 | 100 | 24847 | ||
2 | 0 | 0 | 13 | 13.0 | 20 | 85104 |
1 | 141 | 141.0 | 219 | 85104 | ||
2 | 115 | 115.0 | 152 | 85104 | ||
3 | 137 | 137.0 | 210 | 85104 | ||
1 | 0 | 13 | 13.0 | 15 | 86434 | |
1 | 148 | 148.0 | 206 | 86434 | ||
2 | 84 | 84.0 | 124 | 86434 | ||
3 | 80 | 80.0 | 126 | 86434 | ||
2 | 0 | 4 | 4.0 | 5 | 52582 | |
1 | 130 | 131.0 | 196 | 87175 | ||
2 | 66 | 66.0 | 92 | 87175 | ||
3 | 64 | 64.0 | 92 | 87175 |
All possible combinations of age, year and setting for index.
complete_index = pd.MultiIndex.from_product([range(3), range(3), range(4)], names=['age_group', 'year', 'setting'])
sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1]
.groupby(['age_group', 'year', 'setting'])
.agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
.reindex(complete_index)
.assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
.fillna(0).astype(int))
rota_data = (merged_virus_data[merged_virus_data.rotavirus==1]
.groupby(['age_group', 'year', 'setting'])
.agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
.reindex(complete_index)
.assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
.fillna(0).astype(int))
astro_data = (merged_virus_data[merged_virus_data.astrovirus==1]
.groupby(['age_group', 'year', 'setting'])
.agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
.reindex(complete_index)
.assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
.fillna(0).astype(int))
noro_data = (merged_virus_data[merged_virus_data.norovirus==1]
.groupby(['age_group', 'year', 'setting'])
.agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
.reindex(complete_index)
.assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
.fillna(0).astype(int))
virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'),
noro_data.reset_index().assign(virus='norovirus'),
rota_data.reset_index().assign(virus='rotavirus'),
sapo_data.reset_index().assign(virus='sapovirus')])
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)
virus_lookup = {0:'norovirus',
1:'rotavirus',
2:'sapovirus',
3:'astrovirus'}
virus_inpatient_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==INPATIENT].reset_index(drop=True)
cases, enrolled, year, n, age_group = virus_inpatient_data[['cases', 'enrolled', 'year', 'n', 'age_group']].values.T
with Model() as virus_inpatient_model:
groups = virus_inpatient_data.shape[0]
# Probability of stool collection
ψ = Beta('ψ', 10, 1, shape=3)
# Age-group specific enrolled inpatients, and inpatients with collected stool
inpatient_enrolled = rotavirus_data.groupby('age_group').apply(lambda x: (x.setting==INPATIENT).sum()).values
collected = (rotavirus_data.groupby('age_group')
.apply(lambda x: x[x.setting==INPATIENT].stool_collected.sum())
.values)
stool_likelihood = Binomial('stool_likelihood', n=inpatient_enrolled, p=ψ,
observed=collected)
# Extract data columns
cases, enrolled, year, n, age_group = virus_inpatient_data[['cases', 'enrolled', 'year', 'n', 'age_group']].values.T
enrolled_cases = DiscreteUniform('enrolled_cases', lower=cases, upper=enrolled, shape=groups,
testval=(cases/ψ[age_group.astype(int)]).astype('int32'))
# Estimate total VUMC cases in setting
enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood',
n=enrolled_cases,
p=ψ[age_group.astype(int)],
observed=cases)
# Calculate an "effective N" that corresponds to sampled population
weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[year.astype(int)]
n_eff = DiscreteUniform('n_eff', lower=cases, upper=n, shape=groups)
davidson_cases_likelihood = Potential('davidson_cases_likelihood',
Binomial.dist(n=n, p=weights).logp(n_eff.astype('int32')).sum())
# Population rate
θ = Normal('θ', 0, sd=10, shape=groups)
λ = Deterministic('λ', tt.exp(θ))
# Data likelihood
virus_obs = Potential('virus_obs', Poisson.dist(λ*n_eff).logp(enrolled_cases))
with virus_inpatient_model:
virus_inpatient_trace = sample(iterations, tune=tune, njobs=2,
random_seed=seed_numbers)
Assigned NUTS to ψ_logodds__ Assigned Metropolis to enrolled_cases Assigned Metropolis to n_eff Assigned NUTS to θ 100%|██████████| 2000/2000 [00:52<00:00, 37.85it/s] /Users/fonnescj/Repos/pymc3/pymc3/step_methods/hmc/nuts.py:463: UserWarning: Chain 1 contains 2 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize. % (self._chain_id, n_diverging))
Posterior distribution of stool sampling rate
forestplot(virus_inpatient_trace, varnames=['ψ'])
<matplotlib.gridspec.GridSpec at 0x133073fd0>
traceplot(virus_inpatient_trace, varnames=['ψ'])
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12c2eb860>, <matplotlib.axes._subplots.AxesSubplot object at 0x12c62ed68>]], dtype=object)
forestplot(virus_inpatient_trace, varnames=['λ'])
<matplotlib.gridspec.GridSpec at 0x12c50c860>
inpatient_sample_melted = pd.melt(pd.DataFrame(virus_inpatient_trace['λ']))
inpatient_virus_samples = (virus_inpatient_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'],
axis=1).reset_index(drop=True)
.merge(inpatient_sample_melted, left_index=True, right_on='variable')
.replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
'virus':virus_lookup})
.rename(columns={'age_group':'age group'}))
sns.set(style="ticks")
inpatient_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus',
data=inpatient_virus_samples.assign(rate=inpatient_virus_samples.value*rate_factor),
palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
fliersize=0)
inpatient_rateplot.despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x1332aa550>
virus_inpatient_data.to_excel('inpatient_virus_data.xlsx')
inpatient_virus_labels = (virus_inpatient_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'],
axis=1).reset_index(drop=True)
.replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
'virus':virus_lookup})
.rename(columns={'age_group':'age group'}))
labels = inpatient_virus_labels
trace = virus_inpatient_trace
varnames=['λ']
labels.head()
age group | year | virus | |
---|---|---|---|
0 | 0-1 | 2012/13 | astrovirus |
1 | 0-1 | 2013/14 | astrovirus |
2 | 0-1 | 2014/15 | astrovirus |
3 | 2-4 | 2012/13 | astrovirus |
4 | 2-4 | 2013/14 | astrovirus |
rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(),
names=labels.columns)
rate_df = (df_summary(trace, varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
#rate_df.index = rate_df_index
virus_inpatient_table = generate_table(virus_inpatient_trace, inpatient_virus_labels,
index=['age group', 'year'],
columns=['virus']).sort_index()
virus_inpatient_table.to_excel('results/virus_inpatient_table.xlsx')
virus_inpatient_table
virus | astrovirus | norovirus | rotavirus | sapovirus | |
---|---|---|---|---|---|
age group | year | ||||
0-1 | 2012/13 | 2.52 (0.09, 5.94) | 8.63 (3.20, 15.26) | 14.45 (6.59, 22.23) | 2.49 (0.12, 5.86) |
2013/14 | 8.51 (2.58, 14.60) | 24.01 (13.66, 34.35) | 7.19 (2.11, 13.11) | 8.61 (2.81, 14.42) | |
2014/15 | 0.18 (0.00, 1.06) | 14.39 (6.80, 22.29) | 7.13 (1.76, 12.72) | 2.48 (0.11, 5.98) | |
2-4 | 2012/13 | 0.13 (0.00, 0.67) | 3.32 (0.65, 6.44) | 2.52 (0.26, 5.17) | 0.88 (0.00, 2.56) |
2013/14 | 0.87 (0.00, 2.53) | 0.87 (0.00, 2.53) | 0.90 (0.00, 2.71) | 0.88 (0.00, 2.64) | |
2014/15 | 0.87 (0.00, 2.49) | 1.67 (0.09, 4.00) | 0.12 (0.00, 0.62) | 0.12 (0.00, 0.64) | |
5+ | 2012/13 | 0.04 (0.00, 0.21) | 0.26 (0.00, 0.78) | 0.25 (0.00, 0.74) | 0.03 (0.00, 0.17) |
2013/14 | 0.04 (0.00, 0.21) | 0.72 (0.07, 1.50) | 0.25 (0.00, 0.74) | 0.48 (0.02, 1.11) | |
2014/15 | 0.06 (0.00, 0.32) | 0.06 (0.00, 0.34) | 0.41 (0.00, 1.19) | 0.06 (0.00, 0.34) |
virus_outpatient_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==OUTPATIENT]
with Model() as virus_outpatient_model:
groups = virus_outpatient_data.shape[0]
# Probability of stool collection
ψ = Beta('ψ', 1, 1)
outpatient_enrolled = (rotavirus_data.setting==OUTPATIENT).sum()
collected = rotavirus_data[rotavirus_data.setting==OUTPATIENT].stool_collected.sum()
stool = Binomial('stool', n=outpatient_enrolled, p=ψ, observed=collected)
cases, enrolled, n = virus_outpatient_data[['cases', 'enrolled', 'n']].values.T
enrolled_cases_out = Uniform('enrolled_cases_out', lower=cases, upper=enrolled,
shape=groups)
p = monitoring_rate[OUTPATIENT]*ψ
# Estimate total VUMC cases in setting
enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood',
n=enrolled_cases_out,
p=p,
observed=cases)
π = Beta('π', 1, 10, shape=groups)
# Data likeihood
virus_out_obs = Potential('virus_out_obs',
Binomial.dist(n=n, p=π).logp(enrolled_cases_out).sum())
with virus_outpatient_model:
virus_outpatient_trace = sample(iterations, tune=tune,
njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler... Initializing NUTS using advi+adapt_diag... Average Loss = 463.64: 25%|██▍ | 49397/200000 [00:08<00:23, 6289.74it/s] Convergence archived at 49700 Interrupted at 49,700 [24%]: Average Loss = 10,288 100%|██████████| 2000/2000 [00:09<00:00, 210.43it/s]
Posterior distribution of stool sampling rate
traceplot(virus_outpatient_trace, varnames=['ψ'])
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12e03c0f0>, <matplotlib.axes._subplots.AxesSubplot object at 0x12dd58358>]], dtype=object)
outpatient_sample_melted = pd.melt(pd.DataFrame(virus_outpatient_trace['π']))
outpatient_virus_samples = (virus_outpatient_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'],
axis=1).reset_index(drop=True)
.merge(outpatient_sample_melted, left_index=True, right_on='variable')
.replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
'virus':virus_lookup})
.rename(columns={'age_group':'age group'}))
outpatient_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus',
data=outpatient_virus_samples.assign(rate=outpatient_virus_samples.value*rate_factor),
palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
fliersize=0)
outpatient_rateplot.despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x12ded8320>
outpatient_virus_labels = (virus_outpatient_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'],
axis=1).reset_index(drop=True)
.replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
'virus':virus_lookup})
.rename(columns={'age_group':'age group'}))
virus_outpatient_table = generate_table(virus_outpatient_trace, outpatient_virus_labels,
index=['age group', 'year'], varnames=['π'],
columns=['virus']).sort_index()
virus_outpatient_table.to_excel('results/virus_outpatient_table.xlsx')
virus_outpatient_table
virus | astrovirus | norovirus | rotavirus | sapovirus | |
---|---|---|---|---|---|
age group | year | ||||
0-1 | 2012/13 | 15.19 (6.44, 24.09) | 48.36 (33.08, 64.25) | 29.08 (17.63, 42.37) | 30.54 (17.76, 43.59) |
2013/14 | 17.79 (8.59, 27.31) | 60.16 (42.72, 78.12) | 19.22 (9.65, 29.94) | 35.58 (22.51, 48.96) | |
2014/15 | 5.42 (1.03, 10.70) | 76.13 (56.90, 96.09) | 18.90 (9.87, 28.98) | 19.00 (10.33, 29.45) | |
2-4 | 2012/13 | 12.01 (6.50, 18.58) | 15.66 (8.28, 23.27) | 12.95 (6.97, 19.95) | 10.15 (4.38, 15.94) |
2013/14 | 8.28 (3.42, 14.06) | 17.40 (9.79, 25.14) | 3.65 (0.79, 7.29) | 11.89 (6.04, 18.44) | |
2014/15 | 4.47 (0.96, 8.30) | 19.03 (11.46, 27.08) | 8.19 (3.67, 13.92) | 4.56 (1.18, 8.47) | |
5+ | 2012/13 | 1.84 (0.59, 3.19) | 4.49 (2.54, 6.75) | 1.85 (0.60, 3.18) | 2.93 (1.39, 4.69) |
2013/14 | 0.81 (0.08, 1.66) | 4.16 (2.23, 6.20) | 1.06 (0.20, 2.06) | 2.35 (0.84, 3.86) | |
2014/15 | 0.56 (0.04, 1.33) | 5.44 (3.21, 7.87) | 1.29 (0.32, 2.47) | 0.80 (0.12, 1.67) |
virus_ed_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==ED]
with Model() as virus_ed_model:
# Probability of stool collection
ψ = Beta('ψ', 1, 1)
ed_enrolled = (rotavirus_data.setting==ED).sum()
collected = rotavirus_data[rotavirus_data.setting==ED].stool_collected.sum()
stool = Binomial('stool', n=ed_enrolled, p=ψ, observed=collected)
data_len = virus_ed_data.shape[0]
cases, enrolled, year, n = virus_ed_data[['cases', 'enrolled', 'year', 'n']].values.T
enrolled_cases = Uniform('enrolled_cases', lower=cases, upper=enrolled,
shape=groups)
# Estimate total VUMC cases in setting
enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood',
n=enrolled_cases,
p=ψ,
observed=cases)
ω = (np.array(monitoring_rate)[ED]
* np.array(ed_market_share)[year.astype(int)]
* surveillance_proportion)
n_eff = DiscreteUniform('n_eff', lower=cases, upper=n,
shape=groups)
n_eff_likelihood = Potential('davidson_cases_likelihood',
Binomial.dist(n=n,
p=ω).logp(n_eff).sum())
# Population rate
θ = Normal('θ', 0, sd=10, shape=groups)
λ = Deterministic('λ', tt.exp(θ))
# Data likelihood
virus_ed_obs = Potential('virus_ed_obs', Poisson.dist(λ*n_eff).logp(enrolled_cases))
with virus_ed_model:
virus_ed_trace = sample(iterations, tune=tune,
njobs=2, random_seed=seed_numbers)
Assigned NUTS to ψ_logodds__ Assigned NUTS to enrolled_cases_interval__ Assigned Metropolis to n_eff Assigned NUTS to θ 100%|██████████| 2000/2000 [00:33<00:00, 58.93it/s]
Posterior distribution of stool sampling rate
traceplot(virus_ed_trace, varnames=['ψ'])
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x133ee56a0>, <matplotlib.axes._subplots.AxesSubplot object at 0x1312b8710>]], dtype=object)
ed_sample_melted = pd.melt(pd.DataFrame(virus_ed_trace['λ']))
ed_virus_samples = (virus_ed_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'],
axis=1).reset_index()
.merge(ed_sample_melted, left_index=True, right_on='variable')
.replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
'virus':virus_lookup})
.rename(columns={'age_group':'age group'}))
ed_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus',
data=ed_virus_samples.assign(rate=ed_virus_samples.value*rate_factor),
palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
fliersize=0)
ed_rateplot.despine(left=True)
<seaborn.axisgrid.FacetGrid at 0x12c19be10>
ed_virus_labels = (virus_ed_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'],
axis=1).reset_index(drop=True)
.replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
'virus':virus_lookup})
.rename(columns={'age_group':'age group'}))
virus_ed_table = generate_table(virus_ed_trace, ed_virus_labels,
index=['age group', 'year'],
columns=['virus']).sort_index()
virus_ed_table.to_excel('results/virus_ed_table.xlsx')
virus_ed_table
virus | astrovirus | norovirus | rotavirus | sapovirus | |
---|---|---|---|---|---|
age group | year | ||||
0-1 | 2012/13 | 18.19 (9.34, 29.89) | 79.84 (56.37, 101.89) | 42.82 (27.23, 59.08) | 54.46 (35.18, 72.40) |
2013/14 | 34.47 (20.02, 49.14) | 136.46 (106.58, 165.19) | 32.58 (18.74, 47.54) | 62.20 (43.43, 81.33) | |
2014/15 | 11.64 (4.02, 20.53) | 112.74 (87.71, 140.87) | 35.49 (20.15, 49.75) | 45.06 (29.65, 62.13) | |
2-4 | 2012/13 | 4.68 (0.97, 8.60) | 30.53 (19.09, 41.69) | 31.88 (20.76, 44.30) | 26.27 (17.12, 38.00) |
2013/14 | 8.75 (3.79, 14.98) | 29.34 (19.34, 40.34) | 8.86 (3.40, 14.84) | 13.04 (6.29, 21.07) | |
2014/15 | 6.59 (1.98, 11.60) | 26.67 (16.87, 37.48) | 13.00 (6.54, 20.60) | 10.87 (4.68, 17.59) | |
5+ | 2012/13 | 1.62 (0.39, 2.91) | 5.27 (2.77, 7.70) | 7.42 (4.63, 10.30) | 4.98 (2.85, 7.52) |
2013/14 | 1.30 (0.28, 2.52) | 9.16 (5.81, 12.42) | 2.49 (0.94, 4.15) | 2.76 (1.27, 4.73) | |
2014/15 | 1.02 (0.13, 2.09) | 6.68 (4.06, 9.46) | 3.95 (2.04, 6.18) | 1.58 (0.46, 2.98) |