In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')

seed_numbers = 20090425, 19700903

from datetime import timedelta, datetime

Data import and cleanup

In [2]:
# rotavirus_data = pd.read_csv('data/AGE Year 2-4 with Final Rota.csv', low_memory=False)
rotavirus_data = pd.read_csv('data/AGE Cases and Healthy Controls Merged with Lab Results Year 2 to 4 v05.23.2017.csv')
rotavirus_data.index.is_unique
/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (142,143,159,160,161,208,209,226,229,230,234,235,237,242,243,246,251,252,253,258,262,266,279,280,281,285,286,288,289,290,291,294,295,296,297,298,299,302,304,305,307,312,313,320,321,322,323,325,329,334,335,336,341,342,343,344,347,351,357,358,360,361,362,366,371,372,378,382,383,384,385,389,390,391,395,400,401,402,415,416,417,420,421,423,425,430,431,433,435,436,438,439,440,442,443,444,448,452,458,460,461,462,466,470,472,474,486,498,508,512,520,540,549,555,558,577,584,591,598,606,607,677,678,679,680,681,682,683,684,685,688,689,690,691,692,693,694,695,696,697) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[2]:
True
In [3]:
rotavirus_data.head()
Out[3]:
caseid case settingnew rota_od vaxstrain rota_genotype noro_type noroseq_gii_c noroseq_gii_d noroseq_gi_c ... ddxdioten1 ddxdioten2 ddxdioten3 ddxdioten4 ddxdioten5 ddxdioten6 ddxdioten7 ddxdioten8 ddxdioten9 ddxdioten10
0 EN1C0001 1 3 0.088 NaN NEG NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 EN1C0002 1 3 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 EN1C0003 1 3 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 EN1C0004 1 3 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 EN1C0006 1 3 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 698 columns

Rename columns and recode data

In [4]:
rotavirus_data.race.value_counts()
Out[4]:
1    2766
2    1968
6     411
4     105
7      10
5       3
3       1
8       1
Name: race, dtype: int64
In [5]:
rotavirus_data['white'] = rotavirus_data['race'] == 1
rotavirus_data['black'] = rotavirus_data['race'] == 2
In [6]:
rotavirus_data['astro'].value_counts().index
Out[6]:
Float64Index([0.0, 1.0], dtype='float64')
In [7]:
# rotavirus_data['rotavirus'] = (rotavirus_data['rotacdc'] == 'Positive').astype(float)
# rotavirus_data['sapovirus'] = (rotavirus_data['sapo'] == 'Positive').astype(float)
# rotavirus_data['astrovirus'] = (rotavirus_data['astro'] == 'Positive').astype(float)
# rotavirus_data['norovirus'] = (rotavirus_data['noro'] == 'Positive').astype(float)

# rotavirus_data.loc[rotavirus_data['rotacdc'].isnull() | (rotavirus_data.rotacdc=='Inhibition') | (rotavirus_data.specimencol=='No'), 
#                    'rotavirus'] = np.nan
# rotavirus_data.loc[rotavirus_data['sapo'].isnull() | (rotavirus_data.sapo=='Inhibition') | (rotavirus_data.specimencol=='No'), 
#                    'sapovirus'] = np.nan
# rotavirus_data.loc[rotavirus_data['astro'].isnull() | (rotavirus_data.astro=='Inhibition') | (rotavirus_data.specimencol=='No'), 
#                    'astrovirus'] = np.nan
# rotavirus_data.loc[rotavirus_data['noro'].isnull() | (rotavirus_data.noro=='Inhibition') | (rotavirus_data.specimencol=='No'), 
#                    'norovirus'] = np.nan
In [8]:
rotavirus_data=rotavirus_data.rename(columns={'rotacdc':'rotavirus',
                       'sapo':'sapovirus',
                       'astro':'astrovirus',
                       'noro':'norovirus'})
In [9]:
rotavirus_data.rotavirus.value_counts()
Out[9]:
0.0    3722
1.0     290
Name: rotavirus, dtype: int64
In [10]:
rotavirus_data.rotavu.value_counts()
Out[10]:
0.0    3574
1.0     441
Name: rotavu, dtype: int64
In [11]:
virus_cols = ['rotavirus','sapovirus','astrovirus','norovirus']

Proportion of test results missing

In [12]:
rotavirus_data[virus_cols].isnull().mean()
Out[12]:
rotavirus     0.237987
sapovirus     0.237417
astrovirus    0.237417
norovirus     0.237417
dtype: float64
In [13]:
rotavirus_data['scrdate'] = pd.to_datetime(rotavirus_data['scrdate'])
rotavirus_data['dob'] = pd.to_datetime(rotavirus_data['dob'])
In [14]:
astro_data = rotavirus_data[rotavirus_data.astrovirus==1].dropna(subset=['astro_ct'])
astro_data.shape
Out[14]:
(159, 700)
In [15]:
noro_data = rotavirus_data[rotavirus_data.norovirus==1].dropna(subset=['noro_ct'])
noro_data.shape
Out[15]:
(729, 700)
In [16]:
noro_data['coinfect'] = noro_data[['rotavirus', 'sapovirus', 'astrovirus']].sum(1)
In [17]:
astro_data['coinfect'] = astro_data[['rotavirus', 'sapovirus', 'norovirus']].sum(1)
In [18]:
with_coinf = noro_data.noro_ct[noro_data.coinfect==1].astype(float)
no_coinf = noro_data.noro_ct[(noro_data.coinfect==0)].astype(float)

# with_coinf = astro_data.astro_ct[astro_data.coinfect==1].astype(float)
# no_coinf = astro_data.astro_ct[(astro_data.coinfect==0)].astype(float)
In [19]:
from scipy.stats import ranksums

ranksums(with_coinf, no_coinf)
Out[19]:
RanksumsResult(statistic=3.6398985691965766, pvalue=0.00027274545863325173)
In [20]:
rotavirus_data.sapo_ct.dropna().astype(float).hist()
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x115755438>

Calculate age, and compare to given age.

In [21]:
rotavirus_data['age_months'] = (rotavirus_data['scrdate']
                              - rotavirus_data['dob'] + timedelta(1)).astype('timedelta64[M]') 
In [22]:
rotavirus_data.age_months.hist();

Group into ages under 5 and 5 or over

In [23]:
rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int)
In [24]:
rotavirus_data['under_2'] = ((rotavirus_data.age_months / 12) < 2).astype(int)
In [25]:
rotavirus_data.loc[rotavirus_data.caseid=='EN1I0141', ['under_5', 'under_2']]
Out[25]:
under_5 under_2
5192 1 0
In [26]:
rotavirus_data['age_group'] = 2
rotavirus_data.loc[rotavirus_data.under_5==1, 'age_group'] = 1
rotavirus_data.loc[rotavirus_data.under_2==1, 'age_group'] = 0
In [27]:
rotavirus_data.age_group.value_counts()
Out[27]:
0    2462
2    1459
1    1344
Name: age_group, dtype: int64
In [28]:
rotavirus_data.year.value_counts()
Out[28]:
2    1989
3    1740
4    1536
Name: year, dtype: int64

Calculate natural year

In [29]:
rotavirus_data['year'] = rotavirus_data.year - 2

Rename setting variable

In [30]:
assert not rotavirus_data.settingnew.isnull().sum()
In [31]:
rotavirus_data['setting'] = rotavirus_data.settingnew - 1
In [32]:
# Indices to each value
INPATIENT, ED, OUTPATIENT, HC = 0, 1, 2, 3

# rotavirus_data['setting'] = rotavirus_data.provider.replace({1:INPATIENT, 2:ED, 3:OUTPATIENT, 4:HC})

Recode sex and stool

In [33]:
assert not rotavirus_data['specimencol'].isnull().sum()
In [34]:
rotavirus_data['male'] = rotavirus_data['sex']==1
rotavirus_data['stool_collected'] = rotavirus_data['specimencol']==1

Recode insurance

In [35]:
rotavirus_data.insurance.value_counts()
Out[35]:
1    4599
2     500
3      84
8      44
4      38
Name: insurance, dtype: int64
In [36]:
rotavirus_data['public_ins'] = rotavirus_data.insurance.isin({1, 3})
rotavirus_data['private_ins'] = rotavirus_data.insurance.isin({2, 3})
rotavirus_data.loc[rotavirus_data.insurance==8, ['public_ins', 'private_ins']] = np.nan
In [37]:
# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['intvwdate', 'dob', 'provider', 'coldat', 'tdat', 
       'insurance', 'sex', 'hispcr', 'mdegree_7', 'rvacver_7',
       'rvacver_7r', 'race', 'race2_8'], axis=1)

Summary of counts by setting

In [38]:
rotavirus_data.shape
Out[38]:
(5265, 696)

Summary of missing values

In [39]:
rotavirus_data.isnull().sum()
Out[39]:
caseid                0
case                  0
settingnew            0
rota_od            5085
vaxstrain          5264
rota_genotype      4826
noro_type          4535
noroseq_gii_c      4772
noroseq_gii_d      4772
noroseq_gi_c       5022
noroseq_gi_d       5037
year                  0
rotavirus          1253
rotavu             1250
sapovirus          1250
sapo_ct            4917
astrovirus         1250
astro_ct           5106
norovirus          1250
noro_ct            4536
studysite             0
hospital              0
abiniscr              0
deiniscr              0
scrdate               0
admitdate             0
admittime            13
agedays               0
agemonths             0
ageyears              0
                   ... 
ddxdcten2          5198
ddxdcten3          5235
ddxdcten4          5254
ddxdcten5          5259
ddxdcten6          5263
ddxdcten7          5264
ddxdcten8          5264
ddxdcten9          5265
ddxdcten10         5265
ddxdioten1         5258
ddxdioten2         5259
ddxdioten3         5261
ddxdioten4         5261
ddxdioten5         5263
ddxdioten6         5263
ddxdioten7         5263
ddxdioten8         5264
ddxdioten9         5264
ddxdioten10        5264
white                 0
black                 0
age_months            0
under_5               0
under_2               0
age_group             0
setting               0
male                  0
stool_collected       0
public_ins           44
private_ins          44
Length: 696, dtype: int64

Aggregate data by age group and setting

In [40]:
rotavirus_summary = (rotavirus_data.drop('scrdate', axis=1)
             .groupby(['age_group', 'year', 'setting', 'black'])['caseid'].count()
             .reset_index()
             .rename(columns={'caseid':'cases'}))

rotavirus_summary.head()
Out[40]:
age_group year setting black cases
0 0 0 0 False 35
1 0 0 0 True 16
2 0 0 1 False 173
3 0 0 1 True 137
4 0 0 2 False 119

Census data

The key for AGEGRP is as follows:

  • 0 = Total
  • 1 = Age 0 to 4 years
  • 2 = Age 5 to 9 years
  • 3 = Age 10 to 14 years
  • 4 = Age 15 to 19 years
  • 5 = Age 20 to 24 years
  • 6 = Age 25 to 29 years
  • 7 = Age 30 to 34 years
  • 8 = Age 35 to 39 years
  • 9 = Age 40 to 44 years
  • 10 = Age 45 to 49 years
  • 11 = Age 50 to 54 years
  • 12 = Age 55 to 59 years
  • 13 = Age 60 to 64 years
  • 14 = Age 65 to 69 years
  • 15 = Age 70 to 74 years
  • 16 = Age 75 to 79 years
  • 17 = Age 80 to 84 years
  • 18 = Age 85 years or older
In [41]:
davidson_census = pd.read_csv('data/davidson_county.csv')
davidson_census.head()
Out[41]:
SUMLEV STATE COUNTY STNAME CTYNAME YEAR2008 YEAR AGEGRP TOT_POP TOT_MALE ... HWAC_MALE HWAC_FEMALE HBAC_MALE HBAC_FEMALE HIAC_MALE HIAC_FEMALE HAAC_MALE HAAC_FEMALE HNAC_MALE HNAC_FEMALE
0 50 47 37 Tennessee Davidson County 1 2008 0 626681 303540 ... 30400 24829 2178 2006 1501 1183 381 374 271 211
1 50 47 37 Tennessee Davidson County 1 2008 1 44691 22868 ... 3838 3664 302 303 185 189 63 53 26 35
2 50 47 37 Tennessee Davidson County 1 2008 2 37613 18977 ... 2910 2826 266 205 145 140 45 50 22 23
3 50 47 37 Tennessee Davidson County 1 2008 3 33904 17479 ... 2154 1890 166 170 105 91 25 29 18 16
4 50 47 37 Tennessee Davidson County 1 2008 4 38999 19674 ... 2029 1783 166 174 111 82 29 29 22 16

5 rows × 81 columns

Number under 18 and under 5 from 2010 census:

In [42]:
under_18_2010 = 136612
under_5_2010 = 44493
over_4_under_18 = under_18_2010 - under_5_2010

over_4_under_18
Out[42]:
92119

Children 5-19 from census dataset:

In [43]:
under_20 = davidson_census.loc[(davidson_census.YEAR==2010) & davidson_census.AGEGRP.isin([2,3,4]), 'TOT_POP'].sum()

under_20
Out[43]:
110614

Proportion over 4 but also under 18:

In [44]:
pct_under_18 = over_4_under_18 / under_20

pct_under_18
Out[44]:
0.8327969334803913

Identify racial/ethnic subgroup populations

In [45]:
census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy()
census_data_child['WHITE'] = census_data_child[['WA_MALE', 'WA_FEMALE']].sum(axis=1)
census_data_child['BLACK'] = census_data_child[['BA_MALE', 'BA_FEMALE']].sum(axis=1)
In [46]:
census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])[['WHITE', 'BLACK']]
    .sum()).reset_index()

census_data['year'] = census_data.YEAR - 2011
census_data
Out[46]:
under_5 YEAR WHITE BLACK year
0 False 2011 60966 41225 0
1 False 2012 62219 41570 1
2 False 2013 63140 41539 2
3 True 2011 26424 14124 0
4 True 2012 26936 14077 1
5 True 2013 27388 14023 2

Correct over-4 counts

In [47]:
census_data.loc[~census_data.under_5, ['WHITE', 'BLACK']] *= pct_under_18

Add "under 2" category that is 40% of under_5

In [48]:
census_data['under_2'] = False
In [49]:
census_under_2 = census_data[census_data.under_5].copy()
census_under_2.under_2 = True
census_under_2.under_5 = False
census_under_2.WHITE = (census_under_2.WHITE*0.4).astype(int)
census_under_2.BLACK = (census_under_2.BLACK*0.4).astype(int)

census_under_2
Out[49]:
under_5 YEAR WHITE BLACK year under_2
3 False 2011 10569 5649 0 True
4 False 2012 10774 5630 1 True
5 False 2013 10955 5609 2 True

Subtract under-2's from under-5's

In [50]:
census_data.loc[census_data.under_5, ['WHITE', 'BLACK']] -= census_under_2[['WHITE', 'BLACK']]
In [51]:
census_data = (pd.concat([census_data, census_under_2])
                   .reset_index(drop=True)[['under_2', 'under_5', 'year', 'WHITE', 'BLACK']]
                   .rename(columns={'under_5':'2_to_4'}))

census_data
Out[51]:
under_2 2_to_4 year WHITE BLACK
0 False False 0 50772.297847 34332.053583
1 False False 1 51815.792404 34619.368525
2 False False 2 52582.798380 34593.551820
3 False True 0 15855.000000 8475.000000
4 False True 1 16162.000000 8447.000000
5 False True 2 16433.000000 8414.000000
6 True False 0 10569.000000 5649.000000
7 True False 1 10774.000000 5630.000000
8 True False 2 10955.000000 5609.000000
In [52]:
census_data[['WHITE', 'BLACK']] = census_data[['WHITE', 'BLACK']].astype(int)

census_data
Out[52]:
under_2 2_to_4 year WHITE BLACK
0 False False 0 50772 34332
1 False False 1 51815 34619
2 False False 2 52582 34593
3 False True 0 15855 8475
4 False True 1 16162 8447
5 False True 2 16433 8414
6 True False 0 10569 5649
7 True False 1 10774 5630
8 True False 2 10955 5609
In [53]:
census_data['age_group'] = 2
census_data.loc[census_data['under_2'], 'age_group'] = 0
census_data.loc[census_data['2_to_4'], 'age_group'] = 1

census_data
Out[53]:
under_2 2_to_4 year WHITE BLACK age_group
0 False False 0 50772 34332 2
1 False False 1 51815 34619 2
2 False False 2 52582 34593 2
3 False True 0 15855 8475 1
4 False True 1 16162 8447 1
5 False True 2 16433 8414 1
6 True False 0 10569 5649 0
7 True False 1 10774 5630 0
8 True False 2 10955 5609 0

Reshape data

In [54]:
melted_census = (pd.melt(census_data, id_vars=['age_group', 'year'], 
                    value_vars=['WHITE', 'BLACK']))

census_clean = (melted_census.assign(black=(melted_census.variable=='BLACK').astype(int))
                 .drop('variable', axis=1)
                 .rename(columns={'value':'n'}))
                    
census_clean
Out[54]:
age_group year n black
0 2 0 50772 0
1 2 1 51815 0
2 2 2 52582 0
3 1 0 15855 0
4 1 1 16162 0
5 1 2 16433 0
6 0 0 10569 0
7 0 1 10774 0
8 0 2 10955 0
9 2 0 34332 1
10 2 1 34619 1
11 2 2 34593 1
12 1 0 8475 1
13 1 1 8447 1
14 1 2 8414 1
15 0 0 5649 1
16 0 1 5630 1
17 0 2 5609 1
In [55]:
census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)])
In [56]:
merged_data = (rotavirus_summary.merge(census_full, on=['age_group', 'year', 'black', 'setting'], how='right')
                       .fillna(0).astype(int))

VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health

In [57]:
inpatient_market_share = 0.94, 0.90, 0.83
ed_market_share = 0.60, 0.59, 0.62
Y2012, Y2013, Y2014 = 0, 1, 2

Will estimate 2014 market shares by extrapolating a linear fit to the previous years.

In [58]:
from sklearn import linear_model
In [59]:
inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share))
inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.values.reshape(-1, 1), y=inpatient_df.y)
inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]]
inpatient_market_share
Out[59]:
[0.9, 0.83, 0.77500000000000002]
In [60]:
ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share))
ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.values.reshape(-1, 1), y=ed_df.y)
ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]]
ed_market_share
Out[60]:
[0.59, 0.62, 0.63]

Enrollment days/week for inpatient, outpatient, ED

In [61]:
monitoring_days = 7, 4, 4

# Only 5.5 days of intake on outpatient
monitoring_rate = np.array(monitoring_days)/np.array([7, 5.5, 7])
monitoring_rate
Out[61]:
array([ 1.        ,  0.72727273,  0.57142857])
In [62]:
rotavirus_data[(rotavirus_data.white==1) & (rotavirus_data.setting==2) & (rotavirus_data.year==2) &
              (rotavirus_data.under_5==1)].shape
Out[62]:
(202, 696)
In [63]:
setting_data = dict(list(merged_data.groupby('setting')))

Inpatient AGE Model

In [64]:
from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential, Metropolis
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform, Uniform, Flat, Gamma
from pymc3 import traceplot, forestplot, summary, df_summary

import theano.tensor as tt
In [65]:
setting_data[INPATIENT].drop('setting', axis=1).head()
Out[65]:
age_group year black cases n
0 0 0 0 35 10569
1 0 0 1 16 5649
6 0 1 0 40 10774
7 0 1 1 13 5630
12 0 2 0 40 10955
In [66]:
inpatient_data = setting_data[INPATIENT].drop('setting', axis=1)
inpatient_data[['cases', 'n']]
Out[66]:
cases n
0 35 10569
1 16 5649
6 40 10774
7 13 5630
12 40 10955
13 13 5609
18 18 15855
19 7 8475
24 15 16162
25 3 8447
30 7 16433
31 3 8414
36 12 50772
37 8 34332
42 14 51815
43 1 34619
48 5 52582
49 2 34593
In [67]:
def generate_inpatient_model(inpatient_data, pool_years=False):
    
    assert not inpatient_data.isnull().sum().any()

    with Model() as model:

        cases, n = inpatient_data[['cases', 'n']].values.T

        # Cases weighted by monitoring effort and market share
        weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year]

        # Uniform prior on weighted cases
        weighted_cases = Uniform('weighted_cases', cases, n, shape=inpatient_data.shape[0])
#                                          testval=(cases/weights).astype(int))
        
        # Likelihood of observed cases
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

        if pool_years:
            
            assert not len(cases) % 3
            
            pooled_shape = int(len(cases)/3)
                                    
            pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
            
            θ = Normal('θ', 0, sd=1000, shape=pooled_shape)

            λ = Deterministic('λ', tt.exp(θ))

            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
                                
        else:
                      
            θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0])

            # Poisson rate parameter
            λ = Deterministic('λ', tt.exp(θ))
                      
            # Poisson likelihood
            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model

Full (unpooled) model

In [68]:
inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1))
In [69]:
iterations = 1000
tune = 1000
In [70]:
with inpatient_model_full:
    inpatient_full_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Average Loss = 29,588:   6%|▌         | 12270/200000 [00:01<00:21, 8539.43it/s]   
Convergence archived at 12900
Interrupted at 12,900 [6%]: Average Loss = 1.7097e+05
100%|██████████| 2000/2000 [00:05<00:00, 398.56it/s]
In [71]:
forestplot(inpatient_full_trace, varnames=['λ'])
Out[71]:
<matplotlib.gridspec.GridSpec at 0x11e9b5400>
In [72]:
label_map = {'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
             'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
             'black':{0:'white', 1:'black'},
             'setting':{0:'inpatient', 1:'ED', 2:'outpatient'}}
In [73]:
inpatient_data_labels = (setting_data[INPATIENT][['age_group', 'year', 'black']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'black':'race', 'age_group':'age'}))

Rate factor for population

In [74]:
rate_factor = 10000
In [75]:
rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns)
In [76]:
from pymc3 import hpd
def _hpd_df(x, alpha):
    cnames = ['hpd_{0:g}'.format(100 * alpha/2),
              'hpd_{0:g}'.format(100 * (1 - alpha/2))]
    return pd.DataFrame(hpd(x, alpha), columns=cnames)
In [77]:
estimates = (df_summary(inpatient_full_trace, varnames=['λ'],
                stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                           lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates.index = inpatient_data_labels.index
In [78]:
trace, labels = inpatient_full_trace, inpatient_data_labels
In [79]:
def generate_table(trace, labels, index, columns, varnames=['λ']):
    rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
                                          names=labels.columns)
    rate_df = (df_summary(trace, varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
    rate_df.index = rate_df_index
    pivot_all = pd.pivot_table(rate_df
                     .reset_index(), 
                               index=index, columns=columns)
    value_iterator = zip(pivot_all['mean'].values.flatten(), 
                     pivot_all['hpd_2.5'].values.flatten(), 
                     pivot_all['hpd_97.5'].values.flatten())
    rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
                                  pivot_all['mean'].shape)
    
    return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)
In [80]:
inpatient_full_table = generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], 
               columns=['race']).sort_index()

inpatient_full_table.to_excel('results/inpatient_full.xlsx')
inpatient_full_table
Out[80]:
race black white
age year
0-1 2012/13 31.52 (18.20, 45.72) 36.87 (25.41, 49.71)
2013/14 28.04 (14.19, 42.66) 44.75 (31.68, 58.47)
2014/15 30.38 (15.63, 47.14) 46.84 (31.71, 60.66)
2-4 2012/13 9.55 (3.55, 16.46) 12.79 (6.98, 18.59)
2013/14 4.74 (0.68, 9.93) 11.22 (5.79, 16.97)
2014/15 4.97 (0.67, 10.40) 5.61 (2.11, 9.92)
5+ 2012/13 2.65 (1.04, 4.32) 2.66 (1.32, 4.13)
2013/14 0.47 (0.00, 1.32) 3.29 (1.73, 5.00)
2014/15 0.86 (0.02, 1.98) 1.27 (0.37, 2.36)
In [81]:
inpatient_rate_samples = pd.concat(
        [inpatient_data_labels, 
           pd.DataFrame(inpatient_full_trace['λ']).T], axis=1)
In [82]:
inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race']))

Plot of rates broken down by year/age/race/ethnicity.

In [83]:
sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age", col='race', 
                          hue_order=['0-1','2-4','5+'],
            data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*rate_factor),
            palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0)
rateplot.despine(left=True)
Out[83]:
<seaborn.axisgrid.FacetGrid at 0x1174e4fd0>

Pooled across race

In [84]:
inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1)
                                    .groupby(['age_group', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
inpatient_pool_race_data
Out[84]:
age_group year cases n
0 0 0 51 16218
1 0 1 53 16404
2 0 2 53 16564
3 1 0 25 24330
4 1 1 18 24609
5 1 2 10 24847
6 2 0 20 85104
7 2 1 15 86434
8 2 2 7 87175
In [85]:
inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data)
In [86]:
with inpatient_model_pool_race:
    inpatient_pool_race_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Average Loss = 35,642:   6%|▌         | 11602/200000 [00:01<00:24, 7698.74it/s]   
Convergence archived at 12500
Interrupted at 12,500 [6%]: Average Loss = 1.7517e+05
100%|██████████| 2000/2000 [00:03<00:00, 513.05it/s]
In [87]:
inpatient_pool_race_data_labels = (inpatient_pool_race_data[['age_group', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'age_group':'age'}))
In [88]:
inpatient_pool_race_table = generate_table(inpatient_pool_race_trace, 
                                           inpatient_pool_race_data_labels, 
                                           index=['year'], columns=['age'])

inpatient_pool_race_table.to_excel('results/inpatient_pool_race.xlsx')
inpatient_pool_race_table
Out[88]:
age 0-1 2-4 5+
year
2012/13 35.01 (25.59, 44.07) 11.47 (7.67, 15.95) 2.58 (1.55, 3.81)
2013/14 38.83 (29.17, 49.64) 8.91 (4.83, 13.09) 2.08 (1.15, 3.22)
2014/15 41.46 (31.25, 53.61) 5.19 (2.13, 8.26) 1.06 (0.39, 1.83)
In [89]:
inpatient_pool_race_samples = pd.concat(
        [inpatient_pool_race_data[['age_group', 'year']].reset_index(drop=True), 
           pd.DataFrame(inpatient_pool_race_trace['λ']).T], axis=1)
In [90]:
inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples, 
                                       id_vars=['age_group', 'year'])
                    .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'age_group':'age group'}))

Plot of rates pooled by ethnicity

In [91]:
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0).set_xlabel('year')
Out[91]:
<matplotlib.text.Text at 0x1209bada0>

Pooled across years

In [92]:
inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1)
 .sort_values(by=['age_group', 'black', 'year'], ascending=False)).reset_index(drop=True)

inpatient_pool_year_data
Out[92]:
age_group year black cases n
0 2 2 1 2 34593
1 2 1 1 1 34619
2 2 0 1 8 34332
3 2 2 0 5 52582
4 2 1 0 14 51815
5 2 0 0 12 50772
6 1 2 1 3 8414
7 1 1 1 3 8447
8 1 0 1 7 8475
9 1 2 0 7 16433
10 1 1 0 15 16162
11 1 0 0 18 15855
12 0 2 1 13 5609
13 0 1 1 13 5630
14 0 0 1 16 5649
15 0 2 0 40 10955
16 0 1 0 40 10774
17 0 0 0 35 10569
In [93]:
inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True)
In [94]:
with inpatient_model_pool_year:
    inpatient_pool_year_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Average Loss = 10,723:   8%|▊         | 15014/200000 [00:01<00:19, 9712.66it/s]    
Convergence archived at 15900
Interrupted at 15,900 [7%]: Average Loss = 1.1632e+05
100%|██████████| 2000/2000 [00:11<00:00, 174.10it/s]
In [95]:
inpatient_pool_year_labels = (inpatient_pool_year_data[['age_group', 'black']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'black':'race', 'age_group':'age'})
                         .drop_duplicates().reset_index(drop=True))
In [96]:
estimates_pooled = (df_summary(inpatient_pool_year_trace, varnames=['λ'],
            stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                       lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates_pooled.index = inpatient_pool_year_labels.index
pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1)
Out[96]:
age race mean sd mc_error hpd_2.5 hpd_97.5 median hpd_25 hpd_75
0 5+ black 4.085 1.166 0.022 1.902 6.384 3.978 3.185 4.703
1 5+ white 7.109 1.272 0.028 4.769 9.627 7.055 6.281 7.990
2 2-4 black 19.782 5.080 0.118 10.649 29.775 19.245 15.546 22.065
3 2-4 white 29.073 4.485 0.090 20.202 37.472 28.845 25.940 31.733
4 0-1 black 90.983 14.169 0.337 64.011 119.772 90.464 80.813 98.960
5 0-1 white 126.747 11.787 0.277 106.351 151.540 126.593 120.010 136.382

Inpatient AGE estimates

In [97]:
inpatient_pool_year_table = generate_table(inpatient_pool_year_trace, 
                                           inpatient_pool_year_labels, 
                                           index=['race'], columns=['age'])

inpatient_pool_year_table.to_excel('results/inpatient_pool_year.xlsx')
inpatient_pool_year_table
Out[97]:
age 0-1 2-4 5+
race
black 90.98 (64.01, 119.77) 19.78 (10.65, 29.78) 4.08 (1.90, 6.38)
white 126.75 (106.35, 151.54) 29.07 (20.20, 37.47) 7.11 (4.77, 9.63)
In [98]:
inpatient_pool_year_samples = pd.concat(
    [inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['age_group', 'black']], 
    pd.DataFrame(inpatient_pool_year_trace['λ']).T], axis=1)
In [99]:
inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['age_group', 'black'])
                    .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                                     'black':{0:'white', 1:'black'}})
                    .rename(columns={'black':'race', 'age_group':'age group'}))

Plot of rates pooled by year

In [100]:
sns.set_context(font_scale=1.5)
plt.subplots(figsize=(12,6))
sns.boxplot(x="race", y="rate", hue="age group", 
        data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*rate_factor),
        palette="gray_r", linewidth=0.6, fliersize=0)
Out[100]:
<matplotlib.axes._subplots.AxesSubplot at 0x11004c828>

Outpatient AGE Model

Import outpatient denominator information from REDCap

In [101]:
import redcap
import getpass

local_auth = True

api_url = 'https://redcap.vanderbilt.edu/api/'

if local_auth:
    api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()

else:
    api_key = getpass.getpass()
In [102]:
outpatient_project = redcap.Project(api_url, api_key)
In [103]:
outpatients = outpatient_project.export_records(format='df')
In [104]:
outpatients.groupby('year').year.count()
Out[104]:
year
2012    15969
2015    17562
Name: year, dtype: int64
In [105]:
outpatients.shape
Out[105]:
(33531, 15)
In [106]:
outpatients.head()
Out[106]:
mrn year dateextracted timeextracted ptname ptdob ptsex ptrace ptethnicity ptage ptcounty datetosubstract ageinjan datetosubstract2 ageinnov
record_id
A1 9030842 2012 4/29/2016 10:16 VEACH, ELIJAH 1993-11-29 M W NH 22.42 Davidson-TN 1/1/2012 18.088980 11/30/2012 19.003422
A2 11416534 2012 4/29/2016 10:16 MCGRAW, TRINEN TYRONE 1998-09-04 M B NH 17.65 Davidson-TN 1/1/2012 13.325120 11/30/2012 14.239562
A3 12265690 2012 4/29/2016 10:16 RANDOLPH, JONATHAN THOMAS 1996-07-30 M W NH 19.75 Robertson - TN 1/1/2012 15.422313 11/30/2012 16.336756
A4 12270971 2012 4/29/2016 10:16 POINTER, LAQUINCAS L 1997-02-08 M B NaN 19.22 Davidson-TN 1/1/2012 14.893908 11/30/2012 15.808350
A5 12750493 2012 4/29/2016 10:16 HUMPHREY, NORMA 1995-12-15 F B NaN 20.37 Davidson-TN 1/1/2012 16.046543 11/30/2012 16.960986
In [107]:
outpatients[['ageinjan', 'ageinnov']].hist(bins=20)
Out[107]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12b8f0550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x128b06be0>]], dtype=object)

Going to assume that the outlier ages above were mistakenly entered as months.

In [108]:
outliers = outpatients.ageinjan > 20
outpatients.loc[outliers, ['ageinjan', 'ageinnov']] = outpatients.loc[outliers, ['ageinjan', 'ageinnov']] / 12
In [109]:
outpatients[['ageinjan', 'ageinnov']].hist(bins=20)
Out[109]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x128a12160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1289ed898>]], dtype=object)
In [110]:
outpatients['age'] = np.maximum(outpatients[['ageinjan', 'ageinnov']].mean(1), 0.1)
In [111]:
outpatients_kids = outpatients[outpatients.age<18].copy()
In [112]:
outpatients_kids['under_5'] = (outpatients_kids.age<5).astype(int)
outpatients_kids['under_2'] = (outpatients_kids.age<2).astype(int)
In [113]:
outpatients_kids['age_group'] = 2
outpatients_kids.loc[outpatients_kids.under_5==1, 'age_group'] = 1
outpatients_kids.loc[outpatients_kids.under_2==1, 'age_group'] = 0
In [114]:
outpatients_kids.groupby('year').age.hist(alpha=0.3)
Out[114]:
year
2012    Axes(0.125,0.125;0.775x0.755)
2015    Axes(0.125,0.125;0.775x0.755)
Name: age, dtype: object
In [115]:
outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
                        .groupby(['year', 'age_group'])
                        .n.count()
                        .reset_index())
outpatient_n
Out[115]:
year age_group n
0 2012 0 2889
1 2012 1 4446
2 2012 2 8605
3 2015 0 3054
4 2015 1 4307
5 2015 2 9870

Interpolate 2013

In [116]:
interp_2013 = (outpatient_n.groupby(['age_group'])
                           .apply(lambda x: np.interp(2013, x.year, x.n))
                           .astype(int)
                           .reset_index()).assign(year=2013).rename(columns={0:'n'})
interp_2013
Out[116]:
age_group n year
0 0 2944 2013
1 1 4399 2013
2 2 9026 2013

Drop 2015 and recode year

In [117]:
outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0)
outpatient_n = outpatient_n[outpatient_n.year<2015]
outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012)
outpatient_n
Out[117]:
age_group n year
0 0 2889 0
1 1 4446 0
2 2 8605 0
0 0 2944 1
1 1 4399 1
2 2 9026 1
In [118]:
outpatient_data_full = setting_data[OUTPATIENT].groupby(['age_group','year']).cases.sum().reset_index()
In [119]:
outpatient_data_full
Out[119]:
age_group year cases
0 0 0 187
1 0 1 185
2 0 2 179
3 1 0 120
4 1 1 119
5 1 2 116
6 2 0 152
7 2 1 124
8 2 2 92
In [120]:
outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['age_group','year'])
In [121]:
outpatient_merged
Out[121]:
age_group year cases n
0 0 0 187 2889
1 0 1 185 2944
2 1 0 120 4446
3 1 1 119 4399
4 2 0 152 8605
5 2 1 124 9026
In [122]:
def generate_outpatient_model(outpatient_data, pool_years=False):
    
    assert not outpatient_data.isnull().sum().any()

    with Model() as model:

        cases,n = outpatient_data[['cases','n']].values.T
        
        shape = outpatient_data.shape[0]
        
        # Cases weighted by monitoring effort
        weights = monitoring_rate[OUTPATIENT] 

        # Uniform prior on weighted cases
        weighted_cases = Uniform('weighted_cases', cases, n, shape=shape)
        
        # Likelihood of observed cases
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

        

        if pool_years:
            
            assert not len(cases) % 3
            
            pooled_shape = int(len(cases)/3)
                                    
            pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
            
            θ = Normal('θ', 0, sd=1000, shape=pooled_shape)

            λ = Deterministic('λ', tt.exp(θ))

            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
                                
        else:
                      
            θ = Normal('θ', 0, sd=1000, shape=outpatient_data.shape[0])

            # Poisson rate parameter
            λ = Deterministic('λ', tt.exp(θ))
                      
            # Poisson likelihood
            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model
In [123]:
outpatient_model_full = generate_outpatient_model(outpatient_merged)
In [124]:
with outpatient_model_full:
    outpatient_full_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Average Loss = 1,053:   6%|▌         | 11993/200000 [00:01<00:15, 12142.93it/s]  
Convergence archived at 12800
Interrupted at 12,800 [6%]: Average Loss = 7,340.5
100%|██████████| 2000/2000 [00:03<00:00, 535.76it/s]
In [125]:
outpatient_data_labels = (inpatient_pool_race_data[['age_group', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'age_group':'age'}))
outpatient_data_labels = outpatient_data_labels[outpatient_data_labels.year.isin(['2012/13','2013/14'])]
outpatient_data_labels.columns = ['age', 'year']
In [126]:
outpatient_data_labels
Out[126]:
age year
0 0-1 2012/13
1 0-1 2013/14
3 2-4 2012/13
4 2-4 2013/14
6 5+ 2012/13
7 5+ 2013/14

Outpatient AGE estimates

In [127]:
outpatient_full_table = generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['λ'], 
               index=['age'], columns=['year'])

outpatient_full_table.to_excel('results/outpatient_pool_race.xlsx')
outpatient_full_table
Out[127]:
year 2012/13 2013/14
age
0-1 1132.37 (975.65, 1297.42) 1097.63 (949.24, 1257.14)
2-4 472.92 (391.60, 562.46) 472.22 (393.97, 549.70)
5+ 308.94 (261.60, 358.79) 240.12 (199.26, 288.89)
In [128]:
outpatient_full_samples = pd.concat(
        [outpatient_merged[['age_group', 'year']].reset_index(drop=True), 
           pd.DataFrame(outpatient_full_trace['λ']).T], axis=1)
In [129]:
outpatient_full_samples = (pd.melt(outpatient_full_samples, 
                                       id_vars=['age_group', 'year'])
                    .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'age_group':'age group'}))
In [130]:
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=outpatient_full_samples.assign(rate=outpatient_full_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0)
Out[130]:
<matplotlib.axes._subplots.AxesSubplot at 0x128642710>

ED AGE Model

In [131]:
ED2 = pd.read_excel('data/FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED3 = pd.read_excel('data/Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED2.columns = ED3.columns
In [132]:
ED4 = pd.read_excel('data/Year 4 denominators.xlsx', index_col=0)
ED4 = ED4.fillna(ED4.mean())
ED4.columns = ED3.columns
In [133]:
ED2_total = ED2.sum()
ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
In [134]:
ED3_total = ED3.sum()
ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
In [135]:
ED4_total = ED4.sum()
ED4_eligible = (ED4_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED4_visits = (ED4_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED4_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
In [136]:
ED_all = pd.concat([ED2, ED3, ED4])
eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')]
In [137]:
ED_counts = ED_all[eligible_cols].sum(1).astype(int)
In [138]:
rotavirus_ed = rotavirus_data[rotavirus_data.setting==ED]
surveillance_counts = rotavirus_data.groupby(['scrdate'])['caseid'].count()
In [139]:
ED_counts.name = 'count24'
surveillance_counts.name = 'count8'
In [140]:
surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna()
# Drop days where 8-hour count exceeds 24-hour count
surveillance = surveillance[surveillance.count8<=surveillance.count24]

Here is what it looks like if you resample to weekly

In [141]:
weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W')
(weekly_surveillance.count8.mean() / weekly_surveillance.count24.mean()).hist()
Out[141]:
<matplotlib.axes._subplots.AxesSubplot at 0x12a308a20>

We will just use the proportion of the pooled suveillance

In [142]:
surveillance_total = surveillance.sum()
surveillance_total
Out[142]:
count24    3935.0
count8     2493.0
dtype: float64
In [143]:
surveillance_proportion = surveillance_total.count8 / surveillance_total.count24
surveillance_proportion
Out[143]:
0.63354510800508257
In [144]:
def generate_ed_model(ED_data):
    
    assert not ED_data.isnull().sum().any()

    with Model() as model:
    
        data_len = ED_data.shape[0]

        cases, n = ED_data[['cases','n']].values.T

        # Weights according to ED monitoring intensity, market share and 8-hour surveillance
        weights = (np.array(monitoring_rate)[ED] 
                         * np.array(ed_market_share)[ED_data.year]
                         * surveillance_proportion)

        # Uniform prior on weighted cases
        weighted_cases = Uniform('weighted_cases', cases, n, shape=data_len)
        # Likelihood of observed cases
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)


        θ = Normal('θ', 0, sd=1000, shape=data_len)

        # Poisson rate parametera
        λ = Deterministic('λ', tt.exp(θ))
        
        # Poisson likelihood
        AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model
In [145]:
ed_pool_race_data = (setting_data[ED].drop('setting', axis=1)
                                    .groupby(['age_group', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
ed_pool_race_data
Out[145]:
age_group year cases n
0 0 0 310 16218
1 0 1 387 16404
2 0 2 324 16564
3 1 0 199 24330
4 1 1 173 24609
5 1 2 165 24847
6 2 0 219 85104
7 2 1 206 86434
8 2 2 196 87175

Back-of-the-envelope rate calculation

In [146]:
ed_pool_race_data.assign(empirical_rate=rate_factor * ed_pool_race_data.cases/(ed_pool_race_data.n 
                                                                 * np.array(monitoring_rate)[ED] 
                                                                 * surveillance_proportion
                                                                 * np.array(ed_market_share)[ed_pool_race_data.year]))
Out[146]:
age_group year cases n empirical_rate
0 0 0 310 16218 703.133099
1 0 1 387 16404 825.837636
2 0 2 324 16564 673.851804
3 1 0 199 24330 300.873621
4 1 1 173 24609 246.085261
5 1 2 165 24847 228.767640
6 2 0 219 85104 94.660171
7 2 1 206 86434 83.428815
8 2 2 196 87175 77.454869
In [147]:
ed_model = generate_ed_model(ed_pool_race_data)
In [148]:
with ed_model:
    ed_trace = sample(1000, tune=tune, njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Average Loss = 8,128.4:   8%|▊         | 15674/200000 [00:01<00:16, 11297.71it/s]  
Convergence archived at 16800
Interrupted at 16,800 [8%]: Average Loss = 53,496
100%|██████████| 2000/2000 [00:05<00:00, 398.10it/s]
In [149]:
forestplot(ed_trace, varnames=['λ'])
Out[149]:
<matplotlib.gridspec.GridSpec at 0x12dd7e3c8>
In [150]:
ed_data_labels = (ed_pool_race_data[['age_group', 'year']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'age_group':'age'}))

ED AGE estimates

In [151]:
ed_table = generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age'])

ed_table.to_excel('results/ed_pool_race.xlsx')
ed_table
Out[151]:
age 0-1 2-4 5+
year
2012/13 703.48 (631.69, 782.60) 301.48 (263.52, 344.45) 94.73 (82.72, 107.80)
2013/14 826.17 (742.86, 904.69) 245.95 (208.05, 282.66) 83.29 (72.36, 94.93)
2014/15 673.15 (602.54, 749.37) 228.58 (191.78, 261.12) 77.28 (67.22, 87.96)

Create data frame of MCMC samples for plotting

In [152]:
ed_rate_samples = pd.concat([ed_pool_race_data[['age_group', 'year']], 
           pd.DataFrame(ed_trace['λ']).T], axis=1)
In [153]:
ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['age_group', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14'}})
                    .rename(columns={'age_group':'age group'}))
In [154]:
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=ed_rate_samples.assign(rate=ed_rate_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0)
Out[154]:
<matplotlib.axes._subplots.AxesSubplot at 0x12c54cbe0>

Virus Rate Models

In [155]:
collected_stools = (rotavirus_data.groupby(['age_group', 'black', 'year', 'setting'])
                        .agg({'stool_collected':['sum', 'count']}))

levels = collected_stools.columns.levels
labels = collected_stools.columns.labels
collected_stools.columns = ['stool_collected', 'enrolled']

collected_stools = collected_stools.reset_index()
collected_stools.head()
Out[155]:
age_group black year setting stool_collected enrolled
0 0 False 0 0 35.0 35
1 0 False 0 1 154.0 173
2 0 False 0 2 112.0 119
3 0 False 0 3 157.0 174
4 0 False 1 0 39.0 40
In [156]:
virus_summary = (rotavirus_data.dropna(subset=virus_cols)
                 
                 .groupby(['age_group', 'black', 'year', 'setting', 'rotavirus', 'sapovirus',
                                        'astrovirus', 'norovirus'])['caseid'].count()
                 .reset_index()
                 .rename(columns={'caseid':'cases'}))

virus_summary.head()
Out[156]:
age_group black year setting rotavirus sapovirus astrovirus norovirus cases
0 0 False 0 0 0.0 0.0 0.0 0.0 19
1 0 False 0 0 0.0 0.0 0.0 1.0 6
2 0 False 0 0 0.0 1.0 0.0 0.0 1
3 0 False 0 0 1.0 0.0 0.0 0.0 8
4 0 False 0 0 1.0 0.0 1.0 0.0 1
In [157]:
merged_virus_data = (virus_summary.merge(collected_stools, 
                                         on=['age_group', 'black', 'year', 'setting'])
                                  .merge(census_clean, 
                                         on=['age_group', 'year', 'black'], 
                                        how='left'))
merged_virus_data.head(10)
Out[157]:
age_group black year setting rotavirus sapovirus astrovirus norovirus cases stool_collected enrolled n
0 0 False 0 0 0.0 0.0 0.0 0.0 19 35.0 35 10569
1 0 False 0 0 0.0 0.0 0.0 1.0 6 35.0 35 10569
2 0 False 0 0 0.0 1.0 0.0 0.0 1 35.0 35 10569
3 0 False 0 0 1.0 0.0 0.0 0.0 8 35.0 35 10569
4 0 False 0 0 1.0 0.0 1.0 0.0 1 35.0 35 10569
5 0 False 0 1 0.0 0.0 0.0 0.0 91 154.0 173 10569
6 0 False 0 1 0.0 0.0 0.0 1.0 25 154.0 173 10569
7 0 False 0 1 0.0 0.0 1.0 0.0 4 154.0 173 10569
8 0 False 0 1 0.0 0.0 1.0 1.0 1 154.0 173 10569
9 0 False 0 1 0.0 1.0 0.0 0.0 16 154.0 173 10569
In [158]:
any_virus = (merged_virus_data.groupby(['black','age_group', 'year', 'setting'])
     .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})).sum(level=['age_group', 'year', 'setting'])
any_virus
Out[158]:
cases stool_collected enrolled n
age_group year setting
0 0 0 51 51.0 51 16218
1 266 268.0 310 16218
2 172 172.0 187 16218
3 249 250.0 301 16218
1 0 52 52.0 53 16404
1 334 334.0 387 16404
2 157 157.0 185 16404
3 174 174.0 230 16404
2 0 50 50.0 53 16564
1 267 268.0 324 16564
2 157 158.0 179 16564
3 159 159.0 202 16564
1 0 0 21 21.0 25 24330
1 150 150.0 199 24330
2 91 92.0 120 24330
3 130 130.0 195 24330
1 0 15 15.0 18 24609
1 119 119.0 173 24609
2 82 82.0 119 24609
3 71 71.0 104 24609
2 0 5 5.0 10 24847
1 107 107.0 165 24847
2 76 76.0 116 24847
3 62 62.0 100 24847
2 0 0 13 13.0 20 85104
1 141 141.0 219 85104
2 115 115.0 152 85104
3 137 137.0 210 85104
1 0 13 13.0 15 86434
1 148 148.0 206 86434
2 84 84.0 124 86434
3 80 80.0 126 86434
2 0 4 4.0 5 52582
1 130 131.0 196 87175
2 66 66.0 92 87175
3 64 64.0 92 87175

All possible combinations of age, year and setting for index.

In [159]:
complete_index = pd.MultiIndex.from_product([range(3), range(3), range(4)], names=['age_group', 'year', 'setting'])
In [160]:
sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1]
                 .groupby(['age_group', 'year', 'setting'])
                 .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
                 .reindex(complete_index)
                 .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
                 .fillna(0).astype(int))

rota_data = (merged_virus_data[merged_virus_data.rotavirus==1]
                 .groupby(['age_group', 'year', 'setting'])
                 .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
                 .reindex(complete_index)
                 .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
                 .fillna(0).astype(int))

astro_data = (merged_virus_data[merged_virus_data.astrovirus==1]
                 .groupby(['age_group', 'year', 'setting'])
                 .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
                 .reindex(complete_index)
                 .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
                 .fillna(0).astype(int))

noro_data = (merged_virus_data[merged_virus_data.norovirus==1]
                 .groupby(['age_group', 'year', 'setting'])
                 .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
                 .reindex(complete_index)
                 .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
                 .fillna(0).astype(int))
In [161]:
virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'),
           noro_data.reset_index().assign(virus='norovirus'),
           rota_data.reset_index().assign(virus='rotavirus'),
           sapo_data.reset_index().assign(virus='sapovirus')])
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)

Inpatient virus rate model

In [205]:
virus_lookup = {0:'norovirus',
                1:'rotavirus',
                2:'sapovirus',
                3:'astrovirus'}
In [162]:
virus_inpatient_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==INPATIENT].reset_index(drop=True)
In [163]:
cases, enrolled, year, n, age_group = virus_inpatient_data[['cases', 'enrolled', 'year', 'n', 'age_group']].values.T
In [164]:
with Model() as virus_inpatient_model:
    
    groups = virus_inpatient_data.shape[0]
    
    # Probability of stool collection
    ψ = Beta('ψ', 10, 1, shape=3)
    
    # Age-group specific enrolled inpatients, and inpatients with collected stool
    inpatient_enrolled = rotavirus_data.groupby('age_group').apply(lambda x: (x.setting==INPATIENT).sum()).values

    collected = (rotavirus_data.groupby('age_group')
                               .apply(lambda x: x[x.setting==INPATIENT].stool_collected.sum())
                               .values)

    stool_likelihood = Binomial('stool_likelihood', n=inpatient_enrolled, p=ψ, 
                                observed=collected)

    # Extract data columns
    cases, enrolled, year, n, age_group = virus_inpatient_data[['cases', 'enrolled', 'year', 'n', 'age_group']].values.T

    enrolled_cases = DiscreteUniform('enrolled_cases', lower=cases, upper=enrolled, shape=groups,
                                    testval=(cases/ψ[age_group.astype(int)]).astype('int32'))  

    # Estimate total VUMC cases in setting
    enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood', 
                                         n=enrolled_cases, 
                                         p=ψ[age_group.astype(int)], 
                                         observed=cases)

    # Calculate an "effective N" that corresponds to sampled population
    weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[year.astype(int)]

    n_eff = DiscreteUniform('n_eff', lower=cases, upper=n, shape=groups)
    
    davidson_cases_likelihood = Potential('davidson_cases_likelihood', 
                          Binomial.dist(n=n, p=weights).logp(n_eff.astype('int32')).sum())

    # Population rate
    θ = Normal('θ', 0, sd=10, shape=groups)

    λ = Deterministic('λ', tt.exp(θ))

    # Data likelihood
    virus_obs = Potential('virus_obs', Poisson.dist(λ*n_eff).logp(enrolled_cases))
In [165]:
with virus_inpatient_model:
    virus_inpatient_trace = sample(iterations, tune=tune, njobs=2,  
                                   random_seed=seed_numbers)
Assigned NUTS to ψ_logodds__
Assigned Metropolis to enrolled_cases
Assigned Metropolis to n_eff
Assigned NUTS to θ
100%|██████████| 2000/2000 [00:52<00:00, 37.85it/s]
/Users/fonnescj/Repos/pymc3/pymc3/step_methods/hmc/nuts.py:463: UserWarning: Chain 1 contains 2 diverging samples after tuning. If increasing `target_accept` does not help try to reparameterize.
  % (self._chain_id, n_diverging))

Posterior distribution of stool sampling rate

In [166]:
forestplot(virus_inpatient_trace, varnames=['ψ'])
Out[166]:
<matplotlib.gridspec.GridSpec at 0x133073fd0>
In [167]:
traceplot(virus_inpatient_trace, varnames=['ψ'])
Out[167]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12c2eb860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12c62ed68>]], dtype=object)
In [168]:
forestplot(virus_inpatient_trace, varnames=['λ'])
Out[168]:
<matplotlib.gridspec.GridSpec at 0x12c50c860>

Inpatient virus rates, plotted and tabulated

In [169]:
inpatient_sample_melted = pd.melt(pd.DataFrame(virus_inpatient_trace['λ']))
In [170]:
inpatient_virus_samples = (virus_inpatient_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                 .merge(inpatient_sample_melted, left_index=True, right_on='variable')
                 .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))
In [171]:
sns.set(style="ticks")

inpatient_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=inpatient_virus_samples.assign(rate=inpatient_virus_samples.value*rate_factor),
        palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
        fliersize=0)

inpatient_rateplot.despine(left=True)
Out[171]:
<seaborn.axisgrid.FacetGrid at 0x1332aa550>
In [172]:
virus_inpatient_data.to_excel('inpatient_virus_data.xlsx')
In [173]:
inpatient_virus_labels = (virus_inpatient_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                          .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))
In [174]:
labels = inpatient_virus_labels
trace = virus_inpatient_trace
varnames=['λ']
In [175]:
labels.head()
Out[175]:
age group year virus
0 0-1 2012/13 astrovirus
1 0-1 2013/14 astrovirus
2 0-1 2014/15 astrovirus
3 2-4 2012/13 astrovirus
4 2-4 2013/14 astrovirus
In [176]:
rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
                                      names=labels.columns)
rate_df = (df_summary(trace, varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
#rate_df.index = rate_df_index
In [177]:
virus_inpatient_table = generate_table(virus_inpatient_trace, inpatient_virus_labels, 
                                       index=['age group', 'year'], 
                                   columns=['virus']).sort_index()
virus_inpatient_table.to_excel('results/virus_inpatient_table.xlsx')
In [178]:
virus_inpatient_table
Out[178]:
virus astrovirus norovirus rotavirus sapovirus
age group year
0-1 2012/13 2.52 (0.09, 5.94) 8.63 (3.20, 15.26) 14.45 (6.59, 22.23) 2.49 (0.12, 5.86)
2013/14 8.51 (2.58, 14.60) 24.01 (13.66, 34.35) 7.19 (2.11, 13.11) 8.61 (2.81, 14.42)
2014/15 0.18 (0.00, 1.06) 14.39 (6.80, 22.29) 7.13 (1.76, 12.72) 2.48 (0.11, 5.98)
2-4 2012/13 0.13 (0.00, 0.67) 3.32 (0.65, 6.44) 2.52 (0.26, 5.17) 0.88 (0.00, 2.56)
2013/14 0.87 (0.00, 2.53) 0.87 (0.00, 2.53) 0.90 (0.00, 2.71) 0.88 (0.00, 2.64)
2014/15 0.87 (0.00, 2.49) 1.67 (0.09, 4.00) 0.12 (0.00, 0.62) 0.12 (0.00, 0.64)
5+ 2012/13 0.04 (0.00, 0.21) 0.26 (0.00, 0.78) 0.25 (0.00, 0.74) 0.03 (0.00, 0.17)
2013/14 0.04 (0.00, 0.21) 0.72 (0.07, 1.50) 0.25 (0.00, 0.74) 0.48 (0.02, 1.11)
2014/15 0.06 (0.00, 0.32) 0.06 (0.00, 0.34) 0.41 (0.00, 1.19) 0.06 (0.00, 0.34)

Outpatient virus rate model

In [179]:
virus_outpatient_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==OUTPATIENT]
In [180]:
with Model() as virus_outpatient_model:
    
    groups = virus_outpatient_data.shape[0]
    
    # Probability of stool collection
    ψ = Beta('ψ', 1, 1)
    outpatient_enrolled = (rotavirus_data.setting==OUTPATIENT).sum()
    collected = rotavirus_data[rotavirus_data.setting==OUTPATIENT].stool_collected.sum()
    stool = Binomial('stool', n=outpatient_enrolled, p=ψ, observed=collected)

    cases, enrolled, n = virus_outpatient_data[['cases', 'enrolled', 'n']].values.T
    
    enrolled_cases_out = Uniform('enrolled_cases_out', lower=cases, upper=enrolled, 
                                     shape=groups)
    
    p = monitoring_rate[OUTPATIENT]*ψ

    # Estimate total VUMC cases in setting
    enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood', 
                                         n=enrolled_cases_out, 
                                         p=p,
                                         observed=cases)
    

    π = Beta('π', 1, 10, shape=groups)

    # Data likeihood
    virus_out_obs = Potential('virus_out_obs', 
              Binomial.dist(n=n, p=π).logp(enrolled_cases_out).sum())
        
In [181]:
with virus_outpatient_model:
    virus_outpatient_trace = sample(iterations, tune=tune,
                                    njobs=2, random_seed=seed_numbers)
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Average Loss = 463.64:  25%|██▍       | 49397/200000 [00:08<00:23, 6289.74it/s]   
Convergence archived at 49700
Interrupted at 49,700 [24%]: Average Loss = 10,288
100%|██████████| 2000/2000 [00:09<00:00, 210.43it/s]

Posterior distribution of stool sampling rate

In [182]:
traceplot(virus_outpatient_trace, varnames=['ψ'])
Out[182]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12e03c0f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12dd58358>]], dtype=object)

Outpatient virus rates, plotted and tabulated

In [183]:
outpatient_sample_melted = pd.melt(pd.DataFrame(virus_outpatient_trace['π']))
In [184]:
outpatient_virus_samples = (virus_outpatient_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                 .merge(outpatient_sample_melted, left_index=True, right_on='variable')
                 .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))
In [185]:
outpatient_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=outpatient_virus_samples.assign(rate=outpatient_virus_samples.value*rate_factor),
        palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
        fliersize=0)

outpatient_rateplot.despine(left=True)
Out[185]:
<seaborn.axisgrid.FacetGrid at 0x12ded8320>
In [186]:
outpatient_virus_labels = (virus_outpatient_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                          .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))
In [187]:
virus_outpatient_table = generate_table(virus_outpatient_trace, outpatient_virus_labels, 
                                        index=['age group', 'year'], varnames=['π'],
               columns=['virus']).sort_index()
virus_outpatient_table.to_excel('results/virus_outpatient_table.xlsx')
In [188]:
virus_outpatient_table
Out[188]:
virus astrovirus norovirus rotavirus sapovirus
age group year
0-1 2012/13 15.19 (6.44, 24.09) 48.36 (33.08, 64.25) 29.08 (17.63, 42.37) 30.54 (17.76, 43.59)
2013/14 17.79 (8.59, 27.31) 60.16 (42.72, 78.12) 19.22 (9.65, 29.94) 35.58 (22.51, 48.96)
2014/15 5.42 (1.03, 10.70) 76.13 (56.90, 96.09) 18.90 (9.87, 28.98) 19.00 (10.33, 29.45)
2-4 2012/13 12.01 (6.50, 18.58) 15.66 (8.28, 23.27) 12.95 (6.97, 19.95) 10.15 (4.38, 15.94)
2013/14 8.28 (3.42, 14.06) 17.40 (9.79, 25.14) 3.65 (0.79, 7.29) 11.89 (6.04, 18.44)
2014/15 4.47 (0.96, 8.30) 19.03 (11.46, 27.08) 8.19 (3.67, 13.92) 4.56 (1.18, 8.47)
5+ 2012/13 1.84 (0.59, 3.19) 4.49 (2.54, 6.75) 1.85 (0.60, 3.18) 2.93 (1.39, 4.69)
2013/14 0.81 (0.08, 1.66) 4.16 (2.23, 6.20) 1.06 (0.20, 2.06) 2.35 (0.84, 3.86)
2014/15 0.56 (0.04, 1.33) 5.44 (3.21, 7.87) 1.29 (0.32, 2.47) 0.80 (0.12, 1.67)

ED virus rate model

In [189]:
virus_ed_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==ED]
In [190]:
with Model() as virus_ed_model:
        
    # Probability of stool collection
    ψ = Beta('ψ', 1, 1)
    ed_enrolled = (rotavirus_data.setting==ED).sum()
    collected = rotavirus_data[rotavirus_data.setting==ED].stool_collected.sum()
    stool = Binomial('stool', n=ed_enrolled, p=ψ, observed=collected)

    data_len = virus_ed_data.shape[0]

    cases, enrolled, year, n = virus_ed_data[['cases', 'enrolled', 'year', 'n']].values.T
    
    enrolled_cases = Uniform('enrolled_cases', lower=cases, upper=enrolled, 
                                     shape=groups)

    # Estimate total VUMC cases in setting
    enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood', 
                                         n=enrolled_cases, 
                                         p=ψ, 
                                         observed=cases)
        
    ω = (np.array(monitoring_rate)[ED] 
                     * np.array(ed_market_share)[year.astype(int)]
                     * surveillance_proportion)

    n_eff = DiscreteUniform('n_eff', lower=cases, upper=n, 
                                     shape=groups)
    n_eff_likelihood = Potential('davidson_cases_likelihood', 
                          Binomial.dist(n=n, 
                                        p=ω).logp(n_eff).sum())


    # Population rate
    θ = Normal('θ', 0, sd=10, shape=groups)

    λ = Deterministic('λ', tt.exp(θ))

    # Data likelihood
    virus_ed_obs = Potential('virus_ed_obs', Poisson.dist(λ*n_eff).logp(enrolled_cases))
In [191]:
with virus_ed_model:
    virus_ed_trace = sample(iterations, tune=tune,
                            njobs=2, random_seed=seed_numbers)
Assigned NUTS to ψ_logodds__
Assigned NUTS to enrolled_cases_interval__
Assigned Metropolis to n_eff
Assigned NUTS to θ
100%|██████████| 2000/2000 [00:33<00:00, 58.93it/s]

Posterior distribution of stool sampling rate

In [192]:
traceplot(virus_ed_trace, varnames=['ψ'])
Out[192]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x133ee56a0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1312b8710>]], dtype=object)

ED virus rate estimates, plotted and tabulated

In [193]:
ed_sample_melted = pd.melt(pd.DataFrame(virus_ed_trace['λ']))
In [194]:
ed_virus_samples = (virus_ed_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index()
                 .merge(ed_sample_melted, left_index=True, right_on='variable')
                 .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))
In [195]:
ed_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=ed_virus_samples.assign(rate=ed_virus_samples.value*rate_factor),
        palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
        fliersize=0)

ed_rateplot.despine(left=True)
Out[195]:
<seaborn.axisgrid.FacetGrid at 0x12c19be10>
In [196]:
ed_virus_labels = (virus_ed_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                          .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))
In [197]:
virus_ed_table = generate_table(virus_ed_trace, ed_virus_labels, 
                                        index=['age group', 'year'], 
               columns=['virus']).sort_index()
virus_ed_table.to_excel('results/virus_ed_table.xlsx')
In [198]:
virus_ed_table
Out[198]:
virus astrovirus norovirus rotavirus sapovirus
age group year
0-1 2012/13 18.19 (9.34, 29.89) 79.84 (56.37, 101.89) 42.82 (27.23, 59.08) 54.46 (35.18, 72.40)
2013/14 34.47 (20.02, 49.14) 136.46 (106.58, 165.19) 32.58 (18.74, 47.54) 62.20 (43.43, 81.33)
2014/15 11.64 (4.02, 20.53) 112.74 (87.71, 140.87) 35.49 (20.15, 49.75) 45.06 (29.65, 62.13)
2-4 2012/13 4.68 (0.97, 8.60) 30.53 (19.09, 41.69) 31.88 (20.76, 44.30) 26.27 (17.12, 38.00)
2013/14 8.75 (3.79, 14.98) 29.34 (19.34, 40.34) 8.86 (3.40, 14.84) 13.04 (6.29, 21.07)
2014/15 6.59 (1.98, 11.60) 26.67 (16.87, 37.48) 13.00 (6.54, 20.60) 10.87 (4.68, 17.59)
5+ 2012/13 1.62 (0.39, 2.91) 5.27 (2.77, 7.70) 7.42 (4.63, 10.30) 4.98 (2.85, 7.52)
2013/14 1.30 (0.28, 2.52) 9.16 (5.81, 12.42) 2.49 (0.94, 4.15) 2.76 (1.27, 4.73)
2014/15 1.02 (0.13, 2.09) 6.68 (4.06, 9.46) 3.95 (2.04, 6.18) 1.58 (0.46, 2.98)