%matplotlib inline import numpy as np import pandas as pd import seaborn as sns import matplotlib.pylab as plt sns.set_context('notebook') seed_numbers = 20090425, 19700903 from datetime import timedelta, datetime

Import data

In [2]:
rotavirus_data = pd.read_csv('ROTA.csv')
rotavirus_data.index.is_unique
Out[2]:
True

Rename columns and recode data

In [3]:
race_cols = [race for race in rotavirus_data.columns if race.startswith('Race')]

(rotavirus_data[race_cols]=='Checked').sum()
Out[3]:
Race (choice=White)                                     2249
Race (choice=Black/African American)                    1545
Race (choice=American Indian/Alaskan Native)               5
Race (choice=Asian)                                       86
Race (choice=Native Hawaiian/Other Pacific Islander)       5
Race (choice=Other)                                       18
Race (choice=Unknown)                                      3
Race (choice=None)                                         1
dtype: int64
In [4]:
rotavirus_data['white'] = (rotavirus_data['Race (choice=White)']=='Checked').astype(int)
rotavirus_data['black'] = (rotavirus_data['Race (choice=Black/African American)']=='Checked').astype(int)

rotavirus_data = rotavirus_data[(rotavirus_data.white + rotavirus_data.black)==1].drop(race_cols, axis=1)
In [5]:
rotavirus_data['hispanic'] = (rotavirus_data.Ethnicity=='Hispanic or Latino').astype(int)
In [6]:
rotavirus_data['rotavirus'] = (rotavirus_data['Result of rotavirus testing'] == 'Positive').astype(int)
rotavirus_data['sapovirus'] = (rotavirus_data['Result of sapovirus testing'] == 'Positive').astype(int)
rotavirus_data['astrovorus'] = (rotavirus_data['Result of astrovirus testing'] == 'Positive').astype(int)
rotavirus_data['norovirus'] = (rotavirus_data['Result of Norovirus Testing'] == 'GII').astype(int)

rotavirus_data = rotavirus_data.drop([col for col in rotavirus_data.columns if col.startswith('Result')], axis=1)
In [7]:
rotavirus_data['Enrollment Date'] = pd.to_datetime(rotavirus_data['Enrollment Date'])
rotavirus_data['Date of Birth'] = pd.to_datetime(rotavirus_data['Date of Birth'])
In [162]:
pd.crosstab(rotavirus_data.black, rotavirus_data.hispanic)
Out[162]:
hispanic 0 1
black
0 719 1359
1 1357 17

Calculate age, and compare to given age.

In [8]:
rotavirus_data['age_months'] = (rotavirus_data['Enrollment Date']
                              - rotavirus_data['Date of Birth'] + timedelta(1)).astype('timedelta64[M]') 
In [9]:
np.abs((rotavirus_data['Age in months (auto calculated)  '] - rotavirus_data['age_months']) > 1).sum()
Out[9]:
0

Group into ages under 5 and 5 or over

In [10]:
rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int)

Calculate natural year

In [152]:
rotavirus_data['Enrollment Date'].sort_values(ascending=False).tail()
Out[152]:
1291   2012-12-02
1295   2012-12-02
1293   2012-12-02
1292   2012-12-02
1290   2012-12-02
Name: Enrollment Date, dtype: datetime64[ns]
In [11]:
rotavirus_data['year'] = rotavirus_data.Year - 2
rotavirus_data.year.value_counts()
Out[11]:
0    1215
1    1178
2    1059
Name: year, dtype: int64
In [154]:
rotavirus_data.setting.value_counts()
Out[154]:
0    2117
2    1190
1     145
Name: setting, dtype: int64

Recode sex and stool

In [12]:
rotavirus_data['male'] = rotavirus_data['Sex updated']=='Male'
rotavirus_data['stool_collected'] = rotavirus_data['Was stool collected?']=='Yes'
In [13]:
# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['Date of Birth', 
                                      'Age in months (auto calculated)  ', 'Sex',
                                     'Sex updated', 'Was stool collected?',
                                     'Ethnicity', 'Year'], axis=1)
In [14]:
rotavirus_data = pd.concat([rotavirus_data, pd.get_dummies(rotavirus_data.Setting)], axis=1).drop('Setting', axis=1)

Summary of counts by setting

In [158]:
rotavirus_data.groupby(['year', 'setting'])['Record ID'].count()
Out[158]:
year  setting
0     0          716
      1           58
      2          441
1     0          736
      1           49
      2          393
2     0          665
      1           38
      2          356
Name: Record ID, dtype: int64
In [159]:
rotavirus_data.shape
Out[159]:
(3452, 18)
In [15]:
rotavirus_data['setting'] = rotavirus_data.IP.astype(int)
rotavirus_data.loc[rotavirus_data.OP==1, 'setting'] = 2
rotavirus_data.setting.value_counts()
Out[15]:
0    2117
2    1190
1     145
Name: setting, dtype: int64
In [16]:
rotavirus_data.shape
Out[16]:
(3452, 18)
In [17]:
rotavirus_data.head()
Out[17]:
Record ID Enrollment Date white black hispanic rotavirus sapovirus astrovorus norovirus age_months under_5 year male stool_collected ED IP OP setting
0 EN1C-0001 2012-12-03 1 0 1 1 0 0 0 77.0 0 0 True True 0.0 0.0 1.0 2
1 EN1C-0002 2012-12-03 0 1 0 0 0 0 0 87.0 0 0 True False 0.0 0.0 1.0 2
2 EN1C-0003 2012-12-03 0 1 0 0 0 0 0 63.0 0 0 False False 0.0 0.0 1.0 2
3 EN1C-0004 2012-12-04 1 0 1 0 0 0 0 36.0 1 0 False True 0.0 0.0 1.0 2
4 EN1C-0006 2012-12-04 0 1 0 0 0 0 0 64.0 0 0 False False 0.0 0.0 1.0 2
In [18]:
rotavirus_data.isnull().sum()
Out[18]:
Record ID          0
Enrollment Date    0
white              0
black              0
hispanic           0
rotavirus          0
sapovirus          0
astrovorus         0
norovirus          0
age_months         0
under_5            0
year               0
male               0
stool_collected    0
ED                 0
IP                 0
OP                 0
setting            0
dtype: int64
In [19]:
rotavirus_data.describe()
Out[19]:
white black hispanic rotavirus sapovirus astrovorus norovirus age_months under_5 year ED IP OP setting
count 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000
mean 0.601970 0.398030 0.398610 0.108633 0.081112 0.037080 0.152665 44.912804 0.715527 0.954809 0.613268 0.042005 0.344728 0.731460
std 0.489563 0.489563 0.489683 0.311223 0.273047 0.188985 0.359717 45.485356 0.451228 0.810492 0.487072 0.200629 0.475348 0.941349
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 11.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 27.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000
75% 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 67.000000 1.000000 2.000000 1.000000 0.000000 1.000000 2.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 215.000000 1.000000 2.000000 1.000000 1.000000 1.000000 2.000000

Aggregate data by age group and setting

In [20]:
rotavirus_summary = (rotavirus_data.drop('Enrollment Date', axis=1)
             .groupby(['under_5', 'year', 'setting', 'black', 'hispanic'])['Record ID'].count()
             .reset_index()
             .rename(columns={'Record ID':'cases'}))

rotavirus_summary.head()
Out[20]:
under_5 year setting black hispanic cases
0 0 0 0 0 0 57
1 0 0 0 0 1 62
2 0 0 0 1 0 99
3 0 0 0 1 1 1
4 0 0 1 0 0 4

Census data

The key for AGEGRP is as follows:

  • 0 = Total
  • 1 = Age 0 to 4 years
  • 2 = Age 5 to 9 years
  • 3 = Age 10 to 14 years
  • 4 = Age 15 to 19 years
  • 5 = Age 20 to 24 years
  • 6 = Age 25 to 29 years
  • 7 = Age 30 to 34 years
  • 8 = Age 35 to 39 years
  • 9 = Age 40 to 44 years
  • 10 = Age 45 to 49 years
  • 11 = Age 50 to 54 years
  • 12 = Age 55 to 59 years
  • 13 = Age 60 to 64 years
  • 14 = Age 65 to 69 years
  • 15 = Age 70 to 74 years
  • 16 = Age 75 to 79 years
  • 17 = Age 80 to 84 years
  • 18 = Age 85 years or older
In [21]:
davidson_census = pd.read_csv('davidson_county.csv')
davidson_census.head()
Out[21]:
SUMLEV STATE COUNTY STNAME CTYNAME YEAR2008 YEAR AGEGRP TOT_POP TOT_MALE ... HWAC_MALE HWAC_FEMALE HBAC_MALE HBAC_FEMALE HIAC_MALE HIAC_FEMALE HAAC_MALE HAAC_FEMALE HNAC_MALE HNAC_FEMALE
0 50 47 37 Tennessee Davidson County 1 2008 0 626681 303540 ... 30400 24829 2178 2006 1501 1183 381 374 271 211
1 50 47 37 Tennessee Davidson County 1 2008 1 44691 22868 ... 3838 3664 302 303 185 189 63 53 26 35
2 50 47 37 Tennessee Davidson County 1 2008 2 37613 18977 ... 2910 2826 266 205 145 140 45 50 22 23
3 50 47 37 Tennessee Davidson County 1 2008 3 33904 17479 ... 2154 1890 166 170 105 91 25 29 18 16
4 50 47 37 Tennessee Davidson County 1 2008 4 38999 19674 ... 2029 1783 166 174 111 82 29 29 22 16

5 rows × 81 columns

Identify racial/ethnic subgroup populations

In [22]:
census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy()
census_data_child['HISPANIC_WHITE'] = census_data_child[['HWA_MALE', 'HWA_FEMALE']].sum(axis=1)
census_data_child['HISPANIC_BLACK'] = census_data_child[['HBA_MALE', 'HBA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_WHITE'] = census_data_child[['NHWA_MALE', 'NHWA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_BLACK'] = census_data_child[['NHBA_MALE', 'NHBA_FEMALE']].sum(axis=1)
In [23]:
census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])[['HISPANIC_WHITE', 'HISPANIC_BLACK', 
                                 'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']]
    .sum()).reset_index()

census_data['YEAR'] = census_data.YEAR - 2011
census_data
Out[23]:
under_5 YEAR HISPANIC_WHITE HISPANIC_BLACK NONHISPANIC_WHITE NONHISPANIC_BLACK
0 False 0 13889 891 47077 40334
1 False 1 14692 914 47527 40656
2 False 2 15653 983 47487 40556
3 True 0 7134 533 19290 13591
4 True 1 6994 566 19942 13511
5 True 2 6909 552 20479 13471

Reshape data

In [24]:
melted_census = (pd.melt(census_data, id_vars=['under_5', 'YEAR'], 
                    value_vars=['HISPANIC_WHITE', 'HISPANIC_BLACK', 
                            'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']))

census_clean = (melted_census.assign(hispanic=melted_census.variable.apply(lambda x: x.split('_')[0]),
                    black=melted_census.variable.apply(lambda x: x.split('_')[1]))
                 .drop('variable', axis=1)
                 .replace({'black':{'BLACK':1, 'WHITE':0},
                          'hispanic':{'HISPANIC':1, 'NONHISPANIC':0}})
                 .rename(columns={'YEAR':'year', 'value':'n'}))
                    
census_clean
Out[24]:
under_5 year n black hispanic
0 False 0 13889 0 1
1 False 1 14692 0 1
2 False 2 15653 0 1
3 True 0 7134 0 1
4 True 1 6994 0 1
5 True 2 6909 0 1
6 False 0 891 1 1
7 False 1 914 1 1
8 False 2 983 1 1
9 True 0 533 1 1
10 True 1 566 1 1
11 True 2 552 1 1
12 False 0 47077 0 0
13 False 1 47527 0 0
14 False 2 47487 0 0
15 True 0 19290 0 0
16 True 1 19942 0 0
17 True 2 20479 0 0
18 False 0 40334 1 0
19 False 1 40656 1 0
20 False 2 40556 1 0
21 True 0 13591 1 0
22 True 1 13511 1 0
23 True 2 13471 1 0
In [25]:
census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)])
In [26]:
merged_data = (rotavirus_summary.merge(census_full, on=['under_5', 'year', 'black', 'hispanic','setting'], how='right')
                       .fillna(0).astype(int))
In [27]:
merged_data.head()
Out[27]:
under_5 year setting black hispanic cases n
0 0 0 0 0 0 57 47077
1 0 0 0 0 1 62 13889
2 0 0 0 1 0 99 40334
3 0 0 0 1 1 1 891
4 0 0 1 0 0 4 47077

VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health

In [28]:
inpatient_market_share = 0.94, 0.90, 0.83
ed_market_share = 0.95, 0.94, 0.92
Y2012, Y2013, Y2014 = 0, 1, 2

Will estimate 2014 market shares by extrapolating a linear fit to the previous years.

In [29]:
from sklearn import linear_model
In [30]:
inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share))
inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.reshape(-1, 1), y=inpatient_df.y)
inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]]
inpatient_market_share
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)
Out[30]:
[0.9, 0.83, 0.77500000000000002]
In [31]:
ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share))
ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.reshape(-1, 1), y=ed_df.y)
ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]]
ed_market_share
Out[31]:
[0.94, 0.92, 0.90500000000000014]

Enrollment days/week for ED, inpatient, outpatient

In [32]:
monitoring_days = 4, 7, 4

# Indices to each value
ED, INPATIENT, OUTPATIENT = 0, 1, 2

monitoring_rate = np.array(np.array(monitoring_days)/7)
monitoring_rate
Out[32]:
array([ 0.57142857,  1.        ,  0.57142857])
In [33]:
rotavirus_data[(rotavirus_data.hispanic==1) & (rotavirus_data.white==1) 
               & (rotavirus_data.setting==2) & (rotavirus_data.year==2) &
              (rotavirus_data.under_5==1)].shape
Out[33]:
(157, 18)
In [34]:
setting_data = dict(list(merged_data.groupby('setting')))

Virus data

In [35]:
viruses = ['rotavirus','sapovirus','astrovorus','norovirus']
In [36]:
rotavirus_data.groupby(['under_5', 'year', 'setting']).stool_collected.sum().astype(int)
Out[36]:
under_5  year  setting
0        0     0          142
               1            8
               2          110
         1     0          150
               1            4
               2           82
         2     0          125
               1            1
               2           63
1        0     0          406
               1           44
               2          249
         1     0          439
               1           42
               2          220
         2     0          365
               1           35
               2          209
Name: stool_collected, dtype: int64
In [37]:
rotavirus_data[rotavirus_data.stool_collected].groupby(['under_5', 'year', 'setting'])[viruses].mean().round(2)
Out[37]:
rotavirus sapovirus astrovorus norovirus
under_5 year setting
0 0 0 0.24 0.11 0.04 0.09
1 0.12 0.00 0.00 0.12
2 0.12 0.09 0.05 0.12
1 0 0.14 0.07 0.03 0.18
1 0.25 0.00 0.00 0.25
2 0.07 0.09 0.02 0.15
2 0 0.19 0.05 0.02 0.11
1 1.00 0.00 0.00 0.00
2 0.11 0.03 0.02 0.22
1 0 0 0.16 0.13 0.03 0.16
1 0.23 0.02 0.02 0.14
2 0.15 0.12 0.08 0.18
1 0 0.12 0.12 0.06 0.23
1 0.19 0.17 0.17 0.31
2 0.08 0.16 0.08 0.25
2 0 0.15 0.09 0.04 0.22
1 0.17 0.06 0.00 0.23
2 0.10 0.08 0.03 0.29

Inpatient Model

In [38]:
from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform
from pymc3 import traceplot, forestplot, summary, df_summary

import theano.tensor as tt
In [39]:
def generate_inpatient_model(inpatient_data, pool_years=False):

    with Model() as model:

        under_5, cases, n = inpatient_data[['under_5', 'cases', 'n']].values.T

        weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year]

        weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=inpatient_data.shape[0],
                                         testval=(cases/weights).astype(int))
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

        if pool_years:
            
            assert not len(cases) % 3
            
            pooled_shape = int(len(cases)/3)
                                    
            pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
            
            θ = Normal('θ', 0, sd=1000, shape=pooled_shape)

            λ = Deterministic('λ', tt.exp(θ))

            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
                                
        else:
                      
            θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0])

            λ = Deterministic('λ', tt.exp(θ))
                      
            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model

Full (unpooled) model

In [40]:
inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1))
In [41]:
with inpatient_model_full:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 564.3 sec
In [42]:
plt.subplots(figsize=(8,12))
forestplot(inpatient_full_trace[1000:], varnames=['λ'])
Out[42]:
<matplotlib.gridspec.GridSpec at 0x11bd5e6d8>
In [43]:
label_map = {'under_5':{0:'5+', 1:'<5'},
             'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
             'black':{0:'white', 1:'black'},
             'hispanic':{0:'non-hispanic', 1:'hispanic'},
             'setting':{0:'ED', 1:'inpatient', 2:'outpatient'}}
In [44]:
inpatient_data_labels = (setting_data[INPATIENT][['under_5', 'year', 'black', 'hispanic']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'}))

Rate factor for population

In [45]:
rate_factor = 10000
In [46]:
rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns)
In [47]:
inpatient_data_labels
Out[47]:
age year race ethnicity
0 5+ 2012/13 white non-hispanic
1 5+ 2012/13 white hispanic
2 5+ 2012/13 black non-hispanic
3 5+ 2013/14 white non-hispanic
4 5+ 2013/14 white hispanic
5 5+ 2014/15 white hispanic
6 5+ 2014/15 black non-hispanic
7 <5 2012/13 white non-hispanic
8 <5 2012/13 white hispanic
9 <5 2012/13 black non-hispanic
10 <5 2013/14 white non-hispanic
11 <5 2013/14 white hispanic
12 <5 2013/14 black non-hispanic
13 <5 2014/15 white non-hispanic
14 <5 2014/15 white hispanic
15 <5 2014/15 black non-hispanic
16 5+ 2012/13 black hispanic
17 5+ 2013/14 black hispanic
18 5+ 2014/15 black hispanic
19 <5 2012/13 black hispanic
20 <5 2013/14 black hispanic
21 <5 2014/15 black hispanic
22 5+ 2014/15 white non-hispanic
23 5+ 2013/14 black non-hispanic
In [48]:
from pymc3 import hpd
def _hpd_df(x, alpha):
    cnames = ['hpd_{0:g}'.format(100 * alpha/2),
              'hpd_{0:g}'.format(100 * (1 - alpha/2))]
    return pd.DataFrame(hpd(x, alpha), columns=cnames)
In [49]:
estimates = (df_summary(inpatient_full_trace[1000:], varnames=['λ'],
                stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                           lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates.index = inpatient_data_labels.index
pd.concat([inpatient_data_labels, estimates], axis=1)
Out[49]:
age year race ethnicity mean sd mc_error hpd_2.5 hpd_97.5 median hpd_25 hpd_75
0 5+ 2012/13 white non-hispanic 0.845 0.430 0.009 0.205 1.772 0.773 0.396 0.895
1 5+ 2012/13 white hispanic 1.429 0.984 0.020 0.094 3.421 1.197 0.271 1.329
2 5+ 2012/13 black non-hispanic 1.004 0.500 0.012 0.172 1.917 0.915 0.422 1.034
3 5+ 2013/14 white non-hispanic 0.418 0.299 0.007 0.017 1.024 0.352 0.063 0.376
4 5+ 2013/14 white hispanic 2.060 1.180 0.028 0.259 4.427 1.834 0.937 2.254
5 5+ 2014/15 white hispanic 0.605 0.641 0.014 0.000 1.775 0.405 0.000 0.405
6 5+ 2014/15 black non-hispanic 0.253 0.247 0.006 0.000 0.738 0.186 0.000 0.186
7 <5 2012/13 white non-hispanic 11.429 2.432 0.047 7.174 16.208 11.257 9.176 12.480
8 <5 2012/13 white hispanic 22.628 5.570 0.139 12.106 33.637 22.098 16.911 24.050
9 <5 2012/13 black non-hispanic 10.254 2.636 0.051 5.234 15.231 10.017 7.652 11.122
10 <5 2013/14 white non-hispanic 14.066 2.614 0.051 9.129 19.126 13.886 12.318 15.786
11 <5 2013/14 white hispanic 17.146 4.843 0.098 8.274 26.776 16.796 12.789 19.115
12 <5 2013/14 black non-hispanic 8.824 2.480 0.050 4.343 13.940 8.641 7.006 10.218
13 <5 2014/15 white non-hispanic 6.849 1.850 0.036 3.491 10.579 6.633 5.111 7.435
14 <5 2014/15 white hispanic 30.361 6.633 0.141 18.231 43.795 29.874 25.511 34.154
15 <5 2014/15 black non-hispanic 7.424 2.293 0.044 3.092 11.894 7.274 5.583 8.557
16 5+ 2012/13 black hispanic 0.005 0.151 0.003 0.000 0.000 0.000 0.000 0.000
17 5+ 2013/14 black hispanic 0.006 0.135 0.004 0.000 0.000 0.000 0.000 0.000
18 5+ 2014/15 black hispanic 0.004 0.165 0.004 0.000 0.000 0.000 0.000 0.000
19 <5 2012/13 black hispanic 0.029 0.910 0.020 0.000 0.000 0.000 0.000 0.000
20 <5 2013/14 black hispanic 0.013 0.388 0.009 0.000 0.000 0.000 0.000 0.000
21 <5 2014/15 black hispanic 0.005 0.107 0.003 0.000 0.000 0.000 0.000 0.000
22 5+ 2014/15 white non-hispanic 0.000 0.002 0.000 0.000 0.000 0.000 0.000 0.000
23 5+ 2013/14 black non-hispanic 0.000 0.002 0.000 0.000 0.000 0.000 0.000 0.000
In [50]:
trace, labels = inpatient_full_trace, inpatient_data_labels
In [51]:
index=['age', 'year']
columns=['race', 'ethnicity']
varnames=['λ']
In [52]:
# rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
#                                       names=labels.columns)
# rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
# rate_df.index = rate_df_index
# pivot_all = pd.pivot_table(rate_df
#                  .reset_index(), 
#                            index=index, columns=columns)
# value_iterator = zip(pivot_all['mean'].values.flatten(), 
#                      pivot_all['hpd_2.5'].values.flatten(), 
#                      pivot_all['hpd_97.5'].values.flatten())
# rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
#                               pivot_all['mean'].shape)

# pd.DataFrame(rate_strings, index=pivot_mean.index, columns=pivot_mean.columns)
In [53]:
def generate_table(trace, labels, index, columns, varnames=['λ']):
    rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
                                          names=labels.columns)
    rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
    rate_df.index = rate_df_index
    pivot_all = pd.pivot_table(rate_df
                     .reset_index(), 
                               index=index, columns=columns)
    value_iterator = zip(pivot_all['mean'].values.flatten(), 
                     pivot_all['hpd_2.5'].values.flatten(), 
                     pivot_all['hpd_97.5'].values.flatten())
    rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
                                  pivot_all['mean'].shape)
    
    return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)
In [54]:
generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], columns=['race', 'ethnicity'])
Out[54]:
race black white
ethnicity hispanic non-hispanic hispanic non-hispanic
age year
5+ 2012/13 0.01 (0.00, 0.00) 1.00 (0.17, 1.92) 1.43 (0.09, 3.42) 0.85 (0.20, 1.77)
2013/14 0.01 (0.00, 0.00) 0.00 (0.00, 0.00) 2.06 (0.26, 4.43) 0.42 (0.02, 1.02)
2014/15 0.00 (0.00, 0.00) 0.25 (0.00, 0.74) 0.60 (0.00, 1.77) 0.00 (0.00, 0.00)
<5 2012/13 0.03 (0.00, 0.00) 10.25 (5.23, 15.23) 22.63 (12.11, 33.64) 11.43 (7.17, 16.21)
2013/14 0.01 (0.00, 0.00) 8.82 (4.34, 13.94) 17.15 (8.27, 26.78) 14.07 (9.13, 19.13)
2014/15 0.01 (0.00, 0.00) 7.42 (3.09, 11.89) 30.36 (18.23, 43.80) 6.85 (3.49, 10.58)
In [55]:
inpatient_rate_samples = pd.concat(
        [inpatient_data_labels, 
           pd.DataFrame(inpatient_full_trace['λ'][-1000:]).T], axis=1)
In [56]:
inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race', 'ethnicity']))

inpatient_rate_samples['race/ethnicity'] = inpatient_rate_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)

Plot of rates broken down by year/age/race/ethnicity.

In [57]:
sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age", row='race', 
                          col='ethnicity', 
            data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*10000),
            palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0)
rateplot.despine(left=True)
Out[57]:
<seaborn.axisgrid.FacetGrid at 0x114c90128>

Pooled across race/ethnicity

In [58]:
inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1)
                                    .groupby(['under_5', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
inpatient_pool_race_data
Out[58]:
under_5 year cases n
0 0 0 10 102191
1 0 1 5 103789
2 0 2 2 104679
3 1 0 48 40548
4 1 1 44 41013
5 1 2 36 41411
In [59]:
inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data)
In [60]:
with inpatient_model_pool_race:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_pool_race_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 3.1 sec
In [61]:
inpatient_pool_race_data_labels = (inpatient_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'under_5':'age'}))
In [62]:
generate_table(inpatient_pool_race_trace, inpatient_pool_race_data_labels, index=['year'], columns=['age'])
Out[62]:
age 5+ <5
year
2012/13 1.06 (0.41, 1.70) 13.09 (9.32, 16.38)
2013/14 0.58 (0.14, 1.05) 13.21 (9.34, 17.32)
2014/15 0.25 (0.01, 0.60) 11.43 (7.74, 15.17)
In [63]:
inpatient_pool_race_samples = pd.concat(
        [inpatient_pool_race_data[['under_5', 'year']].reset_index(drop=True), 
           pd.DataFrame(inpatient_pool_race_trace['λ'][-1000:]).T], axis=1)
In [64]:
inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples, 
                                       id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))

Plot of rates pooled by ethnicity

In [65]:
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*10000),
            palette="gray", linewidth=0.6, fliersize=0)
Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ada9ef0>

Pooled across years

In [66]:
inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1)
 .sort(columns=['under_5', 'black', 'hispanic', 'year'], ascending=False)).reset_index(drop=True)

inpatient_pool_year_data
Out[66]:
under_5 year black hispanic cases n
0 1 2 1 1 0 552
1 1 1 1 1 0 566
2 1 0 1 1 0 533
3 1 2 1 0 8 13471
4 1 1 1 0 10 13511
5 1 0 1 0 13 13591
6 1 2 0 1 17 6909
7 1 1 0 1 10 6994
8 1 0 0 1 15 7134
9 1 2 0 0 11 20479
10 1 1 0 0 24 19942
11 1 0 0 0 20 19290
12 0 2 1 1 0 983
13 0 1 1 1 0 914
14 0 0 1 1 0 891
15 0 2 1 0 1 40556
16 0 1 1 0 0 40656
17 0 0 1 0 4 40334
18 0 2 0 1 1 15653
19 0 1 0 1 3 14692
20 0 0 0 1 2 13889
21 0 2 0 0 0 47487
22 0 1 0 0 2 47527
23 0 0 0 0 4 47077
In [67]:
inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True)
In [68]:
with inpatient_model_pool_year:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_pool_year_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 30.1 sec
In [69]:
inpatient_pool_year_labels = (inpatient_pool_year_data[['under_5', 'black', 'hispanic']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'})
                         .drop_duplicates().reset_index(drop=True))
In [70]:
estimates_pooled = (df_summary(inpatient_pool_year_trace[1000:], varnames=['λ'],
            stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                       lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates_pooled.index = inpatient_pool_year_labels.index
pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1)
Out[70]:
age race ethnicity mean sd mc_error hpd_2.5 hpd_97.5 median hpd_25 hpd_75
0 <5 black hispanic 0.003 0.107 0.002 0.000 0.000 0.000 0.000 0.000
1 <5 black non-hispanic 26.644 4.522 0.130 18.073 35.547 26.388 23.132 29.001
2 <5 white hispanic 70.787 10.213 0.353 50.471 90.144 70.350 63.174 76.382
3 <5 white non-hispanic 31.384 3.838 0.089 24.137 38.477 31.189 27.996 33.245
4 5+ black hispanic 0.001 0.017 0.000 0.000 0.000 0.000 0.000 0.000
5 5+ black non-hispanic 1.219 0.540 0.015 0.323 2.292 1.145 0.727 1.400
6 5+ white hispanic 3.804 1.572 0.041 1.113 6.877 3.589 2.288 4.234
7 5+ white non-hispanic 1.267 0.507 0.014 0.440 2.268 1.206 0.758 1.410
In [71]:
generate_table(inpatient_pool_year_trace, inpatient_pool_year_labels, index=['race','ethnicity'], columns=['age'])
Out[71]:
age 5+ <5
race ethnicity
black hispanic 0.00 (0.00, 0.00) 0.00 (0.00, 0.00)
non-hispanic 1.22 (0.32, 2.29) 26.64 (18.07, 35.55)
white hispanic 3.80 (1.11, 6.88) 70.79 (50.47, 90.14)
non-hispanic 1.27 (0.44, 2.27) 31.38 (24.14, 38.48)
In [72]:
inpatient_pool_year_samples = pd.concat(
    [inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['under_5', 'black', 'hispanic']], 
    pd.DataFrame(inpatient_pool_year_trace['λ'][-1000:]).T], axis=1)
In [73]:
inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['under_5', 'black', 'hispanic'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'black':{0:'white', 1:'black'},
                                     'hispanic':{0:'non-hispanic', 1:'hispanic'}})
                    .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age group'}))

inpatient_pool_year_samples['race/ethnicity'] = inpatient_pool_year_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)

Plot of rates pooled by year

In [74]:
sns.set_context(font_scale=1.5)
plt.subplots(figsize=(12,6))
sns.boxplot(x="race/ethnicity", y="rate", hue="age group", 
        data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*10000),
        palette="gray_r", linewidth=0.6, fliersize=0)
Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x120133ba8>

Outpatient Model

Import outpatient denominator information.

In [75]:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()
In [76]:
outpatient_project = Project(api_url, api_key)
In [77]:
outpatients = outpatient_project.export_records(format='df')
In [78]:
outpatients.groupby('year').year.count()
Out[78]:
year
2012    15969
2015    17562
Name: year, dtype: int64
In [79]:
outpatients[(outpatients.ptage<5) & (outpatients.ptethnicity=='HL') & (outpatients.year==2012)].shape
Out[79]:
(393, 15)
In [80]:
outpatients_kids = outpatients[outpatients.ptage<=18].copy()
In [81]:
outpatients_kids['under_5'] = (outpatients_kids.ptage<5).astype(int)
In [82]:
outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
                        .groupby(['year', 'under_5'])
                        .n.count()
                        .reset_index())
outpatient_n
Out[82]:
year under_5 n
0 2012 0 13913
1 2012 1 1422
2 2015 0 11003
3 2015 1 6135
In [83]:
interp_2013 = (outpatient_n.groupby(['under_5'])
                           .apply(lambda x: np.interp(2013, x.year, x.n))
                           .astype(int)
                           .reset_index()).assign(year=2013).rename(columns={0:'n'})
interp_2013
Out[83]:
under_5 n year
0 0 12943 2013
1 1 2993 2013
In [84]:
outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0)
outpatient_n = outpatient_n[outpatient_n.year<2015]
outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012)
outpatient_n
Out[84]:
n under_5 year
0 13913 0 0
1 1422 1 0
0 12943 0 1
1 2993 1 1
In [85]:
outpatient_data_full = setting_data[OUTPATIENT].groupby(['under_5','year']).cases.sum().reset_index()
In [86]:
outpatient_data_full
Out[86]:
under_5 year cases
0 0 0 148
1 0 1 120
2 0 2 87
3 1 0 293
4 1 1 273
5 1 2 269
In [87]:
outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['under_5','year'])
In [88]:
outpatient_merged
Out[88]:
under_5 year cases n
0 0 0 148 13913
1 0 1 120 12943
2 1 0 293 1422
3 1 1 273 2993
In [89]:
def generate_outpatient_model(outpatient_data, pool_years=False):

    with Model() as model:

        cases,n = outpatient_data[['cases','n']].values.T

        ω = monitoring_rate[OUTPATIENT]

        shape = outpatient_data.shape[0]

        if pool_years:
            
            assert not len(cases) % 3
            
            shape = int(len(cases)/3)
                                    
            cases = cases.reshape((pooled_shape, 3)).sum(-1)
                      
        π = Beta('π', 1, 10, shape=shape)

        AGE_obs = Binomial('AGE_obs', n, π*ω, observed=cases)
        
    return model
In [90]:
outpatient_model_full = generate_outpatient_model(outpatient_merged)
Applied logodds-transform to π and added transformed π_logodds to model.
In [91]:
with outpatient_model_full:
    start = find_MAP()
    step = NUTS(scaling=start)
    outpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
 [-----------------100%-----------------] 2000 of 2000 complete in 1.7 sec
In [92]:
forestplot(outpatient_full_trace[1000:], varnames=['π'])
Out[92]:
<matplotlib.gridspec.GridSpec at 0x12765d978>
In [93]:
outpatient_data_labels = (inpatient_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'under_5':'age'}))
outpatient_data_labels = outpatient_data_labels[outpatient_data_labels.year.isin(['2012/13','2013/14'])]
In [94]:
generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['π'], 
               index=['age'], columns=['year'])
Out[94]:
year 2012/13 2013/14
age
5+ 187.53 (159.72, 219.80) 163.25 (133.51, 192.79)
<5 3563.82 (3196.31, 3912.01) 1593.31 (1421.97, 1781.77)
In [95]:
outpatient_full_samples = pd.concat(
        [outpatient_merged[['under_5', 'year']].reset_index(drop=True), 
           pd.DataFrame(outpatient_full_trace['π'][-1000:]).T], axis=1)
In [96]:
outpatient_full_samples = (pd.melt(outpatient_full_samples, 
                                       id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))
In [97]:
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=outpatient_full_samples.assign(rate=outpatient_full_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0)
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x12585b390>

ED Model

In [98]:
ED2 = pd.read_excel('FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED3 = pd.read_excel('Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED2.columns = ED3.columns
In [99]:
ED2_total = ED2.sum()
ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
In [100]:
ED3_total = ED3.sum()
ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
In [101]:
ED_all = pd.concat([ED2, ED3])
eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')]
In [102]:
ED_counts = ED_all[eligible_cols].sum(1).astype(int)
In [103]:
rotavirus_ed = rotavirus_data[rotavirus_data.ED==1]
surveillance_counts = rotavirus_data.groupby(['Enrollment Date'])['Record ID'].count()
In [104]:
ED_counts.name = 'count24'
surveillance_counts.name = 'count8'
In [105]:
surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna()
# Drop days where 8-hour count exceeds 24-hour count
surveillance = surveillance[surveillance.count8<=surveillance.count24]

Here is what it looks like if you resample to weekly

In [106]:
weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W').sum()
(weekly_surveillance.count8 / weekly_surveillance.count24).hist()
Out[106]:
<matplotlib.axes._subplots.AxesSubplot at 0x101bca7f0>

We will just use the proportion of the pooled suveillance

In [107]:
surveillance_total = surveillance.sum()
surveillance_total
Out[107]:
count24    3486.0
count8     1849.0
dtype: float64
In [108]:
surveillance_proportion = surveillance_total.count8 / surveillance_total.count24
surveillance_proportion
Out[108]:
0.53040734366035569
In [109]:
def generate_ed_model(ED_data):

    with Model() as model:
    
        data_len = ED_data.shape[0]

        cases, n = ED_data[['cases','n']].values.T

        weights = (np.array(monitoring_rate)[ED] 
                         * np.array(ed_market_share[:-1])[ED_data.year]
                         * surveillance_proportion)

        weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len,
                                         testval=(cases/weights).astype(int))
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)


        θ = Normal('θ', 0, sd=1000, shape=data_len)

        λ = Deterministic('λ', tt.exp(θ))

        age_vanderbilt = Poisson('age_vanderbilt', λ * n, observed=weighted_cases)
        
    return model
In [110]:
ed_pool_race_data = (setting_data[ED].drop('setting', axis=1)
                                    .query('year<2')
                                    .groupby(['under_5', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
ed_pool_race_data
Out[110]:
under_5 year cases n
0 0 0 219 102191
1 0 1 203 103789
2 1 0 497 40548
3 1 1 533 41013
In [111]:
ed_model = generate_ed_model(ed_pool_race_data)
In [112]:
with ed_model:
    start = find_MAP()
    step = NUTS(scaling=start)
    ed_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 2.2 sec
In [113]:
forestplot(ed_trace[1000:], varnames=['λ'])
Out[113]:
<matplotlib.gridspec.GridSpec at 0x128896320>
In [114]:
ed_data_labels = (ed_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'under_5':'age'}))
In [115]:
generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age'])
Out[115]:
age 5+ <5
year
2012/13 75.48 (66.17, 86.08) 432.16 (393.73, 468.47)
2013/14 69.52 (60.21, 78.79) 467.24 (428.29, 506.03)

Create data frame of MCMC samples for plotting

In [116]:
ed_rate_samples = pd.concat([ed_pool_race_data[['under_5', 'year']], 
           pd.DataFrame(ed_trace['λ'][-1000:]).T], axis=1)
In [117]:
ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14'}})
                    .rename(columns={'under_5':'age group'}))
In [118]:
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=ed_rate_samples.assign(rate=ed_rate_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0)
Out[118]:
<matplotlib.axes._subplots.AxesSubplot at 0x1272e4278>

Virus rates

In [119]:
census_data_virus = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])['TOT_POP'].sum()).reset_index()

census_data_virus['YEAR'] = census_data_virus.YEAR - 2011
In [120]:
census_virus_clean = census_data_virus.rename(columns={'YEAR':'year', 'TOT_POP':'n'})
census_virus_clean.head()
Out[120]:
under_5 year n
0 False 0 111216
1 False 1 113190
2 False 2 114362
3 True 0 45013
4 True 1 45582
In [166]:
rotavirus_data[['rotavirus','norovirus']].sum(1).value_counts()
Out[166]:
0    2598
1     806
2      48
dtype: int64
In [121]:
virus_summary = (rotavirus_data.groupby(['under_5', 'year', 'setting', 'rotavirus', 'sapovirus',
                                        'astrovorus', 'norovirus'])
             ['Record ID'].count()
             .reset_index()
             .rename(columns={'Record ID':'cases', 'astrovorus':'astrovirus'}))

virus_summary.head()
Out[121]:
under_5 year setting rotavirus sapovirus astrovirus norovirus cases
0 0 0 0 0 0 0 0 158
1 0 0 0 0 0 0 1 10
2 0 0 0 0 0 1 0 2
3 0 0 0 0 1 0 0 12
4 0 0 0 0 1 1 0 3
In [122]:
merged_virus_data = virus_summary.merge(census_virus_clean, on=['under_5', 'year'], how='left')
merged_virus_data.head(10)
Out[122]:
under_5 year setting rotavirus sapovirus astrovirus norovirus cases n
0 0 0 0 0 0 0 0 158 111216
1 0 0 0 0 0 0 1 10 111216
2 0 0 0 0 0 1 0 2 111216
3 0 0 0 0 1 0 0 12 111216
4 0 0 0 0 1 1 0 3 111216
5 0 0 0 1 0 0 0 30 111216
6 0 0 0 1 0 0 1 3 111216
7 0 0 0 1 1 0 0 1 111216
8 0 0 1 0 0 0 0 8 111216
9 0 0 1 0 0 0 1 1 111216
In [123]:
rota_data = (merged_virus_data[merged_virus_data.rotavirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max}))

sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))

astro_data = (merged_virus_data[merged_virus_data.astrovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))

noro_data = (merged_virus_data[merged_virus_data.norovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))
In [124]:
virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'),
           noro_data.reset_index().assign(virus='norovirus'),
           rota_data.reset_index().assign(virus='rotavirus'),
           sapo_data.reset_index().assign(virus='sapovirus')])
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)
virus_totals.head(10)
Out[124]:
under_5 year setting cases n virus
0 0 0 0 5 111216 astrovirus
1 0 0 1 0 111216 astrovirus
2 0 0 2 6 111216 astrovirus
3 0 1 0 4 113190 astrovirus
4 0 1 1 0 113190 astrovirus
5 0 1 2 2 113190 astrovirus
6 0 2 0 3 114362 astrovirus
7 0 2 1 0 114362 astrovirus
8 0 2 2 1 114362 astrovirus
9 1 0 0 13 45013 astrovirus
In [125]:
n_lookup = virus_totals.groupby(['under_5', 'year', 'virus']).agg({'n':max})
n_lookup
Out[125]:
n
under_5 year virus
0 0 astrovirus 111216
norovirus 111216
rotavirus 111216
sapovirus 111216
1 astrovirus 113190
norovirus 113190
rotavirus 113190
sapovirus 113190
2 astrovirus 114362
norovirus 114362
rotavirus 114362
sapovirus 114362
1 0 astrovirus 45013
norovirus 45013
rotavirus 45013
sapovirus 45013
1 astrovirus 45582
norovirus 45582
rotavirus 45582
sapovirus 45582
2 astrovirus 46003
norovirus 46003
rotavirus 46003
sapovirus 46003

Inpatient virus rate mode

In [127]:
virus_inpatient_data = (virus_totals.replace({'virus': {'norovirus':0,
                                                               'rotavirus':1,
                                                               'sapovirus':2,
                                                               'astrovirus':3}}))[virus_totals.setting==INPATIENT]
In [128]:
with Model() as virus_inpatient_model:

    cases, year, n = virus_inpatient_data[['cases', 'year', 'n']].values.T

    weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[year]

    weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=virus_inpatient_data.shape[0],
                                     testval=(cases/weights).astype(int))
    weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)


    θ = Normal('θ', 0, sd=1000, shape=virus_inpatient_data.shape[0])

    λ = Deterministic('λ', tt.exp(θ))

    virus_obs = Potential('virus_obs', Poisson.dist(λ*n).logp(weighted_cases))
In [129]:
from pymc3 import Metropolis

with virus_inpatient_model:
    virus_inpatient_trace = sample(20000, step=Metropolis(), njobs=2, random_seed=seed_numbers)
 [-----------------100%-----------------] 20000 of 20000 complete in 6.6 sec
In [130]:
virus_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']], 
           pd.DataFrame(virus_inpatient_trace['λ'][-1000:]).T], axis=1)
In [131]:
virus_samples = (pd.melt(virus_samples, id_vars=['under_5', 'year', 'virus'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))
In [132]:
sns.set(style="ticks")

rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=virus_samples.assign(rate=virus_samples.value*rate_factor),
        palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
        col_wrap=2)

rateplot.despine(left=True)
Out[132]:
<seaborn.axisgrid.FacetGrid at 0x128302278>

Outpatient virus rate model

In [133]:
virus_outpatient_data = (virus_totals.replace({'virus': {'norovirus':0,
                                                               'rotavirus':1,
                                                               'sapovirus':2,
                                                               'astrovirus':3}}))[virus_totals.setting==OUTPATIENT]
In [134]:
with Model() as virus_outpatient_model:

    cases,n = virus_outpatient_data[['cases','n']].values.T

    ω = monitoring_rate[OUTPATIENT]

    π = Beta('π', 1, 10, shape=virus_outpatient_data.shape[0])

    virus_out_obs = Binomial('virus_out_obs', n, π*ω, observed=cases)
        
Applied logodds-transform to π and added transformed π_logodds to model.
In [135]:
with virus_outpatient_model:
    virus_outpatient_trace = sample(2000, njobs=2, random_seed=seed_numbers)
Assigned NUTS to π_logodds
 [-----------------100%-----------------] 2000 of 2000 complete in 7.4 sec
In [136]:
virus_out_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']], 
           pd.DataFrame(virus_outpatient_trace['π'][-1000:]).T], axis=1)
In [137]:
virus_out_samples = (pd.melt(virus_out_samples, id_vars=['under_5', 'year', 'virus'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))
In [138]:
sns.set(style="ticks")

sns.set_context(font_scale=1.5)
rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=virus_out_samples.assign(rate=virus_out_samples.value*rate_factor),
        palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
        col_wrap=2)

rateplot.despine(left=True)
Out[138]:
<seaborn.axisgrid.FacetGrid at 0x12b235198>

ED virus rate model

In [139]:
virus_ed_data = (virus_totals.replace({'virus': {'norovirus':0,
                                                               'rotavirus':1,
                                                               'sapovirus':2,
                                                               'astrovirus':3}}))[virus_totals.setting==ED]
In [140]:
with Model() as virus_ed_model:

    data_len = virus_ed_data.shape[0]

    cases, year, n = virus_ed_data[['cases', 'year', 'n']].values.T

    weights = (np.array(monitoring_rate)[ED] 
                     * np.array(ed_market_share)[year]
                     * surveillance_proportion)

    weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len,
                                     testval=(cases/weights).astype(int))
    weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)


    θ = Normal('θ', 0, sd=1000, shape=data_len)

    λ = Deterministic('λ', tt.exp(θ))

    virus_ed_obs = Poisson('virus_ed_obs', λ * n, observed=weighted_cases)
        
In [141]:
with virus_ed_model:
    virus_ed_trace = sample(2000, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases
Assigned NUTS to θ
 [-----------------100%-----------------] 2000 of 2000 complete in 10.4 sec
In [142]:
virus_ed_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']], 
           pd.DataFrame(virus_ed_trace['λ'][-1000:]).T], axis=1)
In [143]:
virus_ed_samples = (pd.melt(virus_ed_samples, id_vars=['under_5', 'year', 'virus'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))
In [144]:
sns.set(style="ticks")

sns.set_context(font_scale=1.5)
rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=virus_ed_samples.assign(rate=virus_ed_samples.value*rate_factor),
        palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
        col_wrap=2)

rateplot.despine(left=True)
Out[144]:
<seaborn.axisgrid.FacetGrid at 0x12d316550>