In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')

seed_numbers = 20090425, 19700903

from datetime import timedelta, datetime
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Import data

In [2]:
rotavirus_data = pd.read_csv('ROTA.csv')
rotavirus_data.index.is_unique
Out[2]:
True

Rename columns and recode data

In [3]:
race_cols = [race for race in rotavirus_data.columns if race.startswith('Race')]

(rotavirus_data[race_cols]=='Checked').sum()
Out[3]:
Race (choice=White)                                     2249
Race (choice=Black/African American)                    1545
Race (choice=American Indian/Alaskan Native)               5
Race (choice=Asian)                                       86
Race (choice=Native Hawaiian/Other Pacific Islander)       5
Race (choice=Other)                                       18
Race (choice=Unknown)                                      3
Race (choice=None)                                         1
dtype: int64
In [4]:
rotavirus_data['white'] = (rotavirus_data['Race (choice=White)']=='Checked').astype(int)
rotavirus_data['black'] = (rotavirus_data['Race (choice=Black/African American)']=='Checked').astype(int)

rotavirus_data = rotavirus_data[(rotavirus_data.white + rotavirus_data.black)==1].drop(race_cols, axis=1)
In [5]:
rotavirus_data['hispanic'] = (rotavirus_data.Ethnicity=='Hispanic or Latino').astype(int)
In [6]:
rotavirus_data['rotavirus'] = (rotavirus_data['Result of rotavirus testing'] == 'Positive').astype(int)
rotavirus_data['sapovirus'] = (rotavirus_data['Result of sapovirus testing'] == 'Positive').astype(int)
rotavirus_data['astrovorus'] = (rotavirus_data['Result of astrovirus testing'] == 'Positive').astype(int)
rotavirus_data['norovirus'] = (rotavirus_data['Result of Norovirus Testing'] == 'GII').astype(int)

rotavirus_data = rotavirus_data.drop([col for col in rotavirus_data.columns if col.startswith('Result')], axis=1)
In [7]:
rotavirus_data['Enrollment Date'] = pd.to_datetime(rotavirus_data['Enrollment Date'])
rotavirus_data['Date of Birth'] = pd.to_datetime(rotavirus_data['Date of Birth'])

Calculate age, and compare to given age.

In [8]:
rotavirus_data['age_months'] = (rotavirus_data['Enrollment Date']
                              - rotavirus_data['Date of Birth'] + timedelta(1)).astype('timedelta64[M]') 
In [9]:
np.abs((rotavirus_data['Age in months (auto calculated)  '] - rotavirus_data['age_months']) > 1).sum()
Out[9]:
0

Group into ages under 5 and 5 or over

In [10]:
rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int)

Calculate natural year

In [11]:
rotavirus_data['year'] = rotavirus_data.Year - 2
rotavirus_data.year.value_counts()
Out[11]:
0    1215
1    1178
2    1059
Name: year, dtype: int64

Recode sex and stool

In [12]:
rotavirus_data['male'] = rotavirus_data['Sex updated']=='Male'
rotavirus_data['stool_collected'] = rotavirus_data['Was stool collected?']=='Yes'
In [13]:
# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['Date of Birth', 
                                      'Age in months (auto calculated)  ', 'Sex',
                                     'Sex updated', 'Was stool collected?',
                                     'Ethnicity', 'Year'], axis=1)
In [14]:
rotavirus_data = pd.concat([rotavirus_data, pd.get_dummies(rotavirus_data.Setting)], axis=1).drop('Setting', axis=1)
In [15]:
rotavirus_data['setting'] = rotavirus_data.IP.astype(int)
rotavirus_data.loc[rotavirus_data.OP==1, 'setting'] = 2
rotavirus_data.setting.value_counts()
Out[15]:
0    2117
2    1190
1     145
Name: setting, dtype: int64
In [16]:
rotavirus_data.shape
Out[16]:
(3452, 18)
In [17]:
rotavirus_data.head()
Out[17]:
Record ID Enrollment Date white black hispanic rotavirus sapovirus astrovorus norovirus age_months under_5 year male stool_collected ED IP OP setting
0 EN1C-0001 2012-12-03 1 0 1 1 0 0 0 77.0 0 0 True True 0.0 0.0 1.0 2
1 EN1C-0002 2012-12-03 0 1 0 0 0 0 0 87.0 0 0 True False 0.0 0.0 1.0 2
2 EN1C-0003 2012-12-03 0 1 0 0 0 0 0 63.0 0 0 False False 0.0 0.0 1.0 2
3 EN1C-0004 2012-12-04 1 0 1 0 0 0 0 36.0 1 0 False True 0.0 0.0 1.0 2
4 EN1C-0006 2012-12-04 0 1 0 0 0 0 0 64.0 0 0 False False 0.0 0.0 1.0 2
In [18]:
rotavirus_data.isnull().sum()
Out[18]:
Record ID          0
Enrollment Date    0
white              0
black              0
hispanic           0
rotavirus          0
sapovirus          0
astrovorus         0
norovirus          0
age_months         0
under_5            0
year               0
male               0
stool_collected    0
ED                 0
IP                 0
OP                 0
setting            0
dtype: int64
In [19]:
rotavirus_data.describe()
Out[19]:
white black hispanic rotavirus sapovirus astrovorus norovirus age_months under_5 year ED IP OP setting
count 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000 3452.000000
mean 0.601970 0.398030 0.398610 0.108633 0.081112 0.037080 0.152665 44.912804 0.715527 0.954809 0.613268 0.042005 0.344728 0.731460
std 0.489563 0.489563 0.489683 0.311223 0.273047 0.188985 0.359717 45.485356 0.451228 0.810492 0.487072 0.200629 0.475348 0.941349
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 11.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 27.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000
75% 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 67.000000 1.000000 2.000000 1.000000 0.000000 1.000000 2.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 215.000000 1.000000 2.000000 1.000000 1.000000 1.000000 2.000000

Aggregate data by age group and setting

In [20]:
rotavirus_summary = (rotavirus_data.drop('Enrollment Date', axis=1)
             .groupby(['under_5', 'year', 'setting', 'black', 'hispanic'])['Record ID'].count()
             .reset_index()
             .rename(columns={'Record ID':'cases'}))

rotavirus_summary.head()
Out[20]:
under_5 year setting black hispanic cases
0 0 0 0 0 0 57
1 0 0 0 0 1 62
2 0 0 0 1 0 99
3 0 0 0 1 1 1
4 0 0 1 0 0 4

Census data

The key for AGEGRP is as follows:

  • 0 = Total
  • 1 = Age 0 to 4 years
  • 2 = Age 5 to 9 years
  • 3 = Age 10 to 14 years
  • 4 = Age 15 to 19 years
  • 5 = Age 20 to 24 years
  • 6 = Age 25 to 29 years
  • 7 = Age 30 to 34 years
  • 8 = Age 35 to 39 years
  • 9 = Age 40 to 44 years
  • 10 = Age 45 to 49 years
  • 11 = Age 50 to 54 years
  • 12 = Age 55 to 59 years
  • 13 = Age 60 to 64 years
  • 14 = Age 65 to 69 years
  • 15 = Age 70 to 74 years
  • 16 = Age 75 to 79 years
  • 17 = Age 80 to 84 years
  • 18 = Age 85 years or older
In [21]:
davidson_census = pd.read_csv('davidson_county.csv')
davidson_census.head()
Out[21]:
SUMLEV STATE COUNTY STNAME CTYNAME YEAR2008 YEAR AGEGRP TOT_POP TOT_MALE ... HWAC_MALE HWAC_FEMALE HBAC_MALE HBAC_FEMALE HIAC_MALE HIAC_FEMALE HAAC_MALE HAAC_FEMALE HNAC_MALE HNAC_FEMALE
0 50 47 37 Tennessee Davidson County 1 2008 0 626681 303540 ... 30400 24829 2178 2006 1501 1183 381 374 271 211
1 50 47 37 Tennessee Davidson County 1 2008 1 44691 22868 ... 3838 3664 302 303 185 189 63 53 26 35
2 50 47 37 Tennessee Davidson County 1 2008 2 37613 18977 ... 2910 2826 266 205 145 140 45 50 22 23
3 50 47 37 Tennessee Davidson County 1 2008 3 33904 17479 ... 2154 1890 166 170 105 91 25 29 18 16
4 50 47 37 Tennessee Davidson County 1 2008 4 38999 19674 ... 2029 1783 166 174 111 82 29 29 22 16

5 rows × 81 columns

Identify racial/ethnic subgroup populations

In [22]:
census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy()
census_data_child['HISPANIC_WHITE'] = census_data_child[['HWA_MALE', 'HWA_FEMALE']].sum(axis=1)
census_data_child['HISPANIC_BLACK'] = census_data_child[['HBA_MALE', 'HBA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_WHITE'] = census_data_child[['NHWA_MALE', 'NHWA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_BLACK'] = census_data_child[['NHBA_MALE', 'NHBA_FEMALE']].sum(axis=1)
In [23]:
census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])[['HISPANIC_WHITE', 'HISPANIC_BLACK', 
                                 'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']]
    .sum()).reset_index()

census_data['YEAR'] = census_data.YEAR - 2011
census_data
Out[23]:
under_5 YEAR HISPANIC_WHITE HISPANIC_BLACK NONHISPANIC_WHITE NONHISPANIC_BLACK
0 False 0 13889 891 47077 40334
1 False 1 14692 914 47527 40656
2 False 2 15653 983 47487 40556
3 True 0 7134 533 19290 13591
4 True 1 6994 566 19942 13511
5 True 2 6909 552 20479 13471

Reshape data

In [24]:
melted_census = (pd.melt(census_data, id_vars=['under_5', 'YEAR'], 
                    value_vars=['HISPANIC_WHITE', 'HISPANIC_BLACK', 
                            'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']))

census_clean = (melted_census.assign(hispanic=melted_census.variable.apply(lambda x: x.split('_')[0]),
                    black=melted_census.variable.apply(lambda x: x.split('_')[1]))
                 .drop('variable', axis=1)
                 .replace({'black':{'BLACK':1, 'WHITE':0},
                          'hispanic':{'HISPANIC':1, 'NONHISPANIC':0}})
                 .rename(columns={'YEAR':'year', 'value':'n'}))
                    
census_clean
Out[24]:
under_5 year n black hispanic
0 False 0 13889 0 1
1 False 1 14692 0 1
2 False 2 15653 0 1
3 True 0 7134 0 1
4 True 1 6994 0 1
5 True 2 6909 0 1
6 False 0 891 1 1
7 False 1 914 1 1
8 False 2 983 1 1
9 True 0 533 1 1
10 True 1 566 1 1
11 True 2 552 1 1
12 False 0 47077 0 0
13 False 1 47527 0 0
14 False 2 47487 0 0
15 True 0 19290 0 0
16 True 1 19942 0 0
17 True 2 20479 0 0
18 False 0 40334 1 0
19 False 1 40656 1 0
20 False 2 40556 1 0
21 True 0 13591 1 0
22 True 1 13511 1 0
23 True 2 13471 1 0
In [25]:
census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)])
In [26]:
merged_data = (rotavirus_summary.merge(census_full, on=['under_5', 'year', 'black', 'hispanic','setting'], how='right')
                       .fillna(0).astype(int))
In [27]:
merged_data.head()
Out[27]:
under_5 year setting black hispanic cases n
0 0 0 0 0 0 57 47077
1 0 0 0 0 1 62 13889
2 0 0 0 1 0 99 40334
3 0 0 0 1 1 1 891
4 0 0 1 0 0 4 47077

VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health

In [28]:
inpatient_market_share = 0.94, 0.90, 0.83
ed_market_share = 0.95, 0.94, 0.92
Y2012, Y2013, Y2014 = 0, 1, 2

Will estimate 2014 market shares by extrapolating a linear fit to the previous years.

In [29]:
from sklearn import linear_model
In [30]:
inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share))
inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.reshape(-1, 1), y=inpatient_df.y)
inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]]
inpatient_market_share
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)
Out[30]:
[0.9, 0.83, 0.77500000000000002]
In [31]:
ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share))
ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.reshape(-1, 1), y=ed_df.y)
ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]]
ed_market_share
Out[31]:
[0.94, 0.92, 0.90500000000000014]

Enrollment days/week for ED, inpatient, outpatient

In [32]:
monitoring_days = 4, 7, 4

# Indices to each value
ED, INPATIENT, OUTPATIENT = 0, 1, 2

monitoring_rate = np.array(np.array(monitoring_days)/7)
monitoring_rate
Out[32]:
array([ 0.57142857,  1.        ,  0.57142857])
In [33]:
rotavirus_data[(rotavirus_data.hispanic==1) & (rotavirus_data.white==1) 
               & (rotavirus_data.setting==2) & (rotavirus_data.year==2) &
              (rotavirus_data.under_5==1)].shape
Out[33]:
(157, 18)
In [34]:
setting_data = dict(list(merged_data.groupby('setting')))

Inpatient Model

In [35]:
from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform
from pymc3 import traceplot, forestplot, summary, df_summary

import theano.tensor as tt
Couldn't import dot_parser, loading of dot files will not be possible.
In [36]:
def generate_inpatient_model(inpatient_data, pool_years=False):

    with Model() as model:

        under_5, cases, n = inpatient_data[['under_5', 'cases', 'n']].values.T

        weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year]

        weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=inpatient_data.shape[0],
                                         testval=(cases/weights).astype(int))
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

        if pool_years:
            
            assert not len(cases) % 3
            
            pooled_shape = int(len(cases)/3)
                                    
            pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
            
            θ = Normal('θ', 0, sd=1000, shape=pooled_shape)

            λ = Deterministic('λ', tt.exp(θ))

            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
                                
        else:
                      
            θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0])

            λ = Deterministic('λ', tt.exp(θ))
                      
            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model

Full (unpooled) model

In [37]:
inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1))
In [38]:
with inpatient_model_full:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 97.1 sec
In [39]:
plt.subplots(figsize=(8,12))
forestplot(inpatient_full_trace[1000:], varnames=['λ'])
Out[39]:
<matplotlib.gridspec.GridSpec at 0x1198dec50>
In [40]:
label_map = {'under_5':{0:'5+', 1:'<5'},
             'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
             'black':{0:'white', 1:'black'},
             'hispanic':{0:'non-hispanic', 1:'hispanic'},
             'setting':{0:'ED', 1:'inpatient', 2:'outpatient'}}
In [41]:
inpatient_data_labels = (setting_data[INPATIENT][['under_5', 'year', 'black', 'hispanic']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'}))

Rate factor for population

In [42]:
rate_factor = 10000
In [43]:
rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns)
In [44]:
inpatient_data_labels
Out[44]:
age year race ethnicity
0 5+ 2012/13 white non-hispanic
1 5+ 2012/13 white hispanic
2 5+ 2012/13 black non-hispanic
3 5+ 2013/14 white non-hispanic
4 5+ 2013/14 white hispanic
5 5+ 2014/15 white hispanic
6 5+ 2014/15 black non-hispanic
7 <5 2012/13 white non-hispanic
8 <5 2012/13 white hispanic
9 <5 2012/13 black non-hispanic
10 <5 2013/14 white non-hispanic
11 <5 2013/14 white hispanic
12 <5 2013/14 black non-hispanic
13 <5 2014/15 white non-hispanic
14 <5 2014/15 white hispanic
15 <5 2014/15 black non-hispanic
16 5+ 2012/13 black hispanic
17 5+ 2013/14 black hispanic
18 5+ 2014/15 black hispanic
19 <5 2012/13 black hispanic
20 <5 2013/14 black hispanic
21 <5 2014/15 black hispanic
22 5+ 2014/15 white non-hispanic
23 5+ 2013/14 black non-hispanic
In [45]:
from pymc3 import hpd
def _hpd_df(x, alpha):
    cnames = ['hpd_{0:g}'.format(100 * alpha/2),
              'hpd_{0:g}'.format(100 * (1 - alpha/2))]
    return pd.DataFrame(hpd(x, alpha), columns=cnames)
In [46]:
estimates = (df_summary(inpatient_full_trace[1000:], varnames=['λ'],
                stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                           lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates.index = inpatient_data_labels.index
pd.concat([inpatient_data_labels, estimates], axis=1)
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  return reshape(newshape, order=order)
Out[46]:
age year race ethnicity mean sd mc_error hpd_2.5 hpd_97.5 median hpd_25 hpd_75
0 5+ 2012/13 white non-hispanic 0.814 0.438 0.031 0.140 1.656 0.720 0.363 0.859
1 5+ 2012/13 white hispanic 1.283 0.930 0.068 0.037 3.083 1.123 0.104 1.168
2 5+ 2012/13 black non-hispanic 1.025 0.495 0.031 0.195 1.988 0.966 0.471 1.083
3 5+ 2013/14 white non-hispanic 0.384 0.273 0.017 0.003 0.894 0.318 0.083 0.363
4 5+ 2013/14 white hispanic 2.152 1.267 0.088 0.203 4.546 1.927 0.791 2.192
5 5+ 2014/15 white hispanic 0.661 0.608 0.042 0.001 1.951 0.478 0.001 0.479
6 5+ 2014/15 black non-hispanic 0.246 0.241 0.017 0.004 0.720 0.170 0.004 0.171
7 <5 2012/13 white non-hispanic 11.252 2.339 0.144 7.025 16.002 11.011 9.193 12.169
8 <5 2012/13 white hispanic 23.234 5.438 0.351 13.545 34.042 22.690 19.251 26.387
9 <5 2012/13 black non-hispanic 10.256 2.640 0.178 5.582 15.543 10.108 7.967 11.511
10 <5 2013/14 white non-hispanic 14.167 2.759 0.175 9.420 19.676 13.953 12.353 15.856
11 <5 2013/14 white hispanic 17.910 4.991 0.326 8.796 28.690 17.573 14.587 20.601
12 <5 2013/14 black non-hispanic 8.902 2.495 0.151 4.272 13.772 8.717 6.978 10.325
13 <5 2014/15 white non-hispanic 6.928 1.980 0.137 3.343 10.624 6.732 5.372 7.968
14 <5 2014/15 white hispanic 30.327 6.354 0.398 19.451 43.707 29.692 24.363 32.647
15 <5 2014/15 black non-hispanic 7.642 2.316 0.158 3.258 11.928 7.531 5.498 8.595
16 5+ 2012/13 black hispanic 0.009 0.188 0.004 0.000 0.000 0.000 0.000 0.000
17 5+ 2013/14 black hispanic 0.027 0.477 0.016 0.000 0.000 0.000 0.000 0.000
18 5+ 2014/15 black hispanic 0.018 0.338 0.008 0.000 0.000 0.000 0.000 0.000
19 <5 2012/13 black hispanic 0.018 0.439 0.010 0.000 0.000 0.000 0.000 0.000
20 <5 2013/14 black hispanic 0.004 0.113 0.003 0.000 0.000 0.000 0.000 0.000
21 <5 2014/15 black hispanic 0.039 0.704 0.020 0.000 0.000 0.000 0.000 0.000
22 5+ 2014/15 white non-hispanic 0.000 0.007 0.000 0.000 0.000 0.000 0.000 0.000
23 5+ 2013/14 black non-hispanic 0.001 0.011 0.000 0.000 0.000 0.000 0.000 0.000
In [47]:
trace, labels = inpatient_full_trace, inpatient_data_labels
In [48]:
index=['age', 'year']
columns=['race', 'ethnicity']
varnames=['λ']
In [49]:
# rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
#                                       names=labels.columns)
# rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
# rate_df.index = rate_df_index
# pivot_all = pd.pivot_table(rate_df
#                  .reset_index(), 
#                            index=index, columns=columns)
# value_iterator = zip(pivot_all['mean'].values.flatten(), 
#                      pivot_all['hpd_2.5'].values.flatten(), 
#                      pivot_all['hpd_97.5'].values.flatten())
# rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
#                               pivot_all['mean'].shape)

# pd.DataFrame(rate_strings, index=pivot_mean.index, columns=pivot_mean.columns)
In [52]:
def generate_table(trace, labels, index, columns, varnames=['λ']):
    rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
                                          names=labels.columns)
    rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
    rate_df.index = rate_df_index
    pivot_all = pd.pivot_table(rate_df
                     .reset_index(), 
                               index=index, columns=columns)
    value_iterator = zip(pivot_all['mean'].values.flatten(), 
                     pivot_all['hpd_2.5'].values.flatten(), 
                     pivot_all['hpd_97.5'].values.flatten())
    rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
                                  pivot_all['mean'].shape)
    
    return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)
In [53]:
generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], columns=['race', 'ethnicity'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  return reshape(newshape, order=order)
Out[53]:
race black white
ethnicity hispanic non-hispanic hispanic non-hispanic
age year
5+ 2012/13 0.01 (0.00, 0.00) 1.02 (0.19, 1.99) 1.28 (0.04, 3.08) 0.81 (0.14, 1.66)
2013/14 0.03 (0.00, 0.00) 0.00 (0.00, 0.00) 2.15 (0.20, 4.55) 0.38 (0.00, 0.89)
2014/15 0.02 (0.00, 0.00) 0.25 (0.00, 0.72) 0.66 (0.00, 1.95) 0.00 (0.00, 0.00)
<5 2012/13 0.02 (0.00, 0.00) 10.26 (5.58, 15.54) 23.23 (13.54, 34.04) 11.25 (7.02, 16.00)
2013/14 0.00 (0.00, 0.00) 8.90 (4.27, 13.77) 17.91 (8.80, 28.69) 14.17 (9.42, 19.68)
2014/15 0.04 (0.00, 0.00) 7.64 (3.26, 11.93) 30.33 (19.45, 43.71) 6.93 (3.34, 10.62)
In [54]:
inpatient_rate_samples = pd.concat(
        [inpatient_data_labels, 
           pd.DataFrame(inpatient_full_trace['λ'][-1000:]).T], axis=1)
In [55]:
inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race', 'ethnicity']))

inpatient_rate_samples['race/ethnicity'] = inpatient_rate_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)

Plot of rates broken down by year/age/race/ethnicity.

In [56]:
sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age", row='race', 
                          col='ethnicity', 
            data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*10000),
            palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0)
rateplot.despine(left=True)
Out[56]:
<seaborn.axisgrid.FacetGrid at 0x11ad19710>

Pooled across race/ethnicity

In [57]:
inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1)
                                    .groupby(['under_5', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
inpatient_pool_race_data
Out[57]:
under_5 year cases n
0 0 0 10 102191
1 0 1 5 103789
2 0 2 2 104679
3 1 0 48 40548
4 1 1 44 41013
5 1 2 36 41411
In [58]:
inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data)
In [59]:
with inpatient_model_pool_race:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_pool_race_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 3.1 sec
In [60]:
inpatient_pool_race_data_labels = (inpatient_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'under_5':'age'}))
In [61]:
generate_table(inpatient_pool_race_trace, inpatient_pool_race_data_labels, index=['year'], columns=['age'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  return reshape(newshape, order=order)
Out[61]:
age 5+ <5
year
2012/13 1.08 (0.48, 1.72) 12.98 (9.51, 16.87)
2013/14 0.57 (0.14, 1.08) 13.38 (9.14, 17.50)
2014/15 0.23 (0.00, 0.55) 11.19 (7.56, 14.86)
In [62]:
inpatient_pool_race_samples = pd.concat(
        [inpatient_pool_race_data[['under_5', 'year']].reset_index(drop=True), 
           pd.DataFrame(inpatient_pool_race_trace['λ'][-1000:]).T], axis=1)
In [63]:
inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples, 
                                       id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))

Plot of rates pooled by ethnicity

In [64]:
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*10000),
            palette="gray", linewidth=0.6, fliersize=0)
Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c735080>

Pooled across years

In [65]:
inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1)
 .sort(columns=['under_5', 'black', 'hispanic', 'year'], ascending=False)).reset_index(drop=True)

inpatient_pool_year_data
Out[65]:
under_5 year black hispanic cases n
0 1 2 1 1 0 552
1 1 1 1 1 0 566
2 1 0 1 1 0 533
3 1 2 1 0 8 13471
4 1 1 1 0 10 13511
5 1 0 1 0 13 13591
6 1 2 0 1 17 6909
7 1 1 0 1 10 6994
8 1 0 0 1 15 7134
9 1 2 0 0 11 20479
10 1 1 0 0 24 19942
11 1 0 0 0 20 19290
12 0 2 1 1 0 983
13 0 1 1 1 0 914
14 0 0 1 1 0 891
15 0 2 1 0 1 40556
16 0 1 1 0 0 40656
17 0 0 1 0 4 40334
18 0 2 0 1 1 15653
19 0 1 0 1 3 14692
20 0 0 0 1 2 13889
21 0 2 0 0 0 47487
22 0 1 0 0 2 47527
23 0 0 0 0 4 47077
In [66]:
inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True)
In [67]:
with inpatient_model_pool_year:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_pool_year_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 105.1 sec
In [68]:
inpatient_pool_year_labels = (inpatient_pool_year_data[['under_5', 'black', 'hispanic']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'})
                         .drop_duplicates().reset_index(drop=True))
In [69]:
estimates_pooled = (df_summary(inpatient_pool_year_trace[1000:], varnames=['λ'],
            stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                       lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates_pooled.index = inpatient_pool_year_labels.index
pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1)
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  return reshape(newshape, order=order)
Out[69]:
age race ethnicity mean sd mc_error hpd_2.5 hpd_97.5 median hpd_25 hpd_75
0 <5 black hispanic 0.012 0.318 0.007 0.000 0.000 0.000 0.000 0.000
1 <5 black non-hispanic 26.795 4.774 0.225 17.589 36.073 26.370 23.418 29.633
2 <5 white hispanic 71.156 10.641 0.409 50.333 91.705 70.631 61.987 76.206
3 <5 white non-hispanic 31.473 3.894 0.139 23.883 38.592 31.386 29.269 34.573
4 5+ black hispanic 0.003 0.079 0.002 0.000 0.000 0.000 0.000 0.000
5 5+ black non-hispanic 1.267 0.550 0.021 0.273 2.238 1.189 0.749 1.422
6 5+ white hispanic 3.815 1.511 0.066 1.351 6.796 3.527 2.091 3.913
7 5+ white non-hispanic 1.283 0.534 0.023 0.410 2.338 1.193 0.754 1.419
In [70]:
generate_table(inpatient_pool_year_trace, inpatient_pool_year_labels, index=['race','ethnicity'], columns=['age'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  return reshape(newshape, order=order)
Out[70]:
age 5+ <5
race ethnicity
black hispanic 0.00 (0.00, 0.00) 0.01 (0.00, 0.00)
non-hispanic 1.27 (0.27, 2.24) 26.80 (17.59, 36.07)
white hispanic 3.81 (1.35, 6.80) 71.16 (50.33, 91.71)
non-hispanic 1.28 (0.41, 2.34) 31.47 (23.88, 38.59)
In [71]:
inpatient_pool_year_samples = pd.concat(
    [inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['under_5', 'black', 'hispanic']], 
    pd.DataFrame(inpatient_pool_year_trace['λ'][-1000:]).T], axis=1)
In [72]:
inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['under_5', 'black', 'hispanic'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'black':{0:'white', 1:'black'},
                                     'hispanic':{0:'non-hispanic', 1:'hispanic'}})
                    .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age group'}))

inpatient_pool_year_samples['race/ethnicity'] = inpatient_pool_year_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)

Plot of rates pooled by year

In [73]:
sns.set_context(font_scale=1.5)
plt.subplots(figsize=(12,6))
sns.boxplot(x="race/ethnicity", y="rate", hue="age group", 
        data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*10000),
        palette="gray_r", linewidth=0.6, fliersize=0)
Out[73]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c9eb4a8>

Outpatient Model

Import outpatient denominator information.

In [74]:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()
In [75]:
outpatient_project = Project(api_url, api_key)
In [76]:
outpatients = outpatient_project.export_records(format='df')
In [77]:
outpatients.groupby('year').year.count()
Out[77]:
year
2012    15969
2015    17562
Name: year, dtype: int64
In [78]:
outpatients[(outpatients.ptage<5) & (outpatients.ptethnicity=='HL') & (outpatients.year==2012)].shape
Out[78]:
(393, 15)
In [79]:
outpatients_kids = outpatients[(outpatients.ptage<=18) 
                               & (outpatients.ptrace.isin(['B','W'])) 
                               & (outpatients.ptethnicity.isin(['NH', 'HL']))].copy()
In [80]:
outpatients_kids.ptrace.value_counts()
Out[80]:
B    12119
W    10179
Name: ptrace, dtype: int64
In [81]:
outpatients_kids.ptethnicity.value_counts()
Out[81]:
NH    18859
HL     3439
Name: ptethnicity, dtype: int64
In [82]:
outpatients_kids['under_5'] = (outpatients_kids.ptage<5).astype(int)
outpatients_kids['black'] = (outpatients_kids.ptrace=='B').astype(int)
outpatients_kids['hispanic'] = (outpatients_kids.ptethnicity=='HL').astype(int)
In [83]:
outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
                        .groupby(['year', 'black', 'hispanic', 'under_5'])
                        .n.count()
                        .reset_index())
outpatient_n
Out[83]:
year black hispanic under_5 n
0 2012 0 0 0 2692
1 2012 0 0 1 379
2 2012 0 1 0 1262
3 2012 0 1 1 176
4 2012 1 0 0 5081
5 2012 1 0 1 455
6 2012 1 1 0 88
7 2012 1 1 1 14
8 2015 0 0 0 2276
9 2015 0 0 1 1629
10 2015 0 1 0 1129
11 2015 0 1 1 636
12 2015 1 0 0 4301
13 2015 1 0 1 2046
14 2015 1 1 0 84
15 2015 1 1 1 50
In [84]:
interp_2013 = (outpatient_n.groupby(['black','hispanic','under_5'])
                           .apply(lambda x: np.interp(2013, x.year, x.n))
                           .astype(int)
                           .reset_index()).assign(year=2013).rename(columns={0:'n'})
interp_2013
Out[84]:
black hispanic under_5 n year
0 0 0 0 2553 2013
1 0 0 1 795 2013
2 0 1 0 1217 2013
3 0 1 1 329 2013
4 1 0 0 4821 2013
5 1 0 1 985 2013
6 1 1 0 86 2013
7 1 1 1 26 2013
In [85]:
outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0)
outpatient_n = outpatient_n[outpatient_n.year<2015]
outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012)
outpatient_n
Out[85]:
black hispanic n under_5 year
0 0 0 2692 0 0
1 0 0 379 1 0
2 0 1 1262 0 0
3 0 1 176 1 0
4 1 0 5081 0 0
5 1 0 455 1 0
6 1 1 88 0 0
7 1 1 14 1 0
0 0 0 2553 0 1
1 0 0 795 1 1
2 0 1 1217 0 1
3 0 1 329 1 1
4 1 0 4821 0 1
5 1 0 985 1 1
6 1 1 86 0 1
7 1 1 26 1 1
In [86]:
outpatient_data_full = setting_data[OUTPATIENT].drop(['setting','n'], axis=1)
In [87]:
outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['black','hispanic','under_5','year'])
In [88]:
outpatient_merged
Out[88]:
under_5 year black hispanic cases n
0 0 0 0 0 28 2692
1 0 0 0 1 68 1262
2 0 0 1 0 52 5081
3 0 1 0 0 22 2553
4 0 1 0 1 63 1217
5 0 1 1 0 35 4821
6 1 0 0 0 40 379
7 1 0 0 1 144 176
8 1 0 1 0 109 455
9 1 1 0 0 38 795
10 1 1 0 1 138 329
11 1 1 1 0 96 985
12 1 1 1 1 1 26
13 0 0 1 1 0 88
14 0 1 1 1 0 86
15 1 0 1 1 0 14
In [89]:
def generate_outpatient_model(outpatient_data, pool_years=False):

    with Model() as model:

        cases,n = outpatient_data[['cases','n']].values.T

        ω = monitoring_rate[INPATIENT]

        shape = outpatient_data.shape[0]

        if pool_years:
            
            assert not len(cases) % 3
            
            shape = int(len(cases)/3)
                                    
            cases = cases.reshape((pooled_shape, 3)).sum(-1)
                      
        π = Beta('π', 1, 10, shape=shape)

        AGE_obs = Binomial('AGE_obs', n, π*ω, observed=cases)
        
    return model
In [90]:
outpatient_model_full = generate_outpatient_model(outpatient_merged)
Applied logodds-transform to π and added transformed π_logodds to model.
In [91]:
with outpatient_model_full:
    start = find_MAP()
    step = NUTS(scaling=start)
    outpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
 [-----------------100%-----------------] 2000 of 2000 complete in 3.9 sec
In [92]:
plt.subplots(figsize=(8,12))
forestplot(outpatient_full_trace[1000:], varnames=['π'])
Out[92]:
<matplotlib.gridspec.GridSpec at 0x1221b8be0>
In [93]:
outpatient_data_labels = inpatient_data_labels[inpatient_data_labels.year.isin(['2012/13','2013/14'])]
In [94]:
generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['π'], 
               index=['age', 'year'], columns=['race', 'ethnicity'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  return reshape(newshape, order=order)
Out[94]:
race black white
ethnicity hispanic non-hispanic hispanic non-hispanic
age year
5+ 2012/13 973.57 (804.49, 1149.48) 103.82 (78.73, 131.44) 542.07 (411.50, 646.74) 107.32 (71.36, 149.99)
2013/14 539.15 (27.72, 1234.00) 396.80 (0.10, 1126.13) 521.78 (405.35, 652.50) 90.36 (55.57, 128.20)
<5 2012/13 105.93 (0.10, 322.00) 7748.09 (7142.60, 8349.21) 1051.22 (756.89, 1333.33) 74.36 (51.22, 98.51)
2013/14 102.21 (0.01, 308.45) 4084.32 (3576.56, 4616.85) 486.26 (338.46, 627.66) 2363.01 (1974.50, 2774.73)

ED Model

In [95]:
ED2 = pd.read_excel('FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED3 = pd.read_excel('Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED2.columns = ED3.columns
In [96]:
ED2_total = ED2.sum()
ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
In [97]:
ED3_total = ED3.sum()
ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])
In [98]:
ED_all = pd.concat([ED2, ED3])
eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')]
In [99]:
ED_counts = ED_all[eligible_cols].sum(1).astype(int)
In [100]:
rotavirus_ed = rotavirus_data[rotavirus_data.ED==1]
surveillance_counts = rotavirus_data.groupby(['Enrollment Date'])['Record ID'].count()
In [101]:
ED_counts.name = 'count24'
surveillance_counts.name = 'count8'
In [102]:
surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna()
# Drop days where 8-hour count exceeds 24-hour count
surveillance = surveillance[surveillance.count8<=surveillance.count24]

Here is what it looks like if you resample to weekly

In [103]:
weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W').sum()
(weekly_surveillance.count8 / weekly_surveillance.count24).hist()
Out[103]:
<matplotlib.axes._subplots.AxesSubplot at 0x12332b940>

We will just use the proportion of the pooled suveillance

In [104]:
surveillance_total = surveillance.sum()
surveillance_total
Out[104]:
count24    3486.0
count8     1849.0
dtype: float64
In [105]:
surveillance_proportion = surveillance_total.count8 / surveillance_total.count24
surveillance_proportion
Out[105]:
0.53040734366035569
In [106]:
def generate_ed_model(ED_data):

    with Model() as model:
    
        data_len = ED_data.shape[0]

        cases, n = ED_data[['cases','n']].values.T

        weights = (np.array(monitoring_rate)[ED] 
                         * np.array(ed_market_share[:-1])[ED_data.year]
                         * surveillance_proportion)

        weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len,
                                         testval=(cases/weights).astype(int))
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)


        θ = Normal('θ', 0, sd=1000, shape=data_len)

        λ = Deterministic('λ', tt.exp(θ))

        age_vanderbilt = Poisson('age_vanderbilt', λ * n, observed=weighted_cases)
        
    return model
In [107]:
ed_pool_race_data = (setting_data[ED].drop('setting', axis=1)
                                    .query('year<2')
                                    .groupby(['under_5', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
ed_pool_race_data
Out[107]:
under_5 year cases n
0 0 0 219 102191
1 0 1 203 103789
2 1 0 497 40548
3 1 1 533 41013
In [108]:
ed_model = generate_ed_model(ed_pool_race_data)
In [109]:
with ed_model:
    start = find_MAP()
    step = NUTS(scaling=start)
    ed_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)
Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 3.7 sec
In [110]:
forestplot(ed_trace[1000:], varnames=['λ'])
Out[110]:
<matplotlib.gridspec.GridSpec at 0x126433f28>
In [111]:
ed_data_labels = (ed_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'under_5':'age'}))
In [112]:
generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age'])
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:225: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  return reshape(newshape, order=order)
Out[112]:
age 5+ <5
year
2012/13 75.29 (66.11, 84.55) 429.88 (396.08, 469.63)
2013/14 70.02 (61.57, 79.29) 464.98 (418.34, 507.64)

Create data frame of MCMC samples for plotting

In [120]:
ed_rate_samples = pd.concat([ed_pool_race_data[['under_5', 'year']], 
           pd.DataFrame(ed_trace['λ'][-1000:]).T], axis=1)
In [121]:
ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14'}})
                    .rename(columns={'under_5':'age group'}))
In [122]:
sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=ed_rate_samples.assign(rate=ed_rate_samples.value*10000),
            palette="gray", linewidth=0.6, fliersize=0)
Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x122de23c8>

Virus rates

In [ ]:
census_data_virus = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])['TOT_POP'].sum()).reset_index()

census_data_virus['YEAR'] = census_data_virus.YEAR - 2011
In [ ]:
census_virus_clean = census_data_virus.rename(columns={'YEAR':'year', 'TOT_POP':'n'})
census_virus_clean.head()
In [ ]:
virus_summary = (rotavirus_data.groupby(['under_5', 'year', 'setting', 'rotavirus', 'sapovirus',
                                        'astrovorus', 'norovirus'])
             ['Record ID'].count()
             .reset_index()
             .rename(columns={'Record ID':'cases', 'astrovorus':'astrovirus'}))

virus_summary.head()
In [ ]:
merged_virus_data = virus_summary.merge(census_virus_clean, on=['under_5', 'year'], how='left')
merged_virus_data.head(10)
In [ ]:
rota_data = (merged_virus_data[merged_virus_data.rotavirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max}))

sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))

astro_data = (merged_virus_data[merged_virus_data.astrovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))

noro_data = (merged_virus_data[merged_virus_data.norovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))
In [ ]:
virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'),
           noro_data.reset_index().assign(virus='norovirus'),
           rota_data.reset_index().assign(virus='rotavirus'),
           sapo_data.reset_index().assign(virus='sapovirus')])
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)
virus_totals.head(10)
In [ ]:
n_lookup = virus_totals.groupby(['under_5', 'year', 'virus']).agg({'n':max})
n_lookup
In [ ]:
with Model() as virus_model:
    
    n_total = int(n_lookup.shape[0])
        
    p_enroll = Beta('p_enroll', 1, 1, testval=0.9)
    enrolled = Binomial('enrolled', n=rotavirus_data.shape[0], p=p_enroll, 
                        observed=rotavirus_data.stool_collected.sum())
    
    weights = (np.array(monitoring_rate)[virus_totals.setting] 
                     * np.array(market_share)[virus_totals.year])
    
    weighted_cases = DiscreteUniform('weighted_cases', virus_totals.cases, 
                            virus_totals.n, shape=virus_totals.shape[0],
                            testval=np.maximum((virus_totals.cases.values/weights).astype(int), 1))

    p_weight = Deterministic('p_weight', weights*p_enroll)
    weighting = Binomial('weighting', n=weighted_cases, p=p_weight, observed=virus_totals.cases)
    
    weighted_cases_total = Deterministic('weighted_cases_total',
                                         weighted_cases.reshape((n_total, 3)).sum(1))
    
    θ = Normal('θ', 0, sd=1000, shape=n_total, testval=np.ones(n_total))
    
    λ = Deterministic('λ', tt.exp(θ))
        
    virus_like = Potential('virus_like', 
                           Poisson.dist(λ * (n_lookup.n/10000.)).logp(weighted_cases_total))
In [ ]:
with virus_model:
    virus_trace = sample(20000, njobs=2, random_seed=seed_numbers)
In [ ]:
traceplot(virus_trace[10000:], varnames=['p_enroll'])
In [ ]:
forestplot(virus_trace[10000:], varnames=['weighted_cases_total'])
In [ ]:
virus_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']], 
           pd.DataFrame(virus_trace['λ'][-1000:]).T], axis=1)
In [ ]:
virus_samples = (pd.melt(virus_samples, id_vars=['under_5', 'year', 'virus'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))
In [ ]:
sns.set(style="ticks")

rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=virus_samples.rename(columns={'value':'rate'}),
        palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
        col_wrap=2)

rateplot.despine(left=True)