Import data

In [2]:

rotavirus_data = pd.read_csv('ROTA.csv')
rotavirus_data.index.is_unique

Out[2]:

True

Rename columns and recode data

In [3]:

race_cols = [race for race in rotavirus_data.columns if race.startswith('Race')]

(rotavirus_data[race_cols]=='Checked').sum()

Out[3]:

Race (choice=White)                                     2249
Race (choice=Black/African American)                    1545
Race (choice=American Indian/Alaskan Native)               5
Race (choice=Asian)                                       86
Race (choice=Native Hawaiian/Other Pacific Islander)       5
Race (choice=Other)                                       18
Race (choice=Unknown)                                      3
Race (choice=None)                                         1
dtype: int64

In [4]:

rotavirus_data['white'] = (rotavirus_data['Race (choice=White)']=='Checked').astype(int)
rotavirus_data['black'] = (rotavirus_data['Race (choice=Black/African American)']=='Checked').astype(int)

rotavirus_data = rotavirus_data[(rotavirus_data.white + rotavirus_data.black)==1].drop(race_cols, axis=1)

In [5]:

rotavirus_data['hispanic'] = (rotavirus_data.Ethnicity=='Hispanic or Latino').astype(int)

In [6]:

rotavirus_data['rotavirus'] = (rotavirus_data['Result of rotavirus testing'] == 'Positive').astype(int)
rotavirus_data['sapovirus'] = (rotavirus_data['Result of sapovirus testing'] == 'Positive').astype(int)
rotavirus_data['astrovorus'] = (rotavirus_data['Result of astrovirus testing'] == 'Positive').astype(int)
rotavirus_data['norovirus'] = (rotavirus_data['Result of Norovirus Testing'] == 'GII').astype(int)

rotavirus_data = rotavirus_data.drop([col for col in rotavirus_data.columns if col.startswith('Result')], axis=1)

In [7]:

rotavirus_data['Enrollment Date'] = pd.to_datetime(rotavirus_data['Enrollment Date'])
rotavirus_data['Date of Birth'] = pd.to_datetime(rotavirus_data['Date of Birth'])

In [162]:

pd.crosstab(rotavirus_data.black, rotavirus_data.hispanic)

Out[162]:

hispanic	0	1
black
0	719	1359
1	1357	17

Calculate age, and compare to given age.

In [8]:

rotavirus_data['age_months'] = (rotavirus_data['Enrollment Date']
                              - rotavirus_data['Date of Birth'] + timedelta(1)).astype('timedelta64[M]') 

In [9]:

np.abs((rotavirus_data['Age in months (auto calculated)  '] - rotavirus_data['age_months']) > 1).sum()

Out[9]:

Group into ages under 5 and 5 or over

In [10]:

rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int)

Calculate natural year

In [152]:

rotavirus_data['Enrollment Date'].sort_values(ascending=False).tail()

Out[152]:

1291   2012-12-02
1295   2012-12-02
1293   2012-12-02
1292   2012-12-02
1290   2012-12-02
Name: Enrollment Date, dtype: datetime64[ns]

In [11]:

rotavirus_data['year'] = rotavirus_data.Year - 2
rotavirus_data.year.value_counts()

Out[11]:

0    1215
1    1178
2    1059
Name: year, dtype: int64

In [154]:

rotavirus_data.setting.value_counts()

Out[154]:

0    2117
2    1190
1     145
Name: setting, dtype: int64

Recode sex and stool

In [12]:

rotavirus_data['male'] = rotavirus_data['Sex updated']=='Male'
rotavirus_data['stool_collected'] = rotavirus_data['Was stool collected?']=='Yes'

In [13]:

# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['Date of Birth', 
                                      'Age in months (auto calculated)  ', 'Sex',
                                     'Sex updated', 'Was stool collected?',
                                     'Ethnicity', 'Year'], axis=1)

In [14]:

rotavirus_data = pd.concat([rotavirus_data, pd.get_dummies(rotavirus_data.Setting)], axis=1).drop('Setting', axis=1)

Summary of counts by setting

In [158]:

rotavirus_data.groupby(['year', 'setting'])['Record ID'].count()

Out[158]:

year  setting
0     0          716
      1           58
      2          441
1     0          736
      1           49
      2          393
2     0          665
      1           38
      2          356
Name: Record ID, dtype: int64

In [159]:

rotavirus_data.shape

Out[159]:

(3452, 18)

In [15]:

rotavirus_data['setting'] = rotavirus_data.IP.astype(int)
rotavirus_data.loc[rotavirus_data.OP==1, 'setting'] = 2
rotavirus_data.setting.value_counts()

Out[15]:

0    2117
2    1190
1     145
Name: setting, dtype: int64

In [16]:

rotavirus_data.shape

Out[16]:

(3452, 18)

In [17]:

rotavirus_data.head()

Out[17]:

	Record ID	Enrollment Date	white	black	hispanic	rotavirus	age_months	under_5	male	stool_collected	OP	setting
0	EN1C-0001	2012-12-03	1	0	1	1	77.0	0	True	True	1.0	2
1	EN1C-0002	2012-12-03	0	1	0	0	87.0	0	True	False	1.0	2
2	EN1C-0003	2012-12-03	0	1	0	0	63.0	0	False	False	1.0	2
3	EN1C-0004	2012-12-04	1	0	1	0	36.0	1	False	True	1.0	2
4	EN1C-0006	2012-12-04	0	1	0	0	64.0	0	False	False	1.0	2

In [18]:

rotavirus_data.isnull().sum()

Out[18]:

Record ID          0
Enrollment Date    0
white              0
black              0
hispanic           0
rotavirus          0
sapovirus          0
astrovorus         0
norovirus          0
age_months         0
under_5            0
year               0
male               0
stool_collected    0
ED                 0
IP                 0
OP                 0
setting            0
dtype: int64

In [19]:

rotavirus_data.describe()

Out[19]:

	white	black	hispanic	rotavirus	sapovirus	astrovorus	norovirus	age_months	under_5	year	ED	IP	OP	setting
count	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000	3452.000000
mean	0.601970	0.398030	0.398610	0.108633	0.081112	0.037080	0.152665	44.912804	0.715527	0.954809	0.613268	0.042005	0.344728	0.731460
std	0.489563	0.489563	0.489683	0.311223	0.273047	0.188985	0.359717	45.485356	0.451228	0.810492	0.487072	0.200629	0.475348	0.941349
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	11.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	27.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000
75%	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	67.000000	1.000000	2.000000	1.000000	0.000000	1.000000	2.000000
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	215.000000	1.000000	2.000000	1.000000	1.000000	1.000000	2.000000

Aggregate data by age group and setting

In [20]:

rotavirus_summary = (rotavirus_data.drop('Enrollment Date', axis=1)
             .groupby(['under_5', 'year', 'setting', 'black', 'hispanic'])['Record ID'].count()
             .reset_index()
             .rename(columns={'Record ID':'cases'}))

rotavirus_summary.head()

Out[20]:

	setting	black	hispanic	cases
0	0	0	0	57
1	0	0	1	62
2	0	1	0	99
3	0	1	1	1
4	1	0	0	4

Census data¶

The key for AGEGRP is as follows:

0 = Total
1 = Age 0 to 4 years
2 = Age 5 to 9 years
3 = Age 10 to 14 years
4 = Age 15 to 19 years
5 = Age 20 to 24 years
6 = Age 25 to 29 years
7 = Age 30 to 34 years
8 = Age 35 to 39 years
9 = Age 40 to 44 years
10 = Age 45 to 49 years
11 = Age 50 to 54 years
12 = Age 55 to 59 years
13 = Age 60 to 64 years
14 = Age 65 to 69 years
15 = Age 70 to 74 years
16 = Age 75 to 79 years
17 = Age 80 to 84 years
18 = Age 85 years or older

In [21]:

davidson_census = pd.read_csv('davidson_county.csv')
davidson_census.head()

Out[21]:

	SUMLEV	STATE	COUNTY	STNAME	CTYNAME	YEAR2008	YEAR	AGEGRP	TOT_POP	TOT_MALE	...	HWAC_MALE	HWAC_FEMALE	HBAC_MALE	HBAC_FEMALE	HIAC_MALE	HIAC_FEMALE	HAAC_MALE	HAAC_FEMALE	HNAC_MALE	HNAC_FEMALE
0	50	47	37	Tennessee	Davidson County	1	2008	0	626681	303540	...	30400	24829	2178	2006	1501	1183	381	374	271	211
1	50	47	37	Tennessee	Davidson County	1	2008	1	44691	22868	...	3838	3664	302	303	185	189	63	53	26	35
2	50	47	37	Tennessee	Davidson County	1	2008	2	37613	18977	...	2910	2826	266	205	145	140	45	50	22	23
3	50	47	37	Tennessee	Davidson County	1	2008	3	33904	17479	...	2154	1890	166	170	105	91	25	29	18	16
4	50	47	37	Tennessee	Davidson County	1	2008	4	38999	19674	...	2029	1783	166	174	111	82	29	29	22	16

5 rows × 81 columns

Identify racial/ethnic subgroup populations

In [22]:

census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy()
census_data_child['HISPANIC_WHITE'] = census_data_child[['HWA_MALE', 'HWA_FEMALE']].sum(axis=1)
census_data_child['HISPANIC_BLACK'] = census_data_child[['HBA_MALE', 'HBA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_WHITE'] = census_data_child[['NHWA_MALE', 'NHWA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_BLACK'] = census_data_child[['NHBA_MALE', 'NHBA_FEMALE']].sum(axis=1)

In [23]:

census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])[['HISPANIC_WHITE', 'HISPANIC_BLACK', 
                                 'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']]
    .sum()).reset_index()

census_data['YEAR'] = census_data.YEAR - 2011
census_data

Out[23]:

	under_5	YEAR	HISPANIC_WHITE	HISPANIC_BLACK	NONHISPANIC_WHITE	NONHISPANIC_BLACK
0	False	0	13889	891	47077	40334
1	False	1	14692	914	47527	40656
2	False	2	15653	983	47487	40556
3	True	0	7134	533	19290	13591
4	True	1	6994	566	19942	13511
5	True	2	6909	552	20479	13471

Reshape data

In [24]:

melted_census = (pd.melt(census_data, id_vars=['under_5', 'YEAR'], 
                    value_vars=['HISPANIC_WHITE', 'HISPANIC_BLACK', 
                            'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']))

census_clean = (melted_census.assign(hispanic=melted_census.variable.apply(lambda x: x.split('_')[0]),
                    black=melted_census.variable.apply(lambda x: x.split('_')[1]))
                 .drop('variable', axis=1)
                 .replace({'black':{'BLACK':1, 'WHITE':0},
                          'hispanic':{'HISPANIC':1, 'NONHISPANIC':0}})
                 .rename(columns={'YEAR':'year', 'value':'n'}))
                    
census_clean

Out[24]:

	under_5	year	n	black	hispanic
0	False	0	13889	0	1
1	False	1	14692	0	1
2	False	2	15653	0	1
3	True	0	7134	0	1
4	True	1	6994	0	1
5	True	2	6909	0	1
6	False	0	891	1	1
7	False	1	914	1	1
8	False	2	983	1	1
9	True	0	533	1	1
10	True	1	566	1	1
11	True	2	552	1	1
12	False	0	47077	0	0
13	False	1	47527	0	0
14	False	2	47487	0	0
15	True	0	19290	0	0
16	True	1	19942	0	0
17	True	2	20479	0	0
18	False	0	40334	1	0
19	False	1	40656	1	0
20	False	2	40556	1	0
21	True	0	13591	1	0
22	True	1	13511	1	0
23	True	2	13471	1	0

In [25]:

census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)])

In [26]:

merged_data = (rotavirus_summary.merge(census_full, on=['under_5', 'year', 'black', 'hispanic','setting'], how='right')
                       .fillna(0).astype(int))

In [27]:

merged_data.head()

Out[27]:

	setting	black	hispanic	cases	n
0	0	0	0	57	47077
1	0	0	1	62	13889
2	0	1	0	99	40334
3	0	1	1	1	891
4	1	0	0	4	47077

VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health

In [28]:

inpatient_market_share = 0.94, 0.90, 0.83
ed_market_share = 0.95, 0.94, 0.92
Y2012, Y2013, Y2014 = 0, 1, 2

Will estimate 2014 market shares by extrapolating a linear fit to the previous years.

In [29]:

from sklearn import linear_model

In [30]:

inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share))
inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.reshape(-1, 1), y=inpatient_df.y)
inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]]
inpatient_market_share

/Users/fonnescj/anaconda3/lib/python3.5/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)

Out[30]:

[0.9, 0.83, 0.77500000000000002]

In [31]:

ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share))
ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.reshape(-1, 1), y=ed_df.y)
ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]]
ed_market_share

Out[31]:

[0.94, 0.92, 0.90500000000000014]

Enrollment days/week for ED, inpatient, outpatient

In [32]:

monitoring_days = 4, 7, 4

# Indices to each value
ED, INPATIENT, OUTPATIENT = 0, 1, 2

monitoring_rate = np.array(np.array(monitoring_days)/7)
monitoring_rate

Out[32]:

array([ 0.57142857,  1.        ,  0.57142857])

In [33]:

rotavirus_data[(rotavirus_data.hispanic==1) & (rotavirus_data.white==1) 
               & (rotavirus_data.setting==2) & (rotavirus_data.year==2) &
              (rotavirus_data.under_5==1)].shape

Out[33]:

(157, 18)

In [34]:

setting_data = dict(list(merged_data.groupby('setting')))

Virus data¶

In [35]:

viruses = ['rotavirus','sapovirus','astrovorus','norovirus']

In [36]:

rotavirus_data.groupby(['under_5', 'year', 'setting']).stool_collected.sum().astype(int)

Out[36]:

under_5  year  setting
0        0     0          142
               1            8
               2          110
         1     0          150
               1            4
               2           82
         2     0          125
               1            1
               2           63
1        0     0          406
               1           44
               2          249
         1     0          439
               1           42
               2          220
         2     0          365
               1           35
               2          209
Name: stool_collected, dtype: int64

In [37]:

rotavirus_data[rotavirus_data.stool_collected].groupby(['under_5', 'year', 'setting'])[viruses].mean().round(2)

Out[37]:

			rotavirus	sapovirus	astrovorus	norovirus
under_5	year	setting
0	0	0	0.24	0.11	0.04	0.09
		1	0.12	0.00	0.00	0.12
		2	0.12	0.09	0.05	0.12
	1	0	0.14	0.07	0.03	0.18
		1	0.25	0.00	0.00	0.25
		2	0.07	0.09	0.02	0.15
	2	0	0.19	0.05	0.02	0.11
		1	1.00	0.00	0.00	0.00
		2	0.11	0.03	0.02	0.22
1	0	0	0.16	0.13	0.03	0.16
		1	0.23	0.02	0.02	0.14
		2	0.15	0.12	0.08	0.18
	1	0	0.12	0.12	0.06	0.23
		1	0.19	0.17	0.17	0.31
		2	0.08	0.16	0.08	0.25
	2	0	0.15	0.09	0.04	0.22
		1	0.17	0.06	0.00	0.23
		2	0.10	0.08	0.03	0.29

Inpatient Model¶

In [38]:

from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform
from pymc3 import traceplot, forestplot, summary, df_summary

import theano.tensor as tt

In [39]:

def generate_inpatient_model(inpatient_data, pool_years=False):

    with Model() as model:

        under_5, cases, n = inpatient_data[['under_5', 'cases', 'n']].values.T

        weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year]

        weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=inpatient_data.shape[0],
                                         testval=(cases/weights).astype(int))
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

        if pool_years:
            
            assert not len(cases) % 3
            
            pooled_shape = int(len(cases)/3)
                                    
            pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
            
            θ = Normal('θ', 0, sd=1000, shape=pooled_shape)

            λ = Deterministic('λ', tt.exp(θ))

            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
                                
        else:
                      
            θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0])

            λ = Deterministic('λ', tt.exp(θ))
                      
            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model

In [ ]:

Full (unpooled) model¶

In [40]:

inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1))

In [41]:

with inpatient_model_full:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)

Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 564.3 sec

In [42]:

plt.subplots(figsize=(8,12))
forestplot(inpatient_full_trace[1000:], varnames=['λ'])

Out[42]:

<matplotlib.gridspec.GridSpec at 0x11bd5e6d8>

In [43]:

label_map = {'under_5':{0:'5+', 1:'<5'},
             'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
             'black':{0:'white', 1:'black'},
             'hispanic':{0:'non-hispanic', 1:'hispanic'},
             'setting':{0:'ED', 1:'inpatient', 2:'outpatient'}}

In [44]:

inpatient_data_labels = (setting_data[INPATIENT][['under_5', 'year', 'black', 'hispanic']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'}))

Rate factor for population

In [45]:

rate_factor = 10000

In [46]:

rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns)

In [47]:

inpatient_data_labels

Out[47]:

	age	year	race	ethnicity
0	5+	2012/13	white	non-hispanic
1	5+	2012/13	white	hispanic
2	5+	2012/13	black	non-hispanic
3	5+	2013/14	white	non-hispanic
4	5+	2013/14	white	hispanic
5	5+	2014/15	white	hispanic
6	5+	2014/15	black	non-hispanic
7	<5	2012/13	white	non-hispanic
8	<5	2012/13	white	hispanic
9	<5	2012/13	black	non-hispanic
10	<5	2013/14	white	non-hispanic
11	<5	2013/14	white	hispanic
12	<5	2013/14	black	non-hispanic
13	<5	2014/15	white	non-hispanic
14	<5	2014/15	white	hispanic
15	<5	2014/15	black	non-hispanic
16	5+	2012/13	black	hispanic
17	5+	2013/14	black	hispanic
18	5+	2014/15	black	hispanic
19	<5	2012/13	black	hispanic
20	<5	2013/14	black	hispanic
21	<5	2014/15	black	hispanic
22	5+	2014/15	white	non-hispanic
23	5+	2013/14	black	non-hispanic

In [48]:

from pymc3 import hpd
def _hpd_df(x, alpha):
    cnames = ['hpd_{0:g}'.format(100 * alpha/2),
              'hpd_{0:g}'.format(100 * (1 - alpha/2))]
    return pd.DataFrame(hpd(x, alpha), columns=cnames)

In [49]:

estimates = (df_summary(inpatient_full_trace[1000:], varnames=['λ'],
                stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                           lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates.index = inpatient_data_labels.index
pd.concat([inpatient_data_labels, estimates], axis=1)

Out[49]:

	age	year	race	ethnicity	mean	sd	mc_error	hpd_2.5	hpd_97.5	median	hpd_25	hpd_75
0	5+	2012/13	white	non-hispanic	0.845	0.430	0.009	0.205	1.772	0.773	0.396	0.895
1	5+	2012/13	white	hispanic	1.429	0.984	0.020	0.094	3.421	1.197	0.271	1.329
2	5+	2012/13	black	non-hispanic	1.004	0.500	0.012	0.172	1.917	0.915	0.422	1.034
3	5+	2013/14	white	non-hispanic	0.418	0.299	0.007	0.017	1.024	0.352	0.063	0.376
4	5+	2013/14	white	hispanic	2.060	1.180	0.028	0.259	4.427	1.834	0.937	2.254
5	5+	2014/15	white	hispanic	0.605	0.641	0.014	0.000	1.775	0.405	0.000	0.405
6	5+	2014/15	black	non-hispanic	0.253	0.247	0.006	0.000	0.738	0.186	0.000	0.186
7	<5	2012/13	white	non-hispanic	11.429	2.432	0.047	7.174	16.208	11.257	9.176	12.480
8	<5	2012/13	white	hispanic	22.628	5.570	0.139	12.106	33.637	22.098	16.911	24.050
9	<5	2012/13	black	non-hispanic	10.254	2.636	0.051	5.234	15.231	10.017	7.652	11.122
10	<5	2013/14	white	non-hispanic	14.066	2.614	0.051	9.129	19.126	13.886	12.318	15.786
11	<5	2013/14	white	hispanic	17.146	4.843	0.098	8.274	26.776	16.796	12.789	19.115
12	<5	2013/14	black	non-hispanic	8.824	2.480	0.050	4.343	13.940	8.641	7.006	10.218
13	<5	2014/15	white	non-hispanic	6.849	1.850	0.036	3.491	10.579	6.633	5.111	7.435
14	<5	2014/15	white	hispanic	30.361	6.633	0.141	18.231	43.795	29.874	25.511	34.154
15	<5	2014/15	black	non-hispanic	7.424	2.293	0.044	3.092	11.894	7.274	5.583	8.557
16	5+	2012/13	black	hispanic	0.005	0.151	0.003	0.000	0.000	0.000	0.000	0.000
17	5+	2013/14	black	hispanic	0.006	0.135	0.004	0.000	0.000	0.000	0.000	0.000
18	5+	2014/15	black	hispanic	0.004	0.165	0.004	0.000	0.000	0.000	0.000	0.000
19	<5	2012/13	black	hispanic	0.029	0.910	0.020	0.000	0.000	0.000	0.000	0.000
20	<5	2013/14	black	hispanic	0.013	0.388	0.009	0.000	0.000	0.000	0.000	0.000
21	<5	2014/15	black	hispanic	0.005	0.107	0.003	0.000	0.000	0.000	0.000	0.000
22	5+	2014/15	white	non-hispanic	0.000	0.002	0.000	0.000	0.000	0.000	0.000	0.000
23	5+	2013/14	black	non-hispanic	0.000	0.002	0.000	0.000	0.000	0.000	0.000	0.000

In [50]:

trace, labels = inpatient_full_trace, inpatient_data_labels

In [51]:

index=['age', 'year']
columns=['race', 'ethnicity']
varnames=['λ']

In [52]:

# rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
#                                       names=labels.columns)
# rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
# rate_df.index = rate_df_index
# pivot_all = pd.pivot_table(rate_df
#                  .reset_index(), 
#                            index=index, columns=columns)
# value_iterator = zip(pivot_all['mean'].values.flatten(), 
#                      pivot_all['hpd_2.5'].values.flatten(), 
#                      pivot_all['hpd_97.5'].values.flatten())
# rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
#                               pivot_all['mean'].shape)

# pd.DataFrame(rate_strings, index=pivot_mean.index, columns=pivot_mean.columns)

In [53]:

def generate_table(trace, labels, index, columns, varnames=['λ']):
    rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
                                          names=labels.columns)
    rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
    rate_df.index = rate_df_index
    pivot_all = pd.pivot_table(rate_df
                     .reset_index(), 
                               index=index, columns=columns)
    value_iterator = zip(pivot_all['mean'].values.flatten(), 
                     pivot_all['hpd_2.5'].values.flatten(), 
                     pivot_all['hpd_97.5'].values.flatten())
    rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
                                  pivot_all['mean'].shape)
    
    return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)

In [54]:

generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], columns=['race', 'ethnicity'])

Out[54]:

	race	black		white
	ethnicity	hispanic	non-hispanic	hispanic	non-hispanic
age	year
5+	2012/13	0.01 (0.00, 0.00)	1.00 (0.17, 1.92)	1.43 (0.09, 3.42)	0.85 (0.20, 1.77)
	2013/14	0.01 (0.00, 0.00)	0.00 (0.00, 0.00)	2.06 (0.26, 4.43)	0.42 (0.02, 1.02)
	2014/15	0.00 (0.00, 0.00)	0.25 (0.00, 0.74)	0.60 (0.00, 1.77)	0.00 (0.00, 0.00)
<5	2012/13	0.03 (0.00, 0.00)	10.25 (5.23, 15.23)	22.63 (12.11, 33.64)	11.43 (7.17, 16.21)
	2013/14	0.01 (0.00, 0.00)	8.82 (4.34, 13.94)	17.15 (8.27, 26.78)	14.07 (9.13, 19.13)
	2014/15	0.01 (0.00, 0.00)	7.42 (3.09, 11.89)	30.36 (18.23, 43.80)	6.85 (3.49, 10.58)

In [55]:

inpatient_rate_samples = pd.concat(
        [inpatient_data_labels, 
           pd.DataFrame(inpatient_full_trace['λ'][-1000:]).T], axis=1)

In [56]:

inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race', 'ethnicity']))

inpatient_rate_samples['race/ethnicity'] = inpatient_rate_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)

Plot of rates broken down by year/age/race/ethnicity.

In [57]:

sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age", row='race', 
                          col='ethnicity', 
            data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*10000),
            palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0)
rateplot.despine(left=True)

Out[57]:

<seaborn.axisgrid.FacetGrid at 0x114c90128>

Pooled across race/ethnicity¶

In [58]:

inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1)
                                    .groupby(['under_5', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
inpatient_pool_race_data

Out[58]:

	under_5	year	cases	n
0	0	0	10	102191
1	0	1	5	103789
2	0	2	2	104679
3	1	0	48	40548
4	1	1	44	41013
5	1	2	36	41411

In [59]:

inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data)

In [60]:

with inpatient_model_pool_race:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_pool_race_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)

Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 3.1 sec

In [61]:

inpatient_pool_race_data_labels = (inpatient_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'under_5':'age'}))

In [62]:

generate_table(inpatient_pool_race_trace, inpatient_pool_race_data_labels, index=['year'], columns=['age'])

Out[62]:

age	5+	<5
year
2012/13	1.06 (0.41, 1.70)	13.09 (9.32, 16.38)
2013/14	0.58 (0.14, 1.05)	13.21 (9.34, 17.32)
2014/15	0.25 (0.01, 0.60)	11.43 (7.74, 15.17)

In [63]:

inpatient_pool_race_samples = pd.concat(
        [inpatient_pool_race_data[['under_5', 'year']].reset_index(drop=True), 
           pd.DataFrame(inpatient_pool_race_trace['λ'][-1000:]).T], axis=1)

In [64]:

inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples, 
                                       id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))

Plot of rates pooled by ethnicity

In [65]:

sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*10000),
            palette="gray", linewidth=0.6, fliersize=0)

Out[65]:

<matplotlib.axes._subplots.AxesSubplot at 0x11ada9ef0>

Pooled across years¶

In [66]:

inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1)
 .sort(columns=['under_5', 'black', 'hispanic', 'year'], ascending=False)).reset_index(drop=True)

inpatient_pool_year_data

Out[66]:

	under_5	year	black	hispanic	cases	n
0	1	2	1	1	0	552
1	1	1	1	1	0	566
2	1	0	1	1	0	533
3	1	2	1	0	8	13471
4	1	1	1	0	10	13511
5	1	0	1	0	13	13591
6	1	2	0	1	17	6909
7	1	1	0	1	10	6994
8	1	0	0	1	15	7134
9	1	2	0	0	11	20479
10	1	1	0	0	24	19942
11	1	0	0	0	20	19290
12	0	2	1	1	0	983
13	0	1	1	1	0	914
14	0	0	1	1	0	891
15	0	2	1	0	1	40556
16	0	1	1	0	0	40656
17	0	0	1	0	4	40334
18	0	2	0	1	1	15653
19	0	1	0	1	3	14692
20	0	0	0	1	2	13889
21	0	2	0	0	0	47487
22	0	1	0	0	2	47527
23	0	0	0	0	4	47077

In [67]:

inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True)

In [68]:

with inpatient_model_pool_year:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_pool_year_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)

Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 30.1 sec

In [69]:

inpatient_pool_year_labels = (inpatient_pool_year_data[['under_5', 'black', 'hispanic']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'})
                         .drop_duplicates().reset_index(drop=True))

In [70]:

estimates_pooled = (df_summary(inpatient_pool_year_trace[1000:], varnames=['λ'],
            stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                       lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates_pooled.index = inpatient_pool_year_labels.index
pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1)

Out[70]:

	age	race	ethnicity	mean	sd	mc_error	hpd_2.5	hpd_97.5	median	hpd_25	hpd_75
0	<5	black	hispanic	0.003	0.107	0.002	0.000	0.000	0.000	0.000	0.000
1	<5	black	non-hispanic	26.644	4.522	0.130	18.073	35.547	26.388	23.132	29.001
2	<5	white	hispanic	70.787	10.213	0.353	50.471	90.144	70.350	63.174	76.382
3	<5	white	non-hispanic	31.384	3.838	0.089	24.137	38.477	31.189	27.996	33.245
4	5+	black	hispanic	0.001	0.017	0.000	0.000	0.000	0.000	0.000	0.000
5	5+	black	non-hispanic	1.219	0.540	0.015	0.323	2.292	1.145	0.727	1.400
6	5+	white	hispanic	3.804	1.572	0.041	1.113	6.877	3.589	2.288	4.234
7	5+	white	non-hispanic	1.267	0.507	0.014	0.440	2.268	1.206	0.758	1.410

In [71]:

generate_table(inpatient_pool_year_trace, inpatient_pool_year_labels, index=['race','ethnicity'], columns=['age'])

Out[71]:

	age	5+	<5
race	ethnicity
black	hispanic	0.00 (0.00, 0.00)	0.00 (0.00, 0.00)
black	non-hispanic	1.22 (0.32, 2.29)	26.64 (18.07, 35.55)
white	hispanic	3.80 (1.11, 6.88)	70.79 (50.47, 90.14)
white	non-hispanic	1.27 (0.44, 2.27)	31.38 (24.14, 38.48)

In [72]:

inpatient_pool_year_samples = pd.concat(
    [inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['under_5', 'black', 'hispanic']], 
    pd.DataFrame(inpatient_pool_year_trace['λ'][-1000:]).T], axis=1)

In [73]:

inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['under_5', 'black', 'hispanic'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'black':{0:'white', 1:'black'},
                                     'hispanic':{0:'non-hispanic', 1:'hispanic'}})
                    .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age group'}))

inpatient_pool_year_samples['race/ethnicity'] = inpatient_pool_year_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)

Plot of rates pooled by year

In [74]:

sns.set_context(font_scale=1.5)
plt.subplots(figsize=(12,6))
sns.boxplot(x="race/ethnicity", y="rate", hue="age group", 
        data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*10000),
        palette="gray_r", linewidth=0.6, fliersize=0)

Out[74]:

<matplotlib.axes._subplots.AxesSubplot at 0x120133ba8>

Outpatient Model¶

Import outpatient denominator information.

In [75]:

from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()

In [76]:

outpatient_project = Project(api_url, api_key)

In [77]:

outpatients = outpatient_project.export_records(format='df')

In [78]:

outpatients.groupby('year').year.count()

Out[78]:

year
2012    15969
2015    17562
Name: year, dtype: int64

In [79]:

outpatients[(outpatients.ptage<5) & (outpatients.ptethnicity=='HL') & (outpatients.year==2012)].shape

Out[79]:

(393, 15)

In [80]:

outpatients_kids = outpatients[outpatients.ptage<=18].copy()

In [81]:

outpatients_kids['under_5'] = (outpatients_kids.ptage<5).astype(int)

In [82]:

outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
                        .groupby(['year', 'under_5'])
                        .n.count()
                        .reset_index())
outpatient_n

Out[82]:

	year	under_5	n
0	2012	0	13913
1	2012	1	1422
2	2015	0	11003
3	2015	1	6135

In [83]:

interp_2013 = (outpatient_n.groupby(['under_5'])
                           .apply(lambda x: np.interp(2013, x.year, x.n))
                           .astype(int)
                           .reset_index()).assign(year=2013).rename(columns={0:'n'})
interp_2013

Out[83]:

	under_5	n	year
0	0	12943	2013
1	1	2993	2013

In [84]:

outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0)
outpatient_n = outpatient_n[outpatient_n.year<2015]
outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012)
outpatient_n

Out[84]:

	n	under_5	year
0	13913	0	0
1	1422	1	0
0	12943	0	1
1	2993	1	1

In [85]:

outpatient_data_full = setting_data[OUTPATIENT].groupby(['under_5','year']).cases.sum().reset_index()

In [86]:

outpatient_data_full

Out[86]:

	under_5	year	cases
0	0	0	148
1	0	1	120
2	0	2	87
3	1	0	293
4	1	1	273
5	1	2	269

In [87]:

outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['under_5','year'])

In [88]:

outpatient_merged

Out[88]:

	under_5	year	cases	n
0	0	0	148	13913
1	0	1	120	12943
2	1	0	293	1422
3	1	1	273	2993

In [89]:

def generate_outpatient_model(outpatient_data, pool_years=False):

    with Model() as model:

        cases,n = outpatient_data[['cases','n']].values.T

        ω = monitoring_rate[OUTPATIENT]

        shape = outpatient_data.shape[0]

        if pool_years:
            
            assert not len(cases) % 3
            
            shape = int(len(cases)/3)
                                    
            cases = cases.reshape((pooled_shape, 3)).sum(-1)
                      
        π = Beta('π', 1, 10, shape=shape)

        AGE_obs = Binomial('AGE_obs', n, π*ω, observed=cases)
        
    return model

In [90]:

outpatient_model_full = generate_outpatient_model(outpatient_merged)

Applied logodds-transform to π and added transformed π_logodds to model.

In [91]:

with outpatient_model_full:
    start = find_MAP()
    step = NUTS(scaling=start)
    outpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)

 [-----------------100%-----------------] 2000 of 2000 complete in 1.7 sec

In [92]:

forestplot(outpatient_full_trace[1000:], varnames=['π'])

Out[92]:

<matplotlib.gridspec.GridSpec at 0x12765d978>

In [93]:

outpatient_data_labels = (inpatient_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'under_5':'age'}))
outpatient_data_labels = outpatient_data_labels[outpatient_data_labels.year.isin(['2012/13','2013/14'])]

In [94]:

generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['π'], 
               index=['age'], columns=['year'])

Out[94]:

year	2012/13	2013/14
age
5+	187.53 (159.72, 219.80)	163.25 (133.51, 192.79)
<5	3563.82 (3196.31, 3912.01)	1593.31 (1421.97, 1781.77)

In [95]:

outpatient_full_samples = pd.concat(
        [outpatient_merged[['under_5', 'year']].reset_index(drop=True), 
           pd.DataFrame(outpatient_full_trace['π'][-1000:]).T], axis=1)

In [96]:

outpatient_full_samples = (pd.melt(outpatient_full_samples, 
                                       id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))

In [97]:

sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=outpatient_full_samples.assign(rate=outpatient_full_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0)

Out[97]:

<matplotlib.axes._subplots.AxesSubplot at 0x12585b390>

ED Model¶

In [98]:

ED2 = pd.read_excel('FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED3 = pd.read_excel('Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED2.columns = ED3.columns

In [99]:

ED2_total = ED2.sum()
ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])

In [100]:

ED3_total = ED3.sum()
ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])

In [101]:

ED_all = pd.concat([ED2, ED3])
eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')]

In [102]:

ED_counts = ED_all[eligible_cols].sum(1).astype(int)

In [103]:

rotavirus_ed = rotavirus_data[rotavirus_data.ED==1]
surveillance_counts = rotavirus_data.groupby(['Enrollment Date'])['Record ID'].count()

In [104]:

ED_counts.name = 'count24'
surveillance_counts.name = 'count8'

In [105]:

surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna()
# Drop days where 8-hour count exceeds 24-hour count
surveillance = surveillance[surveillance.count8<=surveillance.count24]

Here is what it looks like if you resample to weekly

In [106]:

weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W').sum()
(weekly_surveillance.count8 / weekly_surveillance.count24).hist()

Out[106]:

<matplotlib.axes._subplots.AxesSubplot at 0x101bca7f0>

We will just use the proportion of the pooled suveillance

In [107]:

surveillance_total = surveillance.sum()
surveillance_total

Out[107]:

count24    3486.0
count8     1849.0
dtype: float64

In [108]:

surveillance_proportion = surveillance_total.count8 / surveillance_total.count24
surveillance_proportion

Out[108]:

0.53040734366035569

In [109]:

def generate_ed_model(ED_data):

    with Model() as model:
    
        data_len = ED_data.shape[0]

        cases, n = ED_data[['cases','n']].values.T

        weights = (np.array(monitoring_rate)[ED] 
                         * np.array(ed_market_share[:-1])[ED_data.year]
                         * surveillance_proportion)

        weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len,
                                         testval=(cases/weights).astype(int))
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

        θ = Normal('θ', 0, sd=1000, shape=data_len)

        λ = Deterministic('λ', tt.exp(θ))

        age_vanderbilt = Poisson('age_vanderbilt', λ * n, observed=weighted_cases)
        
    return model

In [110]:

ed_pool_race_data = (setting_data[ED].drop('setting', axis=1)
                                    .query('year<2')
                                    .groupby(['under_5', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
ed_pool_race_data

Out[110]:

	under_5	year	cases	n
0	0	0	219	102191
1	0	1	203	103789
2	1	0	497	40548
3	1	1	533	41013

In [111]:

ed_model = generate_ed_model(ed_pool_race_data)

In [112]:

with ed_model:
    start = find_MAP()
    step = NUTS(scaling=start)
    ed_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)

Assigned Metropolis to weighted_cases
 [-----------------100%-----------------] 2000 of 2000 complete in 2.2 sec

In [113]:

forestplot(ed_trace[1000:], varnames=['λ'])

Out[113]:

<matplotlib.gridspec.GridSpec at 0x128896320>

In [114]:

ed_data_labels = (ed_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'under_5':'age'}))

In [115]:

generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age'])

Out[115]:

age	5+	<5
year
2012/13	75.48 (66.17, 86.08)	432.16 (393.73, 468.47)
2013/14	69.52 (60.21, 78.79)	467.24 (428.29, 506.03)

Create data frame of MCMC samples for plotting

In [116]:

ed_rate_samples = pd.concat([ed_pool_race_data[['under_5', 'year']], 
           pd.DataFrame(ed_trace['λ'][-1000:]).T], axis=1)

In [117]:

ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14'}})
                    .rename(columns={'under_5':'age group'}))

In [118]:

sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=ed_rate_samples.assign(rate=ed_rate_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0)

Out[118]:

<matplotlib.axes._subplots.AxesSubplot at 0x1272e4278>

Virus rates¶

In [119]:

census_data_virus = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])['TOT_POP'].sum()).reset_index()

census_data_virus['YEAR'] = census_data_virus.YEAR - 2011

In [120]:

census_virus_clean = census_data_virus.rename(columns={'YEAR':'year', 'TOT_POP':'n'})
census_virus_clean.head()

Out[120]:

	under_5	year	n
0	False	0	111216
1	False	1	113190
2	False	2	114362
3	True	0	45013
4	True	1	45582

In [166]:

rotavirus_data[['rotavirus','norovirus']].sum(1).value_counts()

Out[166]:

0    2598
1     806
2      48
dtype: int64

In [121]:

virus_summary = (rotavirus_data.groupby(['under_5', 'year', 'setting', 'rotavirus', 'sapovirus',
                                        'astrovorus', 'norovirus'])
             ['Record ID'].count()
             .reset_index()
             .rename(columns={'Record ID':'cases', 'astrovorus':'astrovirus'}))

virus_summary.head()

Out[121]:

	sapovirus	astrovirus	norovirus	cases
0	0	0	0	158
1	0	0	1	10
2	0	1	0	2
3	1	0	0	12
4	1	1	0	3

In [122]:

merged_virus_data = virus_summary.merge(census_virus_clean, on=['under_5', 'year'], how='left')
merged_virus_data.head(10)

Out[122]:

	setting	rotavirus	sapovirus	astrovirus	norovirus	cases	n
0	0	0	0	0	0	158	111216
1	0	0	0	0	1	10	111216
2	0	0	0	1	0	2	111216
3	0	0	1	0	0	12	111216
4	0	0	1	1	0	3	111216
5	0	1	0	0	0	30	111216
6	0	1	0	0	1	3	111216
7	0	1	1	0	0	1	111216
8	1	0	0	0	0	8	111216
9	1	0	0	0	1	1	111216

In [123]:

rota_data = (merged_virus_data[merged_virus_data.rotavirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max}))

sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))

astro_data = (merged_virus_data[merged_virus_data.astrovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))

noro_data = (merged_virus_data[merged_virus_data.norovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))

In [124]:

virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'),
           noro_data.reset_index().assign(virus='norovirus'),
           rota_data.reset_index().assign(virus='rotavirus'),
           sapo_data.reset_index().assign(virus='sapovirus')])
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)
virus_totals.head(10)

Out[124]:

	under_5	year	setting	cases	n	virus
0	0	0	0	5	111216	astrovirus
1	0	0	1	0	111216	astrovirus
2	0	0	2	6	111216	astrovirus
3	0	1	0	4	113190	astrovirus
4	0	1	1	0	113190	astrovirus
5	0	1	2	2	113190	astrovirus
6	0	2	0	3	114362	astrovirus
7	0	2	1	0	114362	astrovirus
8	0	2	2	1	114362	astrovirus
9	1	0	0	13	45013	astrovirus

In [125]:

n_lookup = virus_totals.groupby(['under_5', 'year', 'virus']).agg({'n':max})
n_lookup

Out[125]:

			n
under_5	year	virus
0	0	astrovirus	111216
		norovirus	111216
		rotavirus	111216
		sapovirus	111216
	1	astrovirus	113190
		norovirus	113190
		rotavirus	113190
		sapovirus	113190
	2	astrovirus	114362
		norovirus	114362
		rotavirus	114362
		sapovirus	114362
1	0	astrovirus	45013
		norovirus	45013
		rotavirus	45013
		sapovirus	45013
	1	astrovirus	45582
		norovirus	45582
		rotavirus	45582
		sapovirus	45582
	2	astrovirus	46003
		norovirus	46003
		rotavirus	46003
		sapovirus	46003

Inpatient virus rate mode¶

In [127]:

virus_inpatient_data = (virus_totals.replace({'virus': {'norovirus':0,
                                                               'rotavirus':1,
                                                               'sapovirus':2,
                                                               'astrovirus':3}}))[virus_totals.setting==INPATIENT]

In [128]:

with Model() as virus_inpatient_model:

    cases, year, n = virus_inpatient_data[['cases', 'year', 'n']].values.T

    weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[year]

    weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=virus_inpatient_data.shape[0],
                                     testval=(cases/weights).astype(int))
    weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

    θ = Normal('θ', 0, sd=1000, shape=virus_inpatient_data.shape[0])

    λ = Deterministic('λ', tt.exp(θ))

    virus_obs = Potential('virus_obs', Poisson.dist(λ*n).logp(weighted_cases))

In [129]:

from pymc3 import Metropolis

with virus_inpatient_model:
    virus_inpatient_trace = sample(20000, step=Metropolis(), njobs=2, random_seed=seed_numbers)

 [-----------------100%-----------------] 20000 of 20000 complete in 6.6 sec

In [130]:

virus_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']], 
           pd.DataFrame(virus_inpatient_trace['λ'][-1000:]).T], axis=1)

In [131]:

virus_samples = (pd.melt(virus_samples, id_vars=['under_5', 'year', 'virus'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))

In [132]:

sns.set(style="ticks")

rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=virus_samples.assign(rate=virus_samples.value*rate_factor),
        palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
        col_wrap=2)

rateplot.despine(left=True)

Out[132]:

<seaborn.axisgrid.FacetGrid at 0x128302278>

Outpatient virus rate model¶

In [133]:

virus_outpatient_data = (virus_totals.replace({'virus': {'norovirus':0,
                                                               'rotavirus':1,
                                                               'sapovirus':2,
                                                               'astrovirus':3}}))[virus_totals.setting==OUTPATIENT]

In [134]:

with Model() as virus_outpatient_model:

    cases,n = virus_outpatient_data[['cases','n']].values.T

    ω = monitoring_rate[OUTPATIENT]

    π = Beta('π', 1, 10, shape=virus_outpatient_data.shape[0])

    virus_out_obs = Binomial('virus_out_obs', n, π*ω, observed=cases)

Applied logodds-transform to π and added transformed π_logodds to model.

In [135]:

with virus_outpatient_model:
    virus_outpatient_trace = sample(2000, njobs=2, random_seed=seed_numbers)

Assigned NUTS to π_logodds
 [-----------------100%-----------------] 2000 of 2000 complete in 7.4 sec

In [136]:

virus_out_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']], 
           pd.DataFrame(virus_outpatient_trace['π'][-1000:]).T], axis=1)

In [137]:

virus_out_samples = (pd.melt(virus_out_samples, id_vars=['under_5', 'year', 'virus'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))

In [138]:

sns.set(style="ticks")

sns.set_context(font_scale=1.5)
rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=virus_out_samples.assign(rate=virus_out_samples.value*rate_factor),
        palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
        col_wrap=2)

rateplot.despine(left=True)

Out[138]:

<seaborn.axisgrid.FacetGrid at 0x12b235198>

ED virus rate model¶

In [139]:

virus_ed_data = (virus_totals.replace({'virus': {'norovirus':0,
                                                               'rotavirus':1,
                                                               'sapovirus':2,
                                                               'astrovirus':3}}))[virus_totals.setting==ED]

In [140]:

with Model() as virus_ed_model:

    data_len = virus_ed_data.shape[0]

    cases, year, n = virus_ed_data[['cases', 'year', 'n']].values.T

    weights = (np.array(monitoring_rate)[ED] 
                     * np.array(ed_market_share)[year]
                     * surveillance_proportion)

    weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len,
                                     testval=(cases/weights).astype(int))
    weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

    θ = Normal('θ', 0, sd=1000, shape=data_len)

    λ = Deterministic('λ', tt.exp(θ))

    virus_ed_obs = Poisson('virus_ed_obs', λ * n, observed=weighted_cases)

In [141]:

with virus_ed_model:
    virus_ed_trace = sample(2000, njobs=2, random_seed=seed_numbers)

Assigned Metropolis to weighted_cases
Assigned NUTS to θ
 [-----------------100%-----------------] 2000 of 2000 complete in 10.4 sec

In [142]:

virus_ed_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']], 
           pd.DataFrame(virus_ed_trace['λ'][-1000:]).T], axis=1)

In [143]:

virus_ed_samples = (pd.melt(virus_ed_samples, id_vars=['under_5', 'year', 'virus'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))

In [144]:

sns.set(style="ticks")

sns.set_context(font_scale=1.5)
rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=virus_ed_samples.assign(rate=virus_ed_samples.value*rate_factor),
        palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
        col_wrap=2)

rateplot.despine(left=True)

Out[144]:

<seaborn.axisgrid.FacetGrid at 0x12d316550>