#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')

seed_numbers = 20090425, 19700903

from datetime import timedelta, datetime


# # Data import and cleanup

# In[2]:


# rotavirus_data = pd.read_csv('data/AGE Year 2-4 with Final Rota.csv', low_memory=False)
rotavirus_data = pd.read_csv('data/AGE Cases and Healthy Controls Merged with Lab Results Year 2 to 4 v05.23.2017.csv')
rotavirus_data.index.is_unique


# In[3]:


rotavirus_data.head()


# Rename columns and recode data

# In[4]:


rotavirus_data.race.value_counts()


# In[5]:


rotavirus_data['white'] = rotavirus_data['race'] == 1
rotavirus_data['black'] = rotavirus_data['race'] == 2


# In[6]:


rotavirus_data['astro'].value_counts().index


# In[7]:


# rotavirus_data['rotavirus'] = (rotavirus_data['rotacdc'] == 'Positive').astype(float)
# rotavirus_data['sapovirus'] = (rotavirus_data['sapo'] == 'Positive').astype(float)
# rotavirus_data['astrovirus'] = (rotavirus_data['astro'] == 'Positive').astype(float)
# rotavirus_data['norovirus'] = (rotavirus_data['noro'] == 'Positive').astype(float)

# rotavirus_data.loc[rotavirus_data['rotacdc'].isnull() | (rotavirus_data.rotacdc=='Inhibition') | (rotavirus_data.specimencol=='No'), 
#                    'rotavirus'] = np.nan
# rotavirus_data.loc[rotavirus_data['sapo'].isnull() | (rotavirus_data.sapo=='Inhibition') | (rotavirus_data.specimencol=='No'), 
#                    'sapovirus'] = np.nan
# rotavirus_data.loc[rotavirus_data['astro'].isnull() | (rotavirus_data.astro=='Inhibition') | (rotavirus_data.specimencol=='No'), 
#                    'astrovirus'] = np.nan
# rotavirus_data.loc[rotavirus_data['noro'].isnull() | (rotavirus_data.noro=='Inhibition') | (rotavirus_data.specimencol=='No'), 
#                    'norovirus'] = np.nan


# In[8]:


rotavirus_data=rotavirus_data.rename(columns={'rotacdc':'rotavirus',
                       'sapo':'sapovirus',
                       'astro':'astrovirus',
                       'noro':'norovirus'})


# In[9]:


rotavirus_data.rotavirus.value_counts()


# In[10]:


rotavirus_data.rotavu.value_counts()


# In[11]:


virus_cols = ['rotavirus','sapovirus','astrovirus','norovirus']


# Proportion of test results missing

# In[12]:


rotavirus_data[virus_cols].isnull().mean()


# In[13]:


rotavirus_data['scrdate'] = pd.to_datetime(rotavirus_data['scrdate'])
rotavirus_data['dob'] = pd.to_datetime(rotavirus_data['dob'])


# In[14]:


astro_data = rotavirus_data[rotavirus_data.astrovirus==1].dropna(subset=['astro_ct'])
astro_data.shape


# In[15]:


noro_data = rotavirus_data[rotavirus_data.norovirus==1].dropna(subset=['noro_ct'])
noro_data.shape


# In[16]:


noro_data['coinfect'] = noro_data[['rotavirus', 'sapovirus', 'astrovirus']].sum(1)


# In[17]:


astro_data['coinfect'] = astro_data[['rotavirus', 'sapovirus', 'norovirus']].sum(1)


# In[18]:


with_coinf = noro_data.noro_ct[noro_data.coinfect==1].astype(float)
no_coinf = noro_data.noro_ct[(noro_data.coinfect==0)].astype(float)

# with_coinf = astro_data.astro_ct[astro_data.coinfect==1].astype(float)
# no_coinf = astro_data.astro_ct[(astro_data.coinfect==0)].astype(float)


# In[19]:


from scipy.stats import ranksums

ranksums(with_coinf, no_coinf)


# In[20]:


rotavirus_data.sapo_ct.dropna().astype(float).hist()


# Calculate age, and compare to given age.

# In[21]:


rotavirus_data['age_months'] = (rotavirus_data['scrdate']
                              - rotavirus_data['dob'] + timedelta(1)).astype('timedelta64[M]') 


# In[22]:


rotavirus_data.age_months.hist();


# Group into ages under 5 and 5 or over

# In[23]:


rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int)


# In[24]:


rotavirus_data['under_2'] = ((rotavirus_data.age_months / 12) < 2).astype(int)


# In[25]:


rotavirus_data.loc[rotavirus_data.caseid=='EN1I0141', ['under_5', 'under_2']]


# In[26]:


rotavirus_data['age_group'] = 2
rotavirus_data.loc[rotavirus_data.under_5==1, 'age_group'] = 1
rotavirus_data.loc[rotavirus_data.under_2==1, 'age_group'] = 0


# In[27]:


rotavirus_data.age_group.value_counts()


# In[28]:


rotavirus_data.year.value_counts()


# Calculate natural year

# In[29]:


rotavirus_data['year'] = rotavirus_data.year - 2


# Rename setting variable

# In[30]:


assert not rotavirus_data.settingnew.isnull().sum()


# In[31]:


rotavirus_data['setting'] = rotavirus_data.settingnew - 1


# In[32]:


# Indices to each value
INPATIENT, ED, OUTPATIENT, HC = 0, 1, 2, 3

# rotavirus_data['setting'] = rotavirus_data.provider.replace({1:INPATIENT, 2:ED, 3:OUTPATIENT, 4:HC})


# Recode sex and stool

# In[33]:


assert not rotavirus_data['specimencol'].isnull().sum()


# In[34]:


rotavirus_data['male'] = rotavirus_data['sex']==1
rotavirus_data['stool_collected'] = rotavirus_data['specimencol']==1


# Recode insurance

# In[35]:


rotavirus_data.insurance.value_counts()


# In[36]:


rotavirus_data['public_ins'] = rotavirus_data.insurance.isin({1, 3})
rotavirus_data['private_ins'] = rotavirus_data.insurance.isin({2, 3})
rotavirus_data.loc[rotavirus_data.insurance==8, ['public_ins', 'private_ins']] = np.nan


# In[37]:


# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['intvwdate', 'dob', 'provider', 'coldat', 'tdat', 
       'insurance', 'sex', 'hispcr', 'mdegree_7', 'rvacver_7',
       'rvacver_7r', 'race', 'race2_8'], axis=1)


# Summary of counts by setting

# In[38]:


rotavirus_data.shape


# Summary of missing values

# In[39]:


rotavirus_data.isnull().sum()


# Aggregate data by age group and setting

# In[40]:


rotavirus_summary = (rotavirus_data.drop('scrdate', axis=1)
             .groupby(['age_group', 'year', 'setting', 'black'])['caseid'].count()
             .reset_index()
             .rename(columns={'caseid':'cases'}))

rotavirus_summary.head()


# ### Census data
# 
# The key for AGEGRP is as follows: 
# 
# + 0 = Total
# + 1 = Age 0 to 4 years
# + 2 = Age 5 to 9 years
# + 3 = Age 10 to 14 years
# + 4 = Age 15 to 19 years
# + 5 = Age 20 to 24 years
# + 6 = Age 25 to 29 years
# + 7 = Age 30 to 34 years
# + 8 = Age 35 to 39 years
# + 9 = Age 40 to 44 years
# + 10 = Age 45 to 49 years 
# + 11 = Age 50 to 54 years 
# + 12 = Age 55 to 59 years 
# + 13 = Age 60 to 64 years 
# + 14 = Age 65 to 69 years 
# + 15 = Age 70 to 74 years 
# + 16 = Age 75 to 79 years 
# + 17 = Age 80 to 84 years 
# + 18 = Age 85 years or older

# In[41]:


davidson_census = pd.read_csv('data/davidson_county.csv')
davidson_census.head()


# Number under 18 and under 5 from [2010 census](http://www.census.gov/quickfacts/table/PST045215/47037,47):

# In[42]:


under_18_2010 = 136612
under_5_2010 = 44493
over_4_under_18 = under_18_2010 - under_5_2010

over_4_under_18


# Children 5-19 from census dataset:

# In[43]:


under_20 = davidson_census.loc[(davidson_census.YEAR==2010) & davidson_census.AGEGRP.isin([2,3,4]), 'TOT_POP'].sum()

under_20


# Proportion over 4 but also under 18:

# In[44]:


pct_under_18 = over_4_under_18 / under_20

pct_under_18


# Identify racial/ethnic subgroup populations

# In[45]:


census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy()
census_data_child['WHITE'] = census_data_child[['WA_MALE', 'WA_FEMALE']].sum(axis=1)
census_data_child['BLACK'] = census_data_child[['BA_MALE', 'BA_FEMALE']].sum(axis=1)


# In[46]:


census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])[['WHITE', 'BLACK']]
    .sum()).reset_index()

census_data['year'] = census_data.YEAR - 2011
census_data


# Correct over-4 counts

# In[47]:


census_data.loc[~census_data.under_5, ['WHITE', 'BLACK']] *= pct_under_18


# Add "under 2" category that is 40% of `under_5`

# In[48]:


census_data['under_2'] = False


# In[49]:


census_under_2 = census_data[census_data.under_5].copy()
census_under_2.under_2 = True
census_under_2.under_5 = False
census_under_2.WHITE = (census_under_2.WHITE*0.4).astype(int)
census_under_2.BLACK = (census_under_2.BLACK*0.4).astype(int)

census_under_2


# Subtract under-2's from under-5's

# In[50]:


census_data.loc[census_data.under_5, ['WHITE', 'BLACK']] -= census_under_2[['WHITE', 'BLACK']]


# In[51]:


census_data = (pd.concat([census_data, census_under_2])
                   .reset_index(drop=True)[['under_2', 'under_5', 'year', 'WHITE', 'BLACK']]
                   .rename(columns={'under_5':'2_to_4'}))

census_data


# In[52]:


census_data[['WHITE', 'BLACK']] = census_data[['WHITE', 'BLACK']].astype(int)

census_data


# In[53]:


census_data['age_group'] = 2
census_data.loc[census_data['under_2'], 'age_group'] = 0
census_data.loc[census_data['2_to_4'], 'age_group'] = 1

census_data


# Reshape data

# In[54]:


melted_census = (pd.melt(census_data, id_vars=['age_group', 'year'], 
                    value_vars=['WHITE', 'BLACK']))

census_clean = (melted_census.assign(black=(melted_census.variable=='BLACK').astype(int))
                 .drop('variable', axis=1)
                 .rename(columns={'value':'n'}))
                    
census_clean


# In[55]:


census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)])


# In[56]:


merged_data = (rotavirus_summary.merge(census_full, on=['age_group', 'year', 'black', 'setting'], how='right')
                       .fillna(0).astype(int))


# VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health

# In[57]:


inpatient_market_share = 0.94, 0.90, 0.83
ed_market_share = 0.60, 0.59, 0.62
Y2012, Y2013, Y2014 = 0, 1, 2


# Will estimate 2014 market shares by extrapolating a linear fit to the previous years.

# In[58]:


from sklearn import linear_model


# In[59]:


inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share))
inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.values.reshape(-1, 1), y=inpatient_df.y)
inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]]
inpatient_market_share


# In[60]:


ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share))
ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.values.reshape(-1, 1), y=ed_df.y)
ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]]
ed_market_share


# Enrollment days/week for inpatient, outpatient, ED

# In[61]:


monitoring_days = 7, 4, 4

# Only 5.5 days of intake on outpatient
monitoring_rate = np.array(monitoring_days)/np.array([7, 5.5, 7])
monitoring_rate


# In[62]:


rotavirus_data[(rotavirus_data.white==1) & (rotavirus_data.setting==2) & (rotavirus_data.year==2) &
              (rotavirus_data.under_5==1)].shape


# In[63]:


setting_data = dict(list(merged_data.groupby('setting')))


# # Inpatient AGE Model

# In[64]:


from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential, Metropolis
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform, Uniform, Flat, Gamma
from pymc3 import traceplot, forestplot, summary, df_summary

import theano.tensor as tt


# In[65]:


setting_data[INPATIENT].drop('setting', axis=1).head()


# In[66]:


inpatient_data = setting_data[INPATIENT].drop('setting', axis=1)
inpatient_data[['cases', 'n']]


# In[67]:


def generate_inpatient_model(inpatient_data, pool_years=False):
    
    assert not inpatient_data.isnull().sum().any()

    with Model() as model:

        cases, n = inpatient_data[['cases', 'n']].values.T

        # Cases weighted by monitoring effort and market share
        weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year]

        # Uniform prior on weighted cases
        weighted_cases = Uniform('weighted_cases', cases, n, shape=inpatient_data.shape[0])
#                                          testval=(cases/weights).astype(int))
        
        # Likelihood of observed cases
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

        if pool_years:
            
            assert not len(cases) % 3
            
            pooled_shape = int(len(cases)/3)
                                    
            pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
            
            θ = Normal('θ', 0, sd=1000, shape=pooled_shape)

            λ = Deterministic('λ', tt.exp(θ))

            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
                                
        else:
                      
            θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0])

            # Poisson rate parameter
            λ = Deterministic('λ', tt.exp(θ))
                      
            # Poisson likelihood
            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model


# ### Full (unpooled) model

# In[68]:


inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1))


# In[69]:


iterations = 1000
tune = 1000


# In[70]:


with inpatient_model_full:
    inpatient_full_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)


# In[71]:


forestplot(inpatient_full_trace, varnames=['λ'])


# In[72]:


label_map = {'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
             'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
             'black':{0:'white', 1:'black'},
             'setting':{0:'inpatient', 1:'ED', 2:'outpatient'}}


# In[73]:


inpatient_data_labels = (setting_data[INPATIENT][['age_group', 'year', 'black']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'black':'race', 'age_group':'age'}))


# Rate factor for population 

# In[74]:


rate_factor = 10000


# In[75]:


rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns)


# In[76]:


from pymc3 import hpd
def _hpd_df(x, alpha):
    cnames = ['hpd_{0:g}'.format(100 * alpha/2),
              'hpd_{0:g}'.format(100 * (1 - alpha/2))]
    return pd.DataFrame(hpd(x, alpha), columns=cnames)


# In[77]:


estimates = (df_summary(inpatient_full_trace, varnames=['λ'],
                stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                           lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates.index = inpatient_data_labels.index


# In[78]:


trace, labels = inpatient_full_trace, inpatient_data_labels


# In[79]:


def generate_table(trace, labels, index, columns, varnames=['λ']):
    rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
                                          names=labels.columns)
    rate_df = (df_summary(trace, varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
    rate_df.index = rate_df_index
    pivot_all = pd.pivot_table(rate_df
                     .reset_index(), 
                               index=index, columns=columns)
    value_iterator = zip(pivot_all['mean'].values.flatten(), 
                     pivot_all['hpd_2.5'].values.flatten(), 
                     pivot_all['hpd_97.5'].values.flatten())
    rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
                                  pivot_all['mean'].shape)
    
    return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)


# In[80]:


inpatient_full_table = generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], 
               columns=['race']).sort_index()

inpatient_full_table.to_excel('results/inpatient_full.xlsx')
inpatient_full_table


# In[81]:


inpatient_rate_samples = pd.concat(
        [inpatient_data_labels, 
           pd.DataFrame(inpatient_full_trace['λ']).T], axis=1)


# In[82]:


inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race']))


# Plot of rates broken down by year/age/race/ethnicity.

# In[83]:


sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age", col='race', 
                          hue_order=['0-1','2-4','5+'],
            data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*rate_factor),
            palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0)
rateplot.despine(left=True)


# ### Pooled across race

# In[84]:


inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1)
                                    .groupby(['age_group', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
inpatient_pool_race_data


# In[85]:


inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data)


# In[86]:


with inpatient_model_pool_race:
    inpatient_pool_race_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)


# In[87]:


inpatient_pool_race_data_labels = (inpatient_pool_race_data[['age_group', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'age_group':'age'}))


# In[88]:


inpatient_pool_race_table = generate_table(inpatient_pool_race_trace, 
                                           inpatient_pool_race_data_labels, 
                                           index=['year'], columns=['age'])

inpatient_pool_race_table.to_excel('results/inpatient_pool_race.xlsx')
inpatient_pool_race_table


# In[89]:


inpatient_pool_race_samples = pd.concat(
        [inpatient_pool_race_data[['age_group', 'year']].reset_index(drop=True), 
           pd.DataFrame(inpatient_pool_race_trace['λ']).T], axis=1)


# In[90]:


inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples, 
                                       id_vars=['age_group', 'year'])
                    .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'age_group':'age group'}))


# Plot of rates pooled by ethnicity

# In[91]:


sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0).set_xlabel('year')


# ### Pooled across years

# In[92]:


inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1)
 .sort_values(by=['age_group', 'black', 'year'], ascending=False)).reset_index(drop=True)

inpatient_pool_year_data


# In[93]:


inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True)


# In[94]:


with inpatient_model_pool_year:
    inpatient_pool_year_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)


# In[95]:


inpatient_pool_year_labels = (inpatient_pool_year_data[['age_group', 'black']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'black':'race', 'age_group':'age'})
                         .drop_duplicates().reset_index(drop=True))


# In[96]:


estimates_pooled = (df_summary(inpatient_pool_year_trace, varnames=['λ'],
            stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                       lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates_pooled.index = inpatient_pool_year_labels.index
pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1)


# ### Inpatient AGE estimates

# In[97]:


inpatient_pool_year_table = generate_table(inpatient_pool_year_trace, 
                                           inpatient_pool_year_labels, 
                                           index=['race'], columns=['age'])

inpatient_pool_year_table.to_excel('results/inpatient_pool_year.xlsx')
inpatient_pool_year_table


# In[98]:


inpatient_pool_year_samples = pd.concat(
    [inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['age_group', 'black']], 
    pd.DataFrame(inpatient_pool_year_trace['λ']).T], axis=1)


# In[99]:


inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['age_group', 'black'])
                    .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                                     'black':{0:'white', 1:'black'}})
                    .rename(columns={'black':'race', 'age_group':'age group'}))


# Plot of rates pooled by year

# In[100]:


sns.set_context(font_scale=1.5)
plt.subplots(figsize=(12,6))
sns.boxplot(x="race", y="rate", hue="age group", 
        data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*rate_factor),
        palette="gray_r", linewidth=0.6, fliersize=0)


# # Outpatient AGE Model
# 
# Import outpatient denominator information from REDCap

# In[101]:


import redcap
import getpass

local_auth = True

api_url = 'https://redcap.vanderbilt.edu/api/'

if local_auth:
    api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()

else:
    api_key = getpass.getpass()


# In[102]:


outpatient_project = redcap.Project(api_url, api_key)


# In[103]:


outpatients = outpatient_project.export_records(format='df')


# In[104]:


outpatients.groupby('year').year.count()


# In[105]:


outpatients.shape


# In[106]:


outpatients.head()


# In[107]:


outpatients[['ageinjan', 'ageinnov']].hist(bins=20)


# Going to assume that the outlier ages above were mistakenly entered as months.

# In[108]:


outliers = outpatients.ageinjan > 20
outpatients.loc[outliers, ['ageinjan', 'ageinnov']] = outpatients.loc[outliers, ['ageinjan', 'ageinnov']] / 12


# In[109]:


outpatients[['ageinjan', 'ageinnov']].hist(bins=20)


# In[110]:


outpatients['age'] = np.maximum(outpatients[['ageinjan', 'ageinnov']].mean(1), 0.1)


# In[111]:


outpatients_kids = outpatients[outpatients.age<18].copy()


# In[112]:


outpatients_kids['under_5'] = (outpatients_kids.age<5).astype(int)
outpatients_kids['under_2'] = (outpatients_kids.age<2).astype(int)


# In[113]:


outpatients_kids['age_group'] = 2
outpatients_kids.loc[outpatients_kids.under_5==1, 'age_group'] = 1
outpatients_kids.loc[outpatients_kids.under_2==1, 'age_group'] = 0


# In[114]:


outpatients_kids.groupby('year').age.hist(alpha=0.3)


# In[115]:


outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
                        .groupby(['year', 'age_group'])
                        .n.count()
                        .reset_index())
outpatient_n


# Interpolate 2013

# In[116]:


interp_2013 = (outpatient_n.groupby(['age_group'])
                           .apply(lambda x: np.interp(2013, x.year, x.n))
                           .astype(int)
                           .reset_index()).assign(year=2013).rename(columns={0:'n'})
interp_2013


# Drop 2015 and recode year

# In[117]:


outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0)
outpatient_n = outpatient_n[outpatient_n.year<2015]
outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012)
outpatient_n


# In[118]:


outpatient_data_full = setting_data[OUTPATIENT].groupby(['age_group','year']).cases.sum().reset_index()


# In[119]:


outpatient_data_full


# In[120]:


outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['age_group','year'])


# In[121]:


outpatient_merged


# In[122]:


def generate_outpatient_model(outpatient_data, pool_years=False):
    
    assert not outpatient_data.isnull().sum().any()

    with Model() as model:

        cases,n = outpatient_data[['cases','n']].values.T
        
        shape = outpatient_data.shape[0]
        
        # Cases weighted by monitoring effort
        weights = monitoring_rate[OUTPATIENT] 

        # Uniform prior on weighted cases
        weighted_cases = Uniform('weighted_cases', cases, n, shape=shape)
        
        # Likelihood of observed cases
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

        
        if pool_years:
            
            assert not len(cases) % 3
            
            pooled_shape = int(len(cases)/3)
                                    
            pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
            
            θ = Normal('θ', 0, sd=1000, shape=pooled_shape)

            λ = Deterministic('λ', tt.exp(θ))

            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
                                
        else:
                      
            θ = Normal('θ', 0, sd=1000, shape=outpatient_data.shape[0])

            # Poisson rate parameter
            λ = Deterministic('λ', tt.exp(θ))
                      
            # Poisson likelihood
            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model


# In[123]:


outpatient_model_full = generate_outpatient_model(outpatient_merged)


# In[124]:


with outpatient_model_full:
    outpatient_full_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)


# In[125]:


outpatient_data_labels = (inpatient_pool_race_data[['age_group', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'age_group':'age'}))
outpatient_data_labels = outpatient_data_labels[outpatient_data_labels.year.isin(['2012/13','2013/14'])]
outpatient_data_labels.columns = ['age', 'year']


# In[126]:


outpatient_data_labels


# ### Outpatient AGE estimates

# In[127]:


outpatient_full_table = generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['λ'], 
               index=['age'], columns=['year'])

outpatient_full_table.to_excel('results/outpatient_pool_race.xlsx')
outpatient_full_table


# In[128]:


outpatient_full_samples = pd.concat(
        [outpatient_merged[['age_group', 'year']].reset_index(drop=True), 
           pd.DataFrame(outpatient_full_trace['λ']).T], axis=1)


# In[129]:


outpatient_full_samples = (pd.melt(outpatient_full_samples, 
                                       id_vars=['age_group', 'year'])
                    .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'age_group':'age group'}))


# In[130]:


sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=outpatient_full_samples.assign(rate=outpatient_full_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0)


# ## ED AGE Model

# In[131]:


ED2 = pd.read_excel('data/FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED3 = pd.read_excel('data/Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED2.columns = ED3.columns


# In[132]:


ED4 = pd.read_excel('data/Year 4 denominators.xlsx', index_col=0)
ED4 = ED4.fillna(ED4.mean())
ED4.columns = ED3.columns


# In[133]:


ED2_total = ED2.sum()
ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])


# In[134]:


ED3_total = ED3.sum()
ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])


# In[135]:


ED4_total = ED4.sum()
ED4_eligible = (ED4_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED4_visits = (ED4_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED4_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])


# In[136]:


ED_all = pd.concat([ED2, ED3, ED4])
eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')]


# In[137]:


ED_counts = ED_all[eligible_cols].sum(1).astype(int)


# In[138]:


rotavirus_ed = rotavirus_data[rotavirus_data.setting==ED]
surveillance_counts = rotavirus_data.groupby(['scrdate'])['caseid'].count()


# In[139]:


ED_counts.name = 'count24'
surveillance_counts.name = 'count8'


# In[140]:


surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna()
# Drop days where 8-hour count exceeds 24-hour count
surveillance = surveillance[surveillance.count8<=surveillance.count24]


# Here is what it looks like if you resample to weekly

# In[141]:


weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W')
(weekly_surveillance.count8.mean() / weekly_surveillance.count24.mean()).hist()


# We will just use the proportion of the pooled suveillance

# In[142]:


surveillance_total = surveillance.sum()
surveillance_total


# In[143]:


surveillance_proportion = surveillance_total.count8 / surveillance_total.count24
surveillance_proportion


# In[144]:


def generate_ed_model(ED_data):
    
    assert not ED_data.isnull().sum().any()

    with Model() as model:
    
        data_len = ED_data.shape[0]

        cases, n = ED_data[['cases','n']].values.T

        # Weights according to ED monitoring intensity, market share and 8-hour surveillance
        weights = (np.array(monitoring_rate)[ED] 
                         * np.array(ed_market_share)[ED_data.year]
                         * surveillance_proportion)

        # Uniform prior on weighted cases
        weighted_cases = Uniform('weighted_cases', cases, n, shape=data_len)
        # Likelihood of observed cases
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)


        θ = Normal('θ', 0, sd=1000, shape=data_len)

        # Poisson rate parametera
        λ = Deterministic('λ', tt.exp(θ))
        
        # Poisson likelihood
        AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model


# In[145]:


ed_pool_race_data = (setting_data[ED].drop('setting', axis=1)
                                    .groupby(['age_group', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
ed_pool_race_data


# Back-of-the-envelope rate calculation

# In[146]:


ed_pool_race_data.assign(empirical_rate=rate_factor * ed_pool_race_data.cases/(ed_pool_race_data.n 
                                                                 * np.array(monitoring_rate)[ED] 
                                                                 * surveillance_proportion
                                                                 * np.array(ed_market_share)[ed_pool_race_data.year]))


# In[147]:


ed_model = generate_ed_model(ed_pool_race_data)


# In[148]:


with ed_model:
    ed_trace = sample(1000, tune=tune, njobs=2, random_seed=seed_numbers)


# In[149]:


forestplot(ed_trace, varnames=['λ'])


# In[150]:


ed_data_labels = (ed_pool_race_data[['age_group', 'year']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'age_group':'age'}))


# ### ED AGE estimates

# In[151]:


ed_table = generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age'])

ed_table.to_excel('results/ed_pool_race.xlsx')
ed_table


# Create data frame of MCMC samples for plotting

# In[152]:


ed_rate_samples = pd.concat([ed_pool_race_data[['age_group', 'year']], 
           pd.DataFrame(ed_trace['λ']).T], axis=1)


# In[153]:


ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['age_group', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14'}})
                    .rename(columns={'age_group':'age group'}))


# In[154]:


sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=ed_rate_samples.assign(rate=ed_rate_samples.value*rate_factor),
            palette="gray", linewidth=0.6, fliersize=0)


# # Virus Rate Models

# In[155]:


collected_stools = (rotavirus_data.groupby(['age_group', 'black', 'year', 'setting'])
                        .agg({'stool_collected':['sum', 'count']}))

levels = collected_stools.columns.levels
labels = collected_stools.columns.labels
collected_stools.columns = ['stool_collected', 'enrolled']

collected_stools = collected_stools.reset_index()
collected_stools.head()


# In[156]:


virus_summary = (rotavirus_data.dropna(subset=virus_cols)
                 
                 .groupby(['age_group', 'black', 'year', 'setting', 'rotavirus', 'sapovirus',
                                        'astrovirus', 'norovirus'])['caseid'].count()
                 .reset_index()
                 .rename(columns={'caseid':'cases'}))

virus_summary.head()


# In[157]:


merged_virus_data = (virus_summary.merge(collected_stools, 
                                         on=['age_group', 'black', 'year', 'setting'])
                                  .merge(census_clean, 
                                         on=['age_group', 'year', 'black'], 
                                        how='left'))
merged_virus_data.head(10)


# In[158]:


any_virus = (merged_virus_data.groupby(['black','age_group', 'year', 'setting'])
     .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})).sum(level=['age_group', 'year', 'setting'])
any_virus


# All possible combinations of age, year and setting for index.

# In[159]:


complete_index = pd.MultiIndex.from_product([range(3), range(3), range(4)], names=['age_group', 'year', 'setting'])


# In[160]:


sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1]
                 .groupby(['age_group', 'year', 'setting'])
                 .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
                 .reindex(complete_index)
                 .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
                 .fillna(0).astype(int))

rota_data = (merged_virus_data[merged_virus_data.rotavirus==1]
                 .groupby(['age_group', 'year', 'setting'])
                 .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
                 .reindex(complete_index)
                 .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
                 .fillna(0).astype(int))

astro_data = (merged_virus_data[merged_virus_data.astrovirus==1]
                 .groupby(['age_group', 'year', 'setting'])
                 .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
                 .reindex(complete_index)
                 .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
                 .fillna(0).astype(int))

noro_data = (merged_virus_data[merged_virus_data.norovirus==1]
                 .groupby(['age_group', 'year', 'setting'])
                 .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})
                 .reindex(complete_index)
                 .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected)
                 .fillna(0).astype(int))


# In[161]:


virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'),
           noro_data.reset_index().assign(virus='norovirus'),
           rota_data.reset_index().assign(virus='rotavirus'),
           sapo_data.reset_index().assign(virus='sapovirus')])
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)


# ## Inpatient virus rate model

# In[205]:


virus_lookup = {0:'norovirus',
                1:'rotavirus',
                2:'sapovirus',
                3:'astrovirus'}


# In[162]:


virus_inpatient_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==INPATIENT].reset_index(drop=True)


# In[163]:


cases, enrolled, year, n, age_group = virus_inpatient_data[['cases', 'enrolled', 'year', 'n', 'age_group']].values.T


# In[164]:


with Model() as virus_inpatient_model:
    
    groups = virus_inpatient_data.shape[0]
    
    # Probability of stool collection
    ψ = Beta('ψ', 10, 1, shape=3)
    
    # Age-group specific enrolled inpatients, and inpatients with collected stool
    inpatient_enrolled = rotavirus_data.groupby('age_group').apply(lambda x: (x.setting==INPATIENT).sum()).values

    collected = (rotavirus_data.groupby('age_group')
                               .apply(lambda x: x[x.setting==INPATIENT].stool_collected.sum())
                               .values)

    stool_likelihood = Binomial('stool_likelihood', n=inpatient_enrolled, p=ψ, 
                                observed=collected)

    # Extract data columns
    cases, enrolled, year, n, age_group = virus_inpatient_data[['cases', 'enrolled', 'year', 'n', 'age_group']].values.T

    enrolled_cases = DiscreteUniform('enrolled_cases', lower=cases, upper=enrolled, shape=groups,
                                    testval=(cases/ψ[age_group.astype(int)]).astype('int32'))  

    # Estimate total VUMC cases in setting
    enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood', 
                                         n=enrolled_cases, 
                                         p=ψ[age_group.astype(int)], 
                                         observed=cases)

    # Calculate an "effective N" that corresponds to sampled population
    weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[year.astype(int)]

    n_eff = DiscreteUniform('n_eff', lower=cases, upper=n, shape=groups)
    
    davidson_cases_likelihood = Potential('davidson_cases_likelihood', 
                          Binomial.dist(n=n, p=weights).logp(n_eff.astype('int32')).sum())

    # Population rate
    θ = Normal('θ', 0, sd=10, shape=groups)

    λ = Deterministic('λ', tt.exp(θ))

    # Data likelihood
    virus_obs = Potential('virus_obs', Poisson.dist(λ*n_eff).logp(enrolled_cases))


# In[165]:


with virus_inpatient_model:
    virus_inpatient_trace = sample(iterations, tune=tune, njobs=2,  
                                   random_seed=seed_numbers)


# Posterior distribution of stool sampling rate

# In[166]:


forestplot(virus_inpatient_trace, varnames=['ψ'])


# In[167]:


traceplot(virus_inpatient_trace, varnames=['ψ'])


# In[168]:


forestplot(virus_inpatient_trace, varnames=['λ'])


# ### Inpatient virus rates, plotted and tabulated

# In[169]:


inpatient_sample_melted = pd.melt(pd.DataFrame(virus_inpatient_trace['λ']))


# In[170]:


inpatient_virus_samples = (virus_inpatient_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                 .merge(inpatient_sample_melted, left_index=True, right_on='variable')
                 .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))


# In[171]:


sns.set(style="ticks")

inpatient_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=inpatient_virus_samples.assign(rate=inpatient_virus_samples.value*rate_factor),
        palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
        fliersize=0)

inpatient_rateplot.despine(left=True)


# In[172]:


virus_inpatient_data.to_excel('inpatient_virus_data.xlsx')


# In[173]:


inpatient_virus_labels = (virus_inpatient_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                          .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))


# In[174]:


labels = inpatient_virus_labels
trace = virus_inpatient_trace
varnames=['λ']


# In[175]:


labels.head()


# In[176]:


rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
                                      names=labels.columns)
rate_df = (df_summary(trace, varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
#rate_df.index = rate_df_index


# In[177]:


virus_inpatient_table = generate_table(virus_inpatient_trace, inpatient_virus_labels, 
                                       index=['age group', 'year'], 
                                   columns=['virus']).sort_index()
virus_inpatient_table.to_excel('results/virus_inpatient_table.xlsx')


# In[178]:


virus_inpatient_table


# ## Outpatient virus rate model

# In[179]:


virus_outpatient_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==OUTPATIENT]


# In[180]:


with Model() as virus_outpatient_model:
    
    groups = virus_outpatient_data.shape[0]
    
    # Probability of stool collection
    ψ = Beta('ψ', 1, 1)
    outpatient_enrolled = (rotavirus_data.setting==OUTPATIENT).sum()
    collected = rotavirus_data[rotavirus_data.setting==OUTPATIENT].stool_collected.sum()
    stool = Binomial('stool', n=outpatient_enrolled, p=ψ, observed=collected)

    cases, enrolled, n = virus_outpatient_data[['cases', 'enrolled', 'n']].values.T
    
    enrolled_cases_out = Uniform('enrolled_cases_out', lower=cases, upper=enrolled, 
                                     shape=groups)
    
    p = monitoring_rate[OUTPATIENT]*ψ

    # Estimate total VUMC cases in setting
    enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood', 
                                         n=enrolled_cases_out, 
                                         p=p,
                                         observed=cases)
    

    π = Beta('π', 1, 10, shape=groups)

    # Data likeihood
    virus_out_obs = Potential('virus_out_obs', 
              Binomial.dist(n=n, p=π).logp(enrolled_cases_out).sum())
        

# In[181]:


with virus_outpatient_model:
    virus_outpatient_trace = sample(iterations, tune=tune,
                                    njobs=2, random_seed=seed_numbers)


# Posterior distribution of stool sampling rate

# In[182]:


traceplot(virus_outpatient_trace, varnames=['ψ'])


# ### Outpatient virus rates, plotted and tabulated

# In[183]:


outpatient_sample_melted = pd.melt(pd.DataFrame(virus_outpatient_trace['π']))


# In[184]:


outpatient_virus_samples = (virus_outpatient_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                 .merge(outpatient_sample_melted, left_index=True, right_on='variable')
                 .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))


# In[185]:


outpatient_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=outpatient_virus_samples.assign(rate=outpatient_virus_samples.value*rate_factor),
        palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
        fliersize=0)

outpatient_rateplot.despine(left=True)


# In[186]:


outpatient_virus_labels = (virus_outpatient_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                          .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))


# In[187]:


virus_outpatient_table = generate_table(virus_outpatient_trace, outpatient_virus_labels, 
                                        index=['age group', 'year'], varnames=['π'],
               columns=['virus']).sort_index()
virus_outpatient_table.to_excel('results/virus_outpatient_table.xlsx')


# In[188]:


virus_outpatient_table


# ## ED virus rate model

# In[189]:


virus_ed_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==ED]


# In[190]:


with Model() as virus_ed_model:
        
    # Probability of stool collection
    ψ = Beta('ψ', 1, 1)
    ed_enrolled = (rotavirus_data.setting==ED).sum()
    collected = rotavirus_data[rotavirus_data.setting==ED].stool_collected.sum()
    stool = Binomial('stool', n=ed_enrolled, p=ψ, observed=collected)

    data_len = virus_ed_data.shape[0]

    cases, enrolled, year, n = virus_ed_data[['cases', 'enrolled', 'year', 'n']].values.T
    
    enrolled_cases = Uniform('enrolled_cases', lower=cases, upper=enrolled, 
                                     shape=groups)

    # Estimate total VUMC cases in setting
    enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood', 
                                         n=enrolled_cases, 
                                         p=ψ, 
                                         observed=cases)
        
    ω = (np.array(monitoring_rate)[ED] 
                     * np.array(ed_market_share)[year.astype(int)]
                     * surveillance_proportion)

    n_eff = DiscreteUniform('n_eff', lower=cases, upper=n, 
                                     shape=groups)
    n_eff_likelihood = Potential('davidson_cases_likelihood', 
                          Binomial.dist(n=n, 
                                        p=ω).logp(n_eff).sum())


    # Population rate
    θ = Normal('θ', 0, sd=10, shape=groups)

    λ = Deterministic('λ', tt.exp(θ))

    # Data likelihood
    virus_ed_obs = Potential('virus_ed_obs', Poisson.dist(λ*n_eff).logp(enrolled_cases))


# In[191]:


with virus_ed_model:
    virus_ed_trace = sample(iterations, tune=tune,
                            njobs=2, random_seed=seed_numbers)


# Posterior distribution of stool sampling rate

# In[192]:


traceplot(virus_ed_trace, varnames=['ψ'])


# ### ED virus rate estimates, plotted and tabulated

# In[193]:


ed_sample_melted = pd.melt(pd.DataFrame(virus_ed_trace['λ']))


# In[194]:


ed_virus_samples = (virus_ed_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index()
                 .merge(ed_sample_melted, left_index=True, right_on='variable')
                 .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))


# In[195]:


ed_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=ed_virus_samples.assign(rate=ed_virus_samples.value*rate_factor),
        palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
        fliersize=0)

ed_rateplot.despine(left=True)


# In[196]:


ed_virus_labels = (virus_ed_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                          .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))


# In[197]:


virus_ed_table = generate_table(virus_ed_trace, ed_virus_labels, 
                                        index=['age group', 'year'], 
               columns=['virus']).sort_index()
virus_ed_table.to_excel('results/virus_ed_table.xlsx')


# In[198]:


virus_ed_table