#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
import itertools
sns.set_context('notebook')

seed_numbers = 20090425, 19700903

from datetime import timedelta, datetime


# # Data import and cleanup

# In[2]:


# rotavirus_data = pd.read_csv('data/AGE Year 2-4 with Final Rota.csv', low_memory=False)
cases_file = 'data/AGE Cases and Healthy Controls Merged with Lab Results Year 2 to 4 v05.23.2017.csv'
rotavirus_data = pd.read_csv(cases_file, low_memory=False, parse_dates=['scrdate', 'dob'])
rotavirus_data.index.is_unique


# In[3]:


rotavirus_data.head()


# In[4]:


rotavirus_data.query('year==3').sort_values('scrdate').scrdate


# Import eligibles

# In[5]:


eligible_file = 'data/AGE Outpatient Eligibility Data Year 2-4.csv'
eligible_data = pd.read_csv(eligible_file, low_memory=False, index_col=0, parse_dates=['scrdate'])
eligible_data.index.is_unique


# In[6]:


eligible_data.head()


# In[7]:


eligible_data.scrdate.min(), eligible_data.scrdate.max()


# In[8]:


eligible_data['year'] = 1
eligible_data.loc[eligible_data.scrdate < datetime(2013,12,1), 'year'] = 0
eligible_data.loc[eligible_data.scrdate >= datetime(2014,12,1), 'year'] = 2


# In[9]:


eligible_data['under_5'] = ((eligible_data.agemonths) < 60).astype(int)
eligible_data['under_2'] = ((eligible_data.agemonths) < 24).astype(int)
eligible_data['under_1'] = ((eligible_data.agemonths) < 12).astype(int)


# In[10]:


eligible_data['age_group'] = 3
eligible_data.loc[eligible_data.under_5==1, 'age_group'] = 2
eligible_data.loc[eligible_data.under_2==1, 'age_group'] = 1
eligible_data.loc[eligible_data.under_1==1, 'age_group'] = 0


# In[11]:


eligible_counts = eligible_data.groupby(['age_group', 'year']).size()
eligible_counts.name = 'eligible'
eligible_counts


# In[12]:


eligible_data.shape


# Rename columns and recode data

# In[13]:


rotavirus_data.race.value_counts()


# In[14]:


rotavirus_data['white'] = rotavirus_data['race'] == 1
rotavirus_data['black'] = rotavirus_data['race'] == 2


# In[15]:


rotavirus_data['astro'].value_counts().index


# In[16]:


rotavirus_data=rotavirus_data.rename(columns={'rotacdc':'rotavirus',
                       'sapo':'sapovirus',
                       'astro':'astrovirus',
                       'noro':'norovirus'})


# In[17]:


rotavirus_data.rotavirus.value_counts()


# In[18]:


rotavirus_data.rotavu.value_counts()


# In[19]:


virus_cols = ['rotavirus','sapovirus','astrovirus','norovirus']


# Proportion of test results missing

# In[20]:


rotavirus_data[virus_cols].isnull().mean()


# In[21]:


astro_data = rotavirus_data[rotavirus_data.astrovirus==1].dropna(subset=['astro_ct'])
astro_data.shape


# In[22]:


noro_data = rotavirus_data[rotavirus_data.norovirus==1].dropna(subset=['noro_ct'])
noro_data.shape


# In[23]:


noro_data['coinfect'] = noro_data[['rotavirus', 'sapovirus', 'astrovirus']].sum(1)


# In[24]:


astro_data['coinfect'] = astro_data[['rotavirus', 'sapovirus', 'norovirus']].sum(1)


# In[25]:


with_coinf = noro_data.noro_ct[noro_data.coinfect==1].astype(float)
no_coinf = noro_data.noro_ct[(noro_data.coinfect==0)].astype(float)

# with_coinf = astro_data.astro_ct[astro_data.coinfect==1].astype(float)
# no_coinf = astro_data.astro_ct[(astro_data.coinfect==0)].astype(float)


# In[26]:


from scipy.stats import ranksums

ranksums(with_coinf, no_coinf)


# In[27]:


rotavirus_data.sapo_ct.dropna().astype(float).hist()


# Calculate age, and compare to given age.

# In[28]:


rotavirus_data['age_months'] = (rotavirus_data['scrdate']
                              - rotavirus_data['dob'] + timedelta(1)).astype('timedelta64[M]') 


# In[29]:


rotavirus_data.age_months.hist();


# Group into ages under 5 and 5 or over

# In[30]:


rotavirus_data['under_5'] = ((rotavirus_data.age_months) < 60).astype(int)


# In[31]:


rotavirus_data['under_2'] = ((rotavirus_data.age_months) < 24).astype(int)


# In[32]:


rotavirus_data['under_1'] = ((rotavirus_data.age_months) < 12).astype(int)


# In[33]:


rotavirus_data['age_group'] = 3
rotavirus_data.loc[rotavirus_data.under_5==1, 'age_group'] = 2
rotavirus_data.loc[rotavirus_data.under_2==1, 'age_group'] = 1
rotavirus_data.loc[rotavirus_data.under_1==1, 'age_group'] = 0


# In[34]:


rotavirus_data.age_group.value_counts()


# In[35]:


rotavirus_data.year.value_counts()


# Calculate natural year

# In[36]:


rotavirus_data['year'] = rotavirus_data.year - 2


# Rename setting variable

# In[37]:


assert not rotavirus_data.settingnew.isnull().sum()


# In[38]:


rotavirus_data['setting'] = rotavirus_data.settingnew - 1


# In[39]:


# Indices to each value
INPATIENT, ED, OUTPATIENT, HC = 0, 1, 2, 3


# Recode sex and stool

# In[40]:


assert not rotavirus_data['specimencol'].isnull().sum()


# In[41]:


rotavirus_data['male'] = rotavirus_data['sex']==1
rotavirus_data['stool_collected'] = rotavirus_data['specimencol']==1


# Recode insurance

# In[42]:


rotavirus_data.insurance.value_counts()


# In[43]:


rotavirus_data['public_ins'] = rotavirus_data.insurance.isin({1, 3})
rotavirus_data['private_ins'] = rotavirus_data.insurance.isin({2, 3})
rotavirus_data.loc[rotavirus_data.insurance==8, ['public_ins', 'private_ins']] = np.nan


# In[44]:


# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['intvwdate', 'dob', 'provider', 'coldat', 'tdat', 
       'insurance', 'sex', 'hispcr', 'mdegree_7', 'rvacver_7',
       'rvacver_7r', 'race', 'race2_8'], axis=1)


# Summary of missing values

# In[45]:


rotavirus_data.isnull().sum()


# Aggregate data by age group and setting

# In[46]:


rotavirus_summary = (rotavirus_data.drop('scrdate', axis=1)
             .groupby(['age_group', 'year', 'setting', 'black'])['caseid'].count()
             .reset_index()
             .rename(columns={'caseid':'cases'}))

rotavirus_summary.head()


# Enrollment days/week for inpatient, outpatient, ED

# In[47]:


monitoring_days = 7, 4, 4

# Only 5.5 days of intake on outpatient
monitoring_rate = np.array(monitoring_days)/np.array([7, 7, 5.5])
monitoring_rate


# # Outpatient AGE Model

# In[155]:


from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential, Metropolis
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform, Uniform, Flat, Gamma
from pymc3 import traceplot, forestplot, summary, df_summary, energyplot

import theano.tensor as tt


# In[49]:


iterations = 1000
tune = 1000


# In[50]:


label_map = {'age_group':{0:'<1', 1:'1', 2:'2-4', 3:'5+'},
             'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
             'black':{0:'white', 1:'black'},
             'setting':{0:'inpatient', 1:'ED', 2:'outpatient'}}


# Rate factor for population 

# In[51]:


rate_factor = 1000


# In[52]:


from pymc3 import hpd
def _hpd_df(x, alpha):
    cnames = ['hpd_{0:g}'.format(100 * alpha/2),
              'hpd_{0:g}'.format(100 * (1 - alpha/2))]
    return pd.DataFrame(hpd(x, alpha), columns=cnames)


# In[53]:


def generate_table(trace, labels, index, columns, varnames=['λ']):
    rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
                                          names=labels.columns)
    rate_df = (df_summary(trace, varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
    rate_df.index = rate_df_index
    pivot_all = pd.pivot_table(rate_df
                     .reset_index(), 
                               index=index, columns=columns)
    value_iterator = zip(pivot_all['mean'].values.flatten(), 
                     pivot_all['hpd_2.5'].values.flatten(), 
                     pivot_all['hpd_97.5'].values.flatten())
    rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
                                  pivot_all['mean'].shape)
    
    return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)


# In[54]:


import redcap
import getpass

local_auth = True

api_url = 'https://redcap.vanderbilt.edu/api/'

if local_auth:
    api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()

else:
    api_key = getpass.getpass()


# In[55]:


outpatient_project = redcap.Project(api_url, api_key)
outpatients = outpatient_project.export_records(format='df')


# In[56]:


outpatients.groupby('year').year.count()


# In[57]:


outpatients.shape


# In[58]:


outpatients[['ageinjan', 'ageinnov']].hist(bins=20)


# Going to assume that the outlier ages above were mistakenly entered as months.

# In[59]:


outliers = outpatients.ageinjan > 20
outpatients.loc[outliers, ['ageinjan', 'ageinnov']] = outpatients.loc[outliers, ['ageinjan', 'ageinnov']] / 12


# In[60]:


outpatients[['ageinjan', 'ageinnov']].hist(bins=20)


# In[61]:


outpatients['age'] = np.maximum(outpatients[['ageinjan', 'ageinnov']].mean(1), 0.1)


# In[62]:


outpatients_kids = outpatients[outpatients.age<18].copy()


# In[63]:


outpatients_kids['5_and_older'] = (outpatients_kids.age>=5).astype(int)
outpatients_kids['under_5'] = (outpatients_kids.age<5).astype(int)
outpatients_kids['under_2'] = (outpatients_kids.age<2).astype(int)
outpatients_kids['under_1'] = (outpatients_kids.age<1).astype(int)


# In[64]:


outpatients_kids['age_group'] = 3
outpatients_kids.loc[outpatients_kids.under_5==1, 'age_group'] = 2
outpatients_kids.loc[outpatients_kids.under_2==1, 'age_group'] = 1
outpatients_kids.loc[outpatients_kids.under_1==1, 'age_group'] = 0


# In[65]:


outpatients_kids.groupby('year').age.hist(alpha=0.3)


# In[66]:


outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
                        .groupby(['year', 'age_group'])
                        .n.count())
outpatient_n


# In[67]:


rotavirus_data.head()


# In[68]:


eligible_data.head()


# In[69]:


eligible_counts.sum()


# Average denominators

# In[70]:


outpatient_n.head()


# In[71]:


outpatient_n_mean = outpatient_n.groupby('age_group').mean().astype(int)


# In[72]:


outpatient_data_full = (rotavirus_summary[rotavirus_summary.setting==OUTPATIENT].groupby(['age_group','year'])
                        .cases.sum().reset_index())


# In[73]:


outpatient_data_full


# In[74]:


outpatient_merged = (outpatient_data_full.merge(outpatient_n_mean.reset_index(), on=['age_group'])
                                         .merge(pd.DataFrame(eligible_counts).reset_index(), on=['age_group', 'year']))


# In[78]:


outpatient_merged


# In[76]:


outpatient_under_5 = outpatient_merged[outpatient_merged.age_group<3]


# In[79]:


def outpatient_model(dataset):
    
    age_group, cases, n, eligible = dataset[['age_group','cases','n', 'eligible']].values.T

    shape = dataset.shape[0]
    n_groups = len(set(age_group))
    
    # Cases weighted by monitoring effort
    weights = monitoring_rate[OUTPATIENT]
    
    with Model() as mod:

        # Uniform prior on weighted cases
        weighted_cases = Uniform('weighted_cases', eligible, n, shape=shape)
        
        # Adjust for enrollment
        enrollment_rate = Beta('p_enrolled', 1, 1, shape=shape)
        eligible_cases = Binomial('eligible_cases', eligible, enrollment_rate, observed=cases)

        # Likelihood of eligible cases
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=eligible)

        θ = Normal('θ', 0, sd=1000, shape=n_groups)

        λ = Deterministic('λ', tt.exp(θ))

        AGE_obs = Potential('AGE_obs', Poisson.dist(λ[age_group]*n).logp(weighted_cases))

    return mod


# In[80]:


outpatient_age = outpatient_model(outpatient_under_5)


# In[81]:


with outpatient_age:
    outpatient_age_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)


# In[139]:


age_labels = ['<1', '1', '2-4']
forestplot(outpatient_age_trace, varnames=['λ'], transform=lambda x: x*rate_factor, ylabels=age_labels)


# ## Pooled AGE model

# In[83]:


outpatient_pooled = outpatient_merged.assign(age_group=(outpatient_merged.age_group==3).astype(int))
outpatient_pooled.head()


# In[84]:


outpatient_age_pooled = outpatient_model(outpatient_pooled)


# In[85]:


with outpatient_age_pooled:
    outpatient_age_pooled_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers)


# In[86]:


age_labels_pooled = ['<5', '5+']
forestplot(outpatient_age_pooled_trace, varnames=['λ'], transform=lambda x: x*rate_factor, ylabels=age_labels_pooled)


# ### Outpatient AGE estimates

# In[90]:


outpatient_age_estimates


# In[92]:


outpatient_age_estimates = (df_summary(outpatient_age_trace, varnames=['λ'], transform=lambda x: x*rate_factor)
                            .drop(['sd', 'mc_error', 'n_eff', 'Rhat'], axis=1)
                            .round(1))
outpatient_age_estimates.index = ['<1', '1', '2-4']
outpatient_age_estimates.columns = 'rate esimate', 'lower 95%', 'upper 95%'
outpatient_age_estimates


# In[94]:


pooled_estimate = (df_summary(outpatient_age_pooled_trace, varnames=['λ'], transform=lambda x: x*rate_factor)
                            .drop(['sd', 'mc_error', 'n_eff', 'Rhat'], axis=1)
                            .round(1))
pooled_estimate.columns = outpatient_age_estimates.columns
pooled_estimate.index = ['<5', '5+']
outpatient_age_estimates.append(pooled_estimate)


# In[95]:


outpatient_age_estimates.to_excel('results/outpatient_age.xlsx')


# In[96]:


mean_data = (outpatient_under_5.groupby('age_group')[['cases', 'n']].mean()
                 .rename(columns={'cases':'mean cases', 'n':'mean n'})
                 .round(1)
                .set_index(outpatient_age_estimates.index))

mean_data.join(outpatient_age_estimates)
mean_data.to_excel('results/outpatient_mean_data.xlsx')


# # Virus Rate Models

# In[97]:


collected_stools = (rotavirus_data[rotavirus_data.setting==OUTPATIENT].groupby(['age_group', 'year'])
                        .agg({'stool_collected':['sum', 'count']}))

levels = collected_stools.columns.levels
labels = collected_stools.columns.labels
collected_stools.columns = ['stool_collected', 'enrolled']

collected_stools = collected_stools.reset_index()
collected_stools


# In[98]:


outpatient_n_mean.reset_index()


# All possible combinations of age, year and setting for index.

# In[99]:


astro_cases = (rotavirus_data[(rotavirus_data.astrovirus==1) & (rotavirus_data.setting==OUTPATIENT)]
                         .groupby(['age_group', 'year'])[['stool_collected']].count())
astro_cases.columns = ['cases']
astro_cases = astro_cases.reset_index()


# In[100]:


eligible_counts


# In[102]:


astro_data = (astro_cases.merge(collected_stools, 
                                         on=['age_group', 'year'])
                                  .merge(outpatient_n_mean.reset_index(), 
                                         on=['age_group'], 
                                        how='left')
                                  .merge(pd.DataFrame(eligible_counts).reset_index(), 
                                         on=['age_group', 'year']))
astro_data


# In[103]:


sapo_cases = (rotavirus_data[(rotavirus_data.sapovirus==1) & (rotavirus_data.setting==OUTPATIENT)]
                         .groupby(['age_group', 'year'])[['stool_collected']].count())
sapo_cases.columns = ['cases']
sapo_cases = sapo_cases.reset_index()


# In[104]:


sapo_data = (sapo_cases.merge(collected_stools, 
                                         on=['age_group', 'year'])
                                  .merge(outpatient_n_mean.reset_index(), 
                                         on=['age_group'], 
                                        how='left')
                                  .merge(pd.DataFrame(eligible_counts).reset_index(), 
                                         on=['age_group', 'year']))
sapo_data.head(10)


# In[105]:


rota_cases = (rotavirus_data[(rotavirus_data.rotavirus==1) & (rotavirus_data.setting==OUTPATIENT)]
                         .groupby(['age_group', 'year'])[['stool_collected']].count())
rota_cases.columns = ['cases']
rota_cases = rota_cases.reset_index()


# In[106]:


rota_data = (rota_cases.merge(collected_stools, 
                                         on=['age_group', 'year'])
                                  .merge(outpatient_n_mean.reset_index(), 
                                         on=['age_group'], 
                                        how='left')
                                  .merge(pd.DataFrame(eligible_counts).reset_index(), 
                                         on=['age_group', 'year']))
rota_data.head(10)


# In[108]:


noro_cases = (rotavirus_data[(rotavirus_data.norovirus==1) & (rotavirus_data.setting==OUTPATIENT)]
                         .groupby(['age_group', 'year'])[['stool_collected']].count())
noro_cases.columns = ['cases']
noro_cases = noro_cases.reset_index()


# In[109]:


noro_data = (noro_cases.merge(collected_stools, 
                                         on=['age_group', 'year'])
                                  .merge(outpatient_n_mean.reset_index(), 
                                         on=['age_group'], 
                                        how='left')
                                  .merge(pd.DataFrame(eligible_counts).reset_index(), 
                                         on=['age_group', 'year']))
noro_data.head(10)


# In[110]:


virus_totals = pd.concat([astro_data.assign(virus='astrovirus'),
           noro_data.assign(virus='norovirus'),
           rota_data.assign(virus='rotavirus'),
           sapo_data.assign(virus='sapovirus')]).reset_index()
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)


# ## Outpatient virus rate model

# In[111]:


virus_lookup = {0:'norovirus',
                1:'rotavirus',
                2:'sapovirus',
                3:'astrovirus'}


# In[112]:


virus_outpatient_data = (virus_totals.replace({'virus': virus_lookup}))


# In[113]:


cases, enrolled, n = virus_outpatient_data[['cases', 'enrolled', 'n']].values.T


# In[114]:


virus_outpatient_under_5 = virus_outpatient_data[virus_outpatient_data.age_group<3].drop('index', axis=1)


# In[117]:


virus_outpatient_pooled = virus_outpatient_under_5.groupby(['virus', 'year']).agg({'cases':sum, 'stool_collected':sum,
                                                                     'enrolled':sum, 'n':sum, 'eligible':sum}).reset_index()


# In[118]:


virus_outpatient_pooled['age_group'] = 3
virus_outpatient_pooled[virus_outpatient_under_5.columns]


# In[119]:


virus_outpatient_5_over = (virus_outpatient_data[virus_outpatient_data.age_group==3]
                               .drop('index', axis=1)
                               .groupby(['virus', 'year'])
                               .agg({'cases':sum, 'stool_collected':sum,
                                        'enrolled':sum, 'n':sum, 'eligible':sum})
                               .reset_index())


# In[120]:


virus_outpatient_5_over['age_group'] = 4
virus_outpatient_5_over[virus_outpatient_under_5.columns]


# In[121]:


virus_outpatient_merged = pd.concat([virus_outpatient_under_5, 
                                     virus_outpatient_pooled[virus_outpatient_under_5.columns],
                                    virus_outpatient_5_over[virus_outpatient_under_5.columns]])


# In[124]:


with Model() as virus_outpatient_model:
    
    groups = virus_outpatient_merged.shape[0]
    
    # Probability of stool collection
    ψ = Beta('ψ', 1, 1)
    outpatient_enrolled = (rotavirus_data.setting==OUTPATIENT).sum()
    collected = rotavirus_data[rotavirus_data.setting==OUTPATIENT].stool_collected.sum()
    stool = Binomial('stool', n=outpatient_enrolled, p=ψ, observed=collected)

    cases, enrolled, n, eligible = virus_outpatient_merged[['cases', 'enrolled', 'n', 'eligible']].values.T
    
    # Adjust for enrollment
    enrollment_rate = Beta('p_enrolled', 1, 1, shape=groups)
    enrolled_cases = Binomial('enrolled_cases', eligible, enrollment_rate, observed=enrolled)
    
    eligible_cases = Uniform('eligible_cases', lower=cases, upper=eligible, 
                                     shape=groups)
    
    p = monitoring_rate[OUTPATIENT]*ψ*enrollment_rate

    # Estimate total VUMC cases in setting
    eligible_cases_likelihood = Binomial('eligible_cases_likelihood', 
                                         n=eligible_cases, 
                                         p=p,
                                         observed=cases)
    

    π = Beta('π', 1, 10, shape=groups)

    # Data likeihood
    virus_out_obs = Potential('virus_out_obs', 
              Binomial.dist(n=n, p=π).logp(eligible_cases).sum())
        

# In[125]:


with virus_outpatient_model:
    virus_outpatient_trace = sample(iterations, tune=tune,
                                    njobs=2, random_seed=seed_numbers)


# MCMC diagnostic plot

# In[157]:


energyplot(virus_outpatient_trace)


# Posterior distribution of stool sampling rate

# In[126]:


traceplot(virus_outpatient_trace, varnames=['ψ'])


# ### Outpatient virus rates, plotted and tabulated

# In[127]:


year_labels = ['2012/13','2013/14','2014/15']
pd.DataFrame(list(itertools.product(age_labels + ['<5', '5+'], year_labels)), columns=['age', 'year'])


# In[128]:


outpatient_sample_melted = pd.melt(pd.DataFrame(virus_outpatient_trace['π']))


# In[129]:


outpatient_virus_samples = (virus_outpatient_merged.drop(['enrolled', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                 .merge(outpatient_sample_melted, left_index=True, right_on='variable')
                 .replace({'age_group':{0:'<1', 1:'1', 2:'2-4', 3:'<5', 4:'5+'},
                            'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))


# In[130]:


outpatient_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=outpatient_virus_samples.assign(rate=outpatient_virus_samples.value*rate_factor),
        palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2,
        fliersize=0)

outpatient_rateplot.despine(left=True)


# In[134]:


outpatient_virus_labels = (virus_outpatient_merged.drop(['enrolled', 'n', 'stool_collected', 'cases', 'eligible'],
                                          axis=1).reset_index(drop=True)
                          .replace({'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))


# In[135]:


virus_outpatient_table = generate_table(virus_outpatient_trace, outpatient_virus_labels, 
                                        index=['age group', 'year'], varnames=['π'],
               columns=['virus']).sort_index()
virus_outpatient_table.to_excel('results/virus_outpatient_table.xlsx')


# In[138]:


virus_outpatient_table


# In[140]:


virus_outpatient_table.index = virus_outpatient_table.index.set_levels(age_labels + ['<5', '5+'], level=0)


# In[141]:


virus_outpatient_table


# In[142]:


virus_outpatient_table[['astrovirus']]


# In[143]:


def create_virus_table(virus_name):
    estimates = virus_outpatient_table[[virus_name]].rename(columns={virus_name:'estimate'})
    virus_data = (virus_outpatient_merged[virus_outpatient_merged.virus==virus_name]
                          .set_index(['age_group', 'year']) [['cases', 'stool_collected', 'enrolled', 'n']]
                          .set_index(estimates.index))
    return estimates.join(virus_data)


# In[144]:


astro_table = create_virus_table('astrovirus')
astro_table.to_excel('results/outpatient_astro.xlsx')
astro_table


# In[145]:


noro_table = create_virus_table('norovirus')
noro_table.to_excel('results/outpatient_noro.xlsx')
noro_table


# In[146]:


sapo_table = create_virus_table('sapovirus')
sapo_table.to_excel('results/outpatient_sapo.xlsx')
sapo_table


# In[147]:


rota_table = create_virus_table('rotavirus')
rota_table.to_excel('results/outpatient_rota.xlsx')
rota_table


# ### Pooled (over years) virus rate model

# In[148]:


virus_outpatient_merged


# In[151]:


virus_outpatient_pooled = virus_outpatient_merged.groupby(['age_group', 'virus']).agg({'n':lambda x: max(x)*3, 
                                                           'cases':sum, 
                                                           'stool_collected':sum, 
                                                           'enrolled':sum, 'eligible':sum}).reset_index()


# In[153]:


with Model() as virus_pooled_model:
    
    groups = virus_outpatient_pooled.shape[0]
    
    # Probability of stool collection
    ψ = Beta('ψ', 1, 1)
    outpatient_enrolled = (rotavirus_data.setting==OUTPATIENT).sum()
    collected = rotavirus_data[rotavirus_data.setting==OUTPATIENT].stool_collected.sum()
    stool = Binomial('stool', n=outpatient_enrolled, p=ψ, observed=collected)

    cases, enrolled, n, eligible = virus_outpatient_pooled[['cases', 'enrolled', 'n', 'eligible']].values.T
    
    # Adjust for enrollment
    enrollment_rate = Beta('p_enrolled', 1, 1, shape=groups)
    enrolled_cases = Binomial('enrolled_cases', eligible, enrollment_rate, observed=enrolled)
    
    eligible_cases = Uniform('eligible_cases', lower=cases, upper=eligible, 
                                     shape=groups)
    
    p = monitoring_rate[OUTPATIENT]*ψ*enrollment_rate

    # Estimate total VUMC cases in setting
    eligible_cases_likelihood = Binomial('eligible_cases_likelihood', 
                                         n=eligible_cases, 
                                         p=p,
                                         observed=cases)
    

    π = Beta('π', 1, 10, shape=groups)

    # Data likeihood
    virus_out_obs = Potential('virus_out_obs', 
              Binomial.dist(n=n, p=π).logp(eligible_cases).sum())
        
        
# In[154]:


with virus_pooled_model:
    virus_outpatient_pooled_trace = sample(iterations, tune=tune,
                                    njobs=2, random_seed=seed_numbers)


# MCMC diagnostic plot

# In[156]:


energyplot(virus_outpatient_pooled_trace)


# In[158]:


outpatient_virus_pooled_labels = (virus_outpatient_pooled.drop(['enrolled', 'n', 'stool_collected', 'cases'],
                                          axis=1).reset_index(drop=True)
                          .replace({'age_group':{0:'<1', 1:'1', 2:'2-4', 3:'<5', 4:'5+'},
                          'virus':virus_lookup})
                .rename(columns={'age_group':'age group'}))


# In[159]:


virus_outpatient_pooled_table = generate_table(virus_outpatient_pooled_trace, outpatient_virus_pooled_labels, 
                                        index=['age group'], varnames=['π'],
               columns=['virus']).sort_index()
# virus_outpatient_table.to_excel('results/virus_outpatient_table.xlsx')


# In[160]:


virus_outpatient_pooled_table


# In[161]:


def create_pooled_table(virus_name):
    estimates = virus_outpatient_pooled_table[[virus_name]].rename(columns={virus_name:'estimate'})
    virus_data = (virus_outpatient_pooled[virus_outpatient_pooled.virus==virus_name]
                          .set_index(['age_group']) [['cases', 'stool_collected', 'enrolled', 'n']]
                          .set_index(estimates.index))
    return estimates.join(virus_data).loc[['<1', '1', '2-4', '<5', '5+']]


# In[162]:


astro_pooled = create_pooled_table('astrovirus')
astro_pooled.to_excel('results/outpatient_astro_pooled.xlsx')
astro_pooled


# In[163]:


noro_pooled = create_pooled_table('norovirus')
noro_pooled.to_excel('results/outpatient_noro_pooled.xlsx')
noro_pooled


# In[164]:


sapo_pooled = create_pooled_table('sapovirus')
sapo_pooled.to_excel('results/outpatient_sapo_pooled.xlsx')
sapo_pooled


# In[165]:


rota_pooled = create_pooled_table('rotavirus')
rota_pooled.to_excel('results/outpatient_rota_pooled.xlsx')
rota_pooled