#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')

seed_numbers = 20090425, 19700903

from datetime import timedelta, datetime


# Import data

# In[2]:


rotavirus_data = pd.read_csv('ROTA.csv')
rotavirus_data.index.is_unique


# Rename columns and recode data

# In[3]:


race_cols = [race for race in rotavirus_data.columns if race.startswith('Race')]

(rotavirus_data[race_cols]=='Checked').sum()


# In[4]:


rotavirus_data['white'] = (rotavirus_data['Race (choice=White)']=='Checked').astype(int)
rotavirus_data['black'] = (rotavirus_data['Race (choice=Black/African American)']=='Checked').astype(int)

rotavirus_data = rotavirus_data[(rotavirus_data.white + rotavirus_data.black)==1].drop(race_cols, axis=1)


# In[5]:


rotavirus_data['hispanic'] = (rotavirus_data.Ethnicity=='Hispanic or Latino').astype(int)


# In[6]:


rotavirus_data['rotavirus'] = (rotavirus_data['Result of rotavirus testing'] == 'Positive').astype(int)
rotavirus_data['sapovirus'] = (rotavirus_data['Result of sapovirus testing'] == 'Positive').astype(int)
rotavirus_data['astrovorus'] = (rotavirus_data['Result of astrovirus testing'] == 'Positive').astype(int)
rotavirus_data['norovirus'] = (rotavirus_data['Result of Norovirus Testing'] == 'GII').astype(int)

rotavirus_data = rotavirus_data.drop([col for col in rotavirus_data.columns if col.startswith('Result')], axis=1)


# In[7]:


rotavirus_data['Enrollment Date'] = pd.to_datetime(rotavirus_data['Enrollment Date'])
rotavirus_data['Date of Birth'] = pd.to_datetime(rotavirus_data['Date of Birth'])


# Calculate age, and compare to given age.

# In[8]:


rotavirus_data['age_months'] = (rotavirus_data['Enrollment Date']
                              - rotavirus_data['Date of Birth'] + timedelta(1)).astype('timedelta64[M]') 


# In[9]:


np.abs((rotavirus_data['Age in months (auto calculated)  '] - rotavirus_data['age_months']) > 1).sum()


# Group into ages under 5 and 5 or over

# In[10]:


rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int)


# Calculate natural year

# In[11]:


rotavirus_data['year'] = rotavirus_data.Year - 2
rotavirus_data.year.value_counts()


# Recode sex and stool

# In[12]:


rotavirus_data['male'] = rotavirus_data['Sex updated']=='Male'
rotavirus_data['stool_collected'] = rotavirus_data['Was stool collected?']=='Yes'


# In[13]:


# Drop extraneous columns
rotavirus_data = rotavirus_data.drop(['Date of Birth', 
                                      'Age in months (auto calculated)  ', 'Sex',
                                     'Sex updated', 'Was stool collected?',
                                     'Ethnicity', 'Year'], axis=1)


# In[14]:


rotavirus_data = pd.concat([rotavirus_data, pd.get_dummies(rotavirus_data.Setting)], axis=1).drop('Setting', axis=1)


# In[15]:


rotavirus_data['setting'] = rotavirus_data.IP.astype(int)
rotavirus_data.loc[rotavirus_data.OP==1, 'setting'] = 2
rotavirus_data.setting.value_counts()


# In[16]:


rotavirus_data.shape


# In[17]:


rotavirus_data.head()


# In[18]:


rotavirus_data.isnull().sum()


# In[19]:


rotavirus_data.describe()


# Aggregate data by age group and setting

# In[20]:


rotavirus_summary = (rotavirus_data.drop('Enrollment Date', axis=1)
             .groupby(['under_5', 'year', 'setting', 'black', 'hispanic'])['Record ID'].count()
             .reset_index()
             .rename(columns={'Record ID':'cases'}))

rotavirus_summary.head()


# ### Census data
# 
# The key for AGEGRP is as follows: 
# 
# + 0 = Total
# + 1 = Age 0 to 4 years
# + 2 = Age 5 to 9 years
# + 3 = Age 10 to 14 years
# + 4 = Age 15 to 19 years
# + 5 = Age 20 to 24 years
# + 6 = Age 25 to 29 years
# + 7 = Age 30 to 34 years
# + 8 = Age 35 to 39 years
# + 9 = Age 40 to 44 years
# + 10 = Age 45 to 49 years 
# + 11 = Age 50 to 54 years 
# + 12 = Age 55 to 59 years 
# + 13 = Age 60 to 64 years 
# + 14 = Age 65 to 69 years 
# + 15 = Age 70 to 74 years 
# + 16 = Age 75 to 79 years 
# + 17 = Age 80 to 84 years 
# + 18 = Age 85 years or older

# In[21]:


davidson_census = pd.read_csv('davidson_county.csv')
davidson_census.head()


# Identify racial/ethnic subgroup populations

# In[22]:


census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy()
census_data_child['HISPANIC_WHITE'] = census_data_child[['HWA_MALE', 'HWA_FEMALE']].sum(axis=1)
census_data_child['HISPANIC_BLACK'] = census_data_child[['HBA_MALE', 'HBA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_WHITE'] = census_data_child[['NHWA_MALE', 'NHWA_FEMALE']].sum(axis=1)
census_data_child['NONHISPANIC_BLACK'] = census_data_child[['NHBA_MALE', 'NHBA_FEMALE']].sum(axis=1)


# In[23]:


census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])[['HISPANIC_WHITE', 'HISPANIC_BLACK', 
                                 'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']]
    .sum()).reset_index()

census_data['YEAR'] = census_data.YEAR - 2011
census_data


# Reshape data

# In[24]:


melted_census = (pd.melt(census_data, id_vars=['under_5', 'YEAR'], 
                    value_vars=['HISPANIC_WHITE', 'HISPANIC_BLACK', 
                            'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']))

census_clean = (melted_census.assign(hispanic=melted_census.variable.apply(lambda x: x.split('_')[0]),
                    black=melted_census.variable.apply(lambda x: x.split('_')[1]))
                 .drop('variable', axis=1)
                 .replace({'black':{'BLACK':1, 'WHITE':0},
                          'hispanic':{'HISPANIC':1, 'NONHISPANIC':0}})
                 .rename(columns={'YEAR':'year', 'value':'n'}))
                    
census_clean


# In[25]:


census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)])


# In[26]:


merged_data = (rotavirus_summary.merge(census_full, on=['under_5', 'year', 'black', 'hispanic','setting'], how='right')
                       .fillna(0).astype(int))


# In[27]:


merged_data.head()


# VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health

# In[28]:


inpatient_market_share = 0.94, 0.90, 0.83
ed_market_share = 0.95, 0.94, 0.92
Y2012, Y2013, Y2014 = 0, 1, 2


# Will estimate 2014 market shares by extrapolating a linear fit to the previous years.

# In[29]:


from sklearn import linear_model


# In[30]:


inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share))
inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.reshape(-1, 1), y=inpatient_df.y)
inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]]
inpatient_market_share


# In[31]:


ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share))
ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.reshape(-1, 1), y=ed_df.y)
ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]]
ed_market_share


# Enrollment days/week for ED, inpatient, outpatient

# In[32]:


monitoring_days = 4, 7, 4

# Indices to each value
ED, INPATIENT, OUTPATIENT = 0, 1, 2

monitoring_rate = np.array(np.array(monitoring_days)/7)
monitoring_rate


# In[33]:


rotavirus_data[(rotavirus_data.hispanic==1) & (rotavirus_data.white==1) 
               & (rotavirus_data.setting==2) & (rotavirus_data.year==2) &
              (rotavirus_data.under_5==1)].shape


# In[34]:


setting_data = dict(list(merged_data.groupby('setting')))


# ## Inpatient Model

# In[35]:


from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential
from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform
from pymc3 import traceplot, forestplot, summary, df_summary

import theano.tensor as tt


# In[36]:


def generate_inpatient_model(inpatient_data, pool_years=False):

    with Model() as model:

        under_5, cases, n = inpatient_data[['under_5', 'cases', 'n']].values.T

        weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year]

        weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=inpatient_data.shape[0],
                                         testval=(cases/weights).astype(int))
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)

        if pool_years:
            
            assert not len(cases) % 3
            
            pooled_shape = int(len(cases)/3)
                                    
            pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1))
            
            θ = Normal('θ', 0, sd=1000, shape=pooled_shape)

            λ = Deterministic('λ', tt.exp(θ))

            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases))
                                
        else:
                      
            θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0])

            λ = Deterministic('λ', tt.exp(θ))
                      
            AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases))
        
    return model


# ### Full (unpooled) model

# In[37]:


inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1))


# In[38]:


with inpatient_model_full:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)


# In[39]:


plt.subplots(figsize=(8,12))
forestplot(inpatient_full_trace[1000:], varnames=['λ'])


# In[40]:


label_map = {'under_5':{0:'5+', 1:'<5'},
             'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'},
             'black':{0:'white', 1:'black'},
             'hispanic':{0:'non-hispanic', 1:'hispanic'},
             'setting':{0:'ED', 1:'inpatient', 2:'outpatient'}}


# In[41]:


inpatient_data_labels = (setting_data[INPATIENT][['under_5', 'year', 'black', 'hispanic']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'}))


# Rate factor for population 

# In[42]:


rate_factor = 10000


# In[43]:


rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns)


# In[44]:


inpatient_data_labels


# In[45]:


from pymc3 import hpd
def _hpd_df(x, alpha):
    cnames = ['hpd_{0:g}'.format(100 * alpha/2),
              'hpd_{0:g}'.format(100 * (1 - alpha/2))]
    return pd.DataFrame(hpd(x, alpha), columns=cnames)


# In[46]:


estimates = (df_summary(inpatient_full_trace[1000:], varnames=['λ'],
                stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                           lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates.index = inpatient_data_labels.index
pd.concat([inpatient_data_labels, estimates], axis=1)


# In[47]:


trace, labels = inpatient_full_trace, inpatient_data_labels


# In[48]:


index=['age', 'year']
columns=['race', 'ethnicity']
varnames=['λ']


# In[49]:


# rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
#                                       names=labels.columns)
# rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
# rate_df.index = rate_df_index
# pivot_all = pd.pivot_table(rate_df
#                  .reset_index(), 
#                            index=index, columns=columns)
# value_iterator = zip(pivot_all['mean'].values.flatten(), 
#                      pivot_all['hpd_2.5'].values.flatten(), 
#                      pivot_all['hpd_97.5'].values.flatten())
# rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
#                               pivot_all['mean'].shape)

# pd.DataFrame(rate_strings, index=pivot_mean.index, columns=pivot_mean.columns)


# In[52]:


def generate_table(trace, labels, index, columns, varnames=['λ']):
    rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), 
                                          names=labels.columns)
    rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']]
    rate_df.index = rate_df_index
    pivot_all = pd.pivot_table(rate_df
                     .reset_index(), 
                               index=index, columns=columns)
    value_iterator = zip(pivot_all['mean'].values.flatten(), 
                     pivot_all['hpd_2.5'].values.flatten(), 
                     pivot_all['hpd_97.5'].values.flatten())
    rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], 
                                  pivot_all['mean'].shape)
    
    return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns)


# In[53]:


generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], columns=['race', 'ethnicity'])


# In[54]:


inpatient_rate_samples = pd.concat(
        [inpatient_data_labels, 
           pd.DataFrame(inpatient_full_trace['λ'][-1000:]).T], axis=1)


# In[55]:


inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race', 'ethnicity']))

inpatient_rate_samples['race/ethnicity'] = inpatient_rate_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)


# Plot of rates broken down by year/age/race/ethnicity.

# In[56]:


sns.set(style="ticks")
rateplot = sns.factorplot(x="year", y="rate", hue="age", row='race', 
                          col='ethnicity', 
            data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*10000),
            palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0)
rateplot.despine(left=True)


# ### Pooled across race/ethnicity

# In[57]:


inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1)
                                    .groupby(['under_5', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
inpatient_pool_race_data


# In[58]:


inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data)


# In[59]:


with inpatient_model_pool_race:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_pool_race_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)


# In[60]:


inpatient_pool_race_data_labels = (inpatient_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'under_5':'age'}))


# In[61]:


generate_table(inpatient_pool_race_trace, inpatient_pool_race_data_labels, index=['year'], columns=['age'])


# In[62]:


inpatient_pool_race_samples = pd.concat(
        [inpatient_pool_race_data[['under_5', 'year']].reset_index(drop=True), 
           pd.DataFrame(inpatient_pool_race_trace['λ'][-1000:]).T], axis=1)


# In[63]:


inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples, 
                                       id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))


# Plot of rates pooled by ethnicity

# In[64]:


sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*10000),
            palette="gray", linewidth=0.6, fliersize=0)


# ### Pooled across years

# In[65]:


inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1)
 .sort(columns=['under_5', 'black', 'hispanic', 'year'], ascending=False)).reset_index(drop=True)

inpatient_pool_year_data


# In[66]:


inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True)


# In[67]:


with inpatient_model_pool_year:
    start = find_MAP()
    step = NUTS(scaling=start)
    inpatient_pool_year_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)


# In[68]:


inpatient_pool_year_labels = (inpatient_pool_year_data[['under_5', 'black', 'hispanic']]
                         .reset_index(drop=True)
                         .replace(label_map)
                         .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'})
                         .drop_duplicates().reset_index(drop=True))


# In[69]:


estimates_pooled = (df_summary(inpatient_pool_year_trace[1000:], varnames=['λ'],
            stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'),
                       lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3)
estimates_pooled.index = inpatient_pool_year_labels.index
pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1)


# In[70]:


generate_table(inpatient_pool_year_trace, inpatient_pool_year_labels, index=['race','ethnicity'], columns=['age'])


# In[71]:


inpatient_pool_year_samples = pd.concat(
    [inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['under_5', 'black', 'hispanic']], 
    pd.DataFrame(inpatient_pool_year_trace['λ'][-1000:]).T], axis=1)


# In[72]:


inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['under_5', 'black', 'hispanic'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'black':{0:'white', 1:'black'},
                                     'hispanic':{0:'non-hispanic', 1:'hispanic'}})
                    .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age group'}))

inpatient_pool_year_samples['race/ethnicity'] = inpatient_pool_year_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1)


# Plot of rates pooled by year

# In[73]:


sns.set_context(font_scale=1.5)
plt.subplots(figsize=(12,6))
sns.boxplot(x="race/ethnicity", y="rate", hue="age group", 
        data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*10000),
        palette="gray_r", linewidth=0.6, fliersize=0)


# ## Outpatient Model
# 
# Import outpatient denominator information.

# In[74]:


from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read()


# In[75]:


outpatient_project = Project(api_url, api_key)


# In[76]:


outpatients = outpatient_project.export_records(format='df')


# In[77]:


outpatients.groupby('year').year.count()


# In[78]:


outpatients[(outpatients.ptage<5) & (outpatients.ptethnicity=='HL') & (outpatients.year==2012)].shape


# In[79]:


outpatients_kids = outpatients[(outpatients.ptage<=18) 
                               & (outpatients.ptrace.isin(['B','W'])) 
                               & (outpatients.ptethnicity.isin(['NH', 'HL']))].copy()


# In[80]:


outpatients_kids.ptrace.value_counts()


# In[81]:


outpatients_kids.ptethnicity.value_counts()


# In[82]:


outpatients_kids['under_5'] = (outpatients_kids.ptage<5).astype(int)
outpatients_kids['black'] = (outpatients_kids.ptrace=='B').astype(int)
outpatients_kids['hispanic'] = (outpatients_kids.ptethnicity=='HL').astype(int)


# In[83]:


outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'})
                        .groupby(['year', 'black', 'hispanic', 'under_5'])
                        .n.count()
                        .reset_index())
outpatient_n


# In[84]:


interp_2013 = (outpatient_n.groupby(['black','hispanic','under_5'])
                           .apply(lambda x: np.interp(2013, x.year, x.n))
                           .astype(int)
                           .reset_index()).assign(year=2013).rename(columns={0:'n'})
interp_2013


# In[85]:


outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0)
outpatient_n = outpatient_n[outpatient_n.year<2015]
outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012)
outpatient_n


# In[86]:


outpatient_data_full = setting_data[OUTPATIENT].drop(['setting','n'], axis=1)


# In[87]:


outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['black','hispanic','under_5','year'])


# In[88]:


outpatient_merged


# In[89]:


def generate_outpatient_model(outpatient_data, pool_years=False):

    with Model() as model:

        cases,n = outpatient_data[['cases','n']].values.T

        ω = monitoring_rate[INPATIENT]

        shape = outpatient_data.shape[0]

        if pool_years:
            
            assert not len(cases) % 3
            
            shape = int(len(cases)/3)
                                    
            cases = cases.reshape((pooled_shape, 3)).sum(-1)
                      
        π = Beta('π', 1, 10, shape=shape)

        AGE_obs = Binomial('AGE_obs', n, π*ω, observed=cases)
        
    return model


# In[90]:


outpatient_model_full = generate_outpatient_model(outpatient_merged)


# In[91]:


with outpatient_model_full:
    start = find_MAP()
    step = NUTS(scaling=start)
    outpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)


# In[92]:


plt.subplots(figsize=(8,12))
forestplot(outpatient_full_trace[1000:], varnames=['π'])


# In[93]:


outpatient_data_labels = inpatient_data_labels[inpatient_data_labels.year.isin(['2012/13','2013/14'])]


# In[94]:


generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['π'], 
               index=['age', 'year'], columns=['race', 'ethnicity'])


# ## ED Model

# In[95]:


ED2 = pd.read_excel('FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED3 = pd.read_excel('Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0)
ED2.columns = ED3.columns


# In[96]:


ED2_total = ED2.sum()
ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])


# In[97]:


ED3_total = ED3.sum()
ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], 
                        ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)',
                                'Total Eligible on Date (Age > 11 Years and < 18 Years)']])
ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'],
                        ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)',
                                 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']])


# In[98]:


ED_all = pd.concat([ED2, ED3])
eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')]


# In[99]:


ED_counts = ED_all[eligible_cols].sum(1).astype(int)


# In[100]:


rotavirus_ed = rotavirus_data[rotavirus_data.ED==1]
surveillance_counts = rotavirus_data.groupby(['Enrollment Date'])['Record ID'].count()


# In[101]:


ED_counts.name = 'count24'
surveillance_counts.name = 'count8'


# In[102]:


surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna()
# Drop days where 8-hour count exceeds 24-hour count
surveillance = surveillance[surveillance.count8<=surveillance.count24]


# Here is what it looks like if you resample to weekly

# In[103]:


weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W').sum()
(weekly_surveillance.count8 / weekly_surveillance.count24).hist()


# We will just use the proportion of the pooled suveillance

# In[104]:


surveillance_total = surveillance.sum()
surveillance_total


# In[105]:


surveillance_proportion = surveillance_total.count8 / surveillance_total.count24
surveillance_proportion


# In[106]:


def generate_ed_model(ED_data):

    with Model() as model:
    
        data_len = ED_data.shape[0]

        cases, n = ED_data[['cases','n']].values.T

        weights = (np.array(monitoring_rate)[ED] 
                         * np.array(ed_market_share[:-1])[ED_data.year]
                         * surveillance_proportion)

        weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len,
                                         testval=(cases/weights).astype(int))
        weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases)


        θ = Normal('θ', 0, sd=1000, shape=data_len)

        λ = Deterministic('λ', tt.exp(θ))

        age_vanderbilt = Poisson('age_vanderbilt', λ * n, observed=weighted_cases)
        
    return model


# In[107]:


ed_pool_race_data = (setting_data[ED].drop('setting', axis=1)
                                    .query('year<2')
                                    .groupby(['under_5', 'year'])[['cases','n']]
                                    .sum()
                                    .reset_index())
ed_pool_race_data


# In[108]:


ed_model = generate_ed_model(ed_pool_race_data)


# In[109]:


with ed_model:
    start = find_MAP()
    step = NUTS(scaling=start)
    ed_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers)


# In[110]:


forestplot(ed_trace[1000:], varnames=['λ'])


# In[111]:


ed_data_labels = (ed_pool_race_data[['under_5', 'year']]
                         .reset_index(drop=True)
                        .replace(label_map)
                    .rename(columns={'under_5':'age'}))


# In[112]:


generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age'])


# Create data frame of MCMC samples for plotting

# In[120]:


ed_rate_samples = pd.concat([ed_pool_race_data[['under_5', 'year']], 
           pd.DataFrame(ed_trace['λ'][-1000:]).T], axis=1)


# In[121]:


ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['under_5', 'year'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14'}})
                    .rename(columns={'under_5':'age group'}))


# In[122]:


sns.set_context(font_scale=1.5)
sns.boxplot(x="year", y="rate", hue="age group",
            data=ed_rate_samples.assign(rate=ed_rate_samples.value*10000),
            palette="gray", linewidth=0.6, fliersize=0)


# ## Virus rates

# In[ ]:


census_data_virus = (census_data_child.assign(under_5=census_data_child.AGEGRP==1)
    .groupby(['under_5','YEAR'])['TOT_POP'].sum()).reset_index()

census_data_virus['YEAR'] = census_data_virus.YEAR - 2011


# In[ ]:


census_virus_clean = census_data_virus.rename(columns={'YEAR':'year', 'TOT_POP':'n'})
census_virus_clean.head()


# In[ ]:


virus_summary = (rotavirus_data.groupby(['under_5', 'year', 'setting', 'rotavirus', 'sapovirus',
                                        'astrovorus', 'norovirus'])
             ['Record ID'].count()
             .reset_index()
             .rename(columns={'Record ID':'cases', 'astrovorus':'astrovirus'}))

virus_summary.head()


# In[ ]:


merged_virus_data = virus_summary.merge(census_virus_clean, on=['under_5', 'year'], how='left')
merged_virus_data.head(10)


# In[ ]:


rota_data = (merged_virus_data[merged_virus_data.rotavirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max}))

sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))

astro_data = (merged_virus_data[merged_virus_data.astrovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))

noro_data = (merged_virus_data[merged_virus_data.norovirus==1]
                 .groupby(['under_5', 'year', 'setting'])
                 .agg({'cases':sum, 'n':max})
                 .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad'))


# In[ ]:


virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'),
           noro_data.reset_index().assign(virus='norovirus'),
           rota_data.reset_index().assign(virus='rotavirus'),
           sapo_data.reset_index().assign(virus='sapovirus')])
virus_totals['cases'] = virus_totals.cases.astype(int)
virus_totals['n'] = virus_totals.n.astype(int)
virus_totals.head(10)


# In[ ]:


n_lookup = virus_totals.groupby(['under_5', 'year', 'virus']).agg({'n':max})
n_lookup


# In[ ]:


with Model() as virus_model:
    
    n_total = int(n_lookup.shape[0])
        
    p_enroll = Beta('p_enroll', 1, 1, testval=0.9)
    enrolled = Binomial('enrolled', n=rotavirus_data.shape[0], p=p_enroll, 
                        observed=rotavirus_data.stool_collected.sum())
    
    weights = (np.array(monitoring_rate)[virus_totals.setting] 
                     * np.array(market_share)[virus_totals.year])
    
    weighted_cases = DiscreteUniform('weighted_cases', virus_totals.cases, 
                            virus_totals.n, shape=virus_totals.shape[0],
                            testval=np.maximum((virus_totals.cases.values/weights).astype(int), 1))

    p_weight = Deterministic('p_weight', weights*p_enroll)
    weighting = Binomial('weighting', n=weighted_cases, p=p_weight, observed=virus_totals.cases)
    
    weighted_cases_total = Deterministic('weighted_cases_total',
                                         weighted_cases.reshape((n_total, 3)).sum(1))
    
    θ = Normal('θ', 0, sd=1000, shape=n_total, testval=np.ones(n_total))
    
    λ = Deterministic('λ', tt.exp(θ))
        
    virus_like = Potential('virus_like', 
                           Poisson.dist(λ * (n_lookup.n/10000.)).logp(weighted_cases_total))


# In[ ]:


with virus_model:
    virus_trace = sample(20000, njobs=2, random_seed=seed_numbers)


# In[ ]:


traceplot(virus_trace[10000:], varnames=['p_enroll'])


# In[ ]:


forestplot(virus_trace[10000:], varnames=['weighted_cases_total'])


# In[ ]:


virus_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']], 
           pd.DataFrame(virus_trace['λ'][-1000:]).T], axis=1)


# In[ ]:


virus_samples = (pd.melt(virus_samples, id_vars=['under_5', 'year', 'virus'])
                    .replace({'under_5':{0:'5+', 1:'<5'},
                                     'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}})
                    .rename(columns={'under_5':'age group'}))


# In[ ]:


sns.set(style="ticks")

rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', 
        data=virus_samples.rename(columns={'value':'rate'}),
        palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6,
        col_wrap=2)

rateplot.despine(left=True)