#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import seaborn as sns
import geopandas as gpd
import pymc3 as pm
import matplotlib.pylab as plt
import theano.tensor as tt
from theano import shared, function
from theano.tensor.shared_randomstreams import RandomStreams
sns.set_style('white')

DATA = '../data/clean/'


# Check for the number of available compute cores

# In[2]:


import multiprocessing

multiprocessing.cpu_count()


# ## Data import
# 
# Line list to extract cases

# In[3]:


line_list = pd.read_csv(DATA + 'line_list.csv')
line_list.head()


# Extract and plot confirmed counts

# In[4]:


confirmed_counts = pd.read_csv(DATA + 'confirmed_counts.csv', index_col=0)
confirmed_counts.tail()


# Plot of confirmed counts by age group.

# In[5]:


confirmed_counts.plot(cmap='viridis');


# In order to account for change in the underlying population, we need the number of births in 2015 and 2016 (Source: [Wikipedia](https://en.wikipedia.org/wiki/Demographics_of_Mongolia#Registered_births_and_deaths))

# In[6]:


births = 82130, 79920


# We will employ the contact matrix for Mongolia, as estimated by [Prem et al. 2017](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005697#sec020)

# In[7]:


B_prem = pd.read_csv(DATA + 'mongolia_prem.csv').values[:9, :9]
B_prem


# In[8]:


B_prem.shape


# Expand contact matrix to include infants; treat them the same as 1-4 year-olds.

# In[9]:


B_prem_exp = np.zeros((10, 10))
B_prem_exp[1:, 1:] = B_prem.copy()
B_prem_exp[:2, :2] = B_prem_exp[1, 1]
B_prem_exp[0, 2:] = B_prem[0, 1:]
B_prem_exp[2:, 0] = B_prem[1:, 0]
B_prem_exp


# $R_0$ as estimated from this contact matrix

# In[10]:


evs = np.linalg.eigvals(B_prem).real
max(evs[np.isreal(evs)])


# Import the underlying population sizes, by province:

# In[11]:


population = pd.read_csv(DATA + '../clean/denominators.csv', index_col=0)
population.head()


# Age classes above 40 will be aggregated into a single group.

# In[12]:


pop_classes = population.iloc[:, 1:9].copy()
pop_classes['40+'] = population.iloc[:, 9:].sum(1)
pop_classes.head()


# In[13]:


age_group_index = pop_classes.columns.str.strip()
age_group_index


# In[14]:


pop_classes_total = pop_classes.sum().values


# Break out infants from 0-4 group. Will use average of 2015 and 2016 births, and subtract them from 0-4's.

# In[15]:


mean_births = int(np.mean(births))
N = [mean_births, pop_classes_total[0] - mean_births]
N.extend(pop_classes_total[1:])
N = np.array(N)


# Here is the assumed population size by age group, with infants as their own group.

# In[16]:


N


# Try to obtain susceptible proportion from coverage history

# In[17]:


coverage = pd.read_csv(DATA + 'coverage.csv', index_col=0)


# In[18]:


coverage.tail(20)


# In[19]:


age_classes = [0,1,5,10,15,20,25,30,35,40,100]
age_slices = [slice(age_classes[i], age_classes[i+1]) for i in range(len(age_classes)-1)]


# In[20]:


n_age_classes = len(age_classes) - 1


# In[21]:


efficacy = 0.85


# Dependent MCV immunization

# In[22]:


immune_MCV1 = coverage[::-1].MCV1 * efficacy
immune_MCV2 = (1 - immune_MCV1) * coverage[::-1].MCV2 * efficacy


# In[23]:


MCV_immunity = immune_MCV2 + immune_MCV1


# In[24]:


downsample_mean = lambda x: np.array([x[s].mean() for s in age_slices])
downsample_sum = lambda x: np.array([x[s].sum() for s in age_slices])


# In[25]:


group_coverage = downsample_mean(coverage[::-1])


# In[26]:


downsample_mean(immune_MCV1)


# In[27]:


susceptibility = (1 - coverage*efficacy).prod(axis=1).round(3).sort_index()
susceptibility.plot()
plt.ylabel('Susceptibility')


# In[28]:


p_susc = susceptibility.groupby(pd.cut(susceptibility.index, age_classes)).mean()


# Down-sample susceptibility to age groups.

# In[30]:


residual_susceptibility = downsample_mean(susceptibility)


# In[32]:


age_labels = ['0', '1-4', '5-9', '10-14', '15-19', '20-24', '25-29', 
                    '30-34', '35-39', '40+']
plt.bar(range(len(age_slices)), downsample_mean(susceptibility), 
        tick_label=age_labels);


# *These susceptibility rates can't be true given the distribution of cases that were observed!*

# ### SIA
# 
# Create a time x age matrix of SIA vaccinations to account for intervention

# In[33]:


sia_2015 = pd.read_csv(DATA + 'sia_2015.csv', index_col=0)
sia_2016 = pd.read_csv(DATA + 'sia_2016.csv', index_col=0)
sia_2015.head()


# In[34]:


sia_2015_total = (sia_2015.sum()
                          .reindex(pd.Index(age_labels))
                          .fillna(0)
                          .astype(int))
sia_2015_total


# In[35]:


sia_2016.sum()


# In[36]:


sia_2016_total = (sia_2016.sum()
                          .reindex(pd.Index(age_labels))
                          .fillna(0)
                          .astype(int))
sia_2016_total


# In[37]:


sia = pd.DataFrame(index=confirmed_counts.index, columns=pd.Index(age_labels)).fillna(0)


# In[38]:


sia.loc['2016-05-22'] = sia_2016_total
sia.loc[['2015-05-24', '2015-06-07', '2015-06-21']] = [sia_2015_total/4,
                                                       sia_2015_total/2,
                                                       sia_2015_total/4]


# In[39]:


sia


# In[40]:


sia_vacc = sia.cumsum(0).values


# ## Stochastic Disease Transmission Model
# 
# We will extend a simple SIR disease model, to account for confirmation status, which will be fit using MCMC.
# 
# This model fits the series of 2-week infection totals for each age group $a$ as a set of Poisson random variables:
# 
# \\[Pr(I_{a}(t) | \lambda_a(t)) = \text{Poisson}(\lambda_a(t)) \\]
# 
# Where the age-specific outbreak intensity at time $t$ is modeled as:
# 
# \\[\lambda_a(t) = S_a(t-1) \frac{I(t-1)\mathbf{B}}{N_a} \\]
# 
# where $S_a(t-1)$ is the number of susceptibles in age group $a$ in the previous time period, $I(t-1)$ an age-specific vector of the number of infected individuals in the previous time period, $\mathbf{B}$ a matrix of transmission coefficients (from Prem et al., specified above), and $N_a$ an estimate of the population of age-$a$ people in Mongolia.
# 
# The basic reproductive number $R_0$ was calculated as the largest real-valued eigenvalue of the matrix $\mathbf{B}$. To impose a mild constraint on $R_0$, we applied a Gaussian prior distribution whose 1st and 99th quantiles are 8 and 24, respectively, a reasonable range for a measles outbreak:

# ### Input data
# 
# Cases

# In[41]:


I = confirmed_counts.values.astype(int)
I.shape


# In[42]:


I_age = I.sum(0)


# In[43]:


lab_samples = line_list.dropna(subset=['date_lab']).copy()
lab_samples.head()


# In[44]:


clinic_confirmed = lab_samples.query('confirmed==1').dropna(subset=['age_years']).copy()
clinic_confirmed.shape


# In[45]:


clinic_confirmed['age_group'] = pd.cut(clinic_confirmed.age_years, age_classes, labels=np.arange(len(age_slices)))


# In[46]:


clinic_confirmed.dropna(subset=['age_years']).groupby('age_group')['labconfirmed'].mean()


# In[47]:


age_midpoints = np.array([0.5, 3, 7.5, 12.5, 17.5, 22.5, 27.5, 32.5, 37.5, 70])


# In[48]:


confirmed, age, age_group = clinic_confirmed[['labconfirmed', 'age_years', 'age_group']].values.T


# In[49]:


age_group = age_group.astype(int)
age = age.astype(float)


# We use a Gaussian random walk to model the variation in confirmation rate across age groups. These functions support the random walk model.

# In[50]:


n_knots = 20
knots = np.linspace(age.min(), age.max(), n_knots)


# In[51]:


def interpolate(x0, y0, x):
    
    idx = np.searchsorted(x0, x)
    dl = np.array(x - x0[idx - 1])
    dr = np.array(x0[idx] - x)
    d = dl + dr
    wl = dr / d

    return wl * y0[idx - 1] + (1 - wl) * y0[idx]


# Generates mcv2 effective coverage under assumption that second dose is given only to those children who received the first dose

# ```R
# routine_coverage_bias <- 0.95
# mcv1 <- read.csv("input-mcv1.csv",row.names = 1)  # read in first dose coverage
# m1 <- unlist(mcv1[which(rownames(mcv1)=="MNG"),1:35]) # select out MNG
# mcv2 <- read.csv("input-mcv2.csv",row.names = 1)  # read in second dose coverage
# m2 <- unlist(mcv2[which(rownames(mcv2)=="MNG"),1:35]) # select out MNG
# m1 <- c(rep(0,15),m1) * routine_coverage_bias
# m2 <- c(rep(0,15),m2) * routine_coverage_bias
# 
# L <- 60                                           # life expectancy
# R <- 25                                           # R0 -- in practice this is fit, this is just a test value for illustration
# 
# # Expected force of natural infection, discounted by vaccination, per SP paper
# foi <- rev(1-m1*.85) * rev(1-mcv2_dep(m1,m2,0.85)) *(R/L)  # here scaling effective MCV2 by mcv2_dep function above
# 
# #########################################
# #SIAs
# SIA.coverage <- 0.5                              # immunization due to sias
# sia1 <- rep(0,35)                                # 2012 SIA just illustrating ages 1-35 here, since historical coverage above only goes that far back.  Reasonable to assume no vaccination before 1980 -- if there was any it was trivial
# sia1[7:18] <- SIA.coverage                                 # current age classes that were targeted by SIA
# sia2 <- rep(0,35)                                # same for 2007 SIA 
# sia2[11:19] <- SIA.coverage
# sia3 <- rep(0,35)                               # same for 2000 SIA 
# sia3[15:29] <- SIA.coverage
# # note that in the calculation below, I assume that these are all independent, which is the most optimistic interpretation of impact
# 
# sia1 <- c(sia1,rep(0,15))
# sia2 <- c(sia2,rep(0,15))
# sia3 <- c(sia3,rep(0,15))
# #########################################
# # The 4hree terms here are:
# # susceptibles after the first dose
# # suceptibles after the second dose
# # susceptibles after natural infection
# # susceptibles after sias
# 
# plot((1-rev(m1)*.85) * rev(1-mcv2_dep(m1,m2,0.85)) * exp(-cumsum(foi)) *(1-sia1)*(1-sia2)*(1-sia3),ylim=c(0,.25),type="l",
#      xlab="current age (years)",
#      ylab="fraction susceptible")
# lines((1-rev(m1)*.85) * 1 * exp(-cumsum(foi)) *(1-sia1)*(1-sia2)*(1-sia3),ylim=c(0,.05),col=2)
# legend(0,.25,legend=c("with mcv2","set mcv2 to 0"),lty=1,col=1:2,bty="n")
# 
# # for the first case (black line)
# initial_susceptible_profile <- (1-rev(m1)*.85) * rev(1-mcv2_dep(m1,m2,0.85)) * exp(-cumsum(foi)) *(1-sia1)*(1-sia2)*(1-sia3)
# 
# ##############################################################
# # Initial susceptibles in numbers -- assume that that population in each age class is equal to the birth cohort in that year of birth
# # then the estiamte of the absolute number of susceptible in each age class is initial_susceptible_profile (above) times the rev()
# # of the birth cohorts.  This ignores any mortality, but that isn't terrible, since it presumes that you're likely to move from S->R 
# # prior to death
# births <- read.csv("input-births.csv",row.names = 1)
# births <- unlist(births[which(rownames(births)=="MNG"),1:35]) # select out MNG
# # This only gives the birth cohorts through 35 years.  It would be reasonable to assume that the prior birth cohorts
# # were the same size; e.g. c() births[1] as many times as needed to births.
# births <- c(rep(births[1],15),births)
# 
# # The first dose of measles vaccine is delivered between 9-12m of age. So most of the first age cohort is not yet eligible.
# # So we model the 0-1 class separately and assume that 25% are susceptible (older than 6m, when they lose maternal immunity on average, 
# # and younger than 9m). The 25% between 9m-12m are vaccinated at mcv1 level
# # 
# barplot(rev(births)*c(.25 * ( 1 + (1-rev(m1)*.85)[1]), initial_susceptible_profile[-1]), 
#         xlab = "birth year",
#         ylab="number susceptible")
# 
# expected_susceptibles <- rev(births)*c(.25 * ( 1 + (1-rev(m1)*.85)[1]), initial_susceptible_profile[-1])
# ```

# In[52]:


def mcv2_dep(mcv1, mcv2, ve):
    return (1 - ve) * mcv2 / (1 - mcv1 * ve)


# In[53]:


routine_coverage_bias = 0.95


# In[72]:


MCV1 = pd.read_csv(DATA + '../raw/input-mcv1.csv', index_col=0).loc['MNG', :'2014'].values
MCV2 = pd.read_csv(DATA + '../raw/input-mcv2.csv', index_col=0).loc['MNG', :'2014'].values

MCV1 = np.concatenate([np.zeros(15), MCV1]) * routine_coverage_bias
#MCV2 = np.concatenate([np.zeros(15), MCV2]) * routine_coverage_bias
MCV2 = np.zeros_like(MCV1)


# In[107]:


foi = (1 - MCV1 * .85)[::-1] * (1 - mcv2_dep(MCV1, MCV2, 0.85))[::-1] * (16 / 60)


# In[108]:


sia_coverage = 0.3
sia1 = np.zeros(50)
sia1[5:14] = sia_coverage
sia2 = np.zeros(50)
sia2[10:19] = sia_coverage
sia3 = np.zeros(50)
sia3[15:23] = sia_coverage
sia4 = np.zeros(50)
sia4[19:29] = sia_coverage


# In[109]:


initial_susceptible_profile = ((1 - MCV1 * .85)[::-1] 
                         * (1 - mcv2_dep(MCV1, MCV2, 0.85))[::-1] 
                         * np.exp(-np.cumsum(foi)) 
                         * (1 - sia1) * (1 - sia2) * (1 - sia3) * (1 - sia4))


# In[110]:


plt.plot(initial_susceptible_profile)


# In[111]:


birth_counts = pd.read_csv(DATA + '../raw/input-births.csv', index_col=0).loc['MNG', :'2014'].values
birth_counts = np.concatenate([np.ones(15)*birth_counts[0], birth_counts])


# In[112]:


initial_susceptible_profile[0] += 1
initial_susceptible_profile[0] *= 0.25


# In[113]:


expected_susceptibles = birth_counts[::-1] * initial_susceptible_profile


# In[114]:


S_exp = downsample_sum(expected_susceptibles)


# Difference between expected susceptibles and total cases by age group

# In[115]:


S_exp - I.cumsum(axis=0)[-1]


# In[116]:


plt.bar(np.arange(len(expected_susceptibles)), expected_susceptibles)


# In[117]:


def tdownsample(x): 
    return tt.stack([x[s].sum() for s in age_slices])


# In[120]:


with pm.Model() as uncertain_vacc_model:
    
    # Confirmation sub-model
    σ = pm.HalfNormal('σ', 5)
    y = pm.GaussianRandomWalk('y', sd=σ, shape=n_knots)
    α = interpolate(knots, y, age)
    π = pm.invlogit(α)
        
    confirmation = pm.Bernoulli('confirmation', π, observed=confirmed)
    
    # Calculate age group probabilities of confirmation
    α_group = interpolate(knots, y, age_midpoints)
    p_confirm = pm.Deterministic('p_confirm', pm.invlogit(α_group))
    
    # Basic reproduction number centered on the dominant eigenvector of the contact matrix
    R_0 = pm.Normal('R_0', max(evs[np.isreal(evs)]), sd=3.5)
    
    # Force of infection based on dependent MCV efficacy
    FOI = (1 - MCV1)[::-1] * (1 - mcv2_dep(MCV1, MCV2, efficacy))[::-1] * R_0 / 60
    # Susceptibility prior to outbreaks
    susceptible = ((1 - MCV1 * .85)[::-1] 
                         * (1 - mcv2_dep(MCV1, MCV2, efficacy))[::-1] 
                         * tt.exp(-tt.cumsum(FOI)) 
                         * (1 - sia1) * (1 - sia2) * (1 - sia3) * (1 - sia3))
    susceptible = tt.concatenate([tt.stack(0.25*(1 + susceptible[0])), susceptible[1:]])
    # Initial susceptibles
    S_0 = pm.Deterministic('S_0', tdownsample(birth_counts[::-1] * susceptible))
    
    # Susceptibles over time, removing individuals vaccinated by SIA
#     S = S_0 - sia_vacc*efficacy*0.5 - shared(I.cumsum(axis=0))
    S = S_0 - shared(I.cumsum(axis=0))

    # Force of infection
    λ = S * (I.dot(B_prem_exp) / N.sum()) + 0.001
    
    # Effective reproductive number
    R_t = pm.Deterministic('R_t', S.sum(1) * R_0 / N.sum())
    
    # Adjust for confirmation bias
    λ_apparent = λ / p_confirm
    
    # Infections likelihood
    infections = pm.Poisson('infections', λ_apparent[:-1], observed=I[1:])


# In[121]:


uncertain_vacc_model.check_test_point()


# In[122]:


with uncertain_vacc_model:
    
    trace = pm.sample(1000, tune=2000, cores=2)


# In[123]:


pm.energyplot(trace);


# In[124]:


if 'p_vacc_interval__' in [v.name for v in uncertain_vacc_model.free_RVs]:
    pm.forestplot(trace, varnames=['p_vacc'], ylabels=age_labels, rhat=False)


# In[125]:


pm.traceplot(trace, varnames=['R_0'])


# However, this model estimates $R_e$ values that are too high.

# In[126]:


pm.forestplot(trace, varnames=['R_t'])