#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import statsmodels as sm
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_context('notebook')


# In[2]:


np.random.seed(42)


# Data import

# In[3]:


ts_data = pd.read_excel('Trend study data_DEC2012-2015.xlsx', index_col=0)
ts_data.head()


# Plot of incidence time series (blue line) and the weekly outbreak records (green dots).

# In[4]:


ts_data.Percent_incident.plot(label='incidence')
ax = ts_data.Outbreaks.plot(secondary_y=True, style='o', label='outbreaks')
ax.set_yticks(range(3))
ax.set_ylim(0, 2.5);


# Autocorrelation function and partial autocorrelation function for observed incidence. The partial autocorrelation shows the autocorrelation at a particular time lag, controlling for the autocorrelation of other periods.
# 
# This shows that there is probably information going back 2 or 3 weeks that can help us inform the prediction of incidence.

# In[5]:


from statsmodels.graphics.tsaplots import acf, pacf

fig, axes = plt.subplots(1, 2)
a = acf(ts_data.Percent_incident)
axes[0].bar(range(len(a)), a)
axes[0].set_ylim(-1,1)
axes[0].set_title('ACF')

p = pacf(ts_data.Percent_incident)
axes[1].bar(range(len(p)), p)
axes[1].set_ylim(-1,1)
axes[1].set_title('Partial ACF')


# This model fits a second order autoregressive model to the incidence time series to check how well current incidence can be predicted from the previous two weeks. That is, we fit the following relationship:
# 
# $$\theta_t = \mu + \rho_1 y_{t-1} + \rho_2 y_{t-2}$$
# 
# where the $\rho_1, \rho_2$ parameters are the influence of information one week and two weeks prior, respectively, on the current expected incidence ($\theta_t$).
# 
# We then use this value in a normal likelihood to predict the observed value $y_t$:
# 
# $$y_t \sim N(\theta_t, \tau$$

# In[6]:


import pymc3 as pm

y = ts_data.Percent_incident.values

with pm.Model() as ar2:
    
    mu = pm.Normal('mu', 0, 1e-6)
    
    # Autoregressive parameters
    ar = 2
    rho = pm.Normal('rho', 0, 1e-3, shape=ar)
    
    sigma = pm.Uniform('sigma', 0, 100)
    tau = sigma ** -2
    
    # Expected values
    theta = pm.Deterministic('theta', mu + rho[0]*y[:-2] + rho[1]*y[1:-1])
    
    incid = pm.Normal('incid', theta, tau, observed=y[ar:])
    incid_pred = pm.Normal('incid_pred', theta, tau, shape=y[ar:].shape)


# In[7]:


with ar2:
    
    tr = pm.sample(2000)


# In[8]:


pm.traceplot(tr[1000:], vars=['rho', 'sigma']);


# Expected incidence

# In[9]:


ax = ts_data.Percent_incident.plot()
ax.plot(tr[-100:]['theta'].T, alpha=0.02, color='r');


# Predicted incidence, that is, including sampling variation as well as parametric uncertainty.

# In[10]:


ax = ts_data.Percent_incident.plot()
ax.plot(tr[-100:]['incid_pred'].T, alpha=0.02, color='r');


# Now that we have shown that we can predict current incidence from past incidence, we extend the model by attempting to predict outbreaks based on the predicted incidence.
# 
# Specifically, we model the observed number of outbreaks in week $t$ as a negative binomial random variable. The negative binomial is a generalization of the Poisson distribution that can be used to flexibly predict counts.
# 
# $$x_t \sim NB(\phi_t, \alpha)$$
# 
# where the expected value $\phi_t$ is a function of the expected incidence that we predicted from past incidence.
# 
# $$\phi_t = \beta_0 + \beta_1 \theta_t$$
# 
# For this model, the prediction is based on the previous **three weeks** of incidence, rather than just 2 in the previous model.

# In[23]:


import pymc3 as pm
import theano.tensor as tt

y = ts_data.Percent_incident.values
count = ts_data.Outbreaks

with pm.Model() as ar2_pred:
    
    
    mu = pm.Normal('mu', 0, 1e-6)
    
    # Autoregressive parameters
    ar = 3
    rho = pm.Normal('rho', 0, 1e-3, shape=ar)
    
    # Expected values
    theta = pm.Deterministic('theta', mu + rho[0]*y[:-ar] + rho[1]*y[ar-2:-(ar-1)] 
                                             + rho[2]*y[ar-1:-(ar-2)])
    
    sigma = pm.Uniform('sigma', 0, 100)
    tau = sigma ** -2
    
    incid = pm.Normal('incid', theta, tau, observed=y[ar:])
    
    beta = pm.Normal('beta', 0, 0.01, shape=2)
    
    expected_outbreaks = pm.Deterministic('expected_outbreaks', np.exp(beta[0] + beta[1]*theta))
    # Probability of one or more outbreaks
    pred_prob_outbreak = pm.Deterministic('pred_prob_outbreak', expected_outbreaks>1)
    
    alpha = pm.Exponential('alpha', 0.1, testval=1)
    
    outbreaks = pm.NegativeBinomial('outbreaks', expected_outbreaks, alpha=alpha,
                                    observed=count[ar:])


# In[24]:


iterations = 10000
burn = 5000


# In[25]:


with ar2_pred:
    start = pm.find_MAP()
    trace = pm.sample(iterations, step=pm.NUTS(scaling=start), start=start)


# In[26]:


pm.traceplot(trace[burn:], vars=['beta', 'alpha']);


# In[27]:


pm.summary(trace[burn:], vars=['rho', 'beta'])


# In[28]:


ax = ts_data.Outbreaks.plot(style='bo')
ax.plot(trace[-100:]['expected_outbreaks'].T, alpha=0.02, color='r');


# This plots the posterior probability of 1 or more outbreaks, based on incidence information in the previous 3 weeks. The labels indicate the year and week of times where the probability of 1 or more outbreaks is predicted to be greater than 5%.

# In[38]:


probs = trace[burn:]['pred_prob_outbreak'].mean(0)
label_mask = probs>0.05

ax = pd.Series(probs, index=ts_data.index[3:]).plot()
ax.set_ylabel('Probability of 1+ outbreaks')
ax.set_xlabel('Week')
ax.set_xticklabels(ts_data.index[2:], rotation=45)

for i,x,y in zip(np.arange(len(probs))[label_mask], ts_data.index[3:][label_mask], probs[label_mask]):
    ax.annotate(str(x), xy=(i,y))


# In[ ]:


get_ipython().system('gist -u 670e777406a2f2bfb67e "Norovirus prediction.ipynb"')