#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import statsmodels as sm import seaborn as sns import matplotlib.pyplot as plt sns.set_context('notebook') # In[2]: np.random.seed(42) # Data import # In[3]: ts_data = pd.read_excel('Trend study data_DEC2012-2015.xlsx', index_col=0) ts_data.head() # Plot of incidence time series (blue line) and the weekly outbreak records (green dots). # In[4]: ts_data.Percent_incident.plot(label='incidence') ax = ts_data.Outbreaks.plot(secondary_y=True, style='o', label='outbreaks') ax.set_yticks(range(3)) ax.set_ylim(0, 2.5); # Autocorrelation function and partial autocorrelation function for observed incidence. The partial autocorrelation shows the autocorrelation at a particular time lag, controlling for the autocorrelation of other periods. # # This shows that there is probably information going back 2 or 3 weeks that can help us inform the prediction of incidence. # In[5]: from statsmodels.graphics.tsaplots import acf, pacf fig, axes = plt.subplots(1, 2) a = acf(ts_data.Percent_incident) axes[0].bar(range(len(a)), a) axes[0].set_ylim(-1,1) axes[0].set_title('ACF') p = pacf(ts_data.Percent_incident) axes[1].bar(range(len(p)), p) axes[1].set_ylim(-1,1) axes[1].set_title('Partial ACF') # This model fits a second order autoregressive model to the incidence time series to check how well current incidence can be predicted from the previous two weeks. That is, we fit the following relationship: # # $$\theta_t = \mu + \rho_1 y_{t-1} + \rho_2 y_{t-2}$$ # # where the $\rho_1, \rho_2$ parameters are the influence of information one week and two weeks prior, respectively, on the current expected incidence ($\theta_t$). # # We then use this value in a normal likelihood to predict the observed value $y_t$: # # $$y_t \sim N(\theta_t, \tau$$ # In[6]: import pymc3 as pm y = ts_data.Percent_incident.values with pm.Model() as ar2: mu = pm.Normal('mu', 0, 1e-6) # Autoregressive parameters ar = 2 rho = pm.Normal('rho', 0, 1e-3, shape=ar) sigma = pm.Uniform('sigma', 0, 100) tau = sigma ** -2 # Expected values theta = pm.Deterministic('theta', mu + rho[0]*y[:-2] + rho[1]*y[1:-1]) incid = pm.Normal('incid', theta, tau, observed=y[ar:]) incid_pred = pm.Normal('incid_pred', theta, tau, shape=y[ar:].shape) # In[7]: with ar2: tr = pm.sample(2000) # In[8]: pm.traceplot(tr[1000:], vars=['rho', 'sigma']); # Expected incidence # In[9]: ax = ts_data.Percent_incident.plot() ax.plot(tr[-100:]['theta'].T, alpha=0.02, color='r'); # Predicted incidence, that is, including sampling variation as well as parametric uncertainty. # In[10]: ax = ts_data.Percent_incident.plot() ax.plot(tr[-100:]['incid_pred'].T, alpha=0.02, color='r'); # Now that we have shown that we can predict current incidence from past incidence, we extend the model by attempting to predict outbreaks based on the predicted incidence. # # Specifically, we model the observed number of outbreaks in week $t$ as a negative binomial random variable. The negative binomial is a generalization of the Poisson distribution that can be used to flexibly predict counts. # # $$x_t \sim NB(\phi_t, \alpha)$$ # # where the expected value $\phi_t$ is a function of the expected incidence that we predicted from past incidence. # # $$\phi_t = \beta_0 + \beta_1 \theta_t$$ # # For this model, the prediction is based on the previous **three weeks** of incidence, rather than just 2 in the previous model. # In[23]: import pymc3 as pm import theano.tensor as tt y = ts_data.Percent_incident.values count = ts_data.Outbreaks with pm.Model() as ar2_pred: mu = pm.Normal('mu', 0, 1e-6) # Autoregressive parameters ar = 3 rho = pm.Normal('rho', 0, 1e-3, shape=ar) # Expected values theta = pm.Deterministic('theta', mu + rho[0]*y[:-ar] + rho[1]*y[ar-2:-(ar-1)] + rho[2]*y[ar-1:-(ar-2)]) sigma = pm.Uniform('sigma', 0, 100) tau = sigma ** -2 incid = pm.Normal('incid', theta, tau, observed=y[ar:]) beta = pm.Normal('beta', 0, 0.01, shape=2) expected_outbreaks = pm.Deterministic('expected_outbreaks', np.exp(beta[0] + beta[1]*theta)) # Probability of one or more outbreaks pred_prob_outbreak = pm.Deterministic('pred_prob_outbreak', expected_outbreaks>1) alpha = pm.Exponential('alpha', 0.1, testval=1) outbreaks = pm.NegativeBinomial('outbreaks', expected_outbreaks, alpha=alpha, observed=count[ar:]) # In[24]: iterations = 10000 burn = 5000 # In[25]: with ar2_pred: start = pm.find_MAP() trace = pm.sample(iterations, step=pm.NUTS(scaling=start), start=start) # In[26]: pm.traceplot(trace[burn:], vars=['beta', 'alpha']); # In[27]: pm.summary(trace[burn:], vars=['rho', 'beta']) # In[28]: ax = ts_data.Outbreaks.plot(style='bo') ax.plot(trace[-100:]['expected_outbreaks'].T, alpha=0.02, color='r'); # This plots the posterior probability of 1 or more outbreaks, based on incidence information in the previous 3 weeks. The labels indicate the year and week of times where the probability of 1 or more outbreaks is predicted to be greater than 5%. # In[38]: probs = trace[burn:]['pred_prob_outbreak'].mean(0) label_mask = probs>0.05 ax = pd.Series(probs, index=ts_data.index[3:]).plot() ax.set_ylabel('Probability of 1+ outbreaks') ax.set_xlabel('Week') ax.set_xticklabels(ts_data.index[2:], rotation=45) for i,x,y in zip(np.arange(len(probs))[label_mask], ts_data.index[3:][label_mask], probs[label_mask]): ax.annotate(str(x), xy=(i,y)) # In[ ]: get_ipython().system('gist -u 670e777406a2f2bfb67e "Norovirus prediction.ipynb"')