#!/usr/bin/env python
# coding: utf-8

# # Inferring parameters of SDEs using a Euler-Maruyama scheme
# 
# _This notebook is derived from a presentation prepared for the Theoretical Neuroscience Group, Institute of Systems Neuroscience at Aix-Marseile University._

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')
import pymc3 as pm
import theano.tensor as tt 
import scipy
from pymc3.distributions.timeseries import EulerMaruyama


# ## Toy model 1
# 
# Here's a scalar linear SDE in symbolic form
# 
# $ dX_t = \lambda X_t + \sigma^2 dW_t $
# 
# discretized with the Euler-Maruyama scheme

# In[2]:


# parameters
λ = -0.78
σ2 = 5e-3
N = 200
dt = 1e-1

# time series
x = 0.1
x_t = []

# simulate
for i in range(N):
    x += dt * λ * x + sqrt(dt) * σ2 * randn()
    x_t.append(x)
    
x_t = array(x_t)

# z_t noisy observation
z_t = x_t + randn(x_t.size) * 5e-3


# In[3]:


figure(figsize=(10, 3))
subplot(121)
plot(x_t[:30], 'k', label='$x(t)$', alpha=0.5), plot(z_t[:30], 'r', label='$z(t)$', alpha=0.5)
title('Transient'), legend()
subplot(122)
plot(x_t[30:], 'k', label='$x(t)$', alpha=0.5), plot(z_t[30:], 'r', label='$z(t)$', alpha=0.5)
title('All time');
tight_layout()


# What is the inference we want to make? Since we've made a noisy observation of the generated time series, we need to estimate both $x(t)$ and $\lambda$.

# First, we rewrite our SDE as a function returning a tuple of the drift and diffusion coefficients

# In[4]:


def lin_sde(x, lam):
    return lam * x, σ2


# Next, we describe the probability model as a set of three stochastic variables, `lam`, `xh`, and `zh`:

# In[5]:


with pm.Model() as model:
    
    # uniform prior, but we know it must be negative
    lam = pm.Flat('lam')
    
    # "hidden states" following a linear SDE distribution
    # parametrized by time step (det. variable) and lam (random variable)
    xh = EulerMaruyama('xh', dt, lin_sde, (lam, ), shape=N, testval=x_t)
    
    # predicted observation
    zh = pm.Normal('zh', mu=xh, sd=5e-3, observed=z_t)


# Once the model is constructed, we perform inference, i.e. sample from the posterior distribution, in the following steps:

# In[6]:


with model:
    
    # optimize to find the mode of the posterior as starting point for prob. mass
    start = pm.find_MAP(vars=[xh], fmin=scipy.optimize.fmin_l_bfgs_b)

    # "warm up" to transition from mode to prob. mass
    step = pm.NUTS(scaling=start)
    trace = pm.sample(1000, step, progressbar=True)

    # sample from the prob. mass
    step = pm.NUTS(scaling=trace[-1], gamma=.25)
    trace = pm.sample(2000, step, start=trace[-1], progressbar=True)


# Next, we plot some basic statistics on the samples from the posterior,

# In[7]:


figure(figsize=(10, 3))
subplot(121)
plot(percentile(trace[xh], [2.5, 97.5], axis=0).T, 'k', label='$\hat{x}_{95\%}(t)$')
plot(x_t, 'r', label='$x(t)$')
legend()

subplot(122)
hist(trace[lam], 30, label='$\hat{\lambda}$', alpha=0.5)
axvline(λ, color='r', label='$\lambda$', alpha=0.5)
legend();


# A model can fit the data precisely and still be wrong; we need to use _posterior predictive checks_ to assess if, under our fit model, the data our likely.
# 
# In other words, we 
# - assume the model is correct
# - simulate new observations
# - check that the new observations fit with the original data

# In[8]:


# generate trace from posterior
ppc_trace = pm.sample_posterior_predictive(trace, model=model)

# plot with data
figure(figsize=(10, 3))
plot(percentile(ppc_trace['zh'], [2.5, 97.5], axis=0).T, 'k', label=r'$z_{95\% PP}(t)$')
plot(z_t, 'r', label='$z(t)$')
legend()


# Note that 
# 
# - inference also estimates the initial conditions
# - the observed data $z(t)$ lies fully within the 95% interval of the PPC.
# - there are many other ways of evaluating fit

# ### Toy model 2
# 
# As the next model, let's use a 2D deterministic oscillator, 
# \begin{align}
# \dot{x} &= \tau (x - x^3/3 + y) \\
# \dot{y} &= \frac{1}{\tau} (a - x)
# \end{align}
# 
# with noisy observation $z(t) = m x + (1 - m) y + N(0, 0.05)$.

# In[9]:


N, τ, a, m, σ2 = 200, 3.0, 1.05, 0.2, 1e-1
xs, ys = [0.0], [1.0]
for i in range(N):
    x, y = xs[-1], ys[-1]
    dx = τ * (x - x**3.0/3.0 + y)
    dy = (1.0 / τ) * (a - x)
    xs.append(x + dt * dx + sqrt(dt) * σ2 * randn())
    ys.append(y + dt * dy + sqrt(dt) * σ2 * randn())
xs, ys = array(xs), array(ys)
zs = m * xs + (1 - m) * ys + randn(xs.size) * 0.1

figure(figsize=(10, 2))
plot(xs, label='$x(t)$')
plot(ys, label='$y(t)$')
plot(zs, label='$z(t)$')
legend()


# Now, estimate the hidden states $x(t)$ and $y(t)$, as well as parameters $\tau$, $a$ and $m$.
# 
# As before, we rewrite our SDE as a function returned drift & diffusion coefficients:

# In[10]:


def osc_sde(xy, τ, a):
    x, y = xy[:, 0], xy[:, 1]
    dx = τ * (x - x**3.0/3.0 + y)
    dy = (1.0 / τ) * (a - x)
    dxy = tt.stack([dx, dy], axis=0).T
    return dxy, σ2


# As before, the Euler-Maruyama discretization of the SDE is written as a prediction of the state at step $i+1$ based on the state at step $i$.

# We can now write our statistical model as before, with uninformative priors on $\tau$, $a$ and $m$:

# In[11]:


xys = c_[xs, ys]

with pm.Model() as model:
    τh = pm.Uniform('τh', lower=0.1, upper=5.0)
    ah = pm.Uniform('ah', lower=0.5, upper=1.5)
    mh = pm.Uniform('mh', lower=0.0, upper=1.0)
    xyh = EulerMaruyama('xyh', dt, osc_sde, (τh, ah), shape=xys.shape, testval=xys)
    zh = pm.Normal('zh', mu=mh * xyh[:, 0] + (1 - mh) * xyh[:, 1], sd=0.1, observed=zs)


# As with the linear SDE, we 1) find a MAP estimate, 2) warm up and 3) sample from the probability mass:

# In[12]:


with model:
    
    # optimize to find the mode of the posterior as starting point for prob. mass
    start = pm.find_MAP(vars=[xyh], fmin=scipy.optimize.fmin_l_bfgs_b)

    # "warm up" to transition from mode to prob. mass
    step = pm.NUTS(scaling=start)
    trace = pm.sample(100, step, progressbar=True)

    # sample from the prob. mass
    step = pm.NUTS(scaling=trace[-1], gamma=.25)
    trace = pm.sample(2000, step, start=trace[-1], progressbar=True)


# Again, the result is a set of samples from the posterior, including our parameters of interest but also the hidden states

# In[13]:


figure(figsize=(10, 6))
subplot(211)
plot(percentile(trace[xyh][..., 0], [2.5, 97.5], axis=0).T, 'k', label='$\hat{x}_{95\%}(t)$')
plot(xs, 'r', label='$x(t)$')
legend(loc=0)
subplot(234), hist(trace['τh']), axvline(τ), xlim([1.0, 4.0]), title('τ')
subplot(235), hist(trace['ah']), axvline(a), xlim([0, 2.0]), title('a')
subplot(236), hist(trace['mh']), axvline(m), xlim([0, 1]), title('m')
tight_layout()


# Again, we can perform a posterior predictive check, that our data are likely given the fit model

# In[14]:


# generate trace from posterior
ppc_trace = pm.sample_posterior_predictive(trace, model=model)

# plot with data
figure(figsize=(10, 3))
plot(percentile(ppc_trace['zh'], [2.5, 97.5], axis=0).T, 'k', label=r'$z_{95\% PP}(t)$')
plot(zs, 'r', label='$z(t)$')
legend()