#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')
rc('text', usetex=True)
rc('axes', labelsize=15, titlesize=15)


# In[2]:


import seaborn as sns
sns.set_style("white")
np.random.seed(10)


# # ABC PMC on a 2D gaussian example

# In this example we're looking at a dataset that has been drawn from a 2D gaussian distribution. We're going to assume that we don't have a proper likelihood but that we know the covariance matrix $\Sigma$ of the distribution. Using the ABC PMC algorithm we will approximate the posterior of the distribtion of the mean values.

# First we generate a new dataset by drawing random variables from a mulitvariate gaussian around mean=[1.1, 1.5]. This is going to be our observed data set

# In[3]:


samples_size = 1000
sigma = np.eye(2) * 0.25
means = [1.1, 1.5]

data = np.random.multivariate_normal(means, sigma, samples_size)
matshow(sigma)
title("covariance matrix sigma")
colorbar()


# Then we need to define our model/simulation. In this case this is simple: we draw again random variables from a multivariate gaussian distribution using the given mean and the sigma from above

# In[4]:


def create_new_sample(theta):
    return np.random.multivariate_normal(theta, sigma, samples_size)


# Next, we need to define a distance measure. We will use the sum of the absolute differences of the means of the simulated and the observed data

# In[5]:


def dist_measure(x, y):
    return np.sum(np.abs(np.mean(x, axis=0) - np.mean(y, axis=0)))


# ### Verification

# To verify if everything works and to see the effect of the random samples in the simulation we compute the distance for 1000 simulations at the true mean values

# In[6]:


distances = [dist_measure(data, create_new_sample(means)) for _ in range(1000)]


# In[7]:


sns.distplot(distances, axlabel="distances", )
title("Variablility of distance from simulations")


# ### Setup

# Now we're going to set up the ABC PMC sampling

# In[8]:


import abcpmc


# As a prior we're going to use a gaussian prior using our best guess about the distribution of the means. 

# In[9]:


prior = abcpmc.GaussianPrior(mu=[1.0, 1.0], sigma=np.eye(2) * 0.5)


# As threshold $\epsilon$ we're going to use the $\alpha^{th}$ percentile of the sorted distances of the particles of the current iteration. The simplest way to do this is to define a constant $\epsilon$ and iteratively adapt the theshold. As starting value we're going to define a sufficiently high value so that the acceptance ratio is reasonable and we will sample for T iterations

# In[10]:


alpha = 75
T = 20
eps_start = 1.0
eps = abcpmc.ConstEps(T, eps_start)


# Finally, we create an instance of your sampler. We want to use 5000 particles and the functions we defined above. Additionally we're going to make use of the built-in parallelization and use 7 cores for the sampling

# In[11]:


sampler = abcpmc.Sampler(N=5000, Y=data, postfn=create_new_sample, dist=dist_measure, threads=7)


# Optionally, we can customize the proposal creation. Here we're going to use a "Optimal Local Covariance Matrix"-kernel (OLCM) as proposed by (Fillipi et al. 2012). This has shown to yield a high acceptance ratio togheter with a faster decrease of the thresold.

# In[12]:


sampler.particle_proposal_cls = abcpmc.OLCMParticleProposal


# ### Sampling

# Now we're ready to sample. All we need to do is to iterate over the yielded values of your sampler instance. The sample function returns a namedtuple per iteration that contains all the information that we're interestend in

# In[13]:


def launch():
    eps = abcpmc.ConstEps(T, eps_start)

    pools = []
    for pool in sampler.sample(prior, eps):
        print("T: {0}, eps: {1:>.4f}, ratio: {2:>.4f}".format(pool.t, eps(pool.eps), pool.ratio))

        for i, (mean, std) in enumerate(zip(*abcpmc.weighted_avg_and_std(pool.thetas, pool.ws, axis=0))):
            print(u"    theta[{0}]: {1:>.4f} \u00B1 {2:>.4f}".format(i, mean,std))

        eps.eps = np.percentile(pool.dists, alpha) # reduce eps value
        pools.append(pool)
    sampler.close()
    return pools


# In[14]:


import time
t0 = time.time()
pools = launch()
print "took", (time.time() - t0)


# ### Postprocessing

# How did the sampled values evolve over the iterations? As the threshold is decreasing we expect the errors to shrink while the means converge to the true means.

# In[15]:


for i in range(len(means)):
    moments = np.array([abcpmc.weighted_avg_and_std(pool.thetas[:,i], pool.ws, axis=0) for pool in pools])
    errorbar(range(T), moments[:, 0], moments[:, 1])
hlines(means, 0, T, linestyle="dotted", linewidth=0.7)
_ = xlim([-.5, T])


# How does the distribution of the distances look like after we have approximated the posterior? If we're close to the true posterior we expect to have a high bin count around the values we've found in the earlier distribution plot

# In[16]:


distances = np.array([pool.dists for pool in pools]).flatten()
sns.distplot(distances, axlabel="distance")


# How did our $\epsilon$ values behave over the iterations? Using the $\alpha^{th}$ percentile causes the threshold to decrease relatively fast in the beginning and to plateau later on

# In[17]:


eps_values = np.array([pool.eps for pool in pools])
plot(eps_values, label=r"$\epsilon$ values")
xlabel("Iteration")
ylabel(r"$\epsilon$")
legend(loc="best")


# What about the acceptance ratio? ABC PMC with the OLCM kernel gives us a relatively high acceptance ratio.

# In[18]:


acc_ratios = np.array([pool.ratio for pool in pools])
plot(acc_ratios, label="Acceptance ratio")
ylim([0, 1])
xlabel("Iteration")
ylabel("Acceptance ratio")
legend(loc="best")


# In[19]:


get_ipython().run_line_magic('pylab', 'inline')
rc('text', usetex=True)
rc('axes', labelsize=15, titlesize=15)


# Finally what does our posterior look like? For the visualization we're using triangle.py (https://github.com/dfm/triangle.py)

# In[20]:


import triangle
samples = np.vstack([pool.thetas for pool in pools])
fig = triangle.corner(samples, truths= means)


# Omitting the first couple of iterations..

# In[21]:


idx = -1
samples = pools[idx].thetas
fig = triangle.corner(samples, weights=pools[idx].ws, truths= means)

for mean, std in zip(*abcpmc.weighted_avg_and_std(samples, pools[idx].ws, axis=0)):
    print(u"mean: {0:>.4f} \u00B1 {1:>.4f}".format(mean,std))


# In[ ]: