#!/usr/bin/env python
# coding: utf-8

# # Aggregating and weighting diverse data

# In this notebook, we illustrate the aggregation of various data, and how to combine that with an adaptive scheme of computing weights.

# ## Aggregating diverse distance functions

# We want to combine different distance metrics operating on subsets of the data to one distance value. As a toy model, assume we want to combine a Laplace and a Normal distance.

# In[ ]:


# install if not done yet
get_ipython().system('pip install pyabc --quiet')


# In[1]:


import os
import tempfile

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

import pyabc

p_true = {'p0': 0, 'p1': 0}


def model(p):
    return {
        's0': p['p0'] + 0.1 * np.random.normal(),
        's1': p['p1'] + 0.1 * np.random.normal(),
    }


observation = {'s0': 0, 's1': 0}


def distance0(x, x_0):
    return abs(x['s0'] - x_0['s0'])


def distance1(x, x_0):
    return (x['s1'] - x_0['s1']) ** 2


# prior
prior = pyabc.Distribution(
    p0=pyabc.RV("uniform", -1, 2), p1=pyabc.RV("uniform", -1, 2)
)


# The key is now to use `pyabc.distance.AggregatedDistance` to combine both.

# In[2]:


distance = pyabc.AggregatedDistance([distance0, distance1])

abc = pyabc.ABCSMC(model, prior, distance)
db_path = "sqlite:///" + os.path.join(tempfile.gettempdir(), "tmp.db")
abc.new(db_path, observation)
history1 = abc.run(max_nr_populations=6)

# plotting


def plot_history(history):
    fig, ax = plt.subplots()
    for t in range(history.max_t + 1):
        df, w = history.get_distribution(m=0, t=t)
        pyabc.visualization.plot_kde_1d(
            df,
            w,
            xmin=-1,
            xmax=1,
            x='p0',
            ax=ax,
            label=f"PDF t={t}",
            refval=p_true,
        )
    ax.legend()

    fig, ax = plt.subplots()
    for t in range(history.max_t + 1):
        df, w = history.get_distribution(m=0, t=t)
        pyabc.visualization.plot_kde_1d(
            df,
            w,
            xmin=-1,
            xmax=1,
            x='p1',
            ax=ax,
            label=f"PDF t={t}",
            refval=p_true,
        )
    ax.legend()


plot_history(history1)


# ## Weighted aggregation

# A problem with the previous aggregation of distance function is that usually they vary on different scales. In order to account for all in a similar manner, one thing one can do is to weight them.
# 
# Let us look at a simple example of two summary statistics which vary on very different scales:

# In[3]:


import os
import tempfile

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

import pyabc

p_true = {'p0': 0, 'p1': 0}


def model(p):
    return {
        's0': p['p0'] + 0.1 * np.random.normal(),
        's1': p['p1'] + 100 * np.random.normal(),
    }


observation = {'s0': 0, 's1': 0}


def distance0(x, x_0):
    return abs(x['s0'] - x_0['s0'])


def distance1(x, x_0):
    return (x['s1'] - x_0['s1']) ** 2


# prior
prior = pyabc.Distribution(
    p0=pyabc.RV("uniform", -1, 2), p1=pyabc.RV("uniform", -1, 2)
)

distance = pyabc.AggregatedDistance([distance0, distance1])

abc = pyabc.ABCSMC(model, prior, distance)
db_path = "sqlite:///" + os.path.join(tempfile.gettempdir(), "tmp.db")
abc.new(db_path, observation)
history1 = abc.run(max_nr_populations=6)

# plotting


def plot_history(history):
    fig, ax = plt.subplots()
    for t in range(history.max_t + 1):
        df, w = history.get_distribution(m=0, t=t)
        pyabc.visualization.plot_kde_1d(
            df,
            w,
            xmin=-1,
            xmax=1,
            x='p0',
            ax=ax,
            label=f"PDF t={t}",
            refval=p_true,
        )
    ax.legend()

    fig, ax = plt.subplots()
    for t in range(history.max_t + 1):
        df, w = history.get_distribution(m=0, t=t)
        pyabc.visualization.plot_kde_1d(
            df,
            w,
            xmin=-1,
            xmax=1,
            x='p1',
            ax=ax,
            label=f"PDF t={t}",
            refval=p_true,
        )
    ax.legend()


plot_history(history1)


# The algorithm has problems extracting information from the first summary statistic on the first parameter, because the second summary statistic is on a much larger scale. Let us use the `pyabc.distance.AdaptiveAggregatedDistance` instead, which tries to find good weights itself (and even adapts these weights over time):

# In[4]:


# prior
prior = pyabc.Distribution(
    p0=pyabc.RV("uniform", -1, 2), p1=pyabc.RV("uniform", -1, 2)
)

distance = pyabc.AdaptiveAggregatedDistance([distance0, distance1])

abc = pyabc.ABCSMC(model, prior, distance)
db_path = "sqlite:///" + os.path.join(tempfile.gettempdir(), "tmp.db")
abc.new(db_path, observation)
history2 = abc.run(max_nr_populations=6)

plot_history(history2)


# The result is much better. We can also only initially calculate weights by setting `adaptive=False`:

# In[5]:


# prior
prior = pyabc.Distribution(
    p0=pyabc.RV("uniform", -1, 2), p1=pyabc.RV("uniform", -1, 2)
)

distance = pyabc.AdaptiveAggregatedDistance(
    [distance0, distance1], adaptive=False
)

abc = pyabc.ABCSMC(model, prior, distance)
db_path = "sqlite:///" + os.path.join(tempfile.gettempdir(), "tmp.db")
abc.new(db_path, observation)
history3 = abc.run(max_nr_populations=6)

plot_history(history3)


# Here, pre-calibration performs comparable to adaptation, because the weights do not change so much over time. Note that one can also specify other scale functions, by passing ``AdaptiveAggregatedDistance(distances, scale_function)``, e.g. ``pyabc.distance.mean[/median/span]``.

# The following plots demonstrate that we not only have a much better posterior approximation after the same number of iterations in the second and third run compared to the first, but we achieve that actually with a much lower number of samples.

# In[6]:


histories = [history1, history2, history3]
labels = ["Standard", "Adaptive", "Pre-Calibrated"]

pyabc.visualization.plot_sample_numbers(histories, labels, rotation=45)
pyabc.visualization.plot_total_sample_numbers(
    histories, labels, yscale='log10', rotation=45
)
pyabc.visualization.plot_effective_sample_sizes(histories, labels)