#!/usr/bin/env python # coding: utf-8 # # Aggregating and weighting diverse data # In this notebook, we illustrate the aggregation of various data, and how to combine that with an adaptive scheme of computing weights. # ## Aggregating diverse distance functions # We want to combine different distance metrics operating on subsets of the data to one distance value. As a toy model, assume we want to combine a Laplace and a Normal distance. # In[ ]: # install if not done yet get_ipython().system('pip install pyabc --quiet') # In[1]: import os import tempfile import matplotlib.pyplot as plt import numpy as np from scipy import stats import pyabc p_true = {'p0': 0, 'p1': 0} def model(p): return { 's0': p['p0'] + 0.1 * np.random.normal(), 's1': p['p1'] + 0.1 * np.random.normal(), } observation = {'s0': 0, 's1': 0} def distance0(x, x_0): return abs(x['s0'] - x_0['s0']) def distance1(x, x_0): return (x['s1'] - x_0['s1']) ** 2 # prior prior = pyabc.Distribution( p0=pyabc.RV("uniform", -1, 2), p1=pyabc.RV("uniform", -1, 2) ) # The key is now to use `pyabc.distance.AggregatedDistance` to combine both. # In[2]: distance = pyabc.AggregatedDistance([distance0, distance1]) abc = pyabc.ABCSMC(model, prior, distance) db_path = "sqlite:///" + os.path.join(tempfile.gettempdir(), "tmp.db") abc.new(db_path, observation) history1 = abc.run(max_nr_populations=6) # plotting def plot_history(history): fig, ax = plt.subplots() for t in range(history.max_t + 1): df, w = history.get_distribution(m=0, t=t) pyabc.visualization.plot_kde_1d( df, w, xmin=-1, xmax=1, x='p0', ax=ax, label=f"PDF t={t}", refval=p_true, ) ax.legend() fig, ax = plt.subplots() for t in range(history.max_t + 1): df, w = history.get_distribution(m=0, t=t) pyabc.visualization.plot_kde_1d( df, w, xmin=-1, xmax=1, x='p1', ax=ax, label=f"PDF t={t}", refval=p_true, ) ax.legend() plot_history(history1) # ## Weighted aggregation # A problem with the previous aggregation of distance function is that usually they vary on different scales. In order to account for all in a similar manner, one thing one can do is to weight them. # # Let us look at a simple example of two summary statistics which vary on very different scales: # In[3]: import os import tempfile import matplotlib.pyplot as plt import numpy as np from scipy import stats import pyabc p_true = {'p0': 0, 'p1': 0} def model(p): return { 's0': p['p0'] + 0.1 * np.random.normal(), 's1': p['p1'] + 100 * np.random.normal(), } observation = {'s0': 0, 's1': 0} def distance0(x, x_0): return abs(x['s0'] - x_0['s0']) def distance1(x, x_0): return (x['s1'] - x_0['s1']) ** 2 # prior prior = pyabc.Distribution( p0=pyabc.RV("uniform", -1, 2), p1=pyabc.RV("uniform", -1, 2) ) distance = pyabc.AggregatedDistance([distance0, distance1]) abc = pyabc.ABCSMC(model, prior, distance) db_path = "sqlite:///" + os.path.join(tempfile.gettempdir(), "tmp.db") abc.new(db_path, observation) history1 = abc.run(max_nr_populations=6) # plotting def plot_history(history): fig, ax = plt.subplots() for t in range(history.max_t + 1): df, w = history.get_distribution(m=0, t=t) pyabc.visualization.plot_kde_1d( df, w, xmin=-1, xmax=1, x='p0', ax=ax, label=f"PDF t={t}", refval=p_true, ) ax.legend() fig, ax = plt.subplots() for t in range(history.max_t + 1): df, w = history.get_distribution(m=0, t=t) pyabc.visualization.plot_kde_1d( df, w, xmin=-1, xmax=1, x='p1', ax=ax, label=f"PDF t={t}", refval=p_true, ) ax.legend() plot_history(history1) # The algorithm has problems extracting information from the first summary statistic on the first parameter, because the second summary statistic is on a much larger scale. Let us use the `pyabc.distance.AdaptiveAggregatedDistance` instead, which tries to find good weights itself (and even adapts these weights over time): # In[4]: # prior prior = pyabc.Distribution( p0=pyabc.RV("uniform", -1, 2), p1=pyabc.RV("uniform", -1, 2) ) distance = pyabc.AdaptiveAggregatedDistance([distance0, distance1]) abc = pyabc.ABCSMC(model, prior, distance) db_path = "sqlite:///" + os.path.join(tempfile.gettempdir(), "tmp.db") abc.new(db_path, observation) history2 = abc.run(max_nr_populations=6) plot_history(history2) # The result is much better. We can also only initially calculate weights by setting `adaptive=False`: # In[5]: # prior prior = pyabc.Distribution( p0=pyabc.RV("uniform", -1, 2), p1=pyabc.RV("uniform", -1, 2) ) distance = pyabc.AdaptiveAggregatedDistance( [distance0, distance1], adaptive=False ) abc = pyabc.ABCSMC(model, prior, distance) db_path = "sqlite:///" + os.path.join(tempfile.gettempdir(), "tmp.db") abc.new(db_path, observation) history3 = abc.run(max_nr_populations=6) plot_history(history3) # Here, pre-calibration performs comparable to adaptation, because the weights do not change so much over time. Note that one can also specify other scale functions, by passing ``AdaptiveAggregatedDistance(distances, scale_function)``, e.g. ``pyabc.distance.mean[/median/span]``. # The following plots demonstrate that we not only have a much better posterior approximation after the same number of iterations in the second and third run compared to the first, but we achieve that actually with a much lower number of samples. # In[6]: histories = [history1, history2, history3] labels = ["Standard", "Adaptive", "Pre-Calibrated"] pyabc.visualization.plot_sample_numbers(histories, labels, rotation=45) pyabc.visualization.plot_total_sample_numbers( histories, labels, yscale='log10', rotation=45 ) pyabc.visualization.plot_effective_sample_sizes(histories, labels)