Chapter 10 - Model Comparision and Hierarchical Modelling

In [1]:
import pandas as pd
import numpy as np
import pymc3 as pm
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

from theano.tensor import eq
from IPython.display import Image
from matplotlib import gridspec

%matplotlib inline
plt.style.use('seaborn-white')

color = '#87ceeb'
In [2]:
%load_ext watermark
%watermark -p pandas,numpy,pymc3,arviz,matplotlib,seaborn,theano
pandas 1.1.1
numpy 1.19.1
pymc3 3.8
arviz 0.9.0
matplotlib 3.3.1
seaborn 0.10.1
theano 1.0.4

10.3.2 - Hierarchical MCMC computation of relative model probability

Model (Kruschke, 2015)

In [3]:
Image('images/fig10_2.png', width=300)
Out[3]:

Model 1 - One theta variable

Coin is flipped nine times, resulting in six heads.

In [4]:
with pm.Model() as hierarchical_model:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    kappa = 12
    
    omega = pm.math.switch(eq(m, 0), .25, .75)
    
    theta = pm.Beta('theta', omega*(kappa-2)+1, (1-omega)*(kappa-2)+1)
    
    y = pm.Bernoulli('y', theta, observed=np.r_[6*[1], 3*[0]])

pm.model_to_graphviz(hierarchical_model)
Out[4]:
%3 cluster9 9 theta theta ~ Beta y y ~ Bernoulli theta->y m m ~ Categorical m->theta
In [5]:
with hierarchical_model:
    trace = pm.sample(5000, cores=4)    
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta]
Sampling 4 chains, 0 divergences: 100%|██████████| 22000/22000 [00:05<00:00, 4211.51draws/s]
The acceptance probability does not match the target. It is 0.8794756253739936, but should be close to 0.8. Try to increase the number of tuning steps.
The number of effective samples is smaller than 25% for some parameters.
In [6]:
pm.plot_trace(az.from_pymc3(trace, model=hierarchical_model));
In [7]:
trace_df = (pm.trace_to_dataframe(trace)
            .set_index('m'))
trace_df.head()
Out[7]:
theta
m
0 0.488826
1 0.624282
1 0.583721
1 0.669008
1 0.693736

Figure 10.4 (lower frame)

Note that the models are indexed starting with 0 and not 1, as is the case in Kruschke (2015). So the posterior mean of parameter m is shifted with -1.

In [8]:
fig = plt.figure(figsize=(10,5))

font_d = {'size':16}

# Define gridspec
gs = gridspec.GridSpec(2, 4)
ax1 = plt.subplot(gs[0:,1])
ax2 = plt.subplot(gs[0,2:])
ax3 = plt.subplot(gs[1,2:])

# Distplot m
pm.plot_posterior(np.asarray(trace_df.index), ax=ax1, color=color, hdi_prob=0.95)
ax1.set_xlabel('m')
ax1.set_title('Model Index')

# Distplot theta for m=0 and m=1 
for model, ax in zip((0,1), (ax2, ax3)):
    pm.plot_posterior(trace_df.loc[model].values.ravel(), point_estimate='mode',
                      ax=ax, color=color, kind='hist', hdi_prob=0.95)
    ax.set_title(r'$m = {}$'.format(model), fontdict=font_d)    
    ax.set(xlim=(0,1), xlabel=r'$\theta$')

fig.suptitle('Posterior distribution', size=16, y=1.05)
    
fig.tight_layout(w_pad=2);

Model 2 - Two theta variables without pseudo priors

Coin is flipped nine times, resulting in six heads.

In [9]:
with pm.Model() as hierarchical_model2:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    omega_0 = .25
    kappa_0 = 12
    theta_0 = pm.Beta('theta_0', omega_0*(kappa_0-2)+1, (1-omega_0)*(kappa_0-2)+1)
    
    omega_1 = .75
    kappa_1 = 12
    theta_1 = pm.Beta('theta_1', omega_1*(kappa_1-2)+1, (1-omega_1)*(kappa_1-2)+1)
    
    theta = pm.math.switch(eq(m, 0), theta_0, theta_1)
    
    y2 = pm.Bernoulli('y2', theta, observed=np.r_[6*[1], 3*[0]])

pm.model_to_graphviz(hierarchical_model2)
Out[9]:
%3 cluster9 9 m m ~ Categorical y2 y2 ~ Bernoulli m->y2 theta_1 theta_1 ~ Beta theta_1->y2 theta_0 theta_0 ~ Beta theta_0->y2
In [10]:
with hierarchical_model2:
    trace2 = pm.sample(5000, cores=4)
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta_1, theta_0]
Sampling 4 chains, 0 divergences: 100%|██████████| 22000/22000 [00:06<00:00, 3462.01draws/s]
In [11]:
pm.traceplot(az.from_pymc3(trace2, model=hierarchical_model2));

Model 3 - Two theta variables with pseudo priors = true prior

Coin is flipped 30 times, resulting in 17 heads.

In [12]:
with pm.Model() as hierarchical_model3:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    # Theta0
    kappa_0_true_p = 20
    kappa_0_pseudo_p = 20
    kappa_0 = pm.math.switch(eq(m, 0), kappa_0_true_p, kappa_0_pseudo_p)
    omega_0_true_p = .10
    omega_0_pseudo_p = .10
    omega_0 = pm.math.switch(eq(m, 0), omega_0_true_p, omega_0_pseudo_p)
    theta_0 = pm.Beta('theta_0', omega_0*(kappa_0-2)+1, (1-omega_0)*(kappa_0-2)+1)
    
    # Theta1    
    kappa_1_true_p = 20
    kappa_1_pseudo_p = 20 
    kappa_1 = pm.math.switch(eq(m, 1), kappa_1_true_p, kappa_1_pseudo_p)
    omega_1_true_p = .90
    omega_1_pseudo_p = .90
    omega_1 = pm.math.switch(eq(m, 1), omega_1_true_p, omega_1_pseudo_p)
    theta_1 = pm.Beta('theta_1', omega_1*(kappa_1-2)+1, (1-omega_1)*(kappa_1-2)+1)
    
    theta = pm.math.switch(eq(m, 0), theta_0, theta_1)
    
    y3 = pm.Bernoulli('y3', theta, observed=np.r_[17*[1], 13*[0]])

pm.model_to_graphviz(hierarchical_model3)
Out[12]:
%3 cluster30 30 m m ~ Categorical theta_1 theta_1 ~ Beta m->theta_1 theta_0 theta_0 ~ Beta m->theta_0 y3 y3 ~ Bernoulli m->y3 theta_1->y3 theta_0->y3
In [13]:
with hierarchical_model3:
    trace3 = pm.sample(5000, cores=4)
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta_1, theta_0]
Sampling 4 chains, 0 divergences: 100%|██████████| 22000/22000 [00:06<00:00, 3333.93draws/s]
The acceptance probability does not match the target. It is 0.8901158140779728, but should be close to 0.8. Try to increase the number of tuning steps.
The number of effective samples is smaller than 10% for some parameters.
In [14]:
pm.traceplot(az.from_pymc3(trace=trace3, model=hierarchical_model3));
In [15]:
trace3_df = (pm.trace_to_dataframe(trace3)
             .set_index('m')[['theta_0', 'theta_1']])
trace3_df.head()
Out[15]:
theta_0 theta_1
m
1 0.153181 0.583654
1 0.030741 0.781029
1 0.096638 0.816370
1 0.134458 0.681352
1 0.101300 0.595565

Figure 10.5 (lower part)

In [16]:
fig = plt.figure(figsize=(10,8))

# Define gridspec
gs = gridspec.GridSpec(3,2)
ax1 = plt.subplot(gs[0,:])
ax2 = plt.subplot(gs[1,0])
ax3 = plt.subplot(gs[1,1])
ax4 = plt.subplot(gs[2,0])
ax5 = plt.subplot(gs[2,1])

pm.plot_posterior(np.asarray(trace3_df.index), ax=ax1, color=color)
ax1.set_xlabel('m')
ax1.set_title('Model Index')

for model, theta, ax in zip((0,0,1,1), (0,1,0,1), (ax2, ax3, ax4, ax5)):
    pm.plot_posterior(trace3_df.loc[model, 'theta_{}'.format(theta)].values,
                      ax=ax, color=color, hdi_prob=0.95, kind='hist')
    ax.set_title(r'$m = {}$'.format(model), fontdict=font_d)
    ax.set_xlim(0,1)
    ax.set_xlabel(r'$\theta_{}$'.format(theta), fontdict=font_d)
    
fig.tight_layout();

Model 4 - Two theta variables with pseudo priors that mimic posteriors

Coin is flipped 30 times, resulting in 17 heads.

In [17]:
with pm.Model() as hierarchical_model4:
    m = pm.Categorical('m', np.asarray([.5, .5]))
    
    # Theta0
    kappa_0_true_p = 20
    kappa_0_pseudo_p = 50
    kappa_0 = pm.math.switch(eq(m, 0), kappa_0_true_p, kappa_0_pseudo_p)
    omega_0_true_p = .10
    omega_0_pseudo_p = .40
    omega_0 = pm.math.switch(eq(m, 0), omega_0_true_p, omega_0_pseudo_p)
    theta_0 = pm.Beta('theta_0', omega_0*(kappa_0-2)+1, (1-omega_0)*(kappa_0-2)+1)
    
    # Theta1    
    kappa_1_true_p = 20
    kappa_1_pseudo_p = 50 
    kappa_1 = pm.math.switch(eq(m, 1), kappa_1_true_p, kappa_1_pseudo_p)
    omega_1_true_p = .90
    omega_1_pseudo_p = .70
    omega_1 = pm.math.switch(eq(m, 1), omega_1_true_p, omega_1_pseudo_p)
    theta_1 = pm.Beta('theta_1', omega_1*(kappa_1-2)+1, (1-omega_1)*(kappa_1-2)+1)
    
    theta = pm.math.switch(eq(m, 0), theta_0, theta_1)
    
    y4 = pm.Bernoulli('y4', theta, observed=np.r_[17*[1], 13*[0]])

pm.model_to_graphviz(hierarchical_model4)
Out[17]:
%3 cluster30 30 m m ~ Categorical theta_1 theta_1 ~ Beta m->theta_1 theta_0 theta_0 ~ Beta m->theta_0 y4 y4 ~ Bernoulli m->y4 theta_1->y4 theta_0->y4
In [18]:
with hierarchical_model4:
    trace4 = pm.sample(5000, cores=4)
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>BinaryGibbsMetropolis: [m]
>NUTS: [theta_1, theta_0]
Sampling 4 chains, 0 divergences: 100%|██████████| 22000/22000 [00:06<00:00, 3319.90draws/s]
In [19]:
pm.traceplot(az.from_pymc3(trace=trace4, model=hierarchical_model4));