Chapter 17 - Metric Predicted Variable with One Metric Predictor

In [1]:
import pandas as pd
import numpy as np
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from matplotlib import gridspec
from IPython.display import Image
import theano.tensor as tt

%matplotlib inline
plt.style.use('seaborn-white')

color = '#87ceeb'

f_dict = {'size':16}
In [2]:
%load_ext watermark
%watermark -p pandas,numpy,pymc3,theano,matplotlib,seaborn,scipy
pandas 0.23.4
numpy 1.15.1
pymc3 3.5
theano 1.0.2
matplotlib 2.2.3
seaborn 0.9.0
scipy 1.1.0
In [3]:
def plot_grid(trace, data, sd_h, sd_w, mean_h, mean_w):
    """This function creates plots like figures 17.3 and 17.4 in the book."""
    
    plt.figure(figsize=(13,13))
    
    # Define gridspec
    gs = gridspec.GridSpec(4, 6)
    ax1 = plt.subplot(gs[:2,1:5])
    ax2 = plt.subplot(gs[2,:2])
    ax3 = plt.subplot(gs[2,2:4])
    ax4 = plt.subplot(gs[2,4:6])
    ax5 = plt.subplot(gs[3,:2])
    ax6 = plt.subplot(gs[3,2:4])                     
    ax7 = plt.subplot(gs[3,4:6])
        
    # Scatter plot of the observed data
    ax1.scatter(data.height, data.weight, s=40, linewidths=1, facecolor='none', edgecolor='k', zorder=10)
    ax1.set_xlabel('height', fontdict=f_dict)
    ax1.set_ylabel('height', fontdict=f_dict)
    ax1.set(xlim=(0,80), ylim=(-350,250))

    # Convert parameters to original scale
    beta0 = trace['beta0']*sd_w+mean_w-trace['beta1']*mean_h*sd_w/sd_h
    beta1 = trace['beta1']*(sd_w/sd_h)
    sigma = trace['sigma']*sd_w
    B = pd.DataFrame(np.c_[beta0, beta1], columns=['beta0', 'beta1'])
       
    # Credible regression lines from posterior
    hpd_interval = np.round(pm.hpd(B.values, alpha=0.05), decimals=3)
    B_hpd = B[B.beta0.between(*hpd_interval[0,:]) & B.beta1.between(*hpd_interval[1,:])] 
    xrange = np.arange(0, data.height.max()*1.05)
    for i in np.random.randint(0, len(B_hpd), 30):
        ax1.plot(xrange, B_hpd.iloc[i,0]+B_hpd.iloc[i,1]*xrange, c=color, alpha=.6, zorder=0)    
        
    # Intercept
    pm.plot_posterior(beta0, point_estimate='mode', ax=ax2, color=color)
    ax2.set_xlabel(r'$\beta_0$', fontdict=f_dict)
    ax2.set_title('Intercept', fontdict={'weight':'bold'})

    # Slope
    pm.plot_posterior(beta1, point_estimate='mode', ax=ax3, color=color, ref_val=0)
    ax3.set_xlabel(r'$\beta_1$', fontdict=f_dict)
    ax3.set_title('Slope', fontdict={'weight':'bold'})
    
    # Scatter plot beta1, beta0
    ax4.scatter(beta1, beta0, edgecolor=color, facecolor='none', alpha=.6)
    ax4.set_xlabel(r'$\beta_1$', fontdict=f_dict)
    ax4.set_ylabel(r'$\beta_0$', fontdict=f_dict)
    
    # Scale
    pm.plot_posterior(sigma, point_estimate='mode', ax=ax5, color=color)
    ax5.set_xlabel(r'$\sigma$', fontdict=f_dict)
    ax5.set_title('Scale', fontdict={'weight':'bold'})

    # Normality
    pm.plot_posterior(np.log10(trace['nu']), point_estimate='mode', ax=ax6, color=color)
    ax6.set_xlabel(r'log10($\nu$)', fontdict=f_dict)
    ax6.set_title('Normality', fontdict={'weight':'bold'})
    
    # Scatter plot normality, sigma
    ax7.scatter(np.log10(trace['nu']), sigma,
                edgecolor=color, facecolor='none', alpha=.6)
    ax7.set_xlabel(r'log10($\nu$)', fontdict=f_dict)
    ax7.set_ylabel(r'$\sigma$', fontdict=f_dict)
    
    plt.tight_layout()
    
    return(plt.gcf());
In [4]:
def plot_cred_lines(dist, x, sd_x, sd_y, mean_x, mean_y, ax):
    """This function plots credibility lines."""
    # Convert parameters to original scale
    beta0 = dist[:,0]*sd_y+mean_y-dist[:,1]*mean_x*sd_y/sd_x
    beta1 = dist[:,1]*(sd_y/sd_x)
    B = pd.DataFrame(np.c_[beta0, beta1], columns=['beta0', 'beta1'])

    # Credible regression lines from posterior
    hpd_interval = np.round(pm.hpd(B.values, alpha=0.05), decimals=3)
    hpd_interval = pm.hpd(B.values, alpha=0.05)
    B_hpd = B[B.beta0.between(*hpd_interval[0,:]) & B.beta1.between(*hpd_interval[1,:])] 
    xrange = np.arange(x.min()*.95, x.max()*1.05)
    for i in np.random.randint(0, len(B_hpd), 30):
        ax.plot(xrange, B_hpd.iloc[i,0]+B_hpd.iloc[i,1]*xrange, c=color, alpha=.6, zorder=0)    
In [5]:
def plot_quad_credlines(dist, x, sd_x, sd_y, mean_x, mean_y, ax):
    """This function plots quadratic credibility lines."""
    # Convert parameters to original scale
    beta0 = dist[:,0]*sd_y+mean_y-dist[:,1]*mean_x*sd_y/sd_x + dist[:,2]*mean_x**2*sd_y/sd_x**2
    beta1 = dist[:,1]*sd_y/sd_x - 2*dist[:,2]*mean_x*sd_y/sd_x**2
    beta2 = dist[:,2]*sd_y/sd_x**2
    B = pd.DataFrame(np.c_[beta0, beta1, beta2], columns=['beta0', 'beta1', 'beta2'])

    # Credible regression lines from posterior
    hpd_interval = pm.hpd(B.values, alpha=0.05)
    B_hpd = B[B.beta0.between(*hpd_interval[0,:]) & B.beta1.between(*hpd_interval[1,:]) & B.beta2.between(*hpd_interval[2,:])] 
    xrange = np.arange(x.min()-1, x.max()+2)
    for i in np.random.randint(0, len(B_hpd), 30):
        ax.plot(xrange, B_hpd.iloc[i,0]+B_hpd.iloc[i,1]*xrange+B_hpd.iloc[i,2]*xrange**2, c=color, alpha=.6, zorder=0)    

17.2 - Robust Linear Regression

Model (Kruschke, 2015)

In [6]:
Image('images/fig17_2.png', width=400)
Out[6]:

N = 30

In [7]:
df_n30 = pd.read_csv('data/HtWtData30.csv')
df_n30.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
male      30 non-null int64
height    30 non-null float64
weight    30 non-null float64
dtypes: float64(2), int64(1)
memory usage: 800.0 bytes
In [8]:
df_n30.head()
Out[8]:
male height weight
0 0 64.0 136.4
1 0 62.3 215.1
2 1 67.9 173.6
3 0 64.2 117.3
4 0 64.8 123.3
In [9]:
# Standardize the data
sd_h = df_n30.height.std()
mean_h = df_n30.height.mean()
zheight = (df_n30.height - mean_h)/sd_h

sd_w = df_n30.weight.std()
mean_w = df_n30.weight.mean()
zy = (df_n30.weight - mean_w)/sd_w

Model

In [10]:
with pm.Model() as model:
    
    beta0 = pm.Normal('beta0', mu=0, tau=1/10**2)
    beta1 = pm.Normal('beta1', mu=0, tau=1/10**2)
    mu =  beta0 + beta1*zheight.ravel()
    
    sigma = pm.Uniform('sigma', 10**-3, 10**3)
    nu = pm.Exponential('nu', 1/29.)
    
    likelihood = pm.StudentT('likelihood', nu, mu=mu, sd=sigma, observed=zy.ravel())

pm.model_to_graphviz(model)
Out[10]:
%3 cluster30 30 nu nu ~ Exponential likelihood likelihood ~ StudentT nu->likelihood beta1 beta1 ~ Normal beta1->likelihood beta0 beta0 ~ Normal beta0->likelihood sigma sigma ~ Uniform sigma->likelihood
In [11]:
with model:
    trace = pm.sample(3000, cores=4, nuts_kwargs={'target_accept': 0.95})
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, beta1, beta0]
Sampling 4 chains: 100%|██████████| 14000/14000 [00:05<00:00, 2419.29draws/s]
In [12]:
pm.traceplot(trace);

Figure 17.3

In [13]:
plot_grid(trace, df_n30, sd_h, sd_w, mean_h, mean_w);

N = 300

In [14]:
df_n300 = pd.read_csv('data/HtWtData300.csv')
df_n300.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
male      300 non-null int64
height    300 non-null float64
weight    300 non-null float64
dtypes: float64(2), int64(1)
memory usage: 7.1 KB
In [15]:
# Standardize the data
sd_h2 = df_n300.height.std()
mean_h2 = df_n300.height.mean()
zheight2 = (df_n300.height - mean_h2)/sd_h2

sd_w2 = df_n300.weight.std()
mean_w2 = df_n300.weight.mean()
zy2 = (df_n300.weight - mean_w2)/sd_w2

Model

In [16]:
with pm.Model() as model2:
    
    beta0 = pm.Normal('beta0', mu=0, tau=1/10**2)
    beta1 = pm.Normal('beta1', mu=0, tau=1/10**2)
    mu =  beta0 + beta1*zheight2.ravel()
    
    sigma = pm.Uniform('sigma', 10**-3, 10**3)
    nu = pm.Exponential('nu', 1/29.)
    
    likelihood = pm.StudentT('likelihood', nu, mu=mu, sd=sigma, observed=zy2.ravel())  

pm.model_to_graphviz(model2)
Out[16]:
%3 cluster300 300 nu nu ~ Exponential likelihood likelihood ~ StudentT nu->likelihood beta1 beta1 ~ Normal beta1->likelihood beta0 beta0 ~ Normal beta0->likelihood sigma sigma ~ Uniform sigma->likelihood
In [17]:
with model2:
    trace2 = pm.sample(3000, cores=4, nuts_kwargs={'target_accept': 0.95})
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, beta1, beta0]
Sampling 4 chains: 100%|██████████| 14000/14000 [00:07<00:00, 1975.42draws/s]
In [18]:
pm.traceplot(trace2);

Figure 17.4

In [19]:
grid = plot_grid(trace2, df_n300, sd_h2, sd_w2, mean_h2, mean_w2)
grid.axes[0].set_xlim(50,80)
grid.axes[0].set_ylim(0,400);

17.3 - Hierarchical Regression on Individuals within Groups

Model (Kruschke, 2015)

In [20]:
Image('images/fig17_6.png', width=500)
Out[20]:
In [21]:
df_HRegr = pd.read_csv('data/HierLinRegressData.csv')
df_HRegr.Subj = df_HRegr.Subj.astype('category')
df_HRegr.Subj = df_HRegr.Subj.cat.as_ordered()
df_HRegr.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 3 columns):
Subj    132 non-null category
X       132 non-null float64
Y       132 non-null float64
dtypes: category(1), float64(2)
memory usage: 3.1 KB
In [22]:
df_HRegr.head()
Out[22]:
Subj X Y
0 1 60.2 145.6
1 1 61.5 157.3
2 1 61.7 165.6
3 1 62.3 158.8
4 1 67.6 196.1
In [23]:
subj_idx = df_HRegr.Subj.cat.codes.values
subj_codes = df_HRegr.Subj.cat.categories
n_subj = len(subj_codes)

print('Number of groups: {}'.format(n_subj))
Number of groups: 25
In [24]:
# Standardize the data
sd_x3 = df_HRegr.X.std()
mean_x3 = df_HRegr.X.mean()
zx3 = (df_HRegr.X - mean_x3)/sd_x3

sd_y3 = df_HRegr.Y.std()
mean_y3 = df_HRegr.Y.mean()
zy3 = (df_HRegr.Y - mean_y3)/sd_y3

Model

Reparameterization of hierarchical models generally results in much more efficient and faster sampling.
See http://twiecki.github.io/blog/2017/02/08/bayesian-hierchical-non-centered/ and
http://pymc3.readthedocs.io/en/latest/notebooks/Diagnosing_biased_Inference_with_Divergences.html

In [25]:
with pm.Model() as model3:
    
    beta0 = pm.Normal('beta0', mu=0, tau=1/10**2)
    beta1 = pm.Normal('beta1', mu=0, tau=1/10**2)
    sigma0 = pm.Uniform('sigma0', 10**-3, 10**3)
    sigma1 = pm.Uniform('sigma1', 10**-3, 10**3)
    
    # The below parameterization resulted in a lot of divergences.
    #beta0_s = pm.Normal('beta0_s', mu=beta0, sd=sigma0, shape=n_subj)
    #beta1_s = pm.Normal('beta1_s', mu=beta1, sd=sigma1, shape=n_subj)
    
    beta0_s_offset = pm.Normal('beta0_s_offset', mu=0, sd=1, shape=n_subj)
    beta0_s = pm.Deterministic('beta0_s', beta0 + beta0_s_offset * sigma0)
    
    beta1_s_offset = pm.Normal('beta1_s_offset', mu=0, sd=1, shape=n_subj)
    beta1_s = pm.Deterministic('beta1_s', beta1 + beta1_s_offset * sigma1)
        
    mu =  beta0_s[subj_idx] + beta1_s[subj_idx] * zx3
    
    sigma = pm.Uniform('sigma', 10**-3, 10**3)
    nu = pm.Exponential('nu', 1/29.)
    
    likelihood = pm.StudentT('likelihood', nu, mu=mu, sd=sigma, observed=zy3)  

pm.model_to_graphviz(model3)
Out[25]:
%3 cluster25 25 cluster132 132 sigma sigma ~ Uniform likelihood likelihood ~ StudentT sigma->likelihood nu nu ~ Exponential nu->likelihood beta1 beta1 ~ Normal beta1_s beta1_s ~ Deterministic beta1->beta1_s beta0 beta0 ~ Normal beta0_s beta0_s ~ Deterministic beta0->beta0_s sigma0 sigma0 ~ Uniform sigma0->beta0_s sigma1 sigma1 ~ Uniform sigma1->beta1_s beta0_s_offset beta0_s_offset ~ Normal beta0_s_offset->beta0_s beta1_s_offset beta1_s_offset ~ Normal beta1_s_offset->beta1_s
In [26]:
with model3:
    trace3 = pm.sample(3000, cores=4, nuts_kwargs={'target_accept': 0.95})
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, beta1_s_offset, beta0_s_offset, sigma1, sigma0, beta1, beta0]
Sampling 4 chains: 100%|██████████| 14000/14000 [00:26<00:00, 193.48draws/s]
The number of effective samples is smaller than 25% for some parameters.
In [27]:
pm.traceplot(trace3, ['beta0', 'beta1', 'sigma0', 'sigma1', 'sigma', 'nu']);