import pandas as pd
import numpy as np
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from IPython.display import Image
from matplotlib import gridspec
%matplotlib inline
plt.style.use('seaborn-white')
color = '#87ceeb'
%load_ext watermark
%watermark -p pandas,numpy,pymc3,matplotlib,seaborn
df = pd.read_csv('data/TherapeuticTouchData.csv', dtype={'s':'category'})
df.info()
df.head()
df_proportions = df.groupby('s')['y'].apply(lambda x: x.sum()/len(x))
ax = sns.distplot(df_proportions, bins=8, kde=False, color='gray')
ax.set(xlabel='Proportion Correct', ylabel='# Practitioners')
sns.despine(ax=ax);
Image('images/fig9_7.png', width=200)
practitioner_idx = df.s.cat.codes.values
practitioner_codes = df.s.cat.categories
n_practitioners = practitioner_codes.size
with pm.Model() as hierarchical_model:
omega = pm.Beta('omega', 1., 1.)
kappa_minus2 = pm.Gamma('kappa_minus2', 0.01, 0.01)
kappa = pm.Deterministic('kappa', kappa_minus2 + 2)
theta = pm.Beta('theta', alpha=omega*(kappa-2)+1, beta=(1-omega)*(kappa-2)+1, shape=n_practitioners)
y = pm.Bernoulli('y', theta[practitioner_idx], observed=df.y)
pm.model_to_graphviz(hierarchical_model)
with hierarchical_model:
trace = pm.sample(5000, cores=4, nuts_kwargs={'target_accept': 0.95})
pm.traceplot(trace, ['omega','kappa', 'theta']);
pm.summary(trace)
# Note that theta is indexed starting with 0 and not 1, as is the case in Kruschke (2015).
plt.figure(figsize=(10,12))
# Define gridspec
gs = gridspec.GridSpec(4, 6)
ax1 = plt.subplot(gs[0,:3])
ax2 = plt.subplot(gs[0,3:])
ax3 = plt.subplot(gs[1,:2])
ax4 = plt.subplot(gs[1,2:4])
ax5 = plt.subplot(gs[1,4:6])
ax6 = plt.subplot(gs[2,:2])
ax7 = plt.subplot(gs[2,2:4])
ax8 = plt.subplot(gs[2,4:6])
ax9 = plt.subplot(gs[3,:2])
ax10 = plt.subplot(gs[3,2:4])
ax11 = plt.subplot(gs[3,4:6])
# thetas and theta pairs to plot
thetas = (0, 13, 27)
theta_pairs = ((0,13),(0,27),(13,27))
font_d = {'size':14}
# kappa & omega posterior plots
for var, ax in zip(['kappa', 'omega'], [ax1, ax2]):
pm.plot_posterior(trace[var], point_estimate='mode', ax=ax, color=color, round_to=2)
ax.set_xlabel('$\{}$'.format(var), fontdict={'size':20, 'weight':'bold'})
ax1.set(xlim=(0,500))
# theta posterior plots
for var, ax in zip(thetas,[ax3, ax7, ax11]):
pm.plot_posterior(trace['theta'][:,var], point_estimate='mode', ax=ax, color=color)
ax.set_xlabel('theta[{}]'.format(var), fontdict=font_d)
# theta scatter plots
for var, ax in zip(theta_pairs,[ax6, ax9, ax10]):
ax.scatter(trace['theta'][::10,var[0]], trace['theta'][::10,var[1]], alpha=0.75, color=color, facecolor='none')
ax.plot([0, 1], [0, 1], ':k', transform=ax.transAxes, alpha=0.5)
ax.set_xlabel('theta[{}]'.format(var[0]), fontdict=font_d)
ax.set_ylabel('theta[{}]'.format(var[1]), fontdict=font_d)
ax.set(xlim=(0,1), ylim=(0,1), aspect='equal')
# theta posterior differences plots
for var, ax in zip(theta_pairs,[ax4, ax5, ax8]):
pm.plot_posterior(trace['theta'][:,var[0]]-trace['theta'][:,var[1]], point_estimate='mode', ax=ax, color=color)
ax.set_xlabel('theta[{}] - theta[{}]'.format(*var), fontdict=font_d)
plt.tight_layout()
Let's create a model with just the theta estimations per practitioner, without the influence of a higher level distribution. Then we can compare the theta values with the hierarchical model above.
with pm.Model() as unpooled_model:
theta = pm.Beta('theta', 1, 1, shape=n_practitioners)
y = pm.Bernoulli('y', theta[practitioner_idx], observed=df.y)
pm.model_to_graphviz(unpooled_model)
with unpooled_model:
unpooled_trace = pm.sample(5000, cores=4)
Here we concatenate the trace results (thetas) from both models into a dataframe. Next we shape the data into a format that we can use with Seaborn's pointplot.
df_shrinkage = (pd.concat([pm.summary(unpooled_trace).iloc[:,0],
pm.summary(trace).iloc[3:,0]],
axis=1)
.reset_index())
df_shrinkage.columns = ['theta', 'unpooled', 'hierarchical']
df_shrinkage = pd.melt(df_shrinkage, 'theta', ['unpooled', 'hierarchical'], var_name='Model')
df_shrinkage.head()
The below plot shows that the theta estimates on practitioner level are pulled towards the group mean of the hierarchical model.
plt.figure(figsize=(10,9))
plt.scatter(1, pm.summary(trace).iloc[0,0], s=100, c='r', marker='x', zorder=999, label='Group mean')
sns.pointplot(x='Model', y='value', hue='theta', data=df_shrinkage);
df2 = pd.read_csv('data/BattingAverage.csv', usecols=[0,1,2,3], dtype={'PriPos':'category'})
df2.info()
The DataFrame contains records for 948 players in the 2012 regular season of Major League Baseball.
df2['BatAv'] = df2.Hits.divide(df2.AtBats)
df2.head(10)
# Batting average by primary field positions calculated from the data
df2.groupby('PriPos')['Hits','AtBats'].sum().pipe(lambda x: x.Hits/x.AtBats)
Image('images/fig9_13.png', width=300)
pripos_idx = df2.PriPos.cat.codes.values
pripos_codes = df2.PriPos.cat.categories
n_pripos = pripos_codes.size
# df2 contains one entry per player
n_players = df2.index.size
with pm.Model() as hierarchical_model2:
# Hyper parameters
omega = pm.Beta('omega', 1, 1)
kappa_minus2 = pm.Gamma('kappa_minus2', 0.01, 0.01)
kappa = pm.Deterministic('kappa', kappa_minus2 + 2)
# Parameters for categories (Primary field positions)
omega_c = pm.Beta('omega_c',
omega*(kappa-2)+1, (1-omega)*(kappa-2)+1,
shape = n_pripos)
kappa_c_minus2 = pm.Gamma('kappa_c_minus2',
0.01, 0.01,
shape = n_pripos)
kappa_c = pm.Deterministic('kappa_c', kappa_c_minus2 + 2)
# Parameter for individual players
theta = pm.Beta('theta',
omega_c[pripos_idx]*(kappa_c[pripos_idx]-2)+1,
(1-omega_c[pripos_idx])*(kappa_c[pripos_idx]-2)+1,
shape = n_players)
y2 = pm.Binomial('y2', n=df2.AtBats.values, p=theta, observed=df2.Hits)
pm.model_to_graphviz(hierarchical_model2)
with hierarchical_model2:
trace2 = pm.sample(3000, cores=4)
pm.traceplot(trace2, ['omega', 'kappa', 'omega_c', 'kappa_c']);
pm.plot_posterior(trace2['omega'], point_estimate='mode', color=color)
plt.title('Overall', fontdict={'fontsize':16, 'fontweight':'bold'})
plt.xlabel('omega', fontdict={'fontsize':14});
fig, axes = plt.subplots(3,3, figsize=(14,8))
for i, ax in enumerate(axes.T.flatten()):
pm.plot_posterior(trace2['omega_c'][:,i], ax=ax, point_estimate='mode', color=color)
ax.set_title(pripos_codes[i], fontdict={'fontsize':16, 'fontweight':'bold'})
ax.set_xlabel('omega_c__{}'.format(i), fontdict={'fontsize':14})
ax.set_xlim(0.10,0.30)
plt.tight_layout(h_pad=3)