#!/usr/bin/env python # coding: utf-8 # ## Chapter 9 - Hierarchical Models # - [9.2.4 - Example: Therapeutic touch](#9.2.4---Example:-Therapeutic-touch) # - [Shrinkage](#Shrinkage) # - [9.5.1 - Example: Baseball batting abilities by position (subjects within categories)](#9.5.1---Example:-Baseball-batting-abilities-by-position) # In[1]: import arviz as az import matplotlib.pyplot as plt import numpy as np import pandas as pd import pymc as pm import seaborn as sns #import warnings #warnings.filterwarnings("ignore", category=FutureWarning) from IPython.display import Image from matplotlib import gridspec plt.style.use('seaborn-white') color = '#87ceeb' # In[2]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-p arviz,matplotlib,numpy,pandas,pymc,seaborn') # ### 9.2.3 - Example: Multiple Coins form a Single Mint # In[3]: # data from coin #1 n_heads1 = 3 n_tails1 = 12 n_flips1 = n_heads1 + n_tails1 data1 = np.repeat([1, 0], [n_heads1, n_tails1]) # coin #2 n_heads2 = 4 n_tails2 = 1 n_flips2 = n_heads2 + n_tails2 data2 = np.repeat([1, 0], [n_heads2, n_tails2]) # now pack all those outcomes into a single vector outcomes = np.hstack((data1, data2)) # and create a vector of 'coin IDs' so that we can connect each flip # with the corresponding coin's value of theta coin = np.hstack((np.zeros_like(data1), np.ones_like(data2))) # In[4]: outcomes # In[5]: # build the model with pm.Model() as oneminttwocoins_model: omega = pm.Beta("omega", 2.0, 2.0) # Kruschke uses these parameters in the chapter, but they yield very poor sampling # kappa_minus2 = pm.Gamma('kappa_minus2', 0.01, 0.01) kappa_minus2 = pm.Gamma("kappa_minus2", 0.5, 0.5) kappa = pm.Deterministic("kappa", kappa_minus2 + 2) theta = pm.Beta( "theta", alpha=omega * (kappa - 2) + 1, beta=(1 - omega) * (kappa - 2) + 1, shape=2, ) y = pm.Bernoulli("y", theta[coin], observed=outcomes) # In[6]: pm.model_to_graphviz(oneminttwocoins_model) # In[7]: # sample! with oneminttwocoins_model: idata_1m2c = pm.sample(5000, target_accept=0.9) # In[8]: az.plot_trace(idata_1m2c, var_names=["~kappa_minus2"]) plt.tight_layout(); # In[9]: az.summary(idata_1m2c) # In[10]: idata_1m2c.posterior['omega'].sel(draw=0) # Given that this is our first legitimate MCMC trace object, let's see how one can interogate the raw trace to extract information we might want. # In[11]: idata_1m2c # In[12]: # let's look at the first MCMC sample idata_1m2c.posterior.sel(draw=0, chain=0) # Note that there are **2** values of theta in this first step (one for each coin). They are stored together, because we asked for an array of theta parameters. Let's calculate the mean value of the $\theta$ parameter associated with the first coin. # In[13]: idata_1m2c.posterior['theta'].mean(dim=['draw','chain'])[0] # And for the second coin. # In[14]: idata_1m2c.posterior['theta'].mean(dim=['draw','chain'])[1] # What if we have 'reference' values against which we wish to compare our parameter estimate? We can, among other things, ask what proportion of samples took on a value greater than/less than that reference value. In the case of coins, which might wish to compare $\theta$ to a value of 0.5 (i.e., to ask how likely it is that each coin is 'fair'). # In[15]: # coin 1 (idata_1m2c.posterior['theta']>.5).mean(dim=['draw','chain'])[0] # In[16]: # coin 2 (idata_1m2c.posterior['theta']>.5).mean(dim=['draw','chain'])[1] # Now let's ask about our mint parameter, $\omega$. Here too we calculate the mean posterior estimate. # In[17]: idata_1m2c.posterior['omega'].mean(dim=['draw','chain']) # We can also again compare $\omega$ against our 'reference' value of 0.5. The interpretation of $\omega$ is somewhat different than the interpretation of $\theta$. Asking about $p(\omega > 0.5)$ tells us whether the _mint tends to generate fair coins_. # In[18]: (idata_1m2c.posterior['omega']>.5).mean(dim=['draw','chain']) # We can also inspect $\kappa$, which will tell us how consistent the mint is in generating coins with values of $\theta$ near $\omega$. # In[19]: idata_1m2c.posterior['kappa'].mean(dim=['draw','chain']) # ### 9.2.4 - Example: Therapeutic touch # In[20]: df = pd.read_csv("data/TherapeuticTouchData.csv", dtype={"s": "category"}) df.info() # In[21]: df.head() # #### Figure 9.9 # In[22]: df_proportions = df.groupby("s")["y"].apply(lambda x: x.sum() / len(x)) ax = sns.displot(df_proportions, bins=8, color="gray") ax.set(xlabel="Proportion Correct", ylabel="# Practitioners"); # #### Model (Kruschke, 2015) # In[23]: Image('images/fig9_7.png', width=200) # In[24]: # create a factor for practitioner ID practitioner_idx, practitioner_codes = pd.factorize(df["s"]) n_practitioners = df["s"].nunique() with pm.Model() as therapeutictouch_model: omega = pm.Beta("omega", 1.0, 1.0) kappa_minus2 = pm.Gamma("kappa_minus2", 0.05, 0.05) kappa = pm.Deterministic("kappa", kappa_minus2 + 2) theta = pm.Beta( "theta", alpha=omega * (kappa - 2) + 1, beta=(1 - omega) * (kappa - 2) + 1, shape=n_practitioners, ) y = pm.Bernoulli("y", theta[practitioner_idx], observed=df["y"]) # In[25]: pm.model_to_graphviz(therapeutictouch_model) # In[26]: with therapeutictouch_model: idata_tt = pm.sample(5000) # In[27]: az.plot_trace(idata_tt, var_names=['omega','kappa', 'theta']) plt.tight_layout(); # In[28]: az.summary(idata_tt) # #### Figure 9.10 - Marginal posterior distributions # In[29]: idata_tt.posterior["theta"].mean(dim=["chain", "draw"]).sel(theta_dim_0=0) # In[30]: plt.figure(figsize=(10, 12)) # Define gridspec gs = gridspec.GridSpec(4, 6) ax1 = plt.subplot(gs[0, :3]) ax2 = plt.subplot(gs[0, 3:]) ax3 = plt.subplot(gs[1, :2]) ax4 = plt.subplot(gs[1, 2:4]) ax5 = plt.subplot(gs[1, 4:6]) ax6 = plt.subplot(gs[2, :2]) ax7 = plt.subplot(gs[2, 2:4]) ax8 = plt.subplot(gs[2, 4:6]) ax9 = plt.subplot(gs[3, :2]) ax10 = plt.subplot(gs[3, 2:4]) ax11 = plt.subplot(gs[3, 4:6]) # thetas and theta pairs to plot thetas = (0, 13, 27) theta_pairs = ((0, 13), (0, 27), (13, 27)) font_d = {"size": 14} # kappa & omega posterior plots for var, ax in zip(["kappa", "omega"], [ax1, ax2]): az.plot_posterior( idata_tt.posterior[var], point_estimate="mode", ax=ax, color=color, round_to=2 ) ax.set_xlabel("$\{}$".format(var), fontdict={"size": 20, "weight": "bold"}) ax1.set(xlim=(0, 500)) # theta posterior plots for var, ax in zip(thetas, [ax3, ax7, ax11]): az.plot_posterior( idata_tt.posterior["theta"].sel(theta_dim_0=var), point_estimate="mode", ax=ax, color=color, ) ax.set_xlabel("theta[{}]".format(var), fontdict=font_d) # theta scatter plots for var, ax in zip(theta_pairs, [ax6, ax9, ax10]): ax.scatter( idata_tt.posterior["theta"].sel(theta_dim_0=var[0]), idata_tt.posterior["theta"].sel(theta_dim_0=var[1]), alpha=0.02, color=color, facecolor="none", ) ax.plot([0, 1], [0, 1], ":k", transform=ax.transAxes, alpha=0.5) ax.set_xlabel("theta[{}]".format(var[0]), fontdict=font_d) ax.set_ylabel("theta[{}]".format(var[1]), fontdict=font_d) ax.set(xlim=(0, 1), ylim=(0, 1), aspect="equal") # theta posterior differences plots for var, ax in zip(theta_pairs, [ax4, ax5, ax8]): az.plot_posterior( idata_tt.posterior["theta"].sel(theta_dim_0=var[0]) - idata_tt.posterior["theta"].sel(theta_dim_0=var[1]), point_estimate="mode", ax=ax, color=color, ) ax.set_xlabel("theta[{}] - theta[{}]".format(*var), fontdict=font_d) plt.tight_layout() # ### Shrinkage # Let's create a model with just the theta estimations per practitioner, without the influence of a higher level distribution. Then we can compare the theta values with the hierarchical model above. # In[31]: with pm.Model() as unpooled_model: theta = pm.Beta("theta", 1, 1, shape=n_practitioners) y = pm.Bernoulli("y", theta[practitioner_idx], observed=df["y"]) # In[32]: pm.model_to_graphviz(unpooled_model) # In[33]: with unpooled_model: unpooled_idata = pm.sample(5000) # Here we concatenate the trace results (thetas) from both models into a dataframe. Next we shape the data into a format that we can use with Seaborn's pointplot. # In[34]: df_shrinkage = pd.concat( [az.summary(unpooled_idata).iloc[:, 0], az.summary(idata_tt).iloc[2:-1, 0]], axis=1 ).reset_index() df_shrinkage.columns = ["theta", "unpooled", "hierarchical"] df_shrinkage = pd.melt( df_shrinkage, "theta", ["unpooled", "hierarchical"], var_name="Model" ) df_shrinkage.head() # The below plot shows that the theta estimates on practitioner level are pulled towards the group mean of the hierarchical model. # In[35]: plt.figure(figsize=(10, 9)) plt.scatter( 1, az.summary(idata_tt).iloc[0, 0], s=100, c="r", marker="x", zorder=999, label="Group mean", ) sns.pointplot(x="Model", y="value", hue="theta", data=df_shrinkage) # ### 9.5.1 - Example: Baseball batting abilities by position # In[36]: df2 = pd.read_csv( "data/BattingAverage.csv", usecols=[0, 1, 2, 3], dtype={"PriPos": "category"} ) df2.info() # The DataFrame contains records for 948 players in the 2012 regular season of Major League Baseball. # - One record per player # - 9 primary field positions # In[37]: df2["BatAv"] = df2.Hits.divide(df2.AtBats) df2.head(10) # In[38]: # batting average by primary field positions calculated from the data df2.groupby("PriPos")[["Hits", "AtBats"]].sum().pipe(lambda x: x.Hits / x.AtBats) # #### Model (Kruschke, 2015) # In[39]: Image('images/fig9_13.png', width=300) # In[46]: pripos_idx, pripos_codes = pd.factorize(df2["PriPos"]) n_pripos = df2["PriPos"].nunique() n_players = df2["Player"].nunique() with pm.Model() as hierarchical_model2: # hyper parameters omega = pm.Beta("omega", 1, 1) kappa_minus2 = pm.Gamma("kappa_minus2", 0.1, 0.1) kappa = pm.Deterministic("kappa", kappa_minus2 + 2) # parameters for categories (Primary field positions) omega_c = pm.Beta( "omega_c", omega * (kappa - 2) + 1, (1 - omega) * (kappa - 2) + 1, shape=n_pripos, ) kappa_c_minus2 = pm.Gamma("kappa_c_minus2", 0.01, 0.01, shape=n_pripos) kappa_c = pm.Deterministic("kappa_c", kappa_c_minus2 + 2) # parameter for individual players theta = pm.Beta( "theta", omega_c[pripos_idx] * (kappa_c[pripos_idx] - 2) + 1, (1 - omega_c[pripos_idx]) * (kappa_c[pripos_idx] - 2) + 1, shape=n_players, ) y2 = pm.Binomial("y2", n=df2.AtBats.values, p=theta, observed=df2["Hits"]) # In[41]: pm.model_to_graphviz(hierarchical_model2) # In[47]: with hierarchical_model2: idata2 = pm.sample(5000) # In[48]: az.plot_trace(idata2, var_names=["omega", "kappa", "omega_c", "kappa_c"]) plt.tight_layout() # #### Figure 9.17 # #### Posterior distribution of hyper parameter omega after sampling. # In[49]: az.plot_posterior(idata2.posterior["omega"], point_estimate="mode", color=color) plt.title("Overall", fontdict={"fontsize": 16, "fontweight": "bold"}) plt.xlabel("omega", fontdict={"fontsize": 14}); # #### Posterior distributions of the omega_c parameters after sampling. # In[50]: fig, axes = plt.subplots(3, 3, figsize=(14, 8)) for i, ax in enumerate(axes.T.flatten()): az.plot_posterior( idata2.posterior["omega_c"].sel(omega_c_dim_0=i), ax=ax, point_estimate="mode", color=color, ) ax.set_title(pripos_codes[i], fontdict={"fontsize": 16, "fontweight": "bold"}) ax.set_xlabel("omega_c__{}".format(i), fontdict={"fontsize": 14}) ax.set_xlim(0.10, 0.30) plt.tight_layout(h_pad=3);