#!/usr/bin/env python
# coding: utf-8

# ## Chapter 9 - Hierarchical Models

# - [9.2.4 - Example: Therapeutic touch](#9.2.4---Example:-Therapeutic-touch)   
#     - [Shrinkage](#Shrinkage)
# - [9.5.1 - Example: Baseball batting abilities by position (subjects within categories)](#9.5.1---Example:-Baseball-batting-abilities-by-position)   

# In[1]:


import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import seaborn as sns
#import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

from IPython.display import Image
from matplotlib import gridspec

plt.style.use('seaborn-white')
color = '#87ceeb'


# In[2]:


get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', '-p arviz,matplotlib,numpy,pandas,pymc,seaborn')


# ### 9.2.3 - Example: Multiple Coins form a Single Mint

# In[3]:


# data from coin #1
n_heads1 = 3
n_tails1 = 12

n_flips1 = n_heads1 + n_tails1
data1 = np.repeat([1, 0], [n_heads1, n_tails1])

# coin #2
n_heads2 = 4
n_tails2 = 1

n_flips2 = n_heads2 + n_tails2
data2 = np.repeat([1, 0], [n_heads2, n_tails2])

# now pack all those outcomes into a single vector
outcomes = np.hstack((data1, data2))

# and create a vector of 'coin IDs' so that we can connect each flip
# with the corresponding coin's value of theta
coin = np.hstack((np.zeros_like(data1), np.ones_like(data2)))


# In[4]:


outcomes


# In[5]:


# build the model
with pm.Model() as oneminttwocoins_model:
    omega = pm.Beta("omega", 2.0, 2.0)
    # Kruschke uses these parameters in the chapter, but they yield very poor sampling
    # kappa_minus2 = pm.Gamma('kappa_minus2', 0.01, 0.01)
    kappa_minus2 = pm.Gamma("kappa_minus2", 0.5, 0.5)
    kappa = pm.Deterministic("kappa", kappa_minus2 + 2)

    theta = pm.Beta(
        "theta",
        alpha=omega * (kappa - 2) + 1,
        beta=(1 - omega) * (kappa - 2) + 1,
        shape=2,
    )

    y = pm.Bernoulli("y", theta[coin], observed=outcomes)


# In[6]:


pm.model_to_graphviz(oneminttwocoins_model)


# In[7]:


# sample!
with oneminttwocoins_model:
    idata_1m2c = pm.sample(5000, target_accept=0.9)


# In[8]:


az.plot_trace(idata_1m2c, var_names=["~kappa_minus2"])
plt.tight_layout();


# In[9]:


az.summary(idata_1m2c)


# In[10]:


idata_1m2c.posterior['omega'].sel(draw=0)


# Given that this is our first legitimate MCMC trace object, let's see how one can interogate the raw trace to extract information we might want.

# In[11]:


idata_1m2c


# In[12]:


# let's look at the first MCMC sample
idata_1m2c.posterior.sel(draw=0, chain=0)


# Note that there are **2** values of theta in this first step (one for each coin).  They are stored together, because we asked for an array of theta parameters.  Let's calculate the mean value of the $\theta$ parameter associated with the first coin.

# In[13]:


idata_1m2c.posterior['theta'].mean(dim=['draw','chain'])[0]


# And for the second coin.

# In[14]:


idata_1m2c.posterior['theta'].mean(dim=['draw','chain'])[1]


# What if we have 'reference' values against which we wish to compare our parameter estimate?  We can, among other things, ask what proportion of samples took on a value greater than/less than that reference value.  In the case of coins, which might wish to compare $\theta$ to a value of 0.5 (i.e., to ask how likely it is that each coin is 'fair').

# In[15]:


# coin 1
(idata_1m2c.posterior['theta']>.5).mean(dim=['draw','chain'])[0]


# In[16]:


# coin 2
(idata_1m2c.posterior['theta']>.5).mean(dim=['draw','chain'])[1]


# Now let's ask about our mint parameter, $\omega$.  Here too we calculate the mean posterior estimate.

# In[17]:


idata_1m2c.posterior['omega'].mean(dim=['draw','chain'])


# We can also again compare $\omega$ against our 'reference' value of 0.5.  The interpretation of $\omega$ is somewhat different than the interpretation of $\theta$.  Asking about $p(\omega > 0.5)$ tells us whether the _mint tends to generate fair coins_.

# In[18]:


(idata_1m2c.posterior['omega']>.5).mean(dim=['draw','chain'])


# We can also inspect $\kappa$, which will tell us how consistent the mint is in generating coins with values of $\theta$ near $\omega$.

# In[19]:


idata_1m2c.posterior['kappa'].mean(dim=['draw','chain'])


# ### 9.2.4 - Example: Therapeutic touch

# In[20]:


df = pd.read_csv("data/TherapeuticTouchData.csv", dtype={"s": "category"})
df.info()


# In[21]:


df.head()


# #### Figure 9.9

# In[22]:


df_proportions = df.groupby("s")["y"].apply(lambda x: x.sum() / len(x))

ax = sns.displot(df_proportions, bins=8, color="gray")
ax.set(xlabel="Proportion Correct", ylabel="# Practitioners");


# #### Model (Kruschke, 2015)

# In[23]:


Image('images/fig9_7.png', width=200)


# In[24]:


# create a factor for practitioner ID
practitioner_idx, practitioner_codes = pd.factorize(df["s"])
n_practitioners = df["s"].nunique()

with pm.Model() as therapeutictouch_model:
    omega = pm.Beta("omega", 1.0, 1.0)
    kappa_minus2 = pm.Gamma("kappa_minus2", 0.05, 0.05)
    kappa = pm.Deterministic("kappa", kappa_minus2 + 2)

    theta = pm.Beta(
        "theta",
        alpha=omega * (kappa - 2) + 1,
        beta=(1 - omega) * (kappa - 2) + 1,
        shape=n_practitioners,
    )

    y = pm.Bernoulli("y", theta[practitioner_idx], observed=df["y"])


# In[25]:


pm.model_to_graphviz(therapeutictouch_model)


# In[26]:


with therapeutictouch_model:
    idata_tt = pm.sample(5000)


# In[27]:


az.plot_trace(idata_tt, var_names=['omega','kappa', 'theta'])
plt.tight_layout();


# In[28]:


az.summary(idata_tt)


# #### Figure 9.10 - Marginal posterior distributions

# In[29]:


idata_tt.posterior["theta"].mean(dim=["chain", "draw"]).sel(theta_dim_0=0)


# In[30]:


plt.figure(figsize=(10, 12))

# Define gridspec
gs = gridspec.GridSpec(4, 6)
ax1 = plt.subplot(gs[0, :3])
ax2 = plt.subplot(gs[0, 3:])
ax3 = plt.subplot(gs[1, :2])
ax4 = plt.subplot(gs[1, 2:4])
ax5 = plt.subplot(gs[1, 4:6])
ax6 = plt.subplot(gs[2, :2])
ax7 = plt.subplot(gs[2, 2:4])
ax8 = plt.subplot(gs[2, 4:6])
ax9 = plt.subplot(gs[3, :2])
ax10 = plt.subplot(gs[3, 2:4])
ax11 = plt.subplot(gs[3, 4:6])

# thetas and theta pairs to plot
thetas = (0, 13, 27)
theta_pairs = ((0, 13), (0, 27), (13, 27))

font_d = {"size": 14}

# kappa & omega posterior plots
for var, ax in zip(["kappa", "omega"], [ax1, ax2]):
    az.plot_posterior(
        idata_tt.posterior[var], point_estimate="mode", ax=ax, color=color, round_to=2
    )
    ax.set_xlabel("$\{}$".format(var), fontdict={"size": 20, "weight": "bold"})
ax1.set(xlim=(0, 500))

# theta posterior plots
for var, ax in zip(thetas, [ax3, ax7, ax11]):
    az.plot_posterior(
        idata_tt.posterior["theta"].sel(theta_dim_0=var),
        point_estimate="mode",
        ax=ax,
        color=color,
    )
    ax.set_xlabel("theta[{}]".format(var), fontdict=font_d)

# theta scatter plots
for var, ax in zip(theta_pairs, [ax6, ax9, ax10]):
    ax.scatter(
        idata_tt.posterior["theta"].sel(theta_dim_0=var[0]),
        idata_tt.posterior["theta"].sel(theta_dim_0=var[1]),
        alpha=0.02,
        color=color,
        facecolor="none",
    )
    ax.plot([0, 1], [0, 1], ":k", transform=ax.transAxes, alpha=0.5)
    ax.set_xlabel("theta[{}]".format(var[0]), fontdict=font_d)
    ax.set_ylabel("theta[{}]".format(var[1]), fontdict=font_d)
    ax.set(xlim=(0, 1), ylim=(0, 1), aspect="equal")

# theta posterior differences plots
for var, ax in zip(theta_pairs, [ax4, ax5, ax8]):
    az.plot_posterior(
        idata_tt.posterior["theta"].sel(theta_dim_0=var[0])
        - idata_tt.posterior["theta"].sel(theta_dim_0=var[1]),
        point_estimate="mode",
        ax=ax,
        color=color,
    )
    ax.set_xlabel("theta[{}] - theta[{}]".format(*var), fontdict=font_d)

plt.tight_layout()


# ### Shrinkage

# Let's create a model with just the theta estimations per practitioner, without the influence of a higher level distribution. Then we can compare the theta values with the hierarchical model above.

# In[31]:


with pm.Model() as unpooled_model:

    theta = pm.Beta("theta", 1, 1, shape=n_practitioners)

    y = pm.Bernoulli("y", theta[practitioner_idx], observed=df["y"])


# In[32]:


pm.model_to_graphviz(unpooled_model)


# In[33]:


with unpooled_model:
    unpooled_idata = pm.sample(5000)


# Here we concatenate the trace results (thetas) from both models into a dataframe. Next we shape the data into a format that we can use with Seaborn's pointplot.

# In[34]:


df_shrinkage = pd.concat(
    [az.summary(unpooled_idata).iloc[:, 0], az.summary(idata_tt).iloc[2:-1, 0]], axis=1
).reset_index()
df_shrinkage.columns = ["theta", "unpooled", "hierarchical"]
df_shrinkage = pd.melt(
    df_shrinkage, "theta", ["unpooled", "hierarchical"], var_name="Model"
)
df_shrinkage.head()


# The below plot shows that the theta estimates on practitioner level are pulled towards the group mean of the hierarchical model.

# In[35]:


plt.figure(figsize=(10, 9))
plt.scatter(
    1,
    az.summary(idata_tt).iloc[0, 0],
    s=100,
    c="r",
    marker="x",
    zorder=999,
    label="Group mean",
)
sns.pointplot(x="Model", y="value", hue="theta", data=df_shrinkage)


# ### 9.5.1 - Example: Baseball batting abilities by position

# In[36]:


df2 = pd.read_csv(
    "data/BattingAverage.csv", usecols=[0, 1, 2, 3], dtype={"PriPos": "category"}
)
df2.info()


# The DataFrame contains records for 948 players in the 2012 regular season of Major League Baseball.
# - One record per player
# - 9 primary field positions

# In[37]:


df2["BatAv"] = df2.Hits.divide(df2.AtBats)
df2.head(10)


# In[38]:


# batting average by primary field positions calculated from the data
df2.groupby("PriPos")[["Hits", "AtBats"]].sum().pipe(lambda x: x.Hits / x.AtBats)


# #### Model (Kruschke, 2015)

# In[39]:


Image('images/fig9_13.png', width=300)


# In[46]:


pripos_idx, pripos_codes = pd.factorize(df2["PriPos"])

n_pripos = df2["PriPos"].nunique()
n_players = df2["Player"].nunique()

with pm.Model() as hierarchical_model2:
    # hyper parameters
    omega = pm.Beta("omega", 1, 1)
    kappa_minus2 = pm.Gamma("kappa_minus2", 0.1, 0.1)
    kappa = pm.Deterministic("kappa", kappa_minus2 + 2)

    # parameters for categories (Primary field positions)
    omega_c = pm.Beta(
        "omega_c",
        omega * (kappa - 2) + 1,
        (1 - omega) * (kappa - 2) + 1,
        shape=n_pripos,
    )

    kappa_c_minus2 = pm.Gamma("kappa_c_minus2", 0.01, 0.01, shape=n_pripos)
    kappa_c = pm.Deterministic("kappa_c", kappa_c_minus2 + 2)

    # parameter for individual players
    theta = pm.Beta(
        "theta",
        omega_c[pripos_idx] * (kappa_c[pripos_idx] - 2) + 1,
        (1 - omega_c[pripos_idx]) * (kappa_c[pripos_idx] - 2) + 1,
        shape=n_players,
    )

    y2 = pm.Binomial("y2", n=df2.AtBats.values, p=theta, observed=df2["Hits"])


# In[41]:


pm.model_to_graphviz(hierarchical_model2)


# In[47]:


with hierarchical_model2:
    idata2 = pm.sample(5000)


# In[48]:


az.plot_trace(idata2, var_names=["omega", "kappa", "omega_c", "kappa_c"])
plt.tight_layout()


# #### Figure 9.17

# #### Posterior distribution of hyper parameter omega after sampling.

# In[49]:


az.plot_posterior(idata2.posterior["omega"], point_estimate="mode", color=color)
plt.title("Overall", fontdict={"fontsize": 16, "fontweight": "bold"})
plt.xlabel("omega", fontdict={"fontsize": 14});


# #### Posterior distributions of the omega_c parameters after sampling.

# In[50]:


fig, axes = plt.subplots(3, 3, figsize=(14, 8))

for i, ax in enumerate(axes.T.flatten()):
    az.plot_posterior(
        idata2.posterior["omega_c"].sel(omega_c_dim_0=i),
        ax=ax,
        point_estimate="mode",
        color=color,
    )
    ax.set_title(pripos_codes[i], fontdict={"fontsize": 16, "fontweight": "bold"})
    ax.set_xlabel("omega_c__{}".format(i), fontdict={"fontsize": 14})
    ax.set_xlim(0.10, 0.30)

plt.tight_layout(h_pad=3);