In [23]:
# %load jupyter_default.py
import pandas as pd
import numpy as np
import os
import re
import datetime
import time
import glob
from tqdm import tqdm_notebook
from colorama import Fore, Style

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns

%config InlineBackend.figure_format='retina'
sns.set() # Revert to matplotlib defaults
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['legend.fancybox'] = True
plt.style.use('fivethirtyeight')
plt.rcParams['font.family'] = 'Marion'

SMALL_SIZE, MEDIUM_SIZE, BIGGER_SIZE = 14, 16, 20
plt.rc('font', size=SMALL_SIZE)
plt.rc('axes', titlesize=SMALL_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rc('xtick', labelsize=SMALL_SIZE)
plt.rc('ytick', labelsize=SMALL_SIZE)
plt.rc('legend', fontsize=MEDIUM_SIZE)
plt.rc('axes', titlesize=BIGGER_SIZE)

plt.rcParams['grid.alpha'] = 0.2
plt.rcParams['axes.labelpad'] = 10
plt.rcParams['axes.labelpad'] = 20
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'
plt.rcParams['xtick.major.pad'] = 15
plt.rcParams['xtick.minor.pad'] = 15
plt.rcParams['ytick.major.pad'] = 10
plt.rcParams['ytick.minor.pad'] = 10

def savefig(name):
    plt.savefig(f'../../figures/{name}.png', bbox_inches='tight', dpi=300)
In [3]:
%load_ext version_information
%version_information pandas, numpy
Out[3]:
SoftwareVersion
Python3.6.8 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
IPython7.2.0
OSDarwin 18.7.0 x86_64 i386 64bit
pandas0.23.4
numpy1.15.4
Thu Jan 14 23:56:45 2021 EST
In [4]:
%load_ext watermark
%watermark -a "Alex Galea" -v -p pymc3,matplotlib,seaborn,pandas
Alex Galea 

CPython 3.6.8
IPython 7.2.0

pymc3 3.6
matplotlib 3.0.2
seaborn 0.9.0
pandas 0.23.4

Bayesian Modeling Discussion

We can model the probability of an outcome $y$ as $P_t(y)$ using a Gamma distribution where time $t$ is defined below.

The PMF is given by

$$ P_t(\alpha, \beta) = \frac{\beta^\alpha t^{\alpha-1}e^{-\beta t}}{\Gamma (\alpha)} $$

where $t$ is the time metric and $\alpha$ & $\beta$ are solved for with MCMC.

Based on a set of goalie pull observations $X$ from 2003-2019 NHL games, we'll solve for the posterior distribution $P_t(y|X)$, the probability of the outcome $y$, given the observations. This is done computationally using markov chain monte carlo and the pymc3 library.

The outcomes we're interested in are $y = \big\{\mathrm{goal\;for}, \mathrm{goal\;against}, \mathrm{no\;goal}\big\}$.

We'll use a uniform prior over the domain of times (last 5mins). Note: when gathering the observations, we throw out goalie pulls greater than 5 minutes from the end of the game (due to high likelihood of false positives when parsing goalie pulls from the raw game table).

Once we find the posteriors discussed above, we can study the risk reward of pulling a goalie. We'll compare posteriors to find the odds of scoring a goal (and the odds of getting scored on) over time $t$ where:

  • t = Time elapsed e.g. if there's 3 minutes left, what is the chance that pulling the goalie will result in a goal for?
  • t = Time since goalie pull e.g. after the goalie has been pulled for 1 minute, what is the chance of getting a goal?
In [5]:
HALF_SPACE = u'\u2000'
In [6]:
import pymc3 as pm

Load the training data

In [7]:
ls ../../data/processed/pkl/
20032004_goalie_pulls_2019-03-01.pkl  20122013_goalie_pulls_2019-04-25.pkl
20052006_goalie_pulls_2019-03-01.pkl  20132014_goalie_pulls_2019-04-25.pkl
20062007_goalie_pulls_2019-03-01.pkl  20142015_goalie_pulls_2019-04-25.pkl
20072008_goalie_pulls_2019-04-25.pkl  20152016_goalie_pulls_2019-04-25.pkl
20082009_goalie_pulls_2019-04-25.pkl  20162017_goalie_pulls_2019-04-25.pkl
20092010_goalie_pulls_2019-04-25.pkl  20172018_goalie_pulls_2019-04-25.pkl
20102011_goalie_pulls_2019-04-25.pkl  20182019_goalie_pulls_2019-04-25.pkl
20112012_goalie_pulls_2019-04-25.pkl  tmp/
In [8]:
def load_data():
    files = glob.glob('../../data/processed/pkl/*.pkl')
    files = sorted(files)
    print(files)
    return pd.concat((pd.read_pickle(f) for f in files), sort=False, ignore_index=True)

def clean_df(df):
    _df = df.copy()
    
    len_0 = _df.shape[0]
    print('Removing goal_for_time < 15 mins')
    _df = _df[~(_df.goal_for_time < datetime.timedelta(seconds=15*60))]
    print(f'Removed {len_0 - _df.shape[0]} total rows')
    
    if 'game_end_time' in df.columns:
        len_0 = _df.shape[0]
        print('Removing game_end_time < 15 mins')
        _df = _df[~(_df.game_end_time < datetime.timedelta(seconds=60*15))]
        print(f'Removed {len_0 - _df.shape[0]} total rows')

    return _df
In [9]:
df = load_data()
df = clean_df(df)
['../../data/processed/pkl/20032004_goalie_pulls_2019-03-01.pkl', '../../data/processed/pkl/20052006_goalie_pulls_2019-03-01.pkl', '../../data/processed/pkl/20062007_goalie_pulls_2019-03-01.pkl', '../../data/processed/pkl/20072008_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20082009_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20092010_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20102011_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20112012_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20122013_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20132014_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20142015_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20152016_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20162017_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20172018_goalie_pulls_2019-04-25.pkl', '../../data/processed/pkl/20182019_goalie_pulls_2019-04-25.pkl']
Removing goal_for_time < 15 mins
Removed 115 total rows
Removing game_end_time < 15 mins
Removed 0 total rows
In [10]:
def load_training_samples(
    df,
    cols,
    masks=[],
    dtype='timedelta64[s]'
) -> np.ndarray:
    '''
    Return buckets of training data.
    '''
    if not masks:
        masks = [None] * len(cols)
    out = []
    for col, m in zip(cols, masks):
        if m is None:
            d = df[col].dropna().astype(dtype).values
        else:
            d = df[col][m].dropna().astype(dtype).values
        out.append(d)
        print(f'Loaded {len(d)} samples for col {col}')

    out = np.array(out)
    print(f'Training data shape = {out.shape}')
    return out

Model 1 - Time elapsed

Load data

In [11]:
# Load time of pull for eventual outcomes:
feature_names = ['goal_for', 'goal_against', 'no_goals']

# Logic for loading the data
features = ['pull_time', 'pull_time', 'pull_time']
masks = [
    ~(df.goal_for_time.isnull()),
    ~(df.goal_against_time.isnull()),
    ~(df.game_end_timedelta.isnull()),
]
training_samples = load_training_samples(df, features, masks)
Loaded 1496 samples for col pull_time
Loaded 3736 samples for col pull_time
Loaded 5937 samples for col pull_time
Training data shape = (3,)

We actuall want to fit the mirror version of the Gamma distributioin (about the y-axis). In order to do this with PyMC3, we will transform the data.

In [12]:
MAX_TIME_SECONDS = 20*60
training_samples[0] = MAX_TIME_SECONDS - training_samples[0]
training_samples[1] = MAX_TIME_SECONDS - training_samples[1]
training_samples[2] = MAX_TIME_SECONDS - training_samples[2]
In [13]:
(training_samples[0][:10],
training_samples[1][:10],
training_samples[2][:10],)
Out[13]:
(array([ 81.,  86.,  91.,  89., 119.,  79., 100.,  97.,  55.,  58.]),
 array([16., 57., 32., 67., 67., 60., 82., 70., 52., 96.]),
 array([  2.,  75., 132.,  49.,  81.,  63.,  69.,  60.,   8.,  63.]))
In [14]:
feature_names
Out[14]:
['goal_for', 'goal_against', 'no_goals']
In [ ]:
 
In [ ]:
 
In [ ]:
 

PyMC3 Model

In [15]:
from typing import Tuple
from pymc3.backends.base import MultiTrace

def bayes_model(training_samples) -> Tuple[pm.model.Model, MultiTrace]:
    """
    Solve for posterior distributions using pymc3
    """
    with pm.Model() as model:

        # Priors
        beta_range = (0, 100)
        alpha_range = (0.001, 10)
        beta_goal_for = pm.Uniform(
            'beta_goal_for', *beta_range
        )
        beta_goal_against = pm.Uniform(
            'beta_goal_against', *beta_range
        )
        beta_no_goal = pm.Uniform(
            'beta_no_goal', *beta_range
        )
        alpha_goal_for = pm.Uniform(
            'alpha_goal_for', *alpha_range
        )
        alpha_goal_against = pm.Uniform(
            'alpha_goal_against', *alpha_range
        )
        alpha_no_goal = pm.Uniform(
            'alpha_no_goal', *alpha_range
        )
        
        # Observations to train the model on
        obs_goal_for = pm.Gamma(
            'obs_goal_for',
            alpha=alpha_goal_for,
            beta=beta_goal_for,
            observed=training_samples[0],
        )
        obs_goal_against = pm.Gamma(
            'obs_goal_against',
            alpha=alpha_goal_against,
            beta=beta_goal_against,
            observed=training_samples[1],
        )
        obs_no_goal = pm.Gamma(
            'obs_no_goal',
            alpha=alpha_no_goal,
            beta=beta_no_goal,
            observed=training_samples[2],
        )
        
        # Outcome probabilities
        p_goal_for = pm.Deterministic(
            'p_goal_for', MAX_TIME_SECONDS - pm.Gamma(
                'gamma_goal_for',
                alpha=alpha_goal_for,
                beta=beta_goal_for,
            )
        )
        p_goal_against = pm.Deterministic(
            'p_goal_against', MAX_TIME_SECONDS - pm.Gamma(
                'gamma_goal_against',
                alpha=alpha_goal_against,
                beta=beta_goal_against,
            )
        )
        p_no_goal = pm.Deterministic(
            'p_no_goal', MAX_TIME_SECONDS - pm.Gamma(
                'gamma_no_goal',
                alpha=alpha_no_goal,
                beta=beta_no_goal,
            )
        )
        
        # Fit model
        step = pm.Metropolis()
        trace = pm.sample(18000, step=step)
        
    return model, trace

model, trace = bayes_model(training_samples)
model
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>Metropolis: [gamma_no_goal]
>Metropolis: [gamma_goal_against]
>Metropolis: [gamma_goal_for]
>Metropolis: [alpha_no_goal]
>Metropolis: [alpha_goal_against]
>Metropolis: [alpha_goal_for]
>Metropolis: [beta_no_goal]
>Metropolis: [beta_goal_against]
>Metropolis: [beta_goal_for]
Sampling 4 chains: 100%|██████████| 74000/74000 [01:04<00:00, 1147.70draws/s]
The estimated number of effective samples is smaller than 200 for some parameters.
Out[15]:
$$ \begin{array}{rcl} \text{beta_goal_for} &\sim & \text{Uniform}(\mathit{lower}=0.0,~\mathit{upper}=100.0)\\\text{beta_goal_against} &\sim & \text{Uniform}(\mathit{lower}=0.0,~\mathit{upper}=100.0)\\\text{beta_no_goal} &\sim & \text{Uniform}(\mathit{lower}=0.0,~\mathit{upper}=100.0)\\\text{alpha_goal_for} &\sim & \text{Uniform}(\mathit{lower}=0.001,~\mathit{upper}=10.0)\\\text{alpha_goal_against} &\sim & \text{Uniform}(\mathit{lower}=0.001,~\mathit{upper}=10.0)\\\text{alpha_no_goal} &\sim & \text{Uniform}(\mathit{lower}=0.001,~\mathit{upper}=10.0)\\\text{gamma_goal_for} &\sim & \text{Gamma}(\mathit{alpha}=\text{alpha_goal_for},~\mathit{beta}=\text{beta_goal_for})\\\text{p_goal_for} &\sim & \text{Deterministic}(\text{Constant},~\text{gamma_goal_for_log__})\\\text{gamma_goal_against} &\sim & \text{Gamma}(\mathit{alpha}=\text{alpha_goal_against},~\mathit{beta}=\text{beta_goal_against})\\\text{p_goal_against} &\sim & \text{Deterministic}(\text{Constant},~\text{gamma_goal_against_log__})\\\text{gamma_no_goal} &\sim & \text{Gamma}(\mathit{alpha}=\text{alpha_no_goal},~\mathit{beta}=\text{beta_no_goal})\\\text{p_no_goal} &\sim & \text{Deterministic}(\text{Constant},~\text{gamma_no_goal_log__})\\\text{obs_goal_for} &\sim & \text{Gamma}(\mathit{alpha}=\text{alpha_goal_for},~\mathit{beta}=\text{beta_goal_for})\\\text{obs_goal_against} &\sim & \text{Gamma}(\mathit{alpha}=\text{alpha_goal_against},~\mathit{beta}=\text{beta_goal_against})\\\text{obs_no_goal} &\sim & \text{Gamma}(\mathit{alpha}=\text{alpha_no_goal},~\mathit{beta}=\text{beta_no_goal}) \end{array} $$

Save / Load the model

Note: using pickle on untrusted files is dangerous. Use at your own risk. No warranty as per licence.

In [16]:
SAVE_MODEL = True
LOAD_MODEL = True
MODEL_PATH = ('../../models/gamma_mcmc_{}.pymc3.pkl'
    .format(datetime.datetime.now().strftime('%Y-%m-%d'))
)
TRACE_PATH = ('../../models/gamma_mcmc_trace_{}.pymc3.dat'
    .format(datetime.datetime.now().strftime('%Y-%m-%d'))
)

import pickle

if SAVE_MODEL:
    with open(MODEL_PATH, 'wb') as f:
        pickle.dump({'model': model, 'trace': trace}, f)

MODEL_PATH = "../../models/gamma_mcmc_2019-07-07.pymc3.pkl"

if LOAD_MODEL:
    with open(MODEL_PATH, 'rb') as f:
        data = pickle.load(f)
        
    model = data['model']
    trace = data['trace']
    del data

MCMC Samples

In [19]:
N_burn = 10000
burned_trace = trace[N_burn:]
In [20]:
from typing import Tuple
from scipy.stats import gamma

def gamma_posterior(
    alpha=None,
    beta=None,
    norm_factors=None,
) -> Tuple[np.ndarray]:

    p = gamma.pdf
    x = np.arange(0*60, 5*60, 1)
    if alpha is None or beta is None:
        return (x / 60,)

    y_goal_for = p(x, alpha[0], scale=1/beta[0])
    y_goal_against = p(x, alpha[1], scale=1/beta[1])
    y_no_goal = p(x, alpha[2], scale=1/beta[2])
    
    if norm_factors is not None:
        y_goal_for = y_goal_for * norm_factors[0]
        y_goal_against = y_goal_against * norm_factors[1]
        y_no_goal = y_no_goal * norm_factors[2]
    
    # Transform x
    x = MAX_TIME_SECONDS - x
    
    # Convert into minutes
    x = x / 60

    return x, y_goal_for, y_goal_against, y_no_goal
In [29]:
ALPHA = 0.6
LW = 3

''' Plot MCMC samples '''

plt.hist(burned_trace['p_goal_for']/60, bins=50,
         color='green', label='p_goal_for samples',
         density='normed',
         histtype='stepfilled', alpha=ALPHA)

plt.hist(burned_trace['p_goal_against']/60, bins=50,
         color='red', label='p_goal_against samples',
         density='normed',
         histtype='stepfilled', alpha=ALPHA)

plt.hist(burned_trace['p_no_goal']/60, bins=50,
         color='orange', label='p_no_goal samples',
         density='normed',
         histtype='stepfilled', alpha=ALPHA)

''' Plot modeled distributions '''
x, y_goal_for, y_goal_against, y_no_goal = gamma_posterior(
    alpha=[
        burned_trace['alpha_goal_for'].mean(),
        burned_trace['alpha_goal_against'].mean(),
        burned_trace['alpha_no_goal'].mean(),
    ],
    beta=[
        burned_trace['beta_goal_for'].mean(),
        burned_trace['beta_goal_against'].mean(),
        burned_trace['beta_no_goal'].mean(),
    ],
)

# Rescale height by arbitrary amount to fit chart
scale_frac = 0.5
y_goal_for = y_goal_for / y_goal_for.max() * scale_frac
y_goal_against = y_goal_against / y_goal_against.max() * scale_frac
scale_frac = 0.65
y_no_goal = y_no_goal / y_no_goal.max() * scale_frac

plt.plot(x, y_goal_for, label=r'$P(\rm{goal\;for})$', color='green', lw=LW)
plt.plot(x, y_goal_against, label=r'$P(\rm{goal\;against})$', color='red', lw=LW)
plt.plot(x, y_no_goal, label=r'$P(\rm{no\;goal})$', color='orange', lw=LW)

''' Clean up the chart '''

plt.ylabel('Counts')
ax = plt.gca()
ax.set_yticklabels([])

plt.xlabel('Time elapsed in 3rd period (minutes)')
plt.legend()

plt.text(x=11.6, y=0.770,
    s='Goalie Pull Outcome Models',
    fontsize=24, color='black', weight='bold')

plt.text(x=11.6, y=0.71,
    s='MCMC gamma posterior models and samples\nfor the three possilbe goalie pull outcomes.',
    fontsize=14, color='black', style='italic')

plt.text(x=19.5, y=0.71,
    s='alexgalea.ca',
    fontsize=14, color='black', style='italic')


savefig('time_elapsed_gamma_mcmc_samples')

plt.show()
In [30]:
fig = pm.traceplot(trace);
savefig('time_elapsed_gamma_mcmc_traceplot')