#!/usr/bin/env python
# coding: utf-8
# A Dockerfile that will produce a container with all the dependencies necessary to run this notebook is available [here](https://github.com/AustinRochford/notebooks).
# In[1]:
get_ipython().run_line_magic('matplotlib', 'inline')
# In[2]:
import datetime
import logging
from warnings import filterwarnings
# In[3]:
import arviz as az
from matplotlib import pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import numpy as np
import pandas as pd
import pymc3 as pm
import scipy as sp
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from theano import pprint
# In[4]:
sns.set(color_codes=True)
pct_formatter = StrMethodFormatter('{x:.1%}')
# In[5]:
# configure pyplot for readability when rendered as a slideshow and projected
FIG_WIDTH, FIG_HEIGHT = 8, 6
plt.rc('figure', figsize=(FIG_WIDTH, FIG_HEIGHT))
LABELSIZE = 14
plt.rc('axes', labelsize=LABELSIZE)
plt.rc('axes', titlesize=LABELSIZE)
plt.rc('figure', titlesize=LABELSIZE)
plt.rc('legend', fontsize=LABELSIZE)
plt.rc('xtick', labelsize=LABELSIZE)
plt.rc('ytick', labelsize=LABELSIZE)
# In[6]:
filterwarnings('ignore', 'findfont')
filterwarnings('ignore', "Conversion of the second argument of issubdtype")
filterwarnings('ignore', "Set changed size during iteration")
# keep theano from complaining about compile locks for small models
(logging.getLogger('theano.gof.compilelock')
.setLevel(logging.CRITICAL))
# In[7]:
SEED = 54902 # from random.org, for reproducibility
np.random.seed(SEED)
# # NBA Foul Calls
#
#
#
# **Question:** Is (not) committing and/or drawing fouls a measurable player skill?
# See this [talk](http://austinrochford.com/resources/talks/nba-fouls-pydata-nyc-2017.slides.html) or this [post](http://austinrochford.com/posts/2018-02-04-nba-irt-2.html) for more information on the data, expanded models, and conclusions from this case study.
# In[8]:
get_ipython().run_cell_magic('bash', '', 'DATA_URI=https://raw.githubusercontent.com/polygraph-cool/last-two-minute-report/32f1c43dfa06c2e7652cc51ea65758007f2a1a01/output/all_games.csv\nDATA_DEST=/tmp/all_games.csv\n\nif [[ ! -e $DATA_DEST ]];\nthen\n wget -q -O $DATA_DEST $DATA_URI\nfi\n')
# In[9]:
USECOLS = [
'period',
'seconds_left',
'call_type',
'committing_player',
'disadvantaged_player',
'review_decision',
'play_id',
'away',
'home',
'date',
'score_away',
'score_home',
'disadvantaged_team',
'committing_team'
]
# In[10]:
orig_df = pd.read_csv(
'/tmp/all_games.csv',
usecols=USECOLS,
index_col='play_id',
parse_dates=['date']
)
# In[11]:
orig_df.head(n=2).T
# In[12]:
foul_df = orig_df[
orig_df.call_type
.fillna("UNKNOWN")
.str.startswith("Foul")
]
# In[13]:
FOULS = [
f"Foul: {foul_type}"
for foul_type in [
"Personal",
"Shooting",
"Offensive",
"Loose Ball",
"Away from Play"
]
]
# In[14]:
TEAM_MAP = {
"NKY": "NYK",
"COS": "BOS",
"SAT": "SAS",
"CHi": "CHI",
"LA)": "LAC",
"AT)": "ATL",
"ARL": "ATL"
}
def correct_team_name(col):
def _correct_team_name(df):
return df[col].apply(lambda team_name: TEAM_MAP.get(team_name, team_name))
return _correct_team_name
# In[15]:
def date_to_season(date):
if date >= datetime.datetime(2017, 10, 17):
return '2017-2018'
elif date >= datetime.datetime(2016, 10, 25):
return '2016-2017'
elif date >= datetime.datetime(2015, 10, 27):
return '2015-2016'
else:
return '2014-2015'
# In[16]:
clean_df = (foul_df.where(lambda df: df.period == "Q4")
.where(lambda df: (df.date.between(datetime.datetime(2016, 10, 25),
datetime.datetime(2017, 4, 12))
| df.date.between(datetime.datetime(2015, 10, 27),
datetime.datetime(2016, 5, 30)))
)
.assign(
review_decision=lambda df: df.review_decision.fillna("INC"),
committing_team=correct_team_name('committing_team'),
disadvantged_team=correct_team_name('disadvantaged_team'),
away=correct_team_name('away'),
home=correct_team_name('home'),
season=lambda df: df.date.apply(date_to_season)
)
.where(lambda df: df.call_type.isin(FOULS))
.dropna()
.drop('period', axis=1)
.assign(call_type=lambda df: (df.call_type
.str.split(': ', expand=True)
.iloc[:, 1])))
# In[17]:
player_enc = LabelEncoder().fit(
np.concatenate((
clean_df.committing_player,
clean_df.disadvantaged_player
))
)
n_player = player_enc.classes_.size
season_enc = LabelEncoder().fit(
clean_df.season
)
n_season = season_enc.classes_.size
# In[18]:
df = (clean_df[['seconds_left']]
.round(0)
.assign(
foul_called=1. * clean_df.review_decision.isin(['CC', 'INC']),
player_committing=player_enc.transform(clean_df.committing_player),
player_disadvantaged=player_enc.transform(clean_df.disadvantaged_player),
score_committing=clean_df.score_home.where(
clean_df.committing_team == clean_df.home,
clean_df.score_away
),
score_disadvantaged=clean_df.score_home.where(
clean_df.disadvantaged_team == clean_df.home,
clean_df.score_away
),
season=season_enc.transform(clean_df.season)
))
# In[19]:
df.head()
# In[20]:
player_committing = df.player_committing.values
player_disadvantaged = df.player_disadvantaged.values
season = df.season.values
# In[21]:
def hierarchical_normal(name, shape):
Δ = pm.Normal(f'Δ_{name}', 0., 1., shape=shape)
σ = pm.HalfNormal(f'σ_{name}', 5.)
return pm.Deterministic(name, Δ * σ)
# ## Model outline
#
# $$
# \operatorname{log-odds}(\textrm{Foul}) \
# \sim \textrm{Season factor} + \left(\textrm{Disadvantaged skill} - \textrm{Committing skill}\right)
# $$
# In[22]:
with pm.Model() as irt_model:
β_season = pm.Normal('β_season', 0., 2.5, shape=n_season)
θ = hierarchical_normal('θ', n_player) # disadvantaged skill
b = hierarchical_normal('b', n_player) # committing skill
p = pm.math.sigmoid(
β_season[season] + θ[player_disadvantaged] - b[player_committing]
)
obs = pm.Bernoulli(
'obs', p,
observed=df['foul_called'].values
)
# In[23]:
CHAINS = 3
SAMPLE_KWARGS = {
'chains': CHAINS,
'cores': CHAINS,
'random_seed': list(SEED + np.arange(CHAINS))
}
# In[24]:
with irt_model:
trace = pm.sample(500, **SAMPLE_KWARGS)
# In[25]:
az.plot_energy(trace);
# In[26]:
az.rhat(trace).max()
# ## Basketball strategy leads to more complexity
#
#
#
#
# In[27]:
df['trailing_committing'] = (df.score_committing
.lt(df.score_disadvantaged)
.mul(1.)
.astype(np.int64))
# In[28]:
def make_foul_rate_yaxis(ax, label="Observed foul call rate"):
ax.yaxis.set_major_formatter(pct_formatter)
ax.set_ylabel(label)
return ax
# In[29]:
def make_time_axes(ax,
xlabel="Seconds remaining in game",
ylabel="Observed foul call rate"):
ax.invert_xaxis()
ax.set_xlabel(xlabel)
return make_foul_rate_yaxis(ax, label=ylabel)
# In[30]:
fig = make_time_axes(
df.pivot_table('foul_called', 'seconds_left', 'trailing_committing')
.rolling(20).mean()
.rename(columns={0: "No", 1: "Yes"})
.rename_axis("Committing team is trailing", axis=1)
.plot()
).figure
# In[31]:
fig
# ### The shot clock
#
#
#
#
# Full model reference
#
# ### Are we measuring a skill?
#
#
#
#
#
#
#
#
#
# |
#
#
# |
#
#
#
#
#
#
# # Future Work
#
# * Advanced baseball stats
# * OPS
# * WAR
# * Park factors
# * Hockey
# * Faceoff skills
# * Fighting ability
# ## Thank you!
#
#
#
#
#
# ### [@AustinRochford](https://twitter.com/AustinRochford) • [arochford@monetate.com](mailto:arochford@monetate.com) • [austin.rochford@gmail.com](mailto:austin.rochford@gmail.com)
# In[33]:
get_ipython().run_cell_magic('bash', '', 'jupyter nbconvert \\\n --to=slides \\\n --reveal-prefix=https://cdnjs.cloudflare.com/ajax/libs/reveal.js/3.2.0/ \\\n --output=pp-sports-analytics-part-2 \\\n ./basketball-irt.ipynb\n')
# In[ ]: