%matplotlib inline
import pymc3 as pm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
Import team data for 2016/2017 from Sports Reference.
team_data = pd.read_csv('data/teams2017.csv', index_col=0)
team_data.index = np.arange(team_data.shape[0])
team_data.head()
School | G | W | L | W-L% | SRS | SOS | W.1 | L.1 | W.2 | ... | FT | FTA | FT% | ORB | TRB | AST | STL | BLK | TOV | PF | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Abilene Christian | 29 | 13 | 16 | 0.448 | -12.00 | -7.24 | 7 | 11 | 9 | ... | 360 | 543 | 0.663 | 221 | 904 | 408 | 210 | 83 | 412 | 631 |
1 | Air Force | 33 | 12 | 21 | 0.364 | -2.92 | 1.14 | 4 | 14 | 11 | ... | 513 | 715 | 0.717 | 308 | 1149 | 494 | 197 | 57 | 397 | 594 |
2 | Akron | 34 | 26 | 8 | 0.765 | 3.97 | -1.51 | 14 | 4 | 13 | ... | 470 | 691 | 0.680 | 334 | 1173 | 499 | 185 | 110 | 396 | 616 |
3 | Alabama A&M | 29 | 2 | 27 | 0.069 | -26.73 | -10.83 | 2 | 16 | 1 | ... | 411 | 614 | 0.669 | 292 | 914 | 326 | 100 | 50 | 383 | 526 |
4 | Alabama-Birmingham | 33 | 17 | 16 | 0.515 | -1.05 | -1.71 | 9 | 9 | 11 | ... | 496 | 674 | 0.736 | 322 | 1188 | 503 | 157 | 160 | 444 | 527 |
5 rows × 33 columns
Clean up school labels
team_data['School'] = [''.join(x).rstrip() for x in team_data.School.str.split('*')]
team_data.replace({'School':{'VMI':'Virginia Military Institute',
'SIU Edwardsville':'Southern Illinois-Edwardsville'}},
inplace=True)
Import individual game data for 2016/17 from Spreadsheet Sports.
game_data = pd.read_excel('data/2017 Game Results Data.xlsx')
game_data.head()
Date | Team | Team Location | Team Score | Opponent | Opponent Score | Opponent Location | Neutral Site | Team Result | Team Margin | Team Differential | Opponent Differential | Game Type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-11-11 | Pepperdine | Home | 77 | Cal Poly | 68 | Away | NaN | Win | 9 | -11.290323 | -5.161290 | Division 1 |
1 | 2016-11-11 | Eastern Michigan | Away | 90 | Pittsburgh | 93 | Home | NaN | Loss | -3 | 4.757576 | -1.818182 | Division 1 |
2 | 2016-11-11 | Penn State | Home | 81 | Albany (NY) | 87 | Away | NaN | Loss | -6 | -1.000000 | 5.852941 | Division 1 |
3 | 2016-11-11 | Cal Poly | Away | 68 | Pepperdine | 77 | Home | NaN | Loss | -9 | -5.161290 | -11.290323 | Division 1 |
4 | 2016-11-11 | Portland | Home | 71 | UC-Riverside | 55 | Away | NaN | Win | 16 | -5.090909 | -4.172414 | Division 1 |
Identify all teams in the game data dataset.
all_teams = set(game_data.Team).union(set(game_data.Opponent))
See differene between team data and game data teams
diff_teams = all_teams.difference(team_data.School)
Remove teams in the game_data
dataset that are not in team_data
(mostly non-Division I teams).
game_data = game_data[~game_data.Opponent.isin(diff_teams)]
Histogram of game margins
game_data['Team Margin'].hist(bins=40)
<matplotlib.axes._subplots.AxesSubplot at 0x1168dc080>
Merge "Team Differential" column from game_data
.
team_data = (team_data.merge(game_data[['Team', 'Team Differential']].drop_duplicates(subset='Team'),
left_on='School', right_on='Team')
.drop('Team', axis=1)
.rename(columns={'Team Differential':'TD'}))
team_data.head()
School | G | W | L | W-L% | SRS | SOS | W.1 | L.1 | W.2 | ... | FTA | FT% | ORB | TRB | AST | STL | BLK | TOV | PF | TD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Abilene Christian | 29 | 13 | 16 | 0.448 | -12.00 | -7.24 | 7 | 11 | 9 | ... | 543 | 0.663 | 221 | 904 | 408 | 210 | 83 | 412 | 631 | -2.206897 |
1 | Air Force | 33 | 12 | 21 | 0.364 | -2.92 | 1.14 | 4 | 14 | 11 | ... | 715 | 0.717 | 308 | 1149 | 494 | 197 | 57 | 397 | 594 | -2.000000 |
2 | Akron | 34 | 26 | 8 | 0.765 | 3.97 | -1.51 | 14 | 4 | 13 | ... | 691 | 0.680 | 334 | 1173 | 499 | 185 | 110 | 396 | 616 | 6.911765 |
3 | Alabama A&M | 29 | 2 | 27 | 0.069 | -26.73 | -10.83 | 2 | 16 | 1 | ... | 614 | 0.669 | 292 | 914 | 326 | 100 | 50 | 383 | 526 | -15.965517 |
4 | Alabama-Birmingham | 33 | 17 | 16 | 0.515 | -1.05 | -1.71 | 9 | 9 | 11 | ... | 674 | 0.736 | 322 | 1188 | 503 | 157 | 160 | 444 | 527 | 1.606061 |
5 rows × 34 columns
We will normalize our predictors.
normalize = lambda x: (x - x.mean())/x.std()
predictor_cols = ['FG%','3P','3P%','FT%','ORB','TRB','AST','STL','BLK','TOV','PF','TD']
X = normalize(team_data[predictor_cols])
pd.scatter_matrix(X);
Lookup table for encoding schools as integers
team_lookup = dict(team_data.School)
reverse_team_lookup = {team_lookup[k]:k for k in team_lookup}
game_data['team_ind'] = game_data.Team.replace(reverse_team_lookup)
game_data['opponent_ind'] = game_data.Opponent.replace(reverse_team_lookup)
y = game_data['Team Margin'].values
Specify model
with pm.Model() as model:
# Predictor coefficients
β = pm.Normal('β', 0, sd=100, shape=len(predictor_cols))
# Observation error
σ = pm.HalfCauchy('σ', 3)
# Team strength parameters
θ = pm.Deterministic('θ', β.dot(X.T))
# Expected game margin
δ = θ[game_data['team_ind'].values] - θ[game_data['opponent_ind'].values]
# Likelihood
pm.Normal('outcome', δ, sd=σ, observed=y)
with model:
trace = pm.sample(5000)
Auto-assigning NUTS sampler... Initializing NUTS using advi... 9%|▊ | 17482/200000 [00:06<01:03, 2851.85it/s]Median ELBO converged. Finished [100%]: Average ELBO = -1.1587e+05 Evidence of divergence detected, inspect ELBO. 100%|██████████| 5000/5000 [01:02<00:00, 79.58it/s]
pm.forestplot(trace[1000:], varnames=['β'], ylabels=predictor_cols)
<matplotlib.gridspec.GridSpec at 0x121c9a208>
pm.forestplot(trace[1000:], varnames=['β'], ylabels=predictor_cols)
<matplotlib.gridspec.GridSpec at 0x11f2c8780>
Team strength parameters
plt.figure(figsize=(10,48))
pm.forestplot(trace[1000:], varnames=['θ'], ylabels=team_data.School.values)
<matplotlib.gridspec.GridSpec at 0x121326278>
Posterior predictive checks
with model:
ppc = pm.sample_ppc(trace, samples=500)
100%|██████████| 500/500 [00:03<00:00, 131.76it/s]
result = ppc['outcome']
result.shape
(500, 10790)
plt.hist((result < y).mean(0));