In [1]:
%matplotlib inline
import pymc3 as pm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

Import team data for 2016/2017 from Sports Reference.

In [2]:
team_data = pd.read_csv('data/teams2017.csv', index_col=0)
team_data.index = np.arange(team_data.shape[0])
team_data.head()
School G W L W-L% SRS SOS W.1 L.1 W.2 ... FT FTA FT% ORB TRB AST STL BLK TOV PF
0 Abilene Christian 29 13 16 0.448 -12.00 -7.24 7 11 9 ... 360 543 0.663 221 904 408 210 83 412 631
1 Air Force 33 12 21 0.364 -2.92 1.14 4 14 11 ... 513 715 0.717 308 1149 494 197 57 397 594
2 Akron 34 26 8 0.765 3.97 -1.51 14 4 13 ... 470 691 0.680 334 1173 499 185 110 396 616
3 Alabama A&M 29 2 27 0.069 -26.73 -10.83 2 16 1 ... 411 614 0.669 292 914 326 100 50 383 526
4 Alabama-Birmingham 33 17 16 0.515 -1.05 -1.71 9 9 11 ... 496 674 0.736 322 1188 503 157 160 444 527

5 rows × 33 columns

Clean up school labels

In [3]:
team_data['School'] = [''.join(x).rstrip() for x in team_data.School.str.split('*')]
team_data.replace({'School':{'VMI':'Virginia Military Institute',
                            'SIU Edwardsville':'Southern Illinois-Edwardsville'}}, 
                  inplace=True)

Import individual game data for 2016/17 from Spreadsheet Sports.

In [4]:
game_data = pd.read_excel('data/2017 Game Results Data.xlsx')
game_data.head()
Date Team Team Location Team Score Opponent Opponent Score Opponent Location Neutral Site Team Result Team Margin Team Differential Opponent Differential Game Type
0 2016-11-11 Pepperdine Home 77 Cal Poly 68 Away NaN Win 9 -11.290323 -5.161290 Division 1
1 2016-11-11 Eastern Michigan Away 90 Pittsburgh 93 Home NaN Loss -3 4.757576 -1.818182 Division 1
2 2016-11-11 Penn State Home 81 Albany (NY) 87 Away NaN Loss -6 -1.000000 5.852941 Division 1
3 2016-11-11 Cal Poly Away 68 Pepperdine 77 Home NaN Loss -9 -5.161290 -11.290323 Division 1
4 2016-11-11 Portland Home 71 UC-Riverside 55 Away NaN Win 16 -5.090909 -4.172414 Division 1

Identify all teams in the game data dataset.

In [5]:
all_teams = set(game_data.Team).union(set(game_data.Opponent))

See differene between team data and game data teams

In [6]:
diff_teams = all_teams.difference(team_data.School)

Remove teams in the game_data dataset that are not in team_data (mostly non-Division I teams).

In [7]:
game_data = game_data[~game_data.Opponent.isin(diff_teams)]

Histogram of game margins

In [8]:
game_data['Team Margin'].hist(bins=40)
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x1168dc080>

Merge "Team Differential" column from game_data.

In [9]:
team_data = (team_data.merge(game_data[['Team', 'Team Differential']].drop_duplicates(subset='Team'), 
                left_on='School', right_on='Team')
                     .drop('Team', axis=1)
                     .rename(columns={'Team Differential':'TD'}))
In [10]:
team_data.head()
School G W L W-L% SRS SOS W.1 L.1 W.2 ... FTA FT% ORB TRB AST STL BLK TOV PF TD
0 Abilene Christian 29 13 16 0.448 -12.00 -7.24 7 11 9 ... 543 0.663 221 904 408 210 83 412 631 -2.206897
1 Air Force 33 12 21 0.364 -2.92 1.14 4 14 11 ... 715 0.717 308 1149 494 197 57 397 594 -2.000000
2 Akron 34 26 8 0.765 3.97 -1.51 14 4 13 ... 691 0.680 334 1173 499 185 110 396 616 6.911765
3 Alabama A&M 29 2 27 0.069 -26.73 -10.83 2 16 1 ... 614 0.669 292 914 326 100 50 383 526 -15.965517
4 Alabama-Birmingham 33 17 16 0.515 -1.05 -1.71 9 9 11 ... 674 0.736 322 1188 503 157 160 444 527 1.606061

5 rows × 34 columns

We will normalize our predictors.

In [11]:
normalize = lambda x: (x - x.mean())/x.std()
In [12]:
predictor_cols = ['FG%','3P','3P%','FT%','ORB','TRB','AST','STL','BLK','TOV','PF','TD']
X = normalize(team_data[predictor_cols])
pd.scatter_matrix(X);

Lookup table for encoding schools as integers

In [13]:
team_lookup = dict(team_data.School)
reverse_team_lookup = {team_lookup[k]:k for k in team_lookup}
In [14]:
game_data['team_ind'] = game_data.Team.replace(reverse_team_lookup)
game_data['opponent_ind'] = game_data.Opponent.replace(reverse_team_lookup)
In [15]:
y = game_data['Team Margin'].values

Specify model

In [16]:
with pm.Model() as model:
    
    # Predictor coefficients
    β = pm.Normal('β', 0, sd=100, shape=len(predictor_cols))
    # Observation error
    σ = pm.HalfCauchy('σ', 3)
    
    # Team strength parameters
    θ = pm.Deterministic('θ', β.dot(X.T))
    
    # Expected game margin
    δ = θ[game_data['team_ind'].values] - θ[game_data['opponent_ind'].values]
    
    # Likelihood
    pm.Normal('outcome', δ, sd=σ, observed=y)
In [17]:
with model:
    trace = pm.sample(5000)
Auto-assigning NUTS sampler...
Initializing NUTS using advi...
  9%|▊         | 17482/200000 [00:06<01:03, 2851.85it/s]Median ELBO converged.
Finished [100%]: Average ELBO = -1.1587e+05

Evidence of divergence detected, inspect ELBO.
100%|██████████| 5000/5000 [01:02<00:00, 79.58it/s]
In [18]:
pm.forestplot(trace[1000:], varnames=['β'], ylabels=predictor_cols)
Out[18]:
<matplotlib.gridspec.GridSpec at 0x121c9a208>
In [19]:
pm.forestplot(trace[1000:], varnames=['β'], ylabels=predictor_cols)
Out[19]:
<matplotlib.gridspec.GridSpec at 0x11f2c8780>

Team strength parameters

In [20]:
plt.figure(figsize=(10,48))
pm.forestplot(trace[1000:], varnames=['θ'], ylabels=team_data.School.values)
Out[20]:
<matplotlib.gridspec.GridSpec at 0x121326278>

Posterior predictive checks

In [21]:
with model:
    ppc = pm.sample_ppc(trace, samples=500)
100%|██████████| 500/500 [00:03<00:00, 131.76it/s]
In [22]:
result = ppc['outcome']
result.shape
Out[22]:
(500, 10790)
In [23]:
plt.hist((result < y).mean(0));