#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pymc3 as pm import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_context('notebook') # Import team data for 2016/2017 from [Sports Reference](http://www.sports-reference.com/cbb/seasons/2017-school-stats.html). # In[2]: team_data = pd.read_csv('data/teams2017.csv', index_col=0) team_data.index = np.arange(team_data.shape[0]) team_data.head() # Clean up school labels # In[3]: team_data['School'] = [''.join(x).rstrip() for x in team_data.School.str.split('*')] team_data.replace({'School':{'VMI':'Virginia Military Institute', 'SIU Edwardsville':'Southern Illinois-Edwardsville'}}, inplace=True) # Import individual game data for 2016/17 from [Spreadsheet Sports](https://www.spreadsheetsports.com). # In[4]: game_data = pd.read_excel('data/2017 Game Results Data.xlsx') game_data.head() # Identify all teams in the game data dataset. # In[5]: all_teams = set(game_data.Team).union(set(game_data.Opponent)) # See differene between team data and game data teams # In[6]: diff_teams = all_teams.difference(team_data.School) # Remove teams in the `game_data` dataset that are not in `team_data` (mostly non-Division I teams). # In[7]: game_data = game_data[~game_data.Opponent.isin(diff_teams)] # Histogram of game margins # In[8]: game_data['Team Margin'].hist(bins=40) # Merge "Team Differential" column from `game_data`. # In[9]: team_data = (team_data.merge(game_data[['Team', 'Team Differential']].drop_duplicates(subset='Team'), left_on='School', right_on='Team') .drop('Team', axis=1) .rename(columns={'Team Differential':'TD'})) # In[10]: team_data.head() # We will normalize our predictors. # In[11]: normalize = lambda x: (x - x.mean())/x.std() # In[12]: predictor_cols = ['FG%','3P','3P%','FT%','ORB','TRB','AST','STL','BLK','TOV','PF','TD'] X = normalize(team_data[predictor_cols]) pd.scatter_matrix(X); # Lookup table for encoding schools as integers # In[13]: team_lookup = dict(team_data.School) reverse_team_lookup = {team_lookup[k]:k for k in team_lookup} # In[14]: game_data['team_ind'] = game_data.Team.replace(reverse_team_lookup) game_data['opponent_ind'] = game_data.Opponent.replace(reverse_team_lookup) # In[15]: y = game_data['Team Margin'].values # Specify model # In[16]: with pm.Model() as model: # Predictor coefficients β = pm.Normal('β', 0, sd=100, shape=len(predictor_cols)) # Observation error σ = pm.HalfCauchy('σ', 3) # Team strength parameters θ = pm.Deterministic('θ', β.dot(X.T)) # Expected game margin δ = θ[game_data['team_ind'].values] - θ[game_data['opponent_ind'].values] # Likelihood pm.Normal('outcome', δ, sd=σ, observed=y) # In[17]: with model: trace = pm.sample(5000) # In[18]: pm.forestplot(trace[1000:], varnames=['β'], ylabels=predictor_cols) # In[19]: pm.forestplot(trace[1000:], varnames=['β'], ylabels=predictor_cols) # Team strength parameters # In[20]: plt.figure(figsize=(10,48)) pm.forestplot(trace[1000:], varnames=['θ'], ylabels=team_data.School.values) # Posterior predictive checks # In[21]: with model: ppc = pm.sample_ppc(trace, samples=500) # In[22]: result = ppc['outcome'] result.shape # In[23]: plt.hist((result < y).mean(0));