#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pymc3 as pm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')


# Import team data for 2016/2017 from [Sports Reference](http://www.sports-reference.com/cbb/seasons/2017-school-stats.html).

# In[2]:


team_data = pd.read_csv('data/teams2017.csv', index_col=0)
team_data.index = np.arange(team_data.shape[0])
team_data.head()


# Clean up school labels

# In[3]:


team_data['School'] = [''.join(x).rstrip() for x in team_data.School.str.split('*')]
team_data.replace({'School':{'VMI':'Virginia Military Institute',
                            'SIU Edwardsville':'Southern Illinois-Edwardsville'}}, 
                  inplace=True)


# Import individual game data for 2016/17 from [Spreadsheet Sports](https://www.spreadsheetsports.com).

# In[4]:


game_data = pd.read_excel('data/2017 Game Results Data.xlsx')
game_data.head()


# Identify all teams in the game data dataset.

# In[5]:


all_teams = set(game_data.Team).union(set(game_data.Opponent))


# See differene between team data and game data teams

# In[6]:


diff_teams = all_teams.difference(team_data.School)


# Remove teams in the `game_data` dataset that are not in `team_data` (mostly non-Division I teams).

# In[7]:


game_data = game_data[~game_data.Opponent.isin(diff_teams)]


# Histogram of game margins

# In[8]:


game_data['Team Margin'].hist(bins=40)


# Merge "Team Differential" column from `game_data`.

# In[9]:


team_data = (team_data.merge(game_data[['Team', 'Team Differential']].drop_duplicates(subset='Team'), 
                left_on='School', right_on='Team')
                     .drop('Team', axis=1)
                     .rename(columns={'Team Differential':'TD'}))


# In[10]:


team_data.head()


# We will normalize our predictors.

# In[11]:


normalize = lambda x: (x - x.mean())/x.std()


# In[12]:


predictor_cols = ['FG%','3P','3P%','FT%','ORB','TRB','AST','STL','BLK','TOV','PF','TD']
X = normalize(team_data[predictor_cols])
pd.scatter_matrix(X);


# Lookup table for encoding schools as integers

# In[13]:


team_lookup = dict(team_data.School)
reverse_team_lookup = {team_lookup[k]:k for k in team_lookup}


# In[14]:


game_data['team_ind'] = game_data.Team.replace(reverse_team_lookup)
game_data['opponent_ind'] = game_data.Opponent.replace(reverse_team_lookup)


# In[15]:


y = game_data['Team Margin'].values


# Specify model

# In[16]:


with pm.Model() as model:
    
    # Predictor coefficients
    β = pm.Normal('β', 0, sd=100, shape=len(predictor_cols))
    # Observation error
    σ = pm.HalfCauchy('σ', 3)
    
    # Team strength parameters
    θ = pm.Deterministic('θ', β.dot(X.T))
    
    # Expected game margin
    δ = θ[game_data['team_ind'].values] - θ[game_data['opponent_ind'].values]
    
    # Likelihood
    pm.Normal('outcome', δ, sd=σ, observed=y)


# In[17]:


with model:
    trace = pm.sample(5000)


# In[18]:


pm.forestplot(trace[1000:], varnames=['β'], ylabels=predictor_cols)


# In[19]:


pm.forestplot(trace[1000:], varnames=['β'], ylabels=predictor_cols)


# Team strength parameters

# In[20]:


plt.figure(figsize=(10,48))
pm.forestplot(trace[1000:], varnames=['θ'], ylabels=team_data.School.values)


# Posterior predictive checks

# In[21]:


with model:
    ppc = pm.sample_ppc(trace, samples=500)


# In[22]:


result = ppc['outcome']
result.shape


# In[23]:


plt.hist((result < y).mean(0));