In [1]:

import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import sys

In [2]:

all_data = pd.read_csv('data_for_model.csv')

In [3]:

# for 2016-17 season train on 2012-13 through 2015-16
training_seasons = [12, 13, 14, 15]
testing_season = 16

In [4]:

all_model_features = ['Period', 'StartScoreDifferential', 'Time', 'Putback', 'IsRegularSeason', 'SecondsSincePlayStarted', 'ShotDistance', 'ShotAngle', 'is_OffDeadball', 'is_OffFTMake', 'is_OffFTMiss', 'is_OffFTOreb', 'is_OffLiveBallTurnover', 'is_OffTeamBlockedOreb', 'is_OffTeamOreb', 'is_OffTimeout', 'is_OffBlockedOreb', 'is_OffBlock', 'is_OffMadeFG', 'is_OffOreb', 'is_OffMissedFG']

In [5]:

training_data = all_data[(all_data.Season.isin(training_seasons))]

at_rim = training_data[(training_data.ShotDistance < 5)]
non_rim_2pt = training_data[((training_data.ShotDistance >= 5) & (training_data.ShotValue == 2))]
jump_shots_3pt = training_data[((training_data.ShotValue == 3) & (training_data.ShotDistance < 35))]

at_rim_xgboost_model = XGBClassifier(
    random_state=909,
    n_estimators=300,
    max_depth=5,
    learning_rate=0.1,
    min_child_weight=3
)

at_rim_catboost_classifier = CatBoostClassifier(
    bagging_temperature=1,
    rsm=0.1,
    learning_rate=0.1,
    depth=7,
    verbose=False, 
    random_seed=909
)

at_rim_pipe = Pipeline(
    [
        ('model', VotingClassifier(
            estimators=[
                ('xgb', at_rim_xgboost_model), 
                ('catboost', at_rim_catboost_classifier)
            ], 
            voting='soft'
        ))
    ]
)

at_rim_weights = [[1, 1]]

at_rim_param_grid = {
    'model__weights': at_rim_weights,
}

at_rim_grid = GridSearchCV(at_rim_pipe, cv=10, scoring='neg_log_loss', n_jobs=1, param_grid=at_rim_param_grid)

X_at_rim = at_rim[all_model_features]
y_at_rim = at_rim.Made

at_rim_grid.fit(X_at_rim.values, y_at_rim.values)
print('RA training log loss: %.4f' % at_rim_grid.best_score_)

non_rim_2pt_xgboost_model = XGBClassifier(
    random_state=909,
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    min_child_weight=5
)

non_rim_2pt_catboost_classifier = CatBoostClassifier(
    bagging_temperature=1,
    rsm=0.1,
    learning_rate=0.04,
    depth=5,
    verbose=False, 
    random_seed=909
)

non_rim_2pt_pipe = Pipeline(
    [ 
        ('model', VotingClassifier(
            estimators=[
                ('xgb', non_rim_2pt_xgboost_model), 
                ('catboost', non_rim_2pt_catboost_classifier)
            ], 
            voting='soft'
        ))
    ]
)

non_rim_2pt_weights = [[1, 1]]

non_rim_2pt_param_grid = {
    'model__weights': non_rim_2pt_weights,
}

non_rim_2pt_grid = GridSearchCV(non_rim_2pt_pipe, cv=10, scoring='neg_log_loss', n_jobs=1, param_grid=non_rim_2pt_param_grid)

X_non_rim_2pt = non_rim_2pt[all_model_features]
y_non_rim_2pt = non_rim_2pt.Made

non_rim_2pt_grid.fit(X_non_rim_2pt.values, y_non_rim_2pt.values)
print('2pt non-RA training log loss: %.4f' % non_rim_2pt_grid.best_score_)

jump_shots_3pt_xgboost_model = XGBClassifier(
    random_state=909,
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    min_child_weight=3
)

jump_shots_3pt_catboost_classifier = CatBoostClassifier(
    bagging_temperature=1,
    rsm=0.1,
    learning_rate=0.08,
    depth=5,
    verbose=False, 
    random_seed=909
)

jump_shots_3pt_pipe = Pipeline(
    [
        ('model', VotingClassifier(
            estimators=[
                ('xgb', jump_shots_3pt_xgboost_model), 
                ('catboost', jump_shots_3pt_catboost_classifier)
            ], 
            voting='soft'
        ))
    ]
)

jump_shots_3pt_weights = [[1, 1]]

jump_shots_3pt_param_grid = {
    'model__weights': jump_shots_3pt_weights,
}

jump_shots_3pt_grid = GridSearchCV(jump_shots_3pt_pipe, cv=10, scoring='neg_log_loss', n_jobs=1, param_grid=jump_shots_3pt_param_grid)

X_jump_shots_3pt = jump_shots_3pt[all_model_features]
y_jump_shots_3pt = jump_shots_3pt.Made

jump_shots_3pt_grid.fit(X_jump_shots_3pt.values, y_jump_shots_3pt.values)
print('3pt training log loss: %.4f' % jump_shots_3pt_grid.best_score_)

testing_data = all_data[(all_data.Season == testing_season)]

testing_at_rim = testing_data[(testing_data.ShotDistance < 5)]
testing_non_rim_2pt = testing_data[((testing_data.ShotDistance >= 5) & (testing_data.ShotValue == 2))]
testing_jump_shots_3pt = testing_data[((testing_data.ShotValue == 3) & (testing_data.ShotDistance < 35))]

X_test_at_rim = testing_at_rim[all_model_features]
y_test_at_rim = testing_at_rim.Made

X_test_non_rim_2pt = testing_non_rim_2pt[all_model_features]
y_test_non_rim_2pt = testing_non_rim_2pt.Made

X_test_jump_shots_3pt = testing_jump_shots_3pt[all_model_features]
y_test_jump_shots_3pt = testing_jump_shots_3pt.Made

at_rim_predictions = at_rim_grid.predict_proba(X_test_at_rim.values)
at_rim_predictions_df = pd.DataFrame(at_rim_predictions, columns=['miss','make'])
print("RA testing log loss: %.4f" % log_loss(y_test_at_rim, at_rim_predictions_df.make.values))

non_rim_2pt_predictions = non_rim_2pt_grid.predict_proba(X_test_non_rim_2pt.values)
non_rim_2pt_predictions_df = pd.DataFrame(non_rim_2pt_predictions, columns=['miss','make'])
print("2pt non-RA testing log loss: %.4f" % log_loss(y_test_non_rim_2pt, non_rim_2pt_predictions_df.make.values))

jump_shot_3pt_predictions = jump_shots_3pt_grid.predict_proba(X_test_jump_shots_3pt.values)
jump_shot_3pt_predictions_df = pd.DataFrame(jump_shot_3pt_predictions, columns=['miss','make'])
print("3pt testing log loss: %.4f" % log_loss(y_test_jump_shots_3pt, jump_shot_3pt_predictions_df.make.values))

test_prediction_df = pd.concat([at_rim_predictions_df, non_rim_2pt_predictions_df, jump_shot_3pt_predictions_df], ignore_index=True)
y_test = pd.concat([y_test_at_rim, y_test_non_rim_2pt, y_test_jump_shots_3pt])
print("All shots testing log loss: %.4f" % log_loss(y_test, test_prediction_df.make.values))

RA training log loss: -0.6247
2pt non-RA training log loss: -0.6685
3pt training log loss: -0.6486
RA testing log loss: 0.6197
2pt non-RA testing log loss: 0.6739
3pt testing log loss: 0.6503
All shots testing log loss: 0.6478

In [ ]:

Feature Importance with SHAP Values¶

In [6]:

import shap
shap.initjs()

In [7]:

xgb_3pt_model = jump_shots_3pt_grid.best_estimator_.named_steps['model'].estimators_[0]

xgb_3pt_explainer = shap.TreeExplainer(xgb_3pt_model)
xgb_3pt_shap_values = xgb_3pt_explainer.shap_values(X_test_jump_shots_3pt.values)

In [8]:

catboost_3pt_model = jump_shots_3pt_grid.best_estimator_.named_steps['model'].estimators_[1]

catboost_3pt_explainer = shap.TreeExplainer(catboost_3pt_model)
catboost_3pt_shap_values = catboost_3pt_explainer.shap_values(X_test_jump_shots_3pt.values)

In [9]:

# elementwise mean of shap values for each model to get shap values for ensemble per - https://github.com/slundberg/shap/issues/112
shap_values_3pt = np.mean([catboost_3pt_shap_values, xgb_3pt_shap_values], axis=0)
mean_ev_3pt = np.mean([xgb_3pt_explainer.expected_value, catboost_3pt_explainer.expected_value])

In [10]:

max_index_3pt = jump_shot_3pt_predictions_df.make.idxmax()

In [11]:

jump_shot_3pt_predictions_df.iloc[max_index_3pt]

Out[11]:

miss    0.41225
make    0.58775
Name: 43539, dtype: float64

In [12]:

# visualize the max prediction's explanation - link='logit' shows probability values above graph
shap.force_plot(mean_ev_3pt, shap_values_3pt[max_index_3pt,:], X_test_jump_shots_3pt.iloc[max_index_3pt,:], link='logit')

Out[12]:

Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security.

In [13]:

shap.summary_plot(shap_values_3pt, X_test_jump_shots_3pt.values, feature_names=all_model_features)

In [14]:

shap.summary_plot(shap_values_3pt, X_test_jump_shots_3pt.values, plot_type="bar", feature_names=all_model_features)

In [15]:

xgb_2pt_non_rim_model = non_rim_2pt_grid.best_estimator_.named_steps['model'].estimators_[0]

xgb_2pt_non_rim_explainer = shap.TreeExplainer(xgb_2pt_non_rim_model)
xgb_2pt_non_rim_shap_values = xgb_2pt_non_rim_explainer.shap_values(X_test_non_rim_2pt.values)

catboost_2pt_non_rim_model = non_rim_2pt_grid.best_estimator_.named_steps['model'].estimators_[1]

catboost_2pt_non_rim_explainer = shap.TreeExplainer(catboost_2pt_non_rim_model)
catboost_2pt_non_rim_shap_values = catboost_2pt_non_rim_explainer.shap_values(X_test_non_rim_2pt.values)

# elementwise mean of shap values for each model to get shap values for ensemble per - https://github.com/slundberg/shap/issues/112
shap_values_2pt_non_rim = np.mean([catboost_2pt_non_rim_shap_values, xgb_2pt_non_rim_shap_values], axis=0)
mean_ev_2pt_non_rim = np.mean([xgb_2pt_non_rim_explainer.expected_value, catboost_2pt_non_rim_explainer.expected_value])

max_index_2pt_non_rim = non_rim_2pt_predictions_df.make.idxmax()

non_rim_2pt_predictions_df.iloc[max_index_2pt_non_rim]

Out[15]:

miss    0.396427
make    0.603573
Name: 15281, dtype: float64

In [16]:

shap.force_plot(mean_ev_2pt_non_rim, shap_values_2pt_non_rim[max_index_2pt_non_rim,:], X_test_non_rim_2pt.iloc[max_index_2pt_non_rim,:], link='logit')

Out[16]:

Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security.

In [17]:

shap.summary_plot(shap_values_2pt_non_rim, X_test_non_rim_2pt.values, feature_names=all_model_features)

In [18]:

shap.summary_plot(shap_values_2pt_non_rim, X_test_non_rim_2pt.values, plot_type="bar", feature_names=all_model_features)

In [19]:

xgb_at_rim_model = at_rim_grid.best_estimator_.named_steps['model'].estimators_[0]

xgb_at_rim_explainer = shap.TreeExplainer(xgb_at_rim_model)
xgb_at_rim_shap_values = xgb_at_rim_explainer.shap_values(X_test_at_rim.values)

catboost_at_rim_model = at_rim_grid.best_estimator_.named_steps['model'].estimators_[1]

catboost_at_rim_explainer = shap.TreeExplainer(catboost_at_rim_model)
catboost_at_rim_shap_values = catboost_at_rim_explainer.shap_values(X_test_at_rim.values)

# elementwise mean of shap values for each model to get shap values for ensemble per - https://github.com/slundberg/shap/issues/112
shap_values_at_rim = np.mean([xgb_at_rim_shap_values, catboost_at_rim_shap_values], axis=0)
mean_ev_at_rim = np.mean([xgb_at_rim_explainer.expected_value, catboost_at_rim_explainer.expected_value])

max_index_at_rim = at_rim_predictions_df.make.idxmax()

at_rim_predictions_df.iloc[max_index_at_rim]

Out[19]:

miss    0.017759
make    0.982241
Name: 51120, dtype: float64

In [20]:

shap.force_plot(mean_ev_at_rim, shap_values_at_rim[max_index_at_rim,:], X_test_at_rim.iloc[max_index_at_rim,:], link='logit')

Out[20]:

Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security.

In [21]:

shap.summary_plot(shap_values_at_rim, X_test_at_rim.values, feature_names=all_model_features)

In [22]:

shap.summary_plot(shap_values_at_rim, X_test_at_rim.values, plot_type="bar", feature_names=all_model_features)

In [ ]:

In [23]:

print(pd.__version__)

0.23.3

In [24]:

print(np.__version__)

1.14.5

In [25]:

import sklearn
print(sklearn.__version__)

0.19.2

In [26]:

import catboost
print(catboost.__version__)

0.9.1.1

In [27]:

import xgboost
print(xgboost.__version__)

0.72.1

In [28]:

print(sys.version_info)

sys.version_info(major=3, minor=6, micro=1, releaselevel='final', serial=0)

In [ ]: