%matplotlib inline
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
import xgboost
import shap
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
# load JS visualization code to notebook
shap.initjs()
# train XGBoost model
X,y = shap.datasets.boston()
model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 1000)
PATH = "data/bulldozers/"
df_raw = pd.read_feather('tmp/bulldozers-raw')
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')
def split_vals(a,n): return a[:n], a[n:]
n_valid = 12000
n_trn = len(df_trn)-n_valid
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
def print_score(m):
res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
m.score(X_train, y_train), m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
print(res)
df_trn.describe(include='all').T
count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|---|---|---|
SalesID | 401125 | NaN | NaN | NaN | 1.91971e+06 | 909021 | 1.13925e+06 | 1.41837e+06 | 1.63942e+06 | 2.24271e+06 | 6.33334e+06 |
MachineID | 401125 | NaN | NaN | NaN | 1.2179e+06 | 440992 | 0 | 1.0887e+06 | 1.27949e+06 | 1.46807e+06 | 2.48633e+06 |
ModelID | 401125 | NaN | NaN | NaN | 6889.7 | 6221.78 | 28 | 3259 | 4604 | 8724 | 37198 |
datasource | 401125 | NaN | NaN | NaN | 134.666 | 8.96224 | 121 | 132 | 132 | 136 | 172 |
auctioneerID | 401125 | NaN | NaN | NaN | 6.32733 | 16.5751 | 0 | 1 | 2 | 4 | 99 |
YearMade | 401125 | NaN | NaN | NaN | 1899.16 | 291.797 | 1000 | 1985 | 1995 | 2000 | 2013 |
MachineHoursCurrentMeter | 401125 | NaN | NaN | NaN | 1230.73 | 16542.9 | 0 | 0 | 0 | 0 | 2.4833e+06 |
UsageBand | 401125 | NaN | NaN | NaN | -0.623898 | 0.870291 | -1 | -1 | -1 | -1 | 2 |
fiModelDesc | 401125 | NaN | NaN | NaN | 1673.71 | 1263.33 | 1 | 631 | 1395 | 2292 | 4999 |
fiBaseModel | 401125 | NaN | NaN | NaN | 559.165 | 469.31 | 1 | 206 | 406 | 704 | 1950 |
fiSecondaryDesc | 401125 | NaN | NaN | NaN | 36.685 | 38.2282 | 0 | 0 | 29 | 57 | 175 |
fiModelSeries | 401125 | NaN | NaN | NaN | 9.19269 | 27.0069 | 0 | 0 | 0 | 0 | 122 |
fiModelDescriptor | 401125 | NaN | NaN | NaN | 12.2334 | 29.0419 | 0 | 0 | 0 | 0 | 139 |
ProductSize | 401125 | NaN | NaN | NaN | 1.81822 | 2.10783 | 0 | 0 | 0 | 4 | 6 |
fiProductClassDesc | 401125 | NaN | NaN | NaN | 32.2622 | 22.5966 | 1 | 11 | 35 | 52 | 74 |
state | 401125 | NaN | NaN | NaN | 23.5083 | 15.7329 | 1 | 9 | 22 | 41 | 53 |
ProductGroup | 401125 | NaN | NaN | NaN | 3.72552 | 1.72577 | 1 | 2 | 4 | 5 | 6 |
ProductGroupDesc | 401125 | NaN | NaN | NaN | 3.72552 | 1.72577 | 1 | 2 | 4 | 5 | 6 |
Drive_System | 401125 | NaN | NaN | NaN | 0.809299 | 1.43685 | 0 | 0 | 0 | 2 | 4 |
Enclosure | 401125 | NaN | NaN | NaN | 3.6052 | 2.22035 | 0 | 1 | 3 | 6 | 6 |
Forks | 401125 | NaN | NaN | NaN | 0.513192 | 0.564375 | 0 | 0 | 0 | 1 | 2 |
Pad_Type | 401125 | NaN | NaN | NaN | 0.422315 | 0.873469 | 0 | 0 | 0 | 0 | 4 |
Ride_Control | 401125 | NaN | NaN | NaN | 0.566736 | 0.822794 | 0 | 0 | 0 | 1 | 3 |
Stick | 401125 | NaN | NaN | NaN | 0.31901 | 0.678752 | 0 | 0 | 0 | 0 | 2 |
Transmission | 401125 | NaN | NaN | NaN | 3.3725 | 3.76405 | 0 | 0 | 0 | 8 | 8 |
Turbocharged | 401125 | NaN | NaN | NaN | 0.20706 | 0.428657 | 0 | 0 | 0 | 0 | 2 |
Blade_Extension | 401125 | NaN | NaN | NaN | 0.0641845 | 0.250385 | 0 | 0 | 0 | 0 | 2 |
Blade_Width | 401125 | NaN | NaN | NaN | 0.235228 | 1.02571 | 0 | 0 | 0 | 0 | 6 |
Enclosure_Type | 401125 | NaN | NaN | NaN | 0.178436 | 0.697879 | 0 | 0 | 0 | 0 | 3 |
Engine_Horsepower | 401125 | NaN | NaN | NaN | 0.0660667 | 0.260948 | 0 | 0 | 0 | 0 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Coupler | 401125 | NaN | NaN | NaN | 1.51179 | 1.45032 | 0 | 0 | 2 | 3 | 3 |
Coupler_System | 401125 | NaN | NaN | NaN | 0.115889 | 0.342865 | 0 | 0 | 0 | 0 | 2 |
Grouser_Tracks | 401125 | NaN | NaN | NaN | 0.115199 | 0.340769 | 0 | 0 | 0 | 0 | 2 |
Hydraulics_Flow | 401125 | NaN | NaN | NaN | 0.321483 | 0.926401 | 0 | 0 | 0 | 0 | 3 |
Track_Type | 401125 | NaN | NaN | NaN | 0.458792 | 0.819459 | 0 | 0 | 0 | 0 | 2 |
Undercarriage_Pad_Width | 401125 | NaN | NaN | NaN | 4.3119 | 7.74189 | 0 | 0 | 0 | 0 | 19 |
Stick_Length | 401125 | NaN | NaN | NaN | 6.53556 | 11.9233 | 0 | 0 | 0 | 0 | 29 |
Thumb | 401125 | NaN | NaN | NaN | 0.685153 | 1.2269 | 0 | 0 | 0 | 0 | 3 |
Pattern_Changer | 401125 | NaN | NaN | NaN | 0.516719 | 0.912594 | 0 | 0 | 0 | 0 | 3 |
Grouser_Type | 401125 | NaN | NaN | NaN | 0.319479 | 0.659002 | 0 | 0 | 0 | 0 | 3 |
Backhoe_Mounting | 401125 | NaN | NaN | NaN | 0.196178 | 0.397231 | 0 | 0 | 0 | 0 | 2 |
Blade_Type | 401125 | NaN | NaN | NaN | 1.27813 | 2.649 | 0 | 0 | 0 | 0 | 10 |
Travel_Controls | 401125 | NaN | NaN | NaN | 1.13201 | 2.3071 | 0 | 0 | 0 | 0 | 7 |
Differential_Type | 401125 | NaN | NaN | NaN | 0.683189 | 1.502 | 0 | 0 | 0 | 0 | 4 |
Steering_Controls | 401125 | NaN | NaN | NaN | 0.344987 | 0.755775 | 0 | 0 | 0 | 0 | 5 |
saleYear | 401125 | NaN | NaN | NaN | 2004.1 | 5.75419 | 1989 | 2000 | 2006 | 2009 | 2011 |
saleMonth | 401125 | NaN | NaN | NaN | 6.40704 | 3.42458 | 1 | 3 | 6 | 9 | 12 |
saleWeek | 401125 | NaN | NaN | NaN | 26.1799 | 14.7881 | 1 | 13 | 25 | 39 | 53 |
saleDay | 401125 | NaN | NaN | NaN | 16.1104 | 8.42732 | 1 | 9 | 16 | 23 | 31 |
saleDayofweek | 401125 | NaN | NaN | NaN | 2.60097 | 1.40576 | 0 | 2 | 3 | 3 | 6 |
saleDayofyear | 401125 | NaN | NaN | NaN | 179.978 | 103.56 | 2 | 84 | 168 | 271 | 365 |
saleIs_month_end | 401125 | 2 | False | 387586 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
saleIs_month_start | 401125 | 2 | False | 390968 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
saleIs_quarter_end | 401125 | 2 | False | 394985 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
saleIs_quarter_start | 401125 | 2 | False | 398606 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
saleIs_year_end | 401125 | 2 | False | 401124 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
saleIs_year_start | 401125 | 1 | False | 401125 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
saleElapsed | 401125 | NaN | NaN | NaN | 1.09143e+09 | 1.81698e+08 | 6.00998e+08 | 9.70877e+08 | 1.14307e+09 | 1.23785e+09 | 1.3252e+09 |
auctioneerID_na | 401125 | 2 | False | 380989 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
MachineHoursCurrentMeter_na | 401125 | 2 | True | 258360 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
66 rows × 11 columns
def rf_feat_importance(m, df):
return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_, 'std':np.std([tree.feature_importances_ for tree in m.estimators_],
axis=0)}
).sort_values('imp', ascending=False)
def plot_fi(fi,std=True, feature_importance_type=''):
if std:
ax = fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False, xerr='std')
else:
ax = fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)
ax.set_xlabel(f"{feature_importance_type} Feature Importance")
return ax
set_rf_samples(50000)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
[0.20607516700458903, 0.2504207366152017, 0.91124636542035586, 0.88800760008028823, 0.89416603927838945]
For the sake of interpretation, we use a subset to speed up computation
n = 1000
sample_idx = np.random.permutation(len(X_valid))[:n]
X_valid_sample = X_valid.iloc[sample_idx].copy()
y_valid_sample = y_valid[sample_idx].copy()
X_valid_sample.shape, y_valid_sample.shape
((1000, 66), (1000,))
fi_sklearn = rf_feat_importance(m, X_valid_sample); fi_sklearn[:10]
cols | imp | std | |
---|---|---|---|
5 | YearMade | 0.170900 | 0.022382 |
37 | Coupler_System | 0.107031 | 0.099127 |
13 | ProductSize | 0.094212 | 0.047293 |
14 | fiProductClassDesc | 0.078732 | 0.039969 |
2 | ModelID | 0.054950 | 0.021318 |
63 | saleElapsed | 0.049756 | 0.005392 |
38 | Grouser_Tracks | 0.049630 | 0.086771 |
10 | fiSecondaryDesc | 0.044861 | 0.020753 |
39 | Hydraulics_Flow | 0.041284 | 0.082109 |
19 | Enclosure | 0.039521 | 0.040465 |
plot_fi(fi_sklearn[:30],False, 'Sklearn');
from rfpimp import importances
fi_permutation = importances(m, X_valid_sample, y_valid_sample) # permutation
fi_permutation.sum()
Importance 1.218516 dtype: float64
I am not sure why it does not sum up to 1, normalize it for now
fi_permutation['Importance'] = fi_permutation['Importance']/ fi_permutation['Importance'].sum()
fi_permutation.sum()
Importance 1.0 dtype: float64
fi_permutation = (fi_permutation
.reset_index()
.rename({'Feature':'cols', 'Importance':'imp'},axis=1))
plot_fi(fi_permutation[:30],False,'Permutation')
<matplotlib.axes._subplots.AxesSubplot at 0x7fdd98f0ab00>
Notice that the computation is quite slow for RandomForest, it seems run faster on XGboost, LightGBM etc.
explainer = shap.TreeExplainer(m)
shap_values = explainer.shap_values(X_valid_sample.values.astype(int))
explainer.expected_value
10.105362202272051
explainer.expected_value
10.105362202272051
shap.summary_plot(shap_values, X_valid_sample)
In https://github.com/slundberg/shap, it suggest you can plot the mean absoulte SHAP value, so I suppose it is OK to normalize the SHAP value and make a similar plot as before
# sum over each feature
fi_shap = abs(shap_values).sum(0)
# Normalize
fi_shap = fi_shap/fi_shap.sum() ;
fi_shap.sum(),fi_shap.shape
(1.0, (66,))
fi_shap = (pd.DataFrame({'cols':X_valid_sample.columns, 'imp':fi_shap })
.sort_values('imp', ascending=False))
fi_shap.head(3)
cols | imp | |
---|---|---|
5 | YearMade | 0.210351 |
37 | Coupler_System | 0.121832 |
13 | ProductSize | 0.112231 |
plot_fi(fi_shap[:30], False, 'SHAP')
<matplotlib.axes._subplots.AxesSubplot at 0x7fbe806052e8>
plot_fi(fi_sklearn[:30],False, "Sklearn");
plot_fi(fi_permutation[:30],False, "Permutation");
plot_fi(fi_shap[:30],False, "SHAP");
fi_df = fi_sklearn.sort_values('cols').copy()
fi_df['imp'].rename('Sklearn imp', axis=1, inplace=True)
fi_df['permutation imp'] = fi_permutation.sort_values('cols')['imp'].copy()
fi_df['SHAP imp'] = fi_shap.sort_values('cols')['imp'].copy()
fi_df.drop('std', axis=1, inplace=True)
fi_df.sort_values('SHAP imp', ascending=False, inplace=True)
fi_df.head(10)
cols | imp | permutation imp | SHAP imp | |
---|---|---|---|---|
5 | YearMade | 0.175381 | 0.033108 | 0.210351 |
37 | Coupler_System | 0.115121 | 0.000276 | 0.121832 |
13 | ProductSize | 0.103851 | 0.004242 | 0.112231 |
63 | saleElapsed | 0.051154 | -0.000183 | 0.076429 |
14 | fiProductClassDesc | 0.077293 | 0.003973 | 0.060901 |
10 | fiSecondaryDesc | 0.044643 | 0.011632 | 0.043858 |
39 | Hydraulics_Flow | 0.041484 | 0.000253 | 0.041667 |
19 | Enclosure | 0.030680 | 0.002077 | 0.039908 |
38 | Grouser_Tracks | 0.032391 | 0.000255 | 0.033907 |
2 | ModelID | 0.057929 | 0.209657 | 0.033187 |
from treeinterpreter import treeinterpreter as ti
from waterfall import waterfall_chart # https://github.com/chrispaulca/waterfall , clone as local repo
row = X_valid_sample.values[None,0]; row
prediction, bias, contributions = ti.predict(m, row)
contribution_df = pd.DataFrame([X_valid_sample.iloc[0].values, contributions[0]], columns=X_valid_sample.columns,index=['Feature_values','TreeInterpreterContribution']).T
ti_plot = waterfall_chart.plot(X_valid_sample.columns.values,contributions[0], rotation_value=90, threshold=0.1,formatting='{:,.3f}',figsize=(8,6))
plt.ylabel('Change on y')
Text(29,0.5,'Change on y')
my_plot
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])
contribution_df['SHAPContribution'] = np.array(shap_values[0,:],dtype=object)
#### The 5 most important features for SHAP
contribution_df.sort_values('SHAPContribution')['SHAPContribution'].head(10).index
Index(['ProductSize', 'saleElapsed', 'fiProductClassDesc', 'saleYear', 'SalesID', 'fiModelDescriptor', 'Enclosure', 'fiBaseModel', 'Tire_Size', 'fiModelDesc'], dtype='object')
#### The 5 most important features for Treeinterpreter
contribution_df.sort_values('SHAPContribution')['TreeInterpreterContribution'].head(10).index
Index(['ProductSize', 'saleElapsed', 'fiProductClassDesc', 'saleYear', 'SalesID', 'fiModelDescriptor', 'Enclosure', 'fiBaseModel', 'Tire_Size', 'fiModelDesc'], dtype='object')
The rank of feature contribution looks similar, but value are not always close
contribution_df.sort_values('SHAPContribution').head(5)
Feature_values | TreeInterpreterContribution | SHAPContribution | |
---|---|---|---|
ProductSize | 5 | -0.610784 | -0.440641 |
saleElapsed | 1.2846e+09 | -0.0600961 | -0.0998432 |
fiProductClassDesc | 17 | -0.0304155 | -0.0471628 |
saleYear | 2010 | -0.0239873 | -0.0449892 |
SalesID | 4.36475e+06 | -0.0798219 | -0.0372894 |
It is easier to compare both SHAP and treeinterpreter with waterfall chart with sorted order
ti_plot = waterfall_chart.plot(X_valid_sample.columns[contributions.argsort()],
contributions[0][contributions.argsort()],
rotation_value=90, threshold=0.1,formatting='{:,.3f}',figsize=(8,6))
plt.ylabel('Change on y')
plt.title('Tree Interpreter')
Text(0.5,1,'Tree Interpreter')
shap_plot = waterfall_chart.plot(X_valid_sample.columns[shap_values[0,:].argsort()],shap_values[0,:][shap_values[0,:].argsort()], rotation_value=90, threshold=0.1,formatting='{:,.3f}',figsize=(8,6))
plt.ylabel('Change on y')
plt.title('SHAP')
Text(0.5,1,'SHAP')
from pdpbox import pdp
from plotnine import *
def plot_pdp(feat, df,clusters=None, feat_name=None):
feat_name = feat_name or feat
p = pdp.pdp_isolate(m, df, feat)
return pdp.pdp_plot(p, feat_name, plot_lines=True,
cluster=clusters is not None, n_cluster_centers=clusters)
X_train.shape, X_valid.shape
To remove the outlier of "YearMade" features, we only plot data with YearMade>1930 to get a better view.
plot_pdp('YearMade', X_valid_sample[X_valid_sample['YearMade']>1930], clusters=5)
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("YearMade", shap_values[X_valid_sample['YearMade']>1930],
X_valid_sample[X_valid_sample['YearMade']>1930])
Inspired by the SHAP Value, which we can add up indivual rows contribution to get a feature importance, I attempt to do the same for treeinterpreter
row = X_valid_sample.values; row
prediction, bias, contributions = ti.predict(m, row)
# contributions_backup = contributions.copy()
contributions = abs(contributions).sum(0)
contribution_df = pd.DataFrame([X_valid_sample.columns, contributions],index=['cols','imp']).T
contribution_df['imp'] = contribution_df['imp']/contribution_df['imp'].sum() # Normalize
Notice that the first graph here is generated by summingup invidual rows contribution, and the second graph is the default MSE variance reduction feature importance for sklearn, they are very similar.
plot_fi(contribution_df.sort_values('imp',ascending=False).head(30),False)
plt.title('Summing up contributions of treeinterpreter')
Text(0.5,1,'Summing up contributions of treeinterpreter')
# Recall the sklearn feature importance...
plot_fi(fi_sklearn[:30],False, "Sklearn");
plt.title('sklearn default gini impurity feature importance')
Text(0.5,1,'sklearn default gini impurity feature importance')
m.fe
Repeat the above process, to confirm it should have consistent behavior with different losses.
X_train.iloc[0:1]
# train XGBoost model
X,y = shap.datasets.boston()
model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100)
# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
explainer = shap.TreeExplainer(m)
shap_values = explainer.shap_values(X_train.iloc[0:1])
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])
X.dtypes
tree = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
tree.fit(X, y)
X_xgb = xgboost.DMatrix(X)
%time model.predict(X_xgb,ntree_limit=100)
%time m.predict(X)
# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])
explainer.shap_values??
shap_values[0,:]
# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
explainer = shap.TreeExplainer(m)
shap_values = explainer.shap_values(X)
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])