import os, sys
sys.path.append("..\\")
sys.path.append("..\\..\\")
print(sys.path)
['/Users/ppedemon/git.repos/AIX360/examples/metrics', '/Users/ppedemon/miniconda3/envs/aix360/lib/python37.zip', '/Users/ppedemon/miniconda3/envs/aix360/lib/python3.7', '/Users/ppedemon/miniconda3/envs/aix360/lib/python3.7/lib-dynload', '', '/Users/ppedemon/miniconda3/envs/aix360/lib/python3.7/site-packages', '/Users/ppedemon/miniconda3/envs/aix360/lib/python3.7/site-packages/IPython/extensions', '/Users/ppedemon/.ipython', '..\\', '..\\..\\']
from __future__ import print_function
from IPython.display import Markdown, display
from matplotlib import pyplot as plt
import sklearn
import sklearn.datasets
import sklearn.ensemble
import numpy as np
import lime
np.random.seed(1)
from lime.lime_tabular import LimeTabularExplainer
from aix360.metrics import faithfulness_metric, monotonicity_metric
iris = sklearn.datasets.load_iris()
train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(iris.data,
iris.target,
train_size=0.80)
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train, labels_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False)
sklearn.metrics.accuracy_score(labels_test, rf.predict(test))
0.9666666666666667
explainer = LimeTabularExplainer(train,
feature_names=iris.feature_names,
class_names=iris.target_names,
discretize_continuous=True)
print(type(explainer))
<class 'lime.lime_tabular.LimeTabularExplainer'>
i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=4, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=False)
Get the local explanation and find the weights assigned to the features. Create a array of base (don't care) values for comparison. For iris dataset, we assume a base value of 0 for each atribute.
predicted_class = rf.predict(test[i].reshape(1,-1))[0]
le = exp.local_exp[predicted_class]
m = exp.as_map()
x = test[i]
coefs = np.zeros(x.shape[0])
for v in le:
coefs[v[0]] = v[1]
base = np.zeros(x.shape[0])
print("Faithfulness: ", faithfulness_metric(rf, x, coefs, base))
print("Monotonity: ", monotonicity_metric(rf, x, coefs, base))
Faithfulness: 0.9850836218974064 Monotonity: True
while the Faithfulness metric deems the explanation to be weak, it is considered to be ok using the Monotonicity metric.
Lets explore further by evaluating these metrics on the entire test set.
ncases = test.shape[0]
mon = np.zeros(ncases)
for i in range(ncases):
predicted_class = rf.predict(test[i].reshape(1,-1))[0]
exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=4, top_labels=1)
le = exp.local_exp[predicted_class]
m = exp.as_map()
x = test[i]
coefs = np.zeros(x.shape[0])
for v in le:
coefs[v[0]] = v[1]
mon[i] = monotonicity_metric(rf, test[i], coefs, base)
print("% of test records where explanation is monotonic",np.mean(mon))
% of test records where explanation is monotonic 0.8333333333333334
More than 80% of the explanations are monotonic. Hence, the LIME explanations are fairly good using this measure.
fait = np.zeros(ncases)
for i in range(ncases):
predicted_class = rf.predict(test[i].reshape(1,-1))[0]
exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=4, top_labels=1)
le = exp.local_exp[predicted_class]
m = exp.as_map()
x = test[i]
coefs = np.zeros(x.shape[0])
for v in le:
coefs[v[0]] = v[1]
fait[i] = faithfulness_metric(rf, test[i], coefs, base)
print("Faithfulness metric mean: ",np.mean(fait))
print("Faithfulness metric std. dev.:", np.std(fait))
Faithfulness metric mean: 0.47354600998894836 Faithfulness metric std. dev.: 0.6256893306331033
The value of the faithfulness metric can be between -1.0 and 1.0. So, a mean value of around 0.5 shows that the LIME explanations are fairly good via this metric. Moreover, the high std. deviation shows that the distribution has probably a large number of cases with high correlation. So, we look at a histogram of faithfulness metric values for all the cases.
plt.hist(fait, bins = [-1.0,-0.5,0,0.5,1.0])
plt.title("histogram")
plt.show()
This shows that most the explanations produced by LIME are 'faithful'. Only a few of the explanations are not good using this metric