In this analysis I am trying to fit a predictive model to predict whether a bank customer will subscribe to a term deposit and then try to report the results in a clear and concise manner using the package 'modelplotpy'.
We will also look at how the predictive model is doing locally using the package 'lime'.
The dataset has various kinds of features associated with 1 row ( = 1 customer). Lets explore.
Importing required libraries
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score, accuracy_score, precision_score, confusion_matrix
import seaborn as sns
import modelplotpy as mp
import numpy as np
import warnings
warnings.filterwarnings("ignore")
C:\Users\cgokh\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release. from numpy.core.umath_tests import inner1d
data = pd.read_csv("../bank-additional/bank-additional-full.csv", sep=";")
data.head()
age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
5 rows × 21 columns
# Firstly, lets change the name of the target column as well as the 'yes' level to a more meaningful name
data['y'].replace({'yes': 'term_deposit'}, inplace=True)
data.drop(['month', 'day_of_week', 'duration'], axis=1, inplace=True)
y = data['y']
#encode_y = LabelEncoder()
#y_encoded = encode_y.fit_transform(y)
data.drop('y', axis=1, inplace=True)
data = pd.get_dummies(data)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.1, random_state=123)
rf_classifier = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=5, min_samples_split=0.1,
max_features='auto')
rf_classifier.fit(X_train, y_train)
gbm_classifier = GradientBoostingClassifier(learning_rate=0.1, n_estimators=500, subsample=0.85, max_depth=10)
gbm_classifier.fit(X_train, y_train)
GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=10, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, presort='auto', random_state=None, subsample=0.85, verbose=0, warm_start=False)
gbm_preds = gbm_classifier.predict(X_test)
rf_preds = rf_classifier.predict(X_test)
confusion_matrix(y_test, rf_preds)
array([[3666, 5], [ 439, 9]], dtype=int64)
rf_probs = rf_classifier.predict_proba(X_test)
rf_probs = np.array([i[1] for i in rf_probs])
sns.distplot(rf_probs, kde=True, color='r')
<matplotlib.axes._subplots.AxesSubplot at 0x275b34bb048>
confusion_matrix(y_test, gbm_preds)
array([[3516, 155], [ 309, 139]], dtype=int64)
gbm_probs = gbm_classifier.predict_proba(X_test)
gbm_probs = np.array([i[1] for i in gbm_probs])
sns.distplot(gbm_probs, kde=True, color='b')
<matplotlib.axes._subplots.AxesSubplot at 0x275b35967b8>
obj = mp.modelplotpy(feature_data = [X_train, X_test]
, label_data = [y_train, y_test]
, dataset_labels = ['train data', 'test data']
, models = [rf_classifier, gbm_classifier]
, model_labels = ['random forest', 'gradient boosting']
)
# transform data generated with prepare_scores_and_deciles into aggregated data for chosen plotting scope
ps = obj.plotting_scope(select_model_label = ['random forest'], select_dataset_label = ['test data'])
mp.plot_cumgains(ps, highlight_decile=2)
Default scope value no_comparison selected, single evaluation line will be plotted. The label with smallest class is term_deposit Target class term_deposit, dataset test data and model random forest. When we select 20% with the highest probability according to model random forest, this selection holds 59% of all term_deposit cases in dataset test data. The cumulative gains plot is saved in C:\Github\JupyterNotebooks\Notebooks/Cumulative gains plot.png
<Figure size 432x288 with 0 Axes>
<matplotlib.axes._subplots.AxesSubplot at 0x275b37f6748>
# Extracting indices of Probs. over 80 percentile or 8th Decilbe (Top 20% probabilities)
rf_probs_top20percent_indices = np.where(rf_probs > np.quantile(rf_probs, 0.8))
# Using those indices to get the true labels of those probs.
rf_probs_top20percent_y_test = np.array(y_test)[rf_probs_top20percent_indices]
# Separating top 20% probs in a separate array
rf_probs_top20percent_probs = rf_probs[rf_probs_top20percent_indices]
# creating all positive predictions for top 20% probabilities
rf_probs_top20percent_preds = np.where(rf_probs_top20percent_probs > 0, 'term_deposit', 'no')
confusion_matrix(rf_probs_top20percent_y_test, rf_probs_top20percent_preds)
array([[ 0, 559], [ 0, 265]], dtype=int64)
# transform data generated with prepare_scores_and_deciles into aggregated data for chosen plotting scope
ps_gbm = obj.plotting_scope(select_model_label = ['gradient boosting'], select_dataset_label = ['test data'])
mp.plot_cumgains(ps_gbm, highlight_decile=3)
Default scope value no_comparison selected, single evaluation line will be plotted. The label with smallest class is term_deposit Target class term_deposit, dataset test data and model gradient boosting. When we select 30% with the highest probability according to model gradient boosting, this selection holds 62% of all term_deposit cases in dataset test data. The cumulative gains plot is saved in C:\Github\JupyterNotebooks\Notebooks/Cumulative gains plot.png
<Figure size 432x288 with 0 Axes>
<matplotlib.axes._subplots.AxesSubplot at 0x275b389cb00>
# Extracting indices of Probs. over 80 percentile or 8th Decilbe (Top 20% probabilities)
gbm_probs_top20percent_indices = np.where(gbm_probs > np.quantile(gbm_probs, 0.7))
# Using those indices to get the true labels of those probs.
gbm_probs_top20percent_y_test = np.array(y_test)[gbm_probs_top20percent_indices]
# Separating top 20% probs in a separate array
gbm_probs_top20percent_probs = gbm_probs[gbm_probs_top20percent_indices]
# creating all positive predictions for top 20% probabilities
gbm_probs_top20percent_preds = np.where(gbm_probs_top20percent_probs > 0, 'term_deposit', 'no')
confusion_matrix(gbm_probs_top20percent_y_test, gbm_probs_top20percent_preds)
array([[ 0, 956], [ 0, 280]], dtype=int64)
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline
lbl_enc = LabelEncoder()
y_test_encoded = lbl_enc.fit_transform(y_test)
precision, recall, thresholds = precision_recall_curve(y_test_encoded, rf_probs, pos_label=1)
fig=plt.figure(figsize=(10, 8), dpi= 80, facecolor='w', edgecolor='k')
thresholds = np.append(thresholds, 1)
queue_rate = []
for threshold in thresholds:
queue_rate.append((rf_probs >= threshold).mean())
plt.plot(thresholds, precision, color=sns.color_palette()[0])
plt.plot(thresholds, recall, color=sns.color_palette()[1])
plt.plot(thresholds, queue_rate, color=sns.color_palette()[2])
leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')
Text(0,0.5,'%')
# Extracting indices of Probs. over 80 percentile or 8th Decilbe (Top 20% probabilities)
rf_probs_top20percent_indices = np.where(rf_probs > 0.3)
# Using those indices to get the true labels of those probs.
rf_probs_top20percent_y_test = np.array(y_test)[rf_probs_top20percent_indices]
# Separating top 20% probs in a separate array
rf_probs_top20percent_probs = rf_probs[rf_probs_top20percent_indices]
# creating all positive predictions for top 20% probabilities
rf_probs_top20percent_preds = np.where(rf_probs_top20percent_probs > 0, 'term_deposit', 'no')
confusion_matrix(rf_probs_top20percent_y_test, rf_probs_top20percent_preds)
array([[ 0, 259], [ 0, 191]], dtype=int64)
# Extracting indices of Probs. over 80 percentile or 8th Decilbe (Top 20% probabilities)
rf_probs_top20percent_indices = np.where(rf_probs > 0.05)
# Using those indices to get the true labels of those probs.
rf_probs_top20percent_y_test = np.array(y_test)[rf_probs_top20percent_indices]
# Separating top 20% probs in a separate array
rf_probs_top20percent_probs = rf_probs[rf_probs_top20percent_indices]
# creating all positive predictions for top 20% probabilities
rf_probs_top20percent_preds = np.where(rf_probs_top20percent_probs > 0, 'term_deposit', 'no')
confusion_matrix(rf_probs_top20percent_y_test, rf_probs_top20percent_preds)
array([[ 0, 2613], [ 0, 415]], dtype=int64)
Total costs = 5 x 999 = 5,226
Total gains = 415 x 25 = 10,375
Profit = 10,375 - 5,226 = 5149
Profit increased. So we can see that looking at our Avg. gains and costs, we can take a decision on precision and recall.
# Extracting indices of Probs. over 80 percentile or 8th Decilbe (Top 20% probabilities)
rf_probs_top20percent_indices = np.where(rf_probs > 0.475)
# Using those indices to get the true labels of those probs.
rf_probs_top20percent_y_test = np.array(y_test)[rf_probs_top20percent_indices]
# Separating top 20% probs in a separate array
rf_probs_top20percent_probs = rf_probs[rf_probs_top20percent_indices]
# creating all positive predictions for top 20% probabilities
rf_probs_top20percent_preds = np.where(rf_probs_top20percent_probs > 0, 'term_deposit', 'no')
confusion_matrix(rf_probs_top20percent_y_test, rf_probs_top20percent_preds)
array([[ 0, 26], [ 0, 59]], dtype=int64)
lbl_enc = LabelEncoder()
y_test_encoded = lbl_enc.fit_transform(y_test)
precision, recall, thresholds = precision_recall_curve(y_test_encoded, gbm_probs, pos_label=1)
fig=plt.figure(figsize=(10, 8), dpi= 80, facecolor='w', edgecolor='k')
thresholds = np.append(thresholds, 1)
queue_rate = []
for threshold in thresholds:
queue_rate.append((gbm_probs >= threshold).mean())
plt.plot(thresholds, precision, color=sns.color_palette()[0])
plt.plot(thresholds, recall, color=sns.color_palette()[1])
plt.plot(thresholds, queue_rate, color=sns.color_palette()[2])
leg = plt.legend(('precision', 'recall', 'queue_rate'), frameon=True)
leg.get_frame().set_edgecolor('k')
plt.xlabel('threshold')
plt.ylabel('%')
Text(0,0.5,'%')
# Extracting indices of Probs. over 80 percentile or 8th Decilbe (Top 20% probabilities)
gbm_probs_top20percent_indices = np.where(gbm_probs > 0.23)
# Using those indices to get the true labels of those probs.
gbm_probs_top20percent_y_test = np.array(y_test)[gbm_probs_top20percent_indices]
# Separating top 20% probs in a separate array
gbm_probs_top20percent_probs = gbm_probs[gbm_probs_top20percent_indices]
# creating all positive predictions for top 20% probabilities
gbm_probs_top20percent_preds = np.where(gbm_probs_top20percent_probs > 0, 'term_deposit', 'no')
confusion_matrix(gbm_probs_top20percent_y_test, gbm_probs_top20percent_preds)
array([[ 0, 275], [ 0, 179]], dtype=int64)
1. We validated our Profit results only on one test set, but it may happen that the results might change on a different test set. So can we create a system which will give us robust profit values? (Cross-validated profit values)
2. Use LIME to Trust the predictive model which we will decide to implement in production.
3. Use LIME to improve the model results if possible by looking at feature to response relationships locally.