#!/usr/bin/env python
# coding: utf-8
#
Example-Dependent Cost-Sensitive Credit Scoring using CostCla
#
#
# Alejandro Correa Bahnsen
#
#
#
#
#
#
#
#
#
#
#
# PyData Berlin, May 2015
#
# About Me
# %%html
#
# ###A brief bio:
#
# * Last year (month) PhD Student at Luxembourg University
# * Work part time a fraud data scientist at CETREL a SIX Company
# * Worked for +5 years as a data scientist at GE Money and Scotiabank
# * Previously, six sigma intern at Dow Chemical
# * Bachelor in Industrial Engineering and Master in Financial Engineering
# * Organizer of Data Science Luxembourg and recently of Big Data Science Bog
# * Sport addict, love to swim, play tennis, squash, and volleyball, among others.
#
#
#
#
# # Agenda
#
# * Quick Intro to Credit Scoring
# * Example of Credit Scoring
# * Financial Evaluation of a Credit Scorecard
# * Example-Dependent Classification
# * CostCla Library
# * Conclusion and Future Work
# Credit Scoring
# #To whom would you grant a loan?
# | | |
# |:-:|:-:|
# | Just fund a bank | Just quit college |
#
#
# # Nice guess!
#
# | | |
# |:-:|:-:|
# | Biggest Ponzi scheme | Now a Billionaire |
#
#
# # Credit Scoring
# - Mitigate the impact of **credit risk** and make more objective
# and accurate decisions
# - Estimate the **risk of a customer defaulting** his contracted
# financial obligation if a loan is granted, based on past
# experiences
# - Different machine learning methods are used in practice, and in the
# literature: logistic regression, neural networks, discriminant
# analysis, genetic programing, decision trees, random forests among others
# # Credit Scoring
# Formally, a credit score is a statistical model that allows the estimation of the probability of a customer $i$ defaulting a contracted debt ($y_i=1$)
#
# $$\hat p_i=P(y_i=1|\mathbf{x}_i)$$
# Example: Kaggle Credit Competition
#
#
#
#
#
# Improve on the state of the art in credit scoring by predicting the probability that somebody will experience financial distress in the next two years.
#
# https://www.kaggle.com/c/GiveMeSomeCredit
# # Load dataset from CostCla package
# In[1]:
import pandas as pd
import numpy as np
# In[1]:
from costcla.datasets import load_creditscoring1
data = load_creditscoring1()
# ### Data file
# In[2]:
print data.keys()
print 'Number of examples ', data.target.shape[0]
# ### Class Label
# In[14]:
target = pd.DataFrame(pd.Series(data.target).value_counts(), columns=('Frequency',))
target['Percentage'] = target['Frequency'] / target['Frequency'].sum()
target.index = ['Negative (Good Customers)', 'Positive (Bad Customers)']
print target
# ### Features
# In[6]:
pd.DataFrame(data.feature_names, columns=('Features',))
# # Credit scoring as a classification problem
# ### Split in training and testing
# In[7]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = \
train_test_split(data.data, data.target, data.cost_mat)
# # Credit scoring as a classification problem
# ### Fit models
# In[8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
classifiers = {"RF": {"f": RandomForestClassifier()},
"DT": {"f": DecisionTreeClassifier()},
"LR": {"f": LogisticRegression()}}
# Fit the classifiers using the training dataset
for model in classifiers.keys():
classifiers[model]["f"].fit(X_train, y_train)
classifiers[model]["c"] = classifiers[model]["f"].predict(X_test)
classifiers[model]["p"] = classifiers[model]["f"].predict_proba(X_test)
classifiers[model]["p_train"] = classifiers[model]["f"].predict_proba(X_train)
# # Models performance
# ### Evaluate metrics and plot results
# In[9]:
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize
import seaborn as sns
figsize(12, 8)
# In[10]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
measures = {"F1Score": f1_score, "Precision": precision_score,
"Recall": recall_score, "Accuracy": accuracy_score}
results = pd.DataFrame(columns=measures.keys())
for model in classifiers.keys():
results.loc[model] = [measures[measure](y_test, classifiers[model]["c"]) for measure in measures.keys()]
# In[10]:
def fig1():
plt.figure()
l = plt.plot(range(results.shape[0]), results, "-o", linewidth=7, markersize=15)
plt.legend(iter(l), results.columns.tolist(), loc='center left', bbox_to_anchor=(1, 0.5),fontsize=22)
plt.xlim([-0.25, results.shape[0]-1+.25])
plt.xticks(range(results.shape[0]), results.index)
plt.tick_params(labelsize=22)
plt.show()
# # Models performance
# In[11]:
fig1()
# # Models performance
# - None of these measures takes into account the **business and economical realities** that take place in credit scoring.
# - Costs that the financial institution had incurred to acquire customers, or the **expected profit** due to a particular client, are not considered in the evaluation of the different models.
# Financial Evaluation of a Credit Scorecard
# # Motivation
#
# - Typically, a credit risk model is evaluated using standard **cost-insensitive measures**.
# - However, in practice, the cost associated with **approving a bad customer** (False Negative) is quite different from the cost associated with **declining a good customer** (False Positive).
# - Furthermore, the costs are **not constant** among customers.
# # Cost Matrix
#
#
# | | Actual Positive ($y_i=1$) | Actual Negative ($y_i=0$)|
# |--- |:-: |:-: |
# | Pred. Positive ($c_i=1$) | $C_{TP_i}=0$ | $C_{FP_i}=r_i+C^a_{FP}$ |
# | Pred. Negative ($c_i=0$) | $C_{FN_i}=Cl_i \cdot L_{gd}$ | $C_{TN_i}=0$ |
#
# Where:
#
# - $C_{FN_i}$ = losses if the customer $i$ defaults
# - $Cl_i$ is the credi line of customer $i$
# - $L_{gd}$ is the loss given default. Percentage of loss over the total credit line when the customer defaulted
# # Cost Matrix
#
# - $C_{FP_i}=r_i+C^a_{FP}$
# - $r_i$ is the loss in profit by rejecting what would have been a good customer.
# - $C^a_{FP}$ is related to the assumption that the financial institution will not keep the money of the declined customer idle, but instead it will give
# a loan to an alternative customer.
#
# For more info see [Correa Bahnsen et al., 2014]
# # Parameters for the Kaggle Credit Database
#
# Assuming the database belong to an average European financial institution, we find the different parameters needed to calculate the cost measure
#
# | Parameter | Value |
# |--- |:-: |
# |Interest rate ($int_r$) | 4.79% |
# | Cost of funds ($int_{cf}$) | 2.94% |
# | Term ($l$) in months | 24 |
# | Loss given default ($L_{gd}$) | 75% |
# | Times income ($q$) | 3 |
# | Maximum credit line ($Cl_{max}$) | 25,000|
# In[12]:
# The cost matrix is already calculated for the dataset
# cost_mat[C_FP,C_FN,C_TP,C_TN]
print data.cost_mat[[10, 17, 50]]
# # Financial savings
#
# The financial cost of using a classifier $f$ on $\mathcal{S}$ is calculated by
#
# $$ Cost(f(\mathcal{S})) = \sum_{i=1}^N y_i(1-c_i)C_{FN_i} + (1-y_i)c_i C_{FP_i}.$$
#
# Then the financial savings are defined as the cost of the algorithm versus the cost of using no algorithm at all.
#
# $$ Savings(f(\mathcal{S})) = \frac{ Cost_l(\mathcal{S}) - Cost(f(\mathcal{S}))} {Cost_l(\mathcal{S})},$$
#
# where $Cost_l(\mathcal{S})$ is the cost of the costless class
# #Models Savings
# ## costcla.metrics.savings_score(y_true, y_pred, cost_mat)
# In[13]:
# Calculation of the cost and savings
from costcla.metrics import savings_score
# Evaluate the savings for each model
results["Savings"] = np.zeros(results.shape[0])
for model in classifiers.keys():
results["Savings"].loc[model] = savings_score(y_test, classifiers[model]["c"], cost_mat_test)
# In[13]:
# Plot the results
colors = sns.color_palette()
def fig2():
fig, ax = plt.subplots()
l = ax.plot(range(results.shape[0]), results["F1Score"], "-o", label='F1Score', color=colors[2], linewidth=7, markersize=15)
b = ax.bar(np.arange(results.shape[0])-0.3, results['Savings'], 0.6, label='Savings', color=colors[0])
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=22)
ax.set_xlim([-0.5, results.shape[0]-1+.5])
ax.set_xticks(range(results.shape[0]))
ax.set_xticklabels(results.index)
plt.tick_params(labelsize=22)
plt.show()
# #Models Savings
# In[14]:
fig2()
# - There are significant differences in the results when evaluating a model using a traditional cost-insensitive measures
# - ~17% of savings is very bad!
# - Train models that take into account the different financial costs
# Example-Dependent Cost-Sensitive Classification
# ## *Why "Example-Dependent"
# Cost-sensitive classification ussualy refers to class-dependent costs, where the cost dependends on the class but is assumed constant accross examples.
# In credit scoring, different customers have different credit lines, which implies that the costs are not constant
# # Bayes Minimum Risk (BMR)
# The BMR classifier is a decision model based on quantifying tradeoffs between various decisions using probabilities and the costs that accompany such decisions.
# In particular:
#
# $$ R(c_i=0|\mathbf{x}_i)=C_{TN_i}(1-\hat p_i)+C_{FN_i} \cdot \hat p_i, $$
# and
# $$ R(c_i=1|\mathbf{x}_i)=C_{TP_i} \cdot \hat p_i + C_{FP_i}(1- \hat p_i), $$
# # BMR Code
# ```
# costcla.models.BayesMinimumRiskClassifier(calibration=True)
# ```
# ```
# fit(y_true_cal=None, y_prob_cal=None)
# ```
# - Parameters
# - **y_true_cal** : True class
# - **y_prob_cal** : Predicted probabilities
# ```
# predict(y_prob,cost_mat)
# ```
# - Parameters
# - **y_prob** : Predicted probabilities
# - **cost_mat** : Cost matrix of the classification problem.
#
# - Returns
# - **y_pred** : Predicted class
# # BMR Code
# In[15]:
from costcla.models import BayesMinimumRiskClassifier
ci_models = classifiers.keys()
for model in ci_models:
classifiers[model+"-BMR"] = {"f": BayesMinimumRiskClassifier()}
# Fit
classifiers[model+"-BMR"]["f"].fit(y_test, classifiers[model]["p"])
# Calibration must be made in a validation set
# Predict
classifiers[model+"-BMR"]["c"] = classifiers[model+"-BMR"]["f"].predict(classifiers[model]["p"], cost_mat_test)
# In[15]:
for model in ci_models:
# Evaluate
results.loc[model+"-BMR"] = 0
results.loc[model+"-BMR", measures.keys()] = \
[measures[measure](y_test, classifiers[model+"-BMR"]["c"]) for measure in measures.keys()]
results["Savings"].loc[model+"-BMR"] = savings_score(y_test, classifiers[model+"-BMR"]["c"], cost_mat_test)
# # BMR Results
# In[16]:
fig2()
# # BMR Results
#
# - Bayes Minimum Risk increases the savings by using a cost-insensitive method and then introducing the costs
# - Why not introduce the costs during the estimation of the methods?
# ## Cost-Sensitive Decision Trees (CSDT)
#
# A a new cost-based impurity measure taking into account the costs when all the examples in a leaf
#
# ```
# costcla.models.CostSensitiveDecisionTreeClassifier(criterion='direct_cost', criterion_weight=False, pruned=True)
# ```
# ## Cost-Sensitive Random Patches (CSRP)
#
# Ensemble of CSDT
#
# ```
# costcla.models.CostSensitiveRandomPatchesClassifier(n_estimators=10, max_samples=0.5, max_features=0.5,combination='majority_voting)
# ```
# # CSDT & CSRP Code
# In[24]:
from costcla.models import CostSensitiveDecisionTreeClassifier
from costcla.models import CostSensitiveRandomPatchesClassifier
classifiers = {"CSDT": {"f": CostSensitiveDecisionTreeClassifier()},
"CSRP": {"f": CostSensitiveRandomPatchesClassifier()}}
# Fit the classifiers using the training dataset
for model in classifiers.keys():
classifiers[model]["f"].fit(X_train, y_train, cost_mat_train)
classifiers[model]["c"] = classifiers[model]["f"].predict(X_test)
# In[24]:
for model in classifiers.keys():
# Evaluate
results.loc[model] = 0
results.loc[model, measures.keys()] = \
[measures[measure](y_test, classifiers[model]["c"]) for measure in measures.keys()]
results["Savings"].loc[model] = savings_score(y_test, classifiers[model]["c"], cost_mat_test)
# # CSDT & CSRP Results
# In[25]:
fig2()
# # Lessons Learned (so far ...)
# - Selecting models based on traditional statistics does not give the best results in terms of cost
# - Models should be evaluated taking into account real financial costs of the application
# - Algorithms should be developed to incorporate those financial costs
#
#
#
#
# # CostCla Library
#
# - **CostCla** is a Python open source cost-sensitive classification library built on top of Scikit-learn, Pandas and Numpy.
#
# - Source code, binaries and documentation are distributed under 3-Clause BSD license in the website http://albahnsen.com/CostSensitiveClassification/
# # CostCla Algorithms
#
# - Cost-proportionate over-sampling [Elkan, 2001]
#
# - SMOTE [Chawla et al., 2002]
#
# - Cost-proportionate rejection-sampling [Zadrozny et al., 2003]
#
# - Thresholding optimization [Sheng and Ling, 2006]
#
# - Bayes minimum risk [Correa Bahnsen et al., 2014a]
#
# - Cost-sensitive logistic regression [Correa Bahnsen et al., 2014b]
#
# - Cost-sensitive decision trees [Correa Bahnsen et al., 2015a]
#
# - Cost-sensitive ensemble methods: cost-sensitive bagging, cost-sensitive pasting, cost-sensitive random forest and cost-sensitive random patches [Correa Bahnsen et al., 2015c]
# # CostCla Databases
#
# - Credit Scoring1 - Kaggle credit competition [Data], cost matrix: [Correa Bahnsen et al., 2014]
#
# - Credit Scoring 2 - PAKDD2009 Credit [Data], cost matrix: [Correa Bahnsen et al., 2014a]
#
# - Direct Marketing - PAKDD2009 Credit [Data], cost matrix: [Correa Bahnsen et al., 2014b]
#
# - Churn Modeling, June 2015
# #Future Work
#
# - CSDT in Cython
# - Cost-sensitive class-dependent algorithms
# - Sampling algorithms
# - Probability calibration (Only ROCCH)
# - Compatibility with Python $\ge$ 3.4
# - Other algorithms
# - More databases
# You find the presentation and the IPython Notebook here:
#
# * http://nbviewer.ipython.org/format/slides/github/
# albahnsen/CostSensitiveClassification/blob/
# master/doc/tutorials/slides_edcs_credit_scoring.ipynb#/
# * https://github.com/albahnsen/CostSensitiveClassification/ blob/master/doc/tutorials/slides_edcs_credit_scoring.ipynb
#
# This slides are a short version of this tutorial:
#
# * http://nbviewer.ipython.org/github/albahnsen/CostSensitiveClassification/ blob/master/doc/tutorials/tutorial_edcs_credit_scoring.ipynb
# Thanks!
#
#
#
# In[3]:
#Format from https://github.com/ellisonbg/talk-2013-scipy
from IPython.display import display, HTML
s = """
"""
display(HTML(s))