#!/usr/bin/env python
# coding: utf-8
#
Example-Dependent Cost-Sensitive Fraud Detection using CostCla
#
#
# Alejandro Correa Bahnsen, PhD
#
#
Data Scientist
#
#
#
#
#
# PyCaribbean, Santo Domingo, Dominican Republic, Feb 2016
#
# About Me
# %%html
#
# ### A brief bio:
#
# * PhD in **Machine Learning** at Luxembourg University
# * Data Scientist at Easy Solutions
# * Worked for +8 years as a data scientist at GE Money, Scotiabank and SIX Financial Services
# * Bachelor in Industrial Engineering and Master in Financial Engineering
# * Organizer of Big Data & Data Science Bogota Meetup
# * Sport addict, love to swim, play tennis, squash, and volleyball, among others.
#
#
#
#
# # Agenda
#
# * Quick Intro to Fraud Detection
# * Financial Evaluation of a Fraud Detection Model
# * Example-Dependent Classification
# * CostCla Library
# * Conclusion and Future Work
# # Fraud Detection
# Estimate the **probability** of a transaction being **fraud** based on analyzing customer patterns and recent fraudulent behavior
#
#
#
#
#
# # Fraud Detection
# Issues when constructing a fraud detection system:
#
# * Skewness of the data
# * **Cost-sensitivity**
# * Short time response of the system
# * Dimensionality of the search space
# * Feature preprocessing
# * Model selection
# Different machine learning methods are used in practice, and in the
# literature: logistic regression, neural networks, discriminant
# analysis, genetic programing, decision trees, random forests among others
# # Fraud Detection
# Formally, a fraud detection is a statistical model that allows the estimation of the probability of transaction $i$ being a fraud ($y_i=1$)
#
# $$\hat p_i=P(y_i=1|\mathbf{x}_i)$$
# Data!
#
#
#
# # Load dataset from CostCla package
# In[1]:
import pandas as pd
import numpy as np
from costcla import datasets
# In[2]:
from costcla.datasets.base import Bunch
def load_fraud(cost_mat_parameters=dict(Ca=10)):
# data_ = pd.read_pickle("trx_fraud_data.pk")
data_ = pd.read_pickle("/home/al/DriveAl/EasySol/Projects/DetectTA/Tests/trx_fraud_data_v3_agg.pk")
target = data_['fraud'].values
data = data_.drop('fraud', 1)
n_samples = data.shape[0]
cost_mat = np.zeros((n_samples, 4))
cost_mat[:, 0] = cost_mat_parameters['Ca']
cost_mat[:, 1] = data['amount']
cost_mat[:, 2] = cost_mat_parameters['Ca']
cost_mat[:, 3] = 0.0
return Bunch(data=data.values, target=target, cost_mat=cost_mat,
target_names=['Legitimate Trx', 'Fraudulent Trx'], DESCR='',
feature_names=data.columns.values, name='FraudDetection')
datasets.load_fraud = load_fraud
# In[3]:
data = datasets.load_fraud()
# ### Data file
# In[4]:
print(data.keys())
print('Number of examples ', data.target.shape[0])
# ### Class Label
# In[5]:
target = pd.DataFrame(pd.Series(data.target).value_counts(), columns=('Frequency',))
target['Percentage'] = (target['Frequency'] / target['Frequency'].sum()) * 100
target.index = ['Negative (Legitimate Trx)', 'Positive (Fraud Trx)']
target.loc['Total Trx'] = [data.target.shape[0], 1.]
print(target)
# ### Features
# In[6]:
pd.DataFrame(data.feature_names[:4], columns=('Features',))
# ### Features
# In[7]:
df = pd.DataFrame(data.data[:, :4], columns=data.feature_names[:4])
df.head(10)
# ### Aggregated Features
# In[8]:
df = pd.DataFrame(data.data[:, 4:], columns=data.feature_names[4:])
df.head(10)
# # Fraud Detection as a classification problem
# ### Split in training and testing
# In[9]:
from sklearn.cross_validation import train_test_split
X = data.data[:, [2, 3] + list(range(4, data.data.shape[1]))].astype(np.float)
X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = \
train_test_split(X, data.target, data.cost_mat, test_size=0.33, random_state=10)
# # Fraud Detection as a classification problem
# ### Fit models
# In[10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
classifiers = {"RF": {"f": RandomForestClassifier()},
"DT": {"f": DecisionTreeClassifier()}}
ci_models = ['DT', 'RF']
# Fit the classifiers using the training dataset
for model in classifiers.keys():
classifiers[model]["f"].fit(X_train, y_train)
classifiers[model]["c"] = classifiers[model]["f"].predict(X_test)
classifiers[model]["p"] = classifiers[model]["f"].predict_proba(X_test)
classifiers[model]["p_train"] = classifiers[model]["f"].predict_proba(X_train)
# # Models performance
# ### Evaluate metrics and plot results
# In[11]:
import warnings
warnings.filterwarnings('ignore')
# In[12]:
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize
import seaborn as sns
colors = sns.color_palette()
figsize(12, 8)
# In[13]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
measures = {"F1Score": f1_score, "Precision": precision_score,
"Recall": recall_score, "Accuracy": accuracy_score}
results = pd.DataFrame(columns=measures.keys())
for model in ci_models:
results.loc[model] = [measures[measure](y_test, classifiers[model]["c"]) for measure in measures.keys()]
# # Models performance
# In[14]:
def fig_acc():
plt.bar(np.arange(results.shape[0])-0.3, results['Accuracy'], 0.6, label='Accuracy', color=colors[0])
plt.xticks(range(results.shape[0]), results.index)
plt.tick_params(labelsize=22); plt.title('Accuracy', size=30)
plt.show()
# In[15]:
fig_acc()
# # Models performance
# In[16]:
def fig_f1():
plt.bar(np.arange(results.shape[0])-0.3, results['Precision'], 0.2, label='Precision', color=colors[0])
plt.bar(np.arange(results.shape[0])-0.3+0.2, results['Recall'], 0.2, label='Recall', color=colors[1])
plt.bar(np.arange(results.shape[0])-0.3+0.4, results['F1Score'], 0.2, label='F1Score', color=colors[2])
plt.xticks(range(results.shape[0]), results.index)
plt.tick_params(labelsize=22)
plt.ylim([0, 1])
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5),fontsize=22)
plt.show()
# In[17]:
fig_f1()
# # Models performance
# - None of these measures takes into account the **business and economical realities** that take place in fraud detection.
# - Losses due to fraud or customer satisfaction costs, are not considered in the evaluation of the different models.
# Financial Evaluation of a Fraud Detection Model
# # Motivation
#
# - Typically, a fraud model is evaluated using standard **cost-insensitive measures**.
# - However, in practice, the cost associated with **approving a fraudulent transaction** (False Negative) is quite different from the cost associated with **declining a legitimate transaction** (False Positive).
# - Furthermore, the costs are **not constant** among transactions.
# # Cost Matrix
#
#
# | | Actual Positive ($y_i=1$) | Actual Negative ($y_i=0$)|
# |--- |:-: |:-: |
# | Pred. Positive ($c_i=1$) | $C_{TP_i}=C_a$ | $C_{FP_i}=C_a$ |
# | Pred. Negative ($c_i=0$) | $C_{FN_i}=Amt_i$ | $C_{TN_i}=0$ |
#
# Where:
#
# - $C_{FN_i}$ = Amount of the transaction $i$
# - $C_a$ is the administrative cost of dealing with an alert
#
# For more info see [Correa Bahnsen et al., 2014]
# In[18]:
# The cost matrix is already calculated for the dataset
# cost_mat[C_FP,C_FN,C_TP,C_TN]
print(data.cost_mat[[10, 17, 50]])
# # Financial savings
#
# The financial cost of using a classifier $f$ on $\mathcal{S}$ is calculated by
#
# $$ Cost(f(\mathcal{S})) = \sum_{i=1}^N y_i(1-c_i)C_{FN_i} + (1-y_i)c_i C_{FP_i}.$$
#
# Then the financial savings are defined as the cost of the algorithm versus the cost of using no algorithm at all.
#
# $$ Savings(f(\mathcal{S})) = \frac{ Cost_l(\mathcal{S}) - Cost(f(\mathcal{S}))} {Cost_l(\mathcal{S})},$$
#
# where $Cost_l(\mathcal{S})$ is the cost of the costless class
# # Models Savings
# ## costcla.metrics.savings_score(y_true, y_pred, cost_mat)
# In[19]:
# Calculation of the cost and savings
from costcla.metrics import savings_score, cost_loss
# In[20]:
# Evaluate the savings for each model
results["Savings"] = np.zeros(results.shape[0])
for model in ci_models:
results["Savings"].loc[model] = savings_score(y_test, classifiers[model]["c"], cost_mat_test)
# In[21]:
# Plot the results
def fig_sav():
plt.bar(np.arange(results.shape[0])-0.4, results['Precision'], 0.2, label='Precision', color=colors[0])
plt.bar(np.arange(results.shape[0])-0.4+0.2, results['Recall'], 0.2, label='Recall', color=colors[1])
plt.bar(np.arange(results.shape[0])-0.4+0.4, results['F1Score'], 0.2, label='F1Score', color=colors[2])
plt.bar(np.arange(results.shape[0])-0.4+0.6, results['Savings'], 0.2, label='Savings', color=colors[3])
plt.xticks(range(results.shape[0]), results.index)
plt.tick_params(labelsize=22)
plt.ylim([0, 1])
plt.xlim([-0.5, results.shape[0] -1 + .5])
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5),fontsize=22)
plt.show()
# # Models Savings
# In[22]:
fig_sav()
# # Threshold Optimization
# Convert a classifier cost-sensitive by selecting a proper threshold
# from training instances according to the savings
# $$ t \quad = \quad argmax_t \: Savings(c(t), y) $$
# # Threshold Optimization - Code
# ```
# costcla.models.ThresholdingOptimization(calibration=True)
# ```
# ```
# fit(y_prob_train=None, cost_mat, y_true_train)
# ```
# - Parameters
# - **y_prob_train** : Predicted probabilities of the training set
# - **cost_mat** : Cost matrix of the classification problem.
# - **y_true_cal** : True class
# ```
# predict(y_prob)
# ```
# - Parameters
# - **y_prob** : Predicted probabilities
#
# - Returns
# - **y_pred** : Predicted class
# # Threshold Optimization
# In[23]:
from costcla.models import ThresholdingOptimization
for model in ci_models:
classifiers[model+"-TO"] = {"f": ThresholdingOptimization()}
# Fit
classifiers[model+"-TO"]["f"].fit(classifiers[model]["p_train"], cost_mat_train, y_train)
# Predict
classifiers[model+"-TO"]["c"] = classifiers[model+"-TO"]["f"].predict(classifiers[model]["p"])
# In[24]:
print('New thresholds')
for model in ci_models:
print(model + '-TO - ' + str(classifiers[model+'-TO']['f'].threshold_))
# In[25]:
for model in ci_models:
# Evaluate
results.loc[model+"-TO"] = 0
results.loc[model+"-TO", measures.keys()] = \
[measures[measure](y_test, classifiers[model+"-TO"]["c"]) for measure in measures.keys()]
results["Savings"].loc[model+"-TO"] = savings_score(y_test, classifiers[model+"-TO"]["c"], cost_mat_test)
# # Threshold Optimization
# In[26]:
fig_sav()
# # Models Savings
# - There are significant differences in the results when evaluating a model using a traditional cost-insensitive measures
# - Train models that take into account the different financial costs
# Example-Dependent Cost-Sensitive Classification
# ## *Why "Example-Dependent"
# Cost-sensitive classification ussualy refers to class-dependent costs, where the cost dependends on the class but is assumed constant accross examples.
# In fraud detection, different transactions have different amounts, which implies that the costs are not constant
# # Bayes Minimum Risk (BMR)
# The BMR classifier is a decision model based on quantifying tradeoffs between various decisions using probabilities and the costs that accompany such decisions.
# In particular:
#
# $$ R(c_i=0|\mathbf{x}_i)=C_{TN_i}(1-\hat p_i)+C_{FN_i} \cdot \hat p_i, $$
# and
# $$ R(c_i=1|\mathbf{x}_i)=C_{TP_i} \cdot \hat p_i + C_{FP_i}(1- \hat p_i), $$
# # BMR Code
# ```
# costcla.models.BayesMinimumRiskClassifier(calibration=True)
# ```
# ```
# fit(y_true_cal=None, y_prob_cal=None)
# ```
# - Parameters
# - **y_true_cal** : True class
# - **y_prob_cal** : Predicted probabilities
# ```
# predict(y_prob,cost_mat)
# ```
# - Parameters
# - **y_prob** : Predicted probabilities
# - **cost_mat** : Cost matrix of the classification problem.
#
# - Returns
# - **y_pred** : Predicted class
# # BMR Code
# In[27]:
from costcla.models import BayesMinimumRiskClassifier
for model in ci_models:
classifiers[model+"-BMR"] = {"f": BayesMinimumRiskClassifier()}
# Fit
classifiers[model+"-BMR"]["f"].fit(y_test, classifiers[model]["p"])
# Calibration must be made in a validation set
# Predict
classifiers[model+"-BMR"]["c"] = classifiers[model+"-BMR"]["f"].predict(classifiers[model]["p"], cost_mat_test)
# In[28]:
for model in ci_models:
# Evaluate
results.loc[model+"-BMR"] = 0
results.loc[model+"-BMR", measures.keys()] = \
[measures[measure](y_test, classifiers[model+"-BMR"]["c"]) for measure in measures.keys()]
results["Savings"].loc[model+"-BMR"] = savings_score(y_test, classifiers[model+"-BMR"]["c"], cost_mat_test)
# # BMR Results
# In[29]:
fig_sav()
# # BMR Results
# Why so important focusing on the Recall
# * Average cost of a False Negative
# In[30]:
print(data.data[data.target == 1, 2].mean())
# * Average cost of a False Positive
# In[31]:
print(data.cost_mat[:,0].mean())
# # BMR Results
#
# - Bayes Minimum Risk increases the savings by using a cost-insensitive method and then introducing the costs
# - Why not introduce the costs during the estimation of the methods?
# ## Cost-Sensitive Decision Trees (CSDT)
#
# A a new cost-based impurity measure taking into account the costs when all the examples in a leaf
#
# ```
# costcla.models.CostSensitiveDecisionTreeClassifier(criterion='direct_cost', criterion_weight=False, pruned=True)
# ```
# ## Cost-Sensitive Random Forest (CSRF)
#
# Ensemble of CSDT
#
# ```
# costcla.models.CostSensitiveRandomForestClassifier(n_estimators=10, max_samples=0.5, max_features=0.5,combination='majority_voting)```
# # CSDT & CSRF Code
# In[33]:
from costcla.models import CostSensitiveDecisionTreeClassifier
from costcla.models import CostSensitiveRandomForestClassifier
classifiers = {"CSDT": {"f": CostSensitiveDecisionTreeClassifier()},
"CSRF": {"f": CostSensitiveRandomForestClassifier(combination='majority_bmr')}}
# Fit the classifiers using the training dataset
for model in classifiers.keys():
classifiers[model]["f"].fit(X_train, y_train, cost_mat_train)
if model == "CSRF":
classifiers[model]["c"] = classifiers[model]["f"].predict(X_test, cost_mat_test)
else:
classifiers[model]["c"] = classifiers[model]["f"].predict(X_test)
# In[34]:
for model in ['CSDT', 'CSRF']:
# Evaluate
results.loc[model] = 0
results.loc[model, measures.keys()] = \
[measures[measure](y_test, classifiers[model]["c"]) for measure in measures.keys()]
results["Savings"].loc[model] = savings_score(y_test, classifiers[model]["c"], cost_mat_test)
# # CSDT & CSRF Results
# In[35]:
fig_sav()
# # Lessons Learned (so far ...)
# - Selecting models based on traditional statistics does not give the best results in terms of cost
# - Models should be evaluated taking into account real financial costs of the application
# - Algorithms should be developed to incorporate those financial costs
#
#
#
#
# # CostCla Library
#
# - **CostCla** is a Python open source cost-sensitive classification library built on top of Scikit-learn, Pandas and Numpy.
#
# - Source code, binaries and documentation are distributed under 3-Clause BSD license in the website http://albahnsen.com/CostSensitiveClassification/
# # CostCla Algorithms
#
# - Cost-proportionate over-sampling [Elkan, 2001]
#
# - SMOTE [Chawla et al., 2002]
#
# - Cost-proportionate rejection-sampling [Zadrozny et al., 2003]
#
# - Thresholding optimization [Sheng and Ling, 2006]
#
# - Bayes minimum risk [Correa Bahnsen et al., 2014a]
#
# - Cost-sensitive logistic regression [Correa Bahnsen et al., 2014b]
#
# - Cost-sensitive decision trees [Correa Bahnsen et al., 2015a]
#
# - Cost-sensitive ensemble methods: cost-sensitive bagging, cost-sensitive pasting, cost-sensitive random forest and cost-sensitive random patches [Correa Bahnsen et al., 2015c]
# # CostCla Databases
#
# - Credit Scoring1 - Kaggle credit competition [Data], cost matrix: [Correa Bahnsen et al., 2014]
#
# - Credit Scoring 2 - PAKDD2009 Credit [Data], cost matrix: [Correa Bahnsen et al., 2014a]
#
# - Direct Marketing - PAKDD2009 Credit [Data], cost matrix: [Correa Bahnsen et al., 2014b]
#
# - Churn Modeling, soon
#
# - Fraud Detection, soon
# # Future Work
#
# - CSDT in Cython
# - Cost-sensitive class-dependent algorithms
# - Sampling algorithms
# - Probability calibration (Only ROCCH)
# - Other algorithms
# - More databases
# You find the presentation and the IPython Notebook here:
#
# * http://nbviewer.ipython.org/format/slides/github/
# albahnsen/CostSensitiveClassification/blob/
# master/doc/tutorials/slides_edcs_fraud_detection.ipynb#/
# * https://github.com/albahnsen/CostSensitiveClassification/ blob/master/doc/tutorials/slides_edcs_fraud_detection.ipynb
#
# Thanks!
#
#
#
# In[36]:
#Format from https://github.com/ellisonbg/talk-2013-scipy
from IPython.display import display, HTML
s = """
"""
display(HTML(s))