#!/usr/bin/env python # coding: utf-8 #

Example-Dependent Cost-Sensitive Fraud Detection using CostCla

# # #

Alejandro Correa Bahnsen, PhD

Data Scientist

# #

PyCaribbean, Santo Domingo, Dominican Republic, Feb 2016

# #

About Me

# %%html # # ### A brief bio: # # * PhD in **Machine Learning** at Luxembourg University # * Data Scientist at Easy Solutions # * Worked for +8 years as a data scientist at GE Money, Scotiabank and SIX Financial Services # * Bachelor in Industrial Engineering and Master in Financial Engineering # * Organizer of Big Data & Data Science Bogota Meetup # * Sport addict, love to swim, play tennis, squash, and volleyball, among others. # #

# # # # # #
# # # al.bahnsen@gmail.com #
# # # # # http://github.com/albahnsen # #
# # # # http://linkedin.com/in/albahnsen # #
# # # # @albahnsen # #
# # Agenda # # * Quick Intro to Fraud Detection # * Financial Evaluation of a Fraud Detection Model # * Example-Dependent Classification # * CostCla Library # * Conclusion and Future Work # # Fraud Detection # Estimate the **probability** of a transaction being **fraud** based on analyzing customer patterns and recent fraudulent behavior #

# # # Fraud Detection # Issues when constructing a fraud detection system: # # * Skewness of the data # * **Cost-sensitivity** # * Short time response of the system # * Dimensionality of the search space # * Feature preprocessing # * Model selection # Different machine learning methods are used in practice, and in the # literature: logistic regression, neural networks, discriminant # analysis, genetic programing, decision trees, random forests among others # # Fraud Detection # Formally, a fraud detection is a statistical model that allows the estimation of the probability of transaction $i$ being a fraud ($y_i=1$) # # $$\hat p_i=P(y_i=1|\mathbf{x}_i)$$ #

Data!

# #

# # # Load dataset from CostCla package # In[1]: import pandas as pd import numpy as np from costcla import datasets # In[2]: from costcla.datasets.base import Bunch def load_fraud(cost_mat_parameters=dict(Ca=10)): # data_ = pd.read_pickle("trx_fraud_data.pk") data_ = pd.read_pickle("/home/al/DriveAl/EasySol/Projects/DetectTA/Tests/trx_fraud_data_v3_agg.pk") target = data_['fraud'].values data = data_.drop('fraud', 1) n_samples = data.shape[0] cost_mat = np.zeros((n_samples, 4)) cost_mat[:, 0] = cost_mat_parameters['Ca'] cost_mat[:, 1] = data['amount'] cost_mat[:, 2] = cost_mat_parameters['Ca'] cost_mat[:, 3] = 0.0 return Bunch(data=data.values, target=target, cost_mat=cost_mat, target_names=['Legitimate Trx', 'Fraudulent Trx'], DESCR='', feature_names=data.columns.values, name='FraudDetection') datasets.load_fraud = load_fraud # In[3]: data = datasets.load_fraud() # ### Data file # In[4]: print(data.keys()) print('Number of examples ', data.target.shape[0]) # ### Class Label # In[5]: target = pd.DataFrame(pd.Series(data.target).value_counts(), columns=('Frequency',)) target['Percentage'] = (target['Frequency'] / target['Frequency'].sum()) * 100 target.index = ['Negative (Legitimate Trx)', 'Positive (Fraud Trx)'] target.loc['Total Trx'] = [data.target.shape[0], 1.] print(target) # ### Features # In[6]: pd.DataFrame(data.feature_names[:4], columns=('Features',)) # ### Features # In[7]: df = pd.DataFrame(data.data[:, :4], columns=data.feature_names[:4]) df.head(10) # ### Aggregated Features # In[8]: df = pd.DataFrame(data.data[:, 4:], columns=data.feature_names[4:]) df.head(10) # # Fraud Detection as a classification problem # ### Split in training and testing # In[9]: from sklearn.cross_validation import train_test_split X = data.data[:, [2, 3] + list(range(4, data.data.shape[1]))].astype(np.float) X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = \ train_test_split(X, data.target, data.cost_mat, test_size=0.33, random_state=10) # # Fraud Detection as a classification problem # ### Fit models # In[10]: from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier classifiers = {"RF": {"f": RandomForestClassifier()}, "DT": {"f": DecisionTreeClassifier()}} ci_models = ['DT', 'RF'] # Fit the classifiers using the training dataset for model in classifiers.keys(): classifiers[model]["f"].fit(X_train, y_train) classifiers[model]["c"] = classifiers[model]["f"].predict(X_test) classifiers[model]["p"] = classifiers[model]["f"].predict_proba(X_test) classifiers[model]["p_train"] = classifiers[model]["f"].predict_proba(X_train) # # Models performance # ### Evaluate metrics and plot results # In[11]: import warnings warnings.filterwarnings('ignore') # In[12]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt from IPython.core.pylabtools import figsize import seaborn as sns colors = sns.color_palette() figsize(12, 8) # In[13]: from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score measures = {"F1Score": f1_score, "Precision": precision_score, "Recall": recall_score, "Accuracy": accuracy_score} results = pd.DataFrame(columns=measures.keys()) for model in ci_models: results.loc[model] = [measures[measure](y_test, classifiers[model]["c"]) for measure in measures.keys()] # # Models performance # In[14]: def fig_acc(): plt.bar(np.arange(results.shape[0])-0.3, results['Accuracy'], 0.6, label='Accuracy', color=colors[0]) plt.xticks(range(results.shape[0]), results.index) plt.tick_params(labelsize=22); plt.title('Accuracy', size=30) plt.show() # In[15]: fig_acc() # # Models performance # In[16]: def fig_f1(): plt.bar(np.arange(results.shape[0])-0.3, results['Precision'], 0.2, label='Precision', color=colors[0]) plt.bar(np.arange(results.shape[0])-0.3+0.2, results['Recall'], 0.2, label='Recall', color=colors[1]) plt.bar(np.arange(results.shape[0])-0.3+0.4, results['F1Score'], 0.2, label='F1Score', color=colors[2]) plt.xticks(range(results.shape[0]), results.index) plt.tick_params(labelsize=22) plt.ylim([0, 1]) plt.legend(loc='center left', bbox_to_anchor=(1, 0.5),fontsize=22) plt.show() # In[17]: fig_f1() # # Models performance # - None of these measures takes into account the **business and economical realities** that take place in fraud detection. # - Losses due to fraud or customer satisfaction costs, are not considered in the evaluation of the different models. #

Financial Evaluation of a Fraud Detection Model

# # Motivation # # - Typically, a fraud model is evaluated using standard **cost-insensitive measures**. # - However, in practice, the cost associated with **approving a fraudulent transaction** (False Negative) is quite different from the cost associated with **declining a legitimate transaction** (False Positive). # - Furthermore, the costs are **not constant** among transactions. # # Cost Matrix # # # | | Actual Positive ($y_i=1$) | Actual Negative ($y_i=0$)| # |--- |:-: |:-: | # | Pred. Positive ($c_i=1$) | $C_{TP_i}=C_a$ | $C_{FP_i}=C_a$ | # | Pred. Negative ($c_i=0$) | $C_{FN_i}=Amt_i$ | $C_{TN_i}=0$ | # # Where: # # - $C_{FN_i}$ = Amount of the transaction $i$ # - $C_a$ is the administrative cost of dealing with an alert # # For more info see [Correa Bahnsen et al., 2014] # In[18]: # The cost matrix is already calculated for the dataset # cost_mat[C_FP,C_FN,C_TP,C_TN] print(data.cost_mat[[10, 17, 50]]) # # Financial savings # # The financial cost of using a classifier $f$ on $\mathcal{S}$ is calculated by # # $$ Cost(f(\mathcal{S})) = \sum_{i=1}^N y_i(1-c_i)C_{FN_i} + (1-y_i)c_i C_{FP_i}.$$ # # Then the financial savings are defined as the cost of the algorithm versus the cost of using no algorithm at all. # # $$ Savings(f(\mathcal{S})) = \frac{ Cost_l(\mathcal{S}) - Cost(f(\mathcal{S}))} {Cost_l(\mathcal{S})},$$ # # where $Cost_l(\mathcal{S})$ is the cost of the costless class # # Models Savings # ## costcla.metrics.savings_score(y_true, y_pred, cost_mat) # In[19]: # Calculation of the cost and savings from costcla.metrics import savings_score, cost_loss # In[20]: # Evaluate the savings for each model results["Savings"] = np.zeros(results.shape[0]) for model in ci_models: results["Savings"].loc[model] = savings_score(y_test, classifiers[model]["c"], cost_mat_test) # In[21]: # Plot the results def fig_sav(): plt.bar(np.arange(results.shape[0])-0.4, results['Precision'], 0.2, label='Precision', color=colors[0]) plt.bar(np.arange(results.shape[0])-0.4+0.2, results['Recall'], 0.2, label='Recall', color=colors[1]) plt.bar(np.arange(results.shape[0])-0.4+0.4, results['F1Score'], 0.2, label='F1Score', color=colors[2]) plt.bar(np.arange(results.shape[0])-0.4+0.6, results['Savings'], 0.2, label='Savings', color=colors[3]) plt.xticks(range(results.shape[0]), results.index) plt.tick_params(labelsize=22) plt.ylim([0, 1]) plt.xlim([-0.5, results.shape[0] -1 + .5]) plt.legend(loc='center left', bbox_to_anchor=(1, 0.5),fontsize=22) plt.show() # # Models Savings # In[22]: fig_sav() # # Threshold Optimization # Convert a classifier cost-sensitive by selecting a proper threshold # from training instances according to the savings # $$ t \quad = \quad argmax_t \: Savings(c(t), y) $$ # # Threshold Optimization - Code # ``` # costcla.models.ThresholdingOptimization(calibration=True) # ``` # ``` # fit(y_prob_train=None, cost_mat, y_true_train) # ``` # - Parameters # - **y_prob_train** : Predicted probabilities of the training set # - **cost_mat** : Cost matrix of the classification problem. # - **y_true_cal** : True class # ``` # predict(y_prob) # ``` # - Parameters # - **y_prob** : Predicted probabilities # # - Returns # - **y_pred** : Predicted class # # Threshold Optimization # In[23]: from costcla.models import ThresholdingOptimization for model in ci_models: classifiers[model+"-TO"] = {"f": ThresholdingOptimization()} # Fit classifiers[model+"-TO"]["f"].fit(classifiers[model]["p_train"], cost_mat_train, y_train) # Predict classifiers[model+"-TO"]["c"] = classifiers[model+"-TO"]["f"].predict(classifiers[model]["p"]) # In[24]: print('New thresholds') for model in ci_models: print(model + '-TO - ' + str(classifiers[model+'-TO']['f'].threshold_)) # In[25]: for model in ci_models: # Evaluate results.loc[model+"-TO"] = 0 results.loc[model+"-TO", measures.keys()] = \ [measures[measure](y_test, classifiers[model+"-TO"]["c"]) for measure in measures.keys()] results["Savings"].loc[model+"-TO"] = savings_score(y_test, classifiers[model+"-TO"]["c"], cost_mat_test) # # Threshold Optimization # In[26]: fig_sav() # # Models Savings # - There are significant differences in the results when evaluating a model using a traditional cost-insensitive measures # - Train models that take into account the different financial costs #

Example-Dependent Cost-Sensitive Classification

# ## *Why "Example-Dependent" # Cost-sensitive classification ussualy refers to class-dependent costs, where the cost dependends on the class but is assumed constant accross examples. # In fraud detection, different transactions have different amounts, which implies that the costs are not constant # # Bayes Minimum Risk (BMR) # The BMR classifier is a decision model based on quantifying tradeoffs between various decisions using probabilities and the costs that accompany such decisions. # In particular: # # $$ R(c_i=0|\mathbf{x}_i)=C_{TN_i}(1-\hat p_i)+C_{FN_i} \cdot \hat p_i, $$ # and # $$ R(c_i=1|\mathbf{x}_i)=C_{TP_i} \cdot \hat p_i + C_{FP_i}(1- \hat p_i), $$ # # BMR Code # ``` # costcla.models.BayesMinimumRiskClassifier(calibration=True) # ``` # ``` # fit(y_true_cal=None, y_prob_cal=None) # ``` # - Parameters # - **y_true_cal** : True class # - **y_prob_cal** : Predicted probabilities # ``` # predict(y_prob,cost_mat) # ``` # - Parameters # - **y_prob** : Predicted probabilities # - **cost_mat** : Cost matrix of the classification problem. # # - Returns # - **y_pred** : Predicted class # # BMR Code # In[27]: from costcla.models import BayesMinimumRiskClassifier for model in ci_models: classifiers[model+"-BMR"] = {"f": BayesMinimumRiskClassifier()} # Fit classifiers[model+"-BMR"]["f"].fit(y_test, classifiers[model]["p"]) # Calibration must be made in a validation set # Predict classifiers[model+"-BMR"]["c"] = classifiers[model+"-BMR"]["f"].predict(classifiers[model]["p"], cost_mat_test) # In[28]: for model in ci_models: # Evaluate results.loc[model+"-BMR"] = 0 results.loc[model+"-BMR", measures.keys()] = \ [measures[measure](y_test, classifiers[model+"-BMR"]["c"]) for measure in measures.keys()] results["Savings"].loc[model+"-BMR"] = savings_score(y_test, classifiers[model+"-BMR"]["c"], cost_mat_test) # # BMR Results # In[29]: fig_sav() # # BMR Results # Why so important focusing on the Recall # * Average cost of a False Negative # In[30]: print(data.data[data.target == 1, 2].mean()) # * Average cost of a False Positive # In[31]: print(data.cost_mat[:,0].mean()) # # BMR Results # # - Bayes Minimum Risk increases the savings by using a cost-insensitive method and then introducing the costs # - Why not introduce the costs during the estimation of the methods? # ## Cost-Sensitive Decision Trees (CSDT) # # A a new cost-based impurity measure taking into account the costs when all the examples in a leaf # # ``` # costcla.models.CostSensitiveDecisionTreeClassifier(criterion='direct_cost', criterion_weight=False, pruned=True) # ``` # ## Cost-Sensitive Random Forest (CSRF) # # Ensemble of CSDT # # ``` # costcla.models.CostSensitiveRandomForestClassifier(n_estimators=10, max_samples=0.5, max_features=0.5,combination='majority_voting)``` # # CSDT & CSRF Code # In[33]: from costcla.models import CostSensitiveDecisionTreeClassifier from costcla.models import CostSensitiveRandomForestClassifier classifiers = {"CSDT": {"f": CostSensitiveDecisionTreeClassifier()}, "CSRF": {"f": CostSensitiveRandomForestClassifier(combination='majority_bmr')}} # Fit the classifiers using the training dataset for model in classifiers.keys(): classifiers[model]["f"].fit(X_train, y_train, cost_mat_train) if model == "CSRF": classifiers[model]["c"] = classifiers[model]["f"].predict(X_test, cost_mat_test) else: classifiers[model]["c"] = classifiers[model]["f"].predict(X_test) # In[34]: for model in ['CSDT', 'CSRF']: # Evaluate results.loc[model] = 0 results.loc[model, measures.keys()] = \ [measures[measure](y_test, classifiers[model]["c"]) for measure in measures.keys()] results["Savings"].loc[model] = savings_score(y_test, classifiers[model]["c"], cost_mat_test) # # CSDT & CSRF Results # In[35]: fig_sav() # # Lessons Learned (so far ...) # - Selecting models based on traditional statistics does not give the best results in terms of cost # - Models should be evaluated taking into account real financial costs of the application # - Algorithms should be developed to incorporate those financial costs # #

# # # # CostCla Library # # - **CostCla** is a Python open source cost-sensitive classification library built on top of Scikit-learn, Pandas and Numpy. # # - Source code, binaries and documentation are distributed under 3-Clause BSD license in the website http://albahnsen.com/CostSensitiveClassification/ # # CostCla Algorithms # # - Cost-proportionate over-sampling [Elkan, 2001] # # - SMOTE [Chawla et al., 2002] # # - Cost-proportionate rejection-sampling [Zadrozny et al., 2003] # # - Thresholding optimization [Sheng and Ling, 2006] # # - Bayes minimum risk [Correa Bahnsen et al., 2014a] # # - Cost-sensitive logistic regression [Correa Bahnsen et al., 2014b] # # - Cost-sensitive decision trees [Correa Bahnsen et al., 2015a] # # - Cost-sensitive ensemble methods: cost-sensitive bagging, cost-sensitive pasting, cost-sensitive random forest and cost-sensitive random patches [Correa Bahnsen et al., 2015c] # # CostCla Databases # # - Credit Scoring1 - Kaggle credit competition [Data], cost matrix: [Correa Bahnsen et al., 2014] # # - Credit Scoring 2 - PAKDD2009 Credit [Data], cost matrix: [Correa Bahnsen et al., 2014a] # # - Direct Marketing - PAKDD2009 Credit [Data], cost matrix: [Correa Bahnsen et al., 2014b] # # - Churn Modeling, soon # # - Fraud Detection, soon # # Future Work # # - CSDT in Cython # - Cost-sensitive class-dependent algorithms # - Sampling algorithms # - Probability calibration (Only ROCCH) # - Other algorithms # - More databases # You find the presentation and the IPython Notebook here: # # * http://nbviewer.ipython.org/format/slides/github/ # albahnsen/CostSensitiveClassification/blob/ # master/doc/tutorials/slides_edcs_fraud_detection.ipynb#/ # * https://github.com/albahnsen/CostSensitiveClassification/ blob/master/doc/tutorials/slides_edcs_fraud_detection.ipynb # #

Thanks!

# # # # # #

# #	# al.bahnsen@gmail.com #
# # # #	# http://github.com/albahnsen # #
# # #	# http://linkedin.com/in/albahnsen # #
# # #	# @albahnsen # #

# # In[36]: #Format from https://github.com/ellisonbg/talk-2013-scipy from IPython.display import display, HTML s = """ """ display(HTML(s))