In [1]:

import  warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve, classification_report

import h2o
from h2o.frame import H2OFrame
from h2o.estimators.random_forest import H2ORandomForestEstimator

%matplotlib inline

Load Dataset¶

In [2]:

data = pd.read_csv('./data/Fraud/Fraud_Data.csv', parse_dates=['signup_time', 'purchase_time'])
data.head()

Out[2]:

	user_id	signup_time	purchase_time	purchase_value	device_id	source	browser	sex	age	ip_address	class
0	22058	2015-02-24 22:55:49	2015-04-18 02:47:11	34	QVPSPJUOCKZAR	SEO	Chrome	M	39	7.327584e+08	0
1	333320	2015-06-07 20:39:50	2015-06-08 01:38:54	16	EOGFQPIZPYXFZ	Ads	Chrome	F	53	3.503114e+08	0
2	1359	2015-01-01 18:52:44	2015-01-01 18:52:45	15	YSSKYOSJHPPLJ	SEO	Opera	M	53	2.621474e+09	1
3	150084	2015-04-28 21:13:25	2015-05-04 13:54:50	44	ATGTXKYKUDUQN	SEO	Safari	M	41	3.840542e+09	0
4	221365	2015-07-21 07:09:52	2015-09-09 18:40:53	39	NAUITBZFJKHWW	Ads	Safari	M	45	4.155831e+08	0

In [3]:

address2country = pd.read_csv('./data/Fraud/IpAddress_to_Country.csv')
address2country.head()

Out[3]:

	lower_bound_ip_address	upper_bound_ip_address	country
0	16777216.0	16777471	Australia
1	16777472.0	16777727	China
2	16777728.0	16778239	China
3	16778240.0	16779263	Australia
4	16779264.0	16781311	China

Add Country to Fraud Data¶

In [5]:

countries = []
for i in range(len(data)):
    ip_address = data.loc[i, 'ip_address']
    tmp = address2country[(address2country['lower_bound_ip_address'] <= ip_address) &
                          (address2country['upper_bound_ip_address'] >= ip_address)]
    if len(tmp) == 1:
        countries.append(tmp['country'].values[0])
    else:
        countries.append('NA')
        
data['country'] = countries

data.head()

Out[5]:

	user_id	signup_time	purchase_time	purchase_value	device_id	source	browser	sex	age	ip_address	class	country
0	22058	2015-02-24 22:55:49	2015-04-18 02:47:11	34	QVPSPJUOCKZAR	SEO	Chrome	M	39	7.327584e+08	0	Japan
1	333320	2015-06-07 20:39:50	2015-06-08 01:38:54	16	EOGFQPIZPYXFZ	Ads	Chrome	F	53	3.503114e+08	0	United States
2	1359	2015-01-01 18:52:44	2015-01-01 18:52:45	15	YSSKYOSJHPPLJ	SEO	Opera	M	53	2.621474e+09	1	United States
3	150084	2015-04-28 21:13:25	2015-05-04 13:54:50	44	ATGTXKYKUDUQN	SEO	Safari	M	41	3.840542e+09	0	NA
4	221365	2015-07-21 07:09:52	2015-09-09 18:40:53	39	NAUITBZFJKHWW	Ads	Safari	M	45	4.155831e+08	0	United States

Feature Engineering¶

Time difference between sign-up time and purchase time
If the device id is unique or certain users are sharing the same device (many different user ids using the same device could be an indicator of fake accounts)
Same for the ip address. Many different users having the same ip address could be an indicator of fake accounts
Usual week of the year and day of the week from time variables

In [6]:

# Get the time difference between purchase time and signup time
time_diff = data['purchase_time'] - data['signup_time']
time_diff = time_diff.apply(lambda x: x.seconds)
data['time_diff'] = time_diff

In [7]:

# Check user number for unique devices
device_num = data[['user_id', 'device_id']].groupby('device_id').count().reset_index()
device_num = device_num.rename(columns={'user_id': 'device_num'})
data = data.merge(device_num, how='left', on='device_id')

In [8]:

# Check user number for unique ip_address
ip_num = data[['user_id', 'ip_address']].groupby('ip_address').count().reset_index()
ip_num = ip_num.rename(columns={'user_id': 'ip_num'})
data = data.merge(ip_num, how='left', on='ip_address')

In [9]:

# Signup day and week
data['signup_day'] = data['signup_time'].apply(lambda x: x.dayofweek)
data['signup_week'] = data['signup_time'].apply(lambda x: x.week)

# Purchase day and week
data['purchase_day'] = data['purchase_time'].apply(lambda x: x.dayofweek)
data['purchase_week'] = data['purchase_time'].apply(lambda x: x.week)

In [10]:

data.head()

Out[10]:

	user_id	signup_time	purchase_time	purchase_value	device_id	source	browser	sex	age	ip_address	class	country	time_diff	device_num	ip_num	signup_day	signup_week	purchase_day	purchase_week
0	22058	2015-02-24 22:55:49	2015-04-18 02:47:11	34	QVPSPJUOCKZAR	SEO	Chrome	M	39	7.327584e+08	0	Japan	13882	1	1	1	9	5	16
1	333320	2015-06-07 20:39:50	2015-06-08 01:38:54	16	EOGFQPIZPYXFZ	Ads	Chrome	F	53	3.503114e+08	0	United States	17944	1	1	6	23	0	24
2	1359	2015-01-01 18:52:44	2015-01-01 18:52:45	15	YSSKYOSJHPPLJ	SEO	Opera	M	53	2.621474e+09	1	United States	1	12	12	3	1	3	1
3	150084	2015-04-28 21:13:25	2015-05-04 13:54:50	44	ATGTXKYKUDUQN	SEO	Safari	M	41	3.840542e+09	0	NA	60085	1	1	1	18	0	19
4	221365	2015-07-21 07:09:52	2015-09-09 18:40:53	39	NAUITBZFJKHWW	Ads	Safari	M	45	4.155831e+08	0	United States	41461	1	1	1	30	2	37

In [11]:

# Define features and target to be used
columns = ['signup_day', 'signup_week', 'purchase_day', 'purchase_week', 'purchase_value', 'source', 
           'browser', 'sex', 'age', 'country', 'time_diff', 'device_num', 'ip_num', 'class']
data = data[columns]
data.head()

Out[11]:

	signup_day	signup_week	purchase_day	purchase_week	purchase_value	source	browser	sex	age	country	time_diff	device_num	ip_num	class
0	1	9	5	16	34	SEO	Chrome	M	39	Japan	13882	1	1	0
1	6	23	0	24	16	Ads	Chrome	F	53	United States	17944	1	1	0
2	3	1	3	1	15	SEO	Opera	M	53	United States	1	12	12	1
3	1	18	0	19	44	SEO	Safari	M	41	NA	60085	1	1	0
4	1	30	2	37	39	Ads	Safari	M	45	United States	41461	1	1	0

Fraudulent Activity Identification¶

In [12]:

# Initialize H2O cluster
h2o.init()
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_121"; OpenJDK Runtime Environment (Zulu 8.20.0.5-macosx) (build 1.8.0_121-b15); OpenJDK 64-Bit Server VM (Zulu 8.20.0.5-macosx) (build 25.121-b15, mixed mode)
  Starting server from /anaconda/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/gx/b15jqbt1567grsfr7l_znn2h0000gn/T/tmpq14c42t4
  JVM stdout: /var/folders/gx/b15jqbt1567grsfr7l_znn2h0000gn/T/tmpq14c42t4/h2o_jifu_started_from_python.out
  JVM stderr: /var/folders/gx/b15jqbt1567grsfr7l_znn2h0000gn/T/tmpq14c42t4/h2o_jifu_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.

H2O cluster uptime:	03 secs
H2O cluster timezone:	America/Chicago
H2O data parsing timezone:	UTC
H2O cluster version:	3.18.0.2
H2O cluster version age:	3 months
H2O cluster name:	H2O_from_python_jifu_3z98p9
H2O cluster total nodes:	1
H2O cluster free memory:	3.556 Gb
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster status:	accepting new members, healthy
H2O connection url:	http://127.0.0.1:54321
H2O connection proxy:	None
H2O internal security:	False
H2O API Extensions:	XGBoost, Algos, AutoML, Core V3, Core V4
Python version:	3.5.4 final

In [13]:

# Transform to H2O Frame, and make sure the target variable is categorical
h2o_df = H2OFrame(data)

for name in ['signup_day', 'purchase_day', 'source', 'browser', 'sex', 'country', 'class']:
    h2o_df[name] = h2o_df[name].asfactor()

h2o_df.summary()

Parse progress: |█████████████████████████████████████████████████████████| 100%

	signup_day	signup_week	purchase_day	purchase_week	purchase_value	source	browser	sex	age	country	time_diff	device_num	ip_num	class
type	enum	int	enum	int	int	enum	enum	enum	int	enum	int	int	int	enum
mins		1.0		1.0	9.0				18.0		1.0	1.0	1.0
mean		16.501740430938508		24.658571126052173	36.93537243898567				33.14070358409671		40942.584427444956	1.6843665625496433	1.6027185134205137
maxs		34.0		51.0	154.0				76.0		86399.0	20.0	20.0
sigma		9.814287461798854		11.651556782719474	18.32276214866212				8.617733490961495		26049.661902118358	2.6169536028041693	2.596239527375835
zeros		0		0	0				0		0	0	0
missing	0	0	0	0	0	0	0	0	0	0	0	0	0	0
0	1	9.0	5	16.0	34.0	SEO	Chrome	M	39.0	Japan	13882.0	1.0	1.0	0
1	6	23.0	0	24.0	16.0	Ads	Chrome	F	53.0	United States	17944.0	1.0	1.0	0
2	3	1.0	3	1.0	15.0	SEO	Opera	M	53.0	United States	1.0	12.0	12.0	1
3	1	18.0	0	19.0	44.0	SEO	Safari	M	41.0	NA	60085.0	1.0	1.0	0
4	1	30.0	2	37.0	39.0	Ads	Safari	M	45.0	United States	41461.0	1.0	1.0	0
5	3	21.0	3	28.0	42.0	Ads	Chrome	M	18.0	Canada	7331.0	1.0	1.0	0
6	5	31.0	3	35.0	11.0	Ads	Chrome	F	19.0	NA	17825.0	1.0	1.0	0
7	0	15.0	0	22.0	27.0	Ads	Opera	M	34.0	United States	35129.0	1.0	1.0	0
8	1	17.0	1	23.0	30.0	SEO	IE	F	43.0	China	51800.0	1.0	1.0	0
9	6	4.0	0	13.0	62.0	Ads	IE	M	31.0	United States	18953.0	1.0	1.0	0

In [14]:

# Split into 70% training and 30% test dataset
strat_split = h2o_df['class'].stratified_split(test_frac=0.3, seed=42)

train = h2o_df[strat_split == 'train']
test = h2o_df[strat_split == 'test']

# Define features and target
feature = ['signup_day', 'signup_week', 'purchase_day', 'purchase_week', 'purchase_value', 
           'source', 'browser', 'sex', 'age', 'country', 'time_diff', 'device_num', 'ip_num']
target = 'class'

In [15]:

# Build random forest model 
model = H2ORandomForestEstimator(balance_classes=True, ntrees=100, mtries=-1, stopping_rounds=5, 
                                 stopping_metric='auc', score_each_iteration=True, seed=42)
model.train(x=feature, y=target, training_frame=train, validation_frame=test)

drf Model Build progress: |███████████████████████████████████████████████| 100%

In [16]:

model.score_history()

Out[16]:

	timestamp	duration	number_of_trees	training_rmse	training_logloss	training_auc	training_lift	training_classification_error	validation_rmse	validation_logloss	validation_auc	validation_lift	validation_classification_error
0	2018-05-20 20:25:44	0.100 sec	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	2018-05-20 20:25:45	1.181 sec	1.0	0.329237	0.763404	0.956487	1.930543	0.103585	0.257295	1.705777	0.780964	7.318324	0.064986
2	2018-05-20 20:25:46	1.837 sec	2.0	0.337703	0.666988	0.958739	1.945962	0.100437	0.211104	0.612876	0.808146	10.238311	0.045618
3	2018-05-20 20:25:47	2.924 sec	3.0	0.342474	0.585470	0.962420	1.959389	0.096241	0.205624	0.416203	0.820663	10.566838	0.044361
4	2018-05-20 20:25:48	4.475 sec	4.0	0.341220	0.530667	0.967584	1.967190	0.088710	0.204276	0.340733	0.823580	10.641350	0.044140
5	2018-05-20 20:25:49	4.799 sec	5.0	0.339991	0.494876	0.970585	1.973005	0.085154	0.203786	0.294007	0.825970	10.669638	0.044162
6	2018-05-20 20:25:49	5.173 sec	6.0	0.338576	0.454248	0.974636	1.979192	0.081914	0.203566	0.268536	0.828691	10.674389	0.044096
7	2018-05-20 20:25:49	5.534 sec	7.0	0.337354	0.427348	0.977835	1.983172	0.076219	0.203257	0.251817	0.830573	10.679152	0.044096
8	2018-05-20 20:25:50	5.923 sec	8.0	0.337933	0.405616	0.980248	1.987241	0.072625	0.203124	0.243703	0.831204	10.679152	0.044118
9	2018-05-20 20:25:50	6.339 sec	9.0	0.336345	0.385657	0.982926	1.990128	0.067415	0.203066	0.231357	0.832550	10.679152	0.044118
10	2018-05-20 20:25:51	6.749 sec	10.0	0.334526	0.366322	0.985589	1.992683	0.061423	0.203097	0.227298	0.832316	10.679152	0.044427
11	2018-05-20 20:25:51	7.162 sec	11.0	0.335901	0.357655	0.986884	1.994899	0.058755	0.203096	0.220205	0.833527	10.679152	0.044228
12	2018-05-20 20:25:51	7.571 sec	12.0	0.332650	0.341858	0.989303	1.996165	0.052590	0.203156	0.218058	0.833289	10.679152	0.044162
13	2018-05-20 20:25:52	8.018 sec	13.0	0.332947	0.337632	0.990264	1.996937	0.050032	0.203230	0.213603	0.833152	10.679152	0.044118
14	2018-05-20 20:25:52	8.467 sec	14.0	0.334967	0.338206	0.990769	1.997604	0.049137	0.203201	0.209188	0.834405	10.679152	0.044471
15	2018-05-20 20:25:53	8.945 sec	15.0	0.334957	0.335644	0.991332	1.998106	0.047514	0.203163	0.207753	0.834603	10.679152	0.044361
16	2018-05-20 20:25:53	9.403 sec	16.0	0.334266	0.331837	0.992059	1.998392	0.045455	0.203135	0.204570	0.835777	10.679152	0.044118
17	2018-05-20 20:25:54	9.872 sec	17.0	0.332295	0.324349	0.993148	1.998720	0.041518	0.203177	0.203516	0.836050	10.679152	0.044228
18	2018-05-20 20:25:54	10.358 sec	18.0	0.331662	0.320754	0.993694	1.999123	0.039879	0.203168	0.202761	0.836510	10.679152	0.044625
19	2018-05-20 20:25:55	10.920 sec	19.0	0.330554	0.316733	0.994235	1.999381	0.038212	0.203183	0.202061	0.836516	10.679152	0.044493
20	2018-05-20 20:25:55	11.517 sec	20.0	0.331997	0.319453	0.994307	1.999455	0.037777	0.203136	0.198964	0.837179	10.679152	0.044846
21	2018-05-20 20:25:56	12.045 sec	21.0	0.332174	0.319514	0.994558	1.999454	0.036471	0.203140	0.198079	0.837936	10.679152	0.044272
22	2018-05-20 20:25:56	12.578 sec	22.0	0.331975	0.318617	0.994821	1.999529	0.035568	0.203130	0.196660	0.838474	10.679152	0.044272
23	2018-05-20 20:25:57	13.145 sec	23.0	0.331260	0.316874	0.995103	1.999529	0.034353	0.203161	0.196128	0.838535	10.679152	0.044383
24	2018-05-20 20:25:58	13.706 sec	24.0	0.333191	0.320829	0.995035	1.999566	0.034467	0.203116	0.194919	0.839078	10.679152	0.044780
25	2018-05-20 20:25:58	14.287 sec	25.0	0.333100	0.320371	0.995231	1.999566	0.033742	0.203085	0.194157	0.839023	10.679152	0.044493
26	2018-05-20 20:25:59	14.897 sec	26.0	0.334370	0.322707	0.995213	1.999566	0.033783	0.203029	0.192694	0.838821	10.679152	0.044493
27	2018-05-20 20:25:59	15.503 sec	27.0	0.334643	0.322935	0.995278	1.999566	0.033309	0.202999	0.189930	0.839291	10.679152	0.044603
28	2018-05-20 20:26:00	16.160 sec	28.0	0.334961	0.323930	0.995309	1.999566	0.033121	0.202994	0.189183	0.839264	10.679152	0.044427
29	2018-05-20 20:26:01	16.805 sec	29.0	0.335004	0.323465	0.995424	1.999566	0.032500	0.202957	0.188449	0.839421	10.679152	0.044449
30	2018-05-20 20:26:01	17.456 sec	30.0	0.334927	0.323080	0.995487	1.999604	0.032255	0.202957	0.187168	0.839391	10.679152	0.044692

In [17]:

# Feature importance
importance = model.varimp(use_pandas=True)

fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(x='scaled_importance', y='variable', data=importance)
plt.show()

In [18]:

# Make predictions 
train_true = train.as_data_frame()['class'].values
test_true = test.as_data_frame()['class'].values
train_pred = model.predict(train).as_data_frame()['p1'].values
test_pred = model.predict(test).as_data_frame()['p1'].values

train_fpr, train_tpr, _ = roc_curve(train_true, train_pred)
test_fpr, test_tpr, _ = roc_curve(test_true, test_pred)
train_auc = np.round(auc(train_fpr, train_tpr), 3)
test_auc = np.round(auc(test_fpr, test_tpr), 3)

drf prediction progress: |████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%

In [19]:

# Classification report
print(classification_report(y_true=test_true, y_pred=(test_pred > 0.5).astype(int)))

             precision    recall  f1-score   support

          0       0.95      1.00      0.98     41088
          1       1.00      0.53      0.69      4245

avg / total       0.96      0.96      0.95     45333

In [20]:

train_fpr = np.insert(train_fpr, 0, 0)
train_tpr = np.insert(train_tpr, 0, 0)
test_fpr = np.insert(test_fpr, 0, 0)
test_tpr = np.insert(test_tpr, 0, 0)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(train_fpr, train_tpr, label='Train AUC: ' + str(train_auc))
ax.plot(test_fpr, test_tpr, label='Test AUC: ' + str(test_auc))
ax.plot(train_fpr, train_fpr, 'k--', label='Chance Curve')
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.grid(True)
ax.legend(fontsize=12)
plt.show()

Based on the ROC, if we care about minimizing false positive, we would choose a cut-off that would give us true positive rate of ~0.5 and false positive rate almost zero (this was essentially the random forest output). However, if we care about maximizing true positive, we will have to decrease the cut-off. This way we will classify more events as “1”: some will be true ones (so true positive goes up) and many, unfortunately, will be false ones (so false positive will also go up).

In [22]:

cols = ['device_num', 'time_diff', 'purchase_week', 'country', 'ip_num']
_ = model.partial_plot(data=train, cols=cols, nbins=200, figsize=(18, 20))

PartialDependencePlot progress: |█████████████████████████████████████████| 100%

In [23]:

# Shutdown h2o instance
h2o.cluster().shutdown()

H2O session _sid_a179 closed.

Regarding “how to use this from a product perspective”: you now have a model that assigns to each user a probability of committing a fraud. You want to think about creating different experiences based on that. For instance:

If predicted fraud probability < X, the user has the normal experience (the high majority should fall here)
If X <= predicted fraud probability < Z (so the user is at risk, but not too much), you can create an additional verification step, like verify your phone number via a code sent by SMS or log in via Facebook.
If predicted fraud probability >= Z (so here is really likely the user is trying to commit a fraud), you can tell the user his session has been put on hold, send this user info to someone who reviews it manually and either blocks the user or decides it is not a fraud so the session is resumed.

This is just an example and there are many different ways to build products around some fraud score. However, it is important because it highlights that a ML model is often really useful when it is combined with a product which is able to take advantage of its strengths and minimize its possible drawbacks (like false positives).

In [ ]: