version 0.1, May 2016
This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License]
pip install tqdm
import pandas as pd
import zipfile
with zipfile.ZipFile('../datasets/fraud_transactions_kaggle.csv.zip', 'r') as z:
f = z.open('fraud_transactions_kaggle.csv')
data = pd.read_csv(f, index_col=0)
data.head()
date | card_number | type | merchant | amount | fraud | |
---|---|---|---|---|---|---|
ID | ||||||
0 | 2011-01-01 08:00:06 | 1942 | 2 | 8328 | 65.16 | 0.0 |
1 | 2011-01-01 08:00:16 | 5629 | 2 | 42588 | 260.84 | 0.0 |
2 | 2011-01-01 08:01:28 | 408 | 2 | 15622 | 6010.05 | 0.0 |
3 | 2011-01-01 08:01:43 | 859 | 2 | 45192 | 348.46 | 0.0 |
4 | 2011-01-01 08:01:48 | 3786 | 2 | 35549 | 1160.35 | 0.0 |
data.tail()
date | card_number | type | merchant | amount | fraud | |
---|---|---|---|---|---|---|
ID | ||||||
199995 | 2012-12-31 17:04:18 | 4069 | 2 | 35828 | 91.22 | NaN |
199996 | 2012-12-31 17:04:51 | 9 | 2 | 46923 | 390.95 | NaN |
199997 | 2012-12-31 17:05:38 | 1481 | 1 | -1 | 0.65 | NaN |
199998 | 2012-12-31 17:05:55 | 1481 | 1 | 4535 | 390.04 | NaN |
199999 | 2012-12-31 17:25:02 | 0 | 1 | 8322 | 308.44 | NaN |
data.fraud.value_counts(dropna=False)
0.0 171048 NaN 27909 1.0 1043 Name: fraud, dtype: int64
from datetime import datetime, timedelta
from tqdm import tqdm
Split for each account and create the date as index
card_numbers = data['card_number'].unique()
data['trx_id'] = data.index
data.index = pd.DatetimeIndex(data['date'])
data_ = []
for card_number in tqdm(card_numbers):
data_.append(data.query('card_number == ' + str(card_number)))
100%|██████████| 8087/8087 [00:20<00:00, 390.15it/s]
Create Aggregated Features for one account
res_agg = pd.DataFrame(index=data['trx_id'].values,
columns=['Trx_sum_7D', 'Trx_count_1D'])
trx = data_[0]
for i in range(trx.shape[0]):
date = trx.index[i]
trx_id = int(trx.ix[i, 'trx_id'])
# Sum 7 D
agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]
res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()
# Count 1D
agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]
res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]
res_agg.mean()
Trx_sum_7D 1054.881429 Trx_count_1D 0.640693 dtype: float64
All accounts
for trx in tqdm(data_):
for i in range(trx.shape[0]):
date = trx.index[i]
trx_id = int(trx.ix[i, 'trx_id'])
# Sum 7 D
agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]
res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()
# Count 1D
agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]
res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]
100%|██████████| 8087/8087 [04:26<00:00, 30.33it/s]
res_agg.head()
Trx_sum_7D | Trx_count_1D | |
---|---|---|
0 | 0 | 0 |
1 | 0 | 0 |
2 | 0 | 0 |
3 | 0 | 0 |
4 | 0 | 0 |
data.index = data.trx_id
data = data.join(res_agg)
data.sample(15, random_state=42).sort_index()
date | card_number | type | merchant | amount | fraud | trx_id | Trx_sum_7D | Trx_count_1D | |
---|---|---|---|---|---|---|---|---|---|
trx_id | |||||||||
4082 | 2011-01-16 16:26:53 | 3558 | 2 | 13505 | 528.82 | 0.0 | 4082 | 307.85 | 0 |
23677 | 2011-04-04 08:13:41 | 1162 | 2 | 9417 | 117.29 | 0.0 | 23677 | 0 | 0 |
30074 | 2011-04-29 13:09:07 | 0 | 1 | 56997 | 21.29 | 0.0 | 30074 | 14171.9 | 2 |
65426 | 2011-09-09 10:11:24 | 4420 | 2 | 57849 | 29.70 | 0.0 | 65426 | 0 | 0 |
72272 | 2011-10-04 10:43:00 | 2114 | 2 | 5109 | 2170.65 | 0.0 | 72272 | 131020 | 7 |
74456 | 2011-10-11 17:17:22 | 2148 | 2 | 1341 | 2150.19 | 0.0 | 74456 | 0 | 0 |
84660 | 2011-11-19 17:06:58 | 1521 | 1 | 35294 | 651.59 | 0.0 | 84660 | 0 | 0 |
117167 | 2012-04-01 12:33:33 | 1471 | 1 | -1 | 650.94 | 0.0 | 117167 | 4381.21 | 1 |
119737 | 2012-04-09 14:27:12 | 2723 | 1 | 38616 | 13.03 | 0.0 | 119737 | 13614.2 | 0 |
132467 | 2012-05-27 16:43:11 | 4857 | 2 | 45373 | 41.70 | 0.0 | 132467 | 634.13 | 10 |
134858 | 2012-06-03 17:05:21 | 2114 | 1 | 18692 | 26.06 | 0.0 | 134858 | 175202 | 7 |
142133 | 2012-06-29 16:21:37 | 7588 | 2 | 35991 | 92.53 | 0.0 | 142133 | 1151.21 | 4 |
158154 | 2012-08-20 10:55:23 | 4420 | 2 | 53353 | 182.65 | 0.0 | 158154 | 121.77 | 0 |
176418 | 2012-10-16 14:23:04 | 1595 | 2 | 25985 | 15397.58 | NaN | 176418 | 0 | 0 |
186433 | 2012-11-20 11:04:00 | 4923 | 2 | 36010 | 217.89 | NaN | 186433 | 573.4 | 0 |
X = data.loc[~data.fraud.isnull()]
y = X.fraud
X = X.drop(['fraud', 'date', 'card_number'], axis=1)
X_kaggle = data.loc[data.fraud.isnull()]
X_kaggle = X_kaggle.drop(['fraud', 'date', 'card_number'], axis=1)
X_kaggle.head()
type | merchant | amount | trx_id | Trx_sum_7D | Trx_count_1D | |
---|---|---|---|---|---|---|
trx_id | ||||||
172091 | 2 | 13273 | 208.51 | 172091 | 120165 | 14 |
172092 | 2 | 34472 | 525.05 | 172092 | 71042.4 | 0 |
172093 | 2 | 37909 | 802.24 | 172093 | 120374 | 15 |
172094 | 2 | 35167 | 130.32 | 172094 | 90638.1 | 9 |
172095 | 2 | 35073 | 9696.96 | 172095 | 0 | 0 |
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced')
from sklearn.metrics import fbeta_score
KFold cross-validation
from sklearn.cross_validation import KFold
kf = KFold(X.shape[0], n_folds=5)
res = []
for train, test in kf:
X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba>0.05).astype(int)
res.append(fbeta_score(y_test, y_pred, beta=2))
pd.Series(res).describe()
count 5.000000 mean 0.078145 std 0.032472 min 0.054945 25% 0.057692 50% 0.062500 75% 0.082713 max 0.132877 dtype: float64
Train with all
clf.fit(X, y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=None, verbose=0, warm_start=False)
y_pred = clf.predict_proba(X_kaggle)[:, 1]
y_pred = (y_pred>0.05).astype(int)
y_pred = pd.Series(y_pred,name='fraud', index=X_kaggle.index)
y_pred.head(10)
trx_id 172091 0 172092 1 172093 1 172094 0 172095 1 172096 1 172097 1 172098 0 172099 1 172100 0 Name: fraud, dtype: int64
y_pred.to_csv('fraud_transactions_kaggle_1.csv', header=True, index_label='ID')