# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz
from tqdm import tqdm
def plot_confusion_matrix(cm,
target_names,
title='Confusion matrix',
cmap=None,
normalize=True):
"""
given a sklearn confusion matrix (cm), make a nice plot
Arguments
---------
cm: confusion matrix from sklearn.metrics.confusion_matrix
target_names: given classification classes such as [0, 1, 2]
the class names, for example: ['high', 'medium', 'low']
title: the text to display at the top of the matrix
cmap: the gradient of the values displayed from matplotlib.pyplot.cm
see http://matplotlib.org/examples/color/colormaps_reference.html
plt.get_cmap('jet') or plt.cm.Blues
normalize: If False, plot the raw numbers
If True, plot the proportions
Usage
-----
plot_confusion_matrix(cm = cm, # confusion matrix created by
# sklearn.metrics.confusion_matrix
normalize = True, # show proportions
target_names = y_labels_vals, # list of names of the classes
title = best_estimator_name) # title of graph
Citiation
---------
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""
import matplotlib.pyplot as plt
import numpy as np
import itertools
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
if target_names is not None:
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 1.5 if normalize else cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
if normalize:
plt.text(j, i, "{:0.4f}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
else:
plt.text(j, i, "{:,}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
plt.savefig('confusion.png')
plt.show()
df = pd.read_csv('../input/PS_20174392719_1491204439457_log.csv')
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', \
'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})
df.head()
step | type | amount | nameOrig | oldBalanceOrig | newBalanceOrig | nameDest | oldBalanceDest | newBalanceDest | isFraud | isFlaggedFraud | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | PAYMENT | 9839.64 | C1231006815 | 170136.0 | 160296.36 | M1979787155 | 0.0 | 0.0 | 0 | 0 |
1 | 1 | PAYMENT | 1864.28 | C1666544295 | 21249.0 | 19384.72 | M2044282225 | 0.0 | 0.0 | 0 | 0 |
2 | 1 | TRANSFER | 181.00 | C1305486145 | 181.0 | 0.00 | C553264065 | 0.0 | 0.0 | 1 | 0 |
3 | 1 | CASH_OUT | 181.00 | C840083671 | 181.0 | 0.00 | C38997010 | 21182.0 | 0.0 | 1 | 0 |
4 | 1 | PAYMENT | 11668.14 | C2048537720 | 41554.0 | 29885.86 | M1230701703 | 0.0 | 0.0 | 0 | 0 |
df.loc[df.isFraud == 1].type.drop_duplicates().values
array(['TRANSFER', 'CASH_OUT'], dtype=object)
df = df[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]
len(df)
2770409
df.loc[(df.isFraud == 1) & (df.type == 'TRANSFER')].amount.median()
445705.76
df.loc[(df.isFraud == 0) & (df.type == 'TRANSFER')].amount.median()
486521.91000000003
df['Fraud_Heuristic'] = np.where(((df['type'] == 'TRANSFER') &
(df['amount'] > 200000)),1,0)
df['Fraud_Heuristic'].sum()
409110
from sklearn.metrics import f1_score
f1_score(y_pred=df['Fraud_Heuristic'],y_true=df['isFraud'])
0.013131315551742895
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_pred=df['Fraud_Heuristic'],y_true=df['isFraud'])
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)
df.shape
(2770409, 12)
df['hour'] = df['step'] % 24
frauds = []
genuine = []
for i in range(24):
f = len(df[(df['hour'] == i) & (df['isFraud'] == 1)])
g = len(df[(df['hour'] == i) & (df['isFraud'] == 0)])
frauds.append(f)
genuine.append(g)
sns.set_style("white")
fig, ax = plt.subplots(figsize=(10,6))
gen = ax.plot(genuine/np.sum(genuine), label='Genuine')
fr = ax.plot(frauds/np.sum(frauds),dashes=[5, 2], label='Fraud')
#frgen = ax.plot(np.devide(frauds,genuine),dashes=[1, 1], label='Fraud vs Genuine')
plt.xticks(np.arange(24))
legend = ax.legend(loc='upper center', shadow=True)
fig.savefig('time.png')
sns.set_style("white")
fig, ax = plt.subplots(figsize=(10,6))
#gen = ax.plot(genuine/np.sum(genuine), label='Genuine')
#fr = ax.plot(frauds/np.sum(frauds),dashes=[5, 2], label='Fraud')
frgen = ax.plot(np.divide(frauds,np.add(genuine,frauds)), label='Share of fraud')
plt.xticks(np.arange(24))
legend = ax.legend(loc='upper center', shadow=True)
fig.savefig('time_comp.png')
dfFraudTransfer = df[(df.isFraud == 1) & (df.type == 'TRANSFER')]
dfFraudCashOut = df[(df.isFraud == 1) & (df.type == 'CASH_OUT')]
dfFraudTransfer.nameDest.isin(dfFraudCashOut.nameOrig).any()
False
dfNotFraud = df[(df.isFraud == 0)]
dfFraud = df[(df.isFraud == 1)]
dfFraudTransfer.loc[dfFraudTransfer.nameDest.isin(
dfNotFraud.loc[dfNotFraud.type == 'CASH_OUT'].nameOrig.drop_duplicates())]
step | type | amount | nameOrig | oldBalanceOrig | newBalanceOrig | nameDest | oldBalanceDest | newBalanceDest | isFraud | isFlaggedFraud | Fraud_Heuristic | hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1030443 | 65 | TRANSFER | 1282971.57 | C1175896731 | 1282971.57 | 0.0 | C1714931087 | 0.0 | 0.0 | 1 | 0 | 1 | 17 |
6039814 | 486 | TRANSFER | 214793.32 | C2140495649 | 214793.32 | 0.0 | C423543548 | 0.0 | 0.0 | 1 | 0 | 1 | 6 |
6362556 | 738 | TRANSFER | 814689.88 | C2029041842 | 814689.88 | 0.0 | C1023330867 | 0.0 | 0.0 | 1 | 0 | 1 | 18 |
len(dfFraud[(dfFraud.oldBalanceDest == 0) & (dfFraud.newBalanceDest == 0) & (dfFraud.amount)]) / (1.0 * len(dfFraud))
0.4955558261293072
len(dfNotFraud[(dfNotFraud.oldBalanceDest == 0) & (dfNotFraud.newBalanceDest == 0) & (dfNotFraud.amount)]) / (1.0 * len(dfNotFraud))
0.0006176245277308345
dfOdd = df[(df.oldBalanceDest == 0) &
(df.newBalanceDest == 0) &
(df.amount)]
len(dfOdd[(dfOdd.isFraud == 1)]) / len(dfOdd)
0.7046398891966759
len(dfOdd[(dfOdd.oldBalanceOrig <= dfOdd.amount)]) / len(dfOdd)
0.8966412742382271
len(dfOdd[(dfOdd.oldBalanceOrig <= dfOdd.amount) & (dfOdd.isFraud == 1)]) / len(dfOdd[(dfOdd.isFraud == 1)])
0.9636363636363636
dfOdd.columns
Index(['step', 'type', 'amount', 'nameOrig', 'oldBalanceOrig', 'newBalanceOrig', 'nameDest', 'oldBalanceDest', 'newBalanceDest', 'isFraud', 'isFlaggedFraud', 'Fraud_Heuristic', 'hour'], dtype='object')
dfOdd.head(20)
step | type | amount | nameOrig | oldBalanceOrig | newBalanceOrig | nameDest | oldBalanceDest | newBalanceDest | isFraud | isFlaggedFraud | Fraud_Heuristic | hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 1 | TRANSFER | 181.00 | C1305486145 | 181.00 | 0.00 | C553264065 | 0.0 | 0.0 | 1 | 0 | 0 | 1 |
251 | 1 | TRANSFER | 2806.00 | C1420196421 | 2806.00 | 0.00 | C972765878 | 0.0 | 0.0 | 1 | 0 | 0 | 1 |
680 | 1 | TRANSFER | 20128.00 | C137533655 | 20128.00 | 0.00 | C1848415041 | 0.0 | 0.0 | 1 | 0 | 0 | 1 |
969 | 1 | TRANSFER | 1277212.77 | C1334405552 | 1277212.77 | 0.00 | C431687661 | 0.0 | 0.0 | 1 | 0 | 1 | 1 |
1115 | 1 | TRANSFER | 35063.63 | C1364127192 | 35063.63 | 0.00 | C1136419747 | 0.0 | 0.0 | 1 | 0 | 0 | 1 |
1248 | 1 | TRANSFER | 271161.74 | C1658487789 | 0.00 | 0.00 | C1219161283 | 0.0 | 0.0 | 0 | 0 | 1 | 1 |
1869 | 1 | TRANSFER | 25071.46 | C669700766 | 25071.46 | 0.00 | C1384210339 | 0.0 | 0.0 | 1 | 0 | 0 | 1 |
2301 | 1 | TRANSFER | 235238.66 | C1872047468 | 235238.66 | 0.00 | C116289363 | 0.0 | 0.0 | 1 | 0 | 1 | 1 |
3059 | 2 | TRANSFER | 1096187.24 | C1093223281 | 1096187.24 | 0.00 | C2063275841 | 0.0 | 0.0 | 1 | 0 | 1 | 2 |
3162 | 2 | TRANSFER | 963532.14 | C1440057381 | 963532.14 | 0.00 | C268086000 | 0.0 | 0.0 | 1 | 0 | 1 | 2 |
3271 | 2 | TRANSFER | 14949.84 | C140702728 | 14949.84 | 0.00 | C634210724 | 0.0 | 0.0 | 1 | 0 | 0 | 2 |
3683 | 2 | TRANSFER | 18627.02 | C1375503918 | 18627.02 | 0.00 | C234430897 | 0.0 | 0.0 | 1 | 0 | 0 | 2 |
4103 | 3 | TRANSFER | 10539.37 | C1134864869 | 10539.37 | 0.00 | C118648358 | 0.0 | 0.0 | 1 | 0 | 0 | 3 |
4260 | 3 | TRANSFER | 22877.00 | C1247938090 | 22877.00 | 0.00 | C1002031672 | 0.0 | 0.0 | 1 | 0 | 0 | 3 |
4440 | 4 | TRANSFER | 10000000.00 | C7162498 | 12930418.44 | 2930418.44 | C945327594 | 0.0 | 0.0 | 1 | 0 | 1 | 4 |
4442 | 4 | TRANSFER | 2930418.44 | C2047521920 | 2930418.44 | 0.00 | C449261773 | 0.0 | 0.0 | 1 | 0 | 1 | 4 |
4667 | 4 | TRANSFER | 169941.73 | C540962910 | 169941.73 | 0.00 | C2127862399 | 0.0 | 0.0 | 1 | 0 | 0 | 4 |
4693 | 4 | TRANSFER | 13707.11 | C17222024 | 13707.11 | 0.00 | C410033330 | 0.0 | 0.0 | 1 | 0 | 0 | 4 |
4775 | 4 | TRANSFER | 86070.17 | C1844941220 | 86070.17 | 0.00 | C1191544932 | 0.0 | 0.0 | 1 | 0 | 0 | 4 |
4857 | 5 | TRANSFER | 120074.73 | C1409933277 | 120074.73 | 0.00 | C162114152 | 0.0 | 0.0 | 1 | 0 | 0 | 5 |
df.head()
step | type | amount | nameOrig | oldBalanceOrig | newBalanceOrig | nameDest | oldBalanceDest | newBalanceDest | isFraud | isFlaggedFraud | Fraud_Heuristic | hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 1 | TRANSFER | 181.00 | C1305486145 | 181.0 | 0.0 | C553264065 | 0.0 | 0.00 | 1 | 0 | 0 | 1 |
3 | 1 | CASH_OUT | 181.00 | C840083671 | 181.0 | 0.0 | C38997010 | 21182.0 | 0.00 | 1 | 0 | 0 | 1 |
15 | 1 | CASH_OUT | 229133.94 | C905080434 | 15325.0 | 0.0 | C476402209 | 5083.0 | 51513.44 | 0 | 0 | 0 | 1 |
19 | 1 | TRANSFER | 215310.30 | C1670993182 | 705.0 | 0.0 | C1100439041 | 22425.0 | 0.00 | 0 | 0 | 1 | 1 |
24 | 1 | TRANSFER | 311685.89 | C1984094095 | 10835.0 | 0.0 | C932583850 | 6267.0 | 2719172.89 | 0 | 0 | 1 | 1 |
df['type'] = 'type_' + df['type'].astype(str)
# Get dummies
dummies = pd.get_dummies(df['type'])
# Add dummies to df
df = pd.concat([df,dummies],axis=1)
#remove original column
del df['type']
Predictive modeling with Keras
df = df.drop(['nameOrig','nameDest','Fraud_Heuristic'], axis= 1)
df['isNight'] = np.where((2 <= df['hour']) & (df['hour'] <= 6), 1,0)
df[df['isNight'] == 1].isFraud.mean()
0.35705263157894734
df.head()
step | amount | oldBalanceOrig | newBalanceOrig | oldBalanceDest | newBalanceDest | isFraud | isFlaggedFraud | hour | type_CASH_OUT | type_TRANSFER | isNight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 1 | 181.00 | 181.0 | 0.0 | 0.0 | 0.00 | 1 | 0 | 1 | 0 | 1 | 0 |
3 | 1 | 181.00 | 181.0 | 0.0 | 21182.0 | 0.00 | 1 | 0 | 1 | 1 | 0 | 0 |
15 | 1 | 229133.94 | 15325.0 | 0.0 | 5083.0 | 51513.44 | 0 | 0 | 1 | 1 | 0 | 0 |
19 | 1 | 215310.30 | 705.0 | 0.0 | 22425.0 | 0.00 | 0 | 0 | 1 | 0 | 1 | 0 |
24 | 1 | 311685.89 | 10835.0 | 0.0 | 6267.0 | 2719172.89 | 0 | 0 | 1 | 0 | 1 | 0 |
df = df.drop(['step','hour'],axis=1)
df.head()
amount | oldBalanceOrig | newBalanceOrig | oldBalanceDest | newBalanceDest | isFraud | isFlaggedFraud | type_CASH_OUT | type_TRANSFER | isNight | |
---|---|---|---|---|---|---|---|---|---|---|
2 | 181.00 | 181.0 | 0.0 | 0.0 | 0.00 | 1 | 0 | 0 | 1 | 0 |
3 | 181.00 | 181.0 | 0.0 | 21182.0 | 0.00 | 1 | 0 | 1 | 0 | 0 |
15 | 229133.94 | 15325.0 | 0.0 | 5083.0 | 51513.44 | 0 | 0 | 1 | 0 | 0 |
19 | 215310.30 | 705.0 | 0.0 | 22425.0 | 0.00 | 0 | 0 | 0 | 1 | 0 |
24 | 311685.89 | 10835.0 | 0.0 | 6267.0 | 2719172.89 | 0 | 0 | 0 | 1 | 0 |
df.columns.values
array(['amount', 'oldBalanceOrig', 'newBalanceOrig', 'oldBalanceDest', 'newBalanceDest', 'isFraud', 'isFlaggedFraud', 'type_CASH_OUT', 'type_TRANSFER', 'isNight'], dtype=object)
y_df = df['isFraud']
x_df = df.drop('isFraud',axis=1)
y = y_df.values
X = x_df.values
y.shape
(2770409,)
X.shape
(2770409, 9)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.33,
random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
test_size=0.1,
random_state=42)
from imblearn.over_sampling import SMOTE, RandomOverSampler
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD
# Log reg
model = Sequential()
model.add(Dense(1, input_dim=9))
model.add(Activation('sigmoid'))
model.summary()
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_20 (Dense) (None, 1) 10 _________________________________________________________________ activation_20 (Activation) (None, 1) 0 ================================================================= Total params: 10 Trainable params: 10 Non-trainable params: 0 _________________________________________________________________
model.compile(loss='binary_crossentropy',
optimizer=SGD(lr=1e-5),
metrics=['acc'])
model.fit(X_train_res,y_train_res,
epochs=5,
batch_size=256,
validation_data=(X_val,y_val))
Train on 3331258 samples, validate on 185618 samples Epoch 1/5 3331258/3331258 [==============================] - 21s 6us/step - loss: 0.7453 - acc: 0.4799 - val_loss: 0.4254 - val_acc: 0.9733 Epoch 2/5 1411840/3331258 [===========>..................] - ETA: 11s - loss: 0.7419 - acc: 0.4797
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-199-7776f19f7d34> in <module>() 2 epochs=5, 3 batch_size=256, ----> 4 validation_data=(X_val_scale,y_val)) /opt/conda/lib/python3.6/site-packages/Keras-2.1.3-py3.6.egg/keras/models.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs) 964 initial_epoch=initial_epoch, 965 steps_per_epoch=steps_per_epoch, --> 966 validation_steps=validation_steps) 967 968 def evaluate(self, x=None, y=None, /opt/conda/lib/python3.6/site-packages/Keras-2.1.3-py3.6.egg/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs) 1671 initial_epoch=initial_epoch, 1672 steps_per_epoch=steps_per_epoch, -> 1673 validation_steps=validation_steps) 1674 1675 def evaluate(self, x=None, y=None, /opt/conda/lib/python3.6/site-packages/Keras-2.1.3-py3.6.egg/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps) 1206 ins_batch[i] = ins_batch[i].toarray() 1207 -> 1208 outs = f(ins_batch) 1209 if not isinstance(outs, list): 1210 outs = [outs] /opt/conda/lib/python3.6/site-packages/Keras-2.1.3-py3.6.egg/keras/backend/tensorflow_backend.py in __call__(self, inputs) 2473 session = get_session() 2474 updated = session.run(fetches=fetches, feed_dict=feed_dict, -> 2475 **self.session_kwargs) 2476 return updated[:len(self.outputs)] 2477 /opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata) 903 try: 904 result = self._run(None, fetches, feed_dict, options_ptr, --> 905 run_metadata_ptr) 906 if run_metadata: 907 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr) /opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata) 1135 if final_fetches or final_targets or (handle and feed_dict_tensor): 1136 results = self._do_run(handle, final_targets, final_fetches, -> 1137 feed_dict_tensor, options, run_metadata) 1138 else: 1139 results = [] /opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata) 1353 if handle is None: 1354 return self._do_call(_run_fn, self._session, feeds, fetches, targets, -> 1355 options, run_metadata) 1356 else: 1357 return self._do_call(_prun_fn, self._session, handle, feeds, fetches) /opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args) 1359 def _do_call(self, fn, *args): 1360 try: -> 1361 return fn(*args) 1362 except errors.OpError as e: 1363 message = compat.as_text(e.message) /opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata) 1338 else: 1339 return tf_session.TF_Run(session, options, feed_dict, fetch_list, -> 1340 target_list, status, run_metadata) 1341 1342 def _prun_fn(session, handle, feed_dict, fetch_list): KeyboardInterrupt:
y_pred = model.predict(X_test)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred < 0.5] = 0
f1_score(y_pred=y_pred,y_true=y_test)
0.054384286716408395
cm = confusion_matrix(y_pred=y_pred,y_true=y_test)
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)
model = Sequential()
model.add(Dense(16,input_dim=9))
model.add(Activation('tanh'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',optimizer=SGD(lr=1e-4), metrics=['acc'])
model.fit(X_train_res,y_train_res,
epochs=5, batch_size=256,
validation_data=(X_val,y_val))
Train on 3331258 samples, validate on 185618 samples Epoch 1/5 3331258/3331258 [==============================] - 22s 7us/step - loss: 0.6064 - acc: 0.6922 - val_loss: 1.0665 - val_acc: 0.1872 Epoch 2/5 729856/3331258 [=====>........................] - ETA: 17s - loss: 0.6029 - acc: 0.6968
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-205-b2b4ddf5127e> in <module>() 1 model.fit(X_train_res,y_train_res, 2 epochs=5, batch_size=256, ----> 3 validation_data=(X_val_scale,y_val)) /opt/conda/lib/python3.6/site-packages/Keras-2.1.3-py3.6.egg/keras/models.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs) 964 initial_epoch=initial_epoch, 965 steps_per_epoch=steps_per_epoch, --> 966 validation_steps=validation_steps) 967 968 def evaluate(self, x=None, y=None, /opt/conda/lib/python3.6/site-packages/Keras-2.1.3-py3.6.egg/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs) 1671 initial_epoch=initial_epoch, 1672 steps_per_epoch=steps_per_epoch, -> 1673 validation_steps=validation_steps) 1674 1675 def evaluate(self, x=None, y=None, /opt/conda/lib/python3.6/site-packages/Keras-2.1.3-py3.6.egg/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps) 1194 ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]] 1195 else: -> 1196 ins_batch = _slice_arrays(ins, batch_ids) 1197 except TypeError: 1198 raise TypeError('TypeError while preparing batch. ' /opt/conda/lib/python3.6/site-packages/Keras-2.1.3-py3.6.egg/keras/engine/training.py in _slice_arrays(arrays, start, stop) 382 if hasattr(start, 'shape'): 383 start = start.tolist() --> 384 return [None if x is None else x[start] for x in arrays] 385 else: 386 return [None if x is None else x[start:stop] for x in arrays] /opt/conda/lib/python3.6/site-packages/Keras-2.1.3-py3.6.egg/keras/engine/training.py in <listcomp>(.0) 382 if hasattr(start, 'shape'): 383 start = start.tolist() --> 384 return [None if x is None else x[start] for x in arrays] 385 else: 386 return [None if x is None else x[start:stop] for x in arrays] KeyboardInterrupt:
y_pred = model.predict(X_test)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred < 0.5] = 0
f1_score(y_pred=y_pred,y_true=y_test)
0.001674751441885722
cm = confusion_matrix(y_pred=y_pred,y_true=y_test)
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
from IPython.display import Image as PImage
#import pydotplus
dot_data = StringIO()
'''export_graphviz(dtree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)'''
with open("tree1.dot", 'w') as f:
f = export_graphviz(dtree,
out_file=f,
max_depth = 3,
impurity = True,
feature_names = list(df.drop(['isFraud'], axis=1)),
class_names = ['Genuine', 'Fraud'],
rounded = True,
filled= True )
#Convert .dot to .png to allow display in web notebook
check_call(['dot','-Tpng','tree1.dot','-o','tree1.png'])
# Annotating chart with PIL
img = Image.open("tree1.png")
draw = ImageDraw.Draw(img)
font = ImageFont.truetype('/usr/share/fonts/truetype/liberation/LiberationSerif-Bold.ttf', 26)
img.save('sample-out.png')
PImage("sample-out.png")
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10,n_jobs=-1)
rf.fit(X_train,y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1, oob_score=False, random_state=None, verbose=0, warm_start=False)
y_pred = rf.predict(X_test)
f1_score(y_pred=y_pred,y_true=y_test)
0.8749502190362406
cm = confusion_matrix(y_pred=y_pred,y_true=y_test)
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)
import xgboost as xgb
booster = xgb.XGBClassifier(n_jobs=-1)
booster = booster.fit(X_train,y_train)
y_pred = booster.predict(X_test)
f1_score(y_pred=y_pred,y_true=y_test)
0.85572959604286891
cm = confusion_matrix(y_pred=y_pred,y_true=y_test)
plot_confusion_matrix(cm,['Genuine','Fraud'], normalize=False)
# Reload data
df = pd.read_csv('../input/PS_20174392719_1491204439457_log.csv')
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', \
'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})
df.head()
df = df.drop(['nameDest','nameOrig','step'],axis=1)
df['type'].unique()
map_dict = {}
for token, value in enumerate(df['type'].unique()):
map_dict[value] = token
map_dict
df["type"].replace(map_dict, inplace=True)
df.head()
other_cols = [c for c in df.columns if ((c != 'type') and (c != 'isFraud'))]
other_cols
from keras.models import Model
from keras.layers import Embedding, Merge, Dense, Activation, Reshape, Input, Concatenate
num_types = len(df['type'].unique())
type_embedding_dim = 3
inputs = []
outputs = []
type_in = Input(shape=(1,))
type_embedding = Embedding(num_types,type_embedding_dim,input_length=1)(type_in)
type_out = Reshape(target_shape=(type_embedding_dim,))(type_embedding)
type_model = Model(type_in,type_out)
inputs.append(type_in)
outputs.append(type_out)
num_rest = len(other_cols)
rest_in = Input(shape = (num_rest,))
rest_out = Dense(16)(rest_in)
rest_model = Model(rest_in,rest_out)
inputs.append(rest_in)
outputs.append(rest_out)
concatenated = Concatenate()(outputs)
x = Dense(16)(concatenated)
x = Activation('sigmoid')(x)
x = Dense(1)(concatenated)
model_out = Activation('sigmoid')(x)
merged_model = Model(inputs, model_out)
merged_model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
types = df['type']
rest = df[other_cols]
target = df['isFraud']
history = merged_model.fit([types.values,rest.values],target.values,
epochs = 1, batch_size = 128)
merged_model.summary()