#!/usr/bin/env python
# coding: utf-8
#
GAN comparison on Kaggle Credit Card Fraud Data
# Cody Nash
# Development Notebook
#
#
# This notebook accompanies the Toptal blog found here.
#
#
#
# Table of Contents
#
# Setup
#
# Exploratory Data Analysis (EDA)
# xgboost fraud detection
# Classification of fraud data
#
# GAN setup and training
#
# Compare GAN Output
# Generated Data Testing
# Summary of Training Data
#
# DRAGAN testing
#
# Blog Figures:
#
# Figure 3: Data Distributions by Feature and Class
# Figure 5: Comparison of GAN Outputs
# Figure 6: Accuracy of Generated Data Detection
# Figure 7: Differences in Critic Loss
# Figure 8: Effects of Additional Data
# Setup
#
# Table of contents
#
#
# - Load libraries
# - Load common functions
# - Load stored datasets
# - Use linux for xgboost and tensorflow
# In[1]:
# Load libraries and check memory
import psutil ; print(list(psutil.virtual_memory())[0:2])
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('ggplot')
import xgboost as xgb
import pickle
import gc
gc.collect()
print(list(psutil.virtual_memory())[0:2])
# In[2]:
# Load custom functions
import GAN_171103
# For reloading after making changes
import importlib
importlib.reload(GAN_171103)
from GAN_171103 import *
# In[3]:
# Load engineered dataset from EDA section
data = pickle.load(open('data/' + 'credicard.engineered.pkl','rb'))
# data columns will be all other columns except class
data_cols = list(data.columns[ data.columns != 'Class' ])
label_cols = ['Class']
print(data_cols)
print('# of data columns: ',len(data_cols))
# In[4]:
# Put columns in order of importance for xgboost fraud detection (from that section)
# sorted_cols = ['V14', 'V4', 'V12', 'V10', 'V26', 'V17', 'Amount', 'V7', 'V21', 'V28', 'V20', 'V3', 'V18', 'V8', 'V13', 'V22', 'V16', 'V11', 'V19', 'V27', 'V5', 'V6', 'V25', 'V15', 'V24', 'V9', 'V1', 'V2', 'V23', 'Class']
# sorted_cols = ['V14', 'V4', 'V12', 'V10', 'Amount', 'V26', 'V17', 'Time', 'V7', 'V28', 'V21', 'V19', 'V8', 'V3', 'V22', 'V20', 'V25', 'V11', 'V6', 'V16', 'V27', 'V5', 'V18', 'V9', 'V1', 'V2', 'V15', 'V23', 'V24', 'V13', 'Class']
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'Time', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
data = data[ sorted_cols ].copy()
# In[5]:
# Add KMeans generated classes to fraud data - see classification section for more details on this
import sklearn.cluster as cluster
train = data.loc[ data['Class']==1 ].copy()
algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':2, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train[ data_cols ])
print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
fraud_w_classes = train.copy()
fraud_w_classes['Class'] = labels
# In[6]:
# Function to create toy spiral dataset (looks like swiss roll)
def create_toy_spiral_df( n, seed=0):
np.random.seed(seed)
toy = np.array([ [ (i/10+1) * np.sin(i), -(i/10+1) * np.cos(i) ] for i in np.random.uniform(0,3*np.pi,size=n) ])
toy = pd.DataFrame( toy, columns=[ ['v'+str(i+1) for i in range(2)] ])
return toy
# toy = create_toy_spiral_df(1000)
# plt.scatter( toy['v1'], toy['v2'] ) ;
# In[7]:
# Function to create toy dataset of multiple groups of normal distributions in n dimensions
def create_toy_df( n, n_dim, n_classes, seed=0):
toy = pd.DataFrame(columns=[ ['v'+str(i+1) for i in range(n_dim)] + ['Class'] ])
toy_cols = toy.columns
np.random.seed(seed)
for class0 in range(n_classes):
center0s = np.random.randint(-10,10,size=n_dim)/10
var0s = np.random.randint(1,3,size=n_dim)/10
temp = np.array([[class0]]*n)
for dim0 in range(n_dim):
temp = np.hstack( [np.random.normal(center0s[dim0],var0s[dim0],n).reshape(-1,1), temp] )
toy = pd.concat([toy,pd.DataFrame(temp,columns=toy_cols)],axis=0).reset_index(drop=True)
return toy
# toy = create_toy_df(n=1000,n_dim=2,n_classes=2,seed=0)
# plt.scatter(toy[toy.columns[0]],toy[toy.columns[1]],c=toy['Class'], alpha=0.2) ;
# Exploratory Data Analysis (EDA)
#
# Table of contents
#
# In[8]:
# Load the credit card data
# Original data available from:
# https://www.kaggle.com/dalpozz/creditcardfraud
data = pd.read_csv("data/creditcard.csv.zip")
print(data.shape)
print(data.columns)
data.head(3)
# In[9]:
# data columns will be all other columns except class
label_cols = ['Class']
data_cols = list(data.columns[ data.columns != 'Class' ])
print(data_cols)
print('# of data columns: ',len(data_cols))
# In[10]:
# 284315 normal transactions (class 0)
# 492 fraud transactions (class 1)
data.groupby('Class')['Class'].count()
# In[11]:
# Total nulls in dataset (sum over rows, then over columns)
data.isnull().sum().sum()
# In[12]:
# Duplicates? Yes
normal_duplicates = sum( data.loc[ data.Class==0 ].duplicated() )
fraud_duplicates = sum( data.loc[ data.Class==1 ].duplicated() )
total_duplicates = normal_duplicates + fraud_duplicates
print( 'Normal duplicates', normal_duplicates )
print( 'Fraud duplicates', fraud_duplicates )
print( 'Total duplicates', total_duplicates )
print( 'Fraction duplicated', total_duplicates / len(data) )
# In[13]:
# 'Time' is seconds from first transaction in set
# 48 hours worth of data
# Let's convert time to time of day, in hours
print( 'Last time value: {:.2f}'.format( data['Time'].max() / 3600 ) )
data['Time'] = ( data['Time'].values / 3600 ) % 24
plt.hist( [ data.loc[ data['Class']==0, 'Time'], data.loc[ data['Class']==1, 'Time'] ],
normed=True, label=['normal','fraud'], bins=np.linspace(0,24,25))
plt.legend()
plt.show()
# Looks like normal transactions have a bias towards 8am to midnight
# Fraud has spikes at 2-3am and noon
# In[14]:
# several columns heavily skewed, 'Amount' the highest (besides Class)
data.skew()
# In[15]:
# Minimum 'Amount' is 0
# 0's account for 0.6% of the data set
print( data['Amount'].min() )
print( np.sum( data['Amount']==0 ) )
# print( np.sum( data['Amount']<0.01 ) )
print( np.sum( data['Amount']==0 ) / len(data) )
# In[16]:
# Looks like all 'Amount' values are rounded to the hundredths (0.01) place
data['Amount'].mod(0.01).hist() ;
# In[17]:
# Some values are much more frequent than others
# 0.00 comes in 12th in the list
print( data.Amount.value_counts().head(15) )
# In[18]:
# Log transform amount values to give more normal distribution
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.hist(data['Amount'], bins=40)
plt.title('Original Amount Distribution')
plt.subplot(1,2,2)
d0 = np.log10( data['Amount'].values + 1 )
# d0 = np.log1p( data['Amount'].values ) / np.log(10)
plt.hist( d0, bins=40 )
plt.title('Log10(x+1) Transformed Amount Distribution')
plt.show()
# In[19]:
# Use log transformed data
data['Amount'] = d0
# In[21]:
# Center and scale all data, only using the middle 99.8%, so outliers don't pull too much.
# First generate the percentile data for each feature
percentiles = pd.DataFrame( np.array([ np.percentile( data[i], [ 0.1, 99.9 ] ) for i in data_cols ]).T,
columns=data_cols, index=['min','max'] )
percentile_means = \
[ [ np.mean( data.loc[ (data[i]>percentiles[i]['min']) & (data[i]percentiles[i]['min']) & (data[i]Figure 3: Data Distributions by Feature and Class
# Table of contents
# In[27]:
# Plot the data by each feature
axarr = [[]]*len(data_cols)
columns = 4
rows = int( np.ceil( len(data_cols) / columns ) )
f, fig = plt.subplots( figsize=(columns*3.5, rows*2) )
f.suptitle('Data Distributions by Feature and Class', size=16)
for i, col in enumerate(data_cols[:]):
axarr[i] = plt.subplot2grid( (int(rows), int(columns)), (int(i//columns), int(i%columns)) )
axarr[i].hist( [ data.loc[ data.Class == 0, col ], data.loc[ data.Class == 1, col ] ], label=['normal','fraud'],
bins=np.linspace( np.percentile(data[col],0.1), np.percentile(data[col],99.9), 30 ),
normed=True )
axarr[i].set_xlabel(col, size=12)
axarr[i].set_ylim([0,0.8])
axarr[i].tick_params(axis='both', labelsize=10)
if i == 0:
legend = axarr[i].legend()
legend.get_frame().set_facecolor('white')
if i%4 != 0 :
axarr[i].tick_params(axis='y', left='off', labelleft='off')
else:
axarr[i].set_ylabel('Fraction',size=12)
plt.tight_layout(rect=[0,0,1,0.95]) # xmin, ymin, xmax, ymax
# plt.savefig('plots/Engineered_Data_Distributions.png')
plt.show()
# In[28]:
# Save engineered dataset for use in analysis
# Save as pickle for faster reload
pickle.dump(data, open('data/' + 'credicard.engineered.pkl','wb'))
# In[26]:
# # Save as csv for human readability - much slower save
# data.to_csv('data/' + 'credicard.engineered.csv.zip')
# xgboost fraud detection
#
# Table of contents
#
#
# - Here we'll use the xgboost algorithm to detect fraud cases
# In[56]:
# define the columns we want to test on, in case we want to use less than the full set
test_cols = data.columns
# test_cols = data.columns[ data.columns != 'Amount' ]
print(len(test_cols))
print(test_cols)
# In[57]:
# Define some custom metric functions for use with the xgboost algorithm
# https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
from sklearn.metrics import recall_score, precision_score, roc_auc_score
def recall(preds, dtrain):
labels = dtrain.get_label()
return 'recall', recall_score(labels, np.round(preds))
def precision(preds, dtrain):
labels = dtrain.get_label()
return 'precision', precision_score(labels, np.round(preds))
def roc_auc(preds, dtrain):
labels = dtrain.get_label()
return 'roc_auc', roc_auc_score(labels, preds)
# In[58]:
# Set up the test and train sets
np.random.seed(0)
n_real = np.sum(data.Class==0) # 200000
n_test = np.sum(data.Class==1) # 492
train_fraction = 0.7
fn_real = int(n_real * train_fraction)
fn_test = int(n_test * train_fraction)
real_samples = data.loc[ data.Class==0, test_cols].sample(n_real, replace=False).reset_index(drop=True)
test_samples = data.loc[ data.Class==1, test_cols].sample(n_test, replace=False).reset_index(drop=True)
train_df = pd.concat([real_samples[:fn_real],test_samples[:fn_test]],axis=0,ignore_index=True).reset_index(drop=True)
# train_df = pd.concat([real_samples[:fn_test],test_samples[:fn_test]],axis=0,ignore_index=True).reset_index(drop=True)
test_df = pd.concat([real_samples[fn_real:],test_samples[fn_test:]],axis=0,ignore_index=True).reset_index(drop=True)
print( 'classes 0, 1: ', n_real, n_test )
print( 'train, test: ', len(train_df), len(test_df) )
X_col = test_df.columns[:-1]
y_col = test_df.columns[-1]
dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
dtest = xgb.DMatrix(test_df[X_col], test_df[y_col], feature_names=X_col)
# In[60]:
# Run the xgboost algorithm, maximize recall on the test set
results_dict = {}
xgb_params = {
# 'max_depth': 4,
'objective': 'binary:logistic',
'random_state': 0,
'eval_metric': 'auc', # auc, error
# 'tree_method': 'hist'
# 'grow_policy': 'lossguide' # depthwise, lossguide
}
xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100,
verbose_eval=False,
early_stopping_rounds=20,
evals=[(dtrain,'train'),(dtest,'test')],
evals_result = results_dict,
feval = recall, maximize=True
# feval = roc_auc, maximize=True
)
y_pred = xgb_test.predict(dtest, ntree_limit=xgb_test.best_iteration+1)
y_true = test_df['Class'].values
print( 'best iteration: ', xgb_test.best_iteration )
print( recall( y_pred, dtest ) )
print( precision( y_pred, dtest ) )
print( roc_auc( y_pred, dtest ) )
# print( 'Accuracy: {:.3f}'.format(SimpleAccuracy(y_pred, y_true)) )
SimpleMetrics( np.round(y_pred), y_true)
# In[61]:
# Let's look at how the metrics changed on the train and test sets as more trees were added
for i in results_dict:
for err in results_dict[i]:
plt.plot(results_dict[i][err], label=i+' '+err)
plt.axvline(xgb_test.best_iteration, c='green', label='best iteration')
plt.xlabel('iteration')
# plt.ylabel(err)
plt.title('xgboost learning curves')
plt.legend()
plt.grid() ;
# In[62]:
# Plot feature importances
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
xgb.plot_importance(xgb_test, max_num_features=10, height=0.5, ax=ax);
# In[65]:
# Generate list of features sorted by importance in detecting fraud
# https://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
import operator
x = xgb_test.get_fscore()
sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)
# print( 'Top eight features for fraud detection: ', [ i[0] for i in sorted_x[:8] ] )
sorted_cols = [i[0] for i in sorted_x] + ['Class']
print( sorted_cols )
# In[64]:
# Plot all of the training data with paired features sorted by importance
# This takes a while
colors = ['red','blue']
markers = ['o','^']
labels = ['real','fraud']
alphas = [0.7, 0.9]
columns = 4
rows = int( np.ceil( len(data_cols) / columns / 2 ) )
plt.figure( figsize=(columns*3.5, rows*3) )
plt.suptitle('XGBoost Sorted Data Distributions ', size=16)
train = train_df.copy()
for i in range( int(np.floor(len(sorted_x)/2)) )[:]:
col1, col2 = sorted_x[i*2][0], sorted_x[i*2+1][0]
# print(i,col1,col2)
plt.subplot(rows,columns,i+1)
for group, color, marker, label, alpha in zip( train.groupby('Class'), colors, markers, labels, alphas ):
plt.scatter( group[1][col1], group[1][col2],
label=label, marker=marker, alpha=alpha,
edgecolors=color, facecolors='none' )
plt.xlabel(col1, size=12)
plt.ylabel(col2, size=12)
plt.tick_params(axis='both', labelsize=10)
if i == 0: plt.legend(fontsize=12, edgecolor='black')
plt.tight_layout(rect=[0,0,1,0.95]) # xmin, ymin, xmax, ymax
# plt.savefig('plots/XGB_Sorted_Data_Distributions.png')
plt.show()
# In[47]:
# Lets look at the effect of the ratio of normal:fraud data in the dataset on recall and roc_auc
# We'll use cross validation to see if differences are significant
np.random.seed(0)
n_real = np.sum(data.Class==0) # 200000
n_test = np.sum(data.Class==1) # 492
real_samples = data.loc[ data.Class==0, test_cols].sample(n_real, replace=False).reset_index(drop=True)
test_samples = data.loc[ data.Class==1, test_cols].sample(n_test, replace=False).reset_index(drop=True)
X_col = data.columns[:-1]
y_col = data.columns[-1]
test_data=[]
# for i in [1]:
# for i in [0.1,0.5,1,2,10]:
for i in np.logspace(-1,2,8):
print(i)
train_df = pd.concat([real_samples[:int(n_test*i)],test_samples[:n_test]],axis=0,ignore_index=True).reset_index(drop=True)
dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
results = xgb.cv(xgb_params, dtrain,
nfold=5, num_boost_round=100, early_stopping_rounds=10, seed=0,
feval=recall)
test_data.append(list([i]) + list(results.tail(1).index) + list(results.tail(1).values[0]))
test_data = pd.DataFrame(test_data, columns=list(['ratio','best'])+list(results.columns))
test_data
# In[52]:
# Recall decreases as more normal data is added
# metric = 'auc'
metric = 'recall'
# xs = test_data['ratio'].values
xs = np.log10(test_data['ratio'].values)
ys = test_data['test-'+metric+'-mean'].values
stds = test_data['test-'+metric+'-std'].values
plt.plot(xs,ys,c='C1')
plt.plot(xs,ys+stds,linestyle=':',c='C2')
plt.plot(xs,ys-stds,linestyle=':',c='C2')
plt.xlabel('log10 ratio of normal:fraud data')
plt.ylabel(metric)
# plt.ylim([0.96,1.01])
plt.show()
# Classification
#
# Table of contents
# In[ ]:
# load clustering libraries
import sklearn.cluster as cluster
# In[67]:
# hdbscan not in kaggle/python at present
get_ipython().system('pip install hdbscan')
import hdbscan
# In[66]:
# Set up training set to consist of only fraud data
train = data.loc[ data['Class']==1 ].copy()
print( pd.DataFrame( [ [np.sum(train['Class']==i)] for i in np.unique(train['Class']) ], columns=['count'], index=np.unique(train['Class']) ) )
# train = pd.get_dummies(train, columns=['Class'], prefix='Class')
label_cols = [ i for i in train.columns if 'Class' in i ]
data_cols = [ i for i in train.columns if i not in label_cols ]
train_no_label = train[ data_cols ]
# In[68]:
get_ipython().run_cell_magic('time', '', '\n# TSNE is an interesting method to map higher dimensional data into two dimensions\n# http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html\n\n# Note TSNE map may not be what you might think:\n# https://distill.pub/2016/misread-tsne/\n\n# Create multiple projections to compare results from different random states\n\nfrom sklearn.manifold import TSNE\n\nprojections = [ TSNE(random_state=i).fit_transform(train_no_label) for i in range(3) ]\n')
# In[69]:
get_ipython().run_cell_magic('time', '', "\n# Now we'll compare some different clustering algorithms\n# https://github.com/scikit-learn-contrib/hdbscan/blob/master/docs/comparing_clustering_algorithms.rst\n\nalgorithms = [ \n# [ 'KMeans', cluster.KMeans, (), {'random_state':0} ],\n [ 'KMeans', cluster.KMeans, (), {'n_clusters':2, 'random_state':0} ],\n# [ 'KMeans 3', cluster.KMeans, (), {'n_clusters':3, 'random_state':0} ],\n# [ 'Agglomerative', cluster.AgglomerativeClustering, (), {} ],\n [ 'Agglomerative', cluster.AgglomerativeClustering, (), {'linkage': 'ward', 'n_clusters': 3} ],\n# [ 'Agg. Ave 3', cluster.AgglomerativeClustering, (), {'linkage': 'average', 'n_clusters': 3} ],\n# [ 'Agg. Complete 3', cluster.AgglomerativeClustering, (), {'linkage': 'complete', 'n_clusters': 3} ],\n# [ 'DBSCAN', cluster.DBSCAN, (), {'eps':0.025} ],\n# [ 'HDBSCAN', hdbscan.HDBSCAN, (), {} ],\n [ 'HDBSCAN', hdbscan.HDBSCAN, (), {'min_cluster_size':10, 'min_samples':1, } ],\n# [ 'HDBSCAN 2 10', hdbscan.HDBSCAN, (), {'min_cluster_size':2, 'min_samples':10, } ],\n# [ 'HDBSCAN 10 10 ', hdbscan.HDBSCAN, (), {'min_cluster_size':10, 'min_samples':10, } ],\n]\n\nrows = len(algorithms)\ncolumns = 4\nplt.figure(figsize=(columns*3, rows*3))\n\nfor i, [name, algorithm, args, kwds] in enumerate(algorithms):\n print(i, name)\n\n labels = algorithm(*args, **kwds).fit_predict(train_no_label)\n# labels = algorithm(*args, **kwds).fit_predict(projections[0])\n \n# print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )\n \n colors = np.clip(labels,-1,9)\n colors = [ 'C'+str(i) if i>-1 else 'black' for i in colors ]\n \n plt.subplot(rows,columns,i*columns+1)\n plt.scatter(train_no_label[data_cols[0]], train_no_label[data_cols[1]], c=colors)\n plt.xlabel(data_cols[0]), plt.ylabel(data_cols[1])\n plt.title(name)\n\n for j in range(3):\n plt.subplot(rows,columns,i*columns+1+j+1)\n plt.scatter(*(projections[j].T), c=colors) \n plt.xlabel('x'), plt.ylabel('y')\n plt.title('TSNE projection '+str(j+1),size=12)\n \n# break\n\nplt.suptitle('Comparison of Fraud Clusters', size=16)\nplt.tight_layout(rect=[0,0,1,0.95])\nplt.savefig('plots/Fraud_Cluster_Diagram.png')\nplt.show()\n")
# In[70]:
# Now pick a set of labels and add to the dataset
algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':2, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train_no_label)
# labels = algorithm(*args, **kwds).fit_predict(projections[0])
print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
fraud_w_classes = train.copy()
fraud_w_classes['Class'] = labels
# In[76]:
# Let's see which features are most useful for detecting differences between the classes:
dtrain = xgb.DMatrix(fraud_w_classes[data_cols], fraud_w_classes['Class'], feature_names=data_cols)
xgb_params = {
'objective': 'binary:logistic',
'random_state': 0,
'eval_metric': 'auc', # allows for balanced or unbalanced classes
}
xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=10) # limit to ten rounds for fast evaluation
import operator
x = xgb_test.get_fscore()
sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)
print( 'Top eight features: ', [ [i[0],i[1]] for i in sorted_x[:8] ] )
# GAN setup and training
#
# Table of contents
#
# Resources:
#
# https://github.com/wayaai/GAN-Sandbox
#
# http://blog.richardweiss.org/2017/07/21/conditional-wasserstein-gan.html
#
#
# Conversion of Waya.ai GAN and WGAN
# - Remove convolutional layers
# - Remove ResNeXt
# - Add simple dense networks
# - Convert WGAN back to simple GAN
# - Use common functions as much as possible
#
#
# For these functions I've passed the arguments as lists because it was easy to implement.
#
# Standard solutions (because they are more stable) include using keyword dictionaries or objects, but those create a larger code base.
#
# In[78]:
# reloading the libraries and setting the parameters
import GAN_171103
import importlib
importlib.reload(GAN_171103) # For reloading after making changes
from GAN_171103 import *
rand_dim = 32 # 32 # needs to be ~data_dim
base_n_count = 128 # 128
nb_steps = 500 + 1 # 50000 # Add one for logging of the last interval
batch_size = 128 # 64
k_d = 1 # number of critic network updates per adversarial training step
k_g = 1 # number of generator network updates per adversarial training step
critic_pre_train_steps = 100 # 100 # number of steps to pre-train the critic before starting adversarial training
log_interval = 100 # 100 # interval (in steps) at which to log loss summaries and save plots of image samples to disc
learning_rate = 5e-4 # 5e-5
data_dir = 'cache/'
generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None
# show = False
show = True
# train = create_toy_spiral_df(1000)
# train = create_toy_df(n=1000,n_dim=2,n_classes=4,seed=0)
train = fraud_w_classes.copy().reset_index(drop=True) # fraud only with labels from classification
# train = pd.get_dummies(train, columns=['Class'], prefix='Class', drop_first=True)
label_cols = [ i for i in train.columns if 'Class' in i ]
data_cols = [ i for i in train.columns if i not in label_cols ]
train[ data_cols ] = train[ data_cols ] / 10 # scale to random noise size, one less thing to learn
train_no_label = train[ data_cols ]
# In[79]:
get_ipython().run_cell_magic('time', '', '\n# Training the vanilla GAN and CGAN architectures\n\nk_d = 1 # number of critic network updates per adversarial training step\nlearning_rate = 5e-4 # 5e-5\narguments = [rand_dim, nb_steps, batch_size, \n k_d, k_g, critic_pre_train_steps, log_interval, learning_rate, base_n_count,\n data_dir, generator_model_path, discriminator_model_path, loss_pickle_path, show ]\n\nadversarial_training_GAN(arguments, train_no_label, data_cols ) # GAN\nadversarial_training_GAN(arguments, train, data_cols=data_cols, label_cols=label_cols ) # CGAN\n')
# In[80]:
get_ipython().run_cell_magic('time', '', '\n# Training the WGAN and WCGAN architectures\n\nk_d = 5 # train critic to optimal state each time\nlearning_rate = 1e-4 # 5e-5\narguments = [rand_dim, nb_steps, batch_size, \n k_d, k_g, critic_pre_train_steps, log_interval, learning_rate, base_n_count,\n data_dir, generator_model_path, discriminator_model_path, loss_pickle_path, show ]\n\nadversarial_training_WGAN(arguments, train_no_label, data_cols=data_cols ) # WGAN\nadversarial_training_WGAN(arguments, train, data_cols=data_cols, label_cols=label_cols ) # WCGAN\n')
# In[ ]:
# %%time
# # for continued training
# import GAN_1711103
# import importlib
# importlib.reload(GAN_171103) # For reloading after making changes
# from GAN_171103 import *
# last_step = 1000
# prefix = 'WGAN'
# # data_dir = 'cache lr mix base 128 act mix 171026/'
# data_dir = 'cache/'
# # Choose your learning rate
# # learning_rate = 1e-5 # first 10k
# # learning_rate = 1e-5 # 10-15k
# # learning_rate = 1e-6 # 15-20k
# generator_model_path = data_dir + prefix + '_generator_model_weights_step_' + str(last_step) + '.h5'
# discriminator_model_path = data_dir + prefix + '_discriminator_model_weights_step_' + str(last_step) + '.h5'
# loss_pickle_path = data_dir + prefix + '_losses_step_' + str(last_step) + '.pkl'
# nb_steps = 4000
# arguments = [rand_dim, nb_steps, batch_size,
# k_d, k_g, critic_pre_train_steps, log_interval, learning_rate, base_n_count,
# data_dir, generator_model_path, discriminator_model_path, loss_pickle_path, show ]
# # Choose your training algorithm
# # adversarial_training_GAN(arguments, train_no_label, data_cols=data_cols, starting_step=last_step+1 ) # GAN
# # adversarial_training_GAN(arguments, train, data_cols=data_cols, label_cols=label_cols, starting_step=last_step+1 ) # CGAN
# # adversarial_training_WGAN(arguments, train_no_label, data_cols=data_cols, starting_step=last_step+1 ) # WGAN
# adversarial_training_WGAN(arguments, train, data_cols=data_cols, label_cols=label_cols, starting_step=last_step+1 ) # WCGAN
# In[82]:
# For reloading loss data from pickles
prefix = 'WCGAN'
step = 500
[combined_loss, disc_loss_generated, disc_loss_real, xgb_losses] = pickle.load(
open(data_dir+prefix+'_losses_step_'+str(step)+'.pkl','rb'))
# In[84]:
# plt.plot( xgb_losses[:] ) ;
w = 10
plt.plot( pd.DataFrame(xgb_losses[:]).rolling(w).mean() ) ;
# In[85]:
# Look for the step with the lowest xgboost accuracy, and the lowest step saved (every 100)
best_step = list(xgb_losses).index( xgb_losses.min() ) * 10
print( best_step, xgb_losses.min() )
xgb100 = [ xgb_losses[i] for i in range(0, len(xgb_losses), 10) ]
best_step = xgb100.index( min(xgb100) ) * log_interval
print( best_step, min(xgb100) )
# In[86]:
# Look for the step with the lowest critic loss, and the lowest step saved (every 100)
delta_losses = np.array(disc_loss_real) - np.array(disc_loss_generated)
best_step = list(delta_losses).index( delta_losses.min() )
print( best_step, delta_losses.min() )
delta100 = [ delta_losses[i] for i in range(0, len(delta_losses), 100) ]
best_step = delta100.index( min(delta100) ) * log_interval
print( best_step, min(delta100) )
# In[87]:
# plt.plot( (np.array(disc_loss_real) - np.array(disc_loss_generated)) )
w = 50
# plt.plot( list(range(0,5001,1)), pd.rolling_mean((np.array(disc_loss_real) - np.array(disc_loss_generated)),w) )
plt.plot( pd.DataFrame(disc_loss_real[:]).rolling(w).mean() - pd.DataFrame(disc_loss_generated[:]).rolling(w).mean() ) ;
# plt.xlim([9000,10000])
# plt.ylim([0.03,0.05])
# In[88]:
# Let's look at some of the generated data
# First create the networks locally and load the weights
import GAN_171103
import importlib
importlib.reload(GAN_171103) # For reloading after making changes
from GAN_171103 import *
seed = 17
train = fraud_w_classes.copy().reset_index(drop=True) # fraud only with labels from classification
# train = pd.get_dummies(train, columns=['Class'], prefix='Class', drop_first=True)
label_cols = [ i for i in train.columns if 'Class' in i ]
data_cols = [ i for i in train.columns if i not in label_cols ]
train[ data_cols ] = train[ data_cols ] / 10 # scale to random noise size, one less thing to learn
train_no_label = train[ data_cols ]
data_dim = len(data_cols)
label_dim = len(label_cols)
with_class = False
if label_dim > 0: with_class = True
np.random.seed(seed)
# define network models
# generator_model, discriminator_model, combined_model = define_models_CGAN(rand_dim, data_dim, label_dim, base_n_count, type='Wasserstein')
# generator_model.load_weights('cache/WCGAN_generator_model_weights_step_4800.h5')
generator_model, discriminator_model, combined_model = define_models_CGAN(rand_dim, data_dim, label_dim, base_n_count)
generator_model.load_weights('cache/CGAN_generator_model_weights_step_500.h5')
# with_class = False
# train = train_no_label
# label_cols = []
# # generator_model, discriminator_model, combined_model = define_models_GAN(rand_dim, data_dim, base_n_count, type='Wasserstein')
# # generator_model.load_weights('cache/WGAN_generator_model_weights_step_4800.h5')
# generator_model, discriminator_model, combined_model = define_models_GAN(rand_dim, data_dim, base_n_count)
# generator_model.load_weights('cache/GAN_generator_model_weights_step_5000.h5')
# In[89]:
# Now generate some new data
test_size = 492 # Equal to all of the fraud cases
x = get_data_batch(train, test_size, seed=i+j)
z = np.random.normal(size=(test_size, rand_dim))
if with_class:
labels = x[:,-label_dim:]
g_z = generator_model.predict([z, labels])
else:
g_z = generator_model.predict(z)
# In[93]:
# Check using the same functions used during GAN training
print( CheckAccuracy( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim ) )
PlotData( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim )
# In[94]:
# Now we can train and test an xgboost classifier on our generated data
real_samples = pd.DataFrame(x, columns=data_cols+label_cols)
test_samples = pd.DataFrame(g_z, columns=data_cols+label_cols)
real_samples['syn_label'] = 0
test_samples['syn_label'] = 1
training_fraction = 0.5
n_real, n_test = int(len(real_samples)*training_fraction), int(len(test_samples)*training_fraction)
train_df = pd.concat([real_samples[:n_real],test_samples[:n_test]],axis=0)
test_df = pd.concat([real_samples[n_real:],test_samples[n_test:]],axis=0)
# X_col = test_df.columns[:-(label_dim+1)]
X_col = test_df.columns[:-1]
y_col = test_df.columns[-1]
dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
dtest = xgb.DMatrix(test_df[X_col], feature_names=X_col)
y_true = test_df['syn_label']
# dtrain = np.vstack( [ x[:int(len(x)/2)], g_z[:int(len(g_z)/2)] ] )
# dlabels = np.hstack( [ np.zeros(int(len(x)/2)), np.ones(int(len(g_z)/2)) ] )
# dtest = np.vstack( [ x[int(len(x)/2):], g_z[int(len(g_z)/2):] ] )
# dtrain = xgb.DMatrix(dtrain, dlabels, feature_names=data_cols+label_cols)
# dtest = xgb.DMatrix(dtest, feature_names=data_cols+label_cols)
# y_true = dlabels
xgb_params = {
'max_depth': 4,
'objective': 'binary:logistic',
'random_state': 0,
'eval_metric': 'auc', # allows for balanced or unbalanced classes
}
xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=10)
y_pred = np.round(xgb_test.predict(dtest))
print( '{:.2f}'.format(SimpleAccuracy(y_pred, y_true)) )
# In[95]:
# Let's look at how the discrimnator scored real and generated data, visualized along every feature
y_pred0 = xgb_test.predict(dtest)
for i in range(0,len(X_col)-1, 2):
f, axarr = plt.subplots(1, 2, figsize=(6,2) )
axarr[0].scatter( test_df[:n_real][X_col[i]], test_df[:n_real][X_col[i+1]], c=y_pred0[:n_real], cmap='plasma' )
axarr[0].set_title('real')
axarr[0].set_ylabel(X_col[i+1])
axarr[1].scatter( test_df[n_real:][X_col[i]], test_df[n_real:][X_col[i+1]], c=y_pred0[n_real:], cmap='plasma' )
axarr[1].set_title('test')
axarr[1].set_xlim(axarr[0].get_xlim()), axarr[1].set_ylim(axarr[0].get_ylim())
for a in axarr:
a.set_xlabel(X_col[i])
plt.show()
# In[99]:
# Now we can compare the real and generated data by class, along every feature
colors = ['red','blue']
markers = ['o','^']
labels = ['real','fraud']
class_label = 'Class'
for i in range(0,len(X_col), 2):
col1, col2 = i, i+1
if i+1 >= len(X_col): continue
f, axarr = plt.subplots(1, 2, figsize=(6,2) )
for group, color, marker, label in zip( test_df[:n_real].groupby(class_label), colors, markers, labels ):
axarr[0].scatter( group[1][X_col[col1]], group[1][X_col[col2]], label=label, c=color, marker=marker, alpha=0.2)
axarr[0].legend()
axarr[0].set_title('real')
axarr[0].set_ylabel(X_col[col2])
for group, color, marker, label in zip( test_df[n_real:].groupby(class_label), colors, markers, labels ):
axarr[1].scatter( group[1][X_col[col1]], group[1][X_col[col2]], label=label, c=color, marker=marker, alpha=0.2)
axarr[1].set_xlim(axarr[0].get_xlim()), axarr[1].set_ylim(axarr[0].get_ylim())
axarr[1].legend()
axarr[1].set_title('generated') ;
for a in axarr:
a.set_xlabel(X_col[col1])
plt.show()
# In[100]:
# Evaluate performance on validation set
SimpleMetrics(y_pred,y_true)
# In[101]:
# Plot feature importances used for identifying generated data
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
xgb.plot_importance(xgb_test, max_num_features=20, height=0.5, ax=ax);
# Compare GAN Output
#
# Table of contents
#
#
#
# It may be best to restart the notebook, run the setup section, and then run this section
#
# In[8]:
# Set up the training dataset
train = fraud_w_classes.copy().reset_index(drop=True) # fraud only with labels from classification
train = pd.get_dummies(train, columns=['Class'], prefix='Class', drop_first=True)
label_cols = [ i for i in train.columns if 'Class' in i ]
data_cols = [ i for i in train.columns if i not in label_cols ]
train[ data_cols ] = train[ data_cols ] / 10 # scale to random noise size, one less thing to learn
train_no_label = train[ data_cols ]
data_dim = len(data_cols)
label_dim = len(label_cols)
# In[9]:
# Generate empty models
rand_dim = 32
base_n_count = 128
model_names = ['GAN','CGAN','WGAN','WCGAN']
with_classes = [False, True, False, True]
type0s = [None, None, 'Wasserstein', 'Wasserstein']
models = {}
for model_name, with_class, type0 in zip(model_names, with_classes, type0s ):
if with_class:
generator_model, discriminator_model, combined_model = \
define_models_CGAN(rand_dim, data_dim, label_dim, base_n_count, type=type0)
else:
generator_model, discriminator_model, combined_model = \
define_models_GAN(rand_dim, data_dim, base_n_count, type=type0)
models[model_name] = [ model_name, with_class, type0, generator_model ]
# In[10]:
# Setup parameters
seed = 17
test_size = 492 # number of fraud cases
np.random.seed(seed)
z = np.random.normal(size=(test_size, rand_dim))
x = get_data_batch(train, test_size, seed=seed)
real_samples = pd.DataFrame(x, columns=data_cols+label_cols)
labels = x[:,-label_dim:]
# colors = ['C1','C9']
# colors = ['xkcd:plum', 'xkcd:navy']
colors = ['red','blue']
markers = ['o','^']
class_labels = ['Class 1','Class 2']
col1, col2 = 'V17', 'V10'
base_dir = 'cache lr mix base 128 act mix 171026/'
# Figure 5: Comparison of GAN outputs
# Table of contents
# In[11]:
# model_steps = [500, 5000]
# model_steps = [ 0, 100, 200, 500, 1000 ]
model_steps = [ 0, 100, 200, 500, 1000, 2000, 5000]
rows = len(model_steps)
columns = 5
axarr = [[]]*len(model_steps)
fig = plt.figure(figsize=(14,rows*3))
for model_step_ix, model_step in enumerate( model_steps ):
print(model_step)
axarr[model_step_ix] = plt.subplot(rows, columns, model_step_ix*columns + 1)
for group, color, marker, label in zip( real_samples.groupby('Class_1'), colors, markers, class_labels ):
plt.scatter( group[1][[col1]], group[1][[col2]],
label=label, marker=marker, edgecolors=color, facecolors='none' )
plt.title('Actual Fraud Data')
plt.ylabel(col2) # Only add y label to left plot
plt.xlabel(col1)
xlims, ylims = axarr[model_step_ix].get_xlim(), axarr[model_step_ix].get_ylim()
if model_step_ix == 0:
legend = plt.legend()
legend.get_frame().set_facecolor('white')
for i, model_name in enumerate( model_names[:] ):
[ model_name, with_class, type0, generator_model ] = models[model_name]
generator_model.load_weights( base_dir + model_name + '_generator_model_weights_step_'+str(model_step)+'.h5')
ax = plt.subplot(rows, columns, model_step_ix*columns + 1 + (i+1) )
if with_class:
g_z = generator_model.predict([z, labels])
gen_samples = pd.DataFrame(g_z, columns=data_cols+label_cols)
for group, color, marker, label in zip( gen_samples.groupby('Class_1'), colors, markers, class_labels ):
plt.scatter( group[1][[col1]], group[1][[col2]],
label=label, marker=marker, edgecolors=color, facecolors='none' )
else:
g_z = generator_model.predict(z)
gen_samples = pd.DataFrame(g_z, columns=data_cols)
plt.scatter( gen_samples[[col1]], gen_samples[[col2]],
label=class_labels[0], marker=markers[0], edgecolors=colors[0], facecolors='none' )
plt.title(model_name)
plt.xlabel(data_cols[0])
ax.set_xlim(xlims), ax.set_ylim(ylims)
plt.suptitle('Comparison of GAN outputs', size=16)
plt.tight_layout(rect=[0.075,0,1,0.95])
# Adding text labels for traning steps
vpositions = np.array([ i._position.bounds[1] for i in axarr ])
vpositions += ((vpositions[0] - vpositions[1]) * 0.35 )
for model_step_ix, model_step in enumerate( model_steps ):
fig.text( 0.05, vpositions[model_step_ix], 'training\nstep\n'+str(model_step), ha='center', va='center', size=12)
# plt.savefig('plots/Comparison_of_GAN_outputs.png')
# Generated Data Testing
#
# Table of contents
# In[20]:
# Setup xgboost parameters
xgb_params = {
# 'max_depth': 4,
'objective': 'binary:logistic',
'random_state': 0,
'eval_metric': 'auc', # auc, error
# 'tree_method': 'hist'
# 'grow_policy': 'lossguide' # depthwise, lossguide
}
# https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
from sklearn.metrics import recall_score, precision_score, roc_auc_score
def recall(preds, dtrain):
labels = dtrain.get_label()
return 'recall', recall_score(labels, np.round(preds))
def precision(preds, dtrain):
labels = dtrain.get_label()
return 'precision', precision_score(labels, np.round(preds))
def roc_auc(preds, dtrain):
labels = dtrain.get_label()
return 'roc_auc', roc_auc_score(labels, preds)
# In[21]:
# Define model parameters
seed = 17
np.random.seed(seed)
data_dim = len(data_cols)
label_dim = len(label_cols)
base_dir = 'cache lr mix base 128 act mix 171026/'
rand_dim = 32
base_n_count = 128
# In[22]:
# defined training set parameters
train_fraction = 0.7
X_col = data.columns[:-1]
y_col = data.columns[-1]
folds = 5
# In[28]:
# Function to make cross folds with different amounts of an additional dataset added
def MakeCrossFolds( g_z_df=[] ):
np.random.seed(0)
train_real_set, test_real_set = [], []
train_fraud_set, test_fraud_set = [], []
real_samples = data.loc[ data.Class==0 ].copy()
fraud_samples = data.loc[ data.Class==1 ].copy()
# n_temp_real = 10000
n_temp_real = len(real_samples)
for seed in range(folds):
np.random.seed(seed)
fraud_samples = fraud_samples.sample(len(fraud_samples), replace=False).reset_index(drop=True) # shuffle
# n_train_fraud = int(len(fraud_samples) * train_fraction)
n_train_fraud = 100
train_fraud_samples = fraud_samples[:n_train_fraud].reset_index(drop=True)
# test_fraud_samples = fraud_samples[n_train_fraud:].reset_index(drop=True)
n_test_fraud = 148 # 30% left out
test_fraud_samples = fraud_samples[-n_test_fraud:].reset_index(drop=True)
if len(g_z_df)==0: g_z_df = fraud_samples[n_train_fraud:-n_test_fraud] # for adding real data, if no generated
n_g_z = len(g_z_df)
train_fraud_samples = train_fraud_samples.append(g_z_df).reset_index(drop=True)
real_samples = real_samples.sample(len(real_samples), replace=False).reset_index(drop=True) # shuffle
temp_real_samples = real_samples[:n_temp_real]
n_train_real = int(len(temp_real_samples) * train_fraction)
train_real_samples = temp_real_samples[:n_train_real].reset_index(drop=True) # with margin
test_real_samples = temp_real_samples[n_train_real:].reset_index(drop=True) # with margin
train_real_set.append( train_real_samples )
test_real_set.append( test_real_samples )
train_fraud_set.append( train_fraud_samples )
test_fraud_set.append( test_fraud_samples )
print( n_train_fraud )
for i in [ fraud_samples, g_z_df, train_fraud_samples, test_fraud_samples ]: print( len(i) )
for i in [ real_samples, train_real_samples, test_real_samples ]: print( len(i) )
# [ [ len(i) for i in j ] for j in [train_real_set, test_real_set, train_fraud_set, test_fraud_set] ]
return n_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set
# In[29]:
# function to run an xgboost classifier on different cross-folds with different amounts of data added
def Run_CV_Xgb(n_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set):
test_data=[]
# for i in [1]:
# for i in [1,2,5,10,20]:
# for i in np.logspace(0,np.log10(11),num=5):
# for i in np.logspace(0,np.log10(11),num=3):
for i in np.logspace(0,np.log10((492-148)/100),num=5):
print('# additional generated data tested: {}'.format (int(n_train_fraud*(i-1)) ) )
for k in range(folds):
train_df = pd.concat(
[ train_real_set[k], train_fraud_set[k][:int(n_train_fraud*i)] ],
axis=0,ignore_index=True).reset_index(drop=True)
test_df = pd.concat(
[ test_real_set[k], test_fraud_set[k] ],
axis=0,ignore_index=True).reset_index(drop=True)
dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
dtest = xgb.DMatrix(test_df[X_col], test_df[y_col], feature_names=X_col)
results_dict = {}
xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100,
verbose_eval=False, early_stopping_rounds=10,
evals=[(dtrain,'train'),(dtest,'test')],
evals_result = results_dict )
y_pred = xgb_test.predict(dtest, ntree_limit=xgb_test.best_iteration+1)
y_true = test_df['Class'].values
results = [k, i, xgb_test.best_iteration, recall( y_pred, dtest )[1], precision( y_pred, dtest )[1], roc_auc( y_pred, dtest )[1] ]
# print(results)
test_data.append(results)
test_data = pd.DataFrame(test_data, columns=['k', 'ratio','best','recall','precision','auc'])
return test_data
# In[ ]:
get_ipython().run_cell_magic('time', '', "\n# Generate and test data with untrained model\n\ngenerator_model, discriminator_model, combined_model = define_models_CGAN(rand_dim, data_dim, label_dim, base_n_count, type='Wasserstein')\ngenerator_model.load_weights( base_dir + 'WCGAN_generator_model_weights_step_0.h5')\n\ntest_size = 492\nx = get_data_batch(fraud_w_classes, test_size, seed=0)\nz = np.random.normal(size=(test_size, rand_dim))\nlabels = x[:,-label_dim:]\ng_z = generator_model.predict([z, labels])\n\n\n# The labels for the generate data will all be 1, as they are supposed to be fraud data\ng_z_df = pd.DataFrame( np.hstack( [g_z[:,:len(data_cols)], np.ones((len(g_z),1))] ), columns=data.columns )\n\nn_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set = MakeCrossFolds(g_z_df)\n \nt_0 = Run_CV_Xgb(n_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set)\n")
# In[114]:
get_ipython().run_cell_magic('time', '', "\n# Generate and test data with trained model\n\ngenerator_model, discriminator_model, combined_model = define_models_CGAN(rand_dim, data_dim, label_dim, base_n_count, type='Wasserstein')\ngenerator_model.load_weights( base_dir + 'WCGAN_generator_model_weights_step_4800.h5')\n\ntest_size = 492\nx = get_data_batch(fraud_w_classes, test_size, seed=0)\nz = np.random.normal(size=(test_size, rand_dim))\nlabels = x[:,-label_dim:]\ng_z = generator_model.predict([z, labels])\n\n\n\n# The labels for the generate data will all be 1, as they are supposed to be fraud data\ng_z_df = pd.DataFrame( np.hstack( [g_z[:,:len(data_cols)], np.ones((len(g_z),1))] ), columns=data.columns )\n\nn_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set = MakeCrossFolds(g_z_df)\n \nt_4800 = Run_CV_Xgb(n_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set)\n")
# In[115]:
get_ipython().run_cell_magic('time', '', '\n# Generate and test data with additional real data\n\nn_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set = MakeCrossFolds()\n \nt_real = Run_CV_Xgb(n_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set)\n')
# In[120]:
# # Save the testing data
# # Run using the WCGAN trained after 0 steps
# pickle.dump( t_0, open('cache/additional untrained generated fraud data test.pkl','wb'))
# # Run using the WCGAN trained after 4800 steps
# pickle.dump( t_4800, open('cache/additional generated fraud data test.pkl','wb'))
# # Run using the real data
# pickle.dump( t_real, open('cache/additional real fraud data test.pkl','wb'))
# In[30]:
# Reload the testing data
t_0 = pickle.load(open('cache/additional untrained generated fraud data test.pkl','rb'))
t_4800 = pickle.load(open('cache/additional generated fraud data test.pkl','rb'))
t_real = pickle.load(open('cache/additional real fraud data test.pkl','rb'))
# Figure 8: Effects of Additional Data on Fraud Detection
#
# Table of contents
# In[31]:
# Plot the testing data
labels = ['WCGAN\ntrained 0 steps','WCGAN\ntrained 4800 steps','Actual Fraud Data']
metric = 'recall'
plt.figure(figsize=(12,3))
for i, [label, test_data] in enumerate(zip(labels, [t_0, t_4800, t_real])):
xs = [ n_train_fraud * (i[0]-1) for i in test_data.groupby('ratio') ]
ys = test_data.groupby('ratio')[metric].mean().values
stds = 2 * test_data.groupby('ratio')[metric].std().values
plt.subplot(1,3,i+1)
plt.axhline(ys[0],linestyle='--',color='red')
plt.plot(xs,ys,c='C1',marker='o')
plt.plot(xs,ys+stds,linestyle=':',c='C2')
plt.plot(xs,ys-stds,linestyle=':',c='C2')
if i==0: plt.ylabel(metric)
plt.xlabel('# additional data')
plt.title(label,size=12)
# plt.xlim([0,11])
# plt.ylim([0.55,.85])
plt.ylim([0.6,1.0])
plt.tight_layout(rect=[0,0,1,0.9])
plt.suptitle('Effects of additional data on fraud detection', size=16)
# plt.savefig('plots/Effects of addtional data on fraud detection.png')
plt.show()
# Summary of Training Data
#
# Table of contents
#
#
# In[32]:
# Load the saved loss data from each model
# base_dir = 'cache/'
base_dir = 'cache lr mix base 128 act mix 171026/'
suffix = '_step_5000'
GAN_losses = pickle.load(open(base_dir + 'GAN_losses'+suffix+'.pkl','rb'))
# GAN_losses = [combined_loss, disc_loss_real, disc_loss_generated, xgb_losses]
CGAN_losses = pickle.load(open(base_dir + 'CGAN_losses'+suffix+'.pkl','rb'))
WGAN_losses = pickle.load(open(base_dir + 'WGAN_losses'+suffix+'.pkl','rb'))
WCGAN_losses = pickle.load(open(base_dir + 'WCGAN_losses'+suffix+'.pkl','rb'))
# In[34]:
# Find best xgb scores overall and saved (every 100 steps)
data_ix = 3
data_sets = [ GAN_losses[data_ix], CGAN_losses[data_ix], WGAN_losses[data_ix], WCGAN_losses[data_ix]]
labels = [ 'GAN','CGAN','WGAN','WCGAN' ]
for label, data_set in zip( labels, data_sets ):
best_step = list(data_set).index( np.array(data_set).min() ) * 10
print( '{: <5} step {: <4}: {:.4f}'.format( label, best_step, np.array(data_set).min() ) )
xgb100 = [ data_set[i] for i in range(0, len(data_set), 10) ]
best_step = xgb100.index( min(xgb100) ) * 100
print( '{: <5} step {: <4}: {:.4f}\n'.format( label, best_step, np.array(xgb100).min() ) )
# print( best_step, min(xgb100) )
# In[35]:
# Look at the unsmoothed losses
data_fields = ['combined_losses_', 'real_losses_', 'generated_losses_', 'xgb_losses']
sampling_intervals = [ 1, 1, 1, 10 ]
labels = [ 'GAN','CGAN','WGAN','WCGAN' ]
linestyles = ['-', '--', '-.', ':']
for data_ix in range(len(data_fields)):
data_sets = [ GAN_losses[data_ix], CGAN_losses[data_ix], WGAN_losses[data_ix], WCGAN_losses[data_ix]]
plt.figure(figsize=(10,5))
for data, label, linestyle in zip(data_sets, labels, linestyles):
plt.plot( np.array(range(0,len(data)))*sampling_intervals[data_ix],
data,
label=label, linestyle=linestyle )
plt.ylabel(data_fields[data_ix])
plt.xlabel('training step')
plt.legend()
plt.show()
# In[36]:
# Look at the smoothed losses
data_fields = ['combined_losses_', 'real_losses_', 'generated_losses_', 'xgb_losses']
sampling_intervals = [ 1, 1, 1, 10 ]
labels = [ 'GAN','CGAN','WGAN','WCGAN' ]
linestyles = ['-', '--', '-.', ':']
w = 100
for data_ix in range(len(data_fields)):
data_sets = [ GAN_losses[data_ix], CGAN_losses[data_ix], WGAN_losses[data_ix], WCGAN_losses[data_ix]]
plt.figure(figsize=(10,5))
for data, label, linestyle in zip(data_sets, labels, linestyles):
plt.plot( np.array(range(0,len(data)))*sampling_intervals[data_ix],
pd.DataFrame(data).rolling(w).mean(),
label=label, linestyle=linestyle )
plt.ylabel(data_fields[data_ix])
plt.xlabel('training step')
plt.legend()
plt.show()
# Figure 6: Accuracy of Generated Data Detection
#
# Table of contents
# In[37]:
# Create a figure for the smoothed xgboost losses
data_fields = ['combined_losses_', 'real_losses_', 'generated_losses_', 'xgb_losses']
sampling_intervals = [ 1, 1, 1, 10 ]
labels = [ 'GAN','CGAN','WGAN','WCGAN' ]
linestyles = ['-', '--', '-.', ':']
w = 50
data_ix = 3
data_sets = [ GAN_losses[data_ix], CGAN_losses[data_ix], WGAN_losses[data_ix], WCGAN_losses[data_ix]]
plt.figure(figsize=(10,5))
for data, label, linestyle in zip(data_sets, labels, linestyles):
plt.plot( np.array(range(0,len(data)))*sampling_intervals[data_ix],
pd.DataFrame(data).rolling(w).mean(),
label=label, linestyle=linestyle )
plt.ylabel(data_fields[data_ix])
plt.xlabel('training step')
legend = plt.legend()
legend.get_frame().set_facecolor('white')
plt.title('Accuracy of generated data detection')
plt.ylabel('xgboost accuracy')
plt.tight_layout() ;
# plt.savefig('plots/GAN_accuracy.png')
# Figure 7: Differences in Critic Loss
#
# Table of contents
# In[38]:
# Create a figure for the critic losses for the WGAN and WCGAN
w = 50
data_ix0 = 2
data_ix1 = 1
data_fields = ['combined_losses_', 'real_losses_', 'generated_losses_', 'xgb_losses']
i1, i2 = 2, 3
i2 += 1
labels = [ 'GAN','CGAN','WGAN','WCGAN' ][i1:i2]
data_sets0 = [ GAN_losses[data_ix0], CGAN_losses[data_ix0], WGAN_losses[data_ix0], WCGAN_losses[data_ix0]][i1:i2]
data_sets1 = [ GAN_losses[data_ix1], CGAN_losses[data_ix1], WGAN_losses[data_ix1], WCGAN_losses[data_ix1]][i1:i2]
linestyles = ['-', '--', '-.', ':'][i1:i2]
plt.figure(figsize=(10,5))
for data0, data1, label, linestyle in zip(data_sets0, data_sets1, labels, linestyles):
plt.plot( range(0,len(data0)),
pd.DataFrame( np.array(data0)-np.array(data1) ).rolling(w).mean(),
label=label, linestyle=linestyle )
plt.title('Difference between critic loss (EM distance estimate)\non generated samples and real samples')
plt.xlabel('training step')
plt.ylabel('Gen - Real Critic Loss')
legend = plt.legend()
legend.get_frame().set_facecolor('white')
# plt.savefig('plots/Delta_critic_loss_plot.png')
# DRAGAN Training
#
# Table of contents
#
#
# This section is less developed, and may contain more errors that average
#
# A number of errors in the wiseodd code?:
# - xavier initializer
# - calculation of gradient penalties
#
# Resources:
#
# https://github.com/kodalinaveen3/DRAGAN
#
# https://github.com/wiseodd/generative-models
#
# https://github.com/wiseodd/generative-models/blob/master/GAN/improved_wasserstein_gan/wgan_gp_tensorflow.py
#
# https://github.com/igul222/improved_wgan_training/blob/master/gan_toy.py
#
# In[39]:
# Load engineered dataset from EDA section
data = pickle.load(open('data/' + 'credicard.engineered.pkl','rb'))
# Put columns in order of importance for xgboost fraud detection, from the xgboost section
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'Time', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
data = data[ sorted_cols ].copy()
# data columns will be all other columns except class
data_cols = list(data.columns[ data.columns != 'Class' ])
data_dim = len(data_cols)
# label_cols = ['Class']
train = data.loc[ data.Class == 1, data_cols ].copy().reset_index(drop=True)
train[ data_cols ] = train[ data_cols ] / 10
with_class = False
show = True
data_dir = 'cache/'
cache_prefix = 'DRAGAN'
label_cols = []
log_interval = 100
z_dim = 32
X_dim = data_dim
# In[40]:
# Load (and/or reload functions)
import GAN_171103
import importlib
importlib.reload(GAN_171103) # For reloading after making changes
from GAN_171103 import *
# In[41]:
# Start with a clean tensorflow graph every time
tf.reset_default_graph()
# In[42]:
# Define the DRAGAN network
mb_size = 128 # 128 # minibatch size
h_dim = 128 # 128 # hidden layer dim
lambda0 = 10
learning_rate = 1e-4 # 1e-4
global_seed = 0
[ D_solver, disc_cost, D_loss_real, D_loss_fake,
X, X_p, z,
G_solver, gen_cost, G_sample ] = \
define_DRAGAN_network( X_dim=X_dim, h_dim=h_dim, z_dim=z_dim, learning_rate=learning_rate, mb_size=mb_size, seed=global_seed ) ;
# In[43]:
# Set the tensorflow session, set seeds, create a saver function, and glance at the length of the operation set for the graph
sess = tf.Session()
np.random.seed(global_seed)
tf.set_random_seed(global_seed)
sess.graph.seed = global_seed
sess.run(tf.global_variables_initializer())
# saver = tf.train.Saver(theta_D + theta_G , max_to_keep=10000) # 20 Mb+ for all, 1.5 Mb for thetas only
saver = tf.train.Saver( max_to_keep=10000 ) # 20 Mb+ for all, 1.5 Mb for thetas only
# saver.save(sess, data_dir + cache_prefix + '_model_step_0' )
# saver.save(sess, data_dir + cache_prefix + '-initial_values' )
saver.save(sess, data_dir + cache_prefix )
combined_loss, disc_loss_generated, disc_loss_real, xgb_losses = [], [], [], []
best_xgb = 0.7
print( len( sess.graph.get_operations() ) )
# In[ ]:
get_ipython().run_cell_magic('time', '', "\n# Now we train the DRAGAN\n# We'll xgboost test it at intervals\n# We'll also save the graph weights and losses at intervals as well\n\nfor it in range(0, 50000+1):\n \n X_mb = get_data_batch(train, mb_size, seed=it)\n X_mb_p = get_perturbed_batch(X_mb) # DRAGAN\n\n _, D_loss_curr, d_l_r, d_l_g = sess.run(\n [D_solver, disc_cost, D_loss_real, D_loss_fake],\n feed_dict={X: X_mb, X_p: X_mb_p, z: sample_z(mb_size, z_dim)}\n )\n\n _, G_loss_curr = sess.run(\n [G_solver, gen_cost],\n feed_dict={z: sample_z(mb_size, z_dim)}\n )\n\n disc_loss_real.append(d_l_r)\n disc_loss_generated.append(d_l_g)\n combined_loss.append(G_loss_curr)\n \n if it % 100 == 0:\n test_size = 492 # The total number of fraud cases\n x = get_data_batch(train, test_size, seed=it)\n g_z = sess.run(G_sample, feed_dict={z: sample_z(test_size, z_dim)})\n xgb_loss = CheckAccuracy( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim )\n xgb_losses.append(xgb_loss)\n if xgb_loss < best_xgb: \n best_xgb = xgb_loss\n saver.save(sess, data_dir + cache_prefix, global_step=it, write_meta_graph=False )\n \n if it % log_interval == 0:\n print('Iter: {}; D loss: {:.4}; G_loss: {:.4}; xgb_loss: {:.4}'\n .format(it, D_loss_curr, G_loss_curr, xgb_loss))\n if show:\n PlotData( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim ) \n \n saver.save(sess, data_dir + cache_prefix, global_step=it, write_meta_graph=False )\n\n pickle.dump([combined_loss, disc_loss_generated, disc_loss_real, xgb_losses], \n open( data_dir + cache_prefix + '_losses_step_{}.pkl'.format(it) ,'wb'))\n \n")
# In[ ]:
get_ipython().run_cell_magic('time', '', "\n# for continued training\n\nfor it in range(50001, 100001):\n \n X_mb = get_data_batch(train, mb_size, seed=it)\n X_mb_p = get_perturbed_batch(X_mb) # DRAGAN\n\n _, D_loss_curr, d_l_r, d_l_g = sess.run(\n [D_solver, disc_cost, D_loss_real, D_loss_fake],\n feed_dict={X: X_mb, X_p: X_mb_p, z: sample_z(mb_size, z_dim)}\n )\n\n _, G_loss_curr = sess.run(\n [G_solver, gen_cost],\n feed_dict={z: sample_z(mb_size, z_dim)}\n )\n\n disc_loss_real.append(d_l_r)\n disc_loss_generated.append(d_l_g)\n combined_loss.append(G_loss_curr)\n \n if it % 100 == 0:\n test_size = 492 # mb_size\n x = get_data_batch(train, test_size, seed=it)\n g_z = sess.run(G_sample, feed_dict={z: sample_z(test_size, z_dim)})\n xgb_loss = CheckAccuracy( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim )\n xgb_losses.append(xgb_loss)\n# if xgb_loss < best_xgb: best_xgb = xgb_loss\n \n if it % log_interval == 0:\n print('Iter: {}; D loss: {:.4}; G_loss: {:.4}; xgb_loss: {:.4}'\n .format(it, D_loss_curr, G_loss_curr, xgb_loss))\n if show:\n PlotData( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim ) \n \n saver.save(sess, data_dir + cache_prefix, global_step=it, write_meta_graph=False )\n\n pickle.dump([combined_loss, disc_loss_generated, disc_loss_real, xgb_losses], \n open( data_dir + cache_prefix + '_losses_step_{}.pkl'.format(it) ,'wb'))\n \n")
# In[170]:
# # For checking if you graph operations list has changed while you weren't looking
# print( len( sess.graph.get_operations() ) )
# In[171]:
# # We should be able to import the graph and weights this way, we just need to redefine whatever operation we want to peform
# tf.reset_default_graph()
# sess = tf.Session()
# print( len( sess.graph.get_operations() ) )
# saver = tf.train.import_meta_graph(data_dir + cache_prefix + '.meta')
# print( len( sess.graph.get_operations() ) )
# In[49]:
# Or we can just define the network like we did before and just load the weights
# That way we already have all the operations defined
# So, to load stored graph, with operations defined locally:
tf.reset_default_graph()
sess = tf.Session()
print( len( sess.graph.get_operations() ) )
[ D_solver, disc_cost, D_loss_real, D_loss_fake,
X, X_p, z,
G_solver, gen_cost, G_sample ] = \
define_DRAGAN_network( X_dim=X_dim, h_dim=h_dim, z_dim=z_dim, learning_rate=learning_rate, mb_size=mb_size, seed=global_seed ) ;
sess = tf.Session()
np.random.seed(global_seed)
tf.set_random_seed(global_seed)
sess.graph.seed = global_seed
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver( max_to_keep=10000 ) # 20 Mb+ for all, 1.5 Mb for thetas only
print( len( sess.graph.get_operations() ) )
# In[50]:
# To load saved weights and test
saver.restore(sess, data_dir + cache_prefix + '-45400' )
print( len( sess.graph.get_operations() ) )
g_z = sess.run(G_sample, feed_dict={z: sample_z(test_size, z_dim)})
# g_z
# In[51]:
# %%time
# We can test the generated data using the functions from the 'GAN Data Testing' Section
# You'll need to load the functions from that section for this to work
# The labels for the generate data will all be 1, as they are supposed to be fraud data
g_z_df = pd.DataFrame( np.hstack( [g_z[:,:len(data_cols)], np.ones((len(g_z),1))] ), columns=data.columns )
n_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set = MakeCrossFolds(g_z_df)
t_DRAGAN = Run_CV_Xgb(n_train_fraud, train_real_set, test_real_set, train_fraud_set, test_fraud_set)
# In[57]:
# Looks like still no increase in recall
t_DRAGAN.groupby('ratio')[['recall']].aggregate(['mean','std'])
# In[44]:
# To load saved losses
prefix = 'DRAGAN'
[combined_loss, disc_loss_generated, disc_loss_real, xgb_losses] = pickle.load(open(data_dir+prefix+'_losses_step_100000.pkl','rb'))
# In[45]:
# Let's look at the xgboost losses
w = 1
plt.plot( pd.DataFrame(xgb_losses[:]).rolling(w).mean() ) ;
# plt.savefig('plots/171031_xgb_loss_DRAGAN_lr1e-4_mb128_hdim128.png')
# In[46]:
# Let's look at the smoothed xgboost losses
w = 20
plt.plot( pd.DataFrame(xgb_losses[:]).rolling(w).mean() ) ;
# plt.savefig('plots/171031_xgb_loss_DRAGAN_lr1e-4_mb128_hdim128.png')
# In[47]:
# Let's find the best xgboost loss
best_step = xgb_losses.index( np.array(xgb_losses).min() ) * 100
print( best_step, np.array(xgb_losses).min() )
xgb100 = [ xgb_losses[i] for i in range(0, len(xgb_losses), 1) ]
best_step = xgb100.index( min(xgb100) ) * log_interval
print( best_step, min(xgb100) )
# In[48]:
# Let's look at training losses
# plt.plot( (np.array(disc_loss_real) - np.array(disc_loss_generated)) )
w = 1000
# plt.plot( list(range(0,5001,1)), pd.rolling_mean((np.array(disc_loss_real) - np.array(disc_loss_generated)),w) )
plt.plot( pd.DataFrame(disc_loss_real[:]).rolling(w).mean() - pd.DataFrame(disc_loss_generated[:]).rolling(w).mean() ) ;
# plt.ylim([-0.005,0.015])
# plt.xlim([50000,110000])
# plt.ylim([-0.001,0.002])