#!/usr/bin/env python # coding: utf-8 # # Anomaly Detection using Deep Learning # # ## Demo # # We'll train a neural network called an Autoencoder on a credit card fraud dataset from [Kaggle](https://www.kaggle.com/dalpozz/creditcardfraud) without labels. It will learn how to reconstruct the input data after time. Then, we'll use its learned representations to turn to perform binary classification. # # ## What's an autoencoder? # # ![alt text](https://vanishingcodes.files.wordpress.com/2017/06/stackedae.png "Logo Title Text 1") # # Auto-encoder is a type of neural network that approximates the function: # # f(x) = x. # # Basically, given an input x, network will learn to output f(x) that is as close as to x. # # The error between output and x is commonly measured using root mean square error (RMSE) # # mean((f(x) – x) ^ 2) # # which is the loss function we try to minimise in our network. # # Autoencoders follows a typical feed-forward neural networks architecture except that the output layer has exactly same number of neurons as input layer. And it uses the input data itself as its target. Therefore it works in a way of unsupervised learning – learn without predicting an actual label # # The ‘encoder’ – job is to ’embed’ the input data into a lower dimensional array. # The ‘decoder’ - job is to try to decode the embedding array into the original one. # # We can have either one hidden layer, or in the case below, have multiple layers depending on the complexity of our features. # # ## What are other Anomaly Detection methods? # # PCA - Principal Component analysis. This is actually a dimensionality reduction technique but can be used to detect anomalies since when visualized, the learned 'principal components' will show outliers. # # ![alt text](https://upload.wikimedia.org/wikipedia/commons/6/69/Principal_Component_Analysis_of_the_Italian_population.png "Logo Title Text 1") # # K means - This is simple arithmetic, a very popular algorithm. Gives great results for small datasets, utility declines as your data grows >1000 data points. # # ![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/0/09/ClusterAnalysis_Mouse.svg/450px-ClusterAnalysis_Mouse.svg.png "Logo Title Text 1") # # ![alt text](https://blog.keras.io/img/ae/autoencoder_schema.jpg "Logo Title Text 1") # # # ![alt text](https://cdn-images-1.medium.com/max/1600/1*8ixTe1VHLsmKB3AquWdxpQ.png "Logo Title Text 1") # # # ## What are the steps # # 1. First part of the forward pass - encode the data points into an encoded representation # 2. Second part of the forward pass - decode the data points # 3. Measure the Root mean squared error. Minimize it using backpropagation. # 4. We'll define a threshold value. If the RMSE is above that threshold value then, it's considered an anomaly. We can Rank the RSME values and the top .1 can be considered anomalies. # # The reason the top X datapoints with the highest RMSE error are considered anomalies is because the RMSE will measure the difference between the learned patterns and the anomaly value. Since the majority of the datapoints share similar patterns, the anomalies will stick out. They'll have the highest difference. # # So we're basically creating a generalized latent representation of all the datapoints (both fraud and valid) and since the valid datapoints far outweight the fraud datapoints, this learned representation will be one for valid datapoints. Then when the difference between the learning and the fraud datapoint is measured, it will be very high. Anomaly spotted! # # # ## How can this be applied at CERN? # # Why use deep learning? # -Particle data is very high dimensional # -It works well on big datasets and big computing power # -No need to perform feature engineering, deep nets will learn the most relevant features. # # -Classification for known particles # -Generative models for simulations (Generative adversarial networks, variational autoencoders) # -Clustering (What we're talking about) # # ![alt text](https://home.cern/sites/home.web.cern.ch/files/image/featured/2014/01/higgs-simulation-3.jpg "Logo Title Text 1") # # ## Load libraries # In[3]: import pandas as pd #data manipulation import numpy as np #matrix math import tensorflow as tf #machine learning import os #saving files from datetime import datetime #logging from sklearn.metrics import roc_auc_score as auc #measuring accuracy import seaborn as sns #plotting # In[4]: import matplotlib.pyplot as plt #plotting import matplotlib.gridspec as gridspec get_ipython().run_line_magic('matplotlib', 'inline') # ## Data Exploration # In[58]: data_dir = "C:\\Users\\weimin\\Desktop\\Fraud" df = pd.read_csv(os.path.join(data_dir, 'creditcard.csv')) # In[59]: df.shape # In[60]: print("Total time spanning: {:.1f} days".format(df['Time'].max() / (3600 * 24.0))) print("{:.3f} % of all transactions are fraud. ".format(np.sum(df['Class']) / df.shape[0] * 100)) # In[3]: df.head() # In[63]: df.columns # In[64]: df.dtypes # In[65]: plt.figure(figsize=(12,5*4)) gs = gridspec.GridSpec(5, 1) for i, cn in enumerate(df.columns[:5]): ax = plt.subplot(gs[i]) sns.distplot(df[cn][df.Class == 1], bins=50) sns.distplot(df[cn][df.Class == 0], bins=50) ax.set_xlabel('') ax.set_title('histogram of feature: ' + str(cn)) plt.show() # ## Train and test split on time series, using first 75% as training/val, and last 25% as test # In[9]: TEST_RATIO = 0.25 df.sort_values('Time', inplace = True) TRA_INDEX = int((1-TEST_RATIO) * df.shape[0]) train_x = df.iloc[:TRA_INDEX, 1:-2].values train_y = df.iloc[:TRA_INDEX, -1].values test_x = df.iloc[TRA_INDEX:, 1:-2].values test_y = df.iloc[TRA_INDEX:, -1].values # In[6]: print("Total train examples: {}, total fraud cases: {}, equal to {:.5f} of total cases. ".format(train_x.shape[0], np.sum(train_y), np.sum(train_y)/train_x.shape[0])) # In[7]: print("Total test examples: {}, total fraud cases: {}, equal to {:.5f} of total cases. ".format(test_x.shape[0], np.sum(test_y), np.sum(test_y)/test_y.shape[0])) # 2 types of standardization z-score and min-max scaling. # # z-score will normalize each column into having mean of zero and standardization of ones, # which will be good choice if we are using some sort of output functions like tanh, # that outputs values on both sides of zero. # Besides, this will leave values that are too extreme to still keep some extremeness # left after normalization (e.g. to have more than 2 standard deviations away). # This might be useful to detect outliers in this case. # # The second min-max approach will ensure all values to be within 0 ~ 1. # All positive. This is the default approach if we are using sigmoid as # our output activation. # # Just to recap the differences between sigmoid and tanh below # (sigmoid will squash the values into range between (0, 1); # whereas tanh, or hyperbolic tangent, squash them into (-1, 1)): # # ![alt text](https://vanishingcodes.files.wordpress.com/2017/06/tanh-and-sigmoid.png "Logo Title Text 1") # # # # Based on experiments, I found tanh to perform better than sigmoid, # when using together with z-score normalization. # Therefore, I chose tanh followed by z-score. # ## Feature Normalization - min max score (used for sigmoid activation) # In[10]: '''cols_max = [] cols_min = [] for c in range(train_x.shape[1]): cols_max.append(train_x[:,c].max()) cols_min.append(train_x[:,c].min()) train_x[:, c] = (train_x[:, c] - cols_min[-1]) / (cols_max[-1] - cols_min[-1]) test_x[:, c] = (test_x[:, c] - cols_min[-1]) / (cols_max[-1] - cols_min[-1])''' # ## Feature Normalization 2 - z score (for tanh activation) # In[10]: cols_mean = [] cols_std = [] for c in range(train_x.shape[1]): cols_mean.append(train_x[:,c].mean()) cols_std.append(train_x[:,c].std()) train_x[:, c] = (train_x[:, c] - cols_mean[-1]) / cols_std[-1] test_x[:, c] = (test_x[:, c] - cols_mean[-1]) / cols_std[-1] # ## Modelling and results # # ## 1. Auto-encoder as unsupervised learning # ### Parameters # In[11]: # Parameters learning_rate = 0.01 #how fast to learn? too low, too long to converge, too high overshoot minimum training_epochs = 10 batch_size = 256 display_step = 1 # Network Parameters (neurons ) n_hidden_1 = 15 # 1st layer num features #n_hidden_2 = 15 # 2nd layer num features n_input = train_x.shape[1] # MNIST data input (img shape: 28*28) # ### Train and val the model - (1 hidden layer turned out to be enough) # In[12]: X = tf.placeholder("float", [None, n_input]) weights = { 'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])), #'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])), 'decoder_h1': tf.Variable(tf.random_normal([n_hidden_1, n_input])), #'decoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_input])), } biases = { 'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])), #'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])), 'decoder_b1': tf.Variable(tf.random_normal([n_input])), #'decoder_b2': tf.Variable(tf.random_normal([n_input])), } # Building the encoder def encoder(x): #input times weight + bias. Activate! It rhymes. # Encoder Hidden layer with sigmoid activation #1 layer_1 = tf.nn.tanh(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1'])) return layer_1 # Building the decoder def decoder(x): # Encoder Hidden layer with sigmoid activation #1 layer_1 = tf.nn.tanh(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1'])) return layer_1 # Construct model #feed the input data into the encoder. encoder_op = encoder(X) #and learned representation to the decoder decoder_op = decoder(encoder_op) # Prediction y_pred = decoder_op # Targets (Labels) are the input data. y_true = X # Define batch mse batch_mse = tf.reduce_mean(tf.pow(y_true - y_pred, 2), 1) # Define loss and optimizer, minimize the squared error cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2)) #this is gradient descent. Stochastic gradient descent. Backpropagate to update weights! optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost) # TRAIN StARTS save_model = os.path.join(data_dir, 'temp_saved_model_1layer.ckpt') saver = tf.train.Saver() # Initializing the variables init = tf.global_variables_initializer() with tf.Session() as sess: now = datetime.now() sess.run(init) total_batch = int(train_x.shape[0]/batch_size) # Training cycle for epoch in range(training_epochs): # Loop over all batches for i in range(total_batch): #pick a random datapoint from the batch batch_idx = np.random.choice(train_x.shape[0], batch_size) batch_xs = train_x[batch_idx] # Run optimization op (backprop) and cost op (to get loss value) #recursive weight updating via gradient computation. _, c = sess.run([optimizer, cost], feed_dict={X: batch_xs}) # Display logs per epoch step if epoch % display_step == 0: train_batch_mse = sess.run(batch_mse, feed_dict={X: train_x}) print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c), "Train auc=", "{:.6f}".format(auc(train_y, train_batch_mse)), "Time elapsed=", "{}".format(datetime.now() - now)) print("Optimization Finished!") save_path = saver.save(sess, save_model) print("Model saved in file: %s" % save_path) # ### Test model - on later 25% test data # In[13]: save_model = os.path.join(data_dir, 'temp_saved_model_1layer.ckpt') saver = tf.train.Saver() # Initializing the variables init = tf.global_variables_initializer() with tf.Session() as sess: now = datetime.now() saver.restore(sess, save_model) test_batch_mse = sess.run(batch_mse, feed_dict={X: test_x}) print("Test auc score: {:.6f}".format(auc(test_y, test_batch_mse))) # ## Visualize the prediction # ### 1. Show distribution of all mse (fraud scores if you want to call it) # In[24]: plt.hist(test_batch_mse, bins = 100) plt.show() # ### Zoom into (0, 20) range # In[26]: plt.hist(test_batch_mse[test_batch_mse < 20], bins = 100) plt.show() # ### 2. Display only fraud cases # In[35]: plt.hist(test_batch_mse[test_y == 1.0], bins = 100) plt.show() # In[34]: THRE_TEST = 7 print(" All scores above treshold: {}, all pos above threshold: {}, and percentage of accuracy above treshold are: {}".format( \ np.sum(test_y[test_batch_mse > THRE_TEST]), np.sum(test_batch_mse > THRE_TEST), np.sum(test_y[test_batch_mse > THRE_TEST]) / np.sum(test_batch_mse > THRE_TEST))) # ## 2. Build a binary classifier model that output probabilities of being fraud, using previous auto-encoder embedding layer as model input for all data # ### 1) Get auto-encoder embedding (encoder_op) for both train and test data # In[16]: save_model = os.path.join(data_dir, 'temp_saved_model_1layer.ckpt') saver = tf.train.Saver() # Initializing the variables init = tf.global_variables_initializer() with tf.Session() as sess: now = datetime.now() saver.restore(sess, save_model) test_encoding = sess.run(encoder_op, feed_dict={X: test_x}) train_encoding = sess.run(encoder_op, feed_dict={X: train_x}) print("Dim for test_encoding and train_encoding are: \n", test_encoding.shape, '\n', train_encoding.shape) # ### 2) Build the graph for FC layers (best hidden size based on validation is found to be 4) # In[34]: #n_input = test_encoding.shape[1] n_input = test_encoding.shape[1] hidden_size = 4 output_size = 2 X = tf.placeholder(tf.float32, [None, n_input], name='input_x') y_ = tf.placeholder(tf.int32, shape=[None, output_size], name='target_y') weights = { 'W1': tf.Variable(tf.truncated_normal([n_input, hidden_size])), 'W2': tf.Variable(tf.truncated_normal([hidden_size, output_size])), } biases = { 'b1': tf.Variable(tf.zeros([hidden_size])), 'b2': tf.Variable(tf.zeros([output_size])), } hidden_layer = tf.nn.relu(tf.add(tf.matmul(X, weights['W1']), biases['b1'])) pred_logits = tf.add(tf.matmul(hidden_layer, weights['W2']), biases['b2']) pred_probs = tf.nn.softmax(pred_logits) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=pred_logits)) optimizer = tf.train.AdamOptimizer(2e-4).minimize(cross_entropy) # ### 3) Prepare the data set. Now we need to re-split the train into train/val. We will therefore use 80% of train as our new train, and the remaining 20% as new val. (train : val : test = (0.75 x 0.8 : 0.75 x 0.2 : 0.25 x 1.0)). Then we start to train our binary classifier # In[38]: n_epochs = 70 batch_size = 256 # PREPARE DATA VAL_PERC = 0.2 all_y_bin = np.zeros((df.shape[0], 2)) all_y_bin[range(df.shape[0]), df['Class'].values] = 1 train_enc_x = train_encoding[:int(train_encoding.shape[0] * (1-VAL_PERC))] train_enc_y = all_y_bin[:int(train_encoding.shape[0] * (1-VAL_PERC))] val_enc_x = train_encoding[int(train_encoding.shape[0] * (1-VAL_PERC)):] val_enc_y = all_y_bin[int(train_encoding.shape[0] * (1-VAL_PERC)):train_encoding.shape[0]] test_enc_y = all_y_bin[train_encoding.shape[0]:] print("Num of data for train, val and test are: \n{}, \n{}, \n{}".format(train_enc_x.shape[0], val_enc_x.shape[0], \ test_encoding.shape[0])) # TRAIN STARTS save_model = os.path.join(data_dir, 'temp_saved_model_FCLayers.ckpt') saver = tf.train.Saver() # Initializing the variables init = tf.global_variables_initializer() with tf.Session() as sess: now = datetime.now() sess.run(init) total_batch = int(train_enc_x.shape[0]/batch_size) # Training cycle for epoch in range(n_epochs): # Loop over all batches for i in range(total_batch): batch_idx = np.random.choice(train_enc_x.shape[0], batch_size) batch_xs = train_enc_x[batch_idx] batch_ys = train_enc_y[batch_idx] # Run optimization op (backprop) and cost op (to get loss value) _, c = sess.run([optimizer, cross_entropy], feed_dict={X: batch_xs, y_: batch_ys}) # Display logs per epoch step if epoch % display_step == 0: val_probs = sess.run(pred_probs, feed_dict={X: val_enc_x}) print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c), "Val auc=", "{:.6f}".format(auc(val_enc_y[:, 1], val_probs[:, 1])), "Time elapsed=", "{}".format(datetime.now() - now)) print("Optimization Finished!") save_path = saver.save(sess, save_model) print("Model saved in file: %s" % save_path) # ### 4) Test the model on the same test data as before - improved on AUC slightly # In[39]: save_model = os.path.join(data_dir, 'temp_saved_model_FCLayers.ckpt') saver = tf.train.Saver() # Initializing the variables init = tf.global_variables_initializer() with tf.Session() as sess: now = datetime.now() saver.restore(sess, save_model) test_probs = sess.run(pred_probs, feed_dict={X: test_encoding}) print("Test auc score: {}".format(auc(test_enc_y[:, 1], test_probs[:, 1]))) # ## 3. (Optional) However, let's test a simple supervisied neural network (two layers) from scratch - without using auto-encoder # ### 1) Build graph - 28 (input) -> 8 -> 4 -> 2 # In[44]: n_epochs = 200 batch_size = 256 #n_input = test_encoding.shape[1] n_input = train_x.shape[1] hidden1_size = 8 hidden2_size = 4 output_size = 2 X = tf.placeholder(tf.float32, [None, n_input], name='input_x') y_ = tf.placeholder(tf.int32, shape=[None, output_size], name='target_y') weights = { 'W1': tf.Variable(tf.truncated_normal([n_input, hidden1_size])), 'W2': tf.Variable(tf.truncated_normal([hidden1_size, hidden2_size])), 'W3': tf.Variable(tf.truncated_normal([hidden2_size, output_size])), } biases = { 'b1': tf.Variable(tf.zeros([hidden1_size])), 'b2': tf.Variable(tf.zeros([hidden2_size])), 'b3': tf.Variable(tf.zeros([output_size])), } hidden1_layer = tf.nn.relu(tf.add(tf.matmul(X, weights['W1']), biases['b1'])) hidden2_layer = tf.nn.relu(tf.add(tf.matmul(hidden1_layer, weights['W2']), biases['b2'])) pred_logits = tf.add(tf.matmul(hidden2_layer, weights['W3']), biases['b3']) pred_probs = tf.nn.softmax(pred_logits) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=pred_logits)) optimizer = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) # ### 2) Prepare the data set. Now we need to re-split the train into train/val. We will therefore use 80% of train as our new train, and the remaining 20% as new val. (train : val : test = (0.75 x 0.8 : 0.75 x 0.2 : 0.25 x 1.0)). Then we start to train our Feedford neural network # In[45]: # PREPARE DATA VAL_PERC = 0.2 all_y_bin = np.zeros((df.shape[0], 2)) all_y_bin[range(df.shape[0]), df['Class'].values] = 1 train_enc_x = train_x[:int(train_x.shape[0] * (1-VAL_PERC))] train_enc_y = all_y_bin[:int(train_x.shape[0] * (1-VAL_PERC))] val_enc_x = train_x[int(train_encoding.shape[0] * (1-VAL_PERC)):] val_enc_y = all_y_bin[int(train_encoding.shape[0] * (1-VAL_PERC)):train_x.shape[0]] test_enc_y = all_y_bin[train_x.shape[0]:] print("Num of data for train, val and test are: \n{}, \n{}, \n{}".format(train_enc_x.shape[0], val_enc_x.shape[0], \ test_encoding.shape[0])) # TRAIN STARTS save_model = os.path.join(data_dir, 'temp_saved_model_FCNNets_raw.ckpt') saver = tf.train.Saver() # Initializing the variables init = tf.global_variables_initializer() with tf.Session() as sess: now = datetime.now() sess.run(init) total_batch = int(train_enc_x.shape[0]/batch_size) # Training cycle for epoch in range(n_epochs): # Loop over all batches for i in range(total_batch): batch_idx = np.random.choice(train_enc_x.shape[0], batch_size) batch_xs = train_enc_x[batch_idx] batch_ys = train_enc_y[batch_idx] # Run optimization op (backprop) and cost op (to get loss value) _, c = sess.run([optimizer, cross_entropy], feed_dict={X: batch_xs, y_: batch_ys}) # Display logs per epoch step if epoch % display_step == 0: val_probs = sess.run(pred_probs, feed_dict={X: val_enc_x}) print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c), "Val auc=", "{:.6f}".format(auc(val_enc_y[:, 1], val_probs[:, 1])), "Time elapsed=", "{}".format(datetime.now() - now)) print("Optimization Finished!") save_path = saver.save(sess, save_model) print("Model saved in file: %s" % save_path) # ### 3) Predict on test data # In[47]: save_model = os.path.join(data_dir, 'temp_saved_model_FCNNets_raw.ckpt') saver = tf.train.Saver() # Initializing the variables init = tf.global_variables_initializer() with tf.Session() as sess: now = datetime.now() saver.restore(sess, save_model) test_probs = sess.run(pred_probs, feed_dict={X: test_x}) print("Test auc score: {}".format(auc(test_enc_y[:, 1], test_probs[:, 1]))) # In[ ]: # In[ ]: # In[ ]: # In[ ]: