#!/usr/bin/env python
# coding: utf-8
# # Gaussian Processes with Spectral Mixture Kernels to Implicitly Capture Hidden Structure from Data
# ---------------------------------------------------------------------------------------------------
# **(Note: Cross-posted with the [Haystax Technology Blog](https://haystax.com/blog/).)**
#
# Several scientific fields, such as insider-threat detection, often lack sufficient amounts of time-series training data for the purpose of scientific discovery. Moreover, the available limited data are very noisy. For instance, Figure [1](#fig:data-emails) shows monthly attachment size in emails (in Gigabytes) sent by an insider from their employee account to their home account.
#
# ![Monthly email attachment size in Gigabytes\label{fig:data-emails}](https://github.com/Emaasit/long-range-extrapolation/blob/dev/code/notebooks/results/emails/data-email.png?raw=true)
#
# ## The problem
# Such data a major challenge when estimating statistical models to extract hidden patterns and perform accurate forecasting. Most of the current literature in insider-threat detection and highway-safety planning involve visualizing the time-series for noticeable structure, such as periodicity, and hard coding them into pre-specified parametric functions.
#
# ### Source code
# For the impatient reader, two options are provided below to access the source code used for empirical analyses:
#
# 1. Most of the code is not shown here to keep the post concise but the code, data and results can be found [here on GitHub](https://github.com/Emaasit/long-range-extrapolation).
#
# 2. [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/Emaasit/long-range-extrapolation/master?urlpath=lab) Click this icon to open the notebooks in a web browser and explore the code and data without downloading the project and installing any software.
#
#
# ## Related Work and Limitations
# This approach is associated with two limitations. First, given that such trends may not be noticeable in small data, it is difficult to explicitly incorporate expressive structure into the statistical models during formulation. Second, it is difficult to know *a priori* the most appropriate functional form to use.
#
# ## Data Science Questions
#
#
# ## Hypothesis
# To address these limitations, a nonparametric Bayesian approach was proposed to capture hidden structure from limited data and perform accurate long-range forecasting. The proposed model, a Gaussian process with a spectral mixture kernel, precludes the need to pre-specify a functional form and hard code trends. Bayesian modeling was adopted to account for uncertainty.
#
# The mathematical details are described in a corresponding paper that can be found [here on arXiv]().
# ## Experiments
# ### The setup
# Let's first install some python packages that we shall use for our analysis. Also we shall set up our plotting requirements.
# In[2]:
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context('notebook', font_scale = 1.1)
np.random.seed(12345)
rc = {'xtick.labelsize': 40, 'ytick.labelsize': 40, 'axes.labelsize': 40, 'font.size': 40, 'lines.linewidth': 4.0,
'lines.markersize': 40, 'font.family': "serif", 'font.serif': "cm", 'savefig.dpi': 200,
'text.usetex': False, 'legend.fontsize': 40.0, 'axes.titlesize': 40, "figure.figsize": [24, 16]}
sns.set(rc = rc)
sns.set_style("darkgrid")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import gpflow
from gpflowopt.domain import ContinuousParameter
from gpflowopt.bo import BayesianOptimizer
from gpflowopt.acquisition import ExpectedImprovement
from gpflowopt.optim import StagedOptimizer, MCOptimizer, SciPyOptimizer
from gpflowopt.design import LatinHyperCube
# ### Raw data and sample formation
#
# First, let's read in the data using `pandas`, view the first three records and the structure of the resulting `pandas` dataframe.
# In[8]:
email_filtered = pd.read_csv("../data/emails/email_filtered.csv", parse_dates=["date"])
email_filtered.head(n = 3)
# In[9]:
email_filtered.info()
# Let's filter data for a particular known insider with user ID "CDE1846".
# In[10]:
df_insider = email_filtered[email_filtered["user"] == "CDE1846"]
df_insider.shape
# In[11]:
emails_per_month = df_insider.resample(rule = "1M", on = "date").sum().reset_index()
emails_per_month["date"] = pd.to_datetime(emails_per_month["date"], format = "%Y-%m-%d")
emails_per_month.columns = ["ds", "y"]
emails_per_month
# In[12]:
fig, ax = plt.subplots()
sns.barplot(data = emails_per_month, x = "ds", y = "y", ax = ax)
ax.set_xticklabels(labels = emails_per_month["ds"], rotation = 45)
ax.set_xlabel("Months of the Year")
ax.set_ylabel("Number of Emails")
ax.set_title("Number of Emails sent Monthly");
# Here, we look at the case where the insider email IP to their home account. The data is resampled per month and the anomalous behavior is clearly visible
# In[13]:
df_insider_non_org = df_insider[~df_insider['to'].str.contains('dtaa.com')]
df_insider_ewing = df_insider_non_org[df_insider_non_org['to'] == 'Ewing_Carlos@comcast.net']
df = df_insider_ewing.resample('1M', on='date').sum().reset_index()
df.columns = ["ds", "y"]
(df.y/1e6).describe()
df.y = df.y/1e6
# In[14]:
from datetime import datetime
df["ds"] = df.apply(lambda x: datetime.date(x["ds"]), axis = 1)
# In[15]:
fig, ax = plt.subplots()
sns.barplot(data = df, x = "ds", y = "y")
ax.set_xticklabels(labels = df.ds, rotation = 45)
ax.set_xlabel("Time")
ax.set_ylabel("Number of Emails ($10^6$)");
# ax.set_title("Number of Emails sent Monthly");
# In[16]:
df = df.drop([14, 15, 16])
# In[17]:
test_size = 11
X_complete = np.array([df.index]).reshape((df.shape[0], 1)).astype('float64')
X_train = X_complete[0:test_size, ]
X_test = X_complete[test_size:df.shape[0], ]
Y_complete = np.array([df.y]).reshape((df.shape[0], 1)).astype('float64')
Y_train = Y_complete[0:test_size, ]
Y_test = Y_complete[test_size:df.shape[0], ]
D = Y_train.shape[1];
# In[18]:
fig, ax = plt.subplots()
ax.plot(X_train.flatten(),Y_train.flatten(), c ='b', marker = "o", label = "Training data")
ax.plot(X_test.flatten(),Y_test.flatten(), c='r', marker = "o", label = 'Test data')
ax.set_xticklabels(labels = df.ds, rotation = 45)
ax.set_xlabel('Time')
ax.set_ylabel('Total size of emails in GB')
plt.legend(loc = "best");
# ### Empirical analysis
# This study used a Gaussian Process model with a Spectral Mixture (SM) kernel proposed by Wilson (2014). This is because the SM kernel is capable of capturing hidden structure with data without hard cording features in a kernel. Moreover, the SM kernel is capable of performing long-range extrapolation beyond available data.
#
# In[19]:
# Trains a model with a spectral mixture kernel, given an ndarray of
# 2Q frequencies and lengthscales
Q = 10 # nr of terms in the sum
max_iters = 1000
def create_model(hypers):
f = np.clip(hypers[:Q], 0, 5)
weights = np.ones(Q) / Q
lengths = hypers[Q:]
kterms = []
for i in range(Q):
rbf = gpflow.kernels.RBF(D, lengthscales=lengths[i], variance=1./Q)
rbf.lengthscales.transform = gpflow.transforms.Exp()
cos = gpflow.kernels.Cosine(D, lengthscales=f[i])
kterms.append(rbf * cos)
k = np.sum(kterms) + gpflow.kernels.Linear(D) + gpflow.kernels.Bias(D)
m = gpflow.gpr.GPR(X_train, Y_train, kern=k)
return m
m = create_model(np.ones((2*Q,)))
# ### Inference through Optimization of the likelihood
# In[20]:
get_ipython().run_cell_magic('time', '', 'm.optimize(maxiter = max_iters)\n')
# In[21]:
def plotprediction(m):
# Perform prediction
mu, var = m.predict_f(X_complete)
# Plot
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_xticklabels(labels = df.ds, rotation = 45)
ax.set_xlabel('Time')
ax.set_ylabel('Total size of emails in GB');
ax.plot(X_train.flatten(),Y_train.flatten(), c='b', marker = "o", label = 'Training data')
ax.plot(X_test.flatten(),Y_test.flatten(), c='r', marker = "o", label = 'Test data')
ax.plot(X_complete.flatten(), mu.flatten(), c='g', marker = "o", label = "Predicted mean function")
lower = mu - 2*np.sqrt(var)
upper = mu + 2*np.sqrt(var)
ax.plot(X_complete, upper, 'g--', X_complete, lower, 'g--', lw=1.2)
ax.fill_between(X_complete.flatten(), lower.flatten(), upper.flatten(),
color='g', alpha=.1, label = "95% Predicted credible interval")
plt.legend(loc = "best")
plt.tight_layout()
# In[22]:
plotprediction(m);
# In[23]:
## Calculate the RMSE and MAPE
def calculate_rmse(model, X_test, Y_test):
mu, var = model.predict_y(X_test)
rmse = np.sqrt(((mu - Y_test)**2).mean())
return rmse
def calculate_mape(model, X_test, Y_test):
mu, var = model.predict_y(X_test)
mape = (np.absolute(((mu - Y_test)/Y_test)*100)).mean()
return mape
# In[24]:
calculate_rmse(model=m, X_test = X_test, Y_test = Y_test)
calculate_mape(model=m, X_test = X_test, Y_test = Y_test)
# In[ ]:
### Inference thro
# ## Perform hyperparameter tuning using Bayesian Optimization
# Let's use Bayesian Optimization to find the optimal model parameters of the GP model and then use then to estimate the model and prediction.
# In[ ]:
from gpflowopt.objective import batch_apply
# Objective function for our optimization
# Input: N x 2Q ndarray, output: N x 1.
# returns the negative log likelihood obtained by training with given frequencies and rbf lengthscales
# Applies some tricks for stability similar to GPy's jitchol
@batch_apply
def objectivefx(freq):
m = create_model(freq)
for i in [0] + [10**exponent for exponent in range(6,1,-1)]:
try:
mean_diag = np.mean(np.diag(m.kern.compute_K_symm(X_train)))
m.likelihood.variance = 1 + mean_diag * i
m.optimize(maxiter=max_iters)
return -m.compute_log_likelihood()
except:
pass
raise RuntimeError("Frequency combination failed indefinately.")
# Setting up optimization domain.
lower = [0.]*Q
upper = [5.]*int(Q)
df = np.sum([ContinuousParameter('freq{0}'.format(i), l, u) for i, l, u in zip(range(Q), lower, upper)])
lower = [1e-5]*Q
upper = [300]*int(Q)
dl = np.sum([ContinuousParameter('l{0}'.format(i), l, u) for i, l, u in zip(range(Q), lower, upper)])
domain = df + dl
domain
# In[ ]:
get_ipython().run_cell_magic('time', '', 'design = LatinHyperCube(6, domain)\nX = design.generate()\n')
# In[ ]:
get_ipython().run_cell_magic('time', '', 'Y = objectivefx(X)\n')
# In[ ]:
get_ipython().run_cell_magic('time', '', 'k_surrogate = gpflow.kernels.Matern52(input_dim = domain.size, ARD = False)\n')
# In[ ]:
get_ipython().run_cell_magic('time', '', 'model_surrogate = gpflow.gpr.GPR(X, Y, kern = k_surrogate)\n')
# In[ ]:
get_ipython().run_cell_magic('time', '', 'acq_fn = ExpectedImprovement(model_surrogate)\n# acq_fn = MinValueEntropySearch(model_surrogate, domain = domain)\n')
# In[ ]:
get_ipython().run_cell_magic('time', '', 'acq_optimizer = StagedOptimizer([MCOptimizer(domain, nsamples = 5000), \n SciPyOptimizer(domain)])\n')
# In[ ]:
get_ipython().run_cell_magic('time', '', 'optimizer = BayesianOptimizer(domain = domain, \n acquisition = acq_fn, \n optimizer = acq_optimizer)\n')
# In[ ]:
get_ipython().run_cell_magic('time', '', 'with optimizer.silent():\n result = optimizer.optimize(objectivefx = objectivefx, n_iter = 30)\n')
# In[ ]:
print(result)
# In[ ]:
get_ipython().run_cell_magic('time', '', 'm_opt = create_model(result.x[0,:])\nm_opt.optimize()\n')
# In[ ]:
plotprediction(m_opt)
# plt.savefig('results/emails/model-opt-emails.png');
# In[ ]:
## Inspect the evolution
f, axes = plt.subplots()
f = acq_fn.data[1][:,0]
axes.plot(np.arange(0, acq_fn.data[0].shape[0]), np.minimum.accumulate(f))
axes.set_ylabel('fmin')
axes.set_xlabel('Number of evaluated points');
# plt.savefig('results/emails/iterations-email.png');
# ### References
# 1. Emaasit, D. and Johnson, M.(2018). Capturing Structure Implicitly from Noisy Time-Series having Limited Data. arXiv preprint arXiv:xxxx.xxxx
#
# 2. Wilson
#
# 3. GPflow
# ### Getting Help
#
# Incase you need help running this code or have general questions, don't hesitate to email us at or .
# ## Computing Environment
# In[ ]:
# print system information/setup
get_ipython().run_line_magic('reload_ext', 'watermark')
get_ipython().run_line_magic('watermark', '-v -m -p numpy,pandas,gpflowopt,gpflow,tensorflow,matplotlib,ipywidgets,beakerx,seaborn -g')