# Import Theano to start!
import theano
import GPy
from theano import tensor as T
import numpy as np
# Create a two dimensioanal array
X = T.matrix()
# Sum of the elements (even when is empty)
f = T.sum(T.square(X))
f
Sum.0
# Define theano function using f and X
my_func = theano.function([X], f)
# Generate some values for X and evaluate
X_values = np.random.randn(3,4)
my_func(X_values)
array(12.963304040797144)
# Define gradients
g = theano.grad(f,X)
# Compute the derivarives
mu_new_func = theano.function([X], [f,g])
# Theano derivatives
mu_new_func(X_values)
[array(12.963304040797144), array([[-2.38163752, -2.16098669, -0.87717575, 0.69632923], [ 2.54935613, -1.25319369, 0.35672432, -1.84071505], [ 0.31138535, -2.65571952, 1.61767688, 4.34798378]])]
# Exact derivatives
X_values*2
array([[-2.38163752, -2.16098669, -0.87717575, 0.69632923], [ 2.54935613, -1.25319369, 0.35672432, -1.84071505], [ 0.31138535, -2.65571952, 1.61767688, 4.34798378]])
# Define the elements of the regression
w = T.dvector()
Xw = T.dot(X,w)
y = T.dvector()
error = T.sum(T.square(y-Xw))
sigma = T.dscalar()
neg_log_lik = 0.5*y.size*np.log(2*np.pi) + 0.5*y.size*T.log(sigma**2) + 0.5*error/sigma**2
# Create objective
my_func = theano.function([X, y, sigma, w], [neg_log_lik, theano.grad(neg_log_lik, w), theano.grad(neg_log_lik, sigma)])
# test with data
w_true = np.random.randn(2)
X_values = np.random.randn(200,2)
y = np.dot(X_values, w_true) + np.random.randn(200)*0.01
y_values = np.dot(X_values, w_true) + np.random.randn(200)*0.01
# evaluate likelihood, gradiends with respect to w and gradients with respect to sigma
my_func(X_values, y_values , 0.02, np.array([1,1]))
[array(1740553.6503363969), array([ 1130920.6618946 , 756385.18434244]), array(-174105226.72308418)]
# elements of the kernel
X = T.matrix()
Z = T.matrix()
len_scale = T.dscalar()
sigma2 = T.dscalar()
# create the RBF kernel
r2 = ((X[:,None,:]/len_scale - Z[None,:,:]/len_scale)**2).sum(2)
rbf_kernel = sigma2*T.exp(-0.5*r2)
# Create the theano functions for r2 and the kernel
r2_eval = theano.function([X,Z,len_scale],[r2])
kern_eval = theano.function([X,Z,len_scale,sigma2],[rbf_kernel])
# Generate some data to evaluate r2 and the kernel
X_val = np.random.randn(4,2)
Z_val = np.random.randn(6,2)
sigma2_val = 1
len_scale_val = 2
# Distances evaluation!!! Order of the arguments should match
r2_eval(X_val,Z_val,len_scale_val)
[array([[ 0.11363081, 0.02207483, 0.26821606, 0.33116141, 0.28348765, 1.18291004], [ 0.47115938, 0.14625441, 0.84486132, 0.57121556, 0.13850336, 0.89131393], [ 0.21376622, 0.34201811, 0.04655479, 0.6857568 , 0.98337181, 2.25323581], [ 0.3233845 , 0.7621669 , 0.23656762, 0.48814614, 1.35426331, 2.07027349]])]
# Kernel evaluation
kern_eval(X_val,Z_val,len_scale_val,sigma2_val)
[array([[ 0.94476845, 0.98902328, 0.87449559, 0.84740147, 0.86784355, 0.55352131], [ 0.7901127 , 0.92948259, 0.6554517 , 0.75155733, 0.93309181, 0.64040341], [ 0.89863071, 0.84281394, 0.97699143, 0.70972451, 0.61159444, 0.32412763], [ 0.85070297, 0.68312088, 0.88844387, 0.78343039, 0.50807223, 0.35517781]])]
# We check that it matches with the same GPy kernel
kernelGPy = GPy.kern.RBF(2, variance = sigma2_val, lengthscale=len_scale_val)
kernelGPy.K(X_val,Z_val)
array([[ 0.94476845, 0.98902328, 0.87449559, 0.84740147, 0.86784355, 0.55352131], [ 0.7901127 , 0.92948259, 0.6554517 , 0.75155733, 0.93309181, 0.64040341], [ 0.89863071, 0.84281394, 0.97699143, 0.70972451, 0.61159444, 0.32412763], [ 0.85070297, 0.68312088, 0.88844387, 0.78343039, 0.50807223, 0.35517781]])
So, it works!
# We calculate the derivatives of our kernel with respect to sigma2
dL_dK = T.matrix()
h = T.sum(dL_dK * rbf_kernel)
# Define gradients with respect to sigma2
grads_wrt_sigma2 = theano.grad(h,sigma2)
grads_wrt_sigma2_eval = theano.function([X,Z,len_scale,sigma2,dL_dK],grads_wrt_sigma2)
# Define gradients with respect to the lengthscale
grads_wrt_lengthscale = theano.grad(h,len_scale)
grads_wrt_lengthscale_eval = theano.function([X,Z,len_scale,sigma2,dL_dK],grads_wrt_lengthscale)
# Define gradients with respect to X
grads_wrt_X= theano.grad(h,X)
grads_wrt_X_eval = theano.function([X,Z,len_scale,sigma2,dL_dK],grads_wrt_X)
# All gradients together
grads_all_eval = theano.function([X,Z,len_scale,sigma2,dL_dK], [grads_wrt_sigma2, grads_wrt_lengthscale, grads_wrt_X])
# We test the result in a simple GPy model
import GPy
from matplotlib import pyplot as plt
# Some data
x = np.linspace(-np.pi, np.pi, 201)[:,None]
y = np.sin(x) + np.random.rand(201)[:,None]
# plot the model
%pylab inline
model = GPy.models.GPRegression(x,y)
model.optimize()
model.plot()
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['f'] `%matplotlib` prevents importing * from pylab and numpy
{'dataplot': [<matplotlib.lines.Line2D at 0x5d0c110>], 'gpplot': [[<matplotlib.lines.Line2D at 0x5d0a090>], [<matplotlib.patches.Polygon at 0x5d0a290>], [<matplotlib.lines.Line2D at 0x5d0a7d0>], [<matplotlib.lines.Line2D at 0x5d0abd0>]]}
## Class of kernels for GPy using theano (that uses our grads_all_eval)
class TheanoKern(GPy.kern.Kern):
def __init__(self, input_dim, variance=1., lengthscale=1.):
GPy.kern.Kern.__init__(self, input_dim, active_dims=None, name='theanokern')
self.var = GPy.core.Param('variance', variance)
self.ls = GPy.core.Param('lengthscale', lengthscale)
self.link_parameters(self.var, self.ls)
# Add here a function which initializes all the theano symbolic functions
def K(self, X, X2=None):
if X2 is None:
X2 = X
return kern_eval(X, X2, self.ls[0], self.var[0])[0]
def Kdiag(self, X):
return np.diag(self.K(X))
def update_gradients_full(self, dL_dK, X, X2=None):
if X2 is None:
X2 = X
dvar, dl, dX = grads_all_eval(X, X2, self.ls[0], self.var[0], dL_dK)
self.var.gradient = dvar
self.ls.gradient = dl
def gradients_X(self, dL_dK, X, X2=None):
if X2 is None:
X2 = X
return grads_wrt_X_eval(X, X2, self.ls[0], self.var[0], dL_dK)
def update_gradients_diag(self, dL_dKdiag, X):
pass
# Create our theano kernel
k = TheanoKern(2)
k.K(X_val)
array([[ 1. , 0.7246144 , 0.63578887, 0.20724078], [ 0.7246144 , 1. , 0.22850941, 0.04283463], [ 0.63578887, 0.22850941, 1. , 0.3758315 ], [ 0.20724078, 0.04283463, 0.3758315 , 1. ]])
# model
m = GPy.models.GPRegression(x, y, kernel=TheanoKern(1))
# Ckeck with the gradients
m.checkgrad(1)
Name | Ratio | Difference | Analytical | Numerical | dF_ratio ------------------------------------------------------------------------------------------------------------------------------- GP_regression.theanokern.variance[[0]] | 1.000000 | 0.000000 | 1.896246 | 1.896246 | 2e-08 GP_regression.theanokern.lengthscale[[0]] | 1.000000 | 0.000000 | -5.847597 | -5.847597 | 6e-08 GP_regression.Gaussian_noise.variance[[0]] | 1.000000 | 0.000000 | 56.397305 | 56.397305 | 6e-07
True
Done! :)