#!/usr/bin/env python # coding: utf-8 # In[8]: import matplotlib.pyplot as plt import numpy as np # # From Gaussian Algebra to Gaussian Processes (Part 2) # In the previous post, we covered the following topics: # # - A Gaussian process (GP) defines a distribution over functions (i.e. function evaluations). √ # - Marginalizing a Gaussian over a subset of its elements gives another Gaussian (just pluck out the pieces of interest). √ # - Conditioning a subset of the elements of a Gaussian on another subset gives another Gaussian (a simple algebraic formula). √ # - Posterior over functions (the linear map of the posterior over weights onto some matrix $A = \phi(X_{*})^T$) √ # - Covariances (the second thing we need in order to specify a multivariate Gaussian) √ # # **If any of the above is still not clear, please look no further, and re-visit the [previous post]().** # # Conversely, we did not directly cover: # # - Kernels # - Squared-exponentials # # Here, we'll explain these two. # # The more features we use, the more expressive our model # We concluded the previous post by plotting posteriors over function evaluations given various `phi_func`s, i.e. a function that creates "features" $\phi(X)$ given an input $X$. # # For example: # # ```python # X_train = np.array([-3, -5, 6, 2, 1]) # 5 inputs # y_train = np.array([1, 4, 2, 9, 4]) # 5 corresponding outputs, which we'll use below # # def phi_func(x): # return np.array([3 * np.cos(x), np.abs(x - np.abs(x - 3))]) # makes D=2 features for each input # # # >>> phi_func(X_train).shape # (2, 5) # ``` # # One common such set of features are those given by "radial basis functions", a.k.a. the "squared exponential" function, defined as: # # ```python # def phi_func(x, D=D): # return np.array([np.exp(-.5 * (x - d)**2) for d in range(int(-D / 2), int(D / 2))]) # phi_x.shape: (D, len(x)) # ``` # # Again, the choice of which features to use is ultimately arbitrary, i.e. a choice left to the modeler. # # Throughout the exercise, we saw that the larger the dimensionality $d$ of our feature function `phi_func`, the more expressive, i.e. less endemically prone to overfitting, our model became. # # **So, how far can we take this?** # # Computing features is expensive # # Ideally, we'd compute as many features as possible for each input element, i.e. employ `phi_func(x, D=some_huge_number)`. Tragically, the cost of doing so adds up, and ultimately becomes intractable past meaningfully large values of $d$. # # **Perhaps there's a better way?** # # How are these things used? # # Let's bring back our GP equations, and prepare ourselves to *squint*! In the previous post, we outlined the following modeling process: # # 1. Define prior distribution over weights and function evaluations, $P(w, y)$. # 2. Marginalizing $P(w, y)$ over $y$, i.e. $\int P(w, y)dy$, and given some observed function evaluations $y$, compute the posterior distribution over weights, $P(w\vert y)$. # 3. Linear-mapping $P(w\vert y)$ onto some new, transformed test input $\phi(X_*)^T$, compute the posterior distribution over function evaluations, $P(y_*\ \vert\ y) = P(\phi(X_{*})^Tw\ \vert\ y)$. # # Now, let's unpack #2 and #3. # # ### $P(w\vert y)$ # # - First, the mathematical equation: # # $$ # \begin{align*} # P(w\vert y) # &= \mathcal{N}(\mu_w + \Sigma_{wy}\Sigma_y^{-1}(y - \mu_y), \Sigma_w - \Sigma_{wy}\Sigma_y^{-1}\Sigma_{wy}^T)\\ # \\ # &= \mathcal{N}(\mu_w + \Sigma_{wy}(\phi(X)^T\Sigma_w \phi(X))^{-1}(y - \mu_w^T \phi(X)), \Sigma_w - \Sigma_{wy}(\phi(X)^T\Sigma_w \phi(X))^{-1}\Sigma_{wy}^T) # \end{align*} # $$ # # - Next, this equation in code: # # ```python # # Define initial parameters # D = ... # dimensionality of `phi_func` # # mu_w = np.zeros(D) # often a vector of zeros, though it doesn't have to be # cov_w = np.eye(D) # often the identity matrix, though it doesn't have to be # # # Featurize `X_train` # phi_x = phi_func(X_train, D=D) # # # Params of prior distribution over function evals # mu_y = phi_x.T @ mu_w # = np.zeros(D) # cov_y = phi_x.T @ cov_w @ phi_x # # # Params of posterior distribution over weights # mu_w_post = mu_w + cov_w @ phi_x @ np.linalg.inv(cov_y) @ (y_train - mu_y) # = mu_w + cov_w @ phi_x @ np.linalg.inv(cov_y) @ y_train # cov_w_post = cov_w - cov_w @ phi_x @ np.linalg.inv(cov_y) @ phi_x.T @ cov_w # = cov_w - cov_w @ phi_x @ np.linalg.inv(phi_x.T @ cov_w @ phi_x) @ phi_x.T @ cov_w # ``` # # ### $P(y_*\ \vert\ y) = P(\phi(X_{*})^Tw\ \vert\ y)$ # # Here, $X_*$ is a set of test points, e.g. `np.linspace(-10, 10, 200)`. # # In addition, let's call $X_* \rightarrow$ `X_test` and $y_* \rightarrow$ `y_test`. # # Never alone # # Squinting at the equations for `mu_y_test_post` and `cov_y_test_post`, we see that `phi_x` and `phi_x_test` appear **only in the presence of another `phi_x`, or `phi_x_test`.** # # These four distinct such terms are: # # ```python # phi_x_test.T @ cov_w @ phi_x_test # phi_x_test.T @ cov_w @ phi_x # phi_x.T @ cov_w @ phi_x # phi_x.T @ cov_w @ phi_x_test # ``` # # In mathematical notation, they are (respectively): # # - $\phi(X_*)^T\Sigma_w \phi(X_*)$ # - $\phi(X_*)^T\Sigma_w \phi(X)$ # - $\phi(X)^T\Sigma_w \phi(X)$ # - $\phi(X)^T\Sigma_w \phi(X_*)$ # # # Simplifying further # # These are nothing more than *scaled* (via the $\Sigma_w$ term) dot products in some expanded feature space $\phi$. # # *Until now, we've explicitly chosen what this $\phi$ function is.* # # If the scaling matrix $\Sigma_w$ is [positive definite](https://en.wikipedia.org/wiki/Positive-definite_matrix), we can state the following, using $\phi(X)^T\Sigma_w \phi(X)$, i.e. `phi_x.T @ cov_w @ phi_x`, as an example: # # $$ # \begin{align*} # \Sigma_w = (\sqrt{\Sigma_w})^2 # \end{align*} # $$ # # $$ # \begin{align*} # \phi(X)^T \Sigma_w \phi(X) # &= \big(\sqrt{\Sigma_w}\phi(X)\big)^T\big(\sqrt{\Sigma_w}\phi(X)\big)\\ # &= \varphi(X)^T\varphi(X)\\ # &= \varphi(X) \cdot \varphi(X)\\ # \end{align*} # $$ # # As such, our four distinct scaled-dot-product terms can be rewritten as: # # - $\phi(X_*)^T\Sigma_w \phi(X_*) = \varphi(X_*) \cdot \varphi(X_*)$ # - $\phi(X_*)^T\Sigma_w \phi(X) = \varphi(X_*) \cdot \varphi(X)$ # - $\phi(X)^T\Sigma_w \phi(X) = \varphi(X) \cdot \varphi(X)$ # - $\phi(X)^T\Sigma_w \phi(X_*) = \varphi(X) \cdot \varphi(X_*)$ # # **In other words, these terms can be equivalently written as dot-products in some space $\varphi$.** # # *We have **not** explicitly chosen what this $\varphi$ function is.* # # Kernels # # A "kernel" is a function which gives the similarity between individual elements in two sets, i.e. a Gram matrix. # # For instance, imagine we have two sets of countries, $\{\text{France}, \text{Germany}, \text{Iceland}\}$ and $\{\text{Morocco}, \text{Denmark}\}$, and that similarity is given by an integer value in $[1, 5]$, where 1 is the least similar, and 5 is the most. Applying a kernel to these sets might give a Gram matrix such as: # In[2]: import pandas as pd pd.DataFrame([[4, 2, 1], [3, 3, 4]], columns=['France', 'Germany', 'Iceland'], index=['Morocco', 'Denmark']) # **When you hear the term "kernel" in the context of machine learning, think "similarity between things in lists." That's it.** # # NB: A "list" could be a list of vectors, i.e. a matrix. A vector, or a matrix, are the canonical inputs to a kernel. # # # Mercer's Theorem # # Mercer's Theorem has as a key result that any kernel function can be expressed as a dot product, i.e. # # $$ # K(X, X') = \varphi(X) \cdot \varphi (X') # $$ # # where $\varphi$ is some function that creates $d$ features out of $X$ (in the same vein as `phi_func` from above). # # ## Example # # To illustrate, I'll borrow an example from [CrossValidated](https://stats.stackexchange.com/questions/152897/how-to-intuitively-explain-what-a-kernel-is): # # # "For example, consider a simple polynomial kernel $K(\mathbf x, \mathbf y) = (1 + \mathbf x^T \mathbf y)^2$ with $\mathbf x, \mathbf y \in \mathbb R^2$. This doesn't seem to correspond to any mapping function $\varphi$, it's just a function that returns a real number. Assuming that $\mathbf x = (x_1, x_2)$ and $\mathbf y = (y_1, y_2)$, let's expand this expression: # # $$ # \begin{align} # K(\mathbf x, \mathbf y) # &= (1 + \mathbf x^T \mathbf y)^2\\ # &= (1 + x_1 \, y_1 + x_2 \, y_2)^2\\ # &= 1 + x_1^2 y_1^2 + x_2^2 y_2^2 + 2 x_1 y_1 + 2 x_2 y_2 + 2 x_1 x_2 y_1 y_2 # \end{align} # $$ # # Note that this is nothing else but a dot product between two vectors $(1, x_1^2, x_2^2, \sqrt{2} x_1, \sqrt{2} x_2, \sqrt{2} x_1 x_2)$ and $(1, y_1^2, y_2^2, \sqrt{2} y_1, \sqrt{2} y_2, \sqrt{2} y_1 y_2)$, and $\varphi(\mathbf x) = \varphi(x_1, x_2) = (1, x_1^2, x_2^2, \sqrt{2} x_1, \sqrt{2} x_2, \sqrt{2} x_1 x_2)$. So the kernel $K(\mathbf x, \mathbf y) = (1 + \mathbf x^T \mathbf y)^2 = \varphi(\mathbf x) \cdot \varphi(\mathbf y)$ computes a dot product in 6-dimensional space without explicitly visiting this space." # # ## What this means # # ![](./kernels-for-gaussian-processes.svg) # # - We start with inputs $X$ and $Y$. # - Our goal is to compute the similarity between then, $\text{Sim}(X, Y)$. # # ### Bottom path # - Lifting these inputs into some feature space, then computing their dot-product in that space, i.e. $\varphi(X) \cdot \varphi (Y)$ (where $F = \varphi$, since I couldn't figure out how to draw a $\varphi$ in [draw.io](http://draw.io)), is one strategy for computing this similarity. # - Unfortunately, this robustness comes at a cost: **the computation is extremely expensive.** # # ### Top Path # - A valid kernel computes similarity between inputs. The function it employs might be extremely simple, e.g. $(X - Y)^{123}$; **the computation is extremely cheap.** # # ### Mercer! # - Mercer's Theorem tells us that every valid kernel, i.e. the top path, is *implicitly traversing the bottom path.* **In other words, kernels allow us to directly compute the result of an extremely expensive computation, extremely cheaply.** # # How does this help? # # Once more, the Gaussian process equations are littered with the following terms: # # - $\phi(X_*)^T\Sigma_w \phi(X_*) = \varphi(X_*) \cdot \varphi(X_*)$ # - $\phi(X_*)^T\Sigma_w \phi(X) = \varphi(X_*) \cdot \varphi(X)$ # - $\phi(X)^T\Sigma_w \phi(X) = \varphi(X) \cdot \varphi(X)$ # - $\phi(X)^T\Sigma_w \phi(X_*) = \varphi(X) \cdot \varphi(X_*)$ # # In addition, we previously established that the more we increase the dimensionality $d$ of our given feature function, the more flexible our model becomes. # # Finally, past any meaningfully large value of $d$, and irrespective of what $\varphi$ actually is, **this computation becomes intractably expensive.** # # ## Kernels! # # You know where this is going. # # Given Mercer's theorem, we can state the following equalities: # # - $\varphi(X_*) \cdot \varphi(X_*) = K(X_*, X_*)$ # - $\varphi(X_*) \cdot \varphi(X) = K(X_*, X)$ # - $\varphi(X) \cdot \varphi(X) = K(X, X)$ # - $\varphi(X) \cdot \varphi(X_*) = K(X, X_*)$ # # # Which kernels to choose? # # At the outset, we stated that our primary goal was to increase $d$. As such, **let's pick the kernel whose implicit $\varphi$ has the largest dimensionality possible.** # # In the example above, we saw that the kernel $k(\mathbf x, \mathbf y)$ was implicitly computing a $d=6$-dimensional dot-product. Which kernels compute a $d=100$-dimensional dot-product? $d=1000$? # # **How about $d=\infty$?** # # Radial basis function, a.k.a. the "squared-exponential" # # This kernel is implicitly computing a $d=\infty$-dimensional dot-product. That's it. **That's why it's so ubiquitous in Gaussian processes.** # # Rewriting our equations # # With all of the above in mind, let's rewrite the equations for the parameters of our posterior distribution over function evaluations. # # ```python # # The mean of the posterior distribution over function evaluations # mu_y_test_post = phi_x_test.T @ mu_w_post # = phi_x_test.T @ cov_w @ phi_x @ np.linalg.inv(phi_x.T @ cov_w @ phi_x) @ y_train # # # Now, substituting in our kernels # = k(X_test, X_train) @ np.linalg.inv(k(X_train, X_train)) @ y_train # # # The covariance of the posterior distribution over function evaluations # cov_y_test_post = phi_x_test.T @ cov_w_post @ phi_x_test # = phi_x_test.T @ cov_w @ phi_x_test - \ # phi_x_test.T @ cov_w @ phi_x @ np.linalg.inv(phi_x.T @ cov_w @ phi_x) @ \ # phi_x.T @ cov_w @ phi_x_test # # # Now, substituting in our kernels # = k(X_test, X_test) - \ # k(X_test, X_train) @ np.linalg.inv(k(X_train, X_train)) @ k(X_train, X_test) # ``` # # Defining the kernel in code # # Mathematically, the RBF kernel is defined as follows: # # $$ # K(X, Y) = \exp(-\frac{1}{2}\vert X - Y \vert ^2) # $$ # # To conclude, let's define a Python function for the parameters of our posterior over function evaluations, using this RBF kernel as `k`, then plot the resulting distribution. # In[39]: X_train = np.array([-3, -5, 6, 2, 1]) # 5 inputs y_train = np.array([1, 4, 2, 9, 4]) # 5 corresponding outputs, which we'll use below X_test = np.linspace(-10, 10, 200) # vector of test inputs def rbf_kernel(x, y): x = np.expand_dims(x, 1) # shape: (len(x), 1) y = np.expand_dims(y, 0) # shape: (1, len(y)) return np.exp(-.5 * (x - y)**2) # shape: (len(x), len(y)) def k(x, y): return rbf_kernel(x, y) # The following quantity is used in both `mu_y_test_post` and `cov_y_test_post`; # we extract it into a separate variable for readability A = k(X_test, X_train) @ np.linalg.inv(k(X_train, X_train)) mu_y_test_post = A @ y_train cov_y_test_post = k(X_test, X_test) - A @ k(X_train, X_test) # # Plot results # In[41]: def plot_gp_posterior(mu_y_post, cov_y_post, x_train, y_train, x_test, n_samples=0, ylim=(-3, 10)): plt.figure(figsize=(14, 9)) plt.ylim(*ylim) plt.xlim(-10, 10) plt.title('Posterior Distribution over Function Evaluations') # Extract the variances, i.e. the diagonal, of our covariance matrix var_y_post = np.diag(cov_y_post) # Plot the error bars. # To do this, we fill the space between `(mu_y_post - var_y_post, mu_y_post + var_y_post)` for each `x` plt.fill_between(x_test, mu_y_post - var_y_post, mu_y_post + var_y_post, color='#23AEDB', alpha=.5) # Scatter-plot our original `(x, y)` tuples plt.plot(x_train, y_train, 'ro', markersize=10) # Optionally plot actual samples (function evaluations) from this posterior if n_samples > 0: for _ in range(n_samples): y_pred = np.random.multivariate_normal(mu_y_post, cov_y_post) plt.plot(x_test, y_pred, alpha=.2) plot_gp_posterior(mu_y_test_post, cov_y_test_post, X_train, y_train, X_test, n_samples=0) # And for good measure, with some samples from the posterior: # In[38]: plot_gp_posterior(mu_y_test_post, cov_y_test_post, X_train, y_train, X_test, n_samples=25) # # In summary # # In this post, we've unpacked the notion of a kernel, and its ubiquitous use in Gaussian Processes. # # In addition, we've introduced the RBF kernel, i.e. "squared exponential" kernel, and motivated its widespread application in these models.