#!/usr/bin/env python # coding: utf-8 # # Sparse Gaussian Process Regression # # Let's set some setting for this Jupyter Notebook. # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') from warnings import filterwarnings filterwarnings("ignore") import os os.environ['MKL_THREADING_LAYER'] = 'GNU' os.environ['THEANO_FLAGS'] = 'device=cpu' import numpy as np import pandas as pd import pymc3 as pm import seaborn as sns import matplotlib.pyplot as plt np.random.seed(12345) rc = {'xtick.labelsize': 20, 'ytick.labelsize': 20, 'axes.labelsize': 20, 'font.size': 20, 'legend.fontsize': 12.0, 'axes.titlesize': 10, "figure.figsize": [12, 6]} sns.set(rc = rc) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # Now, let's import the `SparseGaussianProcessRegression` algorithm from the `pymc-learn` package. # In[3]: import pmlearn from pmlearn.gaussian_process import SparseGaussianProcessRegressor print('Running on pymc-learn v{}'.format(pmlearn.__version__)) # ## Step 1: Prepare the data # Generate synthetic data. # In[4]: n = 150 # The number of data points X = np.linspace(start = 0, stop = 10, num = n)[:, None] # The inputs to the GP, they must be arranged as a column vector # Define the true covariance function and its parameters length_scale_true = 1.0 signal_variance_true = 3.0 cov_func = signal_variance_true**2 * pm.gp.cov.ExpQuad(1, length_scale_true) # A mean function that is zero everywhere mean_func = pm.gp.mean.Constant(10) # The latent function values are one sample from a multivariate normal # Note that we have to call `eval()` because PyMC3 built on top of Theano f_true = np.random.multivariate_normal(mean_func(X).eval(), cov_func(X).eval() + 1e-8*np.eye(n), 1).flatten() # The observed data is the latent function plus a small amount of Gaussian distributed noise # The standard deviation of the noise is `sigma` noise_variance_true = 2.0 y = f_true + noise_variance_true * np.random.randn(n) ## Plot the data and the unobserved latent function fig = plt.figure(figsize=(12,5)) ax = fig.gca() ax.plot(X, f_true, "dodgerblue", lw=3, label="True f"); ax.plot(X, y, 'ok', ms=3, label="Data"); ax.set_xlabel("X"); ax.set_ylabel("y"); plt.legend(); # In[5]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # ## Step 2: Instantiate a model # In[6]: model = SparseGaussianProcessRegressor() # ## Step 3: Perform Inference # In[7]: model.fit(X_train, y_train) # ## Step 4: Diagnose convergence # In[8]: model.plot_elbo() # In[8]: pm.traceplot(model.trace); # In[9]: pm.traceplot(model.trace, lines = {"signal_variance": signal_variance_true, "noise_variance": noise_variance_true, "length_scale": length_scale_true}, varnames=["signal_variance", "noise_variance", "length_scale"]); # In[11]: pm.forestplot(model.trace, varnames=["signal_variance", "noise_variance", "length_scale"]); # ## Step 5: Critize the model # In[12]: pm.summary(model.trace, varnames=["signal_variance", "length_scale", "noise_variance"]) # In[13]: pm.plot_posterior(model.trace, varnames=["signal_variance", "noise_variance", "length_scale"], figsize = [14, 8]); # In[8]: # collect the results into a pandas dataframe to display # "mp" stands for marginal posterior pd.DataFrame({"Parameter": ["length_scale", "signal_variance", "noise_variance"], "Predicted Mean Value": [float(model.trace["length_scale"].mean(axis=0)), float(model.trace["signal_variance"].mean(axis=0)), float(model.trace["noise_variance"].mean(axis=0))], "True value": [length_scale_true, signal_variance_true, noise_variance_true]}) # ## Step 6: Use the model for prediction # In[ ]: y_predict1 = model.predict(X_test) # In[ ]: y_predict1 # In[ ]: model.score(X_test, y_test) # In[ ]: model.save('pickle_jar/sgpr') # #### Use already trained model for prediction # In[ ]: model_new = SparseGaussianProcessRegressor() model_new.load('pickle_jar/sgpr') model_new.score(X_test, y_test) # ## Multiple Features # In[ ]: num_pred = 2 X = np.random.randn(1000, num_pred) noise = 2 * np.random.randn(1000,) Y = X.dot(np.array([4, 5])) + 3 + noise # In[ ]: y = np.squeeze(Y) # In[ ]: model_big = SparseGaussianProcessRegressor() # In[ ]: model_big.fit(X, y, inference_args={"n" : 1000}) # In[ ]: pm.summary(model_big.trace, varnames=["signal_variance", "length_scale", "noise_variance"]) # ## MCMC # ### Perform inference # In[8]: model2 = SparseGaussianProcessRegressor() model2.fit(X_train, y_train, inference_type='nuts') # ### Diagnose convergence # In[18]: pm.traceplot(model2.trace, lines = {"signal_variance": signal_variance_true, "noise_variance": noise_variance_true, "length_scale": length_scale_true}, varnames=["signal_variance", "noise_variance", "length_scale"]); # In[19]: pm.gelman_rubin(model2.trace, varnames=["signal_variance", "noise_variance", "length_scale"]) # In[22]: pm.energyplot(model2.trace); # In[21]: pm.forestplot(model2.trace, varnames=["signal_variance", "noise_variance", "length_scale"]); # ### Criticize the model # In[10]: pm.summary(model2.trace, varnames=["signal_variance", "length_scale", "noise_variance"]) # In[11]: # collect the results into a pandas dataframe to display # "mp" stands for marginal posterior pd.DataFrame({"Parameter": ["length_scale", "signal_variance", "noise_variance"], "Predicted Mean Value": [float(model2.trace["length_scale"].mean(axis=0)), float(model2.trace["signal_variance"].mean(axis=0)), float(model2.trace["noise_variance"].mean(axis=0))], "True value": [length_scale_true, signal_variance_true, noise_variance_true]}) # In[12]: pm.plot_posterior(model2.trace, varnames=["signal_variance", "noise_variance", "length_scale"], figsize = [14, 8]); # ### Use the model for prediction # In[28]: y_predict2 = model2.predict(X_test) # In[29]: y_predict2 # In[30]: model2.score(X_test, y_test) # In[ ]: model2.save('pickle_jar/') model2_new = SparseGaussianProcessRegressor() model2_new.load('pickle_jar/') model2_new.score(X_test, y_test)