In this lecture we will cover the following topics:
import sys
# Install dependencies if the notebook is running in Colab
if 'google.colab' in sys.modules:
!pip install -U -qq reservoir_computing
# Imports
import numpy as np
import matplotlib.pyplot as plt
import time
import plotly.graph_objects as go
import statsmodels.api as sm
from sklearn.datasets import make_circles
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from reservoir_computing.reservoir import Reservoir
from reservoir_computing.utils import make_forecasting_dataset
from reservoir_computing.datasets import PredLoader
Temperature and Demand
Time and Demand
def plot_3D(elev=20, azim=30):
X, y = make_circles(n_samples=200, factor=.01, noise=.15)
fig = plt.figure(figsize=(10, 5))
r = np.exp(-(X ** 2).sum(1))
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.scatter3D(X[:, 0], X[:, 1], r, c=y, s=50, cmap='bwr', alpha=0.5)
ax.view_init(elev=elev, azim=azim)
xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50))
zz = np.full(xx.shape, 0.6)
ax.plot_surface(xx, yy, zz, color='grey', alpha=0.7)
ax.set_xlabel("$z_1$")
ax.set_ylabel("$z_2$")
ax.set_zlabel("$z_3$")
ax.set_xticks(())
ax.set_yticks(())
ax.set_zticks(())
ax2 = fig.add_subplot(1, 2, 1)
ax2.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='bwr', alpha=0.5)
ax2.set_xlabel("$x_1$")
ax2.set_ylabel("$x_2$")
ax2.set_xticks(())
ax2.set_yticks(())
plt.tight_layout()
plt.show()
plot_3D()
Input layer
First hidden layer
Second hidden layer
Output Layer
3000
samples.ts_full = PredLoader().get_data('ElecRome')
# Resample the time series to hourly frequency
ts_hourly = np.mean(ts_full.reshape(-1, 6), axis=1)
# Use only the first 3000 time steps
time_series = ts_hourly[0:3000]
time_steps = np.arange(0, len(time_series))
Loaded ElecRome dataset. Data shape: X: (137376, 1)
# Split the time series into training and test sets
train_size = int(0.9*len(time_series))
tr = time_series[:train_size]
te = time_series[train_size:]
window_size=12
.forecast_horizon=12
.def create_windows(data, window_size, forecast_horizon):
X, y = [], []
for i in range(len(data) - window_size - forecast_horizon + 1):
X.append(data[i:(i + window_size)])
y.append(data[i + window_size:i + window_size + forecast_horizon])
return np.array(X), np.array(y)
# Define window size and forecast horizon
window_size = 12
forecast_horizon = 12
# Create input-output pairs
X_train, y_train = create_windows(tr, window_size, forecast_horizon)
X_test, y_test = create_windows(te, window_size, forecast_horizon)
# Plot some pairs of input-output
fig, axes = plt.subplots(5,1,figsize=(10,8))
for i in range(5):
axes[i].scatter(range(i, window_size+i), X_train[i], color='tab:blue', edgecolor='k') # Input
axes[i].scatter(range(window_size+i, i+window_size+forecast_horizon-1), y_train[i,:-1], color='lightgray', edgecolor='k')
axes[i].scatter(i+window_size+forecast_horizon-1, y_train[i,-1], color='tab:orange', edgecolor='k')
axes[i].set_xlim(-1,28)
plt.xlabel('Time step')
plt.show()
forecast_horizon = 24
to do that.# Define window size and forecast horizon
window_size = 12
forecast_horizon = 24
# Create input-output pairs
X_train, y_train = create_windows(tr, window_size, forecast_horizon)
X_test, y_test = create_windows(te, window_size, forecast_horizon)
# We are only interested in the last time step of the horizon
y_train = y_train[:, -1]
y_test = y_test[:, -1]
StandardScaler()
from the sklearn library.scaler.fit_transform(X_train)
will subtract from X_train
its mean and divide by its variance.scaler.transform(X_test)
will subtract from X_test
the mean of X_train
and divide by the variance of X_train
.# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
1000
iterations.# Define and train the neural network
mlp = MLPRegressor(hidden_layer_sizes=(16,8), activation='relu', solver='adam', max_iter=1000)
mlp.fit(X_train_scaled, y_train)
MLPRegressor(hidden_layer_sizes=(16, 8), max_iter=1000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MLPRegressor(hidden_layer_sizes=(16, 8), max_iter=1000)
# Predict on the test set
y_pred = mlp.predict(X_test_scaled)
# Forecast beyond the dataset using the model
last_window = time_series[-window_size:]
last_window_scaled = scaler.transform(last_window.reshape(1, -1))
next_step_pred = mlp.predict(last_window_scaled)
print(f"The next time step prediction is {next_step_pred[0]:.2f}")
The next time step prediction is 34.50
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
plt.figure(figsize=(14, 4))
plt.plot(time_steps[2500:], time_series[2500:], label="Actual")
plt.plot(time_steps[-len(y_test):], y_pred, label="Predicted")
plt.title("Time Series Forecast")
plt.xlabel("Time Step")
plt.ylabel("Value")
plt.grid()
plt.legend()
plt.show()
Mean Squared Error: 20.55
# Compute the mse between the original time series and the time series shifhted by 24 time steps
mse = mean_squared_error(y_test[24:], y_test[:-24])
print(f"Mean Squared Error: {mse:.2f}")
plt.figure(figsize=(7, 3))
plt.plot(y_test[24:])
plt.plot(y_test[:-24])
plt.grid()
plt.show()
Mean Squared Error: 27.14
Back Propagation Through Time (BPTT)
Generate a sequence of reservoir states H={h(0),h(1),…,h(T)}.
for each time step t=1,2,…,T of the input sequence.
The nonlinearity σ is usually an hyperbolic tangent (tanh).
Note that fitting an MLP to the readout states is different from the window approach we saw before.
Window approach:
Reservoir approach:
# Initial states
initial_state_0 = np.zeros((1, 100), dtype=float)
initial_state_1 = np.ones((1, 100), dtype=float)
x = np.zeros((1, 100, 1)) # Zero input, it does not contribute to the state
rhos = [.3, 0.99, 1.3] # We will use three different spectral radii
def plot_states_evolution(x, rhos, h0, h1):
plt.figure(figsize=(5, 3))
for rho in rhos:
res = Reservoir(spectral_radius=rho) # initialize the reservoir with a given spectral radius
states_0 = res.get_states(x, bidir=False, initial_state=h0) # states using initial state 0
states_1 = res.get_states(x, bidir=False, initial_state=h1) # states using initial state 1
plt.plot(np.linalg.norm(states_0 - states_1, axis=(0,2)), label=f'rho={rho}') # L2 norm of the difference
plt.legend(loc='upper right', ncol=3)
plt.xlabel('$t$')
plt.ylabel('$\|\mathbf{h}_1(t) - \mathbf{h}_2(t)\|_2$')
plt.grid()
plt.tight_layout()
plt.title('Transient phase of the Reservoir')
plt.show()
plot_states_evolution(x, rhos, initial_state_0, initial_state_1)
reservoir-computing
.We reload the same time series of energy load that we used before.
ts_full = PredLoader().get_data('ElecRome')
# Resample the time series to hourly frequency
ts_hourly = np.mean(ts_full.reshape(-1, 6), axis=1)[:, None]
# Use only the first 3000 time steps
time_series = ts_hourly[0:3000, :]
Loaded ElecRome dataset. Data shape: X: (137376, 1)
Xtr
and Ytr
.Xte
and Yte
to test the model and validation data Xval
and Yval
if we need to do hyperparameters tuning.forecasting_datasets
that given a time series X
does the following computations.train
, val
and test
.val_percent
and test_percent
.val_percent=0
(default) and the validation data will not be created.X
and target data Y
by shifting the data horizon
time steps, where horizon
is how far we want to predict.Xtr = train[:-horizon,:]
Ytr = train[horizon:,:]
sklearn.preprocessing
.StandardScaler
is created.Xtr
and then used to transform Ytr
, Xval
, and Xte
.Yval
and Yte
are not transformed.X = np.arange(36)[:, None]
Xtr, Ytr, Xte, Yte, Xval, Yval, scaler = make_forecasting_dataset(X, horizon=5,
test_percent=0.2,
val_percent=0.3)
print("Xtr: ", scaler.inverse_transform(Xtr.T)[0])
print("Ytr: ", scaler.inverse_transform(Ytr.T))
print("Xval: ", scaler.inverse_transform(Xval.T)[0])
print("Yval: ", Yval.T)
print("Xte: ", scaler.inverse_transform(Xte.T)[0])
print("Yte: ", Yte.T)
Xtr: [ 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11.] Ytr: [[ 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15. 16.]] Xval: [17. 18. 19. 20. 21. 22.] Yval: [[22 23 24 25 26 27]] Xte: [28. 29. 30.] Yte: [[33 34 35]]
val_percent=0
and the validation set is not created.# Generate training and test datasets
Xtr, Ytr, Xte, Yte, scaler = make_forecasting_dataset(time_series,
horizon=24, # forecast horizon of 24h ahead
test_percent = 0.1)
print(f"Xtr shape: {Xtr.shape}\nYtr shape: {Ytr.shape}\nXte shape: {Xte.shape}\nYte shape: {Yte.shape}")
Xtr shape: (2676, 2) Ytr shape: (2676, 1) Xte shape: (276, 2) Yte shape: (276, 1)
res= Reservoir(n_internal_units=900,
spectral_radius=0.99,
input_scaling=0.1,
connectivity=0.25)
states_tr
and states_te
associated with the training and test data, respecitvely.n_drop
that specifies how many of the initial states we drop.n_drop=10
states_tr = res.get_states(Xtr[None,:,:], n_drop=n_drop, bidir=False)
states_te = res.get_states(Xte[None,:,:], n_drop=n_drop, bidir=False)
print(f"states_tr shape: {states_tr.shape}\nstates_te shape: {states_te.shape}")
states_tr shape: (1, 2666, 900) states_te shape: (1, 266, 900)
# Fit the ridge regression model
ridge = Ridge(alpha=1.0)
time_start = time.time()
ridge.fit(states_tr[0], Ytr[n_drop:,:])
print(f"Training time: {time.time()-time_start:.4f}s")
# Compute the predictions
time_start = time.time()
Yhat = ridge.predict(states_te[0])
print(f"Test time: {time.time()-time_start:.4f}s")
Training time: 0.0344s Test time: 0.0003s
fig = plt.figure(figsize=(14,4))
plt.plot(Yte[n_drop:,:], label="True", linewidth=2)
plt.plot(scaler.inverse_transform(Yhat.reshape(-1, 1)), label="Predicted")
plt.grid()
plt.legend()
plt.title("True vs predicted electricity load")
plt.show()
mse = mean_squared_error(scaler.inverse_transform(Yhat.reshape(-1, 1)), Yte[n_drop:,:])
print(f"Mean Squared Error: {mse:.2f}")
Mean Squared Error: 21.44
# Generate 4 clusters of points in 3 dimensions
T = 300 # number of samples
N_h = 3 # number of features
H, clust_id = make_blobs(n_samples=T, n_features=N_h,
centers=4, cluster_std=1.5, random_state=1)
def plot_data(H, clust_id, interactive=False):
if interactive:
fig = go.Figure(data=[
go.Scatter3d(x=H[:, 0], y=H[:, 1], z=H[:, 2], mode='markers',
marker=dict(size=5, opacity=0.5, color=clust_id, colorscale='Viridis')),
])
fig.update_layout(scene=dict(xaxis_title='X Axis',
yaxis_title='Y Axis',
zaxis_title='Z Axis',
zaxis=dict(range=[-20, 10])),
margin=dict(l=0, r=0, b=0, t=0),
width=400, height=400
)
fig.show()
else:
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection='3d')
ax.scatter3D(xs=H[:, 0], ys=H[:, 1], zs=H[:, 2], linewidth=0.2, alpha=0.7, c=clust_id)
ax.set_xticks(())
ax.set_yticks(())
ax.set_zticks(())
plt.show()
# Set interactive = True to get an interactive plot that can be rotated
plot_data(H, clust_id, interactive=False)
Reducing the data dimensionality with PCA invole the following steps:
# Apply PCA to reduce to 2 components
pca = PCA(n_components=2)
H_pca = pca.fit_transform(H)
v1, v2 = pca.components_
def plot_pca_plane(H, clust_id, v1, v2, interactive=False):
mean = np.mean(H, axis=0)
length = np.linspace(-15, 15, 20)
another_length = np.linspace(-15, 15, 20)
xx, yy = np.meshgrid(length, another_length)
def compute_z(x, y, v1, v2, mean):
z = np.zeros(x.shape)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
point_in_plane = mean + v1 * x[i, j] + v2 * y[i, j]
z[i, j] = point_in_plane[2]
return z
Z_grid = compute_z(xx, yy, v1, v2, mean)
if interactive:
fig = go.Figure(data=[
go.Surface(z=Z_grid, x=xx, y=yy, colorscale='gray', opacity=0.5, showscale=False),
go.Scatter3d(x=H[:, 0], y=H[:, 1], z=H[:, 2], mode='markers',
marker=dict(size=5, opacity=0.5, color=clust_id, colorscale='Viridis'))])
fig.update_layout(scene=dict(xaxis_title='X Axis',
yaxis_title='Y Axis',
zaxis_title='Z Axis',
zaxis=dict(range=[-20, 20])),
margin=dict(l=0, r=0, b=0, t=0),
width=400, height=400)
fig.show()
else:
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(xx, yy, Z_grid, cmap='gray', edgecolor='none', alpha=0.5)
ax.scatter3D(xs=H[:, 0], ys=H[:, 1], zs=H[:, 2], linewidth=0.2, alpha=0.7, c=clust_id)
ax.view_init(elev=20, azim=20)
ax.set_xticks(())
ax.set_yticks(())
ax.set_zticks(())
plt.show()
# Plot the hyperplane spanned by the first two principal components
plot_pca_plane(H, clust_id, v1, v2, interactive=False)
# Plot the 2D projection
plt.figure(figsize=(4, 4))
plt.scatter(H_pca[:, 0], H_pca[:, 1], c=clust_id, cmap='viridis', alpha=0.7)
plt.xticks([], []), plt.yticks([], [])
plt.show()
H_pca_1d = PCA(n_components=1).fit_transform(H)
# Plot the 1D projection
plt.figure(figsize=(4, 1.5))
plt.scatter(H_pca_1d, np.zeros_like(H_pca_1d), c=clust_id, cmap='viridis', alpha=0.7)
plt.xticks([], []), plt.yticks([], [])
plt.show()
n_internal_units=900
, we end up with a sequence of length T
of vectors with size 900
.75
.pca = PCA(n_components=75)
states_tr_pca = pca.fit_transform(states_tr[0])
states_te_pca = pca.transform(states_te[0])
print(f"states_tr shape: {states_tr_pca.shape}\nstates_te shape: {states_te_pca.shape}")
states_tr shape: (2666, 75) states_te shape: (266, 75)
# Fit the ridge regression model
ridge = Ridge(alpha=1.0)
time_start = time.time()
ridge.fit(states_tr_pca, Ytr[n_drop:,:])
print(f"Training time: {time.time()-time_start:.4f}s")
# Compute the predictions
time_start = time.time()
Yhat_pca = ridge.predict(states_te_pca)
print(f"Test time: {time.time()-time_start:.4f}s")
# Compute the mean squared error
mse = mean_squared_error(scaler.inverse_transform(Yhat_pca.reshape(-1, 1)), Yte[n_drop:,:])
print(f"Mean Squared Error: {mse:.2f}")
Training time: 0.0028s Test time: 0.0001s Mean Squared Error: 22.55
time_start = time.time()
# Quantile 0.5
max_iter = 100
gbrt_median = HistGradientBoostingRegressor(
loss="quantile", quantile=0.5, max_iter=max_iter)
gbrt_median.fit(states_tr[0], Ytr[n_drop:,0])
median_predictions = gbrt_median.predict(states_te[0])
# Quantile 0.05
gbrt_percentile_5 = HistGradientBoostingRegressor(
loss="quantile", quantile=0.05, max_iter=max_iter)
gbrt_percentile_5.fit(states_tr[0], Ytr[n_drop:,0])
percentile_5_predictions = gbrt_percentile_5.predict(states_te[0])
# Quantile 0.95
gbrt_percentile_95 = HistGradientBoostingRegressor(
loss="quantile", quantile=0.95, max_iter=max_iter)
gbrt_percentile_95.fit(states_tr[0], Ytr[n_drop:,0])
percentile_95_predictions = gbrt_percentile_95.predict(states_te[0])
print(f"Training time: {time.time()-time_start:.2f}s")
Training time: 11.51s
# Plot the results
fig = plt.figure(figsize=(14,4))
plt.plot(Yte[n_drop:,:], 'k--', label="True", linewidth=2)
plt.plot(scaler.inverse_transform(median_predictions[:,None]), label="Median prediction", color="tab:blue")
plt.fill_between(np.arange(len(Yte[n_drop:,:])), scaler.inverse_transform(percentile_5_predictions[:,None]).ravel(), scaler.inverse_transform(percentile_95_predictions[:,None]).ravel(), alpha=0.3, label="90% CI", color="tab:blue")
plt.grid()
plt.legend()
plt.title("Predicted electricity load using Gradient Boosting Regression Trees")
plt.show()
time_start = time.time()
# Quantile 0.5
max_iter = 100
gbrt_median = HistGradientBoostingRegressor(
loss="quantile", quantile=0.5, max_iter=max_iter)
gbrt_median.fit(states_tr_pca, Ytr[n_drop:,0])
median_predictions = gbrt_median.predict(states_te_pca)
# Quantile 0.05
gbrt_percentile_5 = HistGradientBoostingRegressor(
loss="quantile", quantile=0.05, max_iter=max_iter)
gbrt_percentile_5.fit(states_tr_pca, Ytr[n_drop:,0])
percentile_5_predictions = gbrt_percentile_5.predict(states_te_pca)
# Quantile 0.95
gbrt_percentile_95 = HistGradientBoostingRegressor(
loss="quantile", quantile=0.95, max_iter=max_iter)
gbrt_percentile_95.fit(states_tr_pca, Ytr[n_drop:,0])
percentile_95_predictions = gbrt_percentile_95.predict(states_te_pca)
print(f"Training time: {time.time()-time_start:.2f}s")
Training time: 5.80s
In this lecture we saw:
forecast_horizon
from 24 to 36.mse = mean_squared_error(y_test[24:], y_test[:-24])
. Is this a good baseline also when the forecast horizon changes? What could be a better baseline?Contrarily from model-based approach such as ARIMA, the MLP and the ESN do not strctly require the data to be stationary. However, by removing trend and seasonality the MLP and the ESN can focus only on predicting the residuals.
For this exercise, we will consider a new dataset consisting of hourly water temperatures from Gulf of Mexico near Key West, Florida. The hourly temperatures are provided from October 3, 2016 to October 3, 2017 The dataset consists of a data frame with 6572 observations on the following 3 variables.
DateTime
Date and time of reading (format mm/dd/yyyy h:00).WaterTemp
: Water temperature (in degrees Fahrenheit).t
: Time index (1 to 673).temps = sm.datasets.get_rdataset("KeyWestWater", "Stat2Data").data
print("Raw data:\n------------------\n", temps.head())
plt.figure(figsize=(14,4))
plt.plot(temps["WaterTemp"])
plt.grid()
plt.show()
Raw data: ------------------ DateTime WaterTemp t 0 10/3/2016 0:00 86.2 1 1 10/3/2016 1:00 86.2 2 2 10/3/2016 2:00 86.2 3 3 10/3/2016 3:00 86.2 4 4 10/3/2016 4:00 86.0 5
hidden_layer_sizes
(e.g., try more layers [16, 16, 8]
, layers with more units [32, 16]
, etc...).learning_rate_init
.activation
(try at least tanh
and relu
).spectral_radius
.input_scaling
.n_internal_units
.connectivity
.Report the configuration of MLP and ESN that achieves the best performance on the test data.