!wget --no-check-certificate https://home.isr.uc.pt/~rui/publications/debutanizer_fortuna_dataset.zip
!unzip debutanizer_fortuna_dataset.zip
--2020-08-30 14:26:30-- https://home.isr.uc.pt/~rui/publications/debutanizer_fortuna_dataset.zip Resolving home.isr.uc.pt (home.isr.uc.pt)... 193.136.230.49 Connecting to home.isr.uc.pt (home.isr.uc.pt)|193.136.230.49|:443... connected. WARNING: cannot verify home.isr.uc.pt's certificate, issued by ‘CN=TERENA SSL CA 3,O=TERENA,L=Amsterdam,ST=Noord-Holland,C=NL’: Unable to locally verify the issuer's authority. HTTP request sent, awaiting response... 200 OK Length: 140406 (137K) [application/zip] Saving to: ‘debutanizer_fortuna_dataset.zip.1’ debutanizer_fortuna 100%[===================>] 137.12K 707KB/s in 0.2s 2020-08-30 14:26:31 (707 KB/s) - ‘debutanizer_fortuna_dataset.zip.1’ saved [140406/140406] Archive: debutanizer_fortuna_dataset.zip replace debutanizer.mat? [y]es, [n]o, [A]ll, [N]one, [r]ename: y inflating: debutanizer.mat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import TimeSeriesSplit
from scipy.spatial.distance import cdist
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from warnings import filterwarnings
filterwarnings('ignore')
from scipy.io import loadmat
debutanizer = loadmat('debutanizer.mat')
0.22.2.post1
INOUT = [s for s in debutanizer.keys() if not s.startswith('_')]
INPUT = INOUT[:-1]
OUTPUT = INOUT[-1]
df = pd.DataFrame([debutanizer[item].reshape(-1,)for item in INOUT]).T
df.rename(columns={k:v for k,v in enumerate(INOUT)}, inplace=True)
df.head(3)
u1 | u2 | u3 | u4 | u5 | u6 | u7 | y | |
---|---|---|---|---|---|---|---|---|
0 | 0.268900 | 0.650894 | 0.832742 | 0.58342 | 0.784759 | 0.843079 | 0.822079 | 0.180295 |
1 | 0.268483 | 0.650140 | 0.852153 | 0.57751 | 0.776487 | 0.838605 | 0.822079 | 0.177124 |
2 | 0.267967 | 0.659657 | 0.823618 | 0.57160 | 0.764546 | 0.807879 | 0.786246 | 0.173618 |
decomposer = make_pipeline(StandardScaler(), PCA(3))
X_all = df[INPUT]
T_all = decomposer.fit_transform(X_all)
y_all = df[OUTPUT]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot(*T_all.T, '.')
plt.show()
JIT model is implemented on the scikit-learn like interface.
class _Sampler(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, x, y=None):
pass
def transform(self, X, y):
return X[self.idx,:], y[self.idx,:]
class JustInTime(_Sampler):
def __init__(self, estimator_, n_samples=50, func=lambda u, v: np.sqrt(((u-v)**2).sum())):
self.estimator_ = estimator_
self.func = func
self.n_samples = n_samples
def fit(self, X, y=None):
self.x_pool = X.copy()
self.y_pool = y.copy()
self.n_dims = self.x_pool.shape[1]
if len(self.y_pool.shape) == 1:
self.y_pool = self.y_pool.reshape(-1,1)
self.n_ydims = self.y_pool.shape[1]
return self
def transform(self, X):
"""
Pick optimal samples according to X
"""
n_samples = X.shape[0]
X_sampled = np.zeros((n_samples, self.n_samples, self.n_dims))
y_sampled = np.zeros((n_samples, self.n_samples, self.n_ydims))
for index, x_q in enumerate(X):
dist = cdist(x_q.reshape(-1, self.n_dims), self.x_pool,
lambda u,v:self.func(u,v))
ix = np.argsort(dist)[:,:self.n_samples]
X_sampled[index, :, :] = self.x_pool[ix,:]
y_sampled[index, :, :] = self.y_pool[ix,:]
return X_sampled, y_sampled
def predict(self, X):
Xsub, ysub = self.transform(X)
y_est = np.zeros((ysub.shape[0],ysub.shape[-1]))
for jx in range(X.shape[0]):
y_est[jx,:] = self.estimator_.fit(Xsub[jx,:,:], ysub[jx,:,:]).predict(X[jx,:].reshape(1,-1))
return y_est
%%time
pls = GridSearchCV(PLSRegression(), {'n_components':np.arange(1,8)})
tscv = TimeSeriesSplit(n_splits=5)
X = df[INPUT]
y = df[OUTPUT]
for mtype, regressor in zip(['pls', 'rf'],
[pls, RandomForestRegressor(n_estimators=300)]):
model = JustInTime(regressor)
scores = np.zeros((tscv.n_splits,))
count = 0
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index.shape[0],
"TEST:", test_index.shape[0])
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
model.fit(X_train.values, y_train.values)
y_cv = model.predict(X_test.values)
scores[count] = metrics.r2_score(y_test.values, y_cv)
count += 1
print(mtype, 'R_{CV}^2', '%.4f'%scores.mean())
TRAIN: 399 TEST: 399 TRAIN: 798 TEST: 399 TRAIN: 1197 TEST: 399 TRAIN: 1596 TEST: 399 TRAIN: 1995 TEST: 399 pls R_{CV}^2 -2.2894 TRAIN: 399 TEST: 399 TRAIN: 798 TEST: 399 TRAIN: 1197 TEST: 399 TRAIN: 1596 TEST: 399 TRAIN: 1995 TEST: 399 rf R_{CV}^2 -1.3338 CPU times: user 14min 47s, sys: 3.37 s, total: 14min 50s Wall time: 14min 51s