This Jupyter notebook and the other files in this directory present the code related to the Medium blog post titled "What is Giotto and Why James Bond Should Use It to Extract Secret Messages". The idea is to use topological data analysis (TDA) to predict the regime change from a chaotic regime to a non-chaotic regime in time series with different levels of noise. For further information please refer to the blog post.
As the feature creation takes a long time (around 45 minutes on a MacBook Pro with 16 cores), a precomputed dataset is loaded. In the 'Plot Features' section the features are directly calculated for a smaller time series for presentation purposes. In case you are interested in creating the features for all the time series yourself, run the bash script 'create_features.sh'.
# Imports from Scikit-learn and XGBoost respectively
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
# For bootstrap confidence intervals
from numpy.random import seed
from numpy.random import rand
from numpy.random import randint
# Others
import pandas as pd
import numpy as np
from datetime import datetime
from itertools import product
import os
from pandarallel import pandarallel
from joblib import Parallel, delayed
from functools import reduce
from scipy.fftpack import rfft
import openml
from openml.datasets.functions import get_dataset
# Giotto
import giotto as gt
import giotto.diagrams as diag
import giotto.homology as hl
# Plotting functions
from plotting import plot_diagram, plot_landscapes
from plotting import plot_betti_surfaces, plot_betti_curves
from plotting import plot_point_cloud
import matplotlib.pyplot as plt
import plotly.express as px
# Our own feature creation and plotting functions
from chaos_detection import *
# Make balanced accuracy scorer
bal_acc_score = make_scorer(balanced_accuracy_score)
Here we create the features for a small time series in order to present the TDA features.
df_res, X_betti_surfaces = create_all_features(42203, noise_level=0.0, return_betti_surface=True)
df_res.head()
New pandarallel memory created - Size: 2000 MB Pandarallel will run on 16 workers Optimal embedding time delay based on mutual information: 5 Optimal embedding dimension based on false nearest neighbors: 14
time | y | x | x_dot | max_10 | max_20 | max_50 | mean_10 | mean_20 | mean_50 | ... | fourier_w_1 | fourier_w_2 | num_holes | avg_lifetime | betti_0 | betti_1 | betti_2 | betti_argmax_1 | betti_argmax_2 | amplitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
133 | 13.30133 | 0 | 0.919056 | 1.520114 | 0.919056 | 0.919056 | 0.919056 | 0.895947 | 0.871955 | 0.801546 | ... | -0.086702 | 1.198089 | 100 | 0.011424 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.080962 |
134 | 13.40134 | 0 | 0.924506 | 1.521954 | 0.924506 | 0.924506 | 0.924506 | 0.900792 | 0.877009 | 0.806352 | ... | -0.085955 | 1.199410 | 100 | 0.011424 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.080962 |
135 | 13.50135 | 0 | 0.922732 | 1.517234 | 0.924506 | 0.924506 | 0.924506 | 0.904971 | 0.881477 | 0.810933 | ... | -0.091852 | 1.199812 | 100 | 0.011424 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.080962 |
136 | 13.60136 | 0 | 0.933383 | 1.522403 | 0.933383 | 0.933383 | 0.933383 | 0.910297 | 0.886202 | 0.815624 | ... | -0.086803 | 1.201020 | 100 | 0.011424 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.080962 |
137 | 13.70137 | 0 | 0.938219 | 1.523874 | 0.938219 | 0.938219 | 0.938219 | 0.915080 | 0.891244 | 0.820432 | ... | -0.086749 | 1.202251 | 100 | 0.011424 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.080962 |
5 rows × 23 columns
#Betti surface
plot_betti_surfaces(X_betti_surfaces)
fig, ax = plt.subplots(8, figsize=(15, 15))
fig.tight_layout()
window_stride = 50
ax[0].plot(df_res['num_holes'], label='Feature')
ax[0].plot(list(map(lambda x: x * 50, df_res['y'].tolist())), label='Reference')
ax[0].title.set_text('Number of relevant holes')
ax[0].legend(loc='lower right')
ax[1].plot(df_res['avg_lifetime'], label='Feature')
ax[1].plot(list(map(lambda x: x, df_res['y'].tolist())), label='Reference')
ax[1].title.set_text('Average lifetime')
ax[1].legend(loc='lower right')
ax[2].plot(df_res['betti_0'], label='Feature')
ax[2].plot(list(map(lambda x: x * 50, df_res['y'].tolist())), label='Reference')
ax[2].title.set_text('Betti feature dim 0')
ax[2].legend(loc='lower right')
ax[3].plot(df_res['betti_1'], label='Feature')
ax[3].plot(list(map(lambda x: x * 50, df_res['y'].tolist())), label='Reference')
ax[3].title.set_text('Betti feature dim 1')
ax[3].legend(loc='lower right')
ax[4].plot(df_res['betti_2'], label='Feature')
ax[4].plot(list(map(lambda x: x * 50, df_res['y'].tolist())), label='Reference')
ax[4].title.set_text('Betti feature dim 2')
ax[4].legend(loc='lower right')
ax[5].plot(df_res['betti_argmax_1'], label='Feature')
ax[5].plot(list(map(lambda x: x * 50, df_res['y'].tolist())), label='Reference')
ax[5].title.set_text('Betti feature argmax dim 1')
ax[5].legend(loc='lower right')
ax[6].plot(df_res['betti_argmax_2'], label='Feature')
ax[6].plot(list(map(lambda x: x * 50, df_res['y'].tolist())), label='Reference')
ax[6].title.set_text('Betti feature argmax dim 2')
ax[6].legend(loc='lower right')
ax[7].plot(df_res['amplitude'], label='Feature')
ax[7].plot(list(map(lambda x: x * 5, df_res['y'].tolist())), label='Reference')
ax[7].title.set_text('Amplitude feature')
ax[7].legend(loc='lower right')
<matplotlib.legend.Legend at 0x1ca0b3f748>
# Note: the first step might take a while the first time you run it
# because the data has to be downloaded from OpenML
dataset = get_dataset(42202)
dataset = dataset.get_data()[0]
train = dataset[dataset['test_set']==0]
test = dataset[dataset['test_set']==1]
test.head()
index | time | y | x | coord_1 | max_10 | max_20 | max_50 | mean_10 | mean_20 | ... | num_holes | avg_lifetime | betti_0 | betti_1 | betti_2 | betti_argmax_1 | betti_argmax_2 | amplitude | noise_level | test_set | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1246600 | 150 | 15.00150 | 0 | 0.994617 | 1.524023 | 0.995729 | 0.995729 | 0.995729 | 0.976636 | 0.953035 | ... | 100 | 0.012137 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.085996 | 0.0 | 1 |
1246601 | 151 | 15.10151 | 0 | 1.005455 | 1.529092 | 1.005455 | 1.005455 | 1.005455 | 0.982045 | 0.957831 | ... | 100 | 0.012137 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.085996 | 0.0 | 1 |
1246602 | 152 | 15.20152 | 0 | 1.010750 | 1.528463 | 1.010750 | 1.010750 | 1.010750 | 0.986904 | 0.962936 | ... | 100 | 0.012137 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.085996 | 0.0 | 1 |
1246603 | 153 | 15.30153 | 0 | 1.008770 | 1.523554 | 1.010750 | 1.010750 | 1.010750 | 0.991085 | 0.967422 | ... | 100 | 0.012137 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.085996 | 0.0 | 1 |
1246604 | 154 | 15.40154 | 0 | 1.019839 | 1.529516 | 1.019839 | 1.019839 | 1.019839 | 0.996478 | 0.972188 | ... | 100 | 0.012137 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0.085996 | 0.0 | 1 |
5 rows × 26 columns
noise_level = [round(0.1*x, 2) for x in range(9)]
time_series = [[train[train['noise_level']==n].drop(['test_set', 'noise_level', 'index'], axis='columns'),
test[test['noise_level']==n].drop(['test_set', 'noise_level', 'index'], axis='columns')]
for n in noise_level]
# Different feature choices
col_to_keep_all = list(time_series[0][0].columns.difference(['time', 'y']))
col_to_keep_tda = list(time_series[0][0].columns.difference(['time', 'y', 'fourier_w_1', 'fourier_w_2',
'max_10', 'max_20', 'max_50', 'mean_10', 'mean_20',
'mean_50', 'min_10', 'min_20', 'min_50']))
col_to_keep_nor = list(time_series[0][0].columns.difference(['time', 'y', 'num_holes', 'avg_lifetime', 'betti_0',
'betti_1','betti_2', 'betti_argmax_1', 'betti_argmax_2',
'amplitude']))
col_to_keep_no_features = list(time_series[0][0].columns.difference(['time', 'y', 'max_10', 'max_20', 'max_50', 'mean_10',
'mean_20', 'mean_50', 'min_10', 'min_20', 'min_50',
'fourier_w_1', 'fourier_w_2', 'num_holes',
'avg_lifetime', 'betti_0', 'betti_1', 'betti_2',
'betti_argmax_1', 'betti_argmax_2', 'amplitude']))
We can now use these features to make predictions about when the systems changes from the chaotic regime to the non-chaotic regime and vice versa.
A particular focus is on the comparison of TDA- and non-TDA features. Below you can find a plot which clearly shows that especially in the high noise region TDA-features can help improve the score as the blue curve is always on top and outperforms the model without TDA features.
def fit_and_predict(f):
result_temp = []
for features in [col_to_keep_all, col_to_keep_tda, col_to_keep_nor, col_to_keep_no_features]:
x_train = f[0][features].values
y_train = f[0][['y']].values.flatten()
x_test = f[1][features].values
y_test = f[1][['y']].values.flatten()
rf = GradientBoostingClassifier()
rf.fit(x_train, y_train)
pred = rf.predict(x_test)
pred_df = create_pred_df(pred, y_test)
result_temp.append(balanced_accuracy_score(pred_df['ref'], pred_df['pred']))
return result_temp
# Train and evaluate the datasets in parallel
results = Parallel(n_jobs=-1)(delayed(fit_and_predict)(f) for f in time_series)
results = np.array(results)
Feature Overview:
TDA features:
Non-TDA features:
Intrinsic features: $x$ and $\dot{x}$ from the dynamic system
All features: TDA-, non-TDA features and intrinsic features
# Plot the results for different levels of noise and different features
np.seterr(divide='ignore') #Note: the signal to noise ratio of the non-noisy signal is inf. No warning needed here.
SNR = convert_to_SNR(time_series, np.array(noise_level))
np.seterr(divide='warn') #Reactivate the warning
figure = plot_results(results, noise_level, x_tick_labels=SNR, return_figure=True)
In order to make predictions, it a rolling mean was applied to the predictions to make them more smooth. Then, using a threshold the rolling mean values were converted to 0 or 1.
f = time_series[-3]
x_train = f[0][col_to_keep_all].values
y_train = f[0][['y']].values.flatten()
x_test = f[1][col_to_keep_all].values
y_test = f[1][['y']].values.flatten()
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
pred = gb.predict(x_test)
plot_predictions(pred, y_test)
pred_df = create_pred_df(pred, y_test)
print('Balanced accuracy: ', round(balanced_accuracy_score(pred_df['ref'], pred_df['indicator']), 4) * 100, '%')
Balanced accuracy: 96.08 %
In this section we are interested in the importance of different features in an XGBClassifier trained on data with and without noise. It turns out that the TDA features tend to become increasingly important if we have noisy data.
(Note: The feature importance score reported here is is a measure of how many times a certain feature is used in the XGBoost trees.)
# feature importance for no noise
f = time_series[0]
x_train = f[0][col_to_keep_all]
y_train = f[0][['y']].values.flatten()
x_test = f[1][col_to_keep_all]
y_test = f[1][['y']].values.flatten()
xgb_classifier = XGBClassifier()
xgb_classifier.fit(x_train, y_train)
pred = xgb_classifier.predict(x_test)
pred_df = create_pred_df(pred, y_test)
df_without_noise = pd.DataFrame(xgb_classifier.get_booster().get_score(importance_type='weight').items()).sort_values(by=1, ascending=False)
# feature importance with high noise
f = time_series[-1]
x_train = f[0][col_to_keep_all]
y_train = f[0][['y']].values.flatten()
x_test = f[1][col_to_keep_all]
y_test = f[1][['y']].values.flatten()
xgb_classifier = XGBClassifier()
xgb_classifier.fit(x_train, y_train)
pred = xgb_classifier.predict(x_test)
pred_df = create_pred_df(pred, y_test)
df_with_noise = pd.DataFrame(xgb_classifier.get_booster().get_score(importance_type='weight').items()).sort_values(by=1, ascending=False)
fig, ax = plt.subplots()
num = 8
y = list(range(num))
labels = df_with_noise.head(num)[0]
ax.barh(y=y,
width=df_with_noise.head(num)[1][::-1],
tick_label=labels[::-1])
ax.set_xlabel('F-score')
fig.tight_layout()
fig, ax = plt.subplots()
num = 8
y = list(range(num))
labels = df_without_noise.head(num)[0]
ax.barh(y=y,
width=df_without_noise.head(num)[1][::-1],
tick_label=labels[::-1])
ax.set_xlabel('F-score')
fig.tight_layout()