#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import tensorflow as tf from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error from sklearn.ensemble import RandomForestRegressor import plotly.express as px import plotly.graph_objects as go # # Data preparation # ## Reading data # In[2]: df_raw = pd.read_csv("train_2.csv") # In[3]: df_raw.head() # In[4]: df_raw.shape # ## Cleaning data # We will use data of only one page # In[5]: df_no_na = df_raw.dropna() df_no_na.head() # In[6]: means = df_no_na.drop("Page", axis=1).mean(axis=1) mins = df_no_na.drop("Page", axis=1).min(axis=1) one_page = df_no_na[(means > 100) & (mins > 10)]["Page"].sample(1).values[0] df_one_page = df_no_na[df_no_na["Page"] == one_page].drop("Page", axis=1) df_one_page.head() # ## Windows # In[7]: window_size = 15 cols = df_one_page.columns window_data = [] for start_index in range(len(cols) - window_size + 1): window_data.append(df_one_page[cols[start_index:start_index + window_size]].values[0]) # In[8]: len(window_data) # ## Split between train and test set # In[9]: test_start_index = int(0.85 * len(window_data)) train_data = np.array(window_data[:test_start_index]) test_data = np.array(window_data[test_start_index:]) # In[10]: x_train = train_data[:, :-1].reshape(len(train_data), window_size - 1, 1) y_train = train_data[:, -1] x_test = test_data[:, :-1].reshape(len(test_data), window_size - 1, 1) y_test = test_data[:, -1] # In[11]: x_train.shape, y_train.shape, x_test.shape, y_test.shape # # Apply LSTM # ## LSTM without global scaling # In[12]: def get_model(x_train): input_layer = tf.keras.layers.Input((x_train.shape[1], x_train.shape[2])) lstm_layer = tf.keras.layers.LSTM(128, return_sequences=True)(input_layer) second_lstm_layer = tf.keras.layers.LSTM(32)(lstm_layer) dense_layer = tf.keras.layers.Dense(8)(second_lstm_layer) output_layer = tf.keras.layers.Dense(1)(dense_layer) model = tf.keras.Model(inputs=input_layer, outputs=output_layer) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss="mae") return model # In[13]: model = get_model(x_train) early_stopping = tf.keras.callbacks.EarlyStopping(patience=100, restore_best_weights=True) model_history = model.fit(x_train, y_train, validation_split=0.1, batch_size=128, epochs=1000, callbacks=early_stopping, verbose=0) # In[14]: def visualize_loss(history, title): loss = history.history["loss"] val_loss = history.history["val_loss"] epochs = list(range(len(loss))) fig = go.Figure(data=[go.Scatter(x=epochs, y=loss, name="Training loss"), go.Scatter(x=epochs, y=val_loss, name="Validation loss")]) fig.update_layout(title=title, xaxis_title="Epoch", yaxis_title="Loss") fig.show() # In[15]: visualize_loss(model_history, "Training of a LSTM model without scaling"); # The training seems to be quite successful, but we will see at the end of the notebook that it's not really the case... # In[16]: preds_test = model.predict(x_test) mean_absolute_error(preds_test[:, 0], y_test) # ## LSTM with scaling # ### Scale features # In[17]: train_mean = np.mean(x_train) train_std = np.std(x_train) train_mean, train_std # In[18]: x_train_scaled = (x_train - train_mean) / train_std y_train_scaled = (y_train - train_mean) / train_std x_test_scaled = (x_test - train_mean) / train_std y_test_scaled = (y_test - train_mean) / train_std # ### Train models # In[19]: model_scaled = get_model(x_train) early_stopping = tf.keras.callbacks.EarlyStopping(patience=100, restore_best_weights=True) scaled_model_history = model_scaled.fit(x_train_scaled, y_train_scaled, validation_split=0.1, batch_size=128, epochs=1000, callbacks=early_stopping, verbose=0) # In[20]: visualize_loss(scaled_model_history, "Training of a LSTM model with scaling"); # In[21]: preds_scaled_test = model_scaled.predict(x_test_scaled) unscaled_preds_scaled_test = (preds_scaled_test[:, 0] * train_std) + train_mean mean_absolute_error(unscaled_preds_scaled_test, y_test) # # Visualize results # ## Visualize global predictions vs reality # In[22]: preds_without_scaling_vs_real = pd.DataFrame({"predictions": preds_test[:, 0], "reality": y_test}) preds_with_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_test, "reality": y_test}) preds_vs_real = pd.concat([preds_without_scaling_vs_real.assign(model="model_without_scaling"), preds_with_scaling_vs_real.assign(model="model_with_scaling")]) preds_vs_real.head() # In[23]: px.scatter(preds_vs_real, x="predictions", y="reality", color="model", trendline="ols", title="Comparaison des prédictions des deux modèles") # The model without scaling fell into a local minima, corresponding with constant predictions # ## Visualize predictions vs reality as time series # In[24]: days = df_one_page.columns[-len(preds_test):] fig = go.Figure(data=[go.Scatter(x=days, y=y_test, name="Number of clicks"), go.Scatter(x=days, y=preds_test[:, 0], name="Predictions with model not using scaling"), go.Scatter(x=days, y=unscaled_preds_scaled_test, name="Predictions with model using global scaling")]) fig.update_layout(title="Predictions vs reality on test set", xaxis_title="Day", yaxis_title="Clicks") # In[ ]: