#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import tensorflow as tf from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error from sklearn.ensemble import RandomForestRegressor import plotly.express as px import plotly.graph_objects as go # # Data preparation # ## Reading data # In[2]: df_raw = pd.read_csv("train_1.csv") # In[3]: df_raw.head() # In[4]: df_raw.shape # ## Cleaning data # We will only keep instantly usable data, as we do not need much for experiment # In[5]: train_cleaned = df_raw.dropna().drop("Page", axis=1) train_cleaned.head() # In[6]: train_cleaned.shape # ## Split between train and test set # In[7]: df_train, df_test = train_test_split(train_cleaned, test_size=0.2) # We will only try to predict what happened on the 29th of July 2015, given data about the previous four weeks # In[8]: features = list(pd.date_range(start="2015-07-01", end="2015-07-28").strftime("%Y-%m-%d")) target = "2015-07-29" # In[9]: x_train = df_train[features] y_train = df_train[target] x_test = df_test[features] y_test = df_test[target] # In[10]: x_train_reshaped = x_train.values.reshape((len(x_train), len(x_train.columns), 1)) x_test_reshaped = x_test.values.reshape((len(x_test), len(x_train.columns), 1)) # # Compare LSTM modelizations # ## LSTM without scaling # In[11]: def get_model(x_train_reshaped): input_layer = tf.keras.layers.Input((x_train_reshaped.shape[1], x_train_reshaped.shape[2])) lstm_layer = tf.keras.layers.LSTM(32)(input_layer) dense_layer = tf.keras.layers.Dense(1, activation="relu")(lstm_layer) model = tf.keras.Model(inputs=input_layer, outputs=dense_layer) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss="mae") return model # In[12]: model = get_model(x_train_reshaped) early_stopping = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True) no_scaling_model_history = model.fit(x_train_reshaped, y_train, validation_split=0.1, batch_size=128, epochs=500, callbacks=early_stopping, verbose=0) # In[13]: def visualize_loss(history, title): loss = history.history["loss"] val_loss = history.history["val_loss"] epochs = list(range(len(loss))) fig = go.Figure(data=[go.Scatter(x=epochs, y=loss, name="Training loss"), go.Scatter(x=epochs, y=val_loss, name="Validation loss")]) fig.update_layout(title=title, xaxis_title="Epoch", yaxis_title="Loss") fig.show() # In[14]: visualize_loss(no_scaling_model_history, "Training of a LSTM model without scaling"); # In[15]: preds_test = model.predict(x_test_reshaped)[:, 0] mean_absolute_error(preds_test, y_test) # ## LSTM with global (features) scaling # ### Sclale data # In[16]: train_values = x_train.values.reshape(-1) train_mean = np.mean(train_values) train_std = np.std(train_values) train_mean, train_std # In[17]: x_train_scaled = (x_train_reshaped - train_mean) / train_std y_train_scaled = (y_train - train_mean) / train_std x_test_scaled = (x_test_reshaped - train_mean) / train_std y_test_scaled = (y_test - train_mean) / train_std # ### Train model # In[18]: model_global_scaling = get_model(x_train_scaled) early_stopping = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True) global_scaling_model_history = model.fit(x_train_scaled, y_train_scaled, validation_split=0.1, batch_size=128, epochs=500, callbacks=early_stopping, verbose=0) # In[19]: visualize_loss(global_scaling_model_history, "Training of a LSTM model with global scaling"); # In[20]: preds_global_scaling_test = model.predict(x_test_scaled) unscaled_preds_global_scaling_test = preds_global_scaling_test[:, 0] * train_std + train_mean mean_absolute_error(unscaled_preds_global_scaling_test, y_test) # # Visualize results # ## Gloval visualization # In[21]: preds_without_scaling_vs_real = pd.DataFrame({"predictions": preds_test, "reality": y_test}) preds_with_global_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_global_scaling_test, "reality": y_test}) preds_vs_real = pd.concat([preds_without_scaling_vs_real.assign(model="model_without_scaling"), preds_with_global_scaling_vs_real.assign(model="model_with_global_scaling")]) preds_vs_real.head() # In[22]: px.scatter(preds_vs_real, x="predictions", y="reality", color="model", log_x=True, log_y=True, title="Comparison of predictions and real values") # ## Visualize as time series # In[23]: df_test_with_predictions = x_test.assign(model_without_scaling=preds_test, model_with_global_scaling=unscaled_preds_global_scaling_test) # In[24]: sample = df_test_with_predictions.sample(1) sample # In[25]: fig = go.Figure(data=[go.Scatter(x=x_test.columns, y=sample[x_test.columns].values[0], name="Number of clicks"), go.Scatter(x=[x_test.columns[-1]], y=[sample["model_without_scaling"].values[0]], name="Predictions with model not using scaling"), go.Scatter(x=[x_test.columns[-1]], y=[sample["model_with_global_scaling"].values[0]], name="Predictions with model with global scaling")]) fig.update_layout(title="Visualization of models predictions on one time series", xaxis_title="Day", yaxis_title="Clicks") # # LSTM with time series scaling # ## Scale data # In[26]: df_train_features_target = df_train[features + [target]] df_test_features_target = df_test[features + [target]] # In[27]: df_train_means = df_train_features_target.mean(axis=1) df_train_stds = df_train_features_target.std(axis=1).replace(0, 1) df_test_means = df_test_features_target.mean(axis=1) df_test_stds = df_test_features_target.std(axis=1).replace(0, 1) # In[28]: df_train_scaled = df_train_features_target.assign(**{ f: (df_train_features_target[f] - df_train_means) / df_train_stds for f in df_train_features_target.columns }) df_test_scaled = df_test_features_target.assign(**{ f: (df_test_features_target[f] - df_test_means) / df_test_stds for f in df_test_features_target.columns }) # In[29]: df_train_scaled.head() # In[30]: x_train_scaled = df_train_scaled[features] y_train_scaled = df_train_scaled[target] x_test_scaled = df_test_scaled[features] y_test_scaled = df_test_scaled[target] # In[31]: x_train_scaled_reshaped = x_train_scaled.values.reshape((len(x_train), len(x_train.columns), 1)) x_test_scaled_reshaped = x_test_scaled.values.reshape((len(x_test), len(x_train.columns), 1)) # ## Train model with scaled data # In[32]: model_scaled = get_model(x_train_reshaped) early_stopping = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True) scaled_model_history = model_scaled.fit(x_train_scaled_reshaped, y_train_scaled, validation_split=0.1, batch_size=128, epochs=500, callbacks=early_stopping, verbose=0) # In[33]: visualize_loss(scaled_model_history, "Training of a LSTM model with scaling"); # In[34]: preds_scaled_test = model_scaled.predict(x_test_scaled_reshaped) unscaled_preds_scaled_test = (preds_scaled_test[:, 0] * df_test_stds) + df_test_means mean_absolute_error(unscaled_preds_scaled_test, y_test) # ### Train model with scaled data and sample weights # In[35]: model_scaling_weights = get_model(x_train_reshaped) early_stopping = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True) scaled_model_with_weights_history = model_scaling_weights.fit(x_train_scaled_reshaped, y_train_scaled, validation_split=0.1, batch_size=128, epochs=500, sample_weight=df_train_stds, callbacks=early_stopping, verbose=0) # In[36]: visualize_loss(scaled_model_with_weights_history, "Training of a LSTM model with scaling and sample weights"); # In[37]: preds_scaled_weighted_test = model_scaling_weights.predict(x_test_scaled_reshaped) unscaled_preds_scaled_weighted_test = (preds_scaled_weighted_test[:, 0] * df_test_stds) + df_test_means mean_absolute_error(unscaled_preds_scaled_weighted_test, y_test) # # Visualize results with time series scaling # ## Global visualization # In[38]: preds_with_window_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_test, "reality": y_test}) preds_with_window_scaling_weights_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_weighted_test, "reality": y_test}) preds_vs_real = pd.concat([preds_with_window_scaling_vs_real.assign(model="model_with_time_series_scaling"), preds_with_window_scaling_weights_vs_real.assign(model="model_with_time_series_scaling_and_weights")]) preds_vs_real.head() # In[39]: px.scatter(preds_vs_real, x="predictions", y="reality", color="model", log_x=True, log_y=True, title="Comparison of predictions and real values") # ## Visualize predictions vs reality as time series # In[40]: df_test_with_predictions = x_test.assign(model_without_scaling=preds_test, model_with_global_scaling=unscaled_preds_global_scaling_test, model_with_window_scaling=unscaled_preds_scaled_test, model_with_window_scaling_and_weights=unscaled_preds_scaled_weighted_test) df_test_with_predictions.head() # In[41]: sample = df_test_with_predictions.sample(1) sample[["model_with_window_scaling", "model_with_window_scaling_and_weights"]] # In[42]: fig = go.Figure(data=[go.Scatter(x=x_test.columns, y=sample[x_test.columns].values[0], name="Number of clicks"), go.Scatter(x=[x_test.columns[-1]], y=[sample["model_with_window_scaling"].values[0]], name="Predictions with model with window scaling"), go.Scatter(x=[x_test.columns[-1]], y=[sample["model_with_window_scaling_and_weights"].values[0]], name="Predictions with model with window scaling and weights")]) fig.update_layout(title="Visualization of models predictions on one time series", xaxis_title="Day", yaxis_title="Clicks") # In[ ]: