#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go


# # Data preparation

# ## Reading data

# In[2]:


df_raw = pd.read_csv("train_1.csv")


# In[3]:


df_raw.head()


# In[4]:


df_raw.shape


# ## Cleaning data

# We will only keep instantly usable data, as we do not need much for experiment

# In[5]:


train_cleaned = df_raw.dropna().drop("Page", axis=1)
train_cleaned.head()


# In[6]:


train_cleaned.shape


# ## Split between train and test set 

# In[7]:


df_train, df_test = train_test_split(train_cleaned, test_size=0.2)


# We will only try to predict what happened on the 29th of July 2015, given data about the previous four weeks

# In[8]:


features = list(pd.date_range(start="2015-07-01",
                             end="2015-07-28").strftime("%Y-%m-%d"))
target = "2015-07-29"


# In[9]:


x_train = df_train[features]
y_train = df_train[target]
x_test = df_test[features]
y_test = df_test[target]


# In[10]:


x_train_reshaped = x_train.values.reshape((len(x_train), len(x_train.columns), 1))
x_test_reshaped = x_test.values.reshape((len(x_test), len(x_train.columns), 1))


# # Compare LSTM modelizations

# ## LSTM without scaling

# In[11]:


def get_model(x_train_reshaped):
    input_layer = tf.keras.layers.Input((x_train_reshaped.shape[1], x_train_reshaped.shape[2]))
    lstm_layer = tf.keras.layers.LSTM(32)(input_layer)
    dense_layer = tf.keras.layers.Dense(1, activation="relu")(lstm_layer)
    model = tf.keras.Model(inputs=input_layer, outputs=dense_layer)
    model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss="mae")
    return model


# In[12]:


model = get_model(x_train_reshaped)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=20,
                                                 restore_best_weights=True)
no_scaling_model_history = model.fit(x_train_reshaped, y_train, validation_split=0.1,
                                     batch_size=128, epochs=500,
                                     callbacks=early_stopping, verbose=0)


# In[13]:


def visualize_loss(history, title):
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs = list(range(len(loss)))
    fig = go.Figure(data=[go.Scatter(x=epochs, y=loss, name="Training loss"),
                   go.Scatter(x=epochs, y=val_loss, name="Validation loss")])
    fig.update_layout(title=title,
                       xaxis_title="Epoch",
                       yaxis_title="Loss")
    fig.show()


# In[14]:


visualize_loss(no_scaling_model_history, 
               "Training of a LSTM model without scaling");


# In[15]:


preds_test = model.predict(x_test_reshaped)[:, 0]
mean_absolute_error(preds_test, y_test)


# ## LSTM with global (features) scaling

# ### Sclale data

# In[16]:


train_values = x_train.values.reshape(-1)
train_mean = np.mean(train_values)
train_std = np.std(train_values)
train_mean, train_std


# In[17]:


x_train_scaled = (x_train_reshaped - train_mean) / train_std
y_train_scaled = (y_train - train_mean) / train_std
x_test_scaled =  (x_test_reshaped - train_mean) / train_std
y_test_scaled = (y_test - train_mean) / train_std


# ### Train model

# In[18]:


model_global_scaling = get_model(x_train_scaled)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=20,
                                                 restore_best_weights=True)
global_scaling_model_history = model.fit(x_train_scaled, y_train_scaled, validation_split=0.1,
                                         batch_size=128, epochs=500,
                                         callbacks=early_stopping, verbose=0)


# In[19]:


visualize_loss(global_scaling_model_history, 
               "Training of a LSTM model with global scaling");


# In[20]:


preds_global_scaling_test = model.predict(x_test_scaled)
unscaled_preds_global_scaling_test = preds_global_scaling_test[:, 0] * train_std + train_mean
mean_absolute_error(unscaled_preds_global_scaling_test, y_test)


# # Visualize results

# ## Gloval visualization

# In[21]:


preds_without_scaling_vs_real = pd.DataFrame({"predictions": preds_test,
                                             "reality": y_test})
preds_with_global_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_global_scaling_test,
                                                 "reality": y_test})

preds_vs_real = pd.concat([preds_without_scaling_vs_real.assign(model="model_without_scaling"),
                          preds_with_global_scaling_vs_real.assign(model="model_with_global_scaling")])
preds_vs_real.head()


# In[22]:


px.scatter(preds_vs_real, x="predictions", y="reality", color="model", log_x=True, log_y=True,
           title="Comparison of predictions and real values")


# ## Visualize as time series

# In[23]:


df_test_with_predictions = x_test.assign(model_without_scaling=preds_test,
                                         model_with_global_scaling=unscaled_preds_global_scaling_test)


# In[24]:


sample = df_test_with_predictions.sample(1)
sample


# In[25]:


fig = go.Figure(data=[go.Scatter(x=x_test.columns, 
                              y=sample[x_test.columns].values[0], 
                              name="Number of clicks"),
                      go.Scatter(x=[x_test.columns[-1]], 
                              y=[sample["model_without_scaling"].values[0]], 
                              name="Predictions with model not using scaling"),
                      go.Scatter(x=[x_test.columns[-1]], 
                              y=[sample["model_with_global_scaling"].values[0]], 
                              name="Predictions with model with global scaling")])
fig.update_layout(title="Visualization of models predictions on one time series",
                   xaxis_title="Day",
                   yaxis_title="Clicks")


# # LSTM with time series scaling

# ## Scale data

# In[26]:


df_train_features_target = df_train[features + [target]]
df_test_features_target = df_test[features + [target]]


# In[27]:


df_train_means = df_train_features_target.mean(axis=1)
df_train_stds = df_train_features_target.std(axis=1).replace(0, 1)

df_test_means = df_test_features_target.mean(axis=1)
df_test_stds = df_test_features_target.std(axis=1).replace(0, 1)


# In[28]:


df_train_scaled = df_train_features_target.assign(**{
    f: (df_train_features_target[f] - df_train_means) / df_train_stds 
    for f in df_train_features_target.columns
})
df_test_scaled = df_test_features_target.assign(**{
    f: (df_test_features_target[f] - df_test_means) / df_test_stds 
    for f in df_test_features_target.columns
})


# In[29]:


df_train_scaled.head()


# In[30]:


x_train_scaled = df_train_scaled[features]
y_train_scaled = df_train_scaled[target]
x_test_scaled = df_test_scaled[features]
y_test_scaled = df_test_scaled[target]


# In[31]:


x_train_scaled_reshaped = x_train_scaled.values.reshape((len(x_train), len(x_train.columns), 1))
x_test_scaled_reshaped = x_test_scaled.values.reshape((len(x_test), len(x_train.columns), 1))


# ## Train model with scaled data

# In[32]:


model_scaled = get_model(x_train_reshaped)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=20,
                                                 restore_best_weights=True)

scaled_model_history = model_scaled.fit(x_train_scaled_reshaped, y_train_scaled, 
                                        validation_split=0.1,
                                        batch_size=128, epochs=500,
                                        callbacks=early_stopping, verbose=0)


# In[33]:


visualize_loss(scaled_model_history, 
               "Training of a LSTM model with scaling");


# In[34]:


preds_scaled_test = model_scaled.predict(x_test_scaled_reshaped)
unscaled_preds_scaled_test = (preds_scaled_test[:, 0] * df_test_stds) + df_test_means
mean_absolute_error(unscaled_preds_scaled_test, y_test)


# ### Train model with scaled data and sample weights

# In[35]:


model_scaling_weights = get_model(x_train_reshaped)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=20,
                                                 restore_best_weights=True)

scaled_model_with_weights_history = model_scaling_weights.fit(x_train_scaled_reshaped,
                                                              y_train_scaled, 
                                                              validation_split=0.1,
                                                              batch_size=128, epochs=500, 
                                                              sample_weight=df_train_stds,
                                                              callbacks=early_stopping, verbose=0)


# In[36]:


visualize_loss(scaled_model_with_weights_history, 
               "Training of a LSTM model with scaling and sample weights");


# In[37]:


preds_scaled_weighted_test = model_scaling_weights.predict(x_test_scaled_reshaped)
unscaled_preds_scaled_weighted_test = (preds_scaled_weighted_test[:, 0] * df_test_stds) + df_test_means

mean_absolute_error(unscaled_preds_scaled_weighted_test, y_test)


# # Visualize results with time series scaling

# ## Global visualization

# In[38]:


preds_with_window_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_test,
                                                 "reality": y_test})
preds_with_window_scaling_weights_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_weighted_test,
                                                         "reality": y_test})

preds_vs_real = pd.concat([preds_with_window_scaling_vs_real.assign(model="model_with_time_series_scaling"),
                          preds_with_window_scaling_weights_vs_real.assign(model="model_with_time_series_scaling_and_weights")])
preds_vs_real.head()


# In[39]:


px.scatter(preds_vs_real, x="predictions", y="reality", color="model", log_x=True, log_y=True,
          title="Comparison of predictions and real values")


# ## Visualize predictions vs reality as time series

# In[40]:


df_test_with_predictions = x_test.assign(model_without_scaling=preds_test,
                                         model_with_global_scaling=unscaled_preds_global_scaling_test,
                                         model_with_window_scaling=unscaled_preds_scaled_test,
                                         model_with_window_scaling_and_weights=unscaled_preds_scaled_weighted_test)
df_test_with_predictions.head()


# In[41]:


sample = df_test_with_predictions.sample(1)
sample[["model_with_window_scaling", "model_with_window_scaling_and_weights"]]


# In[42]:


fig = go.Figure(data=[go.Scatter(x=x_test.columns, 
                              y=sample[x_test.columns].values[0], 
                              name="Number of clicks"),
                      go.Scatter(x=[x_test.columns[-1]], 
                              y=[sample["model_with_window_scaling"].values[0]], 
                              name="Predictions with model with window scaling"),
                      go.Scatter(x=[x_test.columns[-1]], 
                              y=[sample["model_with_window_scaling_and_weights"].values[0]], 
                              name="Predictions with model with window scaling and weights")])
fig.update_layout(title="Visualization of models predictions on one time series",
                   xaxis_title="Day",
                   yaxis_title="Clicks")


# In[ ]: