In [1]:

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go

Data preparation¶

Reading data¶

In [2]:

df_raw = pd.read_csv("train_2.csv")

In [3]:

df_raw.head()

Out[3]:

	Page	2015-07-01	2015-07-02	2015-07-03	2015-07-04	2015-07-05	2015-07-06	2015-07-07	2015-07-08	2015-07-09	...	2017-09-01	2017-09-02	2017-09-03	2017-09-04	2017-09-05	2017-09-06	2017-09-07	2017-09-08	2017-09-09	2017-09-10
0	2NE1_zh.wikipedia.org_all-access_spider	18.0	11.0	5.0	13.0	14.0	9.0	9.0	22.0	26.0	...	19.0	33.0	33.0	18.0	16.0	27.0	29.0	23.0	54.0	38.0
1	2PM_zh.wikipedia.org_all-access_spider	11.0	14.0	15.0	18.0	11.0	13.0	22.0	11.0	10.0	...	32.0	30.0	11.0	19.0	54.0	25.0	26.0	23.0	13.0	81.0
2	3C_zh.wikipedia.org_all-access_spider	1.0	0.0	1.0	1.0	0.0	4.0	0.0	3.0	4.0	...	6.0	6.0	7.0	2.0	4.0	7.0	3.0	4.0	7.0	6.0
3	4minute_zh.wikipedia.org_all-access_spider	35.0	13.0	10.0	94.0	4.0	26.0	14.0	9.0	11.0	...	7.0	19.0	19.0	9.0	6.0	16.0	19.0	30.0	38.0	4.0
4	52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	16.0	16.0	19.0	9.0	20.0	23.0	28.0	14.0	8.0	7.0

5 rows × 804 columns

In [4]:

df_raw.shape

Out[4]:

(145063, 804)

Cleaning data¶

We will use data of only one page

In [5]:

df_no_na = df_raw.dropna()
df_no_na.head()

Out[5]:

	Page	2015-07-01	2015-07-02	2015-07-03	2015-07-04	2015-07-05	2015-07-06	2015-07-07	2015-07-08	2015-07-09	...	2017-09-01	2017-09-02	2017-09-03	2017-09-04	2017-09-05	2017-09-06	2017-09-07	2017-09-08	2017-09-09	2017-09-10
0	2NE1_zh.wikipedia.org_all-access_spider	18.0	11.0	5.0	13.0	14.0	9.0	9.0	22.0	26.0	...	19.0	33.0	33.0	18.0	16.0	27.0	29.0	23.0	54.0	38.0
1	2PM_zh.wikipedia.org_all-access_spider	11.0	14.0	15.0	18.0	11.0	13.0	22.0	11.0	10.0	...	32.0	30.0	11.0	19.0	54.0	25.0	26.0	23.0	13.0	81.0
2	3C_zh.wikipedia.org_all-access_spider	1.0	0.0	1.0	1.0	0.0	4.0	0.0	3.0	4.0	...	6.0	6.0	7.0	2.0	4.0	7.0	3.0	4.0	7.0	6.0
3	4minute_zh.wikipedia.org_all-access_spider	35.0	13.0	10.0	94.0	4.0	26.0	14.0	9.0	11.0	...	7.0	19.0	19.0	9.0	6.0	16.0	19.0	30.0	38.0	4.0
5	5566_zh.wikipedia.org_all-access_spider	12.0	7.0	4.0	5.0	20.0	8.0	5.0	17.0	24.0	...	13.0	13.0	45.0	4.0	13.0	20.0	18.0	17.0	14.0	11.0

5 rows × 804 columns

In [6]:

means = df_no_na.drop("Page", axis=1).mean(axis=1)
mins = df_no_na.drop("Page", axis=1).min(axis=1)
one_page = df_no_na[(means > 100) & (mins > 10)]["Page"].sample(1).values[0]
df_one_page = df_no_na[df_no_na["Page"] == one_page].drop("Page", axis=1)
df_one_page.head()

Out[6]:

	2015-07-01	2015-07-02	2015-07-03	2015-07-04	2015-07-05	2015-07-06	2015-07-07	2015-07-08	2015-07-09	2015-07-10	...	2017-09-01	2017-09-02	2017-09-03	2017-09-04	2017-09-05	2017-09-06	2017-09-07	2017-09-08	2017-09-09	2017-09-10
71123	1348.0	1448.0	881.0	547.0	658.0	951.0	1228.0	1157.0	1162.0	866.0	...	2155.0	1274.0	1715.0	2758.0	3151.0	2991.0	2637.0	1527.0	931.0	1146.0

1 rows × 803 columns

Windows¶

In [7]:

window_size = 15
cols = df_one_page.columns
window_data = []
for start_index in range(len(cols) - window_size + 1):
    window_data.append(df_one_page[cols[start_index:start_index + window_size]].values[0])

In [8]:

len(window_data)

Out[8]:

Split between train and test set¶

In [9]:

test_start_index = int(0.85 * len(window_data))
train_data = np.array(window_data[:test_start_index])
test_data = np.array(window_data[test_start_index:])

In [10]:

x_train = train_data[:, :-1].reshape(len(train_data), window_size - 1, 1)
y_train = train_data[:, -1]
x_test = test_data[:, :-1].reshape(len(test_data), window_size - 1, 1)
y_test = test_data[:, -1]

In [11]:

x_train.shape, y_train.shape, x_test.shape, y_test.shape

Out[11]:

((670, 14, 1), (670,), (119, 14, 1), (119,))

Apply LSTM¶

LSTM without global scaling¶

In [12]:

def get_model(x_train):
    input_layer = tf.keras.layers.Input((x_train.shape[1], x_train.shape[2]))
    lstm_layer = tf.keras.layers.LSTM(128, return_sequences=True)(input_layer)
    second_lstm_layer = tf.keras.layers.LSTM(32)(lstm_layer)
    dense_layer = tf.keras.layers.Dense(8)(second_lstm_layer)
    output_layer = tf.keras.layers.Dense(1)(dense_layer)
    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss="mae")
    return model

In [13]:

model = get_model(x_train)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=100,
                                                 restore_best_weights=True)
model_history = model.fit(x_train, y_train, validation_split=0.1,
                         batch_size=128, epochs=1000,
                        callbacks=early_stopping, verbose=0)

In [14]:

def visualize_loss(history, title):
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs = list(range(len(loss)))
    fig = go.Figure(data=[go.Scatter(x=epochs, y=loss, name="Training loss"),
                   go.Scatter(x=epochs, y=val_loss, name="Validation loss")])
    fig.update_layout(title=title,
                       xaxis_title="Epoch",
                       yaxis_title="Loss")
    fig.show()

In [15]:

visualize_loss(model_history, 
               "Training of a LSTM model without scaling");

The training seems to be quite successful, but we will see at the end of the notebook that it's not really the case...

In [16]:

preds_test = model.predict(x_test)
mean_absolute_error(preds_test[:, 0], y_test)

Out[16]:

614.4952618254333

LSTM with scaling¶

Scale features¶

In [17]:

train_mean = np.mean(x_train)
train_std = np.std(x_train)
train_mean, train_std

Out[17]:

(1625.2317697228145, 1184.967031656248)

In [18]:

x_train_scaled = (x_train - train_mean) / train_std
y_train_scaled = (y_train - train_mean) / train_std
x_test_scaled =  (x_test - train_mean) / train_std
y_test_scaled = (y_test - train_mean) / train_std

Train models¶

In [19]:

model_scaled = get_model(x_train)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=100,
                                                 restore_best_weights=True)

scaled_model_history = model_scaled.fit(x_train_scaled, y_train_scaled, 
                                        validation_split=0.1,
                                        batch_size=128, epochs=1000,
                                       callbacks=early_stopping, verbose=0)

In [20]:

visualize_loss(scaled_model_history, 
               "Training of a LSTM model with scaling");

In [21]:

preds_scaled_test = model_scaled.predict(x_test_scaled)
unscaled_preds_scaled_test = (preds_scaled_test[:, 0] * train_std) + train_mean
mean_absolute_error(unscaled_preds_scaled_test, y_test)

Out[21]:

143.87561548056723

Visualize results¶

Visualize global predictions vs reality¶

In [22]:

preds_without_scaling_vs_real = pd.DataFrame({"predictions": preds_test[:, 0],
                                             "reality": y_test})
preds_with_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_test,
                                             "reality": y_test})

preds_vs_real = pd.concat([preds_without_scaling_vs_real.assign(model="model_without_scaling"),
                          preds_with_scaling_vs_real.assign(model="model_with_scaling")])
preds_vs_real.head()

Out[22]:

	predictions	reality	model
0	1307.753052	1592.0	model_without_scaling
1	1307.753052	1570.0	model_without_scaling
2	1307.753052	1732.0	model_without_scaling
3	1307.753052	1445.0	model_without_scaling
4	1307.753052	1180.0	model_without_scaling

In [23]:

px.scatter(preds_vs_real, x="predictions", y="reality", color="model",
          trendline="ols", title="Comparaison des prédictions des deux modèles")

The model without scaling fell into a local minima, corresponding with constant predictions

Visualize predictions vs reality as time series¶

In [24]:

days = df_one_page.columns[-len(preds_test):]

fig = go.Figure(data=[go.Scatter(x=days, 
                              y=y_test, 
                              name="Number of clicks"),
                      go.Scatter(x=days, 
                              y=preds_test[:, 0], 
                              name="Predictions with model not using scaling"),
                     go.Scatter(x=days, 
                              y=unscaled_preds_scaled_test, 
                              name="Predictions with model using global scaling")])
fig.update_layout(title="Predictions vs reality on test set",
                   xaxis_title="Day",
                   yaxis_title="Clicks")

In [ ]: