import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go
df_raw = pd.read_csv("train_2.csv")
df_raw.head()
Page | 2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | ... | 2017-09-01 | 2017-09-02 | 2017-09-03 | 2017-09-04 | 2017-09-05 | 2017-09-06 | 2017-09-07 | 2017-09-08 | 2017-09-09 | 2017-09-10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2NE1_zh.wikipedia.org_all-access_spider | 18.0 | 11.0 | 5.0 | 13.0 | 14.0 | 9.0 | 9.0 | 22.0 | 26.0 | ... | 19.0 | 33.0 | 33.0 | 18.0 | 16.0 | 27.0 | 29.0 | 23.0 | 54.0 | 38.0 |
1 | 2PM_zh.wikipedia.org_all-access_spider | 11.0 | 14.0 | 15.0 | 18.0 | 11.0 | 13.0 | 22.0 | 11.0 | 10.0 | ... | 32.0 | 30.0 | 11.0 | 19.0 | 54.0 | 25.0 | 26.0 | 23.0 | 13.0 | 81.0 |
2 | 3C_zh.wikipedia.org_all-access_spider | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 4.0 | 0.0 | 3.0 | 4.0 | ... | 6.0 | 6.0 | 7.0 | 2.0 | 4.0 | 7.0 | 3.0 | 4.0 | 7.0 | 6.0 |
3 | 4minute_zh.wikipedia.org_all-access_spider | 35.0 | 13.0 | 10.0 | 94.0 | 4.0 | 26.0 | 14.0 | 9.0 | 11.0 | ... | 7.0 | 19.0 | 19.0 | 9.0 | 6.0 | 16.0 | 19.0 | 30.0 | 38.0 | 4.0 |
4 | 52_Hz_I_Love_You_zh.wikipedia.org_all-access_s... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 16.0 | 16.0 | 19.0 | 9.0 | 20.0 | 23.0 | 28.0 | 14.0 | 8.0 | 7.0 |
5 rows × 804 columns
df_raw.shape
(145063, 804)
We will use data of only one page
df_no_na = df_raw.dropna()
df_no_na.head()
Page | 2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | ... | 2017-09-01 | 2017-09-02 | 2017-09-03 | 2017-09-04 | 2017-09-05 | 2017-09-06 | 2017-09-07 | 2017-09-08 | 2017-09-09 | 2017-09-10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2NE1_zh.wikipedia.org_all-access_spider | 18.0 | 11.0 | 5.0 | 13.0 | 14.0 | 9.0 | 9.0 | 22.0 | 26.0 | ... | 19.0 | 33.0 | 33.0 | 18.0 | 16.0 | 27.0 | 29.0 | 23.0 | 54.0 | 38.0 |
1 | 2PM_zh.wikipedia.org_all-access_spider | 11.0 | 14.0 | 15.0 | 18.0 | 11.0 | 13.0 | 22.0 | 11.0 | 10.0 | ... | 32.0 | 30.0 | 11.0 | 19.0 | 54.0 | 25.0 | 26.0 | 23.0 | 13.0 | 81.0 |
2 | 3C_zh.wikipedia.org_all-access_spider | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 4.0 | 0.0 | 3.0 | 4.0 | ... | 6.0 | 6.0 | 7.0 | 2.0 | 4.0 | 7.0 | 3.0 | 4.0 | 7.0 | 6.0 |
3 | 4minute_zh.wikipedia.org_all-access_spider | 35.0 | 13.0 | 10.0 | 94.0 | 4.0 | 26.0 | 14.0 | 9.0 | 11.0 | ... | 7.0 | 19.0 | 19.0 | 9.0 | 6.0 | 16.0 | 19.0 | 30.0 | 38.0 | 4.0 |
5 | 5566_zh.wikipedia.org_all-access_spider | 12.0 | 7.0 | 4.0 | 5.0 | 20.0 | 8.0 | 5.0 | 17.0 | 24.0 | ... | 13.0 | 13.0 | 45.0 | 4.0 | 13.0 | 20.0 | 18.0 | 17.0 | 14.0 | 11.0 |
5 rows × 804 columns
means = df_no_na.drop("Page", axis=1).mean(axis=1)
mins = df_no_na.drop("Page", axis=1).min(axis=1)
one_page = df_no_na[(means > 100) & (mins > 10)]["Page"].sample(1).values[0]
df_one_page = df_no_na[df_no_na["Page"] == one_page].drop("Page", axis=1)
df_one_page.head()
2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | 2015-07-10 | ... | 2017-09-01 | 2017-09-02 | 2017-09-03 | 2017-09-04 | 2017-09-05 | 2017-09-06 | 2017-09-07 | 2017-09-08 | 2017-09-09 | 2017-09-10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
71123 | 1348.0 | 1448.0 | 881.0 | 547.0 | 658.0 | 951.0 | 1228.0 | 1157.0 | 1162.0 | 866.0 | ... | 2155.0 | 1274.0 | 1715.0 | 2758.0 | 3151.0 | 2991.0 | 2637.0 | 1527.0 | 931.0 | 1146.0 |
1 rows × 803 columns
window_size = 15
cols = df_one_page.columns
window_data = []
for start_index in range(len(cols) - window_size + 1):
window_data.append(df_one_page[cols[start_index:start_index + window_size]].values[0])
len(window_data)
789
test_start_index = int(0.85 * len(window_data))
train_data = np.array(window_data[:test_start_index])
test_data = np.array(window_data[test_start_index:])
x_train = train_data[:, :-1].reshape(len(train_data), window_size - 1, 1)
y_train = train_data[:, -1]
x_test = test_data[:, :-1].reshape(len(test_data), window_size - 1, 1)
y_test = test_data[:, -1]
x_train.shape, y_train.shape, x_test.shape, y_test.shape
((670, 14, 1), (670,), (119, 14, 1), (119,))
def get_model(x_train):
input_layer = tf.keras.layers.Input((x_train.shape[1], x_train.shape[2]))
lstm_layer = tf.keras.layers.LSTM(128, return_sequences=True)(input_layer)
second_lstm_layer = tf.keras.layers.LSTM(32)(lstm_layer)
dense_layer = tf.keras.layers.Dense(8)(second_lstm_layer)
output_layer = tf.keras.layers.Dense(1)(dense_layer)
model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss="mae")
return model
model = get_model(x_train)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=100,
restore_best_weights=True)
model_history = model.fit(x_train, y_train, validation_split=0.1,
batch_size=128, epochs=1000,
callbacks=early_stopping, verbose=0)
def visualize_loss(history, title):
loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs = list(range(len(loss)))
fig = go.Figure(data=[go.Scatter(x=epochs, y=loss, name="Training loss"),
go.Scatter(x=epochs, y=val_loss, name="Validation loss")])
fig.update_layout(title=title,
xaxis_title="Epoch",
yaxis_title="Loss")
fig.show()
visualize_loss(model_history,
"Training of a LSTM model without scaling");
The training seems to be quite successful, but we will see at the end of the notebook that it's not really the case...
preds_test = model.predict(x_test)
mean_absolute_error(preds_test[:, 0], y_test)
614.4952618254333
train_mean = np.mean(x_train)
train_std = np.std(x_train)
train_mean, train_std
(1625.2317697228145, 1184.967031656248)
x_train_scaled = (x_train - train_mean) / train_std
y_train_scaled = (y_train - train_mean) / train_std
x_test_scaled = (x_test - train_mean) / train_std
y_test_scaled = (y_test - train_mean) / train_std
model_scaled = get_model(x_train)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=100,
restore_best_weights=True)
scaled_model_history = model_scaled.fit(x_train_scaled, y_train_scaled,
validation_split=0.1,
batch_size=128, epochs=1000,
callbacks=early_stopping, verbose=0)
visualize_loss(scaled_model_history,
"Training of a LSTM model with scaling");
preds_scaled_test = model_scaled.predict(x_test_scaled)
unscaled_preds_scaled_test = (preds_scaled_test[:, 0] * train_std) + train_mean
mean_absolute_error(unscaled_preds_scaled_test, y_test)
143.87561548056723
preds_without_scaling_vs_real = pd.DataFrame({"predictions": preds_test[:, 0],
"reality": y_test})
preds_with_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_test,
"reality": y_test})
preds_vs_real = pd.concat([preds_without_scaling_vs_real.assign(model="model_without_scaling"),
preds_with_scaling_vs_real.assign(model="model_with_scaling")])
preds_vs_real.head()
predictions | reality | model | |
---|---|---|---|
0 | 1307.753052 | 1592.0 | model_without_scaling |
1 | 1307.753052 | 1570.0 | model_without_scaling |
2 | 1307.753052 | 1732.0 | model_without_scaling |
3 | 1307.753052 | 1445.0 | model_without_scaling |
4 | 1307.753052 | 1180.0 | model_without_scaling |
px.scatter(preds_vs_real, x="predictions", y="reality", color="model",
trendline="ols", title="Comparaison des prédictions des deux modèles")
The model without scaling fell into a local minima, corresponding with constant predictions
days = df_one_page.columns[-len(preds_test):]
fig = go.Figure(data=[go.Scatter(x=days,
y=y_test,
name="Number of clicks"),
go.Scatter(x=days,
y=preds_test[:, 0],
name="Predictions with model not using scaling"),
go.Scatter(x=days,
y=unscaled_preds_scaled_test,
name="Predictions with model using global scaling")])
fig.update_layout(title="Predictions vs reality on test set",
xaxis_title="Day",
yaxis_title="Clicks")