import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go
df_raw = pd.read_csv("train_1.csv")
df_raw.head()
Page | 2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | ... | 2016-12-22 | 2016-12-23 | 2016-12-24 | 2016-12-25 | 2016-12-26 | 2016-12-27 | 2016-12-28 | 2016-12-29 | 2016-12-30 | 2016-12-31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2NE1_zh.wikipedia.org_all-access_spider | 18.0 | 11.0 | 5.0 | 13.0 | 14.0 | 9.0 | 9.0 | 22.0 | 26.0 | ... | 32.0 | 63.0 | 15.0 | 26.0 | 14.0 | 20.0 | 22.0 | 19.0 | 18.0 | 20.0 |
1 | 2PM_zh.wikipedia.org_all-access_spider | 11.0 | 14.0 | 15.0 | 18.0 | 11.0 | 13.0 | 22.0 | 11.0 | 10.0 | ... | 17.0 | 42.0 | 28.0 | 15.0 | 9.0 | 30.0 | 52.0 | 45.0 | 26.0 | 20.0 |
2 | 3C_zh.wikipedia.org_all-access_spider | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 4.0 | 0.0 | 3.0 | 4.0 | ... | 3.0 | 1.0 | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 3.0 | 4.0 | 17.0 |
3 | 4minute_zh.wikipedia.org_all-access_spider | 35.0 | 13.0 | 10.0 | 94.0 | 4.0 | 26.0 | 14.0 | 9.0 | 11.0 | ... | 32.0 | 10.0 | 26.0 | 27.0 | 16.0 | 11.0 | 17.0 | 19.0 | 10.0 | 11.0 |
4 | 52_Hz_I_Love_You_zh.wikipedia.org_all-access_s... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 48.0 | 9.0 | 25.0 | 13.0 | 3.0 | 11.0 | 27.0 | 13.0 | 36.0 | 10.0 |
5 rows × 551 columns
df_raw.shape
(145063, 551)
We will only keep instantly usable data, as we do not need much for experiment
train_cleaned = df_raw.dropna().drop("Page", axis=1)
train_cleaned.head()
2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | 2015-07-10 | ... | 2016-12-22 | 2016-12-23 | 2016-12-24 | 2016-12-25 | 2016-12-26 | 2016-12-27 | 2016-12-28 | 2016-12-29 | 2016-12-30 | 2016-12-31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 11.0 | 5.0 | 13.0 | 14.0 | 9.0 | 9.0 | 22.0 | 26.0 | 24.0 | ... | 32.0 | 63.0 | 15.0 | 26.0 | 14.0 | 20.0 | 22.0 | 19.0 | 18.0 | 20.0 |
1 | 11.0 | 14.0 | 15.0 | 18.0 | 11.0 | 13.0 | 22.0 | 11.0 | 10.0 | 4.0 | ... | 17.0 | 42.0 | 28.0 | 15.0 | 9.0 | 30.0 | 52.0 | 45.0 | 26.0 | 20.0 |
2 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 4.0 | 0.0 | 3.0 | 4.0 | 4.0 | ... | 3.0 | 1.0 | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 3.0 | 4.0 | 17.0 |
3 | 35.0 | 13.0 | 10.0 | 94.0 | 4.0 | 26.0 | 14.0 | 9.0 | 11.0 | 16.0 | ... | 32.0 | 10.0 | 26.0 | 27.0 | 16.0 | 11.0 | 17.0 | 19.0 | 10.0 | 11.0 |
5 | 12.0 | 7.0 | 4.0 | 5.0 | 20.0 | 8.0 | 5.0 | 17.0 | 24.0 | 7.0 | ... | 16.0 | 27.0 | 8.0 | 17.0 | 32.0 | 19.0 | 23.0 | 17.0 | 17.0 | 50.0 |
5 rows × 550 columns
train_cleaned.shape
(117277, 550)
df_train, df_test = train_test_split(train_cleaned, test_size=0.2)
We will only try to predict what happened on the 29th of July 2015, given data about the previous four weeks
features = list(pd.date_range(start="2015-07-01",
end="2015-07-28").strftime("%Y-%m-%d"))
target = "2015-07-29"
x_train = df_train[features]
y_train = df_train[target]
x_test = df_test[features]
y_test = df_test[target]
x_train_reshaped = x_train.values.reshape((len(x_train), len(x_train.columns), 1))
x_test_reshaped = x_test.values.reshape((len(x_test), len(x_train.columns), 1))
def get_model(x_train_reshaped):
input_layer = tf.keras.layers.Input((x_train_reshaped.shape[1], x_train_reshaped.shape[2]))
lstm_layer = tf.keras.layers.LSTM(32)(input_layer)
dense_layer = tf.keras.layers.Dense(1, activation="relu")(lstm_layer)
model = tf.keras.Model(inputs=input_layer, outputs=dense_layer)
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss="mae")
return model
model = get_model(x_train_reshaped)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=20,
restore_best_weights=True)
no_scaling_model_history = model.fit(x_train_reshaped, y_train, validation_split=0.1,
batch_size=128, epochs=500,
callbacks=early_stopping, verbose=0)
def visualize_loss(history, title):
loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs = list(range(len(loss)))
fig = go.Figure(data=[go.Scatter(x=epochs, y=loss, name="Training loss"),
go.Scatter(x=epochs, y=val_loss, name="Validation loss")])
fig.update_layout(title=title,
xaxis_title="Epoch",
yaxis_title="Loss")
fig.show()
visualize_loss(no_scaling_model_history,
"Training of a LSTM model without scaling");
preds_test = model.predict(x_test_reshaped)[:, 0]
mean_absolute_error(preds_test, y_test)
505.49933319755337
train_values = x_train.values.reshape(-1)
train_mean = np.mean(train_values)
train_std = np.std(train_values)
train_mean, train_std
(1357.7947531545633, 81776.07685299727)
x_train_scaled = (x_train_reshaped - train_mean) / train_std
y_train_scaled = (y_train - train_mean) / train_std
x_test_scaled = (x_test_reshaped - train_mean) / train_std
y_test_scaled = (y_test - train_mean) / train_std
model_global_scaling = get_model(x_train_scaled)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=20,
restore_best_weights=True)
global_scaling_model_history = model.fit(x_train_scaled, y_train_scaled, validation_split=0.1,
batch_size=128, epochs=500,
callbacks=early_stopping, verbose=0)
visualize_loss(global_scaling_model_history,
"Training of a LSTM model with global scaling");
preds_global_scaling_test = model.predict(x_test_scaled)
unscaled_preds_global_scaling_test = preds_global_scaling_test[:, 0] * train_std + train_mean
mean_absolute_error(unscaled_preds_global_scaling_test, y_test)
1359.9819542701377
preds_without_scaling_vs_real = pd.DataFrame({"predictions": preds_test,
"reality": y_test})
preds_with_global_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_global_scaling_test,
"reality": y_test})
preds_vs_real = pd.concat([preds_without_scaling_vs_real.assign(model="model_without_scaling"),
preds_with_global_scaling_vs_real.assign(model="model_with_global_scaling")])
preds_vs_real.head()
predictions | reality | model | |
---|---|---|---|
32325 | 13.045105 | 21.0 | model_without_scaling |
24056 | 226.384186 | 264.0 | model_without_scaling |
63499 | 37.388153 | 43.0 | model_without_scaling |
31182 | 10.135635 | 6.0 | model_without_scaling |
15368 | 4.617081 | 3.0 | model_without_scaling |
px.scatter(preds_vs_real, x="predictions", y="reality", color="model", log_x=True, log_y=True,
title="Comparison of predictions and real values")
df_test_with_predictions = x_test.assign(model_without_scaling=preds_test,
model_with_global_scaling=unscaled_preds_global_scaling_test)
sample = df_test_with_predictions.sample(1)
sample
2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | 2015-07-10 | ... | 2015-07-21 | 2015-07-22 | 2015-07-23 | 2015-07-24 | 2015-07-25 | 2015-07-26 | 2015-07-27 | 2015-07-28 | model_without_scaling | model_with_global_scaling | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
30353 | 287.0 | 535.0 | 385.0 | 517.0 | 400.0 | 586.0 | 430.0 | 566.0 | 658.0 | 668.0 | ... | 1641.0 | 21004.0 | 3315.0 | 1259.0 | 889.0 | 1214.0 | 1745.0 | 535.0 | 473.044586 | 1357.7948 |
1 rows × 30 columns
fig = go.Figure(data=[go.Scatter(x=x_test.columns,
y=sample[x_test.columns].values[0],
name="Number of clicks"),
go.Scatter(x=[x_test.columns[-1]],
y=[sample["model_without_scaling"].values[0]],
name="Predictions with model not using scaling"),
go.Scatter(x=[x_test.columns[-1]],
y=[sample["model_with_global_scaling"].values[0]],
name="Predictions with model with global scaling")])
fig.update_layout(title="Visualization of models predictions on one time series",
xaxis_title="Day",
yaxis_title="Clicks")
df_train_features_target = df_train[features + [target]]
df_test_features_target = df_test[features + [target]]
df_train_means = df_train_features_target.mean(axis=1)
df_train_stds = df_train_features_target.std(axis=1).replace(0, 1)
df_test_means = df_test_features_target.mean(axis=1)
df_test_stds = df_test_features_target.std(axis=1).replace(0, 1)
df_train_scaled = df_train_features_target.assign(**{
f: (df_train_features_target[f] - df_train_means) / df_train_stds
for f in df_train_features_target.columns
})
df_test_scaled = df_test_features_target.assign(**{
f: (df_test_features_target[f] - df_test_means) / df_test_stds
for f in df_test_features_target.columns
})
df_train_scaled.head()
2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | 2015-07-10 | ... | 2015-07-20 | 2015-07-21 | 2015-07-22 | 2015-07-23 | 2015-07-24 | 2015-07-25 | 2015-07-26 | 2015-07-27 | 2015-07-28 | 2015-07-29 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
38067 | 0.187183 | -0.803384 | -1.556214 | -0.750553 | 0.306051 | -0.076968 | 1.626806 | 0.028692 | -0.288289 | -1.107157 | ... | 1.190957 | -0.803384 | -0.076968 | 0.490957 | -0.843006 | -0.024138 | -0.697723 | -0.433572 | 0.649447 | 0.266428 |
27418 | 3.898156 | 2.849669 | 0.832404 | -0.117979 | 0.121150 | 0.458383 | -0.062795 | -0.338713 | -0.105716 | -0.246740 | ... | -0.124110 | 0.065967 | -0.289661 | -0.259003 | -0.657551 | -0.761786 | -0.504263 | -0.553315 | -0.596236 | -0.350976 |
133770 | 1.120961 | -0.811939 | -0.636221 | -0.284785 | 0.418088 | 0.066652 | -0.811939 | -0.284785 | 0.066652 | 0.945243 | ... | 0.593806 | -0.284785 | 0.769524 | -0.636221 | -0.460503 | -0.109066 | -0.636221 | -0.284785 | -0.109066 | -0.987657 |
57046 | -0.535820 | -0.570976 | -0.496270 | 0.127742 | 4.922092 | 0.496876 | 0.053037 | -0.030458 | -0.065614 | -0.272153 | ... | -0.188658 | -0.324887 | -0.386409 | 0.145320 | -0.469904 | -0.500665 | -0.346859 | -0.430353 | -0.447931 | -0.513848 |
25480 | -1.071227 | -0.993867 | -1.358565 | -1.413822 | 0.343357 | 0.431768 | 1.006443 | 0.884877 | 0.100225 | -1.712211 | ... | 0.265997 | 1.216420 | 0.166534 | 0.221791 | -0.894404 | -0.872301 | 0.730157 | 0.376511 | 2.387873 | 0.122328 |
5 rows × 29 columns
x_train_scaled = df_train_scaled[features]
y_train_scaled = df_train_scaled[target]
x_test_scaled = df_test_scaled[features]
y_test_scaled = df_test_scaled[target]
x_train_scaled_reshaped = x_train_scaled.values.reshape((len(x_train), len(x_train.columns), 1))
x_test_scaled_reshaped = x_test_scaled.values.reshape((len(x_test), len(x_train.columns), 1))
model_scaled = get_model(x_train_reshaped)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=20,
restore_best_weights=True)
scaled_model_history = model_scaled.fit(x_train_scaled_reshaped, y_train_scaled,
validation_split=0.1,
batch_size=128, epochs=500,
callbacks=early_stopping, verbose=0)
visualize_loss(scaled_model_history,
"Training of a LSTM model with scaling");
preds_scaled_test = model_scaled.predict(x_test_scaled_reshaped)
unscaled_preds_scaled_test = (preds_scaled_test[:, 0] * df_test_stds) + df_test_means
mean_absolute_error(unscaled_preds_scaled_test, y_test)
168.87094454272494
model_scaling_weights = get_model(x_train_reshaped)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=20,
restore_best_weights=True)
scaled_model_with_weights_history = model_scaling_weights.fit(x_train_scaled_reshaped,
y_train_scaled,
validation_split=0.1,
batch_size=128, epochs=500,
sample_weight=df_train_stds,
callbacks=early_stopping, verbose=0)
visualize_loss(scaled_model_with_weights_history,
"Training of a LSTM model with scaling and sample weights");
preds_scaled_weighted_test = model_scaling_weights.predict(x_test_scaled_reshaped)
unscaled_preds_scaled_weighted_test = (preds_scaled_weighted_test[:, 0] * df_test_stds) + df_test_means
mean_absolute_error(unscaled_preds_scaled_weighted_test, y_test)
175.5151280040324
preds_with_window_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_test,
"reality": y_test})
preds_with_window_scaling_weights_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_weighted_test,
"reality": y_test})
preds_vs_real = pd.concat([preds_with_window_scaling_vs_real.assign(model="model_with_time_series_scaling"),
preds_with_window_scaling_weights_vs_real.assign(model="model_with_time_series_scaling_and_weights")])
preds_vs_real.head()
predictions | reality | model | |
---|---|---|---|
32325 | 21.042154 | 21.0 | model_with_time_series_scaling |
24056 | 257.100567 | 264.0 | model_with_time_series_scaling |
63499 | 44.259588 | 43.0 | model_with_time_series_scaling |
31182 | 9.689655 | 6.0 | model_with_time_series_scaling |
15368 | 4.827586 | 3.0 | model_with_time_series_scaling |
px.scatter(preds_vs_real, x="predictions", y="reality", color="model", log_x=True, log_y=True,
title="Comparison of predictions and real values")
df_test_with_predictions = x_test.assign(model_without_scaling=preds_test,
model_with_global_scaling=unscaled_preds_global_scaling_test,
model_with_window_scaling=unscaled_preds_scaled_test,
model_with_window_scaling_and_weights=unscaled_preds_scaled_weighted_test)
df_test_with_predictions.head()
2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | 2015-07-10 | ... | 2015-07-23 | 2015-07-24 | 2015-07-25 | 2015-07-26 | 2015-07-27 | 2015-07-28 | model_without_scaling | model_with_global_scaling | model_with_window_scaling | model_with_window_scaling_and_weights | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
32325 | 5.0 | 17.0 | 28.0 | 13.0 | 8.0 | 13.0 | 31.0 | 16.0 | 21.0 | 11.0 | ... | 50.0 | 29.0 | 27.0 | 31.0 | 14.0 | 9.0 | 13.045105 | 1357.7948 | 21.042154 | 20.809841 |
24056 | 204.0 | 208.0 | 186.0 | 270.0 | 264.0 | 223.0 | 198.0 | 193.0 | 206.0 | 164.0 | ... | 258.0 | 249.0 | 183.0 | 540.0 | 318.0 | 251.0 | 226.384186 | 1357.7948 | 257.100567 | 262.121880 |
63499 | 27.0 | 34.0 | 29.0 | 40.0 | 42.0 | 41.0 | 62.0 | 39.0 | 47.0 | 47.0 | ... | 34.0 | 52.0 | 42.0 | 61.0 | 48.0 | 28.0 | 37.388153 | 1357.7948 | 44.259588 | 42.400217 |
31182 | 9.0 | 9.0 | 5.0 | 12.0 | 8.0 | 7.0 | 8.0 | 12.0 | 12.0 | 13.0 | ... | 6.0 | 8.0 | 9.0 | 7.0 | 9.0 | 13.0 | 10.135635 | 1357.7948 | 9.689655 | 9.689655 |
15368 | 7.0 | 5.0 | 8.0 | 7.0 | 4.0 | 4.0 | 3.0 | 7.0 | 8.0 | 5.0 | ... | 3.0 | 7.0 | 2.0 | 2.0 | 3.0 | 2.0 | 4.617081 | 1357.7948 | 4.827586 | 4.827586 |
5 rows × 32 columns
sample = df_test_with_predictions.sample(1)
sample[["model_with_window_scaling", "model_with_window_scaling_and_weights"]]
model_with_window_scaling | model_with_window_scaling_and_weights | |
---|---|---|
138836 | 698.461308 | 700.278395 |
fig = go.Figure(data=[go.Scatter(x=x_test.columns,
y=sample[x_test.columns].values[0],
name="Number of clicks"),
go.Scatter(x=[x_test.columns[-1]],
y=[sample["model_with_window_scaling"].values[0]],
name="Predictions with model with window scaling"),
go.Scatter(x=[x_test.columns[-1]],
y=[sample["model_with_window_scaling_and_weights"].values[0]],
name="Predictions with model with window scaling and weights")])
fig.update_layout(title="Visualization of models predictions on one time series",
xaxis_title="Day",
yaxis_title="Clicks")