Note that all networks only train one epoch to save compute time. Increase the number of epochs for more accurate results.
import pandas as pd
import numpy as np
%matplotlib inline
train = pd.read_csv('../input/train_1.csv').fillna(0)
train.head()
Page | 2015-07-01 | 2015-07-02 | 2015-07-03 | 2015-07-04 | 2015-07-05 | 2015-07-06 | 2015-07-07 | 2015-07-08 | 2015-07-09 | ... | 2016-12-22 | 2016-12-23 | 2016-12-24 | 2016-12-25 | 2016-12-26 | 2016-12-27 | 2016-12-28 | 2016-12-29 | 2016-12-30 | 2016-12-31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2NE1_zh.wikipedia.org_all-access_spider | 18.0 | 11.0 | 5.0 | 13.0 | 14.0 | 9.0 | 9.0 | 22.0 | 26.0 | ... | 32.0 | 63.0 | 15.0 | 26.0 | 14.0 | 20.0 | 22.0 | 19.0 | 18.0 | 20.0 |
1 | 2PM_zh.wikipedia.org_all-access_spider | 11.0 | 14.0 | 15.0 | 18.0 | 11.0 | 13.0 | 22.0 | 11.0 | 10.0 | ... | 17.0 | 42.0 | 28.0 | 15.0 | 9.0 | 30.0 | 52.0 | 45.0 | 26.0 | 20.0 |
2 | 3C_zh.wikipedia.org_all-access_spider | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 4.0 | 0.0 | 3.0 | 4.0 | ... | 3.0 | 1.0 | 1.0 | 7.0 | 4.0 | 4.0 | 6.0 | 3.0 | 4.0 | 17.0 |
3 | 4minute_zh.wikipedia.org_all-access_spider | 35.0 | 13.0 | 10.0 | 94.0 | 4.0 | 26.0 | 14.0 | 9.0 | 11.0 | ... | 32.0 | 10.0 | 26.0 | 27.0 | 16.0 | 11.0 | 17.0 | 19.0 | 10.0 | 11.0 |
4 | 52_Hz_I_Love_You_zh.wikipedia.org_all-access_s... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 48.0 | 9.0 | 25.0 | 13.0 | 3.0 | 11.0 | 27.0 | 13.0 | 36.0 | 10.0 |
5 rows × 551 columns
def parse_page(page):
x = page.split('_')
return ' '.join(x[:-3]), x[-3], x[-2], x[-1]
l = list(train.Page.apply(parse_page))
df = pd.DataFrame(l)
del l
df.columns = ['Subject','Sub_Page','Access','Agent']
df.head()
Subject | Sub_Page | Access | Agent | |
---|---|---|---|---|
0 | 2NE1 | zh.wikipedia.org | all-access | spider |
1 | 2PM | zh.wikipedia.org | all-access | spider |
2 | 3C | zh.wikipedia.org | all-access | spider |
3 | 4minute | zh.wikipedia.org | all-access | spider |
4 | 52 Hz I Love You | zh.wikipedia.org | all-access | spider |
train = pd.concat([train,df],axis=1)
del train['Page']
del df
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
def lag_arr(arr, lag,fill):
filler = np.full((arr.shape[0],lag,1),-1)
comb = np.concatenate((filler,arr),axis=1)
result = comb[:,:arr.shape[1]]
return result
def single_autocorr(series, lag):
"""
Autocorrelation for single data series
:param series: traffic series
:param lag: lag, days
:return:
"""
s1 = series[lag:]
s2 = series[:-lag]
ms1 = np.mean(s1)
ms2 = np.mean(s2)
ds1 = s1 - ms1
ds2 = s2 - ms2
divider = np.sqrt(np.sum(ds1 * ds1)) * np.sqrt(np.sum(ds2 * ds2))
return np.sum(ds1 * ds2) / divider if divider != 0 else 0
def batc_autocorr(data,lag,series_length):
corrs = []
for i in range(data.shape[0]):
c = single_autocorr(data, lag)
corrs.append(c)
corr = np.array(corrs)
corr = corr.reshape(-1,1)
corr = np.expand_dims(corr,-1)
corr = np.repeat(corr,series_length,axis=1)
return corr
datetime.datetime.strptime(train.columns.values[0], '%Y-%m-%d').strftime('%a')
weekdays = [datetime.datetime.strptime(date, '%Y-%m-%d').strftime('%a')
for date in train.columns.values[:-4]]
day_one_hot = LabelEncoder().fit_transform(weekdays)
day_one_hot = day_one_hot.reshape(-1, 1)
day_one_hot = OneHotEncoder(sparse=False).fit_transform(day_one_hot)
day_one_hot = np.expand_dims(day_one_hot,0)
agent_int = LabelEncoder().fit(train['Agent'])
agent_enc = agent_int.transform(train['Agent'])
agent_enc = agent_enc.reshape(-1, 1)
agent_one_hot = OneHotEncoder(sparse=False).fit(agent_enc)
del agent_enc
page_int = LabelEncoder().fit(train['Sub_Page'])
page_enc = page_int.transform(train['Sub_Page'])
page_enc = page_enc.reshape(-1, 1)
page_one_hot = OneHotEncoder(sparse=False).fit(page_enc)
del page_enc
acc_int = LabelEncoder().fit(train['Access'])
acc_enc = acc_int.transform(train['Access'])
acc_enc = acc_enc.reshape(-1, 1)
acc_one_hot = OneHotEncoder(sparse=False).fit(acc_enc)
del acc_enc
def get_batch(train,start=0,lookback = 100):
assert((start + lookback) <= (train.shape[1] - 5)) , 'End of lookback would be out of bounds'
data = train.iloc[:,start:start + lookback].values
target = train.iloc[:,start + lookback].values
target = np.log1p(target)
log_view = np.log1p(data)
log_view = np.expand_dims(log_view,axis=-1)
days = day_one_hot[:,start:start + lookback]
days = np.repeat(days,repeats=train.shape[0],axis=0)
year_lag = lag_arr(log_view,365,-1)
halfyear_lag = lag_arr(log_view,182,-1)
quarter_lag = lag_arr(log_view,91,-1)
agent_enc = agent_int.transform(train['Agent'])
agent_enc = agent_enc.reshape(-1, 1)
agent_enc = agent_one_hot.transform(agent_enc)
agent_enc = np.expand_dims(agent_enc,1)
agent_enc = np.repeat(agent_enc,lookback,axis=1)
page_enc = page_int.transform(train['Sub_Page'])
page_enc = page_enc.reshape(-1, 1)
page_enc = page_one_hot.transform(page_enc)
page_enc = np.expand_dims(page_enc, 1)
page_enc = np.repeat(page_enc,lookback,axis=1)
acc_enc = acc_int.transform(train['Access'])
acc_enc = acc_enc.reshape(-1, 1)
acc_enc = acc_one_hot.transform(acc_enc)
acc_enc = np.expand_dims(acc_enc,1)
acc_enc = np.repeat(acc_enc,lookback,axis=1)
year_autocorr = batc_autocorr(data,lag=365,series_length=lookback)
halfyr_autocorr = batc_autocorr(data,lag=182,series_length=lookback)
quarter_autocorr = batc_autocorr(data,lag=91,series_length=lookback)
medians = np.median(data,axis=1)
medians = np.expand_dims(medians,-1)
medians = np.expand_dims(medians,-1)
medians = np.repeat(medians,lookback,axis=1)
'''
print(log_view.shape)
print(days.shape)
print(year_lag.shape)
print(halfyear_lag.shape)
print(page_enc.shape)
print(agent_enc.shape)
print(acc_enc.shape)'''
batch = np.concatenate((log_view,
days,
year_lag,
halfyear_lag,
quarter_lag,
page_enc,
agent_enc,
acc_enc,
year_autocorr,
halfyr_autocorr,
quarter_autocorr,
medians),axis=2)
return batch, target
def generate_batches(train,batch_size = 32, lookback = 100):
num_samples = train.shape[0]
num_steps = train.shape[1] - 5
while True:
for i in range(num_samples // batch_size):
batch_start = i * batch_size
batch_end = batch_start + batch_size
seq_start = np.random.randint(num_steps - lookback)
X,y = get_batch(train.iloc[batch_start:batch_end],start=seq_start)
yield X,y
from keras.models import Sequential
from keras.layers import Conv1D, MaxPool1D, Dense, Activation, GlobalMaxPool1D, Flatten
/opt/conda/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. from ._conv import register_converters as _register_converters Using TensorFlow backend.
max_len = 100
n_features = 29
model = Sequential()
model.add(Conv1D(16,5, input_shape=(max_len,n_features)))
model.add(Activation('relu'))
model.add(MaxPool1D(5))
model.add(Conv1D(16,5))
model.add(Activation('relu'))
model.add(MaxPool1D(5))
model.add(Flatten())
model.add(Dense(1))
model.compile(optimizer='adam',loss='mean_absolute_percentage_error')
from sklearn.model_selection import train_test_split
batch_size = 128
train_df, val_df = train_test_split(train, test_size=0.1)
train_gen = generate_batches(train_df,batch_size=batch_size)
val_gen = generate_batches(val_df, batch_size=batch_size)
n_train_samples = train_df.shape[0]
n_val_samples = val_df.shape[0]
a,b = next(train_gen)
/opt/conda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice. out=out, **kwargs) /opt/conda/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
model.fit_generator(train_gen,
epochs=1,
steps_per_epoch=n_train_samples // batch_size,
validation_data= val_gen,
validation_steps=n_val_samples // batch_size)
Epoch 1/1
/opt/conda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice. out=out, **kwargs) /opt/conda/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
1019/1019 [==============================] - 52s 51ms/step - loss: 431502.3788 - val_loss: 107665.1747
<keras.callbacks.History at 0x7f0803fe7630>
from keras.layers import SimpleRNN
model = Sequential()
model.add(SimpleRNN(16,input_shape=(max_len,n_features)))
model.add(Dense(1))
model.compile(optimizer='adam',loss='mean_absolute_percentage_error')
model.fit_generator(train_gen,
epochs=1,
steps_per_epoch=n_train_samples // batch_size,
validation_data= val_gen,
validation_steps=n_val_samples // batch_size)
Epoch 1/1
/opt/conda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice. out=out, **kwargs) /opt/conda/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
1019/1019 [==============================] - 82s 81ms/step - loss: 4612880.6155 - val_loss: 973000.9942
<keras.callbacks.History at 0x7f0802940278>
from keras.layers import SimpleRNN
model = Sequential()
model.add(SimpleRNN(32,return_sequences=True,input_shape=(max_len,n_features)))
model.add(SimpleRNN(16, return_sequences = True))
model.add(SimpleRNN(16))
model.add(Dense(1))
model.compile(optimizer='adam',loss='mean_absolute_percentage_error')
model.fit_generator(train_gen,
epochs=1,
steps_per_epoch=n_train_samples // batch_size,
validation_data= val_gen,
validation_steps=n_val_samples // batch_size)
Epoch 1/1
/opt/conda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice. out=out, **kwargs) /opt/conda/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
1019/1019 [==============================] - 171s 168ms/step - loss: 3398789.7302 - val_loss: 1034468.5364
<keras.callbacks.History at 0x7f0802940198>
from keras.layers import CuDNNLSTM
model = Sequential()
model.add(CuDNNLSTM(16,input_shape=(max_len,n_features)))
model.add(Dense(1))
WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version. Instructions for updating: Use the retry module or similar alternatives.
model.compile(optimizer='adam',loss='mean_absolute_percentage_error')
model.fit_generator(train_gen,
epochs=1,
steps_per_epoch=n_train_samples // batch_size,
validation_data= val_gen,
validation_steps=n_val_samples // batch_size)
Epoch 1/1
/opt/conda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice. out=out, **kwargs) /opt/conda/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
1019/1019 [==============================] - 52s 51ms/step - loss: 918992.5830 - val_loss: 283646.8184
<keras.callbacks.History at 0x7f0728f436d8>
from keras.layers import LSTM
model = Sequential()
model.add(LSTM(16,
recurrent_dropout=0.1,
return_sequences=True,
input_shape=(max_len,n_features)))
model.add(LSTM(16,recurrent_dropout=0.1))
model.add(Dense(1))
model.compile(optimizer='adam',loss='mean_absolute_percentage_error')
model.fit_generator(train_gen,
epochs=1,
steps_per_epoch=n_train_samples // batch_size,
validation_data= val_gen,
validation_steps=n_val_samples // batch_size)
Epoch 1/1
/opt/conda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2957: RuntimeWarning: Mean of empty slice. out=out, **kwargs) /opt/conda/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
1019/1019 [==============================] - 443s 435ms/step - loss: 516063.6124 - val_loss: 87626.7314
<keras.callbacks.History at 0x7f0726750898>