import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
date_index = pd.date_range(start='2019-01-01', end='2019-01-30', freq='1D')
X = pd.DataFrame(date_index, columns=['date'])
X['dummy'] = 'a'
X['label'] = 1
y = X['label']
del X['label']
X.head()
date | dummy | |
---|---|---|
0 | 2019-01-01 | a |
1 | 2019-01-02 | a |
2 | 2019-01-03 | a |
3 | 2019-01-04 | a |
4 | 2019-01-05 | a |
# 데이터가 이미 date 기준으로 sort 되었다고 가정하고 진행
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
print("X_train")
print(X_train.head(3))
print()
print("X_test")
print(X_test.head(3))
TRAIN: [0 1 2 3 4] TEST: [5 6 7 8 9] X_train date dummy date_encoding 0 0 0 0 1 1 0 1 2 2 0 2 X_test date dummy date_encoding 5 5 0 5 6 6 0 6 7 7 0 7 TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11 12 13 14] X_train date dummy date_encoding 0 0 0 0 1 1 0 1 2 2 0 2 X_test date dummy date_encoding 10 10 0 10 11 11 0 11 12 12 0 12 TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] TEST: [15 16 17 18 19] X_train date dummy date_encoding 0 0 0 0 1 1 0 1 2 2 0 2 X_test date dummy date_encoding 15 15 0 15 16 16 0 16 17 17 0 17 TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21 22 23 24] X_train date dummy date_encoding 0 0 0 0 1 1 0 1 2 2 0 2 X_test date dummy date_encoding 20 20 0 20 21 21 0 21 22 22 0 22 TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] TEST: [25 26 27 28 29] X_train date dummy date_encoding 0 0 0 0 1 1 0 1 2 2 0 2 X_test date dummy date_encoding 25 25 0 25 26 26 0 26 27 27 0 27
tscv = TimeSeriesSplit(n_splits=3)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
TRAIN: [0 1 2 3 4 5 6 7 8] TEST: [ 9 10 11 12 13 14 15] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] TEST: [16 17 18 19 20 21 22] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22] TEST: [23 24 25 26 27 28 29]
tscv = TimeSeriesSplit(n_splits=10)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11] TEST: [12 13] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13] TEST: [14 15] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] TEST: [16 17] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] TEST: [18 19] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] TEST: [22 23] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] TEST: [26 27] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27] TEST: [28 29]
le = LabelEncoder()
X.head(2)
date | dummy | |
---|---|---|
0 | 2019-01-01 | a |
1 | 2019-01-02 | a |
le.fit(X['date'])
LabelEncoder()
X['date'] = le.transform(X['date'])
X['dummy'] = 0
X.head(2)
date | dummy | date_encoding | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 1 | 0 | 1 |
xgb_preds = []
tscv = TimeSeriesSplit(n_splits=4)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
print(len(X_train), len(y_train), len(X_test))
xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}
xgb_regressor = xgb.XGBRegressor(n_estimators=1000)
xgb_model = xgb_regressor.fit(X_train, y_train, verbose=False)
xgb_pred = xgb_model.predict(X_test)
xgb_preds.append(list(xgb_pred))
# print('cv', cross_val_score(xgb_model, X_train, y_train, cv=tscv, scoring='accuracy'))
TRAIN: [0 1 2 3 4 5] TEST: [ 6 7 8 9 10 11] 6 6 6 TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11] TEST: [12 13 14 15 16 17] 12 12 6 TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] TEST: [18 19 20 21 22 23] 18 18 6 TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25 26 27 28 29] 24 24 6
preds=[]
for i in range(len(xgb_preds[0])):
sum=0
for j in range(4):
sum+=xgb_preds[j][i]
preds.append(sum / 4)
output = pd.DataFrame({'id': 'unknown', 'target': preds})
output
id | target | |
---|---|---|
0 | unknown | 1.0 |
1 | unknown | 1.0 |
2 | unknown | 1.0 |
3 | unknown | 1.0 |
4 | unknown | 1.0 |
5 | unknown | 1.0 |