#!/usr/bin/env python # coding: utf-8 # ## Sliding window hold # - K Hold 말고, 기간이 있으면 누적해서 Train을 설정하는 Hold # - 주로 시계열 데이터를 다룰 때 사용 # - [참고 문서](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html) # # In[37]: import pandas as pd from sklearn.model_selection import TimeSeriesSplit from sklearn.preprocessing import LabelEncoder import xgboost as xgb # In[3]: date_index = pd.date_range(start='2019-01-01', end='2019-01-30', freq='1D') X = pd.DataFrame(date_index, columns=['date']) X['dummy'] = 'a' X['label'] = 1 # In[7]: y = X['label'] # In[13]: del X['label'] # In[14]: X.head() # In[156]: # 데이터가 이미 date 기준으로 sort 되었다고 가정하고 진행 tscv = TimeSeriesSplit(n_splits=5) for train_index, test_index in tscv.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1] y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1] print("X_train") print(X_train.head(3)) print() print("X_test") print(X_test.head(3)) # In[154]: tscv = TimeSeriesSplit(n_splits=3) for train_index, test_index in tscv.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1] y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1] # In[155]: tscv = TimeSeriesSplit(n_splits=10) for train_index, test_index in tscv.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1] y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1] # ## Simple Modeling # In[23]: le = LabelEncoder() # In[24]: X.head(2) # In[25]: le.fit(X['date']) # In[40]: X['date'] = le.transform(X['date']) X['dummy'] = 0 # In[42]: X.head(2) # In[153]: xgb_preds = [] tscv = TimeSeriesSplit(n_splits=4) for train_index, test_index in tscv.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1] y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1] print(len(X_train), len(y_train), len(X_test)) xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True} xgb_regressor = xgb.XGBRegressor(n_estimators=1000) xgb_model = xgb_regressor.fit(X_train, y_train, verbose=False) xgb_pred = xgb_model.predict(X_test) xgb_preds.append(list(xgb_pred)) # print('cv', cross_val_score(xgb_model, X_train, y_train, cv=tscv, scoring='accuracy')) # In[144]: preds=[] for i in range(len(xgb_preds[0])): sum=0 for j in range(4): sum+=xgb_preds[j][i] preds.append(sum / 4) output = pd.DataFrame({'id': 'unknown', 'target': preds}) # In[145]: output # In[ ]: