import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold as skKFold
class KFold():
def __init__(self, n_splits=5, shuffle=False, random_state=0):
self.n_splits = n_splits
self.shuffle = shuffle
self.random_state = random_state
def _iter_test_indices(self, X, y):
indices = np.arange(X.shape[0])
if self.shuffle:
rng = np.random.RandomState(self.random_state)
rng.shuffle(indices)
fold_sizes = np.full(self.n_splits, X.shape[0] // self.n_splits)
fold_sizes[:X.shape[0] % self.n_splits] += 1
current = 0
for fold_size in fold_sizes:
yield indices[current:current + fold_size]
current += fold_size
def _iter_test_masks(self, X, y):
for test_index in self._iter_test_indices(X, y):
test_mask = np.zeros(X.shape[0], dtype=bool)
test_mask[test_index] = True
yield test_mask
def split(self, X, y):
indices = np.arange(X.shape[0])
for test_index in self._iter_test_masks(X, y):
yield indices[~test_index], indices[test_index]
X, y = load_boston(return_X_y=True)
cv1 = KFold(n_splits=5)
cv2 = skKFold(n_splits=5)
for (train1, test1), (train2, test2) in zip(cv1.split(X, y), cv2.split(X, y)):
assert np.array_equal(train1, train2)
assert np.array_equal(test1, test2)
X, y = load_boston(return_X_y=True)
cv1 = KFold(n_splits=5, shuffle=True, random_state=0)
cv2 = skKFold(n_splits=5, shuffle=True, random_state=0)
for (train1, test1), (train2, test2) in zip(cv1.split(X, y), cv2.split(X, y)):
assert np.array_equal(train1, train2)
assert np.array_equal(test1, test2)