Notebook

In [1]:

import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold as skKFold

In [2]:

class KFold():
    def __init__(self, n_splits=5, shuffle=False, random_state=0):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def _iter_test_indices(self, X, y):
        indices = np.arange(X.shape[0])
        if self.shuffle:
            rng = np.random.RandomState(self.random_state)
            rng.shuffle(indices)
        fold_sizes = np.full(self.n_splits, X.shape[0] // self.n_splits)
        fold_sizes[:X.shape[0] % self.n_splits] += 1
        current = 0
        for fold_size in fold_sizes:
            yield indices[current:current + fold_size]
            current += fold_size

    def _iter_test_masks(self, X, y):
        for test_index in self._iter_test_indices(X, y):
            test_mask = np.zeros(X.shape[0], dtype=bool)
            test_mask[test_index] = True
            yield test_mask

    def split(self, X, y):
        indices = np.arange(X.shape[0])
        for test_index in self._iter_test_masks(X, y):
            yield indices[~test_index], indices[test_index]

In [3]:

X, y = load_boston(return_X_y=True)
cv1 = KFold(n_splits=5)
cv2 = skKFold(n_splits=5)
for (train1, test1), (train2, test2) in zip(cv1.split(X, y), cv2.split(X, y)):
    assert np.array_equal(train1, train2)
    assert np.array_equal(test1, test2)

In [4]:

X, y = load_boston(return_X_y=True)
cv1 = KFold(n_splits=5, shuffle=True, random_state=0)
cv2 = skKFold(n_splits=5, shuffle=True, random_state=0)
for (train1, test1), (train2, test2) in zip(cv1.split(X, y), cv2.split(X, y)):
    assert np.array_equal(train1, train2)
    assert np.array_equal(test1, test2)