Notebook

In [1]:

import numpy as np
from sklearn.base import clone
from sklearn.datasets import load_boston, load_iris
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import learning_curve as sklearning_curve

In [2]:

def learning_curve(estimator, X, y, train_sizes, random_state=0):
    if estimator._estimator_type == "regressor":
        cv = KFold()
    else:  # estimator._estimator_type == "classifier"
        cv = StratifiedKFold()
    train_scores = np.zeros((len(train_sizes), cv.n_splits))
    test_scores = np.zeros((len(train_sizes), cv.n_splits))
    cv_iter = list(cv.split(X, y))
    train_sizes_abs = (len(cv_iter[0][0]) * np.array(train_sizes)).astype(int)
    rng = np.random.RandomState(random_state)
    cv_iter = [(rng.permutation(train), test) for train, test in cv_iter]
    for i, train_size in enumerate(train_sizes_abs):
        for j, (train, test) in enumerate(cv_iter):
            est = clone(estimator)
            est.fit(X[train][:train_size], y[train][:train_size])
            train_scores[i, j] = est.score(X[train][:train_size], y[train][:train_size])
            test_scores[i, j] = est.score(X[test], y[test])
    return train_sizes_abs, train_scores, test_scores

In [3]:

# regression
X, y = load_boston(return_X_y=True)
clf = RandomForestRegressor(random_state=0)
ans1 = learning_curve(clf, X, y, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], random_state=0)
ans2 = sklearning_curve(clf, X, y, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], shuffle=True, random_state=0)
assert np.array_equal(ans1[0], ans2[0])
assert np.allclose(ans1[1], ans2[1])
assert np.allclose(ans1[2], ans2[2])

In [4]:

# classification
X, y = load_iris(return_X_y=True)
clf = RandomForestClassifier(random_state=0)
ans1 = learning_curve(clf, X, y, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], random_state=0)
ans2 = sklearning_curve(clf, X, y, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], shuffle=True, random_state=0)
assert np.array_equal(ans1[0], ans2[0])
assert np.allclose(ans1[1], ans2[1])
assert np.allclose(ans1[2], ans2[2])