import numpy as np
from sklearn.base import clone
from sklearn.datasets import load_boston, load_iris
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import learning_curve as sklearning_curve
def learning_curve(estimator, X, y, train_sizes, random_state=0):
if estimator._estimator_type == "regressor":
cv = KFold()
else: # estimator._estimator_type == "classifier"
cv = StratifiedKFold()
train_scores = np.zeros((len(train_sizes), cv.n_splits))
test_scores = np.zeros((len(train_sizes), cv.n_splits))
cv_iter = list(cv.split(X, y))
train_sizes_abs = (len(cv_iter[0][0]) * np.array(train_sizes)).astype(int)
rng = np.random.RandomState(random_state)
cv_iter = [(rng.permutation(train), test) for train, test in cv_iter]
for i, train_size in enumerate(train_sizes_abs):
for j, (train, test) in enumerate(cv_iter):
est = clone(estimator)
est.fit(X[train][:train_size], y[train][:train_size])
train_scores[i, j] = est.score(X[train][:train_size], y[train][:train_size])
test_scores[i, j] = est.score(X[test], y[test])
return train_sizes_abs, train_scores, test_scores
# regression
X, y = load_boston(return_X_y=True)
clf = RandomForestRegressor(random_state=0)
ans1 = learning_curve(clf, X, y, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], random_state=0)
ans2 = sklearning_curve(clf, X, y, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], shuffle=True, random_state=0)
assert np.array_equal(ans1[0], ans2[0])
assert np.allclose(ans1[1], ans2[1])
assert np.allclose(ans1[2], ans2[2])
# classification
X, y = load_iris(return_X_y=True)
clf = RandomForestClassifier(random_state=0)
ans1 = learning_curve(clf, X, y, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], random_state=0)
ans2 = sklearning_curve(clf, X, y, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], shuffle=True, random_state=0)
assert np.array_equal(ans1[0], ans2[0])
assert np.allclose(ans1[1], ans2[1])
assert np.allclose(ans1[2], ans2[2])