Notebook

In [1]:

import numpy as np
from sklearn.base import clone
from sklearn.datasets import load_boston, load_iris
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_predict as skcross_val_predict

In [2]:

def cross_val_predict(estimator, X, y, method="predict"):
    if estimator._estimator_type == "regressor":
        cv = KFold()
    else:  # estimator._estimator_type == "classifier"
        cv = StratifiedKFold()
    predictions = []
    indices = []
    for train, test in cv.split(X, y):
        est = clone(estimator)
        est.fit(X[train], y[train])
        predictions.extend(getattr(est, method)(X[test]))
        indices.extend(test)
    inv_indices = np.empty(len(indices), dtype=np.int)
    inv_indices[indices] = np.arange(len(indices))
    return np.array(predictions)[inv_indices]

In [3]:

# regression
X, y = load_boston(return_X_y=True)
clf = RandomForestRegressor(random_state=0)
ans1 = cross_val_predict(clf, X, y)
ans2 = skcross_val_predict(clf, X, y)
assert np.allclose(ans1, ans2)

In [4]:

# classification
X, y = load_iris(return_X_y=True)
clf = RandomForestClassifier(random_state=0)
ans1 = cross_val_predict(clf, X, y)
ans2 = skcross_val_predict(clf, X, y)
assert np.array_equal(ans1, ans2)
ans1 = cross_val_predict(clf, X, y, method="predict_proba")
ans2 = skcross_val_predict(clf, X, y, method="predict_proba")
assert np.allclose(ans1, ans2)