class Estimator(object): def fit(self, X, y=None): """Fits estimator to data. """ # set state of ``self`` return self def predict(self, X): """Predict response of ``X``. """ # compute predictions ``pred`` return pred from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor est = GradientBoostingRegressor(n_estimators=100, max_depth=3, loss='ls') est? from sklearn.datasets import make_hastie_10_2 from sklearn.cross_validation import train_test_split # generate synthetic data from ESLII - Example 10.2 X, y = make_hastie_10_2(n_samples=5000) X_train, X_test, y_train, y_test = train_test_split(X, y) # fit estimator est = GradientBoostingClassifier(n_estimators=200, max_depth=3) est.fit(X_train, y_train) # predict class labels pred = est.predict(X_test) # score on test data (accuracy) acc = est.score(X_test, y_test) print('ACC: %.4f' % acc) # predict class probabilities est.predict_proba(X_test)[0] est.estimators_[0, 0] %pylab inline import numpy as np from sklearn.cross_validation import train_test_split FIGSIZE = (11, 7) def ground_truth(x): """Ground truth -- function to approximate""" return x * np.sin(x) + np.sin(2 * x) def gen_data(n_samples=200): """generate training and testing data""" np.random.seed(15) X = np.random.uniform(0, 10, size=n_samples)[:, np.newaxis] y = ground_truth(X.ravel()) + np.random.normal(scale=2, size=n_samples) train_mask = np.random.randint(0, 2, size=n_samples).astype(np.bool) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3) return X_train, X_test, y_train, y_test X_train, X_test, y_train, y_test = gen_data(100) # plot ground truth x_plot = np.linspace(0, 10, 500) def plot_data(alpha=0.4, s=20): fig = plt.figure(figsize=FIGSIZE) gt = plt.plot(x_plot, ground_truth(x_plot), alpha=alpha, label='ground truth') # plot training and testing data plt.scatter(X_train, y_train, s=s, alpha=alpha) plt.scatter(X_test, y_test, s=s, alpha=alpha, color='red') plt.xlim((0, 10)) plt.ylabel('y') plt.xlabel('x') annotation_kw = {'xycoords': 'data', 'textcoords': 'data', 'arrowprops': {'arrowstyle': '->', 'connectionstyle': 'arc'}} plot_data() from sklearn.tree import DecisionTreeRegressor plot_data() est = DecisionTreeRegressor(max_depth=1).fit(X_train, y_train) plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='RT max_depth=1', color='g', alpha=0.9, linewidth=2) est = DecisionTreeRegressor(max_depth=3).fit(X_train, y_train) plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='RT max_depth=3', color='g', alpha=0.7, linewidth=1) plt.legend(loc='upper left') from itertools import islice plot_data() est = GradientBoostingRegressor(n_estimators=1000, max_depth=1, learning_rate=1.0) est.fit(X_train, y_train) ax = plt.gca() first = True # step through prediction as we add 10 more trees. for pred in islice(est.staged_predict(x_plot[:, np.newaxis]), 0, est.n_estimators, 10): plt.plot(x_plot, pred, color='r', alpha=0.2) if first: ax.annotate('High bias - low variance', xy=(x_plot[x_plot.shape[0] // 2], pred[x_plot.shape[0] // 2]), xytext=(4, 4), **annotation_kw) first = False pred = est.predict(x_plot[:, np.newaxis]) plt.plot(x_plot, pred, color='r', label='GBRT max_depth=1') ax.annotate('Low bias - high variance', xy=(x_plot[x_plot.shape[0] // 2], pred[x_plot.shape[0] // 2]), xytext=(6.25, -6), **annotation_kw) plt.legend(loc='upper left') def deviance_plot(est, X_test, y_test, ax=None, label='', train_color='#2c7bb6', test_color='#d7191c', alpha=1.0, ylim=(0, 10)): """Deviance plot for ``est``, use ``X_test`` and ``y_test`` for test error. """ n_estimators = len(est.estimators_) test_dev = np.empty(n_estimators) for i, pred in enumerate(est.staged_predict(X_test)): test_dev[i] = est.loss_(y_test, pred) if ax is None: fig = plt.figure(figsize=FIGSIZE) ax = plt.gca() ax.plot(np.arange(n_estimators) + 1, test_dev, color=test_color, label='Test %s' % label, linewidth=2, alpha=alpha) ax.plot(np.arange(n_estimators) + 1, est.train_score_, color=train_color, label='Train %s' % label, linewidth=2, alpha=alpha) ax.set_ylabel('Error') ax.set_xlabel('n_estimators') ax.set_ylim(ylim) return test_dev, ax test_dev, ax = deviance_plot(est, X_test, y_test) ax.legend(loc='upper right') # add some annotations ax.annotate('Lowest test error', xy=(test_dev.argmin() + 1, test_dev.min() + 0.02), xytext=(150, 3.5), **annotation_kw) ann = ax.annotate('', xy=(800, test_dev[799]), xycoords='data', xytext=(800, est.train_score_[799]), textcoords='data', arrowprops={'arrowstyle': '<->'}) ax.text(810, 3.5, 'train-test gap') def fmt_params(params): return ", ".join("{0}={1}".format(key, val) for key, val in params.iteritems()) fig = plt.figure(figsize=FIGSIZE) ax = plt.gca() for params, (test_color, train_color) in [({}, ('#d7191c', '#2c7bb6')), ({'min_samples_leaf': 3}, ('#fdae61', '#abd9e9'))]: est = GradientBoostingRegressor(n_estimators=1000, max_depth=1, learning_rate=1.0) est.set_params(**params) est.fit(X_train, y_train) test_dev, ax = deviance_plot(est, X_test, y_test, ax=ax, label=fmt_params(params), train_color=train_color, test_color=test_color) ax.annotate('Higher bias', xy=(900, est.train_score_[899]), xytext=(600, 3), **annotation_kw) ax.annotate('Lower variance', xy=(900, test_dev[899]), xytext=(600, 3.5), **annotation_kw) plt.legend(loc='upper right') fig = plt.figure(figsize=FIGSIZE) ax = plt.gca() for params, (test_color, train_color) in [({}, ('#d7191c', '#2c7bb6')), ({'learning_rate': 0.1}, ('#fdae61', '#abd9e9'))]: est = GradientBoostingRegressor(n_estimators=1000, max_depth=1, learning_rate=1.0) est.set_params(**params) est.fit(X_train, y_train) test_dev, ax = deviance_plot(est, X_test, y_test, ax=ax, label=fmt_params(params), train_color=train_color, test_color=test_color) ax.annotate('Requires more trees', xy=(200, est.train_score_[199]), xytext=(300, 1.75), **annotation_kw) ax.annotate('Lower test error', xy=(900, test_dev[899]), xytext=(600, 1.75), **annotation_kw) plt.legend(loc='upper right') fig = plt.figure(figsize=FIGSIZE) ax = plt.gca() for params, (test_color, train_color) in [({}, ('#d7191c', '#2c7bb6')), ({'learning_rate': 0.1, 'subsample': 0.5}, ('#fdae61', '#abd9e9'))]: est = GradientBoostingRegressor(n_estimators=1000, max_depth=1, learning_rate=1.0, random_state=1) est.set_params(**params) est.fit(X_train, y_train) test_dev, ax = deviance_plot(est, X_test, y_test, ax=ax, label=fmt_params(params), train_color=train_color, test_color=test_color) ax.annotate('Even lower test error', xy=(400, test_dev[399]), xytext=(500, 3.0), **annotation_kw) est = GradientBoostingRegressor(n_estimators=1000, max_depth=1, learning_rate=1.0, subsample=0.5) est.fit(X_train, y_train) test_dev, ax = deviance_plot(est, X_test, y_test, ax=ax, label=fmt_params({'subsample': 0.5}), train_color='#abd9e9', test_color='#fdae61', alpha=0.5) ax.annotate('Subsample alone does poorly', xy=(300, test_dev[299]), xytext=(500, 5.5), **annotation_kw) plt.legend(loc='upper right', fontsize='small') from sklearn.grid_search import GridSearchCV param_grid = {'learning_rate': [0.1, 0.01, 0.001], 'max_depth': [4, 6], 'min_samples_leaf': [3, 5] ## depends on the nr of training examples # 'max_features': [1.0, 0.3, 0.1] ## not possible in our example (only 1 fx) } est = GradientBoostingRegressor(n_estimators=3000) # this may take some minutes gs_cv = GridSearchCV(est, param_grid, scoring='mean_squared_error', n_jobs=4).fit(X_train, y_train) # best hyperparameter setting print('Best hyperparameters: %r' % gs_cv.best_params_) # refit model on best parameters est.set_params(**gs_cv.best_params_) est.fit(X_train, y_train) # plot the approximation plot_data() plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), color='r', linewidth=2) from sklearn.datasets.california_housing import fetch_california_housing cal_housing = fetch_california_housing() # split 80/20 train-test X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names import pandas as pd X_df = pd.DataFrame(data=X_train, columns=names) X_df['MedHouseVal'] = y_train _ = X_df.hist(column=['Latitude', 'Longitude', 'MedInc', 'MedHouseVal'], figsize=FIGSIZE) import time from collections import defaultdict from sklearn.metrics import mean_absolute_error from sklearn.linear_model import Ridge from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.dummy import DummyRegressor from sklearn.svm import SVR res = defaultdict(dict) def benchmark(est, name=None): if not name: name = est.__class__.__name__ t0 = time.clock() est.fit(X_train, y_train) res[name]['train_time'] = time.clock() - t0 t0 = time.clock() pred = est.predict(X_test) res[name]['test_time'] = time.clock() - t0 res[name]['MAE'] = mean_absolute_error(y_test, pred) return est benchmark(DummyRegressor()) benchmark(Ridge(alpha=0.0001, normalize=True)) benchmark(Pipeline([('std', StandardScaler()), ('svr', SVR(kernel='rbf', C=10.0, gamma=0.1, tol=0.001))]), name='SVR') benchmark(RandomForestRegressor(n_estimators=100, max_features=5, random_state=0, bootstrap=False, n_jobs=4)) est = benchmark(GradientBoostingRegressor(n_estimators=500, max_depth=4, learning_rate=0.1, loss='huber', min_samples_leaf=3, random_state=0)) res_df = pd.DataFrame(data=res).T res_df[['train_time', 'test_time', 'MAE']].sort('MAE', ascending=False) # diagnose the model test_dev, ax = deviance_plot(est, X_test, y_test, ylim=(0, 1.0)) ## modify the hyperparameters #tuned_est = benchmark(GradientBoostingRegressor(n_estimators=500, max_depth=4, learning_rate=0.1, # loss='huber', random_state=0, verbose=1)) ## print results #res_df = pd.DataFrame(data=res).T #res_df[['train_time', 'test_time', 'MAE']].sort('MAE', ascending=False) fx_imp = pd.Series(est.feature_importances_, index=names) fx_imp /= fx_imp.max() # normalize fx_imp.sort() fx_imp.plot(kind='barh', figsize=FIGSIZE) from sklearn.ensemble.partial_dependence import plot_partial_dependence features = ['MedInc', 'AveOccup', 'HouseAge', ('AveOccup', 'HouseAge')] fig, axs = plot_partial_dependence(est, X_train, features, feature_names=names, n_cols=2, figsize=FIGSIZE) df = pd.DataFrame(data={'icao': ['CRJ2', 'A380', 'B737', 'B737']}) # ordinal encoding df_enc = pd.DataFrame(data={'icao': np.unique(df.icao, return_inverse=True)[1]}) X = np.asfortranarray(df_enc.values, dtype=np.float32)