%pylab inline rng = np.random.RandomState(0) N = 100 X = np.linspace(0, 6, N)[:, np.newaxis] error = 0.4 y_true = np.sin(X).ravel() + np.sin(6 * X).ravel() y_noisy = y_true + rng.normal(0, error, X.shape[0]) plt.plot(X.ravel(), y_true, color='gray') plt.plot(X.ravel(), y_noisy, '.k') from scipy import fftpack plt.plot(fftpack.fftfreq(len(y_noisy))[:N/2], abs(fftpack.fft(y_noisy))[:N/2]) plt.xlim(0, None) plt.xlabel('frequency') plt.ylabel('Fourier power') i = np.random.permutation(X.shape[0]) X = X[i] y_noisy = y_noisy[i] from sklearn.tree import DecisionTreeRegressor clf = DecisionTreeRegressor(max_depth=5) clf.fit(X, y_noisy) X_fit = np.linspace(0, 6, 1000).reshape((-1, 1)) y_fit_1 = clf.predict(X_fit) plt.plot(X_fit.ravel(), y_fit_1, color='blue') plt.plot(X.ravel(), y_noisy, '.k') from sklearn.ensemble import RandomForestRegressor clf = RandomForestRegressor(n_estimators=200, max_depth=5) clf.fit(X, y_noisy) y_fit_200 = clf.predict(X_fit) plt.plot(X_fit.ravel(), y_fit_200, color='blue') plt.plot(X.ravel(), y_noisy, '.k') from sklearn import grid_search rf = RandomForestRegressor() parameters = {'n_estimators':[200, 300, 400], 'max_depth':[5, 7, 9]} # Warning: be sure your data is shuffled before using GridSearch! clf_grid = grid_search.GridSearchCV(rf, parameters) clf_grid.fit(X, y_noisy) rf_best = clf_grid.best_estimator_ X_fit = np.linspace(0, 6, 1000).reshape((-1, 1)) y_fit_best = rf_best.predict(X_fit) print rf_best.n_estimators, rf_best.max_depth plt.plot(X_fit.ravel(), y_fit_best, color='blue') plt.plot(X.ravel(), y_noisy, '.k') from sklearn.ensemble import GradientBoostingRegressor clf = GradientBoostingRegressor(n_estimators=200, max_depth=2) clf.fit(X, y_noisy) y_fit_200 = clf.predict(X_fit) plt.plot(X_fit.ravel(), y_fit_200, color='blue') plt.plot(X.ravel(), y_noisy, '.k') %load solutions/07B_grid_search.py