%pylab inline

rng = np.random.RandomState(0)
N = 100
X = np.linspace(0, 6, N)[:, np.newaxis]
error = 0.4
y_true = np.sin(X).ravel() + np.sin(6 * X).ravel()
y_noisy = y_true + rng.normal(0, error, X.shape[0])

plt.plot(X.ravel(), y_true, color='gray')
plt.plot(X.ravel(), y_noisy, '.k')

from scipy import fftpack
plt.plot(fftpack.fftfreq(len(y_noisy))[:N/2], abs(fftpack.fft(y_noisy))[:N/2])
plt.xlim(0, None)
plt.xlabel('frequency')
plt.ylabel('Fourier power')

i = np.random.permutation(X.shape[0])
X = X[i]
y_noisy = y_noisy[i]

from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor(max_depth=5)
clf.fit(X, y_noisy)

X_fit = np.linspace(0, 6, 1000).reshape((-1, 1))
y_fit_1 = clf.predict(X_fit)

plt.plot(X_fit.ravel(), y_fit_1, color='blue')
plt.plot(X.ravel(), y_noisy, '.k')

from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(n_estimators=200, max_depth=5)
clf.fit(X, y_noisy)

y_fit_200 = clf.predict(X_fit)

plt.plot(X_fit.ravel(), y_fit_200, color='blue')
plt.plot(X.ravel(), y_noisy, '.k')

from sklearn import grid_search

rf = RandomForestRegressor()
parameters = {'n_estimators':[200, 300, 400],
              'max_depth':[5, 7, 9]}

# Warning: be sure your data is shuffled before using GridSearch!
clf_grid = grid_search.GridSearchCV(rf, parameters)
clf_grid.fit(X, y_noisy)

rf_best = clf_grid.best_estimator_
X_fit = np.linspace(0, 6, 1000).reshape((-1, 1))
y_fit_best = rf_best.predict(X_fit)

print rf_best.n_estimators, rf_best.max_depth

plt.plot(X_fit.ravel(), y_fit_best, color='blue')
plt.plot(X.ravel(), y_noisy, '.k')

from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor(n_estimators=200, max_depth=2)
clf.fit(X, y_noisy)

y_fit_200 = clf.predict(X_fit)

plt.plot(X_fit.ravel(), y_fit_200, color='blue')
plt.plot(X.ravel(), y_noisy, '.k')


%load solutions/07B_grid_search.py