#!/usr/bin/env python # coding: utf-8 # In[1]: from preamble import * get_ipython().run_line_magic('matplotlib', 'notebook') # In[2]: mglearn.plots.plot_logistic_regression_graph() # In[3]: mglearn.plots.plot_single_hidden_layer_graph() # In[4]: line = np.linspace(-3, 3, 100) plt.figure() plt.plot(line, np.tanh(line), label="tanh") plt.plot(line, np.maximum(line, 0), label="relu") plt.legend(loc="best") plt.title("activation_functions") # In[10]: from sklearn.neural_network import MLPClassifier from sklearn.datasets import make_moons from sklearn.cross_validation import train_test_split X, y = make_moons(n_samples=100, noise=0.25, random_state=3) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) mlp = MLPClassifier(algorithm='l-bfgs', random_state=0).fit(X_train, y_train) plt.figure() mglearn.plots.plot_2d_separator(mlp, X_train, fill=True, alpha=.3) plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=60, cmap=mglearn.cm2) # # Exercise # Compare ``MLPRegressor`` to linear methods on the boston and bike datasets. Try varying the number of hidden layers and nodes in the hidden layers. Compare ``adam`` and ``l-bfgs`` algorithms in terms of outcome and time. # # How do the results differ with and without scaling the data?