#!/usr/bin/env python
# coding: utf-8

# In[1]:


from preamble import *
get_ipython().run_line_magic('matplotlib', 'notebook')


# In[2]:


mglearn.plots.plot_logistic_regression_graph()


# In[3]:


mglearn.plots.plot_single_hidden_layer_graph()


# In[4]:


line = np.linspace(-3, 3, 100)
plt.figure()
plt.plot(line, np.tanh(line), label="tanh")
plt.plot(line, np.maximum(line, 0), label="relu")
plt.legend(loc="best")
plt.title("activation_functions")


# In[10]:


from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_moons
from sklearn.cross_validation import train_test_split

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)


mlp = MLPClassifier(algorithm='l-bfgs', random_state=0).fit(X_train, y_train)

plt.figure()
mglearn.plots.plot_2d_separator(mlp, X_train, fill=True, alpha=.3)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=60, cmap=mglearn.cm2)


# # Exercise
# Compare ``MLPRegressor`` to linear methods on the boston and bike datasets. Try varying the number of hidden layers and nodes in the hidden layers. Compare ``adam`` and ``l-bfgs`` algorithms in terms of outcome and time.
# 
# How do the results differ with and without scaling the data?