In [36]:

```
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import pandas as pd
from sklearn.datasets import make_moons
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from keras.datasets import imdb
# Display plots inline, change default figure size and change plot resolution to retina
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = [10, 8]
%config InlineBackend.figure_format = 'retina'
# Set Seaborn aesthetic parameters to defaults
sns.set()
```

In [37]:

```
def plot_data(X, y):
"""Plot some data"""
plt.figure()
plt.plot(X[y == 0, 0], X[y == 0, 1], 'or', alpha=0.5, label=0)
plt.plot(X[y == 1, 0], X[y == 1, 1], 'ob', alpha=0.5, label=1)
plt.legend()
def plot_decision_boundary(pred_func, X, y, figure=None):
"""Plot a decision boundary"""
if figure is None: # If no figure is given, create a new one
plt.figure()
# Set min and max values and give it some padding
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Predict the function value for the whole grid
Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cm_bright)
def plot_loss_acc(history):
"""Plot training and (optionally) validation loss and accuracy"""
loss = history.history['loss']
epochs = range(1, len(loss) + 1)
plt.figure(figsize=(10, 10))
plt.subplot(2, 1, 1)
plt.plot(epochs, loss, '.--', label='Training loss')
final_loss = loss[-1]
title = 'Training loss: {:.4f}'.format(final_loss)
plt.ylabel('Loss')
if 'val_loss' in history.history:
val_loss = history.history['val_loss']
plt.plot(epochs, val_loss, 'o-', label='Validation loss')
final_val_loss = val_loss[-1]
title += ', Validation loss: {:.4f}'.format(final_val_loss)
plt.title(title)
plt.legend()
acc = history.history['acc']
plt.subplot(2, 1, 2)
plt.plot(epochs, acc, '.--', label='Training acc')
final_acc = acc[-1]
title = 'Training accuracy: {:.2f}%'.format(final_acc * 100)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
if 'val_acc' in history.history:
val_acc = history.history['val_acc']
plt.plot(epochs, val_acc, 'o-', label='Validation acc')
final_val_acc = val_acc[-1]
title += ', Validation accuracy: {:.2f}%'.format(final_val_acc * 100)
plt.title(title)
plt.legend()
def vectorize_sequences(sequences, dimension=10000):
"""One-hot encode a vector of sequences into a binary matrix (number of sequences, dimension)"""
# Example : [[3, 5]] -> [[0. 0. 0. 1. 0. 1. 0...]]
results = np.zeros((len(sequences), dimension))
# set specific indices of results[i] to 1s
for i, sequence in enumerate(sequences):
results[i, sequence] = 1.
return results
```

Inspired by this article.

In [38]:

```
# Generate moon-shaped data with some noise
x_train, y_train = make_moons(200, noise=0.40)
plot_data(x_train, y_train)
```

In [39]:

```
# Varying the hidden layer size to observe underfitting and overfitting
plt.figure(figsize=(14, 28))
hidden_layer_dimensions = [1, 3, 5, 20, 50]
for i, hidden_layer_size in enumerate(hidden_layer_dimensions):
fig = plt.subplot(4, 2, i+1)
plt.title('Hidden Layer size: {:d}'.format(hidden_layer_size))
model = Sequential()
model.add(Dense(hidden_layer_size, activation='tanh', input_shape=(2,)))
model.add(Dense(1, activation='sigmoid'))
model.compile(SGD(lr=1.0), 'binary_crossentropy', metrics=['accuracy'])
# Batch size = dataset size => batch gradient descent
history = model.fit(x_train, y_train, verbose=0, epochs=5000, batch_size=x_train.shape[0])
plot_decision_boundary(lambda x: model.predict(x) > 0.5, x_train, y_train, fig)
```

In [40]:

```
model = Sequential()
# Use L1 regularization on hidden layer
model.add(Dense(50, activation='tanh', input_shape=(2,),
kernel_regularizer=regularizers.l1(0.001)))
model.add(Dense(1, activation='sigmoid'))
model.compile(SGD(lr=1.0), 'binary_crossentropy', metrics=['accuracy'])
# Batch size = dataset size => batch gradient descent
history = model.fit(x_train, y_train, verbose=0, epochs=5000, batch_size=x_train.shape[0])
plot_decision_boundary(lambda x: model.predict(x) > 0.5, x_train, y_train)
```

In [41]:

```
model = Sequential()
# Use 25% dropout on hidden layer
model.add(Dense(50, activation='tanh', input_shape=(2,)))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(SGD(lr=1.0), 'binary_crossentropy', metrics=['accuracy'])
# Batch size = dataset size => batch gradient descent
history = model.fit(x_train, y_train, verbose=0, epochs=5000, batch_size=x_train.shape[0])
plot_decision_boundary(lambda x: model.predict(x) > 0.5, x_train, y_train)
```

Inspired by a use case in FranÃ§ois Chollet's book Deep Learning With Python.

In [42]:

```
# Load the Keras IMDB dataset
# We only keep the top 10,000 most frequently occuring words in the training data
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
print(f'Training data: {train_data.shape}. Training labels: {train_labels.shape}')
# Reviews are encoded as lists of word indexes
# Words are indexed by overall frequency in the dataset
print(f'First review: {train_data[0]}')
# Labels are binary integers: 0 for a negative review, 1 for a positive one
print(f'First label: {train_labels[0]}')
```

In [43]:

```
# Show the first review as text
# word_index is a dictionary mapping words to an integer index
word_index = imdb.get_word_index()
# We reverse it, mapping integer indices to words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# We decode the review; note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
print(decoded_review)
```

In [44]:

```
# Preparation of data for training
# Turn reviews into vectors of 0s and 1s (one-hot encoding)
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
# Set apart the first 10,000 reviews as validation data
x_val, x_train = x_train[:10000], x_train[10000:]
y_val, y_train = train_labels[:10000], train_labels[10000:]
y_test = test_labels
print(f'x_train: {x_train.shape}. x_val: {x_val.shape}')
```

In [45]:

```
# Show a sample of encoded input
df_x_train = pd.DataFrame(x_train)
df_x_train.sample(n=10)
```

Out[45]:

In [46]:

```
# Build a (10000, 16, 16, 1) NN to demonstrate overfitting
model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(10000,)))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Show model info
model.summary()
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
# We also record validation history during training
history = model.fit(x_train, y_train,
epochs=20, batch_size=512,
validation_data=(x_val, y_val))
plot_loss_acc(history)
```