First let's import some prerequisites
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv2D, MaxPooling2D, Flatten
from keras.utils import to_categorical
from keras import backend as K
plt.rcParams['figure.figsize'] = (10,10) # Make the figures a bit bigger
nb_classes = 10
# the data, shuffled and split between tran and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
print("X_train original shape", X_train.shape)
print("y_train original shape", y_train.shape)
# input image dimensions
img_rows, img_cols = 28, 28
Let's look at some examples of the training data
for i in range(9):
plt.subplot(3,3,i+1)
plt.imshow(X_train[i], cmap='gray', interpolation='none')
plt.title("Class {}".format(y_train[i]))
Since we are building Fully Connected network. Our neural-network is going to take a single vector for each training example, so we need to reshape the input so that each 28x28 image becomes a single 784 dimensional vector. We'll also scale the inputs to be in the range [0-1] rather than [0-255]
X_train1 = X_train.reshape(60000, 784)
X_test1 = X_test.reshape(10000, 784)
X_train1 = X_train1.astype('float32')
X_test1 = X_test1.astype('float32')
X_train1 /= 255
X_test1 /= 255
print("Training matrix shape", X_train1.shape)
print("Testing matrix shape", X_test1.shape)
Modify the target matrices to be in the one-hot format, i.e.
0 -> [1, 0, 0, 0, 0, 0, 0, 0, 0]
1 -> [0, 1, 0, 0, 0, 0, 0, 0, 0]
2 -> [0, 0, 1, 0, 0, 0, 0, 0, 0] etc.
Y_train = to_categorical(y_train, nb_classes)
Y_test = to_categorical(y_test, nb_classes)
print("Y_train shape", Y_train.shape)
print("Y_test shape", Y_test.shape)
Here we'll do a simple 3 layer fully connected (Dense) network.
model1 = Sequential()
model1.add(Dense(128, input_shape=(784,)))
model1.add(Activation('relu')) # An "activation" is just a non-linear function applied to the output
# of the layer above. Here, with a "rectified linear unit",
# we clamp all values below 0 to 0.
#model1.add(Dropout(0.2)) # Dropout helps protect the model from memorizing or "overfitting" the training data
model1.add(Dense(128))
model1.add(Activation('relu'))
#model1.add(Dropout(0.2))
model1.add(Dense(10))
model1.add(Activation('softmax')) # This special "softmax" activation among other things,
# ensures the output is a valid probaility distribution, that is
# that its values are all non-negative and sum to 1.
When compiing a model, Keras asks you to specify your loss function and your optimizer. The loss function we'll use here is called categorical crossentropy, and is a loss function well-suited to comparing two probability distributions.
model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 128
epochs = 4
This is the fun part: you can feed the training data loaded in earlier into this model and it will learn to classify digits
model1.fit(X_train1, Y_train,
batch_size=batch_size, epochs=epochs, verbose=1,
validation_data=(X_test1, Y_test))
model1.summary()
score1 = model1.evaluate(X_test1, Y_test, verbose=1)
print('Test score:', score1[0])
print('Test accuracy:', score1[1])
It's always a good idea to inspect the output and make sure everything looks sane. Here we'll look at some examples it gets right, and some examples it gets wrong.
# The predict_classes function outputs the highest probability class
# according to the trained classifier for each input example.
predicted_classes1 = model1.predict(X_test1, verbose=1)
predicted_classes1 = np.argmax(predicted_classes1, axis=1)
# Check which items we got right / wrong
correct_indices1 = np.nonzero(predicted_classes1 == y_test)[0]
incorrect_indices1 = np.nonzero(predicted_classes1 != y_test)[0]
plt.figure()
for i, correct in enumerate(correct_indices1[:9]):
plt.subplot(3,3,i+1)
plt.imshow(X_test1[correct].reshape(28,28), cmap='gray', interpolation='none')
plt.title("Predicted {}, Class {}".format(predicted_classes1[correct], y_test[correct]))
plt.figure()
for i, incorrect in enumerate(incorrect_indices1[:9]):
plt.subplot(3,3,i+1)
plt.imshow(X_test1[incorrect].reshape(28,28), cmap='gray', interpolation='none')
plt.title("Predicted {}, Class {}".format(predicted_classes1[incorrect], y_test[incorrect]))
Now we are building Convolutional Neural Network (CNN). This type of neural-network is going to take an image for each training example, so we pass the input as 28x28 image. We'll also scale the inputs to be in the range [0-1] rather than [0-255]
if K.image_data_format() == 'channels_first':
X_train2 = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
X_test2 = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
X_train2 = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
X_test2 = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
X_train2 = X_train2.astype('float32')
X_test2 = X_test2.astype('float32')
X_train2 /= 255
X_test2 /= 255
print('X_train shape:', X_train2.shape)
print(X_train2.shape[0], 'train samples')
print(X_test2.shape[0], 'test samples')
Output stays the same - encoded as one-hot vectors.
print("Y_train shape", Y_train.shape)
print("Y_test shape", Y_test.shape)
Here we'll do a Convolutional Neural Network (CNN) with 2 conv layers followed by MaxPooling.
model2 = Sequential()
model2.add(Conv2D(16, kernel_size=(3, 3),
activation='relu',
input_shape=input_shape))
#model2.add(Conv2D(64, (3, 3), activation='relu'))
model2.add(MaxPooling2D(pool_size=(2, 2)))
model2.add(Flatten())
#model2.add(Dense(128, activation='relu'))
model2.add(Dense(nb_classes, activation='softmax'))
model2.summary()
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# hyperparameters
batch_size = 128
epochs = 4
model2.fit(X_train2, Y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(X_test2, Y_test))
score = model2.evaluate(X_test2, Y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# The predict_classes function outputs the highest probability class
# according to the trained classifier for each input example.
predicted_classes2 = model2.predict(X_test2, verbose=1)
predicted_classes2 = np.argmax(predicted_classes2, axis=1)
# Check which items we got right / wrong
correct_indices2 = np.nonzero(predicted_classes2 == y_test)[0]
incorrect_indices2 = np.nonzero(predicted_classes2 != y_test)[0]
plt.figure()
for i, correct in enumerate(correct_indices2[:9]):
plt.subplot(3,3,i+1)
plt.imshow(X_test2[correct].reshape(28,28), cmap='gray', interpolation='none')
plt.title("Predicted {}, Class {}".format(predicted_classes2[correct], y_test[correct]))
plt.figure()
for i, incorrect in enumerate(incorrect_indices2[:9]):
plt.subplot(3,3,i+1)
plt.imshow(X_test2[incorrect].reshape(28,28), cmap='gray', interpolation='none')
plt.title("Predicted {}, Class {}".format(predicted_classes2[incorrect], y_test[incorrect]))
score1 = model1.evaluate(X_test1, Y_test, verbose=0)
#print('FC model Test loss:', score1[0])
print('FC model Test accuracy:', score1[1])
params1 = model1.count_params()
print('FC model # params:', str(params1))
score2 = model2.evaluate(X_test2, Y_test, verbose=0)
#print('CNN model Test loss:', score2[0])
print('CNN model Test accuracy:', score2[1])
params2 = model2.count_params()
print('CNN model # params:', str(params2))
CNNs have better performance with fewer parameters, hence less memory and computing consumption. CNNs are able to handle image data that is infeasible using only FC layers. The number of weights in FC layer with 1000 neurons for 224x224x3 image is something like 150M. That's 150M for only one layer. While modern CNN architectures that have 50-100 layers while having overall couple dozen milion parameters, for example ResNet50 has 23M params and InceptionV3 has 21M parameters.
CNNs are better for image data.