I borrowed most of the techniques from online courses (http://course.fast.ai/, https://hunkim.github.io/ml/) and got help from a Facebook group dedicated to learning TensorFlow in Korea (https://www.facebook.com/groups/TensorFlowKR/).
from theano.sandbox import cuda
cuda.use('gpu2')
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import utils; reload(utils)
from utils import *
from __future__ import division, print_function
from keras.datasets import mnist
WARNING (theano.sandbox.cuda): Ignoring call to use(2), GPU number 0 is already in use.
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
X_train.shape
(60000, 28, 28)
def prepare_data():
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
# Add channel
X_test = np.expand_dims(X_test,1)
X_train = np.expand_dims(X_train,1)
# One-hot encode the labels
Y_train = onehot(Y_train)
Y_test = onehot(Y_test)
# Split the initial set into train and valid set
X_valid = X_train[55000:]
Y_valid = Y_train[55000:]
X_train = X_train[:55000]
Y_train = Y_train[:55000]
print('X_train', X_train.shape, 'X_valid', X_valid.shape, 'X_test', X_test.shape)
return (X_train, Y_train), (X_valid, Y_valid), (X_test, Y_test)
(X_train, Y_train), (X_valid, Y_valid), (X_test, Y_test) = prepare_data()
X_train (55000, 1, 28, 28) X_valid (5000, 1, 28, 28) X_test (10000, 1, 28, 28)
This is how we will normalize the input image
# Get mean and std to use for normalization
mean_px = X_train.mean().astype(np.float32)
std_px = X_train.std().astype(np.float32)
def norm_input(x): return (x-mean_px)/std_px
The fastest way to come up with an efficient model architecture seems to be "copying" other models that have proven to be efficient (e.g. VGG16). But I wanted to experiment and see the effect of different layers.
I started with a dead simple architecture and started building up. I had already learned that adding Dropouts and BatchNorms would increase accuracy, but I wanted to see it for myself how much difference they actually make.
These are my finds:
While time consuming, the experiments helped me have a better intuition about why certain model architectures work better than the others.
def make_model():
model = Sequential([
Lambda(norm_input, input_shape=(1,28,28)), # Normalize the input
ZeroPadding2D(padding=(1, 1)),
Conv2D(32, 3, 3, activation='relu'),
Conv2D(32, 3, 3, activation='relu'),
BatchNormalization(axis=1),
Dropout(0.3),
MaxPooling2D((3,3), strides=(2,2)),
ZeroPadding2D(padding=(1, 1)),
Conv2D(64, 3, 3, activation='relu'),
Conv2D(64, 3, 3, activation='relu'),
Conv2D(128, 3, 3, activation='relu'),
Conv2D(128, 3, 3, activation='relu'),
BatchNormalization(axis=1),
Dropout(0.3),
MaxPooling2D((3,3), strides=(2,2)),
Flatten(),
Dense(10, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
return model
def make_batches(X_train, Y_train, X_valid, Y_valid):
gen = image.ImageDataGenerator(rotation_range=6, width_shift_range=0.08, shear_range=0.2, height_shift_range=0.05, zoom_range=0.05)
batches = gen.flow(X_train, Y_train, batch_size=64)
valid_batches = gen.flow(X_valid, Y_valid, batch_size=64)
return (batches, valid_batches)
This is known as learning rate annealing technique (Jeremy Howard explains it in his lecture)
def fit_model(model, batches, valid_batches, model_name='my_model'):
model.fit_generator(batches, batches.N, nb_epoch=1, verbose=1,
validation_data=valid_batches, nb_val_samples=valid_batches.N)
model.optimizer.lr=0.1
model.fit_generator(batches, batches.N, nb_epoch=3, verbose=1,
validation_data=valid_batches, nb_val_samples=valid_batches.N)
model.optimizer.lr=0.01
model.fit_generator(batches, batches.N, nb_epoch=12, verbose=1,
validation_data=valid_batches, nb_val_samples=valid_batches.N)
model.optimizer.lr=0.001
model.fit_generator(batches, batches.N, nb_epoch=4, verbose=1,
validation_data=valid_batches, nb_val_samples=valid_batches.N)
model.save_weights('../models/weights_v2_' + model_name)
return model
def show_imgs(imgs, labels=[]):
fig = plt.figure()
num_imgs = len(imgs)
is_labeled = len(labels) > 0
for i in range(num_imgs):
img = imgs[i]
subplot = fig.add_subplot(1, num_imgs, i+1)
if img.ndim > 2 and img.shape[0] == 1:
img = img[-1,:,:]
plt.imshow(img)
subplot.axis('off')
if is_labeled:
subplot.set_title(labels[i])
plt.show()
batches, valid_batches = make_batches(X_train, Y_train, X_valid, Y_valid)
import numpy
from scipy.ndimage.interpolation import map_coordinates
from scipy.ndimage.filters import gaussian_filter
#https://gist.github.com/fmder/e28813c1e8721830ff9c
#visualization: http://augmentorjl.readthedocs.io/en/latest/tutorials/mnist.html
def elastic_transform(image, alpha, sigma, random_state=None):
"""Elastic deformation of images as described in [Simard2003]_.
.. [Simard2003] Simard, Steinkraus and Platt, "Best Practices for
Convolutional Neural Networks applied to Visual Document Analysis", in
Proc. of the International Conference on Document Analysis and
Recognition, 2003.
"""
if image.ndim > 2 and image.shape[0] == 1:
image = image[-1,:,:]
if random_state is None:
random_state = numpy.random.RandomState(None)
shape = image.shape
dx = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
dy = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
x, y = numpy.meshgrid(numpy.arange(shape[0]), numpy.arange(shape[1]))
indices = numpy.reshape(y+dy, (-1, 1)), numpy.reshape(x+dx, (-1, 1))
transformed = map_coordinates(image, indices, order=1).reshape(shape)
if transformed.ndim < 3:
transformed = np.expand_dims(transformed,0)
return transformed
It's important to see what elastic distortion does, and how it will augment the initial train set. If you choose the wrong parameters (i.e. alpha and sigma), then you will end up with too-distorted images that will confuse the model.
img = X_train[0]
# These parameters worked the best for me
alpha = 36
sigma = 8
transformed_arr = []
for i in range(10):
transformed = elastic_transform(img, alpha, sigma)
alpha += 4
transformed_arr.append(transformed)
transformed_arr = np.array(transformed_arr)
arr = np.concatenate((transformed_arr))
show_imgs(arr)
def distort(imgs, labels):
alpha = 36
sigma = 8
num_augmentation = 10
output = []
output_labels = []
for idx, img in enumerate(imgs):
output.append(img)
for i in range(num_augmentation):
transformed = elastic_transform(img, alpha, sigma)
alpha += 4
output.append(transformed)
alpha = 36
sigma = 8
for label in labels:
output_labels.append(label)
for i in range(num_augmentation):
output_labels.append(label)
return np.array(output), np.array(output_labels)
imgs, labels = distort(X_train[:2], Y_train[:2])
show_imgs(imgs, labels=[np.argmax(l) for l in labels])
Also note that Keras will dynamically augment the train set, so you will end up training the model even with more data
augmented_X_train, augmented_Y_train = distort(X_train, Y_train)
print("Before augmentation:", X_train.shape, "After:", augmented_X_train.shape)
np.save('../data/mnist/augmented_x_train_v2', augmented_X_train)
np.save('../data/mnist/augmented_y_train_v2', augmented_Y_train)
Before augmentation: (55000, 1, 28, 28) After: (605000, 1, 28, 28)
augmented_X_train = np.load('../data/mnist/augmented_x_train_v2.npy')
augmented_Y_train = np.load('../data/mnist/augmented_y_train_v2.npy')
augmented_batches, valid_batches = make_batches(augmented_X_train, augmented_Y_train, X_valid, Y_valid)
augmented_X_train.shape
(605000, 1, 28, 28)
augmented_Y_train.shape
(605000, 10)
batches, valid_batches = make_batches(X_train, Y_train, X_valid, Y_valid)
model = make_model()
model.fit_generator(batches, batches.N, nb_epoch=1, verbose=1, validation_data=valid_batches, nb_val_samples=valid_batches.N)
Epoch 1/1 31680/55000 [================>.............] - ETA: 8s - loss: 0.2922 - acc: 0.9110
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-41-005047635129> in <module>() 1 batches, valid_batches = make_batches(X_train, Y_train, X_valid, Y_valid) 2 model = make_model() ----> 3 model.fit_generator(batches, batches.N, nb_epoch=1, verbose=1, validation_data=valid_batches, nb_val_samples=valid_batches.N) /home/ubuntu/anaconda2/lib/python2.7/site-packages/keras/models.pyc in fit_generator(self, generator, samples_per_epoch, nb_epoch, verbose, callbacks, validation_data, nb_val_samples, class_weight, max_q_size, nb_worker, pickle_safe, **kwargs) 872 max_q_size=max_q_size, 873 nb_worker=nb_worker, --> 874 pickle_safe=pickle_safe) 875 876 def evaluate_generator(self, generator, val_samples, max_q_size=10, nb_worker=1, pickle_safe=False, **kwargs): /home/ubuntu/anaconda2/lib/python2.7/site-packages/keras/engine/training.pyc in fit_generator(self, generator, samples_per_epoch, nb_epoch, verbose, callbacks, validation_data, nb_val_samples, class_weight, max_q_size, nb_worker, pickle_safe) 1441 outs = self.train_on_batch(x, y, 1442 sample_weight=sample_weight, -> 1443 class_weight=class_weight) 1444 except: 1445 _stop.set() /home/ubuntu/anaconda2/lib/python2.7/site-packages/keras/engine/training.pyc in train_on_batch(self, x, y, sample_weight, class_weight) 1219 ins = x + y + sample_weights 1220 self._make_train_function() -> 1221 outputs = self.train_function(ins) 1222 if len(outputs) == 1: 1223 return outputs[0] /home/ubuntu/anaconda2/lib/python2.7/site-packages/keras/backend/theano_backend.pyc in __call__(self, inputs) 715 def __call__(self, inputs): 716 assert type(inputs) in {list, tuple} --> 717 return self.function(*inputs) 718 719 /home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/compile/function_module.pyc in __call__(self, *args, **kwargs) 857 t0_fn = time.time() 858 try: --> 859 outputs = self.fn() 860 except Exception: 861 if hasattr(self.fn, 'position_of_error'): KeyboardInterrupt:
We train multiple models with the same architecture and the same train set and "combine" them.
models = [fit_model(make_model(), augmented_batches, test_batches, model_name='howon_%d' % i) for i in range(5)]
Epoch 1/1 660000/660000 [==============================] - 248s - loss: 0.0766 - acc: 0.9767 - val_loss: 0.0506 - val_acc: 0.9840 Epoch 1/3 660000/660000 [==============================] - 249s - loss: 0.0350 - acc: 0.9896 - val_loss: 0.0312 - val_acc: 0.9905 Epoch 2/3 660000/660000 [==============================] - 249s - loss: 0.0266 - acc: 0.9919 - val_loss: 0.0341 - val_acc: 0.9908 Epoch 3/3 660000/660000 [==============================] - 248s - loss: 0.0220 - acc: 0.9933 - val_loss: 0.0330 - val_acc: 0.9902 Epoch 1/2 660000/660000 [==============================] - 249s - loss: 0.0190 - acc: 0.9941 - val_loss: 0.0287 - val_acc: 0.9922 Epoch 2/2 660000/660000 [==============================] - 249s - loss: 0.0169 - acc: 0.9947 - val_loss: 0.0408 - val_acc: 0.9884 Epoch 1/2 660000/660000 [==============================] - 248s - loss: 0.0154 - acc: 0.9953 - val_loss: 0.0275 - val_acc: 0.9918 Epoch 2/2 660000/660000 [==============================] - 249s - loss: 0.0139 - acc: 0.9955 - val_loss: 0.0286 - val_acc: 0.9922 Epoch 1/1 660000/660000 [==============================] - 249s - loss: 0.0753 - acc: 0.9774 - val_loss: 0.0375 - val_acc: 0.9894 Epoch 1/3 660000/660000 [==============================] - 248s - loss: 0.0345 - acc: 0.9896 - val_loss: 0.0431 - val_acc: 0.9884 Epoch 2/3 660000/660000 [==============================] - 248s - loss: 0.0260 - acc: 0.9922 - val_loss: 0.0448 - val_acc: 0.9871 Epoch 3/3 660000/660000 [==============================] - 248s - loss: 0.0210 - acc: 0.9937 - val_loss: 0.0318 - val_acc: 0.9913 Epoch 1/2 660000/660000 [==============================] - 249s - loss: 0.0182 - acc: 0.9943 - val_loss: 0.0269 - val_acc: 0.9928 Epoch 2/2 660000/660000 [==============================] - 249s - loss: 0.0163 - acc: 0.9950 - val_loss: 0.0313 - val_acc: 0.9910 Epoch 1/2 660000/660000 [==============================] - 249s - loss: 0.0149 - acc: 0.9953 - val_loss: 0.0234 - val_acc: 0.9936 Epoch 2/2 660000/660000 [==============================] - 249s - loss: 0.0137 - acc: 0.9957 - val_loss: 0.0226 - val_acc: 0.9940 Epoch 1/1 660000/660000 [==============================] - 249s - loss: 0.0754 - acc: 0.9771 - val_loss: 0.0354 - val_acc: 0.9894 Epoch 1/3 660000/660000 [==============================] - 248s - loss: 0.0342 - acc: 0.9899 - val_loss: 0.0261 - val_acc: 0.9922 Epoch 2/3 660000/660000 [==============================] - 248s - loss: 0.0254 - acc: 0.9923 - val_loss: 0.0302 - val_acc: 0.9904 Epoch 3/3 660000/660000 [==============================] - 248s - loss: 0.0210 - acc: 0.9935 - val_loss: 0.0274 - val_acc: 0.9919 Epoch 1/2 660000/660000 [==============================] - 248s - loss: 0.0180 - acc: 0.9945 - val_loss: 0.0277 - val_acc: 0.9922 Epoch 2/2 660000/660000 [==============================] - 248s - loss: 0.0159 - acc: 0.9950 - val_loss: 0.0257 - val_acc: 0.9924 Epoch 1/2 660000/660000 [==============================] - 248s - loss: 0.0145 - acc: 0.9954 - val_loss: 0.0165 - val_acc: 0.9952 Epoch 2/2 660000/660000 [==============================] - 248s - loss: 0.0128 - acc: 0.9958 - val_loss: 0.0259 - val_acc: 0.9931
all_preds = np.stack([m.predict(X_test, batch_size=256) for m in all_models])
avg_preds = all_preds.mean(axis=0)
preds = avg_preds
preds.shape
(10000, 10)
keras.metrics.categorical_accuracy(Y_test, preds).eval()
array(0.9966999888420105, dtype=float32)
Adjusting the hyperparameters and coming up with a better model architecture are very costly. Instead, having more labelled data seems to be more effective. We can try the following:
Let's try the second option and see if that improves the accuracy.
models = [make_model() for i in range(5)]
for i in range(5):
models[i].load_weights('../models/weights_v2_howon_v2_%d' % i)
X_valid.shape
(5000, 1, 28, 28)
X_train.shape
(55000, 1, 28, 28)
Once again, synthesize many slightly distorted images using the valid set as seed. Then, instead of having only 5000 extra images, we can have 55000 images.
augmented_X_valid, augmented_Y_valid = distort(X_valid, Y_valid)
print("Before augmentation:", X_valid.shape, "After:", augmented_X_valid.shape)
Before augmentation: (5000, 1, 28, 28) After: (55000, 1, 28, 28)
np.save('../data/mnist/augmented_x_valid_v2', augmented_X_valid)
np.save('../data/mnist/augmented_y_valid_v2', augmented_Y_valid)
augmented_combined_X_train = np.concatenate((augmented_X_train, augmented_X_valid))
augmented_combined_Y_train = np.concatenate((augmented_Y_train, augmented_Y_valid))
augmented_combined_Y_train.shape
(660000, 10)
gen = image.ImageDataGenerator(rotation_range=6, width_shift_range=0.08, shear_range=0.2, height_shift_range=0.05, zoom_range=0.05)
augmented_combined_batches = gen.flow(augmented_combined_X_train, augmented_combined_Y_train, batch_size=64)
# NOTE: valid_batches don't really matter here, because we assume that the model is final at this point
def short_fit_model(model, batches, valid_batches, model_name=''):
model.optimizer.lr=0.001
model.fit_generator(batches, batches.N, nb_epoch=3, verbose=1,
validation_data=valid_batches, nb_val_samples=valid_batches.N)
model.save_weights('../models/final_weights_v3_' + model_name)
return model
final_models = [short_fit_model(models[i], augmented_combined_batches, valid_batches, model_name='%d' % i) for i in range(5)]
Epoch 1/3 660000/660000 [==============================] - 240s - loss: 0.0082 - acc: 0.9974 - val_loss: 0.0098 - val_acc: 0.9974 Epoch 2/3 660000/660000 [==============================] - 240s - loss: 0.0079 - acc: 0.9976 - val_loss: 0.0086 - val_acc: 0.9982 Epoch 3/3 660000/660000 [==============================] - 240s - loss: 0.0076 - acc: 0.9976 - val_loss: 0.0115 - val_acc: 0.9974 Epoch 1/3 660000/660000 [==============================] - 240s - loss: 0.0083 - acc: 0.9974 - val_loss: 0.0099 - val_acc: 0.9978 Epoch 2/3 660000/660000 [==============================] - 240s - loss: 0.0077 - acc: 0.9975 - val_loss: 0.0073 - val_acc: 0.9980 Epoch 3/3 660000/660000 [==============================] - 240s - loss: 0.0073 - acc: 0.9977 - val_loss: 0.0104 - val_acc: 0.9976 Epoch 1/3 660000/660000 [==============================] - 240s - loss: 0.0083 - acc: 0.9974 - val_loss: 0.0133 - val_acc: 0.9970 Epoch 2/3 660000/660000 [==============================] - 239s - loss: 0.0076 - acc: 0.9976 - val_loss: 0.0124 - val_acc: 0.9970 Epoch 3/3 660000/660000 [==============================] - 239s - loss: 0.0073 - acc: 0.9977 - val_loss: 0.0105 - val_acc: 0.9980 Epoch 1/3 660000/660000 [==============================] - 239s - loss: 0.0082 - acc: 0.9974 - val_loss: 0.0067 - val_acc: 0.9990 Epoch 2/3 660000/660000 [==============================] - 239s - loss: 0.0078 - acc: 0.9976 - val_loss: 0.0088 - val_acc: 0.9986 Epoch 3/3 660000/660000 [==============================] - 239s - loss: 0.0075 - acc: 0.9976 - val_loss: 0.0107 - val_acc: 0.9984 Epoch 1/3 660000/660000 [==============================] - 239s - loss: 0.0085 - acc: 0.9973 - val_loss: 0.0063 - val_acc: 0.9988 Epoch 2/3 660000/660000 [==============================] - 239s - loss: 0.0079 - acc: 0.9975 - val_loss: 0.0080 - val_acc: 0.9980 Epoch 3/3 660000/660000 [==============================] - 239s - loss: 0.0076 - acc: 0.9976 - val_loss: 0.0080 - val_acc: 0.9988
all_preds = np.stack([m.predict(X_test, batch_size=256) for m in final_models])
avg_preds = all_preds.mean(axis=0)
(10000, 10)
keras.metrics.categorical_accuracy(Y_test, avg_preds).eval()
array(0.9973000288009644, dtype=float32)