Classify images from MNIST using LeNet

Dataset

Download the dataset to your workspace (i.e. the notebook folder).

In [1]:
from __future__ import division
from builtins import zip
from builtins import str
from builtins import range
from past.utils import old_div
from future import standard_library
from __future__ import print_function
from tqdm import tnrange, tqdm_notebook

standard_library.install_aliases()
import pickle, gzip

# Load the dataset
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, _ = pickle.load(f, encoding='latin1')
f.close()
In [2]:
print(train_set[0].shape, train_set[1].shape)
print(valid_set[0].shape, valid_set[1].shape)
(50000, 784) (50000,)
(10000, 784) (10000,)
In [3]:
import numpy as np
train_x = np.reshape(train_set[0], (50000, 1, 28, 28)).astype(np.float32, copy=False)
train_y = np.array(train_set[1]).astype(np.int32, copy=False)
valid_x = np.reshape(valid_set[0], (10000, 1, 28, 28))
In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.imshow(train_x[0][0])
Out[4]:
<matplotlib.image.AxesImage at 0x7fdde5663438>

Create the CNN model

TODO: plot the net structure

In [5]:
from singa import net as ffnet
from singa.layer import Conv2D, MaxPooling2D, Dropout, Activation, Flatten, Dense
from singa import optimizer, loss, metric
from singa import layer
layer.engine = 'singacpp'
net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy())
net.add(Conv2D('conv1', 32, 3, 2, input_sample_shape=(1,28,28)))
net.add(Activation('relu1'))
net.add(Conv2D('conv2', 32, 3, 2))
net.add(Activation('relu2'))
net.add(MaxPooling2D('pool', 3, 2))
net.add(Flatten('flat'))
net.add(Dense('dense', 10))
('conv1', (32, 14, 14))
('relu1', (32, 14, 14))
('conv2', (32, 7, 7))
('relu2', (32, 7, 7))
('pool', (32, 4, 4))
('flat', (512,))
('dense', (10,))
Out[5]:
<singa.layer.Dense at 0x7fddc5ca3b00>

Initialize the parameters

  • weight matrix - guassian distribution
  • bias - 0
In [6]:
for pname, pval in zip(net.param_names(), net.param_values()):
    if len(pval.shape) > 1:
        pval.gaussian(0, 0.1)
    else:
        pval.set_value(0)
    print(pname, pval.shape, pval.l1())
conv1/weight (32, 9) 0.07648436725139618
conv1/bias (32,) 0.0
conv2/weight (32, 288) 0.08030246943235397
conv2/bias (32,) 0.0
dense/weight (512, 10) 0.07954108715057373
dense/bias (10,) 0.0

Set up the optimizer and tensors

In [7]:
from singa import tensor
#from singa.proto import core_pb2
from singa import device
from singa import utils
cpu = device.get_default_device()

opt = optimizer.SGD(momentum=0.9, weight_decay=1e-4)
batch_size = 32
num_train_batch = old_div(train_x.shape[0], batch_size)

tx = tensor.Tensor((batch_size, 1, 28, 28))
ty = tensor.Tensor((batch_size,), cpu , tensor.int32)

# for progress bar
from tqdm import tnrange
idx = np.arange(train_x.shape[0], dtype=np.int32)

Conduct SGD

  1. process the training data multile time, each time is called on epoch;
  2. for each epoch, read the data as mini-batches in random order
  3. for each mini-batch, do BP and update the parameters
In [8]:
for epoch in range(2):
    np.random.shuffle(idx)
    loss, acc = 0.0, 0.0
    
    bar = tnrange(num_train_batch, desc='Epoch %d' % epoch)
    for b in bar:
        x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
        y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        grads, (l, a) = net.train(tx, ty)
        loss += l
        acc += a
        for (s, p, g) in zip(net.param_names(), net.param_values(), grads):
            opt.apply_with_lr(epoch, 0.01, g, p, str(s), b)
        # update progress bar
        bar.set_postfix(train_loss=l, train_accuracy=a)
    print('Epoch = %d, training loss = %f, training accuracy = %f' % (epoch, old_div(loss, num_train_batch), old_div(acc, num_train_batch)))
Epoch = 0, training loss = 0.291366, training accuracy = 0.907370
Epoch = 1, training loss = 0.111163, training accuracy = 0.965089

Save model to disk

In [9]:
net.save('checkpoint')

Load model from disk

In [10]:
for pval in net.param_values():
    pval.set_value(0)
net.load('checkpoint')
NOTE: If your model was saved using pickle, then set use_pickle=True for loading it

Do prediction

In [11]:
from PIL import Image
img = Image.open('static/digit.jpg').convert('L')
img = img.resize((28,28))
img = old_div(np.array(img, dtype=np.float32),255)
img = tensor.from_numpy(img)
img.reshape((1,1,28,28))
y=net.predict(img)
In [12]:
prob=tensor.to_numpy(y)[0]
plt.plot(list(range(10)), prob)
Out[12]:
[<matplotlib.lines.Line2D at 0x7fddc4b29240>]

Debug

Print l1 norm or parameter and layer feature

  1. parameter initialization
  2. learning rate
  3. weight decay
In [13]:
np.random.shuffle(idx)
ffnet.verbose=True
for pname, pval in zip(net.param_names(), net.param_values()):
    if len(pval.shape) > 1:
        pval.gaussian(0, 10)
    else:
        pval.set_value(0)
    print(pname, pval.shape, pval.l1())
for b in range(10):
    print("\n\nEpoch %d" % b)
    x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
    y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)
    grads, (l, a) = net.train(tx, ty)
    print('\n loss = %f, params' % l)
    for (s, p, g) in zip(net.param_names(), net.param_values(), grads):
        opt.apply_with_lr(epoch, 0.01, g, p, str(s), b)
        print(s, p.l1())
conv1/weight (32, 9) 7.971656799316406
conv1/bias (32,) 0.0
conv2/weight (32, 288) 8.005664825439453
conv2/bias (32,) 0.0
dense/weight (512, 10) 7.921195983886719
dense/bias (10,) 0.0


Epoch 0
-->conv1: 4.059905
conv1-->relu1: 2.407956
relu1-->conv2: 620.319519
conv2-->relu2: 302.810760
relu2-->pool: 965.994873
pool-->flat: 965.994873
flat-->dense: 276680.062500
-->dense: 0.270273
dense-->flat: 0.270273
flat-->pool: 0.074453
pool-->relu2: 0.046011
relu2-->conv2: 8.166893
conv2-->relu1: 2.553801
relu1-->conv1: 299.891296

 loss = 68.231674, params
conv1/weight 9.855006217956543
conv1/bias 9.832422256469727
conv2/weight 8.0507173538208
conv2/bias 0.1285100281238556
dense/weight 8.218809127807617
dense/bias 0.0013595324708148837


Epoch 1
-->conv1: 17.634811
conv1-->relu1: 3.629616
relu1-->conv2: 1683.136475
conv2-->relu2: 389.035248
relu2-->pool: 934.496582
pool-->flat: 934.496582
flat-->dense: 1198527.500000
-->dense: 0.322575
dense-->flat: 0.322575
flat-->pool: 0.094647
pool-->relu2: 0.039437
relu2-->conv2: 8.781067
conv2-->relu1: 2.430810
relu1-->conv1: 502.457764

 loss = 79.148743, params
conv1/weight 14.95351791381836
conv1/bias 28.66775131225586
conv2/weight 8.543733596801758
conv2/bias 0.320060670375824
dense/weight 9.1849365234375
dense/bias 0.0028705699369311333


Epoch 2
-->conv1: 44.400776
conv1-->relu1: 8.777474
relu1-->conv2: 12810.666016
conv2-->relu2: 1705.242798
relu2-->pool: 2268.597656
pool-->flat: 2268.597656
flat-->dense: 4815859.500000
-->dense: 0.347976
dense-->flat: 0.347976
flat-->pool: 0.109863
pool-->relu2: 0.024067
relu2-->conv2: 7.023767
conv2-->relu1: 1.075155
relu1-->conv1: 418.399963

 loss = 76.419479, params
conv1/weight 29.47024154663086
conv1/bias 52.10609436035156
conv2/weight 10.518251419067383
conv2/bias 0.5825839042663574
dense/weight 12.961698532104492
dense/bias 0.004623417742550373


Epoch 3
-->conv1: 82.561668
conv1-->relu1: 2.029232
relu1-->conv2: 19918.304688
conv2-->relu2: 2221.255859
relu2-->pool: 3186.028076
pool-->flat: 3186.028076
flat-->dense: 8799418.000000
-->dense: 0.585852
dense-->flat: 0.585852
flat-->pool: 0.186150
pool-->relu2: 0.025595
relu2-->conv2: 10.546330
conv2-->relu1: 1.008846
relu1-->conv1: 2478.874512

 loss = 87.336548, params
conv1/weight 45.3760871887207
conv1/bias 122.97279357910156
conv2/weight 12.587639808654785
conv2/bias 0.9543725252151489
dense/weight 21.757226943969727
dense/bias 0.007790021598339081


Epoch 4
-->conv1: 181.790573
conv1-->relu1: 0.000030
relu1-->conv2: 1.017071
conv2-->relu2: 0.120676
relu2-->pool: 0.305297
pool-->flat: 0.305297
flat-->dense: 2401.528809
-->dense: 0.703576
dense-->flat: 0.703576
flat-->pool: 0.229087
pool-->relu2: 0.029813
relu2-->conv2: 13.549859
conv2-->relu1: 0.006924
relu1-->conv1: 4.986567

 loss = 68.686081, params
conv1/weight 59.74170684814453
conv1/bias 191.91458129882812
conv2/weight 14.60444450378418
conv2/bias 1.4008562564849854
dense/weight 30.463171005249023
dense/bias 0.010233680717647076


Epoch 5
-->conv1: 260.486359
conv1-->relu1: 0.000000
relu1-->conv2: 1.400860
conv2-->relu2: 0.062520
relu2-->pool: 0.062520
pool-->flat: 0.062520
flat-->dense: 296.831329
-->dense: 2.126958
dense-->flat: 2.126958
flat-->pool: 0.694517
pool-->relu2: 0.014730
relu2-->conv2: 6.709585
conv2-->relu1: 0.000000
relu1-->conv1: 0.000000

 loss = 65.246483, params
conv1/weight 72.80561828613281
conv1/bias 253.96200561523438
conv2/weight 16.481393814086914
conv2/bias 1.8957630395889282
dense/weight 38.40126419067383
dense/bias 0.014084763824939728


Epoch 6
-->conv1: 347.085114
conv1-->relu1: 0.000000
relu1-->conv2: 1.895745
conv2-->relu2: 0.013967
relu2-->pool: 0.013967
pool-->flat: 0.013967
flat-->dense: 33.670372
-->dense: 1.395184
dense-->flat: 1.395184
flat-->pool: 0.455570
pool-->relu2: 0.009508
relu2-->conv2: 3.835509
conv2-->relu1: 0.000000
relu1-->conv1: 0.000000

 loss = 38.581238, params
conv1/weight 84.56834411621094
conv1/bias 309.804443359375
conv2/weight 18.20405387878418
conv2/bias 2.4867684841156006
dense/weight 45.590518951416016
dense/bias 0.016868162900209427


Epoch 7
-->conv1: 416.296112
conv1-->relu1: 0.000000
relu1-->conv2: 2.486759
conv2-->relu2: 0.000000
relu2-->pool: 0.000000
pool-->flat: 0.000000
flat-->dense: 0.016868
-->dense: 1.538023
dense-->flat: 1.538023
flat-->pool: 0.502211
pool-->relu2: 0.000000
relu2-->conv2: 0.000000
conv2-->relu1: 0.000000
relu1-->conv1: 0.000000

 loss = 2.308411, params
conv1/weight 95.16199493408203
conv1/bias 360.06231689453125
conv2/weight 19.77170753479004
conv2/bias 3.043811798095703
dense/weight 52.087501525878906
dense/bias 0.01910635642707348


Epoch 8
-->conv1: 469.519379
conv1-->relu1: 0.000000
relu1-->conv2: 3.043824
conv2-->relu2: 0.000000
relu2-->pool: 0.000000
pool-->flat: 0.000000
flat-->dense: 0.019106
-->dense: 1.641877
dense-->flat: 1.641877
flat-->pool: 0.536123
pool-->relu2: 0.000000
relu2-->conv2: 0.000000
conv2-->relu1: 0.000000
relu1-->conv1: 0.000000

 loss = 2.301430, params
conv1/weight 104.70797729492188
conv1/bias 405.2940368652344
conv2/weight 21.199617385864258
conv2/bias 3.545147657394409
dense/weight 57.95866775512695
dense/bias 0.02116413414478302


Epoch 9
-->conv1: 525.609802
conv1-->relu1: 0.000000
relu1-->conv2: 3.545132
conv2-->relu2: 0.000000
relu2-->pool: 0.000000
pool-->flat: 0.000000
flat-->dense: 0.021164
-->dense: 2.182356
dense-->flat: 2.182356
flat-->pool: 0.712606
pool-->relu2: 0.000000
relu2-->conv2: 0.000000
conv2-->relu1: 0.000000
relu1-->conv1: 0.000000

 loss = 2.316046, params
conv1/weight 113.30311584472656
conv1/bias 446.00213623046875
conv2/weight 22.49642562866211
conv2/bias 3.9963467121124268
dense/weight 63.25651931762695
dense/bias 0.02255747839808464
In [14]:
def vis_square(data):
    """Take an array of shape (n, height, width) or (n, height, width, 3)
       and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)"""
    
    # normalize data for display
    data = old_div((data - data.min()), (data.max() - data.min()))
    
    # force the number of filters to be square
    n = int(np.ceil(np.sqrt(data.shape[0])))
    padding = (((0, n ** 2 - data.shape[0]),
               (0, 1), (0, 1))                 # add some space between filters
               + ((0, 0),) * (data.ndim - 3))  # don't pad the last dimension (if there is one)
    data = np.pad(data, padding, mode='constant', constant_values=1)  # pad with ones (white)
    
    # tile the filters into an image
    data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))
    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
    
    plt.imshow(data); plt.axis('off')
In [15]:
np.random.shuffle(idx)
ffnet.verbose=False
net.load('checkpoint')
b=1
x = train_x[idx[b * batch_size: (b + 1) * batch_size]]    
tx.copy_from_numpy(x)

r = net.forward(False, tx, ['relu1', 'relu2'])    
NOTE: If your model was saved using pickle, then set use_pickle=True for loading it
In [16]:
r1 = tensor.to_numpy(r['relu1'])[0]
vis_square(r1)
In [17]:
r2 = tensor.to_numpy(r['relu2'])[0]
vis_square(r2)
In [18]:
p=net.param_values()[2]
print(p.shape)
(32, 288)
In [19]:
vis_square(tensor.to_numpy(p)[0].reshape(32, 3,3))