This notebook reproduces the experiment in Section 5.7. of the paper

efficient planar affine canonicalization¶

submitted to Pattern Recognition

convolutional network with canonicalized inputs¶

In [1]:

%matplotlib inline

import matplotlib.pyplot as plt
import numpy             as np
import numpy.linalg      as la

import os
import urllib

def save(name):
    plt.savefig('results/'+name+'.pdf',bbox_inches='tight')

def figsize(width=6,height=4):
    '''Change figure size in the notebook'''
    plt.rc('figure', figsize = (width,height))

In [2]:

import umucv.canonic as can
import cv2 as cv

MNIST dataset¶

In [3]:

# wget http://robot.inf.um.es/material/mnist.npz
if not os.path.isfile('mnist.npz'):
    filepath, _ = urllib.request.urlretrieve('http://robot.inf.um.es/material/mnist.npz', 'mnist.npz')
    statinfo = os.stat(filepath)
    print('Succesfully downloaded mnist.npz ({} bytes).'.format(statinfo.st_size))

mnist = np.load("mnist.npz")
xl,yl,xt,yt = [mnist[d] for d in ['xl', 'yl', 'xt', 'yt']]
print(xl.shape, yl.shape)
print(xt.shape, yt.shape)

cl = np.argmax(yl,axis=1)
ct = np.argmax(yt,axis=1)

(60000, 784) (60000, 10)
(10000, 784) (10000, 10)

In [4]:

def shdig(v):
    '''Show mnist image'''
    x = np.reshape(v,[28,28])
    plt.imshow(1-x, 'gray', vmin=0, vmax=1, interpolation="nearest");

In [5]:

shdig(xl[5])

Show a few samples:

In [6]:

figsize(6,6)
plt.imshow(-np.bmat([[ x.reshape(28,28) for x in xl[10*k:10*(k+1)] ] for k in range(10)]),'gray'); plt.axis('off');

In [7]:

# optionally merge 6 and 9
if False:
    yl[cl==9] = np.array([0,0,0,0,0,0,1,0,0,0])
    yt[ct==9] = np.array([0,0,0,0,0,0,1,0,0,0])
    cl[cl==9] = 6
    ct[ct==9] = 6

canonicalization¶

Generate always only one canonic version.

In [8]:

def only1Canonic(x,  sz=(100,100), eps_r=0.2, eps_t=0.75, eps_s=0.04, eps_w=10):
    _, H, W = can.whitenedMoments(x, th=eps_w)
    SBar, S, K   = can.KSsignature(W)
    Ts    = [ t @ H for t in can.canonicTransforms((SBar,K),eps_r=eps_r, eps_t=eps_t, eps_s=eps_s, debug=False) ]
    return can.warpW(x, Ts[0], sz=sz, sigma=3)

In [9]:

# from vectorized to vectorized, same size
def cano(v):
    return np.ravel(only1Canonic(np.reshape(v,[28,28]),(28,28)))

In [10]:

figsize()
shdig(xl[37])

In [11]:

shdig(cano(xl[37]))

We canonicalize the whole data set and save it in compressed numpy format.

In [12]:

import os

if not os.path.isfile('mnist_canonic.npz'):
    xlc = xl.copy()
    for k in range(len(xl)):
        if k%5000 == 0: print(k)
        xlc[k] = cano(xl[k])

    xtc = xt.copy()
    for k in range(len(xt)):
        if k%1000 == 0: print(k)
        xtc[k] = cano(xt[k])

    np.savez_compressed('mnist_canonic',xlc=xlc,yl=yl,xtc=xtc,yt=yt)
else:
    mnist_canonic = np.load("mnist_canonic.npz")
    xlc,yl,xtc,yt = [mnist_canonic[d] for d in ['xlc', 'yl', 'xtc', 'yt']]

xlc=xlc.reshape(-1,28,28,1)
xtc=xtc.reshape(-1,28,28,1)

In [13]:

print(xlc.shape)
print(xtc.shape)

(60000, 28, 28, 1)
(10000, 28, 28, 1)

We can take a look at the canonic versions:

In [14]:

def samp(k):
    sel = xlc
    return np.pad(np.bmat([[ x.reshape(28,28) for x in sel[10*k:10*(k+1)] ] for k in range(10)]),10,'constant')

In [15]:

figsize(6,6)
plt.imshow(-samp(0),'gray'); plt.axis('off');

convolutional network¶

In [16]:

import tensorflow as tf

from IPython.display import clear_output

We first define some utilities:

In [17]:

def mkSaver(path):
    saver = tf.train.Saver(tf.all_variables())
    def save(n):
          saver.save(sess, path+'/model', global_step=n)
    def restore():
        saver.restore(sess, tf.train.latest_checkpoint(path))
    return save,restore

In [18]:

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                           strides=[1, 2, 2, 1], padding='SAME')

We create the network described in the paper.

In [19]:

g = tf.Graph()

with g.as_default():
    
    x = tf.placeholder("float", shape=[None, 784])
    y_ = tf.placeholder("float", shape=[None, 10])
    
    x_image = tf.reshape(x, [-1,28,28,1])
    
    W_conv1 = weight_variable([5, 5, 1, 32])
    b_conv1 = bias_variable([32])
    
    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    h_pool1 = max_pool_2x2(h_conv1)
    
    W_conv2 = weight_variable([5, 5, 32, 64])
    b_conv2 = bias_variable([64])

    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
    h_pool2 = max_pool_2x2(h_conv2)
    
    W_fc1 = weight_variable([7 * 7 * 64, 1024])
    b_fc1 = bias_variable([1024])

    h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

    keep_prob = tf.placeholder("float")
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    W_fc2 = weight_variable([1024, 10])
    b_fc2 = bias_variable([10])

    y = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
    
    cost = -tf.reduce_sum(y_*tf.log(y))
    
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cost)
    
    correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    
    init = tf.initialize_all_variables()
    save,restore = mkSaver('models/convo')
    
sess = tf.Session(graph=g)
sess.run(init)

Convenience functions for evaluation of accuracy and training:

In [20]:

def accu(X,Y):
    xb = X.reshape(-1,1000,784)
    yb = Y.reshape(-1,1000,10)
    acc = np.mean([sess.run(accuracy,feed_dict={x: xb[k], y_: yb[k], keep_prob: 1.0})
                   for k in range(len(xb))])
    return acc

In [21]:

def epoch(X,Y, batch=100):
    n = len(X)
    perm = list(range(n))
    np.random.shuffle(perm)
    xb = X[perm]
    xb = xb.reshape(-1,batch,784)
    yb = Y[perm]
    yb = yb.reshape(-1,batch,10)
    for k in range(n//batch):
        sess.run(train_step,feed_dict={x: xb[k], y_: yb[k], keep_prob: 0.5})

After initialization:

In [22]:

accu(xtc,yt)

Out[22]:

0.0656

After $\sim 50$ epochs:

In [28]:

for p in range(10):
    epoch(xlc,yl)
    print(accu(xtc,yt))

In [23]:

#save(6000)
restore()

In [24]:

accu(xtc,yt), accu(xlc,yl)

Out[24]:

(0.97180003, 0.99703336)

We get $97\%$ accuray in the test set. (There is slight overtraining as the training set is almost perfectly memorized.)

We can now easily use the machine for prediction:

In [25]:

dig = xtc[1235]

figsize()
shdig(dig)

pred = sess.run(y, feed_dict={x: dig.reshape(1,-1), keep_prob: 1.0})

print(pred)

np.argmax(pred,axis=1)

[[  4.16875388e-12   8.19726286e-13   9.36033138e-14   5.43449119e-07
    7.14271697e-10   9.99995708e-01   1.57225681e-08   1.19578758e-09
    1.10670202e-08   3.63725940e-06]]

Out[25]:

array([5])

confusion matrix¶

In [26]:

from sklearn.metrics import confusion_matrix

In [27]:

def maqpred(xs):
    return np.argmax(sess.run(y, feed_dict={x: xs, keep_prob: 1.0}),axis=1)

In [28]:

def maqprobs(xs):
    return sess.run(y, feed_dict={x: xs, keep_prob: 1.0})

In [29]:

ypred = maqpred(xtc.reshape(-1,784))
ypred

Out[29]:

array([7, 2, 1, ..., 4, 5, 6])

In [30]:

ytrue = ct
ytrue

Out[30]:

array([7, 2, 1, ..., 4, 5, 6])

In [31]:

confusion_matrix(ytrue, ypred)

Out[31]:

array([[ 973,    0,    1,    0,    0,    1,    1,    1,    0,    3],
       [   0, 1128,    0,    0,    1,    0,    0,    6,    0,    0],
       [   3,    1, 1012,    1,    0,    0,    0,    9,    3,    3],
       [   3,    0,    3,  996,    3,    1,    1,    1,    2,    0],
       [   0,    1,    2,    0,  968,    1,    2,    1,    4,    3],
       [   2,    0,    0,    2,    6,  868,    2,    3,    0,    9],
       [   2,    0,    0,    0,    2,    3,  896,   17,    0,   38],
       [   0,    1,    9,    1,   10,    2,    0, 1004,    0,    1],
       [   2,    0,    3,    4,    3,    0,    1,    1,  958,    2],
       [   0,    2,    1,    1,    4,   15,   66,    2,    3,  915]])

In [32]:

if False:
    from tabulate import tabulate

    print(tabulate(confusion_matrix(np.argmax(yt,axis=1), maqpred(xtc)), tablefmt='latex'))

In [33]:

mistakes = ypred != ytrue
mistakes.sum()

Out[33]:

In [34]:

figsize(8,8)
plt.imshow(- np.pad(np.bmat([[ x.reshape(28,28) for x in xtc[mistakes][10*k:10*(k+1)] ] 
                             for k in range(18)]),10,'constant'),'gray', interpolation= 'nearest');
plt.axis('off');
#save('errorscan')

In [35]:

figsize(8,8)
plt.imshow(- np.pad(np.bmat([[ x.reshape(28,28) for x in xt[mistakes][10*k:10*(k+1)] ] 
                             for k in range(18)]),10,'constant'),'gray', interpolation= 'nearest');
plt.axis('off');
#save('errorsorig')

In [36]:

figsize(8,8)
plt.imshow(- np.pad(np.bmat([[ x.reshape(28,28) for x in xtc[np.logical_not(mistakes)][20*k:20*(k+1)] ] 
                             for k in range(20)]),10,'constant'),'gray', interpolation= 'nearest');
plt.axis('off');
#save('noerrorscan')

real image¶

In [37]:

from skimage import io

path = "http://robot.inf.um.es/material/va/images/"

view = io.imread(path+"handdigits.png")

In [38]:

figsize()
plt.imshow(view);

In [54]:

def findShapes(x, th=128, area=(100,1000)):
    ret, gt = cv.threshold(cv.cvtColor(view,cv.COLOR_RGB2GRAY),th,255,cv.THRESH_BINARY_INV)
    n,cc = cv.connectedComponents(gt.copy())
    ok = [ cc==k for k in range(1,n) ]
    def extract(mask):
        thing = np.argwhere(mask)
        (x1, y1), (x2, y2) = thing.min(0), thing.max(0)
        box = mask[x1:x2+1,y1:y2+1].astype(np.float32)
        return ((x1+x2)/2,(y1+y2)/2), box
    detected = [ extract(s) for s in ok ]
    return [ p for p in detected if area[0] < p[1].sum() < area[1] ]

In [55]:

detected = findShapes(view,125,(50,500))
print(len(detected))

In [56]:

figsize(12,4)
plt.imshow(-np.hstack([can.normalize(p,(100,100)) for _,p in detected]),'gray'); plt.axis('off');
#save('hand1')

In [42]:

plt.imshow(-np.hstack([only1Canonic(p,sz=(28,28),eps_w=11) for _,p in detected]),'gray'); plt.axis('off');
#save('hand2')

In [43]:

maqpred([only1Canonic(p,sz=(28,28),eps_w=11).ravel() for _,p in detected])

Out[43]:

array([0, 1, 2, 9, 3, 8, 4, 7, 5, 6, 6, 5, 7, 8, 4, 9, 3, 2, 1, 0])

In [44]:

figsize()
plt.imshow(view)

for (x1,y1), box in detected:    
    [l] = maqpred([only1Canonic(box,sz=(28,28)).ravel()])
    plt.annotate(l,(y1,x1),color='red',fontsize=14)

#save('hand3b')

In [45]:

for _, box in detected:
    [l] = maqprobs([only1Canonic(box,sz=(28,28)).ravel()])
    print(np.argmax(l), max(l/l.sum()))

discrimination 6 vs 9¶

In [46]:

sum(maqpred(xtc[ct==6].reshape(-1,28*28)) == 9)

Out[46]:

In [47]:

sum(maqpred(xtc[ct==9].reshape(-1,28*28)) == 6)

Out[47]:

synthetic weak perspective views¶

In [48]:

from umucv.htrans import tilt
from numpy.random import randint

def can1(x):
    return only1Canonic(x,(28,28))

views = [(0,0)]+[(a,b) for a in range(0,360,30) for b in range(0,360,45)]

# f:  fov  (degrees, e.g. 30)
# t:  tilt (degrees, e.g. 40)
# sz: desired size (default = same as input)
def mkVariations(f,t,x,sz=None):
    if sz:
        xn =  can.normalize(x,sz)
    else:
        xn = x
    return [can1(xn)]+[can1(tilt(f, a, t, b, xn)) for a in range(0,360,30) for b in range(0,360,45)]

def randomView(f,t,x,sz=None):
    if sz:
        xn =  can.normalize(x,sz)
    else:
        xn = x
    a,b = views[randint(len(views))]
    return can1(tilt(f, a, t, b, xn)) 

In [50]:

# full expansion of test set
if False:
    bigx = np.array([ mkVariations(30,40,x.reshape(28,28),(100,100)) for x in xtc[:5000]]).reshape(-1,28,28,1)
    bigy = np.array([ [y for k in range(97)] for y in yt[:5000]]).reshape(-1,10)
    bigc = np.argmax(bigy,axis=1)

    print(bigx.shape)
    print(bigy.shape)
    print(bigc.shape)

    accu(bigx,bigy)

Canonicalization of randomly chosen synthetic perspective views:

In [51]:

ranx = np.array([ randomView(30,40,x.reshape(28,28),(100,100)) for x in xtc]).reshape(-1,28,28,1)

In [52]:

accu(ranx,yt)

Out[52]:

0.95679998

end¶

In [53]:

# custom notebook style
from IPython.display import HTML
HTML(open('nb1.css').read())

Out[53]: