We will treat images and hidden layers as two-dimensional arrays. I.e. $x[i,j]$ and $h[i,j]$ denote the position $(i,j)$ in an image.
In this case a dense layer can be written as follows: $$h[i,j] = \sum_{k,l} W[i,j,k,l] \cdot x[k,l] = \sum_{a, b} V[i,j,a,b] \cdot x[i+a,j+b]$$
*Translation Invariance*.
*Locality*
from mxnet import autograd, nd
from mxnet.gluon import nn
def corr2d(X, K): # This function has been saved in the gluonbook package for future use.
h, w = K.shape
Y = nd.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
for i in range(Y.shape[0]):
for j in range(Y.shape[1]):
Y[i, j] = (X[i: i + h, j: j + w] * K).sum()
return Y
X = nd.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
K = nd.array([[0, 1], [2, 3]])
corr2d(X, K)
[[19. 25.] [37. 43.]] <NDArray 2x2 @cpu(0)>
class Conv2D(nn.Block):
def __init__(self, kernel_size, **kwargs):
super(Conv2D, self).__init__(**kwargs)
self.weight = self.params.get('weight', shape=kernel_size)
self.bias = self.params.get('bias', shape=(1,))
def forward(self, x):
return corr2d(x, self.weight.data()) + self.bias.data()
X = nd.ones((6, 8))
X[:, 2:6] = 0
X
[[1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.]] <NDArray 6x8 @cpu(0)>
K = nd.array([[1, -1]])
Y = corr2d(X, K)
Y
[[ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.]] <NDArray 6x7 @cpu(0)>
corr2d(X.T, K)
[[0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.]] <NDArray 8x5 @cpu(0)>
conv2d = nn.Conv2D(channels=1, kernel_size=(1, 2))
conv2d.initialize()
X = X.reshape((1, 1, 6, 8))
Y = Y.reshape((1, 1, 6, 7))
(X, Y)
( [[[[1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.] [1. 1. 0. 0. 0. 0. 1. 1.]]]] <NDArray 1x1x6x8 @cpu(0)>, [[[[ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.] [ 0. 1. 0. 0. 0. -1. 0.]]]] <NDArray 1x1x6x7 @cpu(0)>)
for i in range(10):
with autograd.record():
Y_hat = conv2d(X)
l = (Y_hat - Y) ** 2
l.backward()
# For the sake of simplicity, we ignore the bias here.
conv2d.weight.data()[:] -= 3e-2 * conv2d.weight.grad()
print('batch %d, loss %.3f' % (i + 1, l.sum().asscalar()))
batch 1, loss 12.495 batch 2, loss 5.132 batch 3, loss 2.111 batch 4, loss 0.871 batch 5, loss 0.360 batch 6, loss 0.150 batch 7, loss 0.063 batch 8, loss 0.027 batch 9, loss 0.012 batch 10, loss 0.005
conv2d.weight.data().reshape((1, 2))
[[ 0.9917276 -0.9848021]] <NDArray 1x2 @cpu(0)>
If a total of $p_h$ rows are padded on both sides of the height and a total of $p_w$ columns are padded on both sides of width, the output shape will be $$(n_h-k_h+p_h+1)\times(n_w-k_w+p_w+1)$$
This means that the height and width of the output will increase by $p_h$ and $p_w$ respectively.
In many cases, we will want to set $p_h=k_h-1$ and $p_w=k_w-1$ to give the input and output the same height and width.
Convolutional neural networks often use convolution *kernels with odd height and width values*, such as 1, 3, 5, and 7,
from mxnet import nd
from mxnet.gluon import nn
# We define a convenience function to calculate the convolutional layer. This function initializes
# the convolutional layer weights and performs corresponding dimensionality elevations and reductions
# on the input and output.
def comp_conv2d(conv2d, X):
conv2d.initialize()
# (1,1) indicates that the batch size and the number of channels (described in later chapters) are both 1.
X = X.reshape((1, 1) + X.shape)
Y = conv2d(X)
return Y.reshape(Y.shape[2:]) # Exclude the first two dimensions that do not interest us: batch and channel.
# Note that here 1 row or column is padded on either side, so a total of 2 rows or columns are added.
conv2d = nn.Conv2D(1, kernel_size=3, padding=1)
X = nd.random.uniform(shape=(8, 8))
comp_conv2d(conv2d, X).shape
(8, 8)
# Here, we use a convolution kernel with a height of 5 and a width of 3. The padding numbers on
# both sides of the height and width are 2 and 1, respectively.
conv2d = nn.Conv2D(1, kernel_size=(5, 3), padding=(2, 1))
comp_conv2d(conv2d, X).shape
(8, 8)
conv2d = nn.Conv2D(1, kernel_size=3, padding=1, strides=2)
comp_conv2d(conv2d, X).shape
(4, 4)
conv2d = nn.Conv2D(1, kernel_size=(3, 5), padding=(0, 1), strides=(3, 4))
comp_conv2d(conv2d, X).shape
(2, 2)
When the padding number on both sides of the input height and width are $p_h$ and $p_w$ respectively, we call the padding $(p_h, p_w)$.
Specifically, when $p_h = p_w = p$, the padding is $p$.
When the strides on the height and width are $s_h$ and $s_w$, respectively, we call the stride $(s_h, s_w)$.
Specifically, when $s_h = s_w = s$, the stride is $s$.
By default
In practice we rarely use inhomogeneous strides or padding, i.e. we usually have $p_h = p_w$ and $s_h = s_w$.
import gluonbook as gb
from mxnet import nd
def corr2d_multi_in(X, K):
# First, traverse along the 0th dimension (channel dimension) of X and K.
# Then, add them together by using * to turn the result list into a positional argument of the add_n function.
return nd.add_n(*[gb.corr2d(x, k) for x, k in zip(X, K)])
X = nd.array([
[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
[[1, 2, 3], [4, 5, 6], [7, 8, 9]]
])
K = nd.array([
[[0, 1], [2, 3]],
[[1, 2], [3, 4]]
])
corr2d_multi_in(X, K)
[[ 56. 72.] [104. 120.]] <NDArray 2x2 @cpu(0)>
def corr2d_multi_in_out(X, K):
# Traverse along the 0th dimension of K, and each time,
# perform cross-correlation operations with input X.
# All of the results are merged together using the stack function.
return nd.stack(*[corr2d_multi_in(X, k) for k in K])
K = nd.array([
[[0, 1], [2, 3]],
[[1, 2], [3, 4]]
])
K = nd.stack(K, K + 1, K + 2)
K.shape
(3, 2, 2, 2)
corr2d_multi_in_out(X, K)
[[[ 56. 72.] [104. 120.]] [[ 76. 100.] [148. 172.]] [[ 96. 128.] [192. 224.]]] <NDArray 3x2x2 @cpu(0)>
$1 \times 1$ convolution, i.e. $k_h = k_w = 1$
$1\times 1$ convolution loses the ability of the convolutional layer to recognize patterns composed of adjacent elements in the height and width dimensions.
The main computation of the $1\times 1$ convolution occurs on the channel dimension.
def corr2d_multi_in_out_1x1(X, K):
c_i, h, w = X.shape
c_o = K.shape[0]
X = X.reshape((c_i, h * w)) #(c_i, h*w) = (3, 9)
K = K.reshape((c_o, c_i)) #(c_o, c_i) = (2, 3)
Y = nd.dot(K, X) # Matrix multiplication in the fully connected layer.
return Y.reshape((c_o, h, w)) #(2, 3, 3)
X = nd.random.uniform(shape=(3, 3, 3))
K = nd.random.uniform(shape=(2, 3, 1, 1))
Y1 = corr2d_multi_in_out_1x1(X, K)
Y2 = corr2d_multi_in_out(X, K)
(Y1 - Y2).norm().asscalar() < 1e-6
True
The $1\times 1$ convolutional layer is equivalent to the fully connected layer, when applied on a per pixel basis.
The $1\times 1$ convolutional layer is typically used to adjust the number of channels between network layers and to control model complexity.
Pooling computes the output for each element in a fixed-shape window (also known as a pooling window) of input data.
The four elements are derived from the maximum value of $\text{max}$: $$ \max(0,1,3,4)=4,\\ \max(1,2,4,5)=5,\\ \max(3,4,6,7)=7,\\ \max(4,5,7,8)=8.\ $$
The pooling layer with a pooling window shape of $p \times q$ is called the $p \times q$ pooling layer.
The pooling operation is called $p \times q$ pooling.
That is to say, using the $2\times 2$ maximum pooling layer, we can still detect if the pattern recognized by the convolutional layer moves no more than one element in height and width.
from mxnet import nd
from mxnet.gluon import nn
def pool2d(X, pool_size, mode='max'):
p_h, p_w = pool_size
Y = nd.zeros((X.shape[0] - p_h + 1, X.shape[1] - p_w + 1))
for i in range(Y.shape[0]):
for j in range(Y.shape[1]):
if mode == 'max':
Y[i, j] = X[i: i + p_h, j: j + p_w].max()
elif mode == 'avg':
Y[i, j] = X[i: i + p_h, j: j + p_w].mean()
return Y
X = nd.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
pool2d(X, (2, 2))
[[4. 5.] [7. 8.]] <NDArray 2x2 @cpu(0)>
pool2d(X, (2, 2), 'avg')
[[2. 3.] [5. 6.]] <NDArray 2x2 @cpu(0)>
MaxPool2D
in the nn
module.X = nd.arange(16).reshape((1, 1, 4, 4))
X
[[[[ 0. 1. 2. 3.] [ 4. 5. 6. 7.] [ 8. 9. 10. 11.] [12. 13. 14. 15.]]]] <NDArray 1x1x4x4 @cpu(0)>
MaxPool2D class
has the same shape as the pooling window.pool2d = nn.MaxPool2D(3)
pool2d(X)
[[[[10.]]]] <NDArray 1x1x1x1 @cpu(0)>
pool2d = nn.MaxPool2D(3, padding=1, strides=2)
pool2d(X)
[[[[ 5. 7.] [13. 15.]]]] <NDArray 1x1x2x2 @cpu(0)>
pool2d = nn.MaxPool2D((2, 3), padding=(1, 2), strides=(2, 3))
pool2d(X)
[[[[ 0. 3.] [ 8. 11.] [12. 15.]]]] <NDArray 1x1x3x2 @cpu(0)>
X = nd.arange(16).reshape((1, 1, 4, 4))
X = nd.concat(X, X + 1, dim=1)
X
[[[[ 0. 1. 2. 3.] [ 4. 5. 6. 7.] [ 8. 9. 10. 11.] [12. 13. 14. 15.]] [[ 1. 2. 3. 4.] [ 5. 6. 7. 8.] [ 9. 10. 11. 12.] [13. 14. 15. 16.]]]] <NDArray 1x2x4x4 @cpu(0)>
pool2d = nn.MaxPool2D(3, padding=1, strides=2)
pool2d(X)
[[[[ 5. 7.] [13. 15.]] [[ 6. 8.] [14. 16.]]]] <NDArray 1x2x2x2 @cpu(0)>