#!/usr/bin/env python # coding: utf-8 # # 5.9 Network in Network (NiN) # - LeNet, AlexNet, and VGG all share a common design pattern # - extract the spatial features through a sequence of convolutions # - pooling layers # - post-process the representations via fully connected layers # - A careless use of a dense layer would destroy the spatial structure of the data entirely, since fully connected layers mangle all inputs. # - NiN # - Lin, Chen and Yan, "Network In Network," 2013 - https://arxiv.org/pdf/1312.4400.pdf # - Use an MLP on the channels for each pixel separately. # ## 5.9.1 NiN Blocks # - The inputs and outputs of convolutional layers are usually four-dimensional arrays # - (example, channel, height, width) # - The inputs and outputs of fully connected layers are usually two-dimensional arrays # - (example, feature) # - Once we process data by a fully connected layer, it's virtually impossible to recover the spatial structure of the representation. # - We could apply a fully connected layer at a pixel level # - Recall the $1\times 1$ convolutional layer. # - It is considered as a fully connected layer processing channel activations on a per pixel level. # - Another way to view this is to think of... # - each element in the spatial dimension (height and width) as equivalent to an example, # - the channel as equivalent to a feature. # - NiN Block # - it use the $1\times 1$ convolutional layer instead of a fully connected layer. # - The spatial information can then be naturally passed to the subsequent layers. # - It consists of a convolutional layer and multiple $1\times 1$ convolutional layer. This can be used within the convolutional stack to allow for more per-pixel nonlinearity. # ![](https://github.com/d2l-ai/d2l-en/raw/master/img/nin-compare.svg?sanitize=true) # In[33]: import gluonbook as gb from mxnet import gluon, init, nd from mxnet.gluon import nn def nin_block(num_channels, kernel_size, strides, padding): blk = nn.Sequential() blk.add( nn.Conv2D(num_channels, kernel_size, strides, padding, activation='relu'), nn.Conv2D(num_channels, kernel_size=1, activation='relu'), nn.Conv2D(num_channels, kernel_size=1, activation='relu') ) return blk # ## 5.9.2 NiN Model # - NiN uses convolutional layers with convolution window shapes of 11 × 11, 5 × 5, and 3 × 3, and the corresponding numbers of output channels (96, 256, and 384) are the same as in AlexNet. # - Each NiN block is followed by a maximum pooling layer with a stride of 2 and a window shape of 3 × 3. # - The last NiN block has a number of output channels equal to the number of label classes, and then uses a global average pooling layer to average all elements in each channel for direct use in classification. # - In global average pooling layer, the window shape is equal to the average pooling layer of the input spatial dimension shape. # - It can significantly reduce the size of model parameters, thus mitigating overfitting. # - In other words, all operations are convolutions. # - However, this design sometimes results in an increase in model training time. # In[48]: net = nn.Sequential() net.add( nin_block(24, kernel_size=3, strides=1, padding=0), nn.MaxPool2D(pool_size=2, strides=1), nin_block(64, kernel_size=3, strides=2, padding=2), nn.MaxPool2D(pool_size=2, strides=1), nin_block(96, kernel_size=4, strides=2, padding=1), nn.MaxPool2D(pool_size=3, strides=2), nn.Dropout(0.5), # There are 10 label classes. nin_block(10, kernel_size=3, strides=1, padding=1), # The global average pooling layer automatically sets the window shape to the height and width of the input. nn.GlobalAvgPool2D(), # Transform the four-dimensional output into two-dimensional output with a shape of (batch size, 10). nn.Flatten() ) # In[49]: X = nd.random.uniform(shape=(1, 1, 28, 28)) net.initialize(force_reinit=True) for layer in net: X = layer(X) print("Layer.name {0:12s} - Output Shape - {1}".format(layer.name, X.shape)) # ## 5.9.3 Data Acquisition and Training # - NiN removes the fully connected layers and replaces them with global average pooling (i.e. summing over all locations) after reducing the number of channels to the desired number of outputs # - Removing the dense layers reduces overfitting. NiN has dramatically fewer parameters. # - The NiN design influenced many subsequent convolutional neural networks designs. # In[50]: lr = 0.05 num_epochs = 5 batch_size = 256 ctx = gb.try_gpu() net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = gb.load_data_fashion_mnist(batch_size) gb.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs) # # 5.10 Networks with Parallel Concatenations (GoogLeNet) # - Szegedy et al., "Going Deeper with Convolutions," 2014 - https://arxiv.org/abs/1409.4842 # - Pragmatic answer to the question as to which size of convolution is ideal for processing. # - 1 × 1 or 3 × 3, 5 × 5 or even larger. # - It isn’t always clear which one is the best. # - As it turns out, the answer is that a combination of all the above works best. # - Over the next few years, researchers made several improvements to GoogLeNet. # ## 5.10.1 Inception Blocks # ![](https://github.com/d2l-ai/d2l-en/raw/master/img/inception.svg?sanitize=true) # - There are four parallel paths in the Inception block. # - The first three paths # - use convolutional layers with window sizes of $1\times 1$, $3\times 3$, and $5\times 5$ # - to extract information from different spatial sizes. # - The middle two paths # - perform a $1\times 1$ convolution on the input to reduce the number of input channels # - so as to reduce the model's complexity. # - The fourth path # - uses the $3\times 3$ maximum pooling layer # - followed by the $1\times 1$ convolutional layer to change the number of channels. # - The four paths all use appropriate padding to give the input and output the same height and width. # - Finally, we concatenate the output of each path on the channel dimension and input it to the next layer. # - The customizable parameters of the Inception block # - Number of output channels per layer, which can be used to control the model complexity. # In[53]: import gluonbook as gb from mxnet import gluon, init, nd from mxnet.gluon import nn class Inception(nn.Block): # c1 - c4 are the number of output channels for each layer in the path. def __init__(self, c1, c2, c3, c4, **kwargs): super(Inception, self).__init__(**kwargs) # Path 1 is a single 1 x 1 convolutional layer. self.p1_1 = nn.Conv2D(c1, kernel_size=1, activation='relu') # Path 2 is a 1 x 1 convolutional layer followed by a 3 x 3 convolutional layer. self.p2_1 = nn.Conv2D(c2[0], kernel_size=1, activation='relu') self.p2_2 = nn.Conv2D(c2[1], kernel_size=3, padding=1, activation='relu') # Path 3 is a 1 x 1 convolutional layer followed by a 5 x 5 convolutional layer. self.p3_1 = nn.Conv2D(c3[0], kernel_size=1, activation='relu') self.p3_2 = nn.Conv2D(c3[1], kernel_size=5, padding=2, activation='relu') # Path 4 is a 3 x 3 maximum pooling layer followed by a 1 x 1 convolutional layer. self.p4_1 = nn.MaxPool2D(pool_size=3, strides=1, padding=1) self.p4_2 = nn.Conv2D(c4, kernel_size=1, activation='relu') def forward(self, x): p1 = self.p1_1(x) p2 = self.p2_2(self.p2_1(x)) p3 = self.p3_2(self.p3_1(x)) p4 = self.p4_2(self.p4_1(x)) # Concatenate the outputs on the channel dimension. return nd.concat(p1, p2, p3, p4, dim=1) # - To understand why this works as well as it does, consider the combination of the filters. # - Details can be recognized efficiently by different filters. # ## 5.10.2 GoogLeNet Model # - GoogLeNet uses a stack of a total of 9 inception blocks and global average pooling to generate its estimates. # - Maximum pooling between inception blocks reduced the dimensionality. # - The first part is identical to AlexNet and LeNet # - The stack of blocks is inherited from VGG # - The global average pooling which can avoid a stack of fully connected layers at the end. # ![](https://github.com/d2l-ai/d2l-en/raw/master/img/inception-full.svg?sanitize=true) # In[54]: b1 = nn.Sequential() b1.add( nn.Conv2D(64, kernel_size=7, strides=2, padding=3, activation='relu'), nn.MaxPool2D(pool_size=3, strides=2, padding=1) ) # In[55]: b2 = nn.Sequential() b2.add( nn.Conv2D(64, kernel_size=1), nn.Conv2D(192, kernel_size=3, padding=1), nn.MaxPool2D(pool_size=3, strides=2, padding=1) ) # In[56]: b3 = nn.Sequential() b3.add( Inception(64, (96, 128), (16, 32), 32), Inception(128, (128, 192), (32, 96), 64), nn.MaxPool2D(pool_size=3, strides=2, padding=1) ) # In[57]: b4 = nn.Sequential() b4.add( Inception(192, (96, 208), (16, 48), 64), Inception(160, (112, 224), (24, 64), 64), Inception(128, (128, 256), (24, 64), 64), Inception(112, (144, 288), (32, 64), 64), Inception(256, (160, 320), (32, 128), 128), nn.MaxPool2D(pool_size=3, strides=2, padding=1) ) # In[58]: b5 = nn.Sequential() b5.add( Inception(256, (160, 320), (32, 128), 128), Inception(384, (192, 384), (48, 128), 128), nn.GlobalAvgPool2D() ) net = nn.Sequential() net.add(b1, b2, b3, b4, b5, nn.Dense(10)) # In[59]: X = nd.random.uniform(shape=(1, 1, 96, 96)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) # ## 5.10.3 Data Acquisition and Training # In[60]: lr = 0.1 num_epochs = 5 batch_size = 256 ctx = gb.try_gpu() net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = gb.load_data_fashion_mnist(batch_size) gb.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs) # - The Inception block is equivalent to a subnetwork with four paths. # - It extracts information in parallel through convolutional layers of different window shapes and maximum pooling layers. # - $1 \times 1$ convolutions reduce channel dimensionality on a per-pixel level. # - Max-pooling reduces the resolution. # - GoogLeNet connects multiple well-designed Inception blocks with other layers in series. # - The ratio of the number of channels assigned in the Inception block is obtained through a large number of experiments on the ImageNet data set. # - GoogLeNet, as well as its succeeding versions, was one of the most efficient models on ImageNet, providing similar test accuracy with lower computational complexity. # In[ ]: