#!/usr/bin/env python # coding: utf-8 # # 3.5 Image Classification Data (Fashion-MNIST) # In[102]: get_ipython().run_line_magic('matplotlib', 'inline') import gluonbook as gb from mxnet.gluon import data as gdata import sys import time # In[103]: mnist_train = gdata.vision.FashionMNIST(train=True) mnist_test = gdata.vision.FashionMNIST(train=False) # In[104]: len(mnist_train), len(mnist_test) # In[105]: feature, label = mnist_train[0] print(feature.shape, feature.dtype) print(label, label.shape, label.dtype) # - There are 10 categories in Fashion-MNIST # - t-shirt, trousers(바지), pullover(머리부터 뒤집어쓰는 스웨터), dress, coat, sandal(샌들), shirt, sneaker, bag and ankle boot (발목 부츠). # In[106]: def get_fashion_mnist_labels(labels): text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] return [text_labels[int(i)] for i in labels] # In[107]: def show_fashion_mnist(images, labels): gb.use_svg_display() _, figs = gb.plt.subplots(1, len(images), figsize=(12, 12)) for f, img, lbl in zip(figs, images, labels): f.imshow(img.reshape((28, 28)).asnumpy()) f.set_title(lbl) f.axes.get_xaxis().set_visible(False) f.axes.get_yaxis().set_visible(False) # In[108]: X, y = mnist_train[0:9] show_fashion_mnist(X, get_fashion_mnist_labels(y)) # ## 3.5.2 Reading a Minibatch # - A handy feature of Gluon’s `DataLoader` is the ability to use multiple processes to speed up data reading. # - We can set aside 4 processes to read the data (via `num_workers`) # - windows os does not support it. # - using the `ToTensor` class # - https://mxnet.incubator.apache.org/_modules/mxnet/gluon/data/vision/transforms.html # - The image data is converted from uint8 to 32-bit floating point numbers # - It moves the image channel from the last dimension to the first dimension # - Facilitate the convolution calculations introduced later. # - That is, It converts an image NDArray of shape (H x W x C) in the range [0, 255] to a float32 tensor NDArray of shape (C x H x W) in the range [0, 1). # - Through the `transform_first` function of the data set, we apply the transformation of `ToTensor` to the first element of each data example # In[109]: batch_size = 256 transformer = gdata.vision.transforms.ToTensor() if sys.platform.startswith('win'): num_workers = 0 # 0 means no additional processes are needed to speed up the reading of data. else: num_workers = 4 train_iter = gdata.DataLoader( dataset=mnist_train.transform_first(transformer), batch_size=batch_size, shuffle=True, num_workers=num_workers ) test_iter = gdata.DataLoader( dataset=mnist_test.transform_first(transformer), batch_size=batch_size, shuffle=False, num_workers=num_workers ) # - The logic that we will use to obtain and read the Fashion-MNIST data set is encapsulated in the `gluonbook.load_data_fashion_mnist` function # - This function will return two variables, `train_iter` and `test_iter` # - Let's look at the time it takes to read the training data. # In[110]: start = time.time() idx = 0 for X, y in train_iter: if idx < 5: print(type(X), X.shape, type(y), y.shape) idx += 1 '%.2f sec' % (time.time() - start) # ## 3.6 Softmax Regression from Scratch # In[111]: get_ipython().run_line_magic('matplotlib', 'inline') import gluonbook as gb from mxnet import autograd, nd # In[112]: batch_size = 256 train_iter, test_iter = gb.load_data_fashion_mnist(batch_size) # ## 3.6.1 Initialize Model Parameters # - The weight and bias parameters of the softmax regression are matrics of size $784 \times 10$ and $1 \times 10$ respectively. # - We initialize $W$ with Gaussian noise. # In[113]: num_inputs = 784 num_outputs = 10 W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs)) b = nd.zeros(num_outputs) # - As before, we have to attach a gradient to the model parameters. # In[114]: W.attach_grad() b.attach_grad() # ## 3.6.2 The Softmax # - Given a matrix X we can `sum`.. # - 1) over all elements (default) or # - 2) only over elements in the same column (axis=0) or the same row (axis=1) # - We can retain the same dimensionality rather than collapsing out the dimension that we summed over, if required (`keepdims=True`). # - By default,`keepdims=False` # - https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.sum # In[116]: X = nd.array([[1, 2, 3], [4, 5, 6]]) print(X.sum(axis=0, keepdims=True)) print(X.sum(axis=0, keepdims=False)) print(X.sum(axis=1, keepdims=True)) print(X.sum(axis=1, keepdims=False)) # $$ \mathrm{softmax}(\mathbf{X})_{ij} = \frac{\exp(X_{ij})}{\sum_k \exp(X_{ik})} $$ # - We exponentiate each term using exp and then sum each row to get the normalization constant # In[119]: def softmax(X): X_exp = X.exp() partition = X_exp.sum(axis=1, keepdims=True) return X_exp / partition # The broadcast mechanism is applied here. # In[120]: X = nd.random.normal(shape=(2, 5)) print(X) X_prob = softmax(X) print(X_prob) print(X_prob.sum(axis=1)) # ## 3.6.3 The Model # In[121]: def net(X): return softmax(nd.dot(X.reshape((-1, num_inputs)), W) + b) # ## 3.6.4 The Loss Function # - the cross-entropy loss function # - We picks the label's predicted probability and takes its logarithm $-\log p(y|x) = -\log \hat y$. # - `nd.pick` function # - It selects the appropriate terms from the matrix of softmax entries easily. # In[122]: y_hat = nd.array([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]]) y = nd.array([0, 2]) nd.pick(y_hat, y) # In[129]: def cross_entropy(y_hat, y): return - nd.pick(y_hat, y).log() # ## 3.6.5 Classification Accuracy # In[130]: def accuracy(y_hat, y): return (y_hat.argmax(axis=1) == y.astype('float32')).mean().asscalar() # In[131]: accuracy(y_hat, y) # In[132]: def evaluate_accuracy(data_iter, net): acc = 0 for X, y in data_iter: acc += accuracy(net(X), y) return acc / len(data_iter) # In[133]: evaluate_accuracy(test_iter, net) # ## 3.6.6 Model Training # In[134]: num_epochs, lr = 5, 0.1 def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, trainer=None): for epoch in range(num_epochs): train_l_sum = 0 train_acc_sum = 0 for X, y in train_iter: with autograd.record(): y_hat = net(X) l = loss(y_hat, y) l.backward() if trainer is None: gb.sgd(params, lr, batch_size) else: trainer.step(batch_size) # This will be illustrated in the next section. train_l_sum += l.mean().asscalar() train_acc_sum += accuracy(y_hat, y) test_acc = evaluate_accuracy(test_iter, net) print('epoch {0}, loss {1:.4f}, train acc {2:.3f}, test acc {3:.3f}'.format( epoch + 1, train_l_sum / len(train_iter), train_acc_sum / len(train_iter), test_acc) ) # In[135]: train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr, None) # ## 3.6.7 Prediction # In[136]: for X, y in test_iter: break true_labels = gb.get_fashion_mnist_labels(y.asnumpy()) pred_labels = gb.get_fashion_mnist_labels(net(X).argmax(axis=1).asnumpy()) titles = [truelabel + '\n' + predlabel for truelabel, predlabel in zip(true_labels, pred_labels)] gb.show_fashion_mnist(X[0:9], titles[0:9]) # # 3.7 Softmax Regression in Gluon # In[138]: get_ipython().run_line_magic('matplotlib', 'inline') import gluonbook as gb from mxnet import gluon, init from mxnet.gluon import loss as gloss, nn # In[139]: batch_size = 256 train_iter, test_iter = gb.load_data_fashion_mnist(batch_size) # ## 3.7.1 Initialize Model Parameters # In[140]: net = nn.Sequential() net.add(nn.Dense(10)) net.initialize(init.Normal(sigma=0.01)) # ## 3.7.2 The Softmax # - Recall that the softmax function calculates $\hat y_j = \frac{e^{z_j}}{\sum_{i=1}^{n} e^{z_i}}$, where $\hat y_j$ is the $j$-th element of $\hat y$ and $z_j$ is the $j$-th element of the input `y_linear` variable # - For very large $z_j$, $e^{z_i}$ should be considered with caution. # - We avoid calculating $e^{z_j}$ but directly used $z_j$ due to $\log(\exp(\cdot))$
$$ \begin{aligned} \log{(\hat y_j)} & = \log\left( \frac{e^{z_j}}{\sum_{i=1}^{n} e^{z_i}}\right) \\ & = \log{(e^{z_j})}-\text{log}{\left( \sum_{i=1}^{n} e^{z_i} \right)} \\ & = z_j -\log{\left( \sum_{i=1}^{n} e^{z_i} \right)} \end{aligned} $$ # - By log-sum-exp trick for log-domain calculations # - https://en.wikipedia.org/wiki/LogSumExp # $$ \begin{aligned} l_j & = - \log{(\hat y_j)} \\ & = \log{\left( \sum_{i=1}^{n} e^{z_i} \right)} - z_j \\ &= z^* + \log{\left( \sum_{i=1}^{n} e^{z_i - z^*} \right)} - z_j\end{aligned} $$ where $$z^* = max \left\{ z_1, z_2, ..., z_j \right\} $$ # In[141]: loss = gloss.SoftmaxCrossEntropyLoss() # ## 3.7.3 Optimization Algorithm # In[142]: trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1}) # ## 3.7.4 Training # In[143]: num_epochs = 5 gb.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)