#!/usr/bin/env python
# coding: utf-8

# # 3.5 Image Classification Data (Fashion-MNIST)

# In[102]:


get_ipython().run_line_magic('matplotlib', 'inline')
import gluonbook as gb
from mxnet.gluon import data as gdata 
import sys
import time


# In[103]:


mnist_train = gdata.vision.FashionMNIST(train=True)
mnist_test = gdata.vision.FashionMNIST(train=False)


# In[104]:


len(mnist_train), len(mnist_test)


# In[105]:


feature, label = mnist_train[0]
print(feature.shape, feature.dtype)
print(label, label.shape, label.dtype)


# - There are 10 categories in Fashion-MNIST
#   - t-shirt, trousers(바지), pullover(머리부터 뒤집어쓰는 스웨터), dress, coat, sandal(샌들), shirt, sneaker, bag and ankle boot (발목 부츠).

# In[106]:


def get_fashion_mnist_labels(labels):
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
                   'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]


# In[107]:


def show_fashion_mnist(images, labels):
    gb.use_svg_display()
    _, figs = gb.plt.subplots(1, len(images), figsize=(12, 12))
    for f, img, lbl in zip(figs, images, labels):
        f.imshow(img.reshape((28, 28)).asnumpy())
        f.set_title(lbl)
        f.axes.get_xaxis().set_visible(False)
        f.axes.get_yaxis().set_visible(False)


# In[108]:


X, y = mnist_train[0:9]
show_fashion_mnist(X, get_fashion_mnist_labels(y))


# ## 3.5.2 Reading a Minibatch

# - A handy feature of Gluon’s `DataLoader` is the ability to use multiple processes to speed up data reading.
#   - We can set aside 4 processes to read the data (via `num_workers`)
#     - windows os does not support it.
# - using the `ToTensor` class
#   - https://mxnet.incubator.apache.org/_modules/mxnet/gluon/data/vision/transforms.html
#   - The image data is converted from uint8 to 32-bit floating point numbers 
#   - It moves the image channel from the last dimension to the first dimension 
#     - Facilitate the convolution calculations introduced later.
#   - That is, It converts an image NDArray of shape (H x W x C) in the range [0, 255] to a float32 tensor NDArray of shape (C x H x W) in the range [0, 1).
# - Through the `transform_first` function of the data set, we apply the transformation of `ToTensor` to the first element of each data example

# In[109]:


batch_size = 256

transformer = gdata.vision.transforms.ToTensor()

if sys.platform.startswith('win'):
    num_workers = 0  # 0 means no additional processes are needed to speed up the reading of data.
else:
    num_workers = 4

train_iter = gdata.DataLoader(
    dataset=mnist_train.transform_first(transformer),
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers
)

test_iter = gdata.DataLoader(
    dataset=mnist_test.transform_first(transformer),
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers
)


# - The logic that we will use to obtain and read the Fashion-MNIST data set is encapsulated in the `gluonbook.load_data_fashion_mnist` function
#   - This function will return two variables, `train_iter` and `test_iter`

# - Let's look at the time it takes to read the training data.

# In[110]:


start = time.time()
idx = 0
for X, y in train_iter:
    if idx < 5:
        print(type(X), X.shape, type(y), y.shape)
    idx += 1
'%.2f sec' % (time.time() - start)


# ## 3.6 Softmax Regression from Scratch

# In[111]:


get_ipython().run_line_magic('matplotlib', 'inline')
import gluonbook as gb
from mxnet import autograd, nd


# In[112]:


batch_size = 256
train_iter, test_iter = gb.load_data_fashion_mnist(batch_size)


# ## 3.6.1 Initialize Model Parameters

# - The weight and bias parameters of the softmax regression are matrics of size $784 \times 10$ and $1 \times 10$ respectively. 
# - We initialize $W$ with Gaussian noise.

# In[113]:


num_inputs = 784
num_outputs = 10

W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs))
b = nd.zeros(num_outputs)


# - As before, we have to attach a gradient to the model parameters.

# In[114]:


W.attach_grad()
b.attach_grad()


# ## 3.6.2 The Softmax

# -  Given a matrix X we can `sum`..
#   - 1) over all elements (default) or 
#   - 2) only over elements in the same column (axis=0) or the same row (axis=1) 
#   - We can retain the same dimensionality rather than collapsing out the dimension that we summed over, if required (`keepdims=True`).
#     - By default,`keepdims=False`
#   - https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.sum

# In[116]:


X = nd.array([[1, 2, 3], [4, 5, 6]])
print(X.sum(axis=0, keepdims=True))
print(X.sum(axis=0, keepdims=False))
print(X.sum(axis=1, keepdims=True))
print(X.sum(axis=1, keepdims=False))


# $$ \mathrm{softmax}(\mathbf{X})_{ij} = \frac{\exp(X_{ij})}{\sum_k \exp(X_{ik})} $$
# - We exponentiate each term using exp and then sum each row to get the normalization constant

# In[119]:


def softmax(X):
    X_exp = X.exp()
    partition = X_exp.sum(axis=1, keepdims=True)
    return X_exp / partition  # The broadcast mechanism is applied here.


# In[120]:


X = nd.random.normal(shape=(2, 5))
print(X)

X_prob = softmax(X)
print(X_prob)
print(X_prob.sum(axis=1))


# ## 3.6.3 The Model

# In[121]:


def net(X):
    return softmax(nd.dot(X.reshape((-1, num_inputs)), W) + b)


# ## 3.6.4 The Loss Function

# - the cross-entropy loss function
# - We picks the label's predicted probability and takes its logarithm $-\log p(y|x) = -\log \hat y$.
# - `nd.pick` function
#   - It selects the appropriate terms from the matrix of softmax entries easily.

# In[122]:


y_hat = nd.array([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = nd.array([0, 2])
nd.pick(y_hat, y)


# In[129]:


def cross_entropy(y_hat, y):
    return - nd.pick(y_hat, y).log()


# ## 3.6.5 Classification Accuracy

# In[130]:


def accuracy(y_hat, y):
    return (y_hat.argmax(axis=1) == y.astype('float32')).mean().asscalar()


# In[131]:


accuracy(y_hat, y)


# In[132]:


def evaluate_accuracy(data_iter, net):
    acc = 0
    for X, y in data_iter:
        acc += accuracy(net(X), y)
    return acc / len(data_iter)


# In[133]:


evaluate_accuracy(test_iter, net)


# ## 3.6.6 Model Training

# In[134]:


num_epochs, lr = 5, 0.1

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, 
              params=None, lr=None, trainer=None):
    for epoch in range(num_epochs):
        train_l_sum = 0
        train_acc_sum = 0
        
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y)
            l.backward()
            if trainer is None:
                gb.sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)  # This will be illustrated in the next section.
            train_l_sum += l.mean().asscalar()
            train_acc_sum += accuracy(y_hat, y)
            
        test_acc = evaluate_accuracy(test_iter, net)
        
        print('epoch {0}, loss {1:.4f}, train acc {2:.3f}, test acc {3:.3f}'.format(
            epoch + 1, train_l_sum / len(train_iter),
            train_acc_sum / len(train_iter), test_acc)
        )


# In[135]:


train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr, None)


# ## 3.6.7 Prediction

# In[136]:


for X, y in test_iter:
    break

true_labels = gb.get_fashion_mnist_labels(y.asnumpy())
pred_labels = gb.get_fashion_mnist_labels(net(X).argmax(axis=1).asnumpy())
titles = [truelabel + '\n' + predlabel for truelabel, predlabel in zip(true_labels, pred_labels)]

gb.show_fashion_mnist(X[0:9], titles[0:9])


# # 3.7 Softmax Regression in Gluon

# In[138]:


get_ipython().run_line_magic('matplotlib', 'inline')
import gluonbook as gb
from mxnet import gluon, init
from mxnet.gluon import loss as gloss, nn


# In[139]:


batch_size = 256
train_iter, test_iter = gb.load_data_fashion_mnist(batch_size)


# ## 3.7.1 Initialize Model Parameters

# In[140]:


net = nn.Sequential()
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))


# ## 3.7.2 The Softmax 

# - Recall that the softmax function calculates $\hat y_j = \frac{e^{z_j}}{\sum_{i=1}^{n} e^{z_i}}$, where $\hat y_j$ is the $j$-th element of $\hat y$ and $z_j$ is the $j$-th element of the input `y_linear` variable
# - For very large $z_j$, $e^{z_i}$ should be considered with caution.
# - We avoid calculating $e^{z_j}$ but directly used $z_j$ due to $\log(\exp(\cdot))$ <br/>$$ \begin{aligned} \log{(\hat y_j)} & = \log\left( \frac{e^{z_j}}{\sum_{i=1}^{n} e^{z_i}}\right) \\ & = \log{(e^{z_j})}-\text{log}{\left( \sum_{i=1}^{n} e^{z_i} \right)} \\ & = z_j -\log{\left( \sum_{i=1}^{n} e^{z_i} \right)}  \end{aligned} $$
# - By log-sum-exp trick for log-domain calculations
#   - https://en.wikipedia.org/wiki/LogSumExp
# $$ \begin{aligned} l_j & = - \log{(\hat y_j)} \\ & = \log{\left( \sum_{i=1}^{n} e^{z_i} \right)} - z_j  \\ &= z^* + \log{\left( \sum_{i=1}^{n} e^{z_i - z^*} \right)} - z_j\end{aligned} $$ where $$z^* = max \left\{ z_1, z_2, ..., z_j \right\} $$

# In[141]:


loss = gloss.SoftmaxCrossEntropyLoss()


# ## 3.7.3 Optimization Algorithm

# In[142]:


trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})


# ## 3.7.4 Training

# In[143]:


num_epochs = 5
gb.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)