#!/usr/bin/env python
# coding: utf-8

# # 4.4 Custom Layers

# ## 4.4.1 Layers without Parameters

# In[1]:


from mxnet import gluon, nd
from mxnet.gluon import nn

class CenteredLayer(nn.Block):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)

    def forward(self, x):
        return x - x.mean()


# In[4]:


layer = CenteredLayer()
X = nd.array([1, 2, 3, 4, 5])
layer(X)


# In[3]:


net = nn.Sequential()
net.add(
    nn.Dense(128), 
    CenteredLayer()
)
net.initialize()


# In[8]:


X = nd.random.uniform(shape=(4, 8))
y = net(X)
print(y.shape)
print(y.mean())


# ## 4.4.2 Layers with Parameters
# - The `Parameter` class and the `ParameterDict` dictionary provide some basic housekeeping functionality.
#   - They govern access, initialization, sharing, saving and loading model parameters.
# - For instance, we can use the member variable params of the `ParameterDict` type that comes with the `Block` class. 
#   - It is a dictionary that maps string type parameter names to model parameters in the `Parameter` type. 
#   - We can create a `Parameter` instance from `ParameterDict` via the `get` function.

# In[15]:


param_dict = gluon.ParameterDict()
param = param_dict.get('param2', shape=(2, 3))


# In[16]:


param


# In[17]:


param_dict


# - Let's use this to implement our own version of the dense layer. 

# In[21]:


class MyDense(nn.Block):
    # Units: the number of outputs in this layer; in_units: the number of inputs in this layer.
    def __init__(self, units, in_units, **kwargs):
        super(MyDense, self).__init__(**kwargs)
        self.weight = self.params.get('weight', shape=(in_units, units))
        self.bias = self.params.get('bias', shape=(units,))

    def forward(self, x):
        linear = nd.dot(x, self.weight.data()) + self.bias.data()
        return nd.relu(linear)


# In[22]:


dense = MyDense(units=3, in_units=5)
dense.params


# In[23]:


dense.initialize()

X = nd.random.uniform(shape=(2, 5))
dense(X)


# In[25]:


net = nn.Sequential()
net.add(
    MyDense(8, in_units=64),
    MyDense(1, in_units=8)
)
net.initialize()

X = nd.random.uniform(shape=(2, 64))
net(X)


# # 4.5 File I/O
# - At some point, we want to save the results (what we obtained) for later use and distribution. 
# - Likewise, when running a long training process it is best practice to save intermediate results (checkpointing) to ensure that we don’t lose several days worth of computation. 
# - At the same time, we might want to load a pretrained model.
# - For all of these cases we need to load and store both individual weight vectors and entire models.

# ## 4.5.1 NDArray

# In[27]:


from mxnet import nd
from mxnet.gluon import nn

x = nd.arange(4)
nd.save('x-file.dat', x)


# In[29]:


x2 = nd.load('x-file.dat')
x2


# In[31]:


y = nd.zeros(4)
nd.save('x-files.dat', [x, y])
x2, y2 = nd.load('x-files.dat')
(x2, y2)


# In[33]:


mydict = {'x': x, 'y': y}
nd.save('mydict.dat', mydict)
mydict2 = nd.load('mydict.dat')
mydict2


# ## 4.5.2 Gluon Model Parameters
# - Saving individual weight vectors (or other NDArray tensors) is useful but it gets very tedious if we want to save (and later load) an entire model. 
# - For this reason Gluon provides built-in functionality to load and save entire networks rather than just single weight vectors. 
#   - This saves model parameters and not the entire model. 
#   - I.e. if we have a 3-layer MLP, we need to specify the architecture separately. 
# - The result is that in order to reinstate a model we need to generate the architecture in code and then load the parameters from disk. 
#   - The deferred initialization is quite advantageous here since we can simply define a model without the need to put actual values in place.

# In[34]:


class MLP(nn.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Dense(256, activation='relu')
        self.output = nn.Dense(10)

    def forward(self, x):
        return self.output(self.hidden(x))

net = MLP()
net.initialize()
x = nd.random.uniform(shape=(2, 20))
y = net(x)


# In[35]:


net.save_parameters('mlp.params')


# In[36]:


clone = MLP()
clone.load_parameters('mlp.params')


# In[38]:


yclone = clone(x)
print(yclone == y)


# ## 4.6 GPUs
# - If a CPU version of MXNet is already installed, we need to uninstall it first. 
#   - `pip uninstall mxnet` 
# - then install the corresponding MXNet version according to the CUDA version. 
#   - Assuming you have CUDA 9.0 installed, `pip install mxnet-cu90`

# ## 4.6.1 Computing Devices
# - `mx.cpu()` (or any integer in the parentheses) means ***all physical CPUs and memory***. 
#   - MXNet's calculations will try to use all CPU cores.
#   
# - `mx.gpu()` only represents one graphic card and the corresponding graphic memory. 
#   - If there are multiple GPUs, we use mx.gpu(i) to represent the $i$-th GPU ($i$ starts from 0). 
#   - Also, mx.gpu(0) and mx.gpu() are equivalent.

# In[46]:


import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn

mx.cpu(), mx.gpu(), mx.gpu(1)


# - By default, `NDArray` objects are created on the CPU. 
# - Therefore, we will see the `@cpu(0)` identifier each time we print an `NDArray`.

# In[47]:


x = nd.array([1, 2, 3])
x


# - We can use the `context` property of `NDArray` to view the device where the `NDArray` is located. 
# - Whenever we want to operate on multiple terms, they need to be ***in the same context***. 
#   - For instance, if we sum two variables, we need to make sure that both arguments are on the same device - otherwise MXNet would not know where to store the result or even how to decide where to perform the computation.

# In[48]:


x.context


# - Storage on the GPU
#   - We can specify a storage device with the `ctx` parameter when creating an `NDArray`

# In[ ]:


x = nd.ones((2, 3), ctx=mx.gpu())
x


# In[ ]:


y = nd.random.uniform(shape=(2, 3), ctx=mx.gpu(0))
y


# - Copying
#   - If we want to compute $\mathbf{x} + \mathbf{y}$, we need to decide where to perform this operation. 
#   - For instance, we can transfer $\mathbf{x}$ to gpu(1) and perform the operation there. 
#   - Do not simply add $\mathbf{x} + \mathbf{y}$, since this will result in an exception. 
#   - If the runtime engine cannot find data on the same device, it fails.
#   ![](https://github.com/d2l-ai/d2l-en/raw/master/img/copyto.svg?sanitize=true)

# In[ ]:


z = x.copyto(mx.gpu(1))
print(x)
print(z)
y + z


# - Imagine that your variable $z$ already lives on your second GPU (gpu(0)). 
# - We want to make a copy only if the variables currently lives on different contexts. 
#   - In these cases, we can call `as_in_context()`. 
#   - If the variable is already the specified context then this is a no-op. 
#   - In fact, unless you specifically want to make a copy, as_in_context() is the method of choice.

# In[ ]:


z = x.as_in_context(mx.gpu(1))
z


# - if the context of the source variable and the target variable are consistent, then the `as_in_context` function does not anything.

# In[ ]:


y.as_in_context(mx.gpu(1)) is y


# - The `copyto` function always creates new memory for the target variable.

# In[ ]:


y.copyto(mx.gpu()) is y


# - Watch Out
#   - Transferring data between devices (CPU, GPUs, other machines) is something that is much slower than computation.
#   - It also makes parallelization a lot more difficult, since we have to wait for data to be sent (or rather to be received) before we can proceed with more operations. 
#   - As a rule of thumb
#     - 1) <u>Many small operations are much worse than one big operation</u>. 
#     - 2) <u>Several operations at a time are much better than many single operations interspersed in the code</u> 
#       - Such operations can block if one device has to wait for the other before it can do something else. 
#   - Lastly, when we print `NDArray` data or convert `NDArrays` to `NumPy` format, if the data is not in main memory, MXNet will copy it to the main memory first, resulting in additional transmission overhead. 
#   - Even worse, it is now subject to the dreaded ***Global Interpreter Lock*** which makes everything wait for Python to complete.
#     - Computing the loss for ***every*** minibatch on the GPU and reporting it back to the user on the commandline (or logging it in a NumPy array) will ***trigger a global interpreter lock which stalls all GPUs***. 
#     - It is much better to <u>allocate memory for logging inside the GPU and only move larger logs</u>.
# 

# ## 4.6.3 Gluon and GPUs
# - Gluon’s model can specify devices through the `ctx` parameter during initialization. 

# In[ ]:


net = nn.Sequential()
net.add(nn.Dense(1))
net.initialize(ctx=mx.gpu())


# In[ ]:


net[0].weight.data()