#!/usr/bin/env python
# coding: utf-8
# # 4.4 Custom Layers
# ## 4.4.1 Layers without Parameters
# In[1]:
from mxnet import gluon, nd
from mxnet.gluon import nn
class CenteredLayer(nn.Block):
def __init__(self, **kwargs):
super(CenteredLayer, self).__init__(**kwargs)
def forward(self, x):
return x - x.mean()
# In[4]:
layer = CenteredLayer()
X = nd.array([1, 2, 3, 4, 5])
layer(X)
# In[3]:
net = nn.Sequential()
net.add(
nn.Dense(128),
CenteredLayer()
)
net.initialize()
# In[8]:
X = nd.random.uniform(shape=(4, 8))
y = net(X)
print(y.shape)
print(y.mean())
# ## 4.4.2 Layers with Parameters
# - The `Parameter` class and the `ParameterDict` dictionary provide some basic housekeeping functionality.
# - They govern access, initialization, sharing, saving and loading model parameters.
# - For instance, we can use the member variable params of the `ParameterDict` type that comes with the `Block` class.
# - It is a dictionary that maps string type parameter names to model parameters in the `Parameter` type.
# - We can create a `Parameter` instance from `ParameterDict` via the `get` function.
# In[15]:
param_dict = gluon.ParameterDict()
param = param_dict.get('param2', shape=(2, 3))
# In[16]:
param
# In[17]:
param_dict
# - Let's use this to implement our own version of the dense layer.
# In[21]:
class MyDense(nn.Block):
# Units: the number of outputs in this layer; in_units: the number of inputs in this layer.
def __init__(self, units, in_units, **kwargs):
super(MyDense, self).__init__(**kwargs)
self.weight = self.params.get('weight', shape=(in_units, units))
self.bias = self.params.get('bias', shape=(units,))
def forward(self, x):
linear = nd.dot(x, self.weight.data()) + self.bias.data()
return nd.relu(linear)
# In[22]:
dense = MyDense(units=3, in_units=5)
dense.params
# In[23]:
dense.initialize()
X = nd.random.uniform(shape=(2, 5))
dense(X)
# In[25]:
net = nn.Sequential()
net.add(
MyDense(8, in_units=64),
MyDense(1, in_units=8)
)
net.initialize()
X = nd.random.uniform(shape=(2, 64))
net(X)
# # 4.5 File I/O
# - At some point, we want to save the results (what we obtained) for later use and distribution.
# - Likewise, when running a long training process it is best practice to save intermediate results (checkpointing) to ensure that we don’t lose several days worth of computation.
# - At the same time, we might want to load a pretrained model.
# - For all of these cases we need to load and store both individual weight vectors and entire models.
# ## 4.5.1 NDArray
# In[27]:
from mxnet import nd
from mxnet.gluon import nn
x = nd.arange(4)
nd.save('x-file.dat', x)
# In[29]:
x2 = nd.load('x-file.dat')
x2
# In[31]:
y = nd.zeros(4)
nd.save('x-files.dat', [x, y])
x2, y2 = nd.load('x-files.dat')
(x2, y2)
# In[33]:
mydict = {'x': x, 'y': y}
nd.save('mydict.dat', mydict)
mydict2 = nd.load('mydict.dat')
mydict2
# ## 4.5.2 Gluon Model Parameters
# - Saving individual weight vectors (or other NDArray tensors) is useful but it gets very tedious if we want to save (and later load) an entire model.
# - For this reason Gluon provides built-in functionality to load and save entire networks rather than just single weight vectors.
# - This saves model parameters and not the entire model.
# - I.e. if we have a 3-layer MLP, we need to specify the architecture separately.
# - The result is that in order to reinstate a model we need to generate the architecture in code and then load the parameters from disk.
# - The deferred initialization is quite advantageous here since we can simply define a model without the need to put actual values in place.
# In[34]:
class MLP(nn.Block):
def __init__(self, **kwargs):
super(MLP, self).__init__(**kwargs)
self.hidden = nn.Dense(256, activation='relu')
self.output = nn.Dense(10)
def forward(self, x):
return self.output(self.hidden(x))
net = MLP()
net.initialize()
x = nd.random.uniform(shape=(2, 20))
y = net(x)
# In[35]:
net.save_parameters('mlp.params')
# In[36]:
clone = MLP()
clone.load_parameters('mlp.params')
# In[38]:
yclone = clone(x)
print(yclone == y)
# ## 4.6 GPUs
# - If a CPU version of MXNet is already installed, we need to uninstall it first.
# - `pip uninstall mxnet`
# - then install the corresponding MXNet version according to the CUDA version.
# - Assuming you have CUDA 9.0 installed, `pip install mxnet-cu90`
# ## 4.6.1 Computing Devices
# - `mx.cpu()` (or any integer in the parentheses) means ***all physical CPUs and memory***.
# - MXNet's calculations will try to use all CPU cores.
#
# - `mx.gpu()` only represents one graphic card and the corresponding graphic memory.
# - If there are multiple GPUs, we use mx.gpu(i) to represent the $i$-th GPU ($i$ starts from 0).
# - Also, mx.gpu(0) and mx.gpu() are equivalent.
# In[46]:
import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn
mx.cpu(), mx.gpu(), mx.gpu(1)
# - By default, `NDArray` objects are created on the CPU.
# - Therefore, we will see the `@cpu(0)` identifier each time we print an `NDArray`.
# In[47]:
x = nd.array([1, 2, 3])
x
# - We can use the `context` property of `NDArray` to view the device where the `NDArray` is located.
# - Whenever we want to operate on multiple terms, they need to be ***in the same context***.
# - For instance, if we sum two variables, we need to make sure that both arguments are on the same device - otherwise MXNet would not know where to store the result or even how to decide where to perform the computation.
# In[48]:
x.context
# - Storage on the GPU
# - We can specify a storage device with the `ctx` parameter when creating an `NDArray`
# In[ ]:
x = nd.ones((2, 3), ctx=mx.gpu())
x
# In[ ]:
y = nd.random.uniform(shape=(2, 3), ctx=mx.gpu(0))
y
# - Copying
# - If we want to compute $\mathbf{x} + \mathbf{y}$, we need to decide where to perform this operation.
# - For instance, we can transfer $\mathbf{x}$ to gpu(1) and perform the operation there.
# - Do not simply add $\mathbf{x} + \mathbf{y}$, since this will result in an exception.
# - If the runtime engine cannot find data on the same device, it fails.
# ![](https://github.com/d2l-ai/d2l-en/raw/master/img/copyto.svg?sanitize=true)
# In[ ]:
z = x.copyto(mx.gpu(1))
print(x)
print(z)
y + z
# - Imagine that your variable $z$ already lives on your second GPU (gpu(0)).
# - We want to make a copy only if the variables currently lives on different contexts.
# - In these cases, we can call `as_in_context()`.
# - If the variable is already the specified context then this is a no-op.
# - In fact, unless you specifically want to make a copy, as_in_context() is the method of choice.
# In[ ]:
z = x.as_in_context(mx.gpu(1))
z
# - if the context of the source variable and the target variable are consistent, then the `as_in_context` function does not anything.
# In[ ]:
y.as_in_context(mx.gpu(1)) is y
# - The `copyto` function always creates new memory for the target variable.
# In[ ]:
y.copyto(mx.gpu()) is y
# - Watch Out
# - Transferring data between devices (CPU, GPUs, other machines) is something that is much slower than computation.
# - It also makes parallelization a lot more difficult, since we have to wait for data to be sent (or rather to be received) before we can proceed with more operations.
# - As a rule of thumb
# - 1) Many small operations are much worse than one big operation.
# - 2) Several operations at a time are much better than many single operations interspersed in the code
# - Such operations can block if one device has to wait for the other before it can do something else.
# - Lastly, when we print `NDArray` data or convert `NDArrays` to `NumPy` format, if the data is not in main memory, MXNet will copy it to the main memory first, resulting in additional transmission overhead.
# - Even worse, it is now subject to the dreaded ***Global Interpreter Lock*** which makes everything wait for Python to complete.
# - Computing the loss for ***every*** minibatch on the GPU and reporting it back to the user on the commandline (or logging it in a NumPy array) will ***trigger a global interpreter lock which stalls all GPUs***.
# - It is much better to allocate memory for logging inside the GPU and only move larger logs.
#
# ## 4.6.3 Gluon and GPUs
# - Gluon’s model can specify devices through the `ctx` parameter during initialization.
# In[ ]:
net = nn.Sequential()
net.add(nn.Dense(1))
net.initialize(ctx=mx.gpu())
# In[ ]:
net[0].weight.data()