#!/usr/bin/env python # coding: utf-8 # # 4.4 Custom Layers # ## 4.4.1 Layers without Parameters # In[1]: from mxnet import gluon, nd from mxnet.gluon import nn class CenteredLayer(nn.Block): def __init__(self, **kwargs): super(CenteredLayer, self).__init__(**kwargs) def forward(self, x): return x - x.mean() # In[4]: layer = CenteredLayer() X = nd.array([1, 2, 3, 4, 5]) layer(X) # In[3]: net = nn.Sequential() net.add( nn.Dense(128), CenteredLayer() ) net.initialize() # In[8]: X = nd.random.uniform(shape=(4, 8)) y = net(X) print(y.shape) print(y.mean()) # ## 4.4.2 Layers with Parameters # - The `Parameter` class and the `ParameterDict` dictionary provide some basic housekeeping functionality. # - They govern access, initialization, sharing, saving and loading model parameters. # - For instance, we can use the member variable params of the `ParameterDict` type that comes with the `Block` class. # - It is a dictionary that maps string type parameter names to model parameters in the `Parameter` type. # - We can create a `Parameter` instance from `ParameterDict` via the `get` function. # In[15]: param_dict = gluon.ParameterDict() param = param_dict.get('param2', shape=(2, 3)) # In[16]: param # In[17]: param_dict # - Let's use this to implement our own version of the dense layer. # In[21]: class MyDense(nn.Block): # Units: the number of outputs in this layer; in_units: the number of inputs in this layer. def __init__(self, units, in_units, **kwargs): super(MyDense, self).__init__(**kwargs) self.weight = self.params.get('weight', shape=(in_units, units)) self.bias = self.params.get('bias', shape=(units,)) def forward(self, x): linear = nd.dot(x, self.weight.data()) + self.bias.data() return nd.relu(linear) # In[22]: dense = MyDense(units=3, in_units=5) dense.params # In[23]: dense.initialize() X = nd.random.uniform(shape=(2, 5)) dense(X) # In[25]: net = nn.Sequential() net.add( MyDense(8, in_units=64), MyDense(1, in_units=8) ) net.initialize() X = nd.random.uniform(shape=(2, 64)) net(X) # # 4.5 File I/O # - At some point, we want to save the results (what we obtained) for later use and distribution. # - Likewise, when running a long training process it is best practice to save intermediate results (checkpointing) to ensure that we don’t lose several days worth of computation. # - At the same time, we might want to load a pretrained model. # - For all of these cases we need to load and store both individual weight vectors and entire models. # ## 4.5.1 NDArray # In[27]: from mxnet import nd from mxnet.gluon import nn x = nd.arange(4) nd.save('x-file.dat', x) # In[29]: x2 = nd.load('x-file.dat') x2 # In[31]: y = nd.zeros(4) nd.save('x-files.dat', [x, y]) x2, y2 = nd.load('x-files.dat') (x2, y2) # In[33]: mydict = {'x': x, 'y': y} nd.save('mydict.dat', mydict) mydict2 = nd.load('mydict.dat') mydict2 # ## 4.5.2 Gluon Model Parameters # - Saving individual weight vectors (or other NDArray tensors) is useful but it gets very tedious if we want to save (and later load) an entire model. # - For this reason Gluon provides built-in functionality to load and save entire networks rather than just single weight vectors. # - This saves model parameters and not the entire model. # - I.e. if we have a 3-layer MLP, we need to specify the architecture separately. # - The result is that in order to reinstate a model we need to generate the architecture in code and then load the parameters from disk. # - The deferred initialization is quite advantageous here since we can simply define a model without the need to put actual values in place. # In[34]: class MLP(nn.Block): def __init__(self, **kwargs): super(MLP, self).__init__(**kwargs) self.hidden = nn.Dense(256, activation='relu') self.output = nn.Dense(10) def forward(self, x): return self.output(self.hidden(x)) net = MLP() net.initialize() x = nd.random.uniform(shape=(2, 20)) y = net(x) # In[35]: net.save_parameters('mlp.params') # In[36]: clone = MLP() clone.load_parameters('mlp.params') # In[38]: yclone = clone(x) print(yclone == y) # ## 4.6 GPUs # - If a CPU version of MXNet is already installed, we need to uninstall it first. # - `pip uninstall mxnet` # - then install the corresponding MXNet version according to the CUDA version. # - Assuming you have CUDA 9.0 installed, `pip install mxnet-cu90` # ## 4.6.1 Computing Devices # - `mx.cpu()` (or any integer in the parentheses) means ***all physical CPUs and memory***. # - MXNet's calculations will try to use all CPU cores. # # - `mx.gpu()` only represents one graphic card and the corresponding graphic memory. # - If there are multiple GPUs, we use mx.gpu(i) to represent the $i$-th GPU ($i$ starts from 0). # - Also, mx.gpu(0) and mx.gpu() are equivalent. # In[46]: import mxnet as mx from mxnet import nd from mxnet.gluon import nn mx.cpu(), mx.gpu(), mx.gpu(1) # - By default, `NDArray` objects are created on the CPU. # - Therefore, we will see the `@cpu(0)` identifier each time we print an `NDArray`. # In[47]: x = nd.array([1, 2, 3]) x # - We can use the `context` property of `NDArray` to view the device where the `NDArray` is located. # - Whenever we want to operate on multiple terms, they need to be ***in the same context***. # - For instance, if we sum two variables, we need to make sure that both arguments are on the same device - otherwise MXNet would not know where to store the result or even how to decide where to perform the computation. # In[48]: x.context # - Storage on the GPU # - We can specify a storage device with the `ctx` parameter when creating an `NDArray` # In[ ]: x = nd.ones((2, 3), ctx=mx.gpu()) x # In[ ]: y = nd.random.uniform(shape=(2, 3), ctx=mx.gpu(0)) y # - Copying # - If we want to compute $\mathbf{x} + \mathbf{y}$, we need to decide where to perform this operation. # - For instance, we can transfer $\mathbf{x}$ to gpu(1) and perform the operation there. # - Do not simply add $\mathbf{x} + \mathbf{y}$, since this will result in an exception. # - If the runtime engine cannot find data on the same device, it fails. # ![](https://github.com/d2l-ai/d2l-en/raw/master/img/copyto.svg?sanitize=true) # In[ ]: z = x.copyto(mx.gpu(1)) print(x) print(z) y + z # - Imagine that your variable $z$ already lives on your second GPU (gpu(0)). # - We want to make a copy only if the variables currently lives on different contexts. # - In these cases, we can call `as_in_context()`. # - If the variable is already the specified context then this is a no-op. # - In fact, unless you specifically want to make a copy, as_in_context() is the method of choice. # In[ ]: z = x.as_in_context(mx.gpu(1)) z # - if the context of the source variable and the target variable are consistent, then the `as_in_context` function does not anything. # In[ ]: y.as_in_context(mx.gpu(1)) is y # - The `copyto` function always creates new memory for the target variable. # In[ ]: y.copyto(mx.gpu()) is y # - Watch Out # - Transferring data between devices (CPU, GPUs, other machines) is something that is much slower than computation. # - It also makes parallelization a lot more difficult, since we have to wait for data to be sent (or rather to be received) before we can proceed with more operations. # - As a rule of thumb # - 1) Many small operations are much worse than one big operation. # - 2) Several operations at a time are much better than many single operations interspersed in the code # - Such operations can block if one device has to wait for the other before it can do something else. # - Lastly, when we print `NDArray` data or convert `NDArrays` to `NumPy` format, if the data is not in main memory, MXNet will copy it to the main memory first, resulting in additional transmission overhead. # - Even worse, it is now subject to the dreaded ***Global Interpreter Lock*** which makes everything wait for Python to complete. # - Computing the loss for ***every*** minibatch on the GPU and reporting it back to the user on the commandline (or logging it in a NumPy array) will ***trigger a global interpreter lock which stalls all GPUs***. # - It is much better to allocate memory for logging inside the GPU and only move larger logs. # # ## 4.6.3 Gluon and GPUs # - Gluon’s model can specify devices through the `ctx` parameter during initialization. # In[ ]: net = nn.Sequential() net.add(nn.Dense(1)) net.initialize(ctx=mx.gpu()) # In[ ]: net[0].weight.data()