MXNet's Blocks
Blocks are combinations of one or more layers.
Network design is aided by code that generates such blocks on demand.
from mxnet import nd
from mxnet.gluon import nn
x = nd.random.uniform(shape=(2, 20))
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize()
net(x)
[[ 0.09543003 0.04614332 -0.00286653 -0.07790346 -0.05130243 0.02942039 0.08696645 -0.0190793 -0.04122177 0.05088576] [ 0.0769287 0.03099705 0.00856576 -0.04467198 -0.0692684 0.09132432 0.06786594 -0.06187843 -0.03436674 0.04234695]] <NDArray 2x10 @cpu(0)>
nn.Sequential
constructor to generate an empty network into which we then inserted two layers.nn.Block
classnn
module, which we can inherit to define the model we want.MLP
class inherits the Block
class to construct the multilayer perceptron__init__
and forward
functions of the Block class.from mxnet import nd
from mxnet.gluon import nn
class MLP(nn.Block):
# Declare a layer with model parameters.
# Here, we declare two fully connected layers.
def __init__(self, **kwargs):
# Call the constructor of the MLP parent class Block to perform the necessary initialization.
# In this way, other function parameters can also be specified when constructing an instance,
# such as the model parameter, params, described in the following sections.
super(MLP, self).__init__(**kwargs)
self.hidden = nn.Dense(256, activation='relu') # Hidden layer.
self.output = nn.Dense(10) # Output layer.
# Define the forward computation of the model
# That is, how to return the required model output based on the input x.
def forward(self, x):
return self.output(self.hidden(x))
forward
method invokes a network simply by evaluating the hidden layer self.hidden(x)
and subsequently by evaluating the output layer self.output( ... )
.__init__
methodnet = MLP()
net.initialize()
net(x)
[[ 0.0036223 0.00633331 0.03201144 -0.01369375 0.10336448 -0.03508019 -0.00032164 -0.01676024 0.06978628 0.01303308] [ 0.03871716 0.02608212 0.03544959 -0.02521311 0.11005434 -0.01430662 -0.03052465 -0.03852826 0.06321152 0.0038594 ]] <NDArray 2x10 @cpu(0)>
block
class's subclass...Dense
class provided by Gluon),MLP
class we just derived),Sequential
class is to provide some useful convenience functions.add
method allows us to add concatenated Block
subclass instances one by one,forward
computation of the model is to compute these instances one by one in the order of additionclass MySequential(nn.Block):
def __init__(self, **kwargs):
super(MySequential, self).__init__(**kwargs)
def add(self, block):
# Here, block is an instance of a Block subclass, and we assume it has a unique name.
# We save it in the member variable _children of the Block class, and its type is OrderedDict.
self._children[block.name] = block
def forward(self, x):
# OrderedDict guarantees that members will be traversed in the order they were added.
for block in self._children.values():
x = block(x)
return x
MySequential
instance calls the initialize function, the system automatically initializes all members of _children.net = MySequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize()
net(x)
[[ 0.07787763 0.00216402 0.016822 0.0305988 -0.00702019 0.01668715 0.04822846 0.0039432 -0.09300035 -0.04494302] [ 0.08891079 -0.00625484 -0.01619132 0.03807179 -0.01451489 0.02006173 0.0303478 0.02463485 -0.07605447 -0.04389168]] <NDArray 2x10 @cpu(0)>
*Constant* model parameter
class FancyMLP(nn.Block):
def __init__(self, **kwargs):
super(FancyMLP, self).__init__(**kwargs)
# Random weight parameters created with the get_constant are not iterated during training
# (i.e. constant parameters).
self.rand_weight = self.params.get_constant(
'rand_weight',
nd.random.uniform(shape=(20, 20))
)
self.dense = nn.Dense(20, activation='relu')
def forward(self, x):
x = self.dense(x)
# Use the constant parameters created, as well as the relu and dot functions of NDArray.
x = nd.relu(nd.dot(x, self.rand_weight.data()) + 1)
# Reuse the fully connected layer.
# This is equivalent to sharing parameters with two fully connected layers.
x = self.dense(x)
# Here in Control flow, we need to call asscalar to return the scalar for comparison.
while x.norm().asscalar() > 1:
x /= 2
if x.norm().asscalar() < 0.8:
x *= 10
return x.sum()
rand_weight
(note that it is not a model parameter), performed a matrix multiplication operation (nd.dot
), and reused the same Dense layer.net = FancyMLP()
net.initialize()
net(x)
[25.522684] <NDArray 1 @cpu(0)>
class NestMLP(nn.Block):
def __init__(self, **kwargs):
super(NestMLP, self).__init__(**kwargs)
self.net = nn.Sequential()
self.net.add(
nn.Dense(64, activation='relu'),
nn.Dense(32, activation='relu')
)
self.dense = nn.Dense(16, activation='relu')
def forward(self, x):
return self.dense(self.net(x))
chimera = nn.Sequential()
chimera.add(
NestMLP(),
nn.Dense(20),
FancyMLP()
)
chimera.initialize()
chimera(x)
[3.853818] <NDArray 1 @cpu(0)>
from mxnet import init, nd
from mxnet.gluon import nn
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize() # Use the default initialization method.
x = nd.random.uniform(shape=(2, 20))
net(x) # Forward computation.
[[ 0.00407254 0.1019081 0.02062148 0.0552136 0.07915469 -0.05606864 -0.1041737 0.00337543 -0.06740113 -0.06313396] [ 0.01474816 0.0497599 0.00468814 0.0468959 0.06075 -0.07501648 -0.07173473 0.06645283 -0.08554209 -0.16031 ]] <NDArray 2x10 @cpu(0)>
dense17_weight
are very useful since they allow us to identify parameters uniquely even in a network of hundreds of layers and with nontrivial structure.print(net[0].params)
print(net[1].params)
dense17_ ( Parameter dense17_weight (shape=(256, 20), dtype=float32) Parameter dense17_bias (shape=(256,), dtype=float32) ) dense18_ ( Parameter dense18_weight (shape=(10, 256), dtype=float32) Parameter dense18_bias (shape=(10,), dtype=float32) )
print(net[0].weight)
print(net[0].weight.data())
print(net[1].weight.data())
Parameter dense17_weight (shape=(256, 20), dtype=float32) [[-0.05357582 -0.00228109 -0.03202471 ... -0.06692369 -0.00955358 -0.01753462] [ 0.01603388 0.02262501 -0.06019409 ... -0.03063859 -0.02505398 0.02994981] [-0.06580696 0.00862081 0.0332156 ... 0.05478401 -0.06591336 -0.06983094] ... [ 0.02946895 0.05579274 0.01646009 ... 0.04695714 0.0208929 -0.06849758] [ 0.01405259 -0.02814856 0.02697545 ... -0.03466139 -0.00090686 0.02379511] [-0.05085108 -0.0290781 0.04582401 ... 0.00601977 -0.00817193 0.06228926]] <NDArray 256x20 @cpu(0)> [[ 0.00338574 0.04148472 -0.01888602 ... -0.06870207 -0.06303862 -0.04540806] [ 0.02585206 0.05058105 0.00044364 ... -0.00163042 -0.04103333 0.06294077] [ 0.04751863 0.06542363 -0.03117647 ... 0.00775644 0.01028717 0.02544965] ... [-0.02485485 0.01089642 0.0489713 ... 0.02502301 0.03442856 -0.03999568] [ 0.02737013 -0.04429683 0.03048034 ... 0.00809494 0.00763652 0.05087072] [ 0.01182987 -0.06716982 0.01266196 ... 0.01583868 -0.00265694 -0.00011061]] <NDArray 10x256 @cpu(0)>
print(net[0].bias)
print(net[0].bias.data())
print(net[1].bias.data())
Parameter dense17_bias (shape=(256,), dtype=float32) [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] <NDArray 256 @cpu(0)> [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] <NDArray 10 @cpu(0)>
print(net[0].params['dense17_weight'])
print(net[0].params['dense17_weight'].data())
Parameter dense17_weight (shape=(256, 20), dtype=float32) [[-0.05357582 -0.00228109 -0.03202471 ... -0.06692369 -0.00955358 -0.01753462] [ 0.01603388 0.02262501 -0.06019409 ... -0.03063859 -0.02505398 0.02994981] [-0.06580696 0.00862081 0.0332156 ... 0.05478401 -0.06591336 -0.06983094] ... [ 0.02946895 0.05579274 0.01646009 ... 0.04695714 0.0208929 -0.06849758] [ 0.01405259 -0.02814856 0.02697545 ... -0.03466139 -0.00090686 0.02379511] [-0.05085108 -0.0290781 0.04582401 ... 0.00601977 -0.00817193 0.06228926]] <NDArray 256x20 @cpu(0)>
net[0].weight.grad()
[[0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] ... [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.]] <NDArray 256x20 @cpu(0)>
collect_params
grabs all parameters of a network in one dictionary such that we can traverse it with ease.collect_params
on subblocks as needed.# parameters only for the first layer
print(net[0].collect_params())
# parameters of the entire network
print(net.collect_params())
dense17_ ( Parameter dense17_weight (shape=(256, 20), dtype=float32) Parameter dense17_bias (shape=(256,), dtype=float32) ) sequential5_ ( Parameter dense17_weight (shape=(256, 20), dtype=float32) Parameter dense17_bias (shape=(256,), dtype=float32) Parameter dense18_weight (shape=(10, 256), dtype=float32) Parameter dense18_bias (shape=(10,), dtype=float32) )
net.collect_params()['dense18_bias'].data()
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] <NDArray 10 @cpu(0)>
print(net.collect_params('.*weight'))
print(net.collect_params('.*bias'))
sequential5_ ( Parameter dense17_weight (shape=(256, 20), dtype=float32) Parameter dense18_weight (shape=(10, 256), dtype=float32) ) sequential5_ ( Parameter dense17_bias (shape=(256,), dtype=float32) Parameter dense18_bias (shape=(10,), dtype=float32) )
def block1():
net = nn.Sequential()
net.add(nn.Dense(32, activation='relu'))
net.add(nn.Dense(16, activation='relu'))
return net
def block2():
net = nn.Sequential()
for i in range(4):
net.add(block1())
return net
rgnet = nn.Sequential()
rgnet.add(block2())
rgnet.add(nn.Dense(10))
rgnet.initialize()
rgnet(x)
[[ 6.6884764e-09 -1.9991958e-08 -4.7974535e-09 -8.7700771e-09 -1.6186359e-08 1.0396601e-08 1.0741704e-08 6.3689147e-09 -1.9723858e-09 3.0433571e-09] [ 8.6247640e-09 -1.8395822e-08 -2.2687403e-09 -1.6464673e-08 -2.4844146e-08 1.4356444e-08 1.6593912e-08 6.3606223e-09 -9.6643706e-09 8.3527123e-09]] <NDArray 2x10 @cpu(0)>
print(rgnet.collect_params)
print(rgnet.collect_params())
<bound method Block.collect_params of Sequential( (0): Sequential( (0): Sequential( (0): Dense(20 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) (1): Sequential( (0): Dense(16 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) (2): Sequential( (0): Dense(16 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) (3): Sequential( (0): Dense(16 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) ) (1): Dense(16 -> 10, linear) )> sequential6_ ( Parameter dense19_weight (shape=(32, 20), dtype=float32) Parameter dense19_bias (shape=(32,), dtype=float32) Parameter dense20_weight (shape=(16, 32), dtype=float32) Parameter dense20_bias (shape=(16,), dtype=float32) Parameter dense21_weight (shape=(32, 16), dtype=float32) Parameter dense21_bias (shape=(32,), dtype=float32) Parameter dense22_weight (shape=(16, 32), dtype=float32) Parameter dense22_bias (shape=(16,), dtype=float32) Parameter dense23_weight (shape=(32, 16), dtype=float32) Parameter dense23_bias (shape=(32,), dtype=float32) Parameter dense24_weight (shape=(16, 32), dtype=float32) Parameter dense24_bias (shape=(16,), dtype=float32) Parameter dense25_weight (shape=(32, 16), dtype=float32) Parameter dense25_bias (shape=(32,), dtype=float32) Parameter dense26_weight (shape=(16, 32), dtype=float32) Parameter dense26_bias (shape=(16,), dtype=float32) Parameter dense27_weight (shape=(10, 16), dtype=float32) Parameter dense27_bias (shape=(10,), dtype=float32) )
print(rgnet[0][1][0].bias.name)
print(rgnet[0][1][0].bias.data())
dense21_bias [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] <NDArray 32 @cpu(0)>
force_reinit
ensures that the variables are initialized again, regardless of whether they were already initialized previously.net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
net[0].weight.data()[0]
[ 2.3467798e-02 -6.5989629e-03 -4.6144146e-04 -1.0800398e-03 -2.5858415e-05 -6.9288602e-03 4.7301534e-03 1.6473899e-02 -8.4304502e-03 3.8224545e-03 6.4377831e-03 9.0460032e-03 -2.7124031e-04 -6.6581573e-03 -8.7738056e-03 -1.9149805e-03 4.9869940e-03 1.7430604e-02 -9.3654627e-03 -1.5981171e-03] <NDArray 20 @cpu(0)>
Constant(1)
.net.initialize(init=init.Constant(1), force_reinit=True)
net[0].weight.data()[0]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] <NDArray 20 @cpu(0)>
Xavier
initializer for the weights of the first layer.net[1].initialize(init=init.Constant(42), force_reinit=True)
net[0].weight.initialize(init=init.Xavier(), force_reinit=True)
print(net[1].weight.data()[0])
print(net[0].weight.data()[0])
[42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42.] <NDArray 256 @cpu(0)> [ 0.08490172 0.13223866 0.01630534 -0.00707628 -0.03077595 0.14420772 0.13430956 0.07363294 0.02899179 -0.13734338 -0.11237526 0.08715159 -0.02431636 0.12052891 0.0830339 0.06951596 0.05713288 -0.06902333 0.12277207 -0.10455534] <NDArray 20 @cpu(0)>
_init_weight
function and modify the incoming NDArray
according to the initial result.class MyInit(init.Initializer):
def _init_weight(self, name, data):
print('Init', name, data.shape)
data[:] = nd.random.uniform(low=-10, high=10, shape=data.shape)
data *= data.abs() >= 5
net.initialize(MyInit(), force_reinit=True)
net[0].weight.data()[0]
Init dense17_weight (256, 20) Init dense18_weight (10, 256)
[-9.572826 7.9411488 -7.953664 0. -0. -7.483777 9.6598015 -5.8997717 -7.205085 8.736895 -0. -0. -8.978939 -0. -0. -0. -0. -0. 8.936142 -0. ] <NDArray 20 @cpu(0)>
data()
returns an NDArray
, we can access it just like any other matrix.autograd
scope, you need to use set_data
to avoid confusing the automatic differentiation mechanics.net[0].weight.data()[:] += 1
net[0].weight.data()[0,0] = 42
net[0].weight.data()[0]
[42. 8.941149 -6.953664 1. 1. -6.483777 10.6598015 -4.8997717 -6.205085 9.736895 1. 1. -7.978939 1. 1. 1. 1. 1. 9.936142 1. ] <NDArray 20 @cpu(0)>
net = nn.Sequential()
# we need to give the shared layer a name such that we can reference its parameters
shared = nn.Dense(8, activation='relu')
net.add(
nn.Dense(8, activation='relu'),
shared,
nn.Dense(8, activation='relu', params=shared.params),
nn.Dense(10)
)
net.initialize()
x = nd.random.uniform(shape=(2, 20))
net(x)
# Check whether the parameters are the same
print(net[1].weight.data()[0] == net[2].weight.data()[0])
print(net[1].weight.data()[0])
print(net[2].weight.data()[0])
# And make sure that they're actually the same object rather than just having the same value.
net[1].weight.data()[0,0] = 100
print(net[1].weight.data()[0] == net[2].weight.data()[0])
[1. 1. 1. 1. 1. 1. 1. 1.] <NDArray 8 @cpu(0)> [-0.03439966 -0.05555296 0.0232332 -0.02662065 0.04434159 -0.05426525 0.01500529 -0.06945959] <NDArray 8 @cpu(0)> [-0.03439966 -0.05555296 0.0232332 -0.02662065 0.04434159 -0.05426525 0.01500529 -0.06945959] <NDArray 8 @cpu(0)> [1. 1. 1. 1. 1. 1. 1. 1.] <NDArray 8 @cpu(0)>
from mxnet import init, nd
from mxnet.gluon import nn
def getnet():
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
return net
net = getnet()
print(net.collect_params)
print(net.collect_params())
<bound method Block.collect_params of Sequential( (0): Dense(None -> 256, Activation(relu)) (1): Dense(None -> 10, linear) )> sequential18_ ( Parameter dense52_weight (shape=(256, 0), dtype=float32) Parameter dense52_bias (shape=(256,), dtype=float32) Parameter dense53_weight (shape=(10, 0), dtype=float32) Parameter dense53_bias (shape=(10,), dtype=float32) )
net[0].weight.data()
at this point would trigger a runtime error stating that the network needs initializing before it can do anything.net[0].weight.data()
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) <ipython-input-46-59ea5453a5fc> in <module> ----> 1 net[0].weight.data() ~/anaconda3/envs/gluon/lib/python3.6/site-packages/mxnet/gluon/parameter.py in data(self, ctx) 391 NDArray on ctx 392 """ --> 393 return self._check_and_get(self._data, ctx) 394 395 def list_data(self): ~/anaconda3/envs/gluon/lib/python3.6/site-packages/mxnet/gluon/parameter.py in _check_and_get(self, arr_list, ctx) 187 "with Block.collect_params() instead of Block.params " \ 188 "because the later does not include Parameters of " \ --> 189 "nested child Blocks"%(self.name)) 190 191 def _load_init(self, data, ctx): RuntimeError: Parameter 'dense52_weight' has not been initialized. Note that you should initialize parameters and create Trainer with Block.collect_params() instead of Block.params because the later does not include Parameters of nested child Blocks
net.initialize()
net.collect_params()
sequential18_ ( Parameter dense52_weight (shape=(256, 0), dtype=float32) Parameter dense52_bias (shape=(256,), dtype=float32) Parameter dense53_weight (shape=(10, 0), dtype=float32) Parameter dense53_bias (shape=(10,), dtype=float32) )
net[0].weight.data()
--------------------------------------------------------------------------- DeferredInitializationError Traceback (most recent call last) <ipython-input-49-59ea5453a5fc> in <module> ----> 1 net[0].weight.data() ~/anaconda3/envs/gluon/lib/python3.6/site-packages/mxnet/gluon/parameter.py in data(self, ctx) 391 NDArray on ctx 392 """ --> 393 return self._check_and_get(self._data, ctx) 394 395 def list_data(self): ~/anaconda3/envs/gluon/lib/python3.6/site-packages/mxnet/gluon/parameter.py in _check_and_get(self, arr_list, ctx) 181 "Please pass one batch of data through the network before accessing Parameters. " \ 182 "You can also avoid deferred initialization by specifying in_units, " \ --> 183 "num_features, etc., for network layers."%(self.name)) 184 raise RuntimeError( 185 "Parameter '%s' has not been initialized. Note that " \ DeferredInitializationError: Parameter 'dense52_weight' has not been initialized yet because initialization was deferred. Actual initialization happens during the first forward pass. Please pass one batch of data through the network before accessing Parameters. You can also avoid deferred initialization by specifying in_units, num_features, etc., for network layers.
x = nd.random.uniform(shape=(2, 20))
net(x) # Forward computation.
net.collect_params()
sequential18_ ( Parameter dense52_weight (shape=(256, 20), dtype=float32) Parameter dense52_bias (shape=(256,), dtype=float32) Parameter dense53_weight (shape=(10, 256), dtype=float32) Parameter dense53_bias (shape=(10,), dtype=float32) )
net[0].weight.data()
[[-0.05247737 -0.01900016 0.06498937 ... 0.02672191 -0.02730501 0.03611466] [ 0.0618015 0.03916474 -0.05941451 ... 0.04577643 -0.0453134 -0.04038748] [ 0.06184389 0.04633274 0.03094608 ... 0.00510379 0.05605743 -0.05085221] ... [-0.06550431 0.04614966 0.04391201 ... -0.01563684 0.04479967 0.06039421] [-0.06207634 0.00493836 -0.0689486 ... 0.02575751 -0.05235828 0.05903549] [-0.01011717 0.01382479 0.02665275 ... -0.05540304 -0.02307985 0.00403536]] <NDArray 256x20 @cpu(0)>
class MyInit(init.Initializer):
def _init_weight(self, name, data):
print('Init', name, data.shape)
# The actual initialization logic is omitted here.
net = getnet()
net.initialize(init=MyInit())
x = nd.random.uniform(shape=(2, 20))
y = net(x)
Init dense54_weight (256, 20) Init dense55_weight (10, 256)
x
, the system can automatically infer the shape of the weight parameters of all layers based on the shape of the input.MyInit
instance to initialize them before proceeding to the forward calculation.y = net(x)
net.initialize(init=MyInit(), force_reinit=True)
Init dense54_weight (256, 20) Init dense55_weight (10, 256)
in_units
so that initialization can occur immediately once initialize is callednet = nn.Sequential()
net.add(nn.Dense(256, in_units=20, activation='relu'))
net.add(nn.Dense(10, in_units=256))
net.initialize(init=MyInit())
Init dense56_weight (256, 20) Init dense57_weight (10, 256)