#export
from local.test import *
from local.basics import *
from local.notebook.showdoc import *
#default_exp callback.hook
Callback and helper function to add hooks in models
from local.test_utils import *
Hooks are functions you can attach to a particular layer in your model and that will be executed in the foward pass (for forward hooks) or backward pass (for backward hooks). Here we begin with an introduction around hooks, but you should jump to HookCallback
if you quickly want to implement one (and read the following example ActivationStats
).
Forward hooks are functions that take three arguments: the layer it's applied to, the input of that layer and the output of that layer.
tst_model = nn.Linear(5,3)
def example_forward_hook(m,i,o): print(m,i,o)
x = torch.randn(4,5)
hook = tst_model.register_forward_hook(example_forward_hook)
y = tst_model(x)
hook.remove()
Linear(in_features=5, out_features=3, bias=True) (tensor([[-1.8820, 0.7021, -0.6919, -0.8470, 0.4694], [ 1.6047, 1.1505, 1.9210, -0.4393, 1.6700], [ 1.1959, 0.5682, -1.0785, -0.5261, -0.2628], [-0.9082, 1.5110, 0.3545, 0.3456, 0.7868]]),) tensor([[-0.4622, 0.4920, 0.0128], [ 0.5634, -1.1016, 1.5529], [-0.1620, 0.0452, 0.6342], [ 0.1072, 0.1193, 0.0884]], grad_fn=<AddmmBackward>)
Backward hooks are functions that take three arguments: the layer it's applied to, the gradients of the loss with respect to the input, and the gradients with respect to the output.
def example_backward_hook(m,gi,go): print(m,gi,go)
hook = tst_model.register_backward_hook(example_backward_hook)
x = torch.randn(4,5)
y = tst_model(x)
loss = y.pow(2).mean()
loss.backward()
hook.remove()
Linear(in_features=5, out_features=3, bias=True) (tensor([ 0.2188, -0.4699, 0.4320]), None, tensor([[-0.1662, -0.1490, 0.0825], [ 0.5101, 0.0921, 0.0596], [ 0.5675, -0.5542, 0.4907], [-0.2050, -0.0900, -0.0019], [-0.0335, 0.0810, -0.0717]])) (tensor([[ 0.0437, -0.1476, 0.1374], [-0.1166, -0.0519, 0.0458], [ 0.1882, -0.0790, 0.1273], [ 0.1035, -0.1914, 0.1215]]),)
Hooks can change the input/output of a layer, or the gradients, print values or shapes. If you want to store something related to theses inputs/outputs, it's best to have your hook associated to a class so that it can put it in the state of an instance of that class.
#export
@docs
class Hook():
"Create a hook on `m` with `hook_func`."
def __init__(self, m, hook_func, is_forward=True, detach=True, cpu=False):
self.hook_func,self.detach,self.cpu,self.stored = hook_func,detach,cpu,None
f = m.register_forward_hook if is_forward else m.register_backward_hook
self.hook = f(self.hook_fn)
self.removed = False
def hook_fn(self, module, input, output):
"Applies `hook_func` to `module`, `input`, `output`."
if self.detach: input,output = to_detach(input, cpu=self.cpu),to_detach(output, cpu=self.cpu)
self.stored = self.hook_func(module, input, output)
def remove(self):
"Remove the hook from the model."
if not self.removed:
self.hook.remove()
self.removed=True
def __enter__(self, *args): return self
def __exit__(self, *args): self.remove()
_docs = dict(__enter__="Register the hook",
__exit__="Remove the hook")
This will be called during the forward pass if is_forward=True
, the backward pass otherwise, and will optionally detach
and put on the cpu
the (gradient of the) input/output of the model before passing them to hook_func
. The result of hook_func
will be stored in the stored
attribute of the Hook
.
tst_model = nn.Linear(5,3)
hook = Hook(tst_model, lambda m,i,o: o)
y = tst_model(x)
test_eq(hook.stored, y)
show_doc(Hook.hook_fn)
Hook.hook_fn
[source]
Hook.hook_fn
(module
,input
,output
)
Applies hook_func
to module
, input
, output
.
show_doc(Hook.remove)
Note: It's important to properly remove your hooks for your model when you're done to avoid them being called again next time your model is applied to some inputs, and to free the memory that go with their state.
tst_model = nn.Linear(5,10)
x = torch.randn(4,5)
y = tst_model(x)
hook = Hook(tst_model, example_forward_hook)
test_stdout(lambda: tst_model(x), f"{tst_model} ({x},) {y.detach()}")
hook.remove()
test_stdout(lambda: tst_model(x), "")
Since it's very important to remove your Hook
even if your code is interrupted by some bug, Hook
can be used as context managers.
show_doc(Hook.__enter__)
show_doc(Hook.__exit__)
tst_model = nn.Linear(5,10)
x = torch.randn(4,5)
y = tst_model(x)
with Hook(tst_model, example_forward_hook) as h:
test_stdout(lambda: tst_model(x), f"{tst_model} ({x},) {y.detach()}")
test_stdout(lambda: tst_model(x), "")
#export
def _hook_inner(m,i,o): return o if isinstance(o,Tensor) or is_listy(o) else list(o)
def hook_output(module, detach=True, cpu=False, grad=False):
"Return a `Hook` that stores activations of `module` in `self.stored`"
return Hook(module, _hook_inner, detach=detach, cpu=cpu, is_forward=not grad)
The activations stored are the gradients if grad=True
, otherwise the output of module
. If detach=True
they are detached from their history, and if cpu=True
, they're put on the CPU.
tst_model = nn.Linear(5,10)
x = torch.randn(4,5)
with hook_output(tst_model) as h:
y = tst_model(x)
test_eq(y, h.stored)
assert not h.stored.requires_grad
with hook_output(tst_model, grad=True) as h:
y = tst_model(x)
loss = y.pow(2).mean()
loss.backward()
test_close(2*y / y.numel(), h.stored[0])
#cuda
with hook_output(tst_model, cpu=True) as h:
y = tst_model.cuda()(x.cuda())
test_eq(h.stored.device, torch.device('cpu'))
#export
@docs
class Hooks():
"Create several hooks on the modules in `ms` with `hook_func`."
def __init__(self, ms, hook_func, is_forward=True, detach=True, cpu=False):
self.hooks = [Hook(m, hook_func, is_forward, detach, cpu) for m in ms]
def __getitem__(self,i): return self.hooks[i]
def __len__(self): return len(self.hooks)
def __iter__(self): return iter(self.hooks)
@property
def stored(self): return L(o.stored for o in self)
def remove(self):
"Remove the hooks from the model."
for h in self.hooks: h.remove()
def __enter__(self, *args): return self
def __exit__ (self, *args): self.remove()
_docs = dict(stored = "The states saved in each hook.",
__enter__="Register the hooks",
__exit__="Remove the hooks")
layers = [nn.Linear(5,10), nn.ReLU(), nn.Linear(10,3)]
tst_model = nn.Sequential(*layers)
hooks = Hooks(tst_model, lambda m,i,o: o)
y = tst_model(x)
test_eq(hooks.stored[0], layers[0](x))
test_eq(hooks.stored[1], F.relu(layers[0](x)))
test_eq(hooks.stored[2], y)
hooks.remove()
show_doc(Hooks.stored, name='Hooks.stored')
Hooks.stored
[source]The states saved in each hook.
show_doc(Hooks.remove)
Like Hook
, you can use Hooks
as context managers.
show_doc(Hooks.__enter__)
show_doc(Hooks.__exit__)
layers = [nn.Linear(5,10), nn.ReLU(), nn.Linear(10,3)]
tst_model = nn.Sequential(*layers)
with Hooks(layers, lambda m,i,o: o) as h:
y = tst_model(x)
test_eq(h.stored[0], layers[0](x))
test_eq(h.stored[1], F.relu(layers[0](x)))
test_eq(h.stored[2], y)
#export
def hook_outputs(modules, detach=True, cpu=False, grad=False):
"Return `Hooks` that store activations of all `modules` in `self.stored`"
return Hooks(modules, _hook_inner, detach=detach, cpu=cpu, is_forward=not grad)
The activations stored are the gradients if grad=True
, otherwise the output of modules
. If detach=True
they are detached from their history, and if cpu=True
, they're put on the CPU.
layers = [nn.Linear(5,10), nn.ReLU(), nn.Linear(10,3)]
tst_model = nn.Sequential(*layers)
x = torch.randn(4,5)
with hook_outputs(layers) as h:
y = tst_model(x)
test_eq(h.stored[0], layers[0](x))
test_eq(h.stored[1], F.relu(layers[0](x)))
test_eq(h.stored[2], y)
for s in h.stored: assert not s.requires_grad
with hook_outputs(layers, grad=True) as h:
y = tst_model(x)
loss = y.pow(2).mean()
loss.backward()
g = 2*y / y.numel()
test_close(g, h.stored[2][0])
g = g @ layers[2].weight.data
test_close(g, h.stored[1][0])
g = g * (layers[0](x) > 0).float()
test_close(g, h.stored[0][0])
#cuda
with hook_outputs(tst_model, cpu=True) as h:
y = tst_model.cuda()(x.cuda())
for s in h.stored: test_eq(s.device, torch.device('cpu'))
#export
def dummy_eval(m, size=(64,64)):
"Evaluate `m` on a dummy input of a certain `size`"
ch_in = in_channels(m)
x = one_param(m).new(1, ch_in, *size).requires_grad_(False).uniform_(-1.,1.)
with torch.no_grad(): return m.eval()(x)
#export
def model_sizes(m, size=(64,64)):
"Pass a dummy input through the model `m` to get the various sizes of activations."
with hook_outputs(m) as hooks:
_ = dummy_eval(m, size=size)
return [o.stored.shape for o in hooks]
m = nn.Sequential(ConvLayer(3, 16), ConvLayer(16, 32, stride=2), ConvLayer(32, 32))
test_eq(model_sizes(m), [[1, 16, 64, 64], [1, 32, 32, 32], [1, 32, 32, 32]])
#export
def num_features_model(m):
"Return the number of output features for `m`."
sz,ch_in = 32,in_channels(m)
while True:
#Trying for a few sizes in case the model requires a big input size.
try:
return model_sizes(m, (sz,sz))[-1][1]
except Exception as e:
sz *= 2
if sz > 2048: raise e
m = nn.Sequential(nn.Conv2d(5,4,3), nn.Conv2d(4,3,3))
test_eq(num_features_model(m), 3)
m = nn.Sequential(ConvLayer(3, 16), ConvLayer(16, 32, stride=2), ConvLayer(32, 32))
test_eq(num_features_model(m), 32)
To make hooks easy to use, we wrapped a version in a Callback where you just have to implement a hook
function (plus any element you might need).
#export
def has_params(m):
"Check if `m` has at least one parameter"
return len(list(m.parameters())) > 0
assert has_params(nn.Linear(3,4))
assert has_params(nn.LSTM(4,5,2))
assert not has_params(nn.ReLU())
#export
@funcs_kwargs
class HookCallback(Callback):
"`Callback` that can be used to register hooks on `modules`"
_methods = ["hook"]
hook = noops
def __init__(self, modules=None, every=None, remove_end=True, is_forward=True, detach=True, cpu=True, **kwargs):
store_attr(self, 'modules,every,remove_end,is_forward,detach,cpu')
assert not kwargs
def begin_fit(self):
"Register the `Hooks` on `self.modules`."
if self.modules is None: self.modules = [m for m in flatten_model(self.model) if has_params(m)]
if self.every is None: self._register()
def begin_batch(self):
if self.every is None: return
if self.training and self.train_iter%self.every==0: self._register()
def after_batch(self):
if self.every is None: return
if self.training and self.train_iter%self.every==0: self._remove()
def after_fit(self):
"Remove the `Hooks`."
if self.remove_end: self._remove()
def _register(self): self.hooks = Hooks(self.modules, self.hook, self.is_forward, self.detach, self.cpu)
def _remove(self):
if getattr(self, 'hooks', None): self.hooks.remove()
def __del__(self): self._remove()
You can either subclass and implement a hook
function (along with any event you want) or pass that a hook
function when initializing. Such a function needs to take three argument: a layer, input and output (for a backward hook, input means gradient with respect to the inputs, output, gradient with respect to the output) and can either modify them or update the state according to them.
If not provided, modules
will default to the layers of self.model
that have a weight
attribute. Depending on do_remove
, the hooks will be properly removed at the end of training (or in case of error). is_forward
, detach
and cpu
are passed to Hooks
.
The function called at each forward (or backward) pass is self.hook
and must be implemented when subclassing this callback.
class TstCallback(HookCallback):
def hook(self, m, i, o): return o
def after_batch(self): test_eq(self.hooks.stored[0], self.pred)
learn = synth_learner(n_trn=5, cbs = TstCallback())
learn.fit(1)
(#4) [0,11.155611038208008,11.110326766967773,00:00]
class TstCallback(HookCallback):
def __init__(self, modules=None, remove_end=True, detach=True, cpu=False):
super().__init__(modules, None, remove_end, False, detach, cpu)
def hook(self, m, i, o): return o
def after_batch(self):
if self.training:
test_eq(self.hooks.stored[0][0], 2*(self.pred-self.y)/self.pred.shape[0])
learn = synth_learner(n_trn=5, cbs = TstCallback())
learn.fit(1)
(#4) [0,14.338729858398438,9.407218933105469,00:00]
show_doc(HookCallback.begin_fit)
show_doc(HookCallback.after_fit)
#export
def total_params(m):
"Give the number of parameters of a module and if it's trainable or not"
params = sum([p.numel() for p in m.parameters()])
trains = [p.requires_grad for p in m.parameters()]
return params, (False if len(trains)==0 else trains[0])
test_eq(total_params(nn.Linear(10,32)), (32*10+32,True))
test_eq(total_params(nn.Linear(10,32, bias=False)), (32*10,True))
test_eq(total_params(nn.BatchNorm2d(20)), (20*2, True))
test_eq(total_params(nn.BatchNorm2d(20, affine=False)), (0,False))
test_eq(total_params(nn.Conv2d(16, 32, 3)), (16*32*3*3 + 32, True))
test_eq(total_params(nn.Conv2d(16, 32, 3, bias=False)), (16*32*3*3, True))
#First ih layer 20--10, all else 10--10. *4 for the four gates
test_eq(total_params(nn.LSTM(20, 10, 2)), (4 * (20*10 + 10) + 3 * 4 * (10*10 + 10), True))
#export
def layer_info(learn):
def _track(m, i, o):
return (m.__class__.__name__,)+total_params(m)+(apply(lambda x:x.shape, o),)
layers = [m for m in flatten_model(learn.model)]
xb,_ = learn.dbunch.train_dl.one_batch()
with Hooks(layers, _track) as h:
_ = learn.model.eval()(apply(lambda o:o[:1], xb))
return xb,h.stored
m = nn.Sequential(nn.Linear(1,50), nn.ReLU(), nn.BatchNorm1d(50), nn.Linear(50, 1))
learn = synth_learner()
learn.model=m
test_eq(layer_info(learn)[1], [
('Linear', 100, True, [1, 50]),
('ReLU', 0, False, [1, 50]),
('BatchNorm1d', 100, True, [1, 50]),
('Linear', 51, True, [1, 1])
])
#export
def _print_shapes(o, bs):
if isinstance(o, torch.Size): return ' x '.join([str(bs)] + [str(t) for t in o[1:]])
else: return [_print_shapes(x, bs) for x in o]
#export
@patch
def summary(self:Learner):
"Print a summary of the model, optimizer and loss function."
xb,infos = layer_info(self)
n,bs = 64,find_bs(xb)
inp_sz = _print_shapes(apply(lambda x:x.shape, xb), bs)
res = f"{self.model.__class__.__name__} (Input shape: {inp_sz})\n"
res += "=" * n + "\n"
res += f"{'Layer (type)':<20} {'Output Shape':<20} {'Param #':<10} {'Trainable':<10}\n"
res += "=" * n + "\n"
ps,trn_ps = 0,0
for typ,np,trn,sz in infos:
if sz is None: continue
ps += np
if trn: trn_ps += np
res += f"{typ:<20} {_print_shapes(sz, bs):<20} {np:<10,} {str(trn):<10}\n"
res += "_" * n + "\n"
res += f"\nTotal params: {ps:,}\n"
res += f"Total trainable params: {trn_ps:,}\n"
res += f"Total non-trainable params: {ps - trn_ps:,}\n\n"
res += f"Optimizer used: {self.opt_func}\nLoss function: {self.loss_func}\n\n"
if self.opt is not None:
res += f"Model " + ("unfrozen\n\n" if self.opt.frozen_idx==0 else f"frozen up to parameter group number {self.opt.frozen_idx}\n\n")
res += "Callbacks:\n" + '\n'.join(f" - {cb}" for cb in sort_by_run(self.cbs))
return PrettyString(res)
m = nn.Sequential(nn.Linear(1,50), nn.ReLU(), nn.BatchNorm1d(50), nn.Linear(50, 1))
for p in m[0].parameters(): p.requires_grad_(False)
learn = synth_learner()
learn.create_opt()
learn.model=m
learn.summary()
Sequential (Input shape: 16 x 1) ================================================================ Layer (type) Output Shape Param # Trainable ================================================================ Linear 16 x 50 100 False ________________________________________________________________ ReLU 16 x 50 0 False ________________________________________________________________ BatchNorm1d 16 x 50 100 True ________________________________________________________________ Linear 16 x 1 51 True ________________________________________________________________ Total params: 251 Total trainable params: 151 Total non-trainable params: 100 Optimizer used: functools.partial(<function SGD at 0x7f8216dfbdd0>, mom=0.9) Loss function: FlattenedLoss of MSELoss() Model unfrozen Callbacks: - TrainEvalCallback - Recorder
This is an example of a HookCallback
, that stores the mean, stds and histograms of activations that go through the network.
#exports
@delegates()
class ActivationStats(HookCallback):
"Callback that record the mean and std of activations."
run_before=TrainEvalCallback
def __init__(self, with_hist=False, **kwargs):
super().__init__(**kwargs)
self.with_hist = with_hist
def begin_fit(self):
"Initialize stats."
super().begin_fit()
self.stats = L()
def hook(self, m, i, o):
o = o.float()
res = {'mean': o.mean().item(), 'std': o.std().item(), 'percent_null': (o<=0.05).long().sum().item()/o.numel()}
if self.with_hist: res['hist'] = o.histc(40,0,10)
return res
def after_batch(self):
"Take the stored results and puts it in `self.stats`"
if self.training and (self.every is None or self.train_iter%self.every != 0): self.stats.append(self.hooks.stored)
super().after_batch()
learn = synth_learner(n_trn=5, cbs = ActivationStats(every=4))
learn.fit(1)
(#4) [0,15.506263732910156,14.684226989746094,00:00]
learn.activation_stats.stats
(#3) [(#1) [{'mean': -0.9653729200363159, 'std': 1.5585802793502808, 'percent_null': 0.8125}],(#1) [{'mean': -0.9653729200363159, 'std': 1.5585802793502808, 'percent_null': 0.8125}],(#1) [{'mean': -0.9653729200363159, 'std': 1.5585802793502808, 'percent_null': 0.8125}]]
The first line contains the means of the outputs of the model for each batch in the training set, the second line their standard deviations.
len(learn.activation_stats.stats[0][0]['hist'])
40
#hide
class TstCallback(HookCallback):
def hook(self, m, i, o): return o
def begin_fit(self):
super().begin_fit()
self.means,self.stds = [],[]
def after_batch(self):
if self.training:
self.means.append(self.hooks.stored[0].mean().item())
self.stds.append (self.hooks.stored[0].std() .item())
learn = synth_learner(n_trn=5, cbs = [TstCallback(), ActivationStats()])
learn.fit(1)
test_eq(learn.activation_stats.stats.itemgot(0).itemgot("mean"), learn.tst.means)
test_eq(learn.activation_stats.stats.itemgot(0).itemgot("std"), learn.tst.stds)
(#4) [0,4.010986804962158,3.2662320137023926,00:00]
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)
Converted 00_test.ipynb. Converted 01_core_foundation.ipynb. Converted 01a_core_utils.ipynb. Converted 01b_core_dispatch.ipynb. Converted 01c_core_transform.ipynb. Converted 02_core_script.ipynb. Converted 03_torchcore.ipynb. Converted 03a_layers.ipynb. Converted 04_data_load.ipynb. Converted 05_data_core.ipynb. Converted 06_data_transforms.ipynb. Converted 07_data_block.ipynb. Converted 08_vision_core.ipynb. Converted 09_vision_augment.ipynb. Converted 09a_vision_data.ipynb. Converted 10_pets_tutorial.ipynb. Converted 11_vision_models_xresnet.ipynb. Converted 12_optimizer.ipynb. Converted 13_learner.ipynb. Converted 13a_metrics.ipynb. Converted 14_callback_schedule.ipynb. Converted 14a_callback_data.ipynb. Converted 15_callback_hook.ipynb. Converted 15a_vision_models_unet.ipynb. Converted 16_callback_progress.ipynb. Converted 17_callback_tracker.ipynb. Converted 18_callback_fp16.ipynb. Converted 19_callback_mixup.ipynb. Converted 20_interpret.ipynb. Converted 20a_distributed.ipynb. Converted 21_vision_learner.ipynb. Converted 22_tutorial_imagenette.ipynb. Converted 23_tutorial_transfer_learning.ipynb. Converted 30_text_core.ipynb. Converted 31_text_data.ipynb. Converted 32_text_models_awdlstm.ipynb. Converted 33_text_models_core.ipynb. Converted 34_callback_rnn.ipynb. Converted 35_tutorial_wikitext.ipynb. Converted 36_text_models_qrnn.ipynb. Converted 37_text_learner.ipynb. Converted 38_tutorial_ulmfit.ipynb. Converted 40_tabular_core.ipynb. Converted 41_tabular_model.ipynb. Converted 42_tabular_rapids.ipynb. Converted 50_data_block_examples.ipynb. Converted 60_medical_imaging.ipynb. Converted 65_medical_text.ipynb. Converted 70_callback_wandb.ipynb. Converted 71_callback_tensorboard.ipynb. Converted 90_notebook_core.ipynb. Converted 91_notebook_export.ipynb. Converted 92_notebook_showdoc.ipynb. Converted 93_notebook_export2html.ipynb. Converted 94_notebook_test.ipynb. Converted 95_index.ipynb. Converted 96_data_external.ipynb. Converted 97_utils_test.ipynb. Converted notebook2jekyll.ipynb.