# Minimal fork of https://github.com/rwightman/gen-efficientnet-pytorch
# Adds setup and lets you set the activation function
# Note changes on setup branch
# !pip install git+https://github.com/thomasbrandon/gen-efficientnet-pytorch@setup
from fastai.vision import *
from gen_efficientnet.gen_efficientnet import efficientnet_b0, model_urls
import swish_torch
SIZE = 256 # Resize crop to 256x256
BS = 48 # Could probably be a little higher for CUDA/Function but will use same for all
LR=1e-3
PATH = untar_data(URLs.IMAGEWOOF_320)
data = (ImageList
.from_folder(PATH)
.split_by_folder(valid='val')
.label_from_folder()
.transform(([flip_lr(p=0.5)], []), size=SIZE)
.databunch(bs=BS, num_workers=6)
.presize(SIZE, scale=(0.35,1))
.normalize(imagenet_stats))
class PeakMemMetric(LearnerCallback):
"Callback that measures used and peak GPU memory."
_order=-20 # Needs to run before the recorder
def __init__(self, learn:Learner, device=None):
super().__init__(learn)
assert torch.cuda.is_available(), "pytorch CUDA is required"
self._dev = ifnone(device, torch.cuda.current_device())
def on_train_begin(self, **kwargs):
self.learn.recorder.add_metric_names(['cache MB', 'alloc MB'])
def on_epoch_begin(self, **kwargs):
torch.cuda.reset_max_memory_cached(self._dev)
torch.cuda.reset_max_memory_allocated(self._dev)
def on_epoch_end(self, last_metrics, **kwargs):
b2mb = lambda num: int(num/2**20)
cache = torch.cuda.max_memory_cached(self._dev)
alloc = torch.cuda.max_memory_allocated(self._dev)
return add_metrics(last_metrics, [b2mb(cache), b2mb(alloc)])
def load_pretrained(mdl):
# Load pretrained data, except for differently size linear layers
state_dict = torch.utils.model_zoo.load_url(model_urls['efficientnet_b0'])
for attr in ['weight','bias']: state_dict[f'classifier.{attr}'] = getattr(mdl.classifier, attr)
mdl.load_state_dict(state_dict)
# https://github.com/fastai/imagenette
# Subset of 10 dog breeds from Imagenet, 320px shortest side
data
ImageDataBunch; Train: LabelList (12454 items) x: ImageList Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256) y: CategoryList n02111889,n02111889,n02111889,n02111889,n02111889 Path: /home/user/.fastai/data/imagewoof-320; Valid: LabelList (500 items) x: ImageList Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256) y: CategoryList n02111889,n02111889,n02111889,n02111889,n02111889 Path: /home/user/.fastai/data/imagewoof-320; Test: None
mdl = efficientnet_b0(num_classes=data.c)
load_pretrained(mdl)
mdl.act_fn??
Signature: mdl.act_fn(x, inplace=False) Docstring: <no docstring> Source: def swish(x, inplace=False): if inplace: return x.mul_(x.sigmoid()) else: return x * x.sigmoid() File: ~/.conda/envs/fastai/lib/python3.7/site-packages/gen_efficientnet/efficientnet_builder.py Type: function
lrn = Learner(data, mdl, callback_fns=[PeakMemMetric], metrics=[accuracy])
lrn.fit_one_cycle(5, LR)
epoch | train_loss | valid_loss | accuracy | cache MB | alloc MB | time |
---|---|---|---|---|---|---|
0 | 0.400987 | 0.370652 | 0.890000 | 7204 | 6890 | 01:12 |
1 | 0.439666 | 0.385724 | 0.890000 | 7106 | 6879 | 01:11 |
2 | 0.298581 | 0.274652 | 0.910000 | 7106 | 6879 | 01:12 |
3 | 0.136597 | 0.231383 | 0.918000 | 7106 | 6879 | 01:11 |
4 | 0.075961 | 0.211751 | 0.932000 | 7106 | 6879 | 01:11 |
lrn.destroy()
del lrn, mdl
class SwishFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, i):
result = i * torch.sigmoid(i)
ctx.save_for_backward(i)
return result
@staticmethod
def backward(ctx, grad_output):
i, = ctx.saved_tensors
if not ctx.needs_input_grad[0]: return (None,)
sigmoid_i = torch.sigmoid(i)
return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
# Activation function for gen_efficientnet has an inplace keyword
# Can't be inplace so just ignore
def swish_function(x, inplace=False): return SwishFunction.apply(x)
mdl = efficientnet_b0(num_classes=data.c, act_fn=swish_function)
load_pretrained(mdl)
lrn = Learner(data, mdl, callback_fns=[PeakMemMetric], metrics=[accuracy])
lrn.fit_one_cycle(5, LR)
epoch | train_loss | valid_loss | accuracy | cache MB | alloc MB | time |
---|---|---|---|---|---|---|
0 | 0.450081 | 0.593470 | 0.882000 | 6432 | 5421 | 01:14 |
1 | 0.436954 | 0.368458 | 0.880000 | 6432 | 5421 | 01:13 |
2 | 0.262158 | 0.368661 | 0.890000 | 6432 | 5421 | 01:14 |
3 | 0.142793 | 0.246673 | 0.928000 | 6432 | 5421 | 01:14 |
4 | 0.075377 | 0.240533 | 0.924000 | 6432 | 5421 | 01:14 |
lrn.destroy()
del lrn, mdl
# Activation function for gen_efficientnet has an inplace keyword
# Can't be inplace so just ignore
def swish_cuda_fn(x, inplace=False): return swish_torch.swish(x)
mdl = efficientnet_b0(num_classes=data.c, act_fn=swish_cuda_fn)
load_pretrained(mdl)
lrn = Learner(data, mdl, callback_fns=[PeakMemMetric], metrics=[accuracy])
lrn.fit_one_cycle(5, LR)
epoch | train_loss | valid_loss | accuracy | cache MB | alloc MB | time |
---|---|---|---|---|---|---|
0 | 0.444761 | 0.394772 | 0.874000 | 5934 | 5400 | 01:02 |
1 | 0.441538 | 0.434501 | 0.866000 | 5934 | 5400 | 01:01 |
2 | 0.293320 | 0.276060 | 0.906000 | 5934 | 5400 | 01:02 |
3 | 0.149419 | 0.245342 | 0.918000 | 5934 | 5400 | 01:02 |
4 | 0.061624 | 0.258465 | 0.918000 | 5934 | 5400 | 01:02 |
lrn.destroy()
del lrn, mdl
train_loss valid_loss accuracy cache MB alloc MB time
Original 0.075961 0.211751 0.932000 7106 6879 01:11
Autograd 0.075377 0.240533 0.924000 6432 5421 01:14
CUDA 0.061624 0.258465 0.918000 5934 5400 01:02
So the CUDA version is (slightly) faster than the original with the memory usage of the Autoigrad version.