Swish Implementation Comparison¶

In [1]:

# Minimal fork of https://github.com/rwightman/gen-efficientnet-pytorch
# Adds setup and lets you set the activation function
# Note changes on setup branch
# !pip install git+https://github.com/thomasbrandon/gen-efficientnet-pytorch@setup

In [2]:

from fastai.vision import *
from gen_efficientnet.gen_efficientnet import efficientnet_b0, model_urls
import swish_torch

In [3]:

SIZE = 256 # Resize crop to 256x256
BS = 48 # Could probably be a little higher for CUDA/Function but will use same for all
LR=1e-3

Setup¶

In [4]:

PATH = untar_data(URLs.IMAGEWOOF_320)
data = (ImageList
          .from_folder(PATH)
          .split_by_folder(valid='val')
          .label_from_folder()
          .transform(([flip_lr(p=0.5)], []), size=SIZE)
          .databunch(bs=BS, num_workers=6)
          .presize(SIZE, scale=(0.35,1))
          .normalize(imagenet_stats))

In [5]:

class PeakMemMetric(LearnerCallback):
    "Callback that measures used and peak GPU memory."
    _order=-20 # Needs to run before the recorder

    def __init__(self, learn:Learner, device=None):
        super().__init__(learn)
        assert torch.cuda.is_available(), "pytorch CUDA is required"
        self._dev = ifnone(device, torch.cuda.current_device())

    def on_train_begin(self, **kwargs):
        self.learn.recorder.add_metric_names(['cache MB',  'alloc MB'])

    def on_epoch_begin(self, **kwargs):
        torch.cuda.reset_max_memory_cached(self._dev)
        torch.cuda.reset_max_memory_allocated(self._dev)
        
    def on_epoch_end(self, last_metrics, **kwargs):
        b2mb = lambda num: int(num/2**20)
        cache = torch.cuda.max_memory_cached(self._dev)
        alloc = torch.cuda.max_memory_allocated(self._dev)
        return add_metrics(last_metrics, [b2mb(cache), b2mb(alloc)])

In [6]:

def load_pretrained(mdl):
    # Load pretrained data, except for differently size linear layers
    state_dict = torch.utils.model_zoo.load_url(model_urls['efficientnet_b0'])
    for attr in ['weight','bias']: state_dict[f'classifier.{attr}'] = getattr(mdl.classifier, attr)
    mdl.load_state_dict(state_dict)

In [7]:

# https://github.com/fastai/imagenette
# Subset of 10 dog breeds from Imagenet, 320px shortest side
data

Out[7]:

ImageDataBunch;

Train: LabelList (12454 items)
x: ImageList
Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256)
y: CategoryList
n02111889,n02111889,n02111889,n02111889,n02111889
Path: /home/user/.fastai/data/imagewoof-320;

Valid: LabelList (500 items)
x: ImageList
Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256)
y: CategoryList
n02111889,n02111889,n02111889,n02111889,n02111889
Path: /home/user/.fastai/data/imagewoof-320;

Test: None

Original Implementation¶

In [8]:

mdl = efficientnet_b0(num_classes=data.c)
load_pretrained(mdl)

In [9]:

mdl.act_fn??

Signature: mdl.act_fn(x, inplace=False)
Docstring: <no docstring>
Source:   
def swish(x, inplace=False):
    if inplace:
        return x.mul_(x.sigmoid())
    else:
        return x * x.sigmoid()
File:      ~/.conda/envs/fastai/lib/python3.7/site-packages/gen_efficientnet/efficientnet_builder.py
Type:      function

In [10]:

lrn = Learner(data, mdl, callback_fns=[PeakMemMetric], metrics=[accuracy])
lrn.fit_one_cycle(5, LR)

epoch	train_loss	valid_loss	accuracy	cache MB	alloc MB	time
0	0.400987	0.370652	0.890000	7204	6890	01:12
1	0.439666	0.385724	0.890000	7106	6879	01:11
2	0.298581	0.274652	0.910000	7106	6879	01:12
3	0.136597	0.231383	0.918000	7106	6879	01:11
4	0.075961	0.211751	0.932000	7106	6879	01:11

In [ ]:

lrn.destroy()
del lrn, mdl

Autograd Function Implementation¶

In [8]:

class SwishFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, i):
        result = i * torch.sigmoid(i)
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i, = ctx.saved_tensors
        if not ctx.needs_input_grad[0]: return (None,)
        sigmoid_i = torch.sigmoid(i)
        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
    
# Activation function for gen_efficientnet has an inplace keyword
# Can't be inplace so just ignore
def swish_function(x, inplace=False): return SwishFunction.apply(x)

mdl = efficientnet_b0(num_classes=data.c, act_fn=swish_function)
load_pretrained(mdl)
lrn = Learner(data, mdl, callback_fns=[PeakMemMetric], metrics=[accuracy])
lrn.fit_one_cycle(5, LR)

epoch	train_loss	valid_loss	accuracy	cache MB	alloc MB	time
0	0.450081	0.593470	0.882000	6432	5421	01:14
1	0.436954	0.368458	0.880000	6432	5421	01:13
2	0.262158	0.368661	0.890000	6432	5421	01:14
3	0.142793	0.246673	0.928000	6432	5421	01:14
4	0.075377	0.240533	0.924000	6432	5421	01:14

In [ ]:

lrn.destroy()
del lrn, mdl

CUDA Implementation¶

In [8]:

# Activation function for gen_efficientnet has an inplace keyword
# Can't be inplace so just ignore
def swish_cuda_fn(x, inplace=False): return swish_torch.swish(x)

mdl = efficientnet_b0(num_classes=data.c, act_fn=swish_cuda_fn)
load_pretrained(mdl)
lrn = Learner(data, mdl, callback_fns=[PeakMemMetric], metrics=[accuracy])
lrn.fit_one_cycle(5, LR)

epoch	train_loss	valid_loss	accuracy	cache MB	alloc MB	time
0	0.444761	0.394772	0.874000	5934	5400	01:02
1	0.441538	0.434501	0.866000	5934	5400	01:01
2	0.293320	0.276060	0.906000	5934	5400	01:02
3	0.149419	0.245342	0.918000	5934	5400	01:02
4	0.061624	0.258465	0.918000	5934	5400	01:02

In [ ]:

lrn.destroy()
del lrn, mdl

Results¶

    	  train_loss  valid_loss  accuracy  cache MB  alloc MB  time
Original  0.075961    0.211751    0.932000  7106      6879      01:11
Autograd  0.075377    0.240533    0.924000  6432      5421      01:14
CUDA      0.061624    0.258465    0.918000  5934      5400      01:02

So the CUDA version is (slightly) faster than the original with the memory usage of the Autoigrad version.