In [1]:

import sys
import torch

In [2]:

sys.path.append("../../fastai/")

In [3]:

%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [4]:

from fastai.conv_learner import *
from fastai.dataset import *

from pathlib import Path
import json
from PIL import ImageDraw, ImageFont
from matplotlib import patches, patheffects
#torch.cuda.set_device(0)

Plotting¶

In [5]:

import matplotlib.cm as cmx
import matplotlib.colors as mcolors
from cycler import cycler

def get_cmap(N):
    color_norm  = mcolors.Normalize(vmin=0, vmax=N-1)
    return cmx.ScalarMappable(norm=color_norm, cmap='Set3').to_rgba

num_colr = 12
cmap = get_cmap(num_colr)
colr_list = [cmap(float(x)) for x in range(num_colr)]

In [6]:

def show_ground_truth(ax, im, bbox, clas=None, prs=None, thresh=0.3):
    bb = [bb_hw(o) for o in bbox.reshape(-1,4)]
    if prs is None:  prs  = [None]*len(bb)
    if clas is None: clas = [None]*len(bb)
    ax = show_img(im, ax=ax)
    for i,(b,c,pr) in enumerate(zip(bb, clas, prs)):
        if((b[2]>0) and (pr is None or pr > thresh)):
            draw_rect(ax, b, color=colr_list[i%num_colr])
            txt = f'{i}: '
            if c is not None: txt += ('bg' if c==len(id2cat) else id2cat[c])
            if pr is not None: txt += f' {pr:.2f}'
            draw_text(ax, b[:2], txt, color=colr_list[i%num_colr])

In [7]:

def show_img(im, figsize=None, ax=None):
    if not ax: fig,ax = plt.subplots(figsize=figsize)
    ax.imshow(im)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    return ax

In [8]:

def draw_outline(o, lw):
    o.set_path_effects([patheffects.Stroke(
        linewidth=lw, foreground='black'), patheffects.Normal()])

In [9]:

def draw_rect(ax, b, color='white'):
    patch = ax.add_patch(patches.Rectangle(b[:2], *b[-2:], fill=False, edgecolor=color, lw=3))
    draw_outline(patch, 4)

In [10]:

def draw_text(ax, xy, txt, sz=14, color='white'):
    text = ax.text(*xy, txt,
        verticalalignment='top', color=color, fontsize=sz, weight='bold')
    draw_outline(text, 1)

In [11]:

def draw_im(im, ann):
    ax = show_img(im, figsize=(16,8))
    for b,c in ann:
        b = bb_hw(b)
        draw_rect(ax, b)
        draw_text(ax, b[:2], id2cat[c], sz=16)

In [12]:

def draw_idx(i):
    im_a = trn_anno[i]
    im = open_image(IMG_PATH/trn_fns[i])
    print(im.shape)
    draw_im(im, im_a)

Data Prep - Pascal¶

The most awful part

In [13]:

PATH = Path('../../data/pascal')
list(PATH.iterdir())

Out[13]:

[PosixPath('../../data/pascal/pascal_train2007.json'),
 PosixPath('../../data/pascal/PASCAL_VOC'),
 PosixPath('../../data/pascal/pascal_test2007.json'),
 PosixPath('../../data/pascal/models'),
 PosixPath('../../data/pascal/tmp'),
 PosixPath('../../data/pascal/pascal_val2012.json'),
 PosixPath('../../data/pascal/csv'),
 PosixPath('../../data/pascal/VOCdevkit'),
 PosixPath('../../data/pascal/pascal_train2012.json'),
 PosixPath('../../data/pascal/pascal_val2007.json')]

In [14]:

trn_j = json.load((PATH/'pascal_train2007.json').open())
trn_j.keys()

Out[14]:

dict_keys(['images', 'type', 'annotations', 'categories'])

In [15]:

IMAGES,ANNOTATIONS,CATEGORIES = ['images', 'annotations', 'categories']
trn_j[IMAGES][:5]

Out[15]:

[{'file_name': '000012.jpg', 'height': 333, 'width': 500, 'id': 12},
 {'file_name': '000017.jpg', 'height': 364, 'width': 480, 'id': 17},
 {'file_name': '000023.jpg', 'height': 500, 'width': 334, 'id': 23},
 {'file_name': '000026.jpg', 'height': 333, 'width': 500, 'id': 26},
 {'file_name': '000032.jpg', 'height': 281, 'width': 500, 'id': 32}]

In [16]:

FILE_NAME,ID,IMG_ID,CAT_ID,BBOX = 'file_name','id','image_id','category_id','bbox'
cats = {o[ID]:o['name'] for o in trn_j[CATEGORIES]}
trn_fns = {o[ID]:o[FILE_NAME] for o in trn_j[IMAGES]}
trn_ids = [o[ID] for o in trn_j[IMAGES]]

In [17]:

JPEGS = 'VOCdevkit/VOC2007/JPEGImages'
IMG_PATH = PATH/JPEGS
list(IMG_PATH.iterdir())[:5]

Out[17]:

[PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/000671.jpg'),
 PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/005710.jpg'),
 PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/003042.jpg'),
 PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/004429.jpg'),
 PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/003866.jpg')]

In [18]:

def hw_bb(bb): return np.array([bb[1], bb[0], bb[3]+bb[1]-1, bb[2]+bb[0]-1])
def bb_hw(a): return np.array([a[1],a[0],a[3]-a[1]+1,a[2]-a[0]+1])
trn_anno = collections.defaultdict(lambda:[])
for o in trn_j[ANNOTATIONS]:
    if not o['ignore']:
        bb = o[BBOX]
        bb = hw_bb(bb)
        trn_anno[o[IMG_ID]].append((bb,o[CAT_ID]))
        
len(trn_anno)

Out[18]:

In [19]:

i = 0
# get im filename
im_dict = trn_j[IMAGES][i]
im_dict[FILE_NAME],im_dict[ID]
# open image
im = open_image(IMG_PATH/im_dict[FILE_NAME])
# get annotations
im_anno = trn_anno[im_dict[ID]]

In [20]:

# im_anno: a single list which has tuple (bb, )
im_anno[0]

Out[20]:

(array([ 96, 155, 269, 350]), 7)

In [21]:

im_anno

Out[21]:

[(array([ 96, 155, 269, 350]), 7)]

In [22]:

# create multiclass classification csv
MC_CSV = PATH/'csv/mc.csv'

mc = [set([cats[p[1]] for p in trn_anno[o]]) for o in trn_ids]
mcs = [' '.join(str(p) for p in o) for o in mc]

df = pd.DataFrame({'fn': [trn_fns[o] for o in trn_ids], 'clas': mcs}, columns=['fn','clas'])

df.to_csv(MC_CSV, index=False)

In [23]:

df.head()

Out[23]:

	fn	clas
0	000012.jpg	car
1	000017.jpg	person horse
2	000023.jpg	person bicycle
3	000026.jpg	car
4	000032.jpg	person aeroplane

In [24]:

# create class and bbox csv s
CLAS_CSV = PATH/'csv/clas.csv'
MBB_CSV = PATH/'csv/mbb.csv'

mc = [[cats[p[1]] for p in trn_anno[o]] for o in trn_ids]
id2cat = list(cats.values())
cat2id = {v:k for k,v in enumerate(id2cat)}
mcs = np.array([np.array([cat2id[p] for p in o]) for o in mc]); mcs

val_idxs = get_cv_idxs(len(trn_fns))
((val_mcs,trn_mcs),) = split_by_idx(val_idxs, mcs)

mbb = [np.concatenate([p[0] for p in trn_anno[o]]) for o in trn_ids]
mbbs = [' '.join(str(p) for p in o) for o in mbb]

df = pd.DataFrame({'fn': [trn_fns[o] for o in trn_ids], 'bbox': mbbs}, columns=['fn','bbox'])
df.to_csv(MBB_CSV, index=False)

Dataset and Dataloader¶

In [25]:

df.head(2)

Out[25]:

	fn	bbox
0	000012.jpg	96 155 269 350
1	000017.jpg	61 184 198 278 77 89 335 402

In [26]:

f_model=resnet34
sz=256
bs=32

In [27]:

aug_tfms = [RandomFlip(tfm_y=TfmType.COORD)]
tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, tfm_y=TfmType.COORD, aug_tfms=aug_tfms)
md = ImageClassifierData.from_csv(PATH, JPEGS, MBB_CSV, tfms=tfms, continuous=True, num_workers=4, bs=16)

In [28]:

aug_tfms = [RandomRotate(10, tfm_y = TfmType.COORD),
           RandomLighting(0.05,0.05, tfm_y = TfmType.COORD),
           RandomFlip(tfm_y = TfmType.COORD)]
tfms = tfms_from_model(f_model, sz, aug_tfms=aug_tfms, crop_type=CropType.NO, tfm_y = TfmType.COORD)
md = ImageClassifierData.from_csv(PATH, JPEGS, MBB_CSV, tfms=tfms, continuous=True, num_workers=4, bs=bs)

In [29]:

class ConcatLblDataset(Dataset):
    def __init__(self, ds, y2):
        self.ds,self.y2 = ds,y2
        self.sz = ds.sz
    def __len__(self): return len(self.ds)
    
    def __getitem__(self, i):
        x,y = self.ds[i]
        return (x, (y,self.y2[i]))

trn_ds2 = ConcatLblDataset(md.trn_ds, trn_mcs)
val_ds2 = ConcatLblDataset(md.val_ds, val_mcs)
md.trn_dl.dataset = trn_ds2
md.val_dl.dataset = val_ds2

In [30]:

len(md.trn_ds), len(md.val_ds)

Out[30]:

(2001, 500)

In [31]:

x,y=to_np(next(iter(md.val_dl)))
x=md.val_ds.ds.denorm(x)

In [32]:

sample_x, sample_y_bb, sample_y_cls = x[0], y[0][0], y[1][0]

In [33]:

sample_y_bb, sample_y_cls

Out[33]:

(array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 103.,  53., 255., 193.,   0.,  56., 235.,
        206.,  10., 193., 248., 254.], dtype=float32),
 array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 14, 14]))

In [34]:

def show_gt(ax, x, y_bb, y_cls):
    """
    Show ground truth image and labels given a sample
    
    x, y_bb and y_cls : image, bbox array, cls idxs array
    
    Modified version of 'show_ground_truth'
    """
    # get non zero idxs for indexing labels
    nonzero_idxs = np.nonzero(y_cls)[0]

    # get cls indexes
    cls_idxs = y_cls[nonzero_idxs]

    # get bboxes - bb format
    bboxes = []
    for idx in nonzero_idxs:
        bboxes.append(y_bb[idx*4:(idx+1)*4])
    
    # show img, draw bboxes and text
    ax = show_img(x, ax=ax)
    for bb, cls_idx in zip(bboxes, cls_idxs):
        b = bb_hw(bb)
        draw_rect(ax, b, color=colr_list[cls_idx%len(colr_list)])
        draw_text(ax, b[:2], id2cat[cls_idx])

In [35]:

fig, ax = plt.subplots(figsize=(20, 10))
show_gt(ax, sample_x, sample_y_bb, sample_y_cls)

In [36]:

fig, axes = plt.subplots(3, 4, figsize=(16, 12))
for i,ax in enumerate(axes.flat):
    show_gt(ax, x[i], y[0][i], y[1][i])
plt.tight_layout()

Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

In [37]:

# check data 
x, y = next(iter(md.trn_dl))

In [38]:

x.size(), y[0].size(), y[1].size()

Out[38]:

(torch.Size([32, 3, 256, 256]), torch.Size([32, 36]), torch.Size([32, 9]))

Model: RetinaNet¶

Subnet1: Classification¶

"Taking an input feature map with C channels from a given pyramid level, the subnet applies four 3x3 conv layers, each with C filters and each followed by ReLU activations, followed by a 3x3 conv layer with KA filters. Finally sigmoid activations are attached to output the KA binary predictions per spatial location, see Figure 3 (c). We use C = 256 and A = 9 in most experiments.

We will use same notations from the paper

K: number of classes

A: number of anchors

All new conv layers except the final one in the RetinaNet subnets are initialized with bias b = 0 and a Gaussian weight fill with $\sigma = 0.01$. For the final conv layer of the classification subnet, we set the bias initialization to $b = − log((1 − \pi)/\pi)$.

In [39]:

"""
K: number of classes
A: number of anchors - 9 for original paper
C: number of channels coming from the pyramid levels
"""
K, A, C = len(id2cat), 1, 256

In [41]:

def conv_bn_relu(kernel_size=3,stride=1, pad=1,in_c=256, out_c=256, use_bn=True):
    """Conv batchnorm relu block"""
    block = nn.Sequential(
        nn.Conv2d(in_c, out_c, kernel_size, stride, pad),
        nn.ReLU())
    if use_bn: block.add_module('3', nn.BatchNorm2d(in_c))
    
    # N(0, 0.01) bias = 0 initialization
    block[0].weight.data.normal_(0, 0.01)
    block[0].bias.data.zero_()
    
    return block

In [42]:

def flatten_conv(x, A):
    """
    IMPORTANT: Receptive fields should match target
    A: number of anchors
    
    Flatten output as:
    grid row 0 col 0 anchor 0
    grid row 0 col 0 anchor 1
    ...
    grid row 0 col 1 anchor 0
    grid row 0 col 1 anchor 1
    ...
    grid row 0 col 2 anchor 0
    grid row 0 col 2 anchor 1
    ...
    grid row n col n anchor A
    grid row n col n anchor A-1
    """
    bs,nf,gx,gy = x.size()
    x = x.permute(0,2,3,1).contiguous()
    return x.view(bs,-1,nf//A)

In [43]:

class Subnet1(nn.Module):
    """For classification: outputs K*A"""
    def __init__(self, K, A, in_c, use_bn=False, depth=4, pi=0.01):
        super().__init__()
        
        # Number of anchors
        self.A = A
        
        # 4 block of convolutions
        self.conv = nn.Sequential(*children(conv_bn_relu(use_bn=False))*depth)
        
        # Final convolutio for prediction
        self.out_conv = nn.Conv2d(in_c, K*A, kernel_size=3, stride=1, padding=1)    
        
        # N(0, 0.01) bias = -np.log((1-pi)/pi) initialization 
        self.out_conv.weight.data.normal_(0, 0.01)
        self.out_conv.bias.data = self.out_conv.bias.data.zero_() - np.log((1-pi)/pi)
        
    def forward(self, x):
        return flatten_conv(self.out_conv(self.conv(x)), self.A)

In [44]:

subnet1 = Subnet1(K, A, 256)

In [45]:

subnet1

Out[45]:

Subnet1(
  (conv): Sequential(
    (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
    (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
  )
  (out_conv): Conv2d(256, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)

Subnet2: Box Regression¶

"The design of the box regression subnet is identical to the classification subnet except that it terminates in 4A linear outputs per spatial location."

In [46]:

class Subnet2(nn.Module):
    """For classification: outputs 4*A"""
    def __init__(self, A, in_c, use_bn=False, depth=4):
        super().__init__()
        
        # Number of anchors
        self.A = A
        
        # 4 block of convolutions
        self.conv = nn.Sequential(*children(conv_bn_relu(use_bn=False))*depth)
        
        # Final convolutio for prediction
        self.out_conv = nn.Conv2d(in_c, 4*A, kernel_size=3, stride=1, padding=1)    
        
        # N(0, 0.01) bias = 0
        self.out_conv.weight.data.normal_(0, 0.01)
        self.out_conv.bias.data = self.out_conv.bias.data.zero_()

    def forward(self, x):
        return flatten_conv(self.out_conv(self.conv(x)), self.A)

In [47]:

subnet2 = Subnet2(A, C)

In [48]:

subnet2

Out[48]:

Subnet2(
  (conv): Sequential(
    (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
    (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
  )
  (out_conv): Conv2d(256, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)

Backbone (ResNet + FPN)¶

"We experiment with ResNet-50-FPN and ResNet-101-FPN backbones [20]. The base ResNet-50 and ResNet-101 models are pre-trained on ImageNet1k; we use the models released by [16]. New layers added for FPN are initialized as in [20]."

In [49]:

# load defined model
def get_encoder(f, cut):
    base_model = (cut_model(f(True), cut))
    return nn.Sequential(*base_model)

In [50]:

class SaveFeatures():
    """ Extract pretrained activations"""
    features=None
    def __init__(self, m): self.hook = m.register_forward_hook(self.hook_fn)
    def hook_fn(self, module, input, output): self.features = output
    def remove(self): self.hook.remove()

In [51]:

f_model=resnet50
sz=256
bs=64

cut, cut_lr = model_meta[f_model]
inp = V(torch.ones(1,3,sz,sz))
encoder = get_encoder(f_model, cut)

In [52]:

sfs = [SaveFeatures(encoder[i]) for i in range(len(children(encoder)))]

In [53]:

inp = V(torch.ones(1,3,sz,sz))

In [54]:

encoder = encoder.cuda()

In [55]:

encoder(inp).size()

Out[55]:

torch.Size([1, 2048, 8, 8])

Paper Notes

Following [20], we build FPN on top of the ResNet ar- chitecture [16]. We construct a pyramid with levels P 3 through P 7 , where l indicates pyramid level (P l has reso- lution 2 l lower than the input). As in [20] all pyramid levels have C = 256 channels. Details of the pyramid generally follow [20] with a few modest differences. 2 While many design choices are not crucial, we emphasize the use of the FPN backbone is; preliminary experiments using features from only the final ResNet layer yielded low AP.

RetinaNet uses feature pyramid levels P3 to P7, where P3 to P5 are computed from the output of the corresponding ResNet residual stage (C3 through C5) using top-down and lateral connections just as in [FPN PAPER],

P6 is obtained via a 3x3 stride-2 conv on C5, and P7 is computed by apply-ing ReLU followed by a 3x3 stride-2 conv on P6. This differs slightly from [20]: (1) we don’t use the high-resolution pyramid level P2 for com-putational reasons, (2) P6 is computed by strided convolution instead of downsampling, and (3) we include P7 to improve large object detection. These minor modifications improve speed while maintaining accuracy. 4

In [56]:

# index C1, C2, C3, C4, C5 -> p2_out, p3_out, p4_out, p5_out, p6_out
[stage_act.features.size() for stage_act in sfs]

Out[56]:

[torch.Size([1, 64, 128, 128]),
 torch.Size([1, 64, 128, 128]),
 torch.Size([1, 64, 128, 128]),
 torch.Size([1, 64, 64, 64]),
 torch.Size([1, 256, 64, 64]),
 torch.Size([1, 512, 32, 32]),
 torch.Size([1, 1024, 16, 16]),
 torch.Size([1, 2048, 8, 8])]

FPN¶

RetinaNet uses feature pyramid levels P to P , where P to P are 3 7 3 5 computed from the output of the corresponding ResNet residual stage (C 3 through C 5 ) using top-down and lateral connections just as in [20], P 6 is obtained via a 3×3 stride-2 conv on C 5 , and P 7 is computed by apply- ing ReLU followed by a 3×3 stride-2 conv on P 6 . This differs slightly from [20]: (1) we don’t use the high-resolution pyramid level P 2 for com- putational reasons, (2) P 6 is computed by strided convolution instead of downsampling, and (3) we include P 7 to improve large object detection. These minor modifications improve speed while maintaining accuracy

In [714]:

class FPN(nn.Module):
    def __init__(self, encoder, out_c=256):
        super().__init__()
        self.encoder = encoder        
        self.sfs = [SaveFeatures(self.encoder[i]) for i in range(len(children(self.encoder)))]
        self.out_c = out_c
                
    def forward(self, x):
        #pdb.set_trace()
        # encode image with ResNet backbone
        x = self.encoder(x)
        
        # get c1, c2, c3, c4, c5 activations
        c1 = self.sfs[2].features  #64 (sz/2)
        c2 = self.sfs[4].features  #256 (sz/4)
        c3 = self.sfs[5].features  #512 (sz/8)
        c4 = self.sfs[6].features  #1024 (sz/16)
        c5 = self.sfs[7].features  #2048 (sz/32) : sz should be divisible by 32       
        C_sz = c5.size()[1] # 2048
        
        # construct convs
        if not hasattr(self, 'P6_conv1'):
            # get channel size of each intermediate activation
            self.sfs_c_sz = [stage_act.features.size()[1] for stage_act in self.sfs]
                        
            self.P6_conv1 = nn.Conv2d(C_sz, self.out_c, kernel_size=3, stride=2, padding=1).cuda()
            self.P6_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()
            self.P7_conv = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=2, padding=1).cuda()
            
            self.P5_conv1 = nn.Conv2d(C_sz, self.out_c, kernel_size=1, stride=1, padding=0).cuda()
            self.P5_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()
            
            self.P4_conv1 = nn.Conv2d(C_sz//2, self.out_c, kernel_size=1, stride=1, padding=0).cuda()
            self.P4_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()
            
            self.P3_conv1 = nn.Conv2d(C_sz//4, self.out_c, kernel_size=1, stride=1, padding=0).cuda()
            self.P3_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()
            
            #self.P2_conv1 = nn.Conv2d(C_sz//8, self.out_c, kernel_size=1, stride=1, padding=0).cuda()
            #self.P2_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()    
            
        # get P2, P3, P4, P5, P6
        p6 = self.P6_conv1(c5)
        p6_out = self.P6_conv2(p6)
        
        p7_out = self.P7_conv(F.relu(p6_out))
        
        p5 = self.P5_conv1(c5) + F.upsample(p6, scale_factor=2, mode='nearest')
        p5_out = self.P5_conv2(p5)
        
        p4 = self.P4_conv1(c4) + F.upsample(p5, scale_factor=2, mode='nearest')
        p4_out = self.P4_conv2(p4)
        
        #p3 = self.P3_conv1(c3) + F.upsample(p4, scale_factor=2, mode='nearest')
        #p3_out = self.P3_conv2(p3)
        
        #p2 = self.P2_conv1(c2) + F.upsample(p3, scale_factor=2, mode='nearest')
        #p2_out = self.P2_conv2(p2)
    
        #return [p2_out, p3_out, p4_out, p5_out, p6_out, p7_out]
        #return [p3_out, p4_out, p5_out, p6_out, p7_out]
        return [p4_out, p5_out, p6_out, p7_out]

In [715]:

for l in range(2,8):
    print(f'Resolution of level P{l}:' ,(256 / 2**l))

Resolution of level P2: 64.0
Resolution of level P3: 32.0
Resolution of level P4: 16.0
Resolution of level P5: 8.0
Resolution of level P6: 4.0
Resolution of level P7: 2.0

In [716]:

f_model=resnet34
sz=256
bs=64

cut, cut_lr = model_meta[f_model]
inp = V(torch.ones(1,3,sz,sz))
encoder = get_encoder(f_model, cut)

In [717]:

out = encoder.cuda()(inp)

In [718]:

fpn = FPN(encoder=encoder)

In [719]:

outs = fpn(inp)

In [720]:

#[p2_out, p3_out, p4_out, p5_out, p6_out]

[out.size() for out in outs]

Out[720]:

[torch.Size([1, 256, 16, 16]),
 torch.Size([1, 256, 8, 8]),
 torch.Size([1, 256, 4, 4]),
 torch.Size([1, 256, 2, 2])]

RetinaNet¶

In [474]:

class RetinaNet(nn.Module):
    def __init__(self, fpn, subnet1, subnet2):
        super().__init__()
        self.fpn = fpn # initialize FPN
        self.subnet1 = subnet1 # initialize classifier
        self.subnet2 = subnet2 # initialize regressor
    
    def forward(self, x):
        #p3_out, p4_out, p5_out, p6_out, p7_out = self.fpn(x)
        p4_out, p5_out, p6_out, p7_out = self.fpn(x)
        
        #cls_out5 = self.subnet1(p3_out) # 32x32 
        cls_out4 = self.subnet1(p4_out) # 16x16 
        cls_out3 = self.subnet1(p5_out) # 8x8 
        cls_out2 = self.subnet1(p6_out) # 4x4 
        cls_out1 = self.subnet1(p7_out) # 2x2 
        
        #reg_out5 = self.subnet2(p3_out) # 32x32 
        reg_out4 = self.subnet2(p4_out) # 16x16 
        reg_out3 = self.subnet2(p5_out) # 8x8 
        reg_out2 = self.subnet2(p6_out) # 4x4 
        reg_out1 = self.subnet2(p7_out) # 2x2 
        
        
        # Concat outputs from different levels of the pyramid
        # Here, each output is coming from flatten_conv in this order:
        # grid row 0 col 0 anchor 0 level: 4x4
        # grid row 0 col 1 anchor 0 level: 4x4
        # ...
        # grid row 0 col 0 anchor 1 level: 4x4
        # grid row 0 col 1 anchor 1 level: 4x4
        #
        # grid row 0 col 0 anchor 0 level: 8x8
        # grid row 0 col 1 anchor 0 level: 8x8
        
        # grid row 0 col 0 anchor 0 level: 16x16
        # grid row 0 col 0 anchor 0 level: 16x16
        
        
        
        #return [torch.cat([cls_out1, cls_out2, cls_out3, cls_out4, cls_out5], 1),
        #       torch.cat([reg_out1, reg_out2, reg_out3, reg_out4, reg_out5], 1)]
#         return [torch.cat([cls_out1, cls_out2, cls_out3, cls_out4], 1),
#                torch.cat([reg_out1, reg_out2, reg_out3, reg_out4], 1)]
        return [torch.cat([cls_out1, cls_out2, cls_out3], 1),
               torch.cat([reg_out1, reg_out2, reg_out3], 1)]

In [475]:

# 20 classes, 9 anchors per grid and 256 channels
K, A, in_c = len(id2cat), 1, 256

In [476]:

# initialize retinanet model
fpn, subnet1, subnet2 = FPN(encoder), Subnet1(K, A, in_c), Subnet2(A, in_c)
retina = RetinaNet(fpn, subnet1, subnet2)

In [477]:

out = retina.cuda()(inp)

In [478]:

out[0].size(), out[1].size()

Out[478]:

(torch.Size([1, 84, 20]), torch.Size([1, 84, 4]))

In [479]:

total_out = 0
for l in range(2,3):
    total_out += (256 / 2**l)**2

In [480]:

total_out

Out[480]:

4096.0

Creating Anchors for Model¶

We constructed the model to output from different levels of FPN. These feature maps will have sizes 4x4, 8x8 and 16x16 respectively.

    # Concat outputs from different levels of the pyramid
    # Here, each output is coming from flatten_conv in this order:
    # grid row 0 col 0 anchor 0 level: nxn
    # grid row 0 col 1 anchor 0 level: nxn
    # ...
    # grid row 0 col 0 anchor 1 level: 4x4
    # grid row 0 col 1 anchor 1 level: 4x4
    #
    # grid row 0 col 0 anchor 0 level: 8x8
    # grid row 0 col 1 anchor 0 level: 8x8

    # grid row 0 col 0 anchor 0 level: 16x16
    # grid row 0 col 0 anchor 0 level: 16x16

In [780]:

def hw2corners(ctr, hw): return torch.cat([ctr-hw/2, ctr+hw/2], dim=1)

#anc_grids = [2, 4, 8, 16, 32] 
anc_grids = [2, 4, 8] 

#anc_zooms = [1, 2**(1/3), 2**(2/3)]
anc_zooms = [1]

#anc_ratios = [(1.,2.), (1.,1), (2.,1.)]
anc_ratios = [(1., 1.)]

anchor_scales = [(anz*i,anz*j) for anz in anc_zooms for (i,j) in anc_ratios] # n_anc_zooms * n_anc_ratios

k = len(anchor_scales)
anc_offsets = [1/(o*2) for o in anc_grids]
k

Out[780]:

In [781]:

anc_x = np.concatenate([np.repeat(np.linspace(ao, 1-ao, ag), ag)
                        for ao,ag in zip(anc_offsets,anc_grids)])

anc_y = np.concatenate([np.tile(np.linspace(ao, 1-ao, ag), ag)
                        for ao,ag in zip(anc_offsets,anc_grids)])

# #daveluo
# anc_x = np.concatenate([np.tile(np.linspace(ao, 1-ao, ag), ag)
#                         for ao,ag in zip(anc_offsets,anc_grids)])
# anc_y = np.concatenate([np.repeat(np.linspace(ao, 1-ao, ag), ag)
#                         for ao,ag in zip(anc_offsets,anc_grids)])

anc_ctrs = np.repeat(np.stack([anc_x,anc_y], axis=1), k, axis=0)

In [783]:

def get_anchor_groups(anc):
    """anc_ctrs or anchor_crn"""
    groups = []
    for i, grid in enumerate(anc_grids):
        n = sum(np.array(anc_grids[:i])**2)
        m = sum(np.array(anc_grids[:i+1])**2)
        groups.append(anc[n:m])
    return groups

In [784]:

anc_sizes  =   np.concatenate([np.array([[o/ag,p/ag] for i in range(ag*ag) for o,p in anchor_scales])
               for ag in anc_grids])
grid_sizes = V(np.concatenate([np.array([ 1/ag       for i in range(ag*ag) for o,p in anchor_scales])
               for ag in anc_grids]), requires_grad=False).unsqueeze(1)
anchors = V(np.concatenate([anc_ctrs, anc_sizes], axis=1), requires_grad=False).float()
anchor_cnr = hw2corners(anchors[:,:2], anchors[:,2:])

In [785]:

anc_ctrs_groups = get_anchor_groups(anc_ctrs)

In [786]:

anc_cnr_groups = get_anchor_groups(anchor_cnr)

In [787]:

[len(i) for i in anc_cnr_groups]

Out[787]:

[4, 16, 64]

Do sanity check to see if anchor order is correct¶

In [788]:

# anchor_cnr for 4x4
anchor_crn2 = anc_cnr_groups[0]

# anchor_cnr for 4x4
anchor_crn4 = anc_cnr_groups[1]

# anchor_cnr for 8x8
anchor_crn8 = anc_cnr_groups[2]

# anchor_cnr for 16x16
#anchor_crn16 = anc_cnr_groups[3]

# anchor_cnr for 32x32
#anchor_crn32 = anc_cnr_groups[4]

In [789]:

[len(anchor_crn2), len(anchor_crn4), len(anchor_crn8), len(anchor_crn16)]#, len(anchor_crn32)

Out[789]:

[4, 16, 64, 256]

In [790]:

anchor_cnr.size()

Out[790]:

torch.Size([84, 4])

In [791]:

fig, ax = plt.subplots(figsize=(20, 10))
ax.imshow(sample_x)

# grid row 0 col 1 for 2x2
for anc in (anchor_crn2)[2:3]:
    draw_rect(ax, bb_hw(to_np(anc)*256), color="red")

# # grid row 0 col 1 for 4x4
# for anc in (anchor_crn4)[:10]:
#     draw_rect(ax, bb_hw(to_np(anc)*256), color="red")
    
# # # grid row 0 col 1 for 8x8
# for anc in (anchor_crn8)[:10]:
#      draw_rect(ax, bb_hw(to_np(anc)*256), color="red")

# # # grid row 0 col 1 for 16x16
# for anc in (anchor_crn16)[:10]:
#     draw_rect(ax, bb_hw(to_np(anc)*256), color="red")

# # grid row 0 col 1 for 32x32
# for anc in (anchor_crn32)[:10]:
#     draw_rect(ax, bb_hw(to_np(anc)*256), color="red")

Now, we are certain that output from our model and anchors are in the same order.¶

In [ ]:

Loss and Model Definition¶

In [792]:

def one_hot_embedding(labels, num_classes):
    return torch.eye(num_classes)[labels.data.cpu()]

class BCE_Loss(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes
    
    def forward(self, preds, targets):
        t = one_hot_embedding(targets, self.num_classes+1)
        t = V(t[:,:-1].contiguous()) #bg class is predicted when none of the others go out.
        x = preds[:,:]
        w = self.get_weight(x,t)# for the last part
        return F.binary_cross_entropy_with_logits(x, t, w, size_average=False) / self.num_classes
    
    def get_weight(self,x,t):
        return None

class FocalLoss(BCE_Loss):
    def get_weight(self,x,t):
        alpha,gamma = 0.25,2.
        p = x.sigmoid()
        pt = p*t + (1-p)*(1-t)
        w = alpha*t + (1-alpha)*(1-t)
        return w * (1-pt).pow(gamma)

loss_f = FocalLoss(len(id2cat))

In [793]:

def intersect(box_a, box_b):
    max_xy = torch.min(box_a[:, None, 2:], box_b[None, :, 2:])
    min_xy = torch.max(box_a[:, None, :2], box_b[None, :, :2])
    inter = torch.clamp((max_xy - min_xy), min=0)
    return inter[:, :, 0] * inter[:, :, 1]

def box_sz(b): return ((b[:, 2]-b[:, 0]) * (b[:, 3]-b[:, 1]))

def jaccard(box_a, box_b):
    inter = intersect(box_a, box_b)
    union = box_sz(box_a).unsqueeze(1) + box_sz(box_b).unsqueeze(0) - inter
    return inter / union

In [794]:

#Removes the zero padding in the target bbox/class
size = 256
def get_y(bbox,clas):
    bbox = bbox.view(-1,4)/size
    bb_keep = ((bbox[:,2] - bbox[:,0])>0.).nonzero()[:,0]
    return bbox[bb_keep], clas[bb_keep]
    
def actn_to_bb(actn, anchors):
    actn_bbs = torch.tanh(actn)
    actn_ctrs = (actn_bbs[:,:2] * grid_sizes/2) + anchors[:,:2]
    actn_hw = (1 + actn_bbs[:,2:]/2) * anchors[:,2:]
    return hw2corners(actn_ctrs,actn_hw)

def map_to_ground_truth(overlaps, print_it=False):
    prior_overlap, prior_idx = overlaps.max(1)
    if print_it: print(prior_overlap)
#     pdb.set_trace()
    gt_overlap, gt_idx = overlaps.max(0)
    gt_overlap[prior_idx] = 1.99
    for i,o in enumerate(prior_idx): gt_idx[o] = i
    return gt_overlap,gt_idx

def ssd_1_loss(b_c,b_bb,bbox,clas,print_it=False, use_ab=True):
    bbox,clas = get_y(bbox,clas)
    a_ic = actn_to_bb(b_bb, anchors)
    overlaps = jaccard(bbox.data, (anchor_cnr if use_ab else a_ic).data)
    gt_overlap,gt_idx = map_to_ground_truth(overlaps,print_it)
    gt_clas = clas[gt_idx]
    pos = gt_overlap > 0.5
    pos_idx = torch.nonzero(pos)[:,0]
    gt_clas[1-pos] = len(id2cat)
    gt_bbox = bbox[gt_idx]
    loc_loss = ((a_ic[pos_idx] - gt_bbox[pos_idx]).abs()).mean()
    clas_loss  = loss_f(b_c, gt_clas) / len(pos_idx) #Normalized by the number of anchors matched to a GT object
    return loc_loss, clas_loss

def ssd_loss(pred,targ,print_it=False):
    lcs,lls = 0.,0.
    for b_c,b_bb,bbox,clas in zip(*pred,*targ):
        loc_loss,clas_loss = ssd_1_loss(b_c,b_bb,bbox,clas,print_it)
        lls += loc_loss
        lcs += clas_loss
    if print_it: print(f'loc: {lls.data[0]}, clas: {lcs.data[0]}')
    return lls+lcs

def ssd_loss2(pred,targ):
    lcs,lls = 0.,0.
    for b_c,b_bb,bbox,clas in zip(*pred,*targ):
        loc_loss,clas_loss = ssd_1_loss(b_c,b_bb,bbox,clas,use_ab=False)
        lls += loc_loss
        lcs += clas_loss
    return lls+lcs

In [795]:

# wrap for fastai 
class RetinaNetModel():
    def __init__(self, model, cut_lr, name='retinanet'):
        self.model,self.name, self.cut_lr = model, name, cut_lr

    def get_layer_groups(self, precompute):
        lgs = list(split_by_idxs(children(self.model.fpn.encoder), [self.cut_lr]))
        return [lgs[0]] + [children(self.model.fpn)[1:]] + [children(self.model.subnet1)] + [children(self.model.subnet2)]

Training¶

In [796]:

aug_tfms = [RandomRotate(10, tfm_y = TfmType.COORD),
           RandomLighting(0.05,0.05, tfm_y = TfmType.COORD),
           RandomFlip(tfm_y = TfmType.COORD)]
tfms = tfms_from_model(f_model, sz, aug_tfms=aug_tfms, crop_type=CropType.NO, tfm_y = TfmType.COORD)
md = ImageClassifierData.from_csv(PATH, JPEGS, MBB_CSV, tfms=tfms, continuous=True, num_workers=4, bs=bs)

In [797]:

class ConcatLblDataset(Dataset):
    def __init__(self, ds, y2):
        self.ds,self.y2 = ds,y2
        self.sz = ds.sz
    def __len__(self): return len(self.ds)
    
    def __getitem__(self, i):
        x,y = self.ds[i]
        return (x, (y,self.y2[i]))

trn_ds2 = ConcatLblDataset(md.trn_ds, trn_mcs)
val_ds2 = ConcatLblDataset(md.val_ds, val_mcs)
md.trn_dl.dataset = trn_ds2
md.val_dl.dataset = val_ds2

In [808]:

# INIT MODEL
f_model=resnet34
sz=256
bs=32

cut, cut_lr = model_meta[f_model]
encoder = get_encoder(f_model, cut).cuda()
# input channel for subnets, C = 256 in the paper
A, K, in_c = 1, 20, 256
fpn, subnet1, subnet2 = FPN(encoder), Subnet1(K, A, in_c), Subnet2(A, in_c)
inp = V(torch.ones(1,3,256,256))
out = fpn(inp.cuda())

retina = RetinaNet(fpn, subnet1, subnet2).cuda()
model = RetinaNetModel(retina, 8)

In [809]:

# init learner and define optimizer 
learn = ConvLearner(md, model)
learn.opt_fn=partial(optim.SGD,momentum=0.9)
learn.crit = ssd_loss

# freeze resnet
learn.freeze_to(1)
learn.unfreeze()

In [810]:

fpn_outs = learn.model.fpn(inp)

In [811]:

[out.size() for out in fpn_outs]

Out[811]:

[torch.Size([1, 256, 16, 16]),
 torch.Size([1, 256, 8, 8]),
 torch.Size([1, 256, 4, 4]),
 torch.Size([1, 256, 2, 2])]

In [812]:

outs = learn.model(inp)

In [813]:

# classes, bbox
outs[0].size(), outs[1].size()

Out[813]:

(torch.Size([1, 84, 20]), torch.Size([1, 84, 4]))

In [814]:

layers = learn.models.get_layer_groups(False)

In [815]:

for m in layers[0]:
    for p in m.parameters():
        print(p.requires_grad)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True

In [817]:

learn.lr_find()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

  0%|          | 0/63 [00:00<?, ?it/s]
 73%|███████▎  | 46/63 [01:04<00:23,  1.39s/it, loss=1.09e+06]

In [818]:

learn.sched.plot()

In [ ]:

lr = 1e-3
#lrs = [lr/100, lr, lr, lr]
learn.fit(lr, n_cycle=20, cycle_len=1, use_clr_beta=(10, 10, 0.85, 0.90))

HBox(children=(IntProgress(value=0, description='Epoch', max=20), HTML(value='')))

  8%|▊         | 5/63 [00:07<01:32,  1.60s/it, loss=2.25]
epoch      trn_loss   val_loss                            
    0      2.215486   2.504212  
 62%|██████▏   | 39/63 [00:51<00:31,  1.31s/it, loss=2.21]

In [ ]:

Testing¶

In [824]:

def nms(boxes, scores, overlap=0.5, top_k=100):
    keep = scores.new(scores.size(0)).zero_().long()
    if boxes.numel() == 0: return keep
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    area = torch.mul(x2 - x1, y2 - y1)
    v, idx = scores.sort(0)  # sort in ascending order
    idx = idx[-top_k:]  # indices of the top-k largest vals
    xx1 = boxes.new()
    yy1 = boxes.new()
    xx2 = boxes.new()
    yy2 = boxes.new()
    w = boxes.new()
    h = boxes.new()

    count = 0
    while idx.numel() > 0:
        i = idx[-1]  # index of current largest val
        keep[count] = i
        count += 1
        if idx.size(0) == 1: break
        idx = idx[:-1]  # remove kept element from view
        # load bboxes of next highest vals
        torch.index_select(x1, 0, idx, out=xx1)
        torch.index_select(y1, 0, idx, out=yy1)
        torch.index_select(x2, 0, idx, out=xx2)
        torch.index_select(y2, 0, idx, out=yy2)
        # store element-wise max with next highest score
        xx1 = torch.clamp(xx1, min=x1[i])
        yy1 = torch.clamp(yy1, min=y1[i])
        xx2 = torch.clamp(xx2, max=x2[i])
        yy2 = torch.clamp(yy2, max=y2[i])
        w.resize_as_(xx2)
        h.resize_as_(yy2)
        w = xx2 - xx1
        h = yy2 - yy1
        # check sizes of xx1 and xx2.. after each iteration
        w = torch.clamp(w, min=0.0)
        h = torch.clamp(h, min=0.0)
        inter = w*h
        # IoU = i / (area(a) + area(b) - i)
        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
        union = (rem_areas - inter) + area[i]
        IoU = inter/union  # store result in iou
        # keep only elements with an IoU <= overlap
        idx = idx[IoU.le(overlap)]
    return keep, count

In [825]:

import matplotlib.cm as cmx
import matplotlib.colors as mcolors

def get_cmap(N):
    color_norm  = mcolors.Normalize(vmin=0, vmax=N-1)
    return cmx.ScalarMappable(norm=color_norm, cmap='Set3').to_rgba

num_colr = 12
cmap = get_cmap(num_colr)
colr_list = [cmap(float(x)) for x in range(num_colr)]

In [826]:

def bb_to_hw(bb):
    return [bb[1],bb[0],bb[3]-bb[1], bb[2]-bb[0]]

def show_img(im, figsize=None, axis=None):
    if not axis:
        fig,axis = plt.subplots(figsize=figsize)
    axis.imshow(im)
    axis.get_xaxis().set_visible(False)
    axis.get_yaxis().set_visible(False)
    return axis

def draw_outline(obj,lw):
    obj.set_path_effects([patheffects.Stroke(linewidth=lw, foreground='black'), patheffects.Normal()])
    
def draw_rect(axis, box, color='white'):
    patch = axis.add_patch(patches.Rectangle(box[:2],box[-2],box[-1],fill=False,edgecolor=color,lw=2))
    draw_outline(patch,4)
    
def draw_text(axis,xy,text,text_size=14, color='white'):
    patch = axis.text(*xy, text, verticalalignment='top', color=color, fontsize=text_size, weight='bold')
    draw_outline(patch,1)
    
def show_img_all(id_img):
    img = open_image(IMG_PATH/imgs_fn[id_img])
    axis = show_img(img, figsize=(16,8))
    for bbox, id_cat in trn_anno[id_img]:
        new_box = bb_to_hw(bbox)
        draw_rect(axis, new_box)
        draw_text(axis, new_box[:2], cats[id_cat])
        
def show_ground_truth(ax, im, bbox, clas = None, prs = None, tresh = 0.3):
    bb = [bb_to_hw(o) for o in bbox.reshape(-1,4)]
    if clas is None: clas = [None] * len(bb)
    if prs is None: prs = [None] * len(bb)
    ax = show_img(im,axis=ax)
    for i, (b,c,pr) in enumerate(zip(bb,clas,prs)):
        if b[2] > 0 and (pr is None or pr > tresh):#Show the bow only if there is something to show
            draw_rect(ax, b, colr_list[i%num_colr])
            txt = f'{i}: '
            if c is not None: txt += ('bg' if c == len(id2cat) else id2cat[c])
            if pr is not None: txt += f'{pr:.2f}'
            draw_text(ax,b[:2],txt,color=colr_list[i%num_colr])
        
def torch_gt(ax, ima, bbox, clas, prs=None, thresh=0.4):
    return show_ground_truth(ax, ima, to_np((bbox*256).long()),
         to_np(clas), to_np(prs) if prs is not None else None, thresh)

def np_gt(ax, ima, bbox, clas, prs=None, thresh=0.4):
    return show_ground_truth(ax, ima, (bbox*256).astype(np.uint8),
         clas, prs if prs is not None else None, thresh)

In [827]:

learn.load('model1')
learn.model.eval()
samples = iter(md.val_dl)

In [828]:

x,y = next(samples)
x,y = V(x),V(y)
pred = learn.model(x)

In [829]:

def show_results(idx, thresh=0.25, ax=None):
    if ax is None: fig, ax = plt.subplots(figsize=(6,6))
    ima=md.val_ds.ds.denorm(x)[idx]
    out1,out2,cc = get1preds(pred[0][idx],pred[1][idx],y[0][idx],y[1][idx],thresh)
    torch_gt(ax, ima, out2, cc, out1, 0.1)
    
def show_gt(idx, ax=None):
    if ax is None: fig, ax = plt.subplots(figsize=(6,6))
    ima = md.val_ds.ds.denorm(x)[idx]
    show_ground_truth(ax,ima,to_np(y[0][idx]),to_np(y[1][idx]))
    
def compare(idx,thresh=0.25):
    fig, axs = plt.subplots(1,2,figsize=(12,6))
    show_results(idx,thresh,ax=axs[0])
    show_gt(idx,ax=axs[1])
    plt.tight_layout()

In [830]:

def get1preds(b_clas,b_bb,bbox,clas,thresh=0.25):
    bbox,clas = get_y(bbox, clas)
    a_ic = actn_to_bb(b_bb, anchors)
    clas_pr, clas_ids = b_clas.max(1)
    conf_scores = b_clas.sigmoid().t().data
    out1,out2,cc = [],[],[]
    for cl in range(conf_scores.size(0)-1):
        cl_mask = conf_scores[cl] > thresh
        if cl_mask.sum() == 0: continue
        scores = conf_scores[cl][cl_mask]
        l_mask = cl_mask.unsqueeze(1).expand_as(a_ic)
        boxes = a_ic[l_mask].view(-1, 4)
        ids, count = nms(boxes.data, scores, 0.4, 50)
        ids = ids[:count]
        out1.append(scores[ids])
        out2.append(boxes.data[ids])
        cc.append([cl]*count)
    cc = T(np.concatenate(cc)) if cc != [] else None
    out1 = torch.cat(out1) if out1 != [] else None
    out2 = torch.cat(out2) if out2 != [] else None
    return out1,out2,cc

In [831]:

def show_results(idx, thresh=0.25, ax=None):
    if ax is None: fig, ax = plt.subplots(figsize=(6,6))
    ima=md.val_ds.ds.denorm(x)[idx]
    out1,out2,cc = get1preds(pred[0][idx],pred[1][idx],y[0][idx],y[1][idx],thresh)
    torch_gt(ax, ima, out2, cc, out1, 0.1)
    
def show_gt(idx, ax=None):
    if ax is None: fig, ax = plt.subplots(figsize=(6,6))
    ima = md.val_ds.ds.denorm(x)[idx]
    show_ground_truth(ax,ima,to_np(y[0][idx]),to_np(y[1][idx]))
    
def compare(idx,thresh=0.25):
    fig, axs = plt.subplots(1,2,figsize=(12,6))
    show_results(idx,thresh,ax=axs[0])
    show_gt(idx,ax=axs[1])
    plt.tight_layout()

In [832]:

def show_hits(idx,gt_obj,tresh=0.25):
    fig, axs = plt.subplots(1,2,figsize=(12,6))
    ima=md.val_ds.ds.denorm(x)[idx]
    show_img(ima,axis=axs[1])
    boxes = to_np(y[0][idx].view(-1,4))
    idx_obj = boxes.shape[0] - 1 - gt_obj
    box = bb_to_hw(boxes[idx_obj])
    draw_rect(axs[1],box)
    draw_text(axs[1],box[0:2],id2cat[to_np(y[1][idx])[idx_obj]])
    pr_scr,pr_bb,pr_cls = get1preds(pred[0][idx],pred[1][idx],y[0][idx],y[1][idx],tresh)
    overlaps = jaccard(pr_bb,y[0][idx].view(-1,4)[idx_obj].unsqueeze(0).data/224)
    hits = (overlaps > 0.5) * (pr_cls == y[1].data[idx,idx_obj]).unsqueeze(1)
    bb_mask = hits.expand_as(pr_bb)
    torch_gt(axs[0], ima, pr_bb[bb_mask].view(-1,4), pr_cls[hits], pr_scr[hits], tresh)
    plt.tight_layout()

In [843]:

compare(15)

MAP¶

In [844]:

def count(L):
    result = collections.defaultdict(int)
    if L is not None:
        for x in L:
            result[x] += 1
    return result

In [845]:

from ipywidgets import FloatProgress
from IPython.display import display
def multiTPFPFN():
    n = 40
    threshes = np.linspace(.05, 0.95, n, endpoint=True)
    tps,fps,fns = np.zeros((n,len(id2cat))),np.zeros((n,len(id2cat))),np.zeros((n,len(id2cat)))
    prog = FloatProgress(min=0,max=len(md.val_dl))
    display(prog)
    for data in md.val_dl:
        x,y = data
        x,y = V(x),V(y)
        pred = learn.model(x)
        for idx in range(x.size(0)):
            bbox,clas = get_y(y[0][idx],y[1][idx])#unpad the target
            p_scrs,p_box,p_cls = get1preds(pred[0][idx],pred[1][idx],y[0][idx],y[1][idx],threshes[0])
            overlaps = to_np(jaccard(p_box,bbox.data))
            overlaps = np.where(overlaps > 0.5, overlaps, 0)
            clas, np_scrs, np_cls = to_np(clas.data),to_np(p_scrs), to_np(p_cls)
            for k in range(threshes.shape[0]):
                new_tp = collections.defaultdict(int)
                for cls in list(set(clas)):
                    msk_clas = np.bitwise_and((clas == cls)[None,:],(np_cls == cls)[:,None])
                    ov_clas = np.where(msk_clas,overlaps,0.)
                    mx_idx = np.argmax(ov_clas,axis=1)
                    for i in range(0,len(clas)):
                        if (clas[i] == cls):
                            keep = np.bitwise_and(np.max(ov_clas,axis=1) > 0.,mx_idx==i)
                            keep = np.bitwise_and(keep,np_scrs > threshes[k])
                            if keep.sum() > 0:
                                new_tp[cls] += 1
                count_pred = count(np_cls[np_scrs > threshes[k]])
                count_gt = count(clas)
                for c in range(len(id2cat)):
                    tps[k,c] += new_tp[c]
                    fps[k,c] += count_pred[c] - new_tp[c]
                    fns[k,c] += count_gt[c] - new_tp[c]
        prog.value += 1 
    return tps, fps, fns

In [846]:

tps, fps, fns = multiTPFPFN()

FloatProgress(value=0.0, max=16.0)

In [847]:

def avg_prec(clas):
    eps = 1e-15
    precisions = tps[:,clas]/(tps[:,clas] + fps[:,clas] + eps)
    recalls = tps[:,clas]/(tps[:,clas] + fns[:,clas] + eps)
    prec_at_rec = []
    for recall_level in np.linspace(0.0, 1.0, 11):
        try:
            args = np.argwhere(recalls >= recall_level).flatten()
            prec = max(precisions[args])
        except ValueError:
            prec = 0.0
        prec_at_rec.append(prec)
    return np.array(prec_at_rec).mean()

In [848]:

def mAP():
    S = 0
    for i in range(len(id2cat)):
        S += avg_prec(i)
    return S/len(id2cat)

In [849]:

mAP()

Out[849]:

0.12747938049885787

In [ ]: