import sys
import torch
sys.path.append("../../fastai/")
%matplotlib inline
%reload_ext autoreload
%autoreload 2
from fastai.conv_learner import *
from fastai.dataset import *
from pathlib import Path
import json
from PIL import ImageDraw, ImageFont
from matplotlib import patches, patheffects
#torch.cuda.set_device(0)
import matplotlib.cm as cmx
import matplotlib.colors as mcolors
from cycler import cycler
def get_cmap(N):
color_norm = mcolors.Normalize(vmin=0, vmax=N-1)
return cmx.ScalarMappable(norm=color_norm, cmap='Set3').to_rgba
num_colr = 12
cmap = get_cmap(num_colr)
colr_list = [cmap(float(x)) for x in range(num_colr)]
def show_ground_truth(ax, im, bbox, clas=None, prs=None, thresh=0.3):
bb = [bb_hw(o) for o in bbox.reshape(-1,4)]
if prs is None: prs = [None]*len(bb)
if clas is None: clas = [None]*len(bb)
ax = show_img(im, ax=ax)
for i,(b,c,pr) in enumerate(zip(bb, clas, prs)):
if((b[2]>0) and (pr is None or pr > thresh)):
draw_rect(ax, b, color=colr_list[i%num_colr])
txt = f'{i}: '
if c is not None: txt += ('bg' if c==len(id2cat) else id2cat[c])
if pr is not None: txt += f' {pr:.2f}'
draw_text(ax, b[:2], txt, color=colr_list[i%num_colr])
def show_img(im, figsize=None, ax=None):
if not ax: fig,ax = plt.subplots(figsize=figsize)
ax.imshow(im)
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
return ax
def draw_outline(o, lw):
o.set_path_effects([patheffects.Stroke(
linewidth=lw, foreground='black'), patheffects.Normal()])
def draw_rect(ax, b, color='white'):
patch = ax.add_patch(patches.Rectangle(b[:2], *b[-2:], fill=False, edgecolor=color, lw=3))
draw_outline(patch, 4)
def draw_text(ax, xy, txt, sz=14, color='white'):
text = ax.text(*xy, txt,
verticalalignment='top', color=color, fontsize=sz, weight='bold')
draw_outline(text, 1)
def draw_im(im, ann):
ax = show_img(im, figsize=(16,8))
for b,c in ann:
b = bb_hw(b)
draw_rect(ax, b)
draw_text(ax, b[:2], id2cat[c], sz=16)
def draw_idx(i):
im_a = trn_anno[i]
im = open_image(IMG_PATH/trn_fns[i])
print(im.shape)
draw_im(im, im_a)
The most awful part
PATH = Path('../../data/pascal')
list(PATH.iterdir())
[PosixPath('../../data/pascal/pascal_train2007.json'), PosixPath('../../data/pascal/PASCAL_VOC'), PosixPath('../../data/pascal/pascal_test2007.json'), PosixPath('../../data/pascal/models'), PosixPath('../../data/pascal/tmp'), PosixPath('../../data/pascal/pascal_val2012.json'), PosixPath('../../data/pascal/csv'), PosixPath('../../data/pascal/VOCdevkit'), PosixPath('../../data/pascal/pascal_train2012.json'), PosixPath('../../data/pascal/pascal_val2007.json')]
trn_j = json.load((PATH/'pascal_train2007.json').open())
trn_j.keys()
dict_keys(['images', 'type', 'annotations', 'categories'])
IMAGES,ANNOTATIONS,CATEGORIES = ['images', 'annotations', 'categories']
trn_j[IMAGES][:5]
[{'file_name': '000012.jpg', 'height': 333, 'width': 500, 'id': 12}, {'file_name': '000017.jpg', 'height': 364, 'width': 480, 'id': 17}, {'file_name': '000023.jpg', 'height': 500, 'width': 334, 'id': 23}, {'file_name': '000026.jpg', 'height': 333, 'width': 500, 'id': 26}, {'file_name': '000032.jpg', 'height': 281, 'width': 500, 'id': 32}]
FILE_NAME,ID,IMG_ID,CAT_ID,BBOX = 'file_name','id','image_id','category_id','bbox'
cats = {o[ID]:o['name'] for o in trn_j[CATEGORIES]}
trn_fns = {o[ID]:o[FILE_NAME] for o in trn_j[IMAGES]}
trn_ids = [o[ID] for o in trn_j[IMAGES]]
JPEGS = 'VOCdevkit/VOC2007/JPEGImages'
IMG_PATH = PATH/JPEGS
list(IMG_PATH.iterdir())[:5]
[PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/000671.jpg'), PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/005710.jpg'), PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/003042.jpg'), PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/004429.jpg'), PosixPath('../../data/pascal/VOCdevkit/VOC2007/JPEGImages/003866.jpg')]
def hw_bb(bb): return np.array([bb[1], bb[0], bb[3]+bb[1]-1, bb[2]+bb[0]-1])
def bb_hw(a): return np.array([a[1],a[0],a[3]-a[1]+1,a[2]-a[0]+1])
trn_anno = collections.defaultdict(lambda:[])
for o in trn_j[ANNOTATIONS]:
if not o['ignore']:
bb = o[BBOX]
bb = hw_bb(bb)
trn_anno[o[IMG_ID]].append((bb,o[CAT_ID]))
len(trn_anno)
2501
i = 0
# get im filename
im_dict = trn_j[IMAGES][i]
im_dict[FILE_NAME],im_dict[ID]
# open image
im = open_image(IMG_PATH/im_dict[FILE_NAME])
# get annotations
im_anno = trn_anno[im_dict[ID]]
# im_anno: a single list which has tuple (bb, )
im_anno[0]
(array([ 96, 155, 269, 350]), 7)
im_anno
[(array([ 96, 155, 269, 350]), 7)]
# create multiclass classification csv
MC_CSV = PATH/'csv/mc.csv'
mc = [set([cats[p[1]] for p in trn_anno[o]]) for o in trn_ids]
mcs = [' '.join(str(p) for p in o) for o in mc]
df = pd.DataFrame({'fn': [trn_fns[o] for o in trn_ids], 'clas': mcs}, columns=['fn','clas'])
df.to_csv(MC_CSV, index=False)
df.head()
fn | clas | |
---|---|---|
0 | 000012.jpg | car |
1 | 000017.jpg | person horse |
2 | 000023.jpg | person bicycle |
3 | 000026.jpg | car |
4 | 000032.jpg | person aeroplane |
# create class and bbox csv s
CLAS_CSV = PATH/'csv/clas.csv'
MBB_CSV = PATH/'csv/mbb.csv'
mc = [[cats[p[1]] for p in trn_anno[o]] for o in trn_ids]
id2cat = list(cats.values())
cat2id = {v:k for k,v in enumerate(id2cat)}
mcs = np.array([np.array([cat2id[p] for p in o]) for o in mc]); mcs
val_idxs = get_cv_idxs(len(trn_fns))
((val_mcs,trn_mcs),) = split_by_idx(val_idxs, mcs)
mbb = [np.concatenate([p[0] for p in trn_anno[o]]) for o in trn_ids]
mbbs = [' '.join(str(p) for p in o) for o in mbb]
df = pd.DataFrame({'fn': [trn_fns[o] for o in trn_ids], 'bbox': mbbs}, columns=['fn','bbox'])
df.to_csv(MBB_CSV, index=False)
df.head(2)
fn | bbox | |
---|---|---|
0 | 000012.jpg | 96 155 269 350 |
1 | 000017.jpg | 61 184 198 278 77 89 335 402 |
f_model=resnet34
sz=256
bs=32
aug_tfms = [RandomFlip(tfm_y=TfmType.COORD)]
tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, tfm_y=TfmType.COORD, aug_tfms=aug_tfms)
md = ImageClassifierData.from_csv(PATH, JPEGS, MBB_CSV, tfms=tfms, continuous=True, num_workers=4, bs=16)
aug_tfms = [RandomRotate(10, tfm_y = TfmType.COORD),
RandomLighting(0.05,0.05, tfm_y = TfmType.COORD),
RandomFlip(tfm_y = TfmType.COORD)]
tfms = tfms_from_model(f_model, sz, aug_tfms=aug_tfms, crop_type=CropType.NO, tfm_y = TfmType.COORD)
md = ImageClassifierData.from_csv(PATH, JPEGS, MBB_CSV, tfms=tfms, continuous=True, num_workers=4, bs=bs)
class ConcatLblDataset(Dataset):
def __init__(self, ds, y2):
self.ds,self.y2 = ds,y2
self.sz = ds.sz
def __len__(self): return len(self.ds)
def __getitem__(self, i):
x,y = self.ds[i]
return (x, (y,self.y2[i]))
trn_ds2 = ConcatLblDataset(md.trn_ds, trn_mcs)
val_ds2 = ConcatLblDataset(md.val_ds, val_mcs)
md.trn_dl.dataset = trn_ds2
md.val_dl.dataset = val_ds2
len(md.trn_ds), len(md.val_ds)
(2001, 500)
x,y=to_np(next(iter(md.val_dl)))
x=md.val_ds.ds.denorm(x)
sample_x, sample_y_bb, sample_y_cls = x[0], y[0][0], y[1][0]
sample_y_bb, sample_y_cls
(array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 103., 53., 255., 193., 0., 56., 235., 206., 10., 193., 248., 254.], dtype=float32), array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 14, 14]))
def show_gt(ax, x, y_bb, y_cls):
"""
Show ground truth image and labels given a sample
x, y_bb and y_cls : image, bbox array, cls idxs array
Modified version of 'show_ground_truth'
"""
# get non zero idxs for indexing labels
nonzero_idxs = np.nonzero(y_cls)[0]
# get cls indexes
cls_idxs = y_cls[nonzero_idxs]
# get bboxes - bb format
bboxes = []
for idx in nonzero_idxs:
bboxes.append(y_bb[idx*4:(idx+1)*4])
# show img, draw bboxes and text
ax = show_img(x, ax=ax)
for bb, cls_idx in zip(bboxes, cls_idxs):
b = bb_hw(bb)
draw_rect(ax, b, color=colr_list[cls_idx%len(colr_list)])
draw_text(ax, b[:2], id2cat[cls_idx])
fig, ax = plt.subplots(figsize=(20, 10))
show_gt(ax, sample_x, sample_y_bb, sample_y_cls)
fig, axes = plt.subplots(3, 4, figsize=(16, 12))
for i,ax in enumerate(axes.flat):
show_gt(ax, x[i], y[0][i], y[1][i])
plt.tight_layout()
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
# check data
x, y = next(iter(md.trn_dl))
x.size(), y[0].size(), y[1].size()
(torch.Size([32, 3, 256, 256]), torch.Size([32, 36]), torch.Size([32, 9]))
"Taking an input feature map with C channels from a given pyramid level, the subnet applies four 3x3 conv layers, each with C filters and each followed by ReLU activations, followed by a 3x3 conv layer with KA filters. Finally sigmoid activations are attached to output the KA binary predictions per spatial location, see Figure 3 (c). We use C = 256 and A = 9 in most experiments.
We will use same notations from the paper
K: number of classes
A: number of anchors
All new conv layers except the final one in the RetinaNet subnets are initialized with bias b = 0 and a Gaussian weight fill with $\sigma = 0.01$. For the final conv layer of the classification subnet, we set the bias initialization to $b = − log((1 − \pi)/\pi)$.
"""
K: number of classes
A: number of anchors - 9 for original paper
C: number of channels coming from the pyramid levels
"""
K, A, C = len(id2cat), 1, 256
def conv_bn_relu(kernel_size=3,stride=1, pad=1,in_c=256, out_c=256, use_bn=True):
"""Conv batchnorm relu block"""
block = nn.Sequential(
nn.Conv2d(in_c, out_c, kernel_size, stride, pad),
nn.ReLU())
if use_bn: block.add_module('3', nn.BatchNorm2d(in_c))
# N(0, 0.01) bias = 0 initialization
block[0].weight.data.normal_(0, 0.01)
block[0].bias.data.zero_()
return block
def flatten_conv(x, A):
"""
IMPORTANT: Receptive fields should match target
A: number of anchors
Flatten output as:
grid row 0 col 0 anchor 0
grid row 0 col 0 anchor 1
...
grid row 0 col 1 anchor 0
grid row 0 col 1 anchor 1
...
grid row 0 col 2 anchor 0
grid row 0 col 2 anchor 1
...
grid row n col n anchor A
grid row n col n anchor A-1
"""
bs,nf,gx,gy = x.size()
x = x.permute(0,2,3,1).contiguous()
return x.view(bs,-1,nf//A)
class Subnet1(nn.Module):
"""For classification: outputs K*A"""
def __init__(self, K, A, in_c, use_bn=False, depth=4, pi=0.01):
super().__init__()
# Number of anchors
self.A = A
# 4 block of convolutions
self.conv = nn.Sequential(*children(conv_bn_relu(use_bn=False))*depth)
# Final convolutio for prediction
self.out_conv = nn.Conv2d(in_c, K*A, kernel_size=3, stride=1, padding=1)
# N(0, 0.01) bias = -np.log((1-pi)/pi) initialization
self.out_conv.weight.data.normal_(0, 0.01)
self.out_conv.bias.data = self.out_conv.bias.data.zero_() - np.log((1-pi)/pi)
def forward(self, x):
return flatten_conv(self.out_conv(self.conv(x)), self.A)
subnet1 = Subnet1(K, A, 256)
subnet1
Subnet1( (conv): Sequential( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU() (4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (5): ReLU() (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): ReLU() ) (out_conv): Conv2d(256, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) )
"The design of the box regression subnet is identical to the classification subnet except that it terminates in 4A linear outputs per spatial location."
class Subnet2(nn.Module):
"""For classification: outputs 4*A"""
def __init__(self, A, in_c, use_bn=False, depth=4):
super().__init__()
# Number of anchors
self.A = A
# 4 block of convolutions
self.conv = nn.Sequential(*children(conv_bn_relu(use_bn=False))*depth)
# Final convolutio for prediction
self.out_conv = nn.Conv2d(in_c, 4*A, kernel_size=3, stride=1, padding=1)
# N(0, 0.01) bias = 0
self.out_conv.weight.data.normal_(0, 0.01)
self.out_conv.bias.data = self.out_conv.bias.data.zero_()
def forward(self, x):
return flatten_conv(self.out_conv(self.conv(x)), self.A)
subnet2 = Subnet2(A, C)
subnet2
Subnet2( (conv): Sequential( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU() (4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (5): ReLU() (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): ReLU() ) (out_conv): Conv2d(256, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) )
"We experiment with ResNet-50-FPN and ResNet-101-FPN backbones [20]. The base ResNet-50 and ResNet-101 models are pre-trained on ImageNet1k; we use the models released by [16]. New layers added for FPN are initialized as in [20]."
# load defined model
def get_encoder(f, cut):
base_model = (cut_model(f(True), cut))
return nn.Sequential(*base_model)
class SaveFeatures():
""" Extract pretrained activations"""
features=None
def __init__(self, m): self.hook = m.register_forward_hook(self.hook_fn)
def hook_fn(self, module, input, output): self.features = output
def remove(self): self.hook.remove()
f_model=resnet50
sz=256
bs=64
cut, cut_lr = model_meta[f_model]
inp = V(torch.ones(1,3,sz,sz))
encoder = get_encoder(f_model, cut)
sfs = [SaveFeatures(encoder[i]) for i in range(len(children(encoder)))]
inp = V(torch.ones(1,3,sz,sz))
encoder = encoder.cuda()
encoder(inp).size()
torch.Size([1, 2048, 8, 8])
Paper Notes
Following [20], we build FPN on top of the ResNet ar- chitecture [16]. We construct a pyramid with levels P 3 through P 7 , where l indicates pyramid level (P l has reso- lution 2 l lower than the input). As in [20] all pyramid levels have C = 256 channels. Details of the pyramid generally follow [20] with a few modest differences. 2 While many design choices are not crucial, we emphasize the use of the FPN backbone is; preliminary experiments using features from only the final ResNet layer yielded low AP.
RetinaNet uses feature pyramid levels P3 to P7, where P3 to P5 are computed from the output of the corresponding ResNet residual stage (C3 through C5) using top-down and lateral connections just as in [FPN PAPER],
P6 is obtained via a 3x3 stride-2 conv on C5, and P7 is computed by apply-ing ReLU followed by a 3x3 stride-2 conv on P6. This differs slightly from [20]: (1) we don’t use the high-resolution pyramid level P2 for com-putational reasons, (2) P6 is computed by strided convolution instead of downsampling, and (3) we include P7 to improve large object detection. These minor modifications improve speed while maintaining accuracy. 4
# index C1, C2, C3, C4, C5 -> p2_out, p3_out, p4_out, p5_out, p6_out
[stage_act.features.size() for stage_act in sfs]
[torch.Size([1, 64, 128, 128]), torch.Size([1, 64, 128, 128]), torch.Size([1, 64, 128, 128]), torch.Size([1, 64, 64, 64]), torch.Size([1, 256, 64, 64]), torch.Size([1, 512, 32, 32]), torch.Size([1, 1024, 16, 16]), torch.Size([1, 2048, 8, 8])]
RetinaNet uses feature pyramid levels P to P , where P to P are 3 7 3 5 computed from the output of the corresponding ResNet residual stage (C 3 through C 5 ) using top-down and lateral connections just as in [20], P 6 is obtained via a 3×3 stride-2 conv on C 5 , and P 7 is computed by apply- ing ReLU followed by a 3×3 stride-2 conv on P 6 . This differs slightly from [20]: (1) we don’t use the high-resolution pyramid level P 2 for com- putational reasons, (2) P 6 is computed by strided convolution instead of downsampling, and (3) we include P 7 to improve large object detection. These minor modifications improve speed while maintaining accuracy
class FPN(nn.Module):
def __init__(self, encoder, out_c=256):
super().__init__()
self.encoder = encoder
self.sfs = [SaveFeatures(self.encoder[i]) for i in range(len(children(self.encoder)))]
self.out_c = out_c
def forward(self, x):
#pdb.set_trace()
# encode image with ResNet backbone
x = self.encoder(x)
# get c1, c2, c3, c4, c5 activations
c1 = self.sfs[2].features #64 (sz/2)
c2 = self.sfs[4].features #256 (sz/4)
c3 = self.sfs[5].features #512 (sz/8)
c4 = self.sfs[6].features #1024 (sz/16)
c5 = self.sfs[7].features #2048 (sz/32) : sz should be divisible by 32
C_sz = c5.size()[1] # 2048
# construct convs
if not hasattr(self, 'P6_conv1'):
# get channel size of each intermediate activation
self.sfs_c_sz = [stage_act.features.size()[1] for stage_act in self.sfs]
self.P6_conv1 = nn.Conv2d(C_sz, self.out_c, kernel_size=3, stride=2, padding=1).cuda()
self.P6_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()
self.P7_conv = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=2, padding=1).cuda()
self.P5_conv1 = nn.Conv2d(C_sz, self.out_c, kernel_size=1, stride=1, padding=0).cuda()
self.P5_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()
self.P4_conv1 = nn.Conv2d(C_sz//2, self.out_c, kernel_size=1, stride=1, padding=0).cuda()
self.P4_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()
self.P3_conv1 = nn.Conv2d(C_sz//4, self.out_c, kernel_size=1, stride=1, padding=0).cuda()
self.P3_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()
#self.P2_conv1 = nn.Conv2d(C_sz//8, self.out_c, kernel_size=1, stride=1, padding=0).cuda()
#self.P2_conv2 = nn.Conv2d(self.out_c, self.out_c, kernel_size=3, stride=1, padding=1).cuda()
# get P2, P3, P4, P5, P6
p6 = self.P6_conv1(c5)
p6_out = self.P6_conv2(p6)
p7_out = self.P7_conv(F.relu(p6_out))
p5 = self.P5_conv1(c5) + F.upsample(p6, scale_factor=2, mode='nearest')
p5_out = self.P5_conv2(p5)
p4 = self.P4_conv1(c4) + F.upsample(p5, scale_factor=2, mode='nearest')
p4_out = self.P4_conv2(p4)
#p3 = self.P3_conv1(c3) + F.upsample(p4, scale_factor=2, mode='nearest')
#p3_out = self.P3_conv2(p3)
#p2 = self.P2_conv1(c2) + F.upsample(p3, scale_factor=2, mode='nearest')
#p2_out = self.P2_conv2(p2)
#return [p2_out, p3_out, p4_out, p5_out, p6_out, p7_out]
#return [p3_out, p4_out, p5_out, p6_out, p7_out]
return [p4_out, p5_out, p6_out, p7_out]
for l in range(2,8):
print(f'Resolution of level P{l}:' ,(256 / 2**l))
Resolution of level P2: 64.0 Resolution of level P3: 32.0 Resolution of level P4: 16.0 Resolution of level P5: 8.0 Resolution of level P6: 4.0 Resolution of level P7: 2.0
f_model=resnet34
sz=256
bs=64
cut, cut_lr = model_meta[f_model]
inp = V(torch.ones(1,3,sz,sz))
encoder = get_encoder(f_model, cut)
out = encoder.cuda()(inp)
fpn = FPN(encoder=encoder)
outs = fpn(inp)
#[p2_out, p3_out, p4_out, p5_out, p6_out]
[out.size() for out in outs]
[torch.Size([1, 256, 16, 16]), torch.Size([1, 256, 8, 8]), torch.Size([1, 256, 4, 4]), torch.Size([1, 256, 2, 2])]
class RetinaNet(nn.Module):
def __init__(self, fpn, subnet1, subnet2):
super().__init__()
self.fpn = fpn # initialize FPN
self.subnet1 = subnet1 # initialize classifier
self.subnet2 = subnet2 # initialize regressor
def forward(self, x):
#p3_out, p4_out, p5_out, p6_out, p7_out = self.fpn(x)
p4_out, p5_out, p6_out, p7_out = self.fpn(x)
#cls_out5 = self.subnet1(p3_out) # 32x32
cls_out4 = self.subnet1(p4_out) # 16x16
cls_out3 = self.subnet1(p5_out) # 8x8
cls_out2 = self.subnet1(p6_out) # 4x4
cls_out1 = self.subnet1(p7_out) # 2x2
#reg_out5 = self.subnet2(p3_out) # 32x32
reg_out4 = self.subnet2(p4_out) # 16x16
reg_out3 = self.subnet2(p5_out) # 8x8
reg_out2 = self.subnet2(p6_out) # 4x4
reg_out1 = self.subnet2(p7_out) # 2x2
# Concat outputs from different levels of the pyramid
# Here, each output is coming from flatten_conv in this order:
# grid row 0 col 0 anchor 0 level: 4x4
# grid row 0 col 1 anchor 0 level: 4x4
# ...
# grid row 0 col 0 anchor 1 level: 4x4
# grid row 0 col 1 anchor 1 level: 4x4
#
# grid row 0 col 0 anchor 0 level: 8x8
# grid row 0 col 1 anchor 0 level: 8x8
# grid row 0 col 0 anchor 0 level: 16x16
# grid row 0 col 0 anchor 0 level: 16x16
#return [torch.cat([cls_out1, cls_out2, cls_out3, cls_out4, cls_out5], 1),
# torch.cat([reg_out1, reg_out2, reg_out3, reg_out4, reg_out5], 1)]
# return [torch.cat([cls_out1, cls_out2, cls_out3, cls_out4], 1),
# torch.cat([reg_out1, reg_out2, reg_out3, reg_out4], 1)]
return [torch.cat([cls_out1, cls_out2, cls_out3], 1),
torch.cat([reg_out1, reg_out2, reg_out3], 1)]
# 20 classes, 9 anchors per grid and 256 channels
K, A, in_c = len(id2cat), 1, 256
# initialize retinanet model
fpn, subnet1, subnet2 = FPN(encoder), Subnet1(K, A, in_c), Subnet2(A, in_c)
retina = RetinaNet(fpn, subnet1, subnet2)
out = retina.cuda()(inp)
out[0].size(), out[1].size()
(torch.Size([1, 84, 20]), torch.Size([1, 84, 4]))
total_out = 0
for l in range(2,3):
total_out += (256 / 2**l)**2
total_out
4096.0
We constructed the model to output from different levels of FPN. These feature maps will have sizes 4x4, 8x8 and 16x16 respectively.
# Concat outputs from different levels of the pyramid
# Here, each output is coming from flatten_conv in this order:
# grid row 0 col 0 anchor 0 level: nxn
# grid row 0 col 1 anchor 0 level: nxn
# ...
# grid row 0 col 0 anchor 1 level: 4x4
# grid row 0 col 1 anchor 1 level: 4x4
#
# grid row 0 col 0 anchor 0 level: 8x8
# grid row 0 col 1 anchor 0 level: 8x8
# grid row 0 col 0 anchor 0 level: 16x16
# grid row 0 col 0 anchor 0 level: 16x16
def hw2corners(ctr, hw): return torch.cat([ctr-hw/2, ctr+hw/2], dim=1)
#anc_grids = [2, 4, 8, 16, 32]
anc_grids = [2, 4, 8]
#anc_zooms = [1, 2**(1/3), 2**(2/3)]
anc_zooms = [1]
#anc_ratios = [(1.,2.), (1.,1), (2.,1.)]
anc_ratios = [(1., 1.)]
anchor_scales = [(anz*i,anz*j) for anz in anc_zooms for (i,j) in anc_ratios] # n_anc_zooms * n_anc_ratios
k = len(anchor_scales)
anc_offsets = [1/(o*2) for o in anc_grids]
k
1
anc_x = np.concatenate([np.repeat(np.linspace(ao, 1-ao, ag), ag)
for ao,ag in zip(anc_offsets,anc_grids)])
anc_y = np.concatenate([np.tile(np.linspace(ao, 1-ao, ag), ag)
for ao,ag in zip(anc_offsets,anc_grids)])
# #daveluo
# anc_x = np.concatenate([np.tile(np.linspace(ao, 1-ao, ag), ag)
# for ao,ag in zip(anc_offsets,anc_grids)])
# anc_y = np.concatenate([np.repeat(np.linspace(ao, 1-ao, ag), ag)
# for ao,ag in zip(anc_offsets,anc_grids)])
anc_ctrs = np.repeat(np.stack([anc_x,anc_y], axis=1), k, axis=0)
def get_anchor_groups(anc):
"""anc_ctrs or anchor_crn"""
groups = []
for i, grid in enumerate(anc_grids):
n = sum(np.array(anc_grids[:i])**2)
m = sum(np.array(anc_grids[:i+1])**2)
groups.append(anc[n:m])
return groups
anc_sizes = np.concatenate([np.array([[o/ag,p/ag] for i in range(ag*ag) for o,p in anchor_scales])
for ag in anc_grids])
grid_sizes = V(np.concatenate([np.array([ 1/ag for i in range(ag*ag) for o,p in anchor_scales])
for ag in anc_grids]), requires_grad=False).unsqueeze(1)
anchors = V(np.concatenate([anc_ctrs, anc_sizes], axis=1), requires_grad=False).float()
anchor_cnr = hw2corners(anchors[:,:2], anchors[:,2:])
anc_ctrs_groups = get_anchor_groups(anc_ctrs)
anc_cnr_groups = get_anchor_groups(anchor_cnr)
[len(i) for i in anc_cnr_groups]
[4, 16, 64]
# anchor_cnr for 4x4
anchor_crn2 = anc_cnr_groups[0]
# anchor_cnr for 4x4
anchor_crn4 = anc_cnr_groups[1]
# anchor_cnr for 8x8
anchor_crn8 = anc_cnr_groups[2]
# anchor_cnr for 16x16
#anchor_crn16 = anc_cnr_groups[3]
# anchor_cnr for 32x32
#anchor_crn32 = anc_cnr_groups[4]
[len(anchor_crn2), len(anchor_crn4), len(anchor_crn8), len(anchor_crn16)]#, len(anchor_crn32)
[4, 16, 64, 256]
anchor_cnr.size()
torch.Size([84, 4])
fig, ax = plt.subplots(figsize=(20, 10))
ax.imshow(sample_x)
# grid row 0 col 1 for 2x2
for anc in (anchor_crn2)[2:3]:
draw_rect(ax, bb_hw(to_np(anc)*256), color="red")
# # grid row 0 col 1 for 4x4
# for anc in (anchor_crn4)[:10]:
# draw_rect(ax, bb_hw(to_np(anc)*256), color="red")
# # # grid row 0 col 1 for 8x8
# for anc in (anchor_crn8)[:10]:
# draw_rect(ax, bb_hw(to_np(anc)*256), color="red")
# # # grid row 0 col 1 for 16x16
# for anc in (anchor_crn16)[:10]:
# draw_rect(ax, bb_hw(to_np(anc)*256), color="red")
# # grid row 0 col 1 for 32x32
# for anc in (anchor_crn32)[:10]:
# draw_rect(ax, bb_hw(to_np(anc)*256), color="red")
def one_hot_embedding(labels, num_classes):
return torch.eye(num_classes)[labels.data.cpu()]
class BCE_Loss(nn.Module):
def __init__(self, num_classes):
super().__init__()
self.num_classes = num_classes
def forward(self, preds, targets):
t = one_hot_embedding(targets, self.num_classes+1)
t = V(t[:,:-1].contiguous()) #bg class is predicted when none of the others go out.
x = preds[:,:]
w = self.get_weight(x,t)# for the last part
return F.binary_cross_entropy_with_logits(x, t, w, size_average=False) / self.num_classes
def get_weight(self,x,t):
return None
class FocalLoss(BCE_Loss):
def get_weight(self,x,t):
alpha,gamma = 0.25,2.
p = x.sigmoid()
pt = p*t + (1-p)*(1-t)
w = alpha*t + (1-alpha)*(1-t)
return w * (1-pt).pow(gamma)
loss_f = FocalLoss(len(id2cat))
def intersect(box_a, box_b):
max_xy = torch.min(box_a[:, None, 2:], box_b[None, :, 2:])
min_xy = torch.max(box_a[:, None, :2], box_b[None, :, :2])
inter = torch.clamp((max_xy - min_xy), min=0)
return inter[:, :, 0] * inter[:, :, 1]
def box_sz(b): return ((b[:, 2]-b[:, 0]) * (b[:, 3]-b[:, 1]))
def jaccard(box_a, box_b):
inter = intersect(box_a, box_b)
union = box_sz(box_a).unsqueeze(1) + box_sz(box_b).unsqueeze(0) - inter
return inter / union
#Removes the zero padding in the target bbox/class
size = 256
def get_y(bbox,clas):
bbox = bbox.view(-1,4)/size
bb_keep = ((bbox[:,2] - bbox[:,0])>0.).nonzero()[:,0]
return bbox[bb_keep], clas[bb_keep]
def actn_to_bb(actn, anchors):
actn_bbs = torch.tanh(actn)
actn_ctrs = (actn_bbs[:,:2] * grid_sizes/2) + anchors[:,:2]
actn_hw = (1 + actn_bbs[:,2:]/2) * anchors[:,2:]
return hw2corners(actn_ctrs,actn_hw)
def map_to_ground_truth(overlaps, print_it=False):
prior_overlap, prior_idx = overlaps.max(1)
if print_it: print(prior_overlap)
# pdb.set_trace()
gt_overlap, gt_idx = overlaps.max(0)
gt_overlap[prior_idx] = 1.99
for i,o in enumerate(prior_idx): gt_idx[o] = i
return gt_overlap,gt_idx
def ssd_1_loss(b_c,b_bb,bbox,clas,print_it=False, use_ab=True):
bbox,clas = get_y(bbox,clas)
a_ic = actn_to_bb(b_bb, anchors)
overlaps = jaccard(bbox.data, (anchor_cnr if use_ab else a_ic).data)
gt_overlap,gt_idx = map_to_ground_truth(overlaps,print_it)
gt_clas = clas[gt_idx]
pos = gt_overlap > 0.5
pos_idx = torch.nonzero(pos)[:,0]
gt_clas[1-pos] = len(id2cat)
gt_bbox = bbox[gt_idx]
loc_loss = ((a_ic[pos_idx] - gt_bbox[pos_idx]).abs()).mean()
clas_loss = loss_f(b_c, gt_clas) / len(pos_idx) #Normalized by the number of anchors matched to a GT object
return loc_loss, clas_loss
def ssd_loss(pred,targ,print_it=False):
lcs,lls = 0.,0.
for b_c,b_bb,bbox,clas in zip(*pred,*targ):
loc_loss,clas_loss = ssd_1_loss(b_c,b_bb,bbox,clas,print_it)
lls += loc_loss
lcs += clas_loss
if print_it: print(f'loc: {lls.data[0]}, clas: {lcs.data[0]}')
return lls+lcs
def ssd_loss2(pred,targ):
lcs,lls = 0.,0.
for b_c,b_bb,bbox,clas in zip(*pred,*targ):
loc_loss,clas_loss = ssd_1_loss(b_c,b_bb,bbox,clas,use_ab=False)
lls += loc_loss
lcs += clas_loss
return lls+lcs
# wrap for fastai
class RetinaNetModel():
def __init__(self, model, cut_lr, name='retinanet'):
self.model,self.name, self.cut_lr = model, name, cut_lr
def get_layer_groups(self, precompute):
lgs = list(split_by_idxs(children(self.model.fpn.encoder), [self.cut_lr]))
return [lgs[0]] + [children(self.model.fpn)[1:]] + [children(self.model.subnet1)] + [children(self.model.subnet2)]
aug_tfms = [RandomRotate(10, tfm_y = TfmType.COORD),
RandomLighting(0.05,0.05, tfm_y = TfmType.COORD),
RandomFlip(tfm_y = TfmType.COORD)]
tfms = tfms_from_model(f_model, sz, aug_tfms=aug_tfms, crop_type=CropType.NO, tfm_y = TfmType.COORD)
md = ImageClassifierData.from_csv(PATH, JPEGS, MBB_CSV, tfms=tfms, continuous=True, num_workers=4, bs=bs)
class ConcatLblDataset(Dataset):
def __init__(self, ds, y2):
self.ds,self.y2 = ds,y2
self.sz = ds.sz
def __len__(self): return len(self.ds)
def __getitem__(self, i):
x,y = self.ds[i]
return (x, (y,self.y2[i]))
trn_ds2 = ConcatLblDataset(md.trn_ds, trn_mcs)
val_ds2 = ConcatLblDataset(md.val_ds, val_mcs)
md.trn_dl.dataset = trn_ds2
md.val_dl.dataset = val_ds2
# INIT MODEL
f_model=resnet34
sz=256
bs=32
cut, cut_lr = model_meta[f_model]
encoder = get_encoder(f_model, cut).cuda()
# input channel for subnets, C = 256 in the paper
A, K, in_c = 1, 20, 256
fpn, subnet1, subnet2 = FPN(encoder), Subnet1(K, A, in_c), Subnet2(A, in_c)
inp = V(torch.ones(1,3,256,256))
out = fpn(inp.cuda())
retina = RetinaNet(fpn, subnet1, subnet2).cuda()
model = RetinaNetModel(retina, 8)
# init learner and define optimizer
learn = ConvLearner(md, model)
learn.opt_fn=partial(optim.SGD,momentum=0.9)
learn.crit = ssd_loss
# freeze resnet
learn.freeze_to(1)
learn.unfreeze()
fpn_outs = learn.model.fpn(inp)
[out.size() for out in fpn_outs]
[torch.Size([1, 256, 16, 16]), torch.Size([1, 256, 8, 8]), torch.Size([1, 256, 4, 4]), torch.Size([1, 256, 2, 2])]
outs = learn.model(inp)
# classes, bbox
outs[0].size(), outs[1].size()
(torch.Size([1, 84, 20]), torch.Size([1, 84, 4]))
layers = learn.models.get_layer_groups(False)
for m in layers[0]:
for p in m.parameters():
print(p.requires_grad)
True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True
learn.lr_find()
HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))
0%| | 0/63 [00:00<?, ?it/s] 73%|███████▎ | 46/63 [01:04<00:23, 1.39s/it, loss=1.09e+06]
learn.sched.plot()
lr = 1e-3
#lrs = [lr/100, lr, lr, lr]
learn.fit(lr, n_cycle=20, cycle_len=1, use_clr_beta=(10, 10, 0.85, 0.90))
HBox(children=(IntProgress(value=0, description='Epoch', max=20), HTML(value='')))
8%|▊ | 5/63 [00:07<01:32, 1.60s/it, loss=2.25] epoch trn_loss val_loss 0 2.215486 2.504212 62%|██████▏ | 39/63 [00:51<00:31, 1.31s/it, loss=2.21]
def nms(boxes, scores, overlap=0.5, top_k=100):
keep = scores.new(scores.size(0)).zero_().long()
if boxes.numel() == 0: return keep
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
area = torch.mul(x2 - x1, y2 - y1)
v, idx = scores.sort(0) # sort in ascending order
idx = idx[-top_k:] # indices of the top-k largest vals
xx1 = boxes.new()
yy1 = boxes.new()
xx2 = boxes.new()
yy2 = boxes.new()
w = boxes.new()
h = boxes.new()
count = 0
while idx.numel() > 0:
i = idx[-1] # index of current largest val
keep[count] = i
count += 1
if idx.size(0) == 1: break
idx = idx[:-1] # remove kept element from view
# load bboxes of next highest vals
torch.index_select(x1, 0, idx, out=xx1)
torch.index_select(y1, 0, idx, out=yy1)
torch.index_select(x2, 0, idx, out=xx2)
torch.index_select(y2, 0, idx, out=yy2)
# store element-wise max with next highest score
xx1 = torch.clamp(xx1, min=x1[i])
yy1 = torch.clamp(yy1, min=y1[i])
xx2 = torch.clamp(xx2, max=x2[i])
yy2 = torch.clamp(yy2, max=y2[i])
w.resize_as_(xx2)
h.resize_as_(yy2)
w = xx2 - xx1
h = yy2 - yy1
# check sizes of xx1 and xx2.. after each iteration
w = torch.clamp(w, min=0.0)
h = torch.clamp(h, min=0.0)
inter = w*h
# IoU = i / (area(a) + area(b) - i)
rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
union = (rem_areas - inter) + area[i]
IoU = inter/union # store result in iou
# keep only elements with an IoU <= overlap
idx = idx[IoU.le(overlap)]
return keep, count
import matplotlib.cm as cmx
import matplotlib.colors as mcolors
def get_cmap(N):
color_norm = mcolors.Normalize(vmin=0, vmax=N-1)
return cmx.ScalarMappable(norm=color_norm, cmap='Set3').to_rgba
num_colr = 12
cmap = get_cmap(num_colr)
colr_list = [cmap(float(x)) for x in range(num_colr)]
def bb_to_hw(bb):
return [bb[1],bb[0],bb[3]-bb[1], bb[2]-bb[0]]
def show_img(im, figsize=None, axis=None):
if not axis:
fig,axis = plt.subplots(figsize=figsize)
axis.imshow(im)
axis.get_xaxis().set_visible(False)
axis.get_yaxis().set_visible(False)
return axis
def draw_outline(obj,lw):
obj.set_path_effects([patheffects.Stroke(linewidth=lw, foreground='black'), patheffects.Normal()])
def draw_rect(axis, box, color='white'):
patch = axis.add_patch(patches.Rectangle(box[:2],box[-2],box[-1],fill=False,edgecolor=color,lw=2))
draw_outline(patch,4)
def draw_text(axis,xy,text,text_size=14, color='white'):
patch = axis.text(*xy, text, verticalalignment='top', color=color, fontsize=text_size, weight='bold')
draw_outline(patch,1)
def show_img_all(id_img):
img = open_image(IMG_PATH/imgs_fn[id_img])
axis = show_img(img, figsize=(16,8))
for bbox, id_cat in trn_anno[id_img]:
new_box = bb_to_hw(bbox)
draw_rect(axis, new_box)
draw_text(axis, new_box[:2], cats[id_cat])
def show_ground_truth(ax, im, bbox, clas = None, prs = None, tresh = 0.3):
bb = [bb_to_hw(o) for o in bbox.reshape(-1,4)]
if clas is None: clas = [None] * len(bb)
if prs is None: prs = [None] * len(bb)
ax = show_img(im,axis=ax)
for i, (b,c,pr) in enumerate(zip(bb,clas,prs)):
if b[2] > 0 and (pr is None or pr > tresh):#Show the bow only if there is something to show
draw_rect(ax, b, colr_list[i%num_colr])
txt = f'{i}: '
if c is not None: txt += ('bg' if c == len(id2cat) else id2cat[c])
if pr is not None: txt += f'{pr:.2f}'
draw_text(ax,b[:2],txt,color=colr_list[i%num_colr])
def torch_gt(ax, ima, bbox, clas, prs=None, thresh=0.4):
return show_ground_truth(ax, ima, to_np((bbox*256).long()),
to_np(clas), to_np(prs) if prs is not None else None, thresh)
def np_gt(ax, ima, bbox, clas, prs=None, thresh=0.4):
return show_ground_truth(ax, ima, (bbox*256).astype(np.uint8),
clas, prs if prs is not None else None, thresh)
learn.load('model1')
learn.model.eval()
samples = iter(md.val_dl)
x,y = next(samples)
x,y = V(x),V(y)
pred = learn.model(x)
def show_results(idx, thresh=0.25, ax=None):
if ax is None: fig, ax = plt.subplots(figsize=(6,6))
ima=md.val_ds.ds.denorm(x)[idx]
out1,out2,cc = get1preds(pred[0][idx],pred[1][idx],y[0][idx],y[1][idx],thresh)
torch_gt(ax, ima, out2, cc, out1, 0.1)
def show_gt(idx, ax=None):
if ax is None: fig, ax = plt.subplots(figsize=(6,6))
ima = md.val_ds.ds.denorm(x)[idx]
show_ground_truth(ax,ima,to_np(y[0][idx]),to_np(y[1][idx]))
def compare(idx,thresh=0.25):
fig, axs = plt.subplots(1,2,figsize=(12,6))
show_results(idx,thresh,ax=axs[0])
show_gt(idx,ax=axs[1])
plt.tight_layout()
def get1preds(b_clas,b_bb,bbox,clas,thresh=0.25):
bbox,clas = get_y(bbox, clas)
a_ic = actn_to_bb(b_bb, anchors)
clas_pr, clas_ids = b_clas.max(1)
conf_scores = b_clas.sigmoid().t().data
out1,out2,cc = [],[],[]
for cl in range(conf_scores.size(0)-1):
cl_mask = conf_scores[cl] > thresh
if cl_mask.sum() == 0: continue
scores = conf_scores[cl][cl_mask]
l_mask = cl_mask.unsqueeze(1).expand_as(a_ic)
boxes = a_ic[l_mask].view(-1, 4)
ids, count = nms(boxes.data, scores, 0.4, 50)
ids = ids[:count]
out1.append(scores[ids])
out2.append(boxes.data[ids])
cc.append([cl]*count)
cc = T(np.concatenate(cc)) if cc != [] else None
out1 = torch.cat(out1) if out1 != [] else None
out2 = torch.cat(out2) if out2 != [] else None
return out1,out2,cc
def show_results(idx, thresh=0.25, ax=None):
if ax is None: fig, ax = plt.subplots(figsize=(6,6))
ima=md.val_ds.ds.denorm(x)[idx]
out1,out2,cc = get1preds(pred[0][idx],pred[1][idx],y[0][idx],y[1][idx],thresh)
torch_gt(ax, ima, out2, cc, out1, 0.1)
def show_gt(idx, ax=None):
if ax is None: fig, ax = plt.subplots(figsize=(6,6))
ima = md.val_ds.ds.denorm(x)[idx]
show_ground_truth(ax,ima,to_np(y[0][idx]),to_np(y[1][idx]))
def compare(idx,thresh=0.25):
fig, axs = plt.subplots(1,2,figsize=(12,6))
show_results(idx,thresh,ax=axs[0])
show_gt(idx,ax=axs[1])
plt.tight_layout()
def show_hits(idx,gt_obj,tresh=0.25):
fig, axs = plt.subplots(1,2,figsize=(12,6))
ima=md.val_ds.ds.denorm(x)[idx]
show_img(ima,axis=axs[1])
boxes = to_np(y[0][idx].view(-1,4))
idx_obj = boxes.shape[0] - 1 - gt_obj
box = bb_to_hw(boxes[idx_obj])
draw_rect(axs[1],box)
draw_text(axs[1],box[0:2],id2cat[to_np(y[1][idx])[idx_obj]])
pr_scr,pr_bb,pr_cls = get1preds(pred[0][idx],pred[1][idx],y[0][idx],y[1][idx],tresh)
overlaps = jaccard(pr_bb,y[0][idx].view(-1,4)[idx_obj].unsqueeze(0).data/224)
hits = (overlaps > 0.5) * (pr_cls == y[1].data[idx,idx_obj]).unsqueeze(1)
bb_mask = hits.expand_as(pr_bb)
torch_gt(axs[0], ima, pr_bb[bb_mask].view(-1,4), pr_cls[hits], pr_scr[hits], tresh)
plt.tight_layout()
compare(15)
def count(L):
result = collections.defaultdict(int)
if L is not None:
for x in L:
result[x] += 1
return result
from ipywidgets import FloatProgress
from IPython.display import display
def multiTPFPFN():
n = 40
threshes = np.linspace(.05, 0.95, n, endpoint=True)
tps,fps,fns = np.zeros((n,len(id2cat))),np.zeros((n,len(id2cat))),np.zeros((n,len(id2cat)))
prog = FloatProgress(min=0,max=len(md.val_dl))
display(prog)
for data in md.val_dl:
x,y = data
x,y = V(x),V(y)
pred = learn.model(x)
for idx in range(x.size(0)):
bbox,clas = get_y(y[0][idx],y[1][idx])#unpad the target
p_scrs,p_box,p_cls = get1preds(pred[0][idx],pred[1][idx],y[0][idx],y[1][idx],threshes[0])
overlaps = to_np(jaccard(p_box,bbox.data))
overlaps = np.where(overlaps > 0.5, overlaps, 0)
clas, np_scrs, np_cls = to_np(clas.data),to_np(p_scrs), to_np(p_cls)
for k in range(threshes.shape[0]):
new_tp = collections.defaultdict(int)
for cls in list(set(clas)):
msk_clas = np.bitwise_and((clas == cls)[None,:],(np_cls == cls)[:,None])
ov_clas = np.where(msk_clas,overlaps,0.)
mx_idx = np.argmax(ov_clas,axis=1)
for i in range(0,len(clas)):
if (clas[i] == cls):
keep = np.bitwise_and(np.max(ov_clas,axis=1) > 0.,mx_idx==i)
keep = np.bitwise_and(keep,np_scrs > threshes[k])
if keep.sum() > 0:
new_tp[cls] += 1
count_pred = count(np_cls[np_scrs > threshes[k]])
count_gt = count(clas)
for c in range(len(id2cat)):
tps[k,c] += new_tp[c]
fps[k,c] += count_pred[c] - new_tp[c]
fns[k,c] += count_gt[c] - new_tp[c]
prog.value += 1
return tps, fps, fns
tps, fps, fns = multiTPFPFN()
FloatProgress(value=0.0, max=16.0)
def avg_prec(clas):
eps = 1e-15
precisions = tps[:,clas]/(tps[:,clas] + fps[:,clas] + eps)
recalls = tps[:,clas]/(tps[:,clas] + fns[:,clas] + eps)
prec_at_rec = []
for recall_level in np.linspace(0.0, 1.0, 11):
try:
args = np.argwhere(recalls >= recall_level).flatten()
prec = max(precisions[args])
except ValueError:
prec = 0.0
prec_at_rec.append(prec)
return np.array(prec_at_rec).mean()
def mAP():
S = 0
for i in range(len(id2cat)):
S += avg_prec(i)
return S/len(id2cat)
mAP()
0.12747938049885787