0.背景
想學習檢測,yolov3很多基於darknet,不是很方便,轉向同樣輕量級的SSD
1.介紹
https://zhuanlan.zhihu.com/p/79854543
https://zhuanlan.zhihu.com/p/79933177
https://zhuanlan.zhihu.com/p/66332452
1.1 數據集
本次採用VOC07+12,20類,加上背景21類
1.2網絡結構
原始SSD300,共6個尺度,從38*38逐步/2,最後到1*1,mobile版本從19*19開始
def forward(self, x):
"""Applies network layers and ops on input image(s) x.
Args:
x: input image or batch of images. Shape: [batch,3,300,300].
Return:
Depending on phase:
test:
Variable(tensor) of output class label predictions,
confidence score, and corresponding location predictions for
each object detected. Shape: [batch,topk,7]
train:
list of concat outputs from:
1: confidence layers, Shape: [batch*num_priors,num_classes]
2: localization layers, Shape: [batch,num_priors*4]
3: priorbox layers, Shape: [2,num_priors*4]
"""
sources = list()
loc = list()
conf = list()
# apply vgg up to conv4_3 relu
x = self.mobilenet.conv1(x) #torch.Size([32, 16, 150, 150])
x = self.mobilenet.bn1(x) #torch.Size([32, 16, 150, 150])
x = self.mobilenet.activation(x) #torch.Size([32, 16, 150, 150])
for i in self.mobilenet.bottlenecks[:5]:
x = i(x)
'''
torch.Size([32, 16, 150, 150]) B_0
torch.Size([32, 24, 75, 75])
torch.Size([32, 32, 38, 38])
torch.Size([32, 64, 19, 19])
torch.Size([32, 96, 19, 19])
torch.Size([32, 160, 10, 10])
torch.Size([32, 320, 10, 10])
'''
#s = self.L2Norm(x)
sources.append(x) #torch.Size([32, 96, 19, 19])
# apply vgg up to fc7
for i in self.mobilenet.bottlenecks[5:]:
x = i(x)
x = self.mobilenet.conv_last(x)
x = self.mobilenet.bn_last(x)
x = self.mobilenet.activation(x)
sources.append(x) #torch.Size([32, 1280, 10, 10])
# apply extra layers and cache source layer outputs
for k, v in enumerate(self.extras):
#print(x.size())
#print(v(x).size())
# x = F.relu(v(x), inplace=True)
x = v(x)
if k % 2 == 1:
sources.append(x) #[[512,5],[256,3],[256,2],[128,1] ? 512 256 256 128
# apply multibox head to source layers
for (x, l, c) in zip(sources, self.loc, self.conf):
loc.append(l(x).permute(0, 2, 3, 1).contiguous()) #[B,4*num_box,H1,W1]--> [B,H,W,4*num_box]
conf.append(c(x).permute(0, 2, 3, 1).contiguous())
loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
'''
[B,H1*W1*16+H2*W2*24],一batch的所有框
torch.Size([32, 16, 19, 19])
torch.Size([32, 24, 10, 10])
torch.Size([32, 24, 5, 5])
torch.Size([32, 24, 3, 3])
torch.Size([32, 24, 2, 2])
torch.Size([32, 24, 1, 1])
'''
conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
if self.phase == "test":
output = self.detect(
loc.view(loc.size(0), -1, 4), # loc preds
self.softmax(conf.view(conf.size(0), -1,
self.num_classes)), # conf preds
self.priors.type(type(x.data)) # default boxes
)
else:
output = (
loc.view(loc.size(0), -1, 4), # b*n*4,n爲一張圖所有的框
conf.view(conf.size(0), -1, self.num_classes),
self.priors # #順序 1_min,1_max,2_max_,2_min [cx,cy,w,h] column,row,feature,大小 n*4
)
return output
簡單來說,自己總結
從網絡中抽取 96*19*19,1280*10*10 512*5*5 256*3*3 256*3*3 128*1*1 進行後續操作
原始 SSD是 38*38 19*19 10*10 5*5 3*3 1*1,有不同
通過 loc 和 conf 提取特徵信息,分別再形成6個feature,用於計算loss,設定6層每個點的anchor數量依次爲 466666
所以 loc 和 conf 不同的只是通道數, 4*num_anchor, 21*num_anchor
例如 loc的6個輸出
torch.Size([32, 16, 19, 19])
torch.Size([32, 24, 10, 10])
torch.Size([32, 24, 5, 5])
torch.Size([32, 24, 3, 3])
torch.Size([32, 24, 2, 2])
torch.Size([32, 24, 1, 1])
再把 loc 和 conf 壓扁 成 Batch_size*x
train:返回 output = (
loc.view(loc.size(0), -1, 4), # b*n*4,n爲一張圖所有的框,6個尺度
conf.view(conf.size(0), -1, self.num_classes),
self.priors # #順序anchor 1_min,1_max,2_max_,2_min,(3_max,3_min
後面5個尺度纔有) [cx,cy,w,h] 按照column,row順序來的,feature,大小 n*4
cx,cy是網格點中心,cx,cy,w,h 均爲0-1的數
)
test: output = self.detect(
loc.view(loc.size(0), -1, 4), # loc preds
self.softmax(conf.view(conf.size(0), -1,
self.num_classes)), # conf preds,NNL要自己siftmax
self.priors.type(type(x.data)) # default boxes
)
1.2 VOC 數據類型
讀取XML文件得到
def __init__(self, root,
image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
transform=None, target_transform=VOCAnnotationTransform(),
dataset_name='VOC0712'):
self.root = root
self.image_set = image_sets
self.transform = transform
self.target_transform = target_transform
self.name = dataset_name
self._annopath = osp.join('%s', 'Annotations', '%s.xml')
self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
self.ids = list()
for (year, name) in image_sets:
rootpath = osp.join(self.root, 'VOC' + year)
for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
self.ids.append((rootpath, line.strip())) #07+12 [(.../VOC2007,000001),()]
def __getitem__(self, index):
im, gt, h, w = self.pull_item(index)
return im, gt # # return torch.from_numpy(img) C*H*W , target(# [[xmin, ymin, xmax, ymax, label_ind], ... ],# 0-1), height, width
def __len__(self):
return len(self.ids)
def pull_item(self, index):
img_id = self.ids[index]
target = ET.parse(self._annopath % img_id).getroot()
img = cv2.imread(self._imgpath % img_id)
height, width, channels = img.shape
if self.target_transform is not None:
target = self.target_transform(target, width, height)
if self.transform is not None:
target = np.array(target)
img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
# to rgb
img = img[:, :, (2, 1, 0)]
# img = img.transpose(2, 0, 1)
target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
return torch.from_numpy(img).permute(2, 0, 1), target, height, width
# return torch.from_numpy(img) C*H*W , target(# [[xmin, ymin, xmax, ymax, label_ind], ... ],# 0-1,相對於自己的w,h), height, width
2. loss 計算
自定義 MultiBoxLoss(nn.Module),在forward中實現loss計算
class MultiBoxLoss(nn.Module):
"""SSD Weighted Loss Function
Compute Targets:
1) Produce Confidence Target Indices by matching ground truth boxes
with (default) 'priorboxes' that have jaccard index > threshold parameter
(default threshold: 0.5).
2) Produce localization target by 'encoding' variance into offsets of ground
truth boxes and their matched 'priorboxes'.
3) Hard negative mining to filter the excessive number of negative examples
that comes with using a large number of default bounding boxes.
(default negative:positive ratio 3:1)
Objective Loss:
L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
weighted by α which is set to 1 by cross val.
Args:
c: class confidences,
l: predicted boxes,
g: ground truth boxes
N: number of matched default boxes
See: https://arxiv.org/pdf/1512.02325.pdf for more details.
criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
False, args.cuda)
"""
def __init__(self, num_classes, overlap_thresh, prior_for_matching,
bkg_label, neg_mining, neg_pos, neg_overlap, encode_target,
use_gpu=True):
super(MultiBoxLoss, self).__init__()
self.use_gpu = use_gpu
self.num_classes = num_classes #21
self.threshold = overlap_thresh #0.5
self.background_label = bkg_label #0
self.encode_target = encode_target # false
self.use_prior_for_matching = prior_for_matching #true
self.do_neg_mining = neg_mining #true
self.negpos_ratio = neg_pos #3 3:1
self.neg_overlap = neg_overlap #0.5
self.variance = [0.1,0.2]
def forward(self, predictions, targets):
"""Multibox Loss
Args:
predictions (tuple): A tuple containing loc preds, conf preds,
and prior boxes from SSD net.
conf shape: torch.size(batch_size,num_priors,num_classes)
loc shape: torch.size(batch_size,num_priors,4)
priors shape: torch.size(num_priors,4) cx cy w h 0-1
targets (tensor): Ground truth boxes and labels for a batch,
shape: [batch_size,num_objs,5] (last idx is the label).
"""
loc_data, conf_data, priors = predictions
num = loc_data.size(0) #batch 數量
priors = priors[:loc_data.size(1), :] #torch.size(num_priors,4)
num_priors = (priors.size(0))
num_classes = self.num_classes
# match priors (default boxes) and ground truth boxes
loc_t = torch.Tensor(num, num_priors, 4)
conf_t = torch.LongTensor(num, num_priors) #方便onehot
# conf_t = torch.zeros(num,num_priors).long()
for idx in range(num): #一張圖片一張圖片來
target = targets[idx] #[xm,ym,xmax,ymax,label] 0-1
truths = target[:, :-1].data #gt, size=num_objects*4,, object 不等於 prior
labels = target[:, -1].data #(num_obj,)
defaults = priors.data
match(self.threshold, truths, defaults, self.variance, labels,
loc_t, conf_t, idx) #loc_t-->[B,num_pro,4],每個prio都有一個自己的GT,理想偏移。 con——t, [B,num_priors] top class label for each prior,21類別
if self.use_gpu:
loc_t = loc_t.cuda() #所有loc_均已改變
conf_t = conf_t.cuda() # B*num_prio
# wrap targets
loc_t = Variable(loc_t, requires_grad=False) #[num_pro,4]
conf_t = Variable(conf_t, requires_grad=False)
pos = conf_t > 0 #0 or 1,mask # B*num_prio
num_pos = pos.sum(dim=1, keepdim=True) # 一個batch,prio非背景數量。 B*1
# Localization Loss (Smooth L1)
# Shape: [batch,num_priors,4]
pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) #(batch_size,num_priors,4),豎着複製一遍,mask
loc_p = loc_data[pos_idx].view(-1, 4) #先拍成一列,再view, n*4, n /b < n_prio, 只考慮含物體的loc
loc_t = loc_t[pos_idx].view(-1, 4) # 正樣本GT,一個batch非常少
loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') # 計算smooth-l1,返回標量。
# Compute max conf across batch for hard negative mining, loss=[b*num_prio,1]
batch_conf = conf_data.view(-1, self.num_classes) # (batch_size,num_priors,num_classes)-->
loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) # 後一個batch_conf; --> (B*Num_prio,1), 對應類的conf值.
# Hard Negative Mining
# loss_c[pos] = 0 # filter out pos boxes for now
# loss_c = loss_c.view(num, -1)
# Hard Negative Mining
loss_c = loss_c.view(num, -1)
loss_c[pos] = 0 ## B*num_prio,只看背景的prio
_, loss_idx = loss_c.sort(1, descending=True) #每一張圖中降序,排前面的背景框
_, idx_rank = loss_idx.sort(1) #告訴每批次,每張圖的排隊位置
num_pos = pos.long().sum(1, keepdim=True) #每張圖非背景數量,B*1
num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) #至少選出3倍數量的negative, B*1
neg = idx_rank < num_neg.expand_as(idx_rank) # B*num_prio, 少量的backgroud是1,其餘是0
# Confidence Loss Including Positive and Negative Examples
pos_idx = pos.unsqueeze(2).expand_as(conf_data) # B*num_prio--> batch_size,num_priors,num_classes 0是背景,1是物體
neg_idx = neg.unsqueeze(2).expand_as(conf_data) # B*num_prio*21, 少量的backgroud是1,其餘是0
conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes) # 0+1或者1+0的框被選中, 最終參與計算conf——loss, n*21
targets_weighted = conf_t[(pos+neg).gt(0)] # 對應框的target,(n,),0-20
if cfg.USE_FL:
alpha = np.array([[0.25], [0.75], [0.75], [0.75], [0.75],
[0.75], [0.75], [0.75], [0.75], [0.75],
[0.75], [0.75], [0.75], [0.75], [0.75],
[0.75], [0.75], [0.75], [0.75], [0.75], [0.75]]) # 21*1,背景類權重下降
alpha = torch.Tensor(alpha)
compute_c_loss = focal_loss.FocalLoss(alpha=alpha, gamma=2, class_num=num_classes, size_average=False)
loss_c = compute_c_loss(conf_p, targets_weighted) #(n,21) (n,)
else:
loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum')
# Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
N = num_pos.data.sum() # n_pos , <_prio
loss_l /= N
loss_c /= N
# print("N",N,"\t","loss_l",loss_l,"\t","loss_c",loss_c)
return loss_l, loss_c
匹配的的過程很精彩,包括 IOU計算
# -*- coding: utf-8 -*-
import torch
def point_form(boxes):
""" Convert prior_boxes to (xmin, ymin, xmax, ymax)
representation for comparison to point form ground truth data.
Args:
boxes: (tensor) center-size default boxes from priorbox layers.
Return:
boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
"""
return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin
boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax
def center_size(boxes):
""" Convert prior_boxes to (cx, cy, w, h)
representation for comparison to center-size form ground truth data.
Args:
boxes: (tensor) point_form boxes
Return:
boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
"""
return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy
boxes[:, 2:] - boxes[:, :2], 1) # w, h
def intersect(box_a, box_b):
""" We resize both tensors to [A,B,2] without new malloc:
[A,2] -> [A,1,2] -> [A,B,2]
[B,2] -> [1,B,2] -> [A,B,2]
Then we compute the area of intersect between box_a and box_b.
Args:
box_a: (tensor) bounding boxes, Shape: [A,4].
box_b: (tensor) bounding boxes, Shape: [B,4].
Return:
(tensor) intersection area, Shape: [A,B].
"""
A = box_a.size(0)
B = box_b.size(0)
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
inter = torch.clamp((max_xy - min_xy), min=0)
return inter[:, :, 0] * inter[:, :, 1]
def jaccard(box_a, box_b):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes. Here we operate on
ground truth boxes and default boxes.
E.g.:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
Args:
box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
Return:
jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
"""
inter = intersect(box_a, box_b)
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
area_b = ((box_b[:, 2]-box_b[:, 0]) *
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
union = area_a + area_b - inter
return inter / union # [A,B]
def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
"""Match each prior box with the ground truth box of the highest jaccard
overlap, encode the bounding boxes, then return the matched indices
corresponding to both confidence and location preds.
Args:
threshold: (float) The overlap threshold used when mathing boxes. 0.5
truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. num_obj*4
priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. [n_priors,4]
variances: (tensor) Variances corresponding to each prior coord, [0.1,0.2]
Shape: [num_priors, 4].
labels: (tensor) All the class labels for the image, Shape: [num_obj]. [num_obj]
loc_t: (tensor) Tensor to be filled w/ endcoded location targets. (num_priors, 4)
conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. (num_priors,)
idx: (int) current batch index
Return:
The matched indices corresponding to 1)location and 2)confidence preds.
"""
# if len(truths) ==0:
# conf_t[idx] = 0
# return
# jaccard index
overlaps = jaccard( #return iou, [num_obj *num_prior]
truths,
point_form(priors) #prio ,cx,cy,h,w 0-1
)
# (Bipartite Matching)
# [1,num_objects] best prior for each ground truth
best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) #[num_oj,1],[nu_obj,1]
# [1,num_priors] best ground truth for each prior
best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) #[1,num_prio],[1.num_prio]
best_truth_idx.squeeze_(0) #[num_prio]
best_truth_overlap.squeeze_(0)
best_prior_idx.squeeze_(1)
best_prior_overlap.squeeze_(1)
best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior,GT指定的框 conf爲2
# TODO refactor: index best_prior_idx with long tensor
# ensure every gt matches with its prior of max overlap
for j in range(best_prior_idx.size(0)):
best_truth_idx[best_prior_idx[j]] = j # 被GT制定的prio框的index更改
matches = truths[best_truth_idx] # Shape: [num_priors,4] every prio 對應de GT
conf = labels[best_truth_idx] + 1 # Shape: [num_priors] # 0代表背景,20類變成21類。 label是20類
conf[best_truth_overlap < threshold] = 0 # label as background # conf [num_prio]
loc = encode(matches, priors, variances) #loc [num_prio,4]
loc_t[idx] = loc # [num_priors,4] encoded offsets to learn ,相當於prio相當於 GT的偏移,是我們希望網絡學習的完美輸出。lov_t[b,num_prio,4]
conf_t[idx] = conf # [num_priors] top class label for each prior
def encode(matched, priors, variances):
"""Encode the variances from the priorbox layers into the ground truth boxes
we have matched (based on jaccard overlap) with the prior boxes.
Args:
matched: (tensor) Coords of ground truth for each prior in point-form
Shape: [num_priors, 4].
priors: (tensor) Prior boxes in center-offset form
Shape: [num_priors,4].
variances: (list[float]) Variances of priorboxes [0.1,0.2]
Return:
encoded boxes (tensor), Shape: [num_priors, 4]
"""
# dist b/t match center and prior's center
g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] # prio ,cx,cy,w,h
# encode variance
g_cxcy /= (variances[0] * priors[:, 2:])
# match wh / prior wh
g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
g_wh = torch.log(g_wh) / variances[1]
# return target for smooth_l1_loss
return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
# Adapted from https://github.com/Hakuyume/chainer-ssd
def decode(loc, priors, variances):
"""Decode locations from predictions using priors to undo
the encoding we did for offset regression at train time.
Args:
loc (tensor): location predictions for loc layers,
Shape: [num_priors,4]
priors (tensor): Prior boxes in center-offset form.
Shape: [num_priors,4].
variances: (list[float]) Variances of priorboxes
Return:
decoded bounding box predictions
"""
boxes = torch.cat((
priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
boxes[:, :2] -= boxes[:, 2:] / 2
boxes[:, 2:] += boxes[:, :2]
return boxes # 最好是GT_prio,實際很接近
def log_sum_exp(x):
"""Utility function for computing log_sum_exp while determining
This will be used to determine unaveraged confidence loss across
all examples in a batch.
Args:
x (Variable(tensor)): conf_preds from conf layers, (b*num_prio,21)
"""
x_max = x.data.max()
return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
NMS的實現
def nms(boxes, scores, overlap=0.5, top_k=200):
"""Apply non-maximum suppression at test time to avoid detecting too many
overlapping bounding boxes for a given object.
Args:
boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
scores: (tensor) The class predscores for the img, Shape:[num_priors].
overlap: (float) The overlap thresh for suppressing unnecessary boxes.
top_k: (int) The Maximum number of box preds to consider.
Return:
The indices of the kept boxes with respect to num_priors.
"""
keep = scores.new(scores.size(0)).zero_().long()
if boxes.numel() == 0:
return keep
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
area = torch.mul(x2 - x1, y2 - y1)
v, idx = scores.sort(0) # sort in ascending order
# I = I[v >= 0.01]
idx = idx[-top_k:] # indices of the top-k largest vals,從最大的200個面開始篩選
xx1 = boxes.new()
yy1 = boxes.new()
xx2 = boxes.new()
yy2 = boxes.new()
w = boxes.new()
h = boxes.new()
# keep = torch.Tensor()
count = 0
while idx.numel() > 0:
i = idx[-1] # index of current largest val
# keep.append(i)
keep[count] = i
count += 1
if idx.size(0) == 1:
break
idx = idx[:-1] # remove kept element from view
# load bboxes of next highest vals
torch.index_select(x1, 0, idx, out=xx1)
torch.index_select(y1, 0, idx, out=yy1)
torch.index_select(x2, 0, idx, out=xx2)
torch.index_select(y2, 0, idx, out=yy2)
# store element-wise max with next highest score
xx1 = torch.clamp(xx1, min=x1[i])
yy1 = torch.clamp(yy1, min=y1[i])
xx2 = torch.clamp(xx2, max=x2[i])
yy2 = torch.clamp(yy2, max=y2[i])
w.resize_as_(xx2)
h.resize_as_(yy2)
w = xx2 - xx1
h = yy2 - yy1
# check sizes of xx1 and xx2.. after each iteration
w = torch.clamp(w, min=0.0)
h = torch.clamp(h, min=0.0)
inter = w*h
# IoU = i / (area(a) + area(b) - i)
rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
union = (rem_areas - inter) + area[i]
IoU = inter/union # store result in iou
# keep only elements with an IoU <= overlap
idx = idx[IoU.le(overlap)]
return keep, count #一張圖中pred框相對的編號
AP計算
https://zhuanlan.zhihu.com/p/70667071
https://zhuanlan.zhihu.com/p/70667071 這個寫的太好了,以這個爲主
import numpy as np
def voc_ap(rec, prec, use_07_metric=False):
""" ap = voc_ap(rec, prec, [use_07_metric])
Compute VOC AP given precision and recall.
If use_07_metric is true, uses the
VOC 07 11 point method (default:False).
"""
# 針對2007年VOC,使用的11個點計算AP,現在不使用
if use_07_metric:
# 11 point metric
ap = 0.
for t in np.arange(0., 1.1, 0.1):
if np.sum(rec >= t) == 0:
p = 0
else:
p = np.max(prec[rec >= t])
ap = ap + p / 11.
else:
# correct AP calculation
# first append sentinel values at the end
mrec = np.concatenate(([0.], rec, [1.])) #[0. 0.0666, 0.1333, 0.4 , 0.4666, 1.]
mpre = np.concatenate(([0.], prec, [0.])) #[0. 1., 0.6666, 0.4285, 0.3043, 0.]
# compute the precision envelope
# 計算出precision的各個斷點(折線點)
for i in range(mpre.size - 1, 0, -1):
mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) #[1. 1. 0.6666 0.4285 0.3043 0. ]
# to calculate area under PR curve, look for points
# where X axis (recall) changes value
i = np.where(mrec[1:] != mrec[:-1])[0] #precision前後兩個值不一樣的點
print(mrec[1:], mrec[:-1])
print(i) #[0, 1, 3, 4, 5]
# AP= AP1 + AP2+ AP3+ AP4
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
return ap
rec = np.array([0.0666, 0.1333,0.1333, 0.4, 0.4666])
prec = np.array([1., 0.6666, 0.6666, 0.4285, 0.3043])
ap = voc_ap(rec, prec)
print(ap) #輸出:0.2456