首先,python接口darknet.py中detect函數如下:
def detect(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
im = load_image(image, 0, 0)
num = c_int(0)
pnum = pointer(num)
predict_image(net, im)
dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
num = pnum[0]
if (nms): do_nms_obj(dets, num, meta.classes, nms);
res = []
for j in range(num):
for i in range(meta.classes):
if dets[j].prob[i] > 0:
b = dets[j].bbox
res.append((meta.names[i], dets[j].prob[i], (b.x, b.y, b.w, b.h)))
res = sorted(res, key=lambda x: -x[1])
free_image(im)
free_detections(dets, num)
return res
首先image傳入後通過load_image()函數讀取,load_image()函數導入見darknet.py中的lib,如下:
load_image = lib.load_image_color
load_image.argtypes = [c_char_p, c_int, c_int]
load_image.restype = IMAGE
具體通過編譯好的libdarknet.so傳入,首先市容load_image_color()函數,還函數位於src/image.c中,具體如下:
image load_image_color(char *filename, int w, int h)
{
return load_image(filename, w, h, 3);
}
可以發現load_image_color()傳入三個參數,除了image name外還有int w和int h,而在detect()中可以發現只傳入了load_image(image, 0, 0),那麼這個w=0和h=0該怎麼算呢?繼續往下看,發現調用了image.c中的load_image()函數,而該函數如下:
image load_image(char *filename, int w, int h, int c)
{
#ifdef OPENCV
image out = load_image_cv(filename, c);
#else
image out = load_image_stb(filename, c);
#endif
if((h && w) && (h != out.h || w != out.w)){
image resized = resize_image(out, w, h);
free_image(out);
out = resized;
}
return out;
}
發現使用了opencv中的load_image_cv讀取圖像,而傳入進來的h和w其實是在下面使用resize_image()操作對圖片做了resize操作。至此發現darknet.py中的detect()使用的load_image(image, 0, 0)意思是隻讀入圖片而不對其進行resize操作。那麼load_image_cv讀取到的圖像返回三個值,即真實的image、image.w和image.h,這裏即爲out,然後返回至detect()中的im。緊接着,im傳遞給predict_image(net, im)函數進行預測,那麼predic_image()我們下次再進行解讀。那麼真實的im.w和im.h(即圖像的原始寬和高)被傳遞給dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)函數,那麼我們看一下此處的w和h到底起什麼作用。get_network_boxes()函數在src/network.c中,函數如下:
detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num)
{
detection *dets = make_network_boxes(net, thresh, num);
fill_network_boxes(net, w, h, thresh, hier, map, relative, dets);
return dets;
}
那麼我們看到fill_network_boxes()使用了該函數,我們看明白了,在檢測過程中並沒有用到此處的w和h,那麼這個原始圖像的w和h幹嘛用了呢?繼續喊fill_network_boxes()函數,如下:
void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets)
{
int j;
for(j = 0; j < net->n; ++j){
layer l = net->layers[j];
if(l.type == YOLO){ //yolov3
int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets);
dets += count;
}
if(l.type == REGION){ //yolov2
get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
dets += l.w*l.h*l.n;
}
if(l.type == DETECTION){ //yolov1
get_detection_detections(l, w, h, thresh, dets);
dets += l.w*l.h*l.n;
}
}
}
在yolov3中使用了get_yolo_detections()函數,並傳入w和h,另外可以發現同時傳入的還有net的w和h,net即是我們的yolov3網絡,可以發現cfg中定義的w和h被一併傳入至get_yolo_detections()中,繼續看該函數(位於src/yolo_layer.c中):
int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets)
{
int i,j,n;
float *predictions = l.output;
if (l.batch == 2) avg_flipped_yolo(l);
int count = 0;
for (i = 0; i < l.w*l.h; ++i){
int row = i / l.w;
int col = i % l.w;
for(n = 0; n < l.n; ++n){
int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4);
float objectness = predictions[obj_index];
if(objectness <= thresh) continue;
int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
dets[count].objectness = objectness;
dets[count].classes = l.classes;
for(j = 0; j < l.classes; ++j){
int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
float prob = objectness*predictions[class_index];
dets[count].prob[j] = (prob > thresh) ? prob : 0;
}
++count;
}
}
correct_yolo_boxes(dets, count, w, h, netw, neth, relative);
return count;
}
可以發現我們的原始圖像傳入的w和h只被correct_yolo_boxes()函數使用,correct是準確的意思,顧名思義,該函數得到準確的yolo檢測框,詳細看該函數:
void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
{
int i;
int new_w=0;
int new_h=0;
if (((float)netw/w) < ((float)neth/h)) {
new_w = netw;
new_h = (h * netw)/w;
} else {
new_h = neth;
new_w = (w * neth)/h;
}
for (i = 0; i < n; ++i){
box b = dets[i].bbox;
b.x = (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw);
b.y = (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth);
b.w *= (float)netw/new_w;
b.h *= (float)neth/new_h;
if(!relative){
b.x *= w;
b.w *= w;
b.y *= h;
b.h *= h;
}
dets[i].bbox = b;
}
}
可以看出,最終原始圖像的w和h的作用是和cfg傳入的net-w和h一起經過計算將預測出的原始boxes映射回對應於原圖的boxes座標。即比如net中測試圖像的resize大小爲608x608,則預測的初始boxes的座標對應的是608x608的,而原圖比如爲1920x1080的,那麼就需要原圖的w和h傳入進行映射計算得到最終的boxes座標。所以,檢測時進行的image resize操作應當位於未講解的predic_image()函數中。