darknet-yolov3中python接口中image传输w和h的过程

首先,python接口darknet.py中detect函数如下:

def detect(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
    im = load_image(image, 0, 0)
    num = c_int(0)
    pnum = pointer(num)
    predict_image(net, im)
    dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
    num = pnum[0]
    if (nms): do_nms_obj(dets, num, meta.classes, nms);

    res = []
    for j in range(num):
        for i in range(meta.classes):
            if dets[j].prob[i] > 0:
                b = dets[j].bbox
                res.append((meta.names[i], dets[j].prob[i], (b.x, b.y, b.w, b.h)))
    res = sorted(res, key=lambda x: -x[1])
    free_image(im)
    free_detections(dets, num)
    return res

首先image传入后通过load_image()函数读取,load_image()函数导入见darknet.py中的lib,如下:

load_image = lib.load_image_color
load_image.argtypes = [c_char_p, c_int, c_int]
load_image.restype = IMAGE

具体通过编译好的libdarknet.so传入,首先市容load_image_color()函数,还函数位于src/image.c中,具体如下:

image load_image_color(char *filename, int w, int h)
{
    return load_image(filename, w, h, 3);
}

可以发现load_image_color()传入三个参数,除了image name外还有int w和int h,而在detect()中可以发现只传入了load_image(image, 0, 0),那么这个w=0和h=0该怎么算呢?继续往下看,发现调用了image.c中的load_image()函数,而该函数如下:

image load_image(char *filename, int w, int h, int c)
{
#ifdef OPENCV
    image out = load_image_cv(filename, c);
#else
    image out = load_image_stb(filename, c);
#endif

    if((h && w) && (h != out.h || w != out.w)){
        image resized = resize_image(out, w, h);
        free_image(out);
        out = resized;
    }
    return out;
}

发现使用了opencv中的load_image_cv读取图像,而传入进来的h和w其实是在下面使用resize_image()操作对图片做了resize操作。至此发现darknet.py中的detect()使用的load_image(image, 0, 0)意思是只读入图片而不对其进行resize操作。那么load_image_cv读取到的图像返回三个值,即真实的image、image.w和image.h,这里即为out,然后返回至detect()中的im。紧接着,im传递给predict_image(net, im)函数进行预测,那么predic_image()我们下次再进行解读。那么真实的im.w和im.h(即图像的原始宽和高)被传递给dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)函数,那么我们看一下此处的w和h到底起什么作用。get_network_boxes()函数在src/network.c中,函数如下:

detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num)
{
    detection *dets = make_network_boxes(net, thresh, num);
    fill_network_boxes(net, w, h, thresh, hier, map, relative, dets);
    return dets;
}

那么我们看到fill_network_boxes()使用了该函数,我们看明白了,在检测过程中并没有用到此处的w和h,那么这个原始图像的w和h干嘛用了呢?继续喊fill_network_boxes()函数,如下:

void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets)
{
    int j;
    for(j = 0; j < net->n; ++j){
        layer l = net->layers[j];
        if(l.type == YOLO){ //yolov3
            int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets);
            dets += count;
        }
        if(l.type == REGION){  //yolov2
            get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
            dets += l.w*l.h*l.n;
        }
        if(l.type == DETECTION){  //yolov1
            get_detection_detections(l, w, h, thresh, dets);
            dets += l.w*l.h*l.n;
        }
    }
}

在yolov3中使用了get_yolo_detections()函数,并传入w和h,另外可以发现同时传入的还有net的w和h,net即是我们的yolov3网络,可以发现cfg中定义的w和h被一并传入至get_yolo_detections()中,继续看该函数(位于src/yolo_layer.c中):

int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets)
{
    int i,j,n;
    float *predictions = l.output;
    if (l.batch == 2) avg_flipped_yolo(l);
    int count = 0;
    for (i = 0; i < l.w*l.h; ++i){
        int row = i / l.w;
        int col = i % l.w;
        for(n = 0; n < l.n; ++n){
            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
            float objectness = predictions[obj_index];
            if(objectness <= thresh) continue;
            int box_index  = entry_index(l, 0, n*l.w*l.h + i, 0);
            dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
            dets[count].objectness = objectness;
            dets[count].classes = l.classes;
            for(j = 0; j < l.classes; ++j){
                int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
                float prob = objectness*predictions[class_index];
                dets[count].prob[j] = (prob > thresh) ? prob : 0;
            }
            ++count;
        }
    }
    correct_yolo_boxes(dets, count, w, h, netw, neth, relative);
    return count;
}

可以发现我们的原始图像传入的w和h只被correct_yolo_boxes()函数使用,correct是准确的意思,顾名思义,该函数得到准确的yolo检测框,详细看该函数:

void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
{
    int i;
    int new_w=0;
    int new_h=0;
    if (((float)netw/w) < ((float)neth/h)) {
        new_w = netw;
        new_h = (h * netw)/w;
    } else {
        new_h = neth;
        new_w = (w * neth)/h;
    }
    for (i = 0; i < n; ++i){
        box b = dets[i].bbox;
        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); 
        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth); 
        b.w *= (float)netw/new_w;
        b.h *= (float)neth/new_h;
        if(!relative){
            b.x *= w;
            b.w *= w;
            b.y *= h;
            b.h *= h;
        }
        dets[i].bbox = b;
    }
}

可以看出,最终原始图像的w和h的作用是和cfg传入的net-w和h一起经过计算将预测出的原始boxes映射回对应于原图的boxes座标。即比如net中测试图像的resize大小为608x608,则预测的初始boxes的座标对应的是608x608的,而原图比如为1920x1080的,那么就需要原图的w和h传入进行映射计算得到最终的boxes座标。所以,检测时进行的image resize操作应当位于未讲解的predic_image()函数中。

 

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章