首先,我參考https://blog.csdn.net/Chen_yingpeng/article/details/80692018提供的yolov3_darknet2caffe.py腳本實現了darknet-yolov3到caffemodel的轉化,得到了prototxt和caffemodel。
但是,我在編譯Chen提供的caffe-yolov3時,由於server上並沒有sudo權限,也無法安裝opencv3,故無法編譯成功,爲此根據之前使用caffe下的MobileNet-YOLOv3,我使用https://github.com/eric612/MobileNet-YOLO中提供的ssd_detect.cpp/yolo_detect.cpp(博主之前已經對MobileNet-YOLO已經使用cmake完全編譯通了)進行測試(當然,其中根據博主的需求已經更改了一些相關code)
需要注意的是使用MobileNet-YOLO提供的ssd_detect.cpp需要將Chen提供轉化過來的yolov3.prototxt做一定的修改,即要添加相應的yolo層和最後的detection_out層,具體可參考該框架中給出的prototxt文件修改。
最終可以正常測試轉化過來的caffemodel,經過demo測試,結果也還正常,但是我們在專用的數據集中測試具體的指標時,發現對traffic sign的recall相對於darknet下差了很多,經過show圖片發現caffemodel會把藍色的traffic sign檢出而忽略紅色的,增加了訓練數據之後情況稍微好轉,但仍然嚴重。
爲此,我debug了Chen提供的convert python代碼,其中weights值轉化的部分如下(整個code最主要的部分):
def darknet2caffe(cfgfile, weightfile, protofile, caffemodel):
net_info = cfg2prototxt(cfgfile)
save_prototxt(net_info , protofile, region=False)
net = caffe.Net(protofile, caffe.TEST)
params = net.params
blocks = parse_cfg(cfgfile)
#Open the weights file
fp = open(weightfile, "rb")
#The first 4 values are header information
# 1. Major version number
# 2. Minor Version Number
# 3. Subversion number
# 4. IMages seen
header = np.fromfile(fp, dtype = np.int32, count = 5)
#fp = open(weightfile, 'rb')
#header = np.fromfile(fp, count=5, dtype=np.int32)
#header = np.ndarray(shape=(5,),dtype='int32',buffer=fp.read(20))
#print(header)
buf = np.fromfile(fp, dtype = np.float32)
#print(buf)
fp.close()
layers = []
layer_id = 1
start = 0
for block in blocks:
if start >= buf.size:
break
if block['type'] == 'net':
continue
elif block['type'] == 'convolutional':
batch_normalize = int(block['batch_normalize'])
if block.has_key('name'):
conv_layer_name = block['name']
bn_layer_name = '%s-bn' % block['name']
scale_layer_name = '%s-scale' % block['name']
else:
conv_layer_name = 'layer%d-conv' % layer_id
bn_layer_name = 'layer%d-bn' % layer_id
scale_layer_name = 'layer%d-scale' % layer_id
if batch_normalize:
start = load_conv_bn2caffe(buf, start, params[conv_layer_name], params[bn_layer_name], params[scale_layer_name])
else:
start = load_conv2caffe(buf, start, params[conv_layer_name])
layer_id = layer_id+1
elif block['type'] == 'depthwise_convolutional':
batch_normalize = int(block['batch_normalize'])
if block.has_key('name'):
conv_layer_name = block['name']
bn_layer_name = '%s-bn' % block['name']
scale_layer_name = '%s-scale' % block['name']
else:
conv_layer_name = 'layer%d-dwconv' % layer_id
bn_layer_name = 'layer%d-bn' % layer_id
scale_layer_name = 'layer%d-scale' % layer_id
if batch_normalize:
start = load_conv_bn2caffe(buf, start, params[conv_layer_name], params[bn_layer_name], params[scale_layer_name])
else:
start = load_conv2caffe(buf, start, params[conv_layer_name])
layer_id = layer_id+1
elif block['type'] == 'connected':
if block.has_key('name'):
fc_layer_name = block['name']
else:
fc_layer_name = 'layer%d-fc' % layer_id
start = load_fc2caffe(buf, start, params[fc_layer_name])
layer_id = layer_id+1
elif block['type'] == 'maxpool':
layer_id = layer_id+1
elif block['type'] == 'avgpool':
layer_id = layer_id+1
elif block['type'] == 'region':
layer_id = layer_id + 1
elif block['type'] == 'route':
layer_id = layer_id + 1
elif block['type'] == 'shortcut':
layer_id = layer_id + 1
elif block['type'] == 'softmax':
layer_id = layer_id + 1
elif block['type'] == 'cost':
layer_id = layer_id + 1
elif block['type'] == 'upsample':
layer_id = layer_id + 1
else:
print('unknow layer type %s ' % block['type'])
layer_id = layer_id + 1
print('save prototxt to %s' % protofile)
save_prototxt(net_info , protofile, region=True)
print('save caffemodel to %s' % caffemodel)
net.save(caffemodel)
......
def load_conv_bn2caffe(buf, start, conv_param, bn_param, scale_param):
conv_weight = conv_param[0].data
running_mean = bn_param[0].data
running_var = bn_param[1].data
scale_weight = scale_param[0].data
scale_bias = scale_param[1].data
scale_param[1].data[...] = np.reshape(buf[start:start+scale_bias.size], scale_bias.shape); start = start + scale_bias.size
#print scale_bias.size
#print scale_bias
scale_param[0].data[...] = np.reshape(buf[start:start+scale_weight.size], scale_weight.shape); start = start + scale_weight.size
#print scale_weight.size
bn_param[0].data[...] = np.reshape(buf[start:start+running_mean.size], running_mean.shape); start = start + running_mean.size
#print running_mean.size
bn_param[1].data[...] = np.reshape(buf[start:start+running_var.size], running_var.shape); start = start + running_var.size
#print running_var.size
bn_param[2].data[...] = np.array([1.0])
conv_param[0].data[...] = np.reshape(buf[start:start+conv_weight.size], conv_weight.shape); start = start + conv_weight.size
#print conv_weight.size
return start
其中,buf爲讀取的yolov3.weights的權值,根據darknet的權重存儲方式,buf爲一個一維的vector,維度爲61592497 x 1,這是由yolov3中所有層累加得到,而start記錄每一層權值開始的位置。然而轉換到caffemodel後(code中load_conv_bn2caffe()即將buf中獲取的權值寫入到caffemodel中)變爲四維vector,例如64x32x3x3(分別表示該卷積層輸入channel爲64,輸出channel爲32,kernel大小爲3x3),這一步的完成主要是由load_conv_bn2caffe中的numpy包中reshape()函數完成。所以導致最後測試結果下降的原因是否是一維reshape爲四維時順序與darknet中不對應導致的RGB權值錯位產生的影響。
然後我檢查了使用的MobileNet-YOLO提供的ssd_detection.cpp,從image input網絡部分入手,code示例如下:
cv::Mat img = cv::imread(fn[k]);
if (img.empty()) continue; //only proceed if sucsessful
// you probably want to do some preprocessing
CHECK(!img.empty()) << "Unable to decode image " << file;
Timer batch_timer;
batch_timer.Start();
std::vector<vector<float> > detections = detector.Detect(img);
LOG(INFO) << "Computing time: " << batch_timer.MilliSeconds() << " ms.";
從上述code中發現,讀取image使用的是opencv中的imread()函數,而將讀取到的img傳入Detector中測試,而Detector被定義爲class類,包含三個子函數,如下:
class Detector {
public:
Detector(const string& model_file,
const string& weights_file,
const string& mean_file,
const string& mean_value,
const float confidence_threshold,
const float normalize_value);
std::vector<vector<float> > Detect(const cv::Mat& img);
private:
void SetMean(const string& mean_file, const string& mean_value);
void WrapInputLayer(std::vector<cv::Mat>* input_channels);
void Preprocess(const cv::Mat& img,
std::vector<cv::Mat>* input_channels);
void Preprocess(const cv::Mat& img,
std::vector<cv::Mat>* input_channels,double normalize_value);
private:
shared_ptr<Net<float> > net_;
cv::Size input_geometry_;
int num_channels_;
cv::Mat mean_;
float nor_val = 1.0;
};
這4個子函數對輸入的img並未做RGB channel轉換,只進行了resize操作,然後測試。
從此處我似乎得到一些啓發,opencv中的imread讀取的RGB圖像是按照BGR順序讀取的,這是否剛好對應了traffic sign中藍色檢出而紅色漏檢的規律呢?然後我將讀取imread讀取的img channel(2)和channel(0)互換,然後input到Detector中測試,測試的結果(座標和分數信息)使用rectangle()函數畫在未RGB channel轉換的原img上,發現,檢測結果正常了,與darknet下結果基本無異,這證明了Chen提供的convert code中reshape並無錯誤,而是我使用了cv中的imread()直接輸入檢測器中導致。
之後我由考慮爲什麼原始darknet中和caffe中其它model(如ssd、RefineDet等)並不會出現此情況呢?我又對其進行了探究。閱讀了darknet中圖像輸入部分的code,具體在src/image.c中函數load_image_color(),如下code:
image load_image_color(char *filename, int w, int h)
{
return load_image(filename, w, h, 3);
}
image load_image(char *filename, int w, int h, int c)
{
#ifdef OPENCV
image out = load_image_cv(filename, c);
#else
image out = load_image_stb(filename, c);
#endif
if((h && w) && (h != out.h || w != out.w)){
//按網絡要求調整到(w,h)大小,前提是輸入的w,h不要是0
image resized = resize_image(out, w, h);
free_image(out);
out = resized;
}
return out;
}
其中的load_image()函數調用了load_image_cv()函數,顯然darknet中仍然使用cv讀取image,因爲我們知道cv讀取image的兩種方式imread(C++)和cvLoadImage(C#)都是按照BGR格式讀取,這更加使我疑惑,查看load_image_cv()函數,code如下:
image load_image_cv(char *filename, int channels)
{
IplImage* src = 0;
int flag = -1;
if (channels == 0) flag = -1;
else if (channels == 1) flag = 0; //grayscale image
else if (channels == 3) flag = 1; //3-channel color image
else {
fprintf(stderr, "OpenCV can't force load with %d channels\n", channels);
}
//opencv api load image
if( (src = cvLoadImage(filename, flag)) == 0 )
{
fprintf(stderr, "Cannot load image \"%s\"\n", filename);
char buff[256];
sprintf(buff, "echo %s >> bad.list", filename);
system(buff);
return make_image(10,10,3);
//exit(0);
}
//將讀取到的IplImage容器中的圖片裝入image結構中
image out = ipl_to_image(src);
cvReleaseImage(&src);
rgbgr_image(out); //convert BGR to RGB
return out;
}
該code中調用C#中cvLoadImage()函數讀取圖像,然後使用rgbgr_image()函數將cvLoadImage讀入圖像的BGR格式轉換爲RGB。轉換的代碼如下(仍然位於image.c文件中):
void rgbgr_image(image im)
{
int i;
for(i = 0; i < im.w*im.h; ++i){
float swap = im.data[i];
im.data[i] = im.data[i+im.w*im.h*2];
im.data[i+im.w*im.h*2] = swap;
}
}
然後博主又順帶查看了caffe下測試時讀取圖像的方式,發現python腳本中使用的是pycaffe接口提供的caffe.io.load_image()函數,具體code位於caffe/python/caffe/io.py文件中的load_image函數。具體代碼如下:
def load_image(filename, color=True):
"""
Load an image converting from grayscale or alpha as needed.
Parameters
----------
filename : string
color : boolean
flag for color format. True (default) loads as RGB while False
loads as intensity (if image is already grayscale).
Returns
-------
image : an image with type np.float32 in range [0, 1]
of size (H x W x 3) in RGB or
of size (H x W x 1) in grayscale.
"""
img = skimage.img_as_float(skimage.io.imread(filename, as_grey=not color)).astype(np.float32)
if img.ndim == 2:
img = img[:, :, np.newaxis]
if color:
img = np.tile(img, (1, 1, 3))
elif img.shape[2] == 4:
img = img[:, :, :3]
return img
其中使用了python包skimage讀取圖像,那麼顯然skimage讀入的圖像爲RGB格式。
然而skimage讀入的圖像會歸一化爲(0~1),所以在caffe中python接口使用cv2.imread()讀入接口會出錯(cv2讀入爲0~255),所以在caffe中python接口想使用cv2.imread()代替caffe.io.load_image()需要做歸一化處理,如博主另一篇博客中code展示了cv2取代load_image()的方式。鏈接爲:https://blog.csdn.net/xunan003/article/details/94740569
至此,博主解決了YOLOv3轉換爲caffemodel後精度下降的原因。開始對於Chen的誤解也被證明是錯誤的。也要感謝這次的debug,讓我瞭解到更多關於兩種框架的區別。