一、opencv的示例模型文件
opencv4.0.0中暂未提供cpp代码,使用python代码改编,参考https://github.com/opencv/opencv/blob/master/samples/dnn/mask_rcnn.py,我们使用的模型为
mask_rcnn_inception_v2_coco_2018_01_28.pb,选择InceptionV2是因为其速度更快,其他更好效果的如ResneXt-101相关模型可在tensorflow model zoo下载。
相关知识
Mask R-CNN(He et al。,ICCV 2017)是对Faster RCNN的改进,它包括一个掩码预测与类标签和边界框预测分支平行的分支,如下图所示。它只为较快的R-CNN网络增加了一小部分开销,因此仍然可以在GPU上以5 fps运行。在本教程中,在Intel Core i7 CPU上运行来显示结果,并且在CPU上每帧大约需要2秒,即使对于具有超过30个对象的帧也是如此。
第一个部分RPN:每个图像生成大约300个区域提案,提议(ROI)中的每一个都通过二部分,即对象检测和掩模预测网络,如上所示。注意,由于掩模预测分支与标签和框预测分支并行运行,因此对于每个给定的ROI,网络预测属于所有类的掩模。
第二个部分MASK:掩模预测分支仅处理最高得分100检测框,因此对于100个ROI和90个对象类(这里使用的是coco数据集),网络的掩模预测部分输出尺寸为100x90x15x15的4D张量,其中每个掩模的大小为15×15。
demo流程
例如,对于coco数据,我们预测1张图时,目标检测有N个目标框,描述为Boxes [1*1*N*7],'7’代表目标检测的数据结构 [batchId, classId, confidence, left, top, right, bottom]。同时,掩码生成的结果为固定的Masks [100*90*15*15],对于得分最高的100个区域,都生成类别90个15*15大小的掩码。
1、对于检测的第n个目标框的数据Box => Boxes[1*1*n*7] ,同时可取得其类别classId,置信度 confidence。
2、通过目标的序号n和类别,取得这个目标对应的掩码Mask => Masks[n*classId*15*15],将Mask缩放到该目标Box大小[left, top, right, bottom]
3、绘制Box,通过Mask叠加蒙版。(实例分割时,需要为每个对象生成不同的颜色蒙版)。
二、示例代码
#include <fstream>
#include <sstream>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <iostream>
using namespace cv;
using namespace dnn;
float confThreshold;
float maskThreshold;
float nmsThreshold;
std::vector<std::string> classes;
std::vector<Vec3b> colors;
void postprocess(cv::Mat& frame, const std::vector<Mat>& outs);
int main(int argc, char** argv) try
{
// mask rcnn
// 根据选择的检测模型文件进行配置
confThreshold = 0.5;
maskThreshold = 0.3;
nmsThreshold = 0.5;
float scale = 1.; // UNneed
Scalar mean = { 0,0,0 }; // UNneed
bool swapRB = true;
int inpWidth = 800;
int inpHeight = 800;
String modelPath = "../../data/testdata/dnn/mask_rcnn_inception_v2_coco_2018_01_28.pb";
String configPath = "../../data/testdata/dnn/mask_rcnn_inception_v2_coco_2018_01_28.pbtxt";
String framework = "";
int backendId = cv::dnn::DNN_BACKEND_OPENCV;
int targetId = cv::dnn::DNN_TARGET_CPU;
String classesFile = "../../data/dnn/object_detection_classes_coco.txt";
String colorFile = "";
// Open file with classes names.
if (!classesFile.empty()) {
const std::string file = classesFile;
std::ifstream ifs(file.c_str());
if (!ifs.is_open())
CV_Error(Error::StsError, "File " + file + " not found");
std::string line;
classes.emplace_back("background"); //使用的是object_detection_classes类
while (std::getline(ifs, line)) {
classes.push_back(line);
}
}
std::srand(324);
if (!colorFile.empty()) {
const std::string& file = colorFile;
std::ifstream ifs(file.c_str());
if (!ifs.is_open())
CV_Error(Error::StsError, "File " + file + " not found");
std::string line;
while (std::getline(ifs, line)) {
std::istringstream colorStr(line.c_str());
Vec3b color;
for (int i = 0; i < 3 && !colorStr.eof(); ++i)
colorStr >> color[i];
colors.push_back(color);
}
}
else {
colors.emplace_back(Vec3b());
for (int i = 1; i < classes.size(); ++i) {
Vec3b color;
for (int j = 0; j < 3; ++j)
color[j] = (colors[i - 1][j] + rand() % 256) / 2;
colors.emplace_back(color);
}
}
CV_Assert(!modelPath.empty());
//! [Read and initialize network]
Net net = readNet(modelPath, configPath, framework);
net.setPreferableBackend(backendId);
net.setPreferableTarget(targetId);
//! [Read and initialize network]
// Create a window
static const std::string kWinName = "MASK-RCNN in OpenCV";
namedWindow(kWinName, WINDOW_AUTOSIZE);
//! [Open a video file or an image file or a camera stream]
VideoCapture cap;
cap.open("../../data/image/dog.jpg"); // pascal voc
if (!cap.isOpened()) {
std::cout << "VideoCapture open failed." << std::endl;
return 0;
}
//! [Open a video file or an image file or a camera stream]
// Process frames.
Mat frame, blob;
while (waitKey(1) < 0) {
cap >> frame;
if (frame.empty()) {
waitKey();
break;
}
//! [Create a 4D blob from a frame]
blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, false);
//! [Create a 4D blob from a frame]
//! [Set input blob]
net.setInput(blob);
//! [Set input blob]
//! [Make forward pass]
std::vector<Mat> outputs; // score, mask
static std::vector<std::string> outputNames = { "detection_out_final" , "detection_masks" };
net.forward(outputs, outputNames);
//! [Make forward pass]
// Draw result
postprocess(frame, outputs);
// Put efficiency information.
std::vector<double> layersTimes;
double freq = getTickFrequency() / 1000;
double t = net.getPerfProfile(layersTimes) / freq;
std::string label = format("Inference time: %.2f ms", t);
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
imshow(kWinName, frame);
}
return 0;
}
catch (std::exception & e) {
std::cerr << e.what() << std::endl;
}
void postprocess(cv::Mat& frame, const std::vector<Mat>& outs)
{
if (frame.empty()) return;
Mat boxes = outs[0]; // 1x1xNx7
Mat masks = outs[1]; // 100 * classNum * 15 * 15 掩码得分最高的100个 mask_rcnn_inception_v2_coco_2018_01_28
Mat tmpFrame = frame.clone();
int frameW = frame.cols;
int frameH = frame.rows;
std::vector<int> classIds;
std::vector<float> confidences;
std::vector<Rect> predBoxes;
// [batchId, classId, confidence, left, top, right, bottom] 1x1xNx7
for (int i = 0; i < boxes.size[2]; ++i) {
float* box = (float*)boxes.ptr<float>(0, 0, i);
float score = box[2];
if (score > confThreshold) {
int classId = box[1];
int boxLeft = frameW * box[3];
int boxTop = frameH * box[4];
int boxRight = frameW * box[5];
int boxBottom = frameH * box[6];
cv::Rect rect{ cv::Point{ boxLeft,boxTop },cv::Point{ boxRight,boxBottom } };
rect &= cv::Rect({ 0,0 }, frame.size());
classIds.emplace_back(classId);
predBoxes.emplace_back(rect);
confidences.emplace_back(score);
}
}
// 相较于源代码,增加了nms,避免可能某些区域生成多个目标框的情况。
std::vector<int> indices;
cv::dnn::NMSBoxes(predBoxes, confidences, confThreshold, nmsThreshold, indices);
// draw results
for (size_t i = 0; i < indices.size(); ++ i) {
int idx = indices[i];
Rect box = predBoxes[idx];
int classId = classIds[idx];
float conf = confidences[idx];
// Draw mask
Scalar color = colors[(classId+1) % colors.size()];
//int colorInd = rand() % colors.size(); //generate different instance colors
//Scalar color = colors[colorInd];
Mat mask(masks.size[2], masks.size[3], CV_32F, masks.ptr<float>(i, classId));
resize(mask, mask, box.size(), 0, 0, cv::INTER_LINEAR_EXACT);
mask = mask > maskThreshold;
Mat coloredRoi;
addWeighted(frame(box), 0.3, color, 0.7, 0, coloredRoi);
coloredRoi.copyTo(frame(box), mask);
// Draw box
rectangle(frame, box, Scalar(0, 255, 0));
std::string label = format("%.2f", conf);
if (!classes.empty()) {
CV_Assert(classId < (int)classes.size());
label = classes[classId + 1] + ": " + label;
}
int baseLine;
cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
Rect labelRect{ box.tl() - cv::Point2i(0,baseLine + labelSize.height), labelSize + cv::Size{ 0,baseLine } };
rectangle(frame, labelRect, cv::Scalar::all(255), cv::FILLED);
putText(frame, label, box.tl() - cv::Point2i(0, baseLine), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar());
}
}
三、演示
修改代码,显示实例分割的结果
//Scalar color = colors[(classId +1) % colors.size()];
int colorInd = rand() % colors.size(); //generate different instance colors
Scalar color = colors[colorInd];
附带一张使用int backendId = cv::dnn::DNN_BACKEND_INFERENCE_ENGINE;
时,使用intel OpenVINO的opencv库测试结果,性能提高30%。