描述
最近遇到一個opencv中,去除最小空洞以及最小連通域的 算法, 因爲要優化並集成到項目中,所以使用cuda對其進行加速,不過中間遇到一些問題,需要誠待解決,而且只是初版,裏面的關於最佳線程數量的分配,都還沒有優化。不過先貼上來吧,等有好的方式再做修改.
opencv原算法.(這裏其實也是百度到的一個)
void Qimage2MatInteroperateGpu::removeSmallRegion(Mat & Src, Mat & Dst, int AreaLimit, int CheckMode, int NeihborMode)
{
int RemoveCount = 0;
//新建一幅標籤圖像初始化爲0像素點,爲了記錄每個像素點檢驗狀態的標籤,0代表未檢查,1代表正在檢查,2代表檢查不合格(需要反轉顏色),3代表檢查合格或不需檢查
//初始化的圖像全部爲0,未檢查
Mat PointLabel = Mat::zeros(Src.size(), CV_8UC1); //和原始圖像同等大小的空位圖.
if (CheckMode == 1)//去除小連通區域的白色點
{
cout << "去除小連通域.";
for (int i = 0; i < Src.rows; i++)
{
for (int j = 0; j < Src.cols; j++)
{
if (Src.at<uchar>(i, j) < 10)
{
PointLabel.at<uchar>(i, j) = 3;//將背景黑色點標記爲合格,像素爲3
}
}
}
}
else//去除孔洞,黑色點像素
{
cout << "去除孔洞";
for (int i = 0; i < Src.rows; i++)
{
for (int j = 0; j < Src.cols; j++)
{
if (Src.at<uchar>(i, j) > 10)
{
PointLabel.at<uchar>(i, j) = 3;//如果原圖是白色區域,標記爲合格,像素爲3
}
}
}
}
showMat(PointLabel,"remove dong");
//for (int i = 0; i < Src.rows; i++)
//{
// for (int j = 0; j < Src.cols; j++)
// {
// if (PointLabel.at<uchar>(i, j) == 0)
// {
// }
// }
//}
vector<Point2i>NeihborPos;//將鄰域壓進容器 田字格
NeihborPos.push_back(Point2i(-1, 0));
NeihborPos.push_back(Point2i(1, 0));
NeihborPos.push_back(Point2i(0, -1));
NeihborPos.push_back(Point2i(0, 1));
if (NeihborMode == 1) //米字格
{
cout << "Neighbor mode: 8鄰域." << endl;
NeihborPos.push_back(Point2i(-1, -1));
NeihborPos.push_back(Point2i(-1, 1));
NeihborPos.push_back(Point2i(1, -1));
NeihborPos.push_back(Point2i(1, 1));
}
else cout << "Neighbor mode: 4鄰域." << endl;
int NeihborCount = 4 + 4 * NeihborMode;
int CurrX = 0, CurrY = 0;
int recordNumvber = 0;
bool status = false;
//開始檢測
for (int i = 0; i < Src.rows; i++)
{
for (int j = 0; j < Src.cols; j++)
{
if (PointLabel.at<uchar>(i, j) == 0)//標籤圖像像素點爲0,表示還未檢查的不合格點
{
std::cout << "開始記錄不合格的點: " << i<<" "<<j << std::endl;
//開始檢查
vector<Point2i>GrowBuffer;//記錄檢查像素點的個數
GrowBuffer.push_back(Point2i(j, i)); // j i ?
PointLabel.at<uchar>(i, j) = 1;//標記爲正在檢查
int CheckResult = 0;
//循環爲自我迭代的過程,在循環自身的過程中,需要不停的對 後來加入的數據也進行計算迭代.
//這裏並行化的時候,可以使用一個比較大的數組來代替.
for (int z = 0; z < GrowBuffer.size(); z++)
{
for (int q = 0; q < NeihborCount; q++)//循環遍歷周圍的4或者8點.
{
CurrX = GrowBuffer.at(z).x + NeihborPos.at(q).x;
CurrY = GrowBuffer.at(z).y + NeihborPos.at(q).y;
if (CurrX >= 0 && CurrX < Src.cols&&CurrY >= 0 && CurrY < Src.rows) //防止越界
{
if (PointLabel.at<uchar>(CurrY, CurrX) == 0)
{
GrowBuffer.push_back(Point2i(CurrX, CurrY)); //鄰域點加入buffer
PointLabel.at<uchar>(CurrY, CurrX) = 1; //更新鄰域點的檢查標籤,避免重複檢查
}
}
}
}
std::cout << "計算獲取到的區域對象的像素點: " << GrowBuffer.size() << " -- (j i) " << i << " " << j << std::endl;
if (GrowBuffer.size() > AreaLimit) //判斷結果(是否超出限定的大小),1爲未超出,2爲超出
CheckResult = 2;
else
{
CheckResult = 1;
RemoveCount++;//記錄有多少區域被去除
}
for (int z = 0; z < GrowBuffer.size(); z++)
{
CurrX = GrowBuffer.at(z).x;
CurrY = GrowBuffer.at(z).y;
PointLabel.at<uchar>(CurrY, CurrX) += CheckResult;//標記不合格的像素點,像素值爲2
}
//********結束該點處的檢查**********
//
status = true;
}
if (status == true)
{
//break;
}
}
if (status == true)
{
//break;
}
}
CheckMode = 255 * (1 - CheckMode);
//開始反轉面積過小的區域
for (int i = 0; i < Src.rows; ++i)
{
for (int j = 0; j < Src.cols; ++j)
{
if (PointLabel.at<uchar>(i, j) == 2)
{
Dst.at<uchar>(i, j) = CheckMode;
}
else if (PointLabel.at<uchar>(i, j) == 3)
{
Dst.at<uchar>(i, j) = Src.at<uchar>(i, j);
}
}
}
cout << RemoveCount << " objects removed." << endl;
}
cuda加速之後的
定義
__host__ bool removeSmallRegionGpu(Mat & Src, Mat & Dst, int AreaLimit, int CheckMode, int NeihborMode);
/******************************************************/
// 函數名 : removeSmallRegionKernelGpu
// 功能描述 : 最小連通域核函數.
// 參數 : source 原圖像數據.(注意,這裏的原圖像必須是經過 灰度變換之後的二值化圖像.)
// :Auxiliary 記錄的檢測標誌
// : outImg 輸出的圖像像素.
// :
// 返回值 : 無
/******************************************************/
__global__ void removeSmallRegionKernelGpu
(uchar* source, uchar* Auxiliary, uchar* outImg,
int *outCalculateNumber,
int2* neiBorModeBuffer, int neightborSize, int2 *GrowBuffer, int GrowBufferSize,
int width, int height,
int AreaLimit, int checkMode, int neiborMode
);
//實現的部分. 這裏描述一下,因爲在覈函數裏啊,實在做不到那個 vector::push_back(),自增,自加,所有,我就使用了一個全局的一維結構體 + 兩個變量來模擬 ... (其實大家對比 上面的opencv的寫法,就明白cuda這部分爲什麼這麼寫了,包括上面的參數列表部分..當然,裏面也有一些自測的部分,比如int *outCalculateNumber, 這個參數,希望不會照成誤解 )
int tidx = threadIdx.x + blockIdx.x*blockDim.x;
int tidy = threadIdx.y + blockIdx.y*blockDim.y;
//行列的偏移.
int offsetx = gridDim.x * blockDim.x;
int offsety = gridDim.y * blockDim.y;
if (tidx < 0 || tidx > width || tidy < 0 || tidy > height) return; //防止越界.
//索引.
int offsetIndex = tidx + tidy * width;
if (checkMode == 1)//去除小連通區域的白色點
{
if (source[offsetIndex] < 10)
{
//將背景黑色點標記爲合格,像素爲3
Auxiliary[offsetIndex] = ckNumbser;
}
}
else//去除孔洞,黑色點像素
{
if (source[offsetIndex] > 10)
{
Auxiliary[offsetIndex] = ckNumbser;
}
}
//同步等待以上所有的數據都計算完畢.
__syncthreads();
int neihborCount = 4;
if (neightborSize < neihborCount)
{
printf("neightborSize less 4 \n");
}
if (neiborMode == 1)
{
neihborCount = 4 + 4 * neiborMode;
}
int currX = 0, currY = 0;
//這裏,讓外部修改的數據,可以讓整個線程粒子都知道.
//開始檢測. 這裏並行化每個單步像素.
if (Auxiliary[offsetIndex] == 0)
{
GrowBuffer[0] = int2{ tidx ,tidy };// tidx ,tidy
Auxiliary[offsetIndex] = 1; //標記正在檢測.
int checkResult = 0;
int offSetLength = width * height;
int GrowBUfferValied = 1;
//循環,查找關聯對象.
for (int z = 0; z < GrowBUfferValied; z++)
{
for (int q = 0; q < neihborCount; q++)
{
int2 temp = GrowBuffer[z];
int2 neigborTemp = neiBorModeBuffer[q];
currX = temp.x + neigborTemp.x;
currY = temp.y + neigborTemp.y;
if (currX >= 0 && currX < width && currY >= 0 && currY < height)
{
int currxyOffset = currX + currY * width;
if (currxyOffset > 0 && currxyOffset < offSetLength)
{
if (Auxiliary[currxyOffset] == 0)
{
GrowBuffer[GrowBUfferValied] = int2{ currX,currY }; //鄰域點加入buffer
Auxiliary[currxyOffset] = 1; //更新鄰域點的檢查標籤,避免重複檢查
GrowBUfferValied++;
}
}
}
}
printf("GrowBUfferValied++ %d \n", GrowBUfferValied);
if (GrowBUfferValied > GrowBufferSize - 1)
{
printf("GrowBUfferValied size number is over \n");
break;
}
}
if (GrowBUfferValied > 20)
{
printf("get recRange is %d %d -> %d imgsize: %d\n", tidx, tidy, GrowBUfferValied, offSetLength);
}
//記錄每次的結果.
outCalculateNumber[offsetIndex] = GrowBUfferValied;
//判斷結果(是否超出限定的大小),1爲未超出,2爲超出
if (GrowBUfferValied > AreaLimit)
{
checkResult = 2;
}
else
{
checkResult = 1;
//removeCount++;//這裏注意,防止資源競奪.
}
for (int z = 0; z < GrowBUfferValied; z++)
{
int2 temp = GrowBuffer[z];
currX = temp.x;
currY = temp.y;
if (currX >= 0 && currX < width && currY >= 0 && currY < height)
{
int currxyOffset = currX + currY * width;
if (currxyOffset > 0 && currxyOffset < offSetLength - 1)
{
Auxiliary[currxyOffset] += checkResult; //?
}
}
}
}
//每個像素判斷,並反轉過小的區域.
checkMode = 255 * (1 - checkMode);
if (Auxiliary[offsetIndex] == 2)
{
outImg[offsetIndex] = checkMode;
}
else if (Auxiliary[offsetIndex] == 3)
{
outImg[offsetIndex] = source[offsetIndex];
}
//printf("%d %d calculate end \n",tidx,tidy);
__syncthreads();
/// 本地函數實現部分.
__host__ bool removeSmallRegionGpu(Mat & Src, Mat & Dst, int AreaLimit, int CheckMode, int NeihborMode)
{
if (Src.data == nullptr)
{
std::cout << "src is nullptr" << std::endl;
return false;
}
//判斷其通道大小.
//獲取其寬高大小.
int imgWidth = Src.cols;
int imgHeight = Src.rows;
int channels = Src.channels();
if (channels == 3)
{
std::cout << "src` channels is 3 or more,please convert 1 channel" << std::endl;
//將多通道合併成單通道.
return false;
}
// cv::imshow("Src", Src);
std::cout << "Src` channel is " << Src.channels() << std::endl;
if (Dst.data == nullptr)
{
//如果輸出圖像爲空,將自動創建單通道.
std::cout << "dst data is empty, the process will creat it default" << std::endl;
Dst = cv::Mat::zeros(cv::Size(imgWidth,imgHeight),CV_8UC1);
}
int imgSize = imgWidth * imgHeight*channels;
//開闢gpu空間.
uchar* srcGpu = nullptr;
HANDLE_ERROR(cudaMalloc((void**)&srcGpu,sizeof(uchar)*imgSize));
HANDLE_ERROR(cudaMemcpy(srcGpu,Src.data,sizeof(uchar)*imgSize,cudaMemcpyKind::cudaMemcpyHostToDevice));
//開闢輸出圖像空間大小.
uchar* DstGpu = nullptr;
HANDLE_ERROR(cudaMalloc((void**)&DstGpu,sizeof(uchar)*imgSize));
HANDLE_ERROR(cudaMemset(DstGpu,0,imgSize*sizeof(uchar)));
//全黑的輔助gpu位圖空間
uchar* AuxiliaryGpu = nullptr;
HANDLE_ERROR(cudaMalloc((void**)&AuxiliaryGpu, sizeof(uchar)*imgSize));
HANDLE_ERROR(cudaMemset(AuxiliaryGpu, 0, imgSize * sizeof(uchar)));
//創建並計算其開闢的最佳核函數算子.
//int Maxblocks = getMaxThreadNums();
int Maxblocks = 32; //1024
dim3 threadsPerBlock(Maxblocks,Maxblocks);
dim3 blocksPerGrid((imgWidth+threadsPerBlock.x-1)/threadsPerBlock.x,(imgHeight+threadsPerBlock.y-1)/threadsPerBlock.y);
//需要寫入需要的數據.
thrust::device_vector<int2> neightborBuffer;
{
neightborBuffer.push_back(int2{ -1,0 });
neightborBuffer.push_back(int2{ 1, 0 });
neightborBuffer.push_back(int2{ 0, -1 });
neightborBuffer.push_back(int2{ 0, 1});
neightborBuffer.push_back(int2{ -1,-1 });
neightborBuffer.push_back(int2{ -1,1 });
neightborBuffer.push_back(int2{ 1,-1 });
neightborBuffer.push_back(int2{ 1,1 });
}
//獲取gpu 動態數組指針.並將其傳入.
int2* neightBorBufferPtr = thrust::raw_pointer_cast(&neightborBuffer[0]);
int neightborSize = neightborBuffer.size();
thrust::device_vector<int2> grawBuffer;
grawBuffer.resize(imgWidth*imgHeight);//imgWidth*imgHeight
int2* grawBufferPtr = thrust::raw_pointer_cast(&grawBuffer[0]);
int grawBufferSize = grawBuffer.size();
std::cout << "blocksPerGrid size: " << blocksPerGrid.x << " " << blocksPerGrid.y << std::endl;
std::cout << "threadsPerBlock size: " << threadsPerBlock.x << " " << threadsPerBlock.y << std::endl;
//預處理待檢測圖像數據.
prereatmentAuxiliary << < blocksPerGrid, threadsPerBlock >> > (srcGpu, AuxiliaryGpu,imgWidth,imgHeight,CheckMode);
cv::Mat auxiliaryTemp = cv::Mat::zeros(cv::Size(imgWidth, imgHeight), CV_8UC1);
HANDLE_ERROR(cudaMemcpy(auxiliaryTemp.data, AuxiliaryGpu, sizeof(uchar)*imgSize, cudaMemcpyKind::cudaMemcpyDeviceToHost));
cv::imshow("auxiliaryTemp", auxiliaryTemp);
/*
測試輔助數.imgSize
*/
//extern __shared__ int getRecordCalculate[]; //共享內存,不能超過16kb
int* recordBuffer = nullptr;
cudaMalloc((void**)&recordBuffer,sizeof(int)*imgSize);
cudaMemset(recordBuffer,0,sizeof(int)*imgSize);
//並行計算孔洞自生長區域.
removeSmallRegionKernelGpu << <blocksPerGrid, threadsPerBlock >> > (
srcGpu,AuxiliaryGpu,DstGpu, recordBuffer,
neightBorBufferPtr, neightborSize,
grawBufferPtr, grawBufferSize,
imgWidth,imgHeight,
AreaLimit,CheckMode,NeihborMode);
//記錄數據
int *cpuBuffer = new int[imgSize];
memset(cpuBuffer,0,sizeof(int)*imgSize);
cudaMemcpy(cpuBuffer,recordBuffer,sizeof(int)*imgSize,cudaMemcpyKind::cudaMemcpyDeviceToHost);
long controlNumber = 0;
for (int i = 0; i < imgSize; i++)
{
controlNumber += cpuBuffer[i];
}
std::cout << "最後總數: " << controlNumber << std::endl;
//計算完畢之後,將gpu數據下載,並拷貝給本地位圖.
HANDLE_ERROR(cudaMemcpy(Dst.data, DstGpu, sizeof(uchar)*imgSize, cudaMemcpyKind::cudaMemcpyDeviceToHost));
cv::Mat OUTauxiliary = cv::Mat::zeros(cv::Size(imgWidth, imgHeight), CV_8UC1);
HANDLE_ERROR(cudaMemcpy(OUTauxiliary.data, AuxiliaryGpu, sizeof(uchar)*imgSize, cudaMemcpyKind::cudaMemcpyDeviceToHost));
cv::imshow("OUTauxiliary", OUTauxiliary);
//釋放gpu空間.
cudaFree(srcGpu);
cudaFree(DstGpu);
cudaFree(AuxiliaryGpu);
cudaFree(recordBuffer);
return true;
}
////使用方式
void test()
{
cv::Mat Source = cv::imread(R"(..\\MatLabCuda\\img\\source\\remove_B.bmp)");
//保證這裏的傳入的圖像爲單通道.
cv::Mat SourceSignel;
if (Source.channels() == 3)
{
SourceSignel = cv::Mat::zeros(cv::Size(Source.cols,Source.rows),CV_8UC1);
for (int i = 0; i < Source.rows; i++)
{
for (int j = 0; j < Source.cols; j++)
{
Vec3b temp = Source.at<Vec3b>(i,j);
SourceSignel.at<uchar>(i, j) = temp[0];
}
}
}
cv::Mat outImg = cv::Mat::zeros(cv::Size(Source.cols,Source.rows), Source.type());
cv::Mat outImgGpu = cv::Mat::zeros(cv::Size(Source.cols, Source.rows), CV_8UC1);
removeSmallRegion(SourceSignel, outImg, 100, 1, 1);
removeSmallRegionGpu(SourceSignel, outImgGpu, 100, 1, 1);
}
以上就是整個過程,其實不需要我再介紹什麼了,上面的註釋以及一些過程的,都寫了,只要順着順序看,就基本明白我要做的事情了.