因爲不想什麼函數都自己寫設備核函數,看到opencv有對應的cuda版本的函數比如濾波,然而CUDA的NPP庫也提供了對應的濾波函數,我不知道哪個性能更高(當然肯定要比純CPU版本快,但我沒測試過)
一、cv::cuda
#include <stdio.h>
#include <opencv2\core\core.hpp>
#include <opencv2\core\cuda.hpp>
#include <opencv2\imgproc.hpp>
#include <opencv2\opencv.hpp>
#include <chrono>
#include <fstream>
#define SIZE 25
int main()
{
cv::Mat ImageHost = cv::imread("E:\\CUDA\\imgs\\cv_cuda_testimg.png", cv::IMREAD_GRAYSCALE);
cv::Mat ImageHostArr[SIZE];
cv::cuda::GpuMat ImageDev;
cv::cuda::GpuMat ImageDevArr[SIZE];
ImageDev.upload(ImageHost);
for (int n = 1; n < SIZE; n++)
cv::resize(ImageHost, ImageHostArr[n], cv::Size(), 0.5*n, 0.5*n, cv::INTER_LINEAR);
for (int n = 1; n < SIZE; n++)
cv::cuda::resize(ImageDev, ImageDevArr[n], cv::Size(), 0.5*n, 0.5*n, cv::INTER_LINEAR);
cv::Mat Detected_EdgesHost[SIZE];
cv::cuda::GpuMat Detected_EdgesDev[SIZE];
std::ofstream File1, File2;
File1.open("E:\\CUDA\\imgs\\canny_cpu.txt");
File2.open("E:\\CUDA\\imgs\\canny_gpu.txt");
std::cout << "Process started... \n" << std::endl;
for (int n = 1; n < SIZE; n++) {
auto start = std::chrono::high_resolution_clock::now();
cv::Canny(ImageHostArr[n], Detected_EdgesHost[n], 2.0, 100.0, 3, false);
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = finish - start;
File1 << "Image Size: " << ImageHostArr[n].rows* ImageHostArr[n].cols << " " << "Elapsed Time: " << elapsed_time.count() * 1000 << " msecs" << "\n" << std::endl;
}
cv::Ptr<cv::cuda::CannyEdgeDetector> canny_edg = cv::cuda::createCannyEdgeDetector(2.0, 100.0, 3, false);
for (int n = 1; n < SIZE; n++) {
auto start = std::chrono::high_resolution_clock::now();
canny_edg->detect(ImageDevArr[n], Detected_EdgesDev[n]);
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = finish - start;
File2 << "Image Size: " << ImageDevArr[n].rows* ImageDevArr[n].cols << " " << "Elapsed Time: " << elapsed_time.count() * 1000 << " msecs" << "\n" << std::endl;
}
std::cout << "Process ended... \n" << std::endl;
return 0;
}
我的電腦測出來:
Image Size: 49476 CPU Elapsed Time: 13.9905 msecs
Image Size: 198170 CPU Elapsed Time: 38.4235 msecs
Image Size: 446082 CPU Elapsed Time: 71.059 msecs
Image Size: 792680 CPU Elapsed Time: 103.162 msecs
Image Size: 1238230 CPU Elapsed Time: 141.263 msecs
Image Size: 1783530 CPU Elapsed Time: 165.636 msecs
Image Size: 2428048 CPU Elapsed Time: 195.356 msecs
Image Size: 3170720 CPU Elapsed Time: 246.407 msecs
Image Size: 4012344 CPU Elapsed Time: 300.643 msecs
Image Size: 4954250 CPU Elapsed Time: 334.725 msecs
Image Size: 5995374 CPU Elapsed Time: 367.368 msecs
Image Size: 7134120 CPU Elapsed Time: 422.822 msecs
Image Size: 8371818 CPU Elapsed Time: 468.351 msecs
Image Size: 9710330 CPU Elapsed Time: 546.653 msecs
Image Size: 11148060 CPU Elapsed Time: 589.476 msecs
Image Size: 12682880 CPU Elapsed Time: 617.778 msecs
Image Size: 14316652 CPU Elapsed Time: 682.61 msecs
Image Size: 16051770 CPU Elapsed Time: 784.524 msecs
Image Size: 17886106 CPU Elapsed Time: 802.988 msecs
Image Size: 19817000 CPU Elapsed Time: 829.102 msecs
Image Size: 21846846 CPU Elapsed Time: 912.721 msecs
Image Size: 23978570 CPU Elapsed Time: 954.053 msecs
Image Size: 26209512 CPU Elapsed Time: 978.438 msecs
Image Size: 28536480 CPU Elapsed Time: 1045.46 msecs
Image Size: 49476 GPU Elapsed Time: 1.8581 msecs
Image Size: 198170 GPU Elapsed Time: 2.1446 msecs
Image Size: 446082 GPU Elapsed Time: 3.8053 msecs
Image Size: 792680 GPU Elapsed Time: 4.8882 msecs
Image Size: 1238230 GPU Elapsed Time: 5.9607 msecs
Image Size: 1783530 GPU Elapsed Time: 6.7705 msecs
Image Size: 2428048 GPU Elapsed Time: 7.3428 msecs
Image Size: 3170720 GPU Elapsed Time: 8.3768 msecs
Image Size: 4012344 GPU Elapsed Time: 9.8166 msecs
Image Size: 4954250 GPU Elapsed Time: 12.5099 msecs
Image Size: 5995374 GPU Elapsed Time: 14.9313 msecs
Image Size: 7134120 GPU Elapsed Time: 17.6367 msecs
Image Size: 8371818 GPU Elapsed Time: 20.3713 msecs
Image Size: 9710330 GPU Elapsed Time: 23.8835 msecs
Image Size: 11148060 GPU Elapsed Time: 25.3751 msecs
Image Size: 12682880 GPU Elapsed Time: 28.7937 msecs
Image Size: 14316652 GPU Elapsed Time: 31.7389 msecs
Image Size: 16051770 GPU Elapsed Time: 35.7431 msecs
Image Size: 17886106 GPU Elapsed Time: 38.3026 msecs
Image Size: 19817000 GPU Elapsed Time: 39.8344 msecs
Image Size: 21846846 GPU Elapsed Time: 43.0583 msecs
Image Size: 23978570 GPU Elapsed Time: 45.6539 msecs
Image Size: 26209512 GPU Elapsed Time: 54.4576 msecs
Image Size: 28536480 GPU Elapsed Time: 49.9312 msecs
cv::cuda比cv::竟然快這麼多?!把兩個函數放一起:
std::cout << "Process started... \n" << std::endl;
for (int n = 1; n < SIZE; n++) {
auto start = std::chrono::high_resolution_clock::now();
cv::resize(ImageHost, ImageHostArr[n], cv::Size(), 0.5*n, 0.5*n, cv::INTER_LINEAR);
cv::Canny(ImageHostArr[n], Detected_EdgesHost[n], 2.0, 100.0, 3, false);
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = finish - start;
File1 << "Image Size: " << ImageHostArr[n].rows* ImageHostArr[n].cols << " " << "CPU Elapsed Time: " << elapsed_time.count() * 1000 << " msecs" << "\n" << std::endl;
}
cv::Ptr<cv::cuda::CannyEdgeDetector> canny_edg = cv::cuda::createCannyEdgeDetector(2.0, 100.0, 3, false);
for (int n = 1; n < SIZE; n++) {
auto start = std::chrono::high_resolution_clock::now();
cv::cuda::resize(ImageDev, ImageDevArr[n], cv::Size(), 0.5*n, 0.5*n, cv::INTER_LINEAR);
canny_edg->detect(ImageDevArr[n], Detected_EdgesDev[n]);
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = finish - start;
File2 << "Image Size: " << ImageDevArr[n].rows* ImageDevArr[n].cols << " " << "GPU Elapsed Time: " << elapsed_time.count() * 1000 << " msecs" << "\n" << std::endl;
}
std::cout << "Process ended... \n" << std::endl;
Image Size: 49476 GPU Elapsed Time: 1.5971 msecs
Image Size: 198170 GPU Elapsed Time: 2.1869 msecs
Image Size: 446082 GPU Elapsed Time: 3.9316 msecs
Image Size: 792680 GPU Elapsed Time: 5.8947 msecs
Image Size: 1238230 GPU Elapsed Time: 6.8415 msecs
Image Size: 1783530 GPU Elapsed Time: 7.8679 msecs
Image Size: 2428048 GPU Elapsed Time: 8.4011 msecs
Image Size: 3170720 GPU Elapsed Time: 9.5377 msecs
Image Size: 4012344 GPU Elapsed Time: 11.3635 msecs
Image Size: 4954250 GPU Elapsed Time: 13.3181 msecs
Image Size: 5995374 GPU Elapsed Time: 16.5964 msecs
Image Size: 7134120 GPU Elapsed Time: 19.9122 msecs
Image Size: 8371818 GPU Elapsed Time: 22.7916 msecs
Image Size: 9710330 GPU Elapsed Time: 25.1661 msecs
Image Size: 11148060 GPU Elapsed Time: 28.3689 msecs
Image Size: 12682880 GPU Elapsed Time: 31.6261 msecs
Image Size: 14316652 GPU Elapsed Time: 34.7694 msecs
Image Size: 16051770 GPU Elapsed Time: 37.7313 msecs
Image Size: 17886106 GPU Elapsed Time: 39.5111 msecs
Image Size: 19817000 GPU Elapsed Time: 43.407 msecs
Image Size: 21846846 GPU Elapsed Time: 46.8648 msecs
Image Size: 23978570 GPU Elapsed Time: 47.9306 msecs
Image Size: 26209512 GPU Elapsed Time: 50.2719 msecs
Image Size: 28536480 GPU Elapsed Time: 53.922 msecs
Image Size: 49476 CPU Elapsed Time: 16.4558 msecs
Image Size: 198170 CPU Elapsed Time: 40.3942 msecs
Image Size: 446082 CPU Elapsed Time: 77.8448 msecs
Image Size: 792680 CPU Elapsed Time: 110.313 msecs
Image Size: 1238230 CPU Elapsed Time: 143.571 msecs
Image Size: 1783530 CPU Elapsed Time: 183.128 msecs
Image Size: 2428048 CPU Elapsed Time: 218.107 msecs
Image Size: 3170720 CPU Elapsed Time: 256.128 msecs
Image Size: 4012344 CPU Elapsed Time: 305.7 msecs
Image Size: 4954250 CPU Elapsed Time: 370.511 msecs
Image Size: 5995374 CPU Elapsed Time: 410.728 msecs
Image Size: 7134120 CPU Elapsed Time: 458.635 msecs
Image Size: 8371818 CPU Elapsed Time: 511.283 msecs
Image Size: 9710330 CPU Elapsed Time: 619.209 msecs
Image Size: 11148060 CPU Elapsed Time: 652.386 msecs
Image Size: 12682880 CPU Elapsed Time: 691.799 msecs
Image Size: 14316652 CPU Elapsed Time: 768.322 msecs
Image Size: 16051770 CPU Elapsed Time: 880.751 msecs
Image Size: 17886106 CPU Elapsed Time: 900.914 msecs
Image Size: 19817000 CPU Elapsed Time: 980.022 msecs
Image Size: 21846846 CPU Elapsed Time: 1037.32 msecs
Image Size: 23978570 CPU Elapsed Time: 1115.81 msecs
Image Size: 26209512 CPU Elapsed Time: 1123.15 msecs
Image Size: 28536480 CPU Elapsed Time: 1226.08 msecs
依舊是快很多的。但是不好意思發現算上cv::Mat與cv::cuda::gpuMat之間的上傳下載,如果只處理幾張圖片,cv::cuda總體是慢的:
int main()
{
cv::Mat ImageHost = cv::imread("E:\\CUDA\\imgs\\cv_cuda_testimg.png", cv::IMREAD_GRAYSCALE);
cv::Mat ImageHostArr[SIZE];
cv::Mat Detected_EdgesHost[SIZE];
std::ofstream File1, File2;
File1.open("E:\\CUDA\\imgs\\canny_cpu.txt");
File2.open("E:\\CUDA\\imgs\\canny_gpu.txt");
std::cout << "Process started... \n" << std::endl;
for (int n = 1; n <SIZE; n++) {
auto start = std::chrono::high_resolution_clock::now();
cv::resize(ImageHost, ImageHostArr[n], cv::Size(), 0.5*n, 0.5*n, cv::INTER_LINEAR);
cv::Canny(ImageHostArr[n], Detected_EdgesHost[n], 2.0, 100.0, 3, false);
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = finish - start;
File1 << "Image Size: " << ImageHostArr[n].rows* ImageHostArr[n].cols << " " << "CPU Elapsed Time: " << elapsed_time.count() * 1000 << " msecs" << "\n" << std::endl;
}
auto start2 = std::chrono::high_resolution_clock::now();
cv::cuda::GpuMat ImageDev;
cv::cuda::GpuMat ImageDevArr[SIZE];
ImageDev.upload(ImageHost);
cv::cuda::GpuMat Detected_EdgesDev[SIZE];
cv::Mat gpuresult[SIZE];
auto finish2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time2 = finish2 - start2;
File2 << "GPU Elapsed Time: " << elapsed_time2.count() * 1000 << " msecs" << "\n" << std::endl;
cv::Ptr<cv::cuda::CannyEdgeDetector> canny_edg = cv::cuda::createCannyEdgeDetector(2.0, 100.0, 3, false);
for (int n = 1; n <SIZE; n++) {
auto start = std::chrono::high_resolution_clock::now();
cv::cuda::resize(ImageDev, ImageDevArr[n], cv::Size(), 0.5*n, 0.5*n, cv::INTER_LINEAR);
canny_edg->detect(ImageDevArr[n], Detected_EdgesDev[n]);
(Detected_EdgesDev[n]).download(gpuresult[n]);
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = finish - start;
File2 << "Image Size: " << ImageDevArr[n].rows* ImageDevArr[n].cols << " " << "GPU Elapsed Time: " << elapsed_time.count() * 1000 << " msecs" << "\n" << std::endl;
}
std::cout << "Process ended... \n" << std::endl;
return 0;
}
看結果上傳花了很多時間:
CPU Elapsed Time: 16.2129 msecs
GPU Elapsed Time: 827.039 msecs
Image Size: 49476 GPU Elapsed Time: 1.3695 msecs
所以如果後續只處理一兩張圖,那就得不償失了。所以網上有人建議要使用cv::cuda,最好cv::Mat與cv::cuda::gpuMat之間只轉換一次,而不要頻繁的轉來轉去,否則cv::cuda函數節約下來的時間趕不上轉換消耗的時間,那總體就慢了。
Image Size: 49476 CPU Elapsed Time: 16.5117 msecs
Image Size: 198170 CPU Elapsed Time: 40.3025 msecs
Image Size: 446082 CPU Elapsed Time: 81.2121 msecs
Image Size: 792680 CPU Elapsed Time: 110.101 msecs
Image Size: 1238230 CPU Elapsed Time: 148.415 msecs
Image Size: 1783530 CPU Elapsed Time: 186.113 msecs
Image Size: 2428048 CPU Elapsed Time: 228.306 msecs
Image Size: 3170720 CPU Elapsed Time: 261.014 msecs
Image Size: 4012344 CPU Elapsed Time: 316.615 msecs
Image Size: 4954250 CPU Elapsed Time: 363.326 msecs
Image Size: 5995374 CPU Elapsed Time: 410.894 msecs
Image Size: 7134120 CPU Elapsed Time: 479.375 msecs
Image Size: 8371818 CPU Elapsed Time: 509.868 msecs
Image Size: 9710330 CPU Elapsed Time: 596.871 msecs
GPU Elapsed Time: 811.702 msecs
Image Size: 49476 GPU Elapsed Time: 1.5237 msecs
Image Size: 198170 GPU Elapsed Time: 2.2596 msecs
Image Size: 446082 GPU Elapsed Time: 3.7014 msecs
Image Size: 792680 GPU Elapsed Time: 5.3606 msecs
Image Size: 1238230 GPU Elapsed Time: 6.7137 msecs
Image Size: 1783530 GPU Elapsed Time: 7.9725 msecs
Image Size: 2428048 GPU Elapsed Time: 9.5008 msecs
Image Size: 3170720 GPU Elapsed Time: 11.3495 msecs
Image Size: 4012344 GPU Elapsed Time: 13.556 msecs
Image Size: 4954250 GPU Elapsed Time: 16.0509 msecs
Image Size: 5995374 GPU Elapsed Time: 19.5233 msecs
Image Size: 7134120 GPU Elapsed Time: 22.7719 msecs
Image Size: 8371818 GPU Elapsed Time: 26.4892 msecs
Image Size: 9710330 GPU Elapsed Time: 28.1691 msecs
我覺得cv::cuda真的只適合一次上傳下載,然後很多很多函數處理或者是很多張圖片的很多操作,這種情況。
的確是這樣。
二、cuda提供的NPP庫
暫時放棄
三、距離變換distanceTransform的CUDA版本
1、先直接用2020外文中提供的最優的JFA的距離變換方法,話說jump flood algorithmn這個算法爲什麼在中國網站裏沒介紹,搜都搜不到原理,因爲看外文的還有點小地方不怎麼理解,就是我沒註釋的第一個函數中。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "helper_math.h"
#include "book.h"
#include <stdlib.h>
#include <stdio.h>
#include <opencv2\core\core.hpp>
#include <opencv2\core\cuda.hpp>
#include <opencv2\imgproc.hpp>
#include <opencv2\opencv.hpp>
#include <chrono>
#include <fstream>
struct ipoint { int x, y; };
struct fpoint { float x, y; };
struct uipoint { unsigned int x, y; };
struct stack { unsigned int top, bot; };
/*這個部分涉及到jump flood算法原理(JFA),沒懂
input:g---當前鄰域點位置的水平、垂直方向的梯度幅值
或鄰域點與中心點的橫座標、縱座標位置差
a---原圖在當前鄰域點位置><=0.0的判斷
*/
__device__ float edgedf(fpoint g, float a)
{
float df, glength, temp, al;
if ((g.x == 0) || (g.y == 0))
{
//此鄰域點是背景點
//此鄰域點在中心點的正上、正下、正左、正右位置
df = 0.5 - a;
}
else
{
//此鄰域點是目標點
//此鄰域點在中心點的斜對角位置
glength = sqrtf(g.x*g.x + g.y*g.y);
if (glength > 0)
{
g.x = g.x / glength;
g.y = g.y / glength;
}
g.x = fabs(g.x);
g.y = fabs(g.y);
if (g.x < g.y)
{
temp = g.x;
g.x = g.y;
g.y = temp;
}
al = 0.5f*g.y / g.x;
if (a < al)
{
df = 0.5f*(g.x + g.y) - sqrt(2.0f*g.x*g.y*a);
}
else if (a < (1.0f - al))
{
df = (0.5f - a)*g.x;
}
else
{
df = -0.5f*(g.x + g.y) + sqrt(2.0f*g.x*g.y*(1.0f - a));
}
}
return df;
}
/*
input:from-----當前中心像素點位置
to-------當前鄰域點位置
grad-----當前鄰域點位置的梯度幅值
img------設備原圖
imgSize--設備原圖尺寸
return :計算當前鄰域點到中心點的XX距離(位置距離+另一種距離(涉及JFA原理))
*/
__device__ float dist(ipoint from, ipoint to, fpoint grad, uchar* img, uipoint imgSize)
{
//再次判斷當前鄰域點位置是否爲背景點
if (to.x < 0 || to.y < 0)
return imgSize.x*3.0f;
/*計算原圖在此鄰域點位置的像素值
*/
int id = to.y*imgSize.x + to.x;
float a = img[id];
if (a > 1) a = 1.0f;//若是目標點,其實按道理此時只可能是這種
if (a < 0) a = 0.0f;//原圖不可能有小於0的像素值??
if (a == 0) return imgSize.x*3.0f;//原圖在鄰域點是背景
//計算鄰域點與當前中心點的距離
float dx = to.x - from.x;
float dy = to.y - from.y;
float di = sqrtf(dx*dx + dy*dy);
float df;
if (di == 0.0f)
{
//當前鄰域點就是中心點
df = fmaxf(edgedf(grad, a), 0.0f);
}
else
{
//另8個鄰域點
df = edgedf({ dx,dy }, a);
}
return di + df;
}
/*計算梯度圖像grad和標記背景位置圖closest
input:img---------需要計算的圖像
imgSize-----圖像尺寸
output:closest----把背景區域置-1
grad-------梯度圖像,背景區域的梯度爲0,目標區域梯度幅值計算並歸一化
*/
__global__ void setup(ipoint *closest, fpoint *grad, uchar *img, uipoint imgSize)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int id = idy*imgSize.x + idx;
#define SQRT2 1.4142136f
if (img[id] > 0.0f)
{
closest[id].x = idx;
closest[id].y = idy;
if (idx > 0 && idx < imgSize.x - 1 && idy>0 && idy < imgSize.y - 1)
{
grad[id].x = -img[id - imgSize.x - 1] - SQRT2*img[id - 1] - img[id + imgSize.x - 1] + img[id - imgSize.x + 1]
+ SQRT2*img[id + 1] + img[id + imgSize.x + 1];
grad[id].y = -img[id - imgSize.x - 1] - SQRT2*img[id - imgSize.x] - img[id - imgSize.x + 1] + img[id + imgSize.x - 1]
+ SQRT2*img[id + imgSize.x] + img[id + imgSize.x + 1];
float g = grad[id].x*grad[id].x + grad[id].y*grad[id].y;
if (g > 0.0f)
{
g = sqrtf(g);
grad[id].x /= g;
grad[id].y /= g;
}
}
else
{
grad[id].x = 0;
grad[id].y = 0;
}
}
else
{
closest[id].x = -1;
closest[id].y = -1;
grad[id].x = 0;
grad[id].y = 0;
}
}
/*
input:closest----背景區域位置標記圖
grad-------梯度幅值圖
img--------設備原圖
imgSize----設備原圖尺寸
stepSize---設備原圖列位置
output:voronoi---XX距離圖像對應的位置圖像
out-------XX距離圖像
*/
__global__ void propagateSites(ipoint *closest, ipoint *voronoi, fpoint *grad, uchar *img,
float *out, uipoint imgSize, int stepSize)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int id = idy*imgSize.x + idx;
//以當前點位置爲中心,上下左右斜對角stepSize距離的點位置,共9個鄰域點
ipoint neighbors[9] = { { idx - stepSize,idy + stepSize },{ idx,idy + stepSize },
{ idx + stepSize,idy + stepSize },{ idx - stepSize,idy },{ idx,idy },{ idx + stepSize,idy },
{ idx - stepSize,idy - stepSize },{ idx,idy - stepSize },{ idx + stepSize,idy - stepSize }
};
//因爲在圖像內,所以不可能有距離值大於3倍圖像尺寸
float bestDist = imgSize.x*3.0f;
ipoint bestSite = { -1,-1 };
//針對當前中心點,計算9個鄰域點中XX距離最小的爲此中心點最終的XX距離
//並把此中心點XX距離最小的鄰域點的位置標記下來
for (int i = 0; i < 9; ++i)
{
ipoint n = neighbors[i];
//如果鄰域點位置無效,則忽略,即超出了圖像實際像素點的位置範圍
if (n.x >= imgSize.x || n.x < 0 || n.y >= imgSize.y || n.y < 0) continue;
//查看這個鄰域點位置是背景點還是目標點,-1代表背景
ipoint nSite = closest[n.x + n.y*imgSize.x];
/*如果這個點是背景點,那麼就賦值爲一個不可能達到的非常大的距離值imgSize.x*3.0f
如果這個點是目標點,就計算此鄰域點與中心點的XX距離
*/
float newDist = (nSite.x < 0) ? imgSize.x*3.0f : dist({ idx,idy },
nSite, grad[nSite.x + nSite.y*imgSize.x], img, imgSize);
if (newDist < bestDist)
{
bestDist = newDist;
bestSite = nSite;
}
}
////針對當前中心點,計算9個鄰域點中XX距離最小的爲此中心點最終的XX距離
voronoi[id] = bestSite;
out[id] = bestDist;
}
/*
input:img----輸入圖像
size---輸入圖像的長寬
output:edt---距離變換後的圖像
*/
void JFA(float *edt, uchar *img, uipoint size)
{
uchar *dev_img = 0;
fpoint *dev_grad = 0;
float *dev_edt = 0;
ipoint *dev_closest = 0;
ipoint *dev_voronoi = 0;
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
cudaSetDevice(0);
cudaMalloc((void**)&dev_grad, size.x*size.y * sizeof(fpoint));
cudaMalloc((void**)&dev_edt, size.x*size.y * sizeof(float));
cudaMalloc((void**)&dev_closest, size.x*size.y * sizeof(ipoint));
cudaMalloc((void**)&dev_voronoi, size.x*size.y * sizeof(ipoint));
//爲原圖在device上分配空間,將數據拷貝到設備全局 1~2ms
cudaMalloc((void**)&dev_img, size.x*size.y * sizeof(uchar));
cudaMemcpy(dev_img, img, size.x*size.y * sizeof(uchar), cudaMemcpyHostToDevice);
dim3 block = { 8,8 };
dim3 grid = { size.x / 8,size.y / 8 };
//計算得到歸一化的梯度幅值圖像dev_grad和標記背景位置圖dev_closest
setup << <grid, block >> >(dev_closest, dev_grad, dev_img, size);
cudaDeviceSynchronize();
for (int i = size.x / 2; i > 0; i /= 2)
{
//計算以i爲間隔,每個像素點的XX距離圖以及最新的位置標記圖
propagateSites << <grid, block >> >(dev_closest, dev_voronoi, dev_grad, dev_img,
dev_edt, size, i);
/*將上次的背景區域位置標記圖dev_closest與剛剛核函數得到的
XX距離圖像對應的位置圖像dev_voronoi交換,即更新背景區域位置標記圖
*/
ipoint *tmp = dev_closest;
dev_closest = dev_voronoi;
dev_voronoi = tmp;
cudaDeviceSynchronize();
}
cudaMemcpy(edt, dev_edt, size.x*size.y * sizeof(float), cudaMemcpyDeviceToHost);
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("Time on GPU: %3.1f ms\n", elapsedTime);
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
cudaFree(dev_closest);
cudaFree(dev_grad);
cudaFree(dev_img);
cudaFree(dev_edt);
cudaFree(dev_voronoi);
return;
}
int main()
{
char srclowfolder[400] = { 0 };
for (int index = 0; index <= 56; index++)
{
sprintf(srclowfolder, "E:\\CUDA\\imgs\\jinxinginput\\101\\%d_lowbw.jpg", index);
cv::Mat low_binary_A = cv::imread(srclowfolder, cv::IMREAD_UNCHANGED);
/*
auto cpustart = std::chrono::high_resolution_clock::now();
cv::Mat dist_image;
distanceTransform(low_binary_A, dist_image, cv::DIST_L2, 3);
auto cpufinish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = cpufinish - cpustart;
std::cout << "CPU distanceTransform Elapsed Time: " << elapsed_time.count() * 1000 << " ms" << "\n" << std::endl;
*/
float *dist_image_gpu=0;
uipoint imgrealsize;
imgrealsize.x = low_binary_A.cols;
imgrealsize.y = low_binary_A.rows;
JFA(dist_image_gpu, low_binary_A.data, imgrealsize);
}
return 0;
}
我對比了一下,純CPU版本的distanceTransform耗時是2~3ms,而CUDA版本竟然要100+ms!!!我看了下,主要是for裏面對整幅圖每次以像素點stepsize附近9個鄰域點求距離最小值,stepsize越來越小,所有點的計算量越來越大
。很好理解,以圖像中央像素點爲例,第一次時是由這9個鄰域點計算出最小距離,並相當於將最小距離賦值給中央像素點;第二次時又計算新的9個鄰域點的最小距離....可以看到當stepsize很大時,有的點也許實際只用算2、3、4...個點,並不用算滿9個點。所以for後面的時間會比前一次for更長。
我以爲論文上的就是最優的,可是比cv::distanceTransform長太多了。我其實很想查到cv::distanceTransform詳細的原理,然後自己寫核函數。
2、按照https://www.cnblogs.com/yrm1160029237/p/11937744.html 中介紹的原理自己實現
找了幾個理論:https://blog.csdn.net/weixin_44128918/article/details/103754674?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase 其實就是第一遍是走正Z字形,從左上角掃描到右下角,比較每個像素左、上點中的最小值點,加上自身像素值;第二遍走倒Z字形,從右下角掃描到左上,比較每個像素右、下點中的最小值,加上自身像素值;比較這兩次掃描的最小值作爲最終的distanceTranceform的結果(此種不是float)
開始時我想這樣設計,分析發現依賴很嚴重。後來想換一種依賴不這麼嚴重的方式:這種方式與上一種嚴格按原理來的那種方式結果一致。但是我測試了一下,這個版本距離變換CUDA總時間基本是3.3ms,也就是比CPU版本還是慢了近1ms。然後我測試了只計算核函數的時間是2.5ms,比CPU版本快。因爲這是我設計的第一版,然後圖像是500x1216,圖像大一些按這個思路應該會比CPU快得更明顯,而且我還沒有使用紋理內存以及CUDA調度方式邊複製變執行kernel,我覺得優化一下更快。剛剛又嘗試了下這個優化方向,但是我之前配置時以爲不用opengl相關的,就沒編譯,所以使用不了紋理內存...........
因爲還有一個膨脹函數也要放到GPU上,我測試了一下,CPU版本distanceTransform+dilate耗時是:
然後GPU版本distanceTransform+dilate的耗時是如下
這麼看這個未優化版本已經開始呈現優勢了。
不好意思,上面的距離變換的原理有誤,所以出來數據不對。然後我又重新按照這個C++版本:
float Distance(int x1, int y1, int x2, int y2)
{
return sqrt(float((x2 - x1)*(x2 - x1) + (y2 - y1)*(y2 - y1)));
}
void cpu_dt(cv::Mat input, cv::Mat &output)
{
cv::Mat BinaryImage = input;
uchar *pRowOne;
uchar *pRowNext;
float distance;
float Mindis;
for (int i = 1; i <= BinaryImage.rows - 2; i++)
{
pRowOne = BinaryImage.ptr<uchar>(i);
for (int j = 1; j <= BinaryImage.cols-1; j++)
{
pRowNext = BinaryImage.ptr<uchar>(i - 1);
distance = Distance(i, j, i - 1, j - 1);//q1
Mindis = min((float)pRowOne[j], distance + pRowNext[j -1]);
distance = Distance(i, j, i -1, j);//q2
Mindis = min(Mindis, distance + pRowNext[j]);
pRowNext = BinaryImage.ptr<uchar>(i);
distance = Distance(i, j, i, j -1);//q3
Mindis = min(Mindis, distance + pRowNext[j]);
pRowNext = BinaryImage.ptr<uchar>(i + 1);//q4
distance = Distance(i, j, i + 1, j - 1);
Mindis = min(Mindis, distance + pRowNext[j - 1]);
pRowOne[j] = (uchar)round(Mindis);
}
}
for(int i = BinaryImage.rows -2; i > 0; i--)
{
pRowOne = BinaryImage.ptr<uchar>(i);
for(int j = BinaryImage.cols -2; j > 0;j--)
{
pRowNext = BinaryImage.ptr<uchar>( i + 1);
distance = Distance(i, j, i + 1, j);//q1
Mindis = min((float)pRowOne[j], distance + pRowNext[j]);
distance = Distance(i, j, i + 1, j + 1 );//q2
Mindis = min(Mindis, distance + pRowNext[j + 1]);
pRowNext = BinaryImage.ptr<uchar>(i);//q3
distance = Distance(i, j, i, j + 1);
Mindis = min(Mindis, distance + pRowNext[j + 1]);
pRowNext = BinaryImage.ptr<uchar>(i - 1);//q4
distance = Distance(i, j, i -1, j + 1);
Mindis = min(Mindis, distance + pRowNext[j + 1]);
pRowOne[j] = (uchar)round(Mindis);
}
}
output = BinaryImage;
}
//:https ://blog.csdn.net/qinzihangxj/article/details/105601108
實際這個版本也不完全對,因爲與opencv的distanceTransform還是有差別然後我又按另一個原理寫了一個CPU版本:
可以看到原理哪怕稍微一點點差別,結果出來也不一樣。但是我現在只要求基本和opencv結果一樣。
我用了第二個模板寫對應CUDA版本,但是說實話這個算法原理步驟依賴性太強,所以並行性很小。
寫了對應的一個CUDA版本:
#define GEN2 1.41421356
/*
500x1216的圖像,實際線程1214個:1個block,每個block裏607x2個線程
*/
__global__ void distanceTransform(uchar *bwinput, uisize sizes, uchar *dtimg)
{
__shared__ uchar tmpmin[500][1216];
int idx = threadIdx.y*blockDim.x + threadIdx.x;
//首尾兩行、首尾兩列 不用處理 直接填0
int tid = idx;
int r = tid / sizes.cols;
int c = tid % sizes.cols;
while (r < sizes.rows && c<sizes.cols)
{
tmpmin[r][c] = 0;
tid += (blockDim.x*blockDim.y);
r = tid / sizes.cols;
c = tid % sizes.cols;
}
//除了首尾兩行、首尾兩列的都要處理,先前向掃描
for (int r = 1; r != sizes.rows - 1; r++)
{
int therow = r*sizes.cols;
int centerid = therow + 1 + idx;//得到中心點的位置,每個thread
int lastrow = therow - sizes.cols;
int upleftid = lastrow +idx;
int upid = lastrow + idx + 1;
int uprightid = lastrow + idx + 2;
uchar minvalue = bwinput[centerid];
uchar upleftvalue = GEN2 + bwinput[upleftid];
if (minvalue > upleftvalue)
{
minvalue = upleftvalue;
}
uchar upvalue = 1 + bwinput[upid];
if (minvalue > upvalue)
{
minvalue = upvalue;
}
uchar uprightvalue = GEN2 + bwinput[uprightid];
if (minvalue > uprightvalue)
{
minvalue = uprightvalue;
}
tmpmin[r][idx+1] = minvalue;//bwinput[centerid]=minvalue;
__syncthreads();
if (idx == 0)
{
for (int c = 1; c != sizes.cols - 1; c++)
{
int globalid = r*sizes.cols + c;
uchar leftvalue = 1 + bwinput[globalid - 1];
if (tmpmin[r][c] > leftvalue)
{
tmpmin[r][c] = leftvalue;
}
}
}
__syncthreads();
}
//除了首尾兩行、首尾兩列的都要處理,反向掃描
for (int r = sizes.rows - 2; r != 0; r--)
{
uchar minvalue = tmpmin[r][idx + 1];
uchar upleftvalue = GEN2 + tmpmin[r+1][idx];
if (minvalue > upleftvalue)
{
minvalue = upleftvalue;
}
uchar upvalue = 1 + tmpmin[r + 1][idx+1];
if (minvalue > upvalue)
{
minvalue = upvalue;
}
uchar uprightvalue = GEN2 + tmpmin[r + 1][idx + 2];
if (minvalue > uprightvalue)
{
minvalue = uprightvalue;
}
tmpmin[r][idx + 1] = minvalue;//bwinput[centerid]=minvalue;
__syncthreads();
if (idx == 0)
{
for (int c = sizes.cols - 2; c != 0; c--)
{
float leftvalue = 1 + tmpmin[r][c+1];
float tmpminvalue = tmpmin[r][c];
if (tmpminvalue > leftvalue)
{
tmpmin[r][c] = leftvalue;
}
}
}
__syncthreads();
}
//所有線程合作將共享變量的值填充到全局變量
tid = idx;
r = tid / sizes.cols;
c = tid % sizes.cols;
while (r < sizes.rows && c<sizes.cols)
{
dtimg[tid] = tmpmin[r][c];
tid += (blockDim.x*blockDim.y);
r = tid / sizes.cols;
c = tid % sizes.cols;
}
}
CUDACOMPILE : ptxas error : Entry function '_Z17distanceTransformPh6uisizeS_' uses too much shared data (0x94700 bytes, 0xc000 max)
但是我的硬件資源限制了,跑不起來,共享內存超出了限制。然後我又重寫了一個,
#define GEN2 1.41421356
float Distance(int x1, int y1, int x2, int y2)
{
return sqrt(float((x2 - x1)*(x2 - x1) + (y2 - y1)*(y2 - y1)));
}
void cpu_dt_mine(cv::Mat input, cv::Mat &output)
{
for (int i = 1; i < input.rows - 1; i++)
{
for (int j = 1; j < input.cols - 1; j++)
{
float tmpmin = input.ptr<uchar>(i)[j];
float left = 1 + input.ptr<uchar>(i)[j - 1];
if (tmpmin > left)
{
tmpmin = left;
}
float up = 1 + input.ptr<uchar>(i - 1)[j];
if (tmpmin > up)
{
tmpmin = up;
}
float upleft = GEN2 + input.ptr<uchar>(i - 1)[j - 1];
if (tmpmin > upleft)
{
tmpmin = upleft;
}
float upright = GEN2 + input.ptr<uchar>(i - 1)[j + 1];
if (tmpmin > upright)
{
tmpmin = upright;
}
output.ptr<float>(i)[j] = tmpmin;
input.ptr<uchar>(i)[j] = int(tmpmin);
}
}
for (int i = input.rows - 2; i >0; i--)
{
for (int j = input.cols - 2; j >0; j--)
{
float tmpmin = output.ptr<float>(i)[j];
float left = 1 + output.ptr<float>(i)[j + 1];
if (tmpmin > left)
{
tmpmin = left;
}
float up = 1 + output.ptr<float>(i + 1)[j];
if (tmpmin > up)
{
tmpmin = up;
}
float upleft = GEN2 + output.ptr<float>(i + 1)[j - 1];
if (tmpmin > upleft)
{
tmpmin = upleft;
}
float upright = GEN2 + output.ptr<float>(i + 1)[j + 1];
if (tmpmin > upright)
{
tmpmin = upright;
}
output.ptr<float>(i)[j] = tmpmin;
}
}
}
void cpu_dt(cv::Mat input, cv::Mat &output)
{
cv::Mat BinaryImage = input;
uchar *pRowOne;
uchar *pRowNext;
float distance;
float Mindis;
for (int i = 1; i <= BinaryImage.rows - 2; i++)
{
pRowOne = BinaryImage.ptr<uchar>(i);
for (int j = 1; j <= BinaryImage.cols - 1; j++)
{
pRowNext = BinaryImage.ptr<uchar>(i - 1);
distance = Distance(i, j, i - 1, j - 1);//q1
Mindis = min((float)pRowOne[j], distance + pRowNext[j - 1]);
distance = Distance(i, j, i - 1, j);//q2
Mindis = min(Mindis, distance + pRowNext[j]);
pRowNext = BinaryImage.ptr<uchar>(i);
distance = Distance(i, j, i, j - 1);//q3
Mindis = min(Mindis, distance + pRowNext[j]);
pRowNext = BinaryImage.ptr<uchar>(i + 1);//q4
distance = Distance(i, j, i + 1, j - 1);
Mindis = min(Mindis, distance + pRowNext[j - 1]);
pRowOne[j] = (uchar)round(Mindis);
}
}
for (int i = BinaryImage.rows - 2; i > 0; i--)
{
pRowOne = BinaryImage.ptr<uchar>(i);
for (int j = BinaryImage.cols - 2; j > 0; j--)
{
pRowNext = BinaryImage.ptr<uchar>(i + 1);
distance = Distance(i, j, i + 1, j);//q1
Mindis = min((float)pRowOne[j], distance + pRowNext[j]);
distance = Distance(i, j, i + 1, j + 1);//q2
Mindis = min(Mindis, distance + pRowNext[j + 1]);
pRowNext = BinaryImage.ptr<uchar>(i);//q3
distance = Distance(i, j, i, j + 1);
Mindis = min(Mindis, distance + pRowNext[j + 1]);
pRowNext = BinaryImage.ptr<uchar>(i - 1);//q4
distance = Distance(i, j, i - 1, j + 1);
Mindis = min(Mindis, distance + pRowNext[j + 1]);
pRowOne[j] = (uchar)round(Mindis);
}
}
output = BinaryImage;
}
//:https ://blog.csdn.net/qinzihangxj/article/details/105601108
////////////////////////////////////////////////////////////////////
/*硬件限制,共享內存不夠
dim3 threads(2, 607);
distanceTransform << <1, threads >> >(dev_img, imgsizes, dev_closest);
500x1216的圖像,實際線程1214個:1個block,每個block裏607x2個線程
*/
/*
__global__ void distanceTransform(uchar *bwinput, uisize sizes, uchar *dtimg)
{
__shared__ float tmpmin[500][1216];
int idx = threadIdx.y*blockDim.x + threadIdx.x;
//首尾兩行、首尾兩列 不用處理 直接填0
int tid = idx;
int r = tid / sizes.cols;
int c = tid % sizes.cols;
while (r < sizes.rows && c<sizes.cols)
{
tmpmin[r][c] = 0;
tid += (blockDim.x*blockDim.y);
r = tid / sizes.cols;
c = tid % sizes.cols;
}
//除了首尾兩行、首尾兩列的都要處理,先前向掃描
for (int r = 1; r != sizes.rows - 1; r++)
{
int therow = r*sizes.cols;
int centerid = therow + 1 + idx;//得到中心點的位置,每個thread
int lastrow = therow - sizes.cols;
int upleftid = lastrow +idx;
int upid = lastrow + idx + 1;
int uprightid = lastrow + idx + 2;
float minvalue = bwinput[centerid];
float upleftvalue = GEN2 + bwinput[upleftid];
if (minvalue > upleftvalue)
{
minvalue = upleftvalue;
}
float upvalue = 1 + bwinput[upid];
if (minvalue > upvalue)
{
minvalue = upvalue;
}
float uprightvalue = GEN2 + bwinput[uprightid];
if (minvalue > uprightvalue)
{
minvalue = uprightvalue;
}
tmpmin[r][idx+1] = minvalue;//bwinput[centerid]=minvalue;
__syncthreads();
if (idx == 0)
{
for (int c = 1; c != sizes.cols - 1; c++)
{
int globalid = r*sizes.cols + c;
float leftvalue = 1 + bwinput[globalid - 1];
if (tmpmin[r][c] > leftvalue)
{
tmpmin[r][c] = leftvalue;
}
}
}
__syncthreads();
}
//除了首尾兩行、首尾兩列的都要處理,反向掃描
for (int r = sizes.rows - 2; r != 0; r--)
{
float minvalue = tmpmin[r][idx + 1];
float upleftvalue = GEN2 + tmpmin[r+1][idx];
if (minvalue > upleftvalue)
{
minvalue = upleftvalue;
}
float upvalue = 1 + tmpmin[r + 1][idx+1];
if (minvalue > upvalue)
{
minvalue = upvalue;
}
float uprightvalue = GEN2 + tmpmin[r + 1][idx + 2];
if (minvalue > uprightvalue)
{
minvalue = uprightvalue;
}
tmpmin[r][idx + 1] = minvalue;//bwinput[centerid]=minvalue;
__syncthreads();
if (idx == 0)
{
for (int c = sizes.cols - 2; c != 0; c--)
{
float leftvalue = 1 + tmpmin[r][c+1];
float tmpminvalue = tmpmin[r][c];
if (tmpminvalue > leftvalue)
{
tmpmin[r][c] = leftvalue;
}
}
}
__syncthreads();
}
//所有線程合作將共享變量的值填充到全局變量
tid = idx;
r = tid / sizes.cols;
c = tid % sizes.cols;
while (r < sizes.rows && c<sizes.cols)
{
dtimg[tid] = tmpmin[r][c];
tid += (blockDim.x*blockDim.y);
r = tid / sizes.cols;
c = tid % sizes.cols;
}
}
*/
//500x1216的圖像,實際線程1214個:1個block,每個block裏607x2個線程
__global__ void distanceTransformgpu(uchar *bwinput, uisize sizes, float *dtimg)
{
__shared__ float tmpmin[1214];
int idx = threadIdx.x;
//首尾兩行、首尾兩列 不用處理 直接填0
int tid = idx;
float sidevalue = 0.0;
while (tid<sizes.cols)
{
dtimg[tid] = sidevalue;
dtimg[(sizes.rows - 1)*sizes.cols + tid] = sidevalue;
tid += (blockDim.x*blockDim.y);
}
tid = idx;
if (tid < sizes.rows)
{
dtimg[tid*sizes.cols] = sidevalue;
}
if ((tid >= sizes.rows) && (tid < 2 * sizes.rows))
{
dtimg[(tid - sizes.rows + 2)*sizes.cols - 1] = sidevalue;
}
////////////////////////////////////先前向掃描
for (int r = 1; r != sizes.rows - 1; r++)
{
int therow = r*sizes.cols;
int threadid = idx;
while (threadid < sizes.cols)
{
int centerid = therow + 1 + threadid;//得到中心點的位置,每個thread
int lastrow = therow - sizes.cols;
int upleftid = lastrow + threadid;
int upid = lastrow + threadid + 1;
int uprightid = lastrow + threadid + 2;
float minvalue = bwinput[centerid];
float upleftvalue = GEN2 + bwinput[upleftid];
if (minvalue > upleftvalue)
{
minvalue = upleftvalue;
}
float upvalue = 1 + bwinput[upid];
if (minvalue > upvalue)
{
minvalue = upvalue;
}
float uprightvalue = GEN2 + bwinput[uprightid];
if (minvalue > uprightvalue)
{
minvalue = uprightvalue;
}
tmpmin[threadid] = minvalue;
threadid += (blockDim.x);
}
__syncthreads();
if (idx == 0)
{
for (int c = 1; c != sizes.cols - 1; c++)
{
int globalid = r*sizes.cols + c;
float leftvalue = 1 + dtimg[globalid - 1];
if (tmpmin[c-1] > leftvalue)
{
dtimg[globalid] = leftvalue;
}
else
{
dtimg[globalid] = tmpmin[c - 1];
}
}
}
__syncthreads();
}
///////////////////////////////////////////反向掃描
for (int r = sizes.rows - 2; r != 0; r--)
{
int therow = r*sizes.cols;
int threadid = idx;
while (threadid < sizes.cols)
{
int centerid = therow + 1 + threadid;//得到中心點的位置,每個thread
int lastrow = therow + sizes.cols;
int upleftid = lastrow + threadid;
int upid = lastrow + threadid + 1;
int uprightid = lastrow + threadid + 2;
float minvalue = dtimg[centerid];
float upleftvalue = GEN2 + dtimg[upleftid];
if (minvalue > upleftvalue)
{
minvalue = upleftvalue;
}
float upvalue = 1 + dtimg[upid];
if (minvalue > upvalue)
{
minvalue = upvalue;
}
float uprightvalue = GEN2 + dtimg[uprightid];
if (minvalue > uprightvalue)
{
minvalue = uprightvalue;
}
tmpmin[threadid] = minvalue;
threadid += (blockDim.x);
}
__syncthreads();
if (idx == 0)
{
for (int c = sizes.cols - 2; c != 0; c--)
{
int globalid = r*sizes.cols + c;
float leftvalue = 1 + dtimg[globalid + 1];
float tmpminvalue = tmpmin[c-1];
if (tmpminvalue > leftvalue)
{
dtimg[globalid] = leftvalue;
}
else
{
dtimg[globalid] = tmpminvalue;
}
}
}
__syncthreads();
}
//////////////////////////////////////////////////////
}
int main()
{
char srclowfolder[400] = { 0 };
cv::Mat element_cross55 = cv::getStructuringElement(cv::MORPH_CROSS, cv::Size(5, 5));
for (int index = 3; index <= 3; index++)
{
sprintf(srclowfolder, "E:\\CUDA\\imgs\\jinxinginput\\101\\%d_lowbw.jpg", index);
cv::Mat low_binary_A = cv::imread(srclowfolder, cv::IMREAD_UNCHANGED);
for (int r = 0; r != low_binary_A.rows; r++)
{
for (int c = 0; c != low_binary_A.cols; c++)
{
if (low_binary_A.ptr<uchar>(r)[c] > 200)
{
low_binary_A.ptr<uchar>(r)[c] = 255;
}
else
{
low_binary_A.ptr<uchar>(r)[c] = 0;
}
}
}
/*
cv::Mat realimg(low_binary_A.rows, low_binary_A.cols, CV_8UC1, cv::Scalar(0));
for (int r = 0; r != low_binary_A.rows; r++)
{
for (int c = 0; c != low_binary_A.cols; c++)
{
if (low_binary_A.ptr<uchar>(r)[c] > 0)
{
realimg.ptr<uchar>(r)[c] = 1;
}
}
}
std::cout << "原圖" << std::endl;
for (int r = 152; r != 180; r++)
{
for (int c = 430; c != 458; c++)
{
std::cout << int(realimg.ptr<uchar>(r)[c]) << " ";
}
std::cout << std::endl;
}
std::cout << std::endl << std::endl;
std::cout << "cv...." << std::endl;
cv::Mat dist_image;
distanceTransform(low_binary_A, dist_image, cv::DIST_L2, 3);
for (int r = 152; r != 180; r++)
{
for (int c = 430; c != 458; c++)
{
std::cout << int(dist_image.ptr<float>(r)[c]) << " ";
}
std::cout << std::endl;
}
std::cout << std::endl << std::endl;
cv::Mat dtcpu(low_binary_A.rows, low_binary_A.cols, CV_8UC1, cv::Scalar(0));
cpu_dt(low_binary_A, dtcpu);
std::cout << "cpu1...." << std::endl;
for (int r = 152; r != 180; r++)
{
for (int c = 430; c != 458; c++)
{
std::cout << int(dtcpu.ptr<uchar>(r)[c]) << " ";
}
std::cout << std::endl;
}
std::cout << std::endl << std::endl;
dtcpu.release();
cv::Mat dt2cpu(low_binary_A.rows, low_binary_A.cols, CV_32FC1, cv::Scalar(0));
cpu_dt_mine(low_binary_A, dt2cpu);
std::cout << "cpu2...." << std::endl;
for (int r = 152; r != 180; r++)
{
for (int c = 430; c != 458; c++)
{
std::cout << int(dt2cpu.ptr<float>(r)[c]) << " ";
}
std::cout << std::endl;
}
std::cout << std::endl << std::endl;
*/
uisize imgsizes;
imgsizes.rows = low_binary_A.rows;
imgsizes.cols = low_binary_A.cols;
uchar *dev_img = 0;
float *dev_closest = 0;
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
cudaSetDevice(0);
cudaMalloc((void**)&dev_img, imgsizes.rows*imgsizes.cols * sizeof(uchar));
cudaMalloc((void**)&dev_closest, imgsizes.rows*imgsizes.cols * sizeof(float));
cudaMemcpy(dev_img, low_binary_A.data, imgsizes.rows*imgsizes.cols * sizeof(uchar), cudaMemcpyHostToDevice);
distanceTransformgpu << <1, 1024 >> >(dev_img, imgsizes, dev_closest);
cv::Mat deviceDTresult(low_binary_A.rows, low_binary_A.cols, CV_32FC1);
cudaMemcpy(deviceDTresult.data, dev_closest, imgsizes.rows*imgsizes.cols * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
std::cout << "gpu result...." << std::endl;
for (int r = 152; r != 180; r++)
{
for (int c = 430; c != 458; c++)
{
std::cout << int(deviceDTresult.ptr<float>(r)[c]) << " ";
}
std::cout << std::endl;
}
uchar *dev_dilate = 0;
cudaMalloc((void**)&dev_dilate, imgsizes.rows*imgsizes.cols * sizeof(uchar));
dim3 grids(2, 500);
dilatecross5 << <grids, 608 >> >(dev_img, dev_dilate);
cudaDeviceSynchronize();
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
//printf("Time on GPU: %3.1f ms\n", elapsedTime);
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
cudaFree(dev_img);
cudaFree(dev_closest);
cudaFree(dev_dilate);
}
return 0;
}
結果如下:
基本和cpu版本差不多(和第二個cpu原理出來的結果不完全一樣,先懶得追究)。但是如果我把註釋的部分放開,打印出的結果就不對,是VS的打印流之間會干擾是嗎?有大神遇到過這個奇怪現象嗎?
看這個打印結果怎麼變成這樣,和註釋掉時的結果不一樣。明明註釋掉的地方應該不影響CUDA的結果啊??
然後我測試了一下時間,opencv版本的距離變換隻要3~4ms,按兩種原理自己手寫的cpu版本的距離變換時間是100ms左右,但是gpu版本的竟然要1500ms!!!太不科學了,我以爲應該要比100ms少的?!!!!!!!真的想知道opencv版的距離變換的源碼內部怎麼處理的這麼快!!!!!!
我剛剛查到,原來opencv中的距離變換用的原理是<<distance transforms of sampled functions>>中的類似min-convolution,並不是中文網上說的這些,也就是我上面找的那樣其實並不是。這纔是cv::distanceTransform的來源。這是2012年的論文,我現在還找到另一個作者說已經實現並行的論文2019年出來的,但有的地方還看不怎麼懂,據說實現了全並行,應該比cv版本快很多,但我還沒完全懂。
我準備先把cv原理改成CUDA版本,測試性能,如果不夠再繼續研究2019這篇並行度很高的原理。
今天終於寫完了cv原理的CUDA版本,與對應的CPU版本一致,但耗時很多:
/*對輸入的二值圖做準備工作,
然後對所有列完成一次掃描
500x1216的圖像,那麼實際使用500x1216個線程就夠了,每個block內608x1
每個grid裏有 2x500個blocks
*/
__global__ void preparefordt(uchar *bwimg, float *gpudtimg)
{
int rows = 500;
int cols = 1216;
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int imgid = idy*cols + idx;
if (bwimg[imgid] == 0)
{
gpudtimg[imgid] = 0;
}
else
{
gpudtimg[imgid] = INF;
}
}
/*
f----輸入的一維數據
d----一維數據f對應的距離變換結果
*/
__device__ void dt1dimgpu(float *f, float *d)
{
const int n = 500;
//float d[n];
int v[n];
float z[n + 1];
int k = 0;
v[0] = 0;
z[0] = -INF;
z[1] = +INF;
for (int q = 1; q <= n - 1; q++)
{
float s = ((f[q] + q*q) - (f[v[k]] + (v[k])*(v[k]))) / (2 * q - 2 * v[k]);
while (s <= z[k])
{
k--;
s = ((f[q] + q*q) - (f[v[k]] + (v[k])*(v[k]))) / (2 * q - 2 * v[k]);
}
k++;
v[k] = q;
z[k] = s;
z[k + 1] = +INF;
}
k = 0;
for (int q = 0; q <= n - 1; q++)
{
while (z[k + 1] < q)
{
k++;
}
d[q] = (q - v[k])*(q - v[k]) + f[v[k]];
}
}
/*
f----輸入的一維數據
d----一維數據f對應的距離變換結果
*/
__device__ void dt1dimrowgpu(float *f, float *d)
{
const int n = 1216;
int v[n];
float z[n + 1];
int k = 0;
v[0] = 0;
z[0] = -INF;
z[1] = +INF;
for (int q = 1; q <= n - 1; q++)
{
float s = ((f[q] + q*q) - (f[v[k]] + (v[k])*(v[k]))) / (2 * q - 2 * v[k]);
while (s <= z[k])
{
k--;
s = ((f[q] + q*q) - (f[v[k]] + (v[k])*(v[k]))) / (2 * q - 2 * v[k]);
}
k++;
v[k] = q;
z[k] = s;
z[k + 1] = +INF;
}
k = 0;
for (int q = 0; q <= n - 1; q++)
{
while (z[k + 1] < q)
{
k++;
}
d[q] = (q - v[k])*(q - v[k]) + f[v[k]];
}
}
/*
然後對gpudtimg所有列完成一次掃描
500x1216的圖像,那麼實際使用1216個線程就夠了,每個block內1x500個線程
每個grid裏有 2x608個blocks
*/
__global__ void dtcolpass(float *gpudtimg, float *colpassimg)
{
const int rows = 500;
const int cols = 1216;
int id = blockIdx.y*gridDim.x + blockIdx.x;
int rowid = threadIdx.x;
int globalid = id + cols*rowid;
//block內所有線程合作,完成一列從全局搬運到共享內存
__shared__ float coldata[rows];
coldata[rowid] = gpudtimg[globalid];//globalid 有問題 越界了??
__syncthreads();
//一個線程 對共享內存的數據進行一維距離變換
__shared__ float coldataresult[rows];
if (rowid == 0)
{
dt1dimgpu(coldata, coldataresult);
}
__syncthreads();
//block內所有線程合作,將共享內存的距離變換結果搬到全局某列
colpassimg[globalid]= coldataresult[rowid];
}
/*
然後對colpassimg所有行完成一次掃描
500x1216的圖像,那麼實際使用500個線程就夠了,每個block內1024個線程
每個grid裏有 500個blocks
*/
__global__ void dtrowpass(float *colpassimg, float *rowpassimg)
{
const int rows = 500;
const int cols = 1216;
//block內所有線程合作,完成一行從全局搬運到共享內存
__shared__ float rowdata[cols];
int tid = threadIdx.x;
while (tid < cols)
{
int thid = tid + cols*blockIdx.y;
rowdata[tid] = colpassimg[thid];
tid += blockDim.x;
}
__syncthreads();
tid = threadIdx.x;
//一個線程 對共享內存的數據進行一維距離變換
__shared__ float rowdataresult[cols];
if (tid == 0)
{
dt1dimrowgpu(rowdata, rowdataresult);
}
__syncthreads();
//block內所有線程合作,將共享內存的距離變換結果搬到全局某行
while (tid < cols)
{
int thid = tid + cols*blockIdx.y;
rowpassimg[thid]= rowdataresult[tid];
tid += blockDim.x;
}
}
/*
最後對圖像結果每個數據開根號,得到最終結果
500x1216的圖像,那麼實際使用500x1216個線程就夠了,每個block內608x1
每個grid裏有 2x500個blocks
*/
__global__ void dtsqrt(float *rowpassimg, float *dtresult)
{
int rows = 500;
int cols = 1216;
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int imgid = idy*cols + idx;
float tmpsquare = rowpassimg[imgid];
dtresult[imgid] = sqrt(tmpsquare);
}
cudaMalloc((void**)&dev_preparedt, imgsizes.rows*imgsizes.cols * sizeof(float));
dim3 preparegrids(2, 500);
preparefordt << <preparegrids, 608 >> >(dev_img, dev_preparedt);
cudaDeviceSynchronize();
float *dev_colpassimg = 0;
cudaMalloc((void**)&dev_colpassimg, imgsizes.rows*imgsizes.cols * sizeof(float));
dim3 colgrids(2, 608);
dtcolpass << <colgrids, 500 >> > (dev_preparedt, dev_colpassimg);
cudaDeviceSynchronize();
cudaFree(dev_preparedt);
float *dev_rowpassimg = 0;
cudaMalloc((void**)&dev_rowpassimg, imgsizes.rows*imgsizes.cols * sizeof(float));
dim3 rowgrids(1, 500);
dtrowpass << <rowgrids, 1024 >> > (dev_colpassimg, dev_rowpassimg);
cudaDeviceSynchronize();
cudaFree(dev_colpassimg);
float *dev_dtresult = 0;
cudaMalloc((void**)&dev_dtresult, imgsizes.rows*imgsizes.cols * sizeof(float));
dtsqrt << <preparegrids, 608 >> > (dev_rowpassimg, dev_dtresult);
cudaDeviceSynchronize();
cudaFree(dev_rowpassimg);
這個耗時太長,然後我又改寫了一個版本:
//up to down, down to up
__device__ void likedt1dimvec(uchar *dim1data, uchar *dim1result)
{
int length = 500;
for (int i = 1; i != length; i++)
{
if (dim1data[i] > 0)
{
dim1result[i] = dim1data[i] + dim1result[i - 1];
}
}
}
//left to right , right to left
__device__ void likedt1dimhor(uchar *dim1data, uchar *dim1result)
{
int length = 1216;
for (int i = 1; i != length; i++)
{
if (dim1data[i] > 0)
{
dim1result[i] = dim1data[i] + dim1result[i - 1];
}
}
}
/*
然後對colpassimg所有行完成一次從左到右掃描
500x1216的圖像,那麼實際使用500個線程就夠了,每個block內1024個線程
每個grid裏有 500個blocks
*/
__global__ void likedtleftrightpass0(uchar *colpassimg, uchar *leftright)
{
const int rows = 500;
const int cols = 1216;
//block內所有線程合作,完成一行從全局搬運到共享內存
__shared__ uchar rowdata[cols];
int tid = threadIdx.x;
while (tid < cols)
{
int thid = tid + cols*blockIdx.y;
rowdata[tid] = colpassimg[thid];
tid += blockDim.x;
}
__syncthreads();
tid = threadIdx.x;
//一個線程 對共享內存的數據進行一維距離變換
__shared__ uchar rowdataresult[cols];
if (tid == 0)
{
likedt1dimhor(rowdata, rowdataresult);
}
__syncthreads();
//block內所有線程合作,將共享內存的距離變換結果搬到全局某行
while (tid < cols)
{
int thid = tid + cols*blockIdx.y;
leftright[thid] = rowdataresult[tid];
tid += blockDim.x;
}
}
/*
然後對colpassimg所有行完成一次從右到左掃描
500x1216的圖像,那麼實際使用500個線程就夠了,每個block內1024個線程
每個grid裏有 500個blocks
*/
__global__ void likedtrightleftpass0(uchar *colpassimg, uchar *rightleft)
{
const int rows = 500;
const int cols = 1216;
//block內所有線程合作,完成一行從全局搬運到共享內存
__shared__ uchar rowdata[cols];
int tid = threadIdx.x;
while (tid < cols)
{
int thid = tid + cols*blockIdx.y;
rowdata[cols-1-tid] = colpassimg[thid];
tid += blockDim.x;
}
__syncthreads();
tid = threadIdx.x;
//一個線程 對共享內存的數據進行一維距離變換
__shared__ uchar rowdataresult[cols];
if (tid == 0)
{
likedt1dimhor(rowdata, rowdataresult);
}
__syncthreads();
//block內所有線程合作,將共享內存的距離變換結果搬到全局某行
while (tid < cols)
{
int thid = tid + cols*blockIdx.y;
rightleft[thid] = rowdataresult[cols-1-tid];
tid += blockDim.x;
}
}
/*
然後對gpudtimg所有列完成一次從上到下掃描
500x1216的圖像,那麼實際使用1216個線程就夠了,每個block內1x500個線程
每個grid裏有 2x608個blocks
*/
__global__ void likedtupdownscan0(uchar *gpudtimg, uchar *updownpassimg)
{
const int rows = 500;
const int cols = 1216;
int id = blockIdx.y*gridDim.x + blockIdx.x;
int rowid = threadIdx.x;
int globalid = id + cols*rowid;
//block內所有線程合作,完成一列從全局搬運到共享內存
__shared__ uchar coldata[rows];
coldata[rowid] = gpudtimg[globalid];
__syncthreads();
//一個線程 對共享內存的數據進行一維距離變換
__shared__ uchar coldataresult[rows];
if (rowid == 0)
{
likedt1dimvec(coldata, coldataresult);
}
__syncthreads();
//block內所有線程合作,將共享內存的距離變換結果搬到全局某列
updownpassimg[globalid] = coldataresult[rowid];
}
/*
然後對gpudtimg所有列完成一次從下到上掃描
500x1216的圖像,那麼實際使用1216個線程就夠了,每個block內1x500個線程
每個grid裏有 2x608個blocks
*/
__global__ void likedtdownupscan0(uchar *gpudtimg, uchar *downuppassimg)
{
const int rows = 500;
const int cols = 1216;
int id = blockIdx.y*gridDim.x + blockIdx.x;
int rowid = threadIdx.x;
int globalid = id + cols*rowid;
//block內所有線程合作,完成一列從全局搬運到共享內存
__shared__ uchar coldata[rows];
coldata[rows-1-rowid] = gpudtimg[globalid];
__syncthreads();
//一個線程 對共享內存的數據進行一維距離變換
__shared__ uchar coldataresult[rows];
if (rowid == 0)
{
likedt1dimvec(coldata, coldataresult);
}
__syncthreads();
//block內所有線程合作,將共享內存的距離變換結果搬到全局某列
downuppassimg[globalid] = coldataresult[rows-1-rowid];
}
/*
最後對圖像結果每個數據開根號,得到最終結果
500x1216的圖像,那麼實際使用500x1216個線程就夠了,每個block內608x1
每個grid裏有 2x500個blocks
*/
__global__ void likedtresult(uchar *updown, uchar *downup, uchar *leftright, uchar *rightleft, uchar *dtresult)
{
int rows = 500;
int cols = 1216;
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int imgid = idy*cols + idx;
uchar udvalue = updown[imgid];
uchar minvalue = udvalue;
uchar duvalue = downup[imgid];
if (minvalue > duvalue)
{
minvalue = duvalue;
}
uchar lrvalue = leftright[imgid];
if (minvalue > lrvalue)
{
minvalue = lrvalue;
}
uchar rlvalue = rightleft[imgid];
if (minvalue > rlvalue)
{
minvalue = rlvalue;
}
dtresult[imgid] = minvalue;
}
但是這個版本的時間還是無法接受要34ms左右。最後我又在此基礎上將本來一個線程處理共享內存中的數據,變成多個線程去處理,現在時間已比CPU版本(10~11)快。
這是現在這個版本的時間。
另一個核函數即圖像膨脹,我測試了一下CUDA版本和我想的一樣提速很多很多,而且測試結果與opencv版本圖像膨脹結果一致。
三、分水嶺
我要使用基於距離變換標記的分水嶺,看了下opencv的這個版本的源碼,終於看懂了。依賴性很強而且使用了鏈表動態取出--存放--取出,不是那麼好並行啊。