问题描述
我为一个简单的图像处理任务编写了一个 OpenCL 内核:进行开运算(腐蚀 + 膨胀),然后根据开运算的结果执行闭运算(膨胀 + 腐蚀)。
问题是我得到的输出如下图所示: 我试过只做开运算,好像还挺稳定的,但是当我排队开合的时候,出现了那些黑色的横线。即使是开场变换也不是那么好,因为我们已经可以看到一些水平延伸。
这是 OpenCL 内核:
kernel void computeSmooth(
const int width,const int height,const int gridStep,global const unsigned char* img,global unsigned char* erodeOpen,global unsigned char* dilateOpen,global unsigned char* dilateClose,global unsigned char* erodeClose)
{
size_t id = get_global_id(0);
const int size = 2;//(gridStep * gridStep) / 16;
areaopening(id,size,width,height,img,erodeOpen,dilateOpen);
areaClosing(id,dilateOpen,dilateClose,erodeClose);
}
bool validindex(int index,int width,int height)
{
return index > 0 && index < (width * height * 3);
}
unsigned char* areaopening(
const int id,const int size,const int width,global unsigned char* dilateOpen)
{
int radius = size / 2;
int baseIndex = id*3;
int index;
int red = 255;
int green = 255;
int blue = 255;
// erode
for(int l = -radius; l < radius; ++l)
{
for(int c = -radius; c < radius; ++c)
{
if(l == 0 && c == 0)
continue;
index = baseIndex + (l * width * 3) + (c * 3);
if(validindex(index,height))
{
if(img[index] < red)
red = img[index];
if(img[index+1] < green)
green = img[index+1];
if(img[index+2] < blue)
blue = img[index+2];
}
}
}
erodeOpen[baseIndex] = red;
erodeOpen[baseIndex+1] = green;
erodeOpen[baseIndex+2] = blue;
red = 0;
green = 0;
blue = 0;
// dilate
for(int l = -radius; l < radius; ++l)
{
for(int c = -radius; c < radius; ++c)
{
index = baseIndex + (l * width * 3) + (c * 3);
if(validindex(index,height))
{
if(erodeOpen[index] > red)
red = erodeOpen[index];
if(erodeOpen[index+1] > green)
green = erodeOpen[index+1];
if(erodeOpen[index+2] > blue)
blue = erodeOpen[index+2];
}
}
}
dilateOpen[baseIndex] = red;
dilateOpen[baseIndex+1] = green;
dilateOpen[baseIndex+2] = blue;
}
unsigned char* areaClosing(
const int id,global unsigned char* img,global unsigned char* erodeClose)
{
int radius = size / 2;
int baseIndex = id*3;
int index;
int red = 0;
int green = 0;
int blue = 0;
// dilate
for(int l = -radius; l < radius; ++l)
{
for(int c = -radius; c < radius; ++c)
{
if(l == 0 && c == 0)
continue;
index = baseIndex + (l * width * 3) + (c * 3);
if(validindex(index,height))
{
if(img[index] > red)
red = img[index];
if(img[index+1] > green)
green = img[index+1];
if(img[index+2] > blue)
blue = img[index+2];
}
}
}
dilateClose[baseIndex] = red;
dilateClose[baseIndex+1] = green;
dilateClose[baseIndex+2] = blue;
red = 255;
green = 255;
blue = 255;
// erode
for(int l = -radius; l < radius; ++l)
{
for(int c = -radius; c < radius; ++c)
{
index = baseIndex + (l * width * 3) + (c * 3);
if(validindex(index,height))
{
if(dilateClose[index] < red)
red = dilateClose[index];
if(dilateClose[index+1] < green)
green = dilateClose[index+1];
if(dilateClose[index+2] < blue)
blue = dilateClose[index+2];
}
}
}
erodeClose[baseIndex] = red;
erodeClose[baseIndex+1] = green;
erodeClose[baseIndex+2] = blue;
}
原始图像是一个无符号字符数组,3 个 RGB 通道,每个通道 8 位。 宿主代码使用 C++。
我已经尝试过 OpenCV,它可以正确地做我想做的事情,但我确实想让我的实现工作,并了解它是如何制作的。
这是显示两种方法的主机代码,一种使用我的 GPU 内核,另一种使用 OpenCV(现在已评论):
void Window::computeSmooth()
{
cl::Context context = program.getContext();
cl::CommandQueue queue = program.getCommandQueue();
cl::Kernel smoothKernel = program.getSmoothKernel();
// reset smooth image data
img.smooth.fill(QColor(0,255));
// prepare data
const int nbElems{img.width * img.height * 3};
cl::Buffer originalImage(context,CL_MEM_READ_ONLY | CL_MEM_copY_HOST_PTR,nbElems * sizeof(unsigned char),img.originalRAW);
cl::Buffer erodeOpen(context,CL_MEM_READ_WRITE,nbElems * sizeof(unsigned char));
cl::Buffer dilateOpen(context,nbElems * sizeof(unsigned char));
cl::Buffer dilateClose(context,nbElems * sizeof(unsigned char));
cl::Buffer erodeClose(context,nbElems * sizeof(unsigned char));
// set kernel parameters
smoothKernel.setArg(0,img.width);
smoothKernel.setArg(1,img.height);
smoothKernel.setArg(2,grid.step);
smoothKernel.setArg(3,originalImage);
smoothKernel.setArg(4,erodeOpen);
smoothKernel.setArg(5,dilateOpen);
smoothKernel.setArg(6,dilateClose);
smoothKernel.setArg(7,erodeClose);
// launch kernel on the compute device
queue.enqueueNDRangeKernel(smoothKernel,cl::NullRange,img.width * img.height,cl::NullRange);
// get result back to host
queue.enqueueReadBuffer(erodeClose,CL_TRUE,img.smoothRAW.get());
/*
// OpenCV
cv::Mat image = cv::Mat(img.height,img.width,CV_8UC3,img.originalRAW);
int morph_size = 1;
cv::Mat element = cv::getStructuringElement(cv::MORPH_RECT,cv::Size(2 * morph_size + 1,2 * morph_size + 1),cv::Point(morph_size,morph_size));
cv::Mat opening;
cv::Mat closing;
// opening
cv::morphologyEx(image,opening,cv::MORPH_OPEN,element,cv::Point(-1,-1),1);
// Closing
cv::morphologyEx(opening,closing,cv::MORPH_CLOSE,1);
unsigned char* data = closing.data;
for(int i{0}; i < (img.height * img.width * 3); i += 3)
{
img.smoothRAW[i] = *(closing.data + i);
img.smoothRAW[i+1] = *(closing.data + i + 1);
img.smoothRAW[i+2] = *(closing.data + i + 2);
}
*/
将不胜感激。 提前致谢。
解决方法
工作组内的线程由 32 个(NVIDIA 扭曲)或 64 个(AMD 波前)的组执行。
erodeOpen
和 dilateClose
很可能在填充数据之前被使用。为确保在使用前先填充它们,请添加屏障:
erodeOpen[baseIndex+2] = blue;
barrier(CLK_GLOBAL_MEM_FENCE);
和
dilateClose[baseIndex+2] = blue;
barrier(CLK_GLOBAL_MEM_FENCE);
====== 更新 ========
我没有注意到 dilateOpen
也被传递给了 areaClosing()
,它在函数内部被命名为 img
。
然后在调用 areaOpening()
和 areaClosing()
之间添加屏障:
areaOpening(id,size,width,height,img,erodeOpen,dilateOpen);
barrier(CLK_GLOBAL_MEM_FENCE);
areaClosing(id,dilateOpen,dilateClose,erodeClose);
这应该可以解决问题,尤其是当你分成 2 个内核时,它会起作用。