问题描述
由于未知原因,我在CUDA全局内核中声明的Ix
和Iy
将导致非法的内存访问。这是代码:
#include "opencv2/opencv.hpp"
#include "opencv2/highgui.hpp"
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>
// Cuda
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CHECK_FINAL_RESULT
//#define CHECK_LOADING_DATA
using namespace std;
const int TSIZEX = 32;
const int TSIZEY = 256;
const int ft_size = 1;
// Mathematical algorithms
#define isl_min(x,y) ((x) < (y) ? (x) : (y)) // compare value x is lesser than y,if correct use x,if wrong use y
#define isl_max(x,y) ((x) > (y) ? (x) : (y)) // comapre value x is larger than y,if correct use y,if wrong use x
__device__ float cudafilter2sq(float A[16][34],float B[34][258],int i,int j);
__global__ void cudapipeline_harris(int C,int R,float* img,float* harris);
__device__ float cudafilter2sq(float A[34][258],int j) {
return (A[i - 1][j - 1] * B[i - 1][j - 1] +
A[i - 1][j] * B[i - 1][j] +
A[i - 1][j + 1] * B[i - 1][j + 1] +
A[i][j - 1] * B[i][j - 1] +
A[i][j] * B[i][j] +
A[i][j + 1] * B[i][j + 1] +
A[i + 1][j - 1] * B[i + 1][j - 1] +
A[i + 1][j] * B[i + 1][j] +
A[i + 1][j + 1] * B[i + 1][j + 1]);
}
__global__ void cudapipeline_harris(int C,float* harris) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
int idz = threadIdx.z + blockIdx.z * blockDim.z;
float Ix[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];
float Iy[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];
for (int Ti = idx; Ti < (float)(R / TSIZEX); Ti += gridDim.x * blockDim.x)
//if (Ti < (R / TSIZEX))
{
//for (int Tj = 0; Tj <= (float)(C / TSIZEY); Tj++)
for (int Tj = idy; Tj < (float)(C/ TSIZEY); Tj += gridDim.y * blockDim.y)
{
int bot0,top0,right0,left0;
int height,width;
bot0 = isl_min(isl_max(Ti * TSIZEX,ft_size),R - ft_size);
top0 = isl_min((Ti + 1) * TSIZEX,R - ft_size);
left0 = isl_min(isl_max(Tj * TSIZEY,C - ft_size);
right0 = isl_min((Tj + 1) * TSIZEY,C - ft_size);
width = right0 - left0;
height = top0 - bot0;
for (int i = bot0; i <= top0; i++)
{
for (int j = left0; j <= right0; j++)
{
//printf("Ix : %d ",i - bot0);
Ix[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
img[(i + 1) * C + j - 1] * 0.0833333333333f +
img[(i + 1) * C + j] * 0.166666666667f +
img[(i - 1) * C + j] * -0.166666666667f +
img[(i - 1) * C + j + 1] * -0.0833333333333f +
img[(i + 1) * C + j + 1] * 0.0833333333333f;
Iy[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
img[(i - 1) * C + j + 1] * 0.0833333333333f +
img[i * C + j - 1] * -0.166666666667f +
img[i * C + j + 1] * 0.166666666667f +
img[(i + 1) * C + j - 1] * -0.0833333333333f +
img[(i + 1) * C + j + 1] * 0.0833333333333f;
}
}
// for (int i = idy + bot0;i < (float)top0; i += gridDim.y * blockDim.y)
for (int i = bot0; i < top0; i++)
{
for (int j = left0; j < right0; j++)
{
int newI = i - bot0;
int newJ = j - left0;
harris[((i)*C + (j))] = cudafilter2sq(Ix,Ix,newI,newJ) * cudafilter2sq(Iy,Iy,newJ) -
cudafilter2sq(Ix,newJ) * cudafilter2sq(Ix,newJ) -
(0.04f * (cudafilter2sq(Ix,newJ) + cudafilter2sq(Iy,newJ))) *
(cudafilter2sq(Ix,newJ));
}
}
}
}
}
int main(int argc,char** argv)
{
int i,j,run; // looping variables
int R,C,nruns; // height,width and number of loops runs
double begin,end; // each loop start time and end time
double init,finish; // total loop start time and end time
double stime,avgt; // time used and total avgt time
cv::Mat image,loaded_data;
cv::Scalar sc;
cv::Size size;
float* t_res;
float* t_data;
// Might be unused depending on preprocessor macro definitions
(void)t_res;
(void)t_data;
(void)loaded_data;
float* data;
float* res;
if (argc != 3)
{
printf("Does not set the NRuns and image needed\n");
return -1;
}
image = cv::imread(argv[1],1); // read image from command line argument [1]
if (!image.data)
{
printf("No image data ! Are you sure %s is an image ?\n",argv[1]);
return -1;
}
// Convert image input to grayscale floating point
cv::cvtColor(image,image,cv::COLOR_BGR2GRAY);
size = image.size();
C = size.width;
R = size.height;
printf("Values settings :\n");
printf("-------------------\n");
printf("Image Used : %s [%i,%i] \n",argv[1],R,C);
res = (float*)calloc(R * C,sizeof(*res));
if (res == NULL)
{
printf("Error while allocating result table of size %ld B\n",(sizeof(*res) * C * R));
return -1;
}
data = (float*)malloc(R * C * sizeof(float));
for (i = 0; i < R; i++) {
for (j = 0; j < C; j++) {
sc = image.at<uchar>(i,j);
data[i * C + j] = (float)sc.val[0] / 255;
}
}
// Parallel Running Test
printf("\n\n-----------------------------------\n");
printf("Cuda\n");
printf("-----------------------------------\n");
res = (float*)calloc(R * C,sizeof(*res)); // reset resources value
dim3 grid(2,2,2);
dim3 block(16,16,1);
// Data required to pass to device
float* img,* harris;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc((void**)&img,R * C * sizeof(*img));
cudaMalloc((void**)&harris,R * C * sizeof(*harris));
cudaMemcpy(img,data,C * R * sizeof(*data),cudaMemcpyHostToDevice); // pass image value to the GPU
cudaEventRecord(start);
cudapipeline_harris << < grid,block >> > (C,img,harris);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds,start,stop);
//cudapipeline_harris << < 1,8 >> > (C,harris);
cudaDeviceSynchronize();
cudaMemcpy(res,harris,C * R * sizeof(*harris),cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("CUDA ERROR : %s",cudaGetErrorString(err));
exit(-1);
}
printf("Total time : \t %f ms\n",milliseconds);
#ifdef CHECK_FINAL_RESULT
// Serial Show input
cv::namedWindow("Input",cv::WINDOW_NORMAL);
cv::imshow("Input",image);
image.release();
// Parallel Show output
cv::Mat imres = cv::Mat(R,CV_32F,res);
cv::namedWindow("Parallel Output",cv::WINDOW_NORMAL);
cv::imshow("Parallel Output",imres * 65535.0);
imres.release();
#endif
cudaFree(harris);
cudaFree(img);
free(data);
free(res);
return 0;
}
这是显示的错误:
CUDA ERROR : an illegal memory access was encountered
**CUDA ERROR : unspecified launch failure========= Invalid __global__ read of size 4
========= at 0x000002d0 in C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:383:cudafilter2sq(float[258]*,float[258]*,int,int)
========= by thread (15,1,0) in block (0,1)
========= Address 0x2c6f5fee774 is out of bounds
========= Device Frame:C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:453:cudapipeline_harris(int,float*,float*) (cudapipeline_harris(int,float*) : 0x2130)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x81dcd]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x82167]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x8686e]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ba]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x176ea9]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0xe97c2) [0x307342]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x361bd]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x365e1]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x368c4]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuLaunchKernel + 0x234) [0x20d954]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8dba]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8c66]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll (cudaLaunchKernel + 0x1c4) [0x29024]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x1f) [0x516f]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__device_stub__Z19cudapipeline_harrisiiPfS_ + 0x22e) [0x4fbe]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (cudapipeline_harris + 0x41) [0x44c1]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x577) [0x4a47]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (invoke_main + 0x39) [0xfa79]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main_seh + 0x12e) [0xf95e]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main + 0xe) [0xf81e]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (mainCRTStartup + 0x9) [0xfb09]
========= Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17bd4]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6ce51]
=========**
解决方法
这里的调试过程非常简单。您的CUDA错误输出指向cudafilter2sq
中超出范围的访问错误,如下所示:
Invalid global read of size 4 ========= at ...cuda/main.cu:383:cudafilter2sq
... Address ... is out of bounds
看着cudafilter2sq
,问自己一个问题:“这些访问之一怎么可能超出范围?”由于该函数非常简单,因此答案是:“如果其中一个索引(从i
或j
计算出来的)超出了A
/ Ix
或{ {1}} / B
。然后您就可以根据已知的可能范围(0-33,0-257)测试那些计算出的索引。
很明显,Iy
的{{1}}值必须大于0,否则cudafilter2sq
的索引将超出范围。但是您不满足此要求。添加:
i
然后添加:
i-1
到 #include <assert.h>
的开始。然后在启用内存检查功能的情况下运行代码(就像您已经做的那样)。您将击中这些设备断言,表明您在索引范围外。您在 assert(i > 0);
上也遇到了同样的问题。
当我将以下代码添加到cudafilter2sq
的开头时:
j
您的代码对我来说没有错误。显然,如果您的cudafilter2sq
内核for循环以:
if (i < 1) i = 1; if (j < 1) j = 1;
然后:
cudapipeline_harris
可以为 ...int i = bot0;...
产生零值(对于 int newI = i - bot0;
同样可以产生零值)。因此,这似乎是索引问题的“来源”。我认为您可以从这里修复它。
另外,请注意,您对newI
的前向声明:
newJ
与定义不符
cudafilter2sq