如何为cuda内核创建一个临时2D变量

问题描述

由于未知原因，我在CUDA全局内核中声明的Ix和Iy将导致非法的内存访问。这是代码：

#include "opencv2/opencv.hpp"
#include "opencv2/highgui.hpp"
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>

// Cuda
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


#define CHECK_FINAL_RESULT
//#define CHECK_LOADING_DATA
using namespace std;
const int TSIZEX = 32;
const int TSIZEY = 256;
const int ft_size = 1;

// Mathematical algorithms
#define isl_min(x,y)        ((x) < (y) ? (x) : (y))         // compare value x is lesser than y,if correct use x,if wrong use y
#define isl_max(x,y)        ((x) > (y) ? (x) : (y))         // comapre value x is larger than y,if correct use y,if wrong use x

__device__ float cudafilter2sq(float A[16][34],float B[34][258],int i,int j);
__global__ void cudapipeline_harris(int  C,int  R,float* img,float* harris);

__device__ float cudafilter2sq(float A[34][258],int j) {

    return (A[i - 1][j - 1] * B[i - 1][j - 1] +
        A[i - 1][j] * B[i - 1][j] +
        A[i - 1][j + 1] * B[i - 1][j + 1] +
        A[i][j - 1] * B[i][j - 1] +
        A[i][j] * B[i][j] +
        A[i][j + 1] * B[i][j + 1] +
        A[i + 1][j - 1] * B[i + 1][j - 1] +
        A[i + 1][j] * B[i + 1][j] +
        A[i + 1][j + 1] * B[i + 1][j + 1]);
}



__global__ void cudapipeline_harris(int  C,float* harris) {

    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    int idz = threadIdx.z + blockIdx.z * blockDim.z;

    float Ix[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];
    float Iy[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];

    for (int Ti = idx; Ti < (float)(R / TSIZEX); Ti += gridDim.x * blockDim.x)
    //if (Ti < (R / TSIZEX))
    {
        //for (int Tj = 0; Tj <= (float)(C / TSIZEY); Tj++)
        for (int Tj = idy; Tj < (float)(C/ TSIZEY); Tj += gridDim.y * blockDim.y)
        {
            int bot0,top0,right0,left0;
            int height,width;

            bot0 = isl_min(isl_max(Ti * TSIZEX,ft_size),R - ft_size);
            top0 = isl_min((Ti + 1) * TSIZEX,R - ft_size);
            left0 = isl_min(isl_max(Tj * TSIZEY,C - ft_size);
            right0 = isl_min((Tj + 1) * TSIZEY,C - ft_size);

            width = right0 - left0;
            height = top0 - bot0;


            for (int i = bot0; i <= top0; i++)
            {
                for (int j = left0; j <= right0; j++)
                {
                    //printf("Ix : %d ",i - bot0);
                    Ix[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
                        img[(i + 1) * C + j - 1] * 0.0833333333333f +
                        img[(i + 1) * C + j] * 0.166666666667f +
                        img[(i - 1) * C + j] * -0.166666666667f +
                        img[(i - 1) * C + j + 1] * -0.0833333333333f +
                        img[(i + 1) * C + j + 1] * 0.0833333333333f;

                    Iy[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
                        img[(i - 1) * C + j + 1] * 0.0833333333333f +
                        img[i * C + j - 1] * -0.166666666667f +
                        img[i * C + j + 1] * 0.166666666667f +
                        img[(i + 1) * C + j - 1] * -0.0833333333333f +
                        img[(i + 1) * C + j + 1] * 0.0833333333333f;

                }
            }

           // for (int i = idy + bot0;i < (float)top0; i += gridDim.y * blockDim.y)
            for (int i = bot0; i < top0; i++)
            {
                for (int j = left0; j < right0; j++)
                {
                    int newI = i - bot0;
                    int newJ = j - left0;

                    harris[((i)*C + (j))] = cudafilter2sq(Ix,Ix,newI,newJ) * cudafilter2sq(Iy,Iy,newJ) -
                        cudafilter2sq(Ix,newJ) * cudafilter2sq(Ix,newJ) -
                        (0.04f * (cudafilter2sq(Ix,newJ) + cudafilter2sq(Iy,newJ))) *
                        (cudafilter2sq(Ix,newJ));
                }
            }

        }
    }


}


int main(int argc,char** argv)
{
    int i,j,run;                // looping variables
    int R,C,nruns;              // height,width and number of loops runs
    double begin,end;            // each loop start time and end time
    double init,finish;          // total loop start time and end time
    double stime,avgt;           // time used and total avgt time
    cv::Mat image,loaded_data;
    cv::Scalar sc;
    cv::Size size;

    float* t_res;
    float* t_data;

    // Might be unused depending on preprocessor macro definitions
    (void)t_res;
    (void)t_data;
    (void)loaded_data;

    float* data;
    float* res;

    if (argc != 3)
    {
        printf("Does not set the NRuns and image needed\n");
        return -1;
    }

    image = cv::imread(argv[1],1);   // read image from command line argument [1]

    if (!image.data)
    {
        printf("No image data ! Are you sure %s is an image ?\n",argv[1]);
        return -1;
    }

    // Convert image input to grayscale floating point
    cv::cvtColor(image,image,cv::COLOR_BGR2GRAY);
    size = image.size();
    C = size.width;
    R = size.height;

    printf("Values settings :\n");
    printf("-------------------\n");
    printf("Image Used : %s [%i,%i] \n",argv[1],R,C);

    res = (float*)calloc(R * C,sizeof(*res));

    if (res == NULL)
    {
        printf("Error while allocating result table of size %ld B\n",(sizeof(*res) * C * R));
        return -1;
    }

    data = (float*)malloc(R * C * sizeof(float));
    for (i = 0; i < R; i++) {
        for (j = 0; j < C; j++) {
            sc = image.at<uchar>(i,j);
            data[i * C + j] = (float)sc.val[0] / 255;
        }
    }

    // Parallel Running Test
    printf("\n\n-----------------------------------\n");
    printf("Cuda\n");
    printf("-----------------------------------\n");
    res = (float*)calloc(R * C,sizeof(*res));                // reset resources value

    dim3 grid(2,2,2);
    dim3 block(16,16,1);

    // Data required to pass to device
    float* img,* harris;

    cudaEvent_t start,stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMalloc((void**)&img,R * C * sizeof(*img));
    cudaMalloc((void**)&harris,R * C * sizeof(*harris));


    cudaMemcpy(img,data,C * R * sizeof(*data),cudaMemcpyHostToDevice);   // pass image value to the GPU

    cudaEventRecord(start);
    cudapipeline_harris << < grid,block >> > (C,img,harris);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds,start,stop);
    //cudapipeline_harris << < 1,8 >> > (C,harris);

   cudaDeviceSynchronize();


    cudaMemcpy(res,harris,C * R * sizeof(*harris),cudaMemcpyDeviceToHost);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        printf("CUDA ERROR : %s",cudaGetErrorString(err));
        exit(-1);
    }

    printf("Total time   :  \t %f ms\n",milliseconds);


#ifdef CHECK_FINAL_RESULT
    // Serial Show input
    cv::namedWindow("Input",cv::WINDOW_NORMAL);
    cv::imshow("Input",image);
    image.release();
    // Parallel Show output
    cv::Mat imres = cv::Mat(R,CV_32F,res);
    cv::namedWindow("Parallel Output",cv::WINDOW_NORMAL);
    cv::imshow("Parallel Output",imres * 65535.0);
    imres.release();
#endif

    cudaFree(harris);
    cudaFree(img);

    free(data);
    free(res);
    return 0;
}

这是显示的错误：

CUDA ERROR : an illegal memory access was encountered

**CUDA ERROR : unspecified launch failure========= Invalid __global__ read of size 4
=========     at 0x000002d0 in C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:383:cudafilter2sq(float[258]*,float[258]*,int,int)
=========     by thread (15,1,0) in block (0,1)
=========     Address 0x2c6f5fee774 is out of bounds
=========     Device Frame:C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:453:cudapipeline_harris(int,float*,float*) (cudapipeline_harris(int,float*) : 0x2130)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x81dcd]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x82167]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x8686e]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ba]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x176ea9]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0xe97c2) [0x307342]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x361bd]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x365e1]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x368c4]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuLaunchKernel + 0x234) [0x20d954]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8dba]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8c66]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll (cudaLaunchKernel + 0x1c4) [0x29024]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x1f) [0x516f]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__device_stub__Z19cudapipeline_harrisiiPfS_ + 0x22e) [0x4fbe]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (cudapipeline_harris + 0x41) [0x44c1]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x577) [0x4a47]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (invoke_main + 0x39) [0xfa79]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main_seh + 0x12e) [0xf95e]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main + 0xe) [0xf81e]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (mainCRTStartup + 0x9) [0xfb09]
=========     Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17bd4]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6ce51]
=========**

解决方法

这里的调试过程非常简单。您的CUDA错误输出指向cudafilter2sq中超出范围的访问错误，如下所示：

Invalid global read of size 4 ========= at ...cuda/main.cu:383:cudafilter2sq
... Address ... is out of bounds

看着cudafilter2sq，问自己一个问题：“这些访问之一怎么可能超出范围？”由于该函数非常简单，因此答案是：“如果其中一个索引（从i或j计算出来的）超出了A / Ix或{ {1}} / B。然后您就可以根据已知的可能范围（0-33，0-257）测试那些计算出的索引。

很明显，Iy的{{1}}值必须大于0，否则cudafilter2sq的索引将超出范围。但是您不满足此要求。添加：

然后添加：

i-1

到#include <assert.h>的开始。然后在启用内存检查功能的情况下运行代码（就像您已经做的那样）。您将击中这些设备断言，表明您在索引范围外。您在assert(i > 0);上也遇到了同样的问题。

当我将以下代码添加到cudafilter2sq的开头时：

您的代码对我来说没有错误。显然，如果您的cudafilter2sq内核for循环以：

开始

if (i < 1) i = 1;  if (j < 1) j = 1;

然后：

cudapipeline_harris

可以为...int i = bot0;...产生零值（对于int newI = i - bot0;同样可以产生零值）。因此，这似乎是索引问题的“来源”。我认为您可以从这里修复它。

另外，请注意，您对newI的前向声明：

newJ

与定义不符

cudafilter2sq

cuda global-variables multidimensional-array

如何为cuda内核创建一个临时2D变量

问题描述

解决方法

相关问答