cuFFT - 行式 FFT --> 元素乘法 --> 行式 iFFT 在程序中进一步崩溃?

问题描述

对于我的研究,我需要尽可能快地进行大量的大卷积。我首先用 cudnn 包做了它,它工作正常,但由于我有非常大的内核和图像,我想看看使用 FFT 卷积是否可以加快这个过程。我需要对两个数据集的输入进行逐行 FFT,将它们元素相乘并进行 iFFT。这使得卷积表示两个矩阵仅在水平方向上相互“滑动”(因为它们具有相同的高度)。

函数代码如下所示:

void Onlinespikes::FFTConvolution(float *Data,float *Templates,float *Output,long lW,long lM,long lC,long lL) {


    //Data is sized lC * lW (around 2000)
    //Templates is sized lC * (lM * lL) (around 1000)
    //Output is then lC + (lM * lL) - 1

    //Initializations
    long Bigger  = max(lW,lM * lL);
    long Smaller = min(lW,lM * lL);

    //New size after convolution
    long ConvSize = Bigger + Smaller - 1;
    
    long Closest_Power_of_two;
    long i = 1;
    while ( (1 << i) <= ConvSize) {
        i++;
    }
    Closest_Power_of_two = 1 << i;

    std::cout << "lM * lL = " << lM * lL << std::endl;
    std::cout << "lW = " << lW << std::endl;
    std::cout << "Closest_Power_of_two = " << Closest_Power_of_two << std::endl; //this prints 8192


    long newsize = Closest_Power_of_two / 2 + 1;

    float *InputMat;
    float *InputMat2;
    float *OutputMat;

    cufftComplex *FFTInputMat;
    cufftComplex *FFTInputMat2;
    cufftComplex *FFTOutputMat;
    
    cudamalloc((void**)&InputMat,lC * Closest_Power_of_two * sizeof(float));
    cudamalloc((void**)&InputMat2,lC * Closest_Power_of_two * sizeof(float));
    cudamalloc((void**)&OutputMat,lC * Closest_Power_of_two * sizeof(float));

    cudamalloc((void**)&FFTInputMat,lC * newsize * sizeof(cufftComplex));
    cudamalloc((void**)&FFTInputMat2,lC * newsize * sizeof(cufftComplex));
    cudamalloc((void**)&FFTOutputMat,lC * newsize * sizeof(cufftComplex));

    //Padding
    cudamemset(InputMat,lC * Closest_Power_of_two * sizeof(float));
    cudamemset(InputMat2,lC * Closest_Power_of_two * sizeof(float));
    cudamemset(OutputMat,lC * Closest_Power_of_two * sizeof(float));

     
    cudamemcpy(InputMat,Data,lC * lW  * sizeof(float),cudamemcpyDevicetoDevice);
    cudamemcpy(InputMat2,Templates,lC * lM * lL * sizeof(float),cudamemcpyDevicetoDevice);

    cufftHandle plan;
    cufftHandle plan_inverse;

    //Set variables needed
    long nx = Closest_Power_of_two;
    long ny = lC;
    long batch = lC;

    static int W[1] = { (int)nx };
    static int C[1] = { (int)lC };

    static int inembed[1] = { (int) nx };
    static int onembed[1] = { (int) nx };

    static int istride = nx;
    static int idist = 1;
    static int ostride = nx;
    static int odist = 1;
    
    
     
    //Make plan(s) forward
    cufftPlanMany(&plan,1,W,inembed,istride,idist,onembed,ostride,odist,CUFFT_R2C,ny);
    //cufftPlan1d(&plan,Closest_Power_of_two,2 * lC);



    //Change variables to make plan backward
    nx = newsize;
    ny = lC;

    W[1] = { (int)nx };
    C[1] = { (int)lC };

    inembed[0] = (int)nx;
    onembed[0] = (int)nx;

    istride = nx;
    idist = 1;
    ostride = nx;
    odist = 1;

    cufftPlanMany(&plan_inverse,CUFFT_C2R,ny);

    //cufftPlanMany(&plan,lC,lC);
    //cufftPlanMany(&plan_inverse,lC);
    

    //execute plans
    cufftExecR2C(plan,InputMat,FFTInputMat);
    cufftExecR2C(plan,InputMat2,FFTInputMat2);

    //multiply results
    ComplexPointwiseMulAndScale<<<32,256>>>(FFTInputMat,FFTInputMat2,newsize,1.0f / newsize); //This is defined in the cuda FFT example and is just copied,so I don't expect any error here.

    //go backwards
    cufftExecC2R(plan_inverse,FFTInputMat,InputMat);

    //copy back
    cudamemcpy(Output,ConvSize * lC * sizeof(float),cudamemcpyDevicetoDevice);
    



    //---------------
    cudaFree(InputMat);
    cudaFree(InputMat2);
    cudaFree(OutputMat);

    //For Now assume Sizes are as original,If it works look to remove copying etc.
    
}

输入数组已在设备上并按列主序排列。

我也尝试过交换 dist 和 strides,但是对于这两种情况,我在代码中进一步出现了内存分配错误(这很奇怪,因为我有足够的 16 Gb 内存)。有没有人可以帮我解决这个问题,因为我似乎无法弄清楚。

提前致谢

ps:我正在运行 CUDA 11.3 MVS 17

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)