问题描述
对于我的研究,我需要尽可能快地进行大量的大卷积。我首先用 cudnn 包做了它,它工作正常,但由于我有非常大的内核和图像,我想看看使用 FFT 卷积是否可以加快这个过程。我需要对两个数据集的输入进行逐行 FFT,将它们元素相乘并进行 iFFT。这使得卷积表示两个矩阵仅在水平方向上相互“滑动”(因为它们具有相同的高度)。
void Onlinespikes::FFTConvolution(float *Data,float *Templates,float *Output,long lW,long lM,long lC,long lL) {
//Data is sized lC * lW (around 2000)
//Templates is sized lC * (lM * lL) (around 1000)
//Output is then lC + (lM * lL) - 1
//Initializations
long Bigger = max(lW,lM * lL);
long Smaller = min(lW,lM * lL);
//New size after convolution
long ConvSize = Bigger + Smaller - 1;
long Closest_Power_of_two;
long i = 1;
while ( (1 << i) <= ConvSize) {
i++;
}
Closest_Power_of_two = 1 << i;
std::cout << "lM * lL = " << lM * lL << std::endl;
std::cout << "lW = " << lW << std::endl;
std::cout << "Closest_Power_of_two = " << Closest_Power_of_two << std::endl; //this prints 8192
long newsize = Closest_Power_of_two / 2 + 1;
float *InputMat;
float *InputMat2;
float *OutputMat;
cufftComplex *FFTInputMat;
cufftComplex *FFTInputMat2;
cufftComplex *FFTOutputMat;
cudamalloc((void**)&InputMat,lC * Closest_Power_of_two * sizeof(float));
cudamalloc((void**)&InputMat2,lC * Closest_Power_of_two * sizeof(float));
cudamalloc((void**)&OutputMat,lC * Closest_Power_of_two * sizeof(float));
cudamalloc((void**)&FFTInputMat,lC * newsize * sizeof(cufftComplex));
cudamalloc((void**)&FFTInputMat2,lC * newsize * sizeof(cufftComplex));
cudamalloc((void**)&FFTOutputMat,lC * newsize * sizeof(cufftComplex));
//Padding
cudamemset(InputMat,lC * Closest_Power_of_two * sizeof(float));
cudamemset(InputMat2,lC * Closest_Power_of_two * sizeof(float));
cudamemset(OutputMat,lC * Closest_Power_of_two * sizeof(float));
cudamemcpy(InputMat,Data,lC * lW * sizeof(float),cudamemcpyDevicetoDevice);
cudamemcpy(InputMat2,Templates,lC * lM * lL * sizeof(float),cudamemcpyDevicetoDevice);
cufftHandle plan;
cufftHandle plan_inverse;
//Set variables needed
long nx = Closest_Power_of_two;
long ny = lC;
long batch = lC;
static int W[1] = { (int)nx };
static int C[1] = { (int)lC };
static int inembed[1] = { (int) nx };
static int onembed[1] = { (int) nx };
static int istride = nx;
static int idist = 1;
static int ostride = nx;
static int odist = 1;
//Make plan(s) forward
cufftPlanMany(&plan,1,W,inembed,istride,idist,onembed,ostride,odist,CUFFT_R2C,ny);
//cufftPlan1d(&plan,Closest_Power_of_two,2 * lC);
//Change variables to make plan backward
nx = newsize;
ny = lC;
W[1] = { (int)nx };
C[1] = { (int)lC };
inembed[0] = (int)nx;
onembed[0] = (int)nx;
istride = nx;
idist = 1;
ostride = nx;
odist = 1;
cufftPlanMany(&plan_inverse,CUFFT_C2R,ny);
//cufftPlanMany(&plan,lC,lC);
//cufftPlanMany(&plan_inverse,lC);
//execute plans
cufftExecR2C(plan,InputMat,FFTInputMat);
cufftExecR2C(plan,InputMat2,FFTInputMat2);
//multiply results
ComplexPointwiseMulAndScale<<<32,256>>>(FFTInputMat,FFTInputMat2,newsize,1.0f / newsize); //This is defined in the cuda FFT example and is just copied,so I don't expect any error here.
//go backwards
cufftExecC2R(plan_inverse,FFTInputMat,InputMat);
//copy back
cudamemcpy(Output,ConvSize * lC * sizeof(float),cudamemcpyDevicetoDevice);
//---------------
cudaFree(InputMat);
cudaFree(InputMat2);
cudaFree(OutputMat);
//For Now assume Sizes are as original,If it works look to remove copying etc.
}
输入数组已在设备上并按列主序排列。
我也尝试过交换 dist 和 strides,但是对于这两种情况,我在代码中进一步出现了内存分配错误(这很奇怪,因为我有足够的 16 Gb 内存)。有没有人可以帮我解决这个问题,因为我似乎无法弄清楚。
提前致谢
ps:我正在运行 CUDA 11.3 MVS 17
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)