CUDA中的并行批处理小型矩阵不适用于for循环

问题描述

我有许多(例如,一百万个)小型矩阵4 x3。我想对它们进行几个简单的操作,并且我希望CUDA内核仅对矩阵索引进行并行处理(而不是对行/列操作进行并行处理) )。让我更好地解释一下:我将矩阵A [Matrixnumb] [row] [col]的数组作为输入传递给我的GPU内核,并且我希望操作并行化仅在Matrixnumb上进行(因此我想将操作强制为一个为简单起见,下面的示例仅使用3个矩阵。它编译并运行,但是给了我错误的结果。基本上,它返回的输入矩阵与我输入的矩阵相同。我不明白为什么以及是否在制作任何错误,我该如何重写/思考代码?我还使用CudamallocManaged编写了代码,以便在主机和设备之间共享内存,但是使用经典的Cudamalloc和memcpy可以得到相同的结果。>

Source.cpp

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <assert.h>
#include <chrono>
#include <random>
#include <time.h>
#include <math.h>

#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>

using namespace std;


__global__ void SVD(double*** a,const int m,const int n,const int numMatrices,double** w)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  // I would like that each thread runs these loops independently
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14;
    }
  }
  for (int j = 0; j < n; j++) {
    w[idx][j] = 3.14 * a[idx][1][j]* a[idx][1][j];
  }

}


int main()
{
  const int n = 3;
  const int m = 4;
  const int lda = m;
  const int numMatrices = 3;

  random_device device;
  mt19937 generator(device());
  uniform_real_distribution<double> distribution(1.,5.);

  // create pointers
  double*** A = new double** [numMatrices];
  double** w = new double* [numMatrices];

  //ALLOCATE SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    A[nm] = new double* [lda];
    w[nm] = new double[n];

    for (int i = 0; i < lda; i++) {
      A[nm][i] = new double[n];

      for (int j = 0; j < n; j++) {
        cudamallocManaged((void**)&A[nm][i][j],sizeof(double));
        cudamallocManaged((void**)&w[nm][j],sizeof(double));
      }
    }
  }

  cout << " memory allocated" << endl;


  //FILL MATRICES INTO SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    A[nm] = new double* [lda];
    w[nm] = new double[n];                                   

    for (int i = 0; i < lda; i++) {
      A[nm][i] = new double[n];

      for (int j = 0; j < n; j++) {
        A[nm][i][j] = distribution(generator);
        w[nm][j] = 0.0;
      }
    }
  }
  cout << " matrix filled " << endl;


  // PRINT MATRICES BEFORE CUDA OPERATION
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  //KERNEL ----------------------------------------------------------------------
  int NThreads = 3;   
  int NBlocks = int(numMatrices / NThreads + 1);
 
  SVD << <NBlocks,NThreads >> > (A,n,m,numMatrices,w);
  cudaDeviceSynchronize();
  cout << " Kernel done " << endl << endl;

  cout << " --- GPU --- " << endl;
  cout << " NEW MATRIX: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  cout << " NEW VECTOR RESULTS: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < n; i++) {
      cout << w[nm][i] << " ";
    }
    cout << endl;
  }

  cout << endl;

  //FREE THE DEVICE'S MEMORY -----------------------------------------------------
  cudaFree(A);
  cudaFree(w);
  cout << " Cuda free " << endl << endl;

  return 0;
}

我得到的(错误输出如下:

memory allocated
 matrix filled
1.28689 3.76588 3.88649
1.52547 4.42371 2.62566
1.48002 3.33719 1.58413
3.78243 2.8394 3.0249

1.14322 1.70261 2.02784
2.86852 2.87918 3.2896
4.87268 3.52447 1.58414
3.52306 3.84931 3.18212

1.76397 1.41317 4.9765
1.63338 4.79316 2.64009
1.99873 1.72617 1.15974
1.18922 4.21513 1.6695

 Kernel done

 --- GPU ---
 NEW MATRIX:
1.28689 3.76588 3.88649
1.52547 4.42371 2.62566
1.48002 3.33719 1.58413
3.78243 2.8394 3.0249

1.14322 1.70261 2.02784
2.86852 2.87918 3.2896
4.87268 3.52447 1.58414
3.52306 3.84931 3.18212

1.76397 1.41317 4.9765
1.63338 4.79316 2.64009
1.99873 1.72617 1.15974
1.18922 4.21513 1.6695

 NEW VECTOR RESULTS:
0 0 0
0 0 0
0 0 0

 Cuda free

我希望通过以下操作来获得新的矩阵和向量: a [idx] [i] [j] =(a [idx] [i] [j] * a [idx] [i] [j])* 3.14; 但是,似乎代码看不到内核或内核无法正常工作。

解决方法

您遇到了几个问题:

  1. 在使用具有双指针或三指针访问权限的托管内存时,必须使用托管分配器来分配树中的每个指针
  2. 您的分配方案有太多的级别,并且您分配了两次指针(内存泄漏)。
  3. 您要传递给内核的参数顺序与内核期望的参数顺序不匹配({nm向后)。
  4. 由于可能启动的块/线程数量可能超出必要,因此内核需要进行线程检查(if-test)。
  5. 您的代码应位于.cu文件中,而不是.cpp文件中。

以下代码解决了上述问题,并且似乎在运行时没有运行时错误。

$ cat t61.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <assert.h>
#include <chrono>
#include <random>
#include <time.h>
#include <math.h>


using namespace std;


__global__ void SVD(double*** a,const int m,const int n,const int numMatrices,double** w)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < numMatrices){
  // I would like that each thread runs these loops independently
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14;
    }
  }
  for (int j = 0; j < n; j++) {
    w[idx][j] = 3.14 * a[idx][1][j]* a[idx][1][j];
  }
  }
}


int main()
{
  const int n = 3;
  const int m = 4;
  const int lda = m;
  const int numMatrices = 3;

  random_device device;
  mt19937 generator(device());
  uniform_real_distribution<double> distribution(1.,5.);

  // create pointers
  double*** A;
  cudaMallocManaged(&A,sizeof(double**)*numMatrices);
  double** w;
  cudaMallocManaged(&w,sizeof(double*)* numMatrices);

  //ALLOCATE SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    cudaMallocManaged(&(A[nm]),sizeof(double*)*lda);
    cudaMallocManaged(&(w[nm]),sizeof(double)*n);

    for (int i = 0; i < lda; i++) {
      cudaMallocManaged(&(A[nm][i]),sizeof(double)*n);
      }
    }

  cout << " memory allocated" << endl;


  //FILL MATRICES INTO SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        A[nm][i][j] = distribution(generator);
        w[nm][j] = 0.0;
      }
    }
  }
  cout << " matrix filled " << endl;


  // PRINT MATRICES BEFORE CUDA OPERATION
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  //KERNEL ----------------------------------------------------------------------
  int NThreads = 3;
  int NBlocks = int(numMatrices / NThreads + 1);

  SVD << <NBlocks,NThreads >> > (A,m,n,numMatrices,w);
  cudaDeviceSynchronize();
  cout << " Kernel done " << endl << endl;

  cout << " --- GPU --- " << endl;
  cout << " NEW MATRIX: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  cout << " NEW VECTOR RESULTS: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < n; i++) {
      cout << w[nm][i] << " ";
    }
    cout << endl;
  }

  cout << endl;

  //FREE THE DEVICE'S MEMORY -----------------------------------------------------
  cudaFree(A);
  cudaFree(w);
  cout << " Cuda free " << endl << endl;

  return 0;
}
$ nvcc -o t61 t61.cu
$ cuda-memcheck ./t61
========= CUDA-MEMCHECK
 memory allocated
 matrix filled
3.73406 3.51919 3.249
1.52374 2.678 2.50944
3.67358 1.15831 3.26327
2.58468 1.49937 2.67133

1.72144 2.99183 3.11156
1.06247 3.34983 4.23568
3.49749 3.07641 3.42827
4.09607 2.00557 2.12049

3.65427 3.98966 4.73428
1.68397 4.3746 2.95533
2.1914 4.96086 1.7165
3.10095 2.61781 4.52626

 Kernel done

 --- GPU ---
 NEW MATRIX:
43.7816 38.888 33.1458
7.29041 22.5191 19.7735
42.375 4.2129 33.4376
20.977 7.05908 22.407

9.30494 28.1062 30.4008
3.54453 35.2351 56.3348
38.41 29.7179 36.9045
52.6821 12.6301 14.1189

41.9306 49.9807 70.3782
8.90432 60.0905 27.4247
15.079 77.2757 9.25165
30.1939 21.5182 64.3294

 NEW VECTOR RESULTS:
166.891 1592.32 1227.71
39.4501 3898.35 9965.14
248.961 11338.1 2361.64

 Cuda free

========= ERROR SUMMARY: 0 errors
$