大数组的基本cuBLAS点积返回错误的值

问题描述

我用cuda-C（Win-10上的Visual Studio 2015，GPU设备= TitanXp）编写了以下代码，以计算1D数组（从2D展平）中所有元素的总和。主机版本很简单，可以使用+=操作对所有元素求和并返回值。对于cuBLAS实现，我使用了点积方法（将目标数组与相同大小的全1的数组进行目标数组的点积运算，以返回所有元素的总和）。该代码适用于小型数组（例如100个元素的数组），但是对于大型数组（例如512x512 = 262144个元素的数组）返回错误的值（尽管足够接近正确值）。我想念什么或做错什么？提前致谢。（免责声明-cuBLAS新用户）。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "book.h"
#include <cublas_v2.h>

void creatematrix(float *out,int nx,int ny)
{
    float ctr = 0;
    for (int i = 0; i < nx; ++i) {
        for (int j = 0; j < ny; ++j) {
            out[j * nx + i] = ctr/1E+5;
            ctr = ctr + 1;
        }
    }
}

float add_arr_val(float *im,int N)
{
    float tmp = 0;
    for (int i = 0; i < N; ++i)
        tmp += im[i];

    float out = tmp;
    return out;
}

__global__ void init_ones(float *d_in,int N)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N)
    {
        d_in[i] = 1.0;
    }
}

void main()
{
    // Define matrix size (using flattened array for most operations)
    int nx = 512;       // row size
    int ny = 512;       // column size
    int N = nx * ny;    // total size of flattened array
    
    // cpu section ========================================
    float *M; M = (float*)malloc(N * sizeof(float));    // create array pointer and allocate memory
    creatematrix(M,nx,ny);                            // create a test matrix of size nx * ny
    float cpu_out = add_arr_val(M,N);                  // cpu function

    // GPU and cuBLAS section ==============================
    float *d_M;
    HANDLE_ERROR(cudamalloc(&d_M,N * sizeof(float)));
    HANDLE_ERROR(cudamemcpy(d_M,M,N * sizeof(float),cudamemcpyHostToDevice));    // copy original array M to device as d_M
        
    // create array of all ones,size N for dot product
    float *d_ones;
    cudamalloc(&d_ones,N * sizeof(float));

    // Max potential blocksize
    int minGridSize,blockSize,gridSize;
    cudaOccupancyMaxPotentialBlockSize(&minGridSize,&blockSize,init_ones,N);
    gridSize = (N + blockSize - 1) / blockSize; 
    init_ones << <gridSize,blockSize >> > (d_ones,N);     // kernel launch to generate array of all 1's
    cudaDeviceSynchronize();
    
    float blas_out;                                                         // output on host variable
    cublasHandle_t handle;  cublasCreate(&handle);                          // initialize CUBLAS context        
    cublasSdot(handle,N,d_M,1,d_ones,&blas_out);                    // Perform cublas single-precision dot product of (d_M . d_ones)
    cudaDeviceSynchronize();    
    
    //print output from cpu and gpu sections
    printf("native output = %lf\n",cpu_out);
    printf("cublas output = %lf\n",blas_out);

    cublasDestroy(handle);
    free(M);
    cudaFree(d_M);
    cudaFree(d_ones);
}

具有262144个元素（展平的512x512矩阵）的数组的输出：

native output = 343590.437500
cublas output = 343596.062500
Press any key to continue . . .

具有144个元素的数组（展平的12x12矩阵）的输出：

native output = 0.102960
cublas output = 0.102960
Press any key to continue . . .

解决方法

您将遇到float精度的限制。确实不应期望它的精度超过5个小数位，并且某些计算模式可能会导致其精度不如此。实际上， CUBLAS结果在数值上比您的CPU实现更接近正确的舍入结果。这很容易证明。我们需要做的就是使用double执行主机求和操作，我们会看到不同的结果：

$ cat t1784.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cublas_v2.h>
#define HANDLE_ERROR(x) x
void creatematrix(float *out,int nx,int ny)
{
    float ctr = 0;
    for (int i = 0; i < nx; ++i) {
        for (int j = 0; j < ny; ++j) {
            out[j * nx + i] = ctr/1E+5;
            ctr = ctr + 1;
        }
    }
}

float add_arr_val(float *im,int N)
{
    float tmp = 0;
    for (int i = 0; i < N; ++i)
        tmp += im[i];

    float out = tmp;
    return out;
}

double add_arr_val_dbl(float *im,int N)
{
    double tmp = 0;
    for (int i = 0; i < N; ++i)
        tmp += (double)(im[i]);

    return tmp;
}


__global__ void init_ones(float *d_in,int N)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N)
    {
        d_in[i] = 1.0;
    }
}

int main()
{
    // Define matrix size (using flattened array for most operations)
    int nx = 512;       // row size
    int ny = 512;       // column size
    int N = nx * ny;    // total size of flattened array

    // CPU section ========================================
    float *M; M = (float*)malloc(N * sizeof(float));    // create array pointer and allocate memory
    creatematrix(M,nx,ny);                            // create a test matrix of size nx * ny
    float cpu_out = add_arr_val(M,N);                  // CPU function
    double cpu_dbl_out = add_arr_val_dbl(M,N);
    // GPU and cuBLAS section ==============================
    float *d_M;
    HANDLE_ERROR(cudaMalloc(&d_M,N * sizeof(float)));
    HANDLE_ERROR(cudaMemcpy(d_M,M,N * sizeof(float),cudaMemcpyHostToDevice));    // copy original array M to device as d_M

    // create array of all ones,size N for dot product
    float *d_ones;
    cudaMalloc(&d_ones,N * sizeof(float));

    // Max potential blocksize
    int minGridSize,blockSize,gridSize;
    cudaOccupancyMaxPotentialBlockSize(&minGridSize,&blockSize,init_ones,N);
    gridSize = (N + blockSize - 1) / blockSize;
    init_ones << <gridSize,blockSize >> > (d_ones,N);     // kernel launch to generate array of all 1's
    cudaDeviceSynchronize();

    float blas_out;                                                         // output on host variable
    cublasHandle_t handle;  cublasCreate(&handle);                          // initialize CUBLAS context
    cublasSdot(handle,N,d_M,1,d_ones,&blas_out);                    // Perform cublas single-precision dot product of (d_M . d_ones)
    cudaDeviceSynchronize();

    //print output from cpu and gpu sections
    printf("native output = %f\n",cpu_out);
    printf("native double output = %f\n",cpu_dbl_out);
    printf("cublas output = %f\n",blas_out);

    cublasDestroy(handle);
    free(M);
    cudaFree(d_M);
    cudaFree(d_ones);
}
$ nvcc -o t1784 t1784.cu -lcublas
$ ./t1784
native output        = 343590.437500
native double output = 343596.072960
cublas output        = 343596.062500
$

cublas输出实际上更接近（*）的原因是，它与主机float代码的执行顺序不同。它在线程块中工作，并在创建最终结果之前将部分总和相加。

作为旁注，无需将l与%f printf格式说明符一起使用。已经设计出可以正确处理double和float两种格式。

要详细了解求和中错误的产生方式，您可能希望阅读this paper，尤其是从p238开始的“求和错误”。

（*）但是，您不应假定情况总是如此，尽管我相信部分和方法通常比纯运行和方法更健壮（这只是个人观点，与我个人无关证明还是我想争论）。但是，无论哪种情况，该错误均取决于数据。从准确性的角度来看，我们可以构造一个特殊的数据序列，使运行总和方法看起来非常好。要在最重要的精度水平上进行大的求和，您可能应该使用可用的最高精度。除此之外，您可能希望在上面链接的文章中阅读有关Kahan求和的信息。

arrays arrays arrays cublas cuda cuda visual-studio