CUDA 共享内存 vs 全局内存，可能会加速

问题描述

我相信我的 CUDA 应用程序可能会受益于共享内存，以便将数据保持在 GPU 内核附近。现在，我有一个内核，我将一个指向先前分配的设备内存块的指针和一些常量传递给它。内核完成后，设备内存包括结果，该结果被复制到主机内存。该方案完美运行，并与在 CPU 上运行的相同算法进行了交叉检查。

文档清楚地表明全局内存比共享内存慢得多，访问延迟也更高，但无论采用哪种方式获得最佳性能，您都应该让您的线程合并并对齐任何访问。我的 GPU 具有计算能力 6.1“Pascal”，每个线程块有 48 kiB 的共享内存和 2 GiB DRAM。如果我重构我的代码以使用共享内存，我如何确保避免银行冲突？

Shared memory 被组织在 32 个 bank 中，因此来自同一块的 32 个线程可以同时访问不同的 bank，而无需等待。假设我从上面获取内核，启动一个内核配置，其中包含一个块和该块中的 32 个线程，并在内核外静态分配 48 kiB 的共享内存。此外，每个线程只会在（共享）内存中读取和写入相同的单个内存位置，这特定于我正在研究的算法。鉴于此，我将访问这 32 个共享内存位置，偏移量为 48 kiB / 32 banks / sizeof(double)，等于 192：

__shared__ double cache[6144];

__global__ void kernel(double *buf_out,double a,double b,double c)
{
    for(...)
    {
       // Perform calculation on shared memory
       cache[threadIdx.x * 192] = ...
    }

    // Write result to global memory
    buf_out[threadIdx.x] = cache[threadIdx.x * 192];
}

我的推理：当 threadIdx.x 从 0 到 31 运行时，偏移量加上 cache 是 double 确保每个线程将访问不同银行的第一个元素，在同一时间。我还没有修改和测试代码，但这是对齐 SM 访问的正确方法吗？

MWE 添加： 这是算法的原始 CPU 到 CUDA 端口，仅使用全局内存。 Visual Profiler 报告内核执行时间为 10.3 秒。环境：Win10、MSVC 2019、x64 Release Build、CUDA v11.2。

#include "cuda_runtime.h"

#include <iostream>
#include <stdio.h>

#define _USE_MATH_DEFINES
#include <math.h>
    

__global__ void kernel(double *buf,double SCREEN_STEP_SIZE,double APERTURE_RADIUS,double APERTURE_STEP_SIZE,double SCREEN_DIST,double WAVE_NUMBER)
{   
    double z,y,y_max;

    unsigned int tid = threadIdx.x/* + blockIdx.x * blockDim.x*/;
    
    double Z = tid * SCREEN_STEP_SIZE,Y = 0;

    double temp = WAVE_NUMBER / SCREEN_DIST;


    // Make sure the per-thread accumulator is zero before we begin
    buf[tid] = 0;

    for (z = -APERTURE_RADIUS; z <= APERTURE_RADIUS; z += APERTURE_STEP_SIZE)
    {
        y_max = sqrt(APERTURE_RADIUS * APERTURE_RADIUS - z * z);

        for (y = -y_max; y <= y_max; y += APERTURE_STEP_SIZE)
        {
            buf[tid] += cos(temp * (Y * y + Z * z));
        }
    }   
}


int main(void)
{
    double *dev_mem;
    double *buf = NULL;
    cudaError_t cudaStatus;

    unsigned int screen_elems = 1000;


    if ((buf = (double*)malloc(screen_elems * sizeof(double))) == NULL)
    {
        printf("Could not allocate memory...");
        return -1;
    }

    memset(buf,screen_elems * sizeof(double));


    if ((cudaStatus = cudaMalloc((void**)&dev_mem,screen_elems * sizeof(double))) != cudaSuccess)
    {
        printf("cudaMalloc failed with code %u",cudaStatus);
        return cudaStatus;
    }


    kernel<<<1,1000>>>(dev_mem,1e-3,5e-5,50e-9,10.0,2 * M_PI / 5e-7);

    cudaDeviceSynchronize();

    if ((cudaStatus = cudaMemcpy(buf,dev_mem,screen_elems * sizeof(double),cudaMemcpyDeviceToHost)) != cudaSuccess)
    {
        printf("cudaMemcpy failed with code %u",cudaStatus);
        return cudaStatus;
    }


    cudaFree(dev_mem);

    cudaDeviceReset();

    free(buf);

    return 0;
}

下面的内核改为使用共享内存，执行时间大约为 10.6 秒，再次在 Visual Profiler 中测量：

__shared__ double cache[1000];


__global__ void kernel(double *buf,y_max;

    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
    double Z = tid * SCREEN_STEP_SIZE,Y = 0;

    double temp = WAVE_NUMBER / SCREEN_DIST;


    // Make sure the per-thread accumulator is zero before we begin
    cache[tid] = 0;

    for (z = -APERTURE_RADIUS; z <= APERTURE_RADIUS; z += APERTURE_STEP_SIZE)
    {
        y_max = sqrt(APERTURE_RADIUS * APERTURE_RADIUS - z * z);

        for (y = -y_max; y <= y_max; y += APERTURE_STEP_SIZE)
        {
            cache[tid] += cos(temp * (Y * y + Z * z));
        }
    }   

    buf[tid] = cache[tid];
}

循环中最里面的一行通常会执行几百万次，具体取决于传递给内核的五个常量。因此，我希望片上共享内存版本的速度要快得多，而不是打击片外全局内存，但显然事实并非如此——我错过了什么？

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

cuda gpgpu