CUDA分钟减少和双数组索引

问题描述

我正在尝试实现一种最小化算法，该算法取自this answer。我不能在这个项目中使用推力库或其他库，因此我必须坚持使用纯CUDA。目的是将代码扩展到非常大的数组，根据我的经验以及在我的机器上，推力对于我的目的而言太慢了。

在下面找到代码，该代码生成一个4096个元素的double数组，其中每个元素都等于其索引（即[0,1,2,3，....，4096-1]），其中人为地将-1的值添加到给定索引（在这种情况下为4091）。但是，该代码似乎无法正常工作。我正在使用cuda 11和带有nvcc -w -arch=sm_50 input.cu -o output.exe

的Visual Studio 2017在Windows机器上为我的GPU（CC = 5.0）编译它

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
//#include <unistd.h>
//#include <sys/time.h>

#if __DEVICE_EMULATION__
#define DEBUG_SYNC __syncthreads();
#else
#define DEBUG_SYNC
#endif

#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif

#ifndef MIN_IDX
#define MIN_IDX(x,y,idx_x,idx_y) ((x < y) ? idx_x : idx_y)
#endif

#if (__CUDA_ARCH__ < 200)
#define int_mult(x,y)   __mul24(x,y)
#else
#define int_mult(x,y)   x*y
#endif
#define inf 0x7f800000

bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}

unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template<class T>
struct SharedMemory {
    __device__ inline operator T *() {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }

    __device__ inline operator const T *() const {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }
};

// specialize for double to avoid unaligned memory
// access compile errors
template<>
struct SharedMemory<double> {
    __device__ inline operator double *() {
        extern __shared__ double __smem_d[];
        return (double *) __smem_d;
    }

    __device__ inline operator const double *() const {
        extern __shared__ double __smem_d[];
        return (double *) __smem_d;
    }
};

template<class T,unsigned int blockSize,bool nIsPow2>
__global__ void reduceMin6(T *g_idata,int *g_idxs,T *g_odata,int *g_oIdxs,unsigned int n) {
    T *sdata = SharedMemory<T>();
    int *sdataIdx = ((int *)sdata) + blockSize;

    //int *sdataIdx = SharedMemory<int>();

    // perform first level of reduction,// reading from global memory,writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;

    T myMin = 99999;
    int myMinIdx = -1;
    // we reduce multiple elements per thread.  The number is determined by the
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n) {
        myMinIdx  = MIN_IDX(g_idata[i],myMin,g_idxs[i],myMinIdx);
        myMin = MIN(g_idata[i],myMin);

        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
        if (nIsPow2 || i + blockSize < n){
            //myMin += g_idata[i + blockSize];
            myMinIdx  = MIN_IDX(g_idata[i + blockSize],g_idxs[i + blockSize],myMinIdx);
            myMin = MIN(g_idata[i + blockSize],myMin);
        }

        i += gridSize;
    }

    // each thread puts its local sum into shared memory
    sdata[tid] = myMin;
    sdataIdx[tid] = myMinIdx;
    __syncthreads();
    // do reduction in shared mem
    if ((blockSize >= 2048) && (tid < 1024)) {
            //sdata[tid] = mySum = mySum + sdata[tid + 256];
        
            sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512],sdataIdx[tid + 512],myMinIdx);
            sdata[tid] = myMin = MIN(sdata[tid + 512],myMin);
        }
        
        __syncthreads();


        // do reduction in shared mem
    if ((blockSize >= 1024) && (tid < 512)) {
            //sdata[tid] = mySum = mySum + sdata[tid + 256];
    
            sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512],myMin);
        }
    
        __syncthreads();

    // do reduction in shared mem
    if ((blockSize >= 512) && (tid < 256)) {
        //sdata[tid] = mySum = mySum + sdata[tid + 256];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256],sdataIdx[tid + 256],myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 256],myMin);
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        //sdata[tid] = myMin = myMin + sdata[tid + 128];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128],sdataIdx[tid + 128],myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 128],myMin);
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        //sdata[tid] = myMin = myMin + sdata[tid + 64];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64],sdataIdx[tid + 64],myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 64],myMin);
    }

    __syncthreads();

#if (__CUDA_ARCH__ >= 300 )
    if (tid < 32) {
        // Fetch final intermediate sum from 2nd warp
        if (blockSize >= 64){
        //myMin += sdata[tid + 32];
            myMinIdx = MIN_IDX(sdata[tid + 32],sdataIdx[tid + 32],myMinIdx);
            myMin = MIN(sdata[tid + 32],myMin);
        }
        // Reduce final warp using shuffle
        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
            //myMin += __shfl_down(myMin,offset);
            int tempMyMinIdx = __shfl_down(myMinIdx,offset);
            double tempMyMin = __shfl_down(myMin,offset);

            myMinIdx = MIN_IDX(tempMyMin,tempMyMinIdx,myMinIdx);
            myMin = MIN(tempMyMin,myMin);
        }

    }
#else
    // fully unroll reduction within a single warp
    if ((blockSize >= 64) && (tid < 32))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 32];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32],myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 32],myMin);
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 16];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16],sdataIdx[tid + 16],myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 16],myMin);
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 8];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8],sdataIdx[tid + 8],myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 8],myMin);
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 4];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4],sdataIdx[tid + 4],myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 4],myMin);
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 2];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2],sdataIdx[tid + 2],myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 2],myMin);
    }

    __syncthreads();

    if ((blockSize >= 2) && ( tid < 1))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 1];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1],sdataIdx[tid + 1],myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 1],myMin);
    }

    __syncthreads();
#endif

    __syncthreads();
    // write result for this block to global mem
    if (tid == 0){
        g_odata[blockIdx.x] = myMin;
        g_oIdxs[blockIdx.x] = myMinIdx;
    }
}

void getNumBlocksAndThreads(int whichKernel,int n,int maxBlocks,int maxThreads,int &blocks,int &threads) {

    //get device capability,to avoid block/grid size exceed the upper bound
    cudaDeviceProp prop;
    int device;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop,device);

    if (whichKernel < 3) {
        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
        blocks = (n + threads - 1) / threads;
    } else {
        threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
        blocks = (n + (threads * 2 - 1)) / (threads * 2);
    }

    if ((float) threads * blocks
            > (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
        printf("n is too large,please choose a smaller number!\n");
    }

    if (blocks > prop.maxGridSize[0]) {
        printf(
                "Grid size <%d> exceeds the device capability <%d>,set block size as %d (original %d)\n",blocks,prop.maxGridSize[0],threads * 2,threads);

        blocks /= 2;
        threads *= 2;
    }

    if (whichKernel == 6) {
        blocks = MIN(maxBlocks,blocks);
    }
}

template<class T>
void reduceMin(int size,int threads,int blocks,int whichKernel,T *d_idata,T *d_odata,int *idxs,int *oIdxs) {
    dim3 dimBlock(threads,1);
    dim3 dimGrid(blocks,1);

    // when there is only one warp per block,we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize =(threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); // shared mem. for the amount of double i have
    smemSize += threads*sizeof(int); // shared memory for the indexes
    if (isPow2(size)) {
        switch (threads) {
        case 2048:
            reduceMin6<T,2048,true> <<<dimGrid,dimBlock,smemSize>>>(d_idata,idxs,d_odata,oIdxs,size);
            break;
        case 1024:
            reduceMin6<T,1024,size);
            break;   
        case 512:
            reduceMin6<T,512,size);
            break;

        case 256:
            reduceMin6<T,256,size);
            break;

        case 128:
            reduceMin6<T,128,size);
            break;

        case 64:
            reduceMin6<T,64,size);
            break;

        case 32:
            reduceMin6<T,32,size);
            break;

        case 16:
            reduceMin6<T,16,size);
            break;

        case 8:
            reduceMin6<T,8,size);
            break;

        case 4:
            reduceMin6<T,4,size);
            break;

        case 2:
            reduceMin6<T,size);
            break;

        case 1:
            reduceMin6<T,size);
            break;
        }
    } else {
        switch (threads) {
        case 2048:
            reduceMin6<T,false> <<<dimGrid,size);
            break ;     
        case 1024:
            reduceMin6<T,size);
            break ;   
        case 512:
            reduceMin6<T,size);
            break;
        }
    }

}

template<class T>
void reduceMINcpu(T *data,int size,T *min,int *idx)
{
    *min = data[0];
    int min_idx = 0;
    T c = (T)0.0;

    for (int i = 1; i < size; i++)
    {
        T y = data[i];
        T t = MIN(*min,y);
        min_idx = MIN_IDX(*min,min_idx,i);
        (*min) = t;
    }

    *idx = min_idx;

    return;
}


// Instantiate the reduction function for 3 types
template void
reduceMin<int>(int size,int *d_idata,int *d_odata,int *oIdxs);

template void
reduceMin<float>(int size,float *d_idata,float *d_odata,int *oIdxs);

template void
reduceMin<double>(int size,double *d_idata,double *d_odata,int *oIdxs);

int main(){
    int num_els=8*8*8*8;

    int maxThreads = 128;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 1000000;

    double* d_in = NULL;
    double* d_out = NULL;
    int *d_idxs = NULL;
    int *d_oIdxs = NULL;

    printf("%d elements\n",num_els);
    printf("%d threads (max)\n",maxThreads);

    int numBlocks = 0;
    int numThreads = 0;
    getNumBlocksAndThreads(whichKernel,num_els,maxBlocks,maxThreads,numBlocks,numThreads);
    printf("%d numBlocks \n",numBlocks);
    printf("%d numThreads \n",numThreads);

    cudamalloc((void **) &d_in,num_els * sizeof(double));
    cudamalloc((void **) &d_idxs,num_els * sizeof(int));
    cudamalloc((void **) &d_out,numBlocks * sizeof(double));
    cudamalloc((void **) &d_oIdxs,numBlocks * sizeof(int));
    printf("Finished cudamalloc");

    double* in = (double*) malloc(num_els * sizeof(double));
    int *idxs = (int*) malloc(num_els * sizeof(int));
    double* out = (double*) malloc(numBlocks * sizeof(double));
    int* oIdxs = (int*) malloc(numBlocks * sizeof(int));

    for (int i = 0; i < num_els; i++) {
        in[i] = (double)i*1.0;
        idxs[i] = i;
    }
    in[num_els-5]=(double)-1.0;
    printf("Verify that the in array is correctly filled \n");
    for (int i = num_els-10; i < num_els; i++) {
   //
       printf("\n [%d] = %.4f",idxs[i],in[i]);
    }

    // copy data directly to device memory
    cudamemcpy(d_in,in,num_els * sizeof(double),cudamemcpyHostToDevice);
    cudamemcpy(d_idxs,num_els * sizeof(int),cudamemcpyHostToDevice);
    cudamemcpy(d_out,out,numBlocks * sizeof(double),cudamemcpyHostToDevice);
    cudamemcpy(d_oIdxs,numBlocks * sizeof(int),cudamemcpyHostToDevice);
    printf(" \n Finished cudamemcopy \n");

    reduceMin<double>(num_els,numThreads,whichKernel,d_in,d_out,d_idxs,d_oIdxs);

    cudamemcpy(out,cudamemcpyDevicetoHost);
    cudamemcpy(oIdxs,d_oIdxs,cudamemcpyDevicetoHost);

    for(int i=0; i< numBlocks; i++){
    //for(int i=numBlocks-20; i< numBlocks; i++)
       printf("\n Reduce MIN GPU idx: %d  value: %f",oIdxs[i],out[i]);
    }
    cudaFree(d_in);
    cudaFree(d_out);
    cudaFree(d_idxs);

    free(in);
    free(out);
    return 0;

}

但是，如果我转换代码以最小化并找到float数组的索引，则它可以正常工作。这是浮点版本的main（这是唯一的不同部分，某些变量和数组被替换为“ float”而不是“ double”）。编译方式与前者相同。

int main(){
    int num_els=8*8*8*8;

    int maxThreads = 128;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 1000000;



    float* d_in = NULL;
    float* d_out = NULL;
    int *d_idxs = NULL;
    int *d_oIdxs = NULL;

    printf("%d elements\n",num_els * sizeof(float));
    cudamalloc((void **) &d_idxs,numBlocks * sizeof(float));
    cudamalloc((void **) &d_oIdxs,numBlocks * sizeof(int));
    printf("Finished cudamalloc");

    float* in = (float*) malloc(num_els * sizeof(float));
    int *idxs = (int*) malloc(num_els * sizeof(int));
    float* out = (float*) malloc(numBlocks * sizeof(float));
    int* oIdxs = (int*) malloc(numBlocks * sizeof(int));

    for (int i = 0; i < num_els; i++) {
        in[i] = (float)i*1.0;
        idxs[i] = i;
    }
    in[num_els-5]=(float)-1.0;
    printf("Verify that the in array is correctly filled \n");
    for (int i = num_els-10; i < num_els; i++) {
   //
       printf("\n [%d] = %.4f",num_els * sizeof(float),numBlocks * sizeof(float),cudamemcpyHostToDevice);
    printf(" \n Finished cudamemcopy \n");

    reduceMin<float>(num_els,out[i]);
    }
    cudaFree(d_in);
    cudaFree(d_out);
    cudaFree(d_idxs);

    free(in);
    free(out);
    return 0;

}

解决方法

问题在这里：

int *sdataIdx = ((int *)sdata) + blockSize;

这行代码设置了一个指针，以便可以将共享内存划分为2个逻辑段，一个逻辑段保存要减少的数据以找到最小值（第一段），另一个逻辑段保存相应的索引（第二个段）。片段-该指针实际指向的开始）。

要正确处理各种数据类型，应为：

int *sdataIdx = (int *)(((T *)sdata) + blockSize);

由于sdata已经是类型T的指针，我们可以简化：

int *sdataIdx = (int *)(sdata + blockSize);

这将保留一块共享内存，其大小足以容纳正确的数据类型（在这种情况下为两倍），然后是int数据类型。 float起作用的原因是因为float和int具有相同的大小要求。

还要注意，这适用于float，double和int归约类型。不一定适用于其他类型。

此代码还会吐出许多编译时警告。这些不应该被忽略。但这不是问题的症结所在。

还请注意，如果需要，可以用更少的代码行来找到min或max plus索引，例如here。

cuda gpgpu