问题描述
我正在尝试实现一种最小化算法,该算法取自this answer。我不能在这个项目中使用推力库或其他库,因此我必须坚持使用纯CUDA。目的是将代码扩展到非常大的数组,根据我的经验以及在我的机器上,推力对于我的目的而言太慢了。
在下面找到代码,该代码生成一个4096个元素的double
数组,其中每个元素都等于其索引(即[0,1,2,3,....,4096-1]),其中人为地将-1的值添加到给定索引(在这种情况下为4091)。但是,该代码似乎无法正常工作。我正在使用cuda 11和带有nvcc -w -arch=sm_50 input.cu -o output.exe
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
//#include <unistd.h>
//#include <sys/time.h>
#if __DEVICE_EMULATION__
#define DEBUG_SYNC __syncthreads();
#else
#define DEBUG_SYNC
#endif
#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif
#ifndef MIN_IDX
#define MIN_IDX(x,y,idx_x,idx_y) ((x < y) ? idx_x : idx_y)
#endif
#if (__CUDA_ARCH__ < 200)
#define int_mult(x,y) __mul24(x,y)
#else
#define int_mult(x,y) x*y
#endif
#define inf 0x7f800000
bool isPow2(unsigned int x)
{
return ((x&(x-1))==0);
}
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template<class T>
struct SharedMemory {
__device__ inline operator T *() {
extern __shared__ int __smem[];
return (T *) __smem;
}
__device__ inline operator const T *() const {
extern __shared__ int __smem[];
return (T *) __smem;
}
};
// specialize for double to avoid unaligned memory
// access compile errors
template<>
struct SharedMemory<double> {
__device__ inline operator double *() {
extern __shared__ double __smem_d[];
return (double *) __smem_d;
}
__device__ inline operator const double *() const {
extern __shared__ double __smem_d[];
return (double *) __smem_d;
}
};
template<class T,unsigned int blockSize,bool nIsPow2>
__global__ void reduceMin6(T *g_idata,int *g_idxs,T *g_odata,int *g_oIdxs,unsigned int n) {
T *sdata = SharedMemory<T>();
int *sdataIdx = ((int *)sdata) + blockSize;
//int *sdataIdx = SharedMemory<int>();
// perform first level of reduction,// reading from global memory,writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
T myMin = 99999;
int myMinIdx = -1;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n) {
myMinIdx = MIN_IDX(g_idata[i],myMin,g_idxs[i],myMinIdx);
myMin = MIN(g_idata[i],myMin);
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n){
//myMin += g_idata[i + blockSize];
myMinIdx = MIN_IDX(g_idata[i + blockSize],g_idxs[i + blockSize],myMinIdx);
myMin = MIN(g_idata[i + blockSize],myMin);
}
i += gridSize;
}
// each thread puts its local sum into shared memory
sdata[tid] = myMin;
sdataIdx[tid] = myMinIdx;
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 2048) && (tid < 1024)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512],sdataIdx[tid + 512],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 512],myMin);
}
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 1024) && (tid < 512)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512],myMin);
}
__syncthreads();
// do reduction in shared mem
if ((blockSize >= 512) && (tid < 256)) {
//sdata[tid] = mySum = mySum + sdata[tid + 256];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256],sdataIdx[tid + 256],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 256],myMin);
}
__syncthreads();
if ((blockSize >= 256) && (tid < 128)) {
//sdata[tid] = myMin = myMin + sdata[tid + 128];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128],sdataIdx[tid + 128],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 128],myMin);
}
__syncthreads();
if ((blockSize >= 128) && (tid < 64)) {
//sdata[tid] = myMin = myMin + sdata[tid + 64];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64],sdataIdx[tid + 64],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 64],myMin);
}
__syncthreads();
#if (__CUDA_ARCH__ >= 300 )
if (tid < 32) {
// Fetch final intermediate sum from 2nd warp
if (blockSize >= 64){
//myMin += sdata[tid + 32];
myMinIdx = MIN_IDX(sdata[tid + 32],sdataIdx[tid + 32],myMinIdx);
myMin = MIN(sdata[tid + 32],myMin);
}
// Reduce final warp using shuffle
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
//myMin += __shfl_down(myMin,offset);
int tempMyMinIdx = __shfl_down(myMinIdx,offset);
double tempMyMin = __shfl_down(myMin,offset);
myMinIdx = MIN_IDX(tempMyMin,tempMyMinIdx,myMinIdx);
myMin = MIN(tempMyMin,myMin);
}
}
#else
// fully unroll reduction within a single warp
if ((blockSize >= 64) && (tid < 32))
{
//sdata[tid] = myMin = myMin + sdata[tid + 32];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 32],myMin);
}
__syncthreads();
if ((blockSize >= 32) && (tid < 16))
{
//sdata[tid] = myMin = myMin + sdata[tid + 16];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16],sdataIdx[tid + 16],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 16],myMin);
}
__syncthreads();
if ((blockSize >= 16) && (tid < 8))
{
//sdata[tid] = myMin = myMin + sdata[tid + 8];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8],sdataIdx[tid + 8],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 8],myMin);
}
__syncthreads();
if ((blockSize >= 8) && (tid < 4))
{
//sdata[tid] = myMin = myMin + sdata[tid + 4];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4],sdataIdx[tid + 4],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 4],myMin);
}
__syncthreads();
if ((blockSize >= 4) && (tid < 2))
{
//sdata[tid] = myMin = myMin + sdata[tid + 2];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2],sdataIdx[tid + 2],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 2],myMin);
}
__syncthreads();
if ((blockSize >= 2) && ( tid < 1))
{
//sdata[tid] = myMin = myMin + sdata[tid + 1];
sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1],sdataIdx[tid + 1],myMinIdx);
sdata[tid] = myMin = MIN(sdata[tid + 1],myMin);
}
__syncthreads();
#endif
__syncthreads();
// write result for this block to global mem
if (tid == 0){
g_odata[blockIdx.x] = myMin;
g_oIdxs[blockIdx.x] = myMinIdx;
}
}
void getNumBlocksAndThreads(int whichKernel,int n,int maxBlocks,int maxThreads,int &blocks,int &threads) {
//get device capability,to avoid block/grid size exceed the upper bound
cudaDeviceProp prop;
int device;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop,device);
if (whichKernel < 3) {
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
blocks = (n + threads - 1) / threads;
} else {
threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
blocks = (n + (threads * 2 - 1)) / (threads * 2);
}
if ((float) threads * blocks
> (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
printf("n is too large,please choose a smaller number!\n");
}
if (blocks > prop.maxGridSize[0]) {
printf(
"Grid size <%d> exceeds the device capability <%d>,set block size as %d (original %d)\n",blocks,prop.maxGridSize[0],threads * 2,threads);
blocks /= 2;
threads *= 2;
}
if (whichKernel == 6) {
blocks = MIN(maxBlocks,blocks);
}
}
template<class T>
void reduceMin(int size,int threads,int blocks,int whichKernel,T *d_idata,T *d_odata,int *idxs,int *oIdxs) {
dim3 dimBlock(threads,1);
dim3 dimGrid(blocks,1);
// when there is only one warp per block,we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize =(threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); // shared mem. for the amount of double i have
smemSize += threads*sizeof(int); // shared memory for the indexes
if (isPow2(size)) {
switch (threads) {
case 2048:
reduceMin6<T,2048,true> <<<dimGrid,dimBlock,smemSize>>>(d_idata,idxs,d_odata,oIdxs,size);
break;
case 1024:
reduceMin6<T,1024,size);
break;
case 512:
reduceMin6<T,512,size);
break;
case 256:
reduceMin6<T,256,size);
break;
case 128:
reduceMin6<T,128,size);
break;
case 64:
reduceMin6<T,64,size);
break;
case 32:
reduceMin6<T,32,size);
break;
case 16:
reduceMin6<T,16,size);
break;
case 8:
reduceMin6<T,8,size);
break;
case 4:
reduceMin6<T,4,size);
break;
case 2:
reduceMin6<T,size);
break;
case 1:
reduceMin6<T,size);
break;
}
} else {
switch (threads) {
case 2048:
reduceMin6<T,false> <<<dimGrid,size);
break ;
case 1024:
reduceMin6<T,size);
break ;
case 512:
reduceMin6<T,size);
break;
}
}
}
template<class T>
void reduceMINcpu(T *data,int size,T *min,int *idx)
{
*min = data[0];
int min_idx = 0;
T c = (T)0.0;
for (int i = 1; i < size; i++)
{
T y = data[i];
T t = MIN(*min,y);
min_idx = MIN_IDX(*min,min_idx,i);
(*min) = t;
}
*idx = min_idx;
return;
}
// Instantiate the reduction function for 3 types
template void
reduceMin<int>(int size,int *d_idata,int *d_odata,int *oIdxs);
template void
reduceMin<float>(int size,float *d_idata,float *d_odata,int *oIdxs);
template void
reduceMin<double>(int size,double *d_idata,double *d_odata,int *oIdxs);
int main(){
int num_els=8*8*8*8;
int maxThreads = 128; // number of threads per block
int whichKernel = 6;
int maxBlocks = 1000000;
double* d_in = NULL;
double* d_out = NULL;
int *d_idxs = NULL;
int *d_oIdxs = NULL;
printf("%d elements\n",num_els);
printf("%d threads (max)\n",maxThreads);
int numBlocks = 0;
int numThreads = 0;
getNumBlocksAndThreads(whichKernel,num_els,maxBlocks,maxThreads,numBlocks,numThreads);
printf("%d numBlocks \n",numBlocks);
printf("%d numThreads \n",numThreads);
cudamalloc((void **) &d_in,num_els * sizeof(double));
cudamalloc((void **) &d_idxs,num_els * sizeof(int));
cudamalloc((void **) &d_out,numBlocks * sizeof(double));
cudamalloc((void **) &d_oIdxs,numBlocks * sizeof(int));
printf("Finished cudamalloc");
double* in = (double*) malloc(num_els * sizeof(double));
int *idxs = (int*) malloc(num_els * sizeof(int));
double* out = (double*) malloc(numBlocks * sizeof(double));
int* oIdxs = (int*) malloc(numBlocks * sizeof(int));
for (int i = 0; i < num_els; i++) {
in[i] = (double)i*1.0;
idxs[i] = i;
}
in[num_els-5]=(double)-1.0;
printf("Verify that the in array is correctly filled \n");
for (int i = num_els-10; i < num_els; i++) {
//
printf("\n [%d] = %.4f",idxs[i],in[i]);
}
// copy data directly to device memory
cudamemcpy(d_in,in,num_els * sizeof(double),cudamemcpyHostToDevice);
cudamemcpy(d_idxs,num_els * sizeof(int),cudamemcpyHostToDevice);
cudamemcpy(d_out,out,numBlocks * sizeof(double),cudamemcpyHostToDevice);
cudamemcpy(d_oIdxs,numBlocks * sizeof(int),cudamemcpyHostToDevice);
printf(" \n Finished cudamemcopy \n");
reduceMin<double>(num_els,numThreads,whichKernel,d_in,d_out,d_idxs,d_oIdxs);
cudamemcpy(out,cudamemcpyDevicetoHost);
cudamemcpy(oIdxs,d_oIdxs,cudamemcpyDevicetoHost);
for(int i=0; i< numBlocks; i++){
//for(int i=numBlocks-20; i< numBlocks; i++)
printf("\n Reduce MIN GPU idx: %d value: %f",oIdxs[i],out[i]);
}
cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_idxs);
free(in);
free(out);
return 0;
}
但是,如果我转换代码以最小化并找到float
数组的索引,则它可以正常工作。这是浮点版本的main
(这是唯一的不同部分,某些变量和数组被替换为“ float”而不是“ double”)。编译方式与前者相同。
int main(){
int num_els=8*8*8*8;
int maxThreads = 128; // number of threads per block
int whichKernel = 6;
int maxBlocks = 1000000;
float* d_in = NULL;
float* d_out = NULL;
int *d_idxs = NULL;
int *d_oIdxs = NULL;
printf("%d elements\n",num_els * sizeof(float));
cudamalloc((void **) &d_idxs,numBlocks * sizeof(float));
cudamalloc((void **) &d_oIdxs,numBlocks * sizeof(int));
printf("Finished cudamalloc");
float* in = (float*) malloc(num_els * sizeof(float));
int *idxs = (int*) malloc(num_els * sizeof(int));
float* out = (float*) malloc(numBlocks * sizeof(float));
int* oIdxs = (int*) malloc(numBlocks * sizeof(int));
for (int i = 0; i < num_els; i++) {
in[i] = (float)i*1.0;
idxs[i] = i;
}
in[num_els-5]=(float)-1.0;
printf("Verify that the in array is correctly filled \n");
for (int i = num_els-10; i < num_els; i++) {
//
printf("\n [%d] = %.4f",num_els * sizeof(float),numBlocks * sizeof(float),cudamemcpyHostToDevice);
printf(" \n Finished cudamemcopy \n");
reduceMin<float>(num_els,out[i]);
}
cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_idxs);
free(in);
free(out);
return 0;
}
解决方法
问题在这里:
int *sdataIdx = ((int *)sdata) + blockSize;
这行代码设置了一个指针,以便可以将共享内存划分为2个逻辑段,一个逻辑段保存要减少的数据以找到最小值(第一段),另一个逻辑段保存相应的索引(第二个段)。片段-该指针实际指向的开始)。
要正确处理各种数据类型,应为:
int *sdataIdx = (int *)(((T *)sdata) + blockSize);
由于sdata
已经是类型T
的指针,我们可以简化:
int *sdataIdx = (int *)(sdata + blockSize);
这将保留一块共享内存,其大小足以容纳正确的数据类型(在这种情况下为两倍),然后是int
数据类型。 float
起作用的原因是因为float
和int
具有相同的大小要求。
还要注意,这适用于float
,double
和int
归约类型。不一定适用于其他类型。
此代码还会吐出许多编译时警告。这些不应该被忽略。但这不是问题的症结所在。
还请注意,如果需要,可以用更少的代码行来找到min或max plus索引,例如here。