问题描述
我试图更好地了解cuda内核中的动态共享内存。我复制了此example并对其进行了修改,以允许我更改工作项的数量。它适用于
代码在讨论过程中已更新
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#define gpuErrChk(ans) { gpuAssert((ans),__FILE__,__LINE__); }
inline void gpuAssert(cudaError_t code,const char *file,int line,bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n",cudaGetErrorString(code),file,line);
if (abort) exit(code);
}
}
__global__ void dynamicAddNine(int *d,int n) {
extern __shared__ int s[];
int t = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
s[t] = d[t] + 9;
__syncthreads();
d[t] = s[t];
}
static void parseArgs(int argc,char **argv) {
if (argc != 2) {
printf("Usage : %s <N>,where N (+ve int) is the required array size\n",argv[0]);
exit(1);
}
if (atoi(argv[1]) < 0 ) {
printf("Usage : %s <N>,argv[0]);
exit(1);
}
}
int main(int argc,char *argv[]) {
parseArgs(argc,argv);
const int n = atoi(argv[1]);
int T = 1024; // maximum number of threads
if (n < 1024) { T = n;} // if no elements required < 1024
int a[n],r[n],d[n];
for (int i = 0; i < n; i++) {
a[i] = i;
r[i] = i+9;
d[i] = 0;
}
int *d_d;
gpuErrChk(cudaMalloc(&d_d,n*sizeof(int)));
int B = (int)ceil((float)n/(float)T); //calculate no of blocks required
// run version with dynamic shared memory
std::cout << "B: " << B << ",T: " << T << std::endl;
gpuErrChk(cudaMemcpy(d_d,a,n*sizeof(int),cudaMemcpyHostToDevice));
dynamicAddNine<<<B,T,n*sizeof(int)>>>(d_d,n);
gpuErrChk(cudaPeekAtLastError());
gpuErrChk(cudaDeviceSynchronize());
gpuErrChk(cudaMemcpy(d,d_d,cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++) {
if (d[i] != r[i]) printf("Error: d[%d] != r[%d] (%d,%d)\n",i,d[i],r[i]);
}
}
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)