问题描述
已为我分配了一个简单的CUDA程序,以计算值在1到256之间的整数输入的直方图。由于某些原因,当我尝试将结果从设备存储器复制到主机存储器时,失败。它给我一个错误11(“ cudaErrorInvalidValue”)。据我所知,这方面的一切都应该起作用。有人可以看看我告诉我什么吗?
#define NUM_THREADS 256
int main(int argc,char* argv[]) {
int histogram[NUM_THREADS + 1] = { 0 };
int values[] = { 15,1,2,3,4,5,6,7,8,9,10,10 };
int *dev_histogram = 0;
int *dev_values = 0;
memory_setup(dev_histogram,values,dev_values,values[0]);
gpu_operations(histogram,dev_histogram,values[0]);
for (int i = 0; i < 25; i++) {
printf("%d : %d\n",i,histogram[i]);
}
return 0;
}
int memory_setup(int* dev_histogram,int* values_arr,int* dev_values_arr,int num_values) {
cudaError_t cudaStatus;
cudaStatus = cudamalloc((void**)&dev_histogram,(NUM_THREADS + 1) * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA memory allocation operation Failed\n");
exit(-1);
}
cudaStatus = cudamalloc((void**)&dev_values_arr,num_values * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA memory allocation operation Failed\n");
exit(-1);
}
cudaStatus = cudamemcpy(dev_values_arr,values_arr,num_values * sizeof(int),cudamemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA memory copying operation Failed\n");
exit(-1);
}
return 1;
}
void gpu_operations(int* histogram,int* dev_histogram,int num_values) {
cudaError_t cudaStatus;
int num_blocks = num_values / NUM_THREADS + num_values % NUM_THREADS != 0;
kernel_histogram <<<num_blocks,NUM_THREADS>>>(dev_histogram,dev_values_arr);
////////////////////////////////////////////////////////////////////////////////////
cudaStatus = cudamemcpy(histogram,(NUM_THREADS + 1) * sizeof(int),cudamemcpyDevicetoHost);
////////////////////////////////////////////////////////////////////////////////////
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA memory copying operation Failed4,%d \n",cudaStatus);
return;
}
cudaFree(dev_histogram);
cudaFree(dev_values_arr);
}
__global__ void kernel_histogram(int* dev_histogram,int* dev_values_arr) {
__shared__ int temp[NUM_THREADS + 1];
temp[threadIdx.x] = 0;
__syncthreads();
int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
int thread_value = dev_values_arr[thread_id];
atomicAdd(&temp[thread_value],1);
__syncthreads();
atomicAdd(&(dev_histogram[threadIdx.x]),temp[threadIdx.x]);
}
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)