问题描述
__#include <math.h>
#include <numeric>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256
__global__ void kernel_histogram(int* dev_histogram,int* dev_values_arr,unsigned int size) {
__shared__ int temp[BLOCK_SIZE + 1];
int thread_id,thread_value;
thread_id = threadIdx.x + blockIdx.x * blockDim.x;
if (thread_id >= size) {
return;
}
temp[threadIdx.x + 1] = 0;
__syncthreads();
thread_value = dev_values_arr[thread_id];
atomicAdd(&temp[thread_value],1);
__syncthreads();
atomicAdd(&(dev_histogram[threadIdx.x + 1]),temp[threadIdx.x + 1]);
}
int* histogram_cuda(int* values_arr,int size) {
int num_blocks = size / BLOCK_SIZE;
int* dev_histogram = 0;
int* dev_values_arr = 0;
int* histogram = (int*)malloc((BLOCK_SIZE + 1) * sizeof(int));
cudaError_t cudaStatus;
if (size % BLOCK_SIZE != 0) {
num_blocks = num_blocks + 1;
}
// allocate histogram and values_arr device memories
cudaStatus = cudamalloc((void**)&dev_histogram,(BLOCK_SIZE + 1) * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudamalloc() operation Failed - %s\n",cudaGetErrorString(cudaStatus));
exit(-1);
}
cudaStatus = cudamemset(dev_histogram,(BLOCK_SIZE + 1) * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudamemset() operation Failed - %s\n",cudaGetErrorString(cudaStatus));
exit(-1);
}
cudaStatus = cudamalloc((void**)&dev_values_arr,size * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudamalloc() operation Failed - %s\n",cudaGetErrorString(cudaStatus));
exit(-1);
}
// copy values_arr memory in host to device
cudaStatus = cudamemcpy(dev_values_arr,values_arr,size * sizeof(int),cudamemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudamemcpy() operation Failed - %s\n",cudaGetErrorString(cudaStatus));
exit(-1);
}
printf("the number of blocks is %d\n\n",num_blocks);
// calculate histogram on the gpu
kernel_histogram << <num_blocks,BLOCK_SIZE >> > (dev_histogram,dev_values_arr,size);
// copy histogram memory in device to host
cudaStatus = cudamemcpy(histogram,dev_histogram,(BLOCK_SIZE + 1) * sizeof(int),cudamemcpyDevicetoHost);
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudamemcpy() operation Failed - %s\n",cudaGetErrorString(cudaStatus));
exit(-1);
}
// free device memory
cudaFree(dev_histogram);
cudaFree(dev_values_arr);
return histogram;
}
int main(int argc,char* argv[]) {
unsigned int size = 21;
int* histogram;
int values_arr[] = { 2,2,4,5,7,19,20,21,100,256 };
histogram = histogram_cuda(values_arr,size);
for (int i = 1; i < BLOCK_SIZE + 1; i++) {
if (histogram[i] > 0) {
printf("%d : %d\n",i,histogram[i]);
}
}
}
直方图用于记录输入中存在的值的数量,允许的值是1到256。每个块最多具有256个线程。我试图将整个块的总线程数限制为,以便每个线程在直方图中记录一个值的出现。
如果我使用“ values_arr = {2,2,2,2,2,2,2,4,4,5,5,5,5,7,7,7,7,19,20,21,100, 256}“,表示大小为21,我得到:
2:7 4:1 5:4 7:4 19:1 20:1 21:1
我试图做到这一点,以便每个值都由一个线程记录,而所有无用的线程都将被丢弃。另外,您发现的任何其他问题以及以最佳方式实现此目的的任何建议也将不胜感激。谢谢!
解决方法
在问题代码的新版本中,您有两个条件执行的__syncthreads()
调用,它们是illegal in CUDA,并且容易死锁或产生不确定的行为,具体取决于您拥有的硬件和用例。
如果我这样修改内核:
__global__ void kernel_histogram(int* dev_histogram,int* dev_values_arr,unsigned int size) {
__shared__ int temp[BLOCK_SIZE + 1];
int thread_id,thread_value;
thread_id = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x + 1] = 0;
// Synchronization is unconditional
__syncthreads();
// Load is performed conditionally
if (thread_id < size) {
thread_value = dev_values_arr[thread_id];
atomicAdd(&temp[thread_value],1);
}
// Synchronization is unconditional
__syncthreads();
atomicAdd(&(dev_histogram[threadIdx.x + 1]),temp[threadIdx.x + 1]);
}
我得到以下输出:
the number of blocks is 1
2 : 7
4 : 1
5 : 4
7 : 4
19 : 1
20 : 1
21 : 1
100 : 1
256 : 1
这看起来更像是我的预期。