CUDA cudaMemcpy问题

问题描述

已为我分配了一个简单的CUDA程序,以计算值在1到256之间的整数输入的直方图。由于某些原因,当我尝试将结果从设备存储器复制到主机存储器时,失败。它给我一个错误11(“ cudaErrorInvalidValue”)。据我所知,这方面的一切都应该起作用。有人可以看看我告诉我什么吗?

#define NUM_THREADS 256

int main(int argc,char* argv[]) {

    int histogram[NUM_THREADS + 1] = { 0 };
    int values[] = { 15,1,2,3,4,5,6,7,8,9,10,10 };
    int *dev_histogram = 0;
    int *dev_values = 0;

    memory_setup(dev_histogram,values,dev_values,values[0]);
    gpu_operations(histogram,dev_histogram,values[0]);

    for (int i = 0; i < 25; i++) {
        printf("%d : %d\n",i,histogram[i]);
    }

    return 0;
}

int memory_setup(int* dev_histogram,int* values_arr,int* dev_values_arr,int num_values) {

    cudaError_t cudaStatus;

    cudaStatus = cudamalloc((void**)&dev_histogram,(NUM_THREADS + 1) * sizeof(int));

    if (cudaStatus != cudaSuccess) {

        printf("ERROR: CUDA memory allocation operation Failed\n");
        exit(-1);
    }

    cudaStatus = cudamalloc((void**)&dev_values_arr,num_values * sizeof(int));

    if (cudaStatus != cudaSuccess) {

        printf("ERROR: CUDA memory allocation operation Failed\n");
        exit(-1);
    }

    cudaStatus = cudamemcpy(dev_values_arr,values_arr,num_values * sizeof(int),cudamemcpyHostToDevice);

    if (cudaStatus != cudaSuccess) {

        printf("ERROR: CUDA memory copying operation Failed\n");
        exit(-1);
    }

    return 1;
}

void gpu_operations(int* histogram,int* dev_histogram,int num_values) {

    cudaError_t cudaStatus;
    int num_blocks = num_values / NUM_THREADS + num_values % NUM_THREADS != 0;
    
    kernel_histogram <<<num_blocks,NUM_THREADS>>>(dev_histogram,dev_values_arr);

    ////////////////////////////////////////////////////////////////////////////////////
    cudaStatus = cudamemcpy(histogram,(NUM_THREADS + 1) * sizeof(int),cudamemcpyDevicetoHost);
    ////////////////////////////////////////////////////////////////////////////////////

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA memory copying operation Failed4,%d \n",cudaStatus);
        return;
    }

    cudaFree(dev_histogram);
    cudaFree(dev_values_arr);
}

__global__ void kernel_histogram(int* dev_histogram,int* dev_values_arr) {

    __shared__ int temp[NUM_THREADS + 1];

    temp[threadIdx.x] = 0;
    __syncthreads();

    int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
    int thread_value = dev_values_arr[thread_id];

    atomicAdd(&temp[thread_value],1);
    __syncthreads();

    atomicAdd(&(dev_histogram[threadIdx.x]),temp[threadIdx.x]);
}

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)