如何在 OpenCL 中将数据从设备内存子缓冲区传输到主机程序?

问题描述

我正在从 Matthew Scarpino 的“OpenCL in action”book 中学习 OpenCL。第 3 章包含有关缓冲区(第 45-47 页)和子缓冲区(第 47-48 页)组织的材料。在第 47 页列出的用于创建子缓冲区的代码示例中存在不准确之处,同样的问题 herehere 披露了这一点。我更进一步,决定研究将存储在子缓冲区中的值传输回主机程序的情况。

我的主机程序正在将整数数组 iaArray1[5] = { 1,2,3,4,5 } 传输到内核中。缓冲区 memObjArray1 用于执行此操作。从 iaArray1 数组获得 ipaArray2 数组,存储值 { 3,5,6,7 } 作为内核将数组的值与常数 2 相加。 memObjArray2 输出缓冲区为用于将 ipaArray2 数组的值完全从设备传输到主机程序。接下来,memObjSubArray 子缓冲区由 memObjArray2 缓冲区构成。正在尝试将数据从设备内存子缓冲区 memObjSubArray 传输到主机程序。

我相信将数据从内核传输到宿主程序的机制对于缓冲区和子缓冲区都是相同的。为此,我使用了相同的函数 clEnqueueReadBuffer(),但程序给出了错误消息。我究竟做错了什么? 应该使用什么函数将数据从设备内存子缓冲区传输到主机程序?

内核函数如下:

__kernel void good (global int* iaArray1,global int* iaArray2)
{
    int i=get_global_id(0);
    iaArray2[i]=iaArray1[i]+2;
}

这是我的程序代码。所提供的程序有几个简化。首先,出口分支已被简化以缩短代码。其次,原始程序被设计用于处理多个 cl 文件,因此一些变量是数组。

#include <CL\cl.h>
#include <stdio.h>
#include <stdlib.h>

#define PROGRAM_FILE_1 "good.cl"
//#define PROGRAM_FILE_2 "bad.cl"
//#define PROGRAM_FILE_3 "setminusone.cl"
#define NUM_OF_FILES 1

int main(){
    cl_platform_id *platforms;
    cl_uint numOfPlatforms;
    cl_int status;
    cl_device_id *devices;
    cl_uint numOfDevices;
    char caDeviceName[500];
    cl_context context;

    const char * kcpaFileName[NUM_OF_FILES] = { PROGRAM_FILE_1};
    FILE * pProgramHandler;
    char * cpaProgramBuffer[NUM_OF_FILES];
    size_t saProgramSize[NUM_OF_FILES] = { 0};
    cl_uint numOfEnters[NUM_OF_FILES] = { 0};

    cl_program program;
    const char kcaOptions[] = "-cl-finite-math-only -cl-no-signed-zeros";
    size_t sLogSize = 0;
    char * cpProgramLog;

    cl_uint numOfKernels = 0;
    cl_kernel * kernels;
    char caKernelName[20];

    cl_command_queue cmdQueue0;

    printf("Establishing number of available platforms... ");
    status = clGetPlatformIDs(NULL,NULL,&numOfPlatforms);
    if (status < 0){
        printf("FAIL to establish platform(s)!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nEstablised %u platform(s).\nInitializing platform(s)... ",numOfPlatforms);
    platforms = (cl_platform_id *)malloc(numOfPlatforms*sizeof(cl_platform_id));
    status = clGetPlatformIDs(numOfPlatforms,platforms,NULL); //
    if (status < 0){
        printf("FAIL to initialize platform(s)!> %d\n",status);
        system("PAUSE");
        exit(1);
    }

    printf("OK.\nEstablishing devices... ");
    status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL,&numOfDevices);
    if (status < 0){
        printf("FAIL to establish device(s)!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nEstablished %u device(s).\nInitializing device(s)... ",numOfDevices);
    devices = (cl_device_id *)malloc(numOfDevices*sizeof(cl_device_id));
    status = clGetDeviceIDs(platforms[0],numOfDevices,devices,NULL);
    if (status < 0){
        printf("FAIL to initialize devices(s)!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.");
    for (int i = 0; i < numOfDevices; i++){
        status = clGetDeviceInfo(devices[i],CL_DEVICE_NAME,sizeof(caDeviceName),caDeviceName,NULL);
        if (status < 0){
            printf("FAIL to read device #%d name!> %d\n",i,status);
            system("PAUSE");
            exit(1);
        }
        printf("\nDevice #%d is \"%s\".",caDeviceName);
    }


    printf("\nCreating context... ");
    context = clCreateContext(NULL,&status);
    if (status < 0){
        printf("FAIL to create context!> %d\n",status);
        system("PAUSE");
        exit(1);
    }

    printf("OK.\nReading source code from file(s)... ");
    for (int i = 0; i < NUM_OF_FILES; i++){
        pProgramHandler = fopen(kcpaFileName[i],"r");
        if (pProgramHandler == NULL){
            printf("FAIL to open file \"%s\"!> %d\n",kcpaFileName[i],status);
            system("PAUSE");
            exit(1);
        }
        fseek(pProgramHandler,SEEK_END);
        saProgramSize[i] = ftell(pProgramHandler);
        rewind(pProgramHandler);
        cpaProgramBuffer[i] = (char*)malloc(sizeof(char)*saProgramSize[i] + 1);
        fread(cpaProgramBuffer[i],sizeof(char),saProgramSize[i],pProgramHandler);
        cpaProgramBuffer[i][saProgramSize[i]] = '\0';
        fclose(pProgramHandler);
        for (int j = 0; j < saProgramSize[i]; j++){
            if ((char)cpaProgramBuffer[i][j] == (char)10){
                numOfEnters[i]++;
            }
        }
        saProgramSize[i] = saProgramSize[i] - numOfEnters[i];
        cpaProgramBuffer[i][saProgramSize[i]] = '\0';
    }
    printf("OK.\nCreating program from source code... ");
    program = clCreateProgramWithSource(context,NUM_OF_FILES,(const char **)cpaProgramBuffer,(const size_t *)saProgramSize,&status);
    if (status < 0){
        printf("FAIL to create program!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nBuilding program... ");
    status = clBuildProgram(program,1,kcaOptions,NULL);//,if (status < 0){
        printf("FAIL to build program.\n...Genetating log...");
        for (int i = 0; i < NUM_OF_FILES; i++){
            printf("\nCode from file \"%s\":\n%s",cpaProgramBuffer[i]);
        }
        clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,&sLogSize);
        cpProgramLog = (char*)malloc(sizeof(char)*sLogSize + 1);
        cpProgramLog[sLogSize] = '\0';
        clGetProgramBuildInfo(program,sLogSize + 1,cpProgramLog,NULL);
        printf("\nLog length is %d.\nLog:\n%s\n> %d\n",sLogSize,status);
        system("PAUSE");
        exit(1);
    }

    printf("OK.\nDetermining number of kernels... ");
    status = clCreateKernelsInProgram(program,&numOfKernels);
    if (status < 0){
        printf("FAIL to determine number of kernels!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nDetermined %d kernel(s):",numOfKernels);
    kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*numOfKernels);
    clCreateKernelsInProgram(program,numOfKernels,kernels,NULL);
    for (int i = 0; i < numOfKernels; i++){
        clGetKernelInfo(kernels[i],CL_KERNEL_FUNCTION_NAME,sizeof(caKernelName),caKernelName,NULL);
        printf("\nKernel \"%s\" indexed at %d.",i);
    }

    printf("\nCreating command queue... ");
    cmdQueue0 = clCreateCommandQueue(context,&status);
    if (status < 0){
        printf("FAIL to create command queue!> %d\n",status);
        system("PAUSE");
        exit(1);
    }

    /*Data,buffers and subbuffers*/
    int iaArray1[5] = { 1,5 };
    printf("\nPrinting out the initial array:\n");
    for (int i = 0; i < 5; i++){
        printf("%d ",iaArray1[i]);
    }
    printf("\nCreating buffers for kernels[0]... ");
    int* ipaArray2 = (int*)malloc(5 * sizeof(int));
    cl_mem memObjArray1 = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeof(iaArray1),&iaArray1,&status);
    if (status < 0){
        printf("\nFAIL to create memObjArray1 buffer!> %d \n",status);
        system("PAUSE");
        exit(1);
    }
    cl_mem memObjArray2 = clCreateBuffer(context,CL_MEM_WRITE_ONLY,&status);
    if (status < 0){
        printf("\nFAIL to create memObjArray2 buffer!> %d \n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nSetting arguments for kernels[0]... ");
    status = clSetKernelArg(kernels[0],sizeof(cl_mem),&memObjArray1);
    if (status < 0){
        printf("\nFAIL to set memObjArray1 argument at kernels[0]!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    status = clSetKernelArg(kernels[0],&memObjArray2);
    if (status < 0){
        printf("\nFAIL to set memObjArray2 argument at kernels[0]!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nExecuting kernels[0]... ");
    size_t tGlobal_item_size = 5;   //?
    size_t tLocal_item_size = 1;    //?
    status = clEnqueueNDRangeKernel(cmdQueue0,kernels[0],&tGlobal_item_size,&tLocal_item_size,NULL);
    if (status < 0){
        printf("\nFAIL to enqueue kernels[0] into cmdQueue0!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nReading results from device memory buffer to host array... ");
    status = clEnqueueReadBuffer(cmdQueue0,memObjArray2,CL_TRUE,5 * sizeof(int),ipaArray2,NULL);
    if (status < 0){
        printf("FAIL to copy results from device to host!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out the result array:\n");
    for (int i = 0; i < 5; i++){
        printf("%d ",ipaArray2[i]);
    }

    printf("\nCreating subbuffer... ");
    int iQuantity = 2;
    int iShift = 2;
    typedef struct _cl_buffer_region{
        size_t size;
        size_t origin;
    } cl_buffer_region;
    cl_buffer_region stRegion;
    stRegion.size = iQuantity * sizeof(int);
    stRegion.origin = iShift * sizeof(int);
    cl_mem memObjSubArray = clCreateSubBuffer(memObjArray2,CL_BUFFER_CREATE_TYPE_REGION,&stRegion,&status);
    if (status < 0){
        printf("FAIL to create subbuffer!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nReading results from device memory subbuffer to host array... ");
    int* ipaSubArray = (int*)malloc(iQuantity*sizeof(int));
    status = clEnqueueReadBuffer(cmdQueue0,memObjSubArray,iQuantity*sizeof(int),ipaSubArray,status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out the result array:\n");
    for (int i = 0; i < iQuantity; i++){
        printf("%d ",ipaSubArray[i]);
    }


    printf("OK.\n...Releasing resources... ");
    clReleaseMemObject(memObjArray1);
    clReleaseMemObject(memObjArray2);
    clReleaseMemObject(memObjSubArray);

    clReleaseCommandQueue(cmdQueue0);
    free(kernels);
    clReleaseProgram(program);
    for (int i = 0; i < NUM_OF_FILES; i++){
        free(cpaProgramBuffer[i]);
    }
    clReleaseContext(context);
    free(devices);
    free(platforms);
    printf("OK.\nEnd of program. Bey!\n");
    system("PAUSE");
}

Program execution LOG-file

解决方法

似乎没有将数据从位于设备上的子缓冲内存对象传输到主机程序内存的功能。但是子缓冲区数据缺乏可观察性的问题可以通过使用缓冲区和数据复制函数clEnqueueCopyBuffer()来解决。它的规范可以在 here 中找到。它的第四个输入参数指定数据从源缓冲区开始的偏移量。第五个输入参数指定目标缓冲区中数据的偏移量。第六个参数指定要复制的数据量。

clEnqueueCopyBuffer() 函数调用示例如下:

cl_int status = clEnqueueCopyBuffer(cmdQueue0,memObjInput,memObjOutput,sizeof(int)*tSrcBufOffset,sizeof(int)*tDestBufOffset,sizeof(int)*tQuantityToCopy,NULL,NULL); 

举个例子,我写了一个程序来替换部分缓冲区数据。对于原始整数数组 { 1,2,3,4,5 },在设备内存中创建了一个缓冲区。从此缓冲区中检索第二个和第三个元素并显示在屏幕上:{2,3}。然后,在内核中,缓冲区的每个元素的值增加两个{ 3,5,6,7 }。内核执行的结果返回给宿主程序并显示出来。然后用存储在辅助缓冲区中的值替换缓冲区的第 3 个和第 4 个元素:{ 3,7 }

程序数据流的本质图形如下:

program data flow essence

代码的主要部分如下所示。要执行它,只需将其插入到先前给出的适当代码中即可。

...
    /*Data and buffers*/
    /*kernels[0]*/
    
    // two arrays and buffers creation
    int iaInputArray[] = { 1,5 };
    int iSizeOfArray = 5;
    int* ipaOutputArray = (int*)malloc(iSizeOfArray*sizeof(int));
    cl_mem memObjInput;
    cl_mem memObjOutput;
    cl_mem memObjSubBuffer;
    size_t tGlobal_item_size = iSizeOfArray;    //?
    size_t tLocal_item_size = 1;    //?
    size_t tSrcBufOffset;           //offset in source buffer
    size_t tDstBufOffset;           //offset in destination buffer
    size_t tNumbOfElementsToCopy=2; //number of elements to copy
    int* ipaSubArray = (int*)malloc(tNumbOfElementsToCopy*sizeof(int));

    printf("OK.\nPrinting out initial input array:\n");
    for (int i = 0; i < iSizeOfArray; i++){
        printf("%d ",iaInputArray[i]);
    }
    printf("\nCreating buffer memory objects... ");
    memObjInput = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,sizeof(iaInputArray),&iaInputArray,&status);
    if (status < 0){
        printf("FAIL to create buffer for input data!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    memObjOutput = clCreateBuffer(context,CL_MEM_WRITE_ONLY,&status);
    if (status < 0){
        printf("FAIL to create buffer for output data!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    memObjSubBuffer = clCreateBuffer(context,sizeof(ipaSubArray),status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nCopying 2nd and 3rd elements of the initial array into sub-buffer... ");
    tSrcBufOffset = 1;
    tDstBufOffset = 0;
    status = clEnqueueCopyBuffer(cmdQueue0,memObjSubBuffer,sizeof(int)*tDstBufOffset,sizeof(int)*tNumbOfElementsToCopy,NULL);
    if (status < 0){
        printf("FAIL to copy buffers!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nTransferring copied elements to host-program... ");
    status = clEnqueueReadBuffer(cmdQueue0,CL_TRUE,tNumbOfElementsToCopy*sizeof(int),ipaSubArray,NULL);
    if (status < 0){
        printf("FAIL to transfer data from device memory buffer to host array!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out copied elements:\n");
    for (int i = 0; i < tNumbOfElementsToCopy; i++){
        printf("%d ",ipaSubArray[i]);
    }
    printf("\nSetting kernel arguments... ");
    status = clSetKernelArg(kernels[0],sizeof(cl_mem),&memObjInput);
    if (status < 0){
        printf("FAIL to set kernel argument #0!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    status = clSetKernelArg(kernels[0],1,&memObjOutput);
    if (status < 0){
        printf("FAIL to set kernel argument #1!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nExecuting kernel... ");
    status = clEnqueueNDRangeKernel(cmdQueue0,kernels[0],&tGlobal_item_size,&tLocal_item_size,NULL);
    if (status < 0){
        printf("FAIL to enqueue kernels[0] into cmdQueue0!> %d\n",status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nReading results from device memory buffer to host array... ");
    status = clEnqueueReadBuffer(cmdQueue0,iSizeOfArray*sizeof(int),ipaOutputArray,status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out data obtained from kernel:\n");
    for (int i = 0; i < iSizeOfArray; i++){
        printf("%d ",ipaOutputArray[i]);
    }
    printf("\nChanging 3rd and 4th elements of data in output buffer... ");
    tSrcBufOffset = 0;
    tDstBufOffset = 2;
    status = clEnqueueCopyBuffer(cmdQueue0,status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nTransferring results from device memory buffer to host array... ");
    status = clEnqueueReadBuffer(cmdQueue0,status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out host array data:\n");
    for (int i = 0; i < iSizeOfArray; i++){
        printf("%d ",ipaOutputArray[i]);
    }
    printf("\n...Releasing resources... ");
...

程序执行的打印屏幕: program execution log

相关问答

错误1:Request method ‘DELETE‘ not supported 错误还原:...
错误1:启动docker镜像时报错:Error response from daemon:...
错误1:private field ‘xxx‘ is never assigned 按Alt...
报错如下,通过源不能下载,最后警告pip需升级版本 Requirem...