即使使用clRelease *释放所有内容后,OpenCL内存也会泄漏和错误

问题描述

我编写了一个非常简单的OpenCL程序,它仅将两个数组加起来。它可以正常工作,但是,如果我使用Valgrind运行它,即使使用clRelease *释放所有内容后,也会出现大量内存泄漏。当我的源代码中有Mismatched free() / delete / delete[]时,我也收到clReleaseProgram(program) Valgrind错误。要明确的是,无论是否有该行,我都会发生内存泄漏,但是该行也会给我错误

我在main.c中包含了main.ckernel.cl代码文件以及通过clReleaseProgram(program)行获得的Valgrind输出

我不知道这是否相关,但是我正在使用AMD的ROCM OpenCL实施并正在使用1.2版,并且它在Radeon RX 5700XT显卡上运行。操作系统是Ubuntu 20.04。

我真的很感谢任何建议!

// Simple test OpenCL program that adds two arrays

// Define OpenCL version
#define CL_TARGET_OPENCL_VERSION 120

#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>


// Main program
int main(int argc,char **argv) {
    // Create all three arrays needed
    float arr1[] = {1.0,2.0,3.0,4.0};
    float arr2[] = {10.0,11.0,12.0,13.0};
    float arr3[4]; 
    
    // Initialize platform id structure
    cl_int err;
    cl_platform_id platform;
    err = clGetPlatformIDs(1,&platform,NULL);

    // Initialize device id structure
    cl_device_id device;
    err = clGetdeviceids(platform,CL_DEVICE_TYPE_GPU,1,&device,NULL);

    // Initialize context
    cl_context context = clCreateContext(NULL,NULL,&err);

    // Determine file size of kernel source file
    FILE *fp;
    fp = fopen("kernel.cl","r");
    fseek(fp,SEEK_END);
    long int src_size = ftell(fp);
    rewind(fp);

    // Read kernel source file into buffer
    char *buf = malloc(sizeof(char) * (src_size + 1));
    fread(buf,sizeof(char),src_size,fp);
    buf[src_size] = '\0';

    // Create the program
    cl_program program = clCreateProgramWithSource(context,(const char **) &buf,&err);
    free(buf);

    // Build the program
    const char options[] = "-cl-std=CL1.2 -D MULT=2";
    err = clBuildProgram(program,options,NULL);

    // Create command queue
    cl_command_queue queue = clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&err);

    // Create kernel
    const char kernel_name[] = "test_kernel";
    cl_kernel kernel = clCreateKernel(program,kernel_name,&err);

    // Create the memory objects
    cl_mem arr1_mem = clCreateBuffer(context,CL_MEM_READ_ONLY,4 * sizeof(float),&err);
    cl_mem arr2_mem = clCreateBuffer(context,&err);
    cl_mem arr3_mem = clCreateBuffer(context,CL_MEM_WRITE_ONLY,&err);

    // Write the input arrays into the input mem objects
    err = clEnqueueWriteBuffer(queue,arr1_mem,CL_TRUE,arr1,NULL);
    err = clEnqueueWriteBuffer(queue,arr2_mem,arr2,NULL);

    // Set the kernel arguments
    err = clSetKernelArg(kernel,sizeof(arr1_mem),&arr1_mem);
    err = clSetKernelArg(kernel,sizeof(arr2_mem),&arr2_mem);
    err = clSetKernelArg(kernel,2,sizeof(arr3_mem),&arr3_mem);

    // Execute kernel
    size_t gws[] = {4};
    err = clEnqueueNDRangeKernel(queue,kernel,gws,NULL);

    // Read the output array into the output mem object
    err = clEnqueueReadBuffer(queue,arr3_mem,arr3,NULL);

    // Print the output
    for (int i = 0; i < 4; ++i) {
        printf("%f\n",arr3[i]);
    }

    // Release all OpenCL objects
    clReleaseMemObject(arr1_mem);
    clReleaseMemObject(arr2_mem);
    clReleaseMemObject(arr3_mem);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program); // This line causes Mismatched free / delete[] / delete valgrind error
    clReleaseContext(context);

    return 0;
}
// Test OpenCL kernel
__kernel void test_kernel(__constant float *arr1,__constant float *arr2,__global float *arr3) {
    // Get the index into the arrays
    int index = get_global_id(0);

    // Sum arr1 + arr2 and store in arr3
    arr3[index] = arr1[index] + arr2[index];
}

ivy@ubuntu-main:~/cl_test$ valgrind ./a.out
==10340== Memcheck,a memory error detector
==10340== copyright (C) 2002-2017,and GNU GPL'd,by Julian Seward et al.
==10340== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==10340== Command: ./a.out
==10340== 
==10340== Warning: set address range perms: large range [0x59c9d000,0x159e9e000) (noaccess)
11.000000
13.000000
15.000000
17.000000
==10340== Mismatched free() / delete / delete []
==10340==    at 0x483CFBF: operator delete(void*) (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==10340==    by 0x542B8C0: rocr::amd::hsa::loader::AmdHsaCodeLoader::DestroyExecutable(rocr::amd::hsa::loader::Executable*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5407ECF: rocr::HSA::hsa_executable_destroy(hsa_executable_s) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5159C13: roc::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x515A606: roc::LightningProgram::~LightningProgram() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5106B22: amd::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5106D28: amd::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x51006C5: amd::ReferenceCountedobject::release() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x50DACB3: clReleaseProgram (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x1099A3: main (main.c:92)
==10340==  Address 0xe3634c0 is 0 bytes inside a block of size 42 alloc'd
==10340==    at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==10340==    by 0x4B0750E: strdup (strdup.c:42)
==10340==    by 0x54348A3: rocr::amd::hsa::loader::ExecutableImpl::LoadCodeObject(hsa_agent_s,hsa_code_object_s,unsigned long,char const*,std::__cxx11::basic_string<char,std::char_traits<char>,std::allocator<char> > const&,hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5434F52: rocr::amd::hsa::loader::ExecutableImpl::LoadCodeObject(hsa_agent_s,hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5408460: rocr::HSA::hsa_executable_load_agent_code_object(hsa_executable_s,hsa_agent_s,hsa_code_object_reader_s,hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340==    by 0x5159FBF: roc::LightningProgram::setKernels(amd::option::Options*,void*,unsigned long) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5153358: device::Program::linkImplLC(amd::option::Options*) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5153ABC: device::Program::build(std::__cxx11::basic_string<char,amd::option::Options*) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x5108F31: amd::Program::build(std::vector<amd::Device*,std::allocator<amd::Device*> > const&,void (*)(_cl_program*,void*),bool,bool) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x50DAF42: clBuildProgram (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340==    by 0x1096B5: main (main.c:51)
==10340== 
==10340== 
==10340== HEAP SUMMARY:
==10340==     in use at exit: 991,949 bytes in 4,653 blocks
==10340==   total heap usage: 576,804 allocs,572,151 frees,246,308,729 bytes allocated
==10340== 
==10340== LEAK SUMMARY:
==10340==    definitely lost: 67,584 bytes in 1 blocks
==10340==    indirectly lost: 0 bytes in 0 blocks
==10340==      possibly lost: 7,692 bytes in 66 blocks
==10340==    still reachable: 916,673 bytes in 4,586 blocks
==10340==         suppressed: 0 bytes in 0 blocks
==10340== Rerun with --leak-check=full to see details of leaked memory
==10340== 
==10340== For lists of detected and suppressed errors,rerun with: -s
==10340== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)

编辑:运行时输出Valgrind --leak-check=full ./a.out 我省略了Valgrind输出的开头,因为它与上面的输出完全相同。我也省略了大部分中间输出,因为所有错误看起来基本上都是相同的,并且本文中有太多错误无法容纳。

... Same output here as above ...


==4996== HEAP SUMMARY:
==4996==     in use at exit: 991,947 bytes in 4,653 blocks
==4996==   total heap usage: 575,705 allocs,571,052 frees,015,102 bytes allocated
==4996== 
==4996== 4 bytes in 1 blocks are possibly lost in loss record 7 of 2,212
==4996==    at 0x483B723: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x483E017: realloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x5B27783: _fmm_map_to_gpu.isra.8 (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B28B57: fmm_map_to_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2950F: fmm_init_process_apertures (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2CD2B: hsaKmtOpenKFD (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5401368: rocr::AMD::Load() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x542440D: rocr::core::Runtime::Acquire() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x54027C9: rocr::HSA::hsa_init() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5135C84: roc::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996==    by 0x50F832E: amd::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996==    by 0x5102525: amd::Runtime::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996== 
==4996== 4 bytes in 1 blocks are possibly lost in loss record 8 of 2,212
==4996==    at 0x483B723: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x483E017: realloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x5B27783: _fmm_map_to_gpu.isra.8 (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B28B57: fmm_map_to_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2C520: hsaKmtMapMemoryToGPU (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2F29B: allocate_exec_aligned_memory_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B25679: hsaKmtCreateEvent (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5414E62: rocr::core::InterruptSignal::CreateEvent(_HSA_EVENTTYPE,bool) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x541FD00: rocr::core::Runtime::BindVmFaultHandler() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5424427: rocr::core::Runtime::Acquire() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x54027C9: rocr::HSA::hsa_init() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5135C84: roc::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)


... Lots of similar repeating errors here ...


==4996== 720 bytes in 3 blocks are possibly lost in loss record 2,165 of 2,212
==4996==    at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0x5B262DF: vm_create_and_init_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B265EC: fmm_allocate_memory_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B28219: fmm_allocate_host (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x5B2B9FA: hsaKmtAllocmemory (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996==    by 0x53F0E31: rocr::AMD::MemoryRegion::Allocate(unsigned long&,unsigned int,void**) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x541C4C2: std::_Function_handler<void* (unsigned long,unsigned int),rocr::core::Runtime::Registeragent(rocr::core::Agent*)::{lambda(unsigned long,unsigned int)#1}>::_M_invoke(std::_Any_data const&,unsigned long&&,std::_Any_data const&,unsigned int&&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53DF22E: rocr::AMD::GpuAgent::AssembleShader(char const*,rocr::AMD::GpuAgent::AssembleTarget,void*&,unsigned long&) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53D39B2: rocr::AMD::BlitKernel::Initialize(rocr::core::Agent const&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53DD58A: std::_Function_handler<rocr::core::Blit* (),rocr::AMD::GpuAgent::InitDma()::{lambda()#3}>::_M_invoke(std::_Any_data const&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53DD2F9: rocr::AMD::GpuAgent::Dmacopy(void*,void const*,unsigned long) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x53ED80A: rocr::(anonymous namespace)::RegionMemory::Freeze() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== 
==4996== 1,200 bytes in 5 blocks are possibly lost in loss record 2,173 of 2,unsigned int&&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x542B8D5: rocr::core::Shared<rocr::core::SharedSignal,rocr::core::SharedSignalPool_t>::Shared(rocr::core::SharedSignalPool_t*,int) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x542921E: rocr::core::LocalSignal::LocalSignal(long,bool) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5414EE8: rocr::core::InterruptSignal::InterruptSignal(long,_HSAEvent*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x5410E85: rocr::AMD::hsa_amd_signal_create(long,hsa_agent_s const*,hsa_signal_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==    by 0x513B727: roc::VirtualGPU::initPool(unsigned long,unsigned int) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996== 
==4996== 67,584 bytes in 1 blocks are definitely lost in loss record 2,209 of 2,212
==4996==    at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996==    by 0xA977E83: RegisterHandlers() (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0xA9787EB: llvm::sys::RemoveFileOnSignal(llvm::StringRef,std::allocator<char> >*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F2BC0E: clang::CompilerInstance::createOutputFile(llvm::StringRef,std::error_code&,llvm::StringRef,std::allocator<char> >*,std::allocator<char> >*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F2BFD2: clang::CompilerInstance::createOutputFile(llvm::StringRef,bool) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F2C39E: clang::CompilerInstance::createDefaultOutputFile(bool,llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x77F3913: GetoutputStream(clang::CompilerInstance&,clang::BackendAction) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x77F959A: clang::CodeGenAction::CreateASTConsumer(clang::CompilerInstance&,llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F6D175: clang::FrontendAction::CreateWrappedASTConsumer(clang::CompilerInstance&,llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F70865: clang::FrontendAction::BeginSourceFile(clang::CompilerInstance&,clang::FrontendInputFile const&) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x8F2F079: clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==    by 0x743641A: clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== 
==4996== LEAK SUMMARY:
==4996==    definitely lost: 67,584 bytes in 1 blocks
==4996==    indirectly lost: 0 bytes in 0 blocks
==4996==      possibly lost: 7,692 bytes in 66 blocks
==4996==    still reachable: 916,671 bytes in 4,586 blocks
==4996==         suppressed: 0 bytes in 0 blocks
==4996== Reachable blocks (those to which a pointer was found) are not shown.
==4996== To see them,rerun with: --leak-check=full --show-leak-kinds=all
==4996== 
==4996== For lists of detected and suppressed errors,rerun with: -s
==4996== ERROR SUMMARY: 56 errors from 56 contexts (suppressed: 0 from 0)

解决方法

如果您正在使用“标准” libstdc ++和libc,则在您的情况下,不匹配的strdup / delete(而不是strdup / free)不会造成问题。这是因为libstdc ++ new仅调用malloc,而delete调用free。另一方面,如果您使用基于delete 不是的替换free,则您的应用程序很可能崩溃。

对于内存泄漏,您需要使用memcheck运行--leak-check=full,因为它在Valgrind输出的末尾显示。