问题描述
我编写了一个非常简单的OpenCL程序,它仅将两个数组加起来。它可以正常工作,但是,如果我使用Valgrind运行它,即使使用clRelease *释放所有内容后,也会出现大量内存泄漏。当我的源代码中有Mismatched free() / delete / delete[]
时,我也收到clReleaseProgram(program)
Valgrind错误。要明确的是,无论是否有该行,我都会发生内存泄漏,但是该行也会给我错误。
我在main.c中包含了main.c
和kernel.cl
源代码文件以及通过clReleaseProgram(program)
行获得的Valgrind输出。
我不知道这是否相关,但是我正在使用AMD的ROCM OpenCL实施并正在使用1.2版,并且它在Radeon RX 5700XT显卡上运行。操作系统是Ubuntu 20.04。
我真的很感谢任何建议!
// Simple test OpenCL program that adds two arrays
// Define OpenCL version
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>
// Main program
int main(int argc,char **argv) {
// Create all three arrays needed
float arr1[] = {1.0,2.0,3.0,4.0};
float arr2[] = {10.0,11.0,12.0,13.0};
float arr3[4];
// Initialize platform id structure
cl_int err;
cl_platform_id platform;
err = clGetPlatformIDs(1,&platform,NULL);
// Initialize device id structure
cl_device_id device;
err = clGetdeviceids(platform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
// Initialize context
cl_context context = clCreateContext(NULL,NULL,&err);
// Determine file size of kernel source file
FILE *fp;
fp = fopen("kernel.cl","r");
fseek(fp,SEEK_END);
long int src_size = ftell(fp);
rewind(fp);
// Read kernel source file into buffer
char *buf = malloc(sizeof(char) * (src_size + 1));
fread(buf,sizeof(char),src_size,fp);
buf[src_size] = '\0';
// Create the program
cl_program program = clCreateProgramWithSource(context,(const char **) &buf,&err);
free(buf);
// Build the program
const char options[] = "-cl-std=CL1.2 -D MULT=2";
err = clBuildProgram(program,options,NULL);
// Create command queue
cl_command_queue queue = clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&err);
// Create kernel
const char kernel_name[] = "test_kernel";
cl_kernel kernel = clCreateKernel(program,kernel_name,&err);
// Create the memory objects
cl_mem arr1_mem = clCreateBuffer(context,CL_MEM_READ_ONLY,4 * sizeof(float),&err);
cl_mem arr2_mem = clCreateBuffer(context,&err);
cl_mem arr3_mem = clCreateBuffer(context,CL_MEM_WRITE_ONLY,&err);
// Write the input arrays into the input mem objects
err = clEnqueueWriteBuffer(queue,arr1_mem,CL_TRUE,arr1,NULL);
err = clEnqueueWriteBuffer(queue,arr2_mem,arr2,NULL);
// Set the kernel arguments
err = clSetKernelArg(kernel,sizeof(arr1_mem),&arr1_mem);
err = clSetKernelArg(kernel,sizeof(arr2_mem),&arr2_mem);
err = clSetKernelArg(kernel,2,sizeof(arr3_mem),&arr3_mem);
// Execute kernel
size_t gws[] = {4};
err = clEnqueueNDRangeKernel(queue,kernel,gws,NULL);
// Read the output array into the output mem object
err = clEnqueueReadBuffer(queue,arr3_mem,arr3,NULL);
// Print the output
for (int i = 0; i < 4; ++i) {
printf("%f\n",arr3[i]);
}
// Release all OpenCL objects
clReleaseMemObject(arr1_mem);
clReleaseMemObject(arr2_mem);
clReleaseMemObject(arr3_mem);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program); // This line causes Mismatched free / delete[] / delete valgrind error
clReleaseContext(context);
return 0;
}
// Test OpenCL kernel
__kernel void test_kernel(__constant float *arr1,__constant float *arr2,__global float *arr3) {
// Get the index into the arrays
int index = get_global_id(0);
// Sum arr1 + arr2 and store in arr3
arr3[index] = arr1[index] + arr2[index];
}
ivy@ubuntu-main:~/cl_test$ valgrind ./a.out
==10340== Memcheck,a memory error detector
==10340== copyright (C) 2002-2017,and GNU GPL'd,by Julian Seward et al.
==10340== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==10340== Command: ./a.out
==10340==
==10340== Warning: set address range perms: large range [0x59c9d000,0x159e9e000) (noaccess)
11.000000
13.000000
15.000000
17.000000
==10340== Mismatched free() / delete / delete []
==10340== at 0x483CFBF: operator delete(void*) (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==10340== by 0x542B8C0: rocr::amd::hsa::loader::AmdHsaCodeLoader::DestroyExecutable(rocr::amd::hsa::loader::Executable*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5407ECF: rocr::HSA::hsa_executable_destroy(hsa_executable_s) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5159C13: roc::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x515A606: roc::LightningProgram::~LightningProgram() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5106B22: amd::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5106D28: amd::Program::~Program() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x51006C5: amd::ReferenceCountedobject::release() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x50DACB3: clReleaseProgram (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x1099A3: main (main.c:92)
==10340== Address 0xe3634c0 is 0 bytes inside a block of size 42 alloc'd
==10340== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==10340== by 0x4B0750E: strdup (strdup.c:42)
==10340== by 0x54348A3: rocr::amd::hsa::loader::ExecutableImpl::LoadCodeObject(hsa_agent_s,hsa_code_object_s,unsigned long,char const*,std::__cxx11::basic_string<char,std::char_traits<char>,std::allocator<char> > const&,hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5434F52: rocr::amd::hsa::loader::ExecutableImpl::LoadCodeObject(hsa_agent_s,hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5408460: rocr::HSA::hsa_executable_load_agent_code_object(hsa_executable_s,hsa_agent_s,hsa_code_object_reader_s,hsa_loaded_code_object_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==10340== by 0x5159FBF: roc::LightningProgram::setKernels(amd::option::Options*,void*,unsigned long) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5153358: device::Program::linkImplLC(amd::option::Options*) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5153ABC: device::Program::build(std::__cxx11::basic_string<char,amd::option::Options*) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x5108F31: amd::Program::build(std::vector<amd::Device*,std::allocator<amd::Device*> > const&,void (*)(_cl_program*,void*),bool,bool) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x50DAF42: clBuildProgram (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==10340== by 0x1096B5: main (main.c:51)
==10340==
==10340==
==10340== HEAP SUMMARY:
==10340== in use at exit: 991,949 bytes in 4,653 blocks
==10340== total heap usage: 576,804 allocs,572,151 frees,246,308,729 bytes allocated
==10340==
==10340== LEAK SUMMARY:
==10340== definitely lost: 67,584 bytes in 1 blocks
==10340== indirectly lost: 0 bytes in 0 blocks
==10340== possibly lost: 7,692 bytes in 66 blocks
==10340== still reachable: 916,673 bytes in 4,586 blocks
==10340== suppressed: 0 bytes in 0 blocks
==10340== Rerun with --leak-check=full to see details of leaked memory
==10340==
==10340== For lists of detected and suppressed errors,rerun with: -s
==10340== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
编辑:运行时输出:Valgrind --leak-check=full ./a.out
我省略了Valgrind
输出的开头,因为它与上面的输出完全相同。我也省略了大部分中间输出,因为所有错误看起来基本上都是相同的,并且本文中有太多错误无法容纳。
... Same output here as above ...
==4996== HEAP SUMMARY:
==4996== in use at exit: 991,947 bytes in 4,653 blocks
==4996== total heap usage: 575,705 allocs,571,052 frees,015,102 bytes allocated
==4996==
==4996== 4 bytes in 1 blocks are possibly lost in loss record 7 of 2,212
==4996== at 0x483B723: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x483E017: realloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x5B27783: _fmm_map_to_gpu.isra.8 (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B28B57: fmm_map_to_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2950F: fmm_init_process_apertures (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2CD2B: hsaKmtOpenKFD (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5401368: rocr::AMD::Load() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x542440D: rocr::core::Runtime::Acquire() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x54027C9: rocr::HSA::hsa_init() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5135C84: roc::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996== by 0x50F832E: amd::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996== by 0x5102525: amd::Runtime::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996==
==4996== 4 bytes in 1 blocks are possibly lost in loss record 8 of 2,212
==4996== at 0x483B723: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x483E017: realloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x5B27783: _fmm_map_to_gpu.isra.8 (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B28B57: fmm_map_to_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2C520: hsaKmtMapMemoryToGPU (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2F29B: allocate_exec_aligned_memory_gpu (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B25679: hsaKmtCreateEvent (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5414E62: rocr::core::InterruptSignal::CreateEvent(_HSA_EVENTTYPE,bool) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x541FD00: rocr::core::Runtime::BindVmFaultHandler() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5424427: rocr::core::Runtime::Acquire() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x54027C9: rocr::HSA::hsa_init() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5135C84: roc::Device::init() (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
... Lots of similar repeating errors here ...
==4996== 720 bytes in 3 blocks are possibly lost in loss record 2,165 of 2,212
==4996== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0x5B262DF: vm_create_and_init_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B265EC: fmm_allocate_memory_object (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B28219: fmm_allocate_host (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x5B2B9FA: hsaKmtAllocmemory (in /opt/rocm-3.7.0/lib/libhsakmt.so.1.0.30700)
==4996== by 0x53F0E31: rocr::AMD::MemoryRegion::Allocate(unsigned long&,unsigned int,void**) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x541C4C2: std::_Function_handler<void* (unsigned long,unsigned int),rocr::core::Runtime::Registeragent(rocr::core::Agent*)::{lambda(unsigned long,unsigned int)#1}>::_M_invoke(std::_Any_data const&,unsigned long&&,std::_Any_data const&,unsigned int&&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53DF22E: rocr::AMD::GpuAgent::AssembleShader(char const*,rocr::AMD::GpuAgent::AssembleTarget,void*&,unsigned long&) const (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53D39B2: rocr::AMD::BlitKernel::Initialize(rocr::core::Agent const&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53DD58A: std::_Function_handler<rocr::core::Blit* (),rocr::AMD::GpuAgent::InitDma()::{lambda()#3}>::_M_invoke(std::_Any_data const&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53DD2F9: rocr::AMD::GpuAgent::Dmacopy(void*,void const*,unsigned long) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x53ED80A: rocr::(anonymous namespace)::RegionMemory::Freeze() (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996==
==4996== 1,200 bytes in 5 blocks are possibly lost in loss record 2,173 of 2,unsigned int&&) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x542B8D5: rocr::core::Shared<rocr::core::SharedSignal,rocr::core::SharedSignalPool_t>::Shared(rocr::core::SharedSignalPool_t*,int) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x542921E: rocr::core::LocalSignal::LocalSignal(long,bool) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5414EE8: rocr::core::InterruptSignal::InterruptSignal(long,_HSAEvent*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x5410E85: rocr::AMD::hsa_amd_signal_create(long,hsa_agent_s const*,hsa_signal_s*) (in /opt/rocm-3.7.0/lib/libhsa-runtime64.so.1.2.30700)
==4996== by 0x513B727: roc::VirtualGPU::initPool(unsigned long,unsigned int) (in /opt/rocm-3.7.0/opencl/lib/libamdocl64.so)
==4996==
==4996== 67,584 bytes in 1 blocks are definitely lost in loss record 2,209 of 2,212
==4996== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==4996== by 0xA977E83: RegisterHandlers() (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0xA9787EB: llvm::sys::RemoveFileOnSignal(llvm::StringRef,std::allocator<char> >*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F2BC0E: clang::CompilerInstance::createOutputFile(llvm::StringRef,std::error_code&,llvm::StringRef,std::allocator<char> >*,std::allocator<char> >*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F2BFD2: clang::CompilerInstance::createOutputFile(llvm::StringRef,bool) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F2C39E: clang::CompilerInstance::createDefaultOutputFile(bool,llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x77F3913: GetoutputStream(clang::CompilerInstance&,clang::BackendAction) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x77F959A: clang::CodeGenAction::CreateASTConsumer(clang::CompilerInstance&,llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F6D175: clang::FrontendAction::CreateWrappedASTConsumer(clang::CompilerInstance&,llvm::StringRef) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F70865: clang::FrontendAction::BeginSourceFile(clang::CompilerInstance&,clang::FrontendInputFile const&) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x8F2F079: clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996== by 0x743641A: clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (in /opt/rocm-3.7.0/lib/libamd_comgr.so.1.6.30700)
==4996==
==4996== LEAK SUMMARY:
==4996== definitely lost: 67,584 bytes in 1 blocks
==4996== indirectly lost: 0 bytes in 0 blocks
==4996== possibly lost: 7,692 bytes in 66 blocks
==4996== still reachable: 916,671 bytes in 4,586 blocks
==4996== suppressed: 0 bytes in 0 blocks
==4996== Reachable blocks (those to which a pointer was found) are not shown.
==4996== To see them,rerun with: --leak-check=full --show-leak-kinds=all
==4996==
==4996== For lists of detected and suppressed errors,rerun with: -s
==4996== ERROR SUMMARY: 56 errors from 56 contexts (suppressed: 0 from 0)
解决方法
如果您正在使用“标准” libstdc ++和libc,则在您的情况下,不匹配的strdup / delete(而不是strdup / free)不会造成问题。这是因为libstdc ++ new
仅调用malloc
,而delete
调用free
。另一方面,如果您使用基于delete
不是的替换free
,则您的应用程序很可能崩溃。
对于内存泄漏,您需要使用memcheck
运行--leak-check=full
,因为它在Valgrind输出的末尾显示。