如何将 cudaArray 提供给 Windows 机器学习推理引擎？

问题描述

我正在尝试为实时图像处理软件开发一个由 ML 驱动的插件，该插件在 GPU 上以 cudaArray_t 的形式提供图像数据，但由于该软件将我锁定在较旧的 CUDA 版本中，我希望使用 DirectML 执行此操作（无论如何，该软件仅适用于 Windows）。

出于延迟原因，我不想进行任何不必要的 GPU-CPU-GPU 往返。为此，我认为我需要将 CUDA 数据映射到 D3D12 资源，然后可以使用这些资源创建输入和输出张量以绑定到模型。我找到了一个使用 CUDA External Resource Interoperability API 将 cudaArray_t 映射到 ID3D12Resource here 的示例，我正试图以此作为代码的基础。由于我不需要渲染任何东西，我认为我可以简单地创建堆和资源，然后将传入的 cudaArray_t 复制到互操作 cudaArray_t 中，如下所示，而无需创建任何排序命令队列。请注意，缺少的代码与上面链接的 github 存储库中的代码相同，因此为了简洁起见，我将其省略了。

这种方法不起作用，但我不知道如何调试它，因为我通常不熟悉 Direct3D 编程和 GPU 编程。我使用 official Direct3D 12 docs 作为参考，但它有点不知所措，因此非常感谢有关此处应该修复的内容的一些指导:) 我在想我需要使用信号量进行某种同步，但我不确定在不创建某种命令队列的情况下是否可行。

bool initD3d12() {
  // setup the d3d12 device
  UINT dxgiFactoryFlags = 0;
  winrt::com_ptr<IDXGIFactory4> factory;
  winrt::check_hresult(CreateDXGIFactory2(dxgiFactoryFlags,IID_PPV_ARGS(factory.put())));
  winrt::com_ptr<IDXGIAdapter1> hardwareAdapter;
  GetHardwareAdapter(factory.get(),hardwareAdapter.put());
  
  winrt::check_hresult(D3D12CreateDevice(hardwareAdapter.get(),D3D_FEATURE_LEVEL_11_0,IID_PPV_ARGS(m_d3d12Device.put())));
  
  DXGI_ADAPTER_DESC1 desc;
  hardwareAdapter->GetDesc1(&desc);
  m_dx12deviceluid = desc.AdapterLuid;
  
  return true;
}
  
void initCuda() {
  // setup the cuda device
  int num_cuda_devices = 0;
  checkCudaErrors(cudaGetDeviceCount(&num_cuda_devices));
  
  if (!num_cuda_devices) {
      throw std::exception("No CUDA Devices found");
  }
  for (int devId = 0; devId < num_cuda_devices; devId++) {
      cudaDeviceProp devProp;
      checkCudaErrors(cudaGetDeviceProperties(&devProp,devId));
  
      if ((memcmp(&m_dx12deviceluid.LowPart,devProp.luid,sizeof(m_dx12deviceluid.LowPart)) == 0) &&
          (memcmp(&m_dx12deviceluid.HighPart,devProp.luid + sizeof(m_dx12deviceluid.LowPart),sizeof(m_dx12deviceluid.HighPart)) == 0)) {
          checkCudaErrors(cudaSetDevice(devId));
          m_cudaDeviceID = devId;
          m_nodeMask = devProp.luidDeviceNodeMask;
          checkCudaErrors(cudaStreamCreate(&m_streamToRun));
          printf("CUDA Device Used [%d] %s\n",devId,devProp.name);
          break;
      }
  }
}
  
void copyArrayToResource(cudaArray_t cudaArray) {
  // then we want to copy cudaArray to the D3D texture,via its mapped form : cudaArray
  cudaMemcpy2DArrayToArray(
      m_cudaArray,// dst array
      0,// offset
      cudaArray,// src
      m_width * 4 * sizeof(float),m_height,// extent
      cudaMemcpyDeviceToDevice); // kind
}
  
void createResource(size_t width,size_t height,ID3D12Resource** d3d12Resource) {
  // Create a d3d12 resource in the desired size and map it to a cudaArray
  m_width = width;
  m_height = height;
  // Create D3D12 2DTexture
  // Assume 32-Bit float RGBA image
  const auto channels = 4;
  const auto textureSurface = width * height;
  const auto texturePixels = textureSurface * channels;
  const auto textureSizeBytes = sizeof(float)* texturePixels;
  
  const auto texFormat = channels == 4 ? DXGI_FORMAT_R32G32B32A32_FLOAT : DXGI_FORMAT_R32G32B32_FLOAT;
  const auto texDesc = CD3DX12_RESOURCE_DESC::Tex2D(texFormat,width,height,1,D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS);
  D3D12_HEAP_PROPERTIES heapProperties = {
      D3D12_HEAP_TYPE_DEFAULT,D3D12_CPU_PAGE_PROPERTY_UNKNOWN,D3D12_MEMORY_POOL_UNKNOWN,0};
  
  winrt::check_hresult(m_d3d12Device->CreateCommittedResource(
      &heapProperties,D3D12_HEAP_FLAG_SHARED,&texDesc,D3D12_RESOURCE_STATE_COMMON,nullptr,IID_PPV_ARGS(d3d12Resource)));
  
  
  // Create CUDA external resource
  HANDLE sharedHandle;
  WindowsSecurityAttributes windowsSecurityAttributes{};
  LPCWSTR name = NULL;
  winrt::check_hresult(m_d3d12Device->CreateSharedHandle(
      *d3d12Resource,&windowsSecurityAttributes,GENERIC_ALL,&sharedHandle));
  
  D3D12_RESOURCE_ALLOCATION_INFO d3d12ResourceAllocationInfo;
  d3d12ResourceAllocationInfo = m_d3d12Device->GetResourceAllocationInfo(
      m_nodeMask,&texDesc);
  size_t actualSize = d3d12ResourceAllocationInfo.SizeInBytes;
  size_t alignment = d3d12ResourceAllocationInfo.Alignment;
  
  cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
  memset(&externalMemoryHandleDesc,sizeof(externalMemoryHandleDesc));
  
  externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
  externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
  externalMemoryHandleDesc.size = actualSize;
  externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
  
  checkCudaErrors(
      cudaImportExternalMemory(&m_externalMemory,&externalMemoryHandleDesc));
  
  cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
  cuExtmemMipDesc.extent = make_cudaExtent(width,0);
  cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
  cuExtmemMipDesc.numLevels = 1;
  cuExtmemMipDesc.flags = cudaArrayDefault;
  
  cudaMipmappedArray_t cuMipArray{};
  checkCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray,m_externalMemory,&cuExtmemMipDesc));
  
  checkCudaErrors(cudaGetMipmappedArrayLevel(&m_cudaArray,cuMipArray,0));
}

最后，如果映射到 ID3D12Resource 可行，我假设可以使用 ITensorStaticsNative interface 创建张量以绑定到 LearningModel 的输出或输入。

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

c++cuda direct3d12 windows-machine-learning