为 DMA 查找 NVIDIA GPU 内存的物理地址

问题描述

我试图找到 GPU 内存的物理 PCIe 地址空间内存位置,以支持由外部 PCIe 资源(例如 FPGA)发起的入站 DMA(类似于 How to get physical address of GPU memory for DMA? (OpenCL))。我们特别努力避免 GPU 控制的操作或通过主机内存间接移动数据。

我有一个 linux 虚拟内存地址到物理内存地址的转换(见下面的代码),它非常适合解码 mmap,包括其他 PCIe 设备,当然我知道“区域”报告的 GPU 的物理地址在lspci -vvv

我尝试检测 CUDA 示例 memmapIPCDrv 来检查 cuMemmap 和 cuMemCreate 输出的虚拟和物理地址;我已经尝试查看 python numba 中的 id(obj)obj.gpu_data.device_ctypes_pointer.value 虚拟和物理内存地址。在所有情况下,生成的物理地址都在主机内存中,而不是来自 GPU 地址空间。我将在下面包含 python 测试用例。

关于如何找到实际的 GPU 上物理 PCIe 地址的任何想法?

以下示例代码使用 GPU 内存,然后检查关联的物理内存映射到何处:


import numpy as np
import sys
import numba
from numba import cuda
import socket
import struct
from numba.cuda.cudadrv import driver as _driver
import re
import os

# Prepare for virtual to physical address translation
mySocket = socket.socket(socket.AF_UNIX,socket.soCK_STREAM)
mySocket.connect("/run/pagemapd")

def do_translate(a):
    """Virtual to Physical Address translation"""
    b = struct.pack('Q',a)
    mySocket.sendall(b)
    data = mySocket.recv(8)
    return struct.unpack("Q",data)[0]

def hexsep(n):
    """Convert int to hex and then seperate digits into groups"""
    return _hexsep(hex(n))

def _hexsep(H):
    """Seperate hex digits into groups (of 4,_ separated)"""
    h = ''
    while h != H:
        h = H
        H = re.sub(r'([0-9a-f]+)([0-9a-f]{4})(?:\b|_)',r'\1_\2',h)
    return(h)

def p_translate(a):
    """Translate virtual address to physical and return both as string"""
    pa = do_translate(a)
    return f"v{hexsep(a)}->p{hexsep(pa)}"

if numba.cuda.is_available() is False:
    print("no CUDA GPU found")
    sys.exit()

# Create array,push it to GPU
n = np.ones(8192,dtype=np.uint8)*7
ngpu = cuda.mapped_array_like(n)
narr = numba.cuda.as_cuda_array(ngpu)
narr.copy_to_device(n)

# Print relevant physical and virtual addresses,and where memory lives
print(f"n    addresses are: id(n)={p_translate(id(n))},id(n.data)={p_translate(id(n.data))},Memory is {'on' if _driver.is_device_memory(n) else 'off'} GPU")
print(f"ngpu addresses are: id(ngpu)={p_translate(id(ngpu))},id(ngpu.data)={p_translate(id(ngpu.data))},ngpu.gpu_data={p_translate(ngpu.gpu_data.device_ctypes_pointer.value)},Memory is {'on' if _driver.is_device_memory(ngpu) else 'off'} GPU")
print(f"narr addresses are: id(narr)={p_translate(id(narr))},narr.gpu_data={p_translate(narr.gpu_data.device_ctypes_pointer.value)},Memory is {'on' if _driver.is_device_memory(ngpu) else 'off'} GPU")

# Print contents of array
print("ngpu",ngpu)
nn = narr.copy_to_host()
print("copy",nn)

# Set environmental variable DEBUG to anything to print out the mapping table,to see where addresses came from
if "DEBUG" in os.environ:
    with open("/proc/self/maps") as R:
        for l in R:
            m = re.search(r'^([0-9a-f]{4,})-([0-9a-f]{4,})(.*)',l)
            if m:
                print(_hexsep(m.group(1))+"-"+_hexsep(m.group(2))+m.group(3),sep='')
            else:
                print(l,sep='')
// Service (run as root) translating virtual address to physical

#define _GNU_SOURCE
#include <sys/socket.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <popt.h>
#include <arpa/inet.h>
#include <ctype.h>
#include <err.h>
#include <linux/un.h>
#include <pthread.h>
#include <sys/stat.h>

#define SPATH "/run/pagemapd"
long pagesize;

// Handle a particular client connection,performing translation
static void *thread_start(void *arg)
{
  char buf[sizeof(uintptr_t)*6];
  struct ucred ucred;
  int ucredlen = sizeof(ucred);
  pid_t targetpid = *(pid_t *)buf;
  uintptr_t targetaddr = ((uintptr_t *)buf)[1];
  int cfd = *(int *)arg;
  int pfd = -1;

  if (getsockopt(cfd,SOL_SOCKET,SO_PEERCRED,&ucred,&ucredlen) < 0)
  {
    warn("Cannot getsockopt");
    goto endf;
  }

  snprintf(buf,sizeof(buf),"/proc/%d/pagemap",ucred.pid);
  if ((pfd = open(buf,O_RDONLY)) < 0)
  {
    warn("Cannot open pagemap for %d from %s",ucred.pid,buf);
    goto endf;
  }

  while (read(cfd,buf,sizeof(buf)) > 0)
  {
    uintptr_t targetaddr = *(uintptr_t *)buf;
    uintptr_t entry = 0;

    if (lseek(pfd,(uintptr_t)targetaddr / pagesize * sizeof(uintptr_t),SEEK_SET) < 0)
    {
      warn("Cannot seek for %d",ucred.pid);
      goto endf;
    }

    if (read(pfd,&entry,sizeof(entry)) != sizeof(entry))
    {
      warn("Cannot read for %d",ucred.pid);
      goto endf;
    }

    targetaddr = (((entry & 0x7fffffffffffffULL) * pagesize) + (((uintptr_t)targetaddr) % pagesize));

    if (write(cfd,&targetaddr,sizeof(targetaddr)) != sizeof(targetaddr))
    {
      warn("Cannot write for %d",ucred.pid);
      goto endf;
    }
  }
 endf:
  close(cfd);
  if (pfd >= 0)
    close(pfd);
}

// Set up socket and create thread per connection
int main()
{
  int fd = socket(PF_UNIX,SOCK_STREAM,0);
  if (fd < 0)
    err(1,"socket");

  pagesize = sysconf(_SC_PAGESIZE);

  struct sockaddr_un address,client;
  int clientlen = sizeof(client);
  memset(&address,sizeof(struct sockaddr_un));

  unlink(SPATH);
  address.sun_family = AF_UNIX;
  snprintf(address.sun_path,UNIX_PATH_MAX,SPATH);
  if (bind(fd,(struct sockaddr *)&address,sizeof(address)) < 0)
    err(1,"bind");

  chmod(SPATH,0666);
  if (listen(fd,5) < 0)
    err(1,"listen");

  char buf[sizeof(uintptr_t)*2];
  struct ucred ucred;
  int clientfd;
  while ((clientfd = accept(fd,(struct sockaddr *)&client,&clientlen)) >= 0)
  {
    pthread_t child;
    if (pthread_create(&child,NULL,thread_start,&clientfd) < 0)
      err(2,"pthread_create");
  }
}

解决方法

为 PCIe 访问公开 CUDA 内存需要内核驱动程序调用。您可以在 GPUDirect RDMA documentation 中找到有关如何执行此操作的详细说明。