PMC 计算软件预取是否命中 L1 缓存

问题描述

我正在尝试找到一个 PMC（性能监控计数器）来显示 prefetcht0 指令命中 L1 dcache（或未命中）的次数。

icelake-client：Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz

我正在尝试制作这种精细的颗粒，即（注意应该在 lfence 周围包含 prefetcht0）

    xorl %ecx,%ecx
    rdpmc
    movl %eax,%edi
    prefetcht0 (%rsi)
    rdpmc
    testl %eax,%edi
    // jump depending on if it was a miss or not

目标是检查预取是否命中 L1。如果没有执行一些准备好的代码，否则继续。

根据现有的情况，它似乎必须是一个未命中的事件。

我尝试了一些事件 from libpfm4 和英特尔手册，但都没有运气：

L1-DCACHE-LOAD-MISSES,emask=0x00,umask=0x10000
L1D.REPLACEMENT,emask=0x51,umask=0x1 
L2_RQSTS.SWPF_HIT,emask=0x24,umask=0xc8
L2_RQSTS.SWPF_MISS,umask=0x28
LOAD_HIT_PREFETCH.SWPF,emask=0x01,umask=0x4c  (this very misleadingly is non-sw prefetch hits)

L1D.REPLACEMENT 和 L1-DCACHE-LOAD-MISSES 类型的作品，如果我延迟 rdpmc 它会起作用，但如果它们一个接一个，它充其量似乎不可靠。其他的完全是半身像。

问题：

这些是否应该用于检测预取是否命中 L1 dcache？（即我的测试很糟糕）
如果没有。什么事件可用于检测预取是否命中 L1 dcache？

编辑：MEM_LOAD_RETIRED.L1_HIT 似乎不适用于软件预取。

这是我用来做测试的代码：

#include <asm/unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>


#define HIT  0
#define MISS 1

#define TODO MISS


#define PAGE_SIZE 4096

// to force hit make TSIZE low
#define TSIZE     10000

#define err_assert(cond)                                                       \
    if (__builtin_expect(!(cond),0)) {                                        \
        fprintf(stderr,"%d:%d: %s\n",__LINE__,errno,strerror(errno));      \
        exit(-1);                                                              \
    }


uint64_t
get_addr() {
    uint8_t * addr =
        (uint8_t *)mmap(NULL,TSIZE * PAGE_SIZE,PROT_READ | PROT_WRITE,MAP_PRIVATE | MAP_ANONYMOUS,-1,0);
    err_assert(addr != NULL);


    for (uint32_t i = 0; i < TSIZE; ++i) {
        addr[i * PAGE_SIZE + (PAGE_SIZE - 1)] = 0;
        #if TODO == HIT
        addr[i * PAGE_SIZE] = 0;
        #endif
    }

    return uint64_t(addr);
}

int
perf_event_open(struct perf_event_attr * hw_event,pid_t                    pid,int                      cpu,int                      group_fd,unsigned long            flags) {
    int ret;

    ret = syscall(__NR_perf_event_open,hw_event,pid,cpu,group_fd,flags);
    return ret;
}

void
init_perf_event_struct(struct perf_event_attr * pe,const uint32_t           type,const uint64_t           ev_config,int                      lead) {
    __builtin_memset(pe,sizeof(struct perf_event_attr));

    pe->type           = type;
    pe->size           = sizeof(struct perf_event_attr);
    pe->config         = ev_config;
    pe->disabled       = !!lead;
    pe->exclude_kernel = 1;
    pe->exclude_hv     = 1;
}


/* Fixed Counters */
static constexpr uint32_t core_instruction_ev  = 0x003c;
static constexpr uint32_t core_instruction_idx = (1 << 30) + 0;

static constexpr uint32_t core_cycles_ev  = 0x00c0;
static constexpr uint32_t core_cycles_idx = (1 << 30) + 1;

static constexpr uint32_t ref_cycles_ev  = 0x0300;
static constexpr uint32_t ref_cycles_idx = (1 << 30) + 2;

/* programmable counters */
static constexpr uint32_t mem_load_retired_l1_hit  = 0x01d1;
static constexpr uint32_t mem_load_retired_l1_miss = 0x08d1;


int
init_perf_tracking() {
    struct perf_event_attr pe;

    init_perf_event_struct(&pe,PERF_TYPE_RAW,core_instruction_ev,1);
    int leadfd = perf_event_open(&pe,0);
    err_assert(leadfd >= 0);

    init_perf_event_struct(&pe,core_cycles_ev,0);
    err_assert(perf_event_open(&pe,leadfd,0) >= 0);

    init_perf_event_struct(&pe,ref_cycles_ev,0) >= 0);


    init_perf_event_struct(&pe,mem_load_retired_l1_hit,0) >= 0);

    return leadfd;
}

void
start_perf_tracking(int leadfd) {
    ioctl(leadfd,PERF_EVENT_IOC_RESET,0);
    ioctl(leadfd,PERF_EVENT_IOC_ENABLE,0);
}

#define _V_TO_STR(X) #X
#define V_TO_STR(X)  _V_TO_STR(X)

//#define DO_PREFETCH
#ifdef DO_PREFETCH
#define DO_MEMORY_OP(addr) "prefetcht0 (%[" V_TO_STR(addr) "])\n\t"
#else
#define DO_MEMORY_OP(addr) "movl (%[" V_TO_STR(addr) "]),%%eax\n\t"
#endif


int
main() {
    int fd = init_perf_tracking();
    start_perf_tracking(fd);

    uint64_t addr = get_addr();

    uint32_t prefetch_miss,cycles_to_detect;
    asm volatile(
        "lfence\n\t"
        "movl %[core_cycles_idx],%%ecx\n\t"
        "rdpmc\n\t"
        "movl %%eax,%[cycles_to_detect]\n\t"
        "xorl %%ecx,%[prefetch_miss]\n\t"
        "lfence\n\t"
        DO_MEMORY_OP(prefetch_addr)
        "lfence\n\t"
        "xorl %%ecx,%%ecx\n\t"
        "rdpmc\n\t"
        "subl %[prefetch_miss],%%eax\n\t"
        "movl %%eax,%[prefetch_miss]\n\t"
        "movl %[core_cycles_idx],%%ecx\n\t"
        "rdpmc\n\t"
        "subl %[cycles_to_detect],%[cycles_to_detect]\n\t"
        "lfence\n\t"
        : [ prefetch_miss ] "=&r"(prefetch_miss),[ cycles_to_detect ] "=&r"(cycles_to_detect)
        : [ prefetch_addr ] "r"(addr),[ core_cycles_idx ] "i"(core_cycles_idx)
        : "eax","edx","ecx");

    fprintf(stderr,"Hit    : %d\n",prefetch_miss);
    fprintf(stderr,"Cycles : %d\n",cycles_to_detect);
}

如果我定义 DO_PREFETCH，MEM_LOAD_RETIRED.L1_HIT 的结果总是 1（似乎总是命中）。如果我注释掉 DO_PREFETCH，结果与我预期的相符（当地址明显不在缓存中时报告未命中，当它显然是报告命中时）。

使用DO_PREFETCH：

g++ -DDO_PREFETCH -O3 -march=native -mtune=native prefetch_hits.cc -o prefetch_hits
$> ./prefetch_hits
Hit    : 1
Cycles : 554

并且没有DO_PREFETCH

g++ -DDO_PREFETCH -O3 -march=native -mtune=native prefetch_hits.cc -o prefetch_hits
$> ./prefetch_hits
Hit    : 0
Cycles : 888

使用 L2_RQSTS.SWPF_HIT 和 L2_RQSTS.SWPF_MISS 能够让它工作。非常感谢 Hadi Brais。值得注意的是，L1D_PEND_MISS.PENDING 不起作用的原因可能与 Icelake 有关。 Hadi Brais 报告说，它可以用于预测 Haswell 上的 L1D 缓存未命中。

为了确定为什么 L1_PEND_MISS.PENDING 和 MEM_LOAD_RETIRED.L1_HIT 不起作用，发布了我用于测试它们的确切代码：

#include <asm/unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>


#define HIT  0
#define MISS 1

#define TODO MISS


#define PAGE_SIZE 4096

#define TSIZE 1000

#define err_assert(cond)                                                       \
    if (__builtin_expect(!(cond),0);
    err_assert(addr != NULL);
    __builtin_memset(addr,TSIZE * PAGE_SIZE);
    return uint64_t(addr);
}

int
perf_event_open(struct perf_event_attr * hw_event,sizeof(struct perf_event_attr));

    pe->type           = type;
    pe->size           = sizeof(struct perf_event_attr);
    pe->config         = ev_config;
    pe->disabled       = !!lead;
    pe->exclude_kernel = 1;
    pe->exclude_hv     = 1;
}


/* Fixed Counters */
static constexpr uint32_t core_instruction_ev  = 0x003c;
static constexpr uint32_t core_instruction_idx = (1 << 30) + 0;

static constexpr uint32_t core_cycles_ev  = 0x00c0;
static constexpr uint32_t core_cycles_idx = (1 << 30) + 1;

static constexpr uint32_t ref_cycles_ev  = 0x0300;
static constexpr uint32_t ref_cycles_idx = (1 << 30) + 2;

/* programmable counters */
static constexpr uint32_t mem_load_retired_l1_hit  = 0x01d1;
static constexpr uint32_t mem_load_retired_l1_miss = 0x08d1;
static constexpr uint32_t l1d_pending              = 0x0148;
static constexpr uint32_t swpf_hit                 = 0xc824;
static constexpr uint32_t swpf_miss                = 0x2824;
static constexpr uint32_t ev0                      = l1d_pending;

#define NEVENTS 1
#if NEVENTS > 1
static constexpr uint32_t ev1 = swpf_miss;
#endif

int
init_perf_tracking() {
    struct perf_event_attr pe;

    init_perf_event_struct(&pe,ev0,0) >= 0);

#if NEVENTS > 1
    init_perf_event_struct(&pe,ev1,0) >= 0);
#endif

    return leadfd;
}

void
start_perf_tracking(int leadfd) {
    ioctl(leadfd,0);
}

#define _V_TO_STR(X) #X
#define V_TO_STR(X)  _V_TO_STR(X)

//#define LFENCE
#ifdef LFENCE
#define SERIALIZER() "lfence\n\t"
#else
#define SERIALIZER()                                                           \
    "xorl %%ecx,%%ecx\n\t"                                                    \
    "xorl %%eax,%%eax\n\t"                                                    \
    "cpuid\n\t"

#endif

#define DO_PREFETCH

#ifdef DO_PREFETCH
#define DO_MEMORY_OP(addr) "prefetcht0 (%[" V_TO_STR(addr) "])\n\t"
#else
#define DO_MEMORY_OP(addr) "movl (%[" V_TO_STR(addr) "]),%%eax\n\t"
#endif


int
main() {
    int fd = init_perf_tracking();
    start_perf_tracking(fd);

    uint64_t addr = get_addr();

    // to ensure page in TLB
    *((volatile uint64_t *)(addr + (PAGE_SIZE - 8))) = 0;
    
#if TODO == HIT
    // loading from 0 offset to check cache miss / hit
    *((volatile uint64_t *)addr) = 0;
#endif

    uint32_t ecount0 = 0,ecount1 = 0,cycles_to_detect = 0;
    asm volatile(
        SERIALIZER()
        "movl %[core_cycles_idx],%[ecount0]\n\t"
#if NEVENTS > 1
        "movl $1,%[ecount1]\n\t"
#endif
        SERIALIZER()
        DO_MEMORY_OP(prefetch_addr)
        SERIALIZER()
        "xorl %%ecx,%%ecx\n\t"
        "rdpmc\n\t"
        "subl %[ecount0],%%ecx\n\t"
        "rdpmc\n\t"
        "subl %[ecount1],%[ecount1]\n\t"
#endif
        "movl %[core_cycles_idx],%[cycles_to_detect]\n\t"
        SERIALIZER()
        : [ ecount0 ] "=&r"(ecount0),#if NEVENTS > 1
          [ ecount1 ] "=&r"(ecount1),#endif
          [ cycles_to_detect ] "=&r"(cycles_to_detect)
        : [ prefetch_addr ] "r"(addr),"E0     : %d\n",ecount0);
    fprintf(stderr,"E1     : %d\n",ecount1);
    fprintf(stderr,cycles_to_detect);
}

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

intel intel-pmu memory-barriers performancecounter x86-64