问题描述
我正在尝试找到一个 PMC(性能监控计数器)来显示 prefetcht0
指令命中 L1 dcache(或未命中)的次数。
icelake-client:Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
我正在尝试制作这种精细的颗粒,即(注意应该在 lfence
周围包含 prefetcht0
)
xorl %ecx,%ecx
rdpmc
movl %eax,%edi
prefetcht0 (%rsi)
rdpmc
testl %eax,%edi
// jump depending on if it was a miss or not
目标是检查预取是否命中 L1。如果没有执行一些准备好的代码,否则继续。
根据现有的情况,它似乎必须是一个未命中的事件。
我尝试了一些事件 from libpfm4 和英特尔手册,但都没有运气:
L1-DCACHE-LOAD-MISSES,emask=0x00,umask=0x10000
L1D.REPLACEMENT,emask=0x51,umask=0x1
L2_RQSTS.SWPF_HIT,emask=0x24,umask=0xc8
L2_RQSTS.SWPF_MISS,umask=0x28
LOAD_HIT_PREFETCH.SWPF,emask=0x01,umask=0x4c (this very misleadingly is non-sw prefetch hits)
L1D.REPLACEMENT
和 L1-DCACHE-LOAD-MISSES
类型的作品,如果我延迟 rdpmc
它会起作用,但如果它们一个接一个,它充其量似乎不可靠。其他的完全是半身像。
问题:
- 这些是否应该用于检测预取是否命中 L1 dcache? (即我的测试很糟糕)
- 如果没有。什么事件可用于检测预取是否命中 L1 dcache?
编辑:MEM_LOAD_RETIRED.L1_HIT
似乎不适用于软件预取。
这是我用来做测试的代码:
#include <asm/unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>
#define HIT 0
#define MISS 1
#define TODO MISS
#define PAGE_SIZE 4096
// to force hit make TSIZE low
#define TSIZE 10000
#define err_assert(cond) \
if (__builtin_expect(!(cond),0)) { \
fprintf(stderr,"%d:%d: %s\n",__LINE__,errno,strerror(errno)); \
exit(-1); \
}
uint64_t
get_addr() {
uint8_t * addr =
(uint8_t *)mmap(NULL,TSIZE * PAGE_SIZE,PROT_READ | PROT_WRITE,MAP_PRIVATE | MAP_ANONYMOUS,-1,0);
err_assert(addr != NULL);
for (uint32_t i = 0; i < TSIZE; ++i) {
addr[i * PAGE_SIZE + (PAGE_SIZE - 1)] = 0;
#if TODO == HIT
addr[i * PAGE_SIZE] = 0;
#endif
}
return uint64_t(addr);
}
int
perf_event_open(struct perf_event_attr * hw_event,pid_t pid,int cpu,int group_fd,unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open,hw_event,pid,cpu,group_fd,flags);
return ret;
}
void
init_perf_event_struct(struct perf_event_attr * pe,const uint32_t type,const uint64_t ev_config,int lead) {
__builtin_memset(pe,sizeof(struct perf_event_attr));
pe->type = type;
pe->size = sizeof(struct perf_event_attr);
pe->config = ev_config;
pe->disabled = !!lead;
pe->exclude_kernel = 1;
pe->exclude_hv = 1;
}
/* Fixed Counters */
static constexpr uint32_t core_instruction_ev = 0x003c;
static constexpr uint32_t core_instruction_idx = (1 << 30) + 0;
static constexpr uint32_t core_cycles_ev = 0x00c0;
static constexpr uint32_t core_cycles_idx = (1 << 30) + 1;
static constexpr uint32_t ref_cycles_ev = 0x0300;
static constexpr uint32_t ref_cycles_idx = (1 << 30) + 2;
/* programmable counters */
static constexpr uint32_t mem_load_retired_l1_hit = 0x01d1;
static constexpr uint32_t mem_load_retired_l1_miss = 0x08d1;
int
init_perf_tracking() {
struct perf_event_attr pe;
init_perf_event_struct(&pe,PERF_TYPE_RAW,core_instruction_ev,1);
int leadfd = perf_event_open(&pe,0);
err_assert(leadfd >= 0);
init_perf_event_struct(&pe,core_cycles_ev,0);
err_assert(perf_event_open(&pe,leadfd,0) >= 0);
init_perf_event_struct(&pe,ref_cycles_ev,0) >= 0);
init_perf_event_struct(&pe,mem_load_retired_l1_hit,0) >= 0);
return leadfd;
}
void
start_perf_tracking(int leadfd) {
ioctl(leadfd,PERF_EVENT_IOC_RESET,0);
ioctl(leadfd,PERF_EVENT_IOC_ENABLE,0);
}
#define _V_TO_STR(X) #X
#define V_TO_STR(X) _V_TO_STR(X)
//#define DO_PREFETCH
#ifdef DO_PREFETCH
#define DO_MEMORY_OP(addr) "prefetcht0 (%[" V_TO_STR(addr) "])\n\t"
#else
#define DO_MEMORY_OP(addr) "movl (%[" V_TO_STR(addr) "]),%%eax\n\t"
#endif
int
main() {
int fd = init_perf_tracking();
start_perf_tracking(fd);
uint64_t addr = get_addr();
uint32_t prefetch_miss,cycles_to_detect;
asm volatile(
"lfence\n\t"
"movl %[core_cycles_idx],%%ecx\n\t"
"rdpmc\n\t"
"movl %%eax,%[cycles_to_detect]\n\t"
"xorl %%ecx,%[prefetch_miss]\n\t"
"lfence\n\t"
DO_MEMORY_OP(prefetch_addr)
"lfence\n\t"
"xorl %%ecx,%%ecx\n\t"
"rdpmc\n\t"
"subl %[prefetch_miss],%%eax\n\t"
"movl %%eax,%[prefetch_miss]\n\t"
"movl %[core_cycles_idx],%%ecx\n\t"
"rdpmc\n\t"
"subl %[cycles_to_detect],%[cycles_to_detect]\n\t"
"lfence\n\t"
: [ prefetch_miss ] "=&r"(prefetch_miss),[ cycles_to_detect ] "=&r"(cycles_to_detect)
: [ prefetch_addr ] "r"(addr),[ core_cycles_idx ] "i"(core_cycles_idx)
: "eax","edx","ecx");
fprintf(stderr,"Hit : %d\n",prefetch_miss);
fprintf(stderr,"Cycles : %d\n",cycles_to_detect);
}
如果我定义 DO_PREFETCH
,MEM_LOAD_RETIRED.L1_HIT
的结果总是 1(似乎总是命中)。如果我注释掉 DO_PREFETCH
,结果与我预期的相符(当地址明显不在缓存中时报告未命中,当它显然是报告命中时)。
使用DO_PREFETCH
:
g++ -DDO_PREFETCH -O3 -march=native -mtune=native prefetch_hits.cc -o prefetch_hits
$> ./prefetch_hits
Hit : 1
Cycles : 554
并且没有DO_PREFETCH
g++ -DDO_PREFETCH -O3 -march=native -mtune=native prefetch_hits.cc -o prefetch_hits
$> ./prefetch_hits
Hit : 0
Cycles : 888
使用 L2_RQSTS.SWPF_HIT
和 L2_RQSTS.SWPF_MISS
能够让它工作。非常感谢 Hadi Brais。值得注意的是,L1D_PEND_MISS.PENDING
不起作用的原因可能与 Icelake 有关。 Hadi Brais 报告说,它可以用于预测 Haswell 上的 L1D 缓存未命中。
为了确定为什么 L1_PEND_MISS.PENDING
和 MEM_LOAD_RETIRED.L1_HIT
不起作用,发布了我用于测试它们的确切代码:
#include <asm/unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>
#define HIT 0
#define MISS 1
#define TODO MISS
#define PAGE_SIZE 4096
#define TSIZE 1000
#define err_assert(cond) \
if (__builtin_expect(!(cond),0);
err_assert(addr != NULL);
__builtin_memset(addr,TSIZE * PAGE_SIZE);
return uint64_t(addr);
}
int
perf_event_open(struct perf_event_attr * hw_event,sizeof(struct perf_event_attr));
pe->type = type;
pe->size = sizeof(struct perf_event_attr);
pe->config = ev_config;
pe->disabled = !!lead;
pe->exclude_kernel = 1;
pe->exclude_hv = 1;
}
/* Fixed Counters */
static constexpr uint32_t core_instruction_ev = 0x003c;
static constexpr uint32_t core_instruction_idx = (1 << 30) + 0;
static constexpr uint32_t core_cycles_ev = 0x00c0;
static constexpr uint32_t core_cycles_idx = (1 << 30) + 1;
static constexpr uint32_t ref_cycles_ev = 0x0300;
static constexpr uint32_t ref_cycles_idx = (1 << 30) + 2;
/* programmable counters */
static constexpr uint32_t mem_load_retired_l1_hit = 0x01d1;
static constexpr uint32_t mem_load_retired_l1_miss = 0x08d1;
static constexpr uint32_t l1d_pending = 0x0148;
static constexpr uint32_t swpf_hit = 0xc824;
static constexpr uint32_t swpf_miss = 0x2824;
static constexpr uint32_t ev0 = l1d_pending;
#define NEVENTS 1
#if NEVENTS > 1
static constexpr uint32_t ev1 = swpf_miss;
#endif
int
init_perf_tracking() {
struct perf_event_attr pe;
init_perf_event_struct(&pe,ev0,0) >= 0);
#if NEVENTS > 1
init_perf_event_struct(&pe,ev1,0) >= 0);
#endif
return leadfd;
}
void
start_perf_tracking(int leadfd) {
ioctl(leadfd,0);
}
#define _V_TO_STR(X) #X
#define V_TO_STR(X) _V_TO_STR(X)
//#define LFENCE
#ifdef LFENCE
#define SERIALIZER() "lfence\n\t"
#else
#define SERIALIZER() \
"xorl %%ecx,%%ecx\n\t" \
"xorl %%eax,%%eax\n\t" \
"cpuid\n\t"
#endif
#define DO_PREFETCH
#ifdef DO_PREFETCH
#define DO_MEMORY_OP(addr) "prefetcht0 (%[" V_TO_STR(addr) "])\n\t"
#else
#define DO_MEMORY_OP(addr) "movl (%[" V_TO_STR(addr) "]),%%eax\n\t"
#endif
int
main() {
int fd = init_perf_tracking();
start_perf_tracking(fd);
uint64_t addr = get_addr();
// to ensure page in TLB
*((volatile uint64_t *)(addr + (PAGE_SIZE - 8))) = 0;
#if TODO == HIT
// loading from 0 offset to check cache miss / hit
*((volatile uint64_t *)addr) = 0;
#endif
uint32_t ecount0 = 0,ecount1 = 0,cycles_to_detect = 0;
asm volatile(
SERIALIZER()
"movl %[core_cycles_idx],%[ecount0]\n\t"
#if NEVENTS > 1
"movl $1,%[ecount1]\n\t"
#endif
SERIALIZER()
DO_MEMORY_OP(prefetch_addr)
SERIALIZER()
"xorl %%ecx,%%ecx\n\t"
"rdpmc\n\t"
"subl %[ecount0],%%ecx\n\t"
"rdpmc\n\t"
"subl %[ecount1],%[ecount1]\n\t"
#endif
"movl %[core_cycles_idx],%[cycles_to_detect]\n\t"
SERIALIZER()
: [ ecount0 ] "=&r"(ecount0),#if NEVENTS > 1
[ ecount1 ] "=&r"(ecount1),#endif
[ cycles_to_detect ] "=&r"(cycles_to_detect)
: [ prefetch_addr ] "r"(addr),"E0 : %d\n",ecount0);
fprintf(stderr,"E1 : %d\n",ecount1);
fprintf(stderr,cycles_to_detect);
}
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)