问题描述
硬件:Intel skylake
我尝试创建一些基准来测试它,以查看如何最好地使用它,但结果却很奇怪:
这是基准代码:
#include <assert.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <x86intrin.h>
#define NLOOPS 5
#ifndef PARTITION_PER_CORE
#define PARTITION_PER_CORE 1
#endif
#ifndef NTHREADS
#define NTHREADS 8
#endif
#ifndef PARTITION
#define PARTITION 64
#endif
#ifndef TEST_SIZE
#define TEST_SIZE (1000000u)
#endif
#define XOR_ATOMIC(X,Y) __atomic_fetch_xor(X,Y,__ATOMIC_RELAXED)
#define XOR_CAS_LOOP(X,Y) \
do { \
volatile uint64_t _sink = __atomic_fetch_xor(X,__ATOMIC_RELAXED); \
} while (0)
#define ADD_ATOMIC(X,Y) __atomic_fetch_add(X,__ATOMIC_RELAXED)
#define XADD_ATOMIC(X,Y) \
do { \
volatile uint64_t _sink = __atomic_fetch_add(X,__ATOMIC_RELAXED); \
} while (0)
#define XOR(X,Y) *(X) ^= Y
#define ADD(X,Y) *(X) += (Y)
#ifndef OP1
#define OP1(X,Y) ADD(X,Y)
#endif
#ifndef OP2
#define OP2(X,Y)
#endif
#define bench_flush_all_pending() asm volatile("" : : : "memory");
#define bench_do_not_optimize_out(X) asm volatile("" : : "r,m"(X) : "memory")
uint64_t * r;
pthread_barrier_t b;
uint64_t total_cycles;
void
init() {
#if PARTITION_PER_CORE
assert(sysconf(_SC_NPROCESSORS_ONLN) * PARTITION <= sysconf(_SC_PAGE_SIZE));
#else
assert(NTHREADS * PARTITION <= sysconf(_SC_PAGE_SIZE));
#endif
r = (uint64_t *)mmap(NULL,sysconf(_SC_PAGE_SIZE),(PROT_READ | PROT_WRITE),(MAP_ANONYMOUS | MAP_PRIVATE),(-1),0);
assert(r != NULL);
assert(!pthread_barrier_init(&b,NULL,NTHREADS));
total_cycles = 0;
}
void *
run1(void * targ) {
uint64_t * t_region = (uint64_t *)targ;
const uint64_t y = rand();
// page in memory / get in cache
*t_region = 0;
pthread_barrier_wait(&b);
uint64_t start_cycles = __rdtsc();
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
OP1(t_region,y);
bench_do_not_optimize_out(t_region);
}
bench_flush_all_pending();
uint64_t end_cycles = __rdtsc();
__atomic_fetch_add(&total_cycles,(end_cycles - start_cycles),__ATOMIC_RELAXED);
return NULL;
}
void *
run2(void * targ) {
uint64_t * t_region = (uint64_t *)targ;
const uint64_t y = rand();
// page in memory / get in cache
*t_region = 0;
pthread_barrier_wait(&b);
uint64_t start_cycles = __rdtsc();
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
OP2(t_region,__ATOMIC_RELAXED);
return NULL;
}
void
test() {
init();
pthread_t * tids = (pthread_t *)calloc(NTHREADS,sizeof(pthread_t));
assert(tids != NULL);
cpu_set_t cset;
CPU_ZERO(&cset);
const uint32_t stack_size = (1 << 18);
uint32_t ncores = sysconf(_SC_NPROCESSORS_ONLN);
for (uint32_t i = 0; i < NTHREADS; ++i) {
CPU_SET(i % ncores,&cset);
pthread_attr_t attr;
assert(!pthread_attr_init(&attr));
assert(!pthread_attr_setaffinity_np(&attr,sizeof(cpu_set_t),&cset));
assert(!pthread_attr_setstacksize(&attr,stack_size));
#if PARTITION_PER_CORE
uint64_t * t_region = r + (i % ncores) * (PARTITION / sizeof(uint64_t));
#else
uint64_t * t_region = r + (i) * (PARTITION / sizeof(uint64_t));
#endif
if (i % 2) {
assert(!pthread_create(tids + i,&attr,run1,(void *)t_region));
}
else {
assert(!pthread_create(tids + i,run2,(void *)t_region));
}
CPU_ZERO(&cset);
assert(!pthread_attr_destroy(&attr));
}
for (uint32_t i = 0; i < NTHREADS; ++i) {
pthread_join(tids[i],NULL);
}
free(tids);
}
int
main(int argc,char ** argv) {
double results[NLOOPS];
for(uint32_t i = 0; i < NLOOPS; ++i) {
test();
double cycles_per_op = total_cycles;
cycles_per_op /= NTHREADS * TEST_SIZE;
results[i] = cycles_per_op;
}
char buf[64] = "";
strcpy(buf,argv[0]);
uint32_t len = strlen(buf);
uint32_t start_op1 = 0,start_op2 = 0;
for (uint32_t i = 0; i < len; ++i) {
if (start_op1 == 0 && buf[i] == '-') {
start_op1 = i + 1;
}
else if (buf[i] == '-') {
start_op2 = i + 1;
buf[i] = 0;
}
}
fprintf(stderr,"Results: %s\n\t"
"nthreads : %d\n\t"
"partition size : %d\n\t"
"partion_per_core : %s\n\t"
"Op1 : %s\n\t"
"Op2 : %s\n\t"
"Cycles Per Op : %.3lf",argv[0],NTHREADS,PARTITION,PARTITION_PER_CORE ? "true" : "false",buf + start_op1,buf + start_op2,results[0]);
for(uint32_t i = 1; i < NLOOPS; ++i) {
fprintf(stderr,",%.3lf",results[i]);
}
fprintf(stderr,"\n\n");
assert(!munmap(r,sysconf(_SC_PAGE_SIZE)));
assert(!pthread_barrier_destroy(&b));
}
然后我一直在使用:
#! /bin/bash
CFLAGS="-O3 -march=native -mtune=native"
LDFLAGS="-lpthread"
# NOTE: nthreads * partion must be <= PAGE_SIZE (test intentionally
# fails assertion if memory spans multiple pages
# nthreads : number of threads performing operations
# op1 : even tids will perform op1 (tid is order they where created
# starting from 0)
# partition_per_core: boolean,if true multiple threads pinned to the
# same core will share the same destination,if
# false each thread will have a unique memory
# destination (this can be ignored in nthreads <= ncores)
# op2 : odd tids will perform op2 (tid is order they where created
# starting from 0)
# partition : space between destinations (i.e partition
# destinations by cache line size or 2 * cache
# line size)
for nthreads in 8; do
for partition_per_core in 1; do
for op1 in ADD XOR ADD_ATOMIC XADD_ATOMIC XOR_ATOMIC XOR_CAS_LOOP; do
for op2 in ADD XOR ADD_ATOMIC XADD_ATOMIC XOR_ATOMIC XOR_CAS_LOOP; do
for partition in 64 128; do
g++ ${CFLAGS} test_atomics.cc -o test-${op1}-${op2} -DPARTITION=${partition} -DNTHREADS=${nthreads} -DPARTITION_PER_CORE=${partition_per_core} -DOP1=${op1} -DOP2=${op2} ${LDFLAGS};
./test-${op1}-${op2};
rm -f test-${op1}-${op2};
done
echo "--------------------------------------------------------------------------------"
done
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
done
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
done
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
done
要运行它。
一些结果有意义:
即与lock xaddq
我得到:
Results: ./test-XADD_ATOMIC-XADD_ATOMIC
nthreads : 8
partition size : 64
partion_per_core : true
Op1 : XADD_ATOMIC
Op2 : XADD_ATOMIC
Cycles Per Op : 21.547,20.089,38.852,26.723,25.934
Results: ./test-XADD_ATOMIC-XADD_ATOMIC
nthreads : 8
partition size : 128
partion_per_core : true
Op1 : XADD_ATOMIC
Op2 : XADD_ATOMIC
Cycles Per Op : 19.607,19.187,19.483,18.857,18.721
基本上显示了128
字节分区的全面改进。总的来说,我可以看到128
字节分区对于CAS
循环(例如:
Results: ./test-XOR_CAS_LOOP-XOR_CAS_LOOP
nthreads : 8
partition size : 64
partion_per_core : true
Op1 : XOR_CAS_LOOP
Op2 : XOR_CAS_LOOP
Cycles Per Op : 20.273,20.061,20.737,21.240,21.747
Results: ./test-XOR_CAS_LOOP-XOR_CAS_LOOP
nthreads : 8
partition size : 128
partion_per_core : true
Op1 : XOR_CAS_LOOP
Op2 : XOR_CAS_LOOP
Cycles Per Op : 20.632,20.432,21.710,22.627,23.070
基本上得到与预期结果相反的结果。同样,当将原子与非原子操作混合时,我无法看到64
字节和128
字节分区大小之间的任何区别,即:
Results: ./test-XADD_ATOMIC-ADD
nthreads : 8
partition size : 64
partion_per_core : true
Op1 : XADD_ATOMIC
Op2 : ADD
Cycles Per Op : 11.117,11.186,11.223,11.169,11.138
Results: ./test-XADD_ATOMIC-ADD
nthreads : 8
partition size : 128
partion_per_core : true
Op1 : XADD_ATOMIC
Op2 : ADD
Cycles Per Op : 11.126,11.157,11.072,11.227,11.080
我很好奇的是:
-
在最佳分区大小方面,
CAS
循环为什么不像其余原子一样? -
如果我的理解是对的,那么使
128
字节分区更有效的原因在于在L2缓存中共享,为什么这似乎只影响原子操作?非原子仍然应该不断使缓存行无效。 -
是否有一般规则来了解何时需要
128
字节分区以及何时可以使用64
分区保存内存?
注意:我不是在问,也不希望有人阅读我的代码。我仅将其作为问题的上下文/我的电话号码。如果您对基准测试方法的评论有疑问/担忧,我会回复。
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)