问题描述
我制作了一个程序来测试不同功能的性能,这些功能以不同的方式执行相同的操作。但是,它们中的一些会生成几乎完全相同的机器代码,只是地址不同,而且当我测量执行时间时,其中一个比另一个要快。这几乎在每次运行中都始终如一地发生,我不知道为什么。奇怪的是,当我删除一些其他测试功能时,差异会分散。我在StackOverflow上为another答案编写了代码,想了解原因。
我在Debian 10 Buster和AMD64上运行,并使用GCC。我用-O3
编译它。
下面是我的代码。尽管有相同的代码,但功能lowerA()
几乎每次都比lower9()
快得多。如果我删除未使用的功能lower1()
-lower8()
,区别就会消失。
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <errno.h>
#include <stdlib.h>
#include <inttypes.h>
#include <time.h>
//#define DEBUG
#ifdef DEBUG
#define N 10
#else
#define N 1000UL*100
#endif
#define M 20
#define STR_(x) #x
#define STR(x) STR_(x)
void lower1(char *s)
{
size_t i;
for (i = 0; i < strlen(s); i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] -= ('A' - 'a');
}
}
}
void lower2(char *s)
{
size_t i;
size_t len = strlen(s);
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] -= ('A' - 'a');
}
}
}
void lower3(char *s)
{
size_t i;
size_t len = strlen(s);
int d='A'-'a';
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] -= d;
}
}
}
void lower4(char *s)
{
size_t i;
size_t len = strlen(s);
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] += 0x20;
}
}
}
void lower5(char *s)
{
size_t i;
for (i = 0; i < strlen(s); i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] += ('a' - 'A');
}
}
}
void lower6(char *s)
{
size_t i;
for (i = 0; i < strlen(s); i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] |= 0x20;
}
}
}
void lower7(char *s)
{
size_t i;
size_t len = strlen(s);
for (i = 0; i < len; i++)
{
if (s[i] >= 'A' && s[i] <= 'Z')
{
s[i] |= 0x20;
}
}
}
void lower8(char *s)
{
size_t len = strlen(s);
while(len--)
{
if (*s >= 'A' && *s <= 'Z')
{
*s |= 0x20;
}
s++;
}
}
void lower9(char *s)
{
while(1)
{
if (!*s)
{
break;
}
if (*s >= 'A' && *s <= 'Z')
{
*s |= 0x20;
}
s++;
}
}
void lowerA(char *s)
{
while(*s)
{
if (*s >= 'A' && *s <= 'Z')
{
*s |= 0x20;
}
s++;
}
}
uint64_t die(const char *msg)
{
fprintf(stderr,"die: %s : %s\n",msg,strerror(errno));
exit(1);
}
uint64_t getTime(void)
{
uint64_t time;
struct timespec t_v;
if(clock_gettime(CLOCK_BOottIME,&t_v)<0)
{
die("cant get time");
}
time=t_v.tv_sec*1000000000ULL;
time+=t_v.tv_nsec;
return time;
}
void test(void (*fp)(char *),char (*s)[M],const char *name)
{
static char (*copy)[M];
copy=malloc(N*M);
if(!copy)
{
die("can't alloc memory");
}
memcpy(copy,s,N*M);
uint64_t start=getTime();
for(size_t u=0;u<N;u++)
{
fp(copy[u]);
}
uint64_t end=getTime();
printf("time %13"PRIu64" %s\n",end-start,name);
#ifdef DEBUG
for(size_t u=0;u<N;u++)
{
printf("%3zu %"STR(M)"s %"STR(M)"s\n",u,s[u],copy[u]);
}
#endif
free(copy);
}
void runTest(void)
{
//create a random string
srand(getTime());
static char string[N][M];
for(size_t u=0;u<N;u++)
{
size_t l=rand()%M;
for(size_t i=0;i<l;i++)
{
string[u][i]=rand()%('z'-'/')+'/';
}
string[u][l]='\0';
}
#define TEST(s) test(s,string,STR(s))
TEST(lower9);
TEST(lowerA);
}
int main(void)
{
for(unsigned i=0;i<8;i++)
{
runtest();
}
return 1;
}
这是对函数lower9()
和lowerA()
的反汇编:
Dump of assembler code for function lower9:
0x00000000000017b0 <+0>: movzbl (%rdi),%eax
0x00000000000017b3 <+3>: test %al,%al
0x00000000000017b5 <+5>: je 0x17eb <lower9+59>
0x00000000000017b7 <+7>: nopw 0x0(%rax,%rax,1)
0x00000000000017c0 <+16>: lea -0x41(%rax),%edx
0x00000000000017c3 <+19>: cmp $0x19,%dl
0x00000000000017c6 <+22>: ja 0x17e0 <lower9+48>
0x00000000000017c8 <+24>: or $0x20,%eax
0x00000000000017cb <+27>: add $0x1,%rdi
0x00000000000017cf <+31>: mov %al,-0x1(%rdi)
0x00000000000017d2 <+34>: movzbl (%rdi),%eax
0x00000000000017d5 <+37>: test %al,%al
0x00000000000017d7 <+39>: jne 0x17c0 <lower9+16>
0x00000000000017d9 <+41>: retq
0x00000000000017da <+42>: nopw 0x0(%rax,1)
0x00000000000017e0 <+48>: add $0x1,%rdi
0x00000000000017e4 <+52>: movzbl (%rdi),%eax
0x00000000000017e7 <+55>: test %al,%al
0x00000000000017e9 <+57>: jne 0x17c0 <lower9+16>
0x00000000000017eb <+59>: retq
End of assembler dump.
Dump of assembler code for function lowerA:
0x00000000000017f0 <+0>: movzbl (%rdi),%eax
0x00000000000017f3 <+3>: test %al,%al
0x00000000000017f5 <+5>: je 0x182b <lowerA+59>
0x00000000000017f7 <+7>: nopw 0x0(%rax,1)
0x0000000000001800 <+16>: lea -0x41(%rax),%edx
0x0000000000001803 <+19>: cmp $0x19,%dl
0x0000000000001806 <+22>: ja 0x1820 <lowerA+48>
0x0000000000001808 <+24>: or $0x20,%eax
0x000000000000180b <+27>: add $0x1,%rdi
0x000000000000180f <+31>: mov %al,-0x1(%rdi)
0x0000000000001812 <+34>: movzbl (%rdi),%eax
0x0000000000001815 <+37>: test %al,%al
0x0000000000001817 <+39>: jne 0x1800 <lowerA+16>
0x0000000000001819 <+41>: retq
0x000000000000181a <+42>: nopw 0x0(%rax,1)
0x0000000000001820 <+48>: add $0x1,%rdi
0x0000000000001824 <+52>: movzbl (%rdi),%eax
0x0000000000001827 <+55>: test %al,%al
0x0000000000001829 <+57>: jne 0x1800 <lowerA+16>
0x000000000000182b <+59>: retq
End of assembler dump.
一个示例结果,排序:
time 4145299 lowerA
time 4153573 lowerA
time 4155081 lowerA
time 4158537 lowerA
time 4173954 lowerA
time 4190982 lowerA
time 4196201 lowerA
time 4202252 lower9
time 4209932 lower9
time 4214722 lowerA
time 4215861 lower9
time 4216471 lower9
time 4243532 lower9
time 4315601 lower9
time 4323003 lower9
time 4331462 lower9
为什么lowerA()
快一点?
修改 用c我没有得到相同的行为。
Edit2
交换lower9()
和lowerA()
的呼叫可以使lower9()
更快。
(编者注:如Idiomatic way of performance evaluation?中所述,这可能意味着预热不足,或者在这种情况下,内联后,代码对齐的差异可能会有所不同。尤其是当clang显示没有差异时。)
Edit3 我使用的是Intel Xeon CPU W3570 @ 3.20GHz,Bloomfield microarchitecture,又名Nehalem服务器。
编辑4
gcc --version
给出gcc (Debian 8.3.0-6) 8.3.0
Peter Cordes说该代码可以内联,我认为他是正确的,这是runtest()
的废话:
Dump of assembler code for function runTest:
0x00000000000019d0 <+0>: push %r15
0x00000000000019d2 <+2>: mov $0x7,%edi
0x00000000000019d7 <+7>: push %r14
0x00000000000019d9 <+9>: push %r13
0x00000000000019db <+11>: push %r12
0x00000000000019dd <+13>: push %rbp
0x00000000000019de <+14>: push %rbx
0x00000000000019df <+15>: sub $0x58,%rsp
0x00000000000019e3 <+19>: mov %rsp,%rsi
0x00000000000019e6 <+22>: callq 0x1050 <clock_gettime@plt>
0x00000000000019eb <+27>: test %eax,%eax
0x00000000000019ed <+29>: js 0x1cd1 <runTest+769>
0x00000000000019f3 <+35>: imul $0x3b9aca00,(%rsp),%rdi
0x00000000000019fb <+43>: add 0x8(%rsp),%rdi
0x0000000000001a00 <+48>: xor %r14d,%r14d
0x0000000000001a03 <+51>: xor %r13d,%r13d
0x0000000000001a06 <+54>: callq 0x1080 <srand@plt>
0x0000000000001a0b <+59>: nopl 0x0(%rax,1)
0x0000000000001a10 <+64>: callq 0x10e0 <rand@plt>
0x0000000000001a15 <+69>: mov %eax,%ebx
0x0000000000001a17 <+71>: mov $0x66666667,%eax
0x0000000000001a1c <+76>: imul %ebx
0x0000000000001a1e <+78>: mov %ebx,%eax
0x0000000000001a20 <+80>: sar $0x1f,%eax
0x0000000000001a23 <+83>: sar $0x3,%edx
0x0000000000001a26 <+86>: sub %eax,%edx
0x0000000000001a28 <+88>: lea (%rdx,%rdx,4),%eax
0x0000000000001a2b <+91>: shl $0x2,%eax
0x0000000000001a2e <+94>: sub %eax,%ebx
0x0000000000001a30 <+96>: movslq %ebx,%rbx
0x0000000000001a33 <+99>: test %rbx,%rbx
0x0000000000001a36 <+102>: je 0x1a78 <runTest+168>
0x0000000000001a38 <+104>: lea 0x2681(%rip),%rax # 0x40c0 <string.3410>
0x0000000000001a3f <+111>: mov $0x1b4e81b5,%ebp
0x0000000000001a44 <+116>: lea (%rax,%r14,1),%r15
0x0000000000001a48 <+120>: lea (%rbx,%r15,%r12
0x0000000000001a4c <+124>: nopl 0x0(%rax)
0x0000000000001a50 <+128>: callq 0x10e0 <rand@plt>
0x0000000000001a55 <+133>: add $0x1,%r15
0x0000000000001a59 <+137>: mov %eax,%ecx
0x0000000000001a5b <+139>: imul %ebp
0x0000000000001a5d <+141>: mov %ecx,%eax
0x0000000000001a5f <+143>: sar $0x1f,%eax
0x0000000000001a62 <+146>: sar $0x3,%edx
0x0000000000001a65 <+149>: sub %eax,%edx
0x0000000000001a67 <+151>: imul $0x4b,%edx,%edx
0x0000000000001a6a <+154>: sub %edx,%ecx
0x0000000000001a6c <+156>: add $0x2f,%ecx
0x0000000000001a6f <+159>: mov %cl,-0x1(%r15)
0x0000000000001a73 <+163>: cmp %r12,%r15
0x0000000000001a76 <+166>: jne 0x1a50 <runTest+128>
0x0000000000001a78 <+168>: lea 0x0(%r13,%r13,%rax
0x0000000000001a7d <+173>: lea 0x263c(%rip),%rdi # 0x40c0 <string.3410>
0x0000000000001a84 <+180>: add $0x1,%r13
0x0000000000001a88 <+184>: add $0x14,%r14
0x0000000000001a8c <+188>: lea (%rdi,%rax
0x0000000000001a90 <+192>: movb $0x0,(%rax,%rbx,1)
0x0000000000001a94 <+196>: cmp $0x186a0,%r13
0x0000000000001a9b <+203>: jne 0x1a10 <runTest+64>
0x0000000000001aa1 <+209>: mov $0x1e8480,%edi
0x0000000000001aa6 <+214>: callq 0x10b0 <malloc@plt>
0x0000000000001aab <+219>: mov %rax,%r12
0x0000000000001aae <+222>: mov %rax,0x1eaa8b(%rip) # 0x1ec540 <copy.3400>
0x0000000000001ab5 <+229>: test %rax,%rax
0x0000000000001ab8 <+232>: je 0x1cdd <runTest+781>
0x0000000000001abe <+238>: lea 0x25fb(%rip),%rsi # 0x40c0 <string.3410>
0x0000000000001ac5 <+245>: mov %rax,%rdi
0x0000000000001ac8 <+248>: mov $0x1e8480,%edx
0x0000000000001acd <+253>: callq 0x10a0 <memcpy@plt>
0x0000000000001ad2 <+258>: lea 0x10(%rsp),%rsi
0x0000000000001ad7 <+263>: mov $0x7,%edi
0x0000000000001adc <+268>: callq 0x1050 <clock_gettime@plt>
0x0000000000001ae1 <+273>: test %eax,%eax
0x0000000000001ae3 <+275>: js 0x1cd1 <runTest+769>
0x0000000000001ae9 <+281>: mov 0x18(%rsp),%rbp
0x0000000000001aee <+286>: mov %r12,%rsi
0x0000000000001af1 <+289>: imul $0x3b9aca00,0x10(%rsp),%rbx
0x0000000000001afa <+298>: lea 0x1e8480(%r12),%rdi
0x0000000000001b02 <+306>: nopw 0x0(%rax,1)
0x0000000000001b08 <+312>: movzbl (%rsi),%eax
0x0000000000001b0b <+315>: mov %rsi,%rdx
0x0000000000001b0e <+318>: test %al,%al
0x0000000000001b10 <+320>: je 0x1b35 <runTest+357>
0x0000000000001b12 <+322>: nopw 0x0(%rax,1)
0x0000000000001b18 <+328>: lea -0x41(%rax),%ecx
0x0000000000001b1b <+331>: cmp $0x19,%cl
0x0000000000001b1e <+334>: ja 0x1c88 <runTest+696>
0x0000000000001b24 <+340>: or $0x20,%eax
0x0000000000001b27 <+343>: add $0x1,%rdx
0x0000000000001b2b <+347>: mov %al,-0x1(%rdx)
0x0000000000001b2e <+350>: movzbl (%rdx),%eax
0x0000000000001b31 <+353>: test %al,%al
0x0000000000001b33 <+355>: jne 0x1b18 <runTest+328>
0x0000000000001b35 <+357>: add $0x14,%rsi
0x0000000000001b39 <+361>: cmp %rdi,%rsi
0x0000000000001b3c <+364>: jne 0x1b08 <runTest+312>
0x0000000000001b3e <+366>: lea 0x20(%rsp),%rsi
0x0000000000001b43 <+371>: mov $0x7,%edi
0x0000000000001b48 <+376>: callq 0x1050 <clock_gettime@plt>
0x0000000000001b4d <+381>: test %eax,%eax
0x0000000000001b4f <+383>: js 0x1cd1 <runTest+769>
0x0000000000001b55 <+389>: lea 0x4e6(%rip),%rdx # 0x2042
0x0000000000001b5c <+396>: lea 0x4d0(%rip),%rdi # 0x2033
0x0000000000001b63 <+403>: xor %eax,%eax
0x0000000000001b65 <+405>: imul $0x3b9aca00,0x20(%rsp),%rsi
0x0000000000001b6e <+414>: sub %rbp,%rsi
0x0000000000001b71 <+417>: add 0x28(%rsp),%rsi
0x0000000000001b76 <+422>: sub %rbx,%rsi
0x0000000000001b79 <+425>: callq 0x1070 <printf@plt>
0x0000000000001b7e <+430>: mov 0x1ea9bb(%rip),%rdi # 0x1ec540 <copy.3400>
0x0000000000001b85 <+437>: callq 0x1030 <free@plt>
0x0000000000001b8a <+442>: mov $0x1e8480,%edi
0x0000000000001b8f <+447>: callq 0x10b0 <malloc@plt>
0x0000000000001b94 <+452>: mov %rax,%r12
0x0000000000001b97 <+455>: mov %rax,0x1ea9a2(%rip) # 0x1ec540 <copy.3400>
0x0000000000001b9e <+462>: test %rax,%rax
0x0000000000001ba1 <+465>: je 0x1cdd <runTest+781>
0x0000000000001ba7 <+471>: lea 0x2512(%rip),%rsi # 0x40c0 <string.3410>
0x0000000000001bae <+478>: mov %rax,%rdi
0x0000000000001bb1 <+481>: mov $0x1e8480,%edx
0x0000000000001bb6 <+486>: callq 0x10a0 <memcpy@plt>
0x0000000000001bbb <+491>: lea 0x30(%rsp),%rsi
0x0000000000001bc0 <+496>: mov $0x7,%edi
0x0000000000001bc5 <+501>: callq 0x1050 <clock_gettime@plt>
0x0000000000001bca <+506>: test %eax,%eax
0x0000000000001bcc <+508>: js 0x1cd1 <runTest+769>
0x0000000000001bd2 <+514>: mov 0x38(%rsp),%rbp
0x0000000000001bd7 <+519>: mov %r12,%rsi
0x0000000000001bda <+522>: imul $0x3b9aca00,0x30(%rsp),%rbx
0x0000000000001be3 <+531>: lea 0x1e8480(%r12),%rdi
0x0000000000001beb <+539>: nopl 0x0(%rax,1)
0x0000000000001bf0 <+544>: movzbl (%rsi),%eax
0x0000000000001bf3 <+547>: mov %rsi,%rdx
0x0000000000001bf6 <+550>: test %al,%al
0x0000000000001bf8 <+552>: je 0x1c1d <runTest+589>
0x0000000000001bfa <+554>: nopw 0x0(%rax,1)
0x0000000000001c00 <+560>: lea -0x41(%rax),%ecx
0x0000000000001c03 <+563>: cmp $0x19,%cl
0x0000000000001c06 <+566>: ja 0x1cb0 <runTest+736>
0x0000000000001c0c <+572>: or $0x20,%eax
0x0000000000001c0f <+575>: add $0x1,%rdx
0x0000000000001c13 <+579>: mov %al,-0x1(%rdx)
0x0000000000001c16 <+582>: movzbl (%rdx),%eax
0x0000000000001c19 <+585>: test %al,%al
0x0000000000001c1b <+587>: jne 0x1c00 <runTest+560>
0x0000000000001c1d <+589>: add $0x14,%rsi
0x0000000000001c21 <+593>: cmp %rsi,%rdi
0x0000000000001c24 <+596>: jne 0x1bf0 <runTest+544>
0x0000000000001c26 <+598>: lea 0x40(%rsp),%rsi
0x0000000000001c2b <+603>: mov $0x7,%edi
0x0000000000001c30 <+608>: callq 0x1050 <clock_gettime@plt>
0x0000000000001c35 <+613>: test %eax,%eax
0x0000000000001c37 <+615>: js 0x1cd1 <runTest+769>
0x0000000000001c3d <+621>: lea 0x405(%rip),%rdx # 0x2049
0x0000000000001c44 <+628>: lea 0x3e8(%rip),%rdi # 0x2033
0x0000000000001c4b <+635>: xor %eax,%eax
0x0000000000001c4d <+637>: imul $0x3b9aca00,0x40(%rsp),%rsi
0x0000000000001c56 <+646>: sub %rbp,%rsi
0x0000000000001c59 <+649>: add 0x48(%rsp),%rsi
0x0000000000001c5e <+654>: sub %rbx,%rsi
0x0000000000001c61 <+657>: callq 0x1070 <printf@plt>
0x0000000000001c66 <+662>: mov 0x1ea8d3(%rip),%rdi # 0x1ec540 <copy.3400>
0x0000000000001c6d <+669>: callq 0x1030 <free@plt>
0x0000000000001c72 <+674>: add $0x58,%rsp
0x0000000000001c76 <+678>: pop %rbx
0x0000000000001c77 <+679>: pop %rbp
0x0000000000001c78 <+680>: pop %r12
0x0000000000001c7a <+682>: pop %r13
0x0000000000001c7c <+684>: pop %r14
0x0000000000001c7e <+686>: pop %r15
0x0000000000001c80 <+688>: retq
0x0000000000001c81 <+689>: nopl 0x0(%rax)
0x0000000000001c88 <+696>: add $0x1,%rdx
0x0000000000001c8c <+700>: movzbl (%rdx),%eax
0x0000000000001c8f <+703>: test %al,%al
0x0000000000001c91 <+705>: jne 0x1b18 <runTest+328>
0x0000000000001c97 <+711>: add $0x14,%rsi
0x0000000000001c9b <+715>: cmp %rdi,%rsi
0x0000000000001c9e <+718>: jne 0x1b08 <runTest+312>
0x0000000000001ca4 <+724>: jmpq 0x1b3e <runTest+366>
0x0000000000001ca9 <+729>: nopl 0x0(%rax)
0x0000000000001cb0 <+736>: add $0x1,%rdx
0x0000000000001cb4 <+740>: movzbl (%rdx),%eax
0x0000000000001cb7 <+743>: test %al,%al
0x0000000000001cb9 <+745>: jne 0x1c00 <runTest+560>
0x0000000000001cbf <+751>: add $0x14,%rsi
0x0000000000001cc3 <+755>: cmp %rsi,%rdi
0x0000000000001cc6 <+758>: jne 0x1bf0 <runTest+544>
0x0000000000001ccc <+764>: jmpq 0x1c26 <runTest+598>
0x0000000000001cd1 <+769>: lea 0x33a(%rip),%rdi # 0x2012
0x0000000000001cd8 <+776>: callq 0x1850 <die>
0x0000000000001cdd <+781>: lea 0x33c(%rip),%rdi # 0x2020
0x0000000000001ce4 <+788>: callq 0x1850 <die>
End of assembler dump.
Edit5 据我所知,内联部分也相同。
解决方法
您给了我们一个关键的观察
如果我删除了未使用的功能lower1()-lower8(),区别就变成了 离开。
这表明它与不同的lowerX()
实现无关,但与内存中的位置无关,可能是通过影响缓存的命中率/未命中率。而且,不仅lowerX()
函数在内存中的位置,而且与其调用函数的位置的结合更多地影响了缓存,从而影响了结果。
这意味着效果取决于最终可执行文件的总体内存布局。
- 内存布局反过来取决于编译器工具链及其设置,在源文件中找到函数的顺序以及可能还有很多其他影响。
- 即使对于给定的内存布局,对于不同的CPU /缓存版本,其性能效果也会有很大的不同。因此,在另一台计算机上,您可能只会得到相反的结果。
无论如何,只要性能差异如此之小(2%或3%),我就很难想象有什么应用很重要,尤其是在像大写字母这样的功能中,它很少能在现实世界中占主导地位。