C++11+ 编译器是否足够智能以优化内部“if”语句？

问题描述

最好用代码来描述：

void foo(vector<vector<int>>& a,vector<vector<int>>& b,bool flag) { 

    vector<vector<int>> c; 
    for (int i ...) { 
        for (int j ...) { 
            int value; 
            if (flag) 
                value = a[i][j] + b[i][j]; 
            else 
                value = a[i][j] - b[i][j];
        } 

    } 
}

从表面上看，该标志在每个单独的内部循环中都会被评估和分支，尽管在任何一个循环之前都是已知的。 C++11+ 编译器会生成两个单独的代码路径，在开始时评估分支，还是应该手动完成？

在教我过早优化之前，请理解这是为了在次要细节方面成为一个更有见识的程序员而提出的要求。

解决方法

这可能取决于示例的复杂性，但编译器能够进行这种优化。让我们看一个简单而完整的例子：

extern bool get_bool() noexcept;
extern int get_int() noexcept;
extern void foo1() noexcept;
extern void foo0() noexcept;

void foo() noexcept {
  bool b = get_bool();
  int i_mx = get_int();
  int j_mx = get_int();

  for (int i = 0; i < i_mx; ++i) {
    for (int j = 0; j < j_mx; ++j) {
      if (b)
        foo1();
      else
        foo0();
    }
  }
}

如果我们用 clang 编译它，here 是生成的代码：

foo():                                # @foo()
        push    rbp
        push    r15
        push    r14
        push    r12
        push    rbx
        call    get_bool()
        mov     r14d,eax
        call    get_int()
        mov     r15d,eax
        call    get_int()
        test    r15d,r15d
        jle     .LBB0_9
        mov     r12d,eax
        test    eax,eax
        jle     .LBB0_9
        xor     ebx,ebx
        test    r14b,r14b
        je      .LBB0_3
.LBB0_6:                                # =>This Loop Header: Depth=1
        mov     ebp,r12d
.LBB0_7:                                #   Parent Loop BB0_6 Depth=1
        call    foo1()
        dec     ebp
        jne     .LBB0_7
        inc     ebx
        cmp     ebx,r15d
        jne     .LBB0_6
        jmp     .LBB0_9
.LBB0_3:                                # =>This Loop Header: Depth=1
        mov     ebp,r12d
.LBB0_4:                                #   Parent Loop BB0_3 Depth=1
        call    foo0()
        dec     ebp
        jne     .LBB0_4
        inc     ebx
        cmp     ebx,r15d
        jne     .LBB0_3
.LBB0_9:
        pop     rbx
        pop     r12
        pop     r14
        pop     r15
        pop     rbp
        ret

很明显，test r14b,r14b 行移到了循环之外。同样，您的里程可能会因代码的复杂性而异。最好检查生成的程序集以确保。

即使是当今最新的 C++ 标准（2021 年），优化方式也很少，例如复制省略和返回值优化。

这使得“编译器”可以应用任何特定于平台的优化。

问题中的函数没有产生任何效果，因此很可能被完全优化掉了。

但是为了解决（我假设的是）“潜在”问题，典型的编译器将能够推断出相同的条件甚至适用于嵌套循环，例如

int foo(vector<vector<int>> a,vector<vector<int>> b,bool flag) { 
    int value = 0;
    for (int i = 0; i < a.size(); i++) { 
        for (int j = 0; j < a[i].size(); j++) { 
            if (flag) 
                value += a[i][j] + b[i][j]; 
            else 
                value += a[i][j] - b[i][j];
        } 

    } 
    return value;
}

下面的汇编代码显示“是的，它已优化”（由 Clang Intel x86 64 位编译器生成）：

注意 third argument foo 在寄存器 dl（64 位寄存器 RDX 的 8 位版本）中发现在两个循环开始之前被测试
根据“foo”条件复制了两个循环：LBB1_2 和 LBB1_6

通过运行生成的这个（编辑过的）汇编代码：g++ -std=c++17 -O3 -c -S code.cpp:

__Z3fooNSt3__16vectorINS0_IiNS_9allocatorIiEEEENS1_IS3_EEEES5_b: ## @_Z3fooNSt3__16vectorINS0_IiNS_9allocatorIiEEEENS1_IS3_EEEES5_b
        .cfi_startproc
## %bb.0:
        pushq   %rbp
...
        xorl    %eax,%eax    # <======== int value = 0;
        testb   %dl,%dl      # <======== if (flag)
        jne     LBB1_6
        jmp     LBB1_2
        .p2align        4,0x90
LBB1_7:                                 ##   in Loop: Header=BB1_6 Depth=1
        incq    %r10
        cmpq    %r10,%r8
        jbe     LBB1_26
LBB1_6:                                 ## =>This Loop Header: Depth=1
                                        ##     Child Loop BB1_13 Depth 2
                                        ##     Child Loop BB1_17 Depth 2
        leaq    (%r10,%r10,2),%rcx
        movq    (%r9,%rcx,8),%rdi
        movq    8(%r9,%r11
        subq    %rdi,%r11
...
LBB1_2:                                 ## =>This Loop Header: Depth=1
                                        ##     Child Loop BB1_21 Depth 2
                                        ##     Child Loop BB1_5 Depth 2
        leaq    (%r10,%r11

这个东西的优化会是这样的

#include <iostream>
#include <vector>
#include <execution>


int main(int argc,char *argv[])
{
    std::vector<std::vector<int>> b{{2,3},{4,7}};
    std::vector<std::vector<int>> aq{{7,8},{2,17}};

    std::for_each(std::execution::par,b.begin(),b.end(),[&](auto& x){
                    for (auto& w : x)
                        w = aq[&x - b.data()][&w - x.data()];
                });
    std::cout << "and then " << b[0][1] << std::endl;
}

-ltbb -std=c++20

branch branch c++compilation compilation optimization optimization