C ++ operator []访问SIMD元素例如AVX

问题描述

我正在寻找一种重载operator []（在更广泛的SIMD类中）的方法，以方便读取和写入SIMD字（例如__m512i）中的各个元素。几个限制：

符合C ++ 11（或更高版本）
与其他基于内在函数的代码兼容
不是OpenCL / SYCL（我可以，但是我不能*叹气*）
大多数可移植到g ++，icpc，clang ++
最好适用于Intel（ARM，IBM等）以外的其他SIMD
（编辑）性能并不是真正的问题（在性能至关重要的地方通常不使用）

（这排除了通过指针转换进行类型修剪和GCC向量类型之类的事情。）

在很大程度上基于Scott Meyers的“更有效的C ++”（第30项）和其他代码，我提出了以下看起来“正确”的MVC代码，该代码似乎可行，但也过于复杂。（“代理”方法用于处理左右操作符[]的用法，“ memcpy”用于处理类型为punning / C ++标准的问题。）

我想知道是否有人有更好的解决方案（并且可以解释它，所以我学到了一些东西； ^））

#include <iostream>
#include <cstring>
#include "immintrin.h"

using T = __m256i;           // SIMD type
using Te = unsigned int;     // SIMD element type

class SIMD {

    class SIMDProxy;

  public :

    const SIMDProxy operator[](int index) const {
      std::cout << "SIMD::operator[] const" << std::endl;
      return SIMDProxy(const_cast<SIMD&>(*this),index);
    }
    SIMDProxy operator[](int index){
      std::cout << "SIMD::operator[]" << std::endl;
      return SIMDProxy(*this,index);
    }
    Te get(int index) {
      std::cout << "SIMD::get" << std::endl;
      alignas(T) Te tmp[8];
      std::memcpy(tmp,&value,sizeof(T));  // _mm256_store_si256(reinterpret_cast<__m256i *>(tmp),c.value);
      return tmp[index];
    }
    void set(int index,Te x) {
      std::cout << "SIMD::set" << std::endl;
      alignas(T) Te tmp[8];
      std::memcpy(tmp,c.value);
      tmp[index] = x;
      std::memcpy(&value,tmp,sizeof(T));  // c.value = _mm256_load_si256(reinterpret_cast<__m256i const *>(tmp));
    }

    void splat(Te x) {
      alignas(T) Te tmp[8];
      std::memcpy(tmp,sizeof(T));
      for (int i=0; i<8; i++) tmp[i] = x;
      std::memcpy(&value,sizeof(T));
    }
    void print() {
      alignas(T) Te tmp[8];
      std::memcpy(tmp,sizeof(T));
      for (int i=0; i<8; i++) std::cout << tmp[i] << " ";
      std::cout << std::endl;
    }

  protected :

  private :

    T value;

    class SIMDProxy {
      public :
        SIMDProxy(SIMD & c_,int index_) : c(c_),index(index_) {};
        // lvalue access
        SIMDProxy& operator=(const SIMDProxy& rhs) {
          std::cout << "SIMDProxy::=SIMDProxy" << std::endl;
          c.set(rhs.index,rhs.c.get(rhs.index));
          return *this;
        }
        SIMDProxy& operator=(Te x) {
          std::cout << "SIMDProxy::=T" << std::endl;
          c.set(index,x);
          return *this;
        }
        // rvalue access
        operator Te() const {
          std::cout << "SIMDProxy::()" << std::endl;
          return c.get(index);
        }
      private:
        SIMD& c;       // SIMD this proxy refers to
        int index;      // index of element we want
    };
    friend class SIMDProxy;   // give SIMDProxy access into SIMD


};

/** a little main to exercise things **/
int
main(int argc,char *argv[])
{

  SIMD x,y;
  Te a = 3;

  x.splat(1);
  x.print();

  y.splat(2);
  y.print();

  x[0] = a;
  x.print();

  y[1] = a;
  y.print();

  x[1] = y[1]; 
  x.print();
}

解决方法

您的代码效率很低。通常，这些SIMD类型不存在于内存中的任何位置，它们是硬件寄存器，它们没有地址，您无法将其传递给memcpy（）。编译器非常假装它们是正常变量，这就是为什么您的代码可以编译并可以正常工作的原因，但是它很慢，您一直在从寄存器到内存再往返。

假设AVX2和整数通道，这就是我要怎么做。

class SimdVector
{
    __m256i val;

    alignas( 64 ) static const std::array<int,8 + 7> s_blendMaskSource;

public:

    int operator[]( size_t lane ) const
    {
        assert( lane < 8 );
        // Move lane index into lowest lane of vector register
        const __m128i shuff = _mm_cvtsi32_si128( (int)lane );
        // Permute the vector so the lane we need is moved to the lowest lane
        // _mm256_castsi128_si256 says "the upper 128 bits of the result are undefined",// and we don't care indeed.
        const __m256i tmp = _mm256_permutevar8x32_epi32( val,_mm256_castsi128_si256( shuff ) );
        // Return the lowest lane of the result
        return _mm_cvtsi128_si32( _mm256_castsi256_si128( tmp ) );
    }

    void setLane( size_t lane,int value )
    {
        assert( lane < 8 );
        // Load the blending mask
        const int* const maskLoadPointer = s_blendMaskSource.data() + 7 - lane;
        const __m256i mask = _mm256_loadu_si256( ( const __m256i* )maskLoadPointer );
        // Broadcast the source value into all lanes.
        // The compiler will do equivalent of _mm_cvtsi32_si128 + _mm256_broadcastd_epi32
        const __m256i broadcasted = _mm256_set1_epi32( value );
        // Use vector blending instruction to set the desired lane
        val = _mm256_blendv_epi8( val,broadcasted,mask );
    }

    template<size_t lane>
    int getLane() const
    {
        static_assert( lane < 8 );
        // That thing is not an instruction;
        // compilers emit different ones based on the index
        return _mm256_extract_epi32( val,(int)lane );
    }

    template<size_t lane>
    void setLane( int value )
    {
        static_assert( lane < 8 );
        val = _mm256_insert_epi32( val,value,(int)lane );
    }
};

// Align by 64 bytes to guarantee it's contained within a cache line
alignas( 64 ) const std::array<int,8 + 7> SimdVector::s_blendMaskSource
{
    0,-1,0
};

对于ARM，情况有所不同。如果在编译时知道通道索引，请参见vgetq_lane_s32和vsetq_lane_s32内部函数。

要在ARM上设置通道，可以使用相同的广播+混合技巧。广播为vdupq_n_s32。向量混合的近似值是vbslq_s32，它可以独立处理每一位，但是在这种情况下，它同样适用，因为-1设置了全部32位。

要提取数据，请写一个switch，或将完整的向量存储到内存中，不确定这两个中的哪一个效率更高。

在原始方法（memcpy，内部加载/存储）和其他建议（用户定义的联合处理，用户定义的向量类型）中，似乎内部方法可能具有较小的优势。这是基于我尝试在Godbolt（https://godbolt.org/z/5zdbKe）中编写的一些快速示例。

写入元素的“最佳”看起来像这样。

__m256i foo2(__m256i x,unsigned int a,int index)
{
    alignas(__m256i) unsigned int tmp[8];
    _mm256_store_si256(reinterpret_cast<__m256i *>(tmp),x);
    tmp[index] = a;
    __m256i z = _mm256_load_si256(reinterpret_cast<__m256i const *>(tmp));
    return z;
}

如果仅关心g ++ / clang ++ / icc兼容性，则可以使用__attribute__，这些编译器在内部使用它们来定义其内部指令：

typedef int32_t int32x16_t __attribute__((vector_size(16*sizeof(int32_t)))) __attribute__((aligned(16*sizeof(int32_t))));

有意义时（在给定的体系结构上可能），变量将存储在向量寄存器中。另外，编译器为此typedef提供可读写的operator[]（如果在编译时知道索引，则应该对其进行优化）。

avx c++intrinsics simd