问题描述
我正在尝试转换此标量代码:
struct CICDecimator {
static constexpr int64_t scale = ((int64_t)1) << 32;
int mStages;
int mFactor;
float mGainCorrection;
int64_t *pIntegrators;
int64_t *pCombs;
CICDecimator(int stages = 4,int factor = 8) {
assert(stages > 0);
assert(factor > 0);
mStages = stages;
pIntegrators = new int64_t[mStages + 1]{};
pCombs = new int64_t[mStages]{};
if (mFactor != factor) {
mFactor = factor;
mGainCorrection = 1.0f / (float)(pow(mFactor,mStages));
}
}
~CICDecimator() {
delete[] pIntegrators;
delete[] pCombs;
}
float process(const float *buffer) {
for (int i = 0; i < mFactor; i++) {
pIntegrators[0] = buffer[i] * scale;
for (int j = 1; j <= mStages; j++) {
pIntegrators[j] += pIntegrators[j - 1];
}
}
int64_t s = pIntegrators[mStages];
for (int i = 0; i < mStages; i++) {
int64_t t = s;
s -= pCombs[i];
pCombs[i] = t;
}
return mGainCorrection * (s / (float)scale);
}
};
到 SSE/SIMD
版本(使用 -march=nocona
构建),尝试在 process()
函数(在音频应用程序中称为重载)上获得性能。
主要问题是缓冲区的输入将是 m128
(4 x float) 值。并且似乎 CICDecimator
需要 64 位整数才能使精度正常工作(因此,只有两个插槽)。
这是我的实际代码:
struct CICDecimatorV4 {
const __m128 scale4 = _mm_set1_ps((float)(((int64_t)1) << 32));
int mStages;
int mFactor;
__m128 mGainCorrection4;
__m128d *pIntegrators4_1;
__m128d *pIntegrators4_2;
__m128d *pCombs4_1;
__m128d *pCombs4_2;
CICDecimatorV4(int stages = 4,int factor = 8) {
assert(stages > 0);
assert(factor > 0);
mStages = stages;
pIntegrators4_1 = new __m128d[mStages + 1]{};
pIntegrators4_2 = new __m128d[mStages + 1]{};
pCombs4_1 = new __m128d[mStages]{};
pCombs4_2 = new __m128d[mStages]{};
if (mFactor != factor) {
mFactor = factor;
mGainCorrection4 = _mm_div_ps(_mm_set1_ps(1.0f),_mm_set1_ps((float)pow(mFactor,mStages)));
}
}
~CICDecimatorV4() {
delete[] pIntegrators4_1;
delete[] pIntegrators4_2;
delete[] pCombs4_1;
delete[] pCombs4_2;
}
__m128 process(const __m128 *buffer) {
for (int i = 0; i < mFactor; i++) {
// float to double
__m128i bufferInt = _mm_cvttps_epi32(_mm_mul_ps(buffer[i],scale4));
pIntegrators4_1[0] = _mm_cvtepi32_pd(bufferInt);
pIntegrators4_2[0] = _mm_cvtepi32_pd(_mm_srli_si128(bufferInt,8));
for (int j = 1; j <= mStages; j++) {
pIntegrators4_1[j] += pIntegrators4_1[j - 1];
pIntegrators4_2[j] += pIntegrators4_2[j - 1];
}
}
__m128d sD1 = pIntegrators4_1[mStages];
__m128d sD2 = pIntegrators4_2[mStages];
for (int i = 0; i < mStages; i++) {
__m128d t1 = sD1;
__m128d t2 = sD2;
sD1 -= pCombs4_1[i];
sD2 -= pCombs4_2[i];
pCombs4_1[i] = t1;
pCombs4_2[i] = t2;
}
// double to float
__m128 sF1 = _mm_cvtpd_ps(sD1);
__m128 sF2 = _mm_cvtpd_ps(sD2);
__m128 sF; // <<<< TODO: i need to merge the two float on a single one
return _mm_mul_ps(mGainCorrection4,_mm_div_ps(sF,scale4));
}
};
我将 int64_t
转换为 m128d
(两个插槽),将输入 m128
拆分为两个 m128d
并复制每个计算以进行保持向量和双精度计算。
问题:
- 这个方法正确吗?
- 如何将两个
m128d
转换为单个m128
? (首先在下部,其次在上部)(即<<<< TODO part
) - 可以整体改进吗? (:P)
谢谢
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)