我刚刚尝试优化RGB到YUV420转换器.使用查找表产生了速度增加,使用定点算法也是如此.但是我期待使用SSE指令获得真正的收益.我的第一次尝试导致代码变慢,并且在链接所有操作之后,它与原始代码的速度大致相同.我的实施中是否有问题,或者SSE指令是否不适合手头的任务?
原始代码的一部分如下:
#define RRGB24YUVCI2_00 0.299 #define RRGB24YUVCI2_01 0.587 #define RRGB24YUVCI2_02 0.114 #define RRGB24YUVCI2_10 -0.147 #define RRGB24YUVCI2_11 -0.289 #define RRGB24YUVCI2_12 0.436 #define RRGB24YUVCI2_20 0.615 #define RRGB24YUVCI2_21 -0.515 #define RRGB24YUVCI2_22 -0.100 void RealRGB24toYUV420Converter::Convert(void* pRgb,void* pY,void* pU,void* pV) { yuvType* py = (yuvType *)pY; yuvType* pu = (yuvType *)pU; yuvType* pv = (yuvType *)pV; unsigned char* src = (unsigned char *)pRgb; /// Y have range 0..255,U & V have range -128..127. double u,v; double r,g,b; /// Step in 2x2 pel blocks. (4 pels per block). int xBlks = _width >> 1; int yBlks = _height >> 1; for(int yb = 0; yb < yBlks; yb++) for(int xb = 0; xb < xBlks; xb++) { int chrOff = yb*xBlks + xb; int lumOff = (yb*_width + xb) << 1; unsigned char* t = src + lumOff*3; /// Top left pel. b = (double)(*t++); g = (double)(*t++); r = (double)(*t++); py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b)); u = RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b; v = RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b; /// Top right pel. b = (double)(*t++); g = (double)(*t++); r = (double)(*t++); py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b)); u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b; v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b; lumOff += _width; t = t + _width*3 - 6; /// Bottom left pel. b = (double)(*t++); g = (double)(*t++); r = (double)(*t++); py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b)); u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b; v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b; /// Bottom right pel. b = (double)(*t++); g = (double)(*t++); r = (double)(*t++); py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b)); u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b; v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b; /// Average the 4 chr values. int iu = (int)u; int iv = (int)v; if(iu < 0) ///< Rounding. iu -= 2; else iu += 2; if(iv < 0) ///< Rounding. iv -= 2; else iv += 2; pu[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iu/4) ); pv[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iv/4) ); }//end for xb & yb... }//end Convert.
这是使用SSE的版本
const float fRRGB24YUVCI2_00 = 0.299; const float fRRGB24YUVCI2_01 = 0.587; const float fRRGB24YUVCI2_02 = 0.114; const float fRRGB24YUVCI2_10 = -0.147; const float fRRGB24YUVCI2_11 = -0.289; const float fRRGB24YUVCI2_12 = 0.436; const float fRRGB24YUVCI2_20 = 0.615; const float fRRGB24YUVCI2_21 = -0.515; const float fRRGB24YUVCI2_22 = -0.100; void RealRGB24toYUV420Converter::Convert(void* pRgb,void* pV) { __m128 xmm_y = _mm_loadu_ps(fCOEFF_0); __m128 xmm_u = _mm_loadu_ps(fCOEFF_1); __m128 xmm_v = _mm_loadu_ps(fCOEFF_2); yuvType* py = (yuvType *)pY; yuvType* pu = (yuvType *)pU; yuvType* pv = (yuvType *)pV; unsigned char* src = (unsigned char *)pRgb; /// Y have range 0..255,U & V have range -128..127. float bgr1[4]; bgr1[3] = 0.0; float bgr2[4]; bgr2[3] = 0.0; float bgr3[4]; bgr3[3] = 0.0; float bgr4[4]; bgr4[3] = 0.0; /// Step in 2x2 pel blocks. (4 pels per block). int xBlks = _width >> 1; int yBlks = _height >> 1; for(int yb = 0; yb < yBlks; yb++) for(int xb = 0; xb < xBlks; xb++) { int chrOff = yb*xBlks + xb; int lumOff = (yb*_width + xb) << 1; unsigned char* t = src + lumOff*3; bgr1[2] = (float)*t++; bgr1[1] = (float)*t++; bgr1[0] = (float)*t++; bgr2[2] = (float)*t++; bgr2[1] = (float)*t++; bgr2[0] = (float)*t++; t = t + _width*3 - 6; bgr3[2] = (float)*t++; bgr3[1] = (float)*t++; bgr3[0] = (float)*t++; bgr4[2] = (float)*t++; bgr4[1] = (float)*t++; bgr4[0] = (float)*t++; __m128 xmm1 = _mm_loadu_ps(bgr1); __m128 xmm2 = _mm_loadu_ps(bgr2); __m128 xmm3 = _mm_loadu_ps(bgr3); __m128 xmm4 = _mm_loadu_ps(bgr4); // Y __m128 xmm_res_y = _mm_mul_ps(xmm1,xmm_y); py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] )); // Y xmm_res_y = _mm_mul_ps(xmm2,xmm_y); py[lumOff + 1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] )); lumOff += _width; // Y xmm_res_y = _mm_mul_ps(xmm3,xmm_y); py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] )); // Y xmm_res_y = _mm_mul_ps(xmm4,xmm_y); py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] )); // U __m128 xmm_res = _mm_add_ps( _mm_add_ps(_mm_mul_ps(xmm1,xmm_u),_mm_mul_ps(xmm2,xmm_u)),_mm_add_ps(_mm_mul_ps(xmm3,_mm_mul_ps(xmm4,xmm_u)) ); float fU = xmm_res.m128_f32[0] + xmm_res.m128_f32[1] + xmm_res.m128_f32[2]; // V xmm_res = _mm_add_ps( _mm_add_ps(_mm_mul_ps(xmm1,xmm_v),xmm_v)),xmm_v)) ); float fV = xmm_res.m128_f32[0] + xmm_res.m128_f32[1] + xmm_res.m128_f32[2]; /// Average the 4 chr values. int iu = (int)fU; int iv = (int)fV; if(iu < 0) ///< Rounding. iu -= 2; else iu += 2; if(iv < 0) ///< Rounding. iv -= 2; else iv += 2; pu[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iu >> 2) ); pv[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iv >> 2) ); }//end for xb & yb... }
这是我在SSE2的第一次尝试之一,所以也许我错过了什么?仅供参考我使用Visual Studio 2008在Windows平台上工作.
解决方法
一些问题:
>你正在使用未对齐的负载 – 这些是相当昂贵的(除了Nehalem又称Core i5 / Core i7) – 至少是对齐负载成本的2倍 – 如果你在负载之后有足够的计算,成本可以摊销但是在这种情况下,你相对较少.您可以通过使这些16字节对齐并使用对齐的加载来为来自bgr1,bgr2等的负载修复此问题. [更好的是,根本不要使用这些中间数组 – 直接从内存加载数据到SSE寄存器,并使用SIMD完成所有的洗牌等 – 见下文]>你在标量和SIMD代码之间来回走动 – 就性能而言,标量代码可能是主要的部分,所以任何SIMD的收益都会被这个淹没 – 你真的需要做你内心的一切循环使用SIMD指令(即摆脱标量代码)