YUV視頻格式到RGB32格式轉換的速度優化上篇

                YUV視頻格式到RGB32格式轉換的速度優化上篇
                    [email protected]    2007.10.30

tag: YUV,YCbCr,YUV到RGB顏色轉換,YUV解碼,VFW,視頻,MMX,SSE,多核優化

摘要: 我們得到的很多視頻數據(一些解碼器的輸出或者攝像頭的輸出等)都使用了一種
叫YUV的顏色格式；本文介紹了常見的YUV視頻格式(YUY2/YVYU/UYVY/I420/YV12等)到
RGB顏色格式的轉換,並嘗試對轉化的速度進行優化；
全文分爲:
    《上篇》文章首先介紹了YUV顏色格式，並介紹了YUV顏色格式和RGB顏色格式之
間的相互轉換；然後重點介紹了YUYV視頻格式到RGB32格式的轉化，並嘗試進行了一
些速度優化；
    《中篇》嘗試使用MMX/SSE指令對前面實現的解碼器核心進行速度優化；然
後簡要介紹了一個使用這類CPU特殊指令時的代碼框架，使得解碼程序能夠根據運行時
的CPU指令支持情況動態調用最佳的實現代碼；並最終提供一個多核並行的優化版本；
    《下篇》介紹YUV類型的其他種類繁多的視頻數據編碼格式；並將前面實現的解碼
器核心(在不損失代碼速度的前提下)進行必要的修改，使之適用於這些YUV視頻格式
的解碼；
(2010.11.23 color_table查詢表擴大範圍,以避免 color_table[Ye + csU_blue_16 * Ue ) >> 16 )]超界; 謝謝bug提交者少浦 .)
  (2007.11.13 修正了一下顏色轉換公式中的係數)
  (2007.11.04 增加一個更深優化的全查表的實現DECODE_YUYV_TableEx;
        對DECODE_YUYV_Common做了一點小的調整和改進)

正文:
代碼使用C++,編譯器:VC2005
涉及到彙編的時候假定爲x86平臺；
現在的高清視頻幀尺寸越來越大，所以本文測試的圖片大小將使用1024x576和
1920x1080兩種常見的幀尺寸來測試解碼器速度；
測試平臺:(CPU:AMD64x2 4200+(2.37G);   內存:DDR2 677(雙通道); 編譯器:VC2005)
測試平臺:(CPU:Intel Core2 4400(2.00G);內存:DDR2 667(雙通道); 編譯器:VC2005)

A:YUV顏色空間介紹,YUV顏色空間和RGB顏色空間的轉換公式
   YUV(或稱爲YCbCr)顏色空間中Y代表亮度,“U”和“V”表示的則是色度。
   (這裏假設YUV和RGB的顏色分量值都是無符號的8bit整數)

   RGB顏色空間到YUV顏色空間的轉換公式:

    Y= 0.256788*R + 0.504129*G + 0.097906*B + 16;
    U=-0.148223*R - 0.290993*G + 0.439216*B + 128;
    V= 0.439216*R - 0.367788*G - 0.071427*B + 128;

   YUV顏色空間到RGB顏色空間的轉換公式:
    B= 1.164383 * (Y - 16) + 2.017232*(U - 128);
    G= 1.164383 * (Y - 16) - 0.391762*(U - 128) - 0.812968*(V - 128);
    R= 1.164383 * (Y - 16) + 1.596027*(V - 128);

( 補充:
在視頻格式中基本上都用的上面的轉換公式；但在其他一些
地方可能會使用下面的轉換公式(不同的使用場合可能有不同的轉換系數):

    Y = 0.299*R + 0.587*G + 0.114*B;
    U = -0.147*R - 0.289*G + 0.436*B;
    V = 0.615*R - 0.515*G - 0.100*B;

    R = Y + 1.14*V;
    G = Y - 0.39*U - 0.58*V;
    B = Y + 2.03*U;
)

B.RGB32顏色和圖片的數據定義:

#define asm __asm

typedef unsigned char TUInt8; // [0..255]
typedef unsigned long TUInt32;
struct TARGB32       // 32 bit color
{
    TUInt8  b,g,r,a;           // a is alpha
};

struct TPicRegion   // 一塊顏色數據區的描述，便於參數傳遞
{
    TARGB32 *     pdata;          // 顏色數據首地址
     long         byte_width;     // 一行數據的物理寬度(字節寬度)；
                 // abs(byte_width)有可能大於等於width*sizeof(TARGB32);
     long         width;          // 像素寬度
     long         height;         // 像素高度
};

// 那麼訪問一個點的函數可以寫爲：
__forceinline TARGB32 & Pixels( const TPicRegion & pic, const long x, const long y)
{
     return ( (TARGB32 * )((TUInt8 * )pic.pdata + pic.byte_width * y) )[x];
}

(注意:__forceinline表示總是內聯代碼，如果你的編譯器不支持，請改寫爲inline關鍵詞)

C.YUYV(也可以叫做YUY2)視頻格式到RGB32的轉化
（本文先集中優化YUYV視頻格式到RGB32的轉化,然後再擴展到其他視頻格式）

YUYV視頻格式的內存數據佈局圖示:

圖中可以看出Y的數據量是U或者V的兩倍，這是因爲人的眼睛一般對亮度比對顏
色更敏感一些，所以將連續的兩個像素的U(或V)值只保存一個U(或V)值,那麼每個
像素平均佔用16bit儲存空間;
解碼YUYV視頻格式的一個簡單浮點實現:

     // 顏色飽和函數
    __forceinline long border_color( long color)
    {
         if (color > 255 )
             return 255 ;
         else if (color < 0 )
             return 0 ;
         else
             return color;
    }

    __forceinline TARGB32 YUVToRGB32_float( const TUInt8 Y, const TUInt8 U, const TUInt8 V)
    {
        TARGB32 result;
        result.b= border_color( 1.164383 * (Y - 16) + 2.017232*(U - 128) );
        result.g= border_color( 1.164383 * (Y - 16) - 0.391762*(U - 128) - 0.812968*(V - 128) );
        result.r= border_color( 1.164383 * (Y - 16) + 1.596027*(V - 128) );
        result.a = 255 ;
         return result;
    }

void DECODE_YUYV_Float( const TUInt8 * pYUYV, const TPicRegion & DstPic)
{
    assert((DstPic.width & 1 ) == 0 );

    TARGB32 * pDstLine = DstPic.pdata;
     for ( long y = 0 ;y < DstPic.height; ++ y)
    {
         for ( long x = 0 ;x < DstPic.width;x += 2 )
        {
            pDstLine[x + 0 ] = YUVToRGB32_float(pYUYV[ 0 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pDstLine[x + 1 ] = YUVToRGB32_float(pYUYV[ 2 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pYUYV += 4 ;
        }
        ((TUInt8 *& )pDstLine) += DstPic.byte_width;
    }
}

D.使用整數運算(定點數運算)來代替浮點運算
默認的浮點數到整數的轉換是比較慢的運算；這裏用整數運算來代替浮點運算；
使用16位定點數，原理是將浮點係數擴大2^16倍，並保存爲整數(引入很小的誤差)，那麼計算出來的值
再除以2^16就得到正確的結果了,而除以2^16可以優化爲帶符號的右移; 代碼如下:

         const int csY_coeff_16 = 1.164383*(1<<16);
         const int csU_blue_16   = 2.017232*(1<<16);
         const int csU_green_16 = (-0.391762)*(1<<16);
          const int csV_green_16 = (-0.812968)*(1<<16);
         const int csV_red_16    = 1.596027*(1<<16);

    __forceinline TARGB32 YUVToRGB32_Int( const TUInt8 Y, const TUInt8 U, const TUInt8 V)
    {
        TARGB32 result;
         int Ye = csY_coeff_16 * (Y - 16 );
         int Ue = U - 128 ;
         int Ve = V - 128 ;
        result.b = border_color( ( Ye + csU_blue_16 * Ue ) >> 16 );
        result.g = border_color( ( Ye + csU_green_16 * Ue + csV_green_16 * Ve ) >> 16 );
        result.r = border_color( ( Ye + csV_red_16 * Ve ) >> 16 );
        result.a = 255 ;
         return result;
    }

void DECODE_YUYV_Int( const TUInt8 * pYUYV, const TPicRegion & DstPic)
{
    assert((DstPic.width & 1 ) == 0 );

    TARGB32 * pDstLine = DstPic.pdata;
     for ( long y = 0 ;y < DstPic.height; ++ y)
    {
         for ( long x = 0 ;x < DstPic.width;x += 2 )
        {
            pDstLine[x + 0 ] = YUVToRGB32_Int(pYUYV[ 0 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pDstLine[x + 1 ] = YUVToRGB32_Int(pYUYV[ 2 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pYUYV += 4 ;
        }
        ((TUInt8 *& )pDstLine) += DstPic.byte_width;
    }
}

速度測試:
////////////////////////////////////////////////////////////////////////////////
//==============================================================================
//                       |        1024x576       |       1920x1080       |
//------------------------------------------------------------------------------
//                       | AMD64x2 |   Core2   | AMD64x2 |   Core2   |
//------------------------------------------------------------------------------
//DECODE_YUYV_Int          137.1 FPS 131.9 FPS     39.0 FPS   37.1 FPS
////////////////////////////////////////////////////////////////////////////////

E.優化border_color顏色飽和函數
因爲border_color的實現使用了分支代碼，在現代CPU上分支預測錯的代價很大，這裏使用一個
查找表來代替它；

// 顏色查表
static TUInt8 _color_table[ 256 * 5 ];
static const TUInt8 * color_table =& _color_table[ 256 *2];
class _CAuto_inti_color_table
{
public :
    _CAuto_inti_color_table() {
         for ( int i = 0 ;i < 256 *5 ; ++ i)
            _color_table[i] = border_color(i - 256*2 );
    }
};
static _CAuto_inti_color_table _Auto_inti_color_table;

    __forceinline TARGB32 YUVToRGB32_RGBTable( const TUInt8 Y, const TUInt8 U, const TUInt8 V)
    {

        TARGB32 result;
         int Ye = csY_coeff_16 * (Y - 16 );
         int Ue = U - 128 ;
         int Ve = V - 128 ;
        result.b = color_table[ ( Ye + csU_blue_16 * Ue ) >> 16 ];
        result.g = color_table[ ( Ye + csU_green_16 * Ue + csV_green_16 * Ve ) >> 16 ];
        result.r = color_table[ ( Ye + csV_red_16 * Ve ) >> 16 ];
        result.a = 255 ;
         return result;
    }

void DECODE_YUYV_RGBTable( const TUInt8 * pYUYV, const TPicRegion & DstPic)
{
    assert((DstPic.width & 1 ) == 0 );

    TARGB32 * pDstLine = DstPic.pdata;
     for ( long y = 0 ;y < DstPic.height; ++ y)
    {
         for ( long x = 0 ;x < DstPic.width;x += 2 )
        {
            pDstLine[x + 0 ] = YUVToRGB32_RGBTable(pYUYV[ 0 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pDstLine[x + 1 ] = YUVToRGB32_RGBTable(pYUYV[ 2 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pYUYV += 4 ;
        }
        ((TUInt8 *& )pDstLine) += DstPic.byte_width;
    }
}

速度測試:
////////////////////////////////////////////////////////////////////////////////
//==============================================================================
//                       |        1024x576       |       1920x1080       |
//------------------------------------------------------------------------------
//                       | AMD64x2 |   Core2   | AMD64x2 |   Core2   |
//------------------------------------------------------------------------------
//DECODE_YUYV_RGBTable     164.8 FPS 152.9 FPS     47.1 FPS   43.7 FPS
////////////////////////////////////////////////////////////////////////////////

F.使用查找表來代乘法運算
其實，現在的x86 CPU做乘法是很快的，用查找表的內存訪問來代替乘法不見得會更快；
本文章討論它的意義在於，該實現版本在其他平臺的CPU上可能有很好的優化效果；在奔騰4上
該版本DECODE_YUYV_Table也很可能比DECODE_YUYV_RGBTable快，我沒有測試過；

static int Ym_table[ 256 ];
static int Um_blue_table[ 256 ];
static int Um_green_table[ 256 ];
static int Vm_green_table[ 256 ];
static int Vm_red_table[ 256 ];

class _CAuto_inti_yuv_table
{
public :
    _CAuto_inti_yuv_table() {
         for ( int i = 0 ;i < 256 ; ++ i)
        {
            Ym_table[i] = csY_coeff_16 * (i - 16 );
            Um_blue_table[i] = csU_blue_16 * (i - 128 );
            Um_green_table[i] = csU_green_16 * (i - 128 );
            Vm_green_table[i] = csV_green_16 * (i - 128 );
            Vm_red_table[i] = csV_red_16 * (i - 128 );
        }
    }
};
static _CAuto_inti_yuv_table _Auto_inti_yuv_table;

    __forceinline TARGB32 YUVToRGB32_Table( const TUInt8 Y, const TUInt8 U, const TUInt8 V)
    {
        TARGB32 result;
         int Ye = Ym_table[Y];
        result.b = color_table[ ( Ye + Um_blue_table[U] ) >> 16 ];
        result.g = color_table[ ( Ye + Um_green_table[U] + Vm_green_table[V] ) >> 16 ];
        result.r = color_table[ ( Ye + Vm_red_table[V] ) >> 16 ];
        result.a = 255 ;
         return result;
    }

void DECODE_YUYV_Table( const TUInt8 * pYUYV, const TPicRegion & DstPic)
{
    assert((DstPic.width & 1 ) == 0 );

    TARGB32 * pDstLine = DstPic.pdata;
     for ( long y = 0 ;y < DstPic.height; ++ y)
    {
         for ( long x = 0 ;x < DstPic.width;x += 2 )
        {
            pDstLine[x + 0 ] = YUVToRGB32_Table(pYUYV[ 0 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pDstLine[x + 1 ] = YUVToRGB32_Table(pYUYV[ 2 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pYUYV += 4 ;
        }
        ((TUInt8 *& )pDstLine) += DstPic.byte_width;
    }
}

(提示：在沒有“帶符號右移”的CPU體系下或者能夠忍受一點點小的誤差，可以在生成YUV的查找表的時候不擴大2^16倍，從而在計算出結果的時候也就不需要右移16位的修正了，這樣改進後函數速度還會提高一些)

2007.11.04 補充一個更深優化的全查表的實現DECODE_YUYV_TableEx；

// 全查表
static int Ym_tableEx[ 256 ];
static int Um_blue_tableEx[ 256 ];
static int Um_green_tableEx[ 256 ];
static int Vm_green_tableEx[ 256 ];
static int Vm_red_tableEx[ 256 ];

class _CAuto_inti_yuv_tableEx
{
public :
    _CAuto_inti_yuv_tableEx() {
         for ( int i = 0 ;i < 256 ; ++ i)
        {
            Ym_tableEx[i] = (csY_coeff_16 * (i - 16 ) ) >> 16 ;
            Um_blue_tableEx[i] = (csU_blue_16 * (i - 128 ) ) >> 16 ;
            Um_green_tableEx[i] = (csU_green_16 * (i - 128 ) ) >> 16 ;
            Vm_green_tableEx[i] = (csV_green_16 * (i - 128 ) ) >> 16 ;
            Vm_red_tableEx[i] = (csV_red_16 * (i - 128 ) ) >> 16 ;
        }
    }
};
static _CAuto_inti_yuv_tableEx _Auto_inti_yuv_tableEx;

    __forceinline void YUVToRGB32_Two_TableEx(TARGB32 * pDst, const TUInt8 Y0, const TUInt8 Y1, const TUInt8 U, const TUInt8 V)
    {
         int Ye0 = Ym_tableEx[Y0];
         int Ye1 = Ym_tableEx[Y1];
         int Ue_blue = Um_blue_tableEx[U];
         int Ue_green = Um_green_tableEx[U];
         int Ve_green = Vm_green_tableEx[V];
         int Ve_red = Vm_red_tableEx[V];
         int UeVe_green = Ue_green + Ve_green;

        ((TUInt32 * )pDst)[ 0 ] = color_table[ ( Ye0 + Ue_blue ) ]
                     | ( color_table[ ( Ye0 + UeVe_green )] << 8 )
                     | ( color_table[ ( Ye0 + Ve_red )] << 16 )
                     | ( 255 << 24 );
        ((TUInt32 * )pDst)[ 1 ] = color_table[ ( Ye1 + Ue_blue ) ]
                     | ( color_table[ ( Ye1 + UeVe_green )] << 8 )
                     | ( color_table[ ( Ye1 + Ve_red )] << 16 )
                     | ( 255 << 24 );
    }

     void DECODE_YUYV_TableEx_line(TARGB32 * pDstLine, const TUInt8 * pYUYV, long width)
    {
         for ( long x = 0 ;x < width;x += 2 )
        {
            YUVToRGB32_Two_TableEx( & pDstLine[x],pYUYV[ 0 ],pYUYV[ 2 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pYUYV += 4 ;
        }
    }

void DECODE_YUYV_TableEx( const TUInt8 * pYUYV, const TPicRegion & DstPic)
{
    assert((DstPic.width & 1 ) == 0 );

     long YUV_byte_width = (DstPic.width >> 1 ) << 2 ;
    TARGB32 * pDstLine = DstPic.pdata;
     for ( long y = 0 ;y < DstPic.height; ++ y)
    {
        DECODE_YUYV_TableEx_line(pDstLine,pYUYV,DstPic.width);
        pYUYV += YUV_byte_width;
        ((TUInt8 *& )pDstLine) += DstPic.byte_width;
    }
}

速度測試:
////////////////////////////////////////////////////////////////////////////////
//==============================================================================
//                       |        1024x576       |       1920x1080       |
//------------------------------------------------------------------------------
//                       | AMD64x2 |   Core2   | AMD64x2 |   Core2   |
//------------------------------------------------------------------------------
//DECODE_YUYV_TableEx      236.5 FPS 300.5 FPS     68.1 FPS   85.0 FPS
////////////////////////////////////////////////////////////////////////////////

G.優化U和V的計算、合併寫內存
由於兩個像素共享U和V值，關於它們的兩次計算，有部分代碼可以共享；
所以實現一個一次轉換兩個像素的版本；
寫內存的時候，合併成4字節來寫，這樣在現在的CPU上更加有效率(注意:在intel的
Xeon CPU上這個改動反而會慢一些):

    __forceinline void YUVToRGB32_Two(TARGB32 * pDst, const TUInt8 Y0, const TUInt8 Y1, const TUInt8 U, const TUInt8 V)
    {
         int Ye0 = csY_coeff_16 * (Y0 - 16 );
         int Ye1 = csY_coeff_16 * (Y1 - 16 );
         int Ue = (U - 128 );
         int Ue_blue = csU_blue_16 * Ue;
         int Ue_green = csU_green_16 * Ue;
         int Ve = (V - 128 );
         int Ve_green = csV_green_16 * Ve;
         int Ve_red = csV_red_16 * Ve;
         int UeVe_green = Ue_green + Ve_green;

        ((TUInt32 * )pDst)[ 0 ] = color_table[ ( Ye0 + Ue_blue ) >> 16 ]
                     | ( color_table[ ( Ye0 + UeVe_green ) >> 16 ] << 8 )
                     | ( color_table[ ( Ye0 + Ve_red ) >> 16 ] << 16 )
                     | ( 255 << 24 );
        ((TUInt32 * )pDst)[ 1 ] = color_table[ ( Ye1 + Ue_blue ) >> 16 ]
                     | ( color_table[ ( Ye1 + UeVe_green ) >> 16 ] << 8 )
                     | ( color_table[ ( Ye1 + Ve_red ) >> 16 ] << 16 )
                     | ( 255 << 24 );
    }

     void DECODE_YUYV_Common_line(TARGB32 * pDstLine, const TUInt8 * pYUYV, long width)
    {
         for ( long x = 0 ;x < width;x += 2 )
        {
            YUVToRGB32_Two( & pDstLine[x],pYUYV[ 0 ],pYUYV[ 2 ],pYUYV[ 1 ],pYUYV[ 3 ]);
            pYUYV += 4 ;
        }
    }

void DECODE_YUYV_Common( const TUInt8 * pYUYV, const TPicRegion & DstPic)
{
    assert((DstPic.width & 1 ) == 0 );

     long YUV_byte_width = (DstPic.width >> 1 ) << 2 ;
    TARGB32 * pDstLine = DstPic.pdata;
     for ( long y = 0 ;y < DstPic.height; ++ y)
    {
        DECODE_YUYV_Common_line(pDstLine,pYUYV,DstPic.width);
        pYUYV += YUV_byte_width;
        ((TUInt8 *& )pDstLine) += DstPic.byte_width;
    }
}

////////////////////////////////////////////////////////////////////////////////
//測試平臺:(CPU:AMD64x2 4200+(2.37G);   內存:DDR2 677(雙通道); 編譯器:VC2005)
//測試平臺:(CPU:Intel Core2 4400(2.00G);內存:DDR2 667(雙通道); 編譯器:VC2005)
////////////////////////////////////////////////////////////////////////////////
//==============================================================================
//                       |        1024x576       |       1920x1080       |
//------------------------------------------------------------------------------
//                       | AMD64x2 |   Core2   | AMD64x2 |   Core2   |
//------------------------------------------------------------------------------
//DECODE_YUYV_Float         55.0 FPS   63.7 FPS     15.6 FPS   18.0 FPS
//DECODE_YUYV_Int          137.1 FPS 131.9 FPS     39.0 FPS   37.1 FPS
//DECODE_YUYV_RGBTable     164.8 FPS 152.9 FPS     47.1 FPS   43.7 FPS
//DECODE_YUYV_Table        146.1 FPS 151.3 FPS     41.8 FPS   43.5 FPS
//DECODE_YUYV_TableEx      236.5 FPS  300.5 FPS     68.1 FPS   85.0 FPS
//DECODE_YUYV_Common       250.7 FPS 287.1 FPS     71.9 FPS   80.7 FPS
////////////////////////////////////////////////////////////////////////////////

( 歡迎提出不足和改進意見; 下一篇文章將繼續成倍的提高解碼速度)

YUV視頻格式到RGB32格式轉換的速度優化上篇

模擬手機設備：使用 Playwright 實現移動端自動化測試

Mellanox網卡開啓SR-IOV

全面系統的AI學習路徑，幫助普通人也能玩轉AI

uni-app實現上拉加載

vue3編譯優化之“靜態提升”

又是一個月-20240513

flask 如何保證返回json有序

linux服務器設置ssh免密

HTML 00 Tutorial

cmakelist的一個例子

“數學函數動態編譯器TCompile類”的bug跟蹤、新版源代碼下載

圖形圖像處理－之－任意角度的高質量的快速的圖像旋轉下篇補充話題

HDiffPatch和BsDiff4.3&xdelta3.1的對比測試

我的分形畫廊

YUV視頻格式到RGB32格式轉換的速度優化中篇

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

YUV視頻格式到RGB32格式轉換的速度優化 上篇

YUV視頻格式到RGB32格式轉換的速度優化上篇