圖形圖像處理-之-任意角度的高質量的快速的圖像旋轉 下篇 補充話題

          圖形圖像處理-之-任意角度的高質量的快速的圖像旋轉 下篇 補充話題
                        
[email protected]   2007.06.29

 

(2009.03.09  可以到這裏下載旋轉算法的完整的可以編譯的項目源代碼:  http://blog.csdn.net/housisong/archive/2009/03/09/3970925.aspx  )

(2007.07.15 添加對大圖片旋轉的預讀緩衝區優化版本)

 

tag:圖像旋轉,任意角度,圖像縮放,速度優化,定點數優化,近鄰取樣插值,二次線性插值,
   三次卷積插值,MipMap鏈,三次線性插值,MMX/SSE優化,CPU緩存優化,AlphaBlend,顏色混合,並行

摘要: 該文章是《任意角度的高質量的快速的圖像旋轉》的一些高級補充話題;
     給出了一個完整的Alpha混合的插值旋轉實現;並嘗試將旋轉函數並行化,從而在多核電腦上獲得更快的速度;添加優化預讀緩衝區的函數實現版本,提高超大圖片的旋轉的速度;

任意角度的高質量的快速的圖像旋轉 全文 分爲:
     上篇 純軟件的任意角度的快速旋轉
     中篇 高質量的旋轉
     下篇 補充話題


正文:
  爲了便於討論,這裏只處理32bit的ARGB顏色;
  代碼使用C++;涉及到彙編優化的時候假定爲x86平臺;使用的編譯器爲vc2005;
  爲了代碼的可讀性,沒有加入異常處理代碼;
   測試使用的CPU爲AMD64x2 4200+(2.37G) 和 Intel Core2 4400(2.00G);
  (基礎代碼參考《圖形圖像處理-之-任意角度的高質量的快速的圖像旋轉》系列前面的文章)


A:完整的Alpha混合的雙線性插值旋轉實現
  《高質量的旋轉》中已經涉及到了邊界的AlphaBlend的問題,這裏順水推舟的實現一個支持全圖片Alpha通道Blend混合的雙線性插值旋轉函數;
   首先給出帶完整Alpha通道的源圖片:

              

   這張圖片是帶有8比特Alpha的32比特RGB真彩bmp圖片;
   帶的Alpha通道在工具裏可能顯示不出來,單獨提取出來的圖示:

               

   函數實現:  


    
void BilInear_BlendBorder_MMX(const TPicRegion& pic,const long x_16,const long y_16,TARGB32* result)
    {
        unsigned 
long x0=(x_16>>16
);
        unsigned 
long y0=(y_16>>16
);

        TARGB32 pixel[
4
];
        
bool
 IsInPic;
        pixel[
0]=
Pixels_Bound(pic,x0,y0,IsInPic);
        
if (!IsInPic) pixel[0].a=0
;
        pixel[
2]=Pixels_Bound(pic,x0,y0+1
,IsInPic);
        
if (!IsInPic) pixel[2].a=0

        pixel[
1]=Pixels_Bound(pic,x0+1
,y0,IsInPic);
        
if (!IsInPic) pixel[1].a=0
;
        pixel[
3]=Pixels_Bound(pic,x0+1,y0+1
,IsInPic);
        
if (!IsInPic) pixel[3].a=0
;
        
        TPicRegion npic;
        npic.pdata     
=&pixel[0
];
        npic.byte_width
=2*sizeof
(TARGB32);
        
//
npic.width     =2;
        
//npic.height    =2;

        BilInear_Fast_MMX(npic,(unsigned short)x_16,(unsigned short)y_16,result);
    }

void PicRotary_BilInear_BlendLine_MMX(TARGB32* pDstLine,long dst_border_x0,long dst_in_x0,long dst_in_x1,long
 dst_border_x1,
                        
const TPicRegion& SrcPic,long srcx0_16,long srcy0_16,long Ax_16,long
 Ay_16)
{
    
long
 x;
    
for (x=dst_border_x0;x<dst_in_x0;++
x)
    {
        TARGB32 src_color;
        BilInear_BlendBorder_MMX(SrcPic,srcx0_16,srcy0_16,
&
src_color);
        
if (src_color.a>0
)
            pDstLine[x]
=
AlphaBlend_MMX(pDstLine[x],src_color);        
        srcx0_16
+=
Ax_16;
        srcy0_16
+=
Ay_16;
    }
    
for (x=dst_in_x0;x<dst_in_x1;++
x)
    {
        TARGB32 src_color;
        BilInear_Fast_MMX(SrcPic,srcx0_16,srcy0_16,
&
src_color);
        
if (src_color.a==255
)
            pDstLine[x]
=
src_color;  
        
else if (src_color.a>0
)
            pDstLine[x]
=
AlphaBlend_MMX(pDstLine[x],src_color);
        srcx0_16
+=
Ax_16;
        srcy0_16
+=
Ay_16;
    }
    
for (x=dst_in_x1;x<dst_border_x1;++
x)
    {
        TARGB32 src_color;
        BilInear_BlendBorder_MMX(SrcPic,srcx0_16,srcy0_16,
&
src_color);
        
if (src_color.a>0
)
            pDstLine[x]
=
AlphaBlend_MMX(pDstLine[x],src_color);        
        srcx0_16
+=
Ax_16;
        srcy0_16
+=
Ay_16;
    }
    asm  emms
}

void PicRotaryBlendBilInear_MMX(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double
 move_y)
{
    
if ( (fabs(ZoomX*Src.width)<1.0e-4|| (fabs(ZoomY*Src.height)<1.0e-4) ) return//太小的縮放比例認爲已經不可見

    double tmprZoomXY=1.0/(ZoomX*ZoomY);  
    
double rZoomX=tmprZoomXY*
ZoomY;
    
double rZoomY=tmprZoomXY*
ZoomX;
    
double
 sinA,cosA;
    SinCos(RotaryAngle,sinA,cosA);
    
long Ax_16=(long)(rZoomX*cosA*(1<<16
)); 
    
long Ay_16=(long)(rZoomX*sinA*(1<<16
)); 
    
long Bx_16=(long)(-rZoomY*sinA*(1<<16
)); 
    
long By_16=(long)(rZoomY*cosA*(1<<16
)); 
    
double rx0=Src.width*0.5;  //(rx0,ry0)爲旋轉中心 

    double ry0=Src.height*0.5
    
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16
));
    
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16
)); 

    TRotaryClipData rcData;
    rcData.Ax_16
=
Ax_16;
    rcData.Bx_16
=
Bx_16;
    rcData.Cx_16
=
Cx_16;
    rcData.Ay_16
=
Ay_16;
    rcData.By_16
=
By_16;
    rcData.Cy_16
=
Cy_16;
    rcData.dst_width
=
Dst.width;
    rcData.dst_height
=
Dst.height;
    rcData.src_width
=
Src.width;
    rcData.src_height
=
Src.height;
    
if (!rcData.inti_clip(move_x,move_y,1)) return
;

    TARGB32
* pDstLine=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_down_y);
    
while (true//to down

    {
        
long y=
rcData.out_dst_down_y;
        
if (y>=Dst.height) break
;
        
if (y>=0
)
        {
            PicRotary_BilInear_BlendLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
                rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
        }
        
if (!rcData.next_clip_line_down()) break
;
        ((TUInt8
*&)pDstLine)+=
Dst.byte_width;
    }
   
    pDstLine
=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_up_y);
    
while (rcData.next_clip_line_up()) //to up 

    {
        
long y=
rcData.out_dst_up_y;
        
if (y<0break
;
        ((TUInt8
*&)pDstLine)-=
Dst.byte_width;
        
if (y<
Dst.height)
        {
            PicRotary_BilInear_BlendLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
                rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
        }
    }
}

  效果圖:


B:在雙核上並行三次卷積插值旋轉的一個簡單實現
  (假設圖片旋轉繪製到目的圖片的中間)
  這裏利用CWorkThreadPool來並行執行任務;
  (參見我的文章《並行計算簡介和多核CPU編程Demo》,裏面有CWorkThreadPool類的完整源代碼)
  最容易想到的方案就是分成上下兩部分分別調用PicRotaryThreeOrder_MMX,從而並行執行;

 

struct TRotaryThreeOrder_WorkData
{
    
const TPicRegion*
 Dst;
    
const TPicRegion*
 Src;
    
double
 RotaryAngle;
    
double
 ZoomX;
    
double
 ZoomY;
    
double
 move_x;
    
double
 move_y;
};

void RotaryThreeOrder_callback(void*
 wd)
{
    TRotaryThreeOrder_WorkData
* WorkData=(TRotaryThreeOrder_WorkData*
)wd;
    PicRotaryThreeOrder_MMX(
*WorkData->Dst,*WorkData->Src,WorkData->RotaryAngle,WorkData->ZoomX,WorkData->ZoomY,WorkData->move_x,WorkData->
move_y);
}


void PicRotaryThreeOrder_MMX_parallel2(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double
 move_y)
{
    TRotaryThreeOrder_WorkData work_list[
2
];
    TRotaryThreeOrder_WorkData
* pwork_list[2
];
    
for (long i=0;i<2;++
i)
    {
        work_list[i].Src
=&
Src;
        work_list[i].RotaryAngle
=
RotaryAngle;
        work_list[i].ZoomX
=
ZoomX;
        work_list[i].ZoomY
=
ZoomY;
        work_list[i].move_x
=
move_x;
        work_list[i].move_y
=
move_y;
        pwork_list[i]
=&
work_list[i];
    }
    TPicRegion dst_up
=
Dst;
    dst_up.height
=Dst.height/2
;
    work_list[
0].Dst=&
dst_up;
    TPicRegion dst_down
=
Dst;
    dst_down.pdata
=&Pixels(Dst,0
,dst_up.height);
    dst_down.height
=Dst.height-
dst_up.height;
    work_list[
1].Dst=&
dst_down;
    work_list[
1].move_y=move_y-Dst.height/2
;
    CWorkThreadPool::work_execute(RotaryThreeOrder_callback,(
void**)&pwork_list,2
);
}


//注:測試圖片都是800*600的圖片旋轉到1004*1004的圖片中心 測試成績取各個旋轉角度的平均速度值
//////////////////////////////////////////////////////////////////////////////////
//速度測試:    CPU: AMD64x2 4200+             
//==============================================================================
// PicRotaryThreeOrder_MMX_parallel2  87.6 fps
//==============================================================================

//////////////////////////////////////////////////////////////////////////////////
//速度測試:    CPU: Intel Core2 4400(2.00G)             
//==============================================================================
// PicRotaryThreeOrder_MMX_parallel2  89.3 fps
////////////////////////////////////////////////////////////////////////////////  

並行化的實現PicRotaryThreeOrder_MMX_parallel2比PicRotaryThreeOrder_MMX的44.2fps快了98.2%!  (Intel Core2 4400上快了94.6%)
在雙核CPU上執行速度幾乎是單核上的2倍!

B':一個通用的針對任意多核並行的一個簡單實現
  有了上面的並行基礎,我們來實現一個更加通用一些的版本;根據CPU核心數來動態分配任務;
  實現方式爲直接按照掃描行來分配(但這樣處理可能不利於內存的高效訪問),就懶得去估算任務量了:)

void PicRotaryThreeOrder_MMX_part(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y,long part_i,long part_count)
{
    
if ( (fabs(ZoomX*Src.width)<1.0e-4|| (fabs(ZoomY*Src.height)<1.0e-4) ) return//太小的縮放比例認爲已經不可見

    double tmprZoomXY=1.0/(ZoomX*ZoomY);  
    
double rZoomX=tmprZoomXY*
ZoomY;
    
double rZoomY=tmprZoomXY*
ZoomX;
    
double
 sinA,cosA;
    SinCos(RotaryAngle,sinA,cosA);
    
long Ax_16=(long)(rZoomX*cosA*(1<<16
)); 
    
long Ay_16=(long)(rZoomX*sinA*(1<<16
)); 
    
long Bx_16=(long)(-rZoomY*sinA*(1<<16
)); 
    
long By_16=(long)(rZoomY*cosA*(1<<16
)); 
    
double rx0=Src.width*0.5;  //(rx0,ry0)爲旋轉中心 

    double ry0=Src.height*0.5
    
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16
));
    
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16
)); 

    TRotaryClipData rcData;
    rcData.Ax_16
=
Ax_16;
    rcData.Bx_16
=
Bx_16;
    rcData.Cx_16
=
Cx_16;
    rcData.Ay_16
=
Ay_16;
    rcData.By_16
=
By_16;
    rcData.Cy_16
=
Cy_16;
    rcData.dst_width
=
Dst.width;
    rcData.dst_height
=
Dst.height;
    rcData.src_width
=
Src.width;
    rcData.src_height
=
Src.height;
    
if (!rcData.inti_clip(move_x,move_y,2)) return
;

    TARGB32
* pDstLine=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_down_y);
    
long run_part_i=0
;
    
while (true//to down

    {
        
long y=
rcData.out_dst_down_y;
        
if (y>=Dst.height) break
;
        
if (y>=0
)
        {
            
if (run_part_i%part_count==
part_i)
                PicRotary_ThreeOrder_CopyLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
                        rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
            
++
run_part_i;
        }
        
if (!rcData.next_clip_line_down()) break
;
        ((TUInt8
*&)pDstLine)+=
Dst.byte_width;
    }
   
    pDstLine
=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_up_y);
    
while (rcData.next_clip_line_up()) //to up 

    {
        
long y=
rcData.out_dst_up_y;
        
if (y<0break
;
        ((TUInt8
*&)pDstLine)-=
Dst.byte_width;
        
if (y<
Dst.height)
        {
            
if (run_part_i%part_count==
part_i)
                PicRotary_ThreeOrder_CopyLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
                        rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
            
++
run_part_i;
        }
    }
}

struct
 TRotaryThreeOrder_part_WorkData
{
    
const TPicRegion*
 Dst;
    
const TPicRegion*
 Src;
    
double
 RotaryAngle;
    
double
 ZoomX;
    
double
 ZoomY;
    
double
 move_x;
    
double
 move_y;
    
long
   part_i;
    
long
   part_count;
};

void RotaryThreeOrder_part_callback(void*
 wd)
{
    TRotaryThreeOrder_part_WorkData
* WorkData=(TRotaryThreeOrder_part_WorkData*
)wd;
    PicRotaryThreeOrder_MMX_part(
*WorkData->Dst,*WorkData->Src,WorkData->RotaryAngle,WorkData->ZoomX,WorkData->
ZoomY,
        WorkData
->move_x,WorkData->move_y,WorkData->part_i,WorkData->
part_count);
}

void PicRotaryThreeOrder_MMX_parallel(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double
 move_y)
{
    
long work_count=
CWorkThreadPool::best_work_count();
    std::vector
<TRotaryThreeOrder_part_WorkData>
   work_list(work_count);
    std::vector
<TRotaryThreeOrder_part_WorkData*>
  pwork_list(work_count);
    
long
 i;
    
for (i=0;i<work_count;++
i)
    {
        work_list[i].Dst
=&
Dst;
        work_list[i].Src
=&
Src;
        work_list[i].RotaryAngle
=
RotaryAngle;
        work_list[i].ZoomX
=
ZoomX;
        work_list[i].ZoomY
=
ZoomY;
        work_list[i].move_x
=
move_x;
        work_list[i].move_y
=
move_y;
        work_list[i].part_i
=
i;
        work_list[i].part_count
=
work_count;
        pwork_list[i]
=&
work_list[i];
    }
    CWorkThreadPool::work_execute(RotaryThreeOrder_part_callback,(
void**)&pwork_list[0
],work_count);
}

//注:測試圖片都是800*600的圖片旋轉到1004*1004的圖片中心 測試成績取各個旋轉角度的平均速度值
//////////////////////////////////////////////////////////////////////////////////
//速度測試: CPU: AMD64x2 4200+                
//==============================================================================
// PicRotaryThreeOrder_MMX_parallel   81.0 fps
////////////////////////////////////////////////////////////////////////////////  

//////////////////////////////////////////////////////////////////////////////////
//速度測試: CPU: Intel Core2 4400(2.00G)                
//==============================================================================
// PicRotaryThreeOrder_MMX_parallel   89.5 fps
////////////////////////////////////////////////////////////////////////////////   

 這個實現能應付大多數時候的並行需求了,包括以後的4核8核...


(注意:這裏的並行任務分割方案僅僅是簡單的舉例(用了代碼改動最小的方案),你應該根據你的需求來更好的並行化你的任務; 如果分割後的單個任務太小,並行的優勢可能就體現不出來,甚至於更慢;)

C:超大圖片旋轉優化

  1.使用PicRotary*、PicRotaryBilInear*、PicRotaryThreeOrder*等函數在旋轉大圖片的時候,會出現一個速度變慢問題:就是旋轉不同的角度,速度差異巨大(甚至達到8倍以上!)

速度測試:
//注:CPU: AMD64x2 4200+(2.37G)
////////////////////////////////////////////////////////////////////////////////
//   800x600的源圖片             各角度平均幀數     角度中最小幀數     角度中最大幀數
//==============================================================================
// PicRotarySSE2                  304.2 fps       250.7 fps       565.3 fps
// PicRotaryBilInear_MMX          100.2 fps        87.8 fps       130.3 fps
// PicRotaryThreeOrder_MMX         44.2 fps        41.4 fps        49.7 fps
////////////////////////////////////////////////////////////////////////////////  
//  3200x2400的源圖片            各角度平均幀數     角度中最小幀數     角度中最大幀數
//==============================================================================
// PicRotarySSE2                   12.2 fps         4.6 fps        36.3 fps
// PicRotaryBilInear_MMX            5.0 fps         1.1 fps         8.7 fps
// PicRotaryThreeOrder_MMX          2.6 fps         0.9 fps         3.5 fps
////////////////////////////////////////////////////////////////////////////////  

//注:CPU: Intel Core2 4400(2.00G)
////////////////////////////////////////////////////////////////////////////////
//   800x600的源圖片             各角度平均幀數     角度中最小幀數     角度中最大幀數
//==============================================================================
// PicRotarySSE2                  449.3 fps       250.5 fps       753.4 fps
// PicRotaryBilInear_MMX          109.5 fps        95.3 fps       132.4 fps
// PicRotaryThreeOrder_MMX         45.9 fps        41.5 fps        50.3 fps
////////////////////////////////////////////////////////////////////////////////  
//  3200x2400的源圖片            各角度平均幀數     角度中最小幀數     角度中最大幀數
//==============================================================================
// PicRotarySSE2                   18.3 fps        12.0 fps        44.8 fps
// PicRotaryBilInear_MMX            6.7 fps         3.5 fps         8.7 fps
// PicRotaryThreeOrder_MMX          2.9 fps         2.2 fps         3.4 fps
////////////////////////////////////////////////////////////////////////////////  

  在我的AMD64下 4200+ CPU上800x600的源圖片對旋轉的角度不是很敏感,但當源圖片爲3200x2400的時候,最小速度和平均速度差異巨大;(Intel Core2 4400上稍好)

  2.先來分析一下問題出現的原因,對於某些角度(比如90度和270度),按以前的函數實現,訪問源圖片內存的方式將是列方向的,當圖片比較大的時候,內存的讀取訪問將變得非常低效;一般CPU訪問內存的時候都會一次性讀取連續相鄰的64字節放到緩存,但很明顯對於某些角度,預讀的大部分數據都沒有用(甚至只使用了其中的4個字節,浪費了60字節完全沒有用就被新的數據擠出了緩存);對於小緩存的CPU和較大的源圖片,這種情況會更嚴重;
  能想到的一些解決方案:a.使用CPU的預讀指令來手工預讀數據,但是沒有辦法指定預讀的內存塊大小從而避免帶寬浪費;而且以後的硬件趨勢也只會朝一次讀取更大的塊發展,所以該方案不可行;b.針對不同的角度方向分別編碼,使讀取的內存方向儘量按行方向(從而使預讀生效),比如靠近90度旋轉的時候寫內存的方向將變爲列方向,因爲寫內存指令中可以禁止寫緩存,應該可以降低列寫入帶來的性能損失;該方案還有一個缺點是代碼編寫稍嫌麻煩:)  C:利用內存訪問的局部性來使緩存的數據有效,就是分成小塊來處理旋轉算法,使內存訪問在任何角度時都有相關性;

  3.分塊局部性旋轉算法的實現方案;
      
             以前的掃描算法圖示                        新的分塊掃描算法圖示

     爲了實現新的掃描路徑,有一個簡單的改進辦法,把以前的掃描行(起始和結束位置)先保存起來,然後在處理這是掃描行;代碼就很簡單了,如下:
(沒有給出的基礎代碼參見該系列的其他文章)

  4.近鄰取樣插值的分塊掃描函數實現PicRotarySSE2_Block

    struct TBlockLineWork //用來保存一個掃描行
    {
    
public
:
        TARGB32
*
    pdst;
        
long
        width_border0;
        
long
        width_in;
        
long
        width_border1;
        
long
        src_x0_16;
        
long
        src_y0_16;
        TBlockLineWork(TARGB32
* _pdst,long _width_in,long _src_x0_16,long
 _src_y0_16)
            :pdst(_pdst),width_in(_width_in),src_x0_16(_src_x0_16),src_y0_16(_src_y0_16),width_border0(
0),width_border1(0
) {}
        TBlockLineWork(TARGB32
* _pdst,long _width_border0,long _width_in,long _width_border1,long _src_x0_16,long
 _src_y0_16)
            :pdst(_pdst),width_in(_width_in),src_x0_16(_src_x0_16),src_y0_16(_src_y0_16),width_border0(_width_border0),width_border1(_width_border1) {}
    };
    typedef std::vector
<TBlockLineWork>
 TBlockWork;

    
//分小塊遍歷

    void do_PicRotarySSE2_Block(TBlockWork& BlockWork,const TPicRegion& Src,long Ax_16,long Ay_16)
    {
        
//我測試的分成64x64的小塊比較合適,也可以嘗試一下其它塊大小

        const long rotary_block_width=64;  
        
const long rotary_block_height=
rotary_block_width;
        
long height=
BlockWork.size();
        
for (long y=0;y<height;y+=
rotary_block_height)
        { 
            
long
 cur_block_height;
            
if (rotary_block_height<=(height-
y))
                cur_block_height
=
rotary_block_height;
            
else
 
                cur_block_height
=(height-
y);
            
bool is_line_filish=false
;
            
while (!
is_line_filish)
            {
                is_line_filish
=true
;
                
for (long yi=y;yi<y+cur_block_height;++
yi)
                {
                    TBlockLineWork
* BlockLine=&
BlockWork[yi];
                    
long cur_block_width=BlockLine->
width_in;
                    
if (cur_block_width>0
)
                    {
                        is_line_filish
=false
;
                        
if (cur_block_width>
rotary_block_width)
                           cur_block_width
=
rotary_block_width;
                        PicRotarySSE2_CopyLine(BlockLine
->
pdst,cur_block_width,Ax_16,Ay_16,
                            BlockLine
->src_x0_16,BlockLine->
src_y0_16,Src);
                        BlockLine
->pdst=&BlockLine->
pdst[cur_block_width];
                        BlockLine
->width_in-=
cur_block_width;
                        BlockLine
->src_x0_16+=(Ax_16*
cur_block_width);
                        BlockLine
->src_y0_16+=(Ay_16*
cur_block_width);
                    }
                }
            }
        }
    }

void PicRotarySSE2_Block(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double
 move_y)
{
    
if ( (fabs(ZoomX*Src.width)<1.0e-4|| (fabs(ZoomY*Src.height)<1.0e-4) ) return//太小的縮放比例認爲已經不可見

    double tmprZoomXY=1.0/(ZoomX*ZoomY);  
    
double rZoomX=tmprZoomXY*
ZoomY;
    
double rZoomY=tmprZoomXY*
ZoomX;
    
double
 sinA,cosA;
    SinCos(RotaryAngle,sinA,cosA);
    
long Ax_16=(long)(rZoomX*cosA*(1<<16
)); 
    
long Ay_16=(long)(rZoomX*sinA*(1<<16
)); 
    
long Bx_16=(long)(-rZoomY*sinA*(1<<16
)); 
    
long By_16=(long)(rZoomY*cosA*(1<<16
)); 
    
double rx0=Src.width*0.5;  //(rx0,ry0)爲旋轉中心 

    double ry0=Src.height*0.5
    
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16
));
    
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16
)); 

    TRotaryClipData rcData;
    rcData.Ax_16
=
Ax_16;
    rcData.Bx_16
=
Bx_16;
    rcData.Cx_16
=
Cx_16;
    rcData.Ay_16
=
Ay_16;
    rcData.By_16
=
By_16;
    rcData.Cy_16
=
Cy_16;
    rcData.dst_width
=
Dst.width;
    rcData.dst_height
=
Dst.height;
    rcData.src_width
=
Src.width;
    rcData.src_height
=
Src.height;
    
if (!rcData.inti_clip(move_x,move_y,0)) return
;


    TBlockWork BlockWork;

    TARGB32
* pDstLine=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_down_y);
    
while (true//to down

    {
        
long y=
rcData.out_dst_down_y;
        
if (y>=Dst.height) break
;
        
if (y>=0
)
        {
            
long x0=
rcData.out_dst_x0_in;
            BlockWork.push_back(TBlockLineWork(
&pDstLine[x0],rcData.out_dst_x1_in-
x0,rcData.out_src_x0_16,rcData.out_src_y0_16));
        }
        
if (!rcData.next_clip_line_down()) break
;
        ((TUInt8
*&)pDstLine)+=
Dst.byte_width;
    }
    
for (long sleft=0,sright=BlockWork.size()-1;sleft<sright;++sleft,--
sright)
        std::swap(BlockWork[sleft],BlockWork[sright]);
   
    pDstLine
=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_up_y);
    
while (rcData.next_clip_line_up()) //to up 

    {
        
long y=
rcData.out_dst_up_y;
        
if (y<0break
;
        ((TUInt8
*&)pDstLine)-=
Dst.byte_width;
        
if (y<
Dst.height)
        {
            
long x0=
rcData.out_dst_x0_in;
            BlockWork.push_back(TBlockLineWork(
&pDstLine[x0],rcData.out_dst_x1_in-
x0,rcData.out_src_x0_16,rcData.out_src_y0_16));
        }
    }
    do_PicRotarySSE2_Block(BlockWork,Src,Ax_16,Ay_16);

    asm  sfence 
//刷新寫入

}

  5.二次線性插值的分塊掃描函數實現PicRotaryBilInear_MMX_Block
    這裏比近鄰取樣的相應函數多出一些邊界處理代碼

    inline void PicRotary_BilInear_CopyLine_Fast_MMX(TARGB32* pDstLine,long width,
                            
long Ax_16,long Ay_16,long srcx0_16,long srcy0_16,const TPicRegion&
 SrcPic)
    {
        
for (long x=0;x<width;++
x)
        {
            BilInear_Fast_MMX(SrcPic,srcx0_16,srcy0_16,
&
pDstLine[x]);
            srcx0_16
+=
Ax_16;
            srcy0_16
+=
Ay_16;
        }
    }
    inline 
void PicRotary_BilInear_CopyLine_Border_MMX(TARGB32* pDstLine,long
 width,
                            
long Ax_16,long Ay_16,long srcx0_16,long srcy0_16,const TPicRegion&
 SrcPic)
    {
        
for (long x=0;x<width;++
x)
        {
            TARGB32 src_color;
            BilInear_Border_MMX(SrcPic,srcx0_16,srcy0_16,
&
src_color);
            pDstLine[x]
=
AlphaBlend_MMX(pDstLine[x],src_color);        
            srcx0_16
+=
Ax_16;
            srcy0_16
+=
Ay_16;
        }
    }

    
void do_PicRotary_BilInear_MMX_Block(TBlockWork& BlockWork,const TPicRegion& Src,long Ax_16,long
 Ay_16)
    {
        
const long rotary_block_width=64;  //128 

        const long rotary_block_height=rotary_block_width;
        
long height=
BlockWork.size();
        
for (long y=0;y<height;y+=
rotary_block_height)
        { 
            
long
 cur_block_height;
            
if (rotary_block_height<=(height-
y))
                cur_block_height
=
rotary_block_height;
            
else
 
                cur_block_height
=(height-
y);

            
for (long yi=y;yi<y+cur_block_height;++
yi)
            {
                TBlockLineWork
* BlockLine=&
BlockWork[yi];
                
long cur_block_width=BlockLine->
width_border0;
                
if (cur_block_width>0
)
                {
                    PicRotary_BilInear_CopyLine_Border_MMX(BlockLine
->
pdst,cur_block_width,Ax_16,Ay_16,
                        BlockLine
->src_x0_16,BlockLine->
src_y0_16,Src);
                    BlockLine
->pdst=&BlockLine->
pdst[cur_block_width];
                    BlockLine
->src_x0_16+=(Ax_16*
cur_block_width);
                    BlockLine
->src_y0_16+=(Ay_16*
cur_block_width);
                }
            }

            
bool is_line_filish=false
;
            
while (!
is_line_filish)
            {
                is_line_filish
=true
;
                
for (long yi=y;yi<y+cur_block_height;++
yi)
                {
                    TBlockLineWork
* BlockLine=&
BlockWork[yi];
                    
long cur_block_width=BlockLine->
width_in;
                    
if (cur_block_width>0
)
                    {
                        is_line_filish
=false
;
                        
if (cur_block_width>
rotary_block_width)
                           cur_block_width
=
rotary_block_width;
                        PicRotary_BilInear_CopyLine_Fast_MMX(BlockLine
->
pdst,cur_block_width,Ax_16,Ay_16,
                            BlockLine
->src_x0_16,BlockLine->
src_y0_16,Src);
                        BlockLine
->pdst=&BlockLine->
pdst[cur_block_width];
                        BlockLine
->width_in-=
cur_block_width;
                        BlockLine
->src_x0_16+=(Ax_16*
cur_block_width);
                        BlockLine
->src_y0_16+=(Ay_16*
cur_block_width);
                    }
                }
            }

            
for (long yi=y;yi<y+cur_block_height;++
yi)
            {
                TBlockLineWork
* BlockLine=&
BlockWork[yi];
                
long cur_block_width=BlockLine->
width_border1;
                
if (cur_block_width>0
)
                {
                    PicRotary_BilInear_CopyLine_Border_MMX(BlockLine
->
pdst,cur_block_width,Ax_16,Ay_16,
                        BlockLine
->src_x0_16,BlockLine->
src_y0_16,Src);
                    
//
BlockLine->pdst=&BlockLine->pdst[cur_block_width];
                    
//
BlockLine->src_x0_16+=(Ax_16*cur_block_width);
                    
//BlockLine->src_y0_16+=(Ay_16*cur_block_width);

                }
            }
        }
    
    
        asm  emms
    }

void PicRotaryBilInear_MMX_Block(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double
 move_y)
{
    
if ( (fabs(ZoomX*Src.width)<1.0e-4|| (fabs(ZoomY*Src.height)<1.0e-4) ) return//太小的縮放比例認爲已經不可見

    double tmprZoomXY=1.0/(ZoomX*ZoomY);  
    
double rZoomX=tmprZoomXY*
ZoomY;
    
double rZoomY=tmprZoomXY*
ZoomX;
    
double
 sinA,cosA;
    SinCos(RotaryAngle,sinA,cosA);
    
long Ax_16=(long)(rZoomX*cosA*(1<<16
)); 
    
long Ay_16=(long)(rZoomX*sinA*(1<<16
)); 
    
long Bx_16=(long)(-rZoomY*sinA*(1<<16
)); 
    
long By_16=(long)(rZoomY*cosA*(1<<16
)); 
    
double rx0=Src.width*0.5;  //(rx0,ry0)爲旋轉中心 

    double ry0=Src.height*0.5
    
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16
));
    
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16
)); 

    TRotaryClipData rcData;
    rcData.Ax_16
=
Ax_16;
    rcData.Bx_16
=
Bx_16;
    rcData.Cx_16
=
Cx_16;
    rcData.Ay_16
=
Ay_16;
    rcData.By_16
=
By_16;
    rcData.Cy_16
=
Cy_16;
    rcData.dst_width
=
Dst.width;
    rcData.dst_height
=
Dst.height;
    rcData.src_width
=
Src.width;
    rcData.src_height
=
Src.height;
    
if (!rcData.inti_clip(move_x,move_y,1)) return
;

    TBlockWork BlockWork;

    TARGB32
* pDstLine=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_down_y);
    
while (true//to down

    {
        
long y=
rcData.out_dst_down_y;
        
if (y>=Dst.height) break
;
        
if (y>=0
)
        {
            BlockWork.push_back(TBlockLineWork(
&
pDstLine[rcData.out_dst_x0_boder],
                rcData.out_dst_x0_in
-rcData.out_dst_x0_boder,rcData.out_dst_x1_in-
rcData.out_dst_x0_in,
                rcData.out_dst_x1_boder
-
rcData.out_dst_x1_in,rcData.out_src_x0_16,rcData.out_src_y0_16));
        }
        
if (!rcData.next_clip_line_down()) break
;
        ((TUInt8
*&)pDstLine)+=
Dst.byte_width;
    }
   
    
for (long sleft=0,sright=BlockWork.size()-1;sleft<sright;++sleft,--
sright)
        std::swap(BlockWork[sleft],BlockWork[sright]);

    pDstLine
=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_up_y);
    
while (rcData.next_clip_line_up()) //to up 

    {
        
long y=
rcData.out_dst_up_y;
        
if (y<0break
;
        ((TUInt8
*&)pDstLine)-=
Dst.byte_width;
        
if (y<
Dst.height)
        {
            BlockWork.push_back(TBlockLineWork(
&
pDstLine[rcData.out_dst_x0_boder],
                rcData.out_dst_x0_in
-rcData.out_dst_x0_boder,rcData.out_dst_x1_in-
rcData.out_dst_x0_in,
                rcData.out_dst_x1_boder
-
rcData.out_dst_x1_in,rcData.out_src_x0_16,rcData.out_src_y0_16));
        }
    }

    do_PicRotary_BilInear_MMX_Block(BlockWork,Src,Ax_16,Ay_16);
}

  6.三次卷積插值的分塊掃描函數實現PicRotaryThreeOrder_MMX_Block
   幾乎就是拷貝PicRotaryBilInear_MMX_Block,然後稍做改動;
   (實際項目中的代碼和文章中的代碼還是有很多不同的,要是實際代碼中也有這麼多長篇長篇的拷貝然後稍作修改的代碼,那就要瘋了:)

    inline void PicRotary_ThreeOrder_CopyLine_Fast_MMX(TARGB32* pDstLine,long width,
                            
long Ax_16,long Ay_16,long srcx0_16,long srcy0_16,const TPicRegion&
 SrcPic)
    {
        
for (long x=0;x<width;++
x)
        {
            ThreeOrder_Fast_MMX(SrcPic,srcx0_16,srcy0_16,
&
pDstLine[x]);
            srcx0_16
+=
Ax_16;
            srcy0_16
+=
Ay_16;
        }
    }
    inline 
void PicRotary_ThreeOrder_CopyLine_Border_MMX(TARGB32* pDstLine,long
 width,
                            
long Ax_16,long Ay_16,long srcx0_16,long srcy0_16,const TPicRegion&
 SrcPic)
    {
        
for (long x=0;x<width;++
x)
        {
            TARGB32 src_color;
            ThreeOrder_Border_MMX(SrcPic,srcx0_16,srcy0_16,
&
src_color);
            pDstLine[x]
=
AlphaBlend_MMX(pDstLine[x],src_color);        
            srcx0_16
+=
Ax_16;
            srcy0_16
+=
Ay_16;
        }
    }

    
void do_PicRotary_ThreeOrder_MMX_Block(TBlockWork& BlockWork,const TPicRegion& Src,long Ax_16,long
 Ay_16)
    {
        
const long rotary_block_width=64;  //128 

        const long rotary_block_height=rotary_block_width;
        
long height=
BlockWork.size();
        
for (long y=0;y<height;y+=
rotary_block_height)
        { 
            
long
 cur_block_height;
            
if (rotary_block_height<=(height-
y))
                cur_block_height
=
rotary_block_height;
            
else
 
                cur_block_height
=(height-
y);

            
for (long yi=y;yi<y+cur_block_height;++
yi)
            {
                TBlockLineWork
* BlockLine=&
BlockWork[yi];
                
long cur_block_width=BlockLine->
width_border0;
                
if (cur_block_width>0
)
                {
                    PicRotary_ThreeOrder_CopyLine_Border_MMX(BlockLine
->
pdst,cur_block_width,Ax_16,Ay_16,
                        BlockLine
->src_x0_16,BlockLine->
src_y0_16,Src);
                    BlockLine
->pdst=&BlockLine->
pdst[cur_block_width];
                    BlockLine
->src_x0_16+=(Ax_16*
cur_block_width);
                    BlockLine
->src_y0_16+=(Ay_16*
cur_block_width);
                }
            }

            
bool is_line_filish=false
;
            
while (!
is_line_filish)
            {
                is_line_filish
=true
;
                
for (long yi=y;yi<y+cur_block_height;++
yi)
                {
                    TBlockLineWork
* BlockLine=&
BlockWork[yi];
                    
long cur_block_width=BlockLine->
width_in;
                    
if (cur_block_width>0
)
                    {
                        is_line_filish
=false
;
                        
if (cur_block_width>
rotary_block_width)
                           cur_block_width
=
rotary_block_width;
                        PicRotary_ThreeOrder_CopyLine_Fast_MMX(BlockLine
->
pdst,cur_block_width,Ax_16,Ay_16,
                            BlockLine
->src_x0_16,BlockLine->
src_y0_16,Src);
                        BlockLine
->pdst=&BlockLine->
pdst[cur_block_width];
                        BlockLine
->width_in-=
cur_block_width;
                        BlockLine
->src_x0_16+=(Ax_16*
cur_block_width);
                        BlockLine
->src_y0_16+=(Ay_16*
cur_block_width);
                    }
                }
            }

            
for (long yi=y;yi<y+cur_block_height;++
yi)
            {
                TBlockLineWork
* BlockLine=&
BlockWork[yi];
                
long cur_block_width=BlockLine->
width_border1;
                
if (cur_block_width>0
)
                {
                    PicRotary_ThreeOrder_CopyLine_Border_MMX(BlockLine
->
pdst,cur_block_width,Ax_16,Ay_16,
                        BlockLine
->src_x0_16,BlockLine->
src_y0_16,Src);
                    
//
BlockLine->pdst=&BlockLine->pdst[cur_block_width];
                    
//
BlockLine->src_x0_16+=(Ax_16*cur_block_width);
                    
//BlockLine->src_y0_16+=(Ay_16*cur_block_width);

                }
            }
        }
    
    
        asm  emms
    }

void PicRotaryThreeOrder_MMX_Block(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double
 move_y)
{
    
if ( (fabs(ZoomX*Src.width)<1.0e-4|| (fabs(ZoomY*Src.height)<1.0e-4) ) return//太小的縮放比例認爲已經不可見

    double tmprZoomXY=1.0/(ZoomX*ZoomY);  
    
double rZoomX=tmprZoomXY*
ZoomY;
    
double rZoomY=tmprZoomXY*
ZoomX;
    
double
 sinA,cosA;
    SinCos(RotaryAngle,sinA,cosA);
    
long Ax_16=(long)(rZoomX*cosA*(1<<16
)); 
    
long Ay_16=(long)(rZoomX*sinA*(1<<16
)); 
    
long Bx_16=(long)(-rZoomY*sinA*(1<<16
)); 
    
long By_16=(long)(rZoomY*cosA*(1<<16
)); 
    
double rx0=Src.width*0.5;  //(rx0,ry0)爲旋轉中心 

    double ry0=Src.height*0.5
    
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16
));
    
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16
)); 

    TRotaryClipData rcData;
    rcData.Ax_16
=
Ax_16;
    rcData.Bx_16
=
Bx_16;
    rcData.Cx_16
=
Cx_16;
    rcData.Ay_16
=
Ay_16;
    rcData.By_16
=
By_16;
    rcData.Cy_16
=
Cy_16;
    rcData.dst_width
=
Dst.width;
    rcData.dst_height
=
Dst.height;
    rcData.src_width
=
Src.width;
    rcData.src_height
=
Src.height;
    
if (!rcData.inti_clip(move_x,move_y,2)) return
;

    TBlockWork BlockWork;

    TARGB32
* pDstLine=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_down_y);
    
while (true//to down

    {
        
long y=
rcData.out_dst_down_y;
        
if (y>=Dst.height) break
;
        
if (y>=0
)
        {
            BlockWork.push_back(TBlockLineWork(
&
pDstLine[rcData.out_dst_x0_boder],
                rcData.out_dst_x0_in
-rcData.out_dst_x0_boder,rcData.out_dst_x1_in-
rcData.out_dst_x0_in,
                rcData.out_dst_x1_boder
-
rcData.out_dst_x1_in,rcData.out_src_x0_16,rcData.out_src_y0_16));
        }
        
if (!rcData.next_clip_line_down()) break
;
        ((TUInt8
*&)pDstLine)+=
Dst.byte_width;
    }
   
    
for (long sleft=0,sright=BlockWork.size()-1;sleft<sright;++sleft,--
sright)
        std::swap(BlockWork[sleft],BlockWork[sright]);

    pDstLine
=
Dst.pdata;
    ((TUInt8
*&)pDstLine)+=(Dst.byte_width*
rcData.out_dst_up_y);
    
while (rcData.next_clip_line_up()) //to up 

    {
        
long y=
rcData.out_dst_up_y;
        
if (y<0break
;
        ((TUInt8
*&)pDstLine)-=
Dst.byte_width;
        
if (y<
Dst.height)
        {
            BlockWork.push_back(TBlockLineWork(
&
pDstLine[rcData.out_dst_x0_boder],
                rcData.out_dst_x0_in
-rcData.out_dst_x0_boder,rcData.out_dst_x1_in-
rcData.out_dst_x0_in,
                rcData.out_dst_x1_boder
-
rcData.out_dst_x1_in,rcData.out_src_x0_16,rcData.out_src_y0_16));
        }
    }

    do_PicRotary_ThreeOrder_MMX_Block(BlockWork,Src,Ax_16,Ay_16);
}

 

  7.分塊處理的旋轉函數的速度測試:
//注:CPU: AMD64x2 4200+(2.37G)
////////////////////////////////////////////////////////////////////////////////
//   800x600的源圖片             各角度平均幀數    角度中最小幀數    角度中最大幀數
//==============================================================================
// PicRotarySSE2                  304.2 fps       250.7 fps       565.3 fps
// PicRotarySSE2_Block            316.6  fps      255.5 fps       495.6 fps
// PicRotaryBilInear_MMX          100.2 fps        87.8 fps       130.3 fps
// PicRotaryBilInear_MMX_Block     99.5  fps       92.7 fps       122.3 fps
// PicRotaryThreeOrder_MMX         44.2 fps        41.4 fps        49.7 fps

/
/ PicRotaryThreeOrder_MMX_Block   43.2 fps        41.3 fps        47.8 fps
////////////////////////////////////////////////////////////////////////////////  
//  3200x2400的源圖片            各角度平均幀數    角度中最小幀數    角度中最大幀數
//==============================================================================
// PicRotarySSE2                   12.2 fps         4.6 fps        36.3 fps
// PicRotarySSE2_Block             19.2 fps        15.9 fps        33.3 fps
// PicRotaryBilInear_MMX            5.0 fps         1.1 fps         8.7 fps
// PicRotaryBilInear_MMX_Block      6.3 fps         5.8 fps         8.3 fps
// PicRotaryThreeOrder_MMX          2.6 fps         0.9 fps         3.5 fps
// PicRotaryThreeOrder_MMX_Block    2.9 fps         2.7 fps         3.4 fps
////////////////////////////////////////////////////////////////////////////////
 

//注:CPU: Intel Core2 4400(2.00G)
////////////////////////////////////////////////////////////////////////////////
//   800x600的源圖片             各角度平均幀數    角度中最小幀數    角度中最大幀數
//==============================================================================
// PicRotarySSE2                  449.3 fps       250.5 fps       753.4 fps
// PicRotarySSE2_Block            351.3 fps       219.0 fps       436.3 fps
// PicRotaryBilInear_MMX          109.5 fps        95.3 fps       132.4 fps
// PicRotaryBilInear_MMX_Block    112.2 fps        98.1 fps       124.9 fps
// PicRotaryThreeOrder_MMX         45.9 fps        41.5 fps        50.3 fps
// PicRotaryThreeOrder_MMX_Block   47.7 fps        44.8 fps        50.1 fps
////////////////////////////////////////////////////////////////////////////////  
//  3200x2400的源圖片            各角度平均幀數     角度中最小幀數     角度中最大幀數
//==============================================================================
// PicRotarySSE2                   18.3 fps        12.0 fps        44.8 fps
// PicRotarySSE2_Block             18.4 fps        15.5 fps        23.8 fps
// PicRotaryBilInear_MMX            6.7 fps         3.5 fps         8.7 fps
// PicRotaryBilInear_MMX_Block      7.3 fps         6.8 fps         8.3 fps
// PicRotaryThreeOrder_MMX          2.9 fps         2.2 fps         3.4 fps
// PicRotaryThreeOrder_MMX_Block    3.2 fps         2.9 fps         3.4 fps
////////////////////////////////////////////////////////////////////////////////
 

測試更大的源圖片(6400x4800)或者CPU的緩存越小的時候*_Block優化版本優勢越明顯!


 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章