圖形圖像處理-之-任意角度的高質量的快速的圖像旋轉 下篇 補充話題
[email protected] 2007.06.29
(2009.03.09 可以到這裏下載旋轉算法的完整的可以編譯的項目源代碼: http://blog.csdn.net/housisong/archive/2009/03/09/3970925.aspx )
(2007.07.15 添加對大圖片旋轉的預讀緩衝區優化版本)
tag:圖像旋轉,任意角度,圖像縮放,速度優化,定點數優化,近鄰取樣插值,二次線性插值,
三次卷積插值,MipMap鏈,三次線性插值,MMX/SSE優化,CPU緩存優化,AlphaBlend,顏色混合,並行
摘要: 該文章是《任意角度的高質量的快速的圖像旋轉》的一些高級補充話題;
給出了一個完整的Alpha混合的插值旋轉實現;並嘗試將旋轉函數並行化,從而在多核電腦上獲得更快的速度;添加優化預讀緩衝區的函數實現版本,提高超大圖片的旋轉的速度;
任意角度的高質量的快速的圖像旋轉 全文 分爲:
上篇 純軟件的任意角度的快速旋轉
中篇 高質量的旋轉
下篇 補充話題
正文:
爲了便於討論,這裏只處理32bit的ARGB顏色;
代碼使用C++;涉及到彙編優化的時候假定爲x86平臺;使用的編譯器爲vc2005;
爲了代碼的可讀性,沒有加入異常處理代碼;
測試使用的CPU爲AMD64x2 4200+(2.37G) 和 Intel Core2 4400(2.00G);
(基礎代碼參考《圖形圖像處理-之-任意角度的高質量的快速的圖像旋轉》系列前面的文章)
A:完整的Alpha混合的雙線性插值旋轉實現
《高質量的旋轉》中已經涉及到了邊界的AlphaBlend的問題,這裏順水推舟的實現一個支持全圖片Alpha通道Blend混合的雙線性插值旋轉函數;
首先給出帶完整Alpha通道的源圖片:
這張圖片是帶有8比特Alpha的32比特RGB真彩bmp圖片;
帶的Alpha通道在工具裏可能顯示不出來,單獨提取出來的圖示:
函數實現:
void BilInear_BlendBorder_MMX(const TPicRegion& pic,const long x_16,const long y_16,TARGB32* result)
{
unsigned long x0=(x_16>>16);
unsigned long y0=(y_16>>16);
TARGB32 pixel[4];
bool IsInPic;
pixel[0]=Pixels_Bound(pic,x0,y0,IsInPic);
if (!IsInPic) pixel[0].a=0;
pixel[2]=Pixels_Bound(pic,x0,y0+1,IsInPic);
if (!IsInPic) pixel[2].a=0;
pixel[1]=Pixels_Bound(pic,x0+1,y0,IsInPic);
if (!IsInPic) pixel[1].a=0;
pixel[3]=Pixels_Bound(pic,x0+1,y0+1,IsInPic);
if (!IsInPic) pixel[3].a=0;
TPicRegion npic;
npic.pdata =&pixel[0];
npic.byte_width=2*sizeof(TARGB32);
//npic.width =2;
//npic.height =2;
BilInear_Fast_MMX(npic,(unsigned short)x_16,(unsigned short)y_16,result);
}
void PicRotary_BilInear_BlendLine_MMX(TARGB32* pDstLine,long dst_border_x0,long dst_in_x0,long dst_in_x1,long dst_border_x1,
const TPicRegion& SrcPic,long srcx0_16,long srcy0_16,long Ax_16,long Ay_16)
{
long x;
for (x=dst_border_x0;x<dst_in_x0;++x)
{
TARGB32 src_color;
BilInear_BlendBorder_MMX(SrcPic,srcx0_16,srcy0_16,&src_color);
if (src_color.a>0)
pDstLine[x]=AlphaBlend_MMX(pDstLine[x],src_color);
srcx0_16+=Ax_16;
srcy0_16+=Ay_16;
}
for (x=dst_in_x0;x<dst_in_x1;++x)
{
TARGB32 src_color;
BilInear_Fast_MMX(SrcPic,srcx0_16,srcy0_16,&src_color);
if (src_color.a==255)
pDstLine[x]=src_color;
else if (src_color.a>0)
pDstLine[x]=AlphaBlend_MMX(pDstLine[x],src_color);
srcx0_16+=Ax_16;
srcy0_16+=Ay_16;
}
for (x=dst_in_x1;x<dst_border_x1;++x)
{
TARGB32 src_color;
BilInear_BlendBorder_MMX(SrcPic,srcx0_16,srcy0_16,&src_color);
if (src_color.a>0)
pDstLine[x]=AlphaBlend_MMX(pDstLine[x],src_color);
srcx0_16+=Ax_16;
srcy0_16+=Ay_16;
}
asm emms
}
void PicRotaryBlendBilInear_MMX(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y)
{
if ( (fabs(ZoomX*Src.width)<1.0e-4) || (fabs(ZoomY*Src.height)<1.0e-4) ) return; //太小的縮放比例認爲已經不可見
double tmprZoomXY=1.0/(ZoomX*ZoomY);
double rZoomX=tmprZoomXY*ZoomY;
double rZoomY=tmprZoomXY*ZoomX;
double sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long Ax_16=(long)(rZoomX*cosA*(1<<16));
long Ay_16=(long)(rZoomX*sinA*(1<<16));
long Bx_16=(long)(-rZoomY*sinA*(1<<16));
long By_16=(long)(rZoomY*cosA*(1<<16));
double rx0=Src.width*0.5; //(rx0,ry0)爲旋轉中心
double ry0=Src.height*0.5;
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16));
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16));
TRotaryClipData rcData;
rcData.Ax_16=Ax_16;
rcData.Bx_16=Bx_16;
rcData.Cx_16=Cx_16;
rcData.Ay_16=Ay_16;
rcData.By_16=By_16;
rcData.Cy_16=Cy_16;
rcData.dst_width=Dst.width;
rcData.dst_height=Dst.height;
rcData.src_width=Src.width;
rcData.src_height=Src.height;
if (!rcData.inti_clip(move_x,move_y,1)) return;
TARGB32* pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_down_y);
while (true) //to down
{
long y=rcData.out_dst_down_y;
if (y>=Dst.height) break;
if (y>=0)
{
PicRotary_BilInear_BlendLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
if (!rcData.next_clip_line_down()) break;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_up_y);
while (rcData.next_clip_line_up()) //to up
{
long y=rcData.out_dst_up_y;
if (y<0) break;
((TUInt8*&)pDstLine)-=Dst.byte_width;
if (y<Dst.height)
{
PicRotary_BilInear_BlendLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
}
}
效果圖:
B:在雙核上並行三次卷積插值旋轉的一個簡單實現
(假設圖片旋轉繪製到目的圖片的中間)
這裏利用CWorkThreadPool來並行執行任務;
(參見我的文章《並行計算簡介和多核CPU編程Demo》,裏面有CWorkThreadPool類的完整源代碼)
最容易想到的方案就是分成上下兩部分分別調用PicRotaryThreeOrder_MMX,從而並行執行;
{
const TPicRegion* Dst;
const TPicRegion* Src;
double RotaryAngle;
double ZoomX;
double ZoomY;
double move_x;
double move_y;
};
void RotaryThreeOrder_callback(void* wd)
{
TRotaryThreeOrder_WorkData* WorkData=(TRotaryThreeOrder_WorkData*)wd;
PicRotaryThreeOrder_MMX(*WorkData->Dst,*WorkData->Src,WorkData->RotaryAngle,WorkData->ZoomX,WorkData->ZoomY,WorkData->move_x,WorkData->move_y);
}
void PicRotaryThreeOrder_MMX_parallel2(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y)
{
TRotaryThreeOrder_WorkData work_list[2];
TRotaryThreeOrder_WorkData* pwork_list[2];
for (long i=0;i<2;++i)
{
work_list[i].Src=&Src;
work_list[i].RotaryAngle=RotaryAngle;
work_list[i].ZoomX=ZoomX;
work_list[i].ZoomY=ZoomY;
work_list[i].move_x=move_x;
work_list[i].move_y=move_y;
pwork_list[i]=&work_list[i];
}
TPicRegion dst_up=Dst;
dst_up.height=Dst.height/2;
work_list[0].Dst=&dst_up;
TPicRegion dst_down=Dst;
dst_down.pdata=&Pixels(Dst,0,dst_up.height);
dst_down.height=Dst.height-dst_up.height;
work_list[1].Dst=&dst_down;
work_list[1].move_y=move_y-Dst.height/2;
CWorkThreadPool::work_execute(RotaryThreeOrder_callback,(void**)&pwork_list,2);
}
//注:測試圖片都是800*600的圖片旋轉到1004*1004的圖片中心 測試成績取各個旋轉角度的平均速度值
//////////////////////////////////////////////////////////////////////////////////
//速度測試: CPU: AMD64x2 4200+
//==============================================================================
// PicRotaryThreeOrder_MMX_parallel2 87.6 fps
//==============================================================================
//////////////////////////////////////////////////////////////////////////////////
//速度測試: CPU: Intel Core2 4400(2.00G)
//==============================================================================
// PicRotaryThreeOrder_MMX_parallel2 89.3 fps
////////////////////////////////////////////////////////////////////////////////
並行化的實現PicRotaryThreeOrder_MMX_parallel2比PicRotaryThreeOrder_MMX的44.2fps快了98.2%! (Intel Core2 4400上快了94.6%)
在雙核CPU上執行速度幾乎是單核上的2倍!
B':一個通用的針對任意多核並行的一個簡單實現
有了上面的並行基礎,我們來實現一個更加通用一些的版本;根據CPU核心數來動態分配任務;
實現方式爲直接按照掃描行來分配(但這樣處理可能不利於內存的高效訪問),就懶得去估算任務量了:)
{
if ( (fabs(ZoomX*Src.width)<1.0e-4) || (fabs(ZoomY*Src.height)<1.0e-4) ) return; //太小的縮放比例認爲已經不可見
double tmprZoomXY=1.0/(ZoomX*ZoomY);
double rZoomX=tmprZoomXY*ZoomY;
double rZoomY=tmprZoomXY*ZoomX;
double sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long Ax_16=(long)(rZoomX*cosA*(1<<16));
long Ay_16=(long)(rZoomX*sinA*(1<<16));
long Bx_16=(long)(-rZoomY*sinA*(1<<16));
long By_16=(long)(rZoomY*cosA*(1<<16));
double rx0=Src.width*0.5; //(rx0,ry0)爲旋轉中心
double ry0=Src.height*0.5;
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16));
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16));
TRotaryClipData rcData;
rcData.Ax_16=Ax_16;
rcData.Bx_16=Bx_16;
rcData.Cx_16=Cx_16;
rcData.Ay_16=Ay_16;
rcData.By_16=By_16;
rcData.Cy_16=Cy_16;
rcData.dst_width=Dst.width;
rcData.dst_height=Dst.height;
rcData.src_width=Src.width;
rcData.src_height=Src.height;
if (!rcData.inti_clip(move_x,move_y,2)) return;
TARGB32* pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_down_y);
long run_part_i=0;
while (true) //to down
{
long y=rcData.out_dst_down_y;
if (y>=Dst.height) break;
if (y>=0)
{
if (run_part_i%part_count==part_i)
PicRotary_ThreeOrder_CopyLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
++run_part_i;
}
if (!rcData.next_clip_line_down()) break;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_up_y);
while (rcData.next_clip_line_up()) //to up
{
long y=rcData.out_dst_up_y;
if (y<0) break;
((TUInt8*&)pDstLine)-=Dst.byte_width;
if (y<Dst.height)
{
if (run_part_i%part_count==part_i)
PicRotary_ThreeOrder_CopyLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
++run_part_i;
}
}
}
struct TRotaryThreeOrder_part_WorkData
{
const TPicRegion* Dst;
const TPicRegion* Src;
double RotaryAngle;
double ZoomX;
double ZoomY;
double move_x;
double move_y;
long part_i;
long part_count;
};
void RotaryThreeOrder_part_callback(void* wd)
{
TRotaryThreeOrder_part_WorkData* WorkData=(TRotaryThreeOrder_part_WorkData*)wd;
PicRotaryThreeOrder_MMX_part(*WorkData->Dst,*WorkData->Src,WorkData->RotaryAngle,WorkData->ZoomX,WorkData->ZoomY,
WorkData->move_x,WorkData->move_y,WorkData->part_i,WorkData->part_count);
}
void PicRotaryThreeOrder_MMX_parallel(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y)
{
long work_count=CWorkThreadPool::best_work_count();
std::vector<TRotaryThreeOrder_part_WorkData> work_list(work_count);
std::vector<TRotaryThreeOrder_part_WorkData*> pwork_list(work_count);
long i;
for (i=0;i<work_count;++i)
{
work_list[i].Dst=&Dst;
work_list[i].Src=&Src;
work_list[i].RotaryAngle=RotaryAngle;
work_list[i].ZoomX=ZoomX;
work_list[i].ZoomY=ZoomY;
work_list[i].move_x=move_x;
work_list[i].move_y=move_y;
work_list[i].part_i=i;
work_list[i].part_count=work_count;
pwork_list[i]=&work_list[i];
}
CWorkThreadPool::work_execute(RotaryThreeOrder_part_callback,(void**)&pwork_list[0],work_count);
}
//注:測試圖片都是800*600的圖片旋轉到1004*1004的圖片中心 測試成績取各個旋轉角度的平均速度值
//////////////////////////////////////////////////////////////////////////////////
//速度測試: CPU: AMD64x2 4200+
//==============================================================================
// PicRotaryThreeOrder_MMX_parallel 81.0 fps
////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
//速度測試: CPU: Intel Core2 4400(2.00G)
//==============================================================================
// PicRotaryThreeOrder_MMX_parallel 89.5 fps
////////////////////////////////////////////////////////////////////////////////
這個實現能應付大多數時候的並行需求了,包括以後的4核8核...
(注意:這裏的並行任務分割方案僅僅是簡單的舉例(用了代碼改動最小的方案),你應該根據你的需求來更好的並行化你的任務; 如果分割後的單個任務太小,並行的優勢可能就體現不出來,甚至於更慢;)
C:超大圖片旋轉優化
1.使用PicRotary*、PicRotaryBilInear*、PicRotaryThreeOrder*等函數在旋轉大圖片的時候,會出現一個速度變慢問題:就是旋轉不同的角度,速度差異巨大(甚至達到8倍以上!)
速度測試:
//注:CPU: AMD64x2 4200+(2.37G)
////////////////////////////////////////////////////////////////////////////////
// 800x600的源圖片 各角度平均幀數 角度中最小幀數 角度中最大幀數
//==============================================================================
// PicRotarySSE2 304.2 fps 250.7 fps 565.3 fps
// PicRotaryBilInear_MMX 100.2 fps 87.8 fps 130.3 fps
// PicRotaryThreeOrder_MMX 44.2 fps 41.4 fps 49.7 fps
////////////////////////////////////////////////////////////////////////////////
// 3200x2400的源圖片 各角度平均幀數 角度中最小幀數 角度中最大幀數
//==============================================================================
// PicRotarySSE2 12.2 fps 4.6 fps 36.3 fps
// PicRotaryBilInear_MMX 5.0 fps 1.1 fps 8.7 fps
// PicRotaryThreeOrder_MMX 2.6 fps 0.9 fps 3.5 fps
////////////////////////////////////////////////////////////////////////////////
//注:CPU: Intel Core2 4400(2.00G)
////////////////////////////////////////////////////////////////////////////////
// 800x600的源圖片 各角度平均幀數 角度中最小幀數 角度中最大幀數
//==============================================================================
// PicRotarySSE2 449.3 fps 250.5 fps 753.4 fps
// PicRotaryBilInear_MMX 109.5 fps 95.3 fps 132.4 fps
// PicRotaryThreeOrder_MMX 45.9 fps 41.5 fps 50.3 fps
////////////////////////////////////////////////////////////////////////////////
// 3200x2400的源圖片 各角度平均幀數 角度中最小幀數 角度中最大幀數
//==============================================================================
// PicRotarySSE2 18.3 fps 12.0 fps 44.8 fps
// PicRotaryBilInear_MMX 6.7 fps 3.5 fps 8.7 fps
// PicRotaryThreeOrder_MMX 2.9 fps 2.2 fps 3.4 fps
////////////////////////////////////////////////////////////////////////////////
在我的AMD64下 4200+ CPU上800x600的源圖片對旋轉的角度不是很敏感,但當源圖片爲3200x2400的時候,最小速度和平均速度差異巨大;(Intel Core2 4400上稍好)
2.先來分析一下問題出現的原因,對於某些角度(比如90度和270度),按以前的函數實現,訪問源圖片內存的方式將是列方向的,當圖片比較大的時候,內存的讀取訪問將變得非常低效;一般CPU訪問內存的時候都會一次性讀取連續相鄰的64字節放到緩存,但很明顯對於某些角度,預讀的大部分數據都沒有用(甚至只使用了其中的4個字節,浪費了60字節完全沒有用就被新的數據擠出了緩存);對於小緩存的CPU和較大的源圖片,這種情況會更嚴重;
能想到的一些解決方案:a.使用CPU的預讀指令來手工預讀數據,但是沒有辦法指定預讀的內存塊大小從而避免帶寬浪費;而且以後的硬件趨勢也只會朝一次讀取更大的塊發展,所以該方案不可行;b.針對不同的角度方向分別編碼,使讀取的內存方向儘量按行方向(從而使預讀生效),比如靠近90度旋轉的時候寫內存的方向將變爲列方向,因爲寫內存指令中可以禁止寫緩存,應該可以降低列寫入帶來的性能損失;該方案還有一個缺點是代碼編寫稍嫌麻煩:) C:利用內存訪問的局部性來使緩存的數據有效,就是分成小塊來處理旋轉算法,使內存訪問在任何角度時都有相關性;
3.分塊局部性旋轉算法的實現方案;
以前的掃描算法圖示 新的分塊掃描算法圖示
爲了實現新的掃描路徑,有一個簡單的改進辦法,把以前的掃描行(起始和結束位置)先保存起來,然後在處理這是掃描行;代碼就很簡單了,如下:
(沒有給出的基礎代碼參見該系列的其他文章)
4.近鄰取樣插值的分塊掃描函數實現PicRotarySSE2_Block
{
public:
TARGB32* pdst;
long width_border0;
long width_in;
long width_border1;
long src_x0_16;
long src_y0_16;
TBlockLineWork(TARGB32* _pdst,long _width_in,long _src_x0_16,long _src_y0_16)
:pdst(_pdst),width_in(_width_in),src_x0_16(_src_x0_16),src_y0_16(_src_y0_16),width_border0(0),width_border1(0) {}
TBlockLineWork(TARGB32* _pdst,long _width_border0,long _width_in,long _width_border1,long _src_x0_16,long _src_y0_16)
:pdst(_pdst),width_in(_width_in),src_x0_16(_src_x0_16),src_y0_16(_src_y0_16),width_border0(_width_border0),width_border1(_width_border1) {}
};
typedef std::vector<TBlockLineWork> TBlockWork;
//分小塊遍歷
void do_PicRotarySSE2_Block(TBlockWork& BlockWork,const TPicRegion& Src,long Ax_16,long Ay_16)
{
//我測試的分成64x64的小塊比較合適,也可以嘗試一下其它塊大小
const long rotary_block_width=64;
const long rotary_block_height=rotary_block_width;
long height=BlockWork.size();
for (long y=0;y<height;y+=rotary_block_height)
{
long cur_block_height;
if (rotary_block_height<=(height-y))
cur_block_height=rotary_block_height;
else
cur_block_height=(height-y);
bool is_line_filish=false;
while (!is_line_filish)
{
is_line_filish=true;
for (long yi=y;yi<y+cur_block_height;++yi)
{
TBlockLineWork* BlockLine=&BlockWork[yi];
long cur_block_width=BlockLine->width_in;
if (cur_block_width>0)
{
is_line_filish=false;
if (cur_block_width>rotary_block_width)
cur_block_width=rotary_block_width;
PicRotarySSE2_CopyLine(BlockLine->pdst,cur_block_width,Ax_16,Ay_16,
BlockLine->src_x0_16,BlockLine->src_y0_16,Src);
BlockLine->pdst=&BlockLine->pdst[cur_block_width];
BlockLine->width_in-=cur_block_width;
BlockLine->src_x0_16+=(Ax_16*cur_block_width);
BlockLine->src_y0_16+=(Ay_16*cur_block_width);
}
}
}
}
}
void PicRotarySSE2_Block(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y)
{
if ( (fabs(ZoomX*Src.width)<1.0e-4) || (fabs(ZoomY*Src.height)<1.0e-4) ) return; //太小的縮放比例認爲已經不可見
double tmprZoomXY=1.0/(ZoomX*ZoomY);
double rZoomX=tmprZoomXY*ZoomY;
double rZoomY=tmprZoomXY*ZoomX;
double sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long Ax_16=(long)(rZoomX*cosA*(1<<16));
long Ay_16=(long)(rZoomX*sinA*(1<<16));
long Bx_16=(long)(-rZoomY*sinA*(1<<16));
long By_16=(long)(rZoomY*cosA*(1<<16));
double rx0=Src.width*0.5; //(rx0,ry0)爲旋轉中心
double ry0=Src.height*0.5;
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16));
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16));
TRotaryClipData rcData;
rcData.Ax_16=Ax_16;
rcData.Bx_16=Bx_16;
rcData.Cx_16=Cx_16;
rcData.Ay_16=Ay_16;
rcData.By_16=By_16;
rcData.Cy_16=Cy_16;
rcData.dst_width=Dst.width;
rcData.dst_height=Dst.height;
rcData.src_width=Src.width;
rcData.src_height=Src.height;
if (!rcData.inti_clip(move_x,move_y,0)) return;
TBlockWork BlockWork;
TARGB32* pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_down_y);
while (true) //to down
{
long y=rcData.out_dst_down_y;
if (y>=Dst.height) break;
if (y>=0)
{
long x0=rcData.out_dst_x0_in;
BlockWork.push_back(TBlockLineWork(&pDstLine[x0],rcData.out_dst_x1_in-x0,rcData.out_src_x0_16,rcData.out_src_y0_16));
}
if (!rcData.next_clip_line_down()) break;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
for (long sleft=0,sright=BlockWork.size()-1;sleft<sright;++sleft,--sright)
std::swap(BlockWork[sleft],BlockWork[sright]);
pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_up_y);
while (rcData.next_clip_line_up()) //to up
{
long y=rcData.out_dst_up_y;
if (y<0) break;
((TUInt8*&)pDstLine)-=Dst.byte_width;
if (y<Dst.height)
{
long x0=rcData.out_dst_x0_in;
BlockWork.push_back(TBlockLineWork(&pDstLine[x0],rcData.out_dst_x1_in-x0,rcData.out_src_x0_16,rcData.out_src_y0_16));
}
}
do_PicRotarySSE2_Block(BlockWork,Src,Ax_16,Ay_16);
asm sfence //刷新寫入
}
5.二次線性插值的分塊掃描函數實現PicRotaryBilInear_MMX_Block
這裏比近鄰取樣的相應函數多出一些邊界處理代碼
long Ax_16,long Ay_16,long srcx0_16,long srcy0_16,const TPicRegion& SrcPic)
{
for (long x=0;x<width;++x)
{
BilInear_Fast_MMX(SrcPic,srcx0_16,srcy0_16,&pDstLine[x]);
srcx0_16+=Ax_16;
srcy0_16+=Ay_16;
}
}
inline void PicRotary_BilInear_CopyLine_Border_MMX(TARGB32* pDstLine,long width,
long Ax_16,long Ay_16,long srcx0_16,long srcy0_16,const TPicRegion& SrcPic)
{
for (long x=0;x<width;++x)
{
TARGB32 src_color;
BilInear_Border_MMX(SrcPic,srcx0_16,srcy0_16,&src_color);
pDstLine[x]=AlphaBlend_MMX(pDstLine[x],src_color);
srcx0_16+=Ax_16;
srcy0_16+=Ay_16;
}
}
void do_PicRotary_BilInear_MMX_Block(TBlockWork& BlockWork,const TPicRegion& Src,long Ax_16,long Ay_16)
{
const long rotary_block_width=64; //128
const long rotary_block_height=rotary_block_width;
long height=BlockWork.size();
for (long y=0;y<height;y+=rotary_block_height)
{
long cur_block_height;
if (rotary_block_height<=(height-y))
cur_block_height=rotary_block_height;
else
cur_block_height=(height-y);
for (long yi=y;yi<y+cur_block_height;++yi)
{
TBlockLineWork* BlockLine=&BlockWork[yi];
long cur_block_width=BlockLine->width_border0;
if (cur_block_width>0)
{
PicRotary_BilInear_CopyLine_Border_MMX(BlockLine->pdst,cur_block_width,Ax_16,Ay_16,
BlockLine->src_x0_16,BlockLine->src_y0_16,Src);
BlockLine->pdst=&BlockLine->pdst[cur_block_width];
BlockLine->src_x0_16+=(Ax_16*cur_block_width);
BlockLine->src_y0_16+=(Ay_16*cur_block_width);
}
}
bool is_line_filish=false;
while (!is_line_filish)
{
is_line_filish=true;
for (long yi=y;yi<y+cur_block_height;++yi)
{
TBlockLineWork* BlockLine=&BlockWork[yi];
long cur_block_width=BlockLine->width_in;
if (cur_block_width>0)
{
is_line_filish=false;
if (cur_block_width>rotary_block_width)
cur_block_width=rotary_block_width;
PicRotary_BilInear_CopyLine_Fast_MMX(BlockLine->pdst,cur_block_width,Ax_16,Ay_16,
BlockLine->src_x0_16,BlockLine->src_y0_16,Src);
BlockLine->pdst=&BlockLine->pdst[cur_block_width];
BlockLine->width_in-=cur_block_width;
BlockLine->src_x0_16+=(Ax_16*cur_block_width);
BlockLine->src_y0_16+=(Ay_16*cur_block_width);
}
}
}
for (long yi=y;yi<y+cur_block_height;++yi)
{
TBlockLineWork* BlockLine=&BlockWork[yi];
long cur_block_width=BlockLine->width_border1;
if (cur_block_width>0)
{
PicRotary_BilInear_CopyLine_Border_MMX(BlockLine->pdst,cur_block_width,Ax_16,Ay_16,
BlockLine->src_x0_16,BlockLine->src_y0_16,Src);
//BlockLine->pdst=&BlockLine->pdst[cur_block_width];
//BlockLine->src_x0_16+=(Ax_16*cur_block_width);
//BlockLine->src_y0_16+=(Ay_16*cur_block_width);
}
}
}
asm emms
}
void PicRotaryBilInear_MMX_Block(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y)
{
if ( (fabs(ZoomX*Src.width)<1.0e-4) || (fabs(ZoomY*Src.height)<1.0e-4) ) return; //太小的縮放比例認爲已經不可見
double tmprZoomXY=1.0/(ZoomX*ZoomY);
double rZoomX=tmprZoomXY*ZoomY;
double rZoomY=tmprZoomXY*ZoomX;
double sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long Ax_16=(long)(rZoomX*cosA*(1<<16));
long Ay_16=(long)(rZoomX*sinA*(1<<16));
long Bx_16=(long)(-rZoomY*sinA*(1<<16));
long By_16=(long)(rZoomY*cosA*(1<<16));
double rx0=Src.width*0.5; //(rx0,ry0)爲旋轉中心
double ry0=Src.height*0.5;
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16));
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16));
TRotaryClipData rcData;
rcData.Ax_16=Ax_16;
rcData.Bx_16=Bx_16;
rcData.Cx_16=Cx_16;
rcData.Ay_16=Ay_16;
rcData.By_16=By_16;
rcData.Cy_16=Cy_16;
rcData.dst_width=Dst.width;
rcData.dst_height=Dst.height;
rcData.src_width=Src.width;
rcData.src_height=Src.height;
if (!rcData.inti_clip(move_x,move_y,1)) return;
TBlockWork BlockWork;
TARGB32* pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_down_y);
while (true) //to down
{
long y=rcData.out_dst_down_y;
if (y>=Dst.height) break;
if (y>=0)
{
BlockWork.push_back(TBlockLineWork(&pDstLine[rcData.out_dst_x0_boder],
rcData.out_dst_x0_in-rcData.out_dst_x0_boder,rcData.out_dst_x1_in-rcData.out_dst_x0_in,
rcData.out_dst_x1_boder-rcData.out_dst_x1_in,rcData.out_src_x0_16,rcData.out_src_y0_16));
}
if (!rcData.next_clip_line_down()) break;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
for (long sleft=0,sright=BlockWork.size()-1;sleft<sright;++sleft,--sright)
std::swap(BlockWork[sleft],BlockWork[sright]);
pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_up_y);
while (rcData.next_clip_line_up()) //to up
{
long y=rcData.out_dst_up_y;
if (y<0) break;
((TUInt8*&)pDstLine)-=Dst.byte_width;
if (y<Dst.height)
{
BlockWork.push_back(TBlockLineWork(&pDstLine[rcData.out_dst_x0_boder],
rcData.out_dst_x0_in-rcData.out_dst_x0_boder,rcData.out_dst_x1_in-rcData.out_dst_x0_in,
rcData.out_dst_x1_boder-rcData.out_dst_x1_in,rcData.out_src_x0_16,rcData.out_src_y0_16));
}
}
do_PicRotary_BilInear_MMX_Block(BlockWork,Src,Ax_16,Ay_16);
}
6.三次卷積插值的分塊掃描函數實現PicRotaryThreeOrder_MMX_Block
幾乎就是拷貝PicRotaryBilInear_MMX_Block,然後稍做改動;
(實際項目中的代碼和文章中的代碼還是有很多不同的,要是實際代碼中也有這麼多長篇長篇的拷貝然後稍作修改的代碼,那就要瘋了:)
long Ax_16,long Ay_16,long srcx0_16,long srcy0_16,const TPicRegion& SrcPic)
{
for (long x=0;x<width;++x)
{
ThreeOrder_Fast_MMX(SrcPic,srcx0_16,srcy0_16,&pDstLine[x]);
srcx0_16+=Ax_16;
srcy0_16+=Ay_16;
}
}
inline void PicRotary_ThreeOrder_CopyLine_Border_MMX(TARGB32* pDstLine,long width,
long Ax_16,long Ay_16,long srcx0_16,long srcy0_16,const TPicRegion& SrcPic)
{
for (long x=0;x<width;++x)
{
TARGB32 src_color;
ThreeOrder_Border_MMX(SrcPic,srcx0_16,srcy0_16,&src_color);
pDstLine[x]=AlphaBlend_MMX(pDstLine[x],src_color);
srcx0_16+=Ax_16;
srcy0_16+=Ay_16;
}
}
void do_PicRotary_ThreeOrder_MMX_Block(TBlockWork& BlockWork,const TPicRegion& Src,long Ax_16,long Ay_16)
{
const long rotary_block_width=64; //128
const long rotary_block_height=rotary_block_width;
long height=BlockWork.size();
for (long y=0;y<height;y+=rotary_block_height)
{
long cur_block_height;
if (rotary_block_height<=(height-y))
cur_block_height=rotary_block_height;
else
cur_block_height=(height-y);
for (long yi=y;yi<y+cur_block_height;++yi)
{
TBlockLineWork* BlockLine=&BlockWork[yi];
long cur_block_width=BlockLine->width_border0;
if (cur_block_width>0)
{
PicRotary_ThreeOrder_CopyLine_Border_MMX(BlockLine->pdst,cur_block_width,Ax_16,Ay_16,
BlockLine->src_x0_16,BlockLine->src_y0_16,Src);
BlockLine->pdst=&BlockLine->pdst[cur_block_width];
BlockLine->src_x0_16+=(Ax_16*cur_block_width);
BlockLine->src_y0_16+=(Ay_16*cur_block_width);
}
}
bool is_line_filish=false;
while (!is_line_filish)
{
is_line_filish=true;
for (long yi=y;yi<y+cur_block_height;++yi)
{
TBlockLineWork* BlockLine=&BlockWork[yi];
long cur_block_width=BlockLine->width_in;
if (cur_block_width>0)
{
is_line_filish=false;
if (cur_block_width>rotary_block_width)
cur_block_width=rotary_block_width;
PicRotary_ThreeOrder_CopyLine_Fast_MMX(BlockLine->pdst,cur_block_width,Ax_16,Ay_16,
BlockLine->src_x0_16,BlockLine->src_y0_16,Src);
BlockLine->pdst=&BlockLine->pdst[cur_block_width];
BlockLine->width_in-=cur_block_width;
BlockLine->src_x0_16+=(Ax_16*cur_block_width);
BlockLine->src_y0_16+=(Ay_16*cur_block_width);
}
}
}
for (long yi=y;yi<y+cur_block_height;++yi)
{
TBlockLineWork* BlockLine=&BlockWork[yi];
long cur_block_width=BlockLine->width_border1;
if (cur_block_width>0)
{
PicRotary_ThreeOrder_CopyLine_Border_MMX(BlockLine->pdst,cur_block_width,Ax_16,Ay_16,
BlockLine->src_x0_16,BlockLine->src_y0_16,Src);
//BlockLine->pdst=&BlockLine->pdst[cur_block_width];
//BlockLine->src_x0_16+=(Ax_16*cur_block_width);
//BlockLine->src_y0_16+=(Ay_16*cur_block_width);
}
}
}
asm emms
}
void PicRotaryThreeOrder_MMX_Block(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y)
{
if ( (fabs(ZoomX*Src.width)<1.0e-4) || (fabs(ZoomY*Src.height)<1.0e-4) ) return; //太小的縮放比例認爲已經不可見
double tmprZoomXY=1.0/(ZoomX*ZoomY);
double rZoomX=tmprZoomXY*ZoomY;
double rZoomY=tmprZoomXY*ZoomX;
double sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long Ax_16=(long)(rZoomX*cosA*(1<<16));
long Ay_16=(long)(rZoomX*sinA*(1<<16));
long Bx_16=(long)(-rZoomY*sinA*(1<<16));
long By_16=(long)(rZoomY*cosA*(1<<16));
double rx0=Src.width*0.5; //(rx0,ry0)爲旋轉中心
double ry0=Src.height*0.5;
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16));
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16));
TRotaryClipData rcData;
rcData.Ax_16=Ax_16;
rcData.Bx_16=Bx_16;
rcData.Cx_16=Cx_16;
rcData.Ay_16=Ay_16;
rcData.By_16=By_16;
rcData.Cy_16=Cy_16;
rcData.dst_width=Dst.width;
rcData.dst_height=Dst.height;
rcData.src_width=Src.width;
rcData.src_height=Src.height;
if (!rcData.inti_clip(move_x,move_y,2)) return;
TBlockWork BlockWork;
TARGB32* pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_down_y);
while (true) //to down
{
long y=rcData.out_dst_down_y;
if (y>=Dst.height) break;
if (y>=0)
{
BlockWork.push_back(TBlockLineWork(&pDstLine[rcData.out_dst_x0_boder],
rcData.out_dst_x0_in-rcData.out_dst_x0_boder,rcData.out_dst_x1_in-rcData.out_dst_x0_in,
rcData.out_dst_x1_boder-rcData.out_dst_x1_in,rcData.out_src_x0_16,rcData.out_src_y0_16));
}
if (!rcData.next_clip_line_down()) break;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
for (long sleft=0,sright=BlockWork.size()-1;sleft<sright;++sleft,--sright)
std::swap(BlockWork[sleft],BlockWork[sright]);
pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_up_y);
while (rcData.next_clip_line_up()) //to up
{
long y=rcData.out_dst_up_y;
if (y<0) break;
((TUInt8*&)pDstLine)-=Dst.byte_width;
if (y<Dst.height)
{
BlockWork.push_back(TBlockLineWork(&pDstLine[rcData.out_dst_x0_boder],
rcData.out_dst_x0_in-rcData.out_dst_x0_boder,rcData.out_dst_x1_in-rcData.out_dst_x0_in,
rcData.out_dst_x1_boder-rcData.out_dst_x1_in,rcData.out_src_x0_16,rcData.out_src_y0_16));
}
}
do_PicRotary_ThreeOrder_MMX_Block(BlockWork,Src,Ax_16,Ay_16);
}
7.分塊處理的旋轉函數的速度測試:
//注:CPU: AMD64x2 4200+(2.37G)
////////////////////////////////////////////////////////////////////////////////
// 800x600的源圖片 各角度平均幀數 角度中最小幀數 角度中最大幀數
//==============================================================================
// PicRotarySSE2 304.2 fps 250.7 fps 565.3 fps
// PicRotarySSE2_Block 316.6 fps 255.5 fps 495.6 fps
// PicRotaryBilInear_MMX 100.2 fps 87.8 fps 130.3 fps
// PicRotaryBilInear_MMX_Block 99.5 fps 92.7 fps 122.3 fps
// PicRotaryThreeOrder_MMX 44.2 fps 41.4 fps 49.7 fps
// PicRotaryThreeOrder_MMX_Block 43.2 fps 41.3 fps 47.8 fps
////////////////////////////////////////////////////////////////////////////////
// 3200x2400的源圖片 各角度平均幀數 角度中最小幀數 角度中最大幀數
//==============================================================================
// PicRotarySSE2 12.2 fps 4.6 fps 36.3 fps
// PicRotarySSE2_Block 19.2 fps 15.9 fps 33.3 fps
// PicRotaryBilInear_MMX 5.0 fps 1.1 fps 8.7 fps
// PicRotaryBilInear_MMX_Block 6.3 fps 5.8 fps 8.3 fps
// PicRotaryThreeOrder_MMX 2.6 fps 0.9 fps 3.5 fps
// PicRotaryThreeOrder_MMX_Block 2.9 fps 2.7 fps 3.4 fps
////////////////////////////////////////////////////////////////////////////////
//注:CPU: Intel Core2 4400(2.00G)
////////////////////////////////////////////////////////////////////////////////
// 800x600的源圖片 各角度平均幀數 角度中最小幀數 角度中最大幀數
//==============================================================================
// PicRotarySSE2 449.3 fps 250.5 fps 753.4 fps
// PicRotarySSE2_Block 351.3 fps 219.0 fps 436.3 fps
// PicRotaryBilInear_MMX 109.5 fps 95.3 fps 132.4 fps
// PicRotaryBilInear_MMX_Block 112.2 fps 98.1 fps 124.9 fps
// PicRotaryThreeOrder_MMX 45.9 fps 41.5 fps 50.3 fps
// PicRotaryThreeOrder_MMX_Block 47.7 fps 44.8 fps 50.1 fps
////////////////////////////////////////////////////////////////////////////////
// 3200x2400的源圖片 各角度平均幀數 角度中最小幀數 角度中最大幀數
//==============================================================================
// PicRotarySSE2 18.3 fps 12.0 fps 44.8 fps
// PicRotarySSE2_Block 18.4 fps 15.5 fps 23.8 fps
// PicRotaryBilInear_MMX 6.7 fps 3.5 fps 8.7 fps
// PicRotaryBilInear_MMX_Block 7.3 fps 6.8 fps 8.3 fps
// PicRotaryThreeOrder_MMX 2.9 fps 2.2 fps 3.4 fps
// PicRotaryThreeOrder_MMX_Block 3.2 fps 2.9 fps 3.4 fps
////////////////////////////////////////////////////////////////////////////////
測試更大的源圖片(6400x4800)或者CPU的緩存越小的時候*_Block優化版本優勢越明顯!