圖像轉置的Neon優化代碼
原理
圖像轉置
圖像轉置和矩陣轉置是一樣的,其公式爲:
dst.getPixels(y, x) = src.getPixels(x, y)
dst.w = src.h
dst.h = src.w
效果如下:
原圖:
結果圖:
先做圖像轉置後,再實現90度/270度的旋轉相對容易,
如圖像旋轉90度,就只需要再水平翻轉一下:
旋轉結果圖:
分而治之
圖像轉置的優化思路是:
1、將圖像分割成一系列小矩陣。
分成的小矩陣當然是越大越好,但在矩陣變大時,彙編代碼的複雜度變高,且寄存器如果使用完了也非常難處理,這裏選的是4X4的矩陣。
2、每個小矩陣的宏觀位置轉置。
3、實現每個小矩陣的內部轉置。
必須把矩陣圖和寄存器向量的關係圖畫清,然後推演一番。
Neon指令vtrn是解決轉置問題的核心。
4、邊角處理
寫慣了基於一行的Neon優化,到這步很容易犯錯,一定要記得這裏是二維的Neon優化,邊角料是兩條邊。
代碼
該代碼僅適用於32位(RGBA)圖像的轉置。
static void _transpose(unsigned char* dest_s, unsigned char* source_s, int dstw, int srcw, int dw, int dh)
{
int ista=0,jsta=0;
const int bpp = 4;//RGBA
#ifdef HAS_NEON
/*矩陣轉置示意圖
d1 d2
x00 x01 x02 x03
d3 d4
x10 x11 x12 x13
d5 d6
x20 x21 x22 x23
d7 d8
x30 x31 x32 x33
_||_
\ /
\/
d1 d2
x00 x10 x20 x30
d3 d4
x01 x11 x21 x31
d5 d6
x02 x12 x22 x32
d7 d8
x03 x13 x23 x33
*/
const int unit = 4;//必須是4
//GPCLOCK;
int nw = dw/unit;
int nh = dh/unit;
int srcstride = srcw*bpp;
int dststride = dstw*bpp;
if (nw > 1 && nh > 1)
{
asm (
"mov r5, #4\t\n"
"mul r8, %[srcstride], r5\t\n"
"mul r9, %[dststride], r5\t\n"
"mul r10, r5, r5\t\n"//In fact, it's 4*r5
"movs r5, %[nh]\t\n"//i
"sub r5, r5, #1\t\n"
"1:\t\n"
"sub r4, %[nw], #1\t\n"//j
"2:\t\n"
"mla r6, r4, r8, %[source_s]\t\n"
"mla r6, r5, r10, r6\t\n"
"vld1.32 {d1, d2}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d3, d4}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d5, d6}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d7, d8}, [r6]\t\n"
/*Transpose internal*/
"vtrn.32 d1, d3\t\n"
"vtrn.32 d2, d4\t\n"
"vtrn.32 d5, d7\t\n"
"vtrn.32 d6, d8\t\n"
"vswp d2, d5\t\n"
"vswp d4, d7\t\n"
"mla r7, r5, r9, %[dest_s]\t\n"
"mla r7, r4, r10, r7\t\n"
"vst1.32 {d1, d2}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d3, d4}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d5, d6}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d7, d8}, [r7]\t\n"
"subs r4, r4, #1\t\n"
"bne 2b\t\n"//Loop1
"mla r6, r4, r8, %[source_s]\t\n"
"mla r6, r5, r10, r6\t\n"
"vld1.32 {d1, d2}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d3, d4}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d5, d6}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d7, d8}, [r6]\t\n"
/*Transpose internal*/
"vtrn.32 d1, d3\t\n"
"vtrn.32 d2, d4\t\n"
"vtrn.32 d5, d7\t\n"
"vtrn.32 d6, d8\t\n"
"vswp d2, d5\t\n"
"vswp d4, d7\t\n"
"mla r7, r5, r9, %[dest_s]\t\n"
"mla r7, r4, r10, r7\t\n"
"vst1.32 {d1, d2}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d3, d4}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d5, d6}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d7, d8}, [r7]\t\n"
"subs r5, r5, #1\t\n"
"bne 1b\t\n"//Loop2
/*Last line*/
"sub r4, %[nw], #1\t\n"//j
"4:\t\n"
"mla r6, r4, r8, %[source_s]\t\n"
"mla r6, r5, r10, r6\t\n"
"vld1.32 {d1, d2}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d3, d4}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d5, d6}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d7, d8}, [r6]\t\n"
/*Transpose internal*/
"vtrn.32 d1, d3\t\n"
"vtrn.32 d2, d4\t\n"
"vtrn.32 d5, d7\t\n"
"vtrn.32 d6, d8\t\n"
"vswp d2, d5\t\n"
"vswp d4, d7\t\n"
"mla r7, r5, r9, %[dest_s]\t\n"
"mla r7, r4, r10, r7\t\n"
"vst1.32 {d1, d2}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d3, d4}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d5, d6}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d7, d8}, [r7]\t\n"
"subs r4, r4, #1\t\n"
"bne 4b\t\n"//Loop1
"mla r6, r4, r8, %[source_s]\t\n"
"mla r6, r5, r10, r6\t\n"
"vld1.32 {d1, d2}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d3, d4}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d5, d6}, [r6]\t\n"
"add r6, r6, %[srcstride]\t\n"
"vld1.32 {d7, d8}, [r6]\t\n"
/*Transpose internal*/
"vtrn.32 d1, d3\t\n"
"vtrn.32 d2, d4\t\n"
"vtrn.32 d5, d7\t\n"
"vtrn.32 d6, d8\t\n"
"vswp d2, d5\t\n"
"vswp d4, d7\t\n"
"mla r7, r5, r9, %[dest_s]\t\n"
"mla r7, r4, r10, r7\t\n"
"vst1.32 {d1, d2}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d3, d4}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d5, d6}, [r7]\t\n"
"add r7, r7, %[dststride]\t\n"
"vst1.32 {d7, d8}, [r7]\t\n"
"5:\t\n"
: [srcstride] "+r" (srcstride), [dststride] "+r" (dststride), [source_s] "+r" (source_s), [dest_s] "+r" (dest_s), [nw] "+r" (nw), [nh] "+r" (nh)
:
: "r4", "r5", "r6", "r7", "r8", "r9","r10", "cc","memory", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
);
}
ista = nh*unit;
jsta = nw*unit;
#endif
//邊角處理,先處理圖像下邊緣,再處理圖像右邊緣
for (int i=ista; i<dh; ++i)
{
for (int j=0; j<dw; ++j)
{
unsigned char* dest = dest_s + (i*dstw+j)*bpp;
unsigned char* source = source_s + (j*srcw+i)*bpp;
::memcpy(dest, source, bpp*sizeof(unsigned char));
}
}
for (int i=0; i<ista; ++i)
{
for (int j=jsta; j<dw; ++j)
{
unsigned char* dest = dest_s + (i*dstw+j)*bpp;
unsigned char* source = source_s + (j*srcw+i)*bpp;
::memcpy(dest, source, bpp*sizeof(unsigned char));
}
}
}
加速比率
大約10倍左右性能提升,數據遺失,不補。