轉至http://blog.csdn.net/feixiang_john/article/details/8438658
平時我們做圖像處理或者視頻處理, 很多地方會用到矩陣轉置:
比如: DCT變換, 圖像旋轉, 圖像濾波, 以及一些數據的內存行和列的交換等, 會大量使用轉置這個動作.
然而由於數據量很大,處理速度很慢!如何來提高處理速度呢?
下面看看分析:
HEVC中有個地方是如下這樣實現(直接行和列對應的位置交換):
- Pel tmp;
- for (k=0;k<blkSize-1;k++)
- {
- for (l=k+1;l<blkSize;l++)
- {
- tmp = pDst[k*dstStride+l];
- pDst[k*dstStride+l] = pDst[l*dstStride+k];
- pDst[l*dstStride+k] = tmp;
- }
- }
如何用匯編來實現呢?
我們先用SSE彙編來實現一個8X8的矩陣轉置吧: 這裏輸入地址pSrc_128[i] 和輸出地址pDst_128[i]可以相同也可以不同:
相同的話就是原地轉置, 不同的話就是非原地轉置.
- __m128i* m_pSrc_tmp = pSrc_128[i];
- __m128i* m_pDst_tmp = pDst_128[i];
- __m128i Org_8_0,Org_8_1, Org_8_2, Org_8_3;
- __m128i tttt1,tttt2,tttt3,tttt4,tttt33,tttt44;
- __m128i tttt5,tttt6, tttt7, tttt8;
- int stride_ii = dstStride>>3;
- //one
- Org_8_0 = _mm_load_si128(m_pSrc_tmp);
- m_pSrc_tmp+=8;
- Org_8_1 = _mm_load_si128(m_pSrc_tmp);
- m_pSrc_tmp+=8;
- Org_8_2 = _mm_load_si128(m_pSrc_tmp);
- m_pSrc_tmp+=8;
- Org_8_3 = _mm_load_si128(m_pSrc_tmp);
- m_pSrc_tmp+=8;
- tttt1 = _mm_unpacklo_epi16(Org_8_0, Org_8_1);
- tttt2 = _mm_unpacklo_epi16(Org_8_2, Org_8_3);
- tttt3 = _mm_unpackhi_epi16(Org_8_0, Org_8_1);
- tttt4 = _mm_unpackhi_epi16(Org_8_2, Org_8_3);
- tttt5 = _mm_unpacklo_epi32(tttt1, tttt2);
- tttt6 = _mm_unpackhi_epi32(tttt1, tttt2);
- Org_8_0 = _mm_load_si128(m_pSrc_tmp);
- m_pSrc_tmp+=8;;
- Org_8_1 = _mm_load_si128(m_pSrc_tmp);
- m_pSrc_tmp+=8;
- Org_8_2 = _mm_load_si128(m_pSrc_tmp);
- m_pSrc_tmp+=8;
- Org_8_3 = _mm_load_si128(m_pSrc_tmp);
- //m_pSrc_tmp+=8;
- tttt1 = _mm_unpacklo_epi16(Org_8_0, Org_8_1);
- tttt2 = _mm_unpacklo_epi16(Org_8_2, Org_8_3);
- tttt33 = _mm_unpackhi_epi16(Org_8_0, Org_8_1);
- tttt44 = _mm_unpackhi_epi16(Org_8_2, Org_8_3);
- tttt7 = _mm_unpacklo_epi32(tttt1, tttt2);
- tttt8 = _mm_unpackhi_epi32(tttt1, tttt2);
- tttt1 = _mm_unpacklo_epi64(tttt5, tttt7);
- tttt2 = _mm_unpackhi_epi64(tttt5, tttt7);
- _mm_storeu_si128(m_pDst_tmp, tttt1);
- m_pDst_tmp+=stride_ii;
- _mm_storeu_si128(m_pDst_tmp, tttt2);
- m_pDst_tmp+=stride_ii;
- tttt5 = _mm_unpacklo_epi64(tttt6, tttt8);
- tttt7 = _mm_unpackhi_epi64(tttt6, tttt8);
- _mm_storeu_si128(m_pDst_tmp, tttt5);
- m_pDst_tmp+=stride_ii;
- _mm_storeu_si128(m_pDst_tmp, tttt7);
- m_pDst_tmp+=stride_ii;
- //tow
- tttt5 = _mm_unpacklo_epi32(tttt3, tttt4);
- tttt6 = _mm_unpackhi_epi32(tttt3, tttt4);
- tttt7 = _mm_unpacklo_epi32(tttt33, tttt44);
- tttt8 = _mm_unpackhi_epi32(tttt33, tttt44);
- tttt1 = _mm_unpacklo_epi64(tttt5, tttt7);
- tttt2 = _mm_unpackhi_epi64(tttt5, tttt7);
- _mm_storeu_si128(m_pDst_tmp, tttt1);
- m_pDst_tmp+=stride_ii;
- _mm_storeu_si128(m_pDst_tmp, tttt2);
- m_pDst_tmp+=stride_ii;
- tttt5 = _mm_unpacklo_epi64(tttt6, tttt8);
- tttt7 = _mm_unpackhi_epi64(tttt6, tttt8);
- _mm_storeu_si128(m_pDst_tmp, tttt5);
- m_pDst_tmp+=stride_ii;
- _mm_storeu_si128(m_pDst_tmp, tttt7);
要實現的是NXN的轉置,如何實現呢:
基於8X8來實現NXN的塊或者圖像的轉置:
這裏先把NXN劃分爲size_case 個8X8, 然後循環調用8X8的轉置!
- __m128i* pDst_128[64];
- __m128i* pSrc_128[64];
- int size_case = (blkSize>>3);
- dstStride = dstStride_tmp;
- for(int y = 0; y<size_case; y++)//對所有8x8的塊進行地址映射
- for(int x = 0; x<size_case; x++)
- {
- pSrc_128[y*size_case + x] = (__m128i*)(pDst + 8*x + y*8*64);
- pDst_128[y*size_case + x] = (__m128i*)(rpDst + 8*y + x*8*dstStride);
- }
- size_case = size_case*size_case;
- for(int i = 0;i <size_case; i++)//開始轉置
- {
- 8x8轉置的代碼:
- }
通過比較, 用SSE彙編優化實現轉置比用純 C代碼實現的轉置速度快5倍左右!
同樣在ARM cortext上的彙編優化也是基於這個原理:
主要循環體代碼如下:
- VTRN.16 q8, q9
- VTRN.16 q10, q11
- VTRN.16 q4, q5
- VTRN.16 q6, q7
- VTRN.32 q8, q10
- VTRN.32 q9, q11
- VTRN.32 q4, q6
- VTRN.32 q5, q7
- VSWP d17, d8
- VSWP d19, d10
- VSWP d21, d12
- VSWP d23, d14
感興趣的可以自己調試下!
當然DSP上也是同樣的方法, 只是涉及到的指令不同而已!