neon 如何快速實現44的整數倍的resize area

static void resizeline4_32(uchar *src1, uchar *src2,uchar *src3, uchar *src4, uchar *dest,int src_width)
{
    int w;
    for (w = 0; w < src_width; w += 32) {
        uint16x8_t line0 = vpaddlq_u8(vld1q_u8(src1));
        uint16x8_t line1 = vpaddlq_u8(vld1q_u8(src2));
		uint16x8_t line2 = vpaddlq_u8(vld1q_u8(src3));
        uint16x8_t line3 = vpaddlq_u8(vld1q_u8(src4));
		
		uint16x8_t line01 = vpaddlq_u8(vld1q_u8(src1+16));
        uint16x8_t line11 = vpaddlq_u8(vld1q_u8(src2+16));
		uint16x8_t line21 = vpaddlq_u8(vld1q_u8(src3+16));
        uint16x8_t line31 = vpaddlq_u8(vld1q_u8(src4+16));
		
        uint16x8_t ab0 = vaddq_u16(line0, line1);
		uint16x8_t ab1 = vaddq_u16(line2, line3);
		uint16x8_t ab =  vaddq_u16(ab0, ab1);
		uint32x4_t result = vpaddlq_u16(ab);

		uint16x8_t ab01 = vaddq_u16(line01, line11);
		uint16x8_t ab11 = vaddq_u16(line21, line31);
		uint16x8_t ab_ =  vaddq_u16(ab01, ab11);
		uint32x4_t result_ = vpaddlq_u16(ab_);
		
		uint16x4_t resulta = vshrn_n_u32(result, 4);
		uint16x4_t resultb = vshrn_n_u32(result_, 4);

		uint16x8_t combile_result = vcombine_u16(resulta,resultb);
        vst1_u8(dest, vmovn_u16(combile_result));
        src1 += 32;
        src2 += 32;
		src3 += 32;
        src4 += 32;
        dest += 8;
    }
}

void resize_44(uchar * src, uchar * dest,int src_width,int src_height,int dst_width,int dst_height)
{
    for(int h = 0; h < dst_height; h++){
        resizeline4_32(src + src_width * (h * 4 + 0), src + src_width * (h * 4 + 1),
        			src + src_width * (h * 4 + 2), src + src_width * (h * 4 + 3), dest + dst_width * h,src_width);
    }
}

大致的思路是一行處理32個數據 再將4行數據相加,然後求取平均值。
 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章