static void resizeline4_32(uchar *src1, uchar *src2,uchar *src3, uchar *src4, uchar *dest,int src_width) { int w; for (w = 0; w < src_width; w += 32) { uint16x8_t line0 = vpaddlq_u8(vld1q_u8(src1)); uint16x8_t line1 = vpaddlq_u8(vld1q_u8(src2)); uint16x8_t line2 = vpaddlq_u8(vld1q_u8(src3)); uint16x8_t line3 = vpaddlq_u8(vld1q_u8(src4)); uint16x8_t line01 = vpaddlq_u8(vld1q_u8(src1+16)); uint16x8_t line11 = vpaddlq_u8(vld1q_u8(src2+16)); uint16x8_t line21 = vpaddlq_u8(vld1q_u8(src3+16)); uint16x8_t line31 = vpaddlq_u8(vld1q_u8(src4+16)); uint16x8_t ab0 = vaddq_u16(line0, line1); uint16x8_t ab1 = vaddq_u16(line2, line3); uint16x8_t ab = vaddq_u16(ab0, ab1); uint32x4_t result = vpaddlq_u16(ab); uint16x8_t ab01 = vaddq_u16(line01, line11); uint16x8_t ab11 = vaddq_u16(line21, line31); uint16x8_t ab_ = vaddq_u16(ab01, ab11); uint32x4_t result_ = vpaddlq_u16(ab_); uint16x4_t resulta = vshrn_n_u32(result, 4); uint16x4_t resultb = vshrn_n_u32(result_, 4); uint16x8_t combile_result = vcombine_u16(resulta,resultb); vst1_u8(dest, vmovn_u16(combile_result)); src1 += 32; src2 += 32; src3 += 32; src4 += 32; dest += 8; } } void resize_44(uchar * src, uchar * dest,int src_width,int src_height,int dst_width,int dst_height) { for(int h = 0; h < dst_height; h++){ resizeline4_32(src + src_width * (h * 4 + 0), src + src_width * (h * 4 + 1), src + src_width * (h * 4 + 2), src + src_width * (h * 4 + 3), dest + dst_width * h,src_width); } } 大致的思路是一行處理32個數據 再將4行數據相加,然後求取平均值。
neon 如何快速實現44的整數倍的resize area
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.