搭建實驗環境
首先新建一個包含native代碼的項目:
然後在gradle中添加對neon的支持:
externalNativeBuild {
cmake {
cppFlags "-std=c++14"
arguments "-DANDROID_ARM_NEON=TRUE"
}
}
這樣,項目就可以支持neon加速了。
小試牛刀
一個最簡單的neon編程的流程大致是這樣的:
1、裝載數據到neon寄存器
2、執行運算
3、從neon寄存器中把結果寫回內存。
沒有例子不知從何說起,先上一個超級簡單的例子吧:
#include <jni.h>
#include <string>
#include <arm_neon.h>
#include <android/log.h>
#define LOG_TAG "TEST_NEON"
#define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__)
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
extern "C"{
void test()
{
int16_t result[8];
int8x8_t a = vdup_n_s8(121);
int8x8_t b = vdup_n_s8(2);
int16x8_t c;
c = vmull_s8(a,b);
vst1q_s16(result,c);
for(int i=0;i<8;i++){
LOGD("data[%d] is %d ",i,result[i]);
}
}
JNIEXPORT jstring
JNICALL
Java_com_example_javer_myapplication_MainActivity_stringFromJNI(
JNIEnv *env,
jobject /* this */) {
std::string hello = "Hello from C++";
test();
return env->NewStringUTF(hello.c_str());
}
}
執行結果:
09-07 12:03:08.335 11709-11709/? D/TEST_NEON:
data[0] is 242
data[1] is 242
data[2] is 242
data[3] is 242
data[4] is 242
data[5] is 242
data[6] is 242
data[7] is 242
代碼中,test函數中實現了兩個64位neon寄存器的乘法。
vdup是數據複製指令,這裏把128這個8位的數複製到一個64位的寄存器中,64位能存放8個8位的數,因此,此時a指向的neon寄存器存放了8個128。兩個8位的數相乘,結果可能是16位的,因此,結果需要用一個128位的寄存器來保存。int16x8就表示的是一個128位的寄存器。
vmull_s8把a,b相乘,並將結果保存在c中。c指向的是neon的128位寄存器,因此,我們需要把結果寫回內存。
vst1q_s16把c中的數據協會result指向的內存中。
這是一個簡單的測試neon指令的代碼,通過這個代碼我們能清晰的認識到neon加速的原理:一次裝載8個8位的數到64位寄存器,一條指令能把實現兩個8*8的數據塊的乘法。這樣效率不就接近提升8倍麼?當然沒有這麼理想,畢竟裝載數據和寫回數據也是需要時間的。
實戰嘗試
接下來,嘗試一個比較簡單的rgb轉灰度圖的code:
void normal_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
int i;
for (i=0; i<n; i++)
{
int r = *src++; // load red
int g = *src++; // load green
int b = *src++; // load blue
// build weighted average:
int y = (r*77)+(g*151)+(b*28);
// undo the scale by 256 and write to memory:
*dest++ = (y>>8);
}
}
void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
int i;
uint8x8_t rfac = vdup_n_u8 (77);
uint8x8_t gfac = vdup_n_u8 (151);
uint8x8_t bfac = vdup_n_u8 (28);
n/=8;
for (i=0; i<n; i++)
{
uint16x8_t temp;
uint8x8x3_t rgb = vld3_u8 (src);
uint8x8_t result;
temp = vmull_u8 (rgb.val[0], rfac);
temp = vmlal_u8 (temp,rgb.val[1], gfac);
temp = vmlal_u8 (temp,rgb.val[2], bfac);
result = vshrn_n_u16 (temp, 8);
vst1_u8 (dest, result);
src += 8*3;
dest += 8;
}
}
void test1()
{
//準備一張圖片,使用軟件模擬生成,格式爲rgb rgb ..
uint32_t const array_size = 2048*2048;
uint8_t * rgb = new uint8_t[array_size*3];
for(int i=0;i<array_size;i++){
rgb[i*3]=234;
rgb[i*3+1]=94;
rgb[i*3+2]=23;
}
//灰度圖大小爲rgb的1/3
uint8_t * gray = new uint8_t[array_size];
struct timeval tv1,tv2;
gettimeofday(&tv1,NULL);
normal_convert(gray,rgb,array_size);
gettimeofday(&tv2,NULL);
LOGD("pure cpu cost time:%ld",(tv2.tv_sec-tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec));
gettimeofday(&tv1,NULL);
neon_convert(gray,rgb,array_size);
gettimeofday(&tv2,NULL);
LOGD("neon cost time:%ld",(tv2.tv_sec-tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec));
delete[] rgb;
delete[] gray;
}
JNIEXPORT jstring
JNICALL
Java_com_example_javer_myapplication_MainActivity_stringFromJNI(
JNIEnv *env,
jobject /* this */) {
std::string hello = "Hello from C++";
test1();
return env->NewStringUTF(hello.c_str());
}
具體的指令就不一一說明了,大家參考neon彙編指令集,對照着看就好。
純cpu耗時53ms,neon優化後耗時43ms,提升非常有限,跟提升近8倍的預期相差甚遠。這主要是因爲c轉換爲彙編後,生成的彙編指令不夠簡潔,使得效率大大降低。因此,接下來,使用匯編對代碼進行優化。
CMake添加彙編支持
爲了在Cmake中編譯彙編文件,我們需要在CMakeLists.txt文件中申明對彙編語言的支持,添加ENABLE_LANGUAGE(ASM)即可實現對彙編的支持,接着將彙編文件添加進來,此處貼出完整的CMakeLists.txt文件供大家參考:
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html
# Sets the minimum version of CMake required to build the native library.
cmake_minimum_required(VERSION 3.4.1)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
ENABLE_LANGUAGE(ASM)
add_library( # Sets the name of the library.
native-lib
# Sets the library as a shared library.
SHARED
# Provides a relative path to your source file(s).
src/main/cpp/Neon.S
src/main/cpp/native-lib.cpp
)
# Searches for a specified prebuilt library and stores the path as a
# variable. Because CMake includes system libraries in the search path by
# default, you only need to specify the name of the public NDK library
# you want to add. CMake verifies that the library exists before
# completing its build.
find_library( # Sets the name of the path variable.
log-lib
# Specifies the name of the NDK library that
# you want CMake to locate.
log )
# Specifies libraries CMake should link to your target library. You
# can link multiple libraries, such as libraries you define in this
# build script, prebuilt third-party libraries, or system libraries.
target_link_libraries( # Specifies the target library.
native-lib
# Links the target library to the log library
# included in the NDK.
${log-lib} )
實現彙編Neon優化
然後在cpp文件中申明:
void neon_asm_convert(uint8_t * dest, uint8_t * src,int n);
注意,這個申明是包含在extern “C”中的。
然後在Neon.S中實現neon_asm_convert函數:
.globl neon_asm_convert
neon_asm_convert:
# r0: Ptr to destination data
# r1: Ptr to source data
# r2: Iteration count:
push {r4-r5,lr}
lsr r2, r2, #3
# build the three constants:
mov r3, #77
mov r4, #151
mov r5, #28
vdup.8 d3, r3
vdup.8 d4, r4
vdup.8 d5, r5
.loop:
# load 8 pixels:
vld3.8 {d0-d2}, [r1]!
# do the weight average:
vmull.u8 q3, d0, d3
vmlal.u8 q3, d1, d4
vmlal.u8 q3, d2, d5
# shift and store:
vshrn.u16 d6, q3, #8
vst1.8 {d6}, [r0]!
subs r2, r2, #1
bne .loop
pop { r4-r5, pc }
爲了對比結果的正確性,專門寫了個比對函數:
int compare(uint8_t *a,uint8_t* b,int n)
{
for(int i=0;i<n;i++){
if(a[i]!=b[i]){
return -1;
}
}
return 0;
}
並將結果打印在時間後面:
LOGD("neon c cost time:%ld,result is %d",(tv2.tv_sec-tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec),result);
三者對比:
09-07 17:12:19.946 25861-25861/com.example.javer.myapplication D/TEST_NEON: pure cpu cost time:57073
09-07 17:12:20.012 25861-25861/com.example.javer.myapplication D/TEST_NEON: neon c cost time:45460,result is 0
09-07 17:12:20.034 25861-25861/com.example.javer.myapplication D/TEST_NEON: neon asm cost time:3397,result is 0
09-07 17:12:25.271 25861-25861/com.example.javer.myapplication D/TEST_NEON: pure cpu cost time:57404
09-07 17:12:25.336 25861-25861/com.example.javer.myapplication D/TEST_NEON: neon c cost time:45166,result is 0
09-07 17:12:25.359 25861-25861/com.example.javer.myapplication D/TEST_NEON: neon asm cost time:3493,result is 0
最終發現,彙編執行的結果完全正確,時間提升超過了16倍!!!!!!!!!!!
我甚至不敢相信能提升這麼多。。。可對比的結果是完全一樣啊!!這…….
如果程序有問題,感謝大神指出。
最後附完整代碼:
native_lib.cpp:
#include <jni.h>
#include <string>
#include <arm_neon.h>
#include <android/log.h>
#define LOG_TAG "TEST_NEON"
#define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__)
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
extern "C"{
void neon_asm_convert(uint8_t * dest, uint8_t * src,int n);
void test()
{
int16_t result[8];
int8x8_t a = vdup_n_s8(121);
int8x8_t b = vdup_n_s8(2);
int16x8_t c;
c = vmull_s8(a,b);
vst1q_s16(result,c);
for(int i=0;i<8;i++){
LOGD("data[%d] is %d ",i,result[i]);
}
}
void normal_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
int i;
for (i=0; i<n; i++)
{
int r = *src++; // load red
int g = *src++; // load green
int b = *src++; // load blue
// build weighted average:
int y = (r*77)+(g*151)+(b*28);
// undo the scale by 256 and write to memory:
*dest++ = (y>>8);
}
}
void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
int i;
uint8x8_t rfac = vdup_n_u8 (77);
uint8x8_t gfac = vdup_n_u8 (151);
uint8x8_t bfac = vdup_n_u8 (28);
n/=8;
for (i=0; i<n; i++)
{
uint16x8_t temp;
uint8x8x3_t rgb = vld3_u8 (src);
uint8x8_t result;
temp = vmull_u8 (rgb.val[0], rfac);
temp = vmlal_u8 (temp,rgb.val[1], gfac);
temp = vmlal_u8 (temp,rgb.val[2], bfac);
result = vshrn_n_u16 (temp, 8);
vst1_u8 (dest, result);
src += 8*3;
dest += 8;
}
}
int compare(uint8_t *a,uint8_t* b,int n)
{
for(int i=0;i<n;i++){
if(a[i]!=b[i]){
return -1;
}
}
return 0;
}
void test1()
{
//準備一張圖片,使用軟件模擬生成,格式爲rgb rgb ..
uint32_t const array_size = 2048*2048;
uint8_t * rgb = new uint8_t[array_size*3];
for(int i=0;i<array_size;i++){
rgb[i*3]=234;
rgb[i*3+1]=94;
rgb[i*3+2]=23;
}
//灰度圖大小爲rgb的1/3
uint8_t * gray_cpu = new uint8_t[array_size];
uint8_t * gray_neon = new uint8_t[array_size];
uint8_t * gray_neon_asm = new uint8_t[array_size];
struct timeval tv1,tv2;
gettimeofday(&tv1,NULL);
normal_convert(gray_cpu,rgb,array_size);
gettimeofday(&tv2,NULL);
LOGD("pure cpu cost time:%ld",(tv2.tv_sec-tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec));
gettimeofday(&tv1,NULL);
neon_convert(gray_neon,rgb,array_size);
gettimeofday(&tv2,NULL);
bool result = compare(gray_cpu,gray_neon,array_size);
LOGD("neon c cost time:%ld,result is %d",(tv2.tv_sec-tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec),result);
gettimeofday(&tv1,NULL);
neon_asm_convert(gray_neon_asm,rgb,array_size);
gettimeofday(&tv2,NULL);
result = compare(gray_cpu,gray_neon_asm,array_size);
LOGD("neon asm cost time:%ld,result is %d",(tv2.tv_sec-tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec),result);
delete[] rgb;
delete[] gray_cpu;
delete[] gray_neon;
delete[] gray_neon_asm;
}
JNIEXPORT jstring
JNICALL
Java_com_example_javer_myapplication_MainActivity_stringFromJNI(
JNIEnv *env,
jobject /* this */) {
std::string hello = "Hello from C++";
test1();
return env->NewStringUTF(hello.c_str());
}
}
Neon.S
.globl neon_asm_convert
neon_asm_convert:
# r0: Ptr to destination data
# r1: Ptr to source data
# r2: Iteration count:
push {r4-r5,lr}
lsr r2, r2, #3
# build the three constants:
mov r3, #77
mov r4, #151
mov r5, #28
vdup.8 d3, r3
vdup.8 d4, r4
vdup.8 d5, r5
.loop:
# load 8 pixels:
vld3.8 {d0-d2}, [r1]!
# do the weight average:
vmull.u8 q3, d0, d3
vmlal.u8 q3, d1, d4
vmlal.u8 q3, d2, d5
# shift and store:
vshrn.u16 d6, q3, #8
vst1.8 {d6}, [r0]!
subs r2, r2, #1
bne .loop
pop { r4-r5, pc }