Ne10編譯和介紹

1.介紹

        ARM® NEON™ 技術是適用於 ARM Cortex™-A 系列處理器的 SIMD(單指令多數據)架構擴展。 它可以使多媒體和信號處理算法提速,例如視頻編碼/解碼、2D/3D 圖形、遊戲、音頻和語音處理以及圖像處理等。 Ne10的問世,出現許多使用NEON 並顯著改善用戶體驗的多媒體應用程序。 有些應用程序開發人員可能不熟悉 NEON 彙編代碼,因此 Ne10 庫的創建可使開發人員從 ARMv7/NEON 中獲得最大效益,而不必使用繁瑣的彙編代碼。

       Ne10 庫提供一組最爲常用並且極爲優化的函數。 這組函數最初於 2012 年 3 月發佈。 庫中的初始功能集着重於矩陣/矢量代數以及信號處理。 Ne10 將持續改進,以包含圖像處理等多領域內的更多高計算量任務。

2.源碼獲取

     Ne10的源碼公開在github上面,其網站地址:https://github.com/projectNe10/Ne10 。

3.環境

  3.1硬件環境

        您需要準備ARM Cortex-A/R系列開發平臺。如果沒有硬件開發平臺,也可使用仿真環境,如Google的Android Emulator。我現在使用的硬件開發板環境是arm-A53的平臺,交叉編譯平臺ubuntu 16.04.

 3.2軟件環境

  • 工具鏈:aarch64-linux-gnu-

4.編譯和使用Ne0庫

  4.1編譯Ne10

    通過第2部分,獲取源碼後,進入源碼目錄,進行如下操作:

  1. 修改CMakeLists.txt.有二處修改,修改如下:
1. option(NE10_BUILD_UNIT_TEST "Build NE10 unit test" ON)  //原先爲OFF
2. option(NE10_PERFORMANCE_TEST "Run performance test" ON)//原先爲OFF

       此處打開,源碼中的測試程序和選擇performance-test。關於smoke testing,regression testing, performancetesting的區別如下:

  • Conformance testing (also called smoke testing), to check if the library works correctly.
  • Regression testing, which is similar to conformance testing but is aimed more specifically at testing whether the library still operates correctly after a change.
  • Performance testing, which gives an indication of how quickly the library performs certain tasks.

     2.修改GNUlinux_config.cmake

if(NOT DEFINED ENV{NE10_LINUX_TARGET_ARCH})
   set(NE10_LINUX_TARGET_ARCH "aarch64")
else()
//直接將此處設置爲,aarch64

     3.編譯

mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../GNUlinux_config.cmake ../
make -j8

   此處是靜態的編譯方式,可以看到在build/modules/下面生成libNE10.

ccion@ubuntu:~/Ne10/build/modules$ ls
CMakeFiles  cmake_install.cmake  libNE10.a  Makefile

4.2使用和結果分析

       通過上面的步驟可以看到,在build目錄下面生成了test文件,有二個應用程序,這裏在我的開發板平臺上面執行FFT的執行程序-NE10-dsp_unit_test_static_performanc。其重要部分源碼如下:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "NE10_dsp.h"
#include "NE10_macros.h"
#include "seatest.h"
#include "unit_test_common.h"
void test_fft_c2c_1d_float32_performance()
{
    ne10_int32_t i = 0;
    ne10_int32_t fftSize = 0;
    ne10_int32_t flag_result = NE10_OK;
    ne10_int32_t test_loop = 0;

    fprintf (stdout, "----------%30s start\n", __FUNCTION__);
    fprintf (stdout, "%25s%20s%20s%20s%20s\n", "FFT Length", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");

    for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
    {
        fprintf (stdout, "FFT size %d\n", fftSize);

        /* FFT test */
        memcpy (in_c, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
        memcpy (in_neon, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
        flag_result = test_c2c_alloc (fftSize);
        if (flag_result == NE10_ERR)
        {
            return;
        }

        test_loop = TEST_COUNT / fftSize;

        GET_TIME
        (
            time_c,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 0);
        }
        );
        GET_TIME
        (
            time_neon,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 0);
        }
        );

        time_speedup = (ne10_float32_t) time_c / time_neon;
        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
        ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);

        /* IFFT test */
        memcpy (in_c, out_c, 2 * fftSize * sizeof (ne10_float32_t));
        memcpy (in_neon, out_c, 2 * fftSize * sizeof (ne10_float32_t));

        GET_TIME
        (
            time_c,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 1);
        }
        );
        GET_TIME
        (
            time_neon,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 1);
        }
        );

        time_speedup = (ne10_float32_t) time_c / time_neon;
        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
        ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);

        NE10_FREE (cfg_c);
        NE10_FREE (cfg_neon);
    }
}

執行結果:

      可以看到,在FFT>8之後採用Ne10版本比純c版效率高很多,但是在處理2,4,8個FFT時,Ne10的效率居然還沒有c高。

再來看看處理圖像的效率問題:執行NE10_imgproc_unit_test_statci_performanc。其重要源碼如下:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "NE10_imgproc.h"
#include "seatest.h"
#include "unit_test_common.h"

void test_resize_performance_case()
{
    ne10_int32_t srcw;
    ne10_int32_t srch;
    ne10_int32_t dstw;
    ne10_int32_t dsth;
    ne10_int32_t i;
    ne10_int32_t w, h;
    ne10_int32_t channels = 4;
    ne10_int32_t pic_size = MEM_SIZE * MEM_SIZE * channels * sizeof (ne10_uint8_t);
    ne10_int64_t time_c = 0;
    ne10_int64_t time_neon = 0;

    /* init input memory */
    in_c = NE10_MALLOC (pic_size);
    in_neon = NE10_MALLOC (pic_size);

    /* init dst memory */
    out_c = NE10_MALLOC (pic_size);
    out_neon = NE10_MALLOC (pic_size);

    for (i = 0; i < pic_size; i++)
    {
        in_c[i] = in_neon[i] = (rand() & 0xff);
    }

    for (h = 16; h < MEM_SIZE; h += 4)
    {
        for (w = 16; w < MEM_SIZE; w += 4)
        {
            srcw = h;
            srch = h;
            dstw = w;
            dsth = w;

            printf ("srcw X srch = %d X %d \n", srcw, srch);
            printf ("dstw X dsth = %d X %d \n", dstw, dsth);

            GET_TIME
            (
                time_c,
            {
                for (i = 0; i < TEST_COUNT; i++)
                    ne10_img_resize_bilinear_rgba_c (out_c, dstw, dsth, in_c, srcw, srch, srcw);
            }
            );

            GET_TIME
            (
                time_neon,
            {
                for (i = 0; i < TEST_COUNT; i++)
                    ne10_img_resize_bilinear_rgba_neon (out_neon, dstw, dsth, in_neon, srcw, srch, srcw);
            }
            );
            printf ("time c %lldus \n", time_c);
            printf ("time neon %lldus \n", time_neon);
            ne10_log (__FUNCTION__, "IMAGERESIZE%20d%20lld%20lld%19.2f%%%18.2f:1\n", (h * MEM_SIZE + w), time_c, time_neon, 0, 0);

        }
    }
    NE10_FREE (in_c);
    NE10_FREE (in_neon);
    NE10_FREE (out_c);
    NE10_FREE (out_neon);
}

執行結果:

很明顯,做圖像resize時,neon版本的要比c版本的效率搞很多

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章