Ne10編譯和介紹

1.介紹

ARM® NEON™ 技術是適用於 ARM Cortex™-A 系列處理器的 SIMD（單指令多數據）架構擴展。它可以使多媒體和信號處理算法提速，例如視頻編碼/解碼、2D/3D 圖形、遊戲、音頻和語音處理以及圖像處理等。 Ne10的問世，出現許多使用NEON 並顯著改善用戶體驗的多媒體應用程序。有些應用程序開發人員可能不熟悉 NEON 彙編代碼，因此 Ne10 庫的創建可使開發人員從 ARMv7/NEON 中獲得最大效益，而不必使用繁瑣的彙編代碼。

Ne10 庫提供一組最爲常用並且極爲優化的函數。這組函數最初於 2012 年 3 月發佈。庫中的初始功能集着重於矩陣/矢量代數以及信號處理。 Ne10 將持續改進，以包含圖像處理等多領域內的更多高計算量任務。

2.源碼獲取

Ne10的源碼公開在github上面，其網站地址：https://github.com/projectNe10/Ne10 。

3.環境

3.1硬件環境

您需要準備ARM Cortex-A/R系列開發平臺。如果沒有硬件開發平臺，也可使用仿真環境，如Google的Android Emulator。我現在使用的硬件開發板環境是arm-A53的平臺，交叉編譯平臺ubuntu 16.04.

3.2軟件環境

工具鏈：aarch64-linux-gnu-

CMake (http://www.cmake.org/)：跨平臺的開源構建系統

4.編譯和使用Ne0庫

4.1編譯Ne10

通過第2部分，獲取源碼後，進入源碼目錄，進行如下操作：

修改CMakeLists.txt.有二處修改，修改如下：

1. option(NE10_BUILD_UNIT_TEST "Build NE10 unit test" ON)  //原先爲OFF
2. option(NE10_PERFORMANCE_TEST "Run performance test" ON)//原先爲OFF

此處打開，源碼中的測試程序和選擇performance-test。關於smoke testing，regression testing， performancetesting的區別如下：

Conformance testing (also called smoke testing), to check if the library works correctly.
Regression testing, which is similar to conformance testing but is aimed more specifically at testing whether the library still operates correctly after a change.
Performance testing, which gives an indication of how quickly the library performs certain tasks.

2.修改GNUlinux_config.cmake

if(NOT DEFINED ENV{NE10_LINUX_TARGET_ARCH})
   set(NE10_LINUX_TARGET_ARCH "aarch64")
else()
//直接將此處設置爲，aarch64

3.編譯

mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../GNUlinux_config.cmake ../
make -j8

此處是靜態的編譯方式，可以看到在build/modules/下面生成libNE10.

ccion@ubuntu:~/Ne10/build/modules$ ls
CMakeFiles  cmake_install.cmake  libNE10.a  Makefile

4.2使用和結果分析

通過上面的步驟可以看到，在build目錄下面生成了test文件，有二個應用程序，這裏在我的開發板平臺上面執行FFT的執行程序-NE10-dsp_unit_test_static_performanc。其重要部分源碼如下：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "NE10_dsp.h"
#include "NE10_macros.h"
#include "seatest.h"
#include "unit_test_common.h"
void test_fft_c2c_1d_float32_performance()
{
    ne10_int32_t i = 0;
    ne10_int32_t fftSize = 0;
    ne10_int32_t flag_result = NE10_OK;
    ne10_int32_t test_loop = 0;

    fprintf (stdout, "----------%30s start\n", __FUNCTION__);
    fprintf (stdout, "%25s%20s%20s%20s%20s\n", "FFT Length", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");

    for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
    {
        fprintf (stdout, "FFT size %d\n", fftSize);

        /* FFT test */
        memcpy (in_c, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
        memcpy (in_neon, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
        flag_result = test_c2c_alloc (fftSize);
        if (flag_result == NE10_ERR)
        {
            return;
        }

        test_loop = TEST_COUNT / fftSize;

        GET_TIME
        (
            time_c,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 0);
        }
        );
        GET_TIME
        (
            time_neon,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 0);
        }
        );

        time_speedup = (ne10_float32_t) time_c / time_neon;
        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
        ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);

        /* IFFT test */
        memcpy (in_c, out_c, 2 * fftSize * sizeof (ne10_float32_t));
        memcpy (in_neon, out_c, 2 * fftSize * sizeof (ne10_float32_t));

        GET_TIME
        (
            time_c,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 1);
        }
        );
        GET_TIME
        (
            time_neon,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 1);
        }
        );

        time_speedup = (ne10_float32_t) time_c / time_neon;
        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
        ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);

        NE10_FREE (cfg_c);
        NE10_FREE (cfg_neon);
    }
}

執行結果：

可以看到，在FFT>8之後採用Ne10版本比純c版效率高很多，但是在處理2,4,8個FFT時，Ne10的效率居然還沒有c高。

再來看看處理圖像的效率問題：執行NE10_imgproc_unit_test_statci_performanc。其重要源碼如下：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "NE10_imgproc.h"
#include "seatest.h"
#include "unit_test_common.h"

void test_resize_performance_case()
{
    ne10_int32_t srcw;
    ne10_int32_t srch;
    ne10_int32_t dstw;
    ne10_int32_t dsth;
    ne10_int32_t i;
    ne10_int32_t w, h;
    ne10_int32_t channels = 4;
    ne10_int32_t pic_size = MEM_SIZE * MEM_SIZE * channels * sizeof (ne10_uint8_t);
    ne10_int64_t time_c = 0;
    ne10_int64_t time_neon = 0;

    /* init input memory */
    in_c = NE10_MALLOC (pic_size);
    in_neon = NE10_MALLOC (pic_size);

    /* init dst memory */
    out_c = NE10_MALLOC (pic_size);
    out_neon = NE10_MALLOC (pic_size);

    for (i = 0; i < pic_size; i++)
    {
        in_c[i] = in_neon[i] = (rand() & 0xff);
    }

    for (h = 16; h < MEM_SIZE; h += 4)
    {
        for (w = 16; w < MEM_SIZE; w += 4)
        {
            srcw = h;
            srch = h;
            dstw = w;
            dsth = w;

            printf ("srcw X srch = %d X %d \n", srcw, srch);
            printf ("dstw X dsth = %d X %d \n", dstw, dsth);

            GET_TIME
            (
                time_c,
            {
                for (i = 0; i < TEST_COUNT; i++)
                    ne10_img_resize_bilinear_rgba_c (out_c, dstw, dsth, in_c, srcw, srch, srcw);
            }
            );

            GET_TIME
            (
                time_neon,
            {
                for (i = 0; i < TEST_COUNT; i++)
                    ne10_img_resize_bilinear_rgba_neon (out_neon, dstw, dsth, in_neon, srcw, srch, srcw);
            }
            );
            printf ("time c %lldus \n", time_c);
            printf ("time neon %lldus \n", time_neon);
            ne10_log (__FUNCTION__, "IMAGERESIZE%20d%20lld%20lld%19.2f%%%18.2f:1\n", (h * MEM_SIZE + w), time_c, time_neon, 0, 0);

        }
    }
    NE10_FREE (in_c);
    NE10_FREE (in_neon);
    NE10_FREE (out_c);
    NE10_FREE (out_neon);
}

執行結果：

很明顯，做圖像resize時，neon版本的要比c版本的效率搞很多

Ne10編譯和介紹

1.介紹

2.源碼獲取

3.環境

4.編譯和使用Ne0庫

linux內存工具查看歸納

/usr/bin./ld: cannot find -l

軟件架構之路 1

從4行代碼看右值引用

C++編程學習52個經典網站

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結