ARM inline assembly acceleration algorithm

The first mobile terminal arm cpu optimization study notes Part 4 - Introduction to inline assembly , but it only gives the code, many people don't know how to run it on the phone, in fact, only a CMakeLists.txt is enough.

cmake_minimum_required(VERSION 2.8)
set(PROJECT_NAME benchmark)
project(${PROJECT_NAME})
set(CMAKE_BUILD_TYPE Release)
aux_source_directory(. SRC_FILES)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
add_executable(${PROJECT_NAME} ${SRC_FILES})

1. Use htop to monitor the CPU operation of the android mobile phone

git clone https://github.com/LinGeLin/Compiled-Android-htop

Create a new run.sh file under it, the content is

#!/bin/bash
install (){
adb push 2-2installht/xbin/htop /data/local/tmp
adb push 6-1ncinstall /data/local/tmp/
}

run(){
    adb shell "cd /data/local/tmp && export TERM=xterm && export TERMINFO=6-1ncinstall/share/terminfo &&export LD_LIBRARY_PATH=6-1ncinstall/lib &&./htop"
}

CMDS="adb shell ls /data/local/tmp/"
FILES=`$CMDS`
# echo $FILES
# for FILE in ${FILES[*]}
# do
# echo $FILE
# done
if [[ ! "${FILES[@]}" =~ "6-1ncinstall" ]]; then
    install
fi
run

Open the command line and switch to the folder where htop is located to execute ./run.sh

Comment out the install line after copying the file for the first time, which is very similar to the Linux side.

2. Test summation acceleration code:

#include "iostream"

#if ANDROID
    #include <arm_neon.h>
#endif

#define ANDROID_LOG 1
#include "mrlog.h"

int rounds = 1000;

float sum_float(const float* data, const long num=1000){
    float sum = 0;
    for(long i=0;i<num;i++){
        sum += data[i];
    }
    return sum;
}

float sum_intrinsics(const float* data, const long size=1000) 
{
    long nn = size>>2;
    float *ptr = (float*)data;
    float32x4_t sum_vec = vdupq_n_f32(0);
    for(; nn>0; nn--,ptr+=4){
        float32x4_t tmp_vec = vld1q_f32(ptr);
        sum_vec = vaddq_f32(sum_vec, tmp_vec);
    }
    float sum = vgetq_lane_f32(sum_vec, 0)+vgetq_lane_f32(sum_vec, 1)+vgetq_lane_f32(sum_vec, 2)+vgetq_lane_f32(sum_vec, 3);
    int remain = size&3;
    for(;remain>0;remain--,ptr++){
        sum+=(*ptr);
    }
    return sum;
}

int sum_neon(const float* data, const long size=1000){
    float sum = 0;
    long kc = size >> 2;
    float *ptr = (float*) data;
    #if __aarch64__
        float *sum_vector= new float[4];
        asm volatile(
            "mov            x0,0                    \n"
            "dup            v0.4s, w0               \n"
            "0:                                     \n"
            "ld1            {v1.4s},[%[ptr]],#16    \n"
            "fadd           v0.4s,v0.4s,v1.4s       \n"
            "subs           %[kc], %[kc],#1         \n"
            "bne            0b                      \n"
            "st1            {v0.4s},[%[sum_vector]] \n"
            : [kc]  "=r" (kc),
              [ptr] "=r" (ptr),
              [sum_vector] "=r" (sum_vector)
            : [kc]  "0"  (kc),   
              [ptr] "1"  (ptr),
              [sum_vector] "2"  (sum_vector)
            : "cc","memory","x0","x1"
        );
        for(int i = 0; i < 4; i++){
            sum += sum_vector[i];
        }
        delete []sum_vector;
    #else
        asm volatile(
            "veor               q0,q0,q0            \n"
            "0:                                     \n"
            "vld1.f32           {q1},[%[ptr]]!      \n"
            "vadd.f32           q0,q0,q1            \n"
            "subs               %[kc], #1           \n"
            "bne                0b                  \n"
            "vpadd.f32          d0,d0,d1            \n"
            "vadd.f32           s0,s0,s1            \n"
            "vmov.32            %[sum],s0           \n"
            : [kc]  "=r" (kc),
              [ptr] "=r" (ptr),
              [sum] "=r" (sum)
            : [kc]  "0"  (kc),   
              [ptr] "1"  (ptr),
              [sum] "2"  (sum)
            : "cc","memory","q0","q1"
        );
    #endif
    for(int i = size - (kc<<2); i < size; ++i) {
        sum += data[i];
    }
    return sum;
}

int test_float(float *a, long num){
    float sum =0;
    MRTIMER_START(sum_time);
    for(int i=0;i<rounds;i++)
        sum = sum_float(a,num);
    std::cout<<sum<<std::endl;
    std::cout<<"float: "<<MRTIMER_END(sum_time)/rounds<<"ms"<<std::endl;
    return 0;
}

int test_intrinsics(float *a, long num){
    float sum = 0;
    MRTIMER_START(intrinsics_time);
    for(int i=0;i<rounds;i++)
        sum = sum_intrinsics(a,num);
    std::cout<<sum<<std::endl;
    std::cout<<"intrinsics: "<<MRTIMER_END(intrinsics_time)/rounds<<"ms"<<std::endl;
    return 0;
}

int test_neon(float *a, long num){
    float sum =0;
    MRTIMER_START(neon_time);
    for(int i=0;i<rounds;i++)
        sum = sum_neon(a,num);
    std::cout<<sum<<std::endl;
    std::cout<<"neon: "<<MRTIMER_END(neon_time)/rounds<<"ms"<<std::endl;
    return 0;
}

int main()
{
    long num = 1000000;
    float *a = new float[num];
    for(int i=0;i<num;i++){
        a[i]=1;
    }
    test_float(a,num);
    test_intrinsics(a,num);
    test_neon(a,num);
    return 0;
}

Refer to the summary of commonly used ARM NEON functions : float32x4_t vdupq_n_f32 (float32_t value) copies the value by 4 points and stores it in the returned register

float32x4_t vld1q_f32 (float32_t const * ptr) loads 4 elements from the array and stores them in the register, vst1q_f32 writes

float32x4_t vaddq_f32 (float32x4_t a, float32x4_t b) returns the sum of the corresponding elements of the two registers r = a+b

float32_t vgetq_lane_f32 (float32x4_t v, const int lane) returns the value of a lane in the register

Automatically compile and run:

#rm -rf build
mkdir build
cd build
#arm64-v8a
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 ..
make
adb push benchmark /data/local/tmp
adb shell "cd /data/local/tmp/ && ./benchmark"

Running time on OnePlus 6 (Qualcomm 845):

1e+06
float: 1.11039ms
1e+06
intrinsics: 0.351055ms
1e+06
neon: 0.351813ms

It can be seen that it can speed up by 3.09 times. PS: One million addition operations take about 1 millisecond.

As the implementation of deep learning convolution, matrix multiplication directly affects the final speed. How to deal with it to squeeze out the performance of the processor?

  1. Getting started with OpenBLAS gemm from scratch
  2. Accelerated implementation of neural network arm neon
  3. [Image processing] NEON programming

The principle of winograd to realize convolution is to find the greatest common divisor in the convolution process and eliminate redundant operations

  1. Winograd Fast Convolution Algorithm in Convolutional Neural Networks
  2. Detailed explanation of Winograd transformation matrix generation principle
  3. Mobile arm cpu optimization study notes Part 3 - binding cpu (cpu affinity)

Guess you like

Origin blog.csdn.net/minstyrain/article/details/105348828