The first mobile terminal arm cpu optimization study notes Part 4 - Introduction to inline assembly , but it only gives the code, many people don't know how to run it on the phone, in fact, only a CMakeLists.txt is enough.
cmake_minimum_required(VERSION 2.8)
set(PROJECT_NAME benchmark)
project(${PROJECT_NAME})
set(CMAKE_BUILD_TYPE Release)
aux_source_directory(. SRC_FILES)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
add_executable(${PROJECT_NAME} ${SRC_FILES})
1. Use htop to monitor the CPU operation of the android mobile phone
git clone https://github.com/LinGeLin/Compiled-Android-htop
Create a new run.sh file under it, the content is
#!/bin/bash
install (){
adb push 2-2installht/xbin/htop /data/local/tmp
adb push 6-1ncinstall /data/local/tmp/
}
run(){
adb shell "cd /data/local/tmp && export TERM=xterm && export TERMINFO=6-1ncinstall/share/terminfo &&export LD_LIBRARY_PATH=6-1ncinstall/lib &&./htop"
}
CMDS="adb shell ls /data/local/tmp/"
FILES=`$CMDS`
# echo $FILES
# for FILE in ${FILES[*]}
# do
# echo $FILE
# done
if [[ ! "${FILES[@]}" =~ "6-1ncinstall" ]]; then
install
fi
run
Open the command line and switch to the folder where htop is located to execute ./run.sh
Comment out the install line after copying the file for the first time, which is very similar to the Linux side.
2. Test summation acceleration code:
#include "iostream"
#if ANDROID
#include <arm_neon.h>
#endif
#define ANDROID_LOG 1
#include "mrlog.h"
int rounds = 1000;
float sum_float(const float* data, const long num=1000){
float sum = 0;
for(long i=0;i<num;i++){
sum += data[i];
}
return sum;
}
float sum_intrinsics(const float* data, const long size=1000)
{
long nn = size>>2;
float *ptr = (float*)data;
float32x4_t sum_vec = vdupq_n_f32(0);
for(; nn>0; nn--,ptr+=4){
float32x4_t tmp_vec = vld1q_f32(ptr);
sum_vec = vaddq_f32(sum_vec, tmp_vec);
}
float sum = vgetq_lane_f32(sum_vec, 0)+vgetq_lane_f32(sum_vec, 1)+vgetq_lane_f32(sum_vec, 2)+vgetq_lane_f32(sum_vec, 3);
int remain = size&3;
for(;remain>0;remain--,ptr++){
sum+=(*ptr);
}
return sum;
}
int sum_neon(const float* data, const long size=1000){
float sum = 0;
long kc = size >> 2;
float *ptr = (float*) data;
#if __aarch64__
float *sum_vector= new float[4];
asm volatile(
"mov x0,0 \n"
"dup v0.4s, w0 \n"
"0: \n"
"ld1 {v1.4s},[%[ptr]],#16 \n"
"fadd v0.4s,v0.4s,v1.4s \n"
"subs %[kc], %[kc],#1 \n"
"bne 0b \n"
"st1 {v0.4s},[%[sum_vector]] \n"
: [kc] "=r" (kc),
[ptr] "=r" (ptr),
[sum_vector] "=r" (sum_vector)
: [kc] "0" (kc),
[ptr] "1" (ptr),
[sum_vector] "2" (sum_vector)
: "cc","memory","x0","x1"
);
for(int i = 0; i < 4; i++){
sum += sum_vector[i];
}
delete []sum_vector;
#else
asm volatile(
"veor q0,q0,q0 \n"
"0: \n"
"vld1.f32 {q1},[%[ptr]]! \n"
"vadd.f32 q0,q0,q1 \n"
"subs %[kc], #1 \n"
"bne 0b \n"
"vpadd.f32 d0,d0,d1 \n"
"vadd.f32 s0,s0,s1 \n"
"vmov.32 %[sum],s0 \n"
: [kc] "=r" (kc),
[ptr] "=r" (ptr),
[sum] "=r" (sum)
: [kc] "0" (kc),
[ptr] "1" (ptr),
[sum] "2" (sum)
: "cc","memory","q0","q1"
);
#endif
for(int i = size - (kc<<2); i < size; ++i) {
sum += data[i];
}
return sum;
}
int test_float(float *a, long num){
float sum =0;
MRTIMER_START(sum_time);
for(int i=0;i<rounds;i++)
sum = sum_float(a,num);
std::cout<<sum<<std::endl;
std::cout<<"float: "<<MRTIMER_END(sum_time)/rounds<<"ms"<<std::endl;
return 0;
}
int test_intrinsics(float *a, long num){
float sum = 0;
MRTIMER_START(intrinsics_time);
for(int i=0;i<rounds;i++)
sum = sum_intrinsics(a,num);
std::cout<<sum<<std::endl;
std::cout<<"intrinsics: "<<MRTIMER_END(intrinsics_time)/rounds<<"ms"<<std::endl;
return 0;
}
int test_neon(float *a, long num){
float sum =0;
MRTIMER_START(neon_time);
for(int i=0;i<rounds;i++)
sum = sum_neon(a,num);
std::cout<<sum<<std::endl;
std::cout<<"neon: "<<MRTIMER_END(neon_time)/rounds<<"ms"<<std::endl;
return 0;
}
int main()
{
long num = 1000000;
float *a = new float[num];
for(int i=0;i<num;i++){
a[i]=1;
}
test_float(a,num);
test_intrinsics(a,num);
test_neon(a,num);
return 0;
}
Refer to the summary of commonly used ARM NEON functions : float32x4_t vdupq_n_f32 (float32_t value) copies the value by 4 points and stores it in the returned register
float32x4_t vld1q_f32 (float32_t const * ptr) loads 4 elements from the array and stores them in the register, vst1q_f32 writes
float32x4_t vaddq_f32 (float32x4_t a, float32x4_t b) returns the sum of the corresponding elements of the two registers r = a+b
float32_t vgetq_lane_f32 (float32x4_t v, const int lane) returns the value of a lane in the register
Automatically compile and run:
#rm -rf build
mkdir build
cd build
#arm64-v8a
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 ..
make
adb push benchmark /data/local/tmp
adb shell "cd /data/local/tmp/ && ./benchmark"
Running time on OnePlus 6 (Qualcomm 845):
1e+06
float: 1.11039ms
1e+06
intrinsics: 0.351055ms
1e+06
neon: 0.351813ms
It can be seen that it can speed up by 3.09 times. PS: One million addition operations take about 1 millisecond.
As the implementation of deep learning convolution, matrix multiplication directly affects the final speed. How to deal with it to squeeze out the performance of the processor?
- Getting started with OpenBLAS gemm from scratch
- Accelerated implementation of neural network arm neon
- [Image processing] NEON programming
The principle of winograd to realize convolution is to find the greatest common divisor in the convolution process and eliminate redundant operations