NEON 加速矩阵乘法优化 (任意阶)

NEON加速系列文章



前言


一、NEON矩阵乘法优化 (任意阶)

Matrix multiplication through neon.

二、使用步骤

1.Code display

代码如下(示例):

#include <stdio.h>
#include <sys/time.h>
#include <stdint.h>
#include <string.h>
#include <arm_neon.h>

double sub_time(struct timeval t1, struct timeval t0)
{
    
    
    double s = t1.tv_sec - t0.tv_sec;
    double us = t1.tv_usec - t0.tv_usec;

    return s*1000 + us/1000;
}

#define N 16

int main(void)
{
    
    
    float a1[N][N], c1[N][N], a2[N][N], c2[N][N];
    for(int i = 0; i < N; i++){
    
    
       for(int j = 0; j <N; j++){
    
    
           a1[i][j] = 1;
           c1[i][j] = 2;
           a2[i][j] = 1;
           c2[i][j] = 2;
       
       }
    }
    
    
    float d[N][N] = {
    
    {
    
    0}};
    float e[N][N] = {
    
    {
    
    0}};
    int i, j, k, m;

    struct timeval t1, t0;
  
    gettimeofday(&t0, NULL);
    for (i=0;i<10000;i++)
    {
    
    
        for (j=0;j<N;j++)
        {
    
    
            for(k=0;k<N;k++)
            {
    
    
                for (m=0;m<N;m++)
                {
    
    
                    d[j][k] += a1[j][m] * c1[m][k];
                }
            }
        }
    }

    gettimeofday(&t1, NULL);
    printf("basic time used: %0.3f.\n", sub_time(t1,t0));

    for (j=0;j<N;j++)
    {
    
    
        for(k=0;k<N;k++)
        {
    
    
            printf("%f\t", d[j][k]);
        }
        printf("\n");
    }
    
    gettimeofday(&t0, NULL);
    for (i=0;i<10000;i++)
    {
    
    
        float32x4_t vc0 = vdupq_n_f32(0.0f);
        float32x4_t vc1 = vdupq_n_f32(0.0f);
        float32x4_t vc2 = vdupq_n_f32(0.0f);
        float32x4_t vc3 = vdupq_n_f32(0.0f);

	float32x4_t ret = vdupq_n_f32(0.0f);
	
        for (j=0;j<N;j++)
        {
    
    
        	// 通过neon直接计算16*16矩阵的结果
          
            ret = vmlaq_f32(ret, vdupq_n_f32(a2[0][j]), vdupq_n_f32(c2[j][0]));
          
            
        }
        
        for (j=0;j<N;j++){
    
    
           
            vst1q_f32(&e[j][0], ret);
            vst1q_f32(&e[j][4], ret);
            vst1q_f32(&e[j][8], ret);
            vst1q_f32(&e[j][12], ret);
           
        }
      
        
    }

    gettimeofday(&t1, NULL);
    printf("neon time used: %0.3f.\n", sub_time(t1,t0));

    for (j=0;j<N;j++)
    {
    
    
        for(k=0;k<N;k++)
        {
    
    
            printf("%f\t", e[j][k]);
        }
        printf("\n");
    }

    return 0;
}

三、其它NEON accelerate实现后续更新

总结

You are welcome to criticize and correct!!

猜你喜欢

转载自blog.csdn.net/weixin_45206081/article/details/128253348