neon加速图像转置

一般的矩阵旋转操作都是对矩阵中的元素逐个操作,假设矩阵大小为m*n,那么时间复杂度就是o(mn)。如果使用了arm公司提供的neon加速技术,则可以并行的读取多个元素,对多个元素进行操作,虽然时间复杂度还是o(mn),但是常数因子会变小,并且在寄存器里的操作比在普通内存中还要快一些,所以会带来一定的性能提升。针对灰度图进行图像转置,图像数据为uint8_t.
c语言实现:

for(y=0;y<h;y++)
    {
        for(x=0;x<w;x++)
            dst_gray[x*h+y]=src_gray[y*w+x];
    }

neon加速思路:
考虑将一个矩阵划分成若干子矩阵,例如:一个128×256大小的矩阵可以划分为16×32个8×8大小的矩阵。分别对每个8x8的子矩阵进行旋转,再将其复制到输出矩阵中正确的坐标上即可。可以总结为2步:
循环执行以下步骤,直到所有子矩阵均被处理过
1.旋转当前子矩阵
2.将旋转后的子矩阵复制到输出矩阵中
Neon指令vtrn是解决转置问题的核心,其相当于对2x2矩阵进行转置,故对于8x8矩阵,可以先进行uint8x8_t的转置,然后转化为uint16x4_t,隔行转置,再转化为uint32x2_t,隔三行转置,最后转化为uint8x8_t输出到目标矩阵的对应位置。同时要注意剩余数据的处理,因为基于8x8矩阵,所以剩下的数据不足构成8x8矩阵时,需要对剩余数据进行处理,直接逐个操作即可。

vtrn函数功能

neon优化代码:

int transposition_neon(uint8_t* src,uint8_t* dst,int w,int h)
{
    uint8x8x4_t mat1;
    uint8x8x4_t mat2;
    uint8x8x2_t temp1;
    uint8x8x2_t temp2;
    uint8x8x2_t temp3;
    uint8x8x2_t temp4;
    uint16x4x4_t temp11;
    uint16x4x4_t temp12;
    uint16x4x2_t temp5;
    uint16x4x2_t temp6;
    uint16x4x2_t temp7;
    uint16x4x2_t temp8;
    uint32x2x4_t temp21;
    uint32x2x4_t temp22;
    uint32x2x2_t res1;
    uint32x2x2_t res2;
    uint32x2x2_t res3;
    uint32x2x2_t res4;

    int dw=w&7;
    int dh=h&7;
    int sw=w-dw;
    int sh=h-dh;
    int x,y;
    for(y=0;y<sh;y=y+8)
    {
        for(x=0;x<sw;x=x+8)
        {
            mat1.val[0]=vld1_u8(src+y*w+x);
            mat1.val[1]=vld1_u8(src+(y+1)*w+x);
            mat1.val[2]=vld1_u8(src+(y+2)*w+x);
            mat1.val[3]=vld1_u8(src+(y+3)*w+x);
            mat2.val[0]=vld1_u8(src+(y+4)*w+x);
            mat2.val[1]=vld1_u8(src+(y+5)*w+x);
            mat2.val[2]=vld1_u8(src+(y+6)*w+x);
            mat2.val[3]=vld1_u8(src+(y+7)*w+x);
            temp1=vtrn_u8(mat1.val[0],mat1.val[1]);
            temp2=vtrn_u8(mat1.val[2],mat1.val[3]);
            temp3=vtrn_u8(mat2.val[0],mat2.val[1]);
            temp4=vtrn_u8(mat2.val[2],mat2.val[3]);

            temp11.val[0]=vreinterpret_u16_u8(temp1.val[0]);
            temp11.val[1]=vreinterpret_u16_u8(temp1.val[1]);
            temp11.val[2]=vreinterpret_u16_u8(temp2.val[0]);
            temp11.val[3]=vreinterpret_u16_u8(temp2.val[1]);
            temp12.val[0]=vreinterpret_u16_u8(temp3.val[0]);
            temp12.val[1]=vreinterpret_u16_u8(temp3.val[1]);
            temp12.val[2]=vreinterpret_u16_u8(temp4.val[0]);
            temp12.val[3]=vreinterpret_u16_u8(temp4.val[1]);

            temp5=vtrn_u16(temp11.val[0],temp11.val[2]);
            temp6=vtrn_u16(temp11.val[1],temp11.val[3]);
            temp7=vtrn_u16(temp12.val[0],temp12.val[2]);
            temp8=vtrn_u16(temp12.val[1],temp12.val[3]);

            temp21.val[0]=vreinterpret_u32_u16(temp5.val[0]);
            temp21.val[1]=vreinterpret_u32_u16(temp5.val[1]);
            temp21.val[2]=vreinterpret_u32_u16(temp6.val[0]);
            temp21.val[3]=vreinterpret_u32_u16(temp6.val[1]);
            temp22.val[0]=vreinterpret_u32_u16(temp7.val[0]);
            temp22.val[1]=vreinterpret_u32_u16(temp7.val[1]);
            temp22.val[2]=vreinterpret_u32_u16(temp8.val[0]);
            temp22.val[3]=vreinterpret_u32_u16(temp8.val[1]);

            res1=vtrn_u32(temp21.val[0],temp22.val[0]);
            res2=vtrn_u32(temp21.val[1],temp22.val[1]);
            res3=vtrn_u32(temp21.val[2],temp22.val[2]);
            res4=vtrn_u32(temp21.val[3],temp22.val[3]);

            mat1.val[0]=vreinterpret_u8_u32(res1.val[0]);
            mat1.val[1]=vreinterpret_u8_u32(res2.val[0]);
            mat1.val[2]=vreinterpret_u8_u32(res3.val[0]);
            mat1.val[3]=vreinterpret_u8_u32(res4.val[0]);
            mat2.val[0]=vreinterpret_u8_u32(res1.val[1]);
            mat2.val[1]=vreinterpret_u8_u32(res2.val[1]);
            mat2.val[2]=vreinterpret_u8_u32(res3.val[1]);
            mat2.val[3]=vreinterpret_u8_u32(res4.val[1]);

            vst1_u8(dst+x*h+y,mat1.val[0]);
            vst1_u8(dst+(x+1)*h+y,mat1.val[1]);
            vst1_u8(dst+(x+2)*h+y,mat1.val[2]);
            vst1_u8(dst+(x+3)*h+y,mat1.val[3]);
            vst1_u8(dst+(x+4)*h+y,mat2.val[0]);
            vst1_u8(dst+(x+5)*h+y,mat2.val[1]);
            vst1_u8(dst+(x+6)*h+y,mat2.val[2]);
            vst1_u8(dst+(x+7)*h+y,mat2.val[3]);
        }
    }
    for(y=sh-1;y<h;y++)
    {
        for(x=0;x<w;x++)
            dst[x*h+y]=src[y*w+x];
    }
    for(x=sw-1;x<w;x++)
    {    
        for(y=0;y<sh;y++)
        {
            dst[x*h+y]=src[y*w+x];
        }
    }
    return 0;
}

测试图像素:1680*1050
测试平台:海思3559
测试结果:O3级优化编译下,约提速2.5倍(O3优化效果似乎跟平台有关,没研究过),默认编译下,约提速1.5倍,网上说能提速10倍,暂时不知道如何实现

参考资料:
http://blog.csdn.net/jxt1234and2010/article/details/50437884
http://book.51cto.com/art/201506/481001.htm
http://www.cnblogs.com/hrlnw/p/3723072.html
http://www.cnblogs.com/hrlnw/p/3767853.html

猜你喜欢

转载自blog.csdn.net/q547569552/article/details/78406285
今日推荐