neon优化二维卷积算法

卷积在图像处理中使用很频繁,由于数据量大,计算多,未经优化的卷积算法很慢。利用neon的并行计算,可以对其进行优化。

未经优化的C语言实现:

bool convolve2DSlow(unsigned char* in, unsigned char* out, int dataSizeX, int dataSizeY,
                    float* kernel, int kernelSizeX, int kernelSizeY)
{
    int i, j, m, n, mm, nn;
    int kCenterX, kCenterY;                         // center index of kernel
    float sum;                                      // temp accumulation buffer
    int rowIndex, colIndex;
    // check validity of params
    if(!in || !out || !kernel) return false;
    if(dataSizeX <= 0 || kernelSizeX <= 0) return false;
    // find center position of kernel (half of kernel size)
    kCenterX = kernelSizeX / 2;
    kCenterY = kernelSizeY / 2;
    for(i=0; i < dataSizeY; ++i)                // rows
    {
        for(j=0; j < dataSizeX; ++j)            // columns
        {
            sum = 0;                            // init to 0 before sum
            for(m=0; m < kernelSizeY; ++m)      // kernel rows
            {
                mm = kernelSizeY - 1 - m;       // row index of flipped kernel
                for(n=0; n < kernelSizeX; ++n)  // kernel columns
                {
                    nn = kernelSizeX - 1 - n;   // column index of flipped kernel
                    // index of input signal, used for checking boundary
                    rowIndex = i + m - kCenterY;
                    colIndex = j + n - kCenterX;
                    // ignore input samples which are out of bound
                    if(rowIndex >= 0 && rowIndex < dataSizeY && colIndex >= 0 && colIndex < dataSizeX)
                        sum += in[dataSizeX * rowIndex + colIndex] * kernel[kernelSizeX * mm + nn];
                }
            }
            out[dataSizeX * i + j] = (unsigned char)((float)fabs(sum) + 0.5f);
        }
    }
    return true;
}

neon优化

思路:neon可以一次处理4个32位数据,考虑到级、卷积核为float类型,灰度像素数据为uint8_t,neon可以一次读取8个像素数据,将像素数据转化为float后进行计算。因为neon只能按内存顺序读取固定数量(8个)数据,为了避免剩余数据和卷积计算中的边界问题,可以将像素矩阵用0填充扩展,每行的头尾分别加上kernelSizeX/2个0数据,并确保每行数据量为8的倍数,不是则添0不足(增加内存开销来减小代码复杂度和时间复杂度,因为优化目标主要在于提升时间上效率)
neon优化后的实现:

/*convolve2D_neon:neon优化的二维卷积*/
int convolve2D_neon(unsigned char * src,unsigned char * dst,int datasizeX,int datasizeY,float* kernel,int kernelSizeX,int kernelSizeY)
{

    if(!dst || !src || !kernel) return -1;
    if(datasizeX <= 0 || kernelSizeX <= 0) return -1;
    int x,y,kx,ky,k,mm,nn,yIndex,xIndex;
    //float k_num;
    float32x4_t k_neon;    
    uint8x8_t    neon_x_u8;
    uint16x8_t    neon_x_u16_8;
    uint16x4_t    neon_x_u16_4;
    uint16x4_t    neon_x_u16_4_2;
    uint32x4_t    neon_x_u32;
    float32x4_t    neon_x_f32;
    float32x4_t neon_sum_f32_high;
    float32x4_t neon_sum_f32_low;
    int kCenterX, kCenterY;

    kCenterX = kernelSizeX >> 1;
    kCenterY = kernelSizeY >> 1;
    k=datasizeX%8;
    if(k)
        k=8-k;
    unsigned char* in=(unsigned char * )malloc((datasizeX+2*kCenterX+k)*(datasizeY+2*kCenterY));    //扩展,避免边界和剩余
    int strde=datasizeX+2*kCenterX+k;//in的步长
    memset(in,0,sizeof(in));
    for(y=0;y<datasizeY;y++)
    {
        for(x=0;x<datasizeX;x++)
            in[(y+kCenterY)*(strde)+x+kCenterX]=src[y*datasizeX+x];    
    }
    for(y=0;y<datasizeY;y++)
    {
        for(x=0;x<datasizeX;x=x+8)
        {
            neon_sum_f32_high=vdupq_n_f32(0);
            neon_sum_f32_low =vdupq_n_f32(0);
            for(ky=0;ky<kernelSizeY;ky++)        // kernel rows
            {
                mm = kernelSizeY - 1 - ky;        // row index of flipped kernel
                for(kx=0; kx < kernelSizeX; ++kx)  // kernel columns
                {
                    nn = kernelSizeX - 1 - kx;   // column index of flipped kernel                    
                    yIndex = y + ky;
                    xIndex = x + kx;

                    neon_x_u8=vld1_u8(in+yIndex*(strde)+xIndex);    //加载8个8位数据
                    neon_x_u16_8=vmovl_u8(neon_x_u8);
                    k_neon=vld1q_dup_f32(kernel+kernelSizeX * mm + nn);//加载kernel[kernelSizeX * mm + nn]
                    /*处理低位的4个数据*/
                    neon_x_u16_4=vget_low_u16(neon_x_u16_8);                
                    neon_x_u32 = vmovl_u16(neon_x_u16_4);    //将16位扩展为32位
                    neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32        
                    neon_sum_f32_low=vmlaq_f32(neon_sum_f32_low,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon
                    /*处理高位4个数据*/
                    neon_x_u16_4=vget_high_u16(neon_x_u16_8);
                    neon_x_u32 = vmovl_u16(neon_x_u16_4);    //将16位扩展为32位
                    neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32
                    neon_sum_f32_high=vmlaq_f32(neon_sum_f32_high,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon
                }
            }
            neon_x_u32=vcvtq_u32_f32(neon_sum_f32_low);    //将float32*4*2转为uint8*8
            neon_x_u16_4=vqmovn_u32(neon_x_u32);
            neon_x_u32=vcvtq_u32_f32(neon_sum_f32_high);
            neon_x_u16_4_2=vqmovn_u32(neon_x_u32);
            neon_x_u16_8=vcombine_u16(neon_x_u16_4,neon_x_u16_4_2);
            neon_x_u8=vqmovn_u16(neon_x_u16_8);
            vst1_u8(dst+y*datasizeX+x,neon_x_u8);    //存储        
        }
    }
    free(in);
    in=NULL;
    return 0;
}

可分离的卷积计算:

如果二维的卷积核矩阵可以分解为两个一维矩阵相乘,那么可以对二维数先进行x方向(行)上的一维卷积计算,然后在进行y方向(列)的一维卷积计算
C/C++语言代码实现:

bool convolve2DSeparable(unsigned char* in, unsigned char* out, int dataSizeX, int dataSizeY,
                         float* kernelX, int kSizeX, float* kernelY, int kSizeY)
{
    int i, j, k, m, n;
    float *tmp, *sum;                               // intermediate data buffer
    unsigned char *inPtr, *outPtr;                  // working pointers
    float *tmpPtr, *tmpPtr2;                        // working pointers
    int kCenter, kOffset, endIndex;                 // kernel indice
    // check validity of params
    if(!in || !out || !kernelX || !kernelY) return false;
    if(dataSizeX <= 0 || kSizeX <= 0) return false;
    // allocate temp storage to keep intermediate result
    tmp = new float[dataSizeX * dataSizeY];
    if(!tmp) return false;  // memory allocation error
    // store accumulated sum
    sum = new float[dataSizeX];
    if(!sum) return false;  // memory allocation error
    // covolve horizontal direction ///////////////////////
    // find center position of kernel (half of kernel size)
    kCenter = kSizeX >> 1;                          // center index of kernel array
    endIndex = dataSizeX - kCenter;                 // index for full kernel convolution
    // init working pointers
    inPtr = in;
    tmpPtr = tmp;                                   // store intermediate results from 1D horizontal convolution
    // start horizontal convolution (x-direction)
    for(i=0; i < dataSizeY; ++i)                    // number of rows
    {
        kOffset = 0;                                // starting index of partial kernel varies for each sample
        // COLUMN FROM index=0 TO index=kCenter-1
        for(j=0; j < kCenter; ++j)
        {
            *tmpPtr = 0;                            // init to 0 before accumulation
            for(k = kCenter + kOffset, m = 0; k >= 0; --k, ++m) // convolve with partial of kernel
            {
                *tmpPtr += *(inPtr + m) * kernelX[k];
            }
            ++tmpPtr;                               // next output
            ++kOffset;                              // increase starting index of kernel
        }
        // COLUMN FROM index=kCenter TO index=(dataSizeX-kCenter-1)
        for(j = kCenter; j < endIndex; ++j)
        {
            *tmpPtr = 0;                            // init to 0 before accumulate
            for(k = kSizeX-1, m = 0; k >= 0; --k, ++m)  // full kernel
            {
                *tmpPtr += *(inPtr + m) * kernelX[k];
            }
            ++inPtr;                                // next input
            ++tmpPtr;                               // next output
        }
        kOffset = 1;                                // ending index of partial kernel varies for each sample
        // COLUMN FROM index=(dataSizeX-kCenter) TO index=(dataSizeX-1)
        for(j = endIndex; j < dataSizeX; ++j)
        {
            *tmpPtr = 0;                            // init to 0 before accumulation
            for(k = kSizeX-1, m=0; k >= kOffset; --k, ++m)   // convolve with partial of kernel
            {
                *tmpPtr += *(inPtr + m) * kernelX[k];
            }
            ++inPtr;                                // next input
            ++tmpPtr;                               // next output
            ++kOffset;                              // increase ending index of partial kernel
        }
        inPtr += kCenter;                           // next row
    }
    // END OF HORIZONTAL CONVOLUTION //////////////////////
    // start vertical direction ///////////////////////////
    // find center position of kernel (half of kernel size)
    kCenter = kSizeY >> 1;                          // center index of vertical kernel
    endIndex = dataSizeY - kCenter;                 // index where full kernel convolution should stop
    // set working pointers
    tmpPtr = tmpPtr2 = tmp;
    outPtr = out;
    // clear out array before accumulation
    for(i = 0; i < dataSizeX; ++i)
        sum[i] = 0;
    // start to convolve vertical direction (y-direction)
    // ROW FROM index=0 TO index=(kCenter-1)
    kOffset = 0;                                    // starting index of partial kernel varies for each sample
    for(i=0; i < kCenter; ++i)
    {
        for(k = kCenter + kOffset; k >= 0; --k)     // convolve with partial kernel
        {
            for(j=0; j < dataSizeX; ++j)
            {
                sum[j] += *tmpPtr * kernelY[k];
                ++tmpPtr;
            }
        }
        for(n = 0; n < dataSizeX; ++n)              // convert and copy from sum to out
        {
            // covert negative to positive
            *outPtr = (unsigned char)((float)fabs(sum[n]) + 0.5f);
            sum[n] = 0;                             // reset to zero for next summing
            ++outPtr;                               // next element of output
        }
        tmpPtr = tmpPtr2;                           // reset input pointer
        ++kOffset;                                  // increase starting index of kernel
    }
    // ROW FROM index=kCenter TO index=(dataSizeY-kCenter-1)
    for(i = kCenter; i < endIndex; ++i)
    {
        for(k = kSizeY -1; k >= 0; --k)             // convolve with full kernel
        {
            for(j = 0; j < dataSizeX; ++j)
            {
                sum[j] += *tmpPtr * kernelY[k];
                ++tmpPtr;
            }
        }
        for(n = 0; n < dataSizeX; ++n)              // convert and copy from sum to out
        {
            // covert negative to positive
            *outPtr = (unsigned char)((float)fabs(sum[n]) + 0.5f);
            sum[n] = 0;                             // reset for next summing
            ++outPtr;                               // next output
        }
        // move to next row
        tmpPtr2 += dataSizeX;
        tmpPtr = tmpPtr2;
    }
    // ROW FROM index=(dataSizeY-kCenter) TO index=(dataSizeY-1)
    kOffset = 1;                                    // ending index of partial kernel varies for each sample
    for(i=endIndex; i < dataSizeY; ++i)
    {
        for(k = kSizeY-1; k >= kOffset; --k)        // convolve with partial kernel
        {
            for(j=0; j < dataSizeX; ++j)
            {
                sum[j] += *tmpPtr * kernelY[k];
                ++tmpPtr;
            }
        }
        for(n = 0; n < dataSizeX; ++n)              // convert and copy from sum to out
        {
            // covert negative to positive
            *outPtr = (unsigned char)((float)fabs(sum[n]) + 0.5f);
            sum[n] = 0;                             // reset for next summing
            ++outPtr;                               // next output
        }
        // move to next row
        tmpPtr2 += dataSizeX;
        tmpPtr = tmpPtr2;                           // next input
        ++kOffset;                                  // increase ending index of kernel
    }
    // END OF VERTICAL CONVOLUTION ////////////////////////
    // deallocate temp buffers
    delete [] tmp;
    delete [] sum;
    return true;
}

可分离二维卷积的neon优化思路:先实现一维的卷积优化,思路相同,一样是填0扩展,避免剩余数据和边界问题,然后并行计算4个像素的卷积计算,由于neon只能一次读取顺序的8个uint8_t数据,只能进行x方向的卷积,所以需要进行矩阵转置(矩阵转置也可以用neon优化,网上说可以提速10倍)
neon优化实现:

/*convolve1D:一维卷积*/
int convolve1D(unsigned char* src, unsigned char* dst,int dataSize, float* kernel, int kernelSize)
{
    int kCenter=kernelSize>>1;
    int k=dataSize%8;
    unsigned char* inpre=src;
    int x,kx;
    if(k)
        k=8-k;
    unsigned char* in=(unsigned char*)malloc(dataSize+2*kCenter+k+8);
    //unsigned char* in=(unsigned char*)malloc(2048);
    memset(in,0,(dataSize+2*kCenter+k));    
    memcpy(in+kCenter,inpre,dataSize);    
    /*
    for(x=0;x<dataSize;x++)
        in[x+kCenter]=inpre[x];
    */    
    float32x4_t k_neon;    
    uint8x8_t    neon_x_u8;
    uint16x8_t    neon_x_u16_8;
    uint16x4_t    neon_x_u16_4;
    uint16x4_t    neon_x_u16_4_2;
    uint32x4_t    neon_x_u32;
    float32x4_t    neon_x_f32;
    float32x4_t neon_sum_f32_high;
    float32x4_t neon_sum_f32_low;

    for(x=0;x<dataSize;x=x+8)
    {
        neon_sum_f32_high=vdupq_n_f32(0);
        neon_sum_f32_low =vdupq_n_f32(0);
        for(kx=0;kx<kernelSize;kx++)
        {
            neon_x_u8=vld1_u8(in+x+kx);    //加载8个8位数据
            neon_x_u16_8=vmovl_u8(neon_x_u8);
            //printf("kernelSize=%d  kernel[%d]=%f\n",kernelSize,kx,kernel[kernelSize-kx-1]);
            k_neon=vld1q_dup_f32((&kernel[kernelSize-kx-1]));//加载kernelX[kSizeX-kx-1]
            //printf("kernelSize=%d  kernel[%d]=%f\n",kernelSize,kx,kernel[kernelSize-kx-1]);
            /*处理低位的4个数据*/            
            neon_x_u16_4=vget_low_u16(neon_x_u16_8);                
            neon_x_u32 = vmovl_u16(neon_x_u16_4);    //将16位扩展为32位
            neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32        
            neon_sum_f32_low=vmlaq_f32(neon_sum_f32_low,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon
            /*处理高位4个数据*/
            neon_x_u16_4=vget_high_u16(neon_x_u16_8);
            neon_x_u32 = vmovl_u16(neon_x_u16_4);    //将16位扩展为32位
            neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32
            neon_sum_f32_high=vmlaq_f32(neon_sum_f32_high,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon
        }
        neon_x_u32=vcvtq_u32_f32(neon_sum_f32_low);    //将float32*4*2转为uint8*8
        neon_x_u16_4=vqmovn_u32(neon_x_u32);
        neon_x_u32=vcvtq_u32_f32(neon_sum_f32_high);
        neon_x_u16_4_2=vqmovn_u32(neon_x_u32);
        neon_x_u16_8=vcombine_u16(neon_x_u16_4,neon_x_u16_4_2);
        neon_x_u8=vqmovn_u16(neon_x_u16_8);
        vst1_u8(dst+x,neon_x_u8);
    }
    free(in);
    in=NULL;
    return 0;
}
/*convolve2DSeparable:neon优化的可分离二维卷积,先对行一维卷积,再对列进行一维卷积*/
int convolve2DSeparable(unsigned char* src, unsigned char* dst, int dataSizeX, int dataSizeY,
                         float* kernelX, int kSizeX, float* kernelY, int kSizeY)
{
    int x,y;
    struct  timeval    start;
       struct  timeval    end;
      unsigned char* in_tmp=(unsigned char*)malloc(dataSizeX*dataSizeY);    
    for(y=0;y<dataSizeY;y++)        //一维卷积
        convolve1D(src+y*dataSizeX,in_tmp+y*dataSizeX,dataSizeX,kernelX,kSizeX);    
    for(y=0;y<dataSizeY;y++)    //矩阵转置
    {
        for(x=0;x<dataSizeX;x++)
        {
            dst[x*dataSizeY+y]=in_tmp[y*dataSizeX+x];
        }
    }
    for(x=0;x<dataSizeX;x++)        //一维卷积
    {
        convolve1D(dst+x*dataSizeY,in_tmp+x*dataSizeY,dataSizeY,kernelY,kSizeY);
    }
    for(y=0;y<dataSizeY;y++)    //矩阵转置输出
    {
        for(x=0;x<dataSizeX;x++)
        {
            dst[y*dataSizeX+x]=in_tmp[x*dataSizeY+y];
        }
    }
    free(in_tmp);
    return 0;
}

测试:

测试图为1920*1080的灰度图,测试平台为海思3559板子,卷积核大小为5*5,所以值均为0.25(平均模糊卷积),分别进行普通编译和O3级优化编译neon优化代码和原来的c语言代码,进行运行时间比较
编译命令:

arm-hisiv600-linux-g++ -mfloat-abi=softfp -mfpu=neon -o neon_test1080 neon_test.c
arm-hisiv600-linux-g++ -mfloat-abi=softfp -mfpu=neon -O3 neon_test.c

测试结果
neon卷积优化测试结果

测试结果与平台以及编译器相关,经过不同平台的测试,O3优化的加速比在不同平台和编译器下不同,但基本能提速2-10倍,而neon相比于C的提速比,经测试,似乎与处理的图片大小有关,没有进一步测试。

猜你喜欢

转载自blog.csdn.net/q547569552/article/details/78366853