ARM Neon指令集的一些简单的应用

Arm Neon Usages

Each Neon instruction detail can be searched in https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics?search=

1. Load into Neon from data ptr/store into the data ptr from Neon.

  input_s16x8 = vld1q_s16(pIn);

       vst1_s8(pOut, output_s8x8);

 

2. find sign/abs/max

       sign_s16x8 = vshrq_n_s16(input_s16x8, 16);

       abs_input_s16x8 = vqabsq_s16(input_s16x8); // satuation |min(int8)|=|-128|=127

       abs_input_s16x8 = vabsq_s16(input_s16x8); //without satuation   

  c_s16x8 = vmaxq_s16(a_s16x8, b_s16x8); // find max{a[i], b[i]}

       c_s16x8 = vpmaxq_s16(a_s16x8, b_s16x8); // find max{a[2i], a[2i+1]}

 

3. find zero padding bits

          clz_s16x8 = vclzq_s16(abs_max_s16x8);

 

4.  split/combine vectors

  abs_temp_s16x4_h = vget_high_s16(abs_temp_s16x8);

       abs_temp_s16x4_l = vget_low_s16(abs_temp_s16x8);

  abs_max_s16x8 = vcombine_s16(abs_temp_s16x4, abs_temp_s16x4);

 

5. converting vector (s16x8 <-> s8x8)

       idx_s8x8 = vmovn_s16(idx_s16x8); //narrow move

  bias_s16x8 = vmovl_s8(bias_s8x8); // long move

 

6. reinterpretion (int<->uint)

       idx_s16x8 = vreinterpretq_s16_u16(idx_u16x8);

7. Lookup table(frequetly use in data shuffle)

  int8_t lut[8]= {1, 2, 3, 4, 5, 6, 7, 0};

  data_shuffle_idx_s8x8 = vld1_s8(lut);

       // put the first element in the tail, others move forward

  data_shuffle_s8x8 = vtbl1_s8(data_s8x8, data_shuffle_idx_s8x8);

8. comparison   

If(a>4)

              b = 49;

       else if (a>1)

              b = 90;

       else

              b = 340;

       greater_than_4_u8x8 = vcgt_s8(a_s8x8, 4_s8x8);

       greater_than_1_u8x8 = vcgt_s8(a_s8x8, 1_s8x8);

       idx_u8x8 = vadd_u8(greater_than_4_u8x8, greater_than_1_u8x8);

       int8_t lut[8]= {49, 90, 340, 0, 0, 0, 0, 0};

  lut_s8x8 = vld1_s8(lut);

  b_s8x8 = vtbl1_s8(lut_s8x8, idx_u8x8);

9. bit selection

       result_s16x8 = vbslq_s16(dest_s16x8, src1_s16x8, src2_s16x8);

0050(dest):   0000 0000 0101 0000 (bit 0: src2, bit 1: src1)

ffaa(src1):   1111 1111 1010 1010

0x7(src2):    0000 0000 0000 0111

result:       0000 0000 0000 0111

10. shift and insert (have same effect with bit selection)

       dbg_s16x8 = vsriq_n_s16(input_s16x8[0], input_s16x8[1], 5);

input_s16x8[0]: 0050 ffffff81 ffffff86 0022 ffffffc5 008c 0078 0047

input_s16x8[1]: ffffffaa 001a ffffff98 0068 0091 0067 ffffff75 ffffff9c

dbg_s16x8: 07fd fffff800 fffffffc 0003 fffff804 0003 07fb 07fc

input0: 0000 0000 0101 0000 (0x0050)

input1: 1111 1111 1010 1010 (0xffaa)

input1 after right shift 5:  0000 0111 1111 1101 (unsigned shift)

output after combine input0: 0000 0111 1111 1101(0x07fd)

input0: 1111 1111 1100 0101 (0xffc5)

input1: 0000 0000 1001 0001 (0x0091)

input1 after right shift 5:  0000 0000 0000 0100

output after combine input0: 1111 1000 0000 0100(0xf804) choose the first 5 bits from input0 and left bits from input1

       dbg_s16x8 = vsliq_n_s16(input_s16x8[0], input_s16x8[1], 5);

input_s16x8[0]: 0050 ffffff81 ffffff86 0022 ffffffc5 008c 0078 0047

input_s16x8[1]: ffffffaa 001a ffffff98 0068 0091 0067 ffffff75 ffffff9c

dbg_s16x8: fffff550 0341 fffff306 0d02 1225 0cec ffffeeb8 fffff387

input0: 0000 0000 0101 0000 (0x50)

input1: 1111 1111 1010 1010 (0xffaa)

input1 after left shift 5:  1111 0101 0100 0000

output: 1111 0101 0101 0000(0xf550)

11. element transposition/interleave

       C_s8x8x2[0] = vzip_s8(B0_s8x8[0], B1_s8x8[0]); // interleave elements

       C_s8x8x2[0] = vtrn_s8(B0_s8x8[0], B1_s8x8[0]); // transpose elements

       C_s8x8x2[0] = vuzp_s8(B0_s8x8[0], B1_s8x8[0]); // de-interleave elements

B0_s8x8[0]: ffffff80 ffffffa6 00 ffffffe0 ffffffe0 0f 30 fffffffb

B1_s8x8[0]: 68 fffffffa ffffff98 ffffffff fffffffe 00 ffffffb2 00

C0_s8x8[0].val[0] after zip: ffffff80 68 ffffffa6 fffffffa 00 ffffff98 ffffffe0 ffffffff

C0_s8x8[0].val[1] after zip: ffffffe0 fffffffe 0f 00 30 ffffffb2 fffffffb 00

C0_s8x8[0].val[0] after transposition: ffffff80 68 00 ffffff98 ffffffe0 fffffffe 30 ffffffb2

C0_s8x8[0].val[1] after transposition: ffffffa6 fffffffa ffffffe0 ffffffff 0f 00 fffffffb 00

// inverse process of interleave zip. you can imagine use the following as input, then zip(), the data will be recovered.

C0_s8x8[0].val[0] after uzp: ffffff80 00 ffffffe0 30 68 ffffff98 fffffffe ffffffb2

C0_s8x8[0].val[1] after uzp: ffffffa6 ffffffe0 0f fffffffb fffffffa ffffffff 00 00

Examples:

We want to choose the latter 9bit from each element in input_s16x8, and form them into 9 bytes.

input_s16x8:

7

9

7

9

7

9

7

9

7

9

7

9

7

9

7

9

Extract 8 9-bit from each element:

9bits

9bits

9bits

9bits

9bits

9bits

9bits

9bits

Form 8 9-bit into 9 bytes:

byte

byte

byte

byte

byte

byte

byte

byte

byte

       uint16_t bitSel_head[8] = {0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff};

       uint16_t bitSel_tail[8] = {0x1fe, 0x1fc, 0x1f8, 0x1f0, 0x1e0, 0x1c0, 0x180, 0x100};

       int16_t bitshift_tail[8] = {-1, -2, -3, -4, -5, -6, -7, -8};

       int16_t bitshift_head[8] = {7, 6, 5, 4, 3, 2, 1, 0};

       int8_t data_shuffle[8] = {1, 2, 3, 4, 5, 6, 7, 0};

       uint8_t bitSel_combine[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};

       bitSel_head_u16x8 = vld1q_u16(bitSel_head);

       bitSel_tail_u16x8 = vld1q_u16(bitSel_tail);

       bitShift_tail_s16x8 = vld1q_s16(bitshift_tail);

       bitShift_head_s16x8 = vld1q_s16(bitshift_head);

       data_shuffle_s8x8 = vld1_s8(data_shuffle);

       bitSel_combine_u8x8 = vld1_u8(bitSel_combine);

       head_s16x8[0] = vbslq_s16(bitSel_head_u16x8, input_s16x8[0], zero_s16x8);

       tail_s16x8[0] = vbslq_s16(bitSel_tail_u16x8, input_s16x8[0], zero_s16x8);

head_s16x8:

7

8

1

7

7

2

7

6

3

7

5

4

7

4

5

7

3

6

7

2

7

7

1

8

tail_s16x8:

7

8

1

7

7

2

7

6

3

7

5

4

7

4

5

7

3

6

7

2

7

7

1

8

       head_s16x8[0] = vshlq_s16(head_s16x8[0], bitShift_head_s16x8);

       tail_s16x8[0] = vshlq_s16(tail_s16x8[0], bitShift_tail_s16x8);

      

       head_s8x8[0] = vmovn_s16(head_s16x8[0]);

  tail_s8x8[0] = vmovn_s16(tail_s16x8[0]);

       vst1_lane_s8(pOut+1, tail_s8x8[0], 0);

       tail_shuffle_s8x8[0] = vtbl1_s8(tail_s8x8[0], data_shuffle_s8x8);

       output_s8x8[0] = vbsl_s8(bitSel_combine_u8x8, head_s8x8[0], tail_shuffle_s8x8[0]);

       vst1_s8(pOut+2, output_s8x8[0]);

      

猜你喜欢

转载自www.cnblogs.com/biggerjun2015/p/11754435.html