ARM Neon指令集的一些简单的应用

Arm Neon Usages

Each Neon instruction detail can be searched in https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics?search=

1. Load into Neon from data ptr/store into the data ptr from Neon.

　　input_s16x8 = vld1q_s16(pIn);

vst1_s8(pOut, output_s8x8);

2. find sign/abs/max

sign_s16x8 = vshrq_n_s16(input_s16x8, 16);

abs_input_s16x8 = vqabsq_s16(input_s16x8); // satuation |min(int8)|=|-128|=127

abs_input_s16x8 = vabsq_s16(input_s16x8); //without satuation

　　c_s16x8 = vmaxq_s16(a_s16x8, b_s16x8); // find max{a[i], b[i]}

c_s16x8 = vpmaxq_s16(a_s16x8, b_s16x8); // find max{a[2i], a[2i+1]}

3. find zero padding bits

clz_s16x8 = vclzq_s16(abs_max_s16x8);

4. split/combine vectors

　　abs_temp_s16x4_h = vget_high_s16(abs_temp_s16x8);

abs_temp_s16x4_l = vget_low_s16(abs_temp_s16x8);

　　abs_max_s16x8 = vcombine_s16(abs_temp_s16x4, abs_temp_s16x4);

5. converting vector (s16x8 <-> s8x8)

idx_s8x8 = vmovn_s16(idx_s16x8); //narrow move

　　bias_s16x8 = vmovl_s8(bias_s8x8); // long move

6. reinterpretion (int<->uint)

idx_s16x8 = vreinterpretq_s16_u16(idx_u16x8);

7. Lookup table(frequetly use in data shuffle)

　　int8_t lut[8]= {1, 2, 3, 4, 5, 6, 7, 0};

　　data_shuffle_idx_s8x8 = vld1_s8(lut);

// put the first element in the tail, others move forward

　　data_shuffle_s8x8 = vtbl1_s8(data_s8x8, data_shuffle_idx_s8x8);

8. comparison

If(a>4)

b = 49;

else if (a>1)

b = 90;

else

b = 340;

greater_than_4_u8x8 = vcgt_s8(a_s8x8, 4_s8x8);

greater_than_1_u8x8 = vcgt_s8(a_s8x8, 1_s8x8);

idx_u8x8 = vadd_u8(greater_than_4_u8x8, greater_than_1_u8x8);

int8_t lut[8]= {49, 90, 340, 0, 0, 0, 0, 0};

　　lut_s8x8 = vld1_s8(lut);

　　b_s8x8 = vtbl1_s8(lut_s8x8, idx_u8x8);

9. bit selection

result_s16x8 = vbslq_s16(dest_s16x8, src1_s16x8, src2_s16x8);

0050(dest): 0000 0000 0101 0000 (bit 0: src2, bit 1: src1)

ffaa(src1): 1111 1111 1010 1010

0x7(src2): 0000 0000 0000 0111

result: 0000 0000 0000 0111

10. shift and insert (have same effect with bit selection)

dbg_s16x8 = vsriq_n_s16(input_s16x8[0], input_s16x8[1], 5);

input_s16x8[0]: 0050 ffffff81 ffffff86 0022 ffffffc5 008c 0078 0047

input_s16x8[1]: ffffffaa 001a ffffff98 0068 0091 0067 ffffff75 ffffff9c

dbg_s16x8: 07fd fffff800 fffffffc 0003 fffff804 0003 07fb 07fc

input0: 0000 0000 0101 0000 (0x0050)

input1: 1111 1111 1010 1010 (0xffaa)

input1 after right shift 5: 0000 0111 1111 1101 (unsigned shift)

output after combine input0: 0000 0111 1111 1101(0x07fd)

input0: 1111 1111 1100 0101 (0xffc5)

input1: 0000 0000 1001 0001 (0x0091)

input1 after right shift 5: 0000 0000 0000 0100

output after combine input0: 1111 1000 0000 0100(0xf804) choose the first 5 bits from input0 and left bits from input1

dbg_s16x8 = vsliq_n_s16(input_s16x8[0], input_s16x8[1], 5);

input_s16x8[0]: 0050 ffffff81 ffffff86 0022 ffffffc5 008c 0078 0047

input_s16x8[1]: ffffffaa 001a ffffff98 0068 0091 0067 ffffff75 ffffff9c

dbg_s16x8: fffff550 0341 fffff306 0d02 1225 0cec ffffeeb8 fffff387

input0: 0000 0000 0101 0000 (0x50)

input1: 1111 1111 1010 1010 (0xffaa)

input1 after left shift 5: 1111 0101 0100 0000

output: 1111 0101 0101 0000(0xf550)

11. element transposition/interleave

C_s8x8x2[0] = vzip_s8(B0_s8x8[0], B1_s8x8[0]); // interleave elements

C_s8x8x2[0] = vtrn_s8(B0_s8x8[0], B1_s8x8[0]); // transpose elements

C_s8x8x2[0] = vuzp_s8(B0_s8x8[0], B1_s8x8[0]); // de-interleave elements

B0_s8x8[0]: ffffff80 ffffffa6 00 ffffffe0 ffffffe0 0f 30 fffffffb

B1_s8x8[0]: 68 fffffffa ffffff98 ffffffff fffffffe 00 ffffffb2 00

C0_s8x8[0].val[0] after zip: ffffff80 68 ffffffa6 fffffffa 00 ffffff98 ffffffe0 ffffffff

C0_s8x8[0].val[1] after zip: ffffffe0 fffffffe 0f 00 30 ffffffb2 fffffffb 00

C0_s8x8[0].val[0] after transposition: ffffff80 68 00 ffffff98 ffffffe0 fffffffe 30 ffffffb2

C0_s8x8[0].val[1] after transposition: ffffffa6 fffffffa ffffffe0 ffffffff 0f 00 fffffffb 00

// inverse process of interleave zip. you can imagine use the following as input, then zip(), the data will be recovered.

C0_s8x8[0].val[0] after uzp: ffffff80 00 ffffffe0 30 68 ffffff98 fffffffe ffffffb2

C0_s8x8[0].val[1] after uzp: ffffffa6 ffffffe0 0f fffffffb fffffffa ffffffff 00 00

Examples:

We want to choose the latter 9bit from each element in input_s16x8, and form them into 9 bytes.

input_s16x8:

Extract 8 9-bit from each element:

9bits

Form 8 9-bit into 9 bytes:

byte

uint16_t bitSel_head[8] = {0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff};

uint16_t bitSel_tail[8] = {0x1fe, 0x1fc, 0x1f8, 0x1f0, 0x1e0, 0x1c0, 0x180, 0x100};

int16_t bitshift_tail[8] = {-1, -2, -3, -4, -5, -6, -7, -8};

int16_t bitshift_head[8] = {7, 6, 5, 4, 3, 2, 1, 0};

int8_t data_shuffle[8] = {1, 2, 3, 4, 5, 6, 7, 0};

uint8_t bitSel_combine[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};

bitSel_head_u16x8 = vld1q_u16(bitSel_head);

bitSel_tail_u16x8 = vld1q_u16(bitSel_tail);

bitShift_tail_s16x8 = vld1q_s16(bitshift_tail);

bitShift_head_s16x8 = vld1q_s16(bitshift_head);

data_shuffle_s8x8 = vld1_s8(data_shuffle);

bitSel_combine_u8x8 = vld1_u8(bitSel_combine);

head_s16x8[0] = vbslq_s16(bitSel_head_u16x8, input_s16x8[0], zero_s16x8);

tail_s16x8[0] = vbslq_s16(bitSel_tail_u16x8, input_s16x8[0], zero_s16x8);

head_s16x8:

tail_s16x8:

head_s16x8[0] = vshlq_s16(head_s16x8[0], bitShift_head_s16x8);

tail_s16x8[0] = vshlq_s16(tail_s16x8[0], bitShift_tail_s16x8);

head_s8x8[0] = vmovn_s16(head_s16x8[0]);

　　tail_s8x8[0] = vmovn_s16(tail_s16x8[0]);

vst1_lane_s8(pOut+1, tail_s8x8[0], 0);

tail_shuffle_s8x8[0] = vtbl1_s8(tail_s8x8[0], data_shuffle_s8x8);

output_s8x8[0] = vbsl_s8(bitSel_combine_u8x8, head_s8x8[0], tail_shuffle_s8x8[0]);

vst1_s8(pOut+2, output_s8x8[0]);

ARM Neon指令集的一些简单的应用

猜你喜欢