SSE 指令

数据类型

__m64 任意整型
__m128 4 位 32 bit 浮点型
__m128d 2 位 64 bit 浮点型
__m128i 任意整型

数学运算

__m128 _mm_add_ss(__m128 a, __m128 b);
单精度浮点低位加法
result = [ a0+b0 , a1 , a2 , a3 ]

__m128 _mm_add_ps(__m128 a, __m128 b);
单精度浮点加法
result = [ a0+b0 , a1+b1 , a2+b2 , a3+b3 ]

__m128 _mm_sub_ss(__m128 a, __m128 b);
单精度浮点低位减法
result = [ a0-b0 , a1 , a2 , a3 ]

__m128 _mm_sub_ps(__m128 a, __m128 b);
单精度浮点低位减法
result = [ a0-b0 , a1-b1 , a2-b2 , a3-b3 ]

__m128 _mm_mul_ss(__m128 a, __m128 b);
单精度浮点低位乘法
result = [ a0*b0 , a1 , a2 , a3 ]

__m128 _mm_mul_ps(__m128 a, __m128 b);
单精度浮点乘法
result = [ a0*b0 , a1*b1 , a2*b2 , a3*b3 ]

__m128 _mm_div_ss(__m128 a, __m128 b);
单精度浮点低位除法
result = [ a0/b0 , a1 , a2 , a3 ]

_m128 _mm_div_ps(__m128 a, __m128 b);
单精度浮点除法
result = [ a0/b0 , a1/b1 , a2/b2 , a3/b3 ]

__m128 _mm_sqrt_ss(__m128 a);
单精度浮点低位平方根
result = [ sqrt(a0) , a1 , a2 , a3 ]

__m128 _mm_sqrt_ps(__m128 a);
单精度浮点平方根
result = [ sqrt( a0) , sqrt(a1) , sqrt( a2) , sqrt(a3) ]

__m128 _mm_rcp_ss(__m128 a);
单精度浮点低位倒数近似值
result = [ recip(a0) , a1 , a2 , a3 ]

__m128 _mm_rcp_ps(__m128 a);
单精度浮点倒数近似值
result = [ recip(a0) , recip(a1) , recip(a2) , recip(a3) ]

__m128 _mm_rsqrt_ss(__m128 a);
单精度浮点低位平方根倒数近似值
result = [ recip(sqrt(a0)) , a1 , a2 , a3 ]

__m128 _mm_rsqrt_ps(__m128 a);
单精度浮点平方根倒数近似值
result = [ recip(sqrt(a0)) , recip(sqrt(a1)) , recip(sqrt(a2)) , recip(sqrt(a3)) ]

__m128 _mm_min_ss(__m128 a, __m128 b);
单精度浮点低位最小值
result = [ min(a0,b0) , a1 , a2 , a3 ]

__m128 _mm_min_ps(__m128 a, __m128 b);
单精度浮点最小值
result = [ min(a0,b0) , min(a1,b1) , min(a2,b2) , min(a3,b3) ]

__m128 _mm_max_ss(__m128 a, __m128 b);
单精度浮点低位最大值
result = [ max(a0,b0) , a1 , a2 , a3 ]

__m128 _mm_max_ps(__m128 a, __m128 b);
单精度浮点最大值
result = [ max(a0,b0) , max(a1,b1) , max(a2,b2) , max(a3,b3) ]

逻辑指令

__m128 _mm_and_ps(__m128 a, __m128 b);
位与
result = [ a&b ]

__m128 _mm_andnot_ps(__m128 a, __m128 b);
a非与 b
result = [ (~a0)&b0 , (~a1)&b1 , (~a2)&b2 (~a3)&b3 ]

__m128 _mm_or_ps(__m128 a, __m128 b);
位或
result = [ a|b ]

__m128 _mm_xor_ps(__m128 a, __m128 b);
位异或
result = [ a^b ]

比较指令

指令	作用	类型	true
_mm_cmpeq_ss	Equal	float 低位	0xffffffff
_mm_cmpeq_ps	Equal	float	0xffffffff
_mm_cmplt_ss	Less Than	float 低位	0xffffffff
_mm_cmplt_ps	Less Than	float	0xffffffff
_mm_cmple_ss	Less Than or Equal	float 低位	0xffffffff
_mm_cmple_ps	Less Than or Equal	float	0xffffffff
_mm_cmpgt_ss	Greater Than	float 低位	0xffffffff
_mm_cmpgt_ps	Greater Than	float	0xffffffff
_mm_cmpge_ss	Greater Than or Equal	float 低位	0xffffffff
_mm_cmpge_ps	Greater Than or Equal	float	0xffffffff
_mm_cmpneq_ss	Not Equal	float 低位	0xffffffff
_mm_cmpneq_ps	Not Equal	float	0xffffffff
_mm_cmpnlt_ss	Not Less Than	float 低位	0xffffffff
_mm_cmpnlt_ps	Not Less Than	float	0xffffffff
_mm_cmpnle_ss	Not Less Than or Equal	float 低位	0xffffffff
_mm_cmpnle_ps	Not Less Than or Equal	float	0xffffffff
_mm_cmpngt_ss	Not Greater Than	float 低位	0xffffffff
_mm_cmpngt_ps	Not Greater Than	float	0xffffffff
_mm_cmpnge_ss	Not Greater Than or Equal	float 低位	0xffffffff
_mm_cmpnge_ps	Not Greater Than or Equal	float	0xffffffff
_mm_cmpord_ss	Ordered	float 低位	0xffffffff
_mm_cmpord_ps	Ordered	float	0xffffffff
_mm_cmpunord_ss	Unordered	float 低位	0xffffffff
_mm_cmpunord_ps	Unordered	float	0xffffffff
_mm_comieq_ss	Equal	float 低位	0x1
_mm_comilt_ss	Less Than	float 低位	0x1
_mm_comile_ss	Less Than or Equal	float 低位	0x1
_mm_comigt_ss	Greater Than	float 低位	0x1
_mm_comige_ss	Greater Than or Equal	float 低位	0x1

转换指令

int _mm_cvtss_si32(__m128 a);
返回低位有符号32bit,近似
result = [ (int)a0 ]

__int64 _mm_cvtss_si64(__m128 a);
返回低位有符号64bit,近似
result = [ (__int64)a0 ]

__m64 _mm_cvtps_pi32(__m128 a);
返回低位有符号32bit,近似
result = [ (int)a0 , (int)a1 ]

int _mm_cvttss_si32(__m128 a);
返回低位有符号32bit,截断
result = [ (int)a0 ]

__int64 _mm_cvttss_si64(__m128 a);
返回低位有符号64bit,截断
result = [ (__int64)a0 ]

__m64 _mm_cvttps_pi32(__m128 a);
返回低位有符号32bit,截断
result = [ (int)a0 , (int) a1]

__m128 _mm_cvtsi32_ss(__m128 a, int b);
32bit 整型转化为低位单精度浮点
result = [ (float)b , a1 , a2 , a3 ]

__m128 _mm_cvtsi64_ss(__m128 a, __int64 b);
64bit 整型转化为低位单精度浮点
result = [ (float)b , a1 , a2 , a3 ]

__m128 _mm_cvtpi32_ps(__m128 a, __m64 b);
两个32bit整型转化为单精度浮点
result = [(float)b0 , (float)b1 , a2 , a3 ]

__m128 _mm_cvtpi16_ps(__m64 a);
四个有符号16bit整型值转为单精度浮点
result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpu16_ps(__m64 a);
四个无符号16bit整型值转为单精度浮点
result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpi8_ps(__m64 a);
四个有符号8bit整型值转为单精度浮点
result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpu8_ps(__m64 a);
四个无符号8bit整型值转为单精度浮点
result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b);
四位 32bit整型转化为单精度浮点
result = [ (float)a0 , (float)a1 , (float)b0 , (float)b1 ]

__m64 _mm_cvtps_pi16(__m128 a);
将单精度浮点转位有符号16bit整型
result = [ (short)a0 , (short)a1 , (short)a2 , (short)a3 ]

__m64 _mm_cvtps_pi8(__m128 a);
四个单精度浮点转为四个有符号8bit整型
result = [ (char)a0 , (char)a1 , (char)a2 , (char)a3 ]

float _mm_cvtss_f32(__m128 a);
返回第一个值

加载指令

__m128 _mm_loadh_pi(__m128 a, __m64 const *p);
加载4个单精度浮点
result = [ a0 , a1 , p0 , p1 ]

__m128 _mm_loadl_pi(__m128 a, __m64 const *p);
加载4个单精度浮点
result = [ p0 , p1 , a2 , a3 ]

__m128 _mm_load_ss(float * p);
加载单精度浮点到低位
result = [ p0 , 0.0 , 0.0 , 0.0 ]

__m128 _mm_load1_ps(float * p);
加载并复制
result = [ p0 , p0 , p0 , p0 ]

__m128 _mm_load_ps(float * p);
加载4个单精度浮点,必须16字节对齐
result = [ p0 , p1 , p2 , p3 ]

__m128 _mm_loadu_ps(float * p);
加载4个单精度浮点,不需要16字节对齐
result = [ p0 , p1 , p2 , p3 ]

__m128 _mm_loadr_ps(float * p);
加载4个单精度浮点翻转,必须16字节对齐
result = [ p3 , p2 , p1 , p0 ]

设置指令

__m128 _mm_set_ss(float w);
设置低位单精度浮点
result = [ w , 0.0 , 0.0 , 0.0 ]

__m128 _mm_set1_ps(float w);
设置并复制
result = [ w , w , w , w ]

__m128 _mm_set_ps(float z, float y, float x, float w);
设置4个单精度浮点
result = [ w , x , y , z ]

__m128 _mm_setr_ps(float z, float y, float x, float w);
设置4个单精度浮点翻转
result = [ z , y , x , w ]

__m128 _mm_setzero_ps(void);
清理4个单精度浮点
result = [ 0.0 , 0.0 , 0.0 , 0.0 ]

存储指令

void _mm_storeh_pi(__m64 *p, __m128 a);
存储高位单精度浮点
result = [ a2 , a3 ]

void _mm_storel_pi(__m64 *p, __m128 a);
低两位存入p
result = [ a0 , a1 ]

void _mm_store_ss(float * p, __m128 a);
存低位
result = [ a0 ]

void _mm_store1_ps(float * p, __m128 a);
存单精度浮点
result = [ a0 , a0 , a0 , a0 ]

void _mm_store_ps(float *p, __m128 a);
存4位单精度浮点,必须16字节对齐
result = [ a0 , a1 , a2 , a3 ]

void _mm_storeu_ps(float *p, __m128 a);
存4位单精度浮点,不需要16字节对齐
result = [ a0 , a1 , a2 , a3 ]

void _mm_storer_ps(float * p, __m128 a);
存4位单精度浮点翻转,必须16字节对齐
result = [ a3 , a2 , a1 , a0 ]

缓存支持

void _mm_prefetch(char const*a, int sel);
Loads one cache line of data from address a to a location “closer” to the processor. The value sel specifies the type of prefetch operation: the constants _MM_HINT_T0 , _MM_HINT_T1 , _MM_HINT_T2 , _MM_HINT_NTA ,and _MM_HINT_ET0 should be used for systems based on IA-32 architecture, and correspond to the type of prefetch instruction.

void _mm_stream_pi(__m64 *p, __m64 a);
Stores the data in a to the address p without polluting the caches. This intrinsic requires you to empty the multimedia state for the MMXTM register. See the topic The EMMS Instruction: Why You Need It.

void _mm_stream_ps(float *p, __m128 a);
Stores the data in a to the address p without polluting the caches. The address must be 16-byte-aligned.

void _mm256_stream_ps(float *p, __m256 a);
Stores the data in a to the address p without polluting the caches. The address must be 32-byte (VEX.256 encoded version) aligned.

void _mm_sfence(void);
Guarantees that every preceding store is globally visible before any subsequent store.

整型指令

int _mm_extract_pi16(__m64 a, int imm);
返回32bit整型
result = [ (n==0) ? a0 : ( (n==1) ? a1 : ( (n==2) ? a2 : a3 ) ) ]

__m64 _mm_insert_pi16(__m64 a, int d, int n);
将16bit d 插入a
result = [ (n==0) ? d : a0 , (n==1) ? d : a1 , (n==2) ? d : a2 , (n==3) ? d : a3 ]

__m64 _mm_max_pi16(__m64 a, __m64 b);
计算最大值
result = [ max(a0,b0) , max(a1,b1) , max(a2,b2) , max(a3,b3) ]

__m64 _mm_max_pu8(__m64 a, __m64 b);
计算无符号最大值
result = [ max(a0,b0) , max(a1,b1) , … , max(a7,b7) ]

__m64 _mm_min_pi16(__m64 a, __m64 b);
计算最小值
result = [ min(a0,b0) , min(a1,b1) , min(a2,b2) , min(a3,b3) ]

__m64 _mm_min_pu8(__m64 a, __m64 b);
计算无符号最小值
result = [ min(a0,b0) , min(a1,b1) , … , min(a7,b7) ]

__m64 _mm_movemask_pi8(__m64 b);
创建8 bit掩码
result = [ sign(a7)<<7 | sign(a6)<<6 |… | sign(a0) ]

__m64 _mm_mulhi_pu16(__m64 a, __m64 b)
无符号16bit相乘,返回32bit结果的前16bit
result = [ (a0 * b0)[0:15] , (a1 * b1)[0:15] , (a2 * b2)[0:15] , (a3 * b3)[0:15] ]

__m64 _mm_shuffle_pi16(__m64 a, int n);
result = [ word (n&0x3) of a , word ((n>>2)&0x3) of a , word ((n>>4)&0x3) of a , word ((n>>6)&0x3) of a ]

void _mm_maskmove_si64(__m64 d, __m64 n, char *p);
条件存储字节到p
result = [ if(sign(n0)) p[0] = d0 , if(sign(n1)) p[1] = d1 , … , if(sign(n1)) p[1] = d1 ]

__m64 _mm_avg_pu8(__m64 a, __m64 b);
计算近似均值
result = [ (t >> 1) | (t &0x01) , where t =(unsigned char)a0 +(unsigned char)b0 , (t >> 1) | (t &0x01) , where t =(unsigned char)a1 +(unsigned char)b1 , … , (t >> 1) | (t &0x01) , where t =(unsigned char)a7 +(unsigned char)b7 ]

__m64 _mm_avg_pu16(__m64 a, __m64 b);
计算近似均值
result = [ (t >> 1) | (t &0x01) , where t =(unsigned char)a0 +(unsigned char)b0 , (t >> 1) | (t &0x01) , where t =(unsigned char)a1 +(unsigned char)b1 , … , (t >> 1) | (t &0x01) , where t =(unsigned char)a3 +(unsigned char)b3 ]

__m64 _mm_sad_pu8(__m64 a, __m64 b);
绝对差累加
result = [ abs(a0-b0) +… +abs(a7-b7) , 0 , 0 , 0 ]

杂项

__m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8);
按照 imm8指示从 a 选择浮点

__m128 _mm_unpackhi_ps(__m128 a, __m128 b);
交替高位
result = [ a2 , b2 , a3 , b3 ]

__m128 _mm_unpacklo_ps(__m128 a, __m128 b);
交替低位
result = [ a0 , b0 , a1 , b1 ]

__m128 _mm_move_ss( __m128 a, __m128 b);
设置低字节
result = [ b0 , a1 , a2 , a3 ]

__m128 _mm_movehl_ps(__m128 a, __m128 b);
设置两字节
result = [ b2 , b3 , a2 , a3 ]

__m128 _mm_movelh_ps(__m128 a, __m128 b);
设置两字节
result = [ a0 , a1 , b0 , b1 ]

int _mm_movemask_ps(__m128 a);
创建四位掩码
result = [ sign(a3)<<3 | sign(a2)<<2 | sign(a1)<<1 | sign(a0) ]