阅读代码的时候遇到了__m128i 、_mm_set1_epi8 、_mm_loadu_si128 、_mm_max_epu8 、_mm_min_epu8 、_mm_store_si128 、_mm_unpackhi_epi8 、_mm_adds_epi16 、_mm_srli_si128 等SIMD指令集,所以想着作一个总结。
0. SIMD基础知识
SIMD是单指令多数据技术,目前Intel处理器支持的SIMD技术包括MMX、SSE以及AVX。
MMX是MultiMedia eXtensions(多媒体扩展)的缩写,是第六代CPU芯片的重要特点。它是继Intel386?处理器(将体系结构扩展至32位)之后对Intel体系结构最重要的加强,这些指令集能够加速有关图形、影像、声音等的应用。其中,MMX提供了8个64bit的寄存器进行SIMD操作。
SSE是"因特尔数据流单指令序列扩展(Internet Streaming SIMD Extensions)"的缩写。SSE除保持原有的MMX指令外,又新添加了70条指令,在加快浮点运算的同时,改善了内存的使用效率,使内存速度更快。其中,SSE系列提供了8个128bit的寄存器进行SIMD操作。
AVX指令集是Sandy Bridge和Larrabee架构下的新指令集,在单指令多数据流计算性能增加的同时也沿用了MMX/SSE指令集,是在之前的128位扩展到256位的单指令多数据流,不过和MMX/SSE的不同点在于增强的AVX指令,从指令的格式上就发生了很大的变化。
1. 如何使用SIMD指令以及其相关头文件
使用SIMD指令有两种方式:一是直接在C/C++中嵌入(汇编)指令;而是使用Intel C++ Compiler或是Microsoft Visual C++提供的支持SIMD指令集的intrinsics内联函数。从代码可读和维护角度讲,推荐使用intrinsics内联函数的形式。intrinsics是对MMX、SSE等指令集的一种封装,以函数的形式提供,使得程序员更容易编写和使用这些高级指令,在编译的时候,这些函数会被内联为汇编,不会产生函数调用的开销。要想使用SIMD指令,则需要包含对应的头文件。
接下来介绍一下头文件之间的关系:
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <pmmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>
#include <nmmintrin.h>
#include <wmmintrin.h>
#include <immintrin.h>
#include <intrin.h>
mmintrin.h ∈ xmmintrin.h ∈ emmintrin.h ∈ pmmintrin.h ∈ tmmintrin.h ∈ smmintrin.h ∈ nmmintrin.h ∈ wmmintrin.h ∈ immintrin.h ∈ intrin.h
2. 变量类型
>>>__m64
mmintrin.h为MMX头文件,__m64的定义就来自这个头文件:
typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{
unsigned __int64 m64_u64;
float m64_f32[2];
__int8 m64_i8[8];
__int16 m64_i16[4];
__int32 m64_i32[2];
__int64 m64_i64;
unsigned __int8 m64_u8[8];
unsigned __int16 m64_u16[4];
unsigned __int32 m64_u32[2];
} __m64;
可以看到,__m64为一个共用体(union)类型,union的特点为:所有成员占用同一段内存,在不同的时间保存不同的数据类型和不同长度的变量。在union中,所有的共用体成员公用一个空间,并且同一时间只能存储其中一个成员变量的值。__m64的大小为64位,其中,__declspec(align(8))是设置内存对齐方式(8字节对齐),来保证__m64的大小为64位。__m64这种类型的变量可用作MMX指令的操作数,它不能直接被访问,被自动分配为8个字节的字长。
>>>__m128
xmmintrin.h为SSE头文件,__m128的定义就来自于这个头文件:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
float m128_f32[4];
unsigned __int64 m128_u64[2];
__int8 m128_i8[16];
__int16 m128_i16[8];
__int32 m128_i32[4];
__int64 m128_i64[2];
unsigned __int8 m128_u8[16];
unsigned __int16 m128_u16[8];
unsigned __int32 m128_u32[4];
} __m128;
>>>__m128i 和 __m128d
emmintrin.h为SSE2头文件,其中__m128i和__m128d的定义就来自于这个头文件:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
__int8 m128i_i8[16];
__int16 m128i_i16[8];
__int32 m128i_i32[4];
__int64 m128i_i64[2];
unsigned __int8 m128i_u8[16];
unsigned __int16 m128i_u16[8];
unsigned __int32 m128i_u32[4];
unsigned __int64 m128i_u64[2];
} __m128i;
typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {
double m128d_f64[2];
} __m128d;
>>>__m256、__m256d 和 __m256i
immintrin.h为AVX头文件,其中__m256、__m256d和__m256i的定义就来自于这个头文件:
typedef union __declspec(intrin_type) _CRT_ALIGN(32) __m256 {
float m256_f32[8];
} __m256;
typedef struct __declspec(intrin_type) _CRT_ALIGN(32) {
double m256d_f64[4];
} __m256d;
typedef union __declspec(intrin_type) _CRT_ALIGN(32) __m256i {
__int8 m256i_i8[32];
__int16 m256i_i16[16];
__int32 m256i_i32[8];
__int64 m256i_i64[4];
unsigned __int8 m256i_u8[32];
unsigned __int16 m256i_u16[16];
unsigned __int32 m256i_u32[8];
unsigned __int64 m256i_u64[4];
} __m256i;
3. 函数接口
>>>mmintrin.h头文件中的函数接口
void _m_empty(void);
__m64 _m_from_int(int _I);
int _m_to_int(__m64 _M);
__m64 _m_packsswb(__m64 _MM1, __m64 _MM2);
__m64 _m_packssdw(__m64 _MM1, __m64 _MM2);
__m64 _m_packuswb(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckhbw(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckhwd(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckhdq(__m64 _MM1, __m64 _MM2);
__m64 _m_punpcklbw(__m64 _MM1, __m64 _MM2);
__m64 _m_punpcklwd(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckldq(__m64 _MM1, __m64 _MM2);
__m64 _m_paddb(__m64 _MM1, __m64 _MM2);
__m64 _m_paddw(__m64 _MM1, __m64 _MM2);
__m64 _m_paddd(__m64 _MM1, __m64 _MM2);
__m64 _m_paddsb(__m64 _MM1, __m64 _MM2);
__m64 _m_paddsw(__m64 _MM1, __m64 _MM2);
__m64 _m_paddusb(__m64 _MM1, __m64 _MM2);
__m64 _m_paddusw(__m64 _MM1, __m64 _MM2);
__m64 _m_psubb(__m64 _MM1, __m64 _MM2);
__m64 _m_psubw(__m64 _MM1, __m64 _MM2);
__m64 _m_psubd(__m64 _MM1, __m64 _MM2);
__m64 _m_psubsb(__m64 _MM1, __m64 _MM2);
__m64 _m_psubsw(__m64 _MM1, __m64 _MM2);
__m64 _m_psubusb(__m64 _MM1, __m64 _MM2);
__m64 _m_psubusw(__m64 _MM1, __m64 _MM2);
__m64 _m_pmaddwd(__m64 _MM1, __m64 _MM2);
__m64 _m_pmulhw(__m64 _MM1, __m64 _MM2);
__m64 _m_pmullw(__m64 _MM1, __m64 _MM2);
__m64 _m_psllw(__m64 _M, __m64 _Count);
__m64 _m_psllwi(__m64 _M, int _Count);
__m64 _m_pslld(__m64 _M, __m64 _Count);
__m64 _m_pslldi(__m64 _M, int _Count);
__m64 _m_psllq(__m64 _M, __m64 _Count);
__m64 _m_psllqi(__m64 _M, int _Count);
__m64 _m_psraw(__m64 _M, __m64 _Count);
__m64 _m_psrawi(__m64 _M, int _Count);
__m64 _m_psrad(__m64 _M, __m64 _Count);
__m64 _m_psradi(__m64 _M, int _Count);
__m64 _m_psrlw(__m64 _M, __m64 _Count);
__m64 _m_psrlwi(__m64 _M, int _Count);
__m64 _m_psrld(__m64 _M, __m64 _Count);
__m64 _m_psrldi(__m64 _M, int _Count);
__m64 _m_psrlq(__m64 _M, __m64 _Count);
__m64 _m_psrlqi(__m64 _M, int _Count);
__m64 _m_pand(__m64 _MM1, __m64 _MM2);
__m64 _m_pandn(__m64 _MM1, __m64 _MM2);
__m64 _m_por(__m64 _MM1, __m64 _MM2);
__m64 _m_pxor(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpeqb(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpeqw(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpeqd(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpgtb(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpgtw(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpgtd(__m64 _MM1, __m64 _MM2);
__m64 _mm_setzero_si64(void);
__m64 _mm_set_pi32(int _I1, int _I0);
__m64 _mm_set_pi16(short _S3, short _S2, short _S1, short _S0);
__m64 _mm_set_pi8(char _B7, char _B6, char _B5, char _B4,
char _B3, char _B2, char _B1, char _B0);
__m64 _mm_set1_pi32(int _I);
__m64 _mm_set1_pi16(short _S);
__m64 _mm_set1_pi8(char _B);
__m64 _mm_setr_pi32(int _I1, int _I0);
__m64 _mm_setr_pi16(short _S3, short _S2, short _S1, short _S0);
__m64 _mm_setr_pi8(char _B7, char _B6, char _B5, char _B4,
char _B3, char _B2, char _B1, char _B0);
>>>xmmintrin.h头文件中的函数接口
extern __m128 _mm_add_ss(__m128 _A, __m128 _B);
extern __m128 _mm_add_ps(__m128 _A, __m128 _B);
extern __m128 _mm_sub_ss(__m128 _A, __m128 _B);
extern __m128 _mm_sub_ps(__m128 _A, __m128 _B);
extern __m128 _mm_mul_ss(__m128 _A, __m128 _B);
extern __m128 _mm_mul_ps(__m128 _A, __m128 _B);
extern __m128 _mm_div_ss(__m128 _A, __m128 _B);
extern __m128 _mm_div_ps(__m128 _A, __m128 _B);
extern __m128 _mm_sqrt_ss(__m128 _A);
extern __m128 _mm_sqrt_ps(__m128 _A);
extern __m128 _mm_rcp_ss(__m128 _A);
extern __m128 _mm_rcp_ps(__m128 _A);
extern __m128 _mm_rsqrt_ss(__m128 _A);
extern __m128 _mm_rsqrt_ps(__m128 _A);
extern __m128 _mm_min_ss(__m128 _A, __m128 _B);
extern __m128 _mm_min_ps(__m128 _A, __m128 _B);
extern __m128 _mm_max_ss(__m128 _A, __m128 _B);
extern __m128 _mm_max_ps(__m128 _A, __m128 _B);
extern __m128 _mm_and_ps(__m128 _A, __m128 _B);
extern __m128 _mm_andnot_ps(__m128 _A, __m128 _B);
extern __m128 _mm_or_ps(__m128 _A, __m128 _B);
extern __m128 _mm_xor_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpeq_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpeq_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmplt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmplt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmple_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmple_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpgt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpgt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpge_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpge_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpneq_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpneq_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnlt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnlt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnle_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnle_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpngt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpngt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnge_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnge_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpord_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpord_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpunord_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpunord_ps(__m128 _A, __m128 _B);
extern int _mm_comieq_ss(__m128 _A, __m128 _B);
extern int _mm_comilt_ss(__m128 _A, __m128 _B);
extern int _mm_comile_ss(__m128 _A, __m128 _B);
extern int _mm_comigt_ss(__m128 _A, __m128 _B);
extern int _mm_comige_ss(__m128 _A, __m128 _B);
extern int _mm_comineq_ss(__m128 _A, __m128 _B);
extern int _mm_ucomieq_ss(__m128 _A, __m128 _B);
extern int _mm_ucomilt_ss(__m128 _A, __m128 _B);
extern int _mm_ucomile_ss(__m128 _A, __m128 _B);
extern int _mm_ucomigt_ss(__m128 _A, __m128 _B);
extern int _mm_ucomige_ss(__m128 _A, __m128 _B);
extern int _mm_ucomineq_ss(__m128 _A, __m128 _B);
extern int _mm_cvt_ss2si(__m128 _A);
extern __m64 _mm_cvt_ps2pi(__m128 _A);
extern int _mm_cvtt_ss2si(__m128 _A);
extern __m64 _mm_cvtt_ps2pi(__m128 _A);
extern __m128 _mm_cvt_si2ss(__m128, int);
extern __m128 _mm_cvt_pi2ps(__m128, __m64);
extern float _mm_cvtss_f32(__m128 _A);
#if defined (_M_X64)
extern __int64 _mm_cvtss_si64(__m128 _A);
extern __int64 _mm_cvttss_si64(__m128 _A);
extern __m128 _mm_cvtsi64_ss(__m128 _A, __int64 _B);
#endif
extern __m128 _mm_shuffle_ps(__m128 _A, __m128 _B, unsigned int _Imm8);
extern __m128 _mm_unpackhi_ps(__m128 _A, __m128 _B);
extern __m128 _mm_unpacklo_ps(__m128 _A, __m128 _B);
extern __m128 _mm_loadh_pi(__m128, __m64 const*);
extern __m128 _mm_movehl_ps(__m128, __m128);
extern __m128 _mm_movelh_ps(__m128, __m128);
extern void _mm_storeh_pi(__m64 *, __m128);
extern __m128 _mm_loadl_pi(__m128, __m64 const*);
extern void _mm_storel_pi(__m64 *, __m128);
extern int _mm_movemask_ps(__m128 _A);
extern int _m_pextrw(__m64, int);
extern __m64 _m_pinsrw(__m64, int, int);
extern __m64 _m_pmaxsw(__m64, __m64);
extern __m64 _m_pmaxub(__m64, __m64);
extern __m64 _m_pminsw(__m64, __m64);
extern __m64 _m_pminub(__m64, __m64);
extern int _m_pmovmskb(__m64);
extern __m64 _m_pmulhuw(__m64, __m64);
extern __m64 _m_pshufw(__m64, int);
extern void _m_maskmovq(__m64, __m64, char *);
extern __m64 _m_pavgb(__m64, __m64);
extern __m64 _m_pavgw(__m64, __m64);
extern __m64 _m_psadbw(__m64, __m64);
extern __m128 _mm_set_ss(float _A);
extern __m128 _mm_set_ps1(float _A);
extern __m128 _mm_set_ps(float _A, float _B, float _C, float _D);
extern __m128 _mm_setr_ps(float _A, float _B, float _C, float _D);
extern __m128 _mm_setzero_ps(void);
extern __m128 _mm_load_ss(float const*_A);
extern __m128 _mm_load_ps1(float const*_A);
extern __m128 _mm_load_ps(float const*_A);
extern __m128 _mm_loadr_ps(float const*_A);
extern __m128 _mm_loadu_ps(float const*_A);
extern void _mm_store_ss(float *_V, __m128 _A);
extern void _mm_store_ps1(float *_V, __m128 _A);
extern void _mm_store_ps(float *_V, __m128 _A);
extern void _mm_storer_ps(float *_V, __m128 _A);
extern void _mm_storeu_ps(float *_V, __m128 _A);
extern void _mm_prefetch(char const*_A, int _Sel);
extern void _mm_stream_pi(__m64 *, __m64);
extern void _mm_stream_ps(float *, __m128);
extern __m128 _mm_move_ss(__m128 _A, __m128 _B);
extern void _mm_sfence(void);
extern unsigned int _mm_getcsr(void);
extern void _mm_setcsr(unsigned int);
__inline __m128 _mm_cvtpi16_ps(__m64 _A)
{
__m128 _Tmp;
__m64 _Ext_val = _mm_cmpgt_pi16(_mm_setzero_si64(), _A);
_Tmp = _mm_cvtpi32_ps(_mm_setzero_ps(), _mm_unpackhi_pi16(_A, _Ext_val));
return(_mm_cvtpi32_ps(_mm_movelh_ps(_Tmp, _Tmp),
_mm_unpacklo_pi16(_A, _Ext_val)));
}
__inline __m128 _mm_cvtpu16_ps(__m64 _A)
{
__m128 _Tmp;
__m64 _Ext_val = _mm_setzero_si64();
_Tmp = _mm_cvtpi32_ps(_mm_setzero_ps(), _mm_unpackhi_pi16(_A, _Ext_val));
return(_mm_cvtpi32_ps(_mm_movelh_ps(_Tmp, _Tmp),
_mm_unpacklo_pi16(_A, _Ext_val)));
}
__inline __m64 _mm_cvtps_pi16(__m128 _A)
{
return _mm_packs_pi32(_mm_cvtps_pi32(_A),
_mm_cvtps_pi32(_mm_movehl_ps(_A, _A)));
}
__inline __m128 _mm_cvtpi8_ps(__m64 _A)
{
__m64 _Ext_val = _mm_cmpgt_pi8(_mm_setzero_si64(), _A);
return _mm_cvtpi16_ps(_mm_unpacklo_pi8(_A, _Ext_val));
}
__inline __m128 _mm_cvtpu8_ps(__m64 _A)
{
return _mm_cvtpu16_ps(_mm_unpacklo_pi8(_A, _mm_setzero_si64()));
}
__inline __m64 _mm_cvtps_pi8(__m128 _A)
{
return _mm_packs_pi16(_mm_cvtps_pi16(_A), _mm_setzero_si64());
}
__inline __m128 _mm_cvtpi32x2_ps(__m64 _A, __m64 _B)
{
return _mm_movelh_ps(_mm_cvt_pi2ps(_mm_setzero_ps(), _A),
_mm_cvt_pi2ps(_mm_setzero_ps(), _B));
}
>>>emmintrin.h头文件中的函数接口
extern __m128d _mm_add_sd(__m128d _A, __m128d _B);
extern __m128d _mm_add_pd(__m128d _A, __m128d _B);
extern __m128d _mm_sub_sd(__m128d _A, __m128d _B);
extern __m128d _mm_sub_pd(__m128d _A, __m128d _B);
extern __m128d _mm_mul_sd(__m128d _A, __m128d _B);
extern __m128d _mm_mul_pd(__m128d _A, __m128d _B);
extern __m128d _mm_sqrt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_sqrt_pd(__m128d _A);
extern __m128d _mm_div_sd(__m128d _A, __m128d _B);
extern __m128d _mm_div_pd(__m128d _A, __m128d _B);
extern __m128d _mm_min_sd(__m128d _A, __m128d _B);
extern __m128d _mm_min_pd(__m128d _A, __m128d _B);
extern __m128d _mm_max_sd(__m128d _A, __m128d _B);
extern __m128d _mm_max_pd(__m128d _A, __m128d _B);
extern __m128d _mm_and_pd(__m128d _A, __m128d _B);
extern __m128d _mm_andnot_pd(__m128d _A, __m128d _B);
extern __m128d _mm_or_pd(__m128d _A, __m128d _B);
extern __m128d _mm_xor_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpeq_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpeq_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmplt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmplt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmple_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmple_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpgt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpgt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpge_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpge_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpneq_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpneq_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnlt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnlt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnle_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnle_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpngt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpngt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnge_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnge_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpord_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpord_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpunord_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpunord_sd(__m128d _A, __m128d _B);
extern int _mm_comieq_sd(__m128d _A, __m128d _B);
extern int _mm_comilt_sd(__m128d _A, __m128d _B);
extern int _mm_comile_sd(__m128d _A, __m128d _B);
extern int _mm_comigt_sd(__m128d _A, __m128d _B);
extern int _mm_comige_sd(__m128d _A, __m128d _B);
extern int _mm_comineq_sd(__m128d _A, __m128d _B);
extern int _mm_ucomieq_sd(__m128d _A, __m128d _B);
extern int _mm_ucomilt_sd(__m128d _A, __m128d _B);
extern int _mm_ucomile_sd(__m128d _A, __m128d _B);
extern int _mm_ucomigt_sd(__m128d _A, __m128d _B);
extern int _mm_ucomige_sd(__m128d _A, __m128d _B);
extern int _mm_ucomineq_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cvtepi32_pd(__m128i _A);
extern __m128i _mm_cvtpd_epi32(__m128d _A);
extern __m128i _mm_cvttpd_epi32(__m128d _A);
extern __m128 _mm_cvtepi32_ps(__m128i _A);
extern __m128i _mm_cvtps_epi32(__m128 _A);
extern __m128i _mm_cvttps_epi32(__m128 _A);
extern __m128 _mm_cvtpd_ps(__m128d _A);
extern __m128d _mm_cvtps_pd(__m128 _A);
extern __m128 _mm_cvtsd_ss(__m128 _A, __m128d _B);
extern __m128d _mm_cvtss_sd(__m128d _A, __m128 _B);
extern int _mm_cvtsd_si32(__m128d _A);
extern int _mm_cvttsd_si32(__m128d _A);
extern __m128d _mm_cvtsi32_sd(__m128d _A, int _B);
extern __m64 _mm_cvtpd_pi32(__m128d _A);
extern __m64 _mm_cvttpd_pi32(__m128d _A);
extern __m128d _mm_cvtpi32_pd(__m64 _A);
extern __m128d _mm_unpackhi_pd(__m128d _A, __m128d _B);
extern __m128d _mm_unpacklo_pd(__m128d _A, __m128d _B);
extern int _mm_movemask_pd(__m128d _A);
extern __m128d _mm_shuffle_pd(__m128d _A, __m128d _B, int _I);
extern __m128d _mm_load_pd(double const*_Dp);
extern __m128d _mm_load1_pd(double const*_Dp);
extern __m128d _mm_loadr_pd(double const*_Dp);
extern __m128d _mm_loadu_pd(double const*_Dp);
extern __m128d _mm_load_sd(double const*_Dp);
extern __m128d _mm_loadh_pd(__m128d _A, double const*_Dp);
extern __m128d _mm_loadl_pd(__m128d _A, double const*_Dp);
extern __m128d _mm_set_sd(double _W);
extern __m128d _mm_set1_pd(double _A);
extern __m128d _mm_set_pd(double _Z, double _Y);
extern __m128d _mm_setr_pd(double _Y, double _Z);
extern __m128d _mm_setzero_pd(void);
extern __m128d _mm_move_sd(__m128d _A, __m128d _B);
extern void _mm_store_sd(double *_Dp, __m128d _A);
extern void _mm_store1_pd(double *_Dp, __m128d _A);
extern void _mm_store_pd(double *_Dp, __m128d _A);
extern void _mm_storeu_pd(double *_Dp, __m128d _A);
extern void _mm_storer_pd(double *_Dp, __m128d _A);
extern void _mm_storeh_pd(double *_Dp, __m128d _A);
extern void _mm_storel_pd(double *_Dp, __m128d _A);
extern __m128i _mm_add_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_add_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_add_epi32(__m128i _A, __m128i _B);
extern __m64 _mm_add_si64(__m64 _A, __m64 _B);
extern __m128i _mm_add_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_avg_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_avg_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_madd_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_max_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_max_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_min_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_min_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_mulhi_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_mulhi_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_mullo_epi16(__m128i _A, __m128i _B);
extern __m64 _mm_mul_su32(__m64 _A, __m64 _B);
extern __m128i _mm_mul_epu32(__m128i _A, __m128i _B);
extern __m128i _mm_sad_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_sub_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_sub_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_sub_epi32(__m128i _A, __m128i _B);
extern __m64 _mm_sub_si64(__m64 _A, __m64 _B);
extern __m128i _mm_sub_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_and_si128(__m128i _A, __m128i _B);
extern __m128i _mm_andnot_si128(__m128i _A, __m128i _B);
extern __m128i _mm_or_si128(__m128i _A, __m128i _B);
extern __m128i _mm_xor_si128(__m128i _A, __m128i _B);
extern __m128i _mm_slli_si128(__m128i _A, int _Imm);
extern __m128i _mm_slli_epi16(__m128i _A, int _Count);
extern __m128i _mm_sll_epi16(__m128i _A, __m128i _Count);
extern __m128i _mm_slli_epi32(__m128i _A, int _Count);
extern __m128i _mm_sll_epi32(__m128i _A, __m128i _Count);
extern __m128i _mm_slli_epi64(__m128i _A, int _Count);
extern __m128i _mm_sll_epi64(__m128i _A, __m128i _Count);
extern __m128i _mm_srai_epi16(__m128i _A, int _Count);
extern __m128i _mm_sra_epi16(__m128i _A, __m128i _Count);
extern __m128i _mm_srai_epi32(__m128i _A, int _Count);
extern __m128i _mm_sra_epi32(__m128i _A, __m128i _Count);
extern __m128i _mm_srli_si128(__m128i _A, int _Imm);
extern __m128i _mm_srli_epi16(__m128i _A, int _Count);
extern __m128i _mm_srl_epi16(__m128i _A, __m128i _Count);
extern __m128i _mm_srli_epi32(__m128i _A, int _Count);
extern __m128i _mm_srl_epi32(__m128i _A, __m128i _Count);
extern __m128i _mm_srli_epi64(__m128i _A, int _Count);
extern __m128i _mm_srl_epi64(__m128i _A, __m128i _Count);
extern __m128i _mm_cmpeq_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_cmpeq_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_cmpeq_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_cmpgt_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_cmpgt_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_cmpgt_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_cmplt_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_cmplt_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_cmplt_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_cvtsi32_si128(int _A);
extern int _mm_cvtsi128_si32(__m128i _A);
extern __m128i _mm_packs_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_packs_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_packus_epi16(__m128i _A, __m128i _B);
extern int _mm_extract_epi16(__m128i _A, int _Imm);
extern __m128i _mm_insert_epi16(__m128i _A, int _B, int _Imm);
extern int _mm_movemask_epi8(__m128i _A);
extern __m128i _mm_shuffle_epi32(__m128i _A, int _Imm);
extern __m128i _mm_shufflehi_epi16(__m128i _A, int _Imm);
extern __m128i _mm_shufflelo_epi16(__m128i _A, int _Imm);
extern __m128i _mm_unpackhi_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_unpackhi_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_unpackhi_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_unpackhi_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_load_si128(__m128i const*_P);
extern __m128i _mm_loadu_si128(__m128i const*_P);
extern __m128i _mm_loadl_epi64(__m128i const*_P);
extern __m128i _mm_set_epi64(__m64 _Q1, __m64 _Q0);
extern __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0);
extern __m128i _mm_set_epi16(short _W7, short _W6, short _W5, short _W4,
short _W3, short _W2, short _W1, short _W0);
extern __m128i _mm_set_epi8(char _B15, char _B14, char _B13, char _B12,
char _B11, char _B10, char _B9, char _B8,
char _B7, char _B6, char _B5, char _B4,
char _B3, char _B2, char _B1, char _B0);
extern __m128i _mm_set1_epi64(__m64 _Q);
extern __m128i _mm_set1_epi32(int _I);
extern __m128i _mm_set1_epi16(short _W);
extern __m128i _mm_set1_epi8(char _B);
extern __m128i _mm_setl_epi64(__m128i _Q);
extern __m128i _mm_setr_epi64(__m64 _Q0, __m64 _Q1);
extern __m128i _mm_setr_epi32(int _I0, int _I1, int _I2, int _I3);
extern __m128i _mm_setr_epi16(short _W0, short _W1, short _W2, short _W3,
short _W4, short _W5, short _W6, short _W7);
extern __m128i _mm_setr_epi8(char _B15, char _B14, char _B13, char _B12,
char _B11, char _B10, char _B9, char _B8,
char _B7, char _B6, char _B5, char _B4,
char _B3, char _B2, char _B1, char _B0);
extern __m128i _mm_setzero_si128(void);
extern void _mm_store_si128(__m128i *_P, __m128i _B);
extern void _mm_storeu_si128(__m128i *_P, __m128i _B);
extern void _mm_storel_epi64(__m128i *_P, __m128i _Q);
extern void _mm_maskmoveu_si128(__m128i _D, __m128i _N, char *_P);
extern __m128i _mm_move_epi64(__m128i _Q);
extern __m128i _mm_movpi64_epi64(__m64 _Q);
extern __m64 _mm_movepi64_pi64(__m128i _Q);
extern void _mm_stream_pd(double *_Dp, __m128d _A);
extern void _mm_stream_si128(__m128i *_P, __m128i _A);
extern void _mm_clflush(void const*_P);
extern void _mm_lfence(void);
extern void _mm_mfence(void);
extern void _mm_stream_si32(int *_P, int _I);
extern void _mm_pause(void);
extern double _mm_cvtsd_f64(__m128d _A);
extern __m128 _mm_castpd_ps(__m128d);
extern __m128i _mm_castpd_si128(__m128d);
extern __m128d _mm_castps_pd(__m128);
extern __m128i _mm_castps_si128(__m128);
extern __m128 _mm_castsi128_ps(__m128i);
extern __m128d _mm_castsi128_pd(__m128i);
#if defined (_M_X64)
extern __int64 _mm_cvtsd_si64(__m128d);
extern __int64 _mm_cvttsd_si64(__m128d);
extern __m128d _mm_cvtsi64_sd(__m128d, __int64);
extern __m128i _mm_cvtsi64_si128(__int64);
extern __int64 _mm_cvtsi128_si64(__m128i);
>>>pmmintrin.h头文件中的函数接口
extern __m128 _mm_addsub_ps(__m128 , __m128 );
extern __m128 _mm_hadd_ps(__m128 , __m128 );
extern __m128 _mm_hsub_ps(__m128 , __m128 );
extern __m128 _mm_movehdup_ps(__m128 );
extern __m128 _mm_moveldup_ps(__m128 );
extern __m128d _mm_addsub_pd(__m128d , __m128d );
extern __m128d _mm_hadd_pd(__m128d , __m128d );
extern __m128d _mm_hsub_pd(__m128d , __m128d );
extern __m128d _mm_loaddup_pd(double const * );
extern __m128d _mm_movedup_pd(__m128d );
extern __m128i _mm_lddqu_si128(__m128i const * );
extern void _mm_monitor(void const * , unsigned , unsigned );
extern void _mm_mwait(unsigned , unsigned );
>>>tmmintrin.h头文件中的函数接口
extern __m128i _mm_hadd_epi16 (__m128i, __m128i);
extern __m128i _mm_hadd_epi32 (__m128i, __m128i);
extern __m128i _mm_hadds_epi16 (__m128i, __m128i);
extern __m64 _mm_hadd_pi16 (__m64, __m64);
extern __m64 _mm_hadd_pi32 (__m64, __m64);
extern __m64 _mm_hadds_pi16 (__m64, __m64);
extern __m128i _mm_hsub_epi16 (__m128i, __m128i);
extern __m128i _mm_hsub_epi32 (__m128i, __m128i);
extern __m128i _mm_hsubs_epi16 (__m128i, __m128i);
extern __m64 _mm_hsub_pi16 (__m64, __m64);
extern __m64 _mm_hsub_pi32 (__m64, __m64);
extern __m64 _mm_hsubs_pi16 (__m64, __m64);
extern __m128i _mm_maddubs_epi16 (__m128i, __m128i);
extern __m64 _mm_maddubs_pi16 (__m64, __m64);
extern __m128i _mm_mulhrs_epi16 (__m128i, __m128i);
extern __m64 _mm_mulhrs_pi16 (__m64, __m64);
extern __m128i _mm_shuffle_epi8 (__m128i, __m128i);
extern __m64 _mm_shuffle_pi8 (__m64, __m64);
extern __m128i _mm_sign_epi8 (__m128i, __m128i);
extern __m128i _mm_sign_epi16 (__m128i, __m128i);
extern __m128i _mm_sign_epi32 (__m128i, __m128i);
extern __m64 _mm_sign_pi8 (__m64, __m64);
extern __m64 _mm_sign_pi16 (__m64, __m64);
extern __m64 _mm_sign_pi32 (__m64, __m64);
extern __m128i _mm_alignr_epi8 (__m128i, __m128i, int);
extern __m64 _mm_alignr_pi8 (__m64, __m64, int);
extern __m128i _mm_abs_epi8 (__m128i);
extern __m128i _mm_abs_epi16 (__m128i);
extern __m128i _mm_abs_epi32 (__m128i);
extern __m64 _mm_abs_pi8 (__m64);
extern __m64 _mm_abs_pi16 (__m64);
extern __m64 _mm_abs_pi32 (__m64);
>>>smmintrin.h头文件中的函数接口
extern __m128i _mm_blend_epi16 (__m128i, __m128i, const int );
extern __m128i _mm_blendv_epi8 (__m128i, __m128i, __m128i mask);
extern __m128 _mm_blend_ps (__m128, __m128, const int );
extern __m128 _mm_blendv_ps(__m128, __m128, __m128 );
extern __m128d _mm_blend_pd (__m128d, __m128d, const int );
extern __m128d _mm_blendv_pd(__m128d, __m128d, __m128d );
extern __m128 _mm_dp_ps(__m128, __m128, const int );
extern __m128d _mm_dp_pd(__m128d, __m128d, const int );
extern __m128i _mm_cmpeq_epi64(__m128i, __m128i);
extern __m128i _mm_min_epi8 (__m128i, __m128i);
extern __m128i _mm_max_epi8 (__m128i, __m128i);
extern __m128i _mm_min_epu16(__m128i, __m128i);
extern __m128i _mm_max_epu16(__m128i, __m128i);
extern __m128i _mm_min_epi32(__m128i, __m128i);
extern __m128i _mm_max_epi32(__m128i, __m128i);
extern __m128i _mm_min_epu32(__m128i, __m128i);
extern __m128i _mm_max_epu32(__m128i, __m128i);
extern __m128i _mm_mullo_epi32(__m128i, __m128i);
extern __m128i _mm_mul_epi32(__m128i, __m128i);
extern int _mm_testz_si128(__m128i , __m128i );
extern int _mm_testc_si128(__m128i , __m128i );
extern int _mm_testnzc_si128(__m128i , __m128i );
extern __m128 _mm_insert_ps(__m128 , __m128 , const int );
#define _MM_MK_INSERTPS_NDX(srcField, dstField, zeroMask) \
(((srcField)<<6) | ((dstField)<<4) | (zeroMask))
extern int _mm_extract_ps(__m128 , const int );
#define _MM_EXTRACT_FLOAT(dest, src, ndx) \
*((int*)&(dest)) = _mm_extract_ps((src), (ndx))
#define _MM_PICK_OUT_PS(src, num) \
_mm_insert_ps(_mm_setzero_ps(), (src), \
_MM_MK_INSERTPS_NDX((num), 0, 0x0e))
extern __m128i _mm_insert_epi8 (__m128i , int , const int );
extern __m128i _mm_insert_epi32(__m128i , int , const int );
#if defined (_M_X64)
extern __m128i _mm_insert_epi64(__m128i , __int64 , const int );
#endif
extern int _mm_extract_epi8 (__m128i , const int );
extern int _mm_extract_epi32(__m128i , const int );
#if defined (_M_X64)
extern __int64 _mm_extract_epi64(__m128i , const int );
#endif
extern __m128i _mm_minpos_epu16(__m128i);
extern __m128d _mm_round_pd(__m128d , int );
extern __m128d _mm_round_sd(__m128d , __m128d , int );
extern __m128 _mm_round_ps(__m128 , int );
extern __m128 _mm_round_ss(__m128 , __m128 , int );
extern __m128i _mm_cvtepi8_epi32 (__m128i);
extern __m128i _mm_cvtepi16_epi32(__m128i);
extern __m128i _mm_cvtepi8_epi64 (__m128i);
extern __m128i _mm_cvtepi32_epi64(__m128i);
extern __m128i _mm_cvtepi16_epi64(__m128i);
extern __m128i _mm_cvtepi8_epi16 (__m128i);
extern __m128i _mm_cvtepu8_epi32 (__m128i);
extern __m128i _mm_cvtepu16_epi32(__m128i);
extern __m128i _mm_cvtepu8_epi64 (__m128i);
extern __m128i _mm_cvtepu32_epi64(__m128i);
extern __m128i _mm_cvtepu16_epi64(__m128i);
extern __m128i _mm_cvtepu8_epi16 (__m128i);
extern __m128i _mm_packus_epi32(__m128i, __m128i);
extern __m128i _mm_mpsadbw_epu8(__m128i , __m128i , const int );
extern __m128i _mm_stream_load_si128(__m128i*);
>>>nmmintrin.h头文件中的函数接口
extern __m128i _mm_cmpistrm (__m128i , __m128i , const int );
extern int _mm_cmpistri (__m128i , __m128i , const int );
extern __m128i _mm_cmpestrm (__m128i , int , __m128i , int , const int );
extern int _mm_cmpestri (__m128i , int , __m128i , int , const int );
extern int _mm_cmpistrz (__m128i , __m128i , const int );
extern int _mm_cmpistrc (__m128i , __m128i , const int );
extern int _mm_cmpistrs (__m128i , __m128i , const int );
extern int _mm_cmpistro (__m128i , __m128i , const int );
extern int _mm_cmpistra (__m128i , __m128i , const int );
extern int _mm_cmpestrz (__m128i , int , __m128i , int , const int );
extern int _mm_cmpestrc (__m128i , int , __m128i , int , const int );
extern int _mm_cmpestrs (__m128i , int , __m128i , int , const int );
extern int _mm_cmpestro (__m128i , int , __m128i , int , const int );
extern int _mm_cmpestra (__m128i , int , __m128i , int , const int );
extern __m128i _mm_cmpgt_epi64(__m128i , __m128i );
extern int _mm_popcnt_u32(unsigned int );
#if defined (_M_X64)
extern __int64 _mm_popcnt_u64(unsigned __int64 );
#endif
extern unsigned int _mm_crc32_u8 (unsigned int , unsigned char );
extern unsigned int _mm_crc32_u16(unsigned int , unsigned short );
extern unsigned int _mm_crc32_u32(unsigned int , unsigned int );
#if defined (_M_X64)
extern unsigned __int64 _mm_crc32_u64(unsigned __int64 , unsigned __int64 );
#endif /
4.实际使用
由于上面头文件中的函数接口比较多,所以仅需要大致了解一下,在用到的时候则需要知道具体函数怎么调用以及功能。这里把最上面说的自己阅读代码时遇到的函数接口做一个简单介绍:
__m128i _mm_set1_epi8(char b)
__m128i _mm_loadu_si128 (__m128i *p);
_mm_store_si128 ( __m128i *p, __m128i a)
_mm_unpacklo_epi8(_m128i S0,_m128i S1)
__m128i _mm_add_epi16 (__m128i a, __m128i b);
_m128i _mm_srli_si128 (__m128i a, int imm8)
|