IT数码 购物 网址 头条 软件 日历 阅读 图书馆
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
图片批量下载器
↓批量下载图片,美女图库↓
图片自动播放器
↓图片自动播放器↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁
 
   -> C++知识库 -> SIMD指令集分析(C/C++) -> 正文阅读

[C++知识库]SIMD指令集分析(C/C++)

阅读代码的时候遇到了__m128i_mm_set1_epi8_mm_loadu_si128_mm_max_epu8_mm_min_epu8_mm_store_si128_mm_unpackhi_epi8_mm_adds_epi16_mm_srli_si128等SIMD指令集,所以想着作一个总结。


0. SIMD基础知识

SIMD是单指令多数据技术,目前Intel处理器支持的SIMD技术包括MMX、SSE以及AVX。

MMX是MultiMedia eXtensions(多媒体扩展)的缩写,是第六代CPU芯片的重要特点。它是继Intel386?处理器(将体系结构扩展至32位)之后对Intel体系结构最重要的加强,这些指令集能够加速有关图形、影像、声音等的应用。其中,MMX提供了8个64bit的寄存器进行SIMD操作。

SSE是"因特尔数据流单指令序列扩展(Internet Streaming SIMD Extensions)"的缩写。SSE除保持原有的MMX指令外,又新添加了70条指令,在加快浮点运算的同时,改善了内存的使用效率,使内存速度更快。其中,SSE系列提供了8个128bit的寄存器进行SIMD操作。

AVX指令集是Sandy Bridge和Larrabee架构下的新指令集,在单指令多数据流计算性能增加的同时也沿用了MMX/SSE指令集,是在之前的128位扩展到256位的单指令多数据流,不过和MMX/SSE的不同点在于增强的AVX指令,从指令的格式上就发生了很大的变化。

1. 如何使用SIMD指令以及其相关头文件

使用SIMD指令有两种方式:一是直接在C/C++中嵌入(汇编)指令;而是使用Intel C++ Compiler或是Microsoft Visual C++提供的支持SIMD指令集的intrinsics内联函数。从代码可读和维护角度讲,推荐使用intrinsics内联函数的形式。intrinsics是对MMX、SSE等指令集的一种封装,以函数的形式提供,使得程序员更容易编写和使用这些高级指令,在编译的时候,这些函数会被内联为汇编,不会产生函数调用的开销。要想使用SIMD指令,则需要包含对应的头文件。

接下来介绍一下头文件之间的关系:

#include <mmintrin.h>  //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h> //SSSE3(include pmmintrin.h)
#include <smmintrin.h> //SSE4.1(include tmmintrin.h)
#include <nmmintrin.h> //SSE4.2(include smmintrin.h)
#include <wmmintrin.h> //AES(include nmmintrin.h)
#include <immintrin.h> //AVX(include wmmintrin.h)
#include <intrin.h>    //(include immintrin.h)

mmintrin.h ∈ xmmintrin.h ∈ emmintrin.h ∈ pmmintrin.h ∈ tmmintrin.h ∈ smmintrin.h ∈ nmmintrin.h ∈ wmmintrin.h ∈ immintrin.h ∈ intrin.h

2. 变量类型

>>>__m64

mmintrin.h为MMX头文件,__m64的定义就来自这个头文件:

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{
    unsigned __int64    m64_u64;
    float               m64_f32[2];
    __int8              m64_i8[8];
    __int16             m64_i16[4];
    __int32             m64_i32[2];    
    __int64             m64_i64;
    unsigned __int8     m64_u8[8];
    unsigned __int16    m64_u16[4];
    unsigned __int32    m64_u32[2];
} __m64;

可以看到,__m64为一个共用体(union)类型,union的特点为:所有成员占用同一段内存,在不同的时间保存不同的数据类型和不同长度的变量。在union中,所有的共用体成员公用一个空间,并且同一时间只能存储其中一个成员变量的值。__m64的大小为64位,其中,__declspec(align(8))是设置内存对齐方式(8字节对齐),来保证__m64的大小为64位。__m64这种类型的变量可用作MMX指令的操作数,它不能直接被访问,被自动分配为8个字节的字长。

>>>__m128

xmmintrin.h为SSE头文件,__m128的定义就来自于这个头文件:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;

>>>__m128i 和 __m128d

emmintrin.h为SSE2头文件,其中__m128i和__m128d的定义就来自于这个头文件:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];    
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;
 
typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {
    double              m128d_f64[2];
} __m128d;

>>>__m256、__m256d 和 __m256i

immintrin.h为AVX头文件,其中__m256、__m256d和__m256i的定义就来自于这个头文件:

typedef union __declspec(intrin_type) _CRT_ALIGN(32) __m256 { 
    float m256_f32[8];
} __m256;
 
typedef struct __declspec(intrin_type) _CRT_ALIGN(32) {
    double m256d_f64[4]; 
} __m256d; 
 
typedef union  __declspec(intrin_type) _CRT_ALIGN(32) __m256i {
    __int8              m256i_i8[32];
    __int16             m256i_i16[16];
    __int32             m256i_i32[8];
    __int64             m256i_i64[4];
    unsigned __int8     m256i_u8[32];
    unsigned __int16    m256i_u16[16];
    unsigned __int32    m256i_u32[8];
    unsigned __int64    m256i_u64[4];
} __m256i;

3. 函数接口

>>>mmintrin.h头文件中的函数接口

/* General support intrinsics */
void  _m_empty(void);
__m64 _m_from_int(int _I);
int   _m_to_int(__m64 _M);
__m64 _m_packsswb(__m64 _MM1, __m64 _MM2);
__m64 _m_packssdw(__m64 _MM1, __m64 _MM2);
__m64 _m_packuswb(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckhbw(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckhwd(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckhdq(__m64 _MM1, __m64 _MM2);
__m64 _m_punpcklbw(__m64 _MM1, __m64 _MM2);
__m64 _m_punpcklwd(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckldq(__m64 _MM1, __m64 _MM2);

/* Packed arithmetic intrinsics */
__m64 _m_paddb(__m64 _MM1, __m64 _MM2);
__m64 _m_paddw(__m64 _MM1, __m64 _MM2);
__m64 _m_paddd(__m64 _MM1, __m64 _MM2);
__m64 _m_paddsb(__m64 _MM1, __m64 _MM2);
__m64 _m_paddsw(__m64 _MM1, __m64 _MM2);
__m64 _m_paddusb(__m64 _MM1, __m64 _MM2);
__m64 _m_paddusw(__m64 _MM1, __m64 _MM2);
__m64 _m_psubb(__m64 _MM1, __m64 _MM2);
__m64 _m_psubw(__m64 _MM1, __m64 _MM2);
__m64 _m_psubd(__m64 _MM1, __m64 _MM2);
__m64 _m_psubsb(__m64 _MM1, __m64 _MM2);
__m64 _m_psubsw(__m64 _MM1, __m64 _MM2);
__m64 _m_psubusb(__m64 _MM1, __m64 _MM2);
__m64 _m_psubusw(__m64 _MM1, __m64 _MM2);
__m64 _m_pmaddwd(__m64 _MM1, __m64 _MM2);
__m64 _m_pmulhw(__m64 _MM1, __m64 _MM2);
__m64 _m_pmullw(__m64 _MM1, __m64 _MM2);

/* Shift intrinsics */
__m64 _m_psllw(__m64 _M, __m64 _Count);
__m64 _m_psllwi(__m64 _M, int _Count);
__m64 _m_pslld(__m64 _M, __m64 _Count);
__m64 _m_pslldi(__m64 _M, int _Count);
__m64 _m_psllq(__m64 _M, __m64 _Count);
__m64 _m_psllqi(__m64 _M, int _Count);
__m64 _m_psraw(__m64 _M, __m64 _Count);
__m64 _m_psrawi(__m64 _M, int _Count);
__m64 _m_psrad(__m64 _M, __m64 _Count);
__m64 _m_psradi(__m64 _M, int _Count);
__m64 _m_psrlw(__m64 _M, __m64 _Count);
__m64 _m_psrlwi(__m64 _M, int _Count);
__m64 _m_psrld(__m64 _M, __m64 _Count);
__m64 _m_psrldi(__m64 _M, int _Count);
__m64 _m_psrlq(__m64 _M, __m64 _Count);
__m64 _m_psrlqi(__m64 _M, int _Count);

/* Logical intrinsics */
__m64 _m_pand(__m64 _MM1, __m64 _MM2);
__m64 _m_pandn(__m64 _MM1, __m64 _MM2);
__m64 _m_por(__m64 _MM1, __m64 _MM2);
__m64 _m_pxor(__m64 _MM1, __m64 _MM2);

/* Comparison intrinsics */
__m64 _m_pcmpeqb(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpeqw(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpeqd(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpgtb(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpgtw(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpgtd(__m64 _MM1, __m64 _MM2);

/* Utility intrinsics */
__m64 _mm_setzero_si64(void);
__m64 _mm_set_pi32(int _I1, int _I0);
__m64 _mm_set_pi16(short _S3, short _S2, short _S1, short _S0);
__m64 _mm_set_pi8(char _B7, char _B6, char _B5, char _B4,
                  char _B3, char _B2, char _B1, char _B0);
__m64 _mm_set1_pi32(int _I);
__m64 _mm_set1_pi16(short _S);
__m64 _mm_set1_pi8(char _B);
__m64 _mm_setr_pi32(int _I1, int _I0);
__m64 _mm_setr_pi16(short _S3, short _S2, short _S1, short _S0);
__m64 _mm_setr_pi8(char _B7, char _B6, char _B5, char _B4,
                   char _B3, char _B2, char _B1, char _B0);

>>>xmmintrin.h头文件中的函数接口

/*
 * FP, arithmetic
 */

extern __m128 _mm_add_ss(__m128 _A, __m128 _B);
extern __m128 _mm_add_ps(__m128 _A, __m128 _B);
extern __m128 _mm_sub_ss(__m128 _A, __m128 _B);
extern __m128 _mm_sub_ps(__m128 _A, __m128 _B);
extern __m128 _mm_mul_ss(__m128 _A, __m128 _B);
extern __m128 _mm_mul_ps(__m128 _A, __m128 _B);
extern __m128 _mm_div_ss(__m128 _A, __m128 _B);
extern __m128 _mm_div_ps(__m128 _A, __m128 _B);
extern __m128 _mm_sqrt_ss(__m128 _A);
extern __m128 _mm_sqrt_ps(__m128 _A);
extern __m128 _mm_rcp_ss(__m128 _A);
extern __m128 _mm_rcp_ps(__m128 _A);
extern __m128 _mm_rsqrt_ss(__m128 _A);
extern __m128 _mm_rsqrt_ps(__m128 _A);
extern __m128 _mm_min_ss(__m128 _A, __m128 _B);
extern __m128 _mm_min_ps(__m128 _A, __m128 _B);
extern __m128 _mm_max_ss(__m128 _A, __m128 _B);
extern __m128 _mm_max_ps(__m128 _A, __m128 _B);

/*
 * FP, logical
 */

extern __m128 _mm_and_ps(__m128 _A, __m128 _B);
extern __m128 _mm_andnot_ps(__m128 _A, __m128 _B);
extern __m128 _mm_or_ps(__m128 _A, __m128 _B);
extern __m128 _mm_xor_ps(__m128 _A, __m128 _B);

/*
 * FP, comparison
 */

extern __m128 _mm_cmpeq_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpeq_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmplt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmplt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmple_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmple_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpgt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpgt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpge_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpge_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpneq_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpneq_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnlt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnlt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnle_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnle_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpngt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpngt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnge_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnge_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpord_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpord_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpunord_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpunord_ps(__m128 _A, __m128 _B);
extern int _mm_comieq_ss(__m128 _A, __m128 _B);
extern int _mm_comilt_ss(__m128 _A, __m128 _B);
extern int _mm_comile_ss(__m128 _A, __m128 _B);
extern int _mm_comigt_ss(__m128 _A, __m128 _B);
extern int _mm_comige_ss(__m128 _A, __m128 _B);
extern int _mm_comineq_ss(__m128 _A, __m128 _B);
extern int _mm_ucomieq_ss(__m128 _A, __m128 _B);
extern int _mm_ucomilt_ss(__m128 _A, __m128 _B);
extern int _mm_ucomile_ss(__m128 _A, __m128 _B);
extern int _mm_ucomigt_ss(__m128 _A, __m128 _B);
extern int _mm_ucomige_ss(__m128 _A, __m128 _B);
extern int _mm_ucomineq_ss(__m128 _A, __m128 _B);

/*
 * FP, conversions
 */

extern int _mm_cvt_ss2si(__m128 _A);
extern __m64 _mm_cvt_ps2pi(__m128 _A);
extern int _mm_cvtt_ss2si(__m128 _A);
extern __m64 _mm_cvtt_ps2pi(__m128 _A);
extern __m128 _mm_cvt_si2ss(__m128, int);
extern __m128 _mm_cvt_pi2ps(__m128, __m64);
extern float _mm_cvtss_f32(__m128 _A);

/*
 * Support for 64-bit extension intrinsics
 */
#if defined (_M_X64)
extern __int64 _mm_cvtss_si64(__m128 _A);
extern __int64 _mm_cvttss_si64(__m128 _A);
extern __m128  _mm_cvtsi64_ss(__m128 _A, __int64 _B);
#endif  /* defined (_M_X64) */

/*
 * FP, misc
 */

extern __m128 _mm_shuffle_ps(__m128 _A, __m128 _B, unsigned int _Imm8);
extern __m128 _mm_unpackhi_ps(__m128 _A, __m128 _B);
extern __m128 _mm_unpacklo_ps(__m128 _A, __m128 _B);
extern __m128 _mm_loadh_pi(__m128, __m64 const*);
extern __m128 _mm_movehl_ps(__m128, __m128);
extern __m128 _mm_movelh_ps(__m128, __m128);
extern void _mm_storeh_pi(__m64 *, __m128);
extern __m128 _mm_loadl_pi(__m128, __m64 const*);
extern void _mm_storel_pi(__m64 *, __m128);
extern int _mm_movemask_ps(__m128 _A);


/*
 * Integer extensions
 */
extern int _m_pextrw(__m64, int);
extern __m64 _m_pinsrw(__m64, int, int);
extern __m64 _m_pmaxsw(__m64, __m64);
extern __m64 _m_pmaxub(__m64, __m64);
extern __m64 _m_pminsw(__m64, __m64);
extern __m64 _m_pminub(__m64, __m64);
extern int _m_pmovmskb(__m64);
extern __m64 _m_pmulhuw(__m64, __m64);
extern __m64 _m_pshufw(__m64, int);
extern void _m_maskmovq(__m64, __m64, char *);
extern __m64 _m_pavgb(__m64, __m64);
extern __m64 _m_pavgw(__m64, __m64);
extern __m64 _m_psadbw(__m64, __m64);

/*
 * memory & initialization
 */

extern __m128 _mm_set_ss(float _A);
extern __m128 _mm_set_ps1(float _A);
extern __m128 _mm_set_ps(float _A, float _B, float _C, float _D);
extern __m128 _mm_setr_ps(float _A, float _B, float _C, float _D);
extern __m128 _mm_setzero_ps(void);
extern __m128 _mm_load_ss(float const*_A);
extern __m128 _mm_load_ps1(float const*_A);
extern __m128 _mm_load_ps(float const*_A);
extern __m128 _mm_loadr_ps(float const*_A);
extern __m128 _mm_loadu_ps(float const*_A);
extern void _mm_store_ss(float *_V, __m128 _A);
extern void _mm_store_ps1(float *_V, __m128 _A);
extern void _mm_store_ps(float *_V, __m128 _A);
extern void _mm_storer_ps(float *_V, __m128 _A);
extern void _mm_storeu_ps(float *_V, __m128 _A);
extern void _mm_prefetch(char const*_A, int _Sel);
extern void _mm_stream_pi(__m64 *, __m64);
extern void _mm_stream_ps(float *, __m128);
extern __m128 _mm_move_ss(__m128 _A, __m128 _B);

extern void _mm_sfence(void);
extern unsigned int _mm_getcsr(void);
extern void _mm_setcsr(unsigned int);


 /******************************************************/
 /* UTILITY INTRINSICS FUNCTION DEFINITIONS START HERE */
 /******************************************************/

 /*********************************************************/
 /*  NAME : _mm_cvtpi16_ps                                */
 /*  DESCRIPTION : Convert 4 16-bit signed integer values */
 /*                to 4 single-precision float values     */
 /*  IN : __m64 _A                                         */
 /*  OUT : none                                           */
 /*  RETURN : __m128 : (float)_A                           */
 /*********************************************************/
__inline __m128 _mm_cvtpi16_ps(__m64 _A)
{
  __m128 _Tmp;
  __m64  _Ext_val = _mm_cmpgt_pi16(_mm_setzero_si64(), _A);

  _Tmp = _mm_cvtpi32_ps(_mm_setzero_ps(), _mm_unpackhi_pi16(_A, _Ext_val));
  return(_mm_cvtpi32_ps(_mm_movelh_ps(_Tmp, _Tmp),
                        _mm_unpacklo_pi16(_A, _Ext_val)));
}


 /***********************************************************/
 /*  NAME : _mm_cvtpu16_ps                                  */
 /*  DESCRIPTION : Convert 4 16-bit unsigned integer values */
 /*                to 4 single-precision float values       */
 /*  IN : __m64 _A                                           */
 /*  OUT : none                                             */
 /*  RETURN : __m128 : (float)_A                             */
 /***********************************************************/
__inline __m128 _mm_cvtpu16_ps(__m64 _A)
{
  __m128 _Tmp;
  __m64  _Ext_val = _mm_setzero_si64();

  _Tmp = _mm_cvtpi32_ps(_mm_setzero_ps(), _mm_unpackhi_pi16(_A, _Ext_val));
  return(_mm_cvtpi32_ps(_mm_movelh_ps(_Tmp, _Tmp),
                        _mm_unpacklo_pi16(_A, _Ext_val)));
}


 /******************************************************/
 /*  NAME : _mm_cvtps_pi16                             */
 /*  DESCRIPTION : Convert 4 single-precision float    */
 /*                values to 4 16-bit integer values   */
 /*  IN : __m128 a                                     */
 /*  OUT : none                                        */
 /*  RETURN : __m64 : (short)a                         */
 /******************************************************/
__inline __m64 _mm_cvtps_pi16(__m128 _A)
{
  return _mm_packs_pi32(_mm_cvtps_pi32(_A),
                        _mm_cvtps_pi32(_mm_movehl_ps(_A, _A)));
}


 /******************************************************/
 /*  NAME : _mm_cvtpi8_ps                              */
 /*  DESCRIPTION : Convert 4 8-bit integer values to 4 */
 /*                single-precision float values       */
 /*  IN : __m64 _A                                     */
 /*  OUT : none                                        */
 /*  RETURN : __m128 : (float)_A                        */
 /******************************************************/
__inline __m128 _mm_cvtpi8_ps(__m64 _A)
{
  __m64  _Ext_val = _mm_cmpgt_pi8(_mm_setzero_si64(), _A);

  return _mm_cvtpi16_ps(_mm_unpacklo_pi8(_A, _Ext_val));
}


 /******************************************************/
 /*  NAME : _mm_cvtpu8_ps                              */
 /*  DESCRIPTION : Convert 4 8-bit unsigned integer    */
 /*                values to 4 single-precision float  */
 /*                values                              */
 /*  IN : __m64 _A                                      */
 /*  OUT : none                                        */
 /*  RETURN : __m128 : (float)_A                        */
 /******************************************************/
__inline __m128 _mm_cvtpu8_ps(__m64 _A)
{
  return _mm_cvtpu16_ps(_mm_unpacklo_pi8(_A, _mm_setzero_si64()));
}


 /******************************************************/
 /*  NAME : _mm_cvtps_pi8                              */
 /*  DESCRIPTION : Convert 4 single-precision float    */
 /*                values to 4 8-bit integer values    */
 /*  IN : __m128 _A                                     */
 /*  OUT : none                                        */
 /*  RETURN : __m64 : (char)_A                          */
 /******************************************************/
__inline __m64 _mm_cvtps_pi8(__m128 _A)
{
  return _mm_packs_pi16(_mm_cvtps_pi16(_A), _mm_setzero_si64());
}


 /******************************************************/
 /*  NAME : _mm_cvtpi32x2_ps                           */
 /*  DESCRIPTION : Convert 4 32-bit integer values     */
 /*                to 4 single-precision float values  */
 /*  IN : __m64 _A : operand 1                          */
 /*       __m64 _B : operand 2                          */
 /*  OUT : none                                        */
 /*  RETURN : __m128 : (float)_A,(float)_B               */
 /******************************************************/
__inline __m128 _mm_cvtpi32x2_ps(__m64 _A, __m64 _B)
{
  return _mm_movelh_ps(_mm_cvt_pi2ps(_mm_setzero_ps(), _A),
                       _mm_cvt_pi2ps(_mm_setzero_ps(), _B));
}

>>>emmintrin.h头文件中的函数接口

/*
 * DP, arithmetic
 */

extern __m128d _mm_add_sd(__m128d _A, __m128d _B);
extern __m128d _mm_add_pd(__m128d _A, __m128d _B);
extern __m128d _mm_sub_sd(__m128d _A, __m128d _B);
extern __m128d _mm_sub_pd(__m128d _A, __m128d _B);
extern __m128d _mm_mul_sd(__m128d _A, __m128d _B);
extern __m128d _mm_mul_pd(__m128d _A, __m128d _B);
extern __m128d _mm_sqrt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_sqrt_pd(__m128d _A);
extern __m128d _mm_div_sd(__m128d _A, __m128d _B);
extern __m128d _mm_div_pd(__m128d _A, __m128d _B);
extern __m128d _mm_min_sd(__m128d _A, __m128d _B);
extern __m128d _mm_min_pd(__m128d _A, __m128d _B);
extern __m128d _mm_max_sd(__m128d _A, __m128d _B);
extern __m128d _mm_max_pd(__m128d _A, __m128d _B);

/*
 * DP, logicals
 */

extern __m128d _mm_and_pd(__m128d _A, __m128d _B);
extern __m128d _mm_andnot_pd(__m128d _A, __m128d _B);
extern __m128d _mm_or_pd(__m128d _A, __m128d _B);
extern __m128d _mm_xor_pd(__m128d _A, __m128d _B);

/*
 * DP, comparisons
 */

extern __m128d _mm_cmpeq_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpeq_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmplt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmplt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmple_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmple_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpgt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpgt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpge_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpge_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpneq_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpneq_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnlt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnlt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnle_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnle_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpngt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpngt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnge_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnge_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpord_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpord_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpunord_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpunord_sd(__m128d _A, __m128d _B);
extern int _mm_comieq_sd(__m128d _A, __m128d _B);
extern int _mm_comilt_sd(__m128d _A, __m128d _B);
extern int _mm_comile_sd(__m128d _A, __m128d _B);
extern int _mm_comigt_sd(__m128d _A, __m128d _B);
extern int _mm_comige_sd(__m128d _A, __m128d _B);
extern int _mm_comineq_sd(__m128d _A, __m128d _B);
extern int _mm_ucomieq_sd(__m128d _A, __m128d _B);
extern int _mm_ucomilt_sd(__m128d _A, __m128d _B);
extern int _mm_ucomile_sd(__m128d _A, __m128d _B);
extern int _mm_ucomigt_sd(__m128d _A, __m128d _B);
extern int _mm_ucomige_sd(__m128d _A, __m128d _B);
extern int _mm_ucomineq_sd(__m128d _A, __m128d _B);

/*
 * DP, converts
 */

extern __m128d _mm_cvtepi32_pd(__m128i _A);
extern __m128i _mm_cvtpd_epi32(__m128d _A);
extern __m128i _mm_cvttpd_epi32(__m128d _A);
extern __m128 _mm_cvtepi32_ps(__m128i _A);
extern __m128i _mm_cvtps_epi32(__m128 _A);
extern __m128i _mm_cvttps_epi32(__m128 _A);
extern __m128 _mm_cvtpd_ps(__m128d _A);
extern __m128d _mm_cvtps_pd(__m128 _A);
extern __m128 _mm_cvtsd_ss(__m128 _A, __m128d _B);
extern __m128d _mm_cvtss_sd(__m128d _A, __m128 _B);

extern int _mm_cvtsd_si32(__m128d _A);
extern int _mm_cvttsd_si32(__m128d _A);
extern __m128d _mm_cvtsi32_sd(__m128d _A, int _B);

extern __m64 _mm_cvtpd_pi32(__m128d _A);
extern __m64 _mm_cvttpd_pi32(__m128d _A);
extern __m128d _mm_cvtpi32_pd(__m64 _A);

/*
 * DP, misc
 */

extern __m128d _mm_unpackhi_pd(__m128d _A, __m128d _B);
extern __m128d _mm_unpacklo_pd(__m128d _A, __m128d _B);
extern int _mm_movemask_pd(__m128d _A);
extern __m128d _mm_shuffle_pd(__m128d _A, __m128d _B, int _I);

/*
 * DP, loads
 */

extern __m128d _mm_load_pd(double const*_Dp);
extern __m128d _mm_load1_pd(double const*_Dp);
extern __m128d _mm_loadr_pd(double const*_Dp);
extern __m128d _mm_loadu_pd(double const*_Dp);
extern __m128d _mm_load_sd(double const*_Dp);
extern __m128d _mm_loadh_pd(__m128d _A, double const*_Dp);
extern __m128d _mm_loadl_pd(__m128d _A, double const*_Dp);

/*
 * DP, sets
 */

extern __m128d _mm_set_sd(double _W);
extern __m128d _mm_set1_pd(double _A);
extern __m128d _mm_set_pd(double _Z, double _Y);
extern __m128d _mm_setr_pd(double _Y, double _Z);
extern __m128d _mm_setzero_pd(void);
extern __m128d _mm_move_sd(__m128d _A, __m128d _B);

/*
 * DP, stores
 */

extern void _mm_store_sd(double *_Dp, __m128d _A);
extern void _mm_store1_pd(double *_Dp, __m128d _A);
extern void _mm_store_pd(double *_Dp, __m128d _A);
extern void _mm_storeu_pd(double *_Dp, __m128d _A);
extern void _mm_storer_pd(double *_Dp, __m128d _A);
extern void _mm_storeh_pd(double *_Dp, __m128d _A);
extern void _mm_storel_pd(double *_Dp, __m128d _A);

/*
 * Integer, arithmetic
 */

extern __m128i _mm_add_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_add_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_add_epi32(__m128i _A, __m128i _B);
extern __m64 _mm_add_si64(__m64 _A, __m64 _B);
extern __m128i _mm_add_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_avg_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_avg_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_madd_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_max_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_max_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_min_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_min_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_mulhi_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_mulhi_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_mullo_epi16(__m128i _A, __m128i _B);
extern __m64 _mm_mul_su32(__m64 _A, __m64 _B);
extern __m128i _mm_mul_epu32(__m128i _A, __m128i _B);
extern __m128i _mm_sad_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_sub_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_sub_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_sub_epi32(__m128i _A, __m128i _B);
extern __m64 _mm_sub_si64(__m64 _A, __m64 _B);
extern __m128i _mm_sub_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epu16(__m128i _A, __m128i _B);

/*
 * Integer, logicals
 */

extern __m128i _mm_and_si128(__m128i _A, __m128i _B);
extern __m128i _mm_andnot_si128(__m128i _A, __m128i _B);
extern __m128i _mm_or_si128(__m128i _A, __m128i _B);
extern __m128i _mm_xor_si128(__m128i _A, __m128i _B);

/*
 * Integer, shifts
 */

extern __m128i _mm_slli_si128(__m128i _A, int _Imm);
extern __m128i _mm_slli_epi16(__m128i _A, int _Count);
extern __m128i _mm_sll_epi16(__m128i _A, __m128i _Count);
extern __m128i _mm_slli_epi32(__m128i _A, int _Count);
extern __m128i _mm_sll_epi32(__m128i _A, __m128i _Count);
extern __m128i _mm_slli_epi64(__m128i _A, int _Count);
extern __m128i _mm_sll_epi64(__m128i _A, __m128i _Count);
extern __m128i _mm_srai_epi16(__m128i _A, int _Count);
extern __m128i _mm_sra_epi16(__m128i _A, __m128i _Count);
extern __m128i _mm_srai_epi32(__m128i _A, int _Count);
extern __m128i _mm_sra_epi32(__m128i _A, __m128i _Count);
extern __m128i _mm_srli_si128(__m128i _A, int _Imm);
extern __m128i _mm_srli_epi16(__m128i _A, int _Count);
extern __m128i _mm_srl_epi16(__m128i _A, __m128i _Count);
extern __m128i _mm_srli_epi32(__m128i _A, int _Count);
extern __m128i _mm_srl_epi32(__m128i _A, __m128i _Count);
extern __m128i _mm_srli_epi64(__m128i _A, int _Count);
extern __m128i _mm_srl_epi64(__m128i _A, __m128i _Count);

/*
 * Integer, comparisons
 */

extern __m128i _mm_cmpeq_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_cmpeq_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_cmpeq_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_cmpgt_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_cmpgt_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_cmpgt_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_cmplt_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_cmplt_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_cmplt_epi32(__m128i _A, __m128i _B);

/*
 * Integer, converts
 */

extern __m128i _mm_cvtsi32_si128(int _A);
extern int _mm_cvtsi128_si32(__m128i _A);

/*
 * Integer, misc
 */

extern __m128i _mm_packs_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_packs_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_packus_epi16(__m128i _A, __m128i _B);
extern int _mm_extract_epi16(__m128i _A, int _Imm);
extern __m128i _mm_insert_epi16(__m128i _A, int _B, int _Imm);
extern int _mm_movemask_epi8(__m128i _A);
extern __m128i _mm_shuffle_epi32(__m128i _A, int _Imm);
extern __m128i _mm_shufflehi_epi16(__m128i _A, int _Imm);
extern __m128i _mm_shufflelo_epi16(__m128i _A, int _Imm);
extern __m128i _mm_unpackhi_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_unpackhi_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_unpackhi_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_unpackhi_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B);

/*
 * Integer, loads
 */

extern __m128i _mm_load_si128(__m128i const*_P);
extern __m128i _mm_loadu_si128(__m128i const*_P);
extern __m128i _mm_loadl_epi64(__m128i const*_P);

/*
 * Integer, sets
 */

extern __m128i _mm_set_epi64(__m64 _Q1, __m64 _Q0);
extern __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0);
extern __m128i _mm_set_epi16(short _W7, short _W6, short _W5, short _W4,
                             short _W3, short _W2, short _W1, short _W0);
extern __m128i _mm_set_epi8(char _B15, char _B14, char _B13, char _B12,
                            char _B11, char _B10, char _B9, char _B8,
                            char _B7, char _B6, char _B5, char _B4,
                            char _B3, char _B2, char _B1, char _B0);
extern __m128i _mm_set1_epi64(__m64 _Q);
extern __m128i _mm_set1_epi32(int _I);
extern __m128i _mm_set1_epi16(short _W);
extern __m128i _mm_set1_epi8(char _B);
extern __m128i _mm_setl_epi64(__m128i _Q);
extern __m128i _mm_setr_epi64(__m64 _Q0, __m64 _Q1);
extern __m128i _mm_setr_epi32(int _I0, int _I1, int _I2, int _I3);
extern __m128i _mm_setr_epi16(short _W0, short _W1, short _W2, short _W3,
                              short _W4, short _W5, short _W6, short _W7);
extern __m128i _mm_setr_epi8(char _B15, char _B14, char _B13, char _B12,
                             char _B11, char _B10, char _B9, char _B8,
                             char _B7, char _B6, char _B5, char _B4,
                             char _B3, char _B2, char _B1, char _B0);
extern __m128i _mm_setzero_si128(void);

/*
 * Integer, stores
 */

extern void _mm_store_si128(__m128i *_P, __m128i _B);
extern void _mm_storeu_si128(__m128i *_P, __m128i _B);
extern void _mm_storel_epi64(__m128i *_P, __m128i _Q);
extern void _mm_maskmoveu_si128(__m128i _D, __m128i _N, char *_P);

/*
 * Integer, moves
 */

extern __m128i _mm_move_epi64(__m128i _Q);
extern __m128i _mm_movpi64_epi64(__m64 _Q);
extern __m64 _mm_movepi64_pi64(__m128i _Q);

/*
 * Cacheability support
 */

extern void _mm_stream_pd(double *_Dp, __m128d _A);
extern void _mm_stream_si128(__m128i *_P, __m128i _A);
extern void _mm_clflush(void const*_P);
extern void _mm_lfence(void);
extern void _mm_mfence(void);
extern void _mm_stream_si32(int *_P, int _I);
extern void _mm_pause(void);

/*
 * New convert to float
 */

extern double _mm_cvtsd_f64(__m128d _A);

/*
 * Support for casting between various SP, DP, INT vector types.
 * Note that these do no conversion of values, they just change
 * the type.
 */

extern __m128  _mm_castpd_ps(__m128d);
extern __m128i _mm_castpd_si128(__m128d);
extern __m128d _mm_castps_pd(__m128);
extern __m128i _mm_castps_si128(__m128);
extern __m128  _mm_castsi128_ps(__m128i);
extern __m128d _mm_castsi128_pd(__m128i);

/*
 * Support for 64-bit extension intrinsics
 */

#if defined (_M_X64)
extern __int64 _mm_cvtsd_si64(__m128d);
extern __int64 _mm_cvttsd_si64(__m128d);
extern __m128d _mm_cvtsi64_sd(__m128d, __int64);
extern __m128i _mm_cvtsi64_si128(__int64);
extern __int64 _mm_cvtsi128_si64(__m128i);

>>>pmmintrin.h头文件中的函数接口

/*
 * New Single precision vector instructions.
 */

extern __m128 _mm_addsub_ps(__m128 /* a */, __m128 /* b */);
extern __m128 _mm_hadd_ps(__m128 /* a */, __m128 /* b */);
extern __m128 _mm_hsub_ps(__m128 /* a */, __m128 /* b */);
extern __m128 _mm_movehdup_ps(__m128 /* a */);
extern __m128 _mm_moveldup_ps(__m128 /* a */);

/*
 * New double precision vector instructions.
 */

extern __m128d _mm_addsub_pd(__m128d /* a */, __m128d /* b */);
extern __m128d _mm_hadd_pd(__m128d /* a */, __m128d /* b */);
extern __m128d _mm_hsub_pd(__m128d /* a */, __m128d /* b */);
extern __m128d _mm_loaddup_pd(double const * /* dp */);
extern __m128d _mm_movedup_pd(__m128d /* a */);

/*
 * New unaligned integer vector load instruction.
 */
extern __m128i _mm_lddqu_si128(__m128i const * /* p */);

/*
 * Miscellaneous new instructions.
 */
/*
 * For _mm_monitor p goes in eax, extensions goes in ecx, hints goes in edx.
 */
extern void _mm_monitor(void const * /* p */, unsigned /* extensions */, unsigned /* hints */);

/*
 * For _mm_mwait, extensions goes in ecx, hints goes in eax.
 */
extern void _mm_mwait(unsigned /* extensions */, unsigned /* hints */);

>>>tmmintrin.h头文件中的函数接口

    // Horizontal Add: add pairs of adjacent words or double words.
    // Each field in the result is the sum of two adjacent fields
    // from the arguments, with the lower result fields coming from
    // the first argument and the upper result fields coming from
    // the second argument. The "hadds" forms saturate the signed
    // addition rather than wrapping.

    extern __m128i _mm_hadd_epi16 (__m128i, __m128i);
    extern __m128i _mm_hadd_epi32 (__m128i, __m128i);
    extern __m128i _mm_hadds_epi16 (__m128i, __m128i);

    extern __m64 _mm_hadd_pi16 (__m64, __m64);
    extern __m64 _mm_hadd_pi32 (__m64, __m64);
    extern __m64 _mm_hadds_pi16 (__m64, __m64);

    // Horizontal Subtract: subtract pairs of adjacent words or double
    // words. Each field in the result is the difference of two adjacent
    // fields from the arguments, where the upper field is subtracted
    // from the lower field. The lower result fields come from
    // the first argument and the upper result fields come from
    // the second argument. The "hsubs" forms saturate the signed
    // subtraction rather than wrapping.

    extern __m128i _mm_hsub_epi16 (__m128i, __m128i);
    extern __m128i _mm_hsub_epi32 (__m128i, __m128i);
    extern __m128i _mm_hsubs_epi16 (__m128i, __m128i);

    extern __m64 _mm_hsub_pi16 (__m64, __m64);
    extern __m64 _mm_hsub_pi32 (__m64, __m64);
    extern __m64 _mm_hsubs_pi16 (__m64, __m64);

    // Multiply unsigned bytes by signed bytes and sum the word
    // results in pairs with saturation. Each byte of the first
    // argument is zero-extended to a word field and each byte
    // of the second argument is sign-extended to a word field,
    // then each pair of words is multiplied together to give
    // signed word intermediate results. Pairs of words from
    // that result are added horizontally with saturation
    // to give the final result.

    extern __m128i _mm_maddubs_epi16 (__m128i, __m128i);

    extern __m64 _mm_maddubs_pi16 (__m64, __m64);

    // Packed multiply high integers with round and scaling,
    // {X,}MM2/m{128,64} (b) to {X,}MM1 (a).

    extern __m128i _mm_mulhrs_epi16 (__m128i, __m128i);

    extern __m64 _mm_mulhrs_pi16 (__m64, __m64);

    // Packed shuffle bytes
    // {X,}MM2/m{128,64} (b) by {X,}MM1 (a).

    extern __m128i _mm_shuffle_epi8 (__m128i, __m128i);

    extern __m64 _mm_shuffle_pi8 (__m64, __m64);

    // Packed byte, word, double word sign, {X,}MM2/m{128,64} (b) to
    // {X,}MM1 (a).

    extern __m128i _mm_sign_epi8 (__m128i, __m128i);
    extern __m128i _mm_sign_epi16 (__m128i, __m128i);
    extern __m128i _mm_sign_epi32 (__m128i, __m128i);

    extern __m64 _mm_sign_pi8 (__m64, __m64);
    extern __m64 _mm_sign_pi16 (__m64, __m64);
    extern __m64 _mm_sign_pi32 (__m64, __m64);

    // Packed align and shift right by n*8 bits,
    // {X,}MM2/m{128,64} (b) to {X,}MM1 (a).

    extern __m128i _mm_alignr_epi8 (__m128i, __m128i, int);

    extern __m64 _mm_alignr_pi8 (__m64, __m64, int);

    // Packed byte, word, double word absolute value,
    // {X,}MM2/m{128,64} (b) to {X,}MM1 (a).

    extern __m128i _mm_abs_epi8 (__m128i);
    extern __m128i _mm_abs_epi16 (__m128i);
    extern __m128i _mm_abs_epi32 (__m128i);

    extern __m64 _mm_abs_pi8 (__m64);
    extern __m64 _mm_abs_pi16 (__m64);
    extern __m64 _mm_abs_pi32 (__m64);

>>>smmintrin.h头文件中的函数接口

        // Integer blend instructions - select data from 2 sources
        // using constant or variable mask

        extern __m128i _mm_blend_epi16 (__m128i, __m128i, const int /* mask */);
        extern __m128i _mm_blendv_epi8 (__m128i, __m128i, __m128i mask);

        // Float single precision blend instructions - select data
        // from 2 sources using constant/variable mask

        extern __m128  _mm_blend_ps (__m128, __m128, const int /* mask */);
        extern __m128  _mm_blendv_ps(__m128, __m128, __m128 /* mask */);

        // Float double precision blend instructions - select data
        // from 2 sources using constant/variable mask

        extern __m128d _mm_blend_pd (__m128d, __m128d, const int /* mask */);
        extern __m128d _mm_blendv_pd(__m128d, __m128d, __m128d /* mask */);

        // Dot product instructions with mask-defined summing and zeroing
        // of result's parts

        extern __m128  _mm_dp_ps(__m128, __m128, const int /* mask */);
        extern __m128d _mm_dp_pd(__m128d, __m128d, const int /* mask */);

        // Packed integer 64-bit comparison, zeroing or filling with ones
        // corresponding parts of result

        extern __m128i _mm_cmpeq_epi64(__m128i, __m128i);

        // Min/max packed integer instructions

        extern __m128i _mm_min_epi8 (__m128i, __m128i);
        extern __m128i _mm_max_epi8 (__m128i, __m128i);

        extern __m128i _mm_min_epu16(__m128i, __m128i);
        extern __m128i _mm_max_epu16(__m128i, __m128i);

        extern __m128i _mm_min_epi32(__m128i, __m128i);
        extern __m128i _mm_max_epi32(__m128i, __m128i);
        extern __m128i _mm_min_epu32(__m128i, __m128i);
        extern __m128i _mm_max_epu32(__m128i, __m128i);

        // Packed integer 32-bit multiplication with truncation
        // of upper halves of results

        extern __m128i _mm_mullo_epi32(__m128i, __m128i);

        // Packed integer 32-bit multiplication of 2 pairs of operands
        // producing two 64-bit results

        extern __m128i _mm_mul_epi32(__m128i, __m128i);

        // Packed integer 128-bit bitwise comparison.
        // return 1 if (val 'and' mask) == 0

        extern int _mm_testz_si128(__m128i /* mask */, __m128i /* val */);

        // Packed integer 128-bit bitwise comparison.
        // return 1 if (val 'and_not' mask) == 0

        extern int _mm_testc_si128(__m128i /* mask */, __m128i /* val */);

        // Packed integer 128-bit bitwise comparison
        // ZF = ((val 'and' mask) == 0)  CF = ((val 'and_not' mask) == 0)
        // return 1 if both ZF and CF are 0

        extern int _mm_testnzc_si128(__m128i /* mask */, __m128i /* val */);

        // Insert single precision float into packed single precision
        // array element selected by index.
        // The bits [7-6] of the 3d parameter define src index,
        // the bits [5-4] define dst index, and bits [3-0] define zeroing
        // mask for dst

        extern __m128 _mm_insert_ps(__m128 /* dst */, __m128 /* src */, const int /* index */);

        // Helper macro to create index-parameter value for _mm_insert_ps

#define _MM_MK_INSERTPS_NDX(srcField, dstField, zeroMask) \
        (((srcField)<<6) | ((dstField)<<4) | (zeroMask))

        // Extract binary representation of single precision float from
        // packed single precision array element selected by index

        extern int _mm_extract_ps(__m128 /* src */, const int /* index */);

        // Extract single precision float from packed single precision
        // array element selected by index into dest

#define _MM_EXTRACT_FLOAT(dest, src, ndx) \
        *((int*)&(dest)) = _mm_extract_ps((src), (ndx))

        // Extract specified single precision float element
        // into the lower part of __m128

#define _MM_PICK_OUT_PS(src, num) \
        _mm_insert_ps(_mm_setzero_ps(), (src), \
                      _MM_MK_INSERTPS_NDX((num), 0, 0x0e))

        // Insert integer into packed integer array element
        // selected by index

        extern __m128i _mm_insert_epi8 (__m128i /* dst */, int /* src */, const int /* index */);
        extern __m128i _mm_insert_epi32(__m128i /* dst */, int /* src */, const int /* index */);

#if defined (_M_X64)
        extern __m128i _mm_insert_epi64(__m128i /* dst */, __int64 /* src */, const int /* index */);
#endif  /* defined (_M_X64) */
        // Extract integer from packed integer array element
        // selected by index

        extern int   _mm_extract_epi8 (__m128i /* src */, const int /* index */);
        extern int   _mm_extract_epi32(__m128i /* src */, const int /* index */);

#if defined (_M_X64)
        extern __int64 _mm_extract_epi64(__m128i /* src */, const int /* index */);
#endif  /* defined (_M_X64) */

        // Horizontal packed word minimum and its index in
        // result[15:0] and result[18:16] respectively

        extern __m128i _mm_minpos_epu16(__m128i);

        // Packed/single float double precision rounding

        extern __m128d _mm_round_pd(__m128d /* val */, int /* iRoundMode */);
        extern __m128d _mm_round_sd(__m128d /* dst */, __m128d /* val */, int /* iRoundMode */);

        // Packed/single float single precision rounding

        extern __m128  _mm_round_ps(__m128  /* val */, int /* iRoundMode */);
        extern __m128  _mm_round_ss(__m128 /* dst */, __m128  /* val */, int /* iRoundMode */);

        // Packed integer sign-extension

        extern __m128i _mm_cvtepi8_epi32 (__m128i);
        extern __m128i _mm_cvtepi16_epi32(__m128i);
        extern __m128i _mm_cvtepi8_epi64 (__m128i);
        extern __m128i _mm_cvtepi32_epi64(__m128i);
        extern __m128i _mm_cvtepi16_epi64(__m128i);
        extern __m128i _mm_cvtepi8_epi16 (__m128i);

        // Packed integer zero-extension

        extern __m128i _mm_cvtepu8_epi32 (__m128i);
        extern __m128i _mm_cvtepu16_epi32(__m128i);
        extern __m128i _mm_cvtepu8_epi64 (__m128i);
        extern __m128i _mm_cvtepu32_epi64(__m128i);
        extern __m128i _mm_cvtepu16_epi64(__m128i);
        extern __m128i _mm_cvtepu8_epi16 (__m128i);


        // Pack 8 double words from 2 operands into 8 words of result
        // with unsigned saturation

        extern __m128i _mm_packus_epi32(__m128i, __m128i);

        // Sum absolute 8-bit integer difference of adjacent groups of 4 byte
        // integers in operands. Starting offsets within operands are
        // determined by mask

        extern __m128i _mm_mpsadbw_epu8(__m128i /* s1 */, __m128i /* s2 */, const int /* mask */);

        /*
         * Load double quadword using non-temporal aligned hint
         */

        extern __m128i _mm_stream_load_si128(__m128i*);

>>>nmmintrin.h头文件中的函数接口

/*
 * Intrinsics for text/string processing.
 */

    extern __m128i _mm_cmpistrm (__m128i /* a */, __m128i /* b */, const int /* mode */);
    extern int     _mm_cmpistri (__m128i /* a */, __m128i /* b */, const int /* mode */);

    extern __m128i _mm_cmpestrm (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);
    extern int     _mm_cmpestri (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);

/*
 * Intrinsics for text/string processing and reading values of EFlags.
 */

    extern int     _mm_cmpistrz (__m128i /* a */, __m128i /* b */, const int /* mode */);
    extern int     _mm_cmpistrc (__m128i /* a */, __m128i /* b */, const int /* mode */);
    extern int     _mm_cmpistrs (__m128i /* a */, __m128i /* b */, const int /* mode */);
    extern int     _mm_cmpistro (__m128i /* a */, __m128i /* b */, const int /* mode */);
    extern int     _mm_cmpistra (__m128i /* a */, __m128i /* b */, const int /* mode */);

    extern int     _mm_cmpestrz (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);
    extern int     _mm_cmpestrc (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);
    extern int     _mm_cmpestrs (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);
    extern int     _mm_cmpestro (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);
    extern int     _mm_cmpestra (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);

/*
 * Packed integer 64-bit comparison, zeroing or filling with ones
 * corresponding parts of result
 */

    extern __m128i _mm_cmpgt_epi64(__m128i /* val1 */, __m128i /* val2 */);

/*
 * Calculate a number of bits set to 1
 */

    extern int _mm_popcnt_u32(unsigned int /* v */);

#if defined (_M_X64)
    extern __int64 _mm_popcnt_u64(unsigned __int64 /* v */);
#endif  /* defined (_M_X64) */

/*
 * Accumulate CRC32 (polynomial 0x11EDC6F41) value
 */

    extern unsigned int _mm_crc32_u8 (unsigned int /* crc */, unsigned char /* v */);
    extern unsigned int _mm_crc32_u16(unsigned int /* crc */, unsigned short /* v */);
    extern unsigned int _mm_crc32_u32(unsigned int /* crc */, unsigned int /* v */);

#if defined (_M_X64)
    extern unsigned __int64 _mm_crc32_u64(unsigned __int64 /* crc */, unsigned __int64 /* v */);
#endif  /

4.实际使用

由于上面头文件中的函数接口比较多,所以仅需要大致了解一下,在用到的时候则需要知道具体函数怎么调用以及功能。这里把最上面说的自己阅读代码时遇到的函数接口做一个简单介绍:

// Sets the 16 signed 8-bit integer values to b.
//将16个有符号8位整数值设置为b。
__m128i _mm_set1_epi8(char b)
// 加载128bits值,返回可以存放在代表寄存器的变量中的值;
// 注意:p不用是一个16-bit对齐的一个变量的地址;
__m128i _mm_loadu_si128 (__m128i *p);
//可存储128位数据;将__m128i 变量a的值存储到p所指定的变量中去;
//p必须是一个16-bit对齐的一个变量的地址。
_mm_store_si128 ( __m128i *p, __m128i a)
//将S0和S1的低64位数以8位为单位进行交错;
//S0:A15 A14 A13 A12 A11 A10 A9 A8 A7 A6 A5 A4 A3 A2 A1 A0
//S1:B15 B14 B13 B12 B11 B10 B9 B8 B7 B6 B5 B4 B3 B2 B1 B0
//_mm_unpacklo_epi8(S0,S1):B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
_mm_unpacklo_epi8(_m128i S0,_m128i S1)
//将两个有符号8位或者无符号16位整数相加
//r0 := a0 + b0
//r1 := a1 + b1
//...
//r7 := a7 + b7
__m128i _mm_add_epi16 (__m128i a, __m128i b);
//右移8*imm8位
_m128i _mm_srli_si128 (__m128i a, int imm8)
  C++知识库 最新文章
【C++】友元、嵌套类、异常、RTTI、类型转换
通讯录的思路与实现(C语言)
C++PrimerPlus 第七章 函数-C++的编程模块(
Problem C: 算法9-9~9-12:平衡二叉树的基本
MSVC C++ UTF-8编程
C++进阶 多态原理
简单string类c++实现
我的年度总结
【C语言】以深厚地基筑伟岸高楼-基础篇(六
c语言常见错误合集
上一篇文章      下一篇文章      查看所有文章
加:2022-04-09 18:05:39  更:2022-04-09 18:09:31 
 
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁

360图书馆 购物 三丰科技 阅读网 日历 万年历 2025年1日历 -2025/1/11 0:18:29-

图片自动播放器
↓图片自动播放器↓
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
图片批量下载器
↓批量下载图片,美女图库↓
  网站联系: qq:121756557 email:121756557@qq.com  IT数码