[X86] Remove __extension__ from macro intrinsics when its not needed.

I think this is a holdover from when we used to declare variables inside the macros. And then its been copy and pasted forward for years every time a new macro intrinsic gets added.

Interestingly this caused some tests for IRGen to be slightly more optimized. We now return a zeroinitializer directly instead of going through a store+load.

It also removed a bogus error message on another test.

llvm-svn: 333613
This commit is contained in:
Craig Topper 2018-05-31 00:51:20 +00:00
parent 7744c7f137
commit c633867944
23 changed files with 2939 additions and 2941 deletions

View File

@ -126,9 +126,9 @@ _mm256_adds_epu16(__m256i __a, __m256i __b)
return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b); return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
} }
#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \ #define _mm256_alignr_epi8(a, b, n) \
(__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (n)); }) (__v32qi)(__m256i)(b), (n))
static __inline__ __m256i __DEFAULT_FN_ATTRS static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_and_si256(__m256i __a, __m256i __b) _mm256_and_si256(__m256i __a, __m256i __b)
@ -169,7 +169,7 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
(__v32qi)__M); (__v32qi)__M);
} }
#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \ #define _mm256_blend_epi16(V1, V2, M) \
(__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1), \ (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1), \
(__v16hi)(__m256i)(V2), \ (__v16hi)(__m256i)(V2), \
(((M) & 0x01) ? 16 : 0), \ (((M) & 0x01) ? 16 : 0), \
@ -187,7 +187,7 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
(((M) & 0x10) ? 28 : 12), \ (((M) & 0x10) ? 28 : 12), \
(((M) & 0x20) ? 29 : 13), \ (((M) & 0x20) ? 29 : 13), \
(((M) & 0x40) ? 30 : 14), \ (((M) & 0x40) ? 30 : 14), \
(((M) & 0x80) ? 31 : 15)); }) (((M) & 0x80) ? 31 : 15))
static __inline__ __m256i __DEFAULT_FN_ATTRS static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_cmpeq_epi8(__m256i __a, __m256i __b) _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
@ -503,7 +503,7 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
} }
#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \ #define _mm256_shuffle_epi32(a, imm) \
(__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \ (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
(__v8si)_mm256_undefined_si256(), \ (__v8si)_mm256_undefined_si256(), \
0 + (((imm) >> 0) & 0x3), \ 0 + (((imm) >> 0) & 0x3), \
@ -513,9 +513,9 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
4 + (((imm) >> 0) & 0x3), \ 4 + (((imm) >> 0) & 0x3), \
4 + (((imm) >> 2) & 0x3), \ 4 + (((imm) >> 2) & 0x3), \
4 + (((imm) >> 4) & 0x3), \ 4 + (((imm) >> 4) & 0x3), \
4 + (((imm) >> 6) & 0x3)); }) 4 + (((imm) >> 6) & 0x3))
#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \ #define _mm256_shufflehi_epi16(a, imm) \
(__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \ (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
(__v16hi)_mm256_undefined_si256(), \ (__v16hi)_mm256_undefined_si256(), \
0, 1, 2, 3, \ 0, 1, 2, 3, \
@ -527,9 +527,9 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
12 + (((imm) >> 0) & 0x3), \ 12 + (((imm) >> 0) & 0x3), \
12 + (((imm) >> 2) & 0x3), \ 12 + (((imm) >> 2) & 0x3), \
12 + (((imm) >> 4) & 0x3), \ 12 + (((imm) >> 4) & 0x3), \
12 + (((imm) >> 6) & 0x3)); }) 12 + (((imm) >> 6) & 0x3))
#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \ #define _mm256_shufflelo_epi16(a, imm) \
(__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \ (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
(__v16hi)_mm256_undefined_si256(), \ (__v16hi)_mm256_undefined_si256(), \
0 + (((imm) >> 0) & 0x3), \ 0 + (((imm) >> 0) & 0x3), \
@ -541,7 +541,7 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
8 + (((imm) >> 2) & 0x3), \ 8 + (((imm) >> 2) & 0x3), \
8 + (((imm) >> 4) & 0x3), \ 8 + (((imm) >> 4) & 0x3), \
8 + (((imm) >> 6) & 0x3), \ 8 + (((imm) >> 6) & 0x3), \
12, 13, 14, 15); }) 12, 13, 14, 15)
static __inline__ __m256i __DEFAULT_FN_ATTRS static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_sign_epi8(__m256i __a, __m256i __b) _mm256_sign_epi8(__m256i __a, __m256i __b)
@ -561,7 +561,7 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
} }
#define _mm256_slli_si256(a, imm) __extension__ ({ \ #define _mm256_slli_si256(a, imm) \
(__m256i)__builtin_shufflevector( \ (__m256i)__builtin_shufflevector( \
(__v32qi)_mm256_setzero_si256(), \ (__v32qi)_mm256_setzero_si256(), \
(__v32qi)(__m256i)(a), \ (__v32qi)(__m256i)(a), \
@ -596,7 +596,7 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 60) - (char)(imm), \ ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 60) - (char)(imm), \
((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 61) - (char)(imm), \ ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 61) - (char)(imm), \
((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 62) - (char)(imm), \ ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 62) - (char)(imm), \
((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 63) - (char)(imm)); }) ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 63) - (char)(imm))
#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count)) #define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
@ -660,7 +660,7 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
} }
#define _mm256_srli_si256(a, imm) __extension__ ({ \ #define _mm256_srli_si256(a, imm) \
(__m256i)__builtin_shufflevector( \ (__m256i)__builtin_shufflevector( \
(__v32qi)(__m256i)(a), \ (__v32qi)(__m256i)(a), \
(__v32qi)_mm256_setzero_si256(), \ (__v32qi)_mm256_setzero_si256(), \
@ -695,7 +695,7 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
((char)(imm)&0xF0) ? 60 : (char)(imm) + ((char)(imm)>0x3 ? 44 : 28), \ ((char)(imm)&0xF0) ? 60 : (char)(imm) + ((char)(imm)>0x3 ? 44 : 28), \
((char)(imm)&0xF0) ? 61 : (char)(imm) + ((char)(imm)>0x2 ? 45 : 29), \ ((char)(imm)&0xF0) ? 61 : (char)(imm) + ((char)(imm)>0x2 ? 45 : 29), \
((char)(imm)&0xF0) ? 62 : (char)(imm) + ((char)(imm)>0x1 ? 46 : 30), \ ((char)(imm)&0xF0) ? 62 : (char)(imm) + ((char)(imm)>0x1 ? 46 : 30), \
((char)(imm)&0xF0) ? 63 : (char)(imm) + ((char)(imm)>0x0 ? 47 : 31)); }) ((char)(imm)&0xF0) ? 63 : (char)(imm) + ((char)(imm)>0x0 ? 47 : 31))
#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count)) #define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
@ -874,15 +874,15 @@ _mm256_broadcastsi128_si256(__m128i __X)
return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
} }
#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \ #define _mm_blend_epi32(V1, V2, M) \
(__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1), \ (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1), \
(__v4si)(__m128i)(V2), \ (__v4si)(__m128i)(V2), \
(((M) & 0x01) ? 4 : 0), \ (((M) & 0x01) ? 4 : 0), \
(((M) & 0x02) ? 5 : 1), \ (((M) & 0x02) ? 5 : 1), \
(((M) & 0x04) ? 6 : 2), \ (((M) & 0x04) ? 6 : 2), \
(((M) & 0x08) ? 7 : 3)); }) (((M) & 0x08) ? 7 : 3))
#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \ #define _mm256_blend_epi32(V1, V2, M) \
(__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1), \ (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), \ (__v8si)(__m256i)(V2), \
(((M) & 0x01) ? 8 : 0), \ (((M) & 0x01) ? 8 : 0), \
@ -892,7 +892,7 @@ _mm256_broadcastsi128_si256(__m128i __X)
(((M) & 0x10) ? 12 : 4), \ (((M) & 0x10) ? 12 : 4), \
(((M) & 0x20) ? 13 : 5), \ (((M) & 0x20) ? 13 : 5), \
(((M) & 0x40) ? 14 : 6), \ (((M) & 0x40) ? 14 : 6), \
(((M) & 0x80) ? 15 : 7)); }) (((M) & 0x80) ? 15 : 7))
static __inline__ __m256i __DEFAULT_FN_ATTRS static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastb_epi8(__m128i __X) _mm256_broadcastb_epi8(__m128i __X)
@ -949,13 +949,13 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
} }
#define _mm256_permute4x64_pd(V, M) __extension__ ({ \ #define _mm256_permute4x64_pd(V, M) \
(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
(__v4df)_mm256_undefined_pd(), \ (__v4df)_mm256_undefined_pd(), \
((M) >> 0) & 0x3, \ ((M) >> 0) & 0x3, \
((M) >> 2) & 0x3, \ ((M) >> 2) & 0x3, \
((M) >> 4) & 0x3, \ ((M) >> 4) & 0x3, \
((M) >> 6) & 0x3); }) ((M) >> 6) & 0x3)
static __inline__ __m256 __DEFAULT_FN_ATTRS static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
@ -963,30 +963,30 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
} }
#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \ #define _mm256_permute4x64_epi64(V, M) \
(__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \ (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
(__v4di)_mm256_undefined_si256(), \ (__v4di)_mm256_undefined_si256(), \
((M) >> 0) & 0x3, \ ((M) >> 0) & 0x3, \
((M) >> 2) & 0x3, \ ((M) >> 2) & 0x3, \
((M) >> 4) & 0x3, \ ((M) >> 4) & 0x3, \
((M) >> 6) & 0x3); }) ((M) >> 6) & 0x3)
#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \ #define _mm256_permute2x128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); }) (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M))
#define _mm256_extracti128_si256(V, M) __extension__ ({ \ #define _mm256_extracti128_si256(V, M) \
(__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \ (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
(__v4di)_mm256_undefined_si256(), \ (__v4di)_mm256_undefined_si256(), \
(((M) & 1) ? 2 : 0), \ (((M) & 1) ? 2 : 0), \
(((M) & 1) ? 3 : 1) ); }) (((M) & 1) ? 3 : 1) )
#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \ #define _mm256_inserti128_si256(V1, V2, M) \
(__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \ (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
(((M) & 1) ? 0 : 4), \ (((M) & 1) ? 0 : 4), \
(((M) & 1) ? 1 : 5), \ (((M) & 1) ? 1 : 5), \
(((M) & 1) ? 4 : 2), \ (((M) & 1) ? 4 : 2), \
(((M) & 1) ? 5 : 3) ); }) (((M) & 1) ? 5 : 3) )
static __inline__ __m256i __DEFAULT_FN_ATTRS static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskload_epi32(int const *__X, __m256i __M) _mm256_maskload_epi32(int const *__X, __m256i __M)
@ -1096,212 +1096,212 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
} }
#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \ #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
(__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
(double const *)(m), \ (double const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s)); }) (__v2df)(__m128d)(mask), (s))
#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \ #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \ (double const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v4df)(__m256d)(mask), (s)); }) (__v4df)(__m256d)(mask), (s))
#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \ #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
(__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
(double const *)(m), \ (double const *)(m), \
(__v2di)(__m128i)(i), \ (__v2di)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s)); }) (__v2df)(__m128d)(mask), (s))
#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \ #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \ (double const *)(m), \
(__v4di)(__m256i)(i), \ (__v4di)(__m256i)(i), \
(__v4df)(__m256d)(mask), (s)); }) (__v4df)(__m256d)(mask), (s))
#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \ #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
(__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
(float const *)(m), \ (float const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s)); }) (__v4sf)(__m128)(mask), (s))
#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \ #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
(float const *)(m), \ (float const *)(m), \
(__v8si)(__m256i)(i), \ (__v8si)(__m256i)(i), \
(__v8sf)(__m256)(mask), (s)); }) (__v8sf)(__m256)(mask), (s))
#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \ #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
(__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
(float const *)(m), \ (float const *)(m), \
(__v2di)(__m128i)(i), \ (__v2di)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s)); }) (__v4sf)(__m128)(mask), (s))
#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \ #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
(float const *)(m), \ (float const *)(m), \
(__v4di)(__m256i)(i), \ (__v4di)(__m256i)(i), \
(__v4sf)(__m128)(mask), (s)); }) (__v4sf)(__m128)(mask), (s))
#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \ #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
(int const *)(m), \ (int const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s)); }) (__v4si)(__m128i)(mask), (s))
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \ #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
(__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
(int const *)(m), \ (int const *)(m), \
(__v8si)(__m256i)(i), \ (__v8si)(__m256i)(i), \
(__v8si)(__m256i)(mask), (s)); }) (__v8si)(__m256i)(mask), (s))
#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \ #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
(int const *)(m), \ (int const *)(m), \
(__v2di)(__m128i)(i), \ (__v2di)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s)); }) (__v4si)(__m128i)(mask), (s))
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \ #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
(int const *)(m), \ (int const *)(m), \
(__v4di)(__m256i)(i), \ (__v4di)(__m256i)(i), \
(__v4si)(__m128i)(mask), (s)); }) (__v4si)(__m128i)(mask), (s))
#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \ #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
(long long const *)(m), \ (long long const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s)); }) (__v2di)(__m128i)(mask), (s))
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \ #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
(__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \ (long long const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v4di)(__m256i)(mask), (s)); }) (__v4di)(__m256i)(mask), (s))
#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \ #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
(long long const *)(m), \ (long long const *)(m), \
(__v2di)(__m128i)(i), \ (__v2di)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s)); }) (__v2di)(__m128i)(mask), (s))
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \ #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
(__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \ (long long const *)(m), \
(__v4di)(__m256i)(i), \ (__v4di)(__m256i)(i), \
(__v4di)(__m256i)(mask), (s)); }) (__v4di)(__m256i)(mask), (s))
#define _mm_i32gather_pd(m, i, s) __extension__ ({ \ #define _mm_i32gather_pd(m, i, s) \
(__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \ (double const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
_mm_setzero_pd()), \ _mm_setzero_pd()), \
(s)); }) (s))
#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \ #define _mm256_i32gather_pd(m, i, s) \
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \ (double const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
_mm256_setzero_pd(), \ _mm256_setzero_pd(), \
_CMP_EQ_OQ), \ _CMP_EQ_OQ), \
(s)); }) (s))
#define _mm_i64gather_pd(m, i, s) __extension__ ({ \ #define _mm_i64gather_pd(m, i, s) \
(__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \ (double const *)(m), \
(__v2di)(__m128i)(i), \ (__v2di)(__m128i)(i), \
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
_mm_setzero_pd()), \ _mm_setzero_pd()), \
(s)); }) (s))
#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \ #define _mm256_i64gather_pd(m, i, s) \
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \ (double const *)(m), \
(__v4di)(__m256i)(i), \ (__v4di)(__m256i)(i), \
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
_mm256_setzero_pd(), \ _mm256_setzero_pd(), \
_CMP_EQ_OQ), \ _CMP_EQ_OQ), \
(s)); }) (s))
#define _mm_i32gather_ps(m, i, s) __extension__ ({ \ #define _mm_i32gather_ps(m, i, s) \
(__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \ (float const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \ _mm_setzero_ps()), \
(s)); }) (s))
#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \ #define _mm256_i32gather_ps(m, i, s) \
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
(float const *)(m), \ (float const *)(m), \
(__v8si)(__m256i)(i), \ (__v8si)(__m256i)(i), \
(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
_mm256_setzero_ps(), \ _mm256_setzero_ps(), \
_CMP_EQ_OQ), \ _CMP_EQ_OQ), \
(s)); }) (s))
#define _mm_i64gather_ps(m, i, s) __extension__ ({ \ #define _mm_i64gather_ps(m, i, s) \
(__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \ (float const *)(m), \
(__v2di)(__m128i)(i), \ (__v2di)(__m128i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \ _mm_setzero_ps()), \
(s)); }) (s))
#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \ #define _mm256_i64gather_ps(m, i, s) \
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \ (float const *)(m), \
(__v4di)(__m256i)(i), \ (__v4di)(__m256i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \ _mm_setzero_ps()), \
(s)); }) (s))
#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \ #define _mm_i32gather_epi32(m, i, s) \
(__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4si)(__m128i)(i), \ (int const *)(m), (__v4si)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s)); }) (__v4si)_mm_set1_epi32(-1), (s))
#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \ #define _mm256_i32gather_epi32(m, i, s) \
(__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
(int const *)(m), (__v8si)(__m256i)(i), \ (int const *)(m), (__v8si)(__m256i)(i), \
(__v8si)_mm256_set1_epi32(-1), (s)); }) (__v8si)_mm256_set1_epi32(-1), (s))
#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \ #define _mm_i64gather_epi32(m, i, s) \
(__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v2di)(__m128i)(i), \ (int const *)(m), (__v2di)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s)); }) (__v4si)_mm_set1_epi32(-1), (s))
#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \ #define _mm256_i64gather_epi32(m, i, s) \
(__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4di)(__m256i)(i), \ (int const *)(m), (__v4di)(__m256i)(i), \
(__v4si)_mm_set1_epi32(-1), (s)); }) (__v4si)_mm_set1_epi32(-1), (s))
#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \ #define _mm_i32gather_epi64(m, i, s) \
(__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \ (long long const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s)); }) (__v2di)_mm_set1_epi64x(-1), (s))
#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \ #define _mm256_i32gather_epi64(m, i, s) \
(__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \ (long long const *)(m), \
(__v4si)(__m128i)(i), \ (__v4si)(__m128i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s)); }) (__v4di)_mm256_set1_epi64x(-1), (s))
#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \ #define _mm_i64gather_epi64(m, i, s) \
(__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \ (long long const *)(m), \
(__v2di)(__m128i)(i), \ (__v2di)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s)); }) (__v2di)_mm_set1_epi64x(-1), (s))
#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \ #define _mm256_i64gather_epi64(m, i, s) \
(__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \ (long long const *)(m), \
(__v4di)(__m256i)(i), \ (__v4di)(__m256i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s)); }) (__v4di)_mm256_set1_epi64x(-1), (s))
#undef __DEFAULT_FN_ATTRS #undef __DEFAULT_FN_ATTRS

View File

@ -36,45 +36,45 @@ typedef unsigned long long __mmask64;
/* Integer compare */ /* Integer compare */
#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \ #define _mm512_cmp_epi8_mask(a, b, p) \
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \ (__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)-1); }) (__mmask64)-1)
#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ #define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \ (__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)(m)); }) (__mmask64)(m))
#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \ #define _mm512_cmp_epu8_mask(a, b, p) \
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \ (__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)-1); }) (__mmask64)-1)
#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ #define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \ (__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)(m)); }) (__mmask64)(m))
#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \ #define _mm512_cmp_epi16_mask(a, b, p) \
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \ (__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)-1); }) (__mmask32)-1)
#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ #define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \ (__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)(m)); }) (__mmask32)(m))
#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \ #define _mm512_cmp_epu16_mask(a, b, p) \
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \ (__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)-1); }) (__mmask32)-1)
#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ #define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \ (__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)(m)); }) (__mmask32)(m))
#define _mm512_cmpeq_epi8_mask(A, B) \ #define _mm512_cmpeq_epi8_mask(A, B) \
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
@ -1286,7 +1286,7 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
} }
#define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \ #define _mm512_shufflehi_epi16(A, imm) \
(__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \ (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
(__v32hi)_mm512_undefined_epi32(), \ (__v32hi)_mm512_undefined_epi32(), \
0, 1, 2, 3, \ 0, 1, 2, 3, \
@ -1308,21 +1308,21 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
28 + (((imm) >> 0) & 0x3), \ 28 + (((imm) >> 0) & 0x3), \
28 + (((imm) >> 2) & 0x3), \ 28 + (((imm) >> 2) & 0x3), \
28 + (((imm) >> 4) & 0x3), \ 28 + (((imm) >> 4) & 0x3), \
28 + (((imm) >> 6) & 0x3)); }) 28 + (((imm) >> 6) & 0x3))
#define _mm512_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ #define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shufflehi_epi16((A), \ (__v32hi)_mm512_shufflehi_epi16((A), \
(imm)), \ (imm)), \
(__v32hi)(__m512i)(W)); }) (__v32hi)(__m512i)(W))
#define _mm512_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \ #define _mm512_maskz_shufflehi_epi16(U, A, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shufflehi_epi16((A), \ (__v32hi)_mm512_shufflehi_epi16((A), \
(imm)), \ (imm)), \
(__v32hi)_mm512_setzero_si512()); }) (__v32hi)_mm512_setzero_si512())
#define _mm512_shufflelo_epi16(A, imm) __extension__ ({ \ #define _mm512_shufflelo_epi16(A, imm) \
(__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \ (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
(__v32hi)_mm512_undefined_epi32(), \ (__v32hi)_mm512_undefined_epi32(), \
0 + (((imm) >> 0) & 0x3), \ 0 + (((imm) >> 0) & 0x3), \
@ -1344,21 +1344,21 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
24 + (((imm) >> 2) & 0x3), \ 24 + (((imm) >> 2) & 0x3), \
24 + (((imm) >> 4) & 0x3), \ 24 + (((imm) >> 4) & 0x3), \
24 + (((imm) >> 6) & 0x3), \ 24 + (((imm) >> 6) & 0x3), \
28, 29, 30, 31); }) 28, 29, 30, 31)
#define _mm512_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \ #define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shufflelo_epi16((A), \ (__v32hi)_mm512_shufflelo_epi16((A), \
(imm)), \ (imm)), \
(__v32hi)(__m512i)(W)); }) (__v32hi)(__m512i)(W))
#define _mm512_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \ #define _mm512_maskz_shufflelo_epi16(U, A, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shufflelo_epi16((A), \ (__v32hi)_mm512_shufflelo_epi16((A), \
(imm)), \ (imm)), \
(__v32hi)_mm512_setzero_si512()); }) (__v32hi)_mm512_setzero_si512())
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_sllv_epi16(__m512i __A, __m512i __B) _mm512_sllv_epi16(__m512i __A, __m512i __B)
@ -1426,7 +1426,7 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
(__v32hi)_mm512_setzero_si512()); (__v32hi)_mm512_setzero_si512());
} }
#define _mm512_bslli_epi128(a, imm) __extension__ ({ \ #define _mm512_bslli_epi128(a, imm) \
(__m512i)__builtin_shufflevector( \ (__m512i)__builtin_shufflevector( \
(__v64qi)_mm512_setzero_si512(), \ (__v64qi)_mm512_setzero_si512(), \
(__v64qi)(__m512i)(a), \ (__v64qi)(__m512i)(a), \
@ -1493,7 +1493,7 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
((char)(imm)&0xF0) ? 60 : ((char)(imm)>0xC ? 76 : 124) - (char)(imm), \ ((char)(imm)&0xF0) ? 60 : ((char)(imm)>0xC ? 76 : 124) - (char)(imm), \
((char)(imm)&0xF0) ? 61 : ((char)(imm)>0xD ? 77 : 125) - (char)(imm), \ ((char)(imm)&0xF0) ? 61 : ((char)(imm)>0xD ? 77 : 125) - (char)(imm), \
((char)(imm)&0xF0) ? 62 : ((char)(imm)>0xE ? 78 : 126) - (char)(imm), \ ((char)(imm)&0xF0) ? 62 : ((char)(imm)>0xE ? 78 : 126) - (char)(imm), \
((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); }) ((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_srlv_epi16(__m512i __A, __m512i __B) _mm512_srlv_epi16(__m512i __A, __m512i __B)
@ -1627,7 +1627,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
(__v32hi)_mm512_setzero_si512()); (__v32hi)_mm512_setzero_si512());
} }
#define _mm512_bsrli_epi128(a, imm) __extension__ ({ \ #define _mm512_bsrli_epi128(a, imm) \
(__m512i)__builtin_shufflevector( \ (__m512i)__builtin_shufflevector( \
(__v64qi)(__m512i)(a), \ (__v64qi)(__m512i)(a), \
(__v64qi)_mm512_setzero_si512(), \ (__v64qi)_mm512_setzero_si512(), \
@ -1694,7 +1694,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
((char)(imm)&0xF0) ? 124 : (char)(imm) + ((char)(imm)>0x3 ? 108 : 60), \ ((char)(imm)&0xF0) ? 124 : (char)(imm) + ((char)(imm)>0x3 ? 108 : 60), \
((char)(imm)&0xF0) ? 125 : (char)(imm) + ((char)(imm)>0x2 ? 109 : 61), \ ((char)(imm)&0xF0) ? 125 : (char)(imm) + ((char)(imm)>0x2 ? 109 : 61), \
((char)(imm)&0xF0) ? 126 : (char)(imm) + ((char)(imm)>0x1 ? 110 : 62), \ ((char)(imm)&0xF0) ? 126 : (char)(imm) + ((char)(imm)>0x1 ? 110 : 62), \
((char)(imm)&0xF0) ? 127 : (char)(imm) + ((char)(imm)>0x0 ? 111 : 63)); }) ((char)(imm)&0xF0) ? 127 : (char)(imm) + ((char)(imm)>0x0 ? 111 : 63))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
@ -1978,37 +1978,37 @@ _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
(__v32hi)__W); (__v32hi)__W);
} }
#define _mm512_alignr_epi8(A, B, N) __extension__ ({\ #define _mm512_alignr_epi8(A, B, N) \
(__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \ (__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), (int)(N)); }) (__v64qi)(__m512i)(B), (int)(N))
#define _mm512_mask_alignr_epi8(W, U, A, B, N) __extension__({\ #define _mm512_mask_alignr_epi8(W, U, A, B, N) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
(__v64qi)(__m512i)(W)); }) (__v64qi)(__m512i)(W))
#define _mm512_maskz_alignr_epi8(U, A, B, N) __extension__({\ #define _mm512_maskz_alignr_epi8(U, A, B, N) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
(__v64qi)(__m512i)_mm512_setzero_si512()); }) (__v64qi)(__m512i)_mm512_setzero_si512())
#define _mm512_dbsad_epu8(A, B, imm) __extension__ ({\ #define _mm512_dbsad_epu8(A, B, imm) \
(__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \ (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), (int)(imm), \ (__v64qi)(__m512i)(B), (int)(imm), \
(__v32hi)_mm512_undefined_epi32(), \ (__v32hi)_mm512_undefined_epi32(), \
(__mmask32)-1); }) (__mmask32)-1)
#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) ({\ #define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
(__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \ (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), (int)(imm), \ (__v64qi)(__m512i)(B), (int)(imm), \
(__v32hi)(__m512i)(W), \ (__v32hi)(__m512i)(W), \
(__mmask32)(U)); }) (__mmask32)(U))
#define _mm512_maskz_dbsad_epu8(U, A, B, imm) ({\ #define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
(__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \ (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), (int)(imm), \ (__v64qi)(__m512i)(B), (int)(imm), \
(__v32hi)_mm512_setzero_si512(), \ (__v32hi)_mm512_setzero_si512(), \
(__mmask32)(U)); }) (__mmask32)(U))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_sad_epu8 (__m512i __A, __m512i __B) _mm512_sad_epu8 (__m512i __A, __m512i __B)

View File

@ -226,20 +226,20 @@ _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvt_roundpd_epi64(A, R) __extension__ ({ \ #define _mm512_cvt_roundpd_epi64(A, R) \
(__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
(__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)(__m512i)(W), \ (__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
(__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_cvtpd_epu64 (__m512d __A) { _mm512_cvtpd_epu64 (__m512d __A) {
@ -265,20 +265,20 @@ _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvt_roundpd_epu64(A, R) __extension__ ({ \ #define _mm512_cvt_roundpd_epu64(A, R) \
(__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
(__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)(__m512i)(W), \ (__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
(__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_cvtps_epi64 (__m256 __A) { _mm512_cvtps_epi64 (__m256 __A) {
@ -304,20 +304,20 @@ _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvt_roundps_epi64(A, R) __extension__ ({ \ #define _mm512_cvt_roundps_epi64(A, R) \
(__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
(__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)(__m512i)(W), \ (__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvt_roundps_epi64(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
(__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_cvtps_epu64 (__m256 __A) { _mm512_cvtps_epu64 (__m256 __A) {
@ -343,20 +343,20 @@ _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvt_roundps_epu64(A, R) __extension__ ({ \ #define _mm512_cvt_roundps_epu64(A, R) \
(__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
(__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)(__m512i)(W), \ (__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvt_roundps_epu64(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
(__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m512d __DEFAULT_FN_ATTRS static __inline__ __m512d __DEFAULT_FN_ATTRS
@ -378,20 +378,20 @@ _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
(__v8df)_mm512_setzero_pd()); (__v8df)_mm512_setzero_pd());
} }
#define _mm512_cvt_roundepi64_pd(A, R) __extension__ ({ \ #define _mm512_cvt_roundepi64_pd(A, R) \
(__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
(__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)(__m512d)(W), \ (__v8df)(__m512d)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
(__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m256 __DEFAULT_FN_ATTRS static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm512_cvtepi64_ps (__m512i __A) { _mm512_cvtepi64_ps (__m512i __A) {
@ -417,20 +417,20 @@ _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvt_roundepi64_ps(A, R) __extension__ ({ \ #define _mm512_cvt_roundepi64_ps(A, R) \
(__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)_mm256_setzero_ps(), \ (__v8sf)_mm256_setzero_ps(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
(__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)(__m256)(W), (__mmask8)(U), \ (__v8sf)(__m256)(W), (__mmask8)(U), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
(__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)_mm256_setzero_ps(), \ (__v8sf)_mm256_setzero_ps(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
@ -457,20 +457,20 @@ _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvtt_roundpd_epi64(A, R) __extension__ ({ \ #define _mm512_cvtt_roundpd_epi64(A, R) \
(__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
(__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)(__m512i)(W), \ (__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
(__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_cvttpd_epu64 (__m512d __A) { _mm512_cvttpd_epu64 (__m512d __A) {
@ -496,20 +496,20 @@ _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvtt_roundpd_epu64(A, R) __extension__ ({ \ #define _mm512_cvtt_roundpd_epu64(A, R) \
(__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
(__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)(__m512i)(W), \ (__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
(__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_cvttps_epi64 (__m256 __A) { _mm512_cvttps_epi64 (__m256 __A) {
@ -535,20 +535,20 @@ _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvtt_roundps_epi64(A, R) __extension__ ({ \ #define _mm512_cvtt_roundps_epi64(A, R) \
(__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
(__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)(__m512i)(W), \ (__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
(__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m512i __DEFAULT_FN_ATTRS static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_cvttps_epu64 (__m256 __A) { _mm512_cvttps_epu64 (__m256 __A) {
@ -574,20 +574,20 @@ _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvtt_roundps_epu64(A, R) __extension__ ({ \ #define _mm512_cvtt_roundps_epu64(A, R) \
(__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
(__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)(__m512i)(W), \ (__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
(__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \ (__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m512d __DEFAULT_FN_ATTRS static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_cvtepu64_pd (__m512i __A) { _mm512_cvtepu64_pd (__m512i __A) {
@ -608,21 +608,21 @@ _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
(__v8df)_mm512_setzero_pd()); (__v8df)_mm512_setzero_pd());
} }
#define _mm512_cvt_roundepu64_pd(A, R) __extension__ ({ \ #define _mm512_cvt_roundepu64_pd(A, R) \
(__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
(__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)(__m512d)(W), \ (__v8df)(__m512d)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
(__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
static __inline__ __m256 __DEFAULT_FN_ATTRS static __inline__ __m256 __DEFAULT_FN_ATTRS
@ -649,291 +649,291 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
_MM_FROUND_CUR_DIRECTION); _MM_FROUND_CUR_DIRECTION);
} }
#define _mm512_cvt_roundepu64_ps(A, R) __extension__ ({ \ #define _mm512_cvt_roundepu64_ps(A, R) \
(__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)_mm256_setzero_ps(), \ (__v8sf)_mm256_setzero_ps(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) __extension__ ({ \ #define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
(__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)(__m256)(W), (__mmask8)(U), \ (__v8sf)(__m256)(W), (__mmask8)(U), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) __extension__ ({ \ #define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
(__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)_mm256_setzero_ps(), \ (__v8sf)_mm256_setzero_ps(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_range_pd(A, B, C) __extension__ ({ \ #define _mm512_range_pd(A, B, C) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \ (__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, \ (__mmask8)-1, \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_range_pd(W, U, A, B, C) __extension__ ({ \ #define _mm512_mask_range_pd(W, U, A, B, C) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \ (__v8df)(__m512d)(B), (int)(C), \
(__v8df)(__m512d)(W), (__mmask8)(U), \ (__v8df)(__m512d)(W), (__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_range_pd(U, A, B, C) __extension__ ({ \ #define _mm512_maskz_range_pd(U, A, B, C) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \ (__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), \ (__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_range_round_pd(A, B, C, R) __extension__ ({ \ #define _mm512_range_round_pd(A, B, C, R) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \ (__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_range_round_pd(W, U, A, B, C, R) __extension__ ({ \ #define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \ (__v8df)(__m512d)(B), (int)(C), \
(__v8df)(__m512d)(W), (__mmask8)(U), \ (__v8df)(__m512d)(W), (__mmask8)(U), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_range_round_pd(U, A, B, C, R) __extension__ ({ \ #define _mm512_maskz_range_round_pd(U, A, B, C, R) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \ (__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_range_ps(A, B, C) __extension__ ({ \ #define _mm512_range_ps(A, B, C) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \ (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, \ (__mmask16)-1, \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_range_ps(W, U, A, B, C) __extension__ ({ \ #define _mm512_mask_range_ps(W, U, A, B, C) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \ (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)(__m512)(W), (__mmask16)(U), \ (__v16sf)(__m512)(W), (__mmask16)(U), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_range_ps(U, A, B, C) __extension__ ({ \ #define _mm512_maskz_range_ps(U, A, B, C) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \ (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)(U), \ (__mmask16)(U), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_range_round_ps(A, B, C, R) __extension__ ({ \ #define _mm512_range_round_ps(A, B, C, R) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \ (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)); }) (__mmask16)-1, (int)(R))
#define _mm512_mask_range_round_ps(W, U, A, B, C, R) __extension__ ({ \ #define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \ (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)(__m512)(W), (__mmask16)(U), \ (__v16sf)(__m512)(W), (__mmask16)(U), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_range_round_ps(U, A, B, C, R) __extension__ ({ \ #define _mm512_maskz_range_round_ps(U, A, B, C, R) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \ (__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)(U), (int)(R)); }) (__mmask16)(U), (int)(R))
#define _mm_range_round_ss(A, B, C, R) __extension__ ({ \ #define _mm_range_round_ss(A, B, C, R) \
(__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8) -1, (int)(C),\ (__mmask8) -1, (int)(C),\
(int)(R)); }) (int)(R))
#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm_mask_range_round_ss(W, U, A, B, C, R) __extension__ ({ \ #define _mm_mask_range_round_ss(W, U, A, B, C, R) \
(__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)(__m128)(W),\ (__v4sf)(__m128)(W),\
(__mmask8)(U), (int)(C),\ (__mmask8)(U), (int)(C),\
(int)(R)); }) (int)(R))
#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION) #define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_range_round_ss(U, A, B, C, R) __extension__ ({ \ #define _mm_maskz_range_round_ss(U, A, B, C, R) \
(__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)(U), (int)(C),\ (__mmask8)(U), (int)(C),\
(int)(R)); }) (int)(R))
#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm_range_round_sd(A, B, C, R) __extension__ ({ \ #define _mm_range_round_sd(A, B, C, R) \
(__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8) -1, (int)(C),\ (__mmask8) -1, (int)(C),\
(int)(R)); }) (int)(R))
#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm_mask_range_round_sd(W, U, A, B, C, R) __extension__ ({ \ #define _mm_mask_range_round_sd(W, U, A, B, C, R) \
(__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)(__m128d)(W),\ (__v2df)(__m128d)(W),\
(__mmask8)(U), (int)(C),\ (__mmask8)(U), (int)(C),\
(int)(R)); }) (int)(R))
#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_range_round_sd(U, A, B, C, R) __extension__ ({ \ #define _mm_maskz_range_round_sd(U, A, B, C, R) \
(__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)(U), (int)(C),\ (__mmask8)(U), (int)(C),\
(int)(R)); }) (int)(R))
#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm512_reduce_pd(A, B) __extension__ ({ \ #define _mm512_reduce_pd(A, B) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, \ (__mmask8)-1, \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_reduce_pd(W, U, A, B) __extension__ ({ \ #define _mm512_mask_reduce_pd(W, U, A, B) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)(__m512d)(W), \ (__v8df)(__m512d)(W), \
(__mmask8)(U), \ (__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_reduce_pd(U, A, B) __extension__ ({ \ #define _mm512_maskz_reduce_pd(U, A, B) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), \ (__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_reduce_ps(A, B) __extension__ ({ \ #define _mm512_reduce_ps(A, B) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, \ (__mmask16)-1, \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_reduce_ps(W, U, A, B) __extension__ ({ \ #define _mm512_mask_reduce_ps(W, U, A, B) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)(__m512)(W), \ (__v16sf)(__m512)(W), \
(__mmask16)(U), \ (__mmask16)(U), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_reduce_ps(U, A, B) __extension__ ({ \ #define _mm512_maskz_reduce_ps(U, A, B) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)(U), \ (__mmask16)(U), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm512_reduce_round_pd(A, B, R) __extension__ ({\ #define _mm512_reduce_round_pd(A, B, R) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_reduce_round_pd(W, U, A, B, R) __extension__ ({\ #define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)(__m512d)(W), \ (__v8df)(__m512d)(W), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_maskz_reduce_round_pd(U, A, B, R) __extension__ ({\ #define _mm512_maskz_reduce_round_pd(U, A, B, R) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), (int)(R)); }) (__mmask8)(U), (int)(R))
#define _mm512_reduce_round_ps(A, B, R) __extension__ ({\ #define _mm512_reduce_round_ps(A, B, R) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)); }) (__mmask16)-1, (int)(R))
#define _mm512_mask_reduce_round_ps(W, U, A, B, R) __extension__ ({\ #define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)(__m512)(W), \ (__v16sf)(__m512)(W), \
(__mmask16)(U), (int)(R)); }) (__mmask16)(U), (int)(R))
#define _mm512_maskz_reduce_round_ps(U, A, B, R) __extension__ ({\ #define _mm512_maskz_reduce_round_ps(U, A, B, R) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)(U), (int)(R)); }) (__mmask16)(U), (int)(R))
#define _mm_reduce_ss(A, B, C) __extension__ ({ \ #define _mm_reduce_ss(A, B, C) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
(int)(C), _MM_FROUND_CUR_DIRECTION); }) (int)(C), _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_reduce_ss(W, U, A, B, C) __extension__ ({ \ #define _mm_mask_reduce_ss(W, U, A, B, C) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)(__m128)(W), (__mmask8)(U), \ (__v4sf)(__m128)(W), (__mmask8)(U), \
(int)(C), _MM_FROUND_CUR_DIRECTION); }) (int)(C), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_reduce_ss(U, A, B, C) __extension__ ({ \ #define _mm_maskz_reduce_ss(U, A, B, C) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)(U), (int)(C), \ (__mmask8)(U), (int)(C), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm_reduce_round_ss(A, B, C, R) __extension__ ({ \ #define _mm_reduce_round_ss(A, B, C, R) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
(int)(C), (int)(R)); }) (int)(C), (int)(R))
#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) __extension__ ({ \ #define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)(__m128)(W), (__mmask8)(U), \ (__v4sf)(__m128)(W), (__mmask8)(U), \
(int)(C), (int)(R)); }) (int)(C), (int)(R))
#define _mm_maskz_reduce_round_ss(U, A, B, C, R) __extension__ ({ \ #define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)(U), (int)(C), (int)(R)); }) (__mmask8)(U), (int)(C), (int)(R))
#define _mm_reduce_sd(A, B, C) __extension__ ({ \ #define _mm_reduce_sd(A, B, C) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(C), \ (__mmask8)-1, (int)(C), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_reduce_sd(W, U, A, B, C) __extension__ ({ \ #define _mm_mask_reduce_sd(W, U, A, B, C) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)(__m128d)(W), (__mmask8)(U), \ (__v2df)(__m128d)(W), (__mmask8)(U), \
(int)(C), _MM_FROUND_CUR_DIRECTION); }) (int)(C), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_reduce_sd(U, A, B, C) __extension__ ({ \ #define _mm_maskz_reduce_sd(U, A, B, C) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)(U), (int)(C), \ (__mmask8)(U), (int)(C), \
_MM_FROUND_CUR_DIRECTION); }) _MM_FROUND_CUR_DIRECTION)
#define _mm_reduce_round_sd(A, B, C, R) __extension__ ({ \ #define _mm_reduce_round_sd(A, B, C, R) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(C), (int)(R)); }) (__mmask8)-1, (int)(C), (int)(R))
#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) __extension__ ({ \ #define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)(__m128d)(W), (__mmask8)(U), \ (__v2df)(__m128d)(W), (__mmask8)(U), \
(int)(C), (int)(R)); }) (int)(C), (int)(R))
#define _mm_maskz_reduce_round_sd(U, A, B, C, R) __extension__ ({ \ #define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)(U), (int)(C), (int)(R)); }) (__mmask8)(U), (int)(C), (int)(R))
static __inline__ __mmask16 __DEFAULT_FN_ATTRS static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_movepi32_mask (__m512i __A) _mm512_movepi32_mask (__m512i __A)
@ -1104,7 +1104,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(__v8di)_mm512_setzero_si512()); (__v8di)_mm512_setzero_si512());
} }
#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \ #define _mm512_extractf32x8_ps(A, imm) \
(__m256)__builtin_shufflevector((__v16sf)(__m512)(A), \ (__m256)__builtin_shufflevector((__v16sf)(__m512)(A), \
(__v16sf)_mm512_undefined_ps(), \ (__v16sf)_mm512_undefined_ps(), \
((imm) & 1) ? 8 : 0, \ ((imm) & 1) ? 8 : 0, \
@ -1114,35 +1114,35 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
((imm) & 1) ? 12 : 4, \ ((imm) & 1) ? 12 : 4, \
((imm) & 1) ? 13 : 5, \ ((imm) & 1) ? 13 : 5, \
((imm) & 1) ? 14 : 6, \ ((imm) & 1) ? 14 : 6, \
((imm) & 1) ? 15 : 7); }) ((imm) & 1) ? 15 : 7)
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) __extension__ ({ \ #define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
(__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
(__v8sf)_mm512_extractf32x8_ps((A), (imm)), \ (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
(__v8sf)(W)); }) (__v8sf)(W))
#define _mm512_maskz_extractf32x8_ps(U, A, imm) __extension__ ({ \ #define _mm512_maskz_extractf32x8_ps(U, A, imm) \
(__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
(__v8sf)_mm512_extractf32x8_ps((A), (imm)), \ (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
(__v8sf)_mm256_setzero_ps()); }) (__v8sf)_mm256_setzero_ps())
#define _mm512_extractf64x2_pd(A, imm) __extension__ ({ \ #define _mm512_extractf64x2_pd(A, imm) \
(__m128d)__builtin_shufflevector((__v8df)(__m512d)(A), \ (__m128d)__builtin_shufflevector((__v8df)(__m512d)(A), \
(__v8df)_mm512_undefined_pd(), \ (__v8df)_mm512_undefined_pd(), \
0 + ((imm) & 0x3) * 2, \ 0 + ((imm) & 0x3) * 2, \
1 + ((imm) & 0x3) * 2); }) 1 + ((imm) & 0x3) * 2)
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \ #define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
(__v2df)_mm512_extractf64x2_pd((A), (imm)), \ (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
(__v2df)(W)); }) (__v2df)(W))
#define _mm512_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \ #define _mm512_maskz_extractf64x2_pd(U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
(__v2df)_mm512_extractf64x2_pd((A), (imm)), \ (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
(__v2df)_mm_setzero_pd()); }) (__v2df)_mm_setzero_pd())
#define _mm512_extracti32x8_epi32(A, imm) __extension__ ({ \ #define _mm512_extracti32x8_epi32(A, imm) \
(__m256i)__builtin_shufflevector((__v16si)(__m512i)(A), \ (__m256i)__builtin_shufflevector((__v16si)(__m512i)(A), \
(__v16si)_mm512_undefined_epi32(), \ (__v16si)_mm512_undefined_epi32(), \
((imm) & 1) ? 8 : 0, \ ((imm) & 1) ? 8 : 0, \
@ -1152,35 +1152,35 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
((imm) & 1) ? 12 : 4, \ ((imm) & 1) ? 12 : 4, \
((imm) & 1) ? 13 : 5, \ ((imm) & 1) ? 13 : 5, \
((imm) & 1) ? 14 : 6, \ ((imm) & 1) ? 14 : 6, \
((imm) & 1) ? 15 : 7); }) ((imm) & 1) ? 15 : 7)
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) __extension__ ({ \ #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm512_extracti32x8_epi32((A), (imm)), \ (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
(__v8si)(W)); }) (__v8si)(W))
#define _mm512_maskz_extracti32x8_epi32(U, A, imm) __extension__ ({ \ #define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm512_extracti32x8_epi32((A), (imm)), \ (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
(__v8si)_mm256_setzero_si256()); }) (__v8si)_mm256_setzero_si256())
#define _mm512_extracti64x2_epi64(A, imm) __extension__ ({ \ #define _mm512_extracti64x2_epi64(A, imm) \
(__m128i)__builtin_shufflevector((__v8di)(__m512i)(A), \ (__m128i)__builtin_shufflevector((__v8di)(__m512i)(A), \
(__v8di)_mm512_undefined_epi32(), \ (__v8di)_mm512_undefined_epi32(), \
0 + ((imm) & 0x3) * 2, \ 0 + ((imm) & 0x3) * 2, \
1 + ((imm) & 0x3) * 2); }) 1 + ((imm) & 0x3) * 2)
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \ #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
(__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \ (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm512_extracti64x2_epi64((A), (imm)), \ (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
(__v2di)(W)); }) (__v2di)(W))
#define _mm512_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \ #define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
(__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \ (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm512_extracti64x2_epi64((A), (imm)), \ (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
(__v2di)_mm_setzero_si128()); }) (__v2di)_mm_setzero_si128())
#define _mm512_insertf32x8(A, B, imm) __extension__ ({ \ #define _mm512_insertf32x8(A, B, imm) \
(__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \ (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
(__v16sf)_mm512_castps256_ps512((__m256)(B)),\ (__v16sf)_mm512_castps256_ps512((__m256)(B)),\
((imm) & 0x1) ? 0 : 16, \ ((imm) & 0x1) ? 0 : 16, \
@ -1198,19 +1198,19 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
((imm) & 0x1) ? 20 : 12, \ ((imm) & 0x1) ? 20 : 12, \
((imm) & 0x1) ? 21 : 13, \ ((imm) & 0x1) ? 21 : 13, \
((imm) & 0x1) ? 22 : 14, \ ((imm) & 0x1) ? 22 : 14, \
((imm) & 0x1) ? 23 : 15); }) ((imm) & 0x1) ? 23 : 15)
#define _mm512_mask_insertf32x8(W, U, A, B, imm) __extension__ ({ \ #define _mm512_mask_insertf32x8(W, U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
(__v16sf)(W)); }) (__v16sf)(W))
#define _mm512_maskz_insertf32x8(U, A, B, imm) __extension__ ({ \ #define _mm512_maskz_insertf32x8(U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
(__v16sf)_mm512_setzero_ps()); }) (__v16sf)_mm512_setzero_ps())
#define _mm512_insertf64x2(A, B, imm) __extension__ ({ \ #define _mm512_insertf64x2(A, B, imm) \
(__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
(__v8df)_mm512_castpd128_pd512((__m128d)(B)),\ (__v8df)_mm512_castpd128_pd512((__m128d)(B)),\
(((imm) & 0x3) == 0) ? 8 : 0, \ (((imm) & 0x3) == 0) ? 8 : 0, \
@ -1220,19 +1220,19 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(((imm) & 0x3) == 2) ? 8 : 4, \ (((imm) & 0x3) == 2) ? 8 : 4, \
(((imm) & 0x3) == 2) ? 9 : 5, \ (((imm) & 0x3) == 2) ? 9 : 5, \
(((imm) & 0x3) == 3) ? 8 : 6, \ (((imm) & 0x3) == 3) ? 8 : 6, \
(((imm) & 0x3) == 3) ? 9 : 7); }) (((imm) & 0x3) == 3) ? 9 : 7)
#define _mm512_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \ #define _mm512_mask_insertf64x2(W, U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_insertf64x2((A), (B), (imm)), \ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
(__v8df)(W)); }) (__v8df)(W))
#define _mm512_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \ #define _mm512_maskz_insertf64x2(U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_insertf64x2((A), (B), (imm)), \ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
(__v8df)_mm512_setzero_pd()); }) (__v8df)_mm512_setzero_pd())
#define _mm512_inserti32x8(A, B, imm) __extension__ ({ \ #define _mm512_inserti32x8(A, B, imm) \
(__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \ (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
(__v16si)_mm512_castsi256_si512((__m256i)(B)),\ (__v16si)_mm512_castsi256_si512((__m256i)(B)),\
((imm) & 0x1) ? 0 : 16, \ ((imm) & 0x1) ? 0 : 16, \
@ -1250,19 +1250,19 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
((imm) & 0x1) ? 20 : 12, \ ((imm) & 0x1) ? 20 : 12, \
((imm) & 0x1) ? 21 : 13, \ ((imm) & 0x1) ? 21 : 13, \
((imm) & 0x1) ? 22 : 14, \ ((imm) & 0x1) ? 22 : 14, \
((imm) & 0x1) ? 23 : 15); }) ((imm) & 0x1) ? 23 : 15)
#define _mm512_mask_inserti32x8(W, U, A, B, imm) __extension__ ({ \ #define _mm512_mask_inserti32x8(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_inserti32x8((A), (B), (imm)), \ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
(__v16si)(W)); }) (__v16si)(W))
#define _mm512_maskz_inserti32x8(U, A, B, imm) __extension__ ({ \ #define _mm512_maskz_inserti32x8(U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_inserti32x8((A), (B), (imm)), \ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
(__v16si)_mm512_setzero_si512()); }) (__v16si)_mm512_setzero_si512())
#define _mm512_inserti64x2(A, B, imm) __extension__ ({ \ #define _mm512_inserti64x2(A, B, imm) \
(__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
(__v8di)_mm512_castsi128_si512((__m128i)(B)),\ (__v8di)_mm512_castsi128_si512((__m128i)(B)),\
(((imm) & 0x3) == 0) ? 8 : 0, \ (((imm) & 0x3) == 0) ? 8 : 0, \
@ -1272,49 +1272,49 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(((imm) & 0x3) == 2) ? 8 : 4, \ (((imm) & 0x3) == 2) ? 8 : 4, \
(((imm) & 0x3) == 2) ? 9 : 5, \ (((imm) & 0x3) == 2) ? 9 : 5, \
(((imm) & 0x3) == 3) ? 8 : 6, \ (((imm) & 0x3) == 3) ? 8 : 6, \
(((imm) & 0x3) == 3) ? 9 : 7); }) (((imm) & 0x3) == 3) ? 9 : 7)
#define _mm512_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \ #define _mm512_mask_inserti64x2(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_inserti64x2((A), (B), (imm)), \ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
(__v8di)(W)); }) (__v8di)(W))
#define _mm512_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \ #define _mm512_maskz_inserti64x2(U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_inserti64x2((A), (B), (imm)), \ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
(__v8di)_mm512_setzero_si512()); }) (__v8di)_mm512_setzero_si512())
#define _mm512_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \ #define _mm512_mask_fpclass_ps_mask(U, A, imm) \
(__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
(int)(imm), (__mmask16)(U)); }) (int)(imm), (__mmask16)(U))
#define _mm512_fpclass_ps_mask(A, imm) __extension__ ({ \ #define _mm512_fpclass_ps_mask(A, imm) \
(__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
(int)(imm), (__mmask16)-1); }) (int)(imm), (__mmask16)-1)
#define _mm512_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \ #define _mm512_mask_fpclass_pd_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm512_fpclass_pd_mask(A, imm) __extension__ ({ \ #define _mm512_fpclass_pd_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_fpclass_sd_mask(A, imm) __extension__ ({ \ #define _mm_fpclass_sd_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_fpclass_sd_mask(U, A, imm) __extension__ ({ \ #define _mm_mask_fpclass_sd_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_fpclass_ss_mask(A, imm) __extension__ ({ \ #define _mm_fpclass_ss_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_fpclass_ss_mask(U, A, imm) __extension__ ({ \ #define _mm_mask_fpclass_ss_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)(U)); }) (__mmask8)(U))
#undef __DEFAULT_FN_ATTRS #undef __DEFAULT_FN_ATTRS

View File

@ -28,20 +28,20 @@
#define __AVX512ERINTRIN_H #define __AVX512ERINTRIN_H
/* exp2a23 */ /* exp2a23 */
#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \ #define _mm512_exp2a23_round_pd(A, R) \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \ #define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \ (__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \ #define _mm512_maskz_exp2a23_round_pd(M, A, R) \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm512_exp2a23_pd(A) \ #define _mm512_exp2a23_pd(A) \
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -52,20 +52,20 @@
#define _mm512_maskz_exp2a23_pd(M, A) \ #define _mm512_maskz_exp2a23_pd(M, A) \
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \ #define _mm512_exp2a23_round_ps(A, R) \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)); }) (__mmask16)-1, (int)(R))
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \ #define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \ (__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \ #define _mm512_maskz_exp2a23_round_ps(M, A, R) \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)); }) (__mmask16)(M), (int)(R))
#define _mm512_exp2a23_ps(A) \ #define _mm512_exp2a23_ps(A) \
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -77,20 +77,20 @@
_mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
/* rsqrt28 */ /* rsqrt28 */
#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \ #define _mm512_rsqrt28_round_pd(A, R) \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \ #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \ (__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \ #define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm512_rsqrt28_pd(A) \ #define _mm512_rsqrt28_pd(A) \
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -101,20 +101,20 @@
#define _mm512_maskz_rsqrt28_pd(M, A) \ #define _mm512_maskz_rsqrt28_pd(M, A) \
_mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \ #define _mm512_rsqrt28_round_ps(A, R) \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)); }) (__mmask16)-1, (int)(R))
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \ #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \ (__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \ #define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)); }) (__mmask16)(M), (int)(R))
#define _mm512_rsqrt28_ps(A) \ #define _mm512_rsqrt28_ps(A) \
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -125,23 +125,23 @@
#define _mm512_maskz_rsqrt28_ps(M, A) \ #define _mm512_maskz_rsqrt28_ps(M, A) \
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \ #define _mm_rsqrt28_round_ss(A, B, R) \
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \ #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \ (__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \ #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm_rsqrt28_ss(A, B) \ #define _mm_rsqrt28_ss(A, B) \
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -152,23 +152,23 @@
#define _mm_maskz_rsqrt28_ss(M, A, B) \ #define _mm_maskz_rsqrt28_ss(M, A, B) \
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \ #define _mm_rsqrt28_round_sd(A, B, R) \
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \ #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \ (__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \ #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm_rsqrt28_sd(A, B) \ #define _mm_rsqrt28_sd(A, B) \
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -180,20 +180,20 @@
_mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
/* rcp28 */ /* rcp28 */
#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \ #define _mm512_rcp28_round_pd(A, R) \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \ #define _mm512_mask_rcp28_round_pd(S, M, A, R) \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \ (__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \ #define _mm512_maskz_rcp28_round_pd(M, A, R) \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \ (__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm512_rcp28_pd(A) \ #define _mm512_rcp28_pd(A) \
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -204,20 +204,20 @@
#define _mm512_maskz_rcp28_pd(M, A) \ #define _mm512_maskz_rcp28_pd(M, A) \
_mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \ #define _mm512_rcp28_round_ps(A, R) \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)); }) (__mmask16)-1, (int)(R))
#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \ #define _mm512_mask_rcp28_round_ps(S, M, A, R) \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \ (__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)); }) (int)(R))
#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \ #define _mm512_maskz_rcp28_round_ps(M, A, R) \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \ (__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)); }) (__mmask16)(M), (int)(R))
#define _mm512_rcp28_ps(A) \ #define _mm512_rcp28_ps(A) \
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -228,23 +228,23 @@
#define _mm512_maskz_rcp28_ps(M, A) \ #define _mm512_maskz_rcp28_ps(M, A) \
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \ #define _mm_rcp28_round_ss(A, B, R) \
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \ #define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \ (__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \ #define _mm_maskz_rcp28_round_ss(M, A, B, R) \
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm_rcp28_ss(A, B) \ #define _mm_rcp28_ss(A, B) \
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -255,23 +255,23 @@
#define _mm_maskz_rcp28_ss(M, A, B) \ #define _mm_maskz_rcp28_ss(M, A, B) \
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \ #define _mm_rcp28_round_sd(A, B, R) \
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)); }) (__mmask8)-1, (int)(R))
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \ #define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \ (__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \ #define _mm_maskz_rcp28_round_sd(M, A, B, R) \
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)); }) (__mmask8)(M), (int)(R))
#define _mm_rcp28_sd(A, B) \ #define _mm_rcp28_sd(A, B) \
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)

File diff suppressed because it is too large Load Diff

View File

@ -31,80 +31,80 @@
/* Define the default attributes for the functions in this file. */ /* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf"))) #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\ #define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(long long const *)(addr), (int)(scale), \ (long long const *)(addr), (int)(scale), \
(int)(hint)); }) (int)(hint))
#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\ #define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
__builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \ __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
(long long const *)(addr), (int)(scale), \ (long long const *)(addr), (int)(scale), \
(int)(hint)); }) (int)(hint))
#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\ #define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfdps((__mmask16)(mask), \ __builtin_ia32_gatherpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (int const *)(addr), \ (__v16si)(__m512i)(index), (int const *)(addr), \
(int)(scale), (int)(hint)); }) (int)(scale), (int)(hint))
#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\ #define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
__builtin_ia32_gatherpfdps((__mmask16) -1, \ __builtin_ia32_gatherpfdps((__mmask16) -1, \
(__v16si)(__m512i)(index), (int const *)(addr), \ (__v16si)(__m512i)(index), (int const *)(addr), \
(int)(scale), (int)(hint)); }) (int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\ #define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(long long const *)(addr), (int)(scale), \ (long long const *)(addr), (int)(scale), \
(int)(hint)); }) (int)(hint))
#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\ #define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
__builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \ __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
(long long const *)(addr), (int)(scale), \ (long long const *)(addr), (int)(scale), \
(int)(hint)); }) (int)(hint))
#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\ #define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(int const *)(addr), (int)(scale), (int)(hint)); }) (int const *)(addr), (int)(scale), (int)(hint))
#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\ #define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
__builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \ __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
(int const *)(addr), (int)(scale), (int)(hint)); }) (int const *)(addr), (int)(scale), (int)(hint))
#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\ #define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
__builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \ __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
(long long *)(addr), (int)(scale), \ (long long *)(addr), (int)(scale), \
(int)(hint)); }) (int)(hint))
#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\ #define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(long long *)(addr), (int)(scale), \ (long long *)(addr), (int)(scale), \
(int)(hint)); }) (int)(hint))
#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\ #define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
__builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \ __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
(int *)(addr), (int)(scale), (int)(hint)); }) (int *)(addr), (int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\ #define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfdps((__mmask16)(mask), \ __builtin_ia32_scatterpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (int *)(addr), \ (__v16si)(__m512i)(index), (int *)(addr), \
(int)(scale), (int)(hint)); }) (int)(scale), (int)(hint))
#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\ #define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
__builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \ __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
(long long *)(addr), (int)(scale), \ (long long *)(addr), (int)(scale), \
(int)(hint)); }) (int)(hint))
#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\ #define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(long long *)(addr), (int)(scale), \ (long long *)(addr), (int)(scale), \
(int)(hint)); }) (int)(hint))
#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\ #define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
__builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \ __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
(int *)(addr), (int)(scale), (int)(hint)); }) (int *)(addr), (int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\ #define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(int *)(addr), (int)(scale), (int)(hint)); }) (int *)(addr), (int)(scale), (int)(hint))
#undef __DEFAULT_FN_ATTRS #undef __DEFAULT_FN_ATTRS

View File

@ -142,12 +142,12 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
__U); __U);
} }
#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ #define _mm512_mask_shldi_epi64(S, U, A, B, I) \
(__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \ (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \
(__v8di)(B), \ (__v8di)(B), \
(int)(I), \ (int)(I), \
(__v8di)(S), \ (__v8di)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm512_maskz_shldi_epi64(U, A, B, I) \ #define _mm512_maskz_shldi_epi64(U, A, B, I) \
_mm512_mask_shldi_epi64(_mm512_setzero_si512(), (U), (A), (B), (I)) _mm512_mask_shldi_epi64(_mm512_setzero_si512(), (U), (A), (B), (I))
@ -155,12 +155,12 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
#define _mm512_shldi_epi64(A, B, I) \ #define _mm512_shldi_epi64(A, B, I) \
_mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) _mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ #define _mm512_mask_shldi_epi32(S, U, A, B, I) \
(__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \ (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \
(__v16si)(B), \ (__v16si)(B), \
(int)(I), \ (int)(I), \
(__v16si)(S), \ (__v16si)(S), \
(__mmask16)(U)); }) (__mmask16)(U))
#define _mm512_maskz_shldi_epi32(U, A, B, I) \ #define _mm512_maskz_shldi_epi32(U, A, B, I) \
_mm512_mask_shldi_epi32(_mm512_setzero_si512(), (U), (A), (B), (I)) _mm512_mask_shldi_epi32(_mm512_setzero_si512(), (U), (A), (B), (I))
@ -168,12 +168,12 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
#define _mm512_shldi_epi32(A, B, I) \ #define _mm512_shldi_epi32(A, B, I) \
_mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) _mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ #define _mm512_mask_shldi_epi16(S, U, A, B, I) \
(__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \ (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \
(__v32hi)(B), \ (__v32hi)(B), \
(int)(I), \ (int)(I), \
(__v32hi)(S), \ (__v32hi)(S), \
(__mmask32)(U)); }) (__mmask32)(U))
#define _mm512_maskz_shldi_epi16(U, A, B, I) \ #define _mm512_maskz_shldi_epi16(U, A, B, I) \
_mm512_mask_shldi_epi16(_mm512_setzero_si512(), (U), (A), (B), (I)) _mm512_mask_shldi_epi16(_mm512_setzero_si512(), (U), (A), (B), (I))
@ -181,12 +181,12 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
#define _mm512_shldi_epi16(A, B, I) \ #define _mm512_shldi_epi16(A, B, I) \
_mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I)) _mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ #define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
(__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \ (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \
(__v8di)(B), \ (__v8di)(B), \
(int)(I), \ (int)(I), \
(__v8di)(S), \ (__v8di)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \ #define _mm512_maskz_shrdi_epi64(U, A, B, I) \
_mm512_mask_shrdi_epi64(_mm512_setzero_si512(), (U), (A), (B), (I)) _mm512_mask_shrdi_epi64(_mm512_setzero_si512(), (U), (A), (B), (I))
@ -194,12 +194,12 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
#define _mm512_shrdi_epi64(A, B, I) \ #define _mm512_shrdi_epi64(A, B, I) \
_mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) _mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ #define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
(__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \ (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \
(__v16si)(B), \ (__v16si)(B), \
(int)(I), \ (int)(I), \
(__v16si)(S), \ (__v16si)(S), \
(__mmask16)(U)); }) (__mmask16)(U))
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \ #define _mm512_maskz_shrdi_epi32(U, A, B, I) \
_mm512_mask_shrdi_epi32(_mm512_setzero_si512(), (U), (A), (B), (I)) _mm512_mask_shrdi_epi32(_mm512_setzero_si512(), (U), (A), (B), (I))
@ -207,12 +207,12 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
#define _mm512_shrdi_epi32(A, B, I) \ #define _mm512_shrdi_epi32(A, B, I) \
_mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) _mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ #define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
(__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \ (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \
(__v32hi)(B), \ (__v32hi)(B), \
(int)(I), \ (int)(I), \
(__v32hi)(S), \ (__v32hi)(S), \
(__mmask32)(U)); }) (__mmask32)(U))
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \ #define _mm512_maskz_shrdi_epi16(U, A, B, I) \
_mm512_mask_shrdi_epi16(_mm512_setzero_si512(), (U), (A), (B), (I)) _mm512_mask_shrdi_epi16(_mm512_setzero_si512(), (U), (A), (B), (I))

View File

@ -33,85 +33,85 @@
/* Integer compare */ /* Integer compare */
#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \ #define _mm_cmp_epi8_mask(a, b, p) \
(__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (int)(p), \ (__v16qi)(__m128i)(b), (int)(p), \
(__mmask16)-1); }) (__mmask16)-1)
#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ #define _mm_mask_cmp_epi8_mask(m, a, b, p) \
(__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (int)(p), \ (__v16qi)(__m128i)(b), (int)(p), \
(__mmask16)(m)); }) (__mmask16)(m))
#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \ #define _mm_cmp_epu8_mask(a, b, p) \
(__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (int)(p), \ (__v16qi)(__m128i)(b), (int)(p), \
(__mmask16)-1); }) (__mmask16)-1)
#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ #define _mm_mask_cmp_epu8_mask(m, a, b, p) \
(__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (int)(p), \ (__v16qi)(__m128i)(b), (int)(p), \
(__mmask16)(m)); }) (__mmask16)(m))
#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \ #define _mm256_cmp_epi8_mask(a, b, p) \
(__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (int)(p), \ (__v32qi)(__m256i)(b), (int)(p), \
(__mmask32)-1); }) (__mmask32)-1)
#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ #define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
(__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (int)(p), \ (__v32qi)(__m256i)(b), (int)(p), \
(__mmask32)(m)); }) (__mmask32)(m))
#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \ #define _mm256_cmp_epu8_mask(a, b, p) \
(__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (int)(p), \ (__v32qi)(__m256i)(b), (int)(p), \
(__mmask32)-1); }) (__mmask32)-1)
#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ #define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
(__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (int)(p), \ (__v32qi)(__m256i)(b), (int)(p), \
(__mmask32)(m)); }) (__mmask32)(m))
#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \ #define _mm_cmp_epi16_mask(a, b, p) \
(__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
(__v8hi)(__m128i)(b), (int)(p), \ (__v8hi)(__m128i)(b), (int)(p), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ #define _mm_mask_cmp_epi16_mask(m, a, b, p) \
(__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
(__v8hi)(__m128i)(b), (int)(p), \ (__v8hi)(__m128i)(b), (int)(p), \
(__mmask8)(m)); }) (__mmask8)(m))
#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \ #define _mm_cmp_epu16_mask(a, b, p) \
(__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
(__v8hi)(__m128i)(b), (int)(p), \ (__v8hi)(__m128i)(b), (int)(p), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ #define _mm_mask_cmp_epu16_mask(m, a, b, p) \
(__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
(__v8hi)(__m128i)(b), (int)(p), \ (__v8hi)(__m128i)(b), (int)(p), \
(__mmask8)(m)); }) (__mmask8)(m))
#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \ #define _mm256_cmp_epi16_mask(a, b, p) \
(__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
(__v16hi)(__m256i)(b), (int)(p), \ (__v16hi)(__m256i)(b), (int)(p), \
(__mmask16)-1); }) (__mmask16)-1)
#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ #define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
(__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
(__v16hi)(__m256i)(b), (int)(p), \ (__v16hi)(__m256i)(b), (int)(p), \
(__mmask16)(m)); }) (__mmask16)(m))
#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \ #define _mm256_cmp_epu16_mask(a, b, p) \
(__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
(__v16hi)(__m256i)(b), (int)(p), \ (__v16hi)(__m256i)(b), (int)(p), \
(__mmask16)-1); }) (__mmask16)-1)
#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ #define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
(__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
(__v16hi)(__m256i)(b), (int)(p), \ (__v16hi)(__m256i)(b), (int)(p), \
(__mmask16)(m)); }) (__mmask16)(m))
#define _mm_cmpeq_epi8_mask(A, B) \ #define _mm_cmpeq_epi8_mask(A, B) \
_mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
@ -1833,47 +1833,47 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
} }
#define _mm_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ #define _mm_mask_shufflehi_epi16(W, U, A, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shufflehi_epi16((A), (imm)), \ (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
(__v8hi)(__m128i)(W)); }) (__v8hi)(__m128i)(W))
#define _mm_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \ #define _mm_maskz_shufflehi_epi16(U, A, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shufflehi_epi16((A), (imm)), \ (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
(__v8hi)_mm_setzero_si128()); }) (__v8hi)_mm_setzero_si128())
#define _mm256_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ #define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
(__v16hi)(__m256i)(W)); }) (__v16hi)(__m256i)(W))
#define _mm256_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \ #define _mm256_maskz_shufflehi_epi16(U, A, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
(__v16hi)_mm256_setzero_si256()); }) (__v16hi)_mm256_setzero_si256())
#define _mm_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \ #define _mm_mask_shufflelo_epi16(W, U, A, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shufflelo_epi16((A), (imm)), \ (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
(__v8hi)(__m128i)(W)); }) (__v8hi)(__m128i)(W))
#define _mm_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \ #define _mm_maskz_shufflelo_epi16(U, A, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shufflelo_epi16((A), (imm)), \ (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
(__v8hi)_mm_setzero_si128()); }) (__v8hi)_mm_setzero_si128())
#define _mm256_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \ #define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shufflelo_epi16((A), \ (__v16hi)_mm256_shufflelo_epi16((A), \
(imm)), \ (imm)), \
(__v16hi)(__m256i)(W)); }) (__v16hi)(__m256i)(W))
#define _mm256_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \ #define _mm256_maskz_shufflelo_epi16(U, A, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shufflelo_epi16((A), \ (__v16hi)_mm256_shufflelo_epi16((A), \
(imm)), \ (imm)), \
(__v16hi)_mm256_setzero_si256()); }) (__v16hi)_mm256_setzero_si256())
static __inline__ __m256i __DEFAULT_FN_ATTRS static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_sllv_epi16(__m256i __A, __m256i __B) _mm256_sllv_epi16(__m256i __A, __m256i __B)
@ -2693,61 +2693,61 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
(__v16hi)__W); (__v16hi)__W);
} }
#define _mm_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \ #define _mm_mask_alignr_epi8(W, U, A, B, N) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
(__v16qi)(__m128i)(W)); }) (__v16qi)(__m128i)(W))
#define _mm_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \ #define _mm_maskz_alignr_epi8(U, A, B, N) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
(__v16qi)_mm_setzero_si128()); }) (__v16qi)_mm_setzero_si128())
#define _mm256_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \ #define _mm256_mask_alignr_epi8(W, U, A, B, N) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
(__v32qi)(__m256i)(W)); }) (__v32qi)(__m256i)(W))
#define _mm256_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \ #define _mm256_maskz_alignr_epi8(U, A, B, N) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
(__v32qi)_mm256_setzero_si256()); }) (__v32qi)_mm256_setzero_si256())
#define _mm_dbsad_epu8(A, B, imm) __extension__ ({ \ #define _mm_dbsad_epu8(A, B, imm) \
(__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \ (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(imm), \ (__v16qi)(__m128i)(B), (int)(imm), \
(__v8hi)_mm_setzero_si128(), \ (__v8hi)_mm_setzero_si128(), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \ #define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
(__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \ (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(imm), \ (__v16qi)(__m128i)(B), (int)(imm), \
(__v8hi)(__m128i)(W), \ (__v8hi)(__m128i)(W), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \ #define _mm_maskz_dbsad_epu8(U, A, B, imm) \
(__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \ (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(imm), \ (__v16qi)(__m128i)(B), (int)(imm), \
(__v8hi)_mm_setzero_si128(), \ (__v8hi)_mm_setzero_si128(), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_dbsad_epu8(A, B, imm) __extension__ ({ \ #define _mm256_dbsad_epu8(A, B, imm) \
(__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \ (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), (int)(imm), \ (__v32qi)(__m256i)(B), (int)(imm), \
(__v16hi)_mm256_setzero_si256(), \ (__v16hi)_mm256_setzero_si256(), \
(__mmask16)-1); }) (__mmask16)-1)
#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \ #define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
(__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \ (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), (int)(imm), \ (__v32qi)(__m256i)(B), (int)(imm), \
(__v16hi)(__m256i)(W), \ (__v16hi)(__m256i)(W), \
(__mmask16)(U)); }) (__mmask16)(U))
#define _mm256_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \ #define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
(__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \ (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), (int)(imm), \ (__v32qi)(__m256i)(B), (int)(imm), \
(__v16hi)_mm256_setzero_si256(), \ (__v16hi)_mm256_setzero_si256(), \
(__mmask16)(U)); }) (__mmask16)(U))
#undef __DEFAULT_FN_ATTRS #undef __DEFAULT_FN_ATTRS

View File

@ -789,135 +789,135 @@ _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
(__mmask8) __U); (__mmask8) __U);
} }
#define _mm_range_pd(A, B, C) __extension__ ({ \ #define _mm_range_pd(A, B, C) \
(__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), (int)(C), \ (__v2df)(__m128d)(B), (int)(C), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_range_pd(W, U, A, B, C) __extension__ ({ \ #define _mm_mask_range_pd(W, U, A, B, C) \
(__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), (int)(C), \ (__v2df)(__m128d)(B), (int)(C), \
(__v2df)(__m128d)(W), \ (__v2df)(__m128d)(W), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_range_pd(U, A, B, C) __extension__ ({ \ #define _mm_maskz_range_pd(U, A, B, C) \
(__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), (int)(C), \ (__v2df)(__m128d)(B), (int)(C), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_range_pd(A, B, C) __extension__ ({ \ #define _mm256_range_pd(A, B, C) \
(__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
(__v4df)(__m256d)(B), (int)(C), \ (__v4df)(__m256d)(B), (int)(C), \
(__v4df)_mm256_setzero_pd(), \ (__v4df)_mm256_setzero_pd(), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm256_mask_range_pd(W, U, A, B, C) __extension__ ({ \ #define _mm256_mask_range_pd(W, U, A, B, C) \
(__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
(__v4df)(__m256d)(B), (int)(C), \ (__v4df)(__m256d)(B), (int)(C), \
(__v4df)(__m256d)(W), \ (__v4df)(__m256d)(W), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_maskz_range_pd(U, A, B, C) __extension__ ({ \ #define _mm256_maskz_range_pd(U, A, B, C) \
(__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
(__v4df)(__m256d)(B), (int)(C), \ (__v4df)(__m256d)(B), (int)(C), \
(__v4df)_mm256_setzero_pd(), \ (__v4df)_mm256_setzero_pd(), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_range_ps(A, B, C) __extension__ ({ \ #define _mm_range_ps(A, B, C) \
(__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), (int)(C), \ (__v4sf)(__m128)(B), (int)(C), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_range_ps(W, U, A, B, C) __extension__ ({ \ #define _mm_mask_range_ps(W, U, A, B, C) \
(__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), (int)(C), \ (__v4sf)(__m128)(B), (int)(C), \
(__v4sf)(__m128)(W), (__mmask8)(U)); }) (__v4sf)(__m128)(W), (__mmask8)(U))
#define _mm_maskz_range_ps(U, A, B, C) __extension__ ({ \ #define _mm_maskz_range_ps(U, A, B, C) \
(__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), (int)(C), \ (__v4sf)(__m128)(B), (int)(C), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_range_ps(A, B, C) __extension__ ({ \ #define _mm256_range_ps(A, B, C) \
(__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
(__v8sf)(__m256)(B), (int)(C), \ (__v8sf)(__m256)(B), (int)(C), \
(__v8sf)_mm256_setzero_ps(), \ (__v8sf)_mm256_setzero_ps(), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm256_mask_range_ps(W, U, A, B, C) __extension__ ({ \ #define _mm256_mask_range_ps(W, U, A, B, C) \
(__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
(__v8sf)(__m256)(B), (int)(C), \ (__v8sf)(__m256)(B), (int)(C), \
(__v8sf)(__m256)(W), (__mmask8)(U)); }) (__v8sf)(__m256)(W), (__mmask8)(U))
#define _mm256_maskz_range_ps(U, A, B, C) __extension__ ({ \ #define _mm256_maskz_range_ps(U, A, B, C) \
(__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
(__v8sf)(__m256)(B), (int)(C), \ (__v8sf)(__m256)(B), (int)(C), \
(__v8sf)_mm256_setzero_ps(), \ (__v8sf)_mm256_setzero_ps(), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_reduce_pd(A, B) __extension__ ({ \ #define _mm_reduce_pd(A, B) \
(__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_reduce_pd(W, U, A, B) __extension__ ({ \ #define _mm_mask_reduce_pd(W, U, A, B) \
(__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
(__v2df)(__m128d)(W), \ (__v2df)(__m128d)(W), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_reduce_pd(U, A, B) __extension__ ({ \ #define _mm_maskz_reduce_pd(U, A, B) \
(__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
(__v2df)_mm_setzero_pd(), \ (__v2df)_mm_setzero_pd(), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_reduce_pd(A, B) __extension__ ({ \ #define _mm256_reduce_pd(A, B) \
(__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
(__v4df)_mm256_setzero_pd(), \ (__v4df)_mm256_setzero_pd(), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm256_mask_reduce_pd(W, U, A, B) __extension__ ({ \ #define _mm256_mask_reduce_pd(W, U, A, B) \
(__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
(__v4df)(__m256d)(W), \ (__v4df)(__m256d)(W), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_maskz_reduce_pd(U, A, B) __extension__ ({ \ #define _mm256_maskz_reduce_pd(U, A, B) \
(__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
(__v4df)_mm256_setzero_pd(), \ (__v4df)_mm256_setzero_pd(), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_reduce_ps(A, B) __extension__ ({ \ #define _mm_reduce_ps(A, B) \
(__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_reduce_ps(W, U, A, B) __extension__ ({ \ #define _mm_mask_reduce_ps(W, U, A, B) \
(__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
(__v4sf)(__m128)(W), \ (__v4sf)(__m128)(W), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_reduce_ps(U, A, B) __extension__ ({ \ #define _mm_maskz_reduce_ps(U, A, B) \
(__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
(__v4sf)_mm_setzero_ps(), \ (__v4sf)_mm_setzero_ps(), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_reduce_ps(A, B) __extension__ ({ \ #define _mm256_reduce_ps(A, B) \
(__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
(__v8sf)_mm256_setzero_ps(), \ (__v8sf)_mm256_setzero_ps(), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm256_mask_reduce_ps(W, U, A, B) __extension__ ({ \ #define _mm256_mask_reduce_ps(W, U, A, B) \
(__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
(__v8sf)(__m256)(W), \ (__v8sf)(__m256)(W), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_maskz_reduce_ps(U, A, B) __extension__ ({ \ #define _mm256_maskz_reduce_ps(U, A, B) \
(__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
(__v8sf)_mm256_setzero_ps(), \ (__v8sf)_mm256_setzero_ps(), \
(__mmask8)(U)); }) (__mmask8)(U))
static __inline__ __mmask8 __DEFAULT_FN_ATTRS static __inline__ __mmask8 __DEFAULT_FN_ATTRS
_mm_movepi32_mask (__m128i __A) _mm_movepi32_mask (__m128i __A)
@ -1085,105 +1085,105 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
(__v4di)_mm256_setzero_si256()); (__v4di)_mm256_setzero_si256());
} }
#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \ #define _mm256_extractf64x2_pd(A, imm) \
(__m128d)__builtin_shufflevector((__v4df)(__m256d)(A), \ (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A), \
(__v4df)_mm256_undefined_pd(), \ (__v4df)_mm256_undefined_pd(), \
((imm) & 1) ? 2 : 0, \ ((imm) & 1) ? 2 : 0, \
((imm) & 1) ? 3 : 1); }) ((imm) & 1) ? 3 : 1)
#define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \ #define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
(__v2df)_mm256_extractf64x2_pd((A), (imm)), \ (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
(__v2df)(W)); }) (__v2df)(W))
#define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \ #define _mm256_maskz_extractf64x2_pd(U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
(__v2df)_mm256_extractf64x2_pd((A), (imm)), \ (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
(__v2df)_mm_setzero_pd()); }) (__v2df)_mm_setzero_pd())
#define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \ #define _mm256_extracti64x2_epi64(A, imm) \
(__m128i)__builtin_shufflevector((__v4di)(__m256i)(A), \ (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A), \
(__v4di)_mm256_undefined_si256(), \ (__v4di)_mm256_undefined_si256(), \
((imm) & 1) ? 2 : 0, \ ((imm) & 1) ? 2 : 0, \
((imm) & 1) ? 3 : 1); }) ((imm) & 1) ? 3 : 1)
#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \ #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm256_extracti64x2_epi64((A), (imm)), \ (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
(__v2di)(W)); }) (__v2di)(W))
#define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \ #define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm256_extracti64x2_epi64((A), (imm)), \ (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
(__v2di)_mm_setzero_si128()); }) (__v2di)_mm_setzero_si128())
#define _mm256_insertf64x2(A, B, imm) __extension__ ({ \ #define _mm256_insertf64x2(A, B, imm) \
(__m256d)__builtin_shufflevector((__v4df)(A), \ (__m256d)__builtin_shufflevector((__v4df)(A), \
(__v4df)_mm256_castpd128_pd256((__m128d)(B)), \ (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \
((imm) & 0x1) ? 0 : 4, \ ((imm) & 0x1) ? 0 : 4, \
((imm) & 0x1) ? 1 : 5, \ ((imm) & 0x1) ? 1 : 5, \
((imm) & 0x1) ? 4 : 2, \ ((imm) & 0x1) ? 4 : 2, \
((imm) & 0x1) ? 5 : 3); }) ((imm) & 0x1) ? 5 : 3)
#define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \ #define _mm256_mask_insertf64x2(W, U, A, B, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
(__v4df)_mm256_insertf64x2((A), (B), (imm)), \ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
(__v4df)(W)); }) (__v4df)(W))
#define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \ #define _mm256_maskz_insertf64x2(U, A, B, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
(__v4df)_mm256_insertf64x2((A), (B), (imm)), \ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
(__v4df)_mm256_setzero_pd()); }) (__v4df)_mm256_setzero_pd())
#define _mm256_inserti64x2(A, B, imm) __extension__ ({ \ #define _mm256_inserti64x2(A, B, imm) \
(__m256i)__builtin_shufflevector((__v4di)(A), \ (__m256i)__builtin_shufflevector((__v4di)(A), \
(__v4di)_mm256_castsi128_si256((__m128i)(B)), \ (__v4di)_mm256_castsi128_si256((__m128i)(B)), \
((imm) & 0x1) ? 0 : 4, \ ((imm) & 0x1) ? 0 : 4, \
((imm) & 0x1) ? 1 : 5, \ ((imm) & 0x1) ? 1 : 5, \
((imm) & 0x1) ? 4 : 2, \ ((imm) & 0x1) ? 4 : 2, \
((imm) & 0x1) ? 5 : 3); }) ((imm) & 0x1) ? 5 : 3)
#define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \ #define _mm256_mask_inserti64x2(W, U, A, B, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_inserti64x2((A), (B), (imm)), \ (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
(__v4di)(W)); }) (__v4di)(W))
#define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \ #define _mm256_maskz_inserti64x2(U, A, B, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_inserti64x2((A), (B), (imm)), \ (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
(__v4di)_mm256_setzero_si256()); }) (__v4di)_mm256_setzero_si256())
#define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \ #define _mm_mask_fpclass_pd_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_fpclass_pd_mask(A, imm) __extension__ ({ \ #define _mm_fpclass_pd_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm256_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \ #define _mm256_mask_fpclass_pd_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_fpclass_pd_mask(A, imm) __extension__ ({ \ #define _mm256_fpclass_pd_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \ #define _mm_mask_fpclass_ps_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_fpclass_ps_mask(A, imm) __extension__ ({ \ #define _mm_fpclass_ps_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)-1); }) (__mmask8)-1)
#define _mm256_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \ #define _mm256_mask_fpclass_ps_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_fpclass_ps_mask(A, imm) __extension__ ({ \ #define _mm256_fpclass_ps_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
(__mmask8)-1); }) (__mmask8)-1)
#undef __DEFAULT_FN_ATTRS #undef __DEFAULT_FN_ATTRS

File diff suppressed because it is too large Load Diff

View File

@ -251,12 +251,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
__U); __U);
} }
#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ #define _mm256_mask_shldi_epi64(S, U, A, B, I) \
(__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \ (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \
(__v4di)(B), \ (__v4di)(B), \
(int)(I), \ (int)(I), \
(__v4di)(S), \ (__v4di)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_maskz_shldi_epi64(U, A, B, I) \ #define _mm256_maskz_shldi_epi64(U, A, B, I) \
_mm256_mask_shldi_epi64(_mm256_setzero_si256(), (U), (A), (B), (I)) _mm256_mask_shldi_epi64(_mm256_setzero_si256(), (U), (A), (B), (I))
@ -264,12 +264,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm256_shldi_epi64(A, B, I) \ #define _mm256_shldi_epi64(A, B, I) \
_mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) _mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ #define _mm_mask_shldi_epi64(S, U, A, B, I) \
(__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \ (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \
(__v2di)(B), \ (__v2di)(B), \
(int)(I), \ (int)(I), \
(__v2di)(S), \ (__v2di)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_shldi_epi64(U, A, B, I) \ #define _mm_maskz_shldi_epi64(U, A, B, I) \
_mm_mask_shldi_epi64(_mm_setzero_si128(), (U), (A), (B), (I)) _mm_mask_shldi_epi64(_mm_setzero_si128(), (U), (A), (B), (I))
@ -277,12 +277,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm_shldi_epi64(A, B, I) \ #define _mm_shldi_epi64(A, B, I) \
_mm_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) _mm_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ #define _mm256_mask_shldi_epi32(S, U, A, B, I) \
(__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \ (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \
(__v8si)(B), \ (__v8si)(B), \
(int)(I), \ (int)(I), \
(__v8si)(S), \ (__v8si)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_maskz_shldi_epi32(U, A, B, I) \ #define _mm256_maskz_shldi_epi32(U, A, B, I) \
_mm256_mask_shldi_epi32(_mm256_setzero_si256(), (U), (A), (B), (I)) _mm256_mask_shldi_epi32(_mm256_setzero_si256(), (U), (A), (B), (I))
@ -290,12 +290,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm256_shldi_epi32(A, B, I) \ #define _mm256_shldi_epi32(A, B, I) \
_mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) _mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ #define _mm_mask_shldi_epi32(S, U, A, B, I) \
(__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \ (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \
(__v4si)(B), \ (__v4si)(B), \
(int)(I), \ (int)(I), \
(__v4si)(S), \ (__v4si)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_shldi_epi32(U, A, B, I) \ #define _mm_maskz_shldi_epi32(U, A, B, I) \
_mm_mask_shldi_epi32(_mm_setzero_si128(), (U), (A), (B), (I)) _mm_mask_shldi_epi32(_mm_setzero_si128(), (U), (A), (B), (I))
@ -303,12 +303,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm_shldi_epi32(A, B, I) \ #define _mm_shldi_epi32(A, B, I) \
_mm_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) _mm_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ #define _mm256_mask_shldi_epi16(S, U, A, B, I) \
(__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \ (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \
(__v16hi)(B), \ (__v16hi)(B), \
(int)(I), \ (int)(I), \
(__v16hi)(S), \ (__v16hi)(S), \
(__mmask16)(U)); }) (__mmask16)(U))
#define _mm256_maskz_shldi_epi16(U, A, B, I) \ #define _mm256_maskz_shldi_epi16(U, A, B, I) \
_mm256_mask_shldi_epi16(_mm256_setzero_si256(), (U), (A), (B), (I)) _mm256_mask_shldi_epi16(_mm256_setzero_si256(), (U), (A), (B), (I))
@ -316,12 +316,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm256_shldi_epi16(A, B, I) \ #define _mm256_shldi_epi16(A, B, I) \
_mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) _mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ #define _mm_mask_shldi_epi16(S, U, A, B, I) \
(__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \ (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \
(__v8hi)(B), \ (__v8hi)(B), \
(int)(I), \ (int)(I), \
(__v8hi)(S), \ (__v8hi)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_shldi_epi16(U, A, B, I) \ #define _mm_maskz_shldi_epi16(U, A, B, I) \
_mm_mask_shldi_epi16(_mm_setzero_si128(), (U), (A), (B), (I)) _mm_mask_shldi_epi16(_mm_setzero_si128(), (U), (A), (B), (I))
@ -329,12 +329,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm_shldi_epi16(A, B, I) \ #define _mm_shldi_epi16(A, B, I) \
_mm_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) _mm_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ #define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
(__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \ (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \
(__v4di)(B), \ (__v4di)(B), \
(int)(I), \ (int)(I), \
(__v4di)(S), \ (__v4di)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \ #define _mm256_maskz_shrdi_epi64(U, A, B, I) \
_mm256_mask_shrdi_epi64(_mm256_setzero_si256(), (U), (A), (B), (I)) _mm256_mask_shrdi_epi64(_mm256_setzero_si256(), (U), (A), (B), (I))
@ -342,12 +342,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm256_shrdi_epi64(A, B, I) \ #define _mm256_shrdi_epi64(A, B, I) \
_mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) _mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ #define _mm_mask_shrdi_epi64(S, U, A, B, I) \
(__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \ (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \
(__v2di)(B), \ (__v2di)(B), \
(int)(I), \ (int)(I), \
(__v2di)(S), \ (__v2di)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_shrdi_epi64(U, A, B, I) \ #define _mm_maskz_shrdi_epi64(U, A, B, I) \
_mm_mask_shrdi_epi64(_mm_setzero_si128(), (U), (A), (B), (I)) _mm_mask_shrdi_epi64(_mm_setzero_si128(), (U), (A), (B), (I))
@ -355,12 +355,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm_shrdi_epi64(A, B, I) \ #define _mm_shrdi_epi64(A, B, I) \
_mm_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) _mm_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ #define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
(__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \ (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \
(__v8si)(B), \ (__v8si)(B), \
(int)(I), \ (int)(I), \
(__v8si)(S), \ (__v8si)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \ #define _mm256_maskz_shrdi_epi32(U, A, B, I) \
_mm256_mask_shrdi_epi32(_mm256_setzero_si256(), (U), (A), (B), (I)) _mm256_mask_shrdi_epi32(_mm256_setzero_si256(), (U), (A), (B), (I))
@ -368,12 +368,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm256_shrdi_epi32(A, B, I) \ #define _mm256_shrdi_epi32(A, B, I) \
_mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) _mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ #define _mm_mask_shrdi_epi32(S, U, A, B, I) \
(__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \ (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \
(__v4si)(B), \ (__v4si)(B), \
(int)(I), \ (int)(I), \
(__v4si)(S), \ (__v4si)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_shrdi_epi32(U, A, B, I) \ #define _mm_maskz_shrdi_epi32(U, A, B, I) \
_mm_mask_shrdi_epi32(_mm_setzero_si128(), (U), (A), (B), (I)) _mm_mask_shrdi_epi32(_mm_setzero_si128(), (U), (A), (B), (I))
@ -381,12 +381,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm_shrdi_epi32(A, B, I) \ #define _mm_shrdi_epi32(A, B, I) \
_mm_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) _mm_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ #define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
(__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \ (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \
(__v16hi)(B), \ (__v16hi)(B), \
(int)(I), \ (int)(I), \
(__v16hi)(S), \ (__v16hi)(S), \
(__mmask16)(U)); }) (__mmask16)(U))
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \ #define _mm256_maskz_shrdi_epi16(U, A, B, I) \
_mm256_mask_shrdi_epi16(_mm256_setzero_si256(), (U), (A), (B), (I)) _mm256_mask_shrdi_epi16(_mm256_setzero_si256(), (U), (A), (B), (I))
@ -394,12 +394,12 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
#define _mm256_shrdi_epi16(A, B, I) \ #define _mm256_shrdi_epi16(A, B, I) \
_mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) _mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ #define _mm_mask_shrdi_epi16(S, U, A, B, I) \
(__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \ (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \
(__v8hi)(B), \ (__v8hi)(B), \
(int)(I), \ (int)(I), \
(__v8hi)(S), \ (__v8hi)(S), \
(__mmask8)(U)); }) (__mmask8)(U))
#define _mm_maskz_shrdi_epi16(U, A, B, I) \ #define _mm_maskz_shrdi_epi16(U, A, B, I) \
_mm_mask_shrdi_epi16(_mm_setzero_si128(), (U), (A), (B), (I)) _mm_mask_shrdi_epi16(_mm_setzero_si128(), (U), (A), (B), (I))

View File

@ -408,8 +408,8 @@ _mm256_rcp_ps(__m256 __a)
/// 10: Upward (toward positive infinity). \n /// 10: Upward (toward positive infinity). \n
/// 11: Truncated. /// 11: Truncated.
/// \returns A 256-bit vector of [4 x double] containing the rounded values. /// \returns A 256-bit vector of [4 x double] containing the rounded values.
#define _mm256_round_pd(V, M) __extension__ ({ \ #define _mm256_round_pd(V, M) \
(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
/// Rounds the values stored in a 256-bit vector of [8 x float] as /// Rounds the values stored in a 256-bit vector of [8 x float] as
/// specified by the byte operand. The source values are rounded to integer /// specified by the byte operand. The source values are rounded to integer
@ -440,8 +440,8 @@ _mm256_rcp_ps(__m256 __a)
/// 10: Upward (toward positive infinity). \n /// 10: Upward (toward positive infinity). \n
/// 11: Truncated. /// 11: Truncated.
/// \returns A 256-bit vector of [8 x float] containing the rounded values. /// \returns A 256-bit vector of [8 x float] containing the rounded values.
#define _mm256_round_ps(V, M) __extension__ ({ \ #define _mm256_round_ps(V, M) \
(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
/// Rounds up the values stored in a 256-bit vector of [4 x double]. The /// Rounds up the values stored in a 256-bit vector of [4 x double]. The
/// source values are rounded up to integer values and returned as 64-bit /// source values are rounded up to integer values and returned as 64-bit
@ -997,10 +997,10 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// 1: Bits [127:64] of the source are copied to bits [127:64] of the /// 1: Bits [127:64] of the source are copied to bits [127:64] of the
/// returned vector. /// returned vector.
/// \returns A 128-bit vector of [2 x double] containing the copied values. /// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_permute_pd(A, C) __extension__ ({ \ #define _mm_permute_pd(A, C) \
(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
(__v2df)_mm_undefined_pd(), \ (__v2df)_mm_undefined_pd(), \
((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) ((C) >> 0) & 0x1, ((C) >> 1) & 0x1)
/// Copies the values in a 256-bit vector of [4 x double] as specified by /// Copies the values in a 256-bit vector of [4 x double] as specified by
/// the immediate integer operand. /// the immediate integer operand.
@ -1039,13 +1039,13 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// 1: Bits [255:192] of the source are copied to bits [255:192] of the /// 1: Bits [255:192] of the source are copied to bits [255:192] of the
/// returned vector. /// returned vector.
/// \returns A 256-bit vector of [4 x double] containing the copied values. /// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_permute_pd(A, C) __extension__ ({ \ #define _mm256_permute_pd(A, C) \
(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
(__v4df)_mm256_undefined_pd(), \ (__v4df)_mm256_undefined_pd(), \
0 + (((C) >> 0) & 0x1), \ 0 + (((C) >> 0) & 0x1), \
0 + (((C) >> 1) & 0x1), \ 0 + (((C) >> 1) & 0x1), \
2 + (((C) >> 2) & 0x1), \ 2 + (((C) >> 2) & 0x1), \
2 + (((C) >> 3) & 0x1)); }) 2 + (((C) >> 3) & 0x1))
/// Copies the values in a 128-bit vector of [4 x float] as specified by /// Copies the values in a 128-bit vector of [4 x float] as specified by
/// the immediate integer operand. /// the immediate integer operand.
@ -1100,11 +1100,11 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// 11: Bits [127:96] of the source are copied to bits [127:96] of the /// 11: Bits [127:96] of the source are copied to bits [127:96] of the
/// returned vector. /// returned vector.
/// \returns A 128-bit vector of [4 x float] containing the copied values. /// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_permute_ps(A, C) __extension__ ({ \ #define _mm_permute_ps(A, C) \
(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
(__v4sf)_mm_undefined_ps(), \ (__v4sf)_mm_undefined_ps(), \
((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) ((C) >> 4) & 0x3, ((C) >> 6) & 0x3)
/// Copies the values in a 256-bit vector of [8 x float] as specified by /// Copies the values in a 256-bit vector of [8 x float] as specified by
/// the immediate integer operand. /// the immediate integer operand.
@ -1195,7 +1195,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// 11: Bits [255:224] of the source are copied to bits [255:224] of the /// 11: Bits [255:224] of the source are copied to bits [255:224] of the
/// returned vector. /// returned vector.
/// \returns A 256-bit vector of [8 x float] containing the copied values. /// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_permute_ps(A, C) __extension__ ({ \ #define _mm256_permute_ps(A, C) \
(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
(__v8sf)_mm256_undefined_ps(), \ (__v8sf)_mm256_undefined_ps(), \
0 + (((C) >> 0) & 0x3), \ 0 + (((C) >> 0) & 0x3), \
@ -1205,7 +1205,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
4 + (((C) >> 0) & 0x3), \ 4 + (((C) >> 0) & 0x3), \
4 + (((C) >> 2) & 0x3), \ 4 + (((C) >> 2) & 0x3), \
4 + (((C) >> 4) & 0x3), \ 4 + (((C) >> 4) & 0x3), \
4 + (((C) >> 6) & 0x3)); }) 4 + (((C) >> 6) & 0x3))
/// Permutes 128-bit data values stored in two 256-bit vectors of /// Permutes 128-bit data values stored in two 256-bit vectors of
/// [4 x double], as specified by the immediate integer operand. /// [4 x double], as specified by the immediate integer operand.
@ -1244,9 +1244,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
/// destination. /// destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values. /// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ #define _mm256_permute2f128_pd(V1, V2, M) \
(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), (M)); }) (__v4df)(__m256d)(V2), (M))
/// Permutes 128-bit data values stored in two 256-bit vectors of /// Permutes 128-bit data values stored in two 256-bit vectors of
/// [8 x float], as specified by the immediate integer operand. /// [8 x float], as specified by the immediate integer operand.
@ -1285,9 +1285,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
/// destination. /// destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values. /// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ #define _mm256_permute2f128_ps(V1, V2, M) \
(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (M)); }) (__v8sf)(__m256)(V2), (M))
/// Permutes 128-bit data values stored in two 256-bit integer vectors, /// Permutes 128-bit data values stored in two 256-bit integer vectors,
/// as specified by the immediate integer operand. /// as specified by the immediate integer operand.
@ -1325,9 +1325,9 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
/// destination. /// destination.
/// \returns A 256-bit integer vector containing the copied values. /// \returns A 256-bit integer vector containing the copied values.
#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ #define _mm256_permute2f128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (M)); }) (__v8si)(__m256i)(V2), (M))
/* Vector Blend */ /* Vector Blend */
/// Merges 64-bit double-precision data values stored in either of the /// Merges 64-bit double-precision data values stored in either of the
@ -1354,13 +1354,13 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// destination. When a mask bit is 1, the corresponding 64-bit element in /// destination. When a mask bit is 1, the corresponding 64-bit element in
/// operand \a V2 is copied to the same position in the destination. /// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values. /// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ #define _mm256_blend_pd(V1, V2, M) \
(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), \ (__v4df)(__m256d)(V2), \
(((M) & 0x01) ? 4 : 0), \ (((M) & 0x01) ? 4 : 0), \
(((M) & 0x02) ? 5 : 1), \ (((M) & 0x02) ? 5 : 1), \
(((M) & 0x04) ? 6 : 2), \ (((M) & 0x04) ? 6 : 2), \
(((M) & 0x08) ? 7 : 3)); }) (((M) & 0x08) ? 7 : 3))
/// Merges 32-bit single-precision data values stored in either of the /// Merges 32-bit single-precision data values stored in either of the
/// two 256-bit vectors of [8 x float], as specified by the immediate /// two 256-bit vectors of [8 x float], as specified by the immediate
@ -1386,7 +1386,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// destination. When a mask bit is 1, the corresponding 32-bit element in /// destination. When a mask bit is 1, the corresponding 32-bit element in
/// operand \a V2 is copied to the same position in the destination. /// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values. /// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ #define _mm256_blend_ps(V1, V2, M) \
(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), \ (__v8sf)(__m256)(V2), \
(((M) & 0x01) ? 8 : 0), \ (((M) & 0x01) ? 8 : 0), \
@ -1396,7 +1396,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
(((M) & 0x10) ? 12 : 4), \ (((M) & 0x10) ? 12 : 4), \
(((M) & 0x20) ? 13 : 5), \ (((M) & 0x20) ? 13 : 5), \
(((M) & 0x40) ? 14 : 6), \ (((M) & 0x40) ? 14 : 6), \
(((M) & 0x80) ? 15 : 7)); }) (((M) & 0x80) ? 15 : 7))
/// Merges 64-bit double-precision data values stored in either of the /// Merges 64-bit double-precision data values stored in either of the
/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
@ -1492,9 +1492,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// is set to zero. The bitmask is applied in the same way to each of the /// is set to zero. The bitmask is applied in the same way to each of the
/// two parallel dot product computations. /// two parallel dot product computations.
/// \returns A 256-bit vector of [8 x float] containing the two dot products. /// \returns A 256-bit vector of [8 x float] containing the two dot products.
#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ #define _mm256_dp_ps(V1, V2, M) \
(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (M)); }) (__v8sf)(__m256)(V2), (M))
/* Vector shuffle */ /* Vector shuffle */
/// Selects 8 float values from the 256-bit operands of [8 x float], as /// Selects 8 float values from the 256-bit operands of [8 x float], as
@ -1546,7 +1546,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n /// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
/// 11: Bits [127:96] and [255:224] are copied from the selected operand. /// 11: Bits [127:96] and [255:224] are copied from the selected operand.
/// \returns A 256-bit vector of [8 x float] containing the shuffled values. /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ #define _mm256_shuffle_ps(a, b, mask) \
(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), \ (__v8sf)(__m256)(b), \
0 + (((mask) >> 0) & 0x3), \ 0 + (((mask) >> 0) & 0x3), \
@ -1556,7 +1556,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
4 + (((mask) >> 0) & 0x3), \ 4 + (((mask) >> 0) & 0x3), \
4 + (((mask) >> 2) & 0x3), \ 4 + (((mask) >> 2) & 0x3), \
12 + (((mask) >> 4) & 0x3), \ 12 + (((mask) >> 4) & 0x3), \
12 + (((mask) >> 6) & 0x3)); }) 12 + (((mask) >> 6) & 0x3))
/// Selects four double-precision values from the 256-bit operands of /// Selects four double-precision values from the 256-bit operands of
/// [4 x double], as specified by the immediate value operand. /// [4 x double], as specified by the immediate value operand.
@ -1600,13 +1600,13 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the /// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
/// destination. /// destination.
/// \returns A 256-bit vector of [4 x double] containing the shuffled values. /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ #define _mm256_shuffle_pd(a, b, mask) \
(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), \ (__v4df)(__m256d)(b), \
0 + (((mask) >> 0) & 0x1), \ 0 + (((mask) >> 0) & 0x1), \
4 + (((mask) >> 1) & 0x1), \ 4 + (((mask) >> 1) & 0x1), \
2 + (((mask) >> 2) & 0x1), \ 2 + (((mask) >> 2) & 0x1), \
6 + (((mask) >> 3) & 0x1)); }) 6 + (((mask) >> 3) & 0x1))
/* Compare */ /* Compare */
#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
@ -1698,9 +1698,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1E: Greater-than (ordered, non-signaling) \n /// 0x1E: Greater-than (ordered, non-signaling) \n
/// 0x1F: True (unordered, signaling) /// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [2 x double] containing the comparison results. /// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_pd(a, b, c) __extension__ ({ \ #define _mm_cmp_pd(a, b, c) \
(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c)); }) (__v2df)(__m128d)(b), (c))
/// Compares each of the corresponding values of two 128-bit vectors of /// Compares each of the corresponding values of two 128-bit vectors of
/// [4 x float], using the operation specified by the immediate integer /// [4 x float], using the operation specified by the immediate integer
@ -1758,9 +1758,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1E: Greater-than (ordered, non-signaling) \n /// 0x1E: Greater-than (ordered, non-signaling) \n
/// 0x1F: True (unordered, signaling) /// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [4 x float] containing the comparison results. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ps(a, b, c) __extension__ ({ \ #define _mm_cmp_ps(a, b, c) \
(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c)); }) (__v4sf)(__m128)(b), (c))
/// Compares each of the corresponding double-precision values of two /// Compares each of the corresponding double-precision values of two
/// 256-bit vectors of [4 x double], using the operation specified by the /// 256-bit vectors of [4 x double], using the operation specified by the
@ -1818,9 +1818,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1E: Greater-than (ordered, non-signaling) \n /// 0x1E: Greater-than (ordered, non-signaling) \n
/// 0x1F: True (unordered, signaling) /// 0x1F: True (unordered, signaling)
/// \returns A 256-bit vector of [4 x double] containing the comparison results. /// \returns A 256-bit vector of [4 x double] containing the comparison results.
#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ #define _mm256_cmp_pd(a, b, c) \
(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), (c)); }) (__v4df)(__m256d)(b), (c))
/// Compares each of the corresponding values of two 256-bit vectors of /// Compares each of the corresponding values of two 256-bit vectors of
/// [8 x float], using the operation specified by the immediate integer /// [8 x float], using the operation specified by the immediate integer
@ -1878,9 +1878,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1E: Greater-than (ordered, non-signaling) \n /// 0x1E: Greater-than (ordered, non-signaling) \n
/// 0x1F: True (unordered, signaling) /// 0x1F: True (unordered, signaling)
/// \returns A 256-bit vector of [8 x float] containing the comparison results. /// \returns A 256-bit vector of [8 x float] containing the comparison results.
#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ #define _mm256_cmp_ps(a, b, c) \
(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), (c)); }) (__v8sf)(__m256)(b), (c))
/// Compares each of the corresponding scalar double-precision values of /// Compares each of the corresponding scalar double-precision values of
/// two 128-bit vectors of [2 x double], using the operation specified by the /// two 128-bit vectors of [2 x double], using the operation specified by the
@ -1937,9 +1937,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1E: Greater-than (ordered, non-signaling) \n /// 0x1E: Greater-than (ordered, non-signaling) \n
/// 0x1F: True (unordered, signaling) /// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [2 x double] containing the comparison results. /// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_sd(a, b, c) __extension__ ({ \ #define _mm_cmp_sd(a, b, c) \
(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c)); }) (__v2df)(__m128d)(b), (c))
/// Compares each of the corresponding scalar values of two 128-bit /// Compares each of the corresponding scalar values of two 128-bit
/// vectors of [4 x float], using the operation specified by the immediate /// vectors of [4 x float], using the operation specified by the immediate
@ -1996,9 +1996,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1E: Greater-than (ordered, non-signaling) \n /// 0x1E: Greater-than (ordered, non-signaling) \n
/// 0x1F: True (unordered, signaling) /// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [4 x float] containing the comparison results. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ss(a, b, c) __extension__ ({ \ #define _mm_cmp_ss(a, b, c) \
(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c)); }) (__v4sf)(__m128)(b), (c))
/// Takes a [8 x i32] vector and returns the vector element value /// Takes a [8 x i32] vector and returns the vector element value
/// indexed by the immediate constant operand. /// indexed by the immediate constant operand.
@ -4669,7 +4669,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
/// result. /// result.
/// \returns A 256-bit vector of [8 x float] containing the interleaved values. /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ #define _mm256_insertf128_ps(V1, V2, M) \
(__m256)__builtin_shufflevector( \ (__m256)__builtin_shufflevector( \
(__v8sf)(__m256)(V1), \ (__v8sf)(__m256)(V1), \
(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
@ -4680,7 +4680,7 @@ _mm256_zextsi128_si256(__m128i __a)
(((M) & 1) ? 8 : 4), \ (((M) & 1) ? 8 : 4), \
(((M) & 1) ? 9 : 5), \ (((M) & 1) ? 9 : 5), \
(((M) & 1) ? 10 : 6), \ (((M) & 1) ? 10 : 6), \
(((M) & 1) ? 11 : 7) );}) (((M) & 1) ? 11 : 7) )
/// Constructs a new 256-bit vector of [4 x double] by first duplicating /// Constructs a new 256-bit vector of [4 x double] by first duplicating
/// a 256-bit vector of [4 x double] given in the first parameter, and then /// a 256-bit vector of [4 x double] given in the first parameter, and then
@ -4716,14 +4716,14 @@ _mm256_zextsi128_si256(__m128i __a)
/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
/// result. /// result.
/// \returns A 256-bit vector of [4 x double] containing the interleaved values. /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ #define _mm256_insertf128_pd(V1, V2, M) \
(__m256d)__builtin_shufflevector( \ (__m256d)__builtin_shufflevector( \
(__v4df)(__m256d)(V1), \ (__v4df)(__m256d)(V1), \
(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
(((M) & 1) ? 0 : 4), \ (((M) & 1) ? 0 : 4), \
(((M) & 1) ? 1 : 5), \ (((M) & 1) ? 1 : 5), \
(((M) & 1) ? 4 : 2), \ (((M) & 1) ? 4 : 2), \
(((M) & 1) ? 5 : 3) );}) (((M) & 1) ? 5 : 3) )
/// Constructs a new 256-bit integer vector by first duplicating a /// Constructs a new 256-bit integer vector by first duplicating a
/// 256-bit integer vector given in the first parameter, and then replacing /// 256-bit integer vector given in the first parameter, and then replacing
@ -4759,14 +4759,14 @@ _mm256_zextsi128_si256(__m128i __a)
/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
/// result. /// result.
/// \returns A 256-bit integer vector containing the interleaved values. /// \returns A 256-bit integer vector containing the interleaved values.
#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ #define _mm256_insertf128_si256(V1, V2, M) \
(__m256i)__builtin_shufflevector( \ (__m256i)__builtin_shufflevector( \
(__v4di)(__m256i)(V1), \ (__v4di)(__m256i)(V1), \
(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
(((M) & 1) ? 0 : 4), \ (((M) & 1) ? 0 : 4), \
(((M) & 1) ? 1 : 5), \ (((M) & 1) ? 1 : 5), \
(((M) & 1) ? 4 : 2), \ (((M) & 1) ? 4 : 2), \
(((M) & 1) ? 5 : 3) );}) (((M) & 1) ? 5 : 3) )
/* /*
Vector extract. Vector extract.
@ -4794,14 +4794,14 @@ _mm256_zextsi128_si256(__m128i __a)
/// result. \n /// result. \n
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit vector of [4 x float] containing the extracted bits. /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
#define _mm256_extractf128_ps(V, M) __extension__ ({ \ #define _mm256_extractf128_ps(V, M) \
(__m128)__builtin_shufflevector( \ (__m128)__builtin_shufflevector( \
(__v8sf)(__m256)(V), \ (__v8sf)(__m256)(V), \
(__v8sf)(_mm256_undefined_ps()), \ (__v8sf)(_mm256_undefined_ps()), \
(((M) & 1) ? 4 : 0), \ (((M) & 1) ? 4 : 0), \
(((M) & 1) ? 5 : 1), \ (((M) & 1) ? 5 : 1), \
(((M) & 1) ? 6 : 2), \ (((M) & 1) ? 6 : 2), \
(((M) & 1) ? 7 : 3) );}) (((M) & 1) ? 7 : 3) )
/// Extracts either the upper or the lower 128 bits from a 256-bit vector /// Extracts either the upper or the lower 128 bits from a 256-bit vector
/// of [4 x double], as determined by the immediate integer parameter, and /// of [4 x double], as determined by the immediate integer parameter, and
@ -4824,12 +4824,12 @@ _mm256_zextsi128_si256(__m128i __a)
/// result. \n /// result. \n
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit vector of [2 x double] containing the extracted bits. /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
#define _mm256_extractf128_pd(V, M) __extension__ ({ \ #define _mm256_extractf128_pd(V, M) \
(__m128d)__builtin_shufflevector( \ (__m128d)__builtin_shufflevector( \
(__v4df)(__m256d)(V), \ (__v4df)(__m256d)(V), \
(__v4df)(_mm256_undefined_pd()), \ (__v4df)(_mm256_undefined_pd()), \
(((M) & 1) ? 2 : 0), \ (((M) & 1) ? 2 : 0), \
(((M) & 1) ? 3 : 1) );}) (((M) & 1) ? 3 : 1) )
/// Extracts either the upper or the lower 128 bits from a 256-bit /// Extracts either the upper or the lower 128 bits from a 256-bit
/// integer vector, as determined by the immediate integer parameter, and /// integer vector, as determined by the immediate integer parameter, and
@ -4852,12 +4852,12 @@ _mm256_zextsi128_si256(__m128i __a)
/// result. \n /// result. \n
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit integer vector containing the extracted bits. /// \returns A 128-bit integer vector containing the extracted bits.
#define _mm256_extractf128_si256(V, M) __extension__ ({ \ #define _mm256_extractf128_si256(V, M) \
(__m128i)__builtin_shufflevector( \ (__m128i)__builtin_shufflevector( \
(__v4di)(__m256i)(V), \ (__v4di)(__m256i)(V), \
(__v4di)(_mm256_undefined_si256()), \ (__v4di)(_mm256_undefined_si256()), \
(((M) & 1) ? 2 : 0), \ (((M) & 1) ? 2 : 0), \
(((M) & 1) ? 3 : 1) );}) (((M) & 1) ? 3 : 1) )
/* SIMD load ops (unaligned) */ /* SIMD load ops (unaligned) */
/// Loads two 128-bit floating-point vectors of [4 x float] from /// Loads two 128-bit floating-point vectors of [4 x float] from

View File

@ -2792,7 +2792,7 @@ _mm_xor_si128(__m128i __a, __m128i __b)
/// An immediate value specifying the number of bytes to left-shift operand /// An immediate value specifying the number of bytes to left-shift operand
/// \a a. /// \a a.
/// \returns A 128-bit integer vector containing the left-shifted value. /// \returns A 128-bit integer vector containing the left-shifted value.
#define _mm_slli_si128(a, imm) __extension__ ({ \ #define _mm_slli_si128(a, imm) \
(__m128i)__builtin_shufflevector( \ (__m128i)__builtin_shufflevector( \
(__v16qi)_mm_setzero_si128(), \ (__v16qi)_mm_setzero_si128(), \
(__v16qi)(__m128i)(a), \ (__v16qi)(__m128i)(a), \
@ -2811,7 +2811,7 @@ _mm_xor_si128(__m128i __a, __m128i __b)
((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \ ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \ ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \ ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); }) ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm))
#define _mm_bslli_si128(a, imm) \ #define _mm_bslli_si128(a, imm) \
_mm_slli_si128((a), (imm)) _mm_slli_si128((a), (imm))
@ -3027,7 +3027,7 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
/// An immediate value specifying the number of bytes to right-shift operand /// An immediate value specifying the number of bytes to right-shift operand
/// \a a. /// \a a.
/// \returns A 128-bit integer vector containing the right-shifted value. /// \returns A 128-bit integer vector containing the right-shifted value.
#define _mm_srli_si128(a, imm) __extension__ ({ \ #define _mm_srli_si128(a, imm) \
(__m128i)__builtin_shufflevector( \ (__m128i)__builtin_shufflevector( \
(__v16qi)(__m128i)(a), \ (__v16qi)(__m128i)(a), \
(__v16qi)_mm_setzero_si128(), \ (__v16qi)_mm_setzero_si128(), \
@ -3046,7 +3046,7 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \ ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \ ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \ ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); }) ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15)
#define _mm_bsrli_si128(a, imm) \ #define _mm_bsrli_si128(a, imm) \
_mm_srli_si128((a), (imm)) _mm_srli_si128((a), (imm))
@ -4384,11 +4384,11 @@ _mm_movemask_epi8(__m128i __a)
/// 10: assign values from bits [95:64] of \a a. \n /// 10: assign values from bits [95:64] of \a a. \n
/// 11: assign values from bits [127:96] of \a a. /// 11: assign values from bits [127:96] of \a a.
/// \returns A 128-bit integer vector containing the shuffled values. /// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ #define _mm_shuffle_epi32(a, imm) \
(__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
(__v4si)_mm_undefined_si128(), \ (__v4si)_mm_undefined_si128(), \
((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); }) ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3)
/// Constructs a 128-bit integer vector by shuffling four lower 16-bit /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
/// elements of a 128-bit integer vector of [8 x i16], using the immediate /// elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4417,12 +4417,12 @@ _mm_movemask_epi8(__m128i __a)
/// 10: assign values from bits [47:32] of \a a. \n /// 10: assign values from bits [47:32] of \a a. \n
/// 11: assign values from bits [63:48] of \a a. \n /// 11: assign values from bits [63:48] of \a a. \n
/// \returns A 128-bit integer vector containing the shuffled values. /// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ #define _mm_shufflelo_epi16(a, imm) \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
(__v8hi)_mm_undefined_si128(), \ (__v8hi)_mm_undefined_si128(), \
((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \ ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
4, 5, 6, 7); }) 4, 5, 6, 7)
/// Constructs a 128-bit integer vector by shuffling four upper 16-bit /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
/// elements of a 128-bit integer vector of [8 x i16], using the immediate /// elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4451,14 +4451,14 @@ _mm_movemask_epi8(__m128i __a)
/// 10: assign values from bits [111:96] of \a a. \n /// 10: assign values from bits [111:96] of \a a. \n
/// 11: assign values from bits [127:112] of \a a. \n /// 11: assign values from bits [127:112] of \a a. \n
/// \returns A 128-bit integer vector containing the shuffled values. /// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ #define _mm_shufflehi_epi16(a, imm) \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
(__v8hi)_mm_undefined_si128(), \ (__v8hi)_mm_undefined_si128(), \
0, 1, 2, 3, \ 0, 1, 2, 3, \
4 + (((imm) >> 0) & 0x3), \ 4 + (((imm) >> 0) & 0x3), \
4 + (((imm) >> 2) & 0x3), \ 4 + (((imm) >> 2) & 0x3), \
4 + (((imm) >> 4) & 0x3), \ 4 + (((imm) >> 4) & 0x3), \
4 + (((imm) >> 6) & 0x3)); }) 4 + (((imm) >> 6) & 0x3))
/// Unpacks the high-order (index 8-15) values from two 128-bit vectors /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
@ -4811,10 +4811,10 @@ _mm_movemask_pd(__m128d __a)
/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
/// \returns A 128-bit vector of [2 x double] containing the shuffled values. /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ #define _mm_shuffle_pd(a, b, i) \
(__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
0 + (((i) >> 0) & 0x1), \ 0 + (((i) >> 0) & 0x1), \
2 + (((i) >> 1) & 0x1)); }) 2 + (((i) >> 1) & 0x1))
/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
/// floating-point vector of [4 x float]. /// floating-point vector of [4 x float].

View File

@ -77,9 +77,9 @@ _cvtsh_ss(unsigned short __a)
/// 011: Truncate \n /// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding /// 1XX: Use MXCSR.RC for rounding
/// \returns The converted 16-bit half-precision float value. /// \returns The converted 16-bit half-precision float value.
#define _cvtss_sh(a, imm) __extension__ ({ \ #define _cvtss_sh(a, imm) \
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \ (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
(imm)))[0]); }) (imm)))[0]);
/// Converts a 128-bit vector containing 32-bit float values into a /// Converts a 128-bit vector containing 32-bit float values into a
/// 128-bit vector containing 16-bit half-precision float values. /// 128-bit vector containing 16-bit half-precision float values.
@ -104,8 +104,8 @@ _cvtsh_ss(unsigned short __a)
/// \returns A 128-bit vector containing converted 16-bit half-precision float /// \returns A 128-bit vector containing converted 16-bit half-precision float
/// values. The lower 64 bits are used to store the converted 16-bit /// values. The lower 64 bits are used to store the converted 16-bit
/// half-precision floating-point values. /// half-precision floating-point values.
#define _mm_cvtps_ph(a, imm) __extension__ ({ \ #define _mm_cvtps_ph(a, imm) \
(__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); }) (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm));
/// Converts a 128-bit vector containing 16-bit half-precision float /// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 128-bit vector containing 32-bit float values. /// values into a 128-bit vector containing 32-bit float values.
@ -147,8 +147,8 @@ _mm_cvtph_ps(__m128i __a)
/// 1XX: Use MXCSR.RC for rounding /// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing the converted 16-bit half-precision /// \returns A 128-bit vector containing the converted 16-bit half-precision
/// float values. /// float values.
#define _mm256_cvtps_ph(a, imm) __extension__ ({ \ #define _mm256_cvtps_ph(a, imm) \
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); }) (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm));
/// Converts a 128-bit vector containing 16-bit half-precision float /// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 256-bit vector of [8 x float]. /// values into a 256-bit vector of [8 x float].

View File

@ -29,95 +29,95 @@
#define __GFNIINTRIN_H #define __GFNIINTRIN_H
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
(__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \ (__v16qi)(__m128i)(B), \
(char)(I)); }) (char)(I))
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ #define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S)); }) (__v16qi)(__m128i)(S))
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ #define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
U, A, B, I); }) U, A, B, I)
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ #define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
(__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \ (__v32qi)(__m256i)(B), \
(char)(I)); }) (char)(I))
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ #define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S)); }) (__v32qi)(__m256i)(S))
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ #define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I); }) U, A, B, I);
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ #define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
(__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \ (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \ (__v64qi)(__m512i)(B), \
(char)(I)); }) (char)(I))
#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ #define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v64qi)(__m512i)(S)); }) (__v64qi)(__m512i)(S))
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ #define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \ (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
U, A, B, I); }) U, A, B, I)
#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ #define _mm_gf2p8affine_epi64_epi8(A, B, I) \
(__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \ (__v16qi)(__m128i)(B), \
(char)(I)); }) (char)(I))
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ #define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S)); }) (__v16qi)(__m128i)(S))
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ #define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
(__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \ (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \
U, A, B, I); }) U, A, B, I)
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ #define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
(__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \ (__v32qi)(__m256i)(B), \
(char)(I)); }) (char)(I))
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ #define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S)); }) (__v32qi)(__m256i)(S))
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ #define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
(__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I); }) U, A, B, I)
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ #define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
(__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \ (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \ (__v64qi)(__m512i)(B), \
(char)(I)); }) (char)(I))
#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ #define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \ (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \
(__v64qi)(__m512i)(S)); }) (__v64qi)(__m512i)(S))
#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ #define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
(__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \ (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
U, A, B, I); }) U, A, B, I)
/* Default attributes for simple form (no masking). */ /* Default attributes for simple form (no masking). */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"))) #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni")))

View File

@ -31,8 +31,8 @@
/* Define the default attributes for the functions in this file. */ /* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"))) #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \ #define _mm_sha1rnds4_epu32(V1, V2, M) \
__builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); }) __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M));
static __inline__ __m128i __DEFAULT_FN_ATTRS static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha1nexte_epu32(__m128i __X, __m128i __Y) _mm_sha1nexte_epu32(__m128i __X, __m128i __Y)

View File

@ -244,8 +244,8 @@
/// 10: Upward (toward positive infinity) \n /// 10: Upward (toward positive infinity) \n
/// 11: Truncated /// 11: Truncated
/// \returns A 128-bit vector of [4 x float] containing the rounded values. /// \returns A 128-bit vector of [4 x float] containing the rounded values.
#define _mm_round_ps(X, M) __extension__ ({ \ #define _mm_round_ps(X, M) \
(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); }) (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
/// Copies three upper elements of the first 128-bit vector operand to /// Copies three upper elements of the first 128-bit vector operand to
/// the corresponding three upper elements of the 128-bit result vector of /// the corresponding three upper elements of the 128-bit result vector of
@ -285,9 +285,9 @@
/// 11: Truncated /// 11: Truncated
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
/// values. /// values.
#define _mm_round_ss(X, Y, M) __extension__ ({ \ #define _mm_round_ss(X, Y, M) \
(__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)); }) (__v4sf)(__m128)(Y), (M))
/// Rounds each element of the 128-bit vector of [2 x double] to an /// Rounds each element of the 128-bit vector of [2 x double] to an
/// integer value according to the rounding control specified by the second /// integer value according to the rounding control specified by the second
@ -319,8 +319,8 @@
/// 10: Upward (toward positive infinity) \n /// 10: Upward (toward positive infinity) \n
/// 11: Truncated /// 11: Truncated
/// \returns A 128-bit vector of [2 x double] containing the rounded values. /// \returns A 128-bit vector of [2 x double] containing the rounded values.
#define _mm_round_pd(X, M) __extension__ ({ \ #define _mm_round_pd(X, M) \
(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); }) (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
/// Copies the upper element of the first 128-bit vector operand to the /// Copies the upper element of the first 128-bit vector operand to the
/// corresponding upper element of the 128-bit result vector of [2 x double]. /// corresponding upper element of the 128-bit result vector of [2 x double].
@ -360,9 +360,9 @@
/// 11: Truncated /// 11: Truncated
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
/// values. /// values.
#define _mm_round_sd(X, Y, M) __extension__ ({ \ #define _mm_round_sd(X, Y, M) \
(__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)); }) (__v2df)(__m128d)(Y), (M))
/* SSE4 Packed Blending Intrinsics. */ /* SSE4 Packed Blending Intrinsics. */
/// Returns a 128-bit vector of [2 x double] where the values are /// Returns a 128-bit vector of [2 x double] where the values are
@ -389,11 +389,11 @@
/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
/// is copied to the same position in the result. /// is copied to the same position in the result.
/// \returns A 128-bit vector of [2 x double] containing the copied values. /// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_blend_pd(V1, V2, M) __extension__ ({ \ #define _mm_blend_pd(V1, V2, M) \
(__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
(__v2df)(__m128d)(V2), \ (__v2df)(__m128d)(V2), \
(((M) & 0x01) ? 2 : 0), \ (((M) & 0x01) ? 2 : 0), \
(((M) & 0x02) ? 3 : 1)); }) (((M) & 0x02) ? 3 : 1))
/// Returns a 128-bit vector of [4 x float] where the values are selected /// Returns a 128-bit vector of [4 x float] where the values are selected
/// from either the first or second operand as specified by the third /// from either the first or second operand as specified by the third
@ -419,12 +419,12 @@
/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
/// is copied to the same position in the result. /// is copied to the same position in the result.
/// \returns A 128-bit vector of [4 x float] containing the copied values. /// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_blend_ps(V1, V2, M) __extension__ ({ \ #define _mm_blend_ps(V1, V2, M) \
(__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
(((M) & 0x01) ? 4 : 0), \ (((M) & 0x01) ? 4 : 0), \
(((M) & 0x02) ? 5 : 1), \ (((M) & 0x02) ? 5 : 1), \
(((M) & 0x04) ? 6 : 2), \ (((M) & 0x04) ? 6 : 2), \
(((M) & 0x08) ? 7 : 3)); }) (((M) & 0x08) ? 7 : 3))
/// Returns a 128-bit vector of [2 x double] where the values are /// Returns a 128-bit vector of [2 x double] where the values are
/// selected from either the first or second operand as specified by the /// selected from either the first or second operand as specified by the
@ -531,7 +531,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
/// is copied to the same position in the result. /// is copied to the same position in the result.
/// \returns A 128-bit vector of [8 x i16] containing the copied values. /// \returns A 128-bit vector of [8 x i16] containing the copied values.
#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ #define _mm_blend_epi16(V1, V2, M) \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
(__v8hi)(__m128i)(V2), \ (__v8hi)(__m128i)(V2), \
(((M) & 0x01) ? 8 : 0), \ (((M) & 0x01) ? 8 : 0), \
@ -541,7 +541,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
(((M) & 0x10) ? 12 : 4), \ (((M) & 0x10) ? 12 : 4), \
(((M) & 0x20) ? 13 : 5), \ (((M) & 0x20) ? 13 : 5), \
(((M) & 0x40) ? 14 : 6), \ (((M) & 0x40) ? 14 : 6), \
(((M) & 0x80) ? 15 : 7)); }) (((M) & 0x80) ? 15 : 7))
/* SSE4 Dword Multiply Instructions. */ /* SSE4 Dword Multiply Instructions. */
/// Multiples corresponding elements of two 128-bit vectors of [4 x i32] /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
@ -616,9 +616,9 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/// each [4 x float] subvector. If a bit is set, the dot product is returned /// each [4 x float] subvector. If a bit is set, the dot product is returned
/// in the corresponding element; otherwise that element is set to zero. /// in the corresponding element; otherwise that element is set to zero.
/// \returns A 128-bit vector of [4 x float] containing the dot product. /// \returns A 128-bit vector of [4 x float] containing the dot product.
#define _mm_dp_ps(X, Y, M) __extension__ ({ \ #define _mm_dp_ps(X, Y, M) \
(__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)); }) (__v4sf)(__m128)(Y), (M))
/// Computes the dot product of the two 128-bit vectors of [2 x double] /// Computes the dot product of the two 128-bit vectors of [2 x double]
/// and returns it in the elements of the 128-bit result vector of /// and returns it in the elements of the 128-bit result vector of
@ -651,9 +651,9 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/// to the lowest element and bit [1] corresponding to the highest element of /// to the lowest element and bit [1] corresponding to the highest element of
/// each [2 x double] vector. If a bit is set, the dot product is returned in /// each [2 x double] vector. If a bit is set, the dot product is returned in
/// the corresponding element; otherwise that element is set to zero. /// the corresponding element; otherwise that element is set to zero.
#define _mm_dp_pd(X, Y, M) __extension__ ({\ #define _mm_dp_pd(X, Y, M) \
(__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)); }) (__v2df)(__m128d)(Y), (M))
/* SSE4 Streaming Load Hint Instruction. */ /* SSE4 Streaming Load Hint Instruction. */
/// Loads integer values from a 128-bit aligned memory location to a /// Loads integer values from a 128-bit aligned memory location to a
@ -1546,9 +1546,9 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
/// \endcode /// \endcode
/// \returns A 128-bit integer vector containing the sums of the sets of /// \returns A 128-bit integer vector containing the sums of the sets of
/// absolute differences between both operands. /// absolute differences between both operands.
#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ #define _mm_mpsadbw_epu8(X, Y, M) \
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (M)); }) (__v16qi)(__m128i)(Y), (M))
/// Finds the minimum unsigned 16-bit element in the input 128-bit /// Finds the minimum unsigned 16-bit element in the input 128-bit
/// vector of [8 x u16] and returns it and along with its index. /// vector of [8 x u16] and returns it and along with its index.

View File

@ -157,9 +157,9 @@ _mm_abs_epi32(__m128i __a)
/// An immediate operand specifying how many bytes to right-shift the result. /// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 128-bit integer vector containing the concatenated right-shifted /// \returns A 128-bit integer vector containing the concatenated right-shifted
/// value. /// value.
#define _mm_alignr_epi8(a, b, n) __extension__ ({ \ #define _mm_alignr_epi8(a, b, n) \
(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (n)); }) (__v16qi)(__m128i)(b), (n))
/// Concatenates the two 64-bit integer vector operands, and right-shifts /// Concatenates the two 64-bit integer vector operands, and right-shifts
/// the result by the number of bytes specified in the immediate operand. /// the result by the number of bytes specified in the immediate operand.
@ -180,8 +180,8 @@ _mm_abs_epi32(__m128i __a)
/// An immediate operand specifying how many bytes to right-shift the result. /// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 64-bit integer vector containing the concatenated right-shifted /// \returns A 64-bit integer vector containing the concatenated right-shifted
/// value. /// value.
#define _mm_alignr_pi8(a, b, n) __extension__ ({ \ #define _mm_alignr_pi8(a, b, n) \
(__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); }) (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
/// Horizontally adds the adjacent pairs of values contained in 2 packed /// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16]. /// 128-bit vectors of [8 x i16].

View File

@ -28,15 +28,15 @@
#ifndef __VPCLMULQDQINTRIN_H #ifndef __VPCLMULQDQINTRIN_H
#define __VPCLMULQDQINTRIN_H #define __VPCLMULQDQINTRIN_H
#define _mm256_clmulepi64_epi128(A, B, I) __extension__ ({ \ #define _mm256_clmulepi64_epi128(A, B, I) \
(__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \ (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), \ (__v4di)(__m256i)(B), \
(char)(I)); }) (char)(I));
#define _mm512_clmulepi64_epi128(A, B, I) __extension__ ({ \ #define _mm512_clmulepi64_epi128(A, B, I) \
(__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \ (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), \ (__v8di)(__m512i)(B), \
(char)(I)); }) (char)(I));
#endif /* __VPCLMULQDQINTRIN_H */ #endif /* __VPCLMULQDQINTRIN_H */

View File

@ -2183,8 +2183,8 @@ void _mm_sfence(void);
/// 2: Bits [47:32] are copied to the destination. \n /// 2: Bits [47:32] are copied to the destination. \n
/// 3: Bits [63:48] are copied to the destination. /// 3: Bits [63:48] are copied to the destination.
/// \returns A 16-bit integer containing the extracted 16 bits of packed data. /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
#define _mm_extract_pi16(a, n) __extension__ ({ \ #define _mm_extract_pi16(a, n) \
(int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); }) (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n)
/// Copies data from the 64-bit vector of [4 x i16] to the destination, /// Copies data from the 64-bit vector of [4 x i16] to the destination,
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset /// and inserts the lower 16-bits of an integer operand at the 16-bit offset
@ -2214,8 +2214,8 @@ void _mm_sfence(void);
/// bits in operand \a a. /// bits in operand \a a.
/// \returns A 64-bit integer vector containing the copied packed data from the /// \returns A 64-bit integer vector containing the copied packed data from the
/// operands. /// operands.
#define _mm_insert_pi16(a, d, n) __extension__ ({ \ #define _mm_insert_pi16(a, d, n) \
(__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); }) (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n)
/// Compares each of the corresponding packed 16-bit integer values of /// Compares each of the corresponding packed 16-bit integer values of
/// the 64-bit integer vectors, and writes the greater value to the /// the 64-bit integer vectors, and writes the greater value to the
@ -2361,8 +2361,8 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
/// 10: assigned from bits [47:32] of \a a. \n /// 10: assigned from bits [47:32] of \a a. \n
/// 11: assigned from bits [63:48] of \a a. /// 11: assigned from bits [63:48] of \a a.
/// \returns A 64-bit integer vector containing the shuffled values. /// \returns A 64-bit integer vector containing the shuffled values.
#define _mm_shuffle_pi16(a, n) __extension__ ({ \ #define _mm_shuffle_pi16(a, n) \
(__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); }) (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
/// Conditionally copies the values from each 8-bit element in the first /// Conditionally copies the values from each 8-bit element in the first
/// 64-bit integer vector operand to the specified memory location, as /// 64-bit integer vector operand to the specified memory location, as
@ -2603,12 +2603,12 @@ void _mm_setcsr(unsigned int __i);
/// 10: Bits [95:64] copied from the specified operand. \n /// 10: Bits [95:64] copied from the specified operand. \n
/// 11: Bits [127:96] copied from the specified operand. /// 11: Bits [127:96] copied from the specified operand.
/// \returns A 128-bit vector of [4 x float] containing the shuffled values. /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ #define _mm_shuffle_ps(a, b, mask) \
(__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
0 + (((mask) >> 0) & 0x3), \ 0 + (((mask) >> 0) & 0x3), \
0 + (((mask) >> 2) & 0x3), \ 0 + (((mask) >> 2) & 0x3), \
4 + (((mask) >> 4) & 0x3), \ 4 + (((mask) >> 4) & 0x3), \
4 + (((mask) >> 6) & 0x3)); }) 4 + (((mask) >> 6) & 0x3))
/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].

View File

@ -237,17 +237,17 @@ _mm_rot_epi64(__m128i __A, __m128i __B)
return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
} }
#define _mm_roti_epi8(A, N) __extension__ ({ \ #define _mm_roti_epi8(A, N) \
(__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); }) (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))
#define _mm_roti_epi16(A, N) __extension__ ({ \ #define _mm_roti_epi16(A, N) \
(__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); }) (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))
#define _mm_roti_epi32(A, N) __extension__ ({ \ #define _mm_roti_epi32(A, N) \
(__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); }) (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))
#define _mm_roti_epi64(A, N) __extension__ ({ \ #define _mm_roti_epi64(A, N) \
(__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); }) (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))
static __inline__ __m128i __DEFAULT_FN_ATTRS static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shl_epi8(__m128i __A, __m128i __B) _mm_shl_epi8(__m128i __A, __m128i __B)
@ -297,37 +297,37 @@ _mm_sha_epi64(__m128i __A, __m128i __B)
return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
} }
#define _mm_com_epu8(A, B, N) __extension__ ({ \ #define _mm_com_epu8(A, B, N) \
(__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \ (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N)); }) (__v16qi)(__m128i)(B), (N))
#define _mm_com_epu16(A, B, N) __extension__ ({ \ #define _mm_com_epu16(A, B, N) \
(__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \ (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N)); }) (__v8hi)(__m128i)(B), (N))
#define _mm_com_epu32(A, B, N) __extension__ ({ \ #define _mm_com_epu32(A, B, N) \
(__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \ (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N)); }) (__v4si)(__m128i)(B), (N))
#define _mm_com_epu64(A, B, N) __extension__ ({ \ #define _mm_com_epu64(A, B, N) \
(__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \ (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N)); }) (__v2di)(__m128i)(B), (N))
#define _mm_com_epi8(A, B, N) __extension__ ({ \ #define _mm_com_epi8(A, B, N) \
(__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \ (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N)); }) (__v16qi)(__m128i)(B), (N))
#define _mm_com_epi16(A, B, N) __extension__ ({ \ #define _mm_com_epi16(A, B, N) \
(__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \ (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N)); }) (__v8hi)(__m128i)(B), (N))
#define _mm_com_epi32(A, B, N) __extension__ ({ \ #define _mm_com_epi32(A, B, N) \
(__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \ (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N)); }) (__v4si)(__m128i)(B), (N))
#define _mm_com_epi64(A, B, N) __extension__ ({ \ #define _mm_com_epi64(A, B, N) \
(__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \ (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N)); }) (__v2di)(__m128i)(B), (N))
#define _MM_PCOMCTRL_LT 0 #define _MM_PCOMCTRL_LT 0
#define _MM_PCOMCTRL_LE 1 #define _MM_PCOMCTRL_LE 1
@ -722,24 +722,24 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B)
return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE); return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
} }
#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \ #define _mm_permute2_pd(X, Y, C, I) \
(__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \ (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), \ (__v2df)(__m128d)(Y), \
(__v2di)(__m128i)(C), (I)); }) (__v2di)(__m128i)(C), (I))
#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \ #define _mm256_permute2_pd(X, Y, C, I) \
(__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \ (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
(__v4df)(__m256d)(Y), \ (__v4df)(__m256d)(Y), \
(__v4di)(__m256i)(C), (I)); }) (__v4di)(__m256i)(C), (I))
#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \ #define _mm_permute2_ps(X, Y, C, I) \
(__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
(__v4si)(__m128i)(C), (I)); }) (__v4si)(__m128i)(C), (I))
#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \ #define _mm256_permute2_ps(X, Y, C, I) \
(__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \ (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
(__v8sf)(__m256)(Y), \ (__v8sf)(__m256)(Y), \
(__v8si)(__m256i)(C), (I)); }) (__v8si)(__m256i)(C), (I))
static __inline__ __m128 __DEFAULT_FN_ATTRS static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_frcz_ss(__m128 __A) _mm_frcz_ss(__m128 __A)

View File

@ -1430,48 +1430,48 @@ float test_mm256_cvtss_f32(__m256 __a)
__m256 test_mm256_cmp_ps_true(__m256 a, __m256 b) { __m256 test_mm256_cmp_ps_true(__m256 a, __m256 b) {
// CHECK-LABEL: @test_mm256_cmp_ps_true // CHECK-LABEL: @test_mm256_cmp_ps_true
// CHECK: store <8 x float> <float 0xFFFFFFFFE0000000, // CHECK: ret <8 x float> <float 0xFFFFFFFFE0000000,
return _mm256_cmp_ps(a, b, _CMP_TRUE_UQ); return _mm256_cmp_ps(a, b, _CMP_TRUE_UQ);
} }
__m256 test_mm256_cmp_pd_true(__m256 a, __m256 b) { __m256d test_mm256_cmp_pd_true(__m256d a, __m256d b) {
// CHECK-LABEL: @test_mm256_cmp_pd_true // CHECK-LABEL: @test_mm256_cmp_pd_true
// CHECK: store <4 x double> <double 0xFFFFFFFFFFFFFFFF, // CHECK: ret <4 x double> <double 0xFFFFFFFFFFFFFFFF,
return _mm256_cmp_pd(a, b, _CMP_TRUE_UQ); return _mm256_cmp_pd(a, b, _CMP_TRUE_UQ);
} }
__m256 test_mm256_cmp_ps_false(__m256 a, __m256 b) { __m256 test_mm256_cmp_ps_false(__m256 a, __m256 b) {
// CHECK-LABEL: @test_mm256_cmp_ps_false // CHECK-LABEL: @test_mm256_cmp_ps_false
// CHECK: store <8 x float> zeroinitializer, <8 x float>* %tmp, align 32 // CHECK: ret <8 x float> zeroinitializer
return _mm256_cmp_ps(a, b, _CMP_FALSE_OQ); return _mm256_cmp_ps(a, b, _CMP_FALSE_OQ);
} }
__m256 test_mm256_cmp_pd_false(__m256 a, __m256 b) { __m256d test_mm256_cmp_pd_false(__m256d a, __m256d b) {
// CHECK-LABEL: @test_mm256_cmp_pd_false // CHECK-LABEL: @test_mm256_cmp_pd_false
// CHECK: store <4 x double> zeroinitializer, <4 x double>* %tmp, align 32 // CHECK: ret <4 x double> zeroinitializer
return _mm256_cmp_pd(a, b, _CMP_FALSE_OQ); return _mm256_cmp_pd(a, b, _CMP_FALSE_OQ);
} }
__m256 test_mm256_cmp_ps_strue(__m256 a, __m256 b) { __m256 test_mm256_cmp_ps_strue(__m256 a, __m256 b) {
// CHECK-LABEL: @test_mm256_cmp_ps_strue // CHECK-LABEL: @test_mm256_cmp_ps_strue
// CHECK: store <8 x float> <float 0xFFFFFFFFE0000000, // CHECK: ret <8 x float> <float 0xFFFFFFFFE0000000,
return _mm256_cmp_ps(a, b, _CMP_TRUE_US); return _mm256_cmp_ps(a, b, _CMP_TRUE_US);
} }
__m256 test_mm256_cmp_pd_strue(__m256 a, __m256 b) { __m256d test_mm256_cmp_pd_strue(__m256d a, __m256d b) {
// CHECK-LABEL: @test_mm256_cmp_pd_strue // CHECK-LABEL: @test_mm256_cmp_pd_strue
// CHECK: store <4 x double> <double 0xFFFFFFFFFFFFFFFF, // CHECK: ret <4 x double> <double 0xFFFFFFFFFFFFFFFF,
return _mm256_cmp_pd(a, b, _CMP_TRUE_US); return _mm256_cmp_pd(a, b, _CMP_TRUE_US);
} }
__m256 test_mm256_cmp_ps_sfalse(__m256 a, __m256 b) { __m256 test_mm256_cmp_ps_sfalse(__m256 a, __m256 b) {
// CHECK-LABEL: @test_mm256_cmp_ps_sfalse // CHECK-LABEL: @test_mm256_cmp_ps_sfalse
// CHECK: store <8 x float> zeroinitializer, <8 x float>* %tmp, align 32 // CHECK: ret <8 x float> zeroinitializer
return _mm256_cmp_ps(a, b, _CMP_FALSE_OS); return _mm256_cmp_ps(a, b, _CMP_FALSE_OS);
} }
__m256 test_mm256_cmp_pd_sfalse(__m256 a, __m256 b) { __m256d test_mm256_cmp_pd_sfalse(__m256d a, __m256d b) {
// CHECK-LABEL: @test_mm256_cmp_pd_sfalse // CHECK-LABEL: @test_mm256_cmp_pd_sfalse
// CHECK: store <4 x double> zeroinitializer, <4 x double>* %tmp, align 32 // CHECK: ret <4 x double> zeroinitializer
return _mm256_cmp_pd(a, b, _CMP_FALSE_OS); return _mm256_cmp_pd(a, b, _CMP_FALSE_OS);
} }

View File

@ -4,7 +4,5 @@
#include <tmmintrin.h> #include <tmmintrin.h>
__m64 test1(__m64 a, __m64 b, int c) { __m64 test1(__m64 a, __m64 b, int c) {
// FIXME: The "incompatible result type" error is due to pr10112 and should return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}}
// be removed when that is fixed.
return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}} expected-error {{incompatible result type}}
} }