[PowerPC] [Clang] Add SSE4 and BMI intrinsics implementation

Reviewed By: jsji

Differential Revision: https://reviews.llvm.org/D119407
This commit is contained in:
Qiu Chaofan 2022-03-24 20:03:08 +08:00
parent b3fbbabdc1
commit 406bde9a15
16 changed files with 1650 additions and 85 deletions

View File

@ -164,6 +164,12 @@ set(ppc_wrapper_files
ppc_wrappers/pmmintrin.h
ppc_wrappers/tmmintrin.h
ppc_wrappers/smmintrin.h
ppc_wrappers/bmiintrin.h
ppc_wrappers/bmi2intrin.h
ppc_wrappers/immintrin.h
ppc_wrappers/tmmintrin.h
ppc_wrappers/x86intrin.h
ppc_wrappers/x86gprintrin.h
)
set(openmp_wrapper_files

View File

@ -0,0 +1,133 @@
/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined X86GPRINTRIN_H_
#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
#endif
#ifndef BMI2INTRIN_H_
#define BMI2INTRIN_H_
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u32(unsigned int __X, unsigned int __Y) {
return ((__X << (32 - __Y)) >> (32 - __Y));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
unsigned long long __res = (unsigned long long)__X * __Y;
*__P = (unsigned int)(__res >> 32);
return (unsigned int)__res;
}
#ifdef __PPC64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u64(unsigned long long __X, unsigned long long __Y) {
return ((__X << (64 - __Y)) >> (64 - __Y));
}
/* __int128 requires base 64-bit. */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u64(unsigned long long __X, unsigned long long __Y,
unsigned long long *__P) {
unsigned __int128 __res = (unsigned __int128)__X * __Y;
*__P = (unsigned long long)(__res >> 64);
return (unsigned long long)__res;
}
#ifdef _ARCH_PWR7
/* popcount and bpermd require power7 minimum. */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u64(unsigned long long __X, unsigned long long __M) {
unsigned long result = 0x0UL;
const unsigned long mask = 0x8000000000000000UL;
unsigned long m = __M;
unsigned long c, t;
unsigned long p;
/* The pop-count of the mask gives the number of the bits from
source to process. This is also needed to shift bits from the
source into the correct position for the result. */
p = 64 - __builtin_popcountl(__M);
/* The loop is for the number of '1' bits in the mask and clearing
each mask bit as it is processed. */
while (m != 0) {
c = __builtin_clzl(m);
t = __X << (p - c);
m ^= (mask >> c);
result |= (t & (mask >> c));
p++;
}
return (result);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u64(unsigned long long __X, unsigned long long __M) {
unsigned long p = 0x4040404040404040UL; // initial bit permute control
const unsigned long mask = 0x8000000000000000UL;
unsigned long m = __M;
unsigned long c;
unsigned long result;
/* if the mask is constant and selects 8 bits or less we can use
the Power8 Bit permute instruction. */
if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
/* Also if the pext mask is constant, then the popcount is
constant, we can evaluate the following loop at compile
time and use a constant bit permute vector. */
for (long i = 0; i < __builtin_popcountl(__M); i++) {
c = __builtin_clzl(m);
p = (p << 8) | c;
m ^= (mask >> c);
}
result = __builtin_bpermd(p, __X);
} else {
p = 64 - __builtin_popcountl(__M);
result = 0;
/* We could a use a for loop here, but that combined with
-funroll-loops can expand to a lot of code. The while
loop avoids unrolling and the compiler commons the xor
from clearing the mask bit with the (m != 0) test. The
result is a more compact loop setup and body. */
while (m != 0) {
unsigned long t;
c = __builtin_clzl(m);
t = (__X & (mask >> c)) >> (p - c);
m ^= (mask >> c);
result |= (t);
p++;
}
}
return (result);
}
/* these 32-bit implementations depend on 64-bit pdep/pext
which depend on _ARCH_PWR7. */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u32(unsigned int __X, unsigned int __Y) {
return _pdep_u64(__X, __Y);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u32(unsigned int __X, unsigned int __Y) {
return _pext_u64(__X, __Y);
}
#endif /* _ARCH_PWR7 */
#endif /* __PPC64__ */
#endif /* BMI2INTRIN_H_ */

View File

@ -0,0 +1,165 @@
/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined X86GPRINTRIN_H_
#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
#endif
#ifndef BMIINTRIN_H_
#define BMIINTRIN_H_
extern __inline unsigned short
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzcnt_u16(unsigned short __X) {
return __builtin_ctz(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__andn_u32(unsigned int __X, unsigned int __Y) {
return (~__X & __Y);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
return ((__X << (32 - (__L + __P))) >> (32 - __L));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bextr_u32(unsigned int __X, unsigned int __Y) {
unsigned int __P, __L;
__P = __Y & 0xFF;
__L = (__Y >> 8) & 0xFF;
return (_bextr_u32(__X, __P, __L));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsi_u32(unsigned int __X) {
return (__X & -__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsi_u32(unsigned int __X) {
return __blsi_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsmsk_u32(unsigned int __X) {
return (__X ^ (__X - 1));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsmsk_u32(unsigned int __X) {
return __blsmsk_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsr_u32(unsigned int __X) {
return (__X & (__X - 1));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsr_u32(unsigned int __X) {
return __blsr_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzcnt_u32(unsigned int __X) {
return __builtin_ctz(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tzcnt_u32(unsigned int __X) {
return __builtin_ctz(__X);
}
/* use the 64-bit shift, rotate, and count leading zeros instructions
for long long. */
#ifdef __PPC64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__andn_u64(unsigned long long __X, unsigned long long __Y) {
return (~__X & __Y);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
return ((__X << (64 - (__L + __P))) >> (64 - __L));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bextr_u64(unsigned long long __X, unsigned long long __Y) {
unsigned int __P, __L;
__P = __Y & 0xFF;
__L = (__Y & 0xFF00) >> 8;
return (_bextr_u64(__X, __P, __L));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsi_u64(unsigned long long __X) {
return __X & -__X;
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsi_u64(unsigned long long __X) {
return __blsi_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsmsk_u64(unsigned long long __X) {
return (__X ^ (__X - 1));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsmsk_u64(unsigned long long __X) {
return __blsmsk_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsr_u64(unsigned long long __X) {
return (__X & (__X - 1));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsr_u64(unsigned long long __X) {
return __blsr_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzcnt_u64(unsigned long long __X) {
return __builtin_ctzll(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tzcnt_u64(unsigned long long __X) {
return __builtin_ctzll(__X);
}
#endif /* __PPC64__ */
#endif /* BMIINTRIN_H_ */

View File

@ -405,20 +405,10 @@ _mm_cmpnge_pd (__m128d __A, __m128d __B)
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd (__m128d __A, __m128d __B)
{
#if _ARCH_PWR8
__v2du c, d;
/* Compare against self will return false (0's) if NAN. */
c = (__v2du)vec_cmpeq (__A, __A);
d = (__v2du)vec_cmpeq (__B, __B);
#else
__v2du a, b;
__v2du c, d;
const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
a = (__v2du)vec_abs ((__v2df)__A);
b = (__v2du)vec_abs ((__v2df)__B);
c = (__v2du)vec_cmpgt (double_exp_mask, a);
d = (__v2du)vec_cmpgt (double_exp_mask, b);
#endif
/* A != NAN and B != NAN. */
return ((__m128d)vec_and(c, d));
}
@ -861,7 +851,11 @@ _mm_cvtpd_epi32 (__m128d __A)
: );
#ifdef _ARCH_PWR8
#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
#else
temp = vec_mergee (temp, temp);
#endif
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@ -896,7 +890,11 @@ _mm_cvtpd_ps (__m128d __A)
: );
#ifdef _ARCH_PWR8
#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
#else
temp = vec_mergee (temp, temp);
#endif
result = (__v4sf) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@ -925,7 +923,11 @@ _mm_cvttpd_epi32 (__m128d __A)
: );
#ifdef _ARCH_PWR8
#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
#else
temp = vec_mergee (temp, temp);
#endif
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@ -1205,6 +1207,9 @@ _mm_loadl_pd (__m128d __A, double const *__B)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd (__m128d __A)
{
#ifdef _ARCH_PWR10
return vec_extractm ((__v2du) __A);
#else
__vector unsigned long long result;
static const __vector unsigned int perm_mask =
{
@ -1224,6 +1229,7 @@ _mm_movemask_pd (__m128d __A)
#else
return result[0];
#endif
#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */
@ -1434,6 +1440,7 @@ _mm_mul_su32 (__m64 __A, __m64 __B)
return ((__m64)a * (__m64)b);
}
#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32 (__m128i __A, __m128i __B)
{
@ -1460,6 +1467,7 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
#endif
}
#endif
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16 (__m128i __A, int __B)
@ -1749,7 +1757,7 @@ _mm_sll_epi64 (__m128i __A, __m128i __B)
lshift = vec_splat ((__v2du) __B, 0);
shmask = vec_cmplt (lshift, shmax);
result = vec_sl ((__v2du) __A, lshift);
result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
result = vec_sel ((__v2du) shmask, result, shmask);
return (__m128i) result;
}
@ -1843,7 +1851,7 @@ _mm_srl_epi64 (__m128i __A, __m128i __B)
rshift = vec_splat ((__v2du) __B, 0);
shmask = vec_cmplt (rshift, shmax);
result = vec_sr ((__v2du) __A, rshift);
result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
result = vec_sel ((__v2du) shmask, result, shmask);
return (__m128i) result;
}
@ -1995,10 +2003,14 @@ _mm_min_epu8 (__m128i __A, __m128i __B)
#ifdef _ARCH_PWR8
/* Intrinsic functions that require PowerISA 2.07 minimum. */
/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
/* Return a mask created from the most significant bit of each 8-bit
element in A. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8 (__m128i __A)
{
#ifdef _ARCH_PWR10
return vec_extractm ((__v16qu) __A);
#else
__vector unsigned long long result;
static const __vector unsigned char perm_mask =
{
@ -2015,6 +2027,7 @@ _mm_movemask_epi8 (__m128i __A)
#else
return result[0];
#endif
#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */
@ -2158,27 +2171,37 @@ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __arti
_mm_sad_epu8 (__m128i __A, __m128i __B)
{
__v16qu a, b;
__v16qu vmin, vmax, vabsdiff;
__v16qu vabsdiff;
__v4si vsum;
const __v4su zero = { 0, 0, 0, 0 };
__v4si result;
a = (__v16qu) __A;
b = (__v16qu) __B;
vmin = vec_min (a, b);
vmax = vec_max (a, b);
#ifndef _ARCH_PWR9
__v16qu vmin = vec_min (a, b);
__v16qu vmax = vec_max (a, b);
vabsdiff = vec_sub (vmax, vmin);
#else
vabsdiff = vec_absd (a, b);
#endif
/* Sum four groups of bytes into integers. */
vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
#ifdef __LITTLE_ENDIAN__
/* Sum across four integers with two integer results. */
asm ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero));
/* Note: vec_sum2s could be used here, but on little-endian, vector
shifts are added that are not needed for this use-case.
A vector shift to correctly position the 32-bit integer results
(currently at [0] and [2]) to [1] and [3] would then need to be
swapped back again since the desired results are two 64-bit
integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
#else
/* Sum across four integers with two integer results. */
result = vec_sum2s (vsum, (__vector signed int) zero);
/* Rotate the sums into the correct position. */
#ifdef __LITTLE_ENDIAN__
result = vec_sld (result, result, 4);
#else
result = vec_sld (result, result, 6);
#endif
/* Rotate the sums into the correct position. */
return (__m128i) result;
}

View File

@ -0,0 +1,27 @@
/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef IMMINTRIN_H_
#define IMMINTRIN_H_
#include <x86gprintrin.h>
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <pmmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>
#endif /* IMMINTRIN_H_ */

View File

@ -0,0 +1,26 @@
/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
makes explicit use of Intel intrinsics to powerpc64le.
It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
Note that much code that uses Intel intrinsics can be rewritten in
standard C or GNU C extensions, which are more portable and better
optimized across multiple targets. */
#endif
#ifndef NMMINTRIN_H_
#define NMMINTRIN_H_
/* We just include SSE4.1 header file. */
#include <smmintrin.h>
#endif /* NMMINTRIN_H_ */

View File

@ -111,17 +111,21 @@ _mm_hsub_pd (__m128d __X, __m128d __Y)
vec_mergel ((__v2df) __X, (__v2df)__Y));
}
#ifdef _ARCH_PWR8
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehdup_ps (__m128 __X)
{
return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_moveldup_ps (__m128 __X)
{
return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
}
#endif
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loaddup_pd (double const *__P)

View File

@ -34,77 +34,683 @@
#include <altivec.h>
#include <tmmintrin.h>
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi8(__m128i __X, const int __N) {
return (unsigned char)((__v16qi)__X)[__N & 15];
/* Rounding mode macros. */
#define _MM_FROUND_TO_NEAREST_INT 0x00
#define _MM_FROUND_TO_ZERO 0x01
#define _MM_FROUND_TO_POS_INF 0x02
#define _MM_FROUND_TO_NEG_INF 0x03
#define _MM_FROUND_CUR_DIRECTION 0x04
#define _MM_FROUND_NINT \
(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_FLOOR \
(_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_CEIL \
(_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_TRUNC \
(_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_RINT \
(_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_NEARBYINT \
(_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
#define _MM_FROUND_RAISE_EXC 0x00
#define _MM_FROUND_NO_EXC 0x08
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_pd (__m128d __A, int __rounding)
{
__v2df __r;
union {
double __fr;
long long __fpscr;
} __enables_save, __fpscr_save;
if (__rounding & _MM_FROUND_NO_EXC)
{
/* Save enabled exceptions, disable all exceptions,
and preserve the rounding mode. */
#ifdef _ARCH_PWR9
__asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
#else
__fpscr_save.__fr = __builtin_mffs ();
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
__fpscr_save.__fpscr &= ~0xf8;
__builtin_mtfsf (0b00000011, __fpscr_save.__fr);
#endif
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__ ("" : "+wa" (__A));
}
switch (__rounding)
{
case _MM_FROUND_TO_NEAREST_INT:
__fpscr_save.__fr = __builtin_mffsl ();
__attribute__ ((fallthrough));
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
__builtin_set_fpscr_rn (0b00);
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__ ("" : "+wa" (__A));
__r = vec_rint ((__v2df) __A);
/* Insert an artificial "read" reference to the variable written
above, to ensure the compiler does not schedule the computation
of the value after the manipulation of the FPSCR, below.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__ ("" : : "wa" (__r));
__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
break;
case _MM_FROUND_TO_NEG_INF:
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
__r = vec_floor ((__v2df) __A);
break;
case _MM_FROUND_TO_POS_INF:
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
__r = vec_ceil ((__v2df) __A);
break;
case _MM_FROUND_TO_ZERO:
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
__r = vec_trunc ((__v2df) __A);
break;
case _MM_FROUND_CUR_DIRECTION:
__r = vec_rint ((__v2df) __A);
break;
}
if (__rounding & _MM_FROUND_NO_EXC)
{
/* Insert an artificial "read" reference to the variable written
above, to ensure the compiler does not schedule the computation
of the value after the manipulation of the FPSCR, below.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__ ("" : : "wa" (__r));
/* Restore enabled exceptions. */
__fpscr_save.__fr = __builtin_mffsl ();
__fpscr_save.__fpscr |= __enables_save.__fpscr;
__builtin_mtfsf (0b00000011, __fpscr_save.__fr);
}
return (__m128d) __r;
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi32(__m128i __X, const int __N) {
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_sd (__m128d __A, __m128d __B, int __rounding)
{
__B = _mm_round_pd (__B, __rounding);
__v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] };
return (__m128d) __r;
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_ps (__m128 __A, int __rounding)
{
__v4sf __r;
union {
double __fr;
long long __fpscr;
} __enables_save, __fpscr_save;
if (__rounding & _MM_FROUND_NO_EXC)
{
/* Save enabled exceptions, disable all exceptions,
and preserve the rounding mode. */
#ifdef _ARCH_PWR9
__asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
#else
__fpscr_save.__fr = __builtin_mffs ();
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
__fpscr_save.__fpscr &= ~0xf8;
__builtin_mtfsf (0b00000011, __fpscr_save.__fr);
#endif
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__ ("" : "+wa" (__A));
}
switch (__rounding)
{
case _MM_FROUND_TO_NEAREST_INT:
__fpscr_save.__fr = __builtin_mffsl ();
__attribute__ ((fallthrough));
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
__builtin_set_fpscr_rn (0b00);
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__ ("" : "+wa" (__A));
__r = vec_rint ((__v4sf) __A);
/* Insert an artificial "read" reference to the variable written
above, to ensure the compiler does not schedule the computation
of the value after the manipulation of the FPSCR, below.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__ ("" : : "wa" (__r));
__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
break;
case _MM_FROUND_TO_NEG_INF:
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
__r = vec_floor ((__v4sf) __A);
break;
case _MM_FROUND_TO_POS_INF:
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
__r = vec_ceil ((__v4sf) __A);
break;
case _MM_FROUND_TO_ZERO:
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
__r = vec_trunc ((__v4sf) __A);
break;
case _MM_FROUND_CUR_DIRECTION:
__r = vec_rint ((__v4sf) __A);
break;
}
if (__rounding & _MM_FROUND_NO_EXC)
{
/* Insert an artificial "read" reference to the variable written
above, to ensure the compiler does not schedule the computation
of the value after the manipulation of the FPSCR, below.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__ ("" : : "wa" (__r));
/* Restore enabled exceptions. */
__fpscr_save.__fr = __builtin_mffsl ();
__fpscr_save.__fpscr |= __enables_save.__fpscr;
__builtin_mtfsf (0b00000011, __fpscr_save.__fr);
}
return (__m128) __r;
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_ss (__m128 __A, __m128 __B, int __rounding)
{
__B = _mm_round_ps (__B, __rounding);
__v4sf __r = (__v4sf) __A;
__r[0] = ((__v4sf) __B)[0];
return (__m128) __r;
}
#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL)
#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL)
#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR)
#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
{
__v16qi result = (__v16qi)__A;
result [__N & 0xf] = __D;
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
{
__v4si result = (__v4si)__A;
result [__N & 3] = __D;
return (__m128i) result;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
{
__v2di result = (__v2di)__A;
result [__N & 1] = __D;
return (__m128i) result;
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi8 (__m128i __X, const int __N)
{
return (unsigned char) ((__v16qi)__X)[__N & 15];
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi32 (__m128i __X, const int __N)
{
return ((__v4si)__X)[__N & 3];
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi64(__m128i __X, const int __N) {
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi64 (__m128i __X, const int __N)
{
return ((__v2di)__X)[__N & 1];
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_ps(__m128 __X, const int __N) {
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_ps (__m128 __X, const int __N)
{
return ((__v4si)__X)[__N & 3];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
__v16qi __charmask = vec_splats((signed char)__imm8);
__charmask = vec_gb(__charmask);
__v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
#ifdef __BIG_ENDIAN__
__shortmask = vec_reve(__shortmask);
#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
{
__v16qi __charmask = vec_splats ((signed char) __imm8);
__charmask = vec_gb (__charmask);
__v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
#ifdef __BIG_ENDIAN__
__shortmask = vec_reve (__shortmask);
#endif
return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
}
#endif
return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
{
#ifdef _ARCH_PWR10
return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask);
#else
const __v16qu __seven = vec_splats ((unsigned char) 0x07);
__v16qu __lmask = vec_sra ((__v16qu) __mask, __seven);
return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask);
#endif
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
{
__v16qu __pcv[] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 },
{ 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 },
{ 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
{ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 },
{ 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 },
{ 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 },
{ 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
{ 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
{ 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
{ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
};
__v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
return (__m128) __r;
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
{
#ifdef _ARCH_PWR10
return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask);
#else
const __v4si __zero = {0};
const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
#endif
}
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
{
__v16qu __pcv[] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
{ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }
};
__v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
return (__m128d) __r;
}
#ifdef _ARCH_PWR8
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
{
#ifdef _ARCH_PWR10
return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask);
#else
const __v2di __zero = {0};
const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
#endif
}
#endif
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_testz_si128 (__m128i __A, __m128i __B)
{
/* Note: This implementation does NOT set "zero" or "carry" flags. */
const __v16qu __zero = {0};
return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_testc_si128 (__m128i __A, __m128i __B)
{
/* Note: This implementation does NOT set "zero" or "carry" flags. */
const __v16qu __zero = {0};
const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A);
return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_testnzc_si128 (__m128i __A, __m128i __B)
{
/* Note: This implementation does NOT set "zero" or "carry" flags. */
return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0;
}
#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
#define _mm_test_all_ones(V) \
_mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y);
}
#endif
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi8 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
const __v16qu __seven = vec_splats((unsigned char)0x07);
__v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu16 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
__v16qi result = (__v16qi)__A;
result[__N & 0xf] = __D;
return (__m128i)result;
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi32 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
__v4si result = (__v4si)__A;
result[__N & 3] = __D;
return (__m128i)result;
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu32 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
__v2di result = (__v2di)__A;
result[__N & 1] = __D;
return (__m128i)result;
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi8 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu16 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y);
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi32 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu32 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y);
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi32 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y);
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epi32 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y);
}
#endif
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi16 (__m128i __A)
{
return (__m128i) vec_unpackh ((__v16qi) __A);
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi32 (__m128i __A)
{
__A = (__m128i) vec_unpackh ((__v16qi) __A);
return (__m128i) vec_unpackh ((__v8hi) __A);
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi64 (__m128i __A)
{
__A = (__m128i) vec_unpackh ((__v16qi) __A);
__A = (__m128i) vec_unpackh ((__v8hi) __A);
return (__m128i) vec_unpackh ((__v4si) __A);
}
#endif
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi16_epi32 (__m128i __A)
{
return (__m128i) vec_unpackh ((__v8hi) __A);
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi16_epi64 (__m128i __A)
{
__A = (__m128i) vec_unpackh ((__v8hi) __A);
return (__m128i) vec_unpackh ((__v4si) __A);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_epi64 (__m128i __A)
{
return (__m128i) vec_unpackh ((__v4si) __A);
}
#endif
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi16 (__m128i __A)
{
const __v16qu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi32 (__m128i __A)
{
const __v16qu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
__A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
__A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi64 (__m128i __A)
{
const __v16qu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
__A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
__A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
__A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
__A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu16_epi32 (__m128i __A)
{
const __v8hu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu16_epi64 (__m128i __A)
{
const __v8hu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
__A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
__A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu32_epi64 (__m128i __A)
{
const __v4su __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i) vec_mergeh ((__v4su) __A, __zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i) vec_mergeh (__zero, (__v4su) __A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
/* Return horizontal packed word minimum and its index in bits [15:0]
and bits [18:16] respectively. */
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_minpos_epu16 (__m128i __A)
{
union __u
{
__m128i __m;
__v8hu __uh;
};
union __u __u = { .__m = __A }, __r = { .__m = {0} };
unsigned short __ridx = 0;
unsigned short __rmin = __u.__uh[__ridx];
for (unsigned long __i = 1; __i < 8; __i++)
{
if (__u.__uh[__i] < __rmin)
{
__rmin = __u.__uh[__i];
__ridx = __i;
}
}
__r.__uh[0] = __rmin;
__r.__uh[1] = __ridx;
return __r.__m;
}
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi32 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y);
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
{
return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y);
}
#endif
#else
#include_next <smmintrin.h>
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
#endif /* _SMMINTRIN_H_ */
#endif /* SMMINTRIN_H_ */

View File

@ -339,6 +339,7 @@ _mm_shuffle_pi8 (__m64 __A, __m64 __B)
return (__m64) ((__v2du) (__C))[0];
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8 (__m128i __A, __m128i __B)
@ -350,7 +351,9 @@ _mm_sign_epi8 (__m128i __A, __m128i __B)
__v16qi __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16 (__m128i __A, __m128i __B)
@ -362,7 +365,9 @@ _mm_sign_epi16 (__m128i __A, __m128i __B)
__v8hi __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32 (__m128i __A, __m128i __B)
@ -374,7 +379,9 @@ _mm_sign_epi32 (__m128i __A, __m128i __B)
__v4si __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8 (__m64 __A, __m64 __B)
@ -385,7 +392,9 @@ _mm_sign_pi8 (__m64 __A, __m64 __B)
__C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16 (__m64 __A, __m64 __B)
@ -396,7 +405,9 @@ _mm_sign_pi16 (__m64 __A, __m64 __B)
__C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32 (__m64 __A, __m64 __B)
@ -407,6 +418,7 @@ _mm_sign_pi32 (__m64 __A, __m64 __B)
__C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))

View File

@ -0,0 +1,17 @@
/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef X86GPRINTRIN_H_
#define X86GPRINTRIN_H_
#include <bmiintrin.h>
#include <bmi2intrin.h>
#endif /* X86GPRINTRIN_H_ */

View File

@ -0,0 +1,28 @@
/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
makes explicit use of Intel intrinsics to powerpc64le.
It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
Note that much code that uses Intel intrinsics can be rewritten in
standard C or GNU C extensions, which are more portable and better
optimized across multiple targets. */
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef X86INTRIN_H_
#define X86INTRIN_H_
#ifdef __ALTIVEC__
#include <immintrin.h>
#endif /* __ALTIVEC__ */
#endif /* X86INTRIN_H_ */

View File

@ -31,8 +31,8 @@
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef _XMMINTRIN_H_INCLUDED
#define _XMMINTRIN_H_INCLUDED
#ifndef XMMINTRIN_H_
#define XMMINTRIN_H_
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
@ -881,7 +881,7 @@ _mm_cvtss_f32 (__m128 __A)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si32 (__m128 __A)
{
__m64 res = 0;
int res;
#ifdef _ARCH_PWR8
double dtmp;
__asm__(
@ -914,8 +914,8 @@ _mm_cvt_ss2si (__m128 __A)
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64 (__m128 __A)
{
__m64 res = 0;
#ifdef _ARCH_PWR8
long long res;
#if defined (_ARCH_PWR8) && defined (__powerpc64__)
double dtmp;
__asm__(
#ifdef __LITTLE_ENDIAN__
@ -1328,6 +1328,9 @@ _mm_storel_pi (__m64 *__P, __m128 __A)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_ps (__m128 __A)
{
#ifdef _ARCH_PWR10
return vec_extractm ((__vector unsigned int) __A);
#else
__vector unsigned long long result;
static const __vector unsigned int perm_mask =
{
@ -1347,6 +1350,7 @@ _mm_movemask_ps (__m128 __A)
#else
return result[0];
#endif
#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */
@ -1553,6 +1557,7 @@ _m_pminub (__m64 __A, __m64 __B)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8 (__m64 __A)
{
#ifdef __powerpc64__
unsigned long long p =
#ifdef __LITTLE_ENDIAN__
0x0008101820283038UL; // permute control for sign bits
@ -1560,6 +1565,18 @@ _mm_movemask_pi8 (__m64 __A)
0x3830282018100800UL; // permute control for sign bits
#endif
return __builtin_bpermd (p, __A);
#else
#ifdef __LITTLE_ENDIAN__
unsigned int mask = 0x20283038UL;
unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
#else
unsigned int mask = 0x38302820UL;
unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf;
unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf;
#endif
return (r2 << 4) | r1;
#endif
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@ -1841,4 +1858,4 @@ do { \
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
#endif /* _XMMINTRIN_H_INCLUDED */
#endif /* XMMINTRIN_H_ */

View File

@ -5,6 +5,9 @@
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10-LE
// CHECK-BE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 -2139078656>, align 16
// CHECK-BE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
// CHECK-BE-DAG: @_mm_shufflehi_epi16.permute_selectors = internal constant [4 x i16] [i16 2057, i16 2571, i16 3085, i16 3599], align 2
@ -440,7 +443,8 @@ test_converts() {
// CHECK: call <2 x double> @vec_rint(double vector[2])
// CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
// CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)
// CHECK-LABEL: define available_externally i64 @_mm_cvtpd_pi32
@ -450,7 +454,8 @@ test_converts() {
// CHECK-LABEL: define available_externally <4 x float> @_mm_cvtpd_ps
// CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK: call <4 x i32> asm "xvcvdpsp ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
// CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)
// CHECK-LABEL: define available_externally <2 x double> @_mm_cvtpi32_pd
@ -530,7 +535,8 @@ test_converts() {
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvttpd_epi32
// CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa"
// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
// CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)
// CHECK-LABEL: define available_externally i64 @_mm_cvttpd_pi32
@ -756,12 +762,18 @@ test_move() {
// CHECK: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK: insertelement <2 x double> %{{[0-9a-zA-Z_.]+}}, double %[[EXT]], i32 0
// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_epi8
// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally signext i32 @_mm_movemask_epi8
// CHECK: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef <i8 120, i8 112, i8 104, i8 96, i8 88, i8 80, i8 72, i8 64, i8 56, i8 48, i8 40, i8 32, i8 24, i8 16, i8 8, i8 0>)
// CHECK-LE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
// CHECK-BE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK: trunc i64 %[[VAL]] to i32
// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_pd
// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally signext i32 @_mm_movemask_pd
// CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 -2139094976, i32 -2139062144, i32 -2139062144, i32 -2139062144> to <16 x i8>))
// CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
@ -857,9 +869,9 @@ test_sad() {
// CHECK: call <16 x i8> @vec_max(unsigned char vector[16], unsigned char vector[16])
// CHECK: call <16 x i8> @vec_sub(unsigned char vector[16], unsigned char vector[16])
// CHECK: call <4 x i32> @vec_sum4s(unsigned char vector[16], unsigned int vector[4])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
// CHECK: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
// CHECK-LE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 4)
// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 6)
// CHECK-LE: call <4 x i32> asm "vsum2sws $0,$1,$2", "=v,v,v"(<4 x i32> %11, <4 x i32> zeroinitializer)
// CHECK-BE: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)
void __attribute__((noinline))
test_set() {
@ -1086,7 +1098,7 @@ test_sll() {
// CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef zeroext 0)
// CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, <2 x i64> noundef <i64 64, i64 64>)
// CHECK: call <2 x i64> @vec_sl(unsigned long long vector[2], unsigned long long vector[2])
// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2])
// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_slli_epi16
// CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
@ -1232,7 +1244,7 @@ test_srl() {
// CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 0)
// CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef <i64 64, i64 64>)
// CHECK: call <2 x i64> @vec_sr(unsigned long long vector[2], unsigned long long vector[2])
// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2])
// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_srli_epi16
// CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16

View File

@ -10,6 +10,11 @@
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
#include <smmintrin.h>
__m128 mn1, mn2;
@ -48,6 +53,10 @@ void __attribute__((noinline))
test_blend() {
_mm_blend_epi16(m1, m2, 0);
_mm_blendv_epi8(m1, m2, mi);
_mm_blend_ps(mn1, mn2, 0);
_mm_blendv_ps(mn1, mn2, mn1);
_mm_blend_pd(md1, md2, 0);
_mm_blendv_pd(md1, md2, md1);
}
// CHECK-LABEL: @test_blend
@ -62,10 +71,13 @@ test_blend() {
// BE: store <8 x i16> %[[REVE]], <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK: call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8])
// P10-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// P10: call <16 x i8> @vec_blendv(signed char vector[16], signed char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <16 x i8> @vec_splats(unsigned char)(i8 noundef zeroext 7)
// CHECK: call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16])
// CHECK: call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], unsigned char vector[16])
// CHECK: call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], unsigned char vector[16])
void __attribute__((noinline))
test_insert() {
@ -89,6 +101,239 @@ test_insert() {
// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1
// CHECK: insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[AND:[0-9a-zA-Z_.]+]]
void __attribute__((noinline))
test_convert() {
_mm_cvtepi16_epi32(m1);
_mm_cvtepi16_epi64(m1);
_mm_cvtepi32_epi64(m1);
_mm_cvtepi8_epi16(m1);
_mm_cvtepi8_epi32(m1);
_mm_cvtepi8_epi64(m1);
_mm_cvtepu16_epi32(m1);
_mm_cvtepu16_epi64(m1);
_mm_cvtepu32_epi64(m1);
_mm_cvtepu8_epi16(m1);
_mm_cvtepu8_epi32(m1);
_mm_cvtepu8_epi64(m1);
}
// CHECK-LABEL: @test_convert
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])
// CHECK: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])
// CHECK: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])
void __attribute__((noinline))
test_minmax() {
_mm_max_epi32(m1, m2);
_mm_max_epi8(m1, m2);
_mm_max_epu16(m1, m2);
_mm_max_epu32(m1, m2);
_mm_min_epi32(m1, m2);
_mm_min_epi8(m1, m2);
_mm_min_epu16(m1, m2);
_mm_min_epu32(m1, m2);
_mm_minpos_epu16(m1);
}
// CHECK-LABEL: @test_minmax
// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x i32> @vec_max(int vector[4], int vector[4])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <16 x i8> @vec_max(signed char vector[16], signed char vector[16])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <8 x i16> @vec_max(unsigned short vector[8], unsigned short vector[8])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x i32> @vec_max(unsigned int vector[4], unsigned int vector[4])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x i32> @vec_min(int vector[4], int vector[4])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <16 x i8> @vec_min(signed char vector[16], signed char vector[16])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <8 x i16> @vec_min(unsigned short vector[8], unsigned short vector[8])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x i32> @vec_min(unsigned int vector[4], unsigned int vector[4])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_minpos_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 %{{[0-9a-zA-Z_.]+}}, i8 0, i64 16, i1 false)
// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i16 %{{[0-9a-zA-Z_.]+}}
// CHECK: %[[VEXT:[0-9a-zA-Z_.]+]] = extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: zext i16 %[[VEXT]] to i32
// CHECK: zext i16 %{{[0-9a-zA-Z_.]+}} to i32
// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
// CHECK: add i64 %{{[0-9a-zA-Z_.]+}}, 1
void __attribute__((noinline))
test_round() {
_mm_round_ps(mn1, 0);
_mm_round_ss(mn1, mn2, 0);
_mm_round_pd(mn1, 0);
_mm_round_sd(mn1, mn2, 0);
}
// CHECK-LABEL: @test_round
// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
// CHECK: call <4 x float> @vec_rint(float vector[4])
// CHECK: call void asm sideeffect "", "^wa"
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x float> @vec_floor(float vector[4])
// CHECK: call <4 x float> @vec_ceil(float vector[4])
// CHECK: call <4 x float> @vec_trunc(float vector[4])
// CHECK: call <4 x float> @vec_rint(float vector[4])
// CHECK: call void asm sideeffect "", "^wa"
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
// CHECK: call <2 x double> @vec_rint(double vector[2])
// CHECK: call void asm sideeffect "", "^wa"
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <2 x double> @vec_floor(double vector[2])
// CHECK: call <2 x double> @vec_ceil(double vector[2])
// CHECK: call <2 x double> @vec_trunc(double vector[2])
// CHECK: call <2 x double> @vec_rint(double vector[2])
// CHECK: call void asm sideeffect "", "^wa"
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
// CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 1
void __attribute__((noinline))
test_testing() {
_mm_testc_si128(m1, m2);
_mm_testnzc_si128(m1, m2);
_mm_testz_si128(m1, m2);
}
// CHECK-LABEL: @test_testing
// CHECK-LABEL: define available_externally signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <16 x i8> @vec_nor(unsigned char vector[16], unsigned char vector[16])
// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call1, <16 x i8> noundef zeroinitializer)
// CHECK-LABEL: define available_externally signext i32 @_mm_testnzc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: zext i1 %{{[0-9a-zA-Z_.]+}} to i32
// CHECK-LABEL: define available_externally signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call, <16 x i8> noundef zeroinitializer)
void __attribute__((noinline))
test_compare() {
_mm_cmpeq_epi64(m1, m2);
_mm_cmpgt_epi64(m1, m2);
}
// CHECK-LABEL: @test_compare
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpeq_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <2 x i64> @vec_cmpeq(long long vector[2], long long vector[2])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpgt_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <2 x i64> @vec_cmpgt(long long vector[2], long long vector[2])
void __attribute__((noinline))
test_mul() {
_mm_mul_epi32(m1, m2);
_mm_mullo_epi32(m1, m2);
}
// CHECK-LABEL: @test_mul
// CHECK-LABEL: define available_externally <2 x i64> @_mm_mul_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: %call = call <2 x i64> @vec_mule(int vector[4], int vector[4])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_mullo_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: %call = call <4 x i32> @vec_mul(unsigned int vector[4], unsigned int vector[4])
void __attribute__((noinline))
test_packus() {
_mm_packus_epi32(m1, m2);
}
// CHECK-LABEL: @test_packus
// CHECK-LABEL: define available_externally <2 x i64> @_mm_packus_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <8 x i16> @vec_packsu(int vector[4], int vector[4])(<4 x i32> noundef %1, <4 x i32> noundef %3)
// To test smmintrin.h includes tmmintrin.h
void __attribute__((noinline))

View File

@ -0,0 +1,239 @@
// REQUIRES: powerpc-registered-target
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
#include <x86gprintrin.h>
unsigned short us;
unsigned ui;
unsigned long long ul;
void __attribute__((noinline))
test_bmiintrin() {
__tzcnt_u16(us);
__andn_u32(ui, ui);
_bextr_u32(ui, ui, ui);
__bextr_u32(ui, ui);
__blsi_u32(ui);
_blsi_u32(ui);
__blsmsk_u32(ui);
_blsmsk_u32(ui);
__blsr_u32(ui);
_blsr_u32(ui);
__tzcnt_u32(ui);
_tzcnt_u32(ui);
__andn_u64(ul, ul);
_bextr_u64(ul, ui, ui);
__bextr_u64(ul, ul);
__blsi_u64(ul);
_blsi_u64(ul);
__blsmsk_u64(ul);
_blsmsk_u64(ul);
__blsr_u64(ul);
_blsr_u64(ul);
__tzcnt_u64(ul);
_tzcnt_u64(ul);
}
// CHECK-LABEL: @test_bmiintrin
// CHECK-LABEL: define available_externally zeroext i16 @__tzcnt_u16(i16 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i16 %{{[0-9a-zA-Z._]+}} to i32
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i32 @llvm.cttz.i32(i32 %[[CONV]], i1 false)
// CHECK: trunc i32 %[[CALL]] to i16
// CHECK-LABEL: define available_externally zeroext i32 @__andn_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i32 %{{[0-9a-zA-Z._]+}}, -1
// CHECK: and i32 %[[NEG]], %1
// CHECK-LABEL: define available_externally zeroext i32 @_bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %[[ADD]]
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
// CHECK: %[[SUB]]1 = sub i32 32, %{{[0-9a-zA-Z._]+}}
// CHECK: lshr i32 %[[SHL]], %[[SUB]]1
// CHECK-LABEL: define available_externally zeroext i32 @__bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i32 %{{[0-9a-zA-Z._]+}}, 255
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i32 %{{[0-9a-zA-Z._]+}}, 8
// CHECK: and i32 %[[SHR]], 255
// CHECK: call zeroext i32 @_bextr_u32
// CHECK-LABEL: define available_externally zeroext i32 @__blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 0, %1
// CHECK: and i32 %0, %[[SUB]]
// CHECK-LABEL: define available_externally zeroext i32 @_blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: call zeroext i32 @__blsi_u32
// CHECK-LABEL: define available_externally zeroext i32 @__blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
// CHECK: xor i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
// CHECK-LABEL: define available_externally zeroext i32 @_blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: call zeroext i32 @__blsmsk_u32
// CHECK-LABEL: define available_externally zeroext i32 @__blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
// CHECK: and i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
// CHECK-LABEL: define available_externally zeroext i32 @_blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: call zeroext i32 @__blsr_u32
// CHECK-LABEL: define available_externally zeroext i32 @__tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
// CHECK-LABEL: define available_externally zeroext i32 @_tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
// CHECK-LABEL: define available_externally i64 @__andn_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i64 %{{[0-9a-zA-Z._]+}}, -1
// CHECK: and i64 %[[NEG]], %{{[0-9a-zA-Z._]+}}
// CHECK-LABEL: define available_externally i64 @_bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 64, %[[ADD]]
// CHECK: %[[EXT:[0-9a-zA-Z._]+]] = zext i32 %[[SUB]] to i64
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[EXT]]
// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 64, %{{[0-9a-zA-Z._]+}}
// CHECK: %[[EXT2:[0-9a-zA-Z._]+]] = zext i32 %[[SUB1]] to i64
// CHECK: lshr i64 %[[SHL]], %[[EXT2]]
// CHECK-LABEL: define available_externally i64 @__bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 255
// CHECK: trunc i64 %[[AND]] to i32
// CHECK: %[[AND1:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 65280
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %[[AND1]], 8
// CHECK: trunc i64 %[[SHR]] to i32
// CHECK: call i64 @_bextr_u64
// CHECK-LABEL: define available_externally i64 @__blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 0, %{{[0-9a-zA-Z._]+}}
// CHECK: and i64 %0, %[[SUB]]
// CHECK-LABEL: define available_externally i64 @_blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: call i64 @__blsi_u64
// CHECK-LABEL: define available_externally i64 @__blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
// CHECK: xor i64 %0, %[[SUB]]
// CHECK-LABEL: define available_externally i64 @_blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: call i64 @__blsmsk_u64
// CHECK-LABEL: define available_externally i64 @__blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
// CHECK: and i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
// CHECK-LABEL: define available_externally i64 @_blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: call i64 @__blsr_u64
// CHECK-LABEL: define available_externally i64 @__tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
// CHECK: sext i32 %[[CAST]] to i64
// CHECK-LABEL: define available_externally i64 @_tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
// CHECK: sext i32 %[[CAST]] to i64
void __attribute__((noinline))
test_bmi2intrin() {
_bzhi_u32(ui, ui);
_mulx_u32(ui, ui, &ui);
_bzhi_u64(ul, ul);
_mulx_u64(ul, ul, &ul);
_pdep_u64(ul, ul);
_pext_u64(ul, ul);
_pdep_u32(ui, ui);
_pext_u32(ui, ui);
}
// CHECK-LABEL: @test_bmi2intrin
// CHECK-LABEL: define available_externally zeroext i32 @_bzhi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
// CHECK: lshr i32 %[[SHL]], %[[SUB1]]
// CHECK-LABEL: define available_externally zeroext i32 @_mulx_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32* noundef %{{[0-9a-zA-Z._]+}})
// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %{{[0-9a-zA-Z._]+}}, 32
// CHECK: trunc i64 %[[SHR]] to i32
// CHECK: trunc i64 %{{[0-9a-zA-Z._]+}} to i32
// CHECK-LABEL: define available_externally i64 @_bzhi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
// CHECK: lshr i64 %[[SHL]], %[[SUB1]]
// CHECK-LABEL: define available_externally i64 @_mulx_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}, i64* noundef %{{[0-9a-zA-Z._]+}})
// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i128 %{{[0-9a-zA-Z._]+}}, 64
// CHECK: trunc i128 %[[SHR]] to i64
// CHECK: trunc i128 %{{[0-9a-zA-Z._]+}} to i64
// CHECK-LABEL: define available_externally i64 @_pdep_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub nsw i32 64, %[[CAST]]
// CHECK: sext i32 %[[SUB]] to i64
// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
// CHECK: %[[CAST2:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL2]] to i32
// CHECK: sext i32 %[[CAST2]] to i64
// CHECK: %[[SUB2:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
// CHECK: shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB2]]
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %[[AND]]
// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
// CHECK-LABEL: define available_externally i64 @_pext_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i1 @llvm.is.constant.i64(i64 %{{[0-9a-zA-Z._]+}})
// CHECK: br i1 %[[CALL]], label %[[TRUECOND:[0-9a-zA-Z._]+]], label %[[FALSECOND:[0-9a-zA-Z._]+]]
// CHECK: [[TRUECOND]]:
// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, 8
// CHECK: or i64 %[[SHL]], %{{[0-9a-zA-Z._]+}}
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
// CHECK: add nsw i64 %{{[0-9a-zA-Z._]+}}, 1
// CHECK: call i64 @llvm.ppc.bpermd
// CHECK: [[FALSECOND]]:
// CHECK: call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
// CHECK: lshr i64 %[[AND]], %[[SUB]]
// CHECK: %[[SHR3:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR3]]
// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
// CHECK-LABEL: define available_externally zeroext i32 @_pdep_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pdep_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
// CHECK: trunc i64 %[[CALL]] to i32
// CHECK-LABEL: define available_externally zeroext i32 @_pext_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pext_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
// CHECK: trunc i64 %[[CALL]] to i32

View File

@ -9,6 +9,9 @@
// RUN: %clang -x c++ -fsyntax-only -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-P10
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
// RUN: %clang -x c++ -fsyntax-only -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \
@ -383,12 +386,12 @@ test_convert() {
// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK-LABEL: define available_externally signext i32 @_mm_cvtss_si32
// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 0
// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 1
// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 2
// CHECK: trunc i64 %{{[0-9a-zA-Z_.]+}} to i32
// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
// CHECK-P10: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 0
// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 1
// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 2
// CHECK-LABEL: define available_externally i64 @_mm_cvtss_si64
// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctid $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
@ -681,9 +684,11 @@ test_move() {
// CHECK-LABEL: define available_externally signext i32 @_mm_movemask_ps
// CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 2113632, i32 -2139062144, i32 -2139062144, i32 -2139062144> to <16 x i8>))
// CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
// CHECK-LE: trunc i64 %[[EXT]] to i32
// CHECK-BE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 2113632> to <16 x i8>))
// CHECK-BE: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK: trunc i64 %[[EXT]] to i32
// CHECK-BE: trunc i64 %[[EXT]] to i32
// CHECK-P10: call zeroext i32 @vec_extractm(unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
void __attribute__((noinline))
test_alt_name_move() {