[PowerPC] [Clang] Add SSE4 and BMI intrinsics implementation
Reviewed By: jsji Differential Revision: https://reviews.llvm.org/D119407
This commit is contained in:
parent
b3fbbabdc1
commit
406bde9a15
|
@ -164,6 +164,12 @@ set(ppc_wrapper_files
|
|||
ppc_wrappers/pmmintrin.h
|
||||
ppc_wrappers/tmmintrin.h
|
||||
ppc_wrappers/smmintrin.h
|
||||
ppc_wrappers/bmiintrin.h
|
||||
ppc_wrappers/bmi2intrin.h
|
||||
ppc_wrappers/immintrin.h
|
||||
ppc_wrappers/tmmintrin.h
|
||||
ppc_wrappers/x86intrin.h
|
||||
ppc_wrappers/x86gprintrin.h
|
||||
)
|
||||
|
||||
set(openmp_wrapper_files
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#if !defined X86GPRINTRIN_H_
|
||||
#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef BMI2INTRIN_H_
|
||||
#define BMI2INTRIN_H_
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_bzhi_u32(unsigned int __X, unsigned int __Y) {
|
||||
return ((__X << (32 - __Y)) >> (32 - __Y));
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
|
||||
unsigned long long __res = (unsigned long long)__X * __Y;
|
||||
*__P = (unsigned int)(__res >> 32);
|
||||
return (unsigned int)__res;
|
||||
}
|
||||
|
||||
#ifdef __PPC64__
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_bzhi_u64(unsigned long long __X, unsigned long long __Y) {
|
||||
return ((__X << (64 - __Y)) >> (64 - __Y));
|
||||
}
|
||||
|
||||
/* __int128 requires base 64-bit. */
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mulx_u64(unsigned long long __X, unsigned long long __Y,
|
||||
unsigned long long *__P) {
|
||||
unsigned __int128 __res = (unsigned __int128)__X * __Y;
|
||||
*__P = (unsigned long long)(__res >> 64);
|
||||
return (unsigned long long)__res;
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR7
|
||||
/* popcount and bpermd require power7 minimum. */
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_pdep_u64(unsigned long long __X, unsigned long long __M) {
|
||||
unsigned long result = 0x0UL;
|
||||
const unsigned long mask = 0x8000000000000000UL;
|
||||
unsigned long m = __M;
|
||||
unsigned long c, t;
|
||||
unsigned long p;
|
||||
|
||||
/* The pop-count of the mask gives the number of the bits from
|
||||
source to process. This is also needed to shift bits from the
|
||||
source into the correct position for the result. */
|
||||
p = 64 - __builtin_popcountl(__M);
|
||||
|
||||
/* The loop is for the number of '1' bits in the mask and clearing
|
||||
each mask bit as it is processed. */
|
||||
while (m != 0) {
|
||||
c = __builtin_clzl(m);
|
||||
t = __X << (p - c);
|
||||
m ^= (mask >> c);
|
||||
result |= (t & (mask >> c));
|
||||
p++;
|
||||
}
|
||||
return (result);
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_pext_u64(unsigned long long __X, unsigned long long __M) {
|
||||
unsigned long p = 0x4040404040404040UL; // initial bit permute control
|
||||
const unsigned long mask = 0x8000000000000000UL;
|
||||
unsigned long m = __M;
|
||||
unsigned long c;
|
||||
unsigned long result;
|
||||
|
||||
/* if the mask is constant and selects 8 bits or less we can use
|
||||
the Power8 Bit permute instruction. */
|
||||
if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
|
||||
/* Also if the pext mask is constant, then the popcount is
|
||||
constant, we can evaluate the following loop at compile
|
||||
time and use a constant bit permute vector. */
|
||||
for (long i = 0; i < __builtin_popcountl(__M); i++) {
|
||||
c = __builtin_clzl(m);
|
||||
p = (p << 8) | c;
|
||||
m ^= (mask >> c);
|
||||
}
|
||||
result = __builtin_bpermd(p, __X);
|
||||
} else {
|
||||
p = 64 - __builtin_popcountl(__M);
|
||||
result = 0;
|
||||
/* We could a use a for loop here, but that combined with
|
||||
-funroll-loops can expand to a lot of code. The while
|
||||
loop avoids unrolling and the compiler commons the xor
|
||||
from clearing the mask bit with the (m != 0) test. The
|
||||
result is a more compact loop setup and body. */
|
||||
while (m != 0) {
|
||||
unsigned long t;
|
||||
c = __builtin_clzl(m);
|
||||
t = (__X & (mask >> c)) >> (p - c);
|
||||
m ^= (mask >> c);
|
||||
result |= (t);
|
||||
p++;
|
||||
}
|
||||
}
|
||||
return (result);
|
||||
}
|
||||
|
||||
/* these 32-bit implementations depend on 64-bit pdep/pext
|
||||
which depend on _ARCH_PWR7. */
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_pdep_u32(unsigned int __X, unsigned int __Y) {
|
||||
return _pdep_u64(__X, __Y);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_pext_u32(unsigned int __X, unsigned int __Y) {
|
||||
return _pext_u64(__X, __Y);
|
||||
}
|
||||
#endif /* _ARCH_PWR7 */
|
||||
#endif /* __PPC64__ */
|
||||
|
||||
#endif /* BMI2INTRIN_H_ */
|
|
@ -0,0 +1,165 @@
|
|||
/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#if !defined X86GPRINTRIN_H_
|
||||
#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef BMIINTRIN_H_
|
||||
#define BMIINTRIN_H_
|
||||
|
||||
extern __inline unsigned short
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__tzcnt_u16(unsigned short __X) {
|
||||
return __builtin_ctz(__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__andn_u32(unsigned int __X, unsigned int __Y) {
|
||||
return (~__X & __Y);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
|
||||
return ((__X << (32 - (__L + __P))) >> (32 - __L));
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__bextr_u32(unsigned int __X, unsigned int __Y) {
|
||||
unsigned int __P, __L;
|
||||
__P = __Y & 0xFF;
|
||||
__L = (__Y >> 8) & 0xFF;
|
||||
return (_bextr_u32(__X, __P, __L));
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__blsi_u32(unsigned int __X) {
|
||||
return (__X & -__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_blsi_u32(unsigned int __X) {
|
||||
return __blsi_u32(__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__blsmsk_u32(unsigned int __X) {
|
||||
return (__X ^ (__X - 1));
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_blsmsk_u32(unsigned int __X) {
|
||||
return __blsmsk_u32(__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__blsr_u32(unsigned int __X) {
|
||||
return (__X & (__X - 1));
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_blsr_u32(unsigned int __X) {
|
||||
return __blsr_u32(__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__tzcnt_u32(unsigned int __X) {
|
||||
return __builtin_ctz(__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_tzcnt_u32(unsigned int __X) {
|
||||
return __builtin_ctz(__X);
|
||||
}
|
||||
|
||||
/* use the 64-bit shift, rotate, and count leading zeros instructions
|
||||
for long long. */
|
||||
#ifdef __PPC64__
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__andn_u64(unsigned long long __X, unsigned long long __Y) {
|
||||
return (~__X & __Y);
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
|
||||
return ((__X << (64 - (__L + __P))) >> (64 - __L));
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__bextr_u64(unsigned long long __X, unsigned long long __Y) {
|
||||
unsigned int __P, __L;
|
||||
__P = __Y & 0xFF;
|
||||
__L = (__Y & 0xFF00) >> 8;
|
||||
return (_bextr_u64(__X, __P, __L));
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__blsi_u64(unsigned long long __X) {
|
||||
return __X & -__X;
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_blsi_u64(unsigned long long __X) {
|
||||
return __blsi_u64(__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__blsmsk_u64(unsigned long long __X) {
|
||||
return (__X ^ (__X - 1));
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_blsmsk_u64(unsigned long long __X) {
|
||||
return __blsmsk_u64(__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__blsr_u64(unsigned long long __X) {
|
||||
return (__X & (__X - 1));
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_blsr_u64(unsigned long long __X) {
|
||||
return __blsr_u64(__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__tzcnt_u64(unsigned long long __X) {
|
||||
return __builtin_ctzll(__X);
|
||||
}
|
||||
|
||||
extern __inline unsigned long long
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_tzcnt_u64(unsigned long long __X) {
|
||||
return __builtin_ctzll(__X);
|
||||
}
|
||||
#endif /* __PPC64__ */
|
||||
|
||||
#endif /* BMIINTRIN_H_ */
|
|
@ -405,20 +405,10 @@ _mm_cmpnge_pd (__m128d __A, __m128d __B)
|
|||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cmpord_pd (__m128d __A, __m128d __B)
|
||||
{
|
||||
#if _ARCH_PWR8
|
||||
__v2du c, d;
|
||||
/* Compare against self will return false (0's) if NAN. */
|
||||
c = (__v2du)vec_cmpeq (__A, __A);
|
||||
d = (__v2du)vec_cmpeq (__B, __B);
|
||||
#else
|
||||
__v2du a, b;
|
||||
__v2du c, d;
|
||||
const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
|
||||
a = (__v2du)vec_abs ((__v2df)__A);
|
||||
b = (__v2du)vec_abs ((__v2df)__B);
|
||||
c = (__v2du)vec_cmpgt (double_exp_mask, a);
|
||||
d = (__v2du)vec_cmpgt (double_exp_mask, b);
|
||||
#endif
|
||||
/* A != NAN and B != NAN. */
|
||||
return ((__m128d)vec_and(c, d));
|
||||
}
|
||||
|
@ -861,7 +851,11 @@ _mm_cvtpd_epi32 (__m128d __A)
|
|||
: );
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
temp = vec_mergeo (temp, temp);
|
||||
#else
|
||||
temp = vec_mergee (temp, temp);
|
||||
#endif
|
||||
result = (__v4si) vec_vpkudum ((__vector long long) temp,
|
||||
(__vector long long) vzero);
|
||||
#else
|
||||
|
@ -896,7 +890,11 @@ _mm_cvtpd_ps (__m128d __A)
|
|||
: );
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
temp = vec_mergeo (temp, temp);
|
||||
#else
|
||||
temp = vec_mergee (temp, temp);
|
||||
#endif
|
||||
result = (__v4sf) vec_vpkudum ((__vector long long) temp,
|
||||
(__vector long long) vzero);
|
||||
#else
|
||||
|
@ -925,7 +923,11 @@ _mm_cvttpd_epi32 (__m128d __A)
|
|||
: );
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
temp = vec_mergeo (temp, temp);
|
||||
#else
|
||||
temp = vec_mergee (temp, temp);
|
||||
#endif
|
||||
result = (__v4si) vec_vpkudum ((__vector long long) temp,
|
||||
(__vector long long) vzero);
|
||||
#else
|
||||
|
@ -1205,6 +1207,9 @@ _mm_loadl_pd (__m128d __A, double const *__B)
|
|||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_movemask_pd (__m128d __A)
|
||||
{
|
||||
#ifdef _ARCH_PWR10
|
||||
return vec_extractm ((__v2du) __A);
|
||||
#else
|
||||
__vector unsigned long long result;
|
||||
static const __vector unsigned int perm_mask =
|
||||
{
|
||||
|
@ -1224,6 +1229,7 @@ _mm_movemask_pd (__m128d __A)
|
|||
#else
|
||||
return result[0];
|
||||
#endif
|
||||
#endif /* !_ARCH_PWR10 */
|
||||
}
|
||||
#endif /* _ARCH_PWR8 */
|
||||
|
||||
|
@ -1434,6 +1440,7 @@ _mm_mul_su32 (__m64 __A, __m64 __B)
|
|||
return ((__m64)a * (__m64)b);
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mul_epu32 (__m128i __A, __m128i __B)
|
||||
{
|
||||
|
@ -1460,6 +1467,7 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
|
|||
return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_slli_epi16 (__m128i __A, int __B)
|
||||
|
@ -1749,7 +1757,7 @@ _mm_sll_epi64 (__m128i __A, __m128i __B)
|
|||
lshift = vec_splat ((__v2du) __B, 0);
|
||||
shmask = vec_cmplt (lshift, shmax);
|
||||
result = vec_sl ((__v2du) __A, lshift);
|
||||
result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
|
||||
result = vec_sel ((__v2du) shmask, result, shmask);
|
||||
|
||||
return (__m128i) result;
|
||||
}
|
||||
|
@ -1843,7 +1851,7 @@ _mm_srl_epi64 (__m128i __A, __m128i __B)
|
|||
rshift = vec_splat ((__v2du) __B, 0);
|
||||
shmask = vec_cmplt (rshift, shmax);
|
||||
result = vec_sr ((__v2du) __A, rshift);
|
||||
result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
|
||||
result = vec_sel ((__v2du) shmask, result, shmask);
|
||||
|
||||
return (__m128i) result;
|
||||
}
|
||||
|
@ -1995,10 +2003,14 @@ _mm_min_epu8 (__m128i __A, __m128i __B)
|
|||
#ifdef _ARCH_PWR8
|
||||
/* Intrinsic functions that require PowerISA 2.07 minimum. */
|
||||
|
||||
/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
|
||||
/* Return a mask created from the most significant bit of each 8-bit
|
||||
element in A. */
|
||||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_movemask_epi8 (__m128i __A)
|
||||
{
|
||||
#ifdef _ARCH_PWR10
|
||||
return vec_extractm ((__v16qu) __A);
|
||||
#else
|
||||
__vector unsigned long long result;
|
||||
static const __vector unsigned char perm_mask =
|
||||
{
|
||||
|
@ -2015,6 +2027,7 @@ _mm_movemask_epi8 (__m128i __A)
|
|||
#else
|
||||
return result[0];
|
||||
#endif
|
||||
#endif /* !_ARCH_PWR10 */
|
||||
}
|
||||
#endif /* _ARCH_PWR8 */
|
||||
|
||||
|
@ -2158,27 +2171,37 @@ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __arti
|
|||
_mm_sad_epu8 (__m128i __A, __m128i __B)
|
||||
{
|
||||
__v16qu a, b;
|
||||
__v16qu vmin, vmax, vabsdiff;
|
||||
__v16qu vabsdiff;
|
||||
__v4si vsum;
|
||||
const __v4su zero = { 0, 0, 0, 0 };
|
||||
__v4si result;
|
||||
|
||||
a = (__v16qu) __A;
|
||||
b = (__v16qu) __B;
|
||||
vmin = vec_min (a, b);
|
||||
vmax = vec_max (a, b);
|
||||
#ifndef _ARCH_PWR9
|
||||
__v16qu vmin = vec_min (a, b);
|
||||
__v16qu vmax = vec_max (a, b);
|
||||
vabsdiff = vec_sub (vmax, vmin);
|
||||
#else
|
||||
vabsdiff = vec_absd (a, b);
|
||||
#endif
|
||||
/* Sum four groups of bytes into integers. */
|
||||
vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
/* Sum across four integers with two integer results. */
|
||||
asm ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero));
|
||||
/* Note: vec_sum2s could be used here, but on little-endian, vector
|
||||
shifts are added that are not needed for this use-case.
|
||||
A vector shift to correctly position the 32-bit integer results
|
||||
(currently at [0] and [2]) to [1] and [3] would then need to be
|
||||
swapped back again since the desired results are two 64-bit
|
||||
integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
|
||||
#else
|
||||
/* Sum across four integers with two integer results. */
|
||||
result = vec_sum2s (vsum, (__vector signed int) zero);
|
||||
/* Rotate the sums into the correct position. */
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
result = vec_sld (result, result, 4);
|
||||
#else
|
||||
result = vec_sld (result, result, 6);
|
||||
#endif
|
||||
/* Rotate the sums into the correct position. */
|
||||
return (__m128i) result;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef IMMINTRIN_H_
|
||||
#define IMMINTRIN_H_
|
||||
|
||||
#include <x86gprintrin.h>
|
||||
|
||||
#include <mmintrin.h>
|
||||
|
||||
#include <xmmintrin.h>
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include <pmmintrin.h>
|
||||
|
||||
#include <tmmintrin.h>
|
||||
|
||||
#include <smmintrin.h>
|
||||
|
||||
#endif /* IMMINTRIN_H_ */
|
|
@ -0,0 +1,26 @@
|
|||
/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef NO_WARN_X86_INTRINSICS
|
||||
/* This header is distributed to simplify porting x86_64 code that
|
||||
makes explicit use of Intel intrinsics to powerpc64le.
|
||||
It is the user's responsibility to determine if the results are
|
||||
acceptable and make additional changes as necessary.
|
||||
Note that much code that uses Intel intrinsics can be rewritten in
|
||||
standard C or GNU C extensions, which are more portable and better
|
||||
optimized across multiple targets. */
|
||||
#endif
|
||||
|
||||
#ifndef NMMINTRIN_H_
|
||||
#define NMMINTRIN_H_
|
||||
|
||||
/* We just include SSE4.1 header file. */
|
||||
#include <smmintrin.h>
|
||||
|
||||
#endif /* NMMINTRIN_H_ */
|
|
@ -111,17 +111,21 @@ _mm_hsub_pd (__m128d __X, __m128d __Y)
|
|||
vec_mergel ((__v2df) __X, (__v2df)__Y));
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_movehdup_ps (__m128 __X)
|
||||
{
|
||||
return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_moveldup_ps (__m128 __X)
|
||||
{
|
||||
return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_loaddup_pd (double const *__P)
|
||||
|
|
|
@ -34,77 +34,683 @@
|
|||
#include <altivec.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_extract_epi8(__m128i __X, const int __N) {
|
||||
return (unsigned char)((__v16qi)__X)[__N & 15];
|
||||
/* Rounding mode macros. */
|
||||
#define _MM_FROUND_TO_NEAREST_INT 0x00
|
||||
#define _MM_FROUND_TO_ZERO 0x01
|
||||
#define _MM_FROUND_TO_POS_INF 0x02
|
||||
#define _MM_FROUND_TO_NEG_INF 0x03
|
||||
#define _MM_FROUND_CUR_DIRECTION 0x04
|
||||
|
||||
#define _MM_FROUND_NINT \
|
||||
(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
|
||||
#define _MM_FROUND_FLOOR \
|
||||
(_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
|
||||
#define _MM_FROUND_CEIL \
|
||||
(_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
|
||||
#define _MM_FROUND_TRUNC \
|
||||
(_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
|
||||
#define _MM_FROUND_RINT \
|
||||
(_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
|
||||
#define _MM_FROUND_NEARBYINT \
|
||||
(_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
|
||||
|
||||
#define _MM_FROUND_RAISE_EXC 0x00
|
||||
#define _MM_FROUND_NO_EXC 0x08
|
||||
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_round_pd (__m128d __A, int __rounding)
|
||||
{
|
||||
__v2df __r;
|
||||
union {
|
||||
double __fr;
|
||||
long long __fpscr;
|
||||
} __enables_save, __fpscr_save;
|
||||
|
||||
if (__rounding & _MM_FROUND_NO_EXC)
|
||||
{
|
||||
/* Save enabled exceptions, disable all exceptions,
|
||||
and preserve the rounding mode. */
|
||||
#ifdef _ARCH_PWR9
|
||||
__asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
|
||||
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
||||
#else
|
||||
__fpscr_save.__fr = __builtin_mffs ();
|
||||
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
||||
__fpscr_save.__fpscr &= ~0xf8;
|
||||
__builtin_mtfsf (0b00000011, __fpscr_save.__fr);
|
||||
#endif
|
||||
/* Insert an artificial "read/write" reference to the variable
|
||||
read below, to ensure the compiler does not schedule
|
||||
a read/use of the variable before the FPSCR is modified, above.
|
||||
This can be removed if and when GCC PR102783 is fixed.
|
||||
*/
|
||||
__asm__ ("" : "+wa" (__A));
|
||||
}
|
||||
|
||||
switch (__rounding)
|
||||
{
|
||||
case _MM_FROUND_TO_NEAREST_INT:
|
||||
__fpscr_save.__fr = __builtin_mffsl ();
|
||||
__attribute__ ((fallthrough));
|
||||
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
|
||||
__builtin_set_fpscr_rn (0b00);
|
||||
/* Insert an artificial "read/write" reference to the variable
|
||||
read below, to ensure the compiler does not schedule
|
||||
a read/use of the variable before the FPSCR is modified, above.
|
||||
This can be removed if and when GCC PR102783 is fixed.
|
||||
*/
|
||||
__asm__ ("" : "+wa" (__A));
|
||||
|
||||
__r = vec_rint ((__v2df) __A);
|
||||
|
||||
/* Insert an artificial "read" reference to the variable written
|
||||
above, to ensure the compiler does not schedule the computation
|
||||
of the value after the manipulation of the FPSCR, below.
|
||||
This can be removed if and when GCC PR102783 is fixed.
|
||||
*/
|
||||
__asm__ ("" : : "wa" (__r));
|
||||
__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
|
||||
break;
|
||||
case _MM_FROUND_TO_NEG_INF:
|
||||
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
|
||||
__r = vec_floor ((__v2df) __A);
|
||||
break;
|
||||
case _MM_FROUND_TO_POS_INF:
|
||||
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
|
||||
__r = vec_ceil ((__v2df) __A);
|
||||
break;
|
||||
case _MM_FROUND_TO_ZERO:
|
||||
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
|
||||
__r = vec_trunc ((__v2df) __A);
|
||||
break;
|
||||
case _MM_FROUND_CUR_DIRECTION:
|
||||
__r = vec_rint ((__v2df) __A);
|
||||
break;
|
||||
}
|
||||
if (__rounding & _MM_FROUND_NO_EXC)
|
||||
{
|
||||
/* Insert an artificial "read" reference to the variable written
|
||||
above, to ensure the compiler does not schedule the computation
|
||||
of the value after the manipulation of the FPSCR, below.
|
||||
This can be removed if and when GCC PR102783 is fixed.
|
||||
*/
|
||||
__asm__ ("" : : "wa" (__r));
|
||||
/* Restore enabled exceptions. */
|
||||
__fpscr_save.__fr = __builtin_mffsl ();
|
||||
__fpscr_save.__fpscr |= __enables_save.__fpscr;
|
||||
__builtin_mtfsf (0b00000011, __fpscr_save.__fr);
|
||||
}
|
||||
return (__m128d) __r;
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_extract_epi32(__m128i __X, const int __N) {
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_round_sd (__m128d __A, __m128d __B, int __rounding)
|
||||
{
|
||||
__B = _mm_round_pd (__B, __rounding);
|
||||
__v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] };
|
||||
return (__m128d) __r;
|
||||
}
|
||||
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_round_ps (__m128 __A, int __rounding)
|
||||
{
|
||||
__v4sf __r;
|
||||
union {
|
||||
double __fr;
|
||||
long long __fpscr;
|
||||
} __enables_save, __fpscr_save;
|
||||
|
||||
if (__rounding & _MM_FROUND_NO_EXC)
|
||||
{
|
||||
/* Save enabled exceptions, disable all exceptions,
|
||||
and preserve the rounding mode. */
|
||||
#ifdef _ARCH_PWR9
|
||||
__asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
|
||||
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
||||
#else
|
||||
__fpscr_save.__fr = __builtin_mffs ();
|
||||
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
||||
__fpscr_save.__fpscr &= ~0xf8;
|
||||
__builtin_mtfsf (0b00000011, __fpscr_save.__fr);
|
||||
#endif
|
||||
/* Insert an artificial "read/write" reference to the variable
|
||||
read below, to ensure the compiler does not schedule
|
||||
a read/use of the variable before the FPSCR is modified, above.
|
||||
This can be removed if and when GCC PR102783 is fixed.
|
||||
*/
|
||||
__asm__ ("" : "+wa" (__A));
|
||||
}
|
||||
|
||||
switch (__rounding)
|
||||
{
|
||||
case _MM_FROUND_TO_NEAREST_INT:
|
||||
__fpscr_save.__fr = __builtin_mffsl ();
|
||||
__attribute__ ((fallthrough));
|
||||
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
|
||||
__builtin_set_fpscr_rn (0b00);
|
||||
/* Insert an artificial "read/write" reference to the variable
|
||||
read below, to ensure the compiler does not schedule
|
||||
a read/use of the variable before the FPSCR is modified, above.
|
||||
This can be removed if and when GCC PR102783 is fixed.
|
||||
*/
|
||||
__asm__ ("" : "+wa" (__A));
|
||||
|
||||
__r = vec_rint ((__v4sf) __A);
|
||||
|
||||
/* Insert an artificial "read" reference to the variable written
|
||||
above, to ensure the compiler does not schedule the computation
|
||||
of the value after the manipulation of the FPSCR, below.
|
||||
This can be removed if and when GCC PR102783 is fixed.
|
||||
*/
|
||||
__asm__ ("" : : "wa" (__r));
|
||||
__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
|
||||
break;
|
||||
case _MM_FROUND_TO_NEG_INF:
|
||||
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
|
||||
__r = vec_floor ((__v4sf) __A);
|
||||
break;
|
||||
case _MM_FROUND_TO_POS_INF:
|
||||
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
|
||||
__r = vec_ceil ((__v4sf) __A);
|
||||
break;
|
||||
case _MM_FROUND_TO_ZERO:
|
||||
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
|
||||
__r = vec_trunc ((__v4sf) __A);
|
||||
break;
|
||||
case _MM_FROUND_CUR_DIRECTION:
|
||||
__r = vec_rint ((__v4sf) __A);
|
||||
break;
|
||||
}
|
||||
if (__rounding & _MM_FROUND_NO_EXC)
|
||||
{
|
||||
/* Insert an artificial "read" reference to the variable written
|
||||
above, to ensure the compiler does not schedule the computation
|
||||
of the value after the manipulation of the FPSCR, below.
|
||||
This can be removed if and when GCC PR102783 is fixed.
|
||||
*/
|
||||
__asm__ ("" : : "wa" (__r));
|
||||
/* Restore enabled exceptions. */
|
||||
__fpscr_save.__fr = __builtin_mffsl ();
|
||||
__fpscr_save.__fpscr |= __enables_save.__fpscr;
|
||||
__builtin_mtfsf (0b00000011, __fpscr_save.__fr);
|
||||
}
|
||||
return (__m128) __r;
|
||||
}
|
||||
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_round_ss (__m128 __A, __m128 __B, int __rounding)
|
||||
{
|
||||
__B = _mm_round_ps (__B, __rounding);
|
||||
__v4sf __r = (__v4sf) __A;
|
||||
__r[0] = ((__v4sf) __B)[0];
|
||||
return (__m128) __r;
|
||||
}
|
||||
|
||||
#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL)
|
||||
#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
|
||||
|
||||
#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
|
||||
#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
|
||||
|
||||
#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL)
|
||||
#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
|
||||
|
||||
#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR)
|
||||
#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
|
||||
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
|
||||
{
|
||||
__v16qi result = (__v16qi)__A;
|
||||
|
||||
result [__N & 0xf] = __D;
|
||||
|
||||
return (__m128i) result;
|
||||
}
|
||||
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
|
||||
{
|
||||
__v4si result = (__v4si)__A;
|
||||
|
||||
result [__N & 3] = __D;
|
||||
|
||||
return (__m128i) result;
|
||||
}
|
||||
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
|
||||
{
|
||||
__v2di result = (__v2di)__A;
|
||||
|
||||
result [__N & 1] = __D;
|
||||
|
||||
return (__m128i) result;
|
||||
}
|
||||
|
||||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_extract_epi8 (__m128i __X, const int __N)
|
||||
{
|
||||
return (unsigned char) ((__v16qi)__X)[__N & 15];
|
||||
}
|
||||
|
||||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_extract_epi32 (__m128i __X, const int __N)
|
||||
{
|
||||
return ((__v4si)__X)[__N & 3];
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_extract_epi64(__m128i __X, const int __N) {
|
||||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_extract_epi64 (__m128i __X, const int __N)
|
||||
{
|
||||
return ((__v2di)__X)[__N & 1];
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_extract_ps(__m128 __X, const int __N) {
|
||||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_extract_ps (__m128 __X, const int __N)
|
||||
{
|
||||
return ((__v4si)__X)[__N & 3];
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
|
||||
__v16qi __charmask = vec_splats((signed char)__imm8);
|
||||
__charmask = vec_gb(__charmask);
|
||||
__v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
|
||||
#ifdef __BIG_ENDIAN__
|
||||
__shortmask = vec_reve(__shortmask);
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
|
||||
{
|
||||
__v16qi __charmask = vec_splats ((signed char) __imm8);
|
||||
__charmask = vec_gb (__charmask);
|
||||
__v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
|
||||
#ifdef __BIG_ENDIAN__
|
||||
__shortmask = vec_reve (__shortmask);
|
||||
#endif
|
||||
return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
|
||||
}
|
||||
#endif
|
||||
return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
|
||||
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
|
||||
{
|
||||
#ifdef _ARCH_PWR10
|
||||
return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask);
|
||||
#else
|
||||
const __v16qu __seven = vec_splats ((unsigned char) 0x07);
|
||||
__v16qu __lmask = vec_sra ((__v16qu) __mask, __seven);
|
||||
return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
|
||||
{
|
||||
__v16qu __pcv[] =
|
||||
{
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 },
|
||||
{ 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 },
|
||||
{ 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
|
||||
{ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 },
|
||||
{ 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 },
|
||||
{ 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 },
|
||||
{ 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
|
||||
{ 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
|
||||
{ 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
|
||||
{ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
|
||||
};
|
||||
__v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
|
||||
return (__m128) __r;
|
||||
}
|
||||
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
|
||||
{
|
||||
#ifdef _ARCH_PWR10
|
||||
return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask);
|
||||
#else
|
||||
const __v4si __zero = {0};
|
||||
const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
|
||||
return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
|
||||
{
|
||||
__v16qu __pcv[] =
|
||||
{
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
|
||||
{ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }
|
||||
};
|
||||
__v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
|
||||
return (__m128d) __r;
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
|
||||
{
|
||||
#ifdef _ARCH_PWR10
|
||||
return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask);
|
||||
#else
|
||||
const __v2di __zero = {0};
|
||||
const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
|
||||
return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline int
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_testz_si128 (__m128i __A, __m128i __B)
|
||||
{
|
||||
/* Note: This implementation does NOT set "zero" or "carry" flags. */
|
||||
const __v16qu __zero = {0};
|
||||
return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero);
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_testc_si128 (__m128i __A, __m128i __B)
|
||||
{
|
||||
/* Note: This implementation does NOT set "zero" or "carry" flags. */
|
||||
const __v16qu __zero = {0};
|
||||
const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A);
|
||||
return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero);
|
||||
}
|
||||
|
||||
extern __inline int
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_testnzc_si128 (__m128i __A, __m128i __B)
|
||||
{
|
||||
/* Note: This implementation does NOT set "zero" or "carry" flags. */
|
||||
return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0;
|
||||
}
|
||||
|
||||
#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
|
||||
|
||||
#define _mm_test_all_ones(V) \
|
||||
_mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
|
||||
|
||||
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_min_epi8 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
|
||||
const __v16qu __seven = vec_splats((unsigned char)0x07);
|
||||
__v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
|
||||
return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_min_epu16 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
|
||||
__v16qi result = (__v16qi)__A;
|
||||
result[__N & 0xf] = __D;
|
||||
return (__m128i)result;
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_min_epi32 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
|
||||
__v4si result = (__v4si)__A;
|
||||
result[__N & 3] = __D;
|
||||
return (__m128i)result;
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_min_epu32 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
|
||||
__v2di result = (__v2di)__A;
|
||||
result[__N & 1] = __D;
|
||||
return (__m128i)result;
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_max_epi8 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_max_epu16 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_max_epi32 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_max_epu32 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mullo_epi32 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y);
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mul_epi32 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepi8_epi16 (__m128i __A)
|
||||
{
|
||||
return (__m128i) vec_unpackh ((__v16qi) __A);
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepi8_epi32 (__m128i __A)
|
||||
{
|
||||
__A = (__m128i) vec_unpackh ((__v16qi) __A);
|
||||
return (__m128i) vec_unpackh ((__v8hi) __A);
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepi8_epi64 (__m128i __A)
|
||||
{
|
||||
__A = (__m128i) vec_unpackh ((__v16qi) __A);
|
||||
__A = (__m128i) vec_unpackh ((__v8hi) __A);
|
||||
return (__m128i) vec_unpackh ((__v4si) __A);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepi16_epi32 (__m128i __A)
|
||||
{
|
||||
return (__m128i) vec_unpackh ((__v8hi) __A);
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepi16_epi64 (__m128i __A)
|
||||
{
|
||||
__A = (__m128i) vec_unpackh ((__v8hi) __A);
|
||||
return (__m128i) vec_unpackh ((__v4si) __A);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepi32_epi64 (__m128i __A)
|
||||
{
|
||||
return (__m128i) vec_unpackh ((__v4si) __A);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepu8_epi16 (__m128i __A)
|
||||
{
|
||||
const __v16qu __zero = {0};
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
|
||||
#else /* __BIG_ENDIAN__. */
|
||||
__A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
|
||||
#endif /* __BIG_ENDIAN__. */
|
||||
return __A;
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepu8_epi32 (__m128i __A)
|
||||
{
|
||||
const __v16qu __zero = {0};
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
|
||||
__A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
|
||||
#else /* __BIG_ENDIAN__. */
|
||||
__A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
|
||||
__A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
|
||||
#endif /* __BIG_ENDIAN__. */
|
||||
return __A;
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepu8_epi64 (__m128i __A)
|
||||
{
|
||||
const __v16qu __zero = {0};
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
|
||||
__A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
|
||||
__A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
|
||||
#else /* __BIG_ENDIAN__. */
|
||||
__A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
|
||||
__A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
|
||||
__A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
|
||||
#endif /* __BIG_ENDIAN__. */
|
||||
return __A;
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepu16_epi32 (__m128i __A)
|
||||
{
|
||||
const __v8hu __zero = {0};
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
|
||||
#else /* __BIG_ENDIAN__. */
|
||||
__A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
|
||||
#endif /* __BIG_ENDIAN__. */
|
||||
return __A;
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepu16_epi64 (__m128i __A)
|
||||
{
|
||||
const __v8hu __zero = {0};
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
|
||||
__A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
|
||||
#else /* __BIG_ENDIAN__. */
|
||||
__A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
|
||||
__A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
|
||||
#endif /* __BIG_ENDIAN__. */
|
||||
return __A;
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtepu32_epi64 (__m128i __A)
|
||||
{
|
||||
const __v4su __zero = {0};
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__A = (__m128i) vec_mergeh ((__v4su) __A, __zero);
|
||||
#else /* __BIG_ENDIAN__. */
|
||||
__A = (__m128i) vec_mergeh (__zero, (__v4su) __A);
|
||||
#endif /* __BIG_ENDIAN__. */
|
||||
return __A;
|
||||
}
|
||||
|
||||
/* Return horizontal packed word minimum and its index in bits [15:0]
|
||||
and bits [18:16] respectively. */
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_minpos_epu16 (__m128i __A)
|
||||
{
|
||||
union __u
|
||||
{
|
||||
__m128i __m;
|
||||
__v8hu __uh;
|
||||
};
|
||||
union __u __u = { .__m = __A }, __r = { .__m = {0} };
|
||||
unsigned short __ridx = 0;
|
||||
unsigned short __rmin = __u.__uh[__ridx];
|
||||
for (unsigned long __i = 1; __i < 8; __i++)
|
||||
{
|
||||
if (__u.__uh[__i] < __rmin)
|
||||
{
|
||||
__rmin = __u.__uh[__i];
|
||||
__ridx = __i;
|
||||
}
|
||||
}
|
||||
__r.__uh[0] = __rmin;
|
||||
__r.__uh[1] = __ridx;
|
||||
return __r.__m;
|
||||
}
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_packus_epi32 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y);
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
#include_next <smmintrin.h>
|
||||
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
||||
*/
|
||||
|
||||
#endif /* _SMMINTRIN_H_ */
|
||||
#endif /* SMMINTRIN_H_ */
|
||||
|
|
|
@ -339,6 +339,7 @@ _mm_shuffle_pi8 (__m64 __A, __m64 __B)
|
|||
return (__m64) ((__v2du) (__C))[0];
|
||||
}
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_epi8 (__m128i __A, __m128i __B)
|
||||
|
@ -350,7 +351,9 @@ _mm_sign_epi8 (__m128i __A, __m128i __B)
|
|||
__v16qi __conv = vec_add (__selectneg, __selectpos);
|
||||
return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_epi16 (__m128i __A, __m128i __B)
|
||||
|
@ -362,7 +365,9 @@ _mm_sign_epi16 (__m128i __A, __m128i __B)
|
|||
__v8hi __conv = vec_add (__selectneg, __selectpos);
|
||||
return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_epi32 (__m128i __A, __m128i __B)
|
||||
|
@ -374,7 +379,9 @@ _mm_sign_epi32 (__m128i __A, __m128i __B)
|
|||
__v4si __conv = vec_add (__selectneg, __selectpos);
|
||||
return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_pi8 (__m64 __A, __m64 __B)
|
||||
|
@ -385,7 +392,9 @@ _mm_sign_pi8 (__m64 __A, __m64 __B)
|
|||
__C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
|
||||
return (__m64) ((__v2du) (__C))[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_pi16 (__m64 __A, __m64 __B)
|
||||
|
@ -396,7 +405,9 @@ _mm_sign_pi16 (__m64 __A, __m64 __B)
|
|||
__C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
|
||||
return (__m64) ((__v2du) (__C))[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _ARCH_PWR8
|
||||
extern __inline __m64
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sign_pi32 (__m64 __A, __m64 __B)
|
||||
|
@ -407,6 +418,7 @@ _mm_sign_pi32 (__m64 __A, __m64 __B)
|
|||
__C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
|
||||
return (__m64) ((__v2du) (__C))[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef X86GPRINTRIN_H_
|
||||
#define X86GPRINTRIN_H_
|
||||
|
||||
#include <bmiintrin.h>
|
||||
|
||||
#include <bmi2intrin.h>
|
||||
|
||||
#endif /* X86GPRINTRIN_H_ */
|
|
@ -0,0 +1,28 @@
|
|||
/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef NO_WARN_X86_INTRINSICS
|
||||
/* This header is distributed to simplify porting x86_64 code that
|
||||
makes explicit use of Intel intrinsics to powerpc64le.
|
||||
It is the user's responsibility to determine if the results are
|
||||
acceptable and make additional changes as necessary.
|
||||
Note that much code that uses Intel intrinsics can be rewritten in
|
||||
standard C or GNU C extensions, which are more portable and better
|
||||
optimized across multiple targets. */
|
||||
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
|
||||
#endif
|
||||
|
||||
#ifndef X86INTRIN_H_
|
||||
#define X86INTRIN_H_
|
||||
|
||||
#ifdef __ALTIVEC__
|
||||
#include <immintrin.h>
|
||||
#endif /* __ALTIVEC__ */
|
||||
|
||||
#endif /* X86INTRIN_H_ */
|
|
@ -31,8 +31,8 @@
|
|||
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
|
||||
#endif
|
||||
|
||||
#ifndef _XMMINTRIN_H_INCLUDED
|
||||
#define _XMMINTRIN_H_INCLUDED
|
||||
#ifndef XMMINTRIN_H_
|
||||
#define XMMINTRIN_H_
|
||||
|
||||
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
||||
|
||||
|
@ -881,7 +881,7 @@ _mm_cvtss_f32 (__m128 __A)
|
|||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtss_si32 (__m128 __A)
|
||||
{
|
||||
__m64 res = 0;
|
||||
int res;
|
||||
#ifdef _ARCH_PWR8
|
||||
double dtmp;
|
||||
__asm__(
|
||||
|
@ -914,8 +914,8 @@ _mm_cvt_ss2si (__m128 __A)
|
|||
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtss_si64 (__m128 __A)
|
||||
{
|
||||
__m64 res = 0;
|
||||
#ifdef _ARCH_PWR8
|
||||
long long res;
|
||||
#if defined (_ARCH_PWR8) && defined (__powerpc64__)
|
||||
double dtmp;
|
||||
__asm__(
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
|
@ -1328,6 +1328,9 @@ _mm_storel_pi (__m64 *__P, __m128 __A)
|
|||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_movemask_ps (__m128 __A)
|
||||
{
|
||||
#ifdef _ARCH_PWR10
|
||||
return vec_extractm ((__vector unsigned int) __A);
|
||||
#else
|
||||
__vector unsigned long long result;
|
||||
static const __vector unsigned int perm_mask =
|
||||
{
|
||||
|
@ -1347,6 +1350,7 @@ _mm_movemask_ps (__m128 __A)
|
|||
#else
|
||||
return result[0];
|
||||
#endif
|
||||
#endif /* !_ARCH_PWR10 */
|
||||
}
|
||||
#endif /* _ARCH_PWR8 */
|
||||
|
||||
|
@ -1553,6 +1557,7 @@ _m_pminub (__m64 __A, __m64 __B)
|
|||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_movemask_pi8 (__m64 __A)
|
||||
{
|
||||
#ifdef __powerpc64__
|
||||
unsigned long long p =
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
0x0008101820283038UL; // permute control for sign bits
|
||||
|
@ -1560,6 +1565,18 @@ _mm_movemask_pi8 (__m64 __A)
|
|||
0x3830282018100800UL; // permute control for sign bits
|
||||
#endif
|
||||
return __builtin_bpermd (p, __A);
|
||||
#else
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
unsigned int mask = 0x20283038UL;
|
||||
unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
|
||||
unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
|
||||
#else
|
||||
unsigned int mask = 0x38302820UL;
|
||||
unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf;
|
||||
unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf;
|
||||
#endif
|
||||
return (r2 << 4) | r1;
|
||||
#endif
|
||||
}
|
||||
|
||||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -1841,4 +1858,4 @@ do { \
|
|||
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
||||
*/
|
||||
|
||||
#endif /* _XMMINTRIN_H_INCLUDED */
|
||||
#endif /* XMMINTRIN_H_ */
|
||||
|
|
|
@ -5,6 +5,9 @@
|
|||
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE
|
||||
|
||||
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10-LE
|
||||
|
||||
// CHECK-BE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 -2139078656>, align 16
|
||||
// CHECK-BE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
|
||||
// CHECK-BE-DAG: @_mm_shufflehi_epi16.permute_selectors = internal constant [4 x i16] [i16 2057, i16 2571, i16 3085, i16 3599], align 2
|
||||
|
@ -440,7 +443,8 @@ test_converts() {
|
|||
// CHECK: call <2 x double> @vec_rint(double vector[2])
|
||||
// CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16
|
||||
// CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
|
||||
// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
|
||||
// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
|
||||
// CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_mm_cvtpd_pi32
|
||||
|
@ -450,7 +454,8 @@ test_converts() {
|
|||
// CHECK-LABEL: define available_externally <4 x float> @_mm_cvtpd_ps
|
||||
// CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16
|
||||
// CHECK: call <4 x i32> asm "xvcvdpsp ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
|
||||
// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
|
||||
// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
|
||||
// CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x double> @_mm_cvtpi32_pd
|
||||
|
@ -530,7 +535,8 @@ test_converts() {
|
|||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvttpd_epi32
|
||||
// CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa"
|
||||
// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
|
||||
// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
|
||||
// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
|
||||
// CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_mm_cvttpd_pi32
|
||||
|
@ -756,12 +762,18 @@ test_move() {
|
|||
// CHECK: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0
|
||||
// CHECK: insertelement <2 x double> %{{[0-9a-zA-Z_.]+}}, double %[[EXT]], i32 0
|
||||
|
||||
// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_epi8
|
||||
// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally signext i32 @_mm_movemask_epi8
|
||||
// CHECK: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef <i8 120, i8 112, i8 104, i8 96, i8 88, i8 80, i8 72, i8 64, i8 56, i8 48, i8 40, i8 32, i8 24, i8 16, i8 8, i8 0>)
|
||||
// CHECK-LE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
|
||||
// CHECK-BE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0
|
||||
// CHECK: trunc i64 %[[VAL]] to i32
|
||||
|
||||
// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_pd
|
||||
// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally signext i32 @_mm_movemask_pd
|
||||
// CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 -2139094976, i32 -2139062144, i32 -2139062144, i32 -2139062144> to <16 x i8>))
|
||||
// CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
|
||||
|
@ -857,9 +869,9 @@ test_sad() {
|
|||
// CHECK: call <16 x i8> @vec_max(unsigned char vector[16], unsigned char vector[16])
|
||||
// CHECK: call <16 x i8> @vec_sub(unsigned char vector[16], unsigned char vector[16])
|
||||
// CHECK: call <4 x i32> @vec_sum4s(unsigned char vector[16], unsigned int vector[4])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
|
||||
// CHECK: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
|
||||
// CHECK-LE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 4)
|
||||
// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 6)
|
||||
// CHECK-LE: call <4 x i32> asm "vsum2sws $0,$1,$2", "=v,v,v"(<4 x i32> %11, <4 x i32> zeroinitializer)
|
||||
// CHECK-BE: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
|
||||
// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_set() {
|
||||
|
@ -1086,7 +1098,7 @@ test_sll() {
|
|||
// CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef zeroext 0)
|
||||
// CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, <2 x i64> noundef <i64 64, i64 64>)
|
||||
// CHECK: call <2 x i64> @vec_sl(unsigned long long vector[2], unsigned long long vector[2])
|
||||
// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2])
|
||||
// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_slli_epi16
|
||||
// CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
|
||||
|
@ -1232,7 +1244,7 @@ test_srl() {
|
|||
// CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 0)
|
||||
// CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef <i64 64, i64 64>)
|
||||
// CHECK: call <2 x i64> @vec_sr(unsigned long long vector[2], unsigned long long vector[2])
|
||||
// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2])
|
||||
// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_srli_epi16
|
||||
// CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
|
||||
|
|
|
@ -10,6 +10,11 @@
|
|||
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
|
||||
|
||||
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
|
||||
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
|
||||
|
||||
#include <smmintrin.h>
|
||||
|
||||
__m128 mn1, mn2;
|
||||
|
@ -48,6 +53,10 @@ void __attribute__((noinline))
|
|||
test_blend() {
|
||||
_mm_blend_epi16(m1, m2, 0);
|
||||
_mm_blendv_epi8(m1, m2, mi);
|
||||
_mm_blend_ps(mn1, mn2, 0);
|
||||
_mm_blendv_ps(mn1, mn2, mn1);
|
||||
_mm_blend_pd(md1, md2, 0);
|
||||
_mm_blendv_pd(md1, md2, md1);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_blend
|
||||
|
@ -62,10 +71,13 @@ test_blend() {
|
|||
// BE: store <8 x i16> %[[REVE]], <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
|
||||
// CHECK: call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8])
|
||||
|
||||
// P10-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// P10: call <16 x i8> @vec_blendv(signed char vector[16], signed char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <16 x i8> @vec_splats(unsigned char)(i8 noundef zeroext 7)
|
||||
// CHECK: call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16])
|
||||
// CHECK: call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], unsigned char vector[16])
|
||||
// CHECK: call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], unsigned char vector[16])
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_insert() {
|
||||
|
@ -89,6 +101,239 @@ test_insert() {
|
|||
// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1
|
||||
// CHECK: insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[AND:[0-9a-zA-Z_.]+]]
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_convert() {
|
||||
_mm_cvtepi16_epi32(m1);
|
||||
_mm_cvtepi16_epi64(m1);
|
||||
_mm_cvtepi32_epi64(m1);
|
||||
_mm_cvtepi8_epi16(m1);
|
||||
_mm_cvtepi8_epi32(m1);
|
||||
_mm_cvtepi8_epi64(m1);
|
||||
_mm_cvtepu16_epi32(m1);
|
||||
_mm_cvtepu16_epi64(m1);
|
||||
_mm_cvtepu32_epi64(m1);
|
||||
_mm_cvtepu8_epi16(m1);
|
||||
_mm_cvtepu8_epi32(m1);
|
||||
_mm_cvtepu8_epi64(m1);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_convert
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
|
||||
// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
|
||||
// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
|
||||
// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
|
||||
// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
|
||||
// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
|
||||
// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
|
||||
// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
|
||||
// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
|
||||
// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
|
||||
// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
|
||||
// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])
|
||||
// CHECK: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])
|
||||
// CHECK: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_minmax() {
|
||||
_mm_max_epi32(m1, m2);
|
||||
_mm_max_epi8(m1, m2);
|
||||
_mm_max_epu16(m1, m2);
|
||||
_mm_max_epu32(m1, m2);
|
||||
_mm_min_epi32(m1, m2);
|
||||
_mm_min_epi8(m1, m2);
|
||||
_mm_min_epu16(m1, m2);
|
||||
_mm_min_epu32(m1, m2);
|
||||
_mm_minpos_epu16(m1);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_minmax
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x i32> @vec_max(int vector[4], int vector[4])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <16 x i8> @vec_max(signed char vector[16], signed char vector[16])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <8 x i16> @vec_max(unsigned short vector[8], unsigned short vector[8])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x i32> @vec_max(unsigned int vector[4], unsigned int vector[4])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x i32> @vec_min(int vector[4], int vector[4])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <16 x i8> @vec_min(signed char vector[16], signed char vector[16])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <8 x i16> @vec_min(unsigned short vector[8], unsigned short vector[8])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x i32> @vec_min(unsigned int vector[4], unsigned int vector[4])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_minpos_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 %{{[0-9a-zA-Z_.]+}}, i8 0, i64 16, i1 false)
|
||||
// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i16 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: %[[VEXT:[0-9a-zA-Z_.]+]] = extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: zext i16 %[[VEXT]] to i32
|
||||
// CHECK: zext i16 %{{[0-9a-zA-Z_.]+}} to i32
|
||||
// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
|
||||
// CHECK: add i64 %{{[0-9a-zA-Z_.]+}}, 1
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_round() {
|
||||
_mm_round_ps(mn1, 0);
|
||||
_mm_round_ss(mn1, mn2, 0);
|
||||
_mm_round_pd(mn1, 0);
|
||||
_mm_round_sd(mn1, mn2, 0);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_round
|
||||
|
||||
// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
|
||||
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
|
||||
// CHECK: call <4 x float> @vec_rint(float vector[4])
|
||||
// CHECK: call void asm sideeffect "", "^wa"
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x float> @vec_floor(float vector[4])
|
||||
// CHECK: call <4 x float> @vec_ceil(float vector[4])
|
||||
// CHECK: call <4 x float> @vec_trunc(float vector[4])
|
||||
// CHECK: call <4 x float> @vec_rint(float vector[4])
|
||||
// CHECK: call void asm sideeffect "", "^wa"
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
|
||||
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
|
||||
// CHECK: call <2 x double> @vec_rint(double vector[2])
|
||||
// CHECK: call void asm sideeffect "", "^wa"
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <2 x double> @vec_floor(double vector[2])
|
||||
// CHECK: call <2 x double> @vec_ceil(double vector[2])
|
||||
// CHECK: call <2 x double> @vec_trunc(double vector[2])
|
||||
// CHECK: call <2 x double> @vec_rint(double vector[2])
|
||||
// CHECK: call void asm sideeffect "", "^wa"
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
|
||||
// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0
|
||||
// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 1
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_testing() {
|
||||
_mm_testc_si128(m1, m2);
|
||||
_mm_testnzc_si128(m1, m2);
|
||||
_mm_testz_si128(m1, m2);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_testing
|
||||
|
||||
// CHECK-LABEL: define available_externally signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <16 x i8> @vec_nor(unsigned char vector[16], unsigned char vector[16])
|
||||
// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
|
||||
// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call1, <16 x i8> noundef zeroinitializer)
|
||||
|
||||
// CHECK-LABEL: define available_externally signext i32 @_mm_testnzc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: zext i1 %{{[0-9a-zA-Z_.]+}} to i32
|
||||
|
||||
// CHECK-LABEL: define available_externally signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
|
||||
// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call, <16 x i8> noundef zeroinitializer)
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_compare() {
|
||||
_mm_cmpeq_epi64(m1, m2);
|
||||
_mm_cmpgt_epi64(m1, m2);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_compare
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpeq_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <2 x i64> @vec_cmpeq(long long vector[2], long long vector[2])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpgt_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <2 x i64> @vec_cmpgt(long long vector[2], long long vector[2])
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_mul() {
|
||||
_mm_mul_epi32(m1, m2);
|
||||
_mm_mullo_epi32(m1, m2);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_mul
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_mul_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: %call = call <2 x i64> @vec_mule(int vector[4], int vector[4])
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_mullo_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: %call = call <4 x i32> @vec_mul(unsigned int vector[4], unsigned int vector[4])
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_packus() {
|
||||
_mm_packus_epi32(m1, m2);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_packus
|
||||
|
||||
// CHECK-LABEL: define available_externally <2 x i64> @_mm_packus_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
// CHECK: call <8 x i16> @vec_packsu(int vector[4], int vector[4])(<4 x i32> noundef %1, <4 x i32> noundef %3)
|
||||
|
||||
// To test smmintrin.h includes tmmintrin.h
|
||||
|
||||
void __attribute__((noinline))
|
||||
|
|
|
@ -0,0 +1,239 @@
|
|||
// REQUIRES: powerpc-registered-target
|
||||
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
|
||||
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
|
||||
|
||||
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
|
||||
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
|
||||
|
||||
#include <x86gprintrin.h>
|
||||
|
||||
unsigned short us;
|
||||
unsigned ui;
|
||||
unsigned long long ul;
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_bmiintrin() {
|
||||
__tzcnt_u16(us);
|
||||
__andn_u32(ui, ui);
|
||||
_bextr_u32(ui, ui, ui);
|
||||
__bextr_u32(ui, ui);
|
||||
__blsi_u32(ui);
|
||||
_blsi_u32(ui);
|
||||
__blsmsk_u32(ui);
|
||||
_blsmsk_u32(ui);
|
||||
__blsr_u32(ui);
|
||||
_blsr_u32(ui);
|
||||
__tzcnt_u32(ui);
|
||||
_tzcnt_u32(ui);
|
||||
__andn_u64(ul, ul);
|
||||
_bextr_u64(ul, ui, ui);
|
||||
__bextr_u64(ul, ul);
|
||||
__blsi_u64(ul);
|
||||
_blsi_u64(ul);
|
||||
__blsmsk_u64(ul);
|
||||
_blsmsk_u64(ul);
|
||||
__blsr_u64(ul);
|
||||
_blsr_u64(ul);
|
||||
__tzcnt_u64(ul);
|
||||
_tzcnt_u64(ul);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_bmiintrin
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i16 @__tzcnt_u16(i16 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i16 %{{[0-9a-zA-Z._]+}} to i32
|
||||
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i32 @llvm.cttz.i32(i32 %[[CONV]], i1 false)
|
||||
// CHECK: trunc i32 %[[CALL]] to i16
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @__andn_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i32 %{{[0-9a-zA-Z._]+}}, -1
|
||||
// CHECK: and i32 %[[NEG]], %1
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @_bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %[[ADD]]
|
||||
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
|
||||
// CHECK: %[[SUB]]1 = sub i32 32, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: lshr i32 %[[SHL]], %[[SUB]]1
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @__bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i32 %{{[0-9a-zA-Z._]+}}, 255
|
||||
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i32 %{{[0-9a-zA-Z._]+}}, 8
|
||||
// CHECK: and i32 %[[SHR]], 255
|
||||
// CHECK: call zeroext i32 @_bextr_u32
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @__blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 0, %1
|
||||
// CHECK: and i32 %0, %[[SUB]]
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @_blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call zeroext i32 @__blsi_u32
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @__blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
|
||||
// CHECK: xor i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @_blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call zeroext i32 @__blsmsk_u32
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @__blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
|
||||
// CHECK: and i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @_blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call zeroext i32 @__blsr_u32
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @__tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @_tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @__andn_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i64 %{{[0-9a-zA-Z._]+}}, -1
|
||||
// CHECK: and i64 %[[NEG]], %{{[0-9a-zA-Z._]+}}
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 64, %[[ADD]]
|
||||
// CHECK: %[[EXT:[0-9a-zA-Z._]+]] = zext i32 %[[SUB]] to i64
|
||||
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[EXT]]
|
||||
// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 64, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: %[[EXT2:[0-9a-zA-Z._]+]] = zext i32 %[[SUB1]] to i64
|
||||
// CHECK: lshr i64 %[[SHL]], %[[EXT2]]
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @__bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 255
|
||||
// CHECK: trunc i64 %[[AND]] to i32
|
||||
// CHECK: %[[AND1:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 65280
|
||||
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %[[AND1]], 8
|
||||
// CHECK: trunc i64 %[[SHR]] to i32
|
||||
// CHECK: call i64 @_bextr_u64
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @__blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 0, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: and i64 %0, %[[SUB]]
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call i64 @__blsi_u64
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @__blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
|
||||
// CHECK: xor i64 %0, %[[SUB]]
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call i64 @__blsmsk_u64
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @__blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
|
||||
// CHECK: and i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call i64 @__blsr_u64
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @__tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
|
||||
// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
|
||||
// CHECK: sext i32 %[[CAST]] to i64
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
|
||||
// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
|
||||
// CHECK: sext i32 %[[CAST]] to i64
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_bmi2intrin() {
|
||||
_bzhi_u32(ui, ui);
|
||||
_mulx_u32(ui, ui, &ui);
|
||||
_bzhi_u64(ul, ul);
|
||||
_mulx_u64(ul, ul, &ul);
|
||||
_pdep_u64(ul, ul);
|
||||
_pext_u64(ul, ul);
|
||||
_pdep_u32(ui, ui);
|
||||
_pext_u32(ui, ui);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_bmi2intrin
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @_bzhi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
|
||||
// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: lshr i32 %[[SHL]], %[[SUB1]]
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @_mulx_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32* noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
|
||||
// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
|
||||
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %{{[0-9a-zA-Z._]+}}, 32
|
||||
// CHECK: trunc i64 %[[SHR]] to i32
|
||||
// CHECK: trunc i64 %{{[0-9a-zA-Z._]+}} to i32
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_bzhi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
|
||||
// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: lshr i64 %[[SHL]], %[[SUB1]]
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_mulx_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}, i64* noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
|
||||
// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
|
||||
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i128 %{{[0-9a-zA-Z._]+}}, 64
|
||||
// CHECK: trunc i128 %[[SHR]] to i64
|
||||
// CHECK: trunc i128 %{{[0-9a-zA-Z._]+}} to i64
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_pdep_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub nsw i32 64, %[[CAST]]
|
||||
// CHECK: sext i32 %[[SUB]] to i64
|
||||
// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
|
||||
// CHECK: %[[CAST2:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL2]] to i32
|
||||
// CHECK: sext i32 %[[CAST2]] to i64
|
||||
// CHECK: %[[SUB2:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB2]]
|
||||
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
|
||||
// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
|
||||
// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %[[AND]]
|
||||
// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_pext_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i1 @llvm.is.constant.i64(i64 %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: br i1 %[[CALL]], label %[[TRUECOND:[0-9a-zA-Z._]+]], label %[[FALSECOND:[0-9a-zA-Z._]+]]
|
||||
// CHECK: [[TRUECOND]]:
|
||||
// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
|
||||
// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, 8
|
||||
// CHECK: or i64 %[[SHL]], %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
|
||||
// CHECK: add nsw i64 %{{[0-9a-zA-Z._]+}}, 1
|
||||
// CHECK: call i64 @llvm.ppc.bpermd
|
||||
// CHECK: [[FALSECOND]]:
|
||||
// CHECK: call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
|
||||
// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
|
||||
// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: lshr i64 %[[AND]], %[[SUB]]
|
||||
// CHECK: %[[SHR3:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR3]]
|
||||
// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
|
||||
// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @_pdep_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
|
||||
// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
|
||||
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pdep_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
|
||||
// CHECK: trunc i64 %[[CALL]] to i32
|
||||
|
||||
// CHECK-LABEL: define available_externally zeroext i32 @_pext_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
|
||||
// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
|
||||
// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
|
||||
// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pext_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
|
||||
// CHECK: trunc i64 %[[CALL]] to i32
|
|
@ -9,6 +9,9 @@
|
|||
// RUN: %clang -x c++ -fsyntax-only -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns
|
||||
|
||||
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-P10
|
||||
|
||||
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \
|
||||
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
|
||||
// RUN: %clang -x c++ -fsyntax-only -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \
|
||||
|
@ -383,12 +386,12 @@ test_convert() {
|
|||
// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
|
||||
|
||||
// CHECK-LABEL: define available_externally signext i32 @_mm_cvtss_si32
|
||||
// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
|
||||
// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
|
||||
// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 0
|
||||
// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 1
|
||||
// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 2
|
||||
// CHECK: trunc i64 %{{[0-9a-zA-Z_.]+}} to i32
|
||||
// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
|
||||
// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
|
||||
// CHECK-P10: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
|
||||
// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 0
|
||||
// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 1
|
||||
// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 2
|
||||
|
||||
// CHECK-LABEL: define available_externally i64 @_mm_cvtss_si64
|
||||
// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctid $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
|
||||
|
@ -681,9 +684,11 @@ test_move() {
|
|||
// CHECK-LABEL: define available_externally signext i32 @_mm_movemask_ps
|
||||
// CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 2113632, i32 -2139062144, i32 -2139062144, i32 -2139062144> to <16 x i8>))
|
||||
// CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
|
||||
// CHECK-LE: trunc i64 %[[EXT]] to i32
|
||||
// CHECK-BE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 2113632> to <16 x i8>))
|
||||
// CHECK-BE: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0
|
||||
// CHECK: trunc i64 %[[EXT]] to i32
|
||||
// CHECK-BE: trunc i64 %[[EXT]] to i32
|
||||
// CHECK-P10: call zeroext i32 @vec_extractm(unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
|
||||
|
||||
void __attribute__((noinline))
|
||||
test_alt_name_move() {
|
||||
|
|
Loading…
Reference in New Issue