[PowerPC] [Clang] Add SSE4 and BMI intrinsics implementation

Reviewed By: jsji Differential Revision: https://reviews.llvm.org/D119407
2022-03-24 20:03:08 +08:00 · 2022-03-24 20:03:08 +08:00 · 406bde9a15
parent b3fbbabdc1
commit 406bde9a15
16 changed files with 1650 additions and 85 deletions
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@ -164,6 +164,12 @@ set(ppc_wrapper_files
  ppc_wrappers/pmmintrin.h
  ppc_wrappers/tmmintrin.h
  ppc_wrappers/smmintrin.h
+  ppc_wrappers/bmiintrin.h
+  ppc_wrappers/bmi2intrin.h
+  ppc_wrappers/immintrin.h
+  ppc_wrappers/tmmintrin.h
+  ppc_wrappers/x86intrin.h
+  ppc_wrappers/x86gprintrin.h
 )

 set(openmp_wrapper_files
--- a/clang/lib/Headers/ppc_wrappers/bmi2intrin.h
+++ b/clang/lib/Headers/ppc_wrappers/bmi2intrin.h
@ -0,0 +1,133 @@
+/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMI2INTRIN_H_
+#define BMI2INTRIN_H_
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bzhi_u32(unsigned int __X, unsigned int __Y) {
+  return ((__X << (32 - __Y)) >> (32 - __Y));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
+  unsigned long long __res = (unsigned long long)__X * __Y;
+  *__P = (unsigned int)(__res >> 32);
+  return (unsigned int)__res;
+}
+
+#ifdef __PPC64__
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
+  return ((__X << (64 - __Y)) >> (64 - __Y));
+}
+
+/* __int128 requires base 64-bit.  */
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mulx_u64(unsigned long long __X, unsigned long long __Y,
+              unsigned long long *__P) {
+  unsigned __int128 __res = (unsigned __int128)__X * __Y;
+  *__P = (unsigned long long)(__res >> 64);
+  return (unsigned long long)__res;
+}
+
+#ifdef _ARCH_PWR7
+/* popcount and bpermd require power7 minimum.  */
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pdep_u64(unsigned long long __X, unsigned long long __M) {
+  unsigned long result = 0x0UL;
+  const unsigned long mask = 0x8000000000000000UL;
+  unsigned long m = __M;
+  unsigned long c, t;
+  unsigned long p;
+
+  /* The pop-count of the mask gives the number of the bits from
+   source to process.  This is also needed to shift bits from the
+   source into the correct position for the result.  */
+  p = 64 - __builtin_popcountl(__M);
+
+  /* The loop is for the number of '1' bits in the mask and clearing
+   each mask bit as it is processed.  */
+  while (m != 0) {
+    c = __builtin_clzl(m);
+    t = __X << (p - c);
+    m ^= (mask >> c);
+    result |= (t & (mask >> c));
+    p++;
+  }
+  return (result);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pext_u64(unsigned long long __X, unsigned long long __M) {
+  unsigned long p = 0x4040404040404040UL; // initial bit permute control
+  const unsigned long mask = 0x8000000000000000UL;
+  unsigned long m = __M;
+  unsigned long c;
+  unsigned long result;
+
+  /* if the mask is constant and selects 8 bits or less we can use
+   the Power8 Bit permute instruction.  */
+  if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
+    /* Also if the pext mask is constant, then the popcount is
+     constant, we can evaluate the following loop at compile
+     time and use a constant bit permute vector.  */
+    for (long i = 0; i < __builtin_popcountl(__M); i++) {
+      c = __builtin_clzl(m);
+      p = (p << 8) | c;
+      m ^= (mask >> c);
+    }
+    result = __builtin_bpermd(p, __X);
+  } else {
+    p = 64 - __builtin_popcountl(__M);
+    result = 0;
+    /* We could a use a for loop here, but that combined with
+     -funroll-loops can expand to a lot of code.  The while
+     loop avoids unrolling and the compiler commons the xor
+     from clearing the mask bit with the (m != 0) test.  The
+     result is a more compact loop setup and body.  */
+    while (m != 0) {
+      unsigned long t;
+      c = __builtin_clzl(m);
+      t = (__X & (mask >> c)) >> (p - c);
+      m ^= (mask >> c);
+      result |= (t);
+      p++;
+    }
+  }
+  return (result);
+}
+
+/* these 32-bit implementations depend on 64-bit pdep/pext
+   which depend on _ARCH_PWR7.  */
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pdep_u32(unsigned int __X, unsigned int __Y) {
+  return _pdep_u64(__X, __Y);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pext_u32(unsigned int __X, unsigned int __Y) {
+  return _pext_u64(__X, __Y);
+}
+#endif /* _ARCH_PWR7  */
+#endif /* __PPC64__  */
+
+#endif /* BMI2INTRIN_H_ */
--- a/clang/lib/Headers/ppc_wrappers/bmiintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/bmiintrin.h
@ -0,0 +1,165 @@
+/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMIINTRIN_H_
+#define BMIINTRIN_H_
+
+extern __inline unsigned short
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __tzcnt_u16(unsigned short __X) {
+  return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __andn_u32(unsigned int __X, unsigned int __Y) {
+  return (~__X & __Y);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
+  return ((__X << (32 - (__L + __P))) >> (32 - __L));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __bextr_u32(unsigned int __X, unsigned int __Y) {
+  unsigned int __P, __L;
+  __P = __Y & 0xFF;
+  __L = (__Y >> 8) & 0xFF;
+  return (_bextr_u32(__X, __P, __L));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsi_u32(unsigned int __X) {
+  return (__X & -__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsi_u32(unsigned int __X) {
+  return __blsi_u32(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsmsk_u32(unsigned int __X) {
+  return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsmsk_u32(unsigned int __X) {
+  return __blsmsk_u32(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsr_u32(unsigned int __X) {
+  return (__X & (__X - 1));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsr_u32(unsigned int __X) {
+  return __blsr_u32(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __tzcnt_u32(unsigned int __X) {
+  return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _tzcnt_u32(unsigned int __X) {
+  return __builtin_ctz(__X);
+}
+
+/* use the 64-bit shift, rotate, and count leading zeros instructions
+   for long long.  */
+#ifdef __PPC64__
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __andn_u64(unsigned long long __X, unsigned long long __Y) {
+  return (~__X & __Y);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
+  return ((__X << (64 - (__L + __P))) >> (64 - __L));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __bextr_u64(unsigned long long __X, unsigned long long __Y) {
+  unsigned int __P, __L;
+  __P = __Y & 0xFF;
+  __L = (__Y & 0xFF00) >> 8;
+  return (_bextr_u64(__X, __P, __L));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsi_u64(unsigned long long __X) {
+  return __X & -__X;
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsi_u64(unsigned long long __X) {
+  return __blsi_u64(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsmsk_u64(unsigned long long __X) {
+  return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsmsk_u64(unsigned long long __X) {
+  return __blsmsk_u64(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsr_u64(unsigned long long __X) {
+  return (__X & (__X - 1));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsr_u64(unsigned long long __X) {
+  return __blsr_u64(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __tzcnt_u64(unsigned long long __X) {
+  return __builtin_ctzll(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _tzcnt_u64(unsigned long long __X) {
+  return __builtin_ctzll(__X);
+}
+#endif /* __PPC64__  */
+
+#endif /* BMIINTRIN_H_ */
--- a/clang/lib/Headers/ppc_wrappers/emmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/emmintrin.h
@ -405,20 +405,10 @@ _mm_cmpnge_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpord_pd (__m128d __A, __m128d __B)
 {
-#if _ARCH_PWR8
  __v2du c, d;
  /* Compare against self will return false (0's) if NAN.  */
  c = (__v2du)vec_cmpeq (__A, __A);
  d = (__v2du)vec_cmpeq (__B, __B);
-#else
-  __v2du a, b;
-  __v2du c, d;
-  const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
-  a = (__v2du)vec_abs ((__v2df)__A);
-  b = (__v2du)vec_abs ((__v2df)__B);
-  c = (__v2du)vec_cmpgt (double_exp_mask, a);
-  d = (__v2du)vec_cmpgt (double_exp_mask, b);
-#endif
  /* A != NAN and B != NAN.  */
  return ((__m128d)vec_and(c, d));
 }
@ -861,7 +851,11 @@ _mm_cvtpd_epi32 (__m128d __A)
      : );

 #ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
  temp = vec_mergeo (temp, temp);
+#else
+  temp = vec_mergee (temp, temp);
+#endif
  result = (__v4si) vec_vpkudum ((__vector long long) temp,
 				 (__vector long long) vzero);
 #else
@ -896,7 +890,11 @@ _mm_cvtpd_ps (__m128d __A)
      : );

 #ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
  temp = vec_mergeo (temp, temp);
+#else
+  temp = vec_mergee (temp, temp);
+#endif
  result = (__v4sf) vec_vpkudum ((__vector long long) temp,
 				 (__vector long long) vzero);
 #else
@ -925,7 +923,11 @@ _mm_cvttpd_epi32 (__m128d __A)
      : );

 #ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
  temp = vec_mergeo (temp, temp);
+#else
+  temp = vec_mergee (temp, temp);
+#endif
  result = (__v4si) vec_vpkudum ((__vector long long) temp,
 				 (__vector long long) vzero);
 #else
@ -1205,6 +1207,9 @@ _mm_loadl_pd (__m128d __A, double const *__B)
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_pd (__m128d  __A)
 {
+#ifdef _ARCH_PWR10
+  return vec_extractm ((__v2du) __A);
+#else
  __vector unsigned long long result;
  static const __vector unsigned int perm_mask =
    {
@ -1224,6 +1229,7 @@ _mm_movemask_pd (__m128d  __A)
 #else
  return result[0];
 #endif
+#endif /* !_ARCH_PWR10 */
 }
 #endif /* _ARCH_PWR8 */

@ -1434,6 +1440,7 @@ _mm_mul_su32 (__m64 __A, __m64 __B)
  return ((__m64)a * (__m64)b);
 }

+#ifdef _ARCH_PWR8
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_epu32 (__m128i __A, __m128i __B)
 {
@ -1460,6 +1467,7 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
  return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
 #endif
 }
+#endif

 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
@ -1749,7 +1757,7 @@ _mm_sll_epi64 (__m128i __A, __m128i __B)
  lshift = vec_splat ((__v2du) __B, 0);
  shmask = vec_cmplt (lshift, shmax);
  result = vec_sl ((__v2du) __A, lshift);
-  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
+  result = vec_sel ((__v2du) shmask, result, shmask);

  return (__m128i) result;
 }
@ -1843,7 +1851,7 @@ _mm_srl_epi64 (__m128i __A, __m128i __B)
  rshift = vec_splat ((__v2du) __B, 0);
  shmask = vec_cmplt (rshift, shmax);
  result = vec_sr ((__v2du) __A, rshift);
-  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
+  result = vec_sel ((__v2du) shmask, result, shmask);

  return (__m128i) result;
 }
@ -1995,10 +2003,14 @@ _mm_min_epu8 (__m128i __A, __m128i __B)
 #ifdef _ARCH_PWR8
 /* Intrinsic functions that require PowerISA 2.07 minimum.  */

-/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
+/* Return a mask created from the most significant bit of each 8-bit
+   element in A.  */
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_epi8 (__m128i __A)
 {
+#ifdef _ARCH_PWR10
+  return vec_extractm ((__v16qu) __A);
+#else
  __vector unsigned long long result;
  static const __vector unsigned char perm_mask =
    {
@ -2015,6 +2027,7 @@ _mm_movemask_epi8 (__m128i __A)
 #else
  return result[0];
 #endif
+#endif /* !_ARCH_PWR10 */
 }
 #endif /* _ARCH_PWR8 */

@ -2158,27 +2171,37 @@ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __arti
 _mm_sad_epu8 (__m128i __A, __m128i __B)
 {
  __v16qu a, b;
-  __v16qu vmin, vmax, vabsdiff;
+  __v16qu vabsdiff;
  __v4si vsum;
  const __v4su zero = { 0, 0, 0, 0 };
  __v4si result;

  a = (__v16qu) __A;
  b = (__v16qu) __B;
-  vmin = vec_min (a, b);
-  vmax = vec_max (a, b);
+#ifndef _ARCH_PWR9
+  __v16qu vmin = vec_min (a, b);
+  __v16qu vmax = vec_max (a, b);
  vabsdiff = vec_sub (vmax, vmin);
+#else
+  vabsdiff = vec_absd (a, b);
+#endif
  /* Sum four groups of bytes into integers.  */
  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+#ifdef __LITTLE_ENDIAN__
+  /* Sum across four integers with two integer results.  */
+  asm ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero));
+  /* Note: vec_sum2s could be used here, but on little-endian, vector
+     shifts are added that are not needed for this use-case.
+     A vector shift to correctly position the 32-bit integer results
+     (currently at [0] and [2]) to [1] and [3] would then need to be
+     swapped back again since the desired results are two 64-bit
+     integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
+#else
  /* Sum across four integers with two integer results.  */
  result = vec_sum2s (vsum, (__vector signed int) zero);
  /* Rotate the sums into the correct position.  */
-#ifdef __LITTLE_ENDIAN__
-  result = vec_sld (result, result, 4);
-#else
  result = vec_sld (result, result, 6);
 #endif
-  /* Rotate the sums into the correct position.  */
  return (__m128i) result;
 }

--- a/clang/lib/Headers/ppc_wrappers/immintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/immintrin.h
@ -0,0 +1,27 @@
+/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef IMMINTRIN_H_
+#define IMMINTRIN_H_
+
+#include <x86gprintrin.h>
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#endif /* IMMINTRIN_H_ */
--- a/clang/lib/Headers/ppc_wrappers/nmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/nmmintrin.h
@ -0,0 +1,26 @@
+/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#endif
+
+#ifndef NMMINTRIN_H_
+#define NMMINTRIN_H_
+
+/* We just include SSE4.1 header file.  */
+#include <smmintrin.h>
+
+#endif /* NMMINTRIN_H_ */
--- a/clang/lib/Headers/ppc_wrappers/pmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/pmmintrin.h
@ -111,17 +111,21 @@ _mm_hsub_pd (__m128d __X, __m128d __Y)
 			    vec_mergel ((__v2df) __X, (__v2df)__Y));
 }

+#ifdef _ARCH_PWR8
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movehdup_ps (__m128 __X)
 {
  return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
 }
+#endif

+#ifdef _ARCH_PWR8
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_moveldup_ps (__m128 __X)
 {
  return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
 }
+#endif

 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loaddup_pd (double const *__P)
--- a/clang/lib/Headers/ppc_wrappers/smmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h
@ -34,77 +34,683 @@
 #include <altivec.h>
 #include <tmmintrin.h>

-extern __inline int
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_extract_epi8(__m128i __X, const int __N) {
-  return (unsigned char)((__v16qi)__X)[__N & 15];
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT       0x00
+#define _MM_FROUND_TO_ZERO              0x01
+#define _MM_FROUND_TO_POS_INF           0x02
+#define _MM_FROUND_TO_NEG_INF           0x03
+#define _MM_FROUND_CUR_DIRECTION        0x04
+
+#define _MM_FROUND_NINT		\
+  (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR	\
+  (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL		\
+  (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC	\
+  (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT		\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT	\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+#define _MM_FROUND_RAISE_EXC            0x00
+#define _MM_FROUND_NO_EXC               0x08
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_pd (__m128d __A, int __rounding)
+{
+  __v2df __r;
+  union {
+    double __fr;
+    long long __fpscr;
+  } __enables_save, __fpscr_save;
+
+  if (__rounding & _MM_FROUND_NO_EXC)
+    {
+      /* Save enabled exceptions, disable all exceptions,
+	 and preserve the rounding mode.  */
+#ifdef _ARCH_PWR9
+      __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
+      __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+      __fpscr_save.__fr = __builtin_mffs ();
+      __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+      __fpscr_save.__fpscr &= ~0xf8;
+      __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+#endif
+      /* Insert an artificial "read/write" reference to the variable
+	 read below, to ensure the compiler does not schedule
+	 a read/use of the variable before the FPSCR is modified, above.
+	 This can be removed if and when GCC PR102783 is fixed.
+       */
+      __asm__ ("" : "+wa" (__A));
+    }
+
+  switch (__rounding)
+    {
+      case _MM_FROUND_TO_NEAREST_INT:
+	__fpscr_save.__fr = __builtin_mffsl ();
+	__attribute__ ((fallthrough));
+      case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+	__builtin_set_fpscr_rn (0b00);
+	/* Insert an artificial "read/write" reference to the variable
+	   read below, to ensure the compiler does not schedule
+	   a read/use of the variable before the FPSCR is modified, above.
+	   This can be removed if and when GCC PR102783 is fixed.
+	 */
+	__asm__ ("" : "+wa" (__A));
+
+	__r = vec_rint ((__v2df) __A);
+
+	/* Insert an artificial "read" reference to the variable written
+	   above, to ensure the compiler does not schedule the computation
+	   of the value after the manipulation of the FPSCR, below.
+	   This can be removed if and when GCC PR102783 is fixed.
+	 */
+	__asm__ ("" : : "wa" (__r));
+	__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
+	break;
+      case _MM_FROUND_TO_NEG_INF:
+      case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+	__r = vec_floor ((__v2df) __A);
+	break;
+      case _MM_FROUND_TO_POS_INF:
+      case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+	__r = vec_ceil ((__v2df) __A);
+	break;
+      case _MM_FROUND_TO_ZERO:
+      case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+	__r = vec_trunc ((__v2df) __A);
+	break;
+      case _MM_FROUND_CUR_DIRECTION:
+	__r = vec_rint ((__v2df) __A);
+	break;
+    }
+  if (__rounding & _MM_FROUND_NO_EXC)
+    {
+      /* Insert an artificial "read" reference to the variable written
+	 above, to ensure the compiler does not schedule the computation
+	 of the value after the manipulation of the FPSCR, below.
+	 This can be removed if and when GCC PR102783 is fixed.
+       */
+      __asm__ ("" : : "wa" (__r));
+      /* Restore enabled exceptions.  */
+      __fpscr_save.__fr = __builtin_mffsl ();
+      __fpscr_save.__fpscr |= __enables_save.__fpscr;
+      __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+    }
+  return (__m128d) __r;
 }

-extern __inline int
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_extract_epi32(__m128i __X, const int __N) {
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_sd (__m128d __A, __m128d __B, int __rounding)
+{
+  __B = _mm_round_pd (__B, __rounding);
+  __v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] };
+  return (__m128d) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ps (__m128 __A, int __rounding)
+{
+  __v4sf __r;
+  union {
+    double __fr;
+    long long __fpscr;
+  } __enables_save, __fpscr_save;
+
+  if (__rounding & _MM_FROUND_NO_EXC)
+    {
+      /* Save enabled exceptions, disable all exceptions,
+	 and preserve the rounding mode.  */
+#ifdef _ARCH_PWR9
+      __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
+      __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+      __fpscr_save.__fr = __builtin_mffs ();
+      __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+      __fpscr_save.__fpscr &= ~0xf8;
+      __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+#endif
+      /* Insert an artificial "read/write" reference to the variable
+	 read below, to ensure the compiler does not schedule
+	 a read/use of the variable before the FPSCR is modified, above.
+	 This can be removed if and when GCC PR102783 is fixed.
+       */
+      __asm__ ("" : "+wa" (__A));
+    }
+
+  switch (__rounding)
+    {
+      case _MM_FROUND_TO_NEAREST_INT:
+	__fpscr_save.__fr = __builtin_mffsl ();
+	__attribute__ ((fallthrough));
+      case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+	__builtin_set_fpscr_rn (0b00);
+	/* Insert an artificial "read/write" reference to the variable
+	   read below, to ensure the compiler does not schedule
+	   a read/use of the variable before the FPSCR is modified, above.
+	   This can be removed if and when GCC PR102783 is fixed.
+	 */
+	__asm__ ("" : "+wa" (__A));
+
+	__r = vec_rint ((__v4sf) __A);
+
+	/* Insert an artificial "read" reference to the variable written
+	   above, to ensure the compiler does not schedule the computation
+	   of the value after the manipulation of the FPSCR, below.
+	   This can be removed if and when GCC PR102783 is fixed.
+	 */
+	__asm__ ("" : : "wa" (__r));
+	__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
+	break;
+      case _MM_FROUND_TO_NEG_INF:
+      case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+	__r = vec_floor ((__v4sf) __A);
+	break;
+      case _MM_FROUND_TO_POS_INF:
+      case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+	__r = vec_ceil ((__v4sf) __A);
+	break;
+      case _MM_FROUND_TO_ZERO:
+      case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+	__r = vec_trunc ((__v4sf) __A);
+	break;
+      case _MM_FROUND_CUR_DIRECTION:
+	__r = vec_rint ((__v4sf) __A);
+	break;
+    }
+  if (__rounding & _MM_FROUND_NO_EXC)
+    {
+      /* Insert an artificial "read" reference to the variable written
+	 above, to ensure the compiler does not schedule the computation
+	 of the value after the manipulation of the FPSCR, below.
+	 This can be removed if and when GCC PR102783 is fixed.
+       */
+      __asm__ ("" : : "wa" (__r));
+      /* Restore enabled exceptions.  */
+      __fpscr_save.__fr = __builtin_mffsl ();
+      __fpscr_save.__fpscr |= __enables_save.__fpscr;
+      __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+    }
+  return (__m128) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ss (__m128 __A, __m128 __B, int __rounding)
+{
+  __B = _mm_round_ps (__B, __rounding);
+  __v4sf __r = (__v4sf) __A;
+  __r[0] = ((__v4sf) __B)[0];
+  return (__m128) __r;
+}
+
+#define _mm_ceil_pd(V)	   _mm_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V)  _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V)	   _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V)	   _mm_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V)  _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V)	   _mm_round_ps ((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
+{
+  __v16qi result = (__v16qi)__A;
+
+  result [__N & 0xf] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
+{
+  __v4si result = (__v4si)__A;
+
+  result [__N & 3] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
+{
+  __v2di result = (__v2di)__A;
+
+  result [__N & 1] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi8 (__m128i __X, const int __N)
+{
+  return (unsigned char) ((__v16qi)__X)[__N & 15];
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi32 (__m128i __X, const int __N)
+{
  return ((__v4si)__X)[__N & 3];
 }

-extern __inline int
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_extract_epi64(__m128i __X, const int __N) {
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi64 (__m128i __X, const int __N)
+{
  return ((__v2di)__X)[__N & 1];
 }

-extern __inline int
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_extract_ps(__m128 __X, const int __N) {
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_ps (__m128 __X, const int __N)
+{
  return ((__v4si)__X)[__N & 3];
 }

-extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
-  __v16qi __charmask = vec_splats((signed char)__imm8);
-  __charmask = vec_gb(__charmask);
-  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
-#ifdef __BIG_ENDIAN__
-  __shortmask = vec_reve(__shortmask);
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
+{
+  __v16qi __charmask = vec_splats ((signed char) __imm8);
+  __charmask = vec_gb (__charmask);
+  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
+  #ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve (__shortmask);
+  #endif
+  return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
+}
 #endif
-  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
+{
+#ifdef _ARCH_PWR10
+  return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask);
+#else
+  const __v16qu __seven = vec_splats ((unsigned char) 0x07);
+  __v16qu __lmask = vec_sra ((__v16qu) __mask, __seven);
+  return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask);
+#endif
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
+{
+  __v16qu __pcv[] =
+    {
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
+      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },
+      { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },
+      {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },
+      { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },
+      {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },
+      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },
+      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
+      { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
+      {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+    };
+  __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+  return (__m128) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
+{
+#ifdef _ARCH_PWR10
+  return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask);
+#else
+  const __v4si __zero = {0};
+  const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
+  return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
+#endif
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
+{
+  __v16qu __pcv[] =
+    {
+      {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+      { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
+      {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
+      { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }
+    };
+  __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+  return (__m128d) __r;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
+{
+#ifdef _ARCH_PWR10
+  return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask);
+#else
+  const __v2di __zero = {0};
+  const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
+  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
+#endif
+}
+#endif
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_si128 (__m128i __A, __m128i __B)
+{
+  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
+  const __v16qu __zero = {0};
+  return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_si128 (__m128i __A, __m128i __B)
+{
+  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
+  const __v16qu __zero = {0};
+  const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A);
+  return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_si128 (__m128i __A, __m128i __B)
+{
+  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
+  return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0;
+}
+
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+#define _mm_test_all_ones(V) \
+  _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y);
 }

 extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
-  const __v16qu __seven = vec_splats((unsigned char)0x07);
-  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
-  return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y);
 }

 extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
-  __v16qi result = (__v16qi)__A;
-  result[__N & 0xf] = __D;
-  return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y);
 }

 extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
-  __v4si result = (__v4si)__A;
-  result[__N & 3] = __D;
-  return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y);
 }

 extern __inline __m128i
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
-  __v2di result = (__v2di)__A;
-  result[__N & 1] = __D;
-  return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y);
 }

+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi16 (__m128i __A)
+{
+  return (__m128i) vec_unpackh ((__v16qi) __A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi32 (__m128i __A)
+{
+  __A = (__m128i) vec_unpackh ((__v16qi) __A);
+  return (__m128i) vec_unpackh ((__v8hi) __A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi64 (__m128i __A)
+{
+  __A = (__m128i) vec_unpackh ((__v16qi) __A);
+  __A = (__m128i) vec_unpackh ((__v8hi) __A);
+  return (__m128i) vec_unpackh ((__v4si) __A);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi32 (__m128i __A)
+{
+  return (__m128i) vec_unpackh ((__v8hi) __A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi64 (__m128i __A)
+{
+  __A = (__m128i) vec_unpackh ((__v8hi) __A);
+  return (__m128i) vec_unpackh ((__v4si) __A);
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi64 (__m128i __A)
+{
+  return (__m128i) vec_unpackh ((__v4si) __A);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi16 (__m128i __A)
+{
+  const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi32 (__m128i __A)
+{
+  const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+  __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+  __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi64 (__m128i __A)
+{
+  const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+  __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
+  __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+  __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
+  __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi32 (__m128i __A)
+{
+  const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi64 (__m128i __A)
+{
+  const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
+  __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
+  __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_epi64 (__m128i __A)
+{
+  const __v4su __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i) vec_mergeh ((__v4su) __A, __zero);
+#else /* __BIG_ENDIAN__.  */
+  __A = (__m128i) vec_mergeh (__zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+/* Return horizontal packed word minimum and its index in bits [15:0]
+   and bits [18:16] respectively.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_minpos_epu16 (__m128i __A)
+{
+  union __u
+    {
+      __m128i __m;
+      __v8hu __uh;
+    };
+  union __u __u = { .__m = __A }, __r = { .__m = {0} };
+  unsigned short __ridx = 0;
+  unsigned short __rmin = __u.__uh[__ridx];
+  for (unsigned long __i = 1; __i < 8; __i++)
+    {
+      if (__u.__uh[__i] < __rmin)
+	{
+	  __rmin = __u.__uh[__i];
+	  __ridx = __i;
+	}
+    }
+  __r.__uh[0] = __rmin;
+  __r.__uh[1] = __ridx;
+  return __r.__m;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y);
+}
+#endif
+
 #else
 #include_next <smmintrin.h>
 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
        */

-#endif /* _SMMINTRIN_H_ */
+#endif /* SMMINTRIN_H_ */
--- a/clang/lib/Headers/ppc_wrappers/tmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/tmmintrin.h
@ -339,6 +339,7 @@ _mm_shuffle_pi8 (__m64 __A, __m64 __B)
  return (__m64) ((__v2du) (__C))[0];
 }

+#ifdef _ARCH_PWR8
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi8 (__m128i __A, __m128i __B)
@ -350,7 +351,9 @@ _mm_sign_epi8 (__m128i __A, __m128i __B)
  __v16qi __conv = vec_add (__selectneg, __selectpos);
  return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
 }
+#endif

+#ifdef _ARCH_PWR8
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi16 (__m128i __A, __m128i __B)
@ -362,7 +365,9 @@ _mm_sign_epi16 (__m128i __A, __m128i __B)
  __v8hi __conv = vec_add (__selectneg, __selectpos);
  return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
 }
+#endif

+#ifdef _ARCH_PWR8
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi32 (__m128i __A, __m128i __B)
@ -374,7 +379,9 @@ _mm_sign_epi32 (__m128i __A, __m128i __B)
  __v4si __conv = vec_add (__selectneg, __selectpos);
  return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
 }
+#endif

+#ifdef _ARCH_PWR8
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi8 (__m64 __A, __m64 __B)
@ -385,7 +392,9 @@ _mm_sign_pi8 (__m64 __A, __m64 __B)
  __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
  return (__m64) ((__v2du) (__C))[0];
 }
+#endif

+#ifdef _ARCH_PWR8
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi16 (__m64 __A, __m64 __B)
@ -396,7 +405,9 @@ _mm_sign_pi16 (__m64 __A, __m64 __B)
  __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
  return (__m64) ((__v2du) (__C))[0];
 }
+#endif

+#ifdef _ARCH_PWR8
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi32 (__m64 __A, __m64 __B)
@ -407,6 +418,7 @@ _mm_sign_pi32 (__m64 __A, __m64 __B)
  __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
  return (__m64) ((__v2du) (__C))[0];
 }
+#endif

 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
--- a/clang/lib/Headers/ppc_wrappers/x86gprintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/x86gprintrin.h
@ -0,0 +1,17 @@
+/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef X86GPRINTRIN_H_
+#define X86GPRINTRIN_H_
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#endif /* X86GPRINTRIN_H_ */
--- a/clang/lib/Headers/ppc_wrappers/x86intrin.h
+++ b/clang/lib/Headers/ppc_wrappers/x86intrin.h
@ -0,0 +1,28 @@
+/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef X86INTRIN_H_
+#define X86INTRIN_H_
+
+#ifdef __ALTIVEC__
+#include <immintrin.h>
+#endif /* __ALTIVEC__ */
+
+#endif /* X86INTRIN_H_ */
--- a/clang/lib/Headers/ppc_wrappers/xmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/xmmintrin.h
@ -31,8 +31,8 @@
 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
 #endif

-#ifndef _XMMINTRIN_H_INCLUDED
-#define _XMMINTRIN_H_INCLUDED
+#ifndef XMMINTRIN_H_
+#define XMMINTRIN_H_

 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))

@ -881,7 +881,7 @@ _mm_cvtss_f32 (__m128 __A)
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_si32 (__m128 __A)
 {
-  __m64 res = 0;
+  int res;
 #ifdef _ARCH_PWR8
  double dtmp;
  __asm__(
@ -914,8 +914,8 @@ _mm_cvt_ss2si (__m128 __A)
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_si64 (__m128 __A)
 {
-  __m64 res = 0;
-#ifdef _ARCH_PWR8
+  long long res;
+#if defined (_ARCH_PWR8) && defined (__powerpc64__)
  double dtmp;
  __asm__(
 #ifdef __LITTLE_ENDIAN__
@ -1328,6 +1328,9 @@ _mm_storel_pi (__m64 *__P, __m128 __A)
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_ps (__m128  __A)
 {
+#ifdef _ARCH_PWR10
+  return vec_extractm ((__vector unsigned int) __A);
+#else
  __vector unsigned long long result;
  static const __vector unsigned int perm_mask =
    {
@ -1347,6 +1350,7 @@ _mm_movemask_ps (__m128  __A)
 #else
  return result[0];
 #endif
+#endif /* !_ARCH_PWR10 */
 }
 #endif /* _ARCH_PWR8 */

@ -1553,6 +1557,7 @@ _m_pminub (__m64 __A, __m64 __B)
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_pi8 (__m64 __A)
 {
+#ifdef __powerpc64__
  unsigned long long p =
 #ifdef __LITTLE_ENDIAN__
                         0x0008101820283038UL; // permute control for sign bits
@ -1560,6 +1565,18 @@ _mm_movemask_pi8 (__m64 __A)
                         0x3830282018100800UL; // permute control for sign bits
 #endif
  return __builtin_bpermd (p, __A);
+#else
+#ifdef __LITTLE_ENDIAN__
+  unsigned int mask = 0x20283038UL;
+  unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
+  unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+#else
+  unsigned int mask = 0x38302820UL;
+  unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+  unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf;
+#endif
+  return (r2 << 4) | r1;
+#endif
 }

 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@ -1841,4 +1858,4 @@ do {									\
 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
        */

-#endif /* _XMMINTRIN_H_INCLUDED */
+#endif /* XMMINTRIN_H_ */
--- a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
@ -5,6 +5,9 @@
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE

+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10-LE
+
 // CHECK-BE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 -2139078656>, align 16
 // CHECK-BE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
 // CHECK-BE-DAG: @_mm_shufflehi_epi16.permute_selectors = internal constant [4 x i16] [i16 2057, i16 2571, i16 3085, i16 3599], align 2
@ -440,7 +443,8 @@ test_converts() {
 // CHECK: call <2 x double> @vec_rint(double vector[2])
 // CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16
 // CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}})
-// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
 // CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)

 // CHECK-LABEL: define available_externally i64 @_mm_cvtpd_pi32
@ -450,7 +454,8 @@ test_converts() {
 // CHECK-LABEL: define available_externally <4 x float> @_mm_cvtpd_ps
 // CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16
 // CHECK: call <4 x i32> asm "xvcvdpsp ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}})
-// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
 // CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)

 // CHECK-LABEL: define available_externally <2 x double> @_mm_cvtpi32_pd
@ -530,7 +535,8 @@ test_converts() {

 // CHECK-LABEL: define available_externally <2 x i64> @_mm_cvttpd_epi32
 // CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa"
-// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
 // CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)

 // CHECK-LABEL: define available_externally i64 @_mm_cvttpd_pi32
@ -756,12 +762,18 @@ test_move() {
 // CHECK: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0
 // CHECK: insertelement <2 x double> %{{[0-9a-zA-Z_.]+}}, double %[[EXT]], i32 0

+// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_epi8
+// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+
 // CHECK-LABEL: define available_externally signext i32 @_mm_movemask_epi8
 // CHECK: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef <i8 120, i8 112, i8 104, i8 96, i8 88, i8 80, i8 72, i8 64, i8 56, i8 48, i8 40, i8 32, i8 24, i8 16, i8 8, i8 0>)
 // CHECK-LE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
 // CHECK-BE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0
 // CHECK: trunc i64 %[[VAL]] to i32

+// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_pd
+// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+
 // CHECK-LABEL: define available_externally signext i32 @_mm_movemask_pd
 // CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 -2139094976, i32 -2139062144, i32 -2139062144, i32 -2139062144> to <16 x i8>))
 // CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
@ -857,9 +869,9 @@ test_sad() {
 // CHECK: call <16 x i8> @vec_max(unsigned char vector[16], unsigned char vector[16])
 // CHECK: call <16 x i8> @vec_sub(unsigned char vector[16], unsigned char vector[16])
 // CHECK: call <4 x i32> @vec_sum4s(unsigned char vector[16], unsigned int vector[4])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
-// CHECK: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
-// CHECK-LE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 4)
-// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 6)
+// CHECK-LE: call <4 x i32> asm "vsum2sws $0,$1,$2", "=v,v,v"(<4 x i32> %11, <4 x i32> zeroinitializer)
+// CHECK-BE: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
+// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)

 void __attribute__((noinline))
 test_set() {
@ -1086,7 +1098,7 @@ test_sll() {
 // CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef zeroext 0)
 // CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, <2 x i64> noundef <i64 64, i64 64>)
 // CHECK: call <2 x i64> @vec_sl(unsigned long long vector[2], unsigned long long vector[2])
-// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2])
+// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2])

 // CHECK-LABEL: define available_externally <2 x i64> @_mm_slli_epi16
 // CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
@ -1232,7 +1244,7 @@ test_srl() {
 // CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 0)
 // CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef <i64 64, i64 64>)
 // CHECK: call <2 x i64> @vec_sr(unsigned long long vector[2], unsigned long long vector[2])
-// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2])
+// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2])

 // CHECK-LABEL: define available_externally <2 x i64> @_mm_srli_epi16
 // CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
--- a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
@ -10,6 +10,11 @@
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s

+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
+
 #include <smmintrin.h>

 __m128 mn1, mn2;
@ -48,6 +53,10 @@ void __attribute__((noinline))
 test_blend() {
  _mm_blend_epi16(m1, m2, 0);
  _mm_blendv_epi8(m1, m2, mi);
+  _mm_blend_ps(mn1, mn2, 0);
+  _mm_blendv_ps(mn1, mn2, mn1);
+  _mm_blend_pd(md1, md2, 0);
+  _mm_blendv_pd(md1, md2, md1);
 }

 // CHECK-LABEL: @test_blend
@ -62,10 +71,13 @@ test_blend() {
 // BE: store <8 x i16> %[[REVE]], <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
 // CHECK: call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8])

+// P10-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// P10: call <16 x i8> @vec_blendv(signed char vector[16], signed char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+
 // CHECK-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
 // CHECK: call <16 x i8> @vec_splats(unsigned char)(i8 noundef zeroext 7)
 // CHECK: call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16])
-// CHECK: call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], unsigned char vector[16])
+// CHECK: call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], unsigned char vector[16])

 void __attribute__((noinline))
 test_insert() {
@ -89,6 +101,239 @@ test_insert() {
 // CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1
 // CHECK: insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[AND:[0-9a-zA-Z_.]+]]

+void __attribute__((noinline))
+test_convert() {
+  _mm_cvtepi16_epi32(m1);
+  _mm_cvtepi16_epi64(m1);
+  _mm_cvtepi32_epi64(m1);
+  _mm_cvtepi8_epi16(m1);
+  _mm_cvtepi8_epi32(m1);
+  _mm_cvtepi8_epi64(m1);
+  _mm_cvtepu16_epi32(m1);
+  _mm_cvtepu16_epi64(m1);
+  _mm_cvtepu32_epi64(m1);
+  _mm_cvtepu8_epi16(m1);
+  _mm_cvtepu8_epi32(m1);
+  _mm_cvtepu8_epi64(m1);
+}
+
+// CHECK-LABEL: @test_convert
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
+// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
+// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
+// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
+// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
+// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
+// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
+// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
+// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])
+// CHECK: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])
+
+void __attribute__((noinline))
+test_minmax() {
+  _mm_max_epi32(m1, m2);
+  _mm_max_epi8(m1, m2);
+  _mm_max_epu16(m1, m2);
+  _mm_max_epu32(m1, m2);
+  _mm_min_epi32(m1, m2);
+  _mm_min_epi8(m1, m2);
+  _mm_min_epu16(m1, m2);
+  _mm_min_epu32(m1, m2);
+  _mm_minpos_epu16(m1);
+}
+
+// CHECK-LABEL: @test_minmax
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_max(int vector[4], int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_max(signed char vector[16], signed char vector[16])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_max(unsigned short vector[8], unsigned short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_max(unsigned int vector[4], unsigned int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_min(int vector[4], int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_min(signed char vector[16], signed char vector[16])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_min(unsigned short vector[8], unsigned short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_min(unsigned int vector[4], unsigned int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_minpos_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 %{{[0-9a-zA-Z_.]+}}, i8 0, i64 16, i1 false)
+// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i16 %{{[0-9a-zA-Z_.]+}}
+// CHECK: %[[VEXT:[0-9a-zA-Z_.]+]] = extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: zext i16 %[[VEXT]] to i32
+// CHECK: zext i16 %{{[0-9a-zA-Z_.]+}} to i32
+// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: add i64 %{{[0-9a-zA-Z_.]+}}, 1
+
+void __attribute__((noinline))
+test_round() {
+  _mm_round_ps(mn1, 0);
+  _mm_round_ss(mn1, mn2, 0);
+  _mm_round_pd(mn1, 0);
+  _mm_round_sd(mn1, mn2, 0);
+}
+
+// CHECK-LABEL: @test_round
+
+// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
+// CHECK: call <4 x float> @vec_rint(float vector[4])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x float> @vec_floor(float vector[4])
+// CHECK: call <4 x float> @vec_ceil(float vector[4])
+// CHECK: call <4 x float> @vec_trunc(float vector[4])
+// CHECK: call <4 x float> @vec_rint(float vector[4])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
+
+// CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
+// CHECK: call <2 x double> @vec_rint(double vector[2])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x double> @vec_floor(double vector[2])
+// CHECK: call <2 x double> @vec_ceil(double vector[2])
+// CHECK: call <2 x double> @vec_trunc(double vector[2])
+// CHECK: call <2 x double> @vec_rint(double vector[2])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0
+// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 1
+
+void __attribute__((noinline))
+test_testing() {
+  _mm_testc_si128(m1, m2);
+  _mm_testnzc_si128(m1, m2);
+  _mm_testz_si128(m1, m2);
+}
+
+// CHECK-LABEL: @test_testing
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_nor(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call1, <16 x i8> noundef zeroinitializer)
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_testnzc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: zext i1 %{{[0-9a-zA-Z_.]+}} to i32
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call, <16 x i8> noundef zeroinitializer)
+
+void __attribute__((noinline))
+test_compare() {
+  _mm_cmpeq_epi64(m1, m2);
+  _mm_cmpgt_epi64(m1, m2);
+}
+
+// CHECK-LABEL: @test_compare
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpeq_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x i64> @vec_cmpeq(long long vector[2], long long vector[2])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpgt_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x i64> @vec_cmpgt(long long vector[2], long long vector[2])
+
+void __attribute__((noinline))
+test_mul() {
+  _mm_mul_epi32(m1, m2);
+  _mm_mullo_epi32(m1, m2);
+}
+
+// CHECK-LABEL: @test_mul
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_mul_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %call = call <2 x i64> @vec_mule(int vector[4], int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_mullo_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %call = call <4 x i32> @vec_mul(unsigned int vector[4], unsigned int vector[4])
+
+void __attribute__((noinline))
+test_packus() {
+  _mm_packus_epi32(m1, m2);
+}
+
+// CHECK-LABEL: @test_packus
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_packus_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_packsu(int vector[4], int vector[4])(<4 x i32> noundef %1, <4 x i32> noundef %3)
+
 // To test smmintrin.h includes tmmintrin.h

 void __attribute__((noinline))
--- a/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
@ -0,0 +1,239 @@
+// REQUIRES: powerpc-registered-target
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+
+#include <x86gprintrin.h>
+
+unsigned short us;
+unsigned ui;
+unsigned long long ul;
+
+void __attribute__((noinline))
+test_bmiintrin() {
+  __tzcnt_u16(us);
+  __andn_u32(ui, ui);
+  _bextr_u32(ui, ui, ui);
+  __bextr_u32(ui, ui);
+  __blsi_u32(ui);
+  _blsi_u32(ui);
+  __blsmsk_u32(ui);
+  _blsmsk_u32(ui);
+  __blsr_u32(ui);
+  _blsr_u32(ui);
+  __tzcnt_u32(ui);
+  _tzcnt_u32(ui);
+  __andn_u64(ul, ul);
+  _bextr_u64(ul, ui, ui);
+  __bextr_u64(ul, ul);
+  __blsi_u64(ul);
+  _blsi_u64(ul);
+  __blsmsk_u64(ul);
+  _blsmsk_u64(ul);
+  __blsr_u64(ul);
+  _blsr_u64(ul);
+  __tzcnt_u64(ul);
+  _tzcnt_u64(ul);
+}
+
+// CHECK-LABEL: @test_bmiintrin
+
+// CHECK-LABEL: define available_externally zeroext i16 @__tzcnt_u16(i16 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i16 %{{[0-9a-zA-Z._]+}} to i32
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i32 @llvm.cttz.i32(i32 %[[CONV]], i1 false)
+// CHECK: trunc i32 %[[CALL]] to i16
+
+// CHECK-LABEL: define available_externally zeroext i32 @__andn_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i32 %{{[0-9a-zA-Z._]+}}, -1
+// CHECK: and i32 %[[NEG]], %1
+
+// CHECK-LABEL: define available_externally zeroext i32 @_bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %[[ADD]]
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+// CHECK: %[[SUB]]1 = sub i32 32, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i32 %[[SHL]], %[[SUB]]1
+
+// CHECK-LABEL: define available_externally zeroext i32 @__bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i32 %{{[0-9a-zA-Z._]+}}, 255
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i32 %{{[0-9a-zA-Z._]+}}, 8
+// CHECK: and i32 %[[SHR]], 255
+// CHECK: call zeroext i32 @_bextr_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 0, %1
+// CHECK: and i32 %0, %[[SUB]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call zeroext i32 @__blsi_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: xor i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call zeroext i32 @__blsmsk_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: and i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call zeroext i32 @__blsr_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
+
+// CHECK-LABEL: define available_externally zeroext i32 @_tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
+
+// CHECK-LABEL: define available_externally i64 @__andn_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i64 %{{[0-9a-zA-Z._]+}}, -1
+// CHECK: and i64 %[[NEG]], %{{[0-9a-zA-Z._]+}}
+
+// CHECK-LABEL: define available_externally i64 @_bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 64, %[[ADD]]
+// CHECK: %[[EXT:[0-9a-zA-Z._]+]] = zext i32 %[[SUB]] to i64
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[EXT]]
+// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 64, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[EXT2:[0-9a-zA-Z._]+]] = zext i32 %[[SUB1]] to i64
+// CHECK: lshr i64 %[[SHL]], %[[EXT2]]
+
+// CHECK-LABEL: define available_externally i64 @__bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 255
+// CHECK: trunc i64 %[[AND]] to i32
+// CHECK: %[[AND1:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 65280
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %[[AND1]], 8
+// CHECK: trunc i64 %[[SHR]] to i32
+// CHECK: call i64 @_bextr_u64
+
+// CHECK-LABEL: define available_externally i64 @__blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 0, %{{[0-9a-zA-Z._]+}}
+// CHECK: and i64 %0, %[[SUB]]
+
+// CHECK-LABEL: define available_externally i64 @_blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @__blsi_u64
+
+// CHECK-LABEL: define available_externally i64 @__blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: xor i64 %0, %[[SUB]]
+
+// CHECK-LABEL: define available_externally i64 @_blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @__blsmsk_u64
+
+// CHECK-LABEL: define available_externally i64 @__blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: and i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+
+// CHECK-LABEL: define available_externally i64 @_blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @__blsr_u64
+
+// CHECK-LABEL: define available_externally i64 @__tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
+// CHECK: sext i32 %[[CAST]] to i64
+
+// CHECK-LABEL: define available_externally i64 @_tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
+// CHECK: sext i32 %[[CAST]] to i64
+
+void __attribute__((noinline))
+test_bmi2intrin() {
+  _bzhi_u32(ui, ui);
+  _mulx_u32(ui, ui, &ui);
+  _bzhi_u64(ul, ul);
+  _mulx_u64(ul, ul, &ul);
+  _pdep_u64(ul, ul);
+  _pext_u64(ul, ul);
+  _pdep_u32(ui, ui);
+  _pext_u32(ui, ui);
+}
+
+// CHECK-LABEL: @test_bmi2intrin
+
+// CHECK-LABEL: define available_externally zeroext i32 @_bzhi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i32 %[[SHL]], %[[SUB1]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_mulx_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32* noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %{{[0-9a-zA-Z._]+}}, 32
+// CHECK: trunc i64 %[[SHR]] to i32
+// CHECK: trunc i64 %{{[0-9a-zA-Z._]+}} to i32
+
+// CHECK-LABEL: define available_externally i64 @_bzhi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i64 %[[SHL]], %[[SUB1]]
+
+// CHECK-LABEL: define available_externally i64 @_mulx_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}, i64* noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
+// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i128 %{{[0-9a-zA-Z._]+}}, 64
+// CHECK: trunc i128 %[[SHR]] to i64
+// CHECK: trunc i128 %{{[0-9a-zA-Z._]+}} to i64
+
+// CHECK-LABEL: define available_externally i64 @_pdep_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub nsw i32 64, %[[CAST]]
+// CHECK: sext i32 %[[SUB]] to i64
+// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[CAST2:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL2]] to i32
+// CHECK: sext i32 %[[CAST2]] to i64
+// CHECK: %[[SUB2:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB2]]
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
+// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
+// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %[[AND]]
+// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
+
+// CHECK-LABEL: define available_externally i64 @_pext_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i1 @llvm.is.constant.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: br i1 %[[CALL]], label %[[TRUECOND:[0-9a-zA-Z._]+]], label %[[FALSECOND:[0-9a-zA-Z._]+]]
+// CHECK: [[TRUECOND]]:
+// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, 8
+// CHECK: or i64 %[[SHL]], %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
+// CHECK: add nsw i64 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: call i64 @llvm.ppc.bpermd
+// CHECK: [[FALSECOND]]:
+// CHECK: call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i64 %[[AND]], %[[SUB]]
+// CHECK: %[[SHR3:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR3]]
+// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
+
+// CHECK-LABEL: define available_externally zeroext i32 @_pdep_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pdep_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
+// CHECK: trunc i64 %[[CALL]] to i32
+
+// CHECK-LABEL: define available_externally zeroext i32 @_pext_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pext_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
+// CHECK: trunc i64 %[[CALL]] to i32
--- a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
@ -9,6 +9,9 @@
 // RUN: %clang -x c++ -fsyntax-only -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns

+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-P10
+
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 // RUN: %clang -x c++ -fsyntax-only -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \
@ -383,12 +386,12 @@ test_convert() {
 // CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0

 // CHECK-LABEL: define available_externally signext i32 @_mm_cvtss_si32
-// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"
-// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"
-// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 0
-// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 1
-// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 2
-// CHECK: trunc i64 %{{[0-9a-zA-Z_.]+}} to i32
+// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"
+// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"
+// CHECK-P10: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"
+// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 0
+// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 1
+// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 2

 // CHECK-LABEL: define available_externally i64 @_mm_cvtss_si64
 // CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctid  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"
@ -681,9 +684,11 @@ test_move() {
 // CHECK-LABEL: define available_externally signext i32 @_mm_movemask_ps
 // CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 2113632, i32 -2139062144, i32 -2139062144, i32 -2139062144> to <16 x i8>))
 // CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
+// CHECK-LE: trunc i64 %[[EXT]] to i32
 // CHECK-BE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 2113632> to <16 x i8>))
 // CHECK-BE: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0
-// CHECK: trunc i64 %[[EXT]] to i32
+// CHECK-BE: trunc i64 %[[EXT]] to i32
+// CHECK-P10: call zeroext i32 @vec_extractm(unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}})

 void __attribute__((noinline))
 test_alt_name_move() {