diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index a3c0b08fa364..76ea66a5d3bf 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -133,6 +133,9 @@ set(ppc_wrapper_files ppc_wrappers/xmmintrin.h ppc_wrappers/mm_malloc.h ppc_wrappers/emmintrin.h + ppc_wrappers/pmmintrin.h + ppc_wrappers/tmmintrin.h + ppc_wrappers/smmintrin.h ) set(openmp_wrapper_files diff --git a/clang/lib/Headers/ppc_wrappers/pmmintrin.h b/clang/lib/Headers/ppc_wrappers/pmmintrin.h new file mode 100644 index 000000000000..6d93383d5412 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/pmmintrin.h @@ -0,0 +1,150 @@ +/*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. + + In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA + is a good match for most SIMD operations. However the Horizontal + add/sub requires the data pairs be permuted into a separate + registers with vertical even/odd alignment for the operation. + And the addsub operation requires the sign of only the even numbered + elements be flipped (xored with -0.0). + For larger blocks of code using these intrinsic implementations, + the compiler be should be able to schedule instructions to avoid + additional latency. + + In the specific case of the monitor and mwait instructions there are + no direct equivalent in the PowerISA at this time. So those + intrinsics are not implemented. */ +#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." +#endif + +#ifndef PMMINTRIN_H_ +#define PMMINTRIN_H_ + +#if defined(__linux__) && defined(__ppc64__) + +/* We need definitions from the SSE2 and SSE header files*/ +#include + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_ps (__m128 __X, __m128 __Y) +{ + const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0}; + __v4sf even_neg_Y = vec_xor(__Y, even_n0); + return (__m128) vec_add (__X, even_neg_Y); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_pd (__m128d __X, __m128d __Y) +{ + const __v2df even_n0 = {-0.0, 0.0}; + __v2df even_neg_Y = vec_xor(__Y, even_n0); + return (__m128d) vec_add (__X, even_neg_Y); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_ps (__m128 __X, __m128 __Y) +{ + __vector unsigned char xform2 = { + 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B + }; + __vector unsigned char xform1 = { + 0x04, 0x05, 0x06, 0x07, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F + }; + return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), + vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_ps (__m128 __X, __m128 __Y) +{ + __vector unsigned char xform2 = { + 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B + }; + __vector unsigned char xform1 = { + 0x04, 0x05, 0x06, 0x07, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F + }; + return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), + vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y), + vec_mergel ((__v2df) __X, (__v2df)__Y)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y), + vec_mergel ((__v2df) __X, (__v2df)__Y)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movehdup_ps (__m128 __X) +{ + return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_moveldup_ps (__m128 __X) +{ + return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loaddup_pd (double const *__P) +{ + return (__m128d) vec_splats (*__P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movedup_pd (__m128d __X) +{ + return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lddqu_si128 (__m128i const *__P) +{ + return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); +} + +/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */ + +#else +#include_next +#endif /* defined(__linux__) && defined(__ppc64__) */ + +#endif /* PMMINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/smmintrin.h b/clang/lib/Headers/ppc_wrappers/smmintrin.h new file mode 100644 index 000000000000..56ef6ba76b06 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h @@ -0,0 +1,85 @@ +/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. + + NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerp64/powerpc64le. + + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#error \ + "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#endif + +#ifndef SMMINTRIN_H_ +#define SMMINTRIN_H_ + +#if defined(__linux__) && defined(__ppc64__) + +#include +#include + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi8(__m128i __X, const int __N) { + return (unsigned char)((__v16qi)__X)[__N & 15]; +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi32(__m128i __X, const int __N) { + return ((__v4si)__X)[__N & 3]; +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi64(__m128i __X, const int __N) { + return ((__v2di)__X)[__N & 1]; +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_ps(__m128 __X, const int __N) { + return ((__v4si)__X)[__N & 3]; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { + __v16qi __charmask = vec_splats((signed char)__imm8); + __charmask = vec_gb(__charmask); + __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask); +#ifdef __BIG_ENDIAN__ + __shortmask = vec_reve(__shortmask); +#endif + return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { + const __v16qu __seven = vec_splats((unsigned char)0x07); + __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); + return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask); +} + +#else +#include_next +#endif /* defined(__linux__) && defined(__ppc64__) */ + +#endif /* _SMMINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/tmmintrin.h b/clang/lib/Headers/ppc_wrappers/tmmintrin.h new file mode 100644 index 000000000000..b5a935d5e47e --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/tmmintrin.h @@ -0,0 +1,495 @@ +/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#endif + +#ifndef TMMINTRIN_H_ +#define TMMINTRIN_H_ + +#if defined(__linux__) && defined(__ppc64__) + +#include + +/* We need definitions from the SSE header files. */ +#include + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi16 (__m128i __A) +{ + return (__m128i) vec_abs ((__v8hi) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi32 (__m128i __A) +{ + return (__m128i) vec_abs ((__v4si) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi8 (__m128i __A) +{ + return (__m128i) vec_abs ((__v16qi) __A); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi16 (__m64 __A) +{ + __v8hi __B = (__v8hi) (__v2du) { __A, __A }; + return (__m64) ((__v2du) vec_abs (__B))[0]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi32 (__m64 __A) +{ + __v4si __B = (__v4si) (__v2du) { __A, __A }; + return (__m64) ((__v2du) vec_abs (__B))[0]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi8 (__m64 __A) +{ + __v16qi __B = (__v16qi) (__v2du) { __A, __A }; + return (__m64) ((__v2du) vec_abs (__B))[0]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) +{ + if (__builtin_constant_p (__count) && __count < 16) + { +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_reve ((__v16qu) __A); + __B = (__m128i) vec_reve ((__v16qu) __B); +#endif + __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_reve ((__v16qu) __A); +#endif + return __A; + } + + if (__count == 0) + return __B; + + if (__count >= 16) + { + if (__count >= 32) + { + const __v16qu zero = { 0 }; + return (__m128i) zero; + } + else + { + const __v16qu __shift = + vec_splats ((unsigned char) ((__count - 16) * 8)); +#ifdef __LITTLE_ENDIAN__ + return (__m128i) vec_sro ((__v16qu) __A, __shift); +#else + return (__m128i) vec_slo ((__v16qu) __A, __shift); +#endif + } + } + else + { + const __v16qu __shiftA = + vec_splats ((unsigned char) ((16 - __count) * 8)); + const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); + __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); +#else + __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA); + __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB); +#endif + return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B); + } +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) +{ + if (__count < 16) + { + __v2du __C = { __B, __A }; +#ifdef __LITTLE_ENDIAN__ + const __v4su __shift = { __count << 3, 0, 0, 0 }; + __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); +#else + const __v4su __shift = { 0, 0, 0, __count << 3 }; + __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift); +#endif + return (__m64) __C[0]; + } + else + { + const __m64 __zero = { 0 }; + return __zero; + } +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi16 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); + __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); + return (__m128i) vec_add (__C, __D); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi32 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; + const __v16qu __Q = + { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; + __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); + __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); + return (__m128i) vec_add (__C, __D); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi16 (__m64 __A, __m64 __B) +{ + __v8hi __C = (__v8hi) (__v2du) { __A, __B }; + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; + __v8hi __D = vec_perm (__C, __C, __Q); + __C = vec_perm (__C, __C, __P); + __C = vec_add (__C, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi32 (__m64 __A, __m64 __B) +{ + __v4si __C = (__v4si) (__v2du) { __A, __B }; + const __v16qu __P = + { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; + const __v16qu __Q = + { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; + __v4si __D = vec_perm (__C, __C, __Q); + __C = vec_perm (__C, __C, __P); + __C = vec_add (__C, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_epi16 (__m128i __A, __m128i __B) +{ + __v4si __C = { 0 }, __D = { 0 }; + __C = vec_sum4s ((__v8hi) __A, __C); + __D = vec_sum4s ((__v8hi) __B, __D); + __C = (__v4si) vec_packs (__C, __D); + return (__m128i) __C; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_pi16 (__m64 __A, __m64 __B) +{ + const __v4si __zero = { 0 }; + __v8hi __C = (__v8hi) (__v2du) { __A, __B }; + __v4si __D = vec_sum4s (__C, __zero); + __C = vec_packs (__D, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi16 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); + __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); + return (__m128i) vec_sub (__C, __D); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi32 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; + const __v16qu __Q = + { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; + __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); + __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); + return (__m128i) vec_sub (__C, __D); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi16 (__m64 __A, __m64 __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; + __v8hi __C = (__v8hi) (__v2du) { __A, __B }; + __v8hi __D = vec_perm (__C, __C, __Q); + __C = vec_perm (__C, __C, __P); + __C = vec_sub (__C, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi32 (__m64 __A, __m64 __B) +{ + const __v16qu __P = + { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; + const __v16qu __Q = + { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; + __v4si __C = (__v4si) (__v2du) { __A, __B }; + __v4si __D = vec_perm (__C, __C, __Q); + __C = vec_perm (__C, __C, __P); + __C = vec_sub (__C, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_epi16 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); + __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); + return (__m128i) vec_subs (__C, __D); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_pi16 (__m64 __A, __m64 __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; + __v8hi __C = (__v8hi) (__v2du) { __A, __B }; + __v8hi __D = vec_perm (__C, __C, __P); + __v8hi __E = vec_perm (__C, __C, __Q); + __C = vec_subs (__D, __E); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi8 (__m128i __A, __m128i __B) +{ + const __v16qi __zero = { 0 }; + __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero); + __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); + return (__m128i) vec_sel (__C, __zero, __select); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi8 (__m64 __A, __m64 __B) +{ + const __v16qi __zero = { 0 }; + __v16qi __C = (__v16qi) (__v2du) { __A, __A }; + __v16qi __D = (__v16qi) (__v2du) { __B, __B }; + __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero); + __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); + __C = vec_sel (__C, __zero, __select); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi8 (__m128i __A, __m128i __B) +{ + const __v16qi __zero = { 0 }; + __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero); + __v16qi __selectpos = + (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero)); + __v16qi __conv = vec_add (__selectneg, __selectpos); + return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi16 (__m128i __A, __m128i __B) +{ + const __v8hi __zero = { 0 }; + __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero); + __v8hi __selectpos = + (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero)); + __v8hi __conv = vec_add (__selectneg, __selectpos); + return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi32 (__m128i __A, __m128i __B) +{ + const __v4si __zero = { 0 }; + __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero); + __v4si __selectpos = + (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero)); + __v4si __conv = vec_add (__selectneg, __selectpos); + return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi8 (__m64 __A, __m64 __B) +{ + const __v16qi __zero = { 0 }; + __v16qi __C = (__v16qi) (__v2du) { __A, __A }; + __v16qi __D = (__v16qi) (__v2du) { __B, __B }; + __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi16 (__m64 __A, __m64 __B) +{ + const __v8hi __zero = { 0 }; + __v8hi __C = (__v8hi) (__v2du) { __A, __A }; + __v8hi __D = (__v8hi) (__v2du) { __B, __B }; + __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi32 (__m64 __A, __m64 __B) +{ + const __v4si __zero = { 0 }; + __v4si __C = (__v4si) (__v2du) { __A, __A }; + __v4si __D = (__v4si) (__v2du) { __B, __B }; + __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_epi16 (__m128i __A, __m128i __B) +{ + __v8hi __unsigned = vec_splats ((signed short) 0x00ff); + __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned); + __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned); + __v8hi __E = vec_unpackh ((__v16qi) __B); + __v8hi __F = vec_unpackl ((__v16qi) __B); + __C = vec_mul (__C, __E); + __D = vec_mul (__D, __F); + const __v16qu __odds = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __evens = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __E = vec_perm (__C, __D, __odds); + __F = vec_perm (__C, __D, __evens); + return (__m128i) vec_adds (__E, __F); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_pi16 (__m64 __A, __m64 __B) +{ + __v8hi __C = (__v8hi) (__v2du) { __A, __A }; + __C = vec_unpackl ((__v16qi) __C); + const __v8hi __unsigned = vec_splats ((signed short) 0x00ff); + __C = vec_and (__C, __unsigned); + __v8hi __D = (__v8hi) (__v2du) { __B, __B }; + __D = vec_unpackl ((__v16qi) __D); + __D = vec_mul (__C, __D); + const __v16qu __odds = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __evens = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __C = vec_perm (__D, __D, __odds); + __D = vec_perm (__D, __D, __evens); + __C = vec_adds (__C, __D); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_epi16 (__m128i __A, __m128i __B) +{ + __v4si __C = vec_unpackh ((__v8hi) __A); + __v4si __D = vec_unpackh ((__v8hi) __B); + __C = vec_mul (__C, __D); + __D = vec_unpackl ((__v8hi) __A); + __v4si __E = vec_unpackl ((__v8hi) __B); + __D = vec_mul (__D, __E); + const __v4su __shift = vec_splats ((unsigned int) 14); + __C = vec_sr (__C, __shift); + __D = vec_sr (__D, __shift); + const __v4si __ones = vec_splats ((signed int) 1); + __C = vec_add (__C, __ones); + __C = vec_sr (__C, (__v4su) __ones); + __D = vec_add (__D, __ones); + __D = vec_sr (__D, (__v4su) __ones); + return (__m128i) vec_pack (__C, __D); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_pi16 (__m64 __A, __m64 __B) +{ + __v4si __C = (__v4si) (__v2du) { __A, __A }; + __C = vec_unpackh ((__v8hi) __C); + __v4si __D = (__v4si) (__v2du) { __B, __B }; + __D = vec_unpackh ((__v8hi) __D); + __C = vec_mul (__C, __D); + const __v4su __shift = vec_splats ((unsigned int) 14); + __C = vec_sr (__C, __shift); + const __v4si __ones = vec_splats ((signed int) 1); + __C = vec_add (__C, __ones); + __C = vec_sr (__C, (__v4su) __ones); + __v8hi __E = vec_pack (__C, __D); + return (__m64) ((__v2du) (__E))[0]; +} + +#else +#include_next +#endif /* defined(__linux__) && defined(__ppc64__) */ + +#endif /* TMMINTRIN_H_ */ diff --git a/clang/test/CodeGen/ppc-pmmintrin.c b/clang/test/CodeGen/ppc-pmmintrin.c new file mode 100644 index 000000000000..ee4e89837444 --- /dev/null +++ b/clang/test/CodeGen/ppc-pmmintrin.c @@ -0,0 +1,153 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: powerpc-registered-target + +// RUN: %clang -S -emit-llvm -target powerpc64-gnu-linux -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s +// RUN: %clang -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s + +#include + +__m128d resd, md1, md2; +__m128 res, m1, m2; +__m128i resi, mi; +double *d; + +void __attribute__((noinline)) +test_pmmintrin() { + resd = _mm_addsub_pd(md1, md2); + res = _mm_addsub_ps(m1, m2); + resd = _mm_hadd_pd(md1, md2); + res = _mm_hadd_ps(m1, m2); + resd = _mm_hsub_pd(md1, md2); + res = _mm_hsub_ps(m1, m2); + resi = _mm_lddqu_si128(&mi); + resd = _mm_loaddup_pd(d); + resd = _mm_movedup_pd(md1); + res = _mm_movehdup_ps(m1); + res = _mm_moveldup_ps(m1); +} + +// CHECK-LABEL: @test_pmmintrin + +// CHECK: define available_externally <2 x double> @_mm_addsub_pd(<2 x double> [[REG1:[0-9a-zA-Z_%.]+]], <2 x double> [[REG2:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x double> [[REG1]], <2 x double>* [[REG3:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x double> [[REG2]], <2 x double>* [[REG4:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x double> , <2 x double>* [[REG5:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG7:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG4]], align 16 +// CHECK-NEXT: [[REG8:[0-9a-zA-Z_%.]+]] = call <2 x double> @vec_xor(double vector[2], double vector[2])(<2 x double> [[REG7]], <2 x double> ) +// CHECK-NEXT: store <2 x double> [[REG8]], <2 x double>* [[REG6:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG9:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG3]], align 16 +// CHECK-NEXT: [[REG10:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG6]], align 16 +// CHECK-NEXT: [[REG11:[0-9a-zA-Z_%.]+]] = call <2 x double> @vec_add(double vector[2], double vector[2])(<2 x double> [[REG9]], <2 x double> [[REG10]]) +// CHECK-NEXT: ret <2 x double> [[REG11]] + +// CHECK: define available_externally <4 x float> @_mm_addsub_ps(<4 x float> [[REG12:[0-9a-zA-Z_%.]+]], <4 x float> [[REG13:[0-9a-zA-Z_%.]+]]) +// CHECK: store <4 x float> [[REG12]], <4 x float>* [[REG14:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <4 x float> [[REG13]], <4 x float>* [[REG15:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <4 x float> , <4 x float>* [[REG16:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG18:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG15]], align 16 +// CHECK-NEXT: [[REG19:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_xor(float vector[4], float vector[4])(<4 x float> [[REG18]], <4 x float> ) +// CHECK-NEXT: store <4 x float> [[REG19]], <4 x float>* [[REG17:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG20:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG14]], align 16 +// CHECK-NEXT: [[REG21:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG17]], align 16 +// CHECK-NEXT: [[REG22:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_add(float vector[4], float vector[4])(<4 x float> [[REG20]], <4 x float> [[REG21]]) +// CHECK-NEXT: ret <4 x float> [[REG22]] + +// CHECK: define available_externally <2 x double> @_mm_hadd_pd(<2 x double> [[REG23:[0-9a-zA-Z_%.]+]], <2 x double> [[REG24:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x double> [[REG23]], <2 x double>* [[REG25:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x double> [[REG24]], <2 x double>* [[REG26:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG27:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG25]], align 16 +// CHECK-NEXT: [[REG28:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG26]], align 16 +// CHECK-NEXT: [[REG29:[0-9a-zA-Z_%.]+]] = call <2 x double> @vec_mergeh(double vector[2], double vector[2])(<2 x double> [[REG27]], <2 x double> [[REG28]]) +// CHECK-NEXT: [[REG30:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG25]], align 16 +// CHECK-NEXT: [[REG31:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG26]], align 16 +// CHECK-NEXT: [[REG32:[0-9a-zA-Z_%.]+]] = call <2 x double> @vec_mergel(double vector[2], double vector[2])(<2 x double> [[REG30]], <2 x double> [[REG31]]) +// CHECK-NEXT: [[REG33:[0-9a-zA-Z_%.]+]] = call <2 x double> @vec_add(double vector[2], double vector[2])(<2 x double> [[REG29]], <2 x double> [[REG32]]) +// CHECK-NEXT: ret <2 x double> [[REG33]] + +// CHECK: define available_externally <4 x float> @_mm_hadd_ps(<4 x float> [[REG34:[0-9a-zA-Z_%.]+]], <4 x float> [[REG35:[0-9a-zA-Z_%.]+]]) +// CHECK: store <4 x float> [[REG34]], <4 x float>* [[REG36:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <4 x float> [[REG35]], <4 x float>* [[REG37:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG38:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG39:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG40:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG36]], align 16 +// CHECK-NEXT: [[REG41:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG37]], align 16 +// CHECK-NEXT: [[REG42:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG38:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG43:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])(<4 x float> [[REG40]], <4 x float> [[REG41]], <16 x i8> [[REG42]]) +// CHECK-NEXT: [[REG44:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG36]], align 16 +// CHECK-NEXT: [[REG45:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG37]], align 16 +// CHECK-NEXT: [[REG46:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG39:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG47:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])(<4 x float> [[REG44]], <4 x float> [[REG45]], <16 x i8> [[REG46]]) +// CHECK-NEXT: [[REG48:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_add(float vector[4], float vector[4])(<4 x float> [[REG43]], <4 x float> [[REG47]]) +// CHECK-NEXT: ret <4 x float> [[REG48]] + +// CHECK: define available_externally <2 x double> @_mm_hsub_pd(<2 x double> [[REG49:[0-9a-zA-Z_%.]+]], <2 x double> [[REG50:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x double> [[REG49]], <2 x double>* [[REG51:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x double> [[REG50]], <2 x double>* [[REG52:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG53:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG51]], align 16 +// CHECK-NEXT: [[REG54:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG52]], align 16 +// CHECK-NEXT: [[REG55:[0-9a-zA-Z_%.]+]] = call <2 x double> @vec_mergeh(double vector[2], double vector[2])(<2 x double> [[REG53]], <2 x double> [[REG54]]) +// CHECK-NEXT: [[REG56:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG51]], align 16 +// CHECK-NEXT: [[REG57:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG52]], align 16 +// CHECK-NEXT: [[REG58:[0-9a-zA-Z_%.]+]] = call <2 x double> @vec_mergel(double vector[2], double vector[2])(<2 x double> [[REG56]], <2 x double> [[REG57]]) +// CHECK-NEXT: [[REG59:[0-9a-zA-Z_%.]+]] = call <2 x double> @vec_sub(double vector[2], double vector[2])(<2 x double> [[REG55]], <2 x double> [[REG58]]) +// CHECK-NEXT: ret <2 x double> [[REG59]] + +// CHECK: define available_externally <4 x float> @_mm_hsub_ps(<4 x float> [[REG60:[0-9a-zA-Z_%.]+]], <4 x float> [[REG61:[0-9a-zA-Z_%.]+]]) +// CHECK: store <4 x float> [[REG60]], <4 x float>* [[REG62:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <4 x float> [[REG61]], <4 x float>* [[REG63:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG64:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG65:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG66:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG62]], align 16 +// CHECK-NEXT: [[REG67:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG63]], align 16 +// CHECK-NEXT: [[REG68:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG64]], align 16 +// CHECK-NEXT: [[REG69:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])(<4 x float> [[REG66]], <4 x float> [[REG67]], <16 x i8> [[REG68]]) +// CHECK-NEXT: [[REG70:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG62]], align 16 +// CHECK-NEXT: [[REG71:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG63]], align 16 +// CHECK-NEXT: [[REG72:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG65]], align 16 +// CHECK-NEXT: [[REG73:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])(<4 x float> [[REG70]], <4 x float> [[REG71]], <16 x i8> [[REG72]]) +// CHECK-NEXT: [[REG74:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sub(float vector[4], float vector[4])(<4 x float> [[REG69]], <4 x float> [[REG73]]) +// CHECK-NEXT: ret <4 x float> [[REG74]] + +// CHECK: define available_externally <2 x i64> @_mm_lddqu_si128(<2 x i64>* [[REG75:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64>* [[REG75]], <2 x i64>** [[REG76:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: [[REG77:[0-9a-zA-Z_%.]+]] = load <2 x i64>*, <2 x i64>** [[REG76]], align 8 +// CHECK-NEXT: [[REG78:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64>* [[REG77]] to i32* +// CHECK-NEXT: [[REG79:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_vsx_ld(int, int const*)(i32 signext 0, i32* [[REG78]]) +// CHECK-NEXT: [[REG80:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG79]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG80]] + +// CHECK: define available_externally <2 x double> @_mm_loaddup_pd(double* [[REG81:[0-9a-zA-Z_%.]+]]) +// CHECK: store double* [[REG81]], double** [[REG82:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: [[REG83:[0-9a-zA-Z_%.]+]] = load double*, double** [[REG82]], align 8 +// CHECK-NEXT: [[REG84:[0-9a-zA-Z_%.]+]] = load double, double* [[REG83]], align 8 +// CHECK-NEXT: [[REG85:[0-9a-zA-Z_%.]+]] = call <2 x double> @vec_splats(double)(double [[REG84]]) +// CHECK-NEXT: ret <2 x double> [[REG85]] + +// CHECK: define available_externally <2 x double> @_mm_movedup_pd(<2 x double> [[REG86:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x double> [[REG86]], <2 x double>* [[REG87:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG88:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG87]], align 16 +// CHECK-NEXT: [[REG89:[0-9a-zA-Z_%.]+]] = load <2 x double>, <2 x double>* [[REG87]], align 16 +// CHECK-NEXT: [[REG90:[0-9a-zA-Z_%.]+]] = call <2 x double> @_mm_shuffle_pd(<2 x double> [[REG88]], <2 x double> [[REG89]], i32 signext 0) +// CHECK-NEXT: ret <2 x double> [[REG90]] + +// CHECK: define available_externally <4 x float> @_mm_movehdup_ps(<4 x float> [[REG91:[0-9a-zA-Z_%.]+]]) +// CHECK: store <4 x float> [[REG91]], <4 x float>* [[REG92:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG93:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG92]], align 16 +// CHECK-NEXT: [[REG94:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG93]] to <4 x i32> +// CHECK-NEXT: [[REG95:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG92]], align 16 +// CHECK-NEXT: [[REG96:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG95]] to <4 x i32> +// CHECK-NEXT: [[REG97:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mergeo(unsigned int vector[4], unsigned int vector[4])(<4 x i32> [[REG94]], <4 x i32> [[REG96]]) +// CHECK-NEXT: [[REG98:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG97]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[REG98]] + +// CHECK: define available_externally <4 x float> @_mm_moveldup_ps(<4 x float> [[REG99:[0-9a-zA-Z_%.]+]]) +// CHECK: store <4 x float> [[REG99]], <4 x float>* [[REG100:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG101:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG100]], align 16 +// CHECK-NEXT: [[REG102:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG101]] to <4 x i32> +// CHECK-NEXT: [[REG103:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG100]], align 16 +// CHECK-NEXT: [[REG104:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG103]] to <4 x i32> +// CHECK-NEXT: [[REG105:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mergee(unsigned int vector[4], unsigned int vector[4])(<4 x i32> [[REG102]], <4 x i32> [[REG104]]) +// CHECK-NEXT: [[REG106:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG105]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[REG106]] diff --git a/clang/test/CodeGen/ppc-smmintrin.c b/clang/test/CodeGen/ppc-smmintrin.c new file mode 100644 index 000000000000..e11656225a3d --- /dev/null +++ b/clang/test/CodeGen/ppc-smmintrin.c @@ -0,0 +1,118 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: powerpc-registered-target + +// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,LE +// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,BE + +#include + +__m128i mi, m1, m2; + +void __attribute__((noinline)) +test_extract() { + _mm_extract_epi8(mi, 0); + _mm_extract_epi32(mi, 0); + _mm_extract_epi64(mi, 0); + _mm_extract_ps(mi, 0); +} + +// CHECK-LABEL: @test_extract + +// CHECK: define available_externally signext i32 @_mm_extract_epi8(<2 x i64> [[REG1:[0-9a-zA-Z_%.]+]], i32 signext [[REG2:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG1]], <2 x i64>* [[REG3:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store i32 [[REG2]], i32* [[REG4:[0-9a-zA-Z_%.]+]], align 4 +// CHECK-NEXT: [[REG5:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG3]], align 16 +// CHECK-NEXT: [[REG6:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG5]] to <16 x i8> +// CHECK-NEXT: [[REG7:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG4]], align 4 +// CHECK-NEXT: [[REG8:[0-9a-zA-Z_%.]+]] = and i32 [[REG7]], 15 +// CHECK-NEXT: [[REG9:[0-9a-zA-Z_%.]+]] = extractelement <16 x i8> [[REG6]], i32 [[REG8]] +// CHECK-NEXT: [[REG10:[0-9a-zA-Z_%.]+]] = zext i8 [[REG9]] to i32 +// CHECK-NEXT: ret i32 [[REG10]] + +// CHECK: define available_externally signext i32 @_mm_extract_epi32(<2 x i64> [[REG11:[0-9a-zA-Z_%.]+]], i32 signext [[REG12:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG11]], <2 x i64>* [[REG13:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store i32 [[REG12]], i32* [[REG14:[0-9a-zA-Z_%.]+]], align 4 +// CHECK-NEXT: [[REG15:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG13]], align 16 +// CHECK-NEXT: [[REG16:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG15]] to <4 x i32> +// CHECK-NEXT: [[REG17:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG14]], align 4 +// CHECK-NEXT: [[REG18:[0-9a-zA-Z_%.]+]] = and i32 [[REG17]], 3 +// CHECK-NEXT: [[REG19:[0-9a-zA-Z_%.]+]] = extractelement <4 x i32> [[REG16]], i32 [[REG18]] +// CHECK-NEXT: ret i32 [[REG19]] + +// CHECK: define available_externally signext i32 @_mm_extract_epi64(<2 x i64> [[REG20:[0-9a-zA-Z_%.]+]], i32 signext [[REG21:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG20]], <2 x i64>* [[REG22:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store i32 [[REG21]], i32* [[REG23:[0-9a-zA-Z_%.]+]], align 4 +// CHECK-NEXT: [[REG24:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG22]], align 16 +// CHECK-NEXT: [[REG25:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG23]], align 4 +// CHECK-NEXT: [[REG26:[0-9a-zA-Z_%.]+]] = and i32 [[REG25]], 1 +// CHECK-NEXT: [[REG27:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG24]], i32 [[REG26]] +// CHECK-NEXT: [[REG28:[0-9a-zA-Z_%.]+]] = trunc i64 [[REG27]] to i32 +// CHECK-NEXT: ret i32 [[REG28]] + +// CHECK: define available_externally signext i32 @_mm_extract_ps(<4 x float> [[REG29:[0-9a-zA-Z_%.]+]], i32 signext [[REG30:[0-9a-zA-Z_%.]+]]) +// CHECK: store <4 x float> [[REG29]], <4 x float>* [[REG31:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store i32 [[REG30]], i32* [[REG32:[0-9a-zA-Z_%.]+]], align 4 +// CHECK-NEXT: [[REG33:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG31]], align 16 +// CHECK-NEXT: [[REG34:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG33]] to <4 x i32> +// CHECK-NEXT: [[REG35:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG32]], align 4 +// CHECK-NEXT: [[REG36:[0-9a-zA-Z_%.]+]] = and i32 [[REG35]], 3 +// CHECK-NEXT: [[REG37:[0-9a-zA-Z_%.]+]] = extractelement <4 x i32> [[REG34]], i32 [[REG36]] +// CHECK-NEXT: ret i32 [[REG37]] + +void __attribute__((noinline)) +test_blend() { + _mm_blend_epi16(m1, m2, 0); + _mm_blendv_epi8(m1, m2, mi); +} + +// CHECK-LABEL: @test_blend + +// CHECK: define available_externally <2 x i64> @_mm_blend_epi16(<2 x i64> [[REG38:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG39:[0-9a-zA-Z_%.]+]], i32 signext [[REG40:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG38]], <2 x i64>* [[REG41:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG39]], <2 x i64>* [[REG42:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store i32 [[REG40]], i32* [[REG43:[0-9a-zA-Z_%.]+]], align 4 +// CHECK-NEXT: [[REG44:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG43]], align 4 +// CHECK-NEXT: [[REG45:[0-9a-zA-Z_%.]+]] = trunc i32 [[REG44]] to i8 +// CHECK-NEXT: [[REG46:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(signed char)(i8 signext [[REG45]]) +// CHECK-NEXT: store <16 x i8> [[REG46]], <16 x i8>* [[REG47:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG48:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG47]], align 16 +// CHECK-NEXT: [[REG49:[0-9a-zA-Z_%.]+]] = call <16 x i8> @llvm.ppc.altivec.vgbbd(<16 x i8> [[REG48]]) +// CHECK-NEXT: store <16 x i8> [[REG49]], <16 x i8>* [[REG47]], align 16 +// CHECK-NEXT: [[REG50:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG47]], align 16 +// CHECK-NEXT: [[REG51:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackh(signed char vector[16])(<16 x i8> [[REG50]]) +// CHECK-NEXT: store <8 x i16> [[REG51]], <8 x i16>* [[REG52:[0-9a-zA-Z_%.]+]], align 16 + +// BE: [[REG53:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG52]], align 16 +// BE-NEXT: [[REG54:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_reve(unsigned short vector[8])(<8 x i16> [[REG53]]) +// BE-NEXT: store <8 x i16> [[REG54]], <8 x i16>* [[REG52]], align 16 + +// CHECK: [[REG55:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG41]], align 16 +// CHECK-NEXT: [[REG56:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG55]] to <8 x i16> +// CHECK-NEXT: [[REG57:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG42]], align 16 +// CHECK-NEXT: [[REG58:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG57]] to <8 x i16> +// CHECK-NEXT: [[REG59:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG52]], align 16 +// CHECK-NEXT: [[REG60:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8])(<8 x i16> [[REG56]], <8 x i16> [[REG58]], <8 x i16> [[REG59]]) +// CHECK-NEXT: [[REG61:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG60]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG61]] + +// CHECK: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> [[REG62:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG63:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG64:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG62]], <2 x i64>* [[REG65:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG63]], <2 x i64>* [[REG66:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG64]], <2 x i64>* [[REG67:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG68:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(unsigned char)(i8 zeroext 7) +// CHECK-NEXT: store <16 x i8> [[REG68]], <16 x i8>* [[REG69:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG70:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG67]], align 16 +// CHECK-NEXT: [[REG71:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG70]] to <16 x i8> +// CHECK-NEXT: [[REG72:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG69]], align 16 +// CHECK-NEXT: [[REG73:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG71]], <16 x i8> [[REG72]]) +// CHECK-NEXT: store <16 x i8> [[REG73]], <16 x i8>* [[REG74:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG75:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG65]], align 16 +// CHECK-NEXT: [[REG76:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG75]] to <16 x i8> +// CHECK-NEXT: [[REG77:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG66]], align 16 +// CHECK-NEXT: [[REG78:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG77]] to <16 x i8> +// CHECK-NEXT: [[REG79:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG74]], align 16 +// CHECK-NEXT: [[REG80:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG76]], <16 x i8> [[REG78]], <16 x i8> [[REG79]]) +// CHECK-NEXT: [[REG81:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG80]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG81]] diff --git a/clang/test/CodeGen/ppc-tmmintrin.c b/clang/test/CodeGen/ppc-tmmintrin.c new file mode 100644 index 000000000000..61453c6744f1 --- /dev/null +++ b/clang/test/CodeGen/ppc-tmmintrin.c @@ -0,0 +1,1061 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: powerpc-registered-target + +// RUN: %clang -S -emit-llvm -target powerpc64-gnu-linux -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-BE +// RUN: %clang -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-LE + +#include + +__m64 res, m1, m2; +__m128i resi, mi1, mi2; + +void __attribute__((noinline)) +test_abs() { + resi = _mm_abs_epi16(mi1); + resi = _mm_abs_epi32(mi1); + resi = _mm_abs_epi8(mi1); + res = _mm_abs_pi16(m1); + res = _mm_abs_pi32(m1); + res = _mm_abs_pi8(m1); +} + +// CHECK-LABEL: @test_abs + +// CHECK: define available_externally <2 x i64> @_mm_abs_epi16(<2 x i64> [[REG1:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG1]], <2 x i64>* [[REG2:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG3:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG2]], align 16 +// CHECK-NEXT: [[REG4:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG3]] to <8 x i16> +// CHECK-NEXT: [[REG5:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_abs(short vector[8])(<8 x i16> [[REG4]]) +// CHECK-NEXT: [[REG6:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG5]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG6]] + +// CHECK: define available_externally <2 x i64> @_mm_abs_epi32(<2 x i64> [[REG7:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG7]], <2 x i64>* [[REG8:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG9:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG8]], align 16 +// CHECK-NEXT: [[REG10:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG9]] to <4 x i32> +// CHECK-NEXT: [[REG11:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_abs(int vector[4])(<4 x i32> [[REG10]]) +// CHECK-NEXT: [[REG12:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG11]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG12]] + +// CHECK: define available_externally <2 x i64> @_mm_abs_epi8(<2 x i64> [[REG13:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG13]], <2 x i64>* [[REG14:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG15:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG14]], align 16 +// CHECK-NEXT: [[REG16:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG15]] to <16 x i8> +// CHECK-NEXT: [[REG17:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_abs(signed char vector[16])(<16 x i8> [[REG16]]) +// CHECK-NEXT: [[REG18:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG17]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG18]] + +// CHECK: define available_externally i64 @_mm_abs_pi16(i64 [[REG19:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG19]], i64* [[REG20:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: [[REG21:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG20]], align 8 +// CHECK-NEXT: [[REG22:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG21]], i32 0 +// CHECK-NEXT: [[REG23:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG20]], align 8 +// CHECK-NEXT: [[REG24:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG22]], i64 [[REG23]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG24]], <2 x i64>* [[REG25:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG26:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG25]], align 16 +// CHECK-NEXT: [[REG27:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG26]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG27]], <8 x i16>* [[REG28:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG29:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG28]], align 16 +// CHECK-NEXT: [[REG30:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_abs(short vector[8])(<8 x i16> [[REG29]]) +// CHECK-NEXT: [[REG31:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG30]] to <2 x i64> +// CHECK-NEXT: [[REG32:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG31]], i32 0 +// CHECK-NEXT: ret i64 [[REG32]] + +// CHECK: define available_externally i64 @_mm_abs_pi32(i64 [[REG33:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG33]], i64* [[REG34:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: [[REG35:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG34]], align 8 +// CHECK-NEXT: [[REG36:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG35]], i32 0 +// CHECK-NEXT: [[REG37:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG34]], align 8 +// CHECK-NEXT: [[REG38:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG36]], i64 [[REG37]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG38]], <2 x i64>* [[REG39:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG40:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG39]], align 16 +// CHECK-NEXT: [[REG41:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG40]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[REG41]], <4 x i32>* [[REG42:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG43:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG42]], align 16 +// CHECK-NEXT: [[REG44:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_abs(int vector[4])(<4 x i32> [[REG43]]) +// CHECK-NEXT: [[REG45:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG44]] to <2 x i64> +// CHECK-NEXT: [[REG46:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG45]], i32 0 +// CHECK-NEXT: ret i64 [[REG46]] + +// CHECK: define available_externally i64 @_mm_abs_pi8(i64 [[REG47:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG47]], i64* [[REG48:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: [[REG49:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG48]], align 8 +// CHECK-NEXT: [[REG50:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG49]], i32 0 +// CHECK-NEXT: [[REG51:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG48]], align 8 +// CHECK-NEXT: [[REG52:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG50]], i64 [[REG51]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG52]], <2 x i64>* [[REG53:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG54:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG53]], align 16 +// CHECK-NEXT: [[REG55:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG54]] to <16 x i8> +// CHECK-NEXT: store <16 x i8> [[REG55]], <16 x i8>* [[REG56:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG57:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG56]], align 16 +// CHECK-NEXT: [[REG58:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_abs(signed char vector[16])(<16 x i8> [[REG57]]) +// CHECK-NEXT: [[REG59:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG58]] to <2 x i64> +// CHECK-NEXT: [[REG60:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG59]], i32 0 +// CHECK-NEXT: ret i64 [[REG60]] + +void __attribute__((noinline)) +test_alignr() { + resi = _mm_alignr_epi8(mi1, mi2, 1U); + res = _mm_alignr_pi8(m1, m2, 1U); +} + +// CHECK-LABEL: @test_alignr + +// CHECK: define available_externally <2 x i64> @_mm_alignr_epi8(<2 x i64> [[REG61:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG62:[0-9a-zA-Z_%.]+]], i32 zeroext [[REG63:[0-9a-zA-Z_%.]+]]) +// CHECK: [[REG64:[0-9a-zA-Z_%.]+]] = alloca i32, align 4 +// CHECK: store <2 x i64> [[REG61]], <2 x i64>* [[REG65:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG62]], <2 x i64>* [[REG66:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store i32 [[REG63]], i32* [[REG64:[0-9a-zA-Z_%.]+]], align 4 +// CHECK-NEXT: br i1 false, label %[[REG67:[0-9a-zA-Z_%.]+]], label %[[REG68:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG67]]: +// CHECK-NEXT: [[REG69:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4 +// CHECK-NEXT: [[REG70:[0-9a-zA-Z_%.]+]] = icmp ult i32 [[REG69]], 16 +// CHECK-NEXT: br i1 [[REG70]], label %[[REG71:[0-9a-zA-Z_%.]+]], label %[[REG68:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG71]]: +// CHECK-BE-NEXT: load <2 x i64>, <2 x i64>* [[REG66]], align 16 +// CHECK-BE: call <16 x i8> @vec_sld(unsigned char vector[16], unsigned char vector[16], unsigned int) +// CHECK-LE-NEXT: load <2 x i64>, <2 x i64>* [[REG65]], align 16 +// CHECK-LE: call <16 x i8> @vec_reve(unsigned char vector[16]) +// CHECK-LE: call <16 x i8> @vec_reve(unsigned char vector[16]) +// CHECk-LE: call <16 x i8> @vec_sld(unsigned char vector[16], unsigned char vector[16], unsigned int) +// CHECK-LE: call <16 x i8> @vec_reve(unsigned char vector[16]) +// CHECK: br label %[[REG72:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG68]]: +// CHECK-NEXT: [[REG73:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4 +// CHECK-NEXT: [[REG74:[0-9a-zA-Z_%.]+]] = icmp eq i32 [[REG73]], 0 +// CHECK-NEXT: br i1 [[REG74]], label %[[REG75:[0-9a-zA-Z_%.]+]], label %[[REG76:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG75]]: +// CHECK-NEXT: [[REG77:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG66]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG77]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16 +// CHECK-NEXT: br label %[[REG72:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG76]]: +// CHECK-NEXT: [[REG78:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4 +// CHECK-NEXT: [[REG79:[0-9a-zA-Z_%.]+]] = icmp uge i32 [[REG78]], 16 +// CHECK-NEXT: br i1 [[REG79]], label %[[REG80:[0-9a-zA-Z_%.]+]], label %[[REG81:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG80]]: +// CHECK-NEXT: [[REG82:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4 +// CHECK-NEXT: [[REG83:[0-9a-zA-Z_%.]+]] = icmp uge i32 [[REG82]], 32 +// CHECK-NEXT: br i1 [[REG83]], label %[[REG84:[0-9a-zA-Z_%.]+]], label %[[REG85:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG84]]: +// CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG86:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> zeroinitializer, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16 +// CHECK-NEXT: br label %[[REG72:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG85]]: +// CHECK-NEXT: [[REG87:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4 +// CHECK-NEXT: [[REG88:[0-9a-zA-Z_%.]+]] = sub i32 [[REG87]], 16 +// CHECK-NEXT: [[REG89:[0-9a-zA-Z_%.]+]] = mul i32 [[REG88]], 8 +// CHECK-NEXT: [[REG90:[0-9a-zA-Z_%.]+]] = trunc i32 [[REG89]] to i8 +// CHECK-NEXT: [[REG91:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(unsigned char)(i8 zeroext [[REG90]]) +// CHECK-NEXT: store <16 x i8> [[REG91]], <16 x i8>* [[REG92:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG93:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG65]], align 16 +// CHECK-NEXT: [[REG94:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG93]] to <16 x i8> +// CHECK-NEXT: [[REG95:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG92]], align 16 +// CHECK-BE-NEXT: [[REG96:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_slo(unsigned char vector[16], unsigned char vector[16]) +// CHECK-LE-NEXT: [[REG96:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sro(unsigned char vector[16], unsigned char vector[16]) +// CHECK-NEXT: [[REG97:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG96]] to <2 x i64> +// CHECK-NEXT: store <2 x i64> [[REG97]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16 +// CHECK-NEXT: br label %[[REG72:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG81]]: +// CHECK-NEXT: [[REG98:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4 +// CHECK-NEXT: [[REG99:[0-9a-zA-Z_%.]+]] = sub i32 16, [[REG98]] +// CHECK-NEXT: [[REG100:[0-9a-zA-Z_%.]+]] = mul i32 [[REG99]], 8 + +// CHECK-BE: [[REG101:[0-9a-zA-Z_%.]+]] = trunc i32 [[REG100]] to i8 +// CHECK-BE: [[REG102:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(unsigned char)(i8 zeroext [[REG101]]) +// CHECK-BE: mul i32 {{[0-9a-zA-Z_%.]+}}, 8 +// CHECK-BE: call <16 x i8> @vec_sro(unsigned char vector[16], unsigned char vector[16]) +// CHECK-BE: call <16 x i8> @vec_slo(unsigned char vector[16], unsigned char vector[16]) +// CHECK-BE: [[REG103:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_or(unsigned char vector[16], unsigned char vector[16]) +// CHECK-BE-NEXT: [[REG104:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG103]] to <2 x i64> +// CHECK-BE-NEXT: store <2 x i64> [[REG104]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16 +// CHECK-BE-NEXT: br label %[[REG72:[0-9a-zA-Z_%.]+]] + +// CHECK-LE: [[REG105:[0-9a-zA-Z_%.]+]] = mul i32 {{[0-9a-zA-Z_%.]+}}, 8 +// CHECK-LE-NEXT: trunc i32 [[REG105]] to i8 +// CHECK-LE: call <16 x i8> @vec_splats(unsigned char) +// CHECK-LE: call <16 x i8> @vec_slo(unsigned char vector[16], unsigned char vector[16]) +// CHECK-LE: call <16 x i8> @vec_sro(unsigned char vector[16], unsigned char vector[16]) +// CHECK-LE: [[REG106:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_or(unsigned char vector[16], unsigned char vector[16]) +// CHECK-LE-NEXT: [[REG107:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG106]] to <2 x i64> +// CHECK-LE-NEXT: store <2 x i64> [[REG107]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16 + +// CHECK: [[REG72]]: +// CHECK-NEXT: [[REG108:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16 +// CHECK-NEXT: ret <2 x i64> [[REG108]] + +// CHECK: define available_externally i64 @_mm_alignr_pi8(i64 [[REG109:[0-9a-zA-Z_%.]+]], i64 [[REG110:[0-9a-zA-Z_%.]+]], i32 zeroext [[REG111:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG109]], i64* [[REG112:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG110]], i64* [[REG113:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i32 [[REG111]], i32* [[REG114:[0-9a-zA-Z_%.]+]], align 4 +// CHECK-NEXT: [[REG115:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG114]], align 4 +// CHECK-NEXT: [[REG116:[0-9a-zA-Z_%.]+]] = icmp ult i32 [[REG115]], 16 +// CHECK-NEXT: br i1 [[REG116]], label %[[REG117:[0-9a-zA-Z_%.]+]], label %[[REG118:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG117]]: +// CHECK-NEXT: [[REG119:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG113]], align 8 +// CHECK-NEXT: [[REG120:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG119]], i32 0 +// CHECK-NEXT: [[REG121:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG112]], align 8 +// CHECK-NEXT: [[REG122:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG120]], i64 [[REG121]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG122]], <2 x i64>* [[REG123:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG124:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG114]], align 4 +// CHECK-NEXT: [[REG125:[0-9a-zA-Z_%.]+]] = shl i32 [[REG124]], 3 +// CHECK-BE-NEXT: [[REG126:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> , i32 [[REG125]], i32 3 +// CHECK-LE-NEXT: [[REG127:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> undef, i32 [[REG125]], i32 0 +// CHECK-LE-NEXT: [[REG128:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG127]], i32 0, i32 1 +// CHECK-LE-NEXT: [[REG129:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG128]], i32 0, i32 2 +// CHECK-LE-NEXT: [[REG126:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG129]], i32 0, i32 3 +// CHECK-NEXT: store <4 x i32> [[REG126]], <4 x i32>* [[REG130:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG131:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG123]], align 16 +// CHECK-NEXT: [[REG132:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG131]] to <16 x i8> +// CHECK-NEXT: [[REG133:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG130]], align 16 +// CHECK-NEXT: [[REG134:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG133]] to <16 x i8> +// CHECK-BE-NEXT: [[REG135:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_slo(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG132]], <16 x i8> [[REG134]]) +// CHECK-LE-NEXT: [[REG135:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sro(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG132]], <16 x i8> [[REG134]]) +// CHECK-NEXT: [[REG136:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG135]] to <2 x i64> +// CHECK-NEXT: store <2 x i64> [[REG136]], <2 x i64>* [[REG123]], align 16 +// CHECK-NEXT: [[REG137:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG123]], align 16 +// CHECK-NEXT: [[REG138:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG137]], i32 0 +// CHECK-NEXT: store i64 [[REG138]], i64* [[REG139:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: br label %[[REG140:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG118]]: +// CHECK-NEXT: store i64 0, i64* [[REG141:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 0, i64* [[REG139]], align 8 +// CHECK-NEXT: br label %[[REG140:[0-9a-zA-Z_%.]+]] + +// CHECK: [[REG140]]: +// CHECK-NEXT: [[REG142:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG139]], align 8 +// CHECK-NEXT: ret i64 [[REG142]] + +void __attribute__((noinline)) +test_hadd() { + resi = _mm_hadd_epi16(mi1, mi2); + resi = _mm_hadd_epi32(mi1, mi2); + res = _mm_hadd_pi16(m1, m2); + res = _mm_hadd_pi32(m1, m2); + resi = _mm_hadds_epi16(mi1, mi2); + res = _mm_hadds_pi16(m1, m2); +} + +// CHECK-LABEL: @test_hadd + +// CHECK: define available_externally <2 x i64> @_mm_hadd_epi16(<2 x i64> [[REG143:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG144:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG143]], <2 x i64>* [[REG145:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG144]], <2 x i64>* [[REG146:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG147:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG148:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG149:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG145]], align 16 +// CHECK-NEXT: [[REG150:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG149]] to <8 x i16> +// CHECK-NEXT: [[REG151:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG146]], align 16 +// CHECK-NEXT: [[REG152:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG151]] to <8 x i16> +// CHECK-NEXT: [[REG153:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG150]], <8 x i16> [[REG152]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG153]], <8 x i16>* [[REG154:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG155:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG145]], align 16 +// CHECK-NEXT: [[REG156:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG155]] to <8 x i16> +// CHECK-NEXT: [[REG157:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG146]], align 16 +// CHECK-NEXT: [[REG158:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG157]] to <8 x i16> +// CHECK-NEXT: [[REG159:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG156]], <8 x i16> [[REG158]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG159]], <8 x i16>* [[REG160:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG161:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG154]], align 16 +// CHECK-NEXT: [[REG162:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG160]], align 16 +// CHECK-NEXT: [[REG163:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_add(short vector[8], short vector[8])(<8 x i16> [[REG161]], <8 x i16> [[REG162]]) +// CHECK-NEXT: [[REG164:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG163]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG164]] + +// CHECK: define available_externally <2 x i64> @_mm_hadd_epi32(<2 x i64> [[REG165:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG166:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG165]], <2 x i64>* [[REG167:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG166]], <2 x i64>* [[REG168:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG169:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG170:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG171:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG167]], align 16 +// CHECK-NEXT: [[REG172:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG171]] to <4 x i32> +// CHECK-NEXT: [[REG173:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG168]], align 16 +// CHECK-NEXT: [[REG174:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG173]] to <4 x i32> +// CHECK-NEXT: [[REG175:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG172]], <4 x i32> [[REG174]], <16 x i8> ) +// CHECK-NEXT: store <4 x i32> [[REG175]], <4 x i32>* [[REG176:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG177:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG167]], align 16 +// CHECK-NEXT: [[REG178:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG177]] to <4 x i32> +// CHECK-NEXT: [[REG179:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG168]], align 16 +// CHECK-NEXT: [[REG180:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG179]] to <4 x i32> +// CHECK-NEXT: [[REG181:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG178]], <4 x i32> [[REG180]], <16 x i8> ) +// CHECK-NEXT: store <4 x i32> [[REG181]], <4 x i32>* [[REG182:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG183:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG176]], align 16 +// CHECK-NEXT: [[REG184:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG182]], align 16 +// CHECK-NEXT: [[REG185:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG183]], <4 x i32> [[REG184]]) +// CHECK-NEXT: [[REG186:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG185]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG186]] + +// CHECK: define available_externally i64 @_mm_hadd_pi16(i64 [[REG187:[0-9a-zA-Z_%.]+]], i64 [[REG188:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG187]], i64* [[REG189:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG188]], i64* [[REG190:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: [[REG191:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG189]], align 8 +// CHECK-NEXT: [[REG192:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG191]], i32 0 +// CHECK-NEXT: [[REG193:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG190]], align 8 +// CHECK-NEXT: [[REG194:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG192]], i64 [[REG193]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG194]], <2 x i64>* [[REG195:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG196:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG195]], align 16 +// CHECK-NEXT: [[REG197:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG196]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG197]], <8 x i16>* [[REG198:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG199:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG200:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG201:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16 +// CHECK-NEXT: [[REG202:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16 +// CHECK-NEXT: [[REG203:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG201]], <8 x i16> [[REG202]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG203]], <8 x i16>* [[REG204:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG205:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16 +// CHECK-NEXT: [[REG206:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16 +// CHECK-NEXT: [[REG207:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG205]], <8 x i16> [[REG206]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG207]], <8 x i16>* [[REG198]], align 16 +// CHECK-NEXT: [[REG208:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16 +// CHECK-NEXT: [[REG209:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG204]], align 16 +// CHECK-NEXT: [[REG210:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_add(short vector[8], short vector[8])(<8 x i16> [[REG208]], <8 x i16> [[REG209]]) +// CHECK-NEXT: store <8 x i16> [[REG210]], <8 x i16>* [[REG198]], align 16 +// CHECK-NEXT: [[REG211:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16 +// CHECK-NEXT: [[REG212:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG211]] to <2 x i64> +// CHECK-NEXT: [[REG213:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG212]], i32 1 +// CHECK-NEXT: ret i64 [[REG213]] + +// CHECK: define available_externally i64 @_mm_hadd_pi32(i64 [[REG214:[0-9a-zA-Z_%.]+]], i64 [[REG215:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG214]], i64* [[REG216:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG215]], i64* [[REG217:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: [[REG218:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG216]], align 8 +// CHECK-NEXT: [[REG219:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG218]], i32 0 +// CHECK-NEXT: [[REG220:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG217]], align 8 +// CHECK-NEXT: [[REG221:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG219]], i64 [[REG220]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG221]], <2 x i64>* [[REG222:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG223:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG222]], align 16 +// CHECK-NEXT: [[REG224:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG223]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[REG224]], <4 x i32>* [[REG225:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG226:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG227:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG228:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16 +// CHECK-NEXT: [[REG229:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16 +// CHECK-NEXT: [[REG230:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG228]], <4 x i32> [[REG229]], <16 x i8> ) +// CHECK-NEXT: store <4 x i32> [[REG230]], <4 x i32>* [[REG231:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG232:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16 +// CHECK-NEXT: [[REG233:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16 +// CHECK-NEXT: [[REG234:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG232]], <4 x i32> [[REG233]], <16 x i8> ) +// CHECK-NEXT: store <4 x i32> [[REG234]], <4 x i32>* [[REG225]], align 16 +// CHECK-NEXT: [[REG235:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16 +// CHECK-NEXT: [[REG236:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG231]], align 16 +// CHECK-NEXT: [[REG237:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG235]], <4 x i32> [[REG236]]) +// CHECK-NEXT: store <4 x i32> [[REG237]], <4 x i32>* [[REG225]], align 16 +// CHECK-NEXT: [[REG238:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16 +// CHECK-NEXT: [[REG239:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG238]] to <2 x i64> +// CHECK-NEXT: [[REG240:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG239]], i32 1 +// CHECK-NEXT: ret i64 [[REG240]] + +// CHECK: define available_externally <2 x i64> @_mm_hadds_epi16(<2 x i64> [[REG241:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG242:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG241]], <2 x i64>* [[REG243:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG242]], <2 x i64>* [[REG244:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG245:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG246:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG247:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG243]], align 16 +// CHECK-NEXT: [[REG248:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG247]] to <8 x i16> +// CHECK-NEXT: [[REG249:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG245]], align 16 +// CHECK-NEXT: [[REG250:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sum4s(short vector[8], int vector[4])(<8 x i16> [[REG248]], <4 x i32> [[REG249]]) +// CHECK-NEXT: store <4 x i32> [[REG250]], <4 x i32>* [[REG245]], align 16 +// CHECK-NEXT: [[REG251:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG244]], align 16 +// CHECK-NEXT: [[REG252:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG251]] to <8 x i16> +// CHECK-NEXT: [[REG253:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG246]], align 16 +// CHECK-NEXT: [[REG254:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sum4s(short vector[8], int vector[4])(<8 x i16> [[REG252]], <4 x i32> [[REG253]]) +// CHECK-NEXT: store <4 x i32> [[REG254]], <4 x i32>* [[REG246]], align 16 +// CHECK-NEXT: [[REG255:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG245]], align 16 +// CHECK-NEXT: [[REG256:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG246]], align 16 +// CHECK-NEXT: [[REG257:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_packs(int vector[4], int vector[4])(<4 x i32> [[REG255]], <4 x i32> [[REG256]]) +// CHECK-NEXT: [[REG258:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG257]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[REG258]], <4 x i32>* [[REG245]], align 16 +// CHECK-NEXT: [[REG259:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG245]], align 16 +// CHECK-NEXT: [[REG260:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG259]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG260]] + +// CHECK: define available_externally i64 @_mm_hadds_pi16(i64 [[REG261:[0-9a-zA-Z_%.]+]], i64 [[REG262:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG261]], i64* [[REG263:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG262]], i64* [[REG264:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG265:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG266:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG263]], align 8 +// CHECK-NEXT: [[REG267:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG266]], i32 0 +// CHECK-NEXT: [[REG268:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG264]], align 8 +// CHECK-NEXT: [[REG269:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG267]], i64 [[REG268]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG269]], <2 x i64>* [[REG270:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG271:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG270]], align 16 +// CHECK-NEXT: [[REG272:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG271]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG272]], <8 x i16>* [[REG273:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG274:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG273]], align 16 +// CHECK-NEXT: [[REG275:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sum4s(short vector[8], int vector[4])(<8 x i16> [[REG274]], <4 x i32> zeroinitializer) +// CHECK-NEXT: store <4 x i32> [[REG275]], <4 x i32>* [[REG276:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG277:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG276]], align 16 +// CHECK-NEXT: [[REG278:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG276]], align 16 +// CHECK-NEXT: [[REG279:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_packs(int vector[4], int vector[4])(<4 x i32> [[REG277]], <4 x i32> [[REG278]]) +// CHECK-NEXT: store <8 x i16> [[REG279]], <8 x i16>* [[REG273]], align 16 +// CHECK-NEXT: [[REG280:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG273]], align 16 +// CHECK-NEXT: [[REG281:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG280]] to <2 x i64> +// CHECK-NEXT: [[REG282:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG281]], i32 1 +// CHECK-NEXT: ret i64 [[REG282]] + +void __attribute__((noinline)) +test_hsub() { + resi = _mm_hsub_epi16(mi1, mi2); + resi = _mm_hsub_epi32(mi1, mi2); + res = _mm_hsub_pi16(m1, m2); + res = _mm_hsub_pi32(m1, m2); + resi = _mm_hsubs_epi16(mi1, mi2); + res = _mm_hsubs_pi16(m1, m2); +} + +// CHECK-LABEL: @test_hsub + +// CHECK: define available_externally <2 x i64> @_mm_hsub_epi16(<2 x i64> [[REG283:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG284:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG283]], <2 x i64>* [[REG285:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG284]], <2 x i64>* [[REG286:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG287:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG288:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG289:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG285]], align 16 +// CHECK-NEXT: [[REG290:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG289]] to <8 x i16> +// CHECK-NEXT: [[REG291:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG286]], align 16 +// CHECK-NEXT: [[REG292:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG291]] to <8 x i16> +// CHECK-NEXT: [[REG293:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG290]], <8 x i16> [[REG292]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG293]], <8 x i16>* [[REG294:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG295:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG285]], align 16 +// CHECK-NEXT: [[REG296:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG295]] to <8 x i16> +// CHECK-NEXT: [[REG297:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG286]], align 16 +// CHECK-NEXT: [[REG298:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG297]] to <8 x i16> +// CHECK-NEXT: [[REG299:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG296]], <8 x i16> [[REG298]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG299]], <8 x i16>* [[REG300:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG301:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG294]], align 16 +// CHECK-NEXT: [[REG302:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG300]], align 16 +// CHECK-NEXT: [[REG303:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_sub(short vector[8], short vector[8])(<8 x i16> [[REG301]], <8 x i16> [[REG302]]) +// CHECK-NEXT: [[REG304:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG303]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG304]] + +// CHECK: define available_externally <2 x i64> @_mm_hsub_epi32(<2 x i64> [[REG305:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG306:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG305]], <2 x i64>* [[REG307:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG306]], <2 x i64>* [[REG308:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG309:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG310:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG311:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG307]], align 16 +// CHECK-NEXT: [[REG312:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG311]] to <4 x i32> +// CHECK-NEXT: [[REG313:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG308]], align 16 +// CHECK-NEXT: [[REG314:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG313]] to <4 x i32> +// CHECK-NEXT: [[REG315:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG312]], <4 x i32> [[REG314]], <16 x i8> ) +// CHECK-NEXT: store <4 x i32> [[REG315]], <4 x i32>* [[REG316:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG317:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG307]], align 16 +// CHECK-NEXT: [[REG318:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG317]] to <4 x i32> +// CHECK-NEXT: [[REG319:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG308]], align 16 +// CHECK-NEXT: [[REG320:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG319]] to <4 x i32> +// CHECK-NEXT: [[REG321:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG318]], <4 x i32> [[REG320]], <16 x i8> ) +// CHECK-NEXT: store <4 x i32> [[REG321]], <4 x i32>* [[REG322:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG323:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG316]], align 16 +// CHECK-NEXT: [[REG324:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG322]], align 16 +// CHECK-NEXT: [[REG325:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sub(int vector[4], int vector[4])(<4 x i32> [[REG323]], <4 x i32> [[REG324]]) +// CHECK-NEXT: [[REG326:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG325]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG326]] + +// CHECK: define available_externally i64 @_mm_hsub_pi16(i64 [[REG327:[0-9a-zA-Z_%.]+]], i64 [[REG328:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG327]], i64* [[REG329:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG328]], i64* [[REG330:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG331:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG332:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG333:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG329]], align 8 +// CHECK-NEXT: [[REG334:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG333]], i32 0 +// CHECK-NEXT: [[REG335:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG330]], align 8 +// CHECK-NEXT: [[REG336:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG334]], i64 [[REG335]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG336]], <2 x i64>* [[REG337:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG338:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG337]], align 16 +// CHECK-NEXT: [[REG339:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG338]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG339]], <8 x i16>* [[REG340:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG341:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16 +// CHECK-NEXT: [[REG342:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16 +// CHECK-NEXT: [[REG343:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG341]], <8 x i16> [[REG342]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG343]], <8 x i16>* [[REG344:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG345:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16 +// CHECK-NEXT: [[REG346:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16 +// CHECK-NEXT: [[REG347:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG345]], <8 x i16> [[REG346]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG347]], <8 x i16>* [[REG340]], align 16 +// CHECK-NEXT: [[REG348:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16 +// CHECK-NEXT: [[REG349:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG344]], align 16 +// CHECK-NEXT: [[REG350:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_sub(short vector[8], short vector[8])(<8 x i16> [[REG348]], <8 x i16> [[REG349]]) +// CHECK-NEXT: store <8 x i16> [[REG350]], <8 x i16>* [[REG340]], align 16 +// CHECK-NEXT: [[REG351:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16 +// CHECK-NEXT: [[REG352:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG351]] to <2 x i64> +// CHECK-NEXT: [[REG353:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG352]], i32 1 +// CHECK-NEXT: ret i64 [[REG353]] + +// CHECK: define available_externally i64 @_mm_hsub_pi32(i64 [[REG354:[0-9a-zA-Z_%.]+]], i64 [[REG355:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG354]], i64* [[REG356:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG355]], i64* [[REG357:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG358:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG359:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG360:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG356]], align 8 +// CHECK-NEXT: [[REG361:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG360]], i32 0 +// CHECK-NEXT: [[REG362:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG357]], align 8 +// CHECK-NEXT: [[REG363:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG361]], i64 [[REG362]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG363]], <2 x i64>* [[REG364:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG365:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG364]], align 16 +// CHECK-NEXT: [[REG366:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG365]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[REG366]], <4 x i32>* [[REG367:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG368:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16 +// CHECK-NEXT: [[REG369:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16 +// CHECK-NEXT: [[REG370:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG368]], <4 x i32> [[REG369]], <16 x i8> ) +// CHECK-NEXT: store <4 x i32> [[REG370]], <4 x i32>* [[REG371:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG372:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16 +// CHECK-NEXT: [[REG373:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16 +// CHECK-NEXT: [[REG374:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG372]], <4 x i32> [[REG373]], <16 x i8> ) +// CHECK-NEXT: store <4 x i32> [[REG374]], <4 x i32>* [[REG367]], align 16 +// CHECK-NEXT: [[REG375:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16 +// CHECK-NEXT: [[REG376:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG371]], align 16 +// CHECK-NEXT: [[REG377:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sub(int vector[4], int vector[4])(<4 x i32> [[REG375]], <4 x i32> [[REG376]]) +// CHECK-NEXT: store <4 x i32> [[REG377]], <4 x i32>* [[REG367]], align 16 +// CHECK-NEXT: [[REG378:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16 +// CHECK-NEXT: [[REG379:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG378]] to <2 x i64> +// CHECK-NEXT: [[REG380:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG379]], i32 1 +// CHECK-NEXT: ret i64 [[REG380]] + +// CHECK: define available_externally <2 x i64> @_mm_hsubs_epi16(<2 x i64> [[REG381:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG382:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG381]], <2 x i64>* [[REG383:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG382]], <2 x i64>* [[REG384:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG385:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG386:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG387:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG383]], align 16 +// CHECK-NEXT: [[REG388:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG387]] to <8 x i16> +// CHECK-NEXT: [[REG389:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG384]], align 16 +// CHECK-NEXT: [[REG390:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG389]] to <8 x i16> +// CHECK-NEXT: [[REG391:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG388]], <8 x i16> [[REG390]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG391]], <8 x i16>* [[REG392:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG393:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG383]], align 16 +// CHECK-NEXT: [[REG394:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG393]] to <8 x i16> +// CHECK-NEXT: [[REG395:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG384]], align 16 +// CHECK-NEXT: [[REG396:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG395]] to <8 x i16> +// CHECK-NEXT: [[REG397:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG394]], <8 x i16> [[REG396]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG397]], <8 x i16>* [[REG398:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG399:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG392]], align 16 +// CHECK-NEXT: [[REG400:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG398]], align 16 +// CHECK-NEXT: [[REG401:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_subs(short vector[8], short vector[8])(<8 x i16> [[REG399]], <8 x i16> [[REG400]]) +// CHECK-NEXT: [[REG402:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG401]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG402]] + +// CHECK: define available_externally i64 @_mm_hsubs_pi16(i64 [[REG403:[0-9a-zA-Z_%.]+]], i64 [[REG404:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG403]], i64* [[REG405:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG404]], i64* [[REG406:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG407:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG408:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG409:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG405]], align 8 +// CHECK-NEXT: [[REG410:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG409]], i32 0 +// CHECK-NEXT: [[REG411:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG406]], align 8 +// CHECK-NEXT: [[REG412:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG410]], i64 [[REG411]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG412]], <2 x i64>* [[REG413:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG414:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG413]], align 16 +// CHECK-NEXT: [[REG415:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG414]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG415]], <8 x i16>* [[REG416:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG417:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16 +// CHECK-NEXT: [[REG418:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16 +// CHECK-NEXT: [[REG419:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG417]], <8 x i16> [[REG418]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG419]], <8 x i16>* [[REG420:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG421:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16 +// CHECK-NEXT: [[REG422:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16 +// CHECK-NEXT: [[REG423:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG421]], <8 x i16> [[REG422]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG423]], <8 x i16>* [[REG424:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG425:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG420]], align 16 +// CHECK-NEXT: [[REG426:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG424]], align 16 +// CHECK-NEXT: [[REG427:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_subs(short vector[8], short vector[8])(<8 x i16> [[REG425]], <8 x i16> [[REG426]]) +// CHECK-NEXT: store <8 x i16> [[REG427]], <8 x i16>* [[REG416]], align 16 +// CHECK-NEXT: [[REG428:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16 +// CHECK-NEXT: [[REG429:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG428]] to <2 x i64> +// CHECK-NEXT: [[REG430:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG429]], i32 1 +// CHECK-NEXT: ret i64 [[REG430]] + +void __attribute__((noinline)) +test_shuffle() { + resi = _mm_shuffle_epi8(mi1, mi2); + res = _mm_shuffle_pi8(m1, m2); +} + +// CHECK-LABEL: @test_shuffle + +// CHECK: define available_externally <2 x i64> @_mm_shuffle_epi8(<2 x i64> [[REG431:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG432:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG431]], <2 x i64>* [[REG433:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG432]], <2 x i64>* [[REG434:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG435:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG436:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG434]], align 16 +// CHECK-NEXT: [[REG437:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG436]] to <16 x i8> +// CHECK-NEXT: [[REG438:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmplt(signed char vector[16], signed char vector[16])(<16 x i8> [[REG437]], <16 x i8> zeroinitializer) +// CHECK-NEXT: store <16 x i8> [[REG438]], <16 x i8>* [[REG439:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG440:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG433]], align 16 +// CHECK-NEXT: [[REG441:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG440]] to <16 x i8> +// CHECK-NEXT: [[REG442:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG433]], align 16 +// CHECK-NEXT: [[REG443:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG442]] to <16 x i8> +// CHECK-NEXT: [[REG444:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG434]], align 16 +// CHECK-NEXT: [[REG445:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG444]] to <16 x i8> +// CHECK-NEXT: [[REG446:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_perm(signed char vector[16], signed char vector[16], unsigned char vector[16])(<16 x i8> [[REG441]], <16 x i8> [[REG443]], <16 x i8> [[REG445]]) +// CHECK-NEXT: store <16 x i8> [[REG446]], <16 x i8>* [[REG447:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG448:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG447]], align 16 +// CHECK-NEXT: [[REG449:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG439]], align 16 +// CHECK-NEXT: [[REG450:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], bool vector[16])(<16 x i8> [[REG448]], <16 x i8> zeroinitializer, <16 x i8> [[REG449]]) +// CHECK-NEXT: [[REG451:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG450]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG451]] + +// CHECK: define available_externally i64 @_mm_shuffle_pi8(i64 [[REG452:[0-9a-zA-Z_%.]+]], i64 [[REG453:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG452]], i64* [[REG454:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG453]], i64* [[REG455:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG456:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG457:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG454]], align 8 +// CHECK-NEXT: [[REG458:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG457]], i32 0 +// CHECK-NEXT: [[REG459:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG454]], align 8 +// CHECK-NEXT: [[REG460:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG458]], i64 [[REG459]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG460]], <2 x i64>* [[REG461:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG462:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG461]], align 16 +// CHECK-NEXT: [[REG463:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG462]] to <16 x i8> +// CHECK-NEXT: store <16 x i8> [[REG463]], <16 x i8>* [[REG464:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG465:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG455]], align 8 +// CHECK-NEXT: [[REG466:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG465]], i32 0 +// CHECK-NEXT: [[REG467:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG455]], align 8 +// CHECK-NEXT: [[REG468:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG466]], i64 [[REG467]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG468]], <2 x i64>* [[REG469:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG470:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG469]], align 16 +// CHECK-NEXT: [[REG471:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG470]] to <16 x i8> +// CHECK-NEXT: store <16 x i8> [[REG471]], <16 x i8>* [[REG472:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG473:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG472]], align 16 +// CHECK-NEXT: [[REG474:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmplt(signed char vector[16], signed char vector[16])(<16 x i8> [[REG473]], <16 x i8> zeroinitializer) +// CHECK-NEXT: store <16 x i8> [[REG474]], <16 x i8>* [[REG475:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG476:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG464]], align 16 +// CHECK-NEXT: [[REG477:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG464]], align 16 +// CHECK-NEXT: [[REG478:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG472]], align 16 +// CHECK-NEXT: [[REG479:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_perm(signed char vector[16], signed char vector[16], unsigned char vector[16])(<16 x i8> [[REG476]], <16 x i8> [[REG477]], <16 x i8> [[REG478]]) +// CHECK-NEXT: store <16 x i8> [[REG479]], <16 x i8>* [[REG464]], align 16 +// CHECK-NEXT: [[REG480:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG464]], align 16 +// CHECK-NEXT: [[REG481:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG475]], align 16 +// CHECK-NEXT: [[REG482:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], bool vector[16])(<16 x i8> [[REG480]], <16 x i8> zeroinitializer, <16 x i8> [[REG481]]) +// CHECK-NEXT: store <16 x i8> [[REG482]], <16 x i8>* [[REG464]], align 16 +// CHECK-NEXT: [[REG483:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG464]], align 16 +// CHECK-NEXT: [[REG484:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG483]] to <2 x i64> +// CHECK-NEXT: [[REG485:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG484]], i32 0 +// CHECK-NEXT: ret i64 [[REG485]] + +void __attribute__((noinline)) +test_sign() { + resi = _mm_sign_epi8(mi1, mi2); + resi = _mm_sign_epi16(mi1, mi2); + resi = _mm_sign_epi32(mi1, mi2); + res = _mm_sign_pi8(m1, m2); + res = _mm_sign_pi16(m1, m2); + res = _mm_sign_pi32(m1, m2); +} + +// CHECK-LABEL: @test_sign + +// CHECK: define available_externally <2 x i64> @_mm_sign_epi8(<2 x i64> [[REG486:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG487:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG486]], <2 x i64>* [[REG488:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG487]], <2 x i64>* [[REG489:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG490:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG491:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG489]], align 16 +// CHECK-NEXT: [[REG492:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG491]] to <16 x i8> +// CHECK-NEXT: [[REG493:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmplt(signed char vector[16], signed char vector[16])(<16 x i8> [[REG492]], <16 x i8> zeroinitializer) +// CHECK-NEXT: store <16 x i8> [[REG493]], <16 x i8>* [[REG494:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG495:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG489]], align 16 +// CHECK-NEXT: [[REG496:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG495]] to <16 x i8> +// CHECK-NEXT: [[REG497:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmpgt(signed char vector[16], signed char vector[16])(<16 x i8> [[REG496]], <16 x i8> zeroinitializer) +// CHECK-NEXT: [[REG498:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_neg(signed char vector[16])(<16 x i8> [[REG497]]) +// CHECK-NEXT: store <16 x i8> [[REG498]], <16 x i8>* [[REG499:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG500:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG494]], align 16 +// CHECK-NEXT: [[REG501:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG499]], align 16 +// CHECK-NEXT: [[REG502:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_add(signed char vector[16], signed char vector[16])(<16 x i8> [[REG500]], <16 x i8> [[REG501]]) +// CHECK-NEXT: store <16 x i8> [[REG502]], <16 x i8>* [[REG503:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG504:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG488]], align 16 +// CHECK-NEXT: [[REG505:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG504]] to <16 x i8> +// CHECK-NEXT: [[REG506:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG503]], align 16 +// CHECK-NEXT: [[REG507:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_mul(signed char vector[16], signed char vector[16])(<16 x i8> [[REG505]], <16 x i8> [[REG506]]) +// CHECK-NEXT: [[REG508:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG507]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG508]] + +// CHECK: define available_externally <2 x i64> @_mm_sign_epi16(<2 x i64> [[REG509:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG510:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG509]], <2 x i64>* [[REG511:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG510]], <2 x i64>* [[REG512:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <8 x i16> zeroinitializer, <8 x i16>* [[REG513:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG514:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG512]], align 16 +// CHECK-NEXT: [[REG515:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG514]] to <8 x i16> +// CHECK-NEXT: [[REG516:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_cmplt(short vector[8], short vector[8])(<8 x i16> [[REG515]], <8 x i16> zeroinitializer) +// CHECK-NEXT: store <8 x i16> [[REG516]], <8 x i16>* [[REG517:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG518:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG512]], align 16 +// CHECK-NEXT: [[REG519:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG518]] to <8 x i16> +// CHECK-NEXT: [[REG520:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_cmpgt(short vector[8], short vector[8])(<8 x i16> [[REG519]], <8 x i16> zeroinitializer) +// CHECK-NEXT: [[REG521:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_neg(short vector[8])(<8 x i16> [[REG520]]) +// CHECK-NEXT: store <8 x i16> [[REG521]], <8 x i16>* [[REG522:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG523:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG517]], align 16 +// CHECK-NEXT: [[REG524:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG522]], align 16 +// CHECK-NEXT: [[REG525:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_add(short vector[8], short vector[8])(<8 x i16> [[REG523]], <8 x i16> [[REG524]]) +// CHECK-NEXT: store <8 x i16> [[REG525]], <8 x i16>* [[REG526:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG527:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG511]], align 16 +// CHECK-NEXT: [[REG528:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG527]] to <8 x i16> +// CHECK-NEXT: [[REG529:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG526]], align 16 +// CHECK-NEXT: [[REG530:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mul(short vector[8], short vector[8])(<8 x i16> [[REG528]], <8 x i16> [[REG529]]) +// CHECK-NEXT: [[REG531:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG530]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG531]] + +// CHECK: define available_externally <2 x i64> @_mm_sign_epi32(<2 x i64> [[REG532:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG533:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG532]], <2 x i64>* [[REG534:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG533]], <2 x i64>* [[REG535:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG536:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG537:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG535]], align 16 +// CHECK-NEXT: [[REG538:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG537]] to <4 x i32> +// CHECK-NEXT: [[REG539:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmplt(int vector[4], int vector[4])(<4 x i32> [[REG538]], <4 x i32> zeroinitializer) +// CHECK-NEXT: store <4 x i32> [[REG539]], <4 x i32>* [[REG540:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG541:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG535]], align 16 +// CHECK-NEXT: [[REG542:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG541]] to <4 x i32> +// CHECK-NEXT: [[REG543:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(int vector[4], int vector[4])(<4 x i32> [[REG542]], <4 x i32> zeroinitializer) +// CHECK-NEXT: [[REG544:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_neg(int vector[4])(<4 x i32> [[REG543]]) +// CHECK-NEXT: store <4 x i32> [[REG544]], <4 x i32>* [[REG545:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG546:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG540]], align 16 +// CHECK-NEXT: [[REG547:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG545]], align 16 +// CHECK-NEXT: [[REG548:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG546]], <4 x i32> [[REG547]]) +// CHECK-NEXT: store <4 x i32> [[REG548]], <4 x i32>* [[REG549:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG550:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG534]], align 16 +// CHECK-NEXT: [[REG551:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG550]] to <4 x i32> +// CHECK-NEXT: [[REG552:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG549]], align 16 +// CHECK-NEXT: [[REG553:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mul(int vector[4], int vector[4])(<4 x i32> [[REG551]], <4 x i32> [[REG552]]) +// CHECK-NEXT: [[REG554:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG553]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG554]] + +// CHECK: define available_externally i64 @_mm_sign_pi8(i64 [[REG555:[0-9a-zA-Z_%.]+]], i64 [[REG556:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG555]], i64* [[REG557:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG556]], i64* [[REG558:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG559:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG560:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG557]], align 8 +// CHECK-NEXT: [[REG561:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG560]], i32 0 +// CHECK-NEXT: [[REG562:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG557]], align 8 +// CHECK-NEXT: [[REG563:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG561]], i64 [[REG562]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG563]], <2 x i64>* [[REG564:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG565:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG564]], align 16 +// CHECK-NEXT: [[REG566:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG565]] to <16 x i8> +// CHECK-NEXT: store <16 x i8> [[REG566]], <16 x i8>* [[REG567:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG568:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG558]], align 8 +// CHECK-NEXT: [[REG569:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG568]], i32 0 +// CHECK-NEXT: [[REG570:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG558]], align 8 +// CHECK-NEXT: [[REG571:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG569]], i64 [[REG570]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG571]], <2 x i64>* [[REG572:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG573:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG572]], align 16 +// CHECK-NEXT: [[REG574:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG573]] to <16 x i8> +// CHECK-NEXT: store <16 x i8> [[REG574]], <16 x i8>* [[REG575:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG576:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG567]], align 16 +// CHECK-NEXT: [[REG577:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG576]] to <2 x i64> +// CHECK-NEXT: [[REG578:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG575]], align 16 +// CHECK-NEXT: [[REG579:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG578]] to <2 x i64> +// CHECK-NEXT: [[REG580:[0-9a-zA-Z_%.]+]] = call <2 x i64> @_mm_sign_epi8(<2 x i64> [[REG577]], <2 x i64> [[REG579]]) +// CHECK-NEXT: [[REG581:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG580]] to <16 x i8> +// CHECK-NEXT: store <16 x i8> [[REG581]], <16 x i8>* [[REG567]], align 16 +// CHECK-NEXT: [[REG582:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG567]], align 16 +// CHECK-NEXT: [[REG583:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG582]] to <2 x i64> +// CHECK-NEXT: [[REG584:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG583]], i32 0 +// CHECK-NEXT: ret i64 [[REG584]] + +// CHECK: define available_externally i64 @_mm_sign_pi16(i64 [[REG585:[0-9a-zA-Z_%.]+]], i64 [[REG586:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG585]], i64* [[REG587:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG586]], i64* [[REG588:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store <8 x i16> zeroinitializer, <8 x i16>* [[REG589:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG590:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG587]], align 8 +// CHECK-NEXT: [[REG591:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG590]], i32 0 +// CHECK-NEXT: [[REG592:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG587]], align 8 +// CHECK-NEXT: [[REG593:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG591]], i64 [[REG592]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG593]], <2 x i64>* [[REG594:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG595:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG594]], align 16 +// CHECK-NEXT: [[REG596:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG595]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG596]], <8 x i16>* [[REG597:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG598:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG588]], align 8 +// CHECK-NEXT: [[REG599:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG598]], i32 0 +// CHECK-NEXT: [[REG600:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG588]], align 8 +// CHECK-NEXT: [[REG601:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG599]], i64 [[REG600]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG601]], <2 x i64>* [[REG602:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG603:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG602]], align 16 +// CHECK-NEXT: [[REG604:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG603]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG604]], <8 x i16>* [[REG605:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG606:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG597]], align 16 +// CHECK-NEXT: [[REG607:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG606]] to <2 x i64> +// CHECK-NEXT: [[REG608:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG605]], align 16 +// CHECK-NEXT: [[REG609:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG608]] to <2 x i64> +// CHECK-NEXT: [[REG610:[0-9a-zA-Z_%.]+]] = call <2 x i64> @_mm_sign_epi16(<2 x i64> [[REG607]], <2 x i64> [[REG609]]) +// CHECK-NEXT: [[REG611:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG610]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG611]], <8 x i16>* [[REG597]], align 16 +// CHECK-NEXT: [[REG612:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG597]], align 16 +// CHECK-NEXT: [[REG613:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG612]] to <2 x i64> +// CHECK-NEXT: [[REG614:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG613]], i32 0 +// CHECK-NEXT: ret i64 [[REG614]] + +// CHECK: define available_externally i64 @_mm_sign_pi32(i64 [[REG615:[0-9a-zA-Z_%.]+]], i64 [[REG616:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG615]], i64* [[REG617:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG616]], i64* [[REG618:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG619:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG620:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG617]], align 8 +// CHECK-NEXT: [[REG621:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG620]], i32 0 +// CHECK-NEXT: [[REG622:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG617]], align 8 +// CHECK-NEXT: [[REG623:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG621]], i64 [[REG622]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG623]], <2 x i64>* [[REG624:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG625:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG624]], align 16 +// CHECK-NEXT: [[REG626:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG625]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[REG626]], <4 x i32>* [[REG627:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG628:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG618]], align 8 +// CHECK-NEXT: [[REG629:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG628]], i32 0 +// CHECK-NEXT: [[REG630:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG618]], align 8 +// CHECK-NEXT: [[REG631:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG629]], i64 [[REG630]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG631]], <2 x i64>* [[REG632:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG633:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG632]], align 16 +// CHECK-NEXT: [[REG634:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG633]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[REG634]], <4 x i32>* [[REG635:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG636:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG627]], align 16 +// CHECK-NEXT: [[REG637:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG636]] to <2 x i64> +// CHECK-NEXT: [[REG638:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG635]], align 16 +// CHECK-NEXT: [[REG639:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG638]] to <2 x i64> +// CHECK-NEXT: [[REG640:[0-9a-zA-Z_%.]+]] = call <2 x i64> @_mm_sign_epi32(<2 x i64> [[REG637]], <2 x i64> [[REG639]]) +// CHECK-NEXT: [[REG641:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG640]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[REG641]], <4 x i32>* [[REG627]], align 16 +// CHECK-NEXT: [[REG642:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG627]], align 16 +// CHECK-NEXT: [[REG643:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG642]] to <2 x i64> +// CHECK-NEXT: [[REG644:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG643]], i32 0 +// CHECK-NEXT: ret i64 [[REG644]] + +void __attribute__((noinline)) +test_maddubs() { + resi = _mm_maddubs_epi16(mi1, mi2); + res = _mm_maddubs_pi16(m1, m2); +} + +// CHECK-LABEL: @test_maddubs + +// CHECK: define available_externally <2 x i64> @_mm_maddubs_epi16(<2 x i64> [[REG645:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG646:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG645]], <2 x i64>* [[REG647:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG646]], <2 x i64>* [[REG648:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG649:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_splats(short)(i16 signext 255) +// CHECK-NEXT: store <8 x i16> [[REG649]], <8 x i16>* [[REG650:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG651:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG647]], align 16 +// CHECK-NEXT: [[REG652:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG651]] to <16 x i8> +// CHECK-NEXT: [[REG653:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackh(signed char vector[16])(<16 x i8> [[REG652]]) +// CHECK-NEXT: [[REG654:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG650]], align 16 +// CHECK-NEXT: [[REG655:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_and(short vector[8], short vector[8])(<8 x i16> [[REG653]], <8 x i16> [[REG654]]) +// CHECK-NEXT: store <8 x i16> [[REG655]], <8 x i16>* [[REG656:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG657:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG647]], align 16 +// CHECK-NEXT: [[REG658:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG657]] to <16 x i8> +// CHECK-NEXT: [[REG659:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackl(signed char vector[16])(<16 x i8> [[REG658]]) +// CHECK-NEXT: [[REG660:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG650]], align 16 +// CHECK-NEXT: [[REG661:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_and(short vector[8], short vector[8])(<8 x i16> [[REG659]], <8 x i16> [[REG660]]) +// CHECK-NEXT: store <8 x i16> [[REG661]], <8 x i16>* [[REG662:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG663:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG648]], align 16 +// CHECK-NEXT: [[REG664:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG663]] to <16 x i8> +// CHECK-NEXT: [[REG76:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackh(signed char vector[16])(<16 x i8> [[REG664]]) +// CHECK-NEXT: store <8 x i16> [[REG76]], <8 x i16>* [[REG665:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG666:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG648]], align 16 +// CHECK-NEXT: [[REG667:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG666]] to <16 x i8> +// CHECK-NEXT: [[REG668:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackl(signed char vector[16])(<16 x i8> [[REG667]]) +// CHECK-NEXT: store <8 x i16> [[REG668]], <8 x i16>* [[REG669:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG670:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG656]], align 16 +// CHECK-NEXT: [[REG671:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG665]], align 16 +// CHECK-NEXT: [[REG672:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mul(short vector[8], short vector[8])(<8 x i16> [[REG670]], <8 x i16> [[REG671]]) +// CHECK-NEXT: store <8 x i16> [[REG672]], <8 x i16>* [[REG656]], align 16 +// CHECK-NEXT: [[REG673:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG662]], align 16 +// CHECK-NEXT: [[REG674:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG669]], align 16 +// CHECK-NEXT: [[REG675:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mul(short vector[8], short vector[8])(<8 x i16> [[REG673]], <8 x i16> [[REG674]]) +// CHECK-NEXT: store <8 x i16> [[REG675]], <8 x i16>* [[REG662]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG676:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG677:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG678:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG656]], align 16 +// CHECK-NEXT: [[REG679:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG662]], align 16 +// CHECK-NEXT: [[REG680:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG678]], <8 x i16> [[REG679]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG680]], <8 x i16>* [[REG665]], align 16 +// CHECK-NEXT: [[REG681:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG656]], align 16 +// CHECK-NEXT: [[REG682:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG662]], align 16 +// CHECK-NEXT: [[REG683:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG681]], <8 x i16> [[REG682]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG683]], <8 x i16>* [[REG669]], align 16 +// CHECK-NEXT: [[REG684:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG665]], align 16 +// CHECK-NEXT: [[REG685:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG669]], align 16 +// CHECK-NEXT: [[REG686:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_adds(short vector[8], short vector[8])(<8 x i16> [[REG684]], <8 x i16> [[REG685]]) +// CHECK-NEXT: [[REG687:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG686]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG687]] + +// CHECK: define available_externally i64 @_mm_maddubs_pi16(i64 [[REG688:[0-9a-zA-Z_%.]+]], i64 [[REG689:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG688]], i64* [[REG690:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG689]], i64* [[REG691:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: [[REG692:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG690]], align 8 +// CHECK-NEXT: [[REG75:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG692]], i32 0 +// CHECK-NEXT: [[REG693:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG690]], align 8 +// CHECK-NEXT: [[REG694:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG75]], i64 [[REG693]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG694]], <2 x i64>* [[REG80:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG695:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG80]], align 16 +// CHECK-NEXT: [[REG84:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG695]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG84]], <8 x i16>* [[REG696:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG697:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16 +// CHECK-NEXT: [[REG698:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG697]] to <16 x i8> +// CHECK-NEXT: [[REG699:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackl(signed char vector[16])(<16 x i8> [[REG698]]) +// CHECK-NEXT: store <8 x i16> [[REG699]], <8 x i16>* [[REG696]], align 16 +// CHECK-NEXT: [[REG700:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_splats(short)(i16 signext 255) +// CHECK-NEXT: store <8 x i16> [[REG700]], <8 x i16>* [[REG701:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG702:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16 +// CHECK-NEXT: [[REG703:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG701]], align 16 +// CHECK-NEXT: [[REG704:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_and(short vector[8], short vector[8])(<8 x i16> [[REG702]], <8 x i16> [[REG703]]) +// CHECK-NEXT: store <8 x i16> [[REG704]], <8 x i16>* [[REG696]], align 16 +// CHECK-NEXT: [[REG705:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG691]], align 8 +// CHECK-NEXT: [[REG706:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG705]], i32 0 +// CHECK-NEXT: [[REG707:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG691]], align 8 +// CHECK-NEXT: [[REG708:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG706]], i64 [[REG707]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG708]], <2 x i64>* [[REG709:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG710:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG709]], align 16 +// CHECK-NEXT: [[REG711:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG710]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[REG711]], <8 x i16>* [[REG712:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG713:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: [[REG714:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG713]] to <16 x i8> +// CHECK-NEXT: [[REG715:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackl(signed char vector[16])(<16 x i8> [[REG714]]) +// CHECK-NEXT: store <8 x i16> [[REG715]], <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: [[REG716:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16 +// CHECK-NEXT: [[REG717:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: [[REG718:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mul(short vector[8], short vector[8])(<8 x i16> [[REG716]], <8 x i16> [[REG717]]) +// CHECK-NEXT: store <8 x i16> [[REG718]], <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG719:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <16 x i8> , <16 x i8>* [[REG720:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG721:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: [[REG722:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: [[REG723:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG721]], <8 x i16> [[REG722]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG723]], <8 x i16>* [[REG696]], align 16 +// CHECK-NEXT: [[REG724:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: [[REG725:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: [[REG726:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG724]], <8 x i16> [[REG725]], <16 x i8> ) +// CHECK-NEXT: store <8 x i16> [[REG726]], <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: [[REG727:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16 +// CHECK-NEXT: [[REG728:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16 +// CHECK-NEXT: [[REG729:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_adds(short vector[8], short vector[8])(<8 x i16> [[REG727]], <8 x i16> [[REG728]]) +// CHECK-NEXT: store <8 x i16> [[REG729]], <8 x i16>* [[REG696]], align 16 +// CHECK-NEXT: [[REG730:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16 +// CHECK-NEXT: [[REG731:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG730]] to <2 x i64> +// CHECK-NEXT: [[REG732:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG731]], i32 0 +// CHECK-NEXT: ret i64 [[REG732]] + +void __attribute__((noinline)) +test_mulhrs() { + resi = _mm_mulhrs_epi16(mi1, mi2); + res = _mm_mulhrs_pi16(m1, m2); +} + +// CHECK-LABEL: @test_mulhrs + +// CHECK: define available_externally <2 x i64> @_mm_mulhrs_epi16(<2 x i64> [[REG733:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG734:[0-9a-zA-Z_%.]+]]) +// CHECK: store <2 x i64> [[REG733]], <2 x i64>* [[REG735:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: store <2 x i64> [[REG734]], <2 x i64>* [[REG736:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG737:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG735]], align 16 +// CHECK-NEXT: [[REG738:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG737]] to <8 x i16> +// CHECK-NEXT: [[REG739:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackh(short vector[8])(<8 x i16> [[REG738]]) +// CHECK-NEXT: store <4 x i32> [[REG739]], <4 x i32>* [[REG740:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG741:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG736]], align 16 +// CHECK-NEXT: [[REG742:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG741]] to <8 x i16> +// CHECK-NEXT: [[REG743:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackh(short vector[8])(<8 x i16> [[REG742]]) +// CHECK-NEXT: store <4 x i32> [[REG743]], <4 x i32>* [[REG744:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG745:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16 +// CHECK-NEXT: [[REG746:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG747:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mul(int vector[4], int vector[4])(<4 x i32> [[REG745]], <4 x i32> [[REG746]]) +// CHECK-NEXT: store <4 x i32> [[REG747]], <4 x i32>* [[REG740]], align 16 +// CHECK-NEXT: [[REG748:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG735]], align 16 +// CHECK-NEXT: [[REG749:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG748]] to <8 x i16> +// CHECK-NEXT: [[REG750:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackl(short vector[8])(<8 x i16> [[REG749]]) +// CHECK-NEXT: store <4 x i32> [[REG750]], <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG751:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG736]], align 16 +// CHECK-NEXT: [[REG752:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG751]] to <8 x i16> +// CHECK-NEXT: [[REG753:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackl(short vector[8])(<8 x i16> [[REG752]]) +// CHECK-NEXT: store <4 x i32> [[REG753]], <4 x i32>* [[REG754:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG755:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG756:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG754]], align 16 +// CHECK-NEXT: [[REG757:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mul(int vector[4], int vector[4])(<4 x i32> [[REG755]], <4 x i32> [[REG756]]) +// CHECK-NEXT: store <4 x i32> [[REG757]], <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG758:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_splats(unsigned int)(i32 zeroext 14) +// CHECK-NEXT: store <4 x i32> [[REG758]], <4 x i32>* [[REG759:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG760:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16 +// CHECK-NEXT: [[REG761:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG759]], align 16 +// CHECK-NEXT: [[REG762:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG760]], <4 x i32> [[REG761]]) +// CHECK-NEXT: store <4 x i32> [[REG762]], <4 x i32>* [[REG740]], align 16 +// CHECK-NEXT: [[REG763:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG764:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG759]], align 16 +// CHECK-NEXT: [[REG765:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG763]], <4 x i32> [[REG764]]) +// CHECK-NEXT: store <4 x i32> [[REG765]], <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG766:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_splats(int)(i32 signext 1) +// CHECK-NEXT: store <4 x i32> [[REG766]], <4 x i32>* [[REG767:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG768:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16 +// CHECK-NEXT: [[REG769:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG767]], align 16 +// CHECK-NEXT: [[REG770:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG768]], <4 x i32> [[REG769]]) +// CHECK-NEXT: store <4 x i32> [[REG770]], <4 x i32>* [[REG740]], align 16 +// CHECK-NEXT: [[REG771:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16 +// CHECK-NEXT: [[REG772:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG767]], align 16 +// CHECK-NEXT: [[REG773:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG771]], <4 x i32> [[REG772]]) +// CHECK-NEXT: store <4 x i32> [[REG773]], <4 x i32>* [[REG740]], align 16 +// CHECK-NEXT: [[REG774:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG775:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG767]], align 16 +// CHECK-NEXT: [[REG776:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG774]], <4 x i32> [[REG775]]) +// CHECK-NEXT: store <4 x i32> [[REG776]], <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG777:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG778:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG767]], align 16 +// CHECK-NEXT: [[REG779:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG777]], <4 x i32> [[REG778]]) +// CHECK-NEXT: store <4 x i32> [[REG779]], <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG780:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16 +// CHECK-NEXT: [[REG781:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16 +// CHECK-NEXT: [[REG782:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_pack(int vector[4], int vector[4])(<4 x i32> [[REG780]], <4 x i32> [[REG781]]) +// CHECK-NEXT: [[REG783:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG782]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[REG783]] + +// CHECK: define available_externally i64 @_mm_mulhrs_pi16(i64 [[REG784:[0-9a-zA-Z_%.]+]], i64 [[REG785:[0-9a-zA-Z_%.]+]]) +// CHECK: store i64 [[REG784]], i64* [[REG786:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: store i64 [[REG785]], i64* [[REG787:[0-9a-zA-Z_%.]+]], align 8 +// CHECK-NEXT: [[REG788:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG786]], align 8 +// CHECK-NEXT: [[REG789:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG788]], i32 0 +// CHECK-NEXT: [[REG790:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG786]], align 8 +// CHECK-NEXT: [[REG791:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG789]], i64 [[REG790]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG791]], <2 x i64>* [[REG792:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG793:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG792]], align 16 +// CHECK-NEXT: [[REG794:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG793]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[REG794]], <4 x i32>* [[REG795:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG796:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG797:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG796]] to <8 x i16> +// CHECK-NEXT: [[REG798:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackh(short vector[8])(<8 x i16> [[REG797]]) +// CHECK-NEXT: store <4 x i32> [[REG798]], <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG799:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG787]], align 8 +// CHECK-NEXT: [[REG800:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG799]], i32 0 +// CHECK-NEXT: [[REG801:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG787]], align 8 +// CHECK-NEXT: [[REG802:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG800]], i64 [[REG801]], i32 1 +// CHECK-NEXT: store <2 x i64> [[REG802]], <2 x i64>* [[REG803:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG804:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG803]], align 16 +// CHECK-NEXT: [[REG805:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG804]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[REG805]], <4 x i32>* [[REG806:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG807:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG806]], align 16 +// CHECK-NEXT: [[REG808:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG807]] to <8 x i16> +// CHECK-NEXT: [[REG809:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackh(short vector[8])(<8 x i16> [[REG808]]) +// CHECK-NEXT: store <4 x i32> [[REG809]], <4 x i32>* [[REG806]], align 16 +// CHECK-NEXT: [[REG810:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG811:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG806]], align 16 +// CHECK-NEXT: [[REG812:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mul(int vector[4], int vector[4])(<4 x i32> [[REG810]], <4 x i32> [[REG811]]) +// CHECK-NEXT: store <4 x i32> [[REG812]], <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG813:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_splats(unsigned int)(i32 zeroext 14) +// CHECK-NEXT: store <4 x i32> [[REG813]], <4 x i32>* [[REG814:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG815:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG816:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG814]], align 16 +// CHECK-NEXT: [[REG817:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG815]], <4 x i32> [[REG816]]) +// CHECK-NEXT: store <4 x i32> [[REG817]], <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG818:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_splats(int)(i32 signext 1) +// CHECK-NEXT: store <4 x i32> [[REG818]], <4 x i32>* [[REG819:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG820:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG821:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG819]], align 16 +// CHECK-NEXT: [[REG822:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG820]], <4 x i32> [[REG821]]) +// CHECK-NEXT: store <4 x i32> [[REG822]], <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG823:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG824:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG819]], align 16 +// CHECK-NEXT: [[REG825:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG823]], <4 x i32> [[REG824]]) +// CHECK-NEXT: store <4 x i32> [[REG825]], <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG826:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16 +// CHECK-NEXT: [[REG827:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG806]], align 16 +// CHECK-NEXT: [[REG828:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_pack(int vector[4], int vector[4])(<4 x i32> [[REG826]], <4 x i32> [[REG827]]) +// CHECK-NEXT: store <8 x i16> [[REG828]], <8 x i16>* [[REG829:[0-9a-zA-Z_%.]+]], align 16 +// CHECK-NEXT: [[REG830:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG829]], align 16 +// CHECK-NEXT: [[REG831:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG830]] to <2 x i64> +// CHECK-NEXT: [[REG832:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG831]], i32 0 +// CHECK-NEXT: ret i64 [[REG832]] diff --git a/clang/test/Headers/ppc-intrinsics.c b/clang/test/Headers/ppc-intrinsics.c new file mode 100644 index 000000000000..3b3753312e2f --- /dev/null +++ b/clang/test/Headers/ppc-intrinsics.c @@ -0,0 +1,28 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: powerpc-registered-target +// expected-no-diagnostics + +// Don't include mm_malloc.h, it's system specific. +#define _MM_MALLOC_H_INCLUDED + +// RUN: %clang -S -emit-llvm -DNO_WARN_X86_INTRINSICS -mcpu=pwr8 -target powerpc64-unknown-linux-gnu %s -Xclang -verify +// RUN: %clang -S -emit-llvm -DNO_WARN_X86_INTRINSICS -mcpu=pwr8 -target powerpc64-unknown-linux-gnu %s -Xclang -verify -x c++ + +// Since mm_malloc.h references system native stdlib.h, doing cross-compile +// testing may cause unexpected problems. This would affect xmmintrin.h and +// other following intrinsics headers. If there's need to test them using +// cross-compile, please add -ffreestanding to compiler options, like +// test/CodeGen/ppc-xmmintrin.c. + +// RUN: not %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr8 %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR + +#include + +// Altivec must be enabled. +#include +#include +#include +#include +#include + +// CHECK-ERROR: {{[0-9]+}}:{{[0-9]+}}: error: "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." diff --git a/clang/test/Headers/ppc-mmx-intrinsics.c b/clang/test/Headers/ppc-mmx-intrinsics.c deleted file mode 100644 index 406694d1ad9a..000000000000 --- a/clang/test/Headers/ppc-mmx-intrinsics.c +++ /dev/null @@ -1,11 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// REQUIRES: powerpc-registered-target - -// RUN: %clang -S -emit-llvm -DNO_WARN_X86_INTRINSICS -mcpu=pwr7 -target powerpc64-unknown-linux-gnu %s -Xclang -verify -// RUN: %clang -S -emit-llvm -DNO_WARN_X86_INTRINSICS -mcpu=pwr7 -target powerpc64-unknown-linux-gnu %s -Xclang -verify -x c++ -// expected-no-diagnostics - -// RUN: not %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr7 %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR - -#include -// CHECK-ERROR: mmintrin.h:{{[0-9]+}}:{{[0-9]+}}: error: "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." diff --git a/clang/test/Headers/ppc-sse-intrinsics.c b/clang/test/Headers/ppc-sse-intrinsics.c deleted file mode 100644 index 91906f0b08c8..000000000000 --- a/clang/test/Headers/ppc-sse-intrinsics.c +++ /dev/null @@ -1,22 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// REQUIRES: powerpc-registered-target - -// Since mm_malloc.h references system native stdlib.h, doing cross-compile -// testing may cause unexpected problems. This would affect xmmintrin.h and -// other following intrinsics headers. If there's need to test them using -// cross-compile, please add -ffreestanding to compiler options, like -// test/CodeGen/ppc-xmmintrin.c. - -// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm -DNO_WARN_X86_INTRINSICS %s -mcpu=pwr7 -Xclang -verify -// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm -DNO_WARN_X86_INTRINSICS %s -mcpu=pwr7 -Xclang -verify -x c++ -// expected-no-diagnostics - -// RUN: not %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -mcpu=pwr7 -o /dev/null 2>&1 | FileCheck %s -check-prefix=SSE-ERROR - -// Don't include mm_malloc.h, it's system specific. -#define _MM_MALLOC_H_INCLUDED - -// Altivec must be enabled. -#include - -// SSE-ERROR: xmmintrin.h:{{[0-9]+}}:{{[0-9]+}}: error: "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." diff --git a/clang/test/Headers/ppc-sse2-intrinsics.c b/clang/test/Headers/ppc-sse2-intrinsics.c deleted file mode 100644 index f1581dd08bea..000000000000 --- a/clang/test/Headers/ppc-sse2-intrinsics.c +++ /dev/null @@ -1,14 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// REQUIRES: powerpc-registered-target - -// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s -Xclang -verify -// RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s -Xclang -verify -x c++ -// expected-no-diagnostics - -// RUN: not %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -mcpu=pwr8 -o /dev/null 2>&1 | FileCheck %s -check-prefix=SSE2-ERROR - -// Don't include mm_malloc.h, it's system specific. -#define _MM_MALLOC_H_INCLUDED - -#include -// SSE2-ERROR: xmmintrin.h:{{[0-9]+}}:{{[0-9]+}}: error: "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."