[X86] Replace valignd/q builtins with appropriate __builtin_shufflevector.

llvm-svn: 287733
This commit is contained in:
Craig Topper 2016-11-23 01:47:12 +00:00
parent ca1c5e0fb6
commit 6aefe00ccf
6 changed files with 119 additions and 102 deletions

View File

@ -996,12 +996,6 @@ TARGET_BUILTIN(__builtin_ia32_vpermt2vard512_mask, "V16iV16iV16iV16iUs", "", "av
TARGET_BUILTIN(__builtin_ia32_vpermt2varq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
TARGET_BUILTIN(__builtin_ia32_vpermt2varps512_mask, "V16fV16iV16fV16fUs", "", "avx512f")
TARGET_BUILTIN(__builtin_ia32_vpermt2varpd512_mask, "V8dV8LLiV8dV8dUc", "", "avx512f")
TARGET_BUILTIN(__builtin_ia32_alignq512_mask, "V8LLiV8LLiV8LLiIiV8LLiUc", "", "avx512f")
TARGET_BUILTIN(__builtin_ia32_alignd512_mask, "V16iV16iV16iIiV16iUs", "", "avx512f")
TARGET_BUILTIN(__builtin_ia32_alignd128_mask, "V4iV4iV4iIiV4iUc","","avx512vl")
TARGET_BUILTIN(__builtin_ia32_alignd256_mask, "V8iV8iV8iIiV8iUc","","avx512vl")
TARGET_BUILTIN(__builtin_ia32_alignq128_mask, "V2LLiV2LLiV2LLiIiV2LLiUc","","avx512vl")
TARGET_BUILTIN(__builtin_ia32_alignq256_mask, "V4LLiV4LLiV4LLiIiV4LLiUc","","avx512vl")
TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2ddC*V2LLiUci","","avx512vl")
TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2LLiV2LLiLLiC*V2LLiUci","","avx512vl")

View File

@ -3416,40 +3416,56 @@ _mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
}
#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)-1); })
(__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \
(__v8di)(__m512i)(A), \
((int)(I) & 0x7) + 0, \
((int)(I) & 0x7) + 1, \
((int)(I) & 0x7) + 2, \
((int)(I) & 0x7) + 3, \
((int)(I) & 0x7) + 4, \
((int)(I) & 0x7) + 5, \
((int)(I) & 0x7) + 6, \
((int)(I) & 0x7) + 7); })
#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
(__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(imm), \
(__v8di)(__m512i)(W), \
(__mmask8)(U)); })
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
(__v8di)(__m512i)(W)); })
#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
(__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(imm), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U)); })
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
(__v8di)_mm512_setzero_si512()); })
#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I), \
(__v16si)_mm512_setzero_si512(), \
(__mmask16)-1); })
(__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \
(__v16si)(__m512i)(A), \
((int)(I) & 0xf) + 0, \
((int)(I) & 0xf) + 1, \
((int)(I) & 0xf) + 2, \
((int)(I) & 0xf) + 3, \
((int)(I) & 0xf) + 4, \
((int)(I) & 0xf) + 5, \
((int)(I) & 0xf) + 6, \
((int)(I) & 0xf) + 7, \
((int)(I) & 0xf) + 8, \
((int)(I) & 0xf) + 9, \
((int)(I) & 0xf) + 10, \
((int)(I) & 0xf) + 11, \
((int)(I) & 0xf) + 12, \
((int)(I) & 0xf) + 13, \
((int)(I) & 0xf) + 14, \
((int)(I) & 0xf) + 15); })
#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
(__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(imm), \
(__v16si)(__m512i)(W), \
(__mmask16)(U)); })
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
(__v16si)(__m512i)(W)); })
#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
(__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(imm), \
(__v16si)_mm512_setzero_si512(), \
(__mmask16)(U)); })
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
(__v16si)_mm512_setzero_si512()); })
/* Vector Extract */
#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \

View File

@ -8638,76 +8638,78 @@ _mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
}
#define _mm_alignr_epi32(A, B, imm) __extension__ ({ \
(__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(imm), \
(__v4si)_mm_undefined_si128(), \
(__mmask8)-1); })
(__m128i)__builtin_shufflevector((__v4si)(__m128i)(B), \
(__v4si)(__m128i)(A), \
((int)(imm) & 0x3) + 0, \
((int)(imm) & 0x3) + 1, \
((int)(imm) & 0x3) + 2, \
((int)(imm) & 0x3) + 3); })
#define _mm_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
(__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(imm), \
(__v4si)(__m128i)(W), \
(__mmask8)(U)); })
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_alignr_epi32((A), (B), (imm)), \
(__v4si)(__m128i)(W)); })
#define _mm_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
(__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(imm), \
(__v4si)_mm_setzero_si128(), \
(__mmask8)(U)); })
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_alignr_epi32((A), (B), (imm)), \
(__v4si)_mm_setzero_si128()); })
#define _mm256_alignr_epi32(A, B, imm) __extension__ ({ \
(__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(imm), \
(__v8si)_mm256_undefined_si256(), \
(__mmask8)-1); })
(__m256i)__builtin_shufflevector((__v8si)(__m256i)(B), \
(__v8si)(__m256i)(A), \
((int)(imm) & 0x7) + 0, \
((int)(imm) & 0x7) + 1, \
((int)(imm) & 0x7) + 2, \
((int)(imm) & 0x7) + 3, \
((int)(imm) & 0x7) + 4, \
((int)(imm) & 0x7) + 5, \
((int)(imm) & 0x7) + 6, \
((int)(imm) & 0x7) + 7); })
#define _mm256_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
(__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(imm), \
(__v8si)(__m256i)(W), \
(__mmask8)(U)); })
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
(__v8si)(__m256i)(W)); })
#define _mm256_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
(__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(imm), \
(__v8si)_mm256_setzero_si256(), \
(__mmask8)(U)); })
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
(__v8si)_mm256_setzero_si256()); })
#define _mm_alignr_epi64(A, B, imm) __extension__ ({ \
(__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(imm), \
(__v2di)_mm_setzero_di(), \
(__mmask8)-1); })
(__m128i)__builtin_shufflevector((__v2di)(__m128i)(B), \
(__v2di)(__m128i)(A), \
((int)(imm) & 0x1) + 0, \
((int)(imm) & 0x1) + 1); })
#define _mm_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
(__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(imm), \
(__v2di)(__m128i)(W), \
(__mmask8)(U)); })
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_alignr_epi64((A), (B), (imm)), \
(__v2di)(__m128i)(W)); })
#define _mm_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
(__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(imm), \
(__v2di)_mm_setzero_di(), \
(__mmask8)(U)); })
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_alignr_epi64((A), (B), (imm)), \
(__v2di)_mm_setzero_di()); })
#define _mm256_alignr_epi64(A, B, imm) __extension__ ({ \
(__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(imm), \
(__v4di)_mm256_undefined_pd(), \
(__mmask8)-1); })
(__m256i)__builtin_shufflevector((__v4di)(__m256i)(B), \
(__v4di)(__m256i)(A), \
((int)(imm) & 0x3) + 0, \
((int)(imm) & 0x3) + 1, \
((int)(imm) & 0x3) + 2, \
((int)(imm) & 0x3) + 3); })
#define _mm256_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
(__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(imm), \
(__v4di)(__m256i)(W), \
(__mmask8)(U)); })
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
(__v4di)(__m256i)(W)); })
#define _mm256_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
(__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(imm), \
(__v4di)_mm256_setzero_si256(), \
(__mmask8)(U)); })
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
(__v4di)_mm256_setzero_si256()); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)

View File

@ -2155,12 +2155,6 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_palignr128:
case X86::BI__builtin_ia32_palignr256:
case X86::BI__builtin_ia32_palignr512_mask:
case X86::BI__builtin_ia32_alignq512_mask:
case X86::BI__builtin_ia32_alignd512_mask:
case X86::BI__builtin_ia32_alignd128_mask:
case X86::BI__builtin_ia32_alignd256_mask:
case X86::BI__builtin_ia32_alignq128_mask:
case X86::BI__builtin_ia32_alignq256_mask:
case X86::BI__builtin_ia32_vcomisd:
case X86::BI__builtin_ia32_vcomiss:
case X86::BI__builtin_ia32_shuf_f32x4_mask:

View File

@ -393,42 +393,46 @@ __mmask16 test_mm512_knot(__mmask16 a)
__m512i test_mm512_alignr_epi32(__m512i a, __m512i b)
{
// CHECK-LABEL: @test_mm512_alignr_epi32
// CHECK: @llvm.x86.avx512.mask.valign.d.512
// CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
return _mm512_alignr_epi32(a, b, 2);
}
__m512i test_mm512_mask_alignr_epi32(__m512i w, __mmask16 u, __m512i a, __m512i b)
{
// CHECK-LABEL: @test_mm512_mask_alignr_epi32
// CHECK: @llvm.x86.avx512.mask.valign.d.512
// CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> {{.*}}
return _mm512_mask_alignr_epi32(w, u, a, b, 2);
}
__m512i test_mm512_maskz_alignr_epi32( __mmask16 u, __m512i a, __m512i b)
{
// CHECK-LABEL: @test_mm512_maskz_alignr_epi32
// CHECK: @llvm.x86.avx512.mask.valign.d.512
// CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> {{.*}}
return _mm512_maskz_alignr_epi32(u, a, b, 2);
}
__m512i test_mm512_alignr_epi64(__m512i a, __m512i b)
{
// CHECK-LABEL: @test_mm512_alignr_epi64
// CHECK: @llvm.x86.avx512.mask.valign.q.512
// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
return _mm512_alignr_epi64(a, b, 2);
}
__m512i test_mm512_mask_alignr_epi64(__m512i w, __mmask8 u, __m512i a, __m512i b)
{
// CHECK-LABEL: @test_mm512_mask_alignr_epi64
// CHECK: @llvm.x86.avx512.mask.valign.q.512
// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> {{.*}}
return _mm512_mask_alignr_epi64(w, u, a, b, 2);
}
__m512i test_mm512_maskz_alignr_epi64( __mmask8 u, __m512i a, __m512i b)
{
// CHECK-LABEL: @test_mm512_maskz_alignr_epi64
// CHECK: @llvm.x86.avx512.mask.valign.q.512
// CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> {{.*}}
return _mm512_maskz_alignr_epi64(u, a, b, 2);
}

View File

@ -6934,73 +6934,80 @@ __m256i test_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X
__m128i test_mm_alignr_epi32(__m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_alignr_epi32
// CHECK: @llvm.x86.avx512.mask.valign.d.128
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
return _mm_alignr_epi32(__A, __B, 1);
}
__m128i test_mm_mask_alignr_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_mask_alignr_epi32
// CHECK: @llvm.x86.avx512.mask.valign.d.128
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_mask_alignr_epi32(__W, __U, __A, __B, 1);
}
__m128i test_mm_maskz_alignr_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_maskz_alignr_epi32
// CHECK: @llvm.x86.avx512.mask.valign.d.128
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
return _mm_maskz_alignr_epi32(__U, __A, __B, 1);
}
__m256i test_mm256_alignr_epi32(__m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_alignr_epi32
// CHECK: @llvm.x86.avx512.mask.valign.d.256
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
return _mm256_alignr_epi32(__A, __B, 1);
}
__m256i test_mm256_mask_alignr_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_mask_alignr_epi32
// CHECK: @llvm.x86.avx512.mask.valign.d.256
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_mask_alignr_epi32(__W, __U, __A, __B, 1);
}
__m256i test_mm256_maskz_alignr_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_maskz_alignr_epi32
// CHECK: @llvm.x86.avx512.mask.valign.d.256
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
return _mm256_maskz_alignr_epi32(__U, __A, __B, 1);
}
__m128i test_mm_alignr_epi64(__m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_alignr_epi64
// CHECK: @llvm.x86.avx512.mask.valign.q.128
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 2>
return _mm_alignr_epi64(__A, __B, 1);
}
__m128i test_mm_mask_alignr_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_mask_alignr_epi64
// CHECK: @llvm.x86.avx512.mask.valign.q.128
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 2>
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm_mask_alignr_epi64(__W, __U, __A, __B, 1);
}
__m128i test_mm_maskz_alignr_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_maskz_alignr_epi64
// CHECK: @llvm.x86.avx512.mask.valign.q.128
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 2>
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
return _mm_maskz_alignr_epi64(__U, __A, __B, 1);
}
__m256i test_mm256_alignr_epi64(__m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_alignr_epi64
// CHECK: @llvm.x86.avx512.mask.valign.q.256
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
return _mm256_alignr_epi64(__A, __B, 1);
}
__m256i test_mm256_mask_alignr_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_mask_alignr_epi64
// CHECK: @llvm.x86.avx512.mask.valign.q.256
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
// CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
return _mm256_mask_alignr_epi64(__W, __U, __A, __B, 1);
}
__m256i test_mm256_maskz_alignr_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_maskz_alignr_epi64
// CHECK: @llvm.x86.avx512.mask.valign.q.256
// CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
return _mm256_maskz_alignr_epi64(__U, __A, __B, 1);
}