AVX512: Masked move intrinsic implementation.

Implemented intrinsic for the follow instructions (reg move) : VMOVDQU8/16, VMOVDQA32/64, VMOVAPS/PD.

Differential Revision: http://reviews.llvm.org/D16316

llvm-svn: 258398
This commit is contained in:
Igor Breger 2016-01-21 14:18:11 +00:00
parent 21a30a42a9
commit 7a000f5bb2
13 changed files with 430 additions and 35 deletions

View File

@ -1996,7 +1996,10 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
GCCBuiltin<"__builtin_ia32_loadapd512_mask">, GCCBuiltin<"__builtin_ia32_loadapd512_mask">,
Intrinsic<[llvm_v8f64_ty], Intrinsic<[llvm_v8f64_ty],
[llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrReadArgMem]>; [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrReadArgMem]>;
}
// Conditional move ops
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_move_ss : def int_x86_avx512_mask_move_ss :
GCCBuiltin<"__builtin_ia32_movss_mask">, GCCBuiltin<"__builtin_ia32_movss_mask">,
Intrinsic<[llvm_v4f32_ty], Intrinsic<[llvm_v4f32_ty],
@ -2007,6 +2010,84 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Intrinsic<[llvm_v2f64_ty], Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>; [IntrNoMem]>;
def int_x86_avx512_mask_mova_pd_128 :
GCCBuiltin<"__builtin_ia32_movapd128_mask">,
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_pd_256 :
GCCBuiltin<"__builtin_ia32_movapd256_mask">,
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_pd_512 :
GCCBuiltin<"__builtin_ia32_movapd512_mask">,
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_ps_128 :
GCCBuiltin<"__builtin_ia32_movaps128_mask">,
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_ps_256 :
GCCBuiltin<"__builtin_ia32_movaps256_mask">,
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_ps_512 :
GCCBuiltin<"__builtin_ia32_movaps512_mask">,
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_q_128 :
GCCBuiltin<"__builtin_ia32_movdqa64_128_mask">,
Intrinsic<[llvm_v2i64_ty],
[llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_q_256 :
GCCBuiltin<"__builtin_ia32_movdqa64_256_mask">,
Intrinsic<[llvm_v4i64_ty],
[llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_q_512 :
GCCBuiltin<"__builtin_ia32_movdqa64_512_mask">,
Intrinsic<[llvm_v8i64_ty],
[llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_d_128 :
GCCBuiltin<"__builtin_ia32_movdqa32_128_mask">,
Intrinsic<[llvm_v4i32_ty],
[llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_d_256 :
GCCBuiltin<"__builtin_ia32_movdqa32_256_mask">,
Intrinsic<[llvm_v8i32_ty],
[llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_mova_d_512 :
GCCBuiltin<"__builtin_ia32_movdqa32_512_mask">,
Intrinsic<[llvm_v16i32_ty],
[llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_movu_w_128 :
GCCBuiltin<"__builtin_ia32_movdquhi128_mask">,
Intrinsic<[llvm_v8i16_ty],
[llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_movu_w_256 :
GCCBuiltin<"__builtin_ia32_movdquhi256_mask">,
Intrinsic<[llvm_v16i16_ty],
[llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_movu_w_512 :
GCCBuiltin<"__builtin_ia32_movdquhi512_mask">,
Intrinsic<[llvm_v32i16_ty],
[llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_movu_b_128 :
GCCBuiltin<"__builtin_ia32_movdquqi128_mask">,
Intrinsic<[llvm_v16i8_ty],
[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_movu_b_256 :
GCCBuiltin<"__builtin_ia32_movdquqi256_mask">,
Intrinsic<[llvm_v32i8_ty],
[llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_movu_b_512 :
GCCBuiltin<"__builtin_ia32_movdquqi512_mask">,
Intrinsic<[llvm_v64i8_ty],
[llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
} }
// Conditional store ops // Conditional store ops

View File

@ -2563,7 +2563,10 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src), (ins _.KRCWM:$mask, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
"${dst} {${mask}} {z}, $src}"), [], _.ExeDomain>, "${dst} {${mask}} {z}, $src}"),
[(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
(_.VT _.RC:$src),
_.ImmAllZerosV)))], _.ExeDomain>,
EVEX, EVEX_KZ; EVEX, EVEX_KZ;
let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable, let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable,
@ -2758,22 +2761,14 @@ def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
(VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
let AddedComplexity = 20 in { let AddedComplexity = 20 in {
def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
(bc_v8i64 (v16i32 immAllZerosV)))),
(VMOVDQU64Zrrkz VK8WM:$mask, VR512:$src)>;
def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
(v8i64 VR512:$src))), (v8i64 VR512:$src))),
(VMOVDQU64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
VK8), VR512:$src)>; VK8), VR512:$src)>;
def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src),
(v16i32 immAllZerosV))),
(VMOVDQU32Zrrkz VK16WM:$mask, VR512:$src)>;
def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
(v16i32 VR512:$src))), (v16i32 VR512:$src))),
(VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; (VMOVDQA32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
} }
// Move Int Doubleword to Packed Double Int // Move Int Doubleword to Packed Double Int

View File

@ -920,6 +920,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FMIN, X86ISD::FMIN_RND), X86ISD::FMIN, X86ISD::FMIN_RND),
X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FMIN, X86ISD::FMIN_RND), X86ISD::FMIN, X86ISD::FMIN_RND),
X86_INTRINSIC_DATA(avx512_mask_mova_d_128, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_d_256, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_d_512, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_pd_128, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_pd_256, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_pd_512, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_ps_128, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_ps_256, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_ps_512, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_q_128, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_q_256, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mova_q_512, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK, X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK,
X86ISD::MOVDDUP, 0), X86ISD::MOVDDUP, 0),
X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK, X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK,
@ -942,6 +954,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::MOVSLDUP, 0), X86ISD::MOVSLDUP, 0),
X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK, X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK,
X86ISD::MOVSLDUP, 0), X86ISD::MOVSLDUP, 0),
X86_INTRINSIC_DATA(avx512_mask_movu_b_128, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_movu_b_256, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_movu_b_512, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_movu_w_128, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_movu_w_256, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_movu_w_512, BLEND, ISD::VSELECT, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,

View File

@ -87,7 +87,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} ; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
; KNL_X32-NEXT: vpbroadcastd LCPI2_1, %zmm0 ; KNL_X32-NEXT: vpbroadcastd LCPI2_1, %zmm0
; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 ; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: retl ; KNL_X32-NEXT: retl
%c = and <8 x i1>%a, %b %c = and <8 x i1>%a, %b
@ -315,7 +315,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL_X32-NEXT: kmovw %eax, %k1 ; KNL_X32-NEXT: kmovw %eax, %k1
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
; KNL_X32-NEXT: vpbroadcastd LCPI7_1, %zmm0 ; KNL_X32-NEXT: vpbroadcastd LCPI7_1, %zmm0
; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 ; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: addl $12, %esp
; KNL_X32-NEXT: retl ; KNL_X32-NEXT: retl

View File

@ -1820,13 +1820,9 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
; SKX: ## BB#0: ; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %zmm2, %zmm2 ; SKX-NEXT: vpsllw $7, %zmm2, %zmm2
; SKX-NEXT: vpmovb2m %zmm2, %k1 ; SKX-NEXT: vpmovb2m %zmm2, %k1
; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: vpxord %zmm3, %zmm3, %zmm3
; SKX-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
; SKX-NEXT: kshiftrq $32, %k1, %k1 ; SKX-NEXT: kshiftrq $32, %k1, %k1
; SKX-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} ; SKX-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z}
; SKX-NEXT: vmovaps %zmm3, %zmm0
; SKX-NEXT: vmovaps %zmm2, %zmm1
; SKX-NEXT: retq ; SKX-NEXT: retq
%ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
ret <64 x i16> %ret ret <64 x i16> %ret

View File

@ -7001,6 +7001,70 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x
ret <16 x i32> %res4 ret <16 x i32> %res4
} }
declare <8 x double> @llvm.x86.avx512.mask.mova.pd.512(<8 x double>, <8 x double>, i8)
define <8 x double>@test_int_x86_avx512_mask_mova_pd_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_pd_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.mova.pd.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
%res1 = call <8 x double> @llvm.x86.avx512.mask.mova.pd.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
%res2 = fadd <8 x double> %res, %res1
ret <8 x double> %res2
}
declare <16 x float> @llvm.x86.avx512.mask.mova.ps.512(<16 x float>, <16 x float>, i16)
define <16 x float>@test_int_x86_avx512_mask_mova_ps_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_ps_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.mova.ps.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
%res1 = call <16 x float> @llvm.x86.avx512.mask.mova.ps.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
%res2 = fadd <16 x float> %res, %res1
ret <16 x float> %res2
}
declare <8 x i64> @llvm.x86.avx512.mask.mova.q.512(<8 x i64>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_mova_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.mova.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.mova.q.512(<8 x i64> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = add <8 x i64> %res, %res1
ret <8 x i64> %res2
}
declare <16 x i32> @llvm.x86.avx512.mask.mova.d.512(<16 x i32>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_mask_mova_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.mova.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.mova.d.512(<16 x i32> %x0, <16 x i32> zeroinitializer, i16 %x2)
%res2 = add <16 x i32> %res, %res1
ret <16 x i32> %res2
}
declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) { define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {

View File

@ -1389,9 +1389,7 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; SKX: ## BB#0: ; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 ; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
; SKX-NEXT: vpmovb2m %ymm1, %k1 ; SKX-NEXT: vpmovb2m %ymm1, %k1
; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq ; SKX-NEXT: retq
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret ret <32 x i16> %ret

View File

@ -216,7 +216,7 @@ define <16 x i32> @test_vbroadcast() {
; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1 ; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1
; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; ALL-NEXT: knotw %k1, %k1 ; ALL-NEXT: knotw %k1, %k1
; ALL-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z} ; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq ; ALL-NEXT: retq
entry: entry:
%0 = sext <16 x i1> zeroinitializer to <16 x i32> %0 = sext <16 x i1> zeroinitializer to <16 x i32>

View File

@ -210,7 +210,7 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: knotw %k0, %k1 ; KNL-NEXT: knotw %k0, %k1
; KNL-NEXT: vmovdqu32 %zmm1, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: retq ; KNL-NEXT: retq
%sub_r = sub <16 x i32> %a, %b %sub_r = sub <16 x i32> %a, %b
%cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
@ -227,7 +227,7 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 ; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: knotw %k0, %k1 ; KNL-NEXT: knotw %k0, %k1
; KNL-NEXT: vmovdqu64 %zmm1, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: retq ; KNL-NEXT: retq
%sub_r = sub <8 x i64> %a, %b %sub_r = sub <8 x i64> %a, %b
%cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a

View File

@ -3194,3 +3194,54 @@ define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i1
call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1) call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)
ret void ret void
} }
declare <32 x i16> @llvm.x86.avx512.mask.movu.w.512(<32 x i16>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_movu_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_movu_w_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_movu_w_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.movu.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.movu.w.512(<32 x i16> %x0, <32 x i16> zeroinitializer, i32 %x2)
%res2 = add <32 x i16> %res, %res1
ret <32 x i16> %res2
}
declare <64 x i8> @llvm.x86.avx512.mask.movu.b.512(<64 x i8>, <64 x i8>, i64)
define <64 x i8>@test_int_x86_avx512_mask_movu_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_movu_b_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_movu_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
; AVX512F-32-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.movu.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.movu.b.512(<64 x i8> %x0, <64 x i8> zeroinitializer, i64 %x2)
%res2 = add <64 x i8> %res, %res1
ret <64 x i8> %res2
}

View File

@ -5163,3 +5163,67 @@ define void@test_int_x86_avx512_mask_storeu_w_256(i8* %ptr1, i8* %ptr2, <16 x i1
call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1) call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1)
ret void ret void
} }
declare <8 x i16> @llvm.x86.avx512.mask.movu.w.128(<8 x i16>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_movu_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movu_w_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.avx512.mask.movu.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.movu.w.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %x2)
%res2 = add <8 x i16> %res, %res1
ret <8 x i16> %res2
}
declare <16 x i16> @llvm.x86.avx512.mask.movu.w.256(<16 x i16>, <16 x i16>, i16)
define <16 x i16>@test_int_x86_avx512_mask_movu_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movu_w_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx512.mask.movu.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.movu.w.256(<16 x i16> %x0, <16 x i16> zeroinitializer, i16 %x2)
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
declare <16 x i8> @llvm.x86.avx512.mask.movu.b.128(<16 x i8>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_movu_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movu_b_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqu8 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.avx512.mask.movu.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.movu.b.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %x2)
%res2 = add <16 x i8> %res, %res1
ret <16 x i8> %res2
}
declare <32 x i8> @llvm.x86.avx512.mask.movu.b.256(<32 x i8>, <32 x i8>, i32)
define <32 x i8>@test_int_x86_avx512_mask_movu_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movu_b_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <32 x i8> @llvm.x86.avx512.mask.movu.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.movu.b.256(<32 x i8> %x0, <32 x i8> zeroinitializer, i32 %x2)
%res2 = add <32 x i8> %res, %res1
ret <32 x i8> %res2
}

View File

@ -7467,6 +7467,134 @@ define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32
ret <8 x i32> %res4 ret <8 x i32> %res4
} }
declare <2 x double> @llvm.x86.avx512.mask.mova.pd.128(<2 x double>, <2 x double>, i8)
define <2 x double>@test_int_x86_avx512_mask_mova_pd_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_pd_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.mova.pd.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.mova.pd.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2)
%res2 = fadd <2 x double> %res, %res1
ret <2 x double> %res2
}
declare <4 x double> @llvm.x86.avx512.mask.mova.pd.256(<4 x double>, <4 x double>, i8)
define <4 x double>@test_int_x86_avx512_mask_mova_pd_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_pd_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.avx512.mask.mova.pd.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.mova.pd.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2)
%res2 = fadd <4 x double> %res, %res1
ret <4 x double> %res2
}
declare <4 x float> @llvm.x86.avx512.mask.mova.ps.128(<4 x float>, <4 x float>, i8)
define <4 x float>@test_int_x86_avx512_mask_mova_ps_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_ps_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.mova.ps.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.mova.ps.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
%res2 = fadd <4 x float> %res, %res1
ret <4 x float> %res2
}
declare <8 x float> @llvm.x86.avx512.mask.mova.ps.256(<8 x float>, <8 x float>, i8)
define <8 x float>@test_int_x86_avx512_mask_mova_ps_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_ps_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx512.mask.mova.ps.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
%res1 = call <8 x float> @llvm.x86.avx512.mask.mova.ps.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
%res2 = fadd <8 x float> %res, %res1
ret <8 x float> %res2
}
declare <2 x i64> @llvm.x86.avx512.mask.mova.q.128(<2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_mova_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_q_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.avx512.mask.mova.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.mova.q.128(<2 x i64> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = add <2 x i64> %res, %res1
ret <2 x i64> %res2
}
declare <4 x i64> @llvm.x86.avx512.mask.mova.q.256(<4 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_mova_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_q_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.avx512.mask.mova.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.mova.q.256(<4 x i64> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = add <4 x i64> %res, %res1
ret <4 x i64> %res2
}
declare <4 x i32> @llvm.x86.avx512.mask.mova.d.128(<4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_mova_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_d_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.mask.mova.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.mova.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2)
%res2 = add <4 x i32> %res, %res1
ret <4 x i32> %res2
}
declare <8 x i32> @llvm.x86.avx512.mask.mova.d.256(<8 x i32>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_mova_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_mova_d_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx512.mask.mova.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.mova.d.256(<8 x i32> %x0, <8 x i32> zeroinitializer, i8 %x2)
%res2 = add <8 x i32> %res, %res1
ret <8 x i32> %res2
}
declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8) declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8)
define void@test_int_x86_avx512_mask_store_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) { define void@test_int_x86_avx512_mask_store_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) {

View File

@ -75,12 +75,12 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; AVX512F: # BB#0: ; AVX512F: # BB#0:
; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: retq ; AVX512F-NEXT: retq
; ;
@ -106,13 +106,13 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0
; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm1 {%k2} {z} ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z}
; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm2 {%k1} {z} ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1 ; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: retq ; AVX512F-NEXT: retq
; ;
@ -164,12 +164,12 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; AVX512F: # BB#0: ; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1 ; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1
; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: retq ; AVX512F-NEXT: retq
; ;
@ -311,8 +311,8 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-NEXT: movb $51, %al ; AVX512F-NEXT: movb $51, %al
; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: kmovw %eax, %k2
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k2} {z} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0