AVX512: VMOVDQU8/16/32/64 (load) intrinsic implementation.

Differential Revision: http://reviews.llvm.org/D16137

llvm-svn: 258657
This commit is contained in:
Igor Breger 2016-01-24 08:04:33 +00:00
parent 5dd7a2cc24
commit 1e5bafbc82
9 changed files with 335 additions and 15 deletions

View File

@ -3028,12 +3028,58 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">, def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">,
Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty], Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty],
[IntrReadArgMem]>; [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_d_512 : GCCBuiltin<"__builtin_ia32_loaddqusi512_mask">,
Intrinsic<[llvm_v16i32_ty], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], def int_x86_avx512_mask_loadu_b_128 :
[IntrReadArgMem]>; GCCBuiltin<"__builtin_ia32_loaddquqi128_mask">,
def int_x86_avx512_mask_loadu_q_512 : GCCBuiltin<"__builtin_ia32_loaddqudi512_mask">, Intrinsic<[llvm_v16i8_ty],
Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [llvm_ptr_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrReadArgMem]>;
[IntrReadArgMem]>; def int_x86_avx512_mask_loadu_b_256 :
GCCBuiltin<"__builtin_ia32_loaddquqi256_mask">,
Intrinsic<[llvm_v32i8_ty],
[llvm_ptr_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_b_512 :
GCCBuiltin<"__builtin_ia32_loaddquqi512_mask">,
Intrinsic<[llvm_v64i8_ty],
[llvm_ptr_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_w_128 :
GCCBuiltin<"__builtin_ia32_loaddquhi128_mask">,
Intrinsic<[llvm_v8i16_ty],
[llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_w_256 :
GCCBuiltin<"__builtin_ia32_loaddquhi256_mask">,
Intrinsic<[llvm_v16i16_ty],
[llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_w_512 :
GCCBuiltin<"__builtin_ia32_loaddquhi512_mask">,
Intrinsic<[llvm_v32i16_ty],
[llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_d_128 :
GCCBuiltin<"__builtin_ia32_loaddqusi128_mask">,
Intrinsic<[llvm_v4i32_ty],
[llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_d_256 :
GCCBuiltin<"__builtin_ia32_loaddqusi256_mask">,
Intrinsic<[llvm_v8i32_ty],
[llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_d_512 :
GCCBuiltin<"__builtin_ia32_loaddqusi512_mask">,
Intrinsic<[llvm_v16i32_ty],
[llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_q_128 :
GCCBuiltin<"__builtin_ia32_loaddqudi128_mask">,
Intrinsic<[llvm_v2i64_ty],
[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_q_256 :
GCCBuiltin<"__builtin_ia32_loaddqudi256_mask">,
Intrinsic<[llvm_v4i64_ty],
[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_loadu_q_512 :
GCCBuiltin<"__builtin_ia32_loaddqudi512_mask">,
Intrinsic<[llvm_v8i64_ty],
[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_load_d_128 : def int_x86_avx512_mask_load_d_128 :
GCCBuiltin<"__builtin_ia32_movdqa32load128_mask">, GCCBuiltin<"__builtin_ia32_movdqa32load128_mask">,

View File

@ -20582,6 +20582,28 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
} }
} }
/// Places new result values for the node in Results (their number
/// and types must exactly match those of the original return values of
/// the node), or leaves Results empty, which indicates that the node is not
/// to be custom lowered after all.
void X86TargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDValue Res = LowerOperation(SDValue(N, 0), DAG);
if (!Res.getNode())
return;
assert((N->getNumValues() <= Res->getNumValues()) &&
"Lowering returned the wrong number of results!");
// Places new result values base on N result number.
// In some cases (LowerSINT_TO_FP for example) Res has more result values
// than original node, chain should be dropped(last value).
for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
Results.push_back(Res.getValue(I));
}
/// ReplaceNodeResults - Replace a node with an illegal result type /// ReplaceNodeResults - Replace a node with an illegal result type
/// with a new node built out of custom code. /// with a new node built out of custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N, void X86TargetLowering::ReplaceNodeResults(SDNode *N,

View File

@ -680,6 +680,14 @@ namespace llvm {
/// ///
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
/// Places new result values for the node in Results (their number
/// and types must exactly match those of the original return values of
/// the node), or leaves Results empty, which indicates that the node is not
/// to be custom lowered after all.
virtual void LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
/// Replace the results of node with an illegal result /// Replace the results of node with an illegal result
/// type with new values built out of custom code. /// type with new values built out of custom code.
/// ///

View File

@ -2752,14 +2752,6 @@ defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512>
avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
(v16i32 immAllZerosV), GR16:$mask)),
(VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
(bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
(VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
let AddedComplexity = 20 in { let AddedComplexity = 20 in {
def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
(v8i64 VR512:$src))), (v8i64 VR512:$src))),

View File

@ -156,12 +156,24 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_mask_load_q_128, LOADA, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_load_q_128, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_q_256, LOADA, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_load_q_256, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_q_512, LOADA, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_load_q_512, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_b_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_b_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_b_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_d_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_d_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_d_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_q_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_q_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_q_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_w_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_w_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_w_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0), X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,

View File

@ -6603,6 +6603,42 @@ define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %
ret <8 x i64> %res4 ret <8 x i64> %res4
} }
declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_d:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0
; CHECK-NEXT: vmovdqu32 (%rsi), %zmm0 {%k1}
; CHECK-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
%res4 = add <16 x i32> %res2, %res1
ret <16 x i32> %res4
}
declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_q:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0
; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 {%k1}
; CHECK-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
%res4 = add <8 x i64> %res2, %res1
ret <8 x i64> %res4
}
declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i8, <16 x i32>, i16) declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i8, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) { define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {

View File

@ -3087,6 +3087,66 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16>
ret <32 x i16> %res4 ret <32 x i16> %res4
} }
declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edx, %k1
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1}
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_w_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0
; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z}
; AVX512F-32-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res0 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> %x1, i32 -1)
%res = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr2, <32 x i16> %res0, i32 %mask)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> zeroinitializer, i32 %mask)
%res2 = add <32 x i16> %res, %res1
ret <32 x i16> %res2
}
declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64)
define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovq %rdx, %k1
; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1}
; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1}
; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z}
; AVX512F-32-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res0 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> %x1, i64 -1)
%res = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr2, <64 x i8> %res0, i64 %mask)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> zeroinitializer, i64 %mask)
%res2 = add <64 x i8> %res, %res1
ret <64 x i8> %res2
}
declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i32) declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) { define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
@ -3244,4 +3304,3 @@ define <64 x i8>@test_int_x86_avx512_mask_movu_b_512(<64 x i8> %x0, <64 x i8> %x
%res2 = add <64 x i8> %res, %res1 %res2 = add <64 x i8> %res, %res1
ret <64 x i8> %res2 ret <64 x i8> %res2
} }

View File

@ -4947,6 +4947,78 @@ define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1
ret <8 x i16> %res4 ret <8 x i16> %res4
} }
declare <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8*, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0
; CHECK-NEXT: vmovdqu16 (%rsi), %xmm0 {%k1}
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm1 {%k1} {z}
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> %x1, i8 -1)
%res = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr2, <8 x i16> %res0, i8 %mask)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> zeroinitializer, i8 %mask)
%res2 = add <8 x i16> %res, %res1
ret <8 x i16> %res2
}
declare <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8*, <16 x i16>, i16)
define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0
; CHECK-NEXT: vmovdqu16 (%rsi), %ymm0 {%k1}
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm1 {%k1} {z}
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> %x1, i16 -1)
%res = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr2, <16 x i16> %res0, i16 %mask)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> zeroinitializer, i16 %mask)
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
declare <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8*, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0
; CHECK-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1}
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm1 {%k1} {z}
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> %x1, i16 -1)
%res = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr2, <16 x i8> %res0, i16 %mask)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> zeroinitializer, i16 %mask)
%res2 = add <16 x i8> %res, %res1
ret <16 x i8> %res2
}
declare <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8*, <32 x i8>, i32)
define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edx, %k1
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0
; CHECK-NEXT: vmovdqu8 (%rsi), %ymm0 {%k1}
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} {z}
; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> %x1, i32 -1)
%res = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr2, <32 x i8> %res0, i32 %mask)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> zeroinitializer, i32 %mask)
%res2 = add <32 x i8> %res, %res1
ret <32 x i8> %res2
}
declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8) declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {

View File

@ -6672,6 +6672,79 @@ define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %
%res4 = add <4 x i64> %res3, %res2 %res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4 ret <4 x i64> %res4
} }
declare <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8*, <4 x i32>, i8)
define <4 x i32> @test_mask_load_unaligned_d_128(i8* %ptr, i8* %ptr2, <4 x i32> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_d_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0
; CHECK-NEXT: vmovdqu32 (%rsi), %xmm0 {%k1}
; CHECK-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr2, <4 x i32> %res, i8 %mask)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 %mask)
%res4 = add <4 x i32> %res2, %res1
ret <4 x i32> %res4
}
declare <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8*, <8 x i32>, i8)
define <8 x i32> @test_mask_load_unaligned_d_256(i8* %ptr, i8* %ptr2, <8 x i32> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_d_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0
; CHECK-NEXT: vmovdqu32 (%rsi), %ymm0 {%k1}
; CHECK-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} {z}
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 -1)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr2, <8 x i32> %res, i8 %mask)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 %mask)
%res4 = add <8 x i32> %res2, %res1
ret <8 x i32> %res4
}
declare <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8*, <2 x i64>, i8)
define <2 x i64> @test_mask_load_unaligned_q_128(i8* %ptr, i8* %ptr2, <2 x i64> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_q_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0
; CHECK-NEXT: vmovdqu64 (%rsi), %xmm0 {%k1}
; CHECK-NEXT: vmovdqu64 (%rdi), %xmm1 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 -1)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr2, <2 x i64> %res, i8 %mask)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 %mask)
%res4 = add <2 x i64> %res2, %res1
ret <2 x i64> %res4
}
declare <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8*, <4 x i64>, i8)
define <4 x i64> @test_mask_load_unaligned_q_256(i8* %ptr, i8* %ptr2, <4 x i64> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_q_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0
; CHECK-NEXT: vmovdqu64 (%rsi), %ymm0 {%k1}
; CHECK-NEXT: vmovdqu64 (%rdi), %ymm1 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 -1)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr2, <4 x i64> %res, i8 %mask)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 %mask)
%res4 = add <4 x i64> %res2, %res1
ret <4 x i64> %res4
}
declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i8, <4 x i32>, i8) declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i8, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) { define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {