[AVX-512] Add 512-bit unmasked intrinsics for pmuldq and pmuludq so we can add them to InstCombine with the 128 and 256 bit versions.

The 128 and 256 bit masked intrinsics are currently unused by clang. The sse and avx2 unmasked intrinsics are used instead. The new 512-bit intrinsic will be used to do the same. Then all masked versions will removed and autoupgraded.

llvm-svn: 290573
This commit is contained in:
Craig Topper 2016-12-27 03:46:05 +00:00
parent 03130d981c
commit 89b3e0223f
3 changed files with 244 additions and 0 deletions

View File

@ -5202,9 +5202,13 @@ let TargetPrefix = "x86" in {
def int_x86_avx512_mask_pmulu_dq_512 : GCCBuiltin<"__builtin_ia32_pmuludq512_mask">,
Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_pmulu_dq_512 : GCCBuiltin<"__builtin_ia32_pmuludq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pmul_dq_512 : GCCBuiltin<"__builtin_ia32_pmuldq512_mask">,
Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_pmul_dq_512 : GCCBuiltin<"__builtin_ia32_pmuldq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512_mask">,
Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;

View File

@ -1522,6 +1522,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),

View File

@ -1513,6 +1513,125 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8
ret <8 x double> %res
}
define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mul_epi32_rr:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rrk:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rrkz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
; CHECK-LABEL: test_mul_epi32_rm:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rmk:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rmkz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mul_epi32_rmb:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
%b = bitcast <8 x i64> %b64 to <16 x i32>
%res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rmbk:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
%b = bitcast <8 x i64> %b64 to <16 x i32>
%mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rmbkz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
%b = bitcast <8 x i64> %b64 to <16 x i32>
%mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
ret < 8 x i64> %res
}
declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>)
define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epi32_rr:
; CHECK: ## BB#0:
@ -1620,6 +1739,125 @@ define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask
declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mul_epu32_rr:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rrk:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rrkz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
; CHECK-LABEL: test_mul_epu32_rm:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rmk:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rmkz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
%mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mul_epu32_rmb:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
%b = bitcast <8 x i64> %b64 to <16 x i32>
%res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rmbk:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
%b = bitcast <8 x i64> %b64 to <16 x i32>
%mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
ret < 8 x i64> %res
}
define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rmbkz:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
%b = bitcast <8 x i64> %b64 to <16 x i32>
%mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
%mask.cast = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
ret < 8 x i64> %res
}
declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>)
define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epu32_rr:
; CHECK: ## BB#0: