From e382c3fdcdced3144a07ee814186aeade613e553 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 10 Dec 2013 13:53:10 +0000 Subject: [PATCH] AVX-512: changed intrinsics for mask operations llvm-svn: 196918 --- llvm/include/llvm/IR/IntrinsicsX86.td | 39 +++++++++------------ llvm/lib/Target/X86/X86ISelLowering.cpp | 6 ++-- llvm/lib/Target/X86/X86InstrAVX512.td | 38 +++++++++++++------- llvm/test/CodeGen/X86/avx512-intrinsics.ll | 40 ++++++++++++++++++---- llvm/test/CodeGen/X86/avx512-mask-op.ll | 13 ------- 5 files changed, 78 insertions(+), 58 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 2b66e3d9f4a4..b28e0e1f5c7b 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2643,37 +2643,30 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Mask instructions // 16-bit mask - def int_x86_kadd_v16i1 : GCCBuiltin<"__builtin_ia32_kaddw">, - Intrinsic<[llvm_v16i1_ty], [llvm_v16i1_ty, llvm_v16i1_ty], + def int_x86_avx512_kand_w : GCCBuiltin<"__builtin_ia32_kandhi">, + Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_kand_v16i1 : GCCBuiltin<"__builtin_ia32_kandw">, - Intrinsic<[llvm_v16i1_ty], [llvm_v16i1_ty, llvm_v16i1_ty], + def int_x86_avx512_kandn_w : GCCBuiltin<"__builtin_ia32_kandnhi">, + Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_kandn_v16i1 : GCCBuiltin<"__builtin_ia32_kandnw">, - Intrinsic<[llvm_v16i1_ty], [llvm_v16i1_ty, llvm_v16i1_ty], + def int_x86_avx512_knot_w : GCCBuiltin<"__builtin_ia32_knothi">, + Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_kor_w : GCCBuiltin<"__builtin_ia32_korhi">, + Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_knot_v16i1 : GCCBuiltin<"__builtin_ia32_knotw">, - Intrinsic<[llvm_v16i1_ty], [llvm_v16i1_ty], [IntrNoMem]>; - def int_x86_kor_v16i1 : GCCBuiltin<"__builtin_ia32_korw">, - Intrinsic<[llvm_v16i1_ty], [llvm_v16i1_ty, llvm_v16i1_ty], + def int_x86_avx512_kxor_w : GCCBuiltin<"__builtin_ia32_kxorhi">, + Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_kxor_v16i1 : GCCBuiltin<"__builtin_ia32_kxorw">, - Intrinsic<[llvm_v16i1_ty], [llvm_v16i1_ty, llvm_v16i1_ty], + def int_x86_avx512_kxnor_w : GCCBuiltin<"__builtin_ia32_kxnorhi">, + Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_kxnor_v16i1 : GCCBuiltin<"__builtin_ia32_kxnorw">, - Intrinsic<[llvm_v16i1_ty], [llvm_v16i1_ty, llvm_v16i1_ty], + def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">, + Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_mask2int_v16i1 : GCCBuiltin<"__builtin_ia32_mask2intw">, - Intrinsic<[llvm_i32_ty], [llvm_v16i1_ty], [IntrNoMem]>; - def int_x86_int2mask_v16i1 : GCCBuiltin<"__builtin_ia32_int2maskw">, - Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_x86_kunpck_v16i1 : GCCBuiltin<"__builtin_ia32_kunpckbw">, - Intrinsic<[llvm_v16i1_ty], [llvm_v8i1_ty, llvm_v8i1_ty], - [IntrNoMem]>; - def int_x86_avx512_kortestz : GCCBuiltin<"__builtin_ia32_kortestz">, + def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">, Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_kortestc : GCCBuiltin<"__builtin_ia32_kortestc">, + def int_x86_avx512_kortestc_w : GCCBuiltin<"__builtin_ia32_kortestchi">, Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty], [IntrNoMem]>; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e9301f22dd6d..23c9640399e5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11490,9 +11490,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - case Intrinsic::x86_avx512_kortestz: - case Intrinsic::x86_avx512_kortestc: { - unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B; + case Intrinsic::x86_avx512_kortestz_w: + case Intrinsic::x86_avx512_kortestc_w: { + unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); SDValue CC = DAG.getConstant(X86CC, MVT::i8); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9bf3b5b91b0b..62e3630dc820 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -898,6 +898,15 @@ multiclass avx512_mask_unop_w opc, string OpcodeStr, defm KNOT : avx512_mask_unop_w<0x44, "knot", not>; +multiclass avx512_mask_unop_int { + let Predicates = [HasAVX512] in + def : Pat<(!cast("int_x86_avx512_"##IntName##"_w") + (i16 GR16:$src)), + (COPY_TO_REGCLASS (!cast(InstName##"Wrr") + (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>; +} +defm : avx512_mask_unop_int<"knot", "KNOT">; + def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>; def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; @@ -908,7 +917,7 @@ def : Pat<(not VK8:$src), (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; // Mask binary operation -// - KADD, KAND, KANDN, KOR, KXNOR, KXOR +// - KAND, KANDN, KOR, KXNOR, KXOR multiclass avx512_mask_binop opc, string OpcodeStr, RegisterClass KRC, SDPatternOperator OpNode> { let Predicates = [HasAVX512] in @@ -928,7 +937,6 @@ def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; let isCommutable = 1 in { - defm KADD : avx512_mask_binop_w<0x4a, "kadd", add>; defm KAND : avx512_mask_binop_w<0x41, "kand", and>; let isCommutable = 0 in defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>; @@ -939,17 +947,19 @@ let isCommutable = 1 in { multiclass avx512_mask_binop_int { let Predicates = [HasAVX512] in - def : Pat<(!cast("int_x86_"##IntName##"_v16i1") - VK16:$src1, VK16:$src2), - (!cast(InstName##"Wrr") VK16:$src1, VK16:$src2)>; + def : Pat<(!cast("int_x86_avx512_"##IntName##"_w") + (i16 GR16:$src1), (i16 GR16:$src2)), + (COPY_TO_REGCLASS (!cast(InstName##"Wrr") + (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)), + (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>; } -defm : avx512_mask_binop_int<"kadd", "KADD">; defm : avx512_mask_binop_int<"kand", "KAND">; defm : avx512_mask_binop_int<"kandn", "KANDN">; defm : avx512_mask_binop_int<"kor", "KOR">; defm : avx512_mask_binop_int<"kxnor", "KXNOR">; defm : avx512_mask_binop_int<"kxor", "KXOR">; + // With AVX-512, 8-bit mask is promoted to 16-bit mask. multiclass avx512_binop_pat { let Predicates = [HasAVX512] in @@ -967,15 +977,15 @@ defm : avx512_binop_pat; // Mask unpacking multiclass avx512_mask_unpck opc, string OpcodeStr, - RegisterClass KRC1, RegisterClass KRC2> { + RegisterClass KRC> { let Predicates = [HasAVX512] in - def rr : I; } multiclass avx512_mask_unpck_bw opc, string OpcodeStr> { - defm BW : avx512_mask_unpck, + defm BW : avx512_mask_unpck, VEX_4V, VEX_L, OpSize, TB; } @@ -983,12 +993,14 @@ defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">; multiclass avx512_mask_unpck_int { let Predicates = [HasAVX512] in - def : Pat<(!cast("int_x86_"##IntName##"_v16i1") - VK8:$src1, VK8:$src2), - (!cast(InstName##"BWrr") VK8:$src1, VK8:$src2)>; + def : Pat<(!cast("int_x86_avx512_"##IntName##"_bw") + (i16 GR16:$src1), (i16 GR16:$src2)), + (COPY_TO_REGCLASS (!cast(InstName##"BWrr") + (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)), + (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>; } +defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">; -defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">; // Mask bit testing multiclass avx512_mask_testop opc, string OpcodeStr, RegisterClass KRC, SDNode OpNode> { diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index b2e639024fd9..cd67cd823e24 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1,23 +1,51 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -declare i32 @llvm.x86.avx512.kortestz(i16, i16) nounwind readnone -; CHECK: test_kortestz +declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone +; CHECK-LABEL: test_kortestz ; CHECK: kortestw ; CHECK: sete define i32 @test_kortestz(i16 %a0, i16 %a1) { - %res = call i32 @llvm.x86.avx512.kortestz(i16 %a0, i16 %a1) + %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1) ret i32 %res } -declare i32 @llvm.x86.avx512.kortestc(i16, i16) nounwind readnone -; CHECK: test_kortestc +declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone +; CHECK-LABEL: test_kortestc ; CHECK: kortestw ; CHECK: sbbl define i32 @test_kortestc(i16 %a0, i16 %a1) { - %res = call i32 @llvm.x86.avx512.kortestc(i16 %a0, i16 %a1) + %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1) ret i32 %res } +declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone +; CHECK-LABEL: test_kand +; CHECK: kandw +; CHECK: kandw +define i16 @test_kand(i16 %a0, i16 %a1) { + %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8) + %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1) + ret i16 %t2 +} + +declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone +; CHECK-LABEL: test_knot +; CHECK: knotw +define i16 @test_knot(i16 %a0) { + %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone + +; CHECK-LABEL: unpckbw_test +; CHECK: kunpckbw +; CHECK:ret +define i16 @unpckbw_test(i16 %a0, i16 %a1) { + %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) + ret i16 %res +} + define <16 x float> @test_rcp_ps_512(<16 x float> %a0) { ; CHECK: vrcp14ps %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index ef5cb56d7284..923e72ec5f7e 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -33,19 +33,6 @@ define i16 @mand16(i16 %x, i16 %y) { ret i16 %ret } -; CHECK: unpckbw_test -; CHECK: kunpckbw -; CHECK:ret -declare <16 x i1> @llvm.x86.kunpck.v16i1(<8 x i1>, <8 x i1>) nounwind readnone - -define i16 @unpckbw_test(i8 %x, i8 %y) { - %m0 = bitcast i8 %x to <8 x i1> - %m1 = bitcast i8 %y to <8 x i1> - %k = tail call <16 x i1> @llvm.x86.kunpck.v16i1(<8 x i1> %m0, <8 x i1> %m1) - %r = bitcast <16 x i1> %k to i16 - ret i16 %r -} - ; CHECK: shuf_test1 ; CHECK: kshiftrw $8 ; CHECK:ret