From 47fc44e52e8e7bad1b901bfae4dc78dec048d5f1 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Mon, 16 Dec 2013 13:52:35 +0000 Subject: [PATCH] AVX-512: Added legal type MVT::i1 and VK1 register for it. Added scalar compare VCMPSS, VCMPSD. Implemented LowerSELECT for scalar FP operations. I replaced FSETCCss, FSETCCsd with one node type FSETCCs. Node extract_vector_elt(v16i1/v8i1, idx) returns an element of type i1. llvm-svn: 197384 --- .../X86DisassemblerDecoderCommon.h | 1 + llvm/lib/Target/X86/X86ISelLowering.cpp | 123 ++++++++++-------- llvm/lib/Target/X86/X86ISelLowering.h | 6 +- llvm/lib/Target/X86/X86InstrAVX512.td | 93 +++++++++++-- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 15 ++- llvm/lib/Target/X86/X86InstrInfo.cpp | 17 +-- llvm/lib/Target/X86/X86InstrInfo.td | 4 +- llvm/lib/Target/X86/X86InstrSSE.td | 8 +- llvm/lib/Target/X86/X86RegisterInfo.td | 2 + .../test/CodeGen/X86/avx512-insert-extract.ll | 22 ++-- llvm/test/CodeGen/X86/avx512-select.ll | 19 +++ llvm/test/CodeGen/X86/isint.ll | 4 +- llvm/utils/TableGen/X86RecognizableInstr.cpp | 7 + 13 files changed, 220 insertions(+), 101 deletions(-) diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index dd1719c64d76..44d61294d463 100644 --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -478,6 +478,7 @@ struct ContextDecision { ENUM_ENTRY(TYPE_XMM128, "16-byte") \ ENUM_ENTRY(TYPE_XMM256, "32-byte") \ ENUM_ENTRY(TYPE_XMM512, "64-byte") \ + ENUM_ENTRY(TYPE_VK1, "1-bit") \ ENUM_ENTRY(TYPE_VK8, "8-bit") \ ENUM_ENTRY(TYPE_VK16, "16-bit") \ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 538a116902e3..e87e7edab3c8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1306,9 +1306,13 @@ void X86TargetLowering::resetOperationActions() { addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + addRegisterClass(MVT::i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::XOR, MVT::i1, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); @@ -1376,6 +1380,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); @@ -2221,6 +2227,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; + else if (RegVT == MVT::i1) + RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) @@ -7669,6 +7677,39 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); } +/// Extract one bit from mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) { + SDValue Vec = Op.getOperand(0); + SDLoc dl(Vec); + MVT VecVT = Vec.getSimpleValueType(); + SDValue Idx = Op.getOperand(1); + MVT EltVT = Op.getSimpleValueType(); + + assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); + + // variable index can't be handled in mask registers, + // extend vector to VR512 + if (!isa(Idx)) { + MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + ExtVT.getVectorElementType(), Ext, Idx); + return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); + } + + unsigned IdxVal = cast(Idx)->getZExtValue(); + if (IdxVal) { + unsigned MaxSift = VecVT.getSizeInBits() - 1; + Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + DAG.getConstant(MaxSift - IdxVal, MVT::i8)); + Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + DAG.getConstant(MaxSift, MVT::i8)); + } + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i1, Vec, + DAG.getIntPtrConstant(0)); +} + SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { @@ -7676,6 +7717,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); + + if (Op.getSimpleValueType() == MVT::i1) + return ExtractBitFromMaskVector(Op, DAG); + if (!isa(Idx)) { if (VecVT.is512BitVector() || (VecVT.is256BitVector() && Subtarget->hasInt256() && @@ -9681,11 +9726,17 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SelectionDAG &DAG) const { - if (ConstantSDNode *C = dyn_cast(Op1)) + SDLoc dl(Op0); + if (ConstantSDNode *C = dyn_cast(Op1)) { if (C->getAPIntValue() == 0) return EmitTest(Op0, X86CC, DAG); - SDLoc dl(Op0); + if (Op0.getValueType() == MVT::i1) { + Op0 = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, DAG.getConstant(-1, MVT::i1)); + return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op0, Op0); + } + } + if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { // Do the comparison at i32 if it's smaller. This avoids subregister @@ -10121,7 +10172,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); - assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); + assert((VT == MVT::i8 || (Subtarget->hasAVX512() && VT == MVT::i1)) + && "SetCC type must be 8-bit or 1-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); @@ -10234,8 +10286,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { cast(Cond.getOperand(2))->get(), CondOp0, CondOp1); if (SSECC != 8) { - unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd; - SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1, + if (Subtarget->hasAVX512()) { + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, + DAG.getConstant(SSECC, MVT::i8)); + return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); + } + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, MVT::i8)); SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); @@ -13774,8 +13830,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMPMU: return "X86ISD::CMPMU"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; - case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; - case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; + case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -13870,7 +13925,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; - case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; @@ -16420,44 +16474,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } -/// Extract one bit from mask vector, like v16i1 or v8i1. -/// AVX-512 feature. -static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) { - SDValue Vec = N->getOperand(0); - SDLoc dl(Vec); - MVT VecVT = Vec.getSimpleValueType(); - SDValue Idx = N->getOperand(1); - MVT EltVT = N->getSimpleValueType(0); - - assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) || - "Unexpected operands in ExtractBitFromMaskVector"); - - // variable index - if (!isa(Idx)) { - MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); - SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - ExtVT.getVectorElementType(), Ext); - return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); - } - - unsigned IdxVal = cast(Idx)->getZExtValue(); - - MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits()); - unsigned MaxShift = VecVT.getSizeInBits() - 1; - Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec); - Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, - DAG.getConstant(MaxShift - IdxVal, ScalarVT)); - Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec, - DAG.getConstant(MaxShift, ScalarVT)); - - if (VecVT == MVT::v16i1) { - Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec); - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec); - } - return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec); -} - /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// to a simple store and scalar loads to extract the elements. @@ -16469,10 +16485,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, SDValue InputVector = N->getOperand(0); - if (InputVector.getValueType().getVectorElementType() == MVT::i1 && - !DCI.isBeforeLegalize()) - return ExtractBitFromMaskVector(N, DAG); - // Detect whether we are trying to convert from mmx to i32 and the bitcast // from mmx to v2i32 has a single usage. if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && @@ -17616,17 +17628,16 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { bool is64BitFP = (CMP00.getValueType() == MVT::f64); - X86ISD::NodeType NTOperator = is64BitFP ? - X86ISD::FSETCCsd : X86ISD::FSETCCss; // FIXME: need symbolic constants for these magic numbers. // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; - SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, + SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, DAG.getConstant(x86cc, MVT::i8)); - SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, + MVT IntVT = (is64BitFP ? MVT::i64 : MVT::i32); + SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF); - SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, - DAG.getConstant(1, MVT::i32)); + SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, + DAG.getConstant(1, IntVT)); SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); return OneBitOfTruth; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index bc3dd608da52..0b3495dc1b3b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -94,6 +94,9 @@ namespace llvm { /// operand, usually produced by a CMP instruction. SETCC, + /// X86 Select + SELECT, + // Same as SETCC except it's materialized with a sbb and the value is all // one's or all zero's. SETCC_CARRY, // R = carry_bit ? ~0 : 0 @@ -101,7 +104,7 @@ namespace llvm { /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. /// Operands are two FP values to compare; result is a mask of /// 0s or 1s. Generally DTRT for C/C++ with NaNs. - FSETCCss, FSETCCsd, + FSETCC, /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values, /// result in an integer GPR. Needs masking for scalar result. @@ -314,7 +317,6 @@ namespace llvm { // OR/AND test for masks KORTEST, - KTEST, // Several flavors of instructions with vector shuffle behaviors. PALIGNR, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 4c114e2a40ce..2a4053327113 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -683,6 +683,42 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; } +//===----------------------------------------------------------------------===// +// Compare Instructions +//===----------------------------------------------------------------------===// + +// avx512_cmp_scalar - AVX512 CMPSS and CMPSD +multiclass avx512_cmp_scalar { + def rr : AVX512Ii8<0xC2, MRMSrcReg, + (outs VK1:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], + IIC_SSE_ALU_F32S_RR>, EVEX_4V; + def rm : AVX512Ii8<0xC2, MRMSrcMem, + (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set VK1:$dst, (OpNode (VT RC:$src1), + (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + let neverHasSideEffects = 1 in { + def rri_alt : AVX512Ii8<0xC2, MRMSrcReg, + (outs VK1:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_ALU_F32S_RR>, EVEX_4V; + def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem, + (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + } +} + +let Predicates = [HasAVX512] in { +defm VCMPSSZ : avx512_cmp_scalar, + XS; +defm VCMPSDZ : avx512_cmp_scalar, + XD, VEX_W; +} multiclass avx512_icmp_packed opc, string OpcodeStr, RegisterClass KRC, RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, @@ -734,10 +770,10 @@ multiclass avx512_icmp_cc opc, RegisterClass KRC, // Accept explicit immediate argument form instead of comparison code. let neverHasSideEffects = 1 in { def rri_alt : AVX512AIi8, EVEX_4V; def rmi_alt : AVX512AIi8, EVEX_4V; } } @@ -864,8 +900,14 @@ let Predicates = [HasAVX512] in { def : Pat<(store (v16i1 VK16:$src), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; - def : Pat<(store (v8i1 VK8:$src), addr:$dst), - (KMOVWmk addr:$dst, (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16)))>; + def : Pat<(store VK8:$src, addr:$dst), + (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; + + def : Pat<(i1 (load addr:$src)), + (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>; + + def : Pat<(v8i1 (load addr:$src)), + (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>; } // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { @@ -878,6 +920,12 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), sub_8bit)>; + + def : Pat<(i1 (extractelt VK16:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK16:$src, VK1)>; + def : Pat<(i1 (extractelt VK8:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK8:$src, VK1)>; + } // Mask unary operation @@ -945,6 +993,19 @@ let isCommutable = 1 in { defm KXOR : avx512_mask_binop_w<0x47, "kxor", xor>; } +def : Pat<(xor VK1:$src1, VK1:$src2), + (COPY_TO_REGCLASS (KXORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; + +def : Pat<(or VK1:$src1, VK1:$src2), + (COPY_TO_REGCLASS (KORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; + +def : Pat<(not VK1:$src), + (COPY_TO_REGCLASS (KXORWrr (COPY_TO_REGCLASS VK1:$src, VK16), + (COPY_TO_REGCLASS (VCMPSSZrr (f32 (IMPLICIT_DEF)), + (f32 (IMPLICIT_DEF)), (i8 0)), VK16)), VK1)>; + multiclass avx512_mask_binop_int { let Predicates = [HasAVX512] in def : Pat<(!cast("int_x86_avx512_"##IntName##"_w") @@ -1016,7 +1077,10 @@ multiclass avx512_mask_testop_w opc, string OpcodeStr, SDNode OpNode> { } defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; -defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest>; + +def : Pat<(X86cmp VK1:$src1, VK1:$src2), + (KORTESTWrr (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16))>; // Mask shift multiclass avx512_mask_shiftop opc, string OpcodeStr, RegisterClass KRC, @@ -1034,8 +1098,8 @@ multiclass avx512_mask_shiftop_w opc1, bits<8> opc2, string OpcodeStr, VEX, OpSize, TA, VEX_W; } -defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", shl>; -defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", srl>; +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>; // Mask setting all 0s or 1s multiclass avx512_mask_setop { @@ -1046,7 +1110,7 @@ multiclass avx512_mask_setop { } multiclass avx512_mask_setop_w { - defm B : avx512_mask_setop; + defm B : avx512_mask_setop; defm W : avx512_mask_setop; } @@ -1341,6 +1405,12 @@ multiclass avx512_move_scalar , EVEX_4V, VEX_LIG; + let Constraints = "$src1 = $dst" in + def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3), + !strconcat(asm, + "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"), + [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K; def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, @@ -1359,6 +1429,13 @@ let ExeDomain = SSEPackedDouble in defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem, loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>; +def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), + (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), + VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>; + +def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), + (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), + VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; // For the disassembler let isCodeGenOnly = 1 in { diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 1fed424fd6e0..b56ce1122dfb 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -59,8 +59,8 @@ def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; -def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>; -def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; +def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; +//def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; @@ -130,9 +130,14 @@ def X86IntCmpMask : SDTypeProfile<1, 2, def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>; def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>; -def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def X86CmpMaskCC : + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def X86CmpMaskCCScalar : + SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; +def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; def X86vshl : SDNode<"X86ISD::VSHL", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, @@ -155,10 +160,10 @@ def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; -def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; -def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, +def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>>; +def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index ae4982f404cd..90598a5cb548 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3015,6 +3015,11 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } +inline static bool MaskRegClassContains(unsigned Reg) { + return X86::VK8RegClass.contains(Reg) || + X86::VK16RegClass.contains(Reg) || + X86::VK1RegClass.contains(Reg); +} static unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { if (X86::VR128XRegClass.contains(DestReg, SrcReg) || @@ -3024,13 +3029,10 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { SrcReg = get512BitSuperRegister(SrcReg); return X86::VMOVAPSZrr; } - if ((X86::VK8RegClass.contains(DestReg) || - X86::VK16RegClass.contains(DestReg)) && - (X86::VK8RegClass.contains(SrcReg) || - X86::VK16RegClass.contains(SrcReg))) + if (MaskRegClassContains(DestReg) && + MaskRegClassContains(SrcReg)) return X86::KMOVWkk; - if ((X86::VK8RegClass.contains(DestReg) || - X86::VK16RegClass.contains(DestReg)) && + if (MaskRegClassContains(DestReg) && (X86::GR32RegClass.contains(SrcReg) || X86::GR16RegClass.contains(SrcReg) || X86::GR8RegClass.contains(SrcReg))) { @@ -3040,8 +3042,7 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { if ((X86::GR32RegClass.contains(DestReg) || X86::GR16RegClass.contains(DestReg) || X86::GR8RegClass.contains(DestReg)) && - (X86::VK8RegClass.contains(SrcReg) || - X86::VK16RegClass.contains(SrcReg))) { + MaskRegClassContains(SrcReg)) { DestReg = getX86SubSuperRegister(DestReg, MVT::i32); return X86::KMOVWrk; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index cc60acb2b7d3..8d1a670b4709 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -23,8 +23,8 @@ def SDTIntShiftDOp: SDTypeProfile<1, 3, def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; -def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; -def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index bbf86cdd0253..d0e4989b0050 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2299,23 +2299,23 @@ multiclass sse12_cmp_scalar, XS, VEX_4V, VEX_LIG; -defm VCMPSD : sse12_cmp_scalar, // same latency as 32 bit compare XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { - defm CMPSS : sse12_cmp_scalar, XS; - defm CMPSD : sse12_cmp_scalar, diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index b8027283cc1f..8d79e13b1db8 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -463,9 +463,11 @@ def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], 256, (sequence "YMM%u", 0, 31)>; +def VK1 : RegisterClass<"X86", [i1], 1, (sequence "K%u", 0, 7)>; def VK8 : RegisterClass<"X86", [v8i1], 8, (sequence "K%u", 0, 7)>; def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)>; +def VK1WM : RegisterClass<"X86", [i1], 1, (sub VK1, K0)>; def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)>; def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>; diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index ef6359b4d9ee..64f2a197008f 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -99,27 +99,21 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind { } ;CHECK-LABEL: test11 -;CHECK: movl $260 -;CHECK: bextrl -;CHECK: movl $268 -;CHECK: bextrl +;CHECK: vpcmpltud +;CKECK: kshiftlw $11 +;CKECK: kshiftrw $15 +;CHECK: kxorw +;CHECK: kortestw +;CHECK: jne +;CHECK: ret ;CHECK: ret define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) { %cmp_res = icmp ult <16 x i32> %a, %b %ia = extractelement <16 x i1> %cmp_res, i32 4 - %ib = extractelement <16 x i1> %cmp_res, i32 12 - br i1 %ia, label %A, label %B - A: ret <16 x i32>%b B: %c = add <16 x i32>%b, %a - br i1 %ib, label %C, label %D - C: - %c1 = sub <16 x i32>%c, %a - ret <16 x i32>%c1 - D: - %c2 = mul <16 x i32>%c, %a - ret <16 x i32>%c2 + ret <16 x i32>%c } diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll index d2d6681fb422..83f46984781f 100644 --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -20,3 +20,22 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind { ret <8 x i64> %res } +; CHECK-LABEL: @select02 +; CHECK: cmpless %xmm0, %xmm3, %k1 +; CHECK-NEXT: vmovss %xmm2, {{.*}}%xmm1 {%k1} +; CHECK: ret +define float @select02(float %a, float %b, float %c, float %eps) { + %cmp = fcmp oge float %a, %eps + %cond = select i1 %cmp, float %c, float %b + ret float %cond +} + +; CHECK-LABEL: @select03 +; CHECK: cmplesd %xmm0, %xmm3, %k1 +; CHECK-NEXT: vmovsd %xmm2, {{.*}}%xmm1 {%k1} +; CHECK: ret +define double @select03(double %a, double %b, double %c, double %eps) { + %cmp = fcmp oge double %a, %eps + %cond = select i1 %cmp, double %c, double %b + ret double %cond +} diff --git a/llvm/test/CodeGen/X86/isint.ll b/llvm/test/CodeGen/X86/isint.ll index 4a98e63f38fc..ce3f13578480 100644 --- a/llvm/test/CodeGen/X86/isint.ll +++ b/llvm/test/CodeGen/X86/isint.ll @@ -8,8 +8,8 @@ define i32 @isint_return(double %d) nounwind { %e = sitofp i32 %i to double ; CHECK: cmpeqsd %c = fcmp oeq double %d, %e -; CHECK-NEXT: movd -; CHECK-NEXT: andl +; CHECK-NEXT: movq +; CHECK-NEXT: andq %z = zext i1 %c to i32 ret i32 %z } diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index 708e72d36e1b..6b5123a49633 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -1269,6 +1269,8 @@ OperandType RecognizableInstr::typeFromString(const std::string &s, TYPE("VR256", TYPE_XMM256) TYPE("VR256X", TYPE_XMM256) TYPE("VR512", TYPE_XMM512) + TYPE("VK1", TYPE_VK1) + TYPE("VK1WM", TYPE_VK1) TYPE("VK8", TYPE_VK8) TYPE("VK8WM", TYPE_VK8) TYPE("VK16", TYPE_VK16) @@ -1340,6 +1342,7 @@ OperandEncoding RecognizableInstr::rmRegisterEncodingFromString ENCODING("VR256", ENCODING_RM) ENCODING("VR256X", ENCODING_RM) ENCODING("VR512", ENCODING_RM) + ENCODING("VK1", ENCODING_RM) ENCODING("VK8", ENCODING_RM) ENCODING("VK16", ENCODING_RM) errs() << "Unhandled R/M register encoding " << s << "\n"; @@ -1367,8 +1370,10 @@ OperandEncoding RecognizableInstr::roRegisterEncodingFromString ENCODING("FR64X", ENCODING_REG) ENCODING("FR32X", ENCODING_REG) ENCODING("VR512", ENCODING_REG) + ENCODING("VK1", ENCODING_REG) ENCODING("VK8", ENCODING_REG) ENCODING("VK16", ENCODING_REG) + ENCODING("VK1WM", ENCODING_REG) ENCODING("VK8WM", ENCODING_REG) ENCODING("VK16WM", ENCODING_REG) errs() << "Unhandled reg/opcode register encoding " << s << "\n"; @@ -1389,6 +1394,7 @@ OperandEncoding RecognizableInstr::vvvvRegisterEncodingFromString ENCODING("VR128X", ENCODING_VVVV) ENCODING("VR256X", ENCODING_VVVV) ENCODING("VR512", ENCODING_VVVV) + ENCODING("VK1", ENCODING_VVVV) ENCODING("VK8", ENCODING_VVVV) ENCODING("VK16", ENCODING_VVVV) errs() << "Unhandled VEX.vvvv register encoding " << s << "\n"; @@ -1398,6 +1404,7 @@ OperandEncoding RecognizableInstr::vvvvRegisterEncodingFromString OperandEncoding RecognizableInstr::writemaskRegisterEncodingFromString (const std::string &s, bool hasOpSizePrefix) { + ENCODING("VK1WM", ENCODING_WRITEMASK) ENCODING("VK8WM", ENCODING_WRITEMASK) ENCODING("VK16WM", ENCODING_WRITEMASK) errs() << "Unhandled mask register encoding " << s << "\n";