diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 15b60ba5bf6e..3ef7b2c7697b 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2131,6 +2131,16 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case X86ISD::GlobalBaseReg: return getGlobalBaseReg(); + case X86ISD::SHRUNKBLEND: { + // SHRUNKBLEND selects like a regular VSELECT. + SDValue VSelect = CurDAG->getNode( + ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2)); + ReplaceUses(SDValue(Node, 0), VSelect); + SelectCode(VSelect.getNode()); + // We already called ReplaceUses. + return nullptr; + } case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bde948174d71..53d9f9134299 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19025,6 +19025,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; + case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; @@ -22618,22 +22619,17 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // build_vector of constants. This will be taken care in a later // condition. (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && - VT != MVT::v8i16)) { + VT != MVT::v8i16) && + // Don't optimize vector of constants. Those are handled by + // the generic code and all the bits must be properly set for + // the generic optimizer. + !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) return SDValue(); - // Check all uses of that condition operand to check whether it will be - // consumed by non-BLEND instructions, which may depend on all bits are set - // properly. - for (SDNode::use_iterator I = Cond->use_begin(), - E = Cond->use_end(); I != E; ++I) - if (I->getOpcode() != ISD::VSELECT) - // TODO: Add other opcodes eventually lowered into BLEND. - return SDValue(); - assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -22641,13 +22637,45 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || - (TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, - TLO) && - // Don't optimize vector of constants. Those are handled by - // the generic code and all the bits must be properly set for - // the generic optimizer. - !ISD::isBuildVectorOfConstantSDNodes(TLO.New.getNode()))) - DCI.CommitTargetLoweringOpt(TLO); + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, + TLO)) { + // If we changed the computation somewhere in the DAG, this change + // will affect all users of Cond. + // Make sure it is fine and update all the nodes so that we do not + // use the generic VSELECT anymore. Otherwise, we may perform + // wrong optimizations as we messed up with the actual expectation + // for the vector boolean values. + if (Cond != TLO.Old) { + // Check all uses of that condition operand to check whether it will be + // consumed by non-BLEND instructions, which may depend on all bits are + // set properly. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + if (I->getOpcode() != ISD::VSELECT) + // TODO: Add other opcodes eventually lowered into BLEND. + return SDValue(); + + // Update all the users of the condition, before committing the change, + // so that the VSELECT optimizations that expect the correct vector + // boolean value will not be triggered. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + DAG.ReplaceAllUsesOfValueWith( + SDValue(*I, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), + Cond, I->getOperand(1), I->getOperand(2))); + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(); + } + // At this point, only Cond is changed. Change the condition + // just for N to keep the opportunity to optimize all other + // users their own way. + DAG.ReplaceAllUsesOfValueWith( + SDValue(N, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), + TLO.New, N->getOperand(1), N->getOperand(2))); + return SDValue(); + } } // We should generate an X86ISD::BLENDI from a vselect if its argument @@ -22661,7 +22689,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Iff we find this pattern and the build_vectors are built from // constants, we translate the vselect into a shuffle_vector that we // know will be matched by LowerVECTOR_SHUFFLEtoBlend. - if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { + if ((N->getOpcode() == ISD::VSELECT || + N->getOpcode() == X86ISD::SHRUNKBLEND) && + !DCI.isBeforeLegalize()) { SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); if (Shuffle.getNode()) return Shuffle; @@ -24854,7 +24884,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: - case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case ISD::SELECT: + case X86ISD::SHRUNKBLEND: + return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 35e132b944a1..2737703bd6db 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -190,6 +190,11 @@ namespace llvm { /// BLENDI - Blend where the selector is an immediate. BLENDI, + /// SHRUNKBLEND - Blend where the condition has been shrunk. + /// This is used to emphasize that the condition mask is + /// no more valid for generic VSELECT optimizations. + SHRUNKBLEND, + /// ADDSUB - Combined add and sub on an FP vector. ADDSUB, diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll index 7022b71336fd..0a3ed7e4b32e 100644 --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -315,9 +315,9 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) { ; SSE41-LABEL: vsel_double8: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm5[0,1] ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] -; SSE41-NEXT: blendpd {{.*#+}} xmm3 = xmm7[0,1] +; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm7, %xmm3 ; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_double8: @@ -353,10 +353,10 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { ; ; SSE41-LABEL: vsel_i648: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm5[0,1] -; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] -; SSE41-NEXT: blendpd {{.*#+}} xmm3 = xmm7[0,1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm7, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vsel_i648: diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index 7926b0c71f12..0c0f4bbf992a 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -25,3 +25,61 @@ body: store <4 x i16> %predphi42, <4 x i16>* %b, align 8 ret void } + +; Improve code coverage. +; +; When shrinking the condition used into the select to match a blend, this +; test case exercises the path where the modified node is not the root +; of the condition. +; +; CHECK-LABEL: test2: +; CHECK: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 +; CHECK-NEXT: vpshufd $78, %xmm0, %xmm0 ## xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]] +; CHECK: vblendvpd [[MASK]] +; CHECK: retq +define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { +bb: + %arrayidx1928 = getelementptr inbounds double** %call1559, i64 %indvars.iv4198 + %tmp1888 = load double** %arrayidx1928, align 8 + %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> , <4 x double> + %tmp1900 = bitcast double* %tmp1888 to <4 x double>* + store <4 x double> %predphi.v.v, <4 x double>* %tmp1900, align 8 + ret void +} + +; For this test, we used to optimized the conditional mask for the blend, i.e., +; we shrunk some of its bits. +; However, this same mask was used in another select (%predphi31) that turned out +; to be optimized into a and. In that case, the conditional mask was wrong. +; +; Make sure that the and is fed by the original mask. +; +; + +; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we +; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise, +; even a faulty pattern would pass! +; +; CHECK-LABEL: test3: +; Compute the original mask. +; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]] +; Shrink the bit of the mask. +; CHECK-NEXT: vpslld $31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]] +; Use the shrunk mask in the blend. +; CHECK-NEXT: vblendvps [[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} +; Use the original mask in the and. +; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}} +; CHECK: retq +define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { + %tmp6 = srem <4 x i32> %induction30, + %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer + %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12 + %predphi31 = select <4 x i1> %tmp7, <4 x i16> , <4 x i16> zeroinitializer + + store <4 x i16> %predphi31, <4 x i16>* %tmp16, align 8 + store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8 + ret void +}