diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5355053241a4..0ea98784eb3e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1556,6 +1556,7 @@ void X86TargetLowering::resetOperationActions() { setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -11615,6 +11616,10 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); + // Fold this packed shift into its first operand if ShiftAmt is 0. + if (ShiftAmt == 0) + return SrcOp; + // Check for ShiftAmt >= element width if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) @@ -18484,6 +18489,55 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: return SDValue(); + // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: { + SDValue Op0 = N->getOperand(1); + SDValue Op1 = N->getOperand(2); + EVT VT = Op0.getValueType(); + assert(VT.isVector() && "Expected a vector type!"); + + if (isa(Op1)) + Op1 = Op1.getOperand(0); + + if (!isa(Op1)) + return SDValue(); + + EVT SVT = VT.getVectorElementType(); + unsigned SVTBits = SVT.getSizeInBits(); + + ConstantSDNode *CND = cast(Op1); + const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); + uint64_t ShAmt = C.getZExtValue(); + + // Don't try to convert this shift into a ISD::SRA if the shift + // count is bigger than or equal to the element size. + if (ShAmt >= SVTBits) + return SDValue(); + + // Trivial case: if the shift count is zero, then fold this + // into the first operand. + if (ShAmt == 0) + return Op0; + + // Replace this packed shift intrinsic with a target independent + // shift dag node. + SDValue Splat = DAG.getConstant(C, VT); + return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat); + } + } +} + /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. @@ -20304,6 +20358,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); + case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG); } return SDValue(); diff --git a/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll new file mode 100644 index 000000000000..0560a8d6ae25 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll @@ -0,0 +1,51 @@ +; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s + +; Verify that the backend correctly combines AVX2 builtin intrinsics. + + +define <8 x i32> @test_psra_1(<8 x i32> %A) { + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3) + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) + %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2) + ret <8 x i32> %3 +} +; CHECK-LABEL: test_psra_1 +; CHECK: vpsrad $8, %ymm0, %ymm0 +; CHECK-NEXT: ret + +define <16 x i16> @test_psra_2(<16 x i16> %A) { + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3) + %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) + %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2) + ret <16 x i16> %3 +} +; CHECK-LABEL: test_psra_2 +; CHECK: vpsraw $8, %ymm0, %ymm0 +; CHECK-NEXT: ret + +define <16 x i16> @test_psra_3(<16 x i16> %A) { + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0) + %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) + %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0) + ret <16 x i16> %3 +} +; CHECK-LABEL: test_psra_3 +; CHECK-NOT: vpsraw +; CHECK: ret + +define <8 x i32> @test_psra_4(<8 x i32> %A) { + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0) + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) + %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0) + ret <8 x i32> %3 +} +; CHECK-LABEL: test_psra_4 +; CHECK-NOT: vpsrad +; CHECK: ret + + +declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) +declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) +declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) +declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) + diff --git a/llvm/test/CodeGen/X86/combine-sse2-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse2-intrinsics.ll new file mode 100644 index 000000000000..fa500e5d8d67 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-sse2-intrinsics.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s + +; Verify that the backend correctly combines SSE2 builtin intrinsics. + + +define <4 x i32> @test_psra_1(<4 x i32> %A) { + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 3) + %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 2) + ret <4 x i32> %3 +} +; CHECK-LABEL: test_psra_1 +; CHECK: psrad $8, %xmm0 +; CHECK-NEXT: ret + +define <8 x i16> @test_psra_2(<8 x i16> %A) { + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 3) + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) + %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 2) + ret <8 x i16> %3 +} +; CHECK-LABEL: test_psra_2 +; CHECK: psraw $8, %xmm0 +; CHECK-NEXT: ret + +define <4 x i32> @test_psra_3(<4 x i32> %A) { + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0) + %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 0) + ret <4 x i32> %3 +} +; CHECK-LABEL: test_psra_3 +; CHECK-NOT: psrad +; CHECK: ret + + +define <8 x i16> @test_psra_4(<8 x i16> %A) { + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0) + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) + %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0) + ret <8 x i16> %3 +} +; CHECK-LABEL: test_psra_4 +; CHECK-NOT: psraw +; CHECK: ret + + +declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) +declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) +