From cd3c1c4a16e8dc8a5e5981b78f4c6ac46b9667c0 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 5 Dec 2012 09:24:57 +0000 Subject: [PATCH] Simplified BLEND pattern matching for shuffles. Generate VPBLENDD for AVX2 and VPBLENDW for v16i16 type on AVX2. llvm-svn: 169366 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 89 +++++++++----------- llvm/lib/Target/X86/X86ISelLowering.h | 8 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 4 +- llvm/lib/Target/X86/X86InstrSSE.td | 45 +++++----- llvm/test/CodeGen/X86/avx2-shuffle.ll | 57 +++++++++++-- llvm/test/CodeGen/X86/vec_shuffle-20.ll | 2 +- 6 files changed, 121 insertions(+), 84 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 81e8a7bd88d1..b3ff4ee98e3d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5641,64 +5641,53 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - MVT VT = SVOp->getValueType(0).getSimpleVT(); + EVT VT = SVOp->getValueType(0); + EVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); - if (!Subtarget->hasSSE41()) + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) return SDValue(); - unsigned ISDNo = 0; - MVT OpTy; + // Check the mask for BLEND and build the value. + unsigned MaskValue = 0; + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems-1)/8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; - switch (VT.SimpleTy) { - default: return SDValue(); - case MVT::v8i16: - ISDNo = X86ISD::BLENDPW; - OpTy = MVT::v8i16; - break; - case MVT::v4i32: - case MVT::v4f32: - ISDNo = X86ISD::BLENDPS; - OpTy = MVT::v4f32; - break; - case MVT::v2i64: - case MVT::v2f64: - ISDNo = X86ISD::BLENDPD; - OpTy = MVT::v2f64; - break; - case MVT::v8i32: - case MVT::v8f32: - if (!Subtarget->hasFp256()) - return SDValue(); - ISDNo = X86ISD::BLENDPS; - OpTy = MVT::v8f32; - break; - case MVT::v4i64: - case MVT::v4f64: - if (!Subtarget->hasFp256()) - return SDValue(); - ISDNo = X86ISD::BLENDPD; - OpTy = MVT::v4f64; - break; - } - assert(ISDNo && "Invalid Op Number"); + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { - unsigned MaskVals = 0; - - for (unsigned i = 0; i != NumElems; ++i) { + int SndLaneEltIdx = (NumLanes == 2) ? + SVOp->getMaskElt(i + NumElemsInLane) : -1; int EltIdx = SVOp->getMaskElt(i); - if (EltIdx == (int)i || EltIdx < 0) - MaskVals |= (1<hasInt256())) { + BlendVT = EVT::getVectorVT(*DAG.getContext(), + EVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, + DAG.getConstant(MaskValue, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, VT, Ret); } @@ -11972,9 +11961,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDV: return "X86ISD::BLENDV"; - case X86ISD::BLENDPW: return "X86ISD::BLENDPW"; - case X86ISD::BLENDPS: return "X86ISD::BLENDPS"; - case X86ISD::BLENDPD: return "X86ISD::BLENDPD"; + case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 2988cee03715..e830c5fef62a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -176,13 +176,11 @@ namespace llvm { /// PSIGN - Copy integer sign. PSIGN, - /// BLENDV - Blend where the selector is an XMM. + /// BLENDV - Blend where the selector is a register. BLENDV, - /// BLENDxx - Blend where the selector is an immediate. - BLENDPW, - BLENDPS, - BLENDPD, + /// BLENDI - Blend where the selector is an immediate. + BLENDI, /// HADD - Integer horizontal add. HADD, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 73ba0011df1b..09ab99516698 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -187,9 +187,7 @@ def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; -def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>; -def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>; -def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>; +def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 229e8b263f9d..da06bf525bf7 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -6891,31 +6891,31 @@ let Predicates = [HasAVX] in { (v4f64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v8f32 (X86Blendps (v8f32 VR256:$src1), (v8f32 VR256:$src2), + def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2), (imm:$mask))), - (VBLENDPSYrri VR256:$src2, VR256:$src1, imm:$mask)>; - def : Pat<(v4f64 (X86Blendpd (v4f64 VR256:$src1), (v4f64 VR256:$src2), + (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>; + def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2), (imm:$mask))), - (VBLENDPDYrri VR256:$src2, VR256:$src1, imm:$mask)>; + (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>; - def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2), + def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), (imm:$mask))), - (VPBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>; - def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2), + (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), (imm:$mask))), - (VBLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>; - def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2), + (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), (imm:$mask))), - (VBLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>; + (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; } let Predicates = [HasAVX2] in { def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), (v32i8 VR256:$src2))), - (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v16i16 (X86Blendpw (v16i16 VR256:$src1), (v16i16 VR256:$src2), + (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>; + def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2), (imm:$mask))), - (VPBLENDWYrri VR256:$src2, VR256:$src1, imm:$mask)>; + (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } /// SS41I_ternary_int - SSE 4.1 ternary operator @@ -6979,15 +6979,15 @@ let Predicates = [UseSSE41] in { (v2f64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2), + def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), (imm:$mask))), - (PBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>; - def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2), + (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), (imm:$mask))), - (BLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>; - def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2), + (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), (imm:$mask))), - (BLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>; + (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; } @@ -7873,6 +7873,13 @@ defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, VR256, memopv4i64, i256mem>, VEX_L; } +def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), + imm:$mask)), + (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>; +def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), + imm:$mask)), + (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>; + //===----------------------------------------------------------------------===// // VPBROADCAST - Load from memory and broadcast to all elements of the // destination operand diff --git a/llvm/test/CodeGen/X86/avx2-shuffle.ll b/llvm/test/CodeGen/X86/avx2-shuffle.ll index a414e6880c32..cf319cb7fe1d 100644 --- a/llvm/test/CodeGen/X86/avx2-shuffle.ll +++ b/llvm/test/CodeGen/X86/avx2-shuffle.ll @@ -4,15 +4,62 @@ ; The mask for the vpblendw instruction needs to be identical for both halves ; of the YMM. Need to use two vpblendw instructions. -; CHECK: blendw1 -; CHECK: vpblendw -; CHECK: vpblendw +; CHECK: vpblendw_test1 +; mask = 10010110,b = 150,d +; CHECK: vpblendw $150, %ymm ; CHECK: ret -define <16 x i16> @blendw1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> +define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline { + %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %t } +; CHECK: vpblendw_test2 +; mask1 = 00010110 = 22 +; mask2 = 10000000 = 128 +; CHECK: vpblendw $128, %xmm +; CHECK: vpblendw $22, %xmm +; CHECK: vinserti128 +; CHECK: ret +define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline { + %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %t +} + +; CHECK: blend_test1 +; CHECK: vpblendd +; CHECK: ret +define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline { + %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %t +} + +; CHECK: blend_test2 +; CHECK: vpblendd +; CHECK: ret +define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline { + %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %t +} + + +; CHECK: blend_test3 +; CHECK: vblendps +; CHECK: ret +define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline { + %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %t +} + +; CHECK: blend_test4 +; CHECK: vblendpd +; CHECK: ret +define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline { + %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %t +} + ; CHECK: vpshufhw $27, %ymm define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp { entry: diff --git a/llvm/test/CodeGen/X86/vec_shuffle-20.ll b/llvm/test/CodeGen/X86/vec_shuffle-20.ll index 976cd1835b40..b6b8ba6f846a 100644 --- a/llvm/test/CodeGen/X86/vec_shuffle-20.ll +++ b/llvm/test/CodeGen/X86/vec_shuffle-20.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 3 +; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2 define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind { entry: