From cd3c1c4a16e8dc8a5e5981b78f4c6ac46b9667c0 Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Wed, 5 Dec 2012 09:24:57 +0000
Subject: [PATCH] Simplified BLEND pattern matching for shuffles. Generate
 VPBLENDD for AVX2 and VPBLENDW for v16i16 type on AVX2.

llvm-svn: 169366
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      | 89 +++++++++-----------
 llvm/lib/Target/X86/X86ISelLowering.h        |  8 +-
 llvm/lib/Target/X86/X86InstrFragmentsSIMD.td |  4 +-
 llvm/lib/Target/X86/X86InstrSSE.td           | 45 +++++-----
 llvm/test/CodeGen/X86/avx2-shuffle.ll        | 57 +++++++++++--
 llvm/test/CodeGen/X86/vec_shuffle-20.ll      |  2 +-
 6 files changed, 121 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 81e8a7bd88d1..b3ff4ee98e3d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5641,64 +5641,53 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  EVT VT = SVOp->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
-  if (!Subtarget->hasSSE41())
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
     return SDValue();
 
-  unsigned ISDNo = 0;
-  MVT OpTy;
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems-1)/8 + 1; 
+  unsigned NumElemsInLane = NumElems / NumLanes;
 
-  switch (VT.SimpleTy) {
-  default: return SDValue();
-  case MVT::v8i16:
-    ISDNo = X86ISD::BLENDPW;
-    OpTy = MVT::v8i16;
-    break;
-  case MVT::v4i32:
-  case MVT::v4f32:
-    ISDNo = X86ISD::BLENDPS;
-    OpTy = MVT::v4f32;
-    break;
-  case MVT::v2i64:
-  case MVT::v2f64:
-    ISDNo = X86ISD::BLENDPD;
-    OpTy = MVT::v2f64;
-    break;
-  case MVT::v8i32:
-  case MVT::v8f32:
-    if (!Subtarget->hasFp256())
-      return SDValue();
-    ISDNo = X86ISD::BLENDPS;
-    OpTy = MVT::v8f32;
-    break;
-  case MVT::v4i64:
-  case MVT::v4f64:
-    if (!Subtarget->hasFp256())
-      return SDValue();
-    ISDNo = X86ISD::BLENDPD;
-    OpTy = MVT::v4f64;
-    break;
-  }
-  assert(ISDNo && "Invalid Op Number");
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
 
-  unsigned MaskVals = 0;
-
-  for (unsigned i = 0; i != NumElems; ++i) {
+    int SndLaneEltIdx = (NumLanes == 2) ? 
+      SVOp->getMaskElt(i + NumElemsInLane) : -1;
     int EltIdx = SVOp->getMaskElt(i);
-    if (EltIdx == (int)i || EltIdx < 0)
-      MaskVals |= (1<<i);
-    else if (EltIdx == (int)(i + NumElems))
-      continue; // Bit is set to zero;
-    else
+
+    if ((EltIdx == -1 || EltIdx == (int)i) && 
+        (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
+      continue;
+
+    if (((unsigned)EltIdx == (i + NumElems)) && 
+        (SndLaneEltIdx == -1 || 
+         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
+      MaskValue |= (1<<i);
+    else 
       return SDValue();
   }
 
-  V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
-  V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
-  SDValue Ret =  DAG.getNode(ISDNo, dl, OpTy, V1, V2,
-                             DAG.getConstant(MaskVals, MVT::i32));
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  EVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = EVT::getVectorVT(*DAG.getContext(), 
+                              EVT::getFloatingPointVT(EltVT.getSizeInBits()), 
+                              NumElems);
+    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
+  }
+  
+  SDValue Ret =  DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
+                             DAG.getConstant(MaskValue, MVT::i32));
   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
 }
 
@@ -11972,9 +11961,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   case X86ISD::BLENDV:             return "X86ISD::BLENDV";
-  case X86ISD::BLENDPW:            return "X86ISD::BLENDPW";
-  case X86ISD::BLENDPS:            return "X86ISD::BLENDPS";
-  case X86ISD::BLENDPD:            return "X86ISD::BLENDPD";
+  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   case X86ISD::HADD:               return "X86ISD::HADD";
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 2988cee03715..e830c5fef62a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -176,13 +176,11 @@ namespace llvm {
       /// PSIGN - Copy integer sign.
       PSIGN,
 
-      /// BLENDV - Blend where the selector is an XMM.
+      /// BLENDV - Blend where the selector is a register.
       BLENDV,
 
-      /// BLENDxx - Blend where the selector is an immediate.
-      BLENDPW,
-      BLENDPS,
-      BLENDPD,
+      /// BLENDI - Blend where the selector is an immediate.
+      BLENDI,
 
       /// HADD - Integer horizontal add.
       HADD,
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 73ba0011df1b..09ab99516698 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -187,9 +187,7 @@ def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 
-def X86Blendpw   : SDNode<"X86ISD::BLENDPW",   SDTBlend>;
-def X86Blendps   : SDNode<"X86ISD::BLENDPS",   SDTBlend>;
-def X86Blendpd   : SDNode<"X86ISD::BLENDPD",   SDTBlend>;
+def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
 def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 229e8b263f9d..da06bf525bf7 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6891,31 +6891,31 @@ let Predicates = [HasAVX] in {
                             (v4f64 VR256:$src2))),
             (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
 
-  def : Pat<(v8f32 (X86Blendps (v8f32 VR256:$src1), (v8f32 VR256:$src2),
+  def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2),
                                (imm:$mask))),
-            (VBLENDPSYrri VR256:$src2, VR256:$src1, imm:$mask)>;
-  def : Pat<(v4f64 (X86Blendpd (v4f64 VR256:$src1), (v4f64 VR256:$src2),
+            (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+  def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2),
                                (imm:$mask))),
-            (VBLENDPDYrri VR256:$src2, VR256:$src1, imm:$mask)>;
+            (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
 
-  def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
                                (imm:$mask))),
-            (VPBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>;
-  def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+            (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
                                (imm:$mask))),
-            (VBLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>;
-  def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+            (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
                                (imm:$mask))),
-            (VBLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>;
+            (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
 }
 
 let Predicates = [HasAVX2] in {
   def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
                             (v32i8 VR256:$src2))),
-            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
-  def : Pat<(v16i16 (X86Blendpw (v16i16 VR256:$src1), (v16i16 VR256:$src2),
+            (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>;
+  def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
                                (imm:$mask))),
-            (VPBLENDWYrri VR256:$src2, VR256:$src1, imm:$mask)>;
+            (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
 }
 
 /// SS41I_ternary_int - SSE 4.1 ternary operator
@@ -6979,15 +6979,15 @@ let Predicates = [UseSSE41] in {
                             (v2f64 VR128:$src2))),
             (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
 
-  def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
                                (imm:$mask))),
-            (PBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>;
-  def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+            (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
                                (imm:$mask))),
-            (BLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>;
-  def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+            (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
                                (imm:$mask))),
-            (BLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>;
+            (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
 
 }
 
@@ -7873,6 +7873,13 @@ defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
                                     VR256, memopv4i64, i256mem>, VEX_L;
 }
 
+def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
+                  imm:$mask)),
+          (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2),
+                  imm:$mask)),
+          (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+
 //===----------------------------------------------------------------------===//
 // VPBROADCAST - Load from memory and broadcast to all elements of the
 //               destination operand
diff --git a/llvm/test/CodeGen/X86/avx2-shuffle.ll b/llvm/test/CodeGen/X86/avx2-shuffle.ll
index a414e6880c32..cf319cb7fe1d 100644
--- a/llvm/test/CodeGen/X86/avx2-shuffle.ll
+++ b/llvm/test/CodeGen/X86/avx2-shuffle.ll
@@ -4,15 +4,62 @@
 ; The mask for the vpblendw instruction needs to be identical for both halves
 ; of the YMM. Need to use two vpblendw instructions.
 
-; CHECK: blendw1
-; CHECK: vpblendw
-; CHECK: vpblendw
+; CHECK: vpblendw_test1
+; mask = 10010110,b = 150,d
+; CHECK: vpblendw  $150, %ymm
 ; CHECK: ret
-define <16 x i16> @blendw1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
-  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3,  i32 20, i32 5,  i32 6,  i32 23, 
+                                                               i32 8, i32 25, i32 26, i32 11, i32 28, i32 13, i32 14, i32 31>
   ret <16 x i16> %t
 }
 
+; CHECK: vpblendw_test2
+; mask1 = 00010110 = 22
+; mask2 = 10000000 = 128
+; CHECK: vpblendw  $128, %xmm
+; CHECK: vpblendw  $22, %xmm
+; CHECK: vinserti128
+; CHECK: ret
+define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, 
+                                                               i32 8, i32 9,  i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %t
+}
+
+; CHECK: blend_test1
+; CHECK: vpblendd
+; CHECK: ret
+define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
+  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+  ret <8 x i32> %t
+}
+
+; CHECK: blend_test2
+; CHECK: vpblendd
+; CHECK: ret
+define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
+  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+  ret <8 x i32> %t
+}
+
+
+; CHECK: blend_test3
+; CHECK: vblendps
+; CHECK: ret
+define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline {
+  %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
+  ret <8 x float> %t
+}
+
+; CHECK: blend_test4
+; CHECK: vblendpd
+; CHECK: ret
+define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
+  %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  ret <4 x i64> %t
+}
+
 ; CHECK: vpshufhw $27, %ymm
 define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
 entry:
diff --git a/llvm/test/CodeGen/X86/vec_shuffle-20.ll b/llvm/test/CodeGen/X86/vec_shuffle-20.ll
index 976cd1835b40..b6b8ba6f846a 100644
--- a/llvm/test/CodeGen/X86/vec_shuffle-20.ll
+++ b/llvm/test/CodeGen/X86/vec_shuffle-20.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 3
+; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2
 
 define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind  {
 entry: