AVX2 optimization.

Added generation of VPSHUB instruction for <32 x i8> vector shuffle when possible. llvm-svn: 163312
2012-09-06 12:42:01 +00:00 · 2012-09-06 12:42:01 +00:00 · 42777877c2
parent 3ecf916c33
commit 42777877c2
2 changed files with 51 additions and 0 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -6011,6 +6011,40 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
 }
 // v32i8 shuffles - Translate to VPSHUFB if possible.
 static
 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
                                 SelectionDAG &DAG,
                                 const X86TargetLowering &TLI) {
  EVT VT = SVOp->getValueType(0);
  SDValue V1 = SVOp->getOperand(0);
  SDValue V2 = SVOp->getOperand(1);
  DebugLoc dl = SVOp->getDebugLoc();
  ArrayRef<int> MaskVals = SVOp->getMask();
  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
  if (VT != MVT::v32i8 || !TLI.getSubtarget()->hasAVX2() || !V2IsUndef)
    return SDValue();
  SmallVector<SDValue,32> pshufbMask;
  for (unsigned i = 0; i != 32; i++) {
    int EltIdx = MaskVals[i];
    if (EltIdx < 0 || EltIdx >= 32)
      EltIdx = 0x80;
    else {
      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
        // Cross lane is not allowed.
        return SDValue();
      EltIdx &= 0xf;
    }
    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
  }
  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
                                  MVT::v32i8, &pshufbMask[0], 32));
 }
 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
 /// done when every pair / quad of shuffle mask elements point to elements in
@ -6837,6 +6871,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return NewOp;
  }
  if (VT == MVT::v32i8) {
    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, DAG, *this);
    if (NewOp.getNode())
      return NewOp;
  }
  // Handle all 128-bit wide vectors with 4 elements, and match them with
  // several different shuffle types.
  if (NumElems == 4 && VT.is128BitVector())
--- a/llvm/test/CodeGen/X86/avx2-shuffle.ll
+++ b/llvm/test/CodeGen/X86/avx2-shuffle.ll
@ -26,3 +26,14 @@ entry:
  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
  ret <16 x i16> %shuffle.i
 }
 ; CHECK: vpshufb_test
 ; CHECK; vpshufb {{.*\(%r.*}}, %ymm
 ; CHECK: ret
 define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
  ret <32 x i8>%S
 }