[X86][SSE] Add shuffle combine support for OR(PSHUFB,PSHUFB) style patterns.

If each element is zero from one (or both) inputs then we can combine these into a single shuffle mask.

llvm-svn: 339686
This commit is contained in:
Simon Pilgrim 2018-08-14 16:00:05 +00:00
parent c84155ff6c
commit ed55138247
4 changed files with 112 additions and 112 deletions

View File

@ -4406,6 +4406,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
case X86ISD::VPERMV3:
return true;
// 'Faux' Target Shuffles.
case ISD::OR:
case ISD::AND:
case X86ISD::ANDNP:
return true;
@ -6166,6 +6167,12 @@ static bool setTargetShuffleZeroElements(SDValue N,
return true;
}
// Forward declaration (for getFauxShuffleMask recursive check).
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
const SelectionDAG &DAG);
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
@ -6219,6 +6226,43 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
Ops.push_back(IsAndN ? N1 : N0);
return true;
}
case ISD::OR: {
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
!resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
return false;
// TODO - Add support for more than 2 inputs.
if ((SrcInputs0.size() + SrcInputs1.size()) > 2)
return false;
int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
for (int i = 0; i != MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
Mask.push_back(SM_SentinelZero);
else if (Mask1[i] == SM_SentinelZero)
Mask.push_back(Mask0[i]);
else if (Mask0[i] == SM_SentinelZero)
Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
else
return false;
}
for (SDValue &Op : SrcInputs0)
Ops.push_back(Op);
for (SDValue &Op : SrcInputs1)
Ops.push_back(Op);
return true;
}
case ISD::SCALAR_TO_VECTOR: {
// Match against a scalar_to_vector of an extract from a vector,
// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
@ -35216,6 +35260,15 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(
{Op}, 0, Op, {0}, {}, /*Depth*/ 1,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return Res;
}
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
return SDValue();

View File

@ -745,57 +745,53 @@ define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: pandn %xmm4, %xmm5
; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
; SSE2-NEXT: packuswb %xmm0, %xmm3
; SSE2-NEXT: movq %xmm3, (%rsi)
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
; SSE2-NEXT: pand %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
; SSE2-NEXT: packuswb %xmm0, %xmm4
; SSE2-NEXT: movq %xmm4, (%rsi)
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pand %xmm4, %xmm5
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm5, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: pandn %xmm4, %xmm5
; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,1,0,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4]
; SSE2-NEXT: packuswb %xmm0, %xmm3
; SSE2-NEXT: movq %xmm3, (%rdx)
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
; SSE2-NEXT: pand %xmm5, %xmm4
; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
; SSE2-NEXT: packuswb %xmm0, %xmm4
; SSE2-NEXT: movq %xmm4, (%rdx)
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
; SSE2-NEXT: pand %xmm0, %xmm4
; SSE2-NEXT: pandn %xmm3, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]

View File

@ -891,18 +891,12 @@ define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) {
; X32-LABEL: combine_pshufb_pshufb_or_as_blend:
; X32: # %bb.0:
; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; X32-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_pshufb_or_as_blend:
; X64: # %bb.0:
; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; X64-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; X64-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
@ -913,18 +907,12 @@ define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1
define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
; X32-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; X32: # %bb.0:
; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; X32-NEXT: vpor %ymm1, %ymm0, %ymm0
; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; X64: # %bb.0:
; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
%2 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)

View File

@ -669,42 +669,19 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @combine_pshufb_pshufb_or_as_blend(<16 x i8> %a0, <16 x i8> %a1) {
; SSSE3-LABEL: combine_pshufb_pshufb_or_as_blend:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[8,9,10,11,12,13,14,15]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_pshufb_or_as_blend:
; SSE41: # %bb.0:
; SSE41-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_pshufb_pshufb_or_as_blend:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pshufb_pshufb_or_as_blend:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_pshufb_pshufb_or_as_blend:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
; AVX-LABEL: combine_pshufb_pshufb_or_as_blend:
; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
%3 = or <16 x i8> %1, %2
@ -712,28 +689,14 @@ define <16 x i8> @combine_pshufb_pshufb_or_as_blend(<16 x i8> %a0, <16 x i8> %a1
}
define <16 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
; SSSE3-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
; SSE-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)