[X86][SSE] Avoid duplicate shuffle input sources in combineX86ShufflesRecursively

rL339686 added the case where a faux shuffle might have repeated shuffle inputs coming from either side of the OR().

This patch improves the insertion of the inputs into the source ops lists to account for this, as well as making it trivial to add support for shuffles with more than 2 inputs in the future.

llvm-svn: 339696
This commit is contained in:
Simon Pilgrim 2018-08-14 17:22:37 +00:00
parent 2d437f6b02
commit 2ce3d6e135
3 changed files with 36 additions and 36 deletions

View File

@ -30279,23 +30279,26 @@ static SDValue combineX86ShufflesRecursively(
// Add the inputs to the Ops list, avoiding duplicates.
SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
int InputIdx0 = -1, InputIdx1 = -1;
for (int i = 0, e = Ops.size(); i < e; ++i) {
SDValue BC = peekThroughBitcasts(Ops[i]);
if (Input0 && BC == peekThroughBitcasts(Input0))
InputIdx0 = i;
if (Input1 && BC == peekThroughBitcasts(Input1))
InputIdx1 = i;
}
auto AddOp = [&Ops](SDValue Input, int InsertionPoint = -1) -> int {
if (!Input)
return -1;
// Attempt to find an existing match.
SDValue InputBC = peekThroughBitcasts(Input);
for (int i = 0, e = Ops.size(); i < e; ++i)
if (InputBC == peekThroughBitcasts(Ops[i]))
return i;
// Match failed - should we replace an existing Op?
if (InsertionPoint >= 0) {
Ops[InsertionPoint] = Input;
return InsertionPoint;
}
// Add to the end of the Ops list.
Ops.push_back(Input);
return Ops.size() - 1;
};
if (Input0 && InputIdx0 < 0) {
InputIdx0 = SrcOpIndex;
Ops[SrcOpIndex] = Input0;
}
if (Input1 && InputIdx1 < 0) {
InputIdx1 = Ops.size();
Ops.push_back(Input1);
}
int InputIdx0 = AddOp(Input0, SrcOpIndex);
int InputIdx1 = AddOp(Input1);
assert(((RootMask.size() > OpMask.size() &&
RootMask.size() % OpMask.size() == 0) ||

View File

@ -923,18 +923,12 @@ define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8>
define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_pshufb_or_pshufb:
; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero
; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19]
; X32-NEXT: vpor %ymm0, %ymm1, %ymm0
; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_pshufb_or_pshufb:
; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero
; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19]
; X64-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; X64-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)

View File

@ -707,20 +707,23 @@ define <16 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<16 x i8> %a0, <16 x i8>
define <16 x i8> @combine_pshufb_pshufb_or_pshufb(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_pshufb_or_pshufb:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero
; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_pshufb_or_pshufb:
; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3]
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX-NEXT: retq
; AVX1-LABEL: combine_pshufb_pshufb_or_pshufb:
; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_pshufb_pshufb_or_pshufb:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vbroadcastss %xmm0, %xmm0
; AVX512F-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
%3 = or <16 x i8> %1, %2