[X86][SSE] combineX86ShufflesRecursively - don't bother merging shuffles with empty roots. NFCI.
This doesn't affect actual codegen, but is a minor refactor toward fixing PR43024 where we need to avoid excess changes (folding zeroables etc.) to the shuffle mask at Depth == 0.
This commit is contained in:
parent
91661573fd
commit
8f29e4407c
|
@ -33037,7 +33037,8 @@ static SDValue combineX86ShufflesRecursively(
|
|||
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
|
||||
bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
|
||||
const X86Subtarget &Subtarget) {
|
||||
assert(RootMask.size() > 0 && (RootMask.size() > 1 || RootMask[0] == 0) &&
|
||||
assert(RootMask.size() > 0 &&
|
||||
(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
|
||||
"Illegal shuffle root mask");
|
||||
|
||||
// Bound the depth of our recursive combine because this is ultimately
|
||||
|
@ -33072,104 +33073,116 @@ static SDValue combineX86ShufflesRecursively(
|
|||
|
||||
resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
|
||||
|
||||
// Add the inputs to the Ops list, avoiding duplicates.
|
||||
SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
|
||||
SmallVector<int, 64> Mask;
|
||||
SmallVector<SDValue, 16> Ops;
|
||||
|
||||
auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
|
||||
// Attempt to find an existing match.
|
||||
SDValue InputBC = peekThroughBitcasts(Input);
|
||||
for (int i = 0, e = Ops.size(); i < e; ++i)
|
||||
if (InputBC == peekThroughBitcasts(Ops[i]))
|
||||
return i;
|
||||
// Match failed - should we replace an existing Op?
|
||||
if (InsertionPoint >= 0) {
|
||||
Ops[InsertionPoint] = Input;
|
||||
return InsertionPoint;
|
||||
// We don't need to merge masks if the root is empty.
|
||||
bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
|
||||
if (EmptyRoot) {
|
||||
Mask = OpMask;
|
||||
Ops.append(OpInputs.begin(), OpInputs.end());
|
||||
} else {
|
||||
// Add the inputs to the Ops list, avoiding duplicates.
|
||||
Ops.append(SrcOps.begin(), SrcOps.end());
|
||||
|
||||
auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
|
||||
// Attempt to find an existing match.
|
||||
SDValue InputBC = peekThroughBitcasts(Input);
|
||||
for (int i = 0, e = Ops.size(); i < e; ++i)
|
||||
if (InputBC == peekThroughBitcasts(Ops[i]))
|
||||
return i;
|
||||
// Match failed - should we replace an existing Op?
|
||||
if (InsertionPoint >= 0) {
|
||||
Ops[InsertionPoint] = Input;
|
||||
return InsertionPoint;
|
||||
}
|
||||
// Add to the end of the Ops list.
|
||||
Ops.push_back(Input);
|
||||
return Ops.size() - 1;
|
||||
};
|
||||
|
||||
SmallVector<int, 2> OpInputIdx;
|
||||
for (SDValue OpInput : OpInputs)
|
||||
OpInputIdx.push_back(
|
||||
AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
|
||||
|
||||
assert(((RootMask.size() > OpMask.size() &&
|
||||
RootMask.size() % OpMask.size() == 0) ||
|
||||
(OpMask.size() > RootMask.size() &&
|
||||
OpMask.size() % RootMask.size() == 0) ||
|
||||
OpMask.size() == RootMask.size()) &&
|
||||
"The smaller number of elements must divide the larger.");
|
||||
|
||||
// This function can be performance-critical, so we rely on the power-of-2
|
||||
// knowledge that we have about the mask sizes to replace div/rem ops with
|
||||
// bit-masks and shifts.
|
||||
assert(isPowerOf2_32(RootMask.size()) &&
|
||||
"Non-power-of-2 shuffle mask sizes");
|
||||
assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
|
||||
unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
|
||||
unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
|
||||
|
||||
unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
|
||||
unsigned RootRatio =
|
||||
std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
|
||||
unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
|
||||
assert((RootRatio == 1 || OpRatio == 1) &&
|
||||
"Must not have a ratio for both incoming and op masks!");
|
||||
|
||||
assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
|
||||
assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
|
||||
assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
|
||||
unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
|
||||
unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
|
||||
|
||||
Mask.resize(MaskWidth, SM_SentinelUndef);
|
||||
|
||||
// Merge this shuffle operation's mask into our accumulated mask. Note that
|
||||
// this shuffle's mask will be the first applied to the input, followed by
|
||||
// the root mask to get us all the way to the root value arrangement. The
|
||||
// reason for this order is that we are recursing up the operation chain.
|
||||
for (unsigned i = 0; i < MaskWidth; ++i) {
|
||||
unsigned RootIdx = i >> RootRatioLog2;
|
||||
if (RootMask[RootIdx] < 0) {
|
||||
// This is a zero or undef lane, we're done.
|
||||
Mask[i] = RootMask[RootIdx];
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned RootMaskedIdx =
|
||||
RootRatio == 1
|
||||
? RootMask[RootIdx]
|
||||
: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
|
||||
|
||||
// Just insert the scaled root mask value if it references an input other
|
||||
// than the SrcOp we're currently inserting.
|
||||
if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
|
||||
(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
|
||||
Mask[i] = RootMaskedIdx;
|
||||
continue;
|
||||
}
|
||||
|
||||
RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
|
||||
unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
|
||||
if (OpMask[OpIdx] < 0) {
|
||||
// The incoming lanes are zero or undef, it doesn't matter which ones we
|
||||
// are using.
|
||||
Mask[i] = OpMask[OpIdx];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
|
||||
unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
|
||||
: (OpMask[OpIdx] << OpRatioLog2) +
|
||||
(RootMaskedIdx & (OpRatio - 1));
|
||||
|
||||
OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
|
||||
int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
|
||||
assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
|
||||
OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
|
||||
|
||||
Mask[i] = OpMaskedIdx;
|
||||
}
|
||||
// Add to the end of the Ops list.
|
||||
Ops.push_back(Input);
|
||||
return Ops.size() - 1;
|
||||
};
|
||||
|
||||
SmallVector<int, 2> OpInputIdx;
|
||||
for (SDValue OpInput : OpInputs)
|
||||
OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
|
||||
|
||||
assert(((RootMask.size() > OpMask.size() &&
|
||||
RootMask.size() % OpMask.size() == 0) ||
|
||||
(OpMask.size() > RootMask.size() &&
|
||||
OpMask.size() % RootMask.size() == 0) ||
|
||||
OpMask.size() == RootMask.size()) &&
|
||||
"The smaller number of elements must divide the larger.");
|
||||
|
||||
// This function can be performance-critical, so we rely on the power-of-2
|
||||
// knowledge that we have about the mask sizes to replace div/rem ops with
|
||||
// bit-masks and shifts.
|
||||
assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
|
||||
assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
|
||||
unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
|
||||
unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
|
||||
|
||||
unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
|
||||
unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
|
||||
unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
|
||||
assert((RootRatio == 1 || OpRatio == 1) &&
|
||||
"Must not have a ratio for both incoming and op masks!");
|
||||
|
||||
assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
|
||||
assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
|
||||
assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
|
||||
unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
|
||||
unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
|
||||
|
||||
SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
|
||||
|
||||
// Merge this shuffle operation's mask into our accumulated mask. Note that
|
||||
// this shuffle's mask will be the first applied to the input, followed by the
|
||||
// root mask to get us all the way to the root value arrangement. The reason
|
||||
// for this order is that we are recursing up the operation chain.
|
||||
for (unsigned i = 0; i < MaskWidth; ++i) {
|
||||
unsigned RootIdx = i >> RootRatioLog2;
|
||||
if (RootMask[RootIdx] < 0) {
|
||||
// This is a zero or undef lane, we're done.
|
||||
Mask[i] = RootMask[RootIdx];
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned RootMaskedIdx =
|
||||
RootRatio == 1
|
||||
? RootMask[RootIdx]
|
||||
: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
|
||||
|
||||
// Just insert the scaled root mask value if it references an input other
|
||||
// than the SrcOp we're currently inserting.
|
||||
if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
|
||||
(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
|
||||
Mask[i] = RootMaskedIdx;
|
||||
continue;
|
||||
}
|
||||
|
||||
RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
|
||||
unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
|
||||
if (OpMask[OpIdx] < 0) {
|
||||
// The incoming lanes are zero or undef, it doesn't matter which ones we
|
||||
// are using.
|
||||
Mask[i] = OpMask[OpIdx];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
|
||||
unsigned OpMaskedIdx =
|
||||
OpRatio == 1
|
||||
? OpMask[OpIdx]
|
||||
: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
|
||||
|
||||
OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
|
||||
int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
|
||||
assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
|
||||
OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
|
||||
|
||||
Mask[i] = OpMaskedIdx;
|
||||
}
|
||||
|
||||
// Remove unused/repeated shuffle source ops.
|
||||
|
|
Loading…
Reference in New Issue