[x86] Fix PR22706 where we would incorrectly try lower a v32i8 dynamic

blend as legal.

We made the same mistake in two different places. Whenever we are custom
lowering a v32i8 blend we need to check whether we are custom lowering
it only for constant conditions that can be shuffled, or whether we
actually have AVX2 and full dynamic blending support on bytes. Both are
fixed, with comments added to make it clear what is going on and a new
test case.

llvm-svn: 230695
This commit is contained in:
Chandler Carruth 2015-02-26 22:15:34 +00:00
parent b6cd5fe918
commit 653773d004
2 changed files with 41 additions and 13 deletions

View File

@ -10126,24 +10126,31 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget->hasSSE41()) if (!Subtarget->hasSSE41())
return SDValue(); return SDValue();
// Some types for vselect were previously set to Expand, not Legal or // Only some types will be legal on some subtargets. If we can emit a legal
// Custom. Return an empty SDValue so we fall-through to Expand, after // VSELECT-matching blend, return Op, and but if we need to expand, return
// the Custom lowering phase. // a null value.
MVT VT = Op.getSimpleValueType(); switch (Op.getSimpleValueType().SimpleTy) {
switch (VT.SimpleTy) {
default: default:
break; // Most of the vector types have blends past SSE4.1.
return Op;
case MVT::v32i8:
// The byte blends for AVX vectors were introduced only in AVX2.
if (Subtarget->hasAVX2())
return Op;
return SDValue();
case MVT::v8i16: case MVT::v8i16:
case MVT::v16i16: case MVT::v16i16:
// AVX-512 BWI and VLX features support VSELECT with i16 elements.
if (Subtarget->hasBWI() && Subtarget->hasVLX()) if (Subtarget->hasBWI() && Subtarget->hasVLX())
break; return Op;
// FIXME: We should custom lower this by fixing the condition and using i8
// blends.
return SDValue(); return SDValue();
} }
// We couldn't create a "Blend with immediate" node.
// This node should still be legal, but we'll have to emit a blendv*
// instruction.
return Op;
} }
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
@ -20784,7 +20791,17 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// lowered. // lowered.
if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
return SDValue(); return SDValue();
if (!Subtarget->hasSSE41() || VT == MVT::v16i16 || VT == MVT::v8i16) // FIXME: We don't support i16-element blends currently. We could and
// should support them by making *all* the bits in the condition be set
// rather than just the high bit and using an i8-element blend.
if (VT.getScalarType() == MVT::i16)
return SDValue();
// Dynamic blending was only available from SSE4.1 onward.
if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
return SDValue();
// Byte blends are only available in AVX2
if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
!Subtarget->hasAVX2())
return SDValue(); return SDValue();
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");

View File

@ -79,3 +79,14 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8 store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8
ret void ret void
} }
; We shouldn't try to lower this directly using VSELECT because we don't have
; vpblendvb in AVX1, only in AVX2. Instead, it should be expanded.
;
; CHECK-LABEL: PR22706:
; CHECK: vpcmpgtb
; CHECK: vpcmpgtb
define <32 x i8> @PR22706(<32 x i1> %x) {
%tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
ret <32 x i8> %tmp
}