This commit contains a few changes that had to go in together.

1. Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
   (and also scalar_to_vector).

2. Xor/and/or are indifferent to the swizzle operation (shuffle of one src).
   Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A, B))

3. Optimize swizzles of shuffles:  shuff(shuff(x, y), undef) -> shuff(x, y).

4. Fix an X86ISelLowering optimization which was very bitcast-sensitive.

Code which was previously compiled to this:

movd    (%rsi), %xmm0
movdqa  .LCPI0_0(%rip), %xmm2
pshufb  %xmm2, %xmm0
movd    (%rdi), %xmm1
pshufb  %xmm2, %xmm1
pxor    %xmm0, %xmm1
pshufb  .LCPI0_1(%rip), %xmm1
movd    %xmm1, (%rdi)
ret

Now compiles to this:

movl    (%rsi), %eax
xorl    %eax, (%rdi)
ret

llvm-svn: 153848
This commit is contained in:
Nadav Rotem 2012-04-01 19:31:22 +00:00
parent ac19edd2b0
commit b078350872
8 changed files with 127 additions and 22 deletions

View File

@ -2336,6 +2336,68 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
ORNode, N0.getOperand(1)); ORNode, N0.getOperand(1));
} }
// Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
// Only perform this optimization after type legalization and before
// LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
// adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
// we don't want to undo this promotion.
// We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
// on scalars.
if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR)
&& Level == AfterLegalizeVectorOps) {
SDValue In0 = N0.getOperand(0);
SDValue In1 = N1.getOperand(0);
EVT In0Ty = In0.getValueType();
EVT In1Ty = In1.getValueType();
// If both incoming values are integers, and the original types are the same.
if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), In0Ty, In0, In1);
SDValue BC = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, Op);
AddToWorkList(Op.getNode());
return BC;
}
}
// Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
// Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
// If both shuffles use the same mask, and both shuffle within a single
// vector, then it is worthwhile to move the swizzle after the operation.
// The type-legalizer generates this pattern when loading illegal
// vector types from memory. In many cases this allows additional shuffle
// optimizations.
if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(N0);
ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(N1);
SDValue In0 = SVN0->getOperand(0);
SDValue In1 = SVN1->getOperand(0);
EVT In0Ty = In0.getValueType();
EVT In1Ty = In1.getValueType();
unsigned NumElts = VT.getVectorNumElements();
// Check that both shuffles are swizzles.
bool SingleVecShuff = (N0.getOperand(1).getOpcode() == ISD::UNDEF &&
N1.getOperand(1).getOpcode() == ISD::UNDEF);
// Check that both shuffles use the same mask. The masks are known to be of
// the same length because the result vector type is the same.
bool SameMask = true;
for (unsigned i = 0; i != NumElts; ++i) {
int Idx0 = SVN0->getMaskElt(i);
int Idx1 = SVN1->getMaskElt(i);
if (Idx0 != Idx1) {
SameMask = false;
break;
}
}
if (SameMask && SingleVecShuff && In0Ty == In1Ty) {
SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT, In0, In1);
SDValue Shuff = DAG.getVectorShuffle(VT, N->getDebugLoc(), Op,
DAG.getUNDEF(VT), &SVN0->getMask()[0]);
AddToWorkList(Op.getNode());
return Shuff;
}
}
return SDValue(); return SDValue();
} }
@ -7721,6 +7783,36 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
return N0; return N0;
} }
} }
// If this shuffle node is simply a swizzle of another shuffle node,
// optimize shuffle(shuffle(x, y), undef) -> shuffle(x, y).
if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
N1.getOpcode() == ISD::UNDEF) {
SmallVector<int, 8> NewMask;
ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
EVT InVT = N0.getValueType();
int InNumElts = InVT.getVectorNumElements();
for (unsigned i = 0; i != NumElts; ++i) {
int Idx = SVN->getMaskElt(i);
// If we access the second (undef) operand then this index can be
// canonicalized to undef as well.
if (Idx >= InNumElts)
Idx = -1;
// Next, this index comes from the first value, which is the incoming
// shuffle. Adopt the incoming index.
if (Idx >= 0)
Idx = OtherSV->getMaskElt(Idx);
NewMask.push_back(Idx);
}
return DAG.getVectorShuffle(VT, N->getDebugLoc(), OtherSV->getOperand(0),
OtherSV->getOperand(1), &NewMask[0]);
}
return SDValue(); return SDValue();
} }

View File

@ -14000,13 +14000,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
return SDValue(); return SDValue();
// Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
if (Mask.getOpcode() != ISD::BITCAST ||
X.getOpcode() != ISD::BITCAST ||
Y.getOpcode() != ISD::BITCAST)
return SDValue();
// Look through mask bitcast. // Look through mask bitcast.
Mask = Mask.getOperand(0); if (Mask.getOpcode() == ISD::BITCAST)
Mask = Mask.getOperand(0);
if (X.getOpcode() == ISD::BITCAST)
X = X.getOperand(0);
if (Y.getOpcode() == ISD::BITCAST)
Y = Y.getOperand(0);
EVT MaskVT = Mask.getValueType(); EVT MaskVT = Mask.getValueType();
// Validate that the Mask operand is a vector sra node. // Validate that the Mask operand is a vector sra node.
@ -14027,8 +14028,6 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
// Now we know we at least have a plendvb with the mask val. See if // Now we know we at least have a plendvb with the mask val. See if
// we can form a psignb/w/d. // we can form a psignb/w/d.
// psign = x.type == y.type == mask.type && y = sub(0, x); // psign = x.type == y.type == mask.type && y = sub(0, x);
X = X.getOperand(0);
Y = Y.getOperand(0);
if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {

View File

@ -273,7 +273,7 @@ define arm_aapcs_vfpcc i32 @t10() nounwind {
entry: entry:
; CHECK: t10: ; CHECK: t10:
; CHECK: vmov.i32 q[[Q0:[0-9]+]], #0x3f000000 ; CHECK: vmov.i32 q[[Q0:[0-9]+]], #0x3f000000
; CHECK: vmul.f32 q8, q8, d0[0] ; CHECK: vmul.f32 q8, q8, d[[DREG:[0-1]+]]
; CHECK: vadd.f32 q8, q8, q8 ; CHECK: vadd.f32 q8, q8, q8
%0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
%1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1] %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]

View File

@ -1,5 +1,5 @@
; RUN: llc < %s -march=cellspu -o %t1.s ; RUN: llc < %s -march=cellspu -o %t1.s
; RUN: grep rot %t1.s | count 86 ; RUN: grep rot %t1.s | count 85
; RUN: grep roth %t1.s | count 8 ; RUN: grep roth %t1.s | count 8
; RUN: grep roti.*5 %t1.s | count 1 ; RUN: grep roti.*5 %t1.s | count 1
; RUN: grep roti.*27 %t1.s | count 1 ; RUN: grep roti.*27 %t1.s | count 1

View File

@ -3,14 +3,14 @@
target triple = "x86_64-unknown-linux-gnu" target triple = "x86_64-unknown-linux-gnu"
;CHECK: ltstore ;CHECK: ltstore
;CHECK: pshufd ;CHECK: movq
;CHECK: pshufd ;CHECK-NEXT: movq
;CHECK: ret ;CHECK-NEXT: ret
define void @ltstore() { define void @ltstore(<4 x i32>* %pIn, <2 x i32>* %pOut) {
entry: entry:
%in = load <4 x i32>* undef %in = load <4 x i32>* %pIn
%j = shufflevector <4 x i32> %in, <4 x i32> undef, <2 x i32> <i32 0, i32 1> %j = shufflevector <4 x i32> %in, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
store <2 x i32> %j, <2 x i32>* undef store <2 x i32> %j, <2 x i32>* %pOut
ret void ret void
} }

View File

@ -0,0 +1,14 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
; Check that we perform a scalar XOR on i32.
; CHECK: pull_bitcast
; CHECK: xorl
; CHECK: ret
define void @pull_bitcast (<4 x i8>* %pA, <4 x i8>* %pB) {
%A = load <4 x i8>* %pA
%B = load <4 x i8>* %pB
%C = xor <4 x i8> %A, %B
store <4 x i8> %C, <4 x i8>* %pA
ret void
}

View File

@ -27,11 +27,11 @@ entry:
define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline { define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
entry: entry:
; CHECK: t02 ; CHECK: t02
; CHECK: movaps ; CHECK: mov
; CHECK: shufps ; CHECK-NEXT: mov
; CHECK: pshufd ; CHECK-NEXT: mov
; CHECK: movq ; CHECK-NEXT: mov
; CHECK: ret ; CHECK-NEXT: ret
%0 = bitcast <8 x i32>* %source to <4 x i32>* %0 = bitcast <8 x i32>* %source to <4 x i32>*
%arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3 %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
%tmp2 = load <4 x i32>* %arrayidx, align 16 %tmp2 = load <4 x i32>* %arrayidx, align 16

View File

@ -33,7 +33,7 @@ entry:
define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind { define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
entry: entry:
; CHECK: shuf3: ; CHECK: shuf3:
; CHECK: shufps ; CHECK: shufd
%shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5> %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
%tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>