[CodeGenPrepare] Teach when it is profitable to speculate calls to @llvm.cttz/ctlz.

If the control flow is modelling an if-statement where the only instruction in
the 'then' basic block (excluding the terminator) is a call to cttz/ctlz,
CodeGenPrepare can try to speculate the cttz/ctlz call and simplify the control
flow graph.

Example:
\code
entry:
  %cmp = icmp eq i64 %val, 0
  br i1 %cmp, label %end.bb, label %then.bb

then.bb:
  %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 true)
  br label %end.bb

end.bb:
  %cond = phi i64 [ %c, %then.bb ], [ 64, %entry]
\code

In this example, basic block %then.bb is taken if value %val is not zero.
Also, the phi node in %end.bb would propagate the size-of in bits of %val
only if %val is equal to zero.

With this patch, CodeGenPrepare will try to hoist the call to cttz from %then.bb
into basic block %entry only if cttz is cheap to speculate for the target.

Added two new hooks in TargetLowering.h to let targets customize the behavior
(i.e. decide whether it is cheap or not to speculate calls to cttz/ctlz). The
two new methods are 'isCheapToSpeculateCtlz' and 'isCheapToSpeculateCttz'.
By default, both methods return 'false'.
On X86, method 'isCheapToSpeculateCtlz' returns true only if the target has
LZCNT. Method 'isCheapToSpeculateCttz' only returns true if the target has BMI.

Differential Revision: http://reviews.llvm.org/D6728

llvm-svn: 224899
This commit is contained in:
Andrea Di Biagio 2014-12-28 11:07:35 +00:00
parent 738e58799c
commit 22ee3f63b9
5 changed files with 415 additions and 0 deletions

View File

@ -249,6 +249,16 @@ public:
return true;
}
/// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
virtual bool isCheapToSpeculateCttz() const {
return false;
}
/// \brief Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual bool isCheapToSpeculateCtlz() const {
return false;
}
/// \brief Return if the target supports combining a
/// chain like:
/// \code

View File

@ -3978,6 +3978,119 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
}
// See if we can speculate calls to intrinsic cttz/ctlz.
//
// Example:
// entry:
// ...
// %cmp = icmp eq i64 %val, 0
// br i1 %cmp, label %end.bb, label %then.bb
//
// then.bb:
// %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 true)
// br label %EndBB
//
// end.bb:
// %cond = phi i64 [ %c, %then.bb ], [ 64, %entry ]
//
// ==>
//
// entry:
// ...
// %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 false)
//
static bool OptimizeBranchInst(BranchInst *BrInst, const TargetLowering &TLI) {
assert(BrInst->isConditional() && "Expected a conditional branch!");
BasicBlock *ThenBB = BrInst->getSuccessor(1);
BasicBlock *EndBB = BrInst->getSuccessor(0);
// See if ThenBB contains only one instruction (excluding the
// terminator and DbgInfoIntrinsic calls).
IntrinsicInst *II = nullptr;
for (BasicBlock::iterator I = ThenBB->begin(),
E = std::prev(ThenBB->end()); I != E; ++I) {
// Skip debug info.
if (isa<DbgInfoIntrinsic>(I))
continue;
if (II)
// Avoid speculating more than one instruction.
return false;
// See if this is a call to intrinsic cttz/ctlz.
if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::cttz>())) {
// Avoid speculating expensive intrinsic calls.
if (!TLI.isCheapToSpeculateCttz())
return false;
}
else if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::ctlz>())) {
// Avoid speculating expensive intrinsic calls.
if (!TLI.isCheapToSpeculateCtlz())
return false;
} else
return false;
II = cast<IntrinsicInst>(I);
}
// Look for PHI nodes with 'II' as the incoming value from 'ThenBB'.
BasicBlock *EntryBB = BrInst->getParent();
for (BasicBlock::iterator I = EndBB->begin();
PHINode *PN = dyn_cast<PHINode>(I); ++I) {
Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
Value *OrigV = PN->getIncomingValueForBlock(EntryBB);
if (!OrigV || ThenV != II)
return false;
if (ConstantInt *CInt = dyn_cast<ConstantInt>(OrigV)) {
unsigned BitWidth = ThenV->getType()->getIntegerBitWidth();
// Don't try to simplify this phi node if 'ThenV' is a cttz/ctlz
// intrinsic call, but 'OrigV' is not equal to the 'size-of' in bits
// of the value in input to the cttz/ctlz.
if (CInt->getValue() != BitWidth)
return false;
// Hoist the call to cttz/ctlz from ThenBB into EntryBB.
EntryBB->getInstList().splice(BrInst, ThenBB->getInstList(),
ThenBB->begin(), std::prev(ThenBB->end()));
// Update PN setting ThenV as the incoming value from both 'EntryBB'
// and 'ThenBB'. Eventually, method 'OptimizeInst' will fold this
// phi node if all the incoming values are the same.
PN->setIncomingValue(PN->getBasicBlockIndex(EntryBB), ThenV);
PN->setIncomingValue(PN->getBasicBlockIndex(ThenBB), ThenV);
// Clear the 'undef on zero' flag of the cttz/ctlz intrinsic call.
if (cast<ConstantInt>(II->getArgOperand(1))->isOne()) {
Type *Ty = II->getArgOperand(0)->getType();
Value *Args[] = { II->getArgOperand(0),
ConstantInt::getFalse(II->getContext()) };
Module *M = EntryBB->getParent()->getParent();
Value *IF = Intrinsic::getDeclaration(M, II->getIntrinsicID(), Ty);
IRBuilder<> Builder(BrInst);
Instruction *NewI = Builder.CreateCall(IF, Args);
// Replace the old call to cttz/ctlz.
II->replaceAllUsesWith(NewI);
II->eraseFromParent();
}
// Update BrInst condition so that the branch to EndBB is always taken.
// Later on, method 'ConstantFoldTerminator' will simplify this branch
// replacing it with a direct branch to 'EndBB'.
// As a side effect, CodeGenPrepare will attempt to simplify the control
// flow graph by deleting basic block 'ThenBB' and merging 'EntryBB' into
// 'EndBB' (calling method 'EliminateFallThrough').
BrInst->setCondition(ConstantInt::getTrue(BrInst->getContext()));
return true;
}
}
return false;
}
/// Some targets can do store(extractelement) with one instruction.
/// Try to push the extractelement towards the stores when the target
/// has this feature and this is profitable.
@ -4130,6 +4243,34 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
if (isa<ExtractElementInst>(I))
return OptimizeExtractElementInst(I);
if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
if (TLI && BI->isConditional() && BI->getCondition()->hasOneUse()) {
// Check if the branch condition compares a value agaist zero.
if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
if (ICI->getPredicate() == ICmpInst::ICMP_EQ &&
match(ICI->getOperand(1), m_Zero())) {
BasicBlock *ThenBB = BI->getSuccessor(1);
BasicBlock *EndBB = BI->getSuccessor(0);
// Check if ThenBB is only reachable from this basic block; also,
// check if EndBB has more than one predecessor.
if (ThenBB->getSinglePredecessor() &&
!EndBB->getSinglePredecessor()) {
TerminatorInst *TI = ThenBB->getTerminator();
if (TI->getNumSuccessors() == 1 && TI->getSuccessor(0) == EndBB &&
// Try to speculate calls to intrinsic cttz/ctlz from 'ThenBB'.
OptimizeBranchInst(BI, *TLI)) {
ModifiedDT = true;
return true;
}
}
}
}
}
return false;
}
return false;
}

View File

@ -3882,6 +3882,16 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
return (Index == 0 || Index == ResVT.getVectorNumElements());
}
bool X86TargetLowering::isCheapToSpeculateCttz() const {
// Don't try to speculate cttz if we can't directly use TZCNT.
return Subtarget->hasBMI();
}
bool X86TargetLowering::isCheapToSpeculateCtlz() const {
// Don't try to speculate ctlz if we can't directly use LZCNT.
return Subtarget->hasLZCNT();
}
/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {

View File

@ -633,6 +633,10 @@ namespace llvm {
/// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
bool isCheapToSpeculateCttz() const override;
bool isCheapToSpeculateCtlz() const override;
/// Return the value type to use for ISD::SETCC.
EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;

View File

@ -0,0 +1,250 @@
; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s --check-prefix=ALL --check-prefix=BMI
; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+lzcnt < %s | FileCheck %s --check-prefix=ALL --check-prefix=LZCNT
; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=GENERIC
define i64 @test1(i64 %A) {
; ALL-LABEL: @test1(
; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i64 @llvm.ctlz.i64(i64 %A, i1 false)
; LZCNT-NEXT: ret i64 [[CTLZ]]
; BMI: icmp eq i64 %A, 0
; BMI: call i64 @llvm.ctlz.i64(i64 %A, i1 true)
; GENERIC: icmp eq i64 %A, 0
; GENERIC: call i64 @llvm.ctlz.i64(i64 %A, i1 true)
entry:
%tobool = icmp eq i64 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
ret i64 %cond
}
define i32 @test2(i32 %A) {
; ALL-LABEL: @test2(
; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i32 @llvm.ctlz.i32(i32 %A, i1 false)
; LZCNT-NEXT: ret i32 [[CTLZ]]
; BMI: icmp eq i32 %A, 0
; BMI: call i32 @llvm.ctlz.i32(i32 %A, i1 true)
; GENERIC: icmp eq i32 %A, 0
; GENERIC: call i32 @llvm.ctlz.i32(i32 %A, i1 true)
entry:
%tobool = icmp eq i32 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
ret i32 %cond
}
define signext i16 @test3(i16 signext %A) {
; ALL-LABEL: @test3(
; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i16 @llvm.ctlz.i16(i16 %A, i1 false)
; LZCNT-NEXT: ret i16 [[CTLZ]]
; BMI: icmp eq i16 %A, 0
; BMI: call i16 @llvm.ctlz.i16(i16 %A, i1 true)
; GENERIC: icmp eq i16 %A, 0
; GENERIC: call i16 @llvm.ctlz.i16(i16 %A, i1 true)
entry:
%tobool = icmp eq i16 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
ret i16 %cond
}
define i64 @test1b(i64 %A) {
; ALL-LABEL: @test1b(
; LZCNT: icmp eq i64 %A, 0
; LZCNT: call i64 @llvm.cttz.i64(i64 %A, i1 true)
; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i64 @llvm.cttz.i64(i64 %A, i1 false)
; BMI-NEXT: ret i64 [[CTTZ]]
; GENERIC: icmp eq i64 %A, 0
; GENERIC: call i64 @llvm.cttz.i64(i64 %A, i1 true)
entry:
%tobool = icmp eq i64 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
ret i64 %cond
}
define i32 @test2b(i32 %A) {
; ALL-LABEL: @test2b(
; LZCNT: icmp eq i32 %A, 0
; LZCNT: call i32 @llvm.cttz.i32(i32 %A, i1 true)
; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %A, i1 false)
; BMI-NEXT: ret i32 [[CTTZ]]
; GENERIC: icmp eq i32 %A, 0
; GENERIC: call i32 @llvm.cttz.i32(i32 %A, i1 true)
entry:
%tobool = icmp eq i32 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
ret i32 %cond
}
define signext i16 @test3b(i16 signext %A) {
; ALL-LABEL: @test3b(
; LZCNT: icmp eq i16 %A, 0
; LZCNT: call i16 @llvm.cttz.i16(i16 %A, i1 true)
; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i16 @llvm.cttz.i16(i16 %A, i1 false)
; BMI-NEXT: ret i16 [[CTTZ]]
; GENERIC: icmp eq i16 %A, 0
; GENERIC: call i16 @llvm.cttz.i16(i16 %A, i1 true)
entry:
%tobool = icmp eq i16 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
ret i16 %cond
}
define i64 @test1c(i64 %A) {
; ALL-LABEL: @test1c(
; ALL: icmp eq i64 %A, 0
; ALL: call i64 @llvm.ctlz.i64(i64 %A, i1 true)
entry:
%tobool = icmp eq i64 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i64 [ %0, %cond.true ], [ 63, %entry ]
ret i64 %cond
}
define i32 @test2c(i32 %A) {
; ALL-LABEL: @test2c(
; ALL: icmp eq i32 %A, 0
; ALL: call i32 @llvm.ctlz.i32(i32 %A, i1 true)
entry:
%tobool = icmp eq i32 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i32 [ %0, %cond.true ], [ 31, %entry ]
ret i32 %cond
}
define signext i16 @test3c(i16 signext %A) {
; ALL-LABEL: @test3c(
; ALL: icmp eq i16 %A, 0
; ALL: call i16 @llvm.ctlz.i16(i16 %A, i1 true)
entry:
%tobool = icmp eq i16 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]
ret i16 %cond
}
define i64 @test1d(i64 %A) {
; ALL-LABEL: @test1d(
; ALL: icmp eq i64 %A, 0
; ALL: call i64 @llvm.cttz.i64(i64 %A, i1 true)
entry:
%tobool = icmp eq i64 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i64 [ %0, %cond.true ], [ 63, %entry ]
ret i64 %cond
}
define i32 @test2d(i32 %A) {
; ALL-LABEL: @test2d(
; ALL: icmp eq i32 %A, 0
; ALL: call i32 @llvm.cttz.i32(i32 %A, i1 true)
entry:
%tobool = icmp eq i32 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i32 [ %0, %cond.true ], [ 31, %entry ]
ret i32 %cond
}
define signext i16 @test3d(i16 signext %A) {
; ALL-LABEL: @test3d(
; ALL: icmp eq i16 %A, 0
; ALL: call i16 @llvm.cttz.i16(i16 %A, i1 true)
entry:
%tobool = icmp eq i16 %A, 0
br i1 %tobool, label %cond.end, label %cond.true
cond.true: ; preds = %entry
%0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
br label %cond.end
cond.end: ; preds = %entry, %cond.true
%cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]
ret i16 %cond
}
declare i64 @llvm.ctlz.i64(i64, i1)
declare i32 @llvm.ctlz.i32(i32, i1)
declare i16 @llvm.ctlz.i16(i16, i1)
declare i64 @llvm.cttz.i64(i64, i1)
declare i32 @llvm.cttz.i32(i32, i1)
declare i16 @llvm.cttz.i16(i16, i1)