[JumpThreading] Re-enable JumpThreading for guards

Summary:
JumpThreading for guards feature has been reverted at https://reviews.llvm.org/rL295200
due to the following problem: the feature used the following algorithm for detection of
diamond patters:

1. Find a block with 2 predecessors;
2. Check that these blocks have a common single parent;
3. Check that the parent's terminator is a branch instruction.

The problem is that these checks are insufficient. They may pass for a non-diamond
construction in case if those two predecessors are actually the same block. This may
happen if parent's terminator is a br (either conditional or unconditional) to a block
that ends with "switch" instruction with exactly two branches going to one block.

This patch re-enables the JumpThreading for guards and fixes this issue by adding the
check that those found predecessors are actually different blocks. This guarantees that
parent's terminator is a conditional branch with exactly 2 different successors, which
is now ensured by assertions. It also adds two more tests for this situation (with parent's
terminator being a conditional and an unconditional branch).

Patch by Max Kazantsev!

Reviewers: anna, sanjoy, reames

Reviewed By: sanjoy

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D30036

llvm-svn: 295410
This commit is contained in:
Sanjoy Das 2017-02-17 04:21:14 +00:00
parent a0921f68ab
commit 8b859c26ec
6 changed files with 439 additions and 14 deletions

View File

@ -23,6 +23,7 @@
#include "llvm/Analysis/LazyValueInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/ValueHandle.h"
namespace llvm {
@ -62,6 +63,7 @@ class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
bool HasProfileData = false;
bool HasGuards = false;
#ifdef NDEBUG
SmallPtrSet<const BasicBlock *, 16> LoopHeaders;
#else
@ -122,6 +124,9 @@ public:
bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
bool TryToUnfoldSelectInCurrBB(BasicBlock *BB);
bool ProcessGuards(BasicBlock *BB);
bool ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, BranchInst *BI);
private:
BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
const char *Suffix);

View File

@ -245,6 +245,16 @@ Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
void remapInstructionsInBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
ValueToValueMapTy &VMap);
/// Split edge between BB and PredBB and duplicate all non-Phi instructions
/// from BB between its beginning and the StopAt instruction into the split
/// block. Phi nodes are not duplicated, but their uses are handled correctly:
/// we replace them with the uses of corresponding Phi inputs. ValueMapping
/// is used to map the original instructions from BB to their newly-created
/// copies. Returns the split block.
BasicBlock *
DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
Instruction *StopAt,
ValueToValueMapTy &ValueMapping);
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_CLONING_H

View File

@ -30,11 +30,13 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <algorithm>
@ -169,6 +171,9 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
// When profile data is available, we need to update edge weights after
// successful jump threading, which requires both BPI and BFI being available.
HasProfileData = HasProfileData_;
auto *GuardDecl = F.getParent()->getFunction(
Intrinsic::getName(Intrinsic::experimental_guard));
HasGuards = GuardDecl && !GuardDecl->use_empty();
if (HasProfileData) {
BPI = std::move(BPI_);
BFI = std::move(BFI_);
@ -238,10 +243,13 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
return EverChanged;
}
/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
/// thread across it. Stop scanning the block when passing the threshold.
static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
/// Return the cost of duplicating a piece of this block from first non-phi
/// and before StopAt instruction to thread across it. Stop scanning the block
/// when exceeding the threshold. If duplication is impossible, returns ~0U.
static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
Instruction *StopAt,
unsigned Threshold) {
assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
/// Ignore PHI nodes, these will be flattened when duplication happens.
BasicBlock::const_iterator I(BB->getFirstNonPHI());
@ -249,15 +257,17 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
// branch, so they shouldn't count against the duplication cost.
unsigned Bonus = 0;
const TerminatorInst *BBTerm = BB->getTerminator();
// Threading through a switch statement is particularly profitable. If this
// block ends in a switch, decrease its cost to make it more likely to happen.
if (isa<SwitchInst>(BBTerm))
Bonus = 6;
if (BB->getTerminator() == StopAt) {
// Threading through a switch statement is particularly profitable. If this
// block ends in a switch, decrease its cost to make it more likely to
// happen.
if (isa<SwitchInst>(StopAt))
Bonus = 6;
// The same holds for indirect branches, but slightly more so.
if (isa<IndirectBrInst>(BBTerm))
Bonus = 8;
// The same holds for indirect branches, but slightly more so.
if (isa<IndirectBrInst>(StopAt))
Bonus = 8;
}
// Bump the threshold up so the early exit from the loop doesn't skip the
// terminator-based Size adjustment at the end.
@ -266,7 +276,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
// Sum up the cost of each instruction until we get to the terminator. Don't
// include the terminator because the copy won't include it.
unsigned Size = 0;
for (; !isa<TerminatorInst>(I); ++I) {
for (; &*I != StopAt; ++I) {
// Stop scanning the block if we've reached the threshold.
if (Size > Threshold)
@ -712,6 +722,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
if (TryToUnfoldSelectInCurrBB(BB))
return true;
// Look if we can propagate guards to predecessors.
if (HasGuards && ProcessGuards(BB))
return true;
// What kind of constant we're looking for.
ConstantPreference Preference = WantInteger;
@ -1466,7 +1480,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
return false;
}
unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
unsigned JumpThreadCost =
getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (JumpThreadCost > BBDupThreshold) {
DEBUG(dbgs() << " Not threading BB '" << BB->getName()
<< "' - Cost is too high: " << JumpThreadCost << "\n");
@ -1754,7 +1769,8 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
return false;
}
unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
unsigned DuplicationCost =
getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (DuplicationCost > BBDupThreshold) {
DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
<< "' - Cost is too high: " << DuplicationCost << "\n");
@ -2019,3 +2035,130 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
return false;
}
/// Try to propagate a guard from the current BB into one of its predecessors
/// in case if another branch of execution implies that the condition of this
/// guard is always true. Currently we only process the simplest case that
/// looks like:
///
/// Start:
/// %cond = ...
/// br i1 %cond, label %T1, label %F1
/// T1:
/// br label %Merge
/// F1:
/// br label %Merge
/// Merge:
/// %condGuard = ...
/// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
///
/// And cond either implies condGuard or !condGuard. In this case all the
/// instructions before the guard can be duplicated in both branches, and the
/// guard is then threaded to one of them.
bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
using namespace PatternMatch;
// We only want to deal with two predecessors.
BasicBlock *Pred1, *Pred2;
auto PI = pred_begin(BB), PE = pred_end(BB);
if (PI == PE)
return false;
Pred1 = *PI++;
if (PI == PE)
return false;
Pred2 = *PI++;
if (PI != PE)
return false;
if (Pred1 == Pred2)
return false;
// Try to thread one of the guards of the block.
// TODO: Look up deeper than to immediate predecessor?
auto *Parent = Pred1->getSinglePredecessor();
if (!Parent || Parent != Pred2->getSinglePredecessor())
return false;
if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
for (auto &I : *BB)
if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
return true;
return false;
}
/// Try to propagate the guard from BB which is the lower block of a diamond
/// to one of its branches, in case if diamond's condition implies guard's
/// condition.
bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
BranchInst *BI) {
assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
assert(BI->isConditional() && "Unconditional branch has 2 successors?");
Value *GuardCond = Guard->getArgOperand(0);
Value *BranchCond = BI->getCondition();
BasicBlock *TrueDest = BI->getSuccessor(0);
BasicBlock *FalseDest = BI->getSuccessor(1);
auto &DL = BB->getModule()->getDataLayout();
bool TrueDestIsSafe = false;
bool FalseDestIsSafe = false;
// True dest is safe if BranchCond => GuardCond.
auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
if (Impl && *Impl)
TrueDestIsSafe = true;
else {
// False dest is safe if !BranchCond => GuardCond.
Impl =
isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
if (Impl && *Impl)
FalseDestIsSafe = true;
}
if (!TrueDestIsSafe && !FalseDestIsSafe)
return false;
BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
ValueToValueMapTy UnguardedMapping, GuardedMapping;
Instruction *AfterGuard = Guard->getNextNode();
unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
if (Cost > BBDupThreshold)
return false;
// Duplicate all instructions before the guard and the guard itself to the
// branch where implication is not proved.
GuardedBlock = DuplicateInstructionsInSplitBetween(
BB, GuardedBlock, AfterGuard, GuardedMapping);
assert(GuardedBlock && "Could not create the guarded block?");
// Duplicate all instructions before the guard in the unguarded branch.
// Since we have successfully duplicated the guarded block and this block
// has fewer instructions, we expect it to succeed.
UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
Guard, UnguardedMapping);
assert(UnguardedBlock && "Could not create the unguarded block?");
DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
<< GuardedBlock->getName() << "\n");
// Some instructions before the guard may still have uses. For them, we need
// to create Phi nodes merging their copies in both guarded and unguarded
// branches. Those instructions that have no uses can be just removed.
SmallVector<Instruction *, 4> ToRemove;
for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
if (!isa<PHINode>(&*BI))
ToRemove.push_back(&*BI);
Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
assert(InsertionPoint && "Empty block?");
// Substitute with Phis & remove.
for (auto *Inst : reverse(ToRemove)) {
if (!Inst->use_empty()) {
PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
NewPN->insertBefore(InsertionPoint);
Inst->replaceAllUsesWith(NewPN);
}
Inst->eraseFromParent();
}
return true;
}

View File

@ -747,3 +747,40 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
return NewLoop;
}
/// \brief Duplicate non-Phi instructions from the beginning of block up to
/// StopAt instruction into a split block between BB and its predecessor.
BasicBlock *
llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
Instruction *StopAt,
ValueToValueMapTy &ValueMapping) {
// We are going to have to map operands from the original BB block to the new
// copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to
// account for entry from PredBB.
BasicBlock::iterator BI = BB->begin();
for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
BasicBlock *NewBB = SplitEdge(PredBB, BB);
NewBB->setName(PredBB->getName() + ".split");
Instruction *NewTerm = NewBB->getTerminator();
// Clone the non-phi instructions of BB into NewBB, keeping track of the
// mapping and using it to remap operands in the cloned instructions.
for (; StopAt != &*BI; ++BI) {
Instruction *New = BI->clone();
New->setName(BI->getName());
New->insertBefore(NewTerm);
ValueMapping[&*BI] = New;
// Remap operands to patch up intra-block references.
for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
auto I = ValueMapping.find(Inst);
if (I != ValueMapping.end())
New->setOperand(i, I->second);
}
}
return NewBB;
}

View File

@ -0,0 +1,183 @@
; RUN: opt < %s -jump-threading -dce -S | FileCheck %s
declare void @llvm.experimental.guard(i1, ...)
declare i32 @f1()
declare i32 @f2()
define i32 @branch_implies_guard(i32 %a) {
; CHECK-LABEL: @branch_implies_guard(
%cond = icmp slt i32 %a, 10
br i1 %cond, label %T1, label %F1
T1:
; CHECK: T1.split
; CHECK: %v1 = call i32 @f1()
; CHECK-NEXT: %retVal
; CHECK-NEXT: br label %Merge
%v1 = call i32 @f1()
br label %Merge
F1:
; CHECK: F1.split
; CHECK: %v2 = call i32 @f2()
; CHECK-NEXT: %retVal
; CHECK-NEXT: %condGuard
; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %condGuard
; CHECK-NEXT: br label %Merge
%v2 = call i32 @f2()
br label %Merge
Merge:
; CHECK: Merge
; CHECK-NOT: call void(i1, ...) @llvm.experimental.guard(
%retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
%retVal = add i32 %retPhi, 10
%condGuard = icmp slt i32 %a, 20
call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
ret i32 %retVal
}
define i32 @not_branch_implies_guard(i32 %a) {
; CHECK-LABEL: @not_branch_implies_guard(
%cond = icmp slt i32 %a, 20
br i1 %cond, label %T1, label %F1
T1:
; CHECK: T1.split:
; CHECK-NEXT: %v1 = call i32 @f1()
; CHECK-NEXT: %retVal
; CHECK-NEXT: %condGuard
; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %condGuard
; CHECK-NEXT: br label %Merge
%v1 = call i32 @f1()
br label %Merge
F1:
; CHECK: F1.split:
; CHECK-NEXT: %v2 = call i32 @f2()
; CHECK-NEXT: %retVal
; CHECK-NEXT: br label %Merge
%v2 = call i32 @f2()
br label %Merge
Merge:
; CHECK: Merge
; CHECK-NOT: call void(i1, ...) @llvm.experimental.guard(
%retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
%retVal = add i32 %retPhi, 10
%condGuard = icmp sgt i32 %a, 10
call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
ret i32 %retVal
}
define i32 @branch_overlaps_guard(i32 %a) {
; CHECK-LABEL: @branch_overlaps_guard(
%cond = icmp slt i32 %a, 20
br i1 %cond, label %T1, label %F1
T1:
; CHECK: T1:
; CHECK-NEXT: %v1 = call i32 @f1()
; CHECK-NEXT: br label %Merge
%v1 = call i32 @f1()
br label %Merge
F1:
; CHECK: F1:
; CHECK-NEXT: %v2 = call i32 @f2()
; CHECK-NEXT: br label %Merge
%v2 = call i32 @f2()
br label %Merge
Merge:
; CHECK: Merge
; CHECK: %condGuard = icmp slt i32 %a, 10
; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
%retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
%retVal = add i32 %retPhi, 10
%condGuard = icmp slt i32 %a, 10
call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
ret i32 %retVal
}
define i32 @branch_doesnt_overlap_guard(i32 %a) {
; CHECK-LABEL: @branch_doesnt_overlap_guard(
%cond = icmp slt i32 %a, 10
br i1 %cond, label %T1, label %F1
T1:
; CHECK: T1:
; CHECK-NEXT: %v1 = call i32 @f1()
; CHECK-NEXT: br label %Merge
%v1 = call i32 @f1()
br label %Merge
F1:
; CHECK: F1:
; CHECK-NEXT: %v2 = call i32 @f2()
; CHECK-NEXT: br label %Merge
%v2 = call i32 @f2()
br label %Merge
Merge:
; CHECK: Merge
; CHECK: %condGuard = icmp sgt i32 %a, 20
; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
%retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
%retVal = add i32 %retPhi, 10
%condGuard = icmp sgt i32 %a, 20
call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
ret i32 %retVal
}
define i32 @not_a_diamond1(i32 %a, i1 %cond1) {
; CHECK-LABEL: @not_a_diamond1(
br i1 %cond1, label %Pred, label %Exit
Pred:
; CHECK: Pred:
; CHECK-NEXT: switch i32 %a, label %Exit
switch i32 %a, label %Exit [
i32 10, label %Merge
i32 20, label %Merge
]
Merge:
; CHECK: Merge:
; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
; CHECK-NEXT: br label %Exit
call void(i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
br label %Exit
Exit:
; CHECK: Exit:
; CHECK-NEXT: ret i32 %a
ret i32 %a
}
define void @not_a_diamond2(i32 %a, i1 %cond1) {
; CHECK-LABEL: @not_a_diamond2(
br label %Parent
Merge:
call void(i1, ...) @llvm.experimental.guard(i1 %cond1)[ "deopt"() ]
ret void
Pred:
; CHECK-NEXT: Pred:
; CHECK-NEXT: switch i32 %a, label %Exit
switch i32 %a, label %Exit [
i32 10, label %Merge
i32 20, label %Merge
]
Parent:
br label %Pred
Exit:
; CHECK: Merge:
; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
; CHECK-NEXT: ret void
ret void
}

View File

@ -201,6 +201,53 @@ TEST_F(CloneInstruction, CallingConvention) {
delete F2;
}
TEST_F(CloneInstruction, DuplicateInstructionsToSplit) {
Type *ArgTy1[] = {Type::getInt32PtrTy(context)};
FunctionType *FT = FunctionType::get(Type::getVoidTy(context), ArgTy1, false);
V = new Argument(Type::getInt32Ty(context));
Function *F = Function::Create(FT, Function::ExternalLinkage);
BasicBlock *BB1 = BasicBlock::Create(context, "", F);
IRBuilder<> Builder1(BB1);
BasicBlock *BB2 = BasicBlock::Create(context, "", F);
IRBuilder<> Builder2(BB2);
Builder1.CreateBr(BB2);
Instruction *AddInst = cast<Instruction>(Builder2.CreateAdd(V, V));
Instruction *MulInst = cast<Instruction>(Builder2.CreateMul(AddInst, V));
Instruction *SubInst = cast<Instruction>(Builder2.CreateSub(MulInst, V));
Builder2.CreateRetVoid();
ValueToValueMapTy Mapping;
auto Split = DuplicateInstructionsInSplitBetween(BB2, BB1, SubInst, Mapping);
EXPECT_TRUE(Split);
EXPECT_EQ(Mapping.size(), 2u);
EXPECT_TRUE(Mapping.find(AddInst) != Mapping.end());
EXPECT_TRUE(Mapping.find(MulInst) != Mapping.end());
auto AddSplit = dyn_cast<Instruction>(Mapping[AddInst]);
EXPECT_TRUE(AddSplit);
EXPECT_EQ(AddSplit->getOperand(0), V);
EXPECT_EQ(AddSplit->getOperand(1), V);
EXPECT_EQ(AddSplit->getParent(), Split);
auto MulSplit = dyn_cast<Instruction>(Mapping[MulInst]);
EXPECT_TRUE(MulSplit);
EXPECT_EQ(MulSplit->getOperand(0), AddSplit);
EXPECT_EQ(MulSplit->getOperand(1), V);
EXPECT_EQ(MulSplit->getParent(), Split);
EXPECT_EQ(AddSplit->getNextNode(), MulSplit);
EXPECT_EQ(MulSplit->getNextNode(), Split->getTerminator());
delete F;
}
class CloneFunc : public ::testing::Test {
protected:
void SetUp() override {