[LICM] Hoist loads that are dominated by invariant.start intrinsic, and are invariant in the loop.
Summary: We can hoist out loads that are dominated by invariant.start, to the preheader. We conservatively assume the load is variant, if we see a corresponding use of invariant.start (it could be an invariant.end or an escaping call). Reviewers: mkuper, sanjoy, reames Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D29331 llvm-svn: 293887
This commit is contained in:
parent
fc19a8ff07
commit
7f4b26e189
|
@ -81,6 +81,11 @@ static cl::opt<bool>
|
||||||
DisablePromotion("disable-licm-promotion", cl::Hidden,
|
DisablePromotion("disable-licm-promotion", cl::Hidden,
|
||||||
cl::desc("Disable memory promotion in LICM pass"));
|
cl::desc("Disable memory promotion in LICM pass"));
|
||||||
|
|
||||||
|
static cl::opt<uint32_t> MaxNumUsesTraversed(
|
||||||
|
"licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
|
||||||
|
cl::desc("Max num uses visited for identifying load "
|
||||||
|
"invariance in loop using invariant start (default = 8)"));
|
||||||
|
|
||||||
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
|
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
|
||||||
static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
|
static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
|
||||||
const LoopSafetyInfo *SafetyInfo);
|
const LoopSafetyInfo *SafetyInfo);
|
||||||
|
@ -480,6 +485,59 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
|
||||||
SafetyInfo->BlockColors = colorEHFunclets(*Fn);
|
SafetyInfo->BlockColors = colorEHFunclets(*Fn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return true if LI is invariant within scope of the loop. LI is invariant if
|
||||||
|
// CurLoop is dominated by an invariant.start representing the same memory location
|
||||||
|
// and size as the memory location LI loads from, and also the invariant.start
|
||||||
|
// has no uses.
|
||||||
|
static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
|
||||||
|
Loop *CurLoop) {
|
||||||
|
Value *Addr = LI->getOperand(0);
|
||||||
|
const DataLayout &DL = LI->getModule()->getDataLayout();
|
||||||
|
const uint32_t LocSizeInBits = DL.getTypeSizeInBits(
|
||||||
|
cast<PointerType>(Addr->getType())->getElementType());
|
||||||
|
|
||||||
|
// if the type is i8 addrspace(x)*, we know this is the type of
|
||||||
|
// llvm.invariant.start operand
|
||||||
|
auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
|
||||||
|
LI->getPointerAddressSpace());
|
||||||
|
unsigned BitcastsVisited = 0;
|
||||||
|
// Look through bitcasts until we reach the i8* type (this is invariant.start
|
||||||
|
// operand type).
|
||||||
|
while (Addr->getType() != PtrInt8Ty) {
|
||||||
|
auto *BC = dyn_cast<BitCastInst>(Addr);
|
||||||
|
// Avoid traversing high number of bitcast uses.
|
||||||
|
if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
|
||||||
|
return false;
|
||||||
|
Addr = BC->getOperand(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned UsesVisited = 0;
|
||||||
|
// Traverse all uses of the load operand value, to see if invariant.start is
|
||||||
|
// one of the uses, and whether it dominates the load instruction.
|
||||||
|
for (auto *U : Addr->users()) {
|
||||||
|
// Avoid traversing for Load operand with high number of users.
|
||||||
|
if (++UsesVisited > MaxNumUsesTraversed)
|
||||||
|
return false;
|
||||||
|
IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
|
||||||
|
// If there are escaping uses of invariant.start instruction, the load maybe
|
||||||
|
// non-invariant.
|
||||||
|
if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
|
||||||
|
II->hasNUsesOrMore(1))
|
||||||
|
continue;
|
||||||
|
unsigned InvariantSizeInBits =
|
||||||
|
cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
|
||||||
|
// Confirm the invariant.start location size contains the load operand size
|
||||||
|
// in bits. Also, the invariant.start should dominate the load, and we
|
||||||
|
// should not hoist the load out of a loop that contains this dominating
|
||||||
|
// invariant.start.
|
||||||
|
if (LocSizeInBits <= InvariantSizeInBits &&
|
||||||
|
DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
|
bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
|
||||||
Loop *CurLoop, AliasSetTracker *CurAST,
|
Loop *CurLoop, AliasSetTracker *CurAST,
|
||||||
LoopSafetyInfo *SafetyInfo,
|
LoopSafetyInfo *SafetyInfo,
|
||||||
|
@ -496,6 +554,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
|
||||||
if (LI->getMetadata(LLVMContext::MD_invariant_load))
|
if (LI->getMetadata(LLVMContext::MD_invariant_load))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
// This checks for an invariant.start dominating the load.
|
||||||
|
if (isLoadInvariantInLoop(LI, DT, CurLoop))
|
||||||
|
return true;
|
||||||
|
|
||||||
// Don't hoist loads which have may-aliased stores in loop.
|
// Don't hoist loads which have may-aliased stores in loop.
|
||||||
uint64_t Size = 0;
|
uint64_t Size = 0;
|
||||||
if (LI->getType()->isSized())
|
if (LI->getType()->isSized())
|
||||||
|
|
|
@ -149,3 +149,174 @@ latch:
|
||||||
return:
|
return:
|
||||||
ret i32 %sum
|
ret i32 %sum
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
|
||||||
|
declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) nounwind
|
||||||
|
declare void @escaping.invariant.start({}*) nounwind
|
||||||
|
; invariant.start dominates the load, and in this scope, the
|
||||||
|
; load is invariant. So, we can hoist the `addrld` load out of the loop.
|
||||||
|
define i32 @test_fence(i8* %addr, i32 %n, i8* %volatile) {
|
||||||
|
; CHECK-LABEL: @test_fence
|
||||||
|
; CHECK-LABEL: entry
|
||||||
|
; CHECK: invariant.start
|
||||||
|
; CHECK: %addrld = load atomic i32, i32* %addr.i unordered, align 8
|
||||||
|
; CHECK: br label %loop
|
||||||
|
entry:
|
||||||
|
%gep = getelementptr inbounds i8, i8* %addr, i64 8
|
||||||
|
%addr.i = bitcast i8* %gep to i32 *
|
||||||
|
store atomic i32 5, i32 * %addr.i unordered, align 8
|
||||||
|
fence release
|
||||||
|
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
|
||||||
|
br label %loop
|
||||||
|
|
||||||
|
loop:
|
||||||
|
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
|
||||||
|
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
|
||||||
|
%volload = load atomic i8, i8* %volatile unordered, align 8
|
||||||
|
fence acquire
|
||||||
|
%volchk = icmp eq i8 %volload, 0
|
||||||
|
%addrld = load atomic i32, i32* %addr.i unordered, align 8
|
||||||
|
%sel = select i1 %volchk, i32 0, i32 %addrld
|
||||||
|
%sum.next = add i32 %sel, %sum
|
||||||
|
%indvar.next = add i32 %indvar, 1
|
||||||
|
%cond = icmp slt i32 %indvar.next, %n
|
||||||
|
br i1 %cond, label %loop, label %loopexit
|
||||||
|
|
||||||
|
loopexit:
|
||||||
|
ret i32 %sum
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
; Same as test above, but the load is no longer invariant (presence of
|
||||||
|
; invariant.end). We cannot hoist the addrld out of loop.
|
||||||
|
define i32 @test_fence1(i8* %addr, i32 %n, i8* %volatile) {
|
||||||
|
; CHECK-LABEL: @test_fence1
|
||||||
|
; CHECK-LABEL: entry
|
||||||
|
; CHECK: invariant.start
|
||||||
|
; CHECK-NEXT: invariant.end
|
||||||
|
; CHECK-NEXT: br label %loop
|
||||||
|
entry:
|
||||||
|
%gep = getelementptr inbounds i8, i8* %addr, i64 8
|
||||||
|
%addr.i = bitcast i8* %gep to i32 *
|
||||||
|
store atomic i32 5, i32 * %addr.i unordered, align 8
|
||||||
|
fence release
|
||||||
|
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
|
||||||
|
call void @llvm.invariant.end.p0i8({}* %invst, i64 4, i8* %gep)
|
||||||
|
br label %loop
|
||||||
|
|
||||||
|
loop:
|
||||||
|
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
|
||||||
|
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
|
||||||
|
%volload = load atomic i8, i8* %volatile unordered, align 8
|
||||||
|
fence acquire
|
||||||
|
%volchk = icmp eq i8 %volload, 0
|
||||||
|
%addrld = load atomic i32, i32* %addr.i unordered, align 8
|
||||||
|
%sel = select i1 %volchk, i32 0, i32 %addrld
|
||||||
|
%sum.next = add i32 %sel, %sum
|
||||||
|
%indvar.next = add i32 %indvar, 1
|
||||||
|
%cond = icmp slt i32 %indvar.next, %n
|
||||||
|
br i1 %cond, label %loop, label %loopexit
|
||||||
|
|
||||||
|
loopexit:
|
||||||
|
ret i32 %sum
|
||||||
|
}
|
||||||
|
|
||||||
|
; same as test above, but instead of invariant.end, we have the result of
|
||||||
|
; invariant.start escaping through a call. We cannot hoist the load.
|
||||||
|
define i32 @test_fence2(i8* %addr, i32 %n, i8* %volatile) {
|
||||||
|
; CHECK-LABEL: @test_fence2
|
||||||
|
; CHECK-LABEL: entry
|
||||||
|
; CHECK-NOT: load
|
||||||
|
; CHECK: br label %loop
|
||||||
|
entry:
|
||||||
|
%gep = getelementptr inbounds i8, i8* %addr, i64 8
|
||||||
|
%addr.i = bitcast i8* %gep to i32 *
|
||||||
|
store atomic i32 5, i32 * %addr.i unordered, align 8
|
||||||
|
fence release
|
||||||
|
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
|
||||||
|
call void @escaping.invariant.start({}* %invst)
|
||||||
|
br label %loop
|
||||||
|
|
||||||
|
loop:
|
||||||
|
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
|
||||||
|
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
|
||||||
|
%volload = load atomic i8, i8* %volatile unordered, align 8
|
||||||
|
fence acquire
|
||||||
|
%volchk = icmp eq i8 %volload, 0
|
||||||
|
%addrld = load atomic i32, i32* %addr.i unordered, align 8
|
||||||
|
%sel = select i1 %volchk, i32 0, i32 %addrld
|
||||||
|
%sum.next = add i32 %sel, %sum
|
||||||
|
%indvar.next = add i32 %indvar, 1
|
||||||
|
%cond = icmp slt i32 %indvar.next, %n
|
||||||
|
br i1 %cond, label %loop, label %loopexit
|
||||||
|
|
||||||
|
loopexit:
|
||||||
|
ret i32 %sum
|
||||||
|
}
|
||||||
|
|
||||||
|
; FIXME: invariant.start dominates the load, and in this scope, the
|
||||||
|
; load is invariant. So, we can hoist the `addrld` load out of the loop.
|
||||||
|
; Consider the loadoperand addr.i bitcasted before being passed to
|
||||||
|
; invariant.start
|
||||||
|
define i32 @test_fence3(i32* %addr, i32 %n, i8* %volatile) {
|
||||||
|
; CHECK-LABEL: @test_fence3
|
||||||
|
; CHECK-LABEL: entry
|
||||||
|
; CHECK: invariant.start
|
||||||
|
; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
|
||||||
|
; CHECK: br label %loop
|
||||||
|
entry:
|
||||||
|
%addr.i = getelementptr inbounds i32, i32* %addr, i64 8
|
||||||
|
%gep = bitcast i32* %addr.i to i8 *
|
||||||
|
store atomic i32 5, i32 * %addr.i unordered, align 8
|
||||||
|
fence release
|
||||||
|
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
|
||||||
|
br label %loop
|
||||||
|
|
||||||
|
loop:
|
||||||
|
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
|
||||||
|
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
|
||||||
|
%volload = load atomic i8, i8* %volatile unordered, align 8
|
||||||
|
fence acquire
|
||||||
|
%volchk = icmp eq i8 %volload, 0
|
||||||
|
%addrld = load atomic i32, i32* %addr.i unordered, align 8
|
||||||
|
%sel = select i1 %volchk, i32 0, i32 %addrld
|
||||||
|
%sum.next = add i32 %sel, %sum
|
||||||
|
%indvar.next = add i32 %indvar, 1
|
||||||
|
%cond = icmp slt i32 %indvar.next, %n
|
||||||
|
br i1 %cond, label %loop, label %loopexit
|
||||||
|
|
||||||
|
loopexit:
|
||||||
|
ret i32 %sum
|
||||||
|
}
|
||||||
|
|
||||||
|
; We should not hoist the addrld out of the loop.
|
||||||
|
define i32 @test_fence4(i32* %addr, i32 %n, i8* %volatile) {
|
||||||
|
; CHECK-LABEL: @test_fence4
|
||||||
|
; CHECK-LABEL: entry
|
||||||
|
; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
|
||||||
|
; CHECK: br label %loop
|
||||||
|
entry:
|
||||||
|
%addr.i = getelementptr inbounds i32, i32* %addr, i64 8
|
||||||
|
%gep = bitcast i32* %addr.i to i8 *
|
||||||
|
br label %loop
|
||||||
|
|
||||||
|
loop:
|
||||||
|
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
|
||||||
|
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
|
||||||
|
store atomic i32 5, i32 * %addr.i unordered, align 8
|
||||||
|
fence release
|
||||||
|
%invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
|
||||||
|
%volload = load atomic i8, i8* %volatile unordered, align 8
|
||||||
|
fence acquire
|
||||||
|
%volchk = icmp eq i8 %volload, 0
|
||||||
|
%addrld = load atomic i32, i32* %addr.i unordered, align 8
|
||||||
|
%sel = select i1 %volchk, i32 0, i32 %addrld
|
||||||
|
%sum.next = add i32 %sel, %sum
|
||||||
|
%indvar.next = add i32 %indvar, 1
|
||||||
|
%cond = icmp slt i32 %indvar.next, %n
|
||||||
|
br i1 %cond, label %loop, label %loopexit
|
||||||
|
|
||||||
|
loopexit:
|
||||||
|
ret i32 %sum
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue