Adds the ability to use an epilog remainder loop during loop unrolling and makes

this the default behavior.

Patch by Evgeny Stupachenko (evstupac@gmail.com).

Differential Revision: http://reviews.llvm.org/D18158

llvm-svn: 265388
This commit is contained in:
David L Kreitzer 2016-04-05 12:19:35 +00:00
parent 849045f2aa
commit 188de5ae69
15 changed files with 501 additions and 167 deletions

View File

@ -34,10 +34,11 @@ bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime,
LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC, bool PreserveLCSSA);
bool UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
bool AllowExpensiveTripCount, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
bool PreserveLCSSA);
bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
bool AllowExpensiveTripCount,
bool UseEpilogRemainder, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
bool PreserveLCSSA);
MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
}

View File

@ -44,6 +44,11 @@ using namespace llvm;
STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
static cl::opt<bool>
UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(true), cl::Hidden,
cl::desc("Allow runtime unrolled loops to be unrolled "
"with epilog instead of prolog."));
/// Convert the instruction operands from referencing the current values into
/// those specified by VMap.
static inline void remapInstruction(Instruction *I,
@ -288,12 +293,13 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
"convergent "
"operation.");
});
// Don't output the runtime loop prolog if Count is a multiple of
// TripMultiple. Such a prolog is never needed, and is unsafe if the loop
// Don't output the runtime loop remainder if Count is a multiple of
// TripMultiple. Such a remainder is never needed, and is unsafe if the loop
// contains a convergent instruction.
if (RuntimeTripCount && TripMultiple % Count != 0 &&
!UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT,
PreserveLCSSA))
!UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
UnrollRuntimeEpilog, LI, SE, DT,
PreserveLCSSA))
return false;
// Notify ScalarEvolution that the loop will be substantially changed,

View File

@ -16,8 +16,8 @@
// case, we need to generate code to execute these 'left over' iterations.
//
// The current strategy generates an if-then-else sequence prior to the
// unrolled loop to execute the 'left over' iterations. Other strategies
// include generate a loop before or after the unrolled loop.
// unrolled loop to execute the 'left over' iterations before or after the
// unrolled loop.
//
//===----------------------------------------------------------------------===//
@ -60,33 +60,35 @@ STATISTIC(NumRuntimeUnrolled,
/// than the unroll factor.
///
static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
BasicBlock *OrigPH, BasicBlock *NewPH,
ValueToValueMapTy &VMap, DominatorTree *DT,
LoopInfo *LI, bool PreserveLCSSA) {
BasicBlock *PrologExit, BasicBlock *PreHeader,
BasicBlock *NewPreHeader, ValueToValueMapTy &VMap,
DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) {
BasicBlock *Latch = L->getLoopLatch();
assert(Latch && "Loop must have a latch");
BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
// Create a PHI node for each outgoing value from the original loop
// (which means it is an outgoing value from the prolog code too).
// The new PHI node is inserted in the prolog end basic block.
// The new PHI name is added as an operand of a PHI node in either
// The new PHI node value is added as an operand of a PHI node in either
// the loop header or the loop exit block.
for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch);
SBI != SBE; ++SBI) {
for (BasicBlock::iterator BBI = (*SBI)->begin();
PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
for (BasicBlock *Succ : successors(Latch)) {
for (Instruction &BBI : *Succ) {
PHINode *PN = dyn_cast<PHINode>(&BBI);
// Exit when we passed all PHI nodes.
if (!PN)
break;
// Add a new PHI node to the prolog end block and add the
// appropriate incoming values.
PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr",
PrologEnd->getTerminator());
PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
PrologExit->getFirstNonPHI());
// Adding a value to the new PHI node from the original loop preheader.
// This is the value that skips all the prolog code.
if (L->contains(PN)) {
NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH);
NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
PreHeader);
} else {
NewPN->addIncoming(UndefValue::get(PN->getType()), OrigPH);
NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
}
Value *V = PN->getIncomingValueForBlock(Latch);
@ -97,22 +99,22 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
}
// Adding a value to the new PHI node from the last prolog block
// that was created.
NewPN->addIncoming(V, LastPrologBB);
NewPN->addIncoming(V, PrologLatch);
// Update the existing PHI node operand with the value from the
// new PHI node. How this is done depends on if the existing
// PHI node is in the original loop block, or the exit block.
if (L->contains(PN)) {
PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN);
PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
} else {
PN->addIncoming(NewPN, PrologEnd);
PN->addIncoming(NewPN, PrologExit);
}
}
}
// Create a branch around the original loop, which is taken if there are no
// iterations remaining to be executed after running the prologue.
Instruction *InsertPt = PrologEnd->getTerminator();
Instruction *InsertPt = PrologExit->getTerminator();
IRBuilder<> B(InsertPt);
assert(Count != 0 && "nonsensical Count!");
@ -126,25 +128,152 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
BasicBlock *Exit = L->getUniqueExitBlock();
assert(Exit && "Loop must have a single exit block only");
// Split the exit to maintain loop canonicalization guarantees
SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI,
PreserveLCSSA);
// Add the branch to the exit block (around the unrolled loop)
B.CreateCondBr(BrLoopExit, Exit, NewPH);
B.CreateCondBr(BrLoopExit, Exit, NewPreHeader);
InsertPt->eraseFromParent();
}
/// Connect the unrolling epilog code to the original loop.
/// The unrolling epilog code contains code to execute the
/// 'extra' iterations if the run-time trip count modulo the
/// unroll count is non-zero.
///
/// This function performs the following:
/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
/// - Create PHI nodes at the unrolling loop exit to combine
/// values that exit the unrolling loop code and jump around it.
/// - Update PHI operands in the epilog loop by the new PHI nodes
/// - Branch around the epilog loop if extra iters (ModVal) is zero.
///
static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
BasicBlock *Exit, BasicBlock *PreHeader,
BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
ValueToValueMapTy &VMap, DominatorTree *DT,
LoopInfo *LI, bool PreserveLCSSA) {
BasicBlock *Latch = L->getLoopLatch();
assert(Latch && "Loop must have a latch");
BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
// Loop structure should be the following:
//
// PreHeader
// NewPreHeader
// Header
// ...
// Latch
// NewExit (PN)
// EpilogPreHeader
// EpilogHeader
// ...
// EpilogLatch
// Exit (EpilogPN)
// Update PHI nodes at NewExit and Exit.
for (Instruction &BBI : *NewExit) {
PHINode *PN = dyn_cast<PHINode>(&BBI);
// Exit when we passed all PHI nodes.
if (!PN)
break;
// PN should be used in another PHI located in Exit block as
// Exit was split by SplitBlockPredecessors into Exit and NewExit
// Basicaly it should look like:
// NewExit:
// PN = PHI [I, Latch]
// ...
// Exit:
// EpilogPN = PHI [PN, EpilogPreHeader]
//
// There is EpilogPreHeader incoming block instead of NewExit as
// NewExit was spilt 1 more time to get EpilogPreHeader.
assert(PN->hasOneUse() && "The phi should have 1 use");
PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
// Add incoming PreHeader from branch around the Loop
PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
Value *V = PN->getIncomingValueForBlock(Latch);
Instruction *I = dyn_cast<Instruction>(V);
if (I && L->contains(I))
// If value comes from an instruction in the loop add VMap value.
V = VMap[I];
// For the instruction out of the loop, constant or undefined value
// insert value itself.
EpilogPN->addIncoming(V, EpilogLatch);
assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
"EpilogPN should have EpilogPreHeader incoming block");
// Change EpilogPreHeader incoming block to NewExit.
EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
NewExit);
// Now PHIs should look like:
// NewExit:
// PN = PHI [I, Latch], [undef, PreHeader]
// ...
// Exit:
// EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
}
// Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
// Update corresponding PHI nodes in epilog loop.
for (BasicBlock *Succ : successors(Latch)) {
// Skip this as we already updated phis in exit blocks.
if (!L->contains(Succ))
continue;
for (Instruction &BBI : *Succ) {
PHINode *PN = dyn_cast<PHINode>(&BBI);
// Exit when we passed all PHI nodes.
if (!PN)
break;
// Add new PHI nodes to the loop exit block and update epilog
// PHIs with the new PHI values.
PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
NewExit->getFirstNonPHI());
// Adding a value to the new PHI node from the unrolling loop preheader.
NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
// Adding a value to the new PHI node from the unrolling loop latch.
NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
// Update the existing PHI node operand with the value from the new PHI
// node. Corresponding instruction in epilog loop should be PHI.
PHINode *VPN = cast<PHINode>(VMap[&BBI]);
VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
}
}
Instruction *InsertPt = NewExit->getTerminator();
IRBuilder<> B(InsertPt);
Value *BrLoopExit = B.CreateIsNotNull(ModVal);
assert(Exit && "Loop must have a single exit block only");
// Split the exit to maintain loop canonicalization guarantees
SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
PreserveLCSSA);
// Add the branch to the exit block (around the unrolling loop)
B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
InsertPt->eraseFromParent();
}
/// Create a clone of the blocks in a loop and connect them together.
/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new
/// loop will be created including all cloned blocks, and the iterator of it
/// switches to count NewIter down to 0.
/// If CreateRemainderLoop is false, loop structure will not be cloned,
/// otherwise a new loop will be created including all cloned blocks, and the
/// iterator of it switches to count NewIter down to 0.
/// The cloned blocks should be inserted between InsertTop and InsertBot.
/// If loop structure is cloned InsertTop should be new preheader, InsertBot
/// new loop exit.
///
static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
static void CloneLoopBlocks(Loop *L, Value *NewIter,
const bool CreateRemainderLoop,
const bool UseEpilogRemainder,
BasicBlock *InsertTop, BasicBlock *InsertBot,
BasicBlock *Preheader,
std::vector<BasicBlock *> &NewBlocks,
LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
LoopInfo *LI) {
BasicBlock *Preheader = L->getLoopPreheader();
StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
Function *F = Header->getParent();
@ -152,7 +281,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
Loop *NewLoop = nullptr;
Loop *ParentLoop = L->getParentLoop();
if (!UnrollProlog) {
if (CreateRemainderLoop) {
NewLoop = new Loop();
if (ParentLoop)
ParentLoop->addChildLoop(NewLoop);
@ -163,7 +292,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
// For each block in the original loop, create a new copy,
// and update the value map with the newly created values.
for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F);
BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
NewBlocks.push_back(NewBB);
if (NewLoop)
@ -179,16 +308,17 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
}
if (Latch == *BB) {
// For the last block, if UnrollProlog is true, create a direct jump to
// InsertBot. If not, create a loop back to cloned head.
// For the last block, if CreateRemainderLoop is false, create a direct
// jump to InsertBot. If not, create a loop back to cloned head.
VMap.erase((*BB)->getTerminator());
BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
IRBuilder<> Builder(LatchBR);
if (UnrollProlog) {
if (!CreateRemainderLoop) {
Builder.CreateBr(InsertBot);
} else {
PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter",
PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
suffix + ".iter",
FirstLoopBB->getFirstNonPHI());
Value *IdxSub =
Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
@ -207,9 +337,15 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
// cloned loop.
for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
if (UnrollProlog) {
VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
if (!CreateRemainderLoop) {
if (UseEpilogRemainder) {
unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
NewPHI->setIncomingBlock(idx, InsertTop);
NewPHI->removeIncomingValue(Latch, false);
} else {
VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
}
} else {
unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
NewPHI->setIncomingBlock(idx, InsertTop);
@ -254,7 +390,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
}
}
/// Insert code in the prolog code when unrolling a loop with a
/// Insert code in the prolog/epilog code when unrolling a loop with a
/// run-time trip-count.
///
/// This method assumes that the loop unroll factor is total number
@ -266,6 +402,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
/// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
/// the switch instruction is generated.
///
/// ***Prolog case***
/// extraiters = tripcount % loopfactor
/// if (extraiters == 0) jump Loop:
/// else jump Prol
@ -277,17 +414,35 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
/// ...
/// End:
///
bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
bool AllowExpensiveTripCount, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
bool PreserveLCSSA) {
// For now, only unroll loops that contain a single exit.
/// ***Epilog case***
/// extraiters = tripcount % loopfactor
/// if (extraiters == tripcount) jump LoopExit:
/// unroll_iters = tripcount - extraiters
/// Loop: LoopBody; (executes unroll_iter times);
/// unroll_iter -= 1
/// if (unroll_iter != 0) jump Loop:
/// LoopExit:
/// if (extraiters == 0) jump EpilExit:
/// Epil: LoopBody; (executes extraiters times)
/// extraiters -= 1 // Omitted if unroll factor is 2.
/// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
/// EpilExit:
bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
bool AllowExpensiveTripCount,
bool UseEpilogRemainder,
LoopInfo *LI, ScalarEvolution *SE,
DominatorTree *DT, bool PreserveLCSSA) {
// for now, only unroll loops that contain a single exit
if (!L->getExitingBlock())
return false;
// Make sure the loop is in canonical form, and there is a single
// exit block only.
if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
if (!L->isLoopSimplifyForm())
return false;
BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop
if (!Exit)
return false;
// Use Scalar Evolution to compute the trip count. This allows more loops to
@ -311,8 +466,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
return false;
BasicBlock *Header = L->getHeader();
BasicBlock *PH = L->getLoopPreheader();
BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
BasicBlock *PreHeader = L->getLoopPreheader();
BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
const DataLayout &DL = Header->getModule()->getDataLayout();
SCEVExpander Expander(*SE, DL, "loop-unroll");
if (!AllowExpensiveTripCount &&
@ -330,26 +485,75 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
SE->forgetLoop(ParentLoop);
BasicBlock *Latch = L->getLoopLatch();
// It helps to split the original preheader twice, one for the end of the
// prolog code and one for a new loop preheader.
BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI);
BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI);
PreHeaderBR = cast<BranchInst>(PH->getTerminator());
// Loop structure is the following:
//
// PreHeader
// Header
// ...
// Latch
// Exit
BasicBlock *NewPreHeader;
BasicBlock *NewExit = nullptr;
BasicBlock *PrologExit = nullptr;
BasicBlock *EpilogPreHeader = nullptr;
BasicBlock *PrologPreHeader = nullptr;
if (UseEpilogRemainder) {
// If epilog remainder
// Split PreHeader to insert a branch around loop for unrolling.
NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
NewPreHeader->setName(PreHeader->getName() + ".new");
// Split Exit to create phi nodes from branch above.
SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa",
DT, LI, PreserveLCSSA);
// Split NewExit to insert epilog remainder loop.
EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
} else {
// If prolog remainder
// Split the original preheader twice to insert prolog remainder loop
PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
PrologPreHeader->setName(Header->getName() + ".prol.preheader");
PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
DT, LI);
PrologExit->setName(Header->getName() + ".prol.loopexit");
// Split PrologExit to get NewPreHeader.
NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
NewPreHeader->setName(PreHeader->getName() + ".new");
}
// Loop structure should be the following:
// Epilog Prolog
//
// PreHeader PreHeader
// *NewPreHeader *PrologPreHeader
// Header *PrologExit
// ... *NewPreHeader
// Latch Header
// *NewExit ...
// *EpilogPreHeader Latch
// Exit Exit
// Calculate conditions for branch around loop for unrolling
// in epilog case and around prolog remainder loop in prolog case.
// Compute the number of extra iterations required, which is:
// extra iterations = run-time trip count % (loop unroll factor + 1)
// extra iterations = run-time trip count % loop unroll factor
PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
PreHeaderBR);
Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
PreHeaderBR);
IRBuilder<> B(PreHeaderBR);
Value *ModVal;
// Calculate ModVal = (BECount + 1) % Count.
// Note that TripCount is BECount + 1.
if (isPowerOf2_32(Count)) {
// When Count is power of 2 we don't BECount for epilog case, however we'll
// need it for a branch around unrolling loop for prolog case.
ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
// 1. There are no iterations to be run in the prologue loop.
// 1. There are no iterations to be run in the prolog/epilog loop.
// OR
// 2. The addition computing TripCount overflowed.
//
@ -371,18 +575,18 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
ConstantInt::get(BECount->getType(), Count),
"xtraiter");
}
Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");
// Branch to either the extra iterations or the cloned/unrolled loop.
// We will fix up the true branch label when adding loop body copies.
B.CreateCondBr(BranchVal, PEnd, PEnd);
assert(PreHeaderBR->isUnconditional() &&
PreHeaderBR->getSuccessor(0) == PEnd &&
"CFG edges in Preheader are not correct");
Value *CmpOperand =
UseEpilogRemainder ? TripCount :
ConstantInt::get(TripCount->getType(), 0);
Value *BranchVal = B.CreateICmpNE(ModVal, CmpOperand, "lcmp.mod");
BasicBlock *FirstLoop = UseEpilogRemainder ? NewPreHeader : PrologPreHeader;
BasicBlock *SecondLoop = UseEpilogRemainder ? NewExit : PrologExit;
// Branch to either remainder (extra iterations) loop or unrolling loop.
B.CreateCondBr(BranchVal, FirstLoop, SecondLoop);
PreHeaderBR->eraseFromParent();
Function *F = Header->getParent();
// Get an ordered list of blocks in the loop to help with the ordering of the
// cloned blocks in the prolog code.
// cloned blocks in the prolog/epilog code
LoopBlocksDFS LoopBlocks(L);
LoopBlocks.perform(LI);
@ -394,17 +598,38 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
std::vector<BasicBlock *> NewBlocks;
ValueToValueMapTy VMap;
bool UnrollPrologue = Count == 2;
// For unroll factor 2 remainder loop will have 1 iterations.
// Do not create 1 iteration loop.
bool CreateRemainderLoop = (Count != 2);
// Clone all the basic blocks in the loop. If Count is 2, we don't clone
// the loop, otherwise we create a cloned loop to execute the extra
// iterations. This function adds the appropriate CFG connections.
CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks,
VMap, LI);
BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit;
BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, LI);
// Insert the cloned blocks into the function just before the original loop.
F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(),
NewBlocks[0]->getIterator(), F->end());
// Insert the cloned blocks into the function.
F->getBasicBlockList().splice(InsertBot->getIterator(),
F->getBasicBlockList(),
NewBlocks[0]->getIterator(),
F->end());
// Loop structure should be the following:
// Epilog Prolog
//
// PreHeader PreHeader
// NewPreHeader PrologPreHeader
// Header PrologHeader
// ... ...
// Latch PrologLatch
// NewExit PrologExit
// EpilogPreHeader NewPreHeader
// EpilogHeader Header
// ... ...
// EpilogLatch Latch
// Exit Exit
// Rewrite the cloned instruction operands to use the values created when the
// clone is created.
@ -415,11 +640,38 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
}
}
// Connect the prolog code to the original loop and update the
// PHI functions.
BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI,
PreserveLCSSA);
if (UseEpilogRemainder) {
// Connect the epilog code to the original loop and update the
// PHI functions.
ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader,
EpilogPreHeader, NewPreHeader, VMap, DT, LI,
PreserveLCSSA);
// Update counter in loop for unrolling.
// I should be multiply of Count.
IRBuilder<> B2(NewPreHeader->getTerminator());
Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
B2.SetInsertPoint(LatchBR);
PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
Header->getFirstNonPHI());
Value *IdxSub =
B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
NewIdx->getName() + ".nsub");
Value *IdxCmp;
if (LatchBR->getSuccessor(0) == Header)
IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
else
IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
NewIdx->addIncoming(TestVal, NewPreHeader);
NewIdx->addIncoming(IdxSub, Latch);
LatchBR->setCondition(IdxCmp);
} else {
// Connect the prolog code to the original loop and update the
// PHI functions.
ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader,
VMap, DT, LI, PreserveLCSSA);
}
NumRuntimeUnrolled++;
return true;
}

View File

@ -1,13 +1,21 @@
; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s
; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s -check-prefix=EPILOG
; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
; Tests for unrolling loops with run-time trip counts
; CHECK: %xtraiter = and i32 %n
; CHECK: %lcmp.mod = icmp ne i32 %xtraiter, 0
; CHECK: br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split
; EPILOG: %xtraiter = and i32 %n
; EPILOG: %lcmp.mod = icmp ne i32 %xtraiter, %n
; EPILOG: br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa
; CHECK: for.body.prol:
; CHECK: for.body:
; PROLOG: %xtraiter = and i32 %n
; PROLOG: %lcmp.mod = icmp ne i32 %xtraiter, 0
; PROLOG: br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit
; EPILOG: for.body:
; EPILOG: for.body.epil:
; PROLOG: for.body.prol:
; PROLOG: for.body:
define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
entry:

View File

@ -1,4 +1,5 @@
; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll | FileCheck %s
; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll | FileCheck %s -check-prefix=EPILOG
; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
define void @unroll_opt_for_size() nounwind optsize {
entry:
br label %loop
@ -13,11 +14,17 @@ exit:
ret void
}
; CHECK-LABEL: @unroll_opt_for_size
; CHECK: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK: icmp
; EPILOG-LABEL: @unroll_opt_for_size
; EPILOG: add
; EPILOG-NEXT: add
; EPILOG-NEXT: add
; EPILOG: icmp
; PROLOG-LABEL: @unroll_opt_for_size
; PROLOG: add
; PROLOG-NEXT: add
; PROLOG-NEXT: add
; PROLOG: icmp
define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
entry:
@ -40,8 +47,13 @@ for.end: ; preds = %for.body, %entry
ret i32 %sum.0.lcssa
}
; CHECK-LABEL: @test
; CHECK: for.body.prol{{.*}}:
; CHECK: for.body:
; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body
; EPILOG-LABEL: @test
; EPILOG: for.body:
; EPILOG: br i1 %niter.ncmp.7, label %for.end.loopexit{{.*}}, label %for.body
; EPILOG: for.body.epil{{.*}}:
; PROLOG-LABEL: @test
; PROLOG: for.body.prol{{.*}}:
; PROLOG: for.body:
; PROLOG: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body

View File

@ -14,9 +14,9 @@ for.body: ; preds = %for.body, %entry
exit: ; preds = %for.body
%ret = phi x86_mmx [ undef, %for.body ]
; CHECK: %[[ret_unr:.*]] = phi x86_mmx [ undef,
; CHECK: %[[ret_ph:.*]] = phi x86_mmx [ undef,
; CHECK: %[[ret:.*]] = phi x86_mmx [ %[[ret_unr]], {{.*}} ], [ %[[ret_ph]]
; CHECK: %[[ret_ph:.*]] = phi x86_mmx [ undef, %entry
; CHECK: %[[ret_ph1:.*]] = phi x86_mmx [ undef,
; CHECK: %[[ret:.*]] = phi x86_mmx [ %[[ret_ph]], {{.*}} ], [ %[[ret_ph1]],
; CHECK: ret x86_mmx %[[ret]]
ret x86_mmx %ret
}

View File

@ -34,7 +34,7 @@ define i32 @test2(i64* %loc, i64 %conv7) {
; CHECK: udiv
; CHECK: udiv
; CHECK-NOT: udiv
; CHECK-LABEL: for.body.prol
; CHECK-LABEL: for.body
entry:
%rem0 = load i64, i64* %loc, align 8
%ExpensiveComputation = udiv i64 %rem0, 42 ; <<< Extra computations are added to the trip-count expression

View File

@ -1,18 +1,30 @@
; RUN: opt < %s -S -loop-unroll -unroll-runtime=true | FileCheck %s
; RUN: opt < %s -S -loop-unroll -unroll-runtime=true | FileCheck %s -check-prefix=EPILOG
; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; Tests for unrolling loops with run-time trip counts
; CHECK: %xtraiter = and i32 %n
; CHECK: %lcmp.mod = icmp ne i32 %xtraiter, 0
; CHECK: br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split
; EPILOG: %xtraiter = and i32 %n
; EPILOG: %lcmp.mod = icmp ne i32 %xtraiter, %n
; EPILOG: br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa
; PROLOG: %xtraiter = and i32 %n
; PROLOG: %lcmp.mod = icmp ne i32 %xtraiter, 0
; PROLOG: br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit
; EPILOG: for.body.epil:
; EPILOG: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ], [ %indvars.iv.unr, %for.body.epil.preheader ]
; EPILOG: %epil.iter.sub = sub i32 %epil.iter, 1
; EPILOG: %epil.iter.cmp = icmp ne i32 %epil.iter.sub, 0
; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop !0
; PROLOG: for.body.prol:
; PROLOG: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
; PROLOG: %prol.iter.sub = sub i32 %prol.iter, 1
; PROLOG: %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit, !llvm.loop !0
; CHECK: for.body.prol:
; CHECK: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ]
; CHECK: %prol.iter.sub = sub i32 %prol.iter, 1
; CHECK: %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split, !llvm.loop !0
define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
entry:
@ -39,8 +51,11 @@ for.end: ; preds = %for.body, %entry
; Still try to completely unroll loops with compile-time trip counts
; even if the -unroll-runtime is specified
; CHECK: for.body:
; CHECK-NOT: for.body.prol:
; EPILOG: for.body:
; EPILOG-NOT: for.body.epil:
; PROLOG: for.body:
; PROLOG-NOT: for.body.prol:
define i32 @test1(i32* nocapture %a) nounwind uwtable readonly {
entry:
@ -64,7 +79,8 @@ for.end: ; preds = %for.body
; This is test 2007-05-09-UnknownTripCount.ll which can be unrolled now
; if the -unroll-runtime option is turned on
; CHECK: bb72.2:
; EPILOG: bb72.2:
; PROLOG: bb72.2:
define void @foo(i32 %trips) {
entry:
@ -86,8 +102,11 @@ cond_true138:
; Test run-time unrolling for a loop that counts down by -2.
; CHECK: for.body.prol:
; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split
; EPILOG: for.body.epil:
; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.cond.for.end_crit_edge.epilog-lcssa
; PROLOG: for.body.prol:
; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit
define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
entry:
@ -116,8 +135,11 @@ for.end: ; preds = %for.cond.for.end_cr
}
; Test run-time unrolling disable metadata.
; CHECK: for.body:
; CHECK-NOT: for.body.prol:
; EPILOG: for.body:
; EPILOG-NOT: for.body.epil:
; PROLOG: for.body:
; PROLOG-NOT: for.body.prol:
define zeroext i16 @test2(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
entry:
@ -148,6 +170,8 @@ for.end: ; preds = %for.cond.for.end_cr
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: !0 = distinct !{!0, !1}
; CHECK: !1 = !{!"llvm.loop.unroll.disable"}
; EPILOG: !0 = distinct !{!0, !1}
; EPILOG: !1 = !{!"llvm.loop.unroll.disable"}
; PROLOG: !0 = distinct !{!0, !1}
; PROLOG: !1 = !{!"llvm.loop.unroll.disable"}

View File

@ -1,19 +1,35 @@
; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s
; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s -check-prefix=EPILOG
; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
; This tests that setting the unroll count works
; CHECK: for.body.preheader:
; CHECK: br {{.*}} label %for.body.prol, label %for.body.preheader.split, !dbg [[PH_LOC:![0-9]+]]
; CHECK: for.body.prol:
; CHECK: br label %for.body.preheader.split, !dbg [[BODY_LOC:![0-9]+]]
; CHECK: for.body.preheader.split:
; CHECK: br {{.*}} label %for.end.loopexit, label %for.body.preheader.split.split, !dbg [[PH_LOC]]
; CHECK: for.body:
; CHECK: br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body, !dbg [[BODY_LOC]]
; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
; CHECK-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}})
; CHECK-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}})
; EPILOG: for.body.preheader:
; EPILOG: br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa, !dbg [[PH_LOC:![0-9]+]]
; EPILOG: for.body:
; EPILOG: br i1 %niter.ncmp.1, label %for.end.loopexit.unr-lcssa.loopexit, label %for.body, !dbg [[BODY_LOC:![0-9]+]]
; EPILOG-NOT: br i1 %niter.ncmp.2, label %for.end.loopexit{{.*}}, label %for.body
; EPILOG: for.body.epil.preheader:
; EPILOG: br label %for.body.epil, !dbg [[EXIT_LOC:![0-9]+]]
; EPILOG: for.body.epil:
; EPILOG: br label %for.end.loopexit.epilog-lcssa, !dbg [[BODY_LOC:![0-9]+]]
; EPILOG-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}})
; EPILOG-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}})
; EPILOG-DAG: [[EXIT_LOC]] = !DILocation(line: 103, column: 1, scope: !{{.*}})
; PROLOG: for.body.preheader:
; PROLOG: br {{.*}} label %for.body.prol.preheader, label %for.body.prol.loopexit, !dbg [[PH_LOC:![0-9]+]]
; PROLOG: for.body.prol:
; PROLOG: br label %for.body.prol.loopexit, !dbg [[BODY_LOC:![0-9]+]]
; PROLOG: for.body.prol.loopexit:
; PROLOG: br {{.*}} label %for.end.loopexit, label %for.body.preheader.new, !dbg [[PH_LOC]]
; PROLOG: for.body:
; PROLOG: br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body, !dbg [[BODY_LOC]]
; PROLOG-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
; PROLOG-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}})
; PROLOG-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}})
define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly !dbg !6 {
entry:

View File

@ -1,12 +1,18 @@
; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-count=8 | FileCheck %s
; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-count=8 | FileCheck %s -check-prefix=EPILOG
; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
; Choose a smaller, power-of-two, unroll count if the loop is too large.
; This test makes sure we're not unrolling 'odd' counts
; CHECK: for.body.prol:
; CHECK: for.body:
; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body
; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
; EPILOG: for.body:
; EPILOG: br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit{{.*}}, label %for.body
; EPILOG-NOT: br i1 %niter.ncmp.4, label %for.end.loopexit.unr-lcssa.loopexit{{.*}}, label %for.body
; EPILOG: for.body.epil:
; PROLOG: for.body.prol:
; PROLOG: for.body:
; PROLOG: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body
; PROLOG-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
entry:

View File

@ -1,13 +1,21 @@
; RUN: opt < %s -S -O2 -unroll-runtime=true | FileCheck %s
; RUN: opt < %s -S -O2 -unroll-runtime=true | FileCheck %s -check-prefix=EPILOG
; RUN: opt < %s -S -O2 -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
; Check runtime unrolling prologue can be promoted by LICM pass.
; CHECK: entry:
; CHECK: %xtraiter
; CHECK: %lcmp.mod
; CHECK: loop1:
; CHECK: br i1 %lcmp.mod
; CHECK: loop2.prol:
; EPILOG: entry:
; EPILOG: %xtraiter
; EPILOG: %lcmp.mod
; EPILOG: loop1:
; EPILOG: br i1 %lcmp.mod
; EPILOG: loop2.epil:
; PROLOG: entry:
; PROLOG: %xtraiter
; PROLOG: %lcmp.mod
; PROLOG: loop1:
; PROLOG: br i1 %lcmp.mod
; PROLOG: loop2.prol:
define void @unroll(i32 %iter, i32* %addr1, i32* %addr2) nounwind {
entry:

View File

@ -11,9 +11,6 @@ entry:
%cmp1 = icmp eq i3 %n, 0
br i1 %cmp1, label %for.end, label %for.body
; UNROLL-16-NOT: for.body.prol:
; UNROLL-4: for.body.prol:
for.body: ; preds = %for.body, %entry
; UNROLL-16-LABEL: for.body:
; UNROLL-4-LABEL: for.body:
@ -39,6 +36,10 @@ for.body: ; preds = %for.body, %entry
; UNROLL-16-LABEL: for.end
; UNROLL-4-LABEL: for.end
; UNROLL-16-NOT: for.body.epil:
; UNROLL-4: for.body.epil:
for.end: ; preds = %for.body, %entry
%sum.0.lcssa = phi i3 [ 0, %entry ], [ %add, %for.body ]
ret i3 %sum.0.lcssa

View File

@ -13,13 +13,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
; CHECK: entry:
; CHECK-NEXT: %0 = add i32 %N, 1
; CHECK-NEXT: %xtraiter = and i32 %0, 1
; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0
; CHECK-NEXT: br i1 %lcmp.mod, label %while.body.prol, label %entry.split
; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, %0
; CHECK-NEXT: br i1 %lcmp.mod, label %entry.new, label %while.end.unr-lcssa
; CHECK: while.body.prol:
; CHECK: br label %entry.split
; CHECK: while.body.epil:
; CHECK: br label %while.end.epilog-lcssa
; CHECK: entry.split:
; CHECK: while.end.epilog-lcssa:
; Function Attrs: nounwind readnone ssp uwtable
define i32 @foo(i32 %N) {

View File

@ -4,14 +4,14 @@
; RUN: opt < %s -O2 -S | FileCheck %s
; After loop unroll:
; %dec18 = add nsw i32 %dec18.in, -1
; %niter.nsub = add nsw i32 %niter, -1
; ...
; %dec18.1 = add nsw i32 %dec18, -1
; %niter.nsub.1 = add nsw i32 %niter.nsub, -1
; should be merged to:
; %dec18.1 = add nsw i32 %dec18.in, -2
; %dec18.1 = add nsw i32 %niter, -2
;
; CHECK-LABEL: @_Z3fn1v(
; CHECK: %dec18.1 = add nsw i32 %dec18.in, -2
; CHECK: %niter.nsub.1 = add i32 %niter, -2
; ModuleID = '<stdin>'
target triple = "x86_64-unknown-linux-gnu"

View File

@ -171,10 +171,6 @@ for.end: ; preds = %for.body, %entry
; should be duplicated (original and 4x unrolled).
;
; CHECK-LABEL: @runtime_loop_with_count4(
; CHECK: for.body.prol:
; CHECK: store
; CHECK-NOT: store
; CHECK: br i1
; CHECK: for.body
; CHECK: store
; CHECK: store
@ -182,6 +178,10 @@ for.end: ; preds = %for.body, %entry
; CHECK: store
; CHECK-NOT: store
; CHECK: br i1
; CHECK: for.body.epil:
; CHECK: store
; CHECK-NOT: store
; CHECK: br i1
define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) {
entry:
%cmp3 = icmp sgt i32 %b, 0
@ -287,10 +287,6 @@ for.end: ; preds = %for.body
; (original and 8x).
;
; CHECK-LABEL: @runtime_loop_with_enable(
; CHECK: for.body.prol:
; CHECK: store
; CHECK-NOT: store
; CHECK: br i1
; CHECK: for.body:
; CHECK: store i32
; CHECK: store i32
@ -302,6 +298,10 @@ for.end: ; preds = %for.body
; CHECK: store i32
; CHECK-NOT: store i32
; CHECK: br i1
; CHECK: for.body.epil:
; CHECK: store
; CHECK-NOT: store
; CHECK: br i1
define void @runtime_loop_with_enable(i32* nocapture %a, i32 %b) {
entry:
%cmp3 = icmp sgt i32 %b, 0
@ -328,16 +328,16 @@ for.end: ; preds = %for.body, %entry
; should be duplicated (original and 3x unrolled).
;
; CHECK-LABEL: @runtime_loop_with_count3(
; CHECK: for.body.prol:
; CHECK: store
; CHECK-NOT: store
; CHECK: br i1
; CHECK: for.body
; CHECK: store
; CHECK: store
; CHECK: store
; CHECK-NOT: store
; CHECK: br i1
; CHECK: for.body.epil:
; CHECK: store
; CHECK-NOT: store
; CHECK: br i1
define void @runtime_loop_with_count3(i32* nocapture %a, i32 %b) {
entry:
%cmp3 = icmp sgt i32 %b, 0