Adds the ability to use an epilog remainder loop during loop unrolling and makes

this the default behavior. Patch by Evgeny Stupachenko (evstupac@gmail.com). Differential Revision: http://reviews.llvm.org/D18158 llvm-svn: 265388
2016-04-05 12:19:35 +00:00 · 2016-04-05 12:19:35 +00:00 · 188de5ae69
parent 849045f2aa
commit 188de5ae69
15 changed files with 501 additions and 167 deletions
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@ -34,10 +34,11 @@ bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime,
                LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
                AssumptionCache *AC, bool PreserveLCSSA);

-bool UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
-                             bool AllowExpensiveTripCount, LoopInfo *LI,
-                             ScalarEvolution *SE, DominatorTree *DT,
-                             bool PreserveLCSSA);
+bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
+                                bool AllowExpensiveTripCount,
+                                bool UseEpilogRemainder, LoopInfo *LI,
+                                ScalarEvolution *SE, DominatorTree *DT,
+                                bool PreserveLCSSA);

 MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
 }
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@ -44,6 +44,11 @@ using namespace llvm;
 STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
 STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");

+static cl::opt<bool>
+UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(true), cl::Hidden,
+                    cl::desc("Allow runtime unrolled loops to be unrolled "
+                             "with epilog instead of prolog."));
+
 /// Convert the instruction operands from referencing the current values into
 /// those specified by VMap.
 static inline void remapInstruction(Instruction *I,
@ -288,12 +293,13 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
               "convergent "
               "operation.");
      });
-  // Don't output the runtime loop prolog if Count is a multiple of
-  // TripMultiple.  Such a prolog is never needed, and is unsafe if the loop
+  // Don't output the runtime loop remainder if Count is a multiple of
+  // TripMultiple.  Such a remainder is never needed, and is unsafe if the loop
  // contains a convergent instruction.
  if (RuntimeTripCount && TripMultiple % Count != 0 &&
-      !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT,
-                               PreserveLCSSA))
+      !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
+                                  UnrollRuntimeEpilog, LI, SE, DT, 
+                                  PreserveLCSSA))
    return false;

  // Notify ScalarEvolution that the loop will be substantially changed,
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@ -16,8 +16,8 @@
 // case, we need to generate code to execute these 'left over' iterations.
 //
 // The current strategy generates an if-then-else sequence prior to the
-// unrolled loop to execute the 'left over' iterations.  Other strategies
-// include generate a loop before or after the unrolled loop.
+// unrolled loop to execute the 'left over' iterations before or after the
+// unrolled loop.
 //
 //===----------------------------------------------------------------------===//

@ -60,33 +60,35 @@ STATISTIC(NumRuntimeUnrolled,
 ///   than the unroll factor.
 ///
 static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
-                          BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
-                          BasicBlock *OrigPH, BasicBlock *NewPH,
-                          ValueToValueMapTy &VMap, DominatorTree *DT,
-                          LoopInfo *LI, bool PreserveLCSSA) {
+                          BasicBlock *PrologExit, BasicBlock *PreHeader,
+                          BasicBlock *NewPreHeader, ValueToValueMapTy &VMap,
+                          DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) {
  BasicBlock *Latch = L->getLoopLatch();
  assert(Latch && "Loop must have a latch");
+  BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);

  // Create a PHI node for each outgoing value from the original loop
  // (which means it is an outgoing value from the prolog code too).
  // The new PHI node is inserted in the prolog end basic block.
-  // The new PHI name is added as an operand of a PHI node in either
+  // The new PHI node value is added as an operand of a PHI node in either
  // the loop header or the loop exit block.
-  for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch);
-       SBI != SBE; ++SBI) {
-    for (BasicBlock::iterator BBI = (*SBI)->begin();
-         PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
-
+  for (BasicBlock *Succ : successors(Latch)) {
+    for (Instruction &BBI : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&BBI);
+      // Exit when we passed all PHI nodes.
+      if (!PN)
+        break;
      // Add a new PHI node to the prolog end block and add the
      // appropriate incoming values.
-      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr",
-                                       PrologEnd->getTerminator());
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+                                       PrologExit->getFirstNonPHI());
      // Adding a value to the new PHI node from the original loop preheader.
      // This is the value that skips all the prolog code.
      if (L->contains(PN)) {
-        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH);
+        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
+                           PreHeader);
      } else {
-        NewPN->addIncoming(UndefValue::get(PN->getType()), OrigPH);
+        NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
      }

      Value *V = PN->getIncomingValueForBlock(Latch);
@ -97,22 +99,22 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
      }
      // Adding a value to the new PHI node from the last prolog block
      // that was created.
-      NewPN->addIncoming(V, LastPrologBB);
+      NewPN->addIncoming(V, PrologLatch);

      // Update the existing PHI node operand with the value from the
      // new PHI node.  How this is done depends on if the existing
      // PHI node is in the original loop block, or the exit block.
      if (L->contains(PN)) {
-        PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN);
+        PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
      } else {
-        PN->addIncoming(NewPN, PrologEnd);
+        PN->addIncoming(NewPN, PrologExit);
      }
    }
  }

  // Create a branch around the original loop, which is taken if there are no
  // iterations remaining to be executed after running the prologue.
-  Instruction *InsertPt = PrologEnd->getTerminator();
+  Instruction *InsertPt = PrologExit->getTerminator();
  IRBuilder<> B(InsertPt);

  assert(Count != 0 && "nonsensical Count!");
@ -126,25 +128,152 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
  BasicBlock *Exit = L->getUniqueExitBlock();
  assert(Exit && "Loop must have a single exit block only");
  // Split the exit to maintain loop canonicalization guarantees
-  SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
+  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
  SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI,
                         PreserveLCSSA);
  // Add the branch to the exit block (around the unrolled loop)
-  B.CreateCondBr(BrLoopExit, Exit, NewPH);
+  B.CreateCondBr(BrLoopExit, Exit, NewPreHeader);
+  InsertPt->eraseFromParent();
+}
+
+/// Connect the unrolling epilog code to the original loop.
+/// The unrolling epilog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
+/// - Create PHI nodes at the unrolling loop exit to combine
+///   values that exit the unrolling loop code and jump around it.
+/// - Update PHI operands in the epilog loop by the new PHI nodes
+/// - Branch around the epilog loop if extra iters (ModVal) is zero.
+///
+static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
+                          BasicBlock *Exit, BasicBlock *PreHeader,
+                          BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA)  {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "Loop must have a latch");
+  BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
+
+  // Loop structure should be the following:
+  //
+  // PreHeader
+  // NewPreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // NewExit (PN)
+  // EpilogPreHeader
+  //   EpilogHeader
+  //   ...
+  //   EpilogLatch
+  // Exit (EpilogPN)
+
+  // Update PHI nodes at NewExit and Exit.
+  for (Instruction &BBI : *NewExit) {
+    PHINode *PN = dyn_cast<PHINode>(&BBI);
+    // Exit when we passed all PHI nodes.
+    if (!PN)
+      break;
+    // PN should be used in another PHI located in Exit block as
+    // Exit was split by SplitBlockPredecessors into Exit and NewExit
+    // Basicaly it should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, EpilogPreHeader]
+    //
+    // There is EpilogPreHeader incoming block instead of NewExit as
+    // NewExit was spilt 1 more time to get EpilogPreHeader.
+    assert(PN->hasOneUse() && "The phi should have 1 use");
+    PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
+    assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
+
+    // Add incoming PreHeader from branch around the Loop
+    PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
+
+    Value *V = PN->getIncomingValueForBlock(Latch);
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (I && L->contains(I))
+      // If value comes from an instruction in the loop add VMap value.
+      V = VMap[I];
+    // For the instruction out of the loop, constant or undefined value
+    // insert value itself.
+    EpilogPN->addIncoming(V, EpilogLatch);
+
+    assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
+          "EpilogPN should have EpilogPreHeader incoming block");
+    // Change EpilogPreHeader incoming block to NewExit.
+    EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
+                               NewExit);
+    // Now PHIs should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch], [undef, PreHeader]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
+  }
+
+  // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
+  // Update corresponding PHI nodes in epilog loop.
+  for (BasicBlock *Succ : successors(Latch)) {
+    // Skip this as we already updated phis in exit blocks.
+    if (!L->contains(Succ))
+      continue;
+    for (Instruction &BBI : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&BBI);
+      // Exit when we passed all PHI nodes.
+      if (!PN)
+        break;
+      // Add new PHI nodes to the loop exit block and update epilog
+      // PHIs with the new PHI values.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+                                       NewExit->getFirstNonPHI());
+      // Adding a value to the new PHI node from the unrolling loop preheader.
+      NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
+      // Adding a value to the new PHI node from the unrolling loop latch.
+      NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
+
+      // Update the existing PHI node operand with the value from the new PHI
+      // node.  Corresponding instruction in epilog loop should be PHI.
+      PHINode *VPN = cast<PHINode>(VMap[&BBI]);
+      VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
+    }
+  }
+
+  Instruction *InsertPt = NewExit->getTerminator();
+  IRBuilder<> B(InsertPt);
+  Value *BrLoopExit = B.CreateIsNotNull(ModVal);
+  assert(Exit && "Loop must have a single exit block only");
+  // Split the exit to maintain loop canonicalization guarantees
+  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
+  SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
+                         PreserveLCSSA);
+  // Add the branch to the exit block (around the unrolling loop)
+  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
  InsertPt->eraseFromParent();
 }

 /// Create a clone of the blocks in a loop and connect them together.
-/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new
-/// loop will be created including all cloned blocks, and the iterator of it
-/// switches to count NewIter down to 0.
+/// If CreateRemainderLoop is false, loop structure will not be cloned,
+/// otherwise a new loop will be created including all cloned blocks, and the
+/// iterator of it switches to count NewIter down to 0.
+/// The cloned blocks should be inserted between InsertTop and InsertBot.
+/// If loop structure is cloned InsertTop should be new preheader, InsertBot
+/// new loop exit.
 ///
-static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
+static void CloneLoopBlocks(Loop *L, Value *NewIter,
+                            const bool CreateRemainderLoop,
+                            const bool UseEpilogRemainder,
                            BasicBlock *InsertTop, BasicBlock *InsertBot,
+                            BasicBlock *Preheader,
                            std::vector<BasicBlock *> &NewBlocks,
                            LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
                            LoopInfo *LI) {
-  BasicBlock *Preheader = L->getLoopPreheader();
+  StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
  BasicBlock *Header = L->getHeader();
  BasicBlock *Latch = L->getLoopLatch();
  Function *F = Header->getParent();
@ -152,7 +281,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
  Loop *NewLoop = nullptr;
  Loop *ParentLoop = L->getParentLoop();
-  if (!UnrollProlog) {
+  if (CreateRemainderLoop) {
    NewLoop = new Loop();
    if (ParentLoop)
      ParentLoop->addChildLoop(NewLoop);
@ -163,7 +292,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
  // For each block in the original loop, create a new copy,
  // and update the value map with the newly created values.
  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
-    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F);
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
    NewBlocks.push_back(NewBB);

    if (NewLoop)
@ -179,16 +308,17 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
    }

    if (Latch == *BB) {
-      // For the last block, if UnrollProlog is true, create a direct jump to
-      // InsertBot. If not, create a loop back to cloned head.
+      // For the last block, if CreateRemainderLoop is false, create a direct
+      // jump to InsertBot. If not, create a loop back to cloned head.
      VMap.erase((*BB)->getTerminator());
      BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
      BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
      IRBuilder<> Builder(LatchBR);
-      if (UnrollProlog) {
+      if (!CreateRemainderLoop) {
        Builder.CreateBr(InsertBot);
      } else {
-        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter",
+        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
+                                          suffix + ".iter",
                                          FirstLoopBB->getFirstNonPHI());
        Value *IdxSub =
            Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
@ -207,9 +337,15 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
  // cloned loop.
  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
    PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
-    if (UnrollProlog) {
-      VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
-      cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+    if (!CreateRemainderLoop) {
+      if (UseEpilogRemainder) {
+        unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+        NewPHI->setIncomingBlock(idx, InsertTop);
+        NewPHI->removeIncomingValue(Latch, false);
+      } else {
+        VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
+        cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+      }
    } else {
      unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
      NewPHI->setIncomingBlock(idx, InsertTop);
@ -254,7 +390,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
  }
 }

-/// Insert code in the prolog code when unrolling a loop with a
+/// Insert code in the prolog/epilog code when unrolling a loop with a
 /// run-time trip-count.
 ///
 /// This method assumes that the loop unroll factor is total number
@ -266,6 +402,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
 /// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
 /// the switch instruction is generated.
 ///
+/// ***Prolog case***
 ///        extraiters = tripcount % loopfactor
 ///        if (extraiters == 0) jump Loop:
 ///        else jump Prol
@ -277,17 +414,35 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
 /// ...
 /// End:
 ///
-bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
-                                   bool AllowExpensiveTripCount, LoopInfo *LI,
-                                   ScalarEvolution *SE, DominatorTree *DT,
-                                   bool PreserveLCSSA) {
-  // For now, only unroll loops that contain a single exit.
+/// ***Epilog case***
+///        extraiters = tripcount % loopfactor
+///        if (extraiters == tripcount) jump LoopExit:
+///        unroll_iters = tripcount - extraiters
+/// Loop:  LoopBody; (executes unroll_iter times);
+///        unroll_iter -= 1
+///        if (unroll_iter != 0) jump Loop:
+/// LoopExit:
+///        if (extraiters == 0) jump EpilExit:
+/// Epil:  LoopBody; (executes extraiters times)
+///        extraiters -= 1                 // Omitted if unroll factor is 2.
+///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
+/// EpilExit:
+
+bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
+                                      bool AllowExpensiveTripCount,
+                                      bool UseEpilogRemainder,
+                                      LoopInfo *LI, ScalarEvolution *SE,
+                                      DominatorTree *DT, bool PreserveLCSSA) {
+  // for now, only unroll loops that contain a single exit
  if (!L->getExitingBlock())
    return false;

  // Make sure the loop is in canonical form, and there is a single
  // exit block only.
-  if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
+  if (!L->isLoopSimplifyForm())
+    return false;
+  BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop
+  if (!Exit)
    return false;

  // Use Scalar Evolution to compute the trip count. This allows more loops to
@ -311,8 +466,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
    return false;

  BasicBlock *Header = L->getHeader();
-  BasicBlock *PH = L->getLoopPreheader();
-  BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  const DataLayout &DL = Header->getModule()->getDataLayout();
  SCEVExpander Expander(*SE, DL, "loop-unroll");
  if (!AllowExpensiveTripCount &&
@ -330,26 +485,75 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
    SE->forgetLoop(ParentLoop);

  BasicBlock *Latch = L->getLoopLatch();
-  // It helps to split the original preheader twice, one for the end of the
-  // prolog code and one for a new loop preheader.
-  BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI);
-  BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI);
-  PreHeaderBR = cast<BranchInst>(PH->getTerminator());

+  // Loop structure is the following:
+  //
+  // PreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // Exit
+
+  BasicBlock *NewPreHeader;
+  BasicBlock *NewExit = nullptr;
+  BasicBlock *PrologExit = nullptr;
+  BasicBlock *EpilogPreHeader = nullptr;
+  BasicBlock *PrologPreHeader = nullptr;
+
+  if (UseEpilogRemainder) {
+    // If epilog remainder
+    // Split PreHeader to insert a branch around loop for unrolling.
+    NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+    // Split Exit to create phi nodes from branch above.
+    SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
+    NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa",
+                                     DT, LI, PreserveLCSSA);
+    // Split NewExit to insert epilog remainder loop.
+    EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
+    EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
+  } else {
+    // If prolog remainder
+    // Split the original preheader twice to insert prolog remainder loop
+    PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
+    PrologPreHeader->setName(Header->getName() + ".prol.preheader");
+    PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
+                            DT, LI);
+    PrologExit->setName(Header->getName() + ".prol.loopexit");
+    // Split PrologExit to get NewPreHeader.
+    NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+  }
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // *NewPreHeader     *PrologPreHeader
+  //   Header          *PrologExit
+  //   ...             *NewPreHeader
+  //   Latch             Header
+  // *NewExit            ...
+  // *EpilogPreHeader    Latch
+  // Exit              Exit
+
+  // Calculate conditions for branch around loop for unrolling
+  // in epilog case and around prolog remainder loop in prolog case.
  // Compute the number of extra iterations required, which is:
-  //  extra iterations = run-time trip count % (loop unroll factor + 1)
+  //  extra iterations = run-time trip count % loop unroll factor
+  PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                            PreHeaderBR);
  Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
                                          PreHeaderBR);
-
  IRBuilder<> B(PreHeaderBR);
  Value *ModVal;
  // Calculate ModVal = (BECount + 1) % Count.
  // Note that TripCount is BECount + 1.
  if (isPowerOf2_32(Count)) {
+    // When Count is power of 2 we don't BECount for epilog case, however we'll
+    // need it for a branch around unrolling loop for prolog case.
    ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
-    //  1. There are no iterations to be run in the prologue loop.
+    //  1. There are no iterations to be run in the prolog/epilog loop.
    // OR
    //  2. The addition computing TripCount overflowed.
    //
@ -371,18 +575,18 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
                          ConstantInt::get(BECount->getType(), Count),
                          "xtraiter");
  }
-  Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");
-
-  // Branch to either the extra iterations or the cloned/unrolled loop.
-  // We will fix up the true branch label when adding loop body copies.
-  B.CreateCondBr(BranchVal, PEnd, PEnd);
-  assert(PreHeaderBR->isUnconditional() &&
-         PreHeaderBR->getSuccessor(0) == PEnd &&
-         "CFG edges in Preheader are not correct");
+  Value *CmpOperand =
+      UseEpilogRemainder ? TripCount :
+                           ConstantInt::get(TripCount->getType(), 0);
+  Value *BranchVal = B.CreateICmpNE(ModVal, CmpOperand, "lcmp.mod");
+  BasicBlock *FirstLoop = UseEpilogRemainder ? NewPreHeader : PrologPreHeader;
+  BasicBlock *SecondLoop = UseEpilogRemainder ? NewExit : PrologExit;
+  // Branch to either remainder (extra iterations) loop or unrolling loop.
+  B.CreateCondBr(BranchVal, FirstLoop, SecondLoop);
  PreHeaderBR->eraseFromParent();
  Function *F = Header->getParent();
  // Get an ordered list of blocks in the loop to help with the ordering of the
-  // cloned blocks in the prolog code.
+  // cloned blocks in the prolog/epilog code
  LoopBlocksDFS LoopBlocks(L);
  LoopBlocks.perform(LI);

@ -394,17 +598,38 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
  std::vector<BasicBlock *> NewBlocks;
  ValueToValueMapTy VMap;

-  bool UnrollPrologue = Count == 2;
+  // For unroll factor 2 remainder loop will have 1 iterations.
+  // Do not create 1 iteration loop.
+  bool CreateRemainderLoop = (Count != 2);

  // Clone all the basic blocks in the loop. If Count is 2, we don't clone
  // the loop, otherwise we create a cloned loop to execute the extra
  // iterations. This function adds the appropriate CFG connections.
-  CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks,
-                  VMap, LI);
+  BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit;
+  BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
+  CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
+                  InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, LI);

-  // Insert the cloned blocks into the function just before the original loop.
-  F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(),
-                                NewBlocks[0]->getIterator(), F->end());
+  // Insert the cloned blocks into the function.
+  F->getBasicBlockList().splice(InsertBot->getIterator(),
+                                F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(),
+                                F->end());
+
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // NewPreHeader      PrologPreHeader
+  //   Header            PrologHeader
+  //   ...               ...
+  //   Latch             PrologLatch
+  // NewExit           PrologExit
+  // EpilogPreHeader   NewPreHeader
+  //   EpilogHeader      Header
+  //   ...               ...
+  //   EpilogLatch       Latch
+  // Exit              Exit

  // Rewrite the cloned instruction operands to use the values created when the
  // clone is created.
@ -415,11 +640,38 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
    }
  }

-  // Connect the prolog code to the original loop and update the
-  // PHI functions.
-  BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
-  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI,
-                PreserveLCSSA);
+  if (UseEpilogRemainder) {
+    // Connect the epilog code to the original loop and update the
+    // PHI functions.
+    ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader,
+                  EpilogPreHeader, NewPreHeader, VMap, DT, LI,
+                  PreserveLCSSA);
+
+    // Update counter in loop for unrolling.
+    // I should be multiply of Count.
+    IRBuilder<> B2(NewPreHeader->getTerminator());
+    Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
+    BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+    B2.SetInsertPoint(LatchBR);
+    PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+                                      Header->getFirstNonPHI());
+    Value *IdxSub =
+        B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                     NewIdx->getName() + ".nsub");
+    Value *IdxCmp;
+    if (LatchBR->getSuccessor(0) == Header)
+      IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
+    else
+      IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
+    NewIdx->addIncoming(TestVal, NewPreHeader);
+    NewIdx->addIncoming(IdxSub, Latch);
+    LatchBR->setCondition(IdxCmp);
+  } else {
+    // Connect the prolog code to the original loop and update the
+    // PHI functions.
+    ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader,
+                  VMap, DT, LI, PreserveLCSSA);
+  }
  NumRuntimeUnrolled++;
  return true;
 }
--- a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll
@ -1,13 +1,21 @@
-; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s -check-prefix=EPILOG
+; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG

 ; Tests for unrolling loops with run-time trip counts

-; CHECK:  %xtraiter = and i32 %n
-; CHECK:  %lcmp.mod = icmp ne i32 %xtraiter, 0
-; CHECK:  br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split
+; EPILOG:  %xtraiter = and i32 %n
+; EPILOG:  %lcmp.mod = icmp ne i32 %xtraiter, %n
+; EPILOG:  br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa

-; CHECK:  for.body.prol:
-; CHECK:  for.body:
+; PROLOG:  %xtraiter = and i32 %n
+; PROLOG:  %lcmp.mod = icmp ne i32 %xtraiter, 0
+; PROLOG:  br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit
+
+; EPILOG:  for.body:
+; EPILOG:  for.body.epil:
+
+; PROLOG:  for.body.prol:
+; PROLOG:  for.body:

 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
 entry:
--- a/llvm/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll
@ -1,4 +1,5 @@
-; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll | FileCheck %s
+; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll | FileCheck %s -check-prefix=EPILOG
+; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
 define void @unroll_opt_for_size() nounwind optsize {
 entry:
  br label %loop
@ -13,11 +14,17 @@ exit:
  ret void
 }

-; CHECK-LABEL: @unroll_opt_for_size
-; CHECK:      add
-; CHECK-NEXT: add
-; CHECK-NEXT: add
-; CHECK: icmp
+; EPILOG-LABEL: @unroll_opt_for_size
+; EPILOG:      add
+; EPILOG-NEXT: add
+; EPILOG-NEXT: add
+; EPILOG: icmp
+
+; PROLOG-LABEL: @unroll_opt_for_size
+; PROLOG:      add
+; PROLOG-NEXT: add
+; PROLOG-NEXT: add
+; PROLOG: icmp

 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
 entry:
@ -40,8 +47,13 @@ for.end:                                          ; preds = %for.body, %entry
  ret i32 %sum.0.lcssa
 }

-; CHECK-LABEL: @test
-; CHECK: for.body.prol{{.*}}:
-; CHECK: for.body:
-; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body
+; EPILOG-LABEL: @test
+; EPILOG: for.body:
+; EPILOG: br i1 %niter.ncmp.7, label %for.end.loopexit{{.*}}, label %for.body
+; EPILOG: for.body.epil{{.*}}:
+
+; PROLOG-LABEL: @test
+; PROLOG: for.body.prol{{.*}}:
+; PROLOG: for.body:
+; PROLOG: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body

--- a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
+++ b/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
@ -14,9 +14,9 @@ for.body:                                         ; preds = %for.body, %entry

 exit:                                             ; preds = %for.body
  %ret = phi x86_mmx [ undef, %for.body ]
-  ; CHECK: %[[ret_unr:.*]] = phi x86_mmx [ undef,
-  ; CHECK: %[[ret_ph:.*]]  = phi x86_mmx [ undef,
-  ; CHECK: %[[ret:.*]] = phi x86_mmx [ %[[ret_unr]], {{.*}} ], [ %[[ret_ph]]
+  ; CHECK: %[[ret_ph:.*]] = phi x86_mmx [ undef, %entry
+  ; CHECK: %[[ret_ph1:.*]]  = phi x86_mmx [ undef,
+  ; CHECK: %[[ret:.*]] = phi x86_mmx [ %[[ret_ph]], {{.*}} ], [ %[[ret_ph1]],
  ; CHECK: ret x86_mmx %[[ret]]
  ret x86_mmx %ret
 }
--- a/llvm/test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll
+++ b/llvm/test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll
@ -34,7 +34,7 @@ define i32 @test2(i64* %loc, i64 %conv7) {
 ; CHECK: udiv
 ; CHECK: udiv
 ; CHECK-NOT: udiv
-; CHECK-LABEL: for.body.prol
+; CHECK-LABEL: for.body
 entry:
  %rem0 = load i64, i64* %loc, align 8
  %ExpensiveComputation = udiv i64 %rem0, 42 ; <<< Extra computations are added to the trip-count expression
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop.ll
@ -1,18 +1,30 @@
-; RUN: opt < %s -S -loop-unroll -unroll-runtime=true | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true | FileCheck %s -check-prefix=EPILOG
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG

 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

 ; Tests for unrolling loops with run-time trip counts

-; CHECK: %xtraiter = and i32 %n
-; CHECK:  %lcmp.mod = icmp ne i32 %xtraiter, 0
-; CHECK:  br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split
+; EPILOG: %xtraiter = and i32 %n
+; EPILOG:  %lcmp.mod = icmp ne i32 %xtraiter, %n
+; EPILOG:  br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa
+
+; PROLOG: %xtraiter = and i32 %n
+; PROLOG:  %lcmp.mod = icmp ne i32 %xtraiter, 0
+; PROLOG:  br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit
+
+; EPILOG: for.body.epil:
+; EPILOG: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ],  [ %indvars.iv.unr, %for.body.epil.preheader ]
+; EPILOG:  %epil.iter.sub = sub i32 %epil.iter, 1
+; EPILOG:  %epil.iter.cmp = icmp ne i32 %epil.iter.sub, 0
+; EPILOG:  br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop !0
+
+; PROLOG: for.body.prol:
+; PROLOG: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
+; PROLOG:  %prol.iter.sub = sub i32 %prol.iter, 1
+; PROLOG:  %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
+; PROLOG:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit, !llvm.loop !0

-; CHECK: for.body.prol:
-; CHECK: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ]
-; CHECK:  %prol.iter.sub = sub i32 %prol.iter, 1
-; CHECK:  %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
-; CHECK:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split, !llvm.loop !0

 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
 entry:
@ -39,8 +51,11 @@ for.end:                                          ; preds = %for.body, %entry
 ; Still try to completely unroll loops with compile-time trip counts
 ; even if the -unroll-runtime is specified

-; CHECK: for.body:
-; CHECK-NOT: for.body.prol:
+; EPILOG: for.body:
+; EPILOG-NOT: for.body.epil:
+
+; PROLOG: for.body:
+; PROLOG-NOT: for.body.prol:

 define i32 @test1(i32* nocapture %a) nounwind uwtable readonly {
 entry:
@ -64,7 +79,8 @@ for.end:                                          ; preds = %for.body
 ; This is test 2007-05-09-UnknownTripCount.ll which can be unrolled now
 ; if the -unroll-runtime option is turned on

-; CHECK: bb72.2:
+; EPILOG: bb72.2:
+; PROLOG: bb72.2:

 define void @foo(i32 %trips) {
 entry:
@ -86,8 +102,11 @@ cond_true138:

 ; Test run-time unrolling for a loop that counts down by -2.

-; CHECK: for.body.prol:
-; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split
+; EPILOG: for.body.epil:
+; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.cond.for.end_crit_edge.epilog-lcssa
+
+; PROLOG: for.body.prol:
+; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit

 define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
 entry:
@ -116,8 +135,11 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 }

 ; Test run-time unrolling disable metadata.
-; CHECK: for.body:
-; CHECK-NOT: for.body.prol:
+; EPILOG: for.body:
+; EPILOG-NOT: for.body.epil:
+
+; PROLOG: for.body:
+; PROLOG-NOT: for.body.prol:

 define zeroext i16 @test2(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
 entry:
@ -148,6 +170,8 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.unroll.runtime.disable"}

-; CHECK: !0 = distinct !{!0, !1}
-; CHECK: !1 = !{!"llvm.loop.unroll.disable"}
+; EPILOG: !0 = distinct !{!0, !1}
+; EPILOG: !1 = !{!"llvm.loop.unroll.disable"}

+; PROLOG: !0 = distinct !{!0, !1}
+; PROLOG: !1 = !{!"llvm.loop.unroll.disable"}
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop1.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop1.ll
@ -1,19 +1,35 @@
-; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s -check-prefix=EPILOG
+; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG

 ; This tests that setting the unroll count works

-; CHECK: for.body.preheader:
-; CHECK:   br {{.*}} label %for.body.prol, label %for.body.preheader.split, !dbg [[PH_LOC:![0-9]+]]
-; CHECK: for.body.prol:
-; CHECK:   br label %for.body.preheader.split, !dbg [[BODY_LOC:![0-9]+]]
-; CHECK: for.body.preheader.split:
-; CHECK:   br {{.*}} label %for.end.loopexit, label %for.body.preheader.split.split, !dbg [[PH_LOC]]
-; CHECK: for.body:
-; CHECK:   br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body, !dbg [[BODY_LOC]]
-; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body

-; CHECK-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}})
-; CHECK-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}})
+; EPILOG: for.body.preheader:
+; EPILOG:   br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa, !dbg [[PH_LOC:![0-9]+]]
+; EPILOG: for.body:
+; EPILOG:   br i1 %niter.ncmp.1, label %for.end.loopexit.unr-lcssa.loopexit, label %for.body, !dbg [[BODY_LOC:![0-9]+]]
+; EPILOG-NOT: br i1 %niter.ncmp.2, label %for.end.loopexit{{.*}}, label %for.body
+; EPILOG: for.body.epil.preheader:
+; EPILOG:   br label %for.body.epil, !dbg [[EXIT_LOC:![0-9]+]]
+; EPILOG: for.body.epil:
+; EPILOG:   br label %for.end.loopexit.epilog-lcssa, !dbg [[BODY_LOC:![0-9]+]]
+
+; EPILOG-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}})
+; EPILOG-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}})
+; EPILOG-DAG: [[EXIT_LOC]] = !DILocation(line: 103, column: 1, scope: !{{.*}})
+
+; PROLOG: for.body.preheader:
+; PROLOG:   br {{.*}} label %for.body.prol.preheader, label %for.body.prol.loopexit, !dbg [[PH_LOC:![0-9]+]]
+; PROLOG: for.body.prol:
+; PROLOG:   br label %for.body.prol.loopexit, !dbg [[BODY_LOC:![0-9]+]]
+; PROLOG: for.body.prol.loopexit:
+; PROLOG:   br {{.*}} label %for.end.loopexit, label %for.body.preheader.new, !dbg [[PH_LOC]]
+; PROLOG: for.body:
+; PROLOG:   br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body, !dbg [[BODY_LOC]]
+; PROLOG-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
+
+; PROLOG-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}})
+; PROLOG-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}})

 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly !dbg !6 {
 entry:
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop2.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop2.ll
@ -1,12 +1,18 @@
-; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-count=8 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-count=8 | FileCheck %s  -check-prefix=EPILOG
+; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG

 ; Choose a smaller, power-of-two, unroll count if the loop is too large.
 ; This test makes sure we're not unrolling 'odd' counts

-; CHECK: for.body.prol:
-; CHECK: for.body:
-; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body
-; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
+; EPILOG: for.body:
+; EPILOG: br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit{{.*}}, label %for.body
+; EPILOG-NOT: br i1 %niter.ncmp.4, label %for.end.loopexit.unr-lcssa.loopexit{{.*}}, label %for.body
+; EPILOG: for.body.epil:
+
+; PROLOG: for.body.prol:
+; PROLOG: for.body:
+; PROLOG: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body
+; PROLOG-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body

 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
 entry:
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop4.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop4.ll
@ -1,13 +1,21 @@
-; RUN: opt < %s -S -O2 -unroll-runtime=true | FileCheck %s
+; RUN: opt < %s -S -O2 -unroll-runtime=true | FileCheck %s -check-prefix=EPILOG
+; RUN: opt < %s -S -O2 -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG

 ; Check runtime unrolling prologue can be promoted by LICM pass.

-; CHECK: entry:
-; CHECK: %xtraiter
-; CHECK: %lcmp.mod
-; CHECK: loop1:
-; CHECK: br i1 %lcmp.mod
-; CHECK: loop2.prol:
+; EPILOG: entry:
+; EPILOG: %xtraiter
+; EPILOG: %lcmp.mod
+; EPILOG: loop1:
+; EPILOG: br i1 %lcmp.mod
+; EPILOG: loop2.epil:
+
+; PROLOG: entry:
+; PROLOG: %xtraiter
+; PROLOG: %lcmp.mod
+; PROLOG: loop1:
+; PROLOG: br i1 %lcmp.mod
+; PROLOG: loop2.prol:

 define void @unroll(i32 %iter, i32* %addr1, i32* %addr2) nounwind {
 entry:
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll
@ -11,9 +11,6 @@ entry:
  %cmp1 = icmp eq i3 %n, 0
  br i1 %cmp1, label %for.end, label %for.body

-; UNROLL-16-NOT: for.body.prol:
-; UNROLL-4: for.body.prol:
-
 for.body:                                         ; preds = %for.body, %entry
 ; UNROLL-16-LABEL: for.body:
 ; UNROLL-4-LABEL: for.body:
@ -39,6 +36,10 @@ for.body:                                         ; preds = %for.body, %entry

 ; UNROLL-16-LABEL: for.end
 ; UNROLL-4-LABEL: for.end
+
+; UNROLL-16-NOT: for.body.epil:
+; UNROLL-4: for.body.epil:
+
 for.end:                                          ; preds = %for.body, %entry
  %sum.0.lcssa = phi i3 [ 0, %entry ], [ %add, %for.body ]
  ret i3 %sum.0.lcssa
--- a/llvm/test/Transforms/LoopUnroll/tripcount-overflow.ll
+++ b/llvm/test/Transforms/LoopUnroll/tripcount-overflow.ll
@ -13,13 +13,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: entry:
 ; CHECK-NEXT: %0 = add i32 %N, 1
 ; CHECK-NEXT: %xtraiter = and i32 %0, 1
-; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0
-; CHECK-NEXT: br i1 %lcmp.mod, label %while.body.prol, label %entry.split
+; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, %0
+; CHECK-NEXT: br i1 %lcmp.mod, label %entry.new, label %while.end.unr-lcssa

-; CHECK: while.body.prol:
-; CHECK: br label %entry.split
+; CHECK: while.body.epil:
+; CHECK: br label %while.end.epilog-lcssa

-; CHECK: entry.split:
+; CHECK: while.end.epilog-lcssa:

 ; Function Attrs: nounwind readnone ssp uwtable
 define i32 @foo(i32 %N) {
--- a/llvm/test/Transforms/LoopUnroll/unroll-cleanup.ll
+++ b/llvm/test/Transforms/LoopUnroll/unroll-cleanup.ll
@ -4,14 +4,14 @@
 ; RUN: opt < %s -O2 -S | FileCheck %s

 ; After loop unroll:
-;       %dec18 = add nsw i32 %dec18.in, -1
+;       %niter.nsub = add nsw i32 %niter, -1
 ;       ...
-;       %dec18.1 = add nsw i32 %dec18, -1
+;       %niter.nsub.1 = add nsw i32 %niter.nsub, -1
 ; should be merged to:
-;       %dec18.1 = add nsw i32 %dec18.in, -2
+;       %dec18.1 = add nsw i32 %niter, -2
 ;
 ; CHECK-LABEL: @_Z3fn1v(
-; CHECK: %dec18.1 = add nsw i32 %dec18.in, -2
+; CHECK: %niter.nsub.1 = add i32 %niter, -2

 ; ModuleID = '<stdin>'
 target triple = "x86_64-unknown-linux-gnu"
--- a/llvm/test/Transforms/LoopUnroll/unroll-pragmas.ll
+++ b/llvm/test/Transforms/LoopUnroll/unroll-pragmas.ll
@ -171,10 +171,6 @@ for.end:                                          ; preds = %for.body, %entry
 ; should be duplicated (original and 4x unrolled).
 ;
 ; CHECK-LABEL: @runtime_loop_with_count4(
-; CHECK: for.body.prol:
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: br i1
 ; CHECK: for.body
 ; CHECK: store
 ; CHECK: store
@ -182,6 +178,10 @@ for.end:                                          ; preds = %for.body, %entry
 ; CHECK: store
 ; CHECK-NOT: store
 ; CHECK: br i1
+; CHECK: for.body.epil:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1
 define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) {
 entry:
  %cmp3 = icmp sgt i32 %b, 0
@ -287,10 +287,6 @@ for.end:                                          ; preds = %for.body
 ; (original and 8x).
 ;
 ; CHECK-LABEL: @runtime_loop_with_enable(
-; CHECK: for.body.prol:
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: br i1
 ; CHECK: for.body:
 ; CHECK: store i32
 ; CHECK: store i32
@ -302,6 +298,10 @@ for.end:                                          ; preds = %for.body
 ; CHECK: store i32
 ; CHECK-NOT: store i32
 ; CHECK: br i1
+; CHECK: for.body.epil:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1
 define void @runtime_loop_with_enable(i32* nocapture %a, i32 %b) {
 entry:
  %cmp3 = icmp sgt i32 %b, 0
@ -328,16 +328,16 @@ for.end:                                          ; preds = %for.body, %entry
 ; should be duplicated (original and 3x unrolled).
 ;
 ; CHECK-LABEL: @runtime_loop_with_count3(
-; CHECK: for.body.prol:
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: br i1
 ; CHECK: for.body
 ; CHECK: store
 ; CHECK: store
 ; CHECK: store
 ; CHECK-NOT: store
 ; CHECK: br i1
+; CHECK: for.body.epil:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1
 define void @runtime_loop_with_count3(i32* nocapture %a, i32 %b) {
 entry:
  %cmp3 = icmp sgt i32 %b, 0