From 4cbcbb0761325099ff63927ef8bf36e97dc43c7f Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Tue, 29 May 2018 20:15:38 +0000
Subject: [PATCH] [LoopInstSimplify] Re-implement the core logic of
 loop-instsimplify to be both simpler and substantially more efficient.

Rather than use a hand-rolled iteration technique that isn't quite the
same as RPO, use the pre-built RPO loop body traversal utility.

Once visiting the loop body in RPO, we can assert that we visit defs
before uses reliably. When this is the case, the only need to iterate is
when simplifying a def that is used by a PHI node along a back-edge.
With this patch, the first pass over the loop body is just a complete
simplification of every instruction across the loop body. When we
encounter a use of a simplified instruction that stems from a PHI node
in the loop body that has already been visited (due to some cyclic CFG,
potentially the loop itself, or a nested loop, or unstructured control
flow), we recall that specific PHI node for the second iteration.
Nothing else needs to be preserved from iteration to iteration.

On the second and later iterations, only instructions known to have
simplified inputs are considered, each time starting from a set of PHIs
that had simplified inputs along the backedges.

Dead instructions are collected along the way, but deleted in a batch at
the end of each iteration making the iterations themselves substantially
simpler. This uses a new batch API for recursively deleting dead
instructions.

This alsa changes the routine to visit subloops. Because simplification
is fundamentally transitive, we may need to visit the entire loop body,
including subloops, to handle knock-on simplification.

I've added a basic test file that helps demonstrate that all of these
changes work. It includes both straight-forward loops with
simplifications as well as interesting PHI-structures, CFG-structures,
and a nested loop case.

Differential Revision: https://reviews.llvm.org/D47407

llvm-svn: 333461
---
 llvm/include/llvm/Analysis/Utils/Local.h      |  12 +
 .../Transforms/Scalar/LoopInstSimplify.cpp    | 212 +++++++++---------
 llvm/lib/Transforms/Utils/Local.cpp           |  33 ++-
 .../test/Transforms/LoopInstSimplify/basic.ll | 164 ++++++++++++++
 4 files changed, 303 insertions(+), 118 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopInstSimplify/basic.ll
diff --git a/llvm/include/llvm/Analysis/Utils/Local.h b/llvm/include/llvm/Analysis/Utils/Local.h
index 42a654d3ddd7..fb6444f72eb7 100644
--- a/llvm/include/llvm/Analysis/Utils/Local.h
+++ b/llvm/include/llvm/Analysis/Utils/Local.h
@@ -141,6 +141,18 @@ bool wouldInstructionBeTriviallyDead(Instruction *I,
 bool RecursivelyDeleteTriviallyDeadInstructions(Value *V,
                                         const TargetLibraryInfo *TLI = nullptr);
 
+/// Delete all of the instructions in `DeadInsts`, and all other instructions
+/// that deleting these in turn causes to be trivially dead.
+///
+/// The initial instructions in the provided vector must all have empty use
+/// lists and satisfy `isInstructionTriviallyDead`.
+///
+/// `DeadInsts` will be used as scratch storage for this routine and will be
+/// empty afterward.
+void RecursivelyDeleteTriviallyDeadInstructions(
+    SmallVectorImpl<Instruction *> &DeadInsts,
+    const TargetLibraryInfo *TLI = nullptr);
+
 /// If the specified value is an effectively dead PHI node, due to being a
 /// def-use chain of single-use nodes that either forms a cycle or is terminated
 /// by a trivially dead instruction, delete it. If that makes any of its
diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 661c4fe906b9..da46be92f2e6 100644
--- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/Utils/Local.h"
@@ -45,118 +46,116 @@ using namespace llvm;
 
 STATISTIC(NumSimplified, "Number of redundant instructions simplified");
 
-static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
-                             AssumptionCache *AC,
-                             const TargetLibraryInfo *TLI) {
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  L->getUniqueExitBlocks(ExitBlocks);
-  array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
+static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                             AssumptionCache &AC,
+                             const TargetLibraryInfo &TLI) {
+  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  SimplifyQuery SQ(DL, &TLI, &DT, &AC);
 
+  // On the first pass over the loop body we try to simplify every instruction.
+  // On subsequent passes, we can restrict this to only simplifying instructions
+  // where the inputs have been updated. We end up needing two sets: one
+  // containing the instructions we are simplifying in *this* pass, and one for
+  // the instructions we will want to simplify in the *next* pass. We use
+  // pointers so we can swap between two stably allocated sets.
   SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
 
-  // The bit we are stealing from the pointer represents whether this basic
-  // block is the header of a subloop, in which case we only process its phis.
-  using WorklistItem = PointerIntPair<BasicBlock *, 1>;
-  SmallVector<WorklistItem, 16> VisitStack;
-  SmallPtrSet<BasicBlock *, 32> Visited;
+  // Track the PHI nodes that have already been visited during each iteration so
+  // that we can identify when it is necessary to iterate.
+  SmallPtrSet<PHINode *, 4> VisitedPHIs;
+
+  // While simplifying we may discover dead code or cause code to become dead.
+  // Keep track of all such instructions and we will delete them at the end.
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  // First we want to create an RPO traversal of the loop body. By processing in
+  // RPO we can ensure that definitions are processed prior to uses (for non PHI
+  // uses) in all cases. This ensures we maximize the simplifications in each
+  // iteration over the loop and minimizes the possible causes for continuing to
+  // iterate.
+  LoopBlocksRPO RPOT(&L);
+  RPOT.perform(&LI);
 
   bool Changed = false;
-  bool LocalChanged;
-  do {
-    LocalChanged = false;
-
-    VisitStack.clear();
-    Visited.clear();
-
-    VisitStack.push_back(WorklistItem(L->getHeader(), false));
-
-    while (!VisitStack.empty()) {
-      WorklistItem Item = VisitStack.pop_back_val();
-      BasicBlock *BB = Item.getPointer();
-      bool IsSubloopHeader = Item.getInt();
-      const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
-      // Simplify instructions in the current basic block.
-      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-        Instruction *I = &*BI++;
-
-        // The first time through the loop ToSimplify is empty and we try to
-        // simplify all instructions. On later iterations ToSimplify is not
-        // empty and we only bother simplifying instructions that are in it.
-        if (!ToSimplify->empty() && !ToSimplify->count(I))
-          continue;
-
-        // Don't bother simplifying unused instructions.
-        if (!I->use_empty()) {
-          Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC});
-          if (V && LI->replacementPreservesLCSSAForm(I, V)) {
-            // Mark all uses for resimplification next time round the loop.
-            for (User *U : I->users())
-              Next->insert(cast<Instruction>(U));
-
-            I->replaceAllUsesWith(V);
-            LocalChanged = true;
-            ++NumSimplified;
-          }
-        }
-        if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
-          // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
-          // instruction, so simply incrementing the iterator does not work.
-          // When instructions get deleted re-iterate instead.
-          BI = BB->begin();
-          BE = BB->end();
-          LocalChanged = true;
-        }
-
-        if (IsSubloopHeader && !isa<PHINode>(I))
-          break;
-      }
-
-      // Add all successors to the worklist, except for loop exit blocks and the
-      // bodies of subloops. We visit the headers of loops so that we can
-      // process
-      // their phis, but we contract the rest of the subloop body and only
-      // follow
-      // edges leading back to the original loop.
-      for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
-           ++SI) {
-        BasicBlock *SuccBB = *SI;
-        if (!Visited.insert(SuccBB).second)
-          continue;
-
-        const Loop *SuccLoop = LI->getLoopFor(SuccBB);
-        if (SuccLoop && SuccLoop->getHeader() == SuccBB &&
-            L->contains(SuccLoop)) {
-          VisitStack.push_back(WorklistItem(SuccBB, true));
-
-          SmallVector<BasicBlock *, 8> SubLoopExitBlocks;
-          SuccLoop->getExitBlocks(SubLoopExitBlocks);
-
-          for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
-            BasicBlock *ExitBB = SubLoopExitBlocks[i];
-            if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
-              VisitStack.push_back(WorklistItem(ExitBB, false));
-          }
+  for (;;) {
+    for (BasicBlock *BB : RPOT) {
+      for (Instruction &I : *BB) {
+        if (auto *PI = dyn_cast<PHINode>(&I))
+          VisitedPHIs.insert(PI);
 
+        if (I.use_empty()) {
+          if (isInstructionTriviallyDead(&I, &TLI))
+            DeadInsts.push_back(&I);
           continue;
         }
 
-        bool IsExitBlock =
-            std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB);
-        if (IsExitBlock)
+        // We special case the first iteration which we can detect due to the
+        // empty `ToSimplify` set.
+        bool IsFirstIteration = ToSimplify->empty();
+
+        if (!IsFirstIteration && !ToSimplify->count(&I))
           continue;
 
-        VisitStack.push_back(WorklistItem(SuccBB, false));
+        Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I));
+        if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
+          continue;
+
+        for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+             UI != UE;) {
+          Use &U = *UI++;
+          auto *UserI = cast<Instruction>(U.getUser());
+          U.set(V);
+
+          // If the instruction is used by a PHI node we have already processed
+          // we'll need to iterate on the loop body to converge, so add it to
+          // the next set.
+          if (auto *UserPI = dyn_cast<PHINode>(UserI))
+            if (VisitedPHIs.count(UserPI)) {
+              Next->insert(UserPI);
+              continue;
+            }
+
+          // If we are only simplifying targeted instructions and the user is an
+          // instruction in the loop body, add it to our set of targeted
+          // instructions. Because we process defs before uses (outside of PHIs)
+          // we won't have visited it yet.
+          //
+          // We also skip any uses outside of the loop being simplified. Those
+          // should always be PHI nodes due to LCSSA form, and we don't want to
+          // try to simplify those away.
+          assert((L.contains(UserI) || isa<PHINode>(UserI)) &&
+                 "Uses outside the loop should be PHI nodes due to LCSSA!");
+          if (!IsFirstIteration && L.contains(UserI))
+            ToSimplify->insert(UserI);
+        }
+
+        assert(I.use_empty() && "Should always have replaced all uses!");
+        if (isInstructionTriviallyDead(&I, &TLI))
+          DeadInsts.push_back(&I);
+        ++NumSimplified;
+        Changed = true;
       }
     }
 
-    // Place the list of instructions to simplify on the next loop iteration
-    // into ToSimplify.
-    std::swap(ToSimplify, Next);
-    Next->clear();
+    // Delete any dead instructions found thus far now that we've finished an
+    // iteration over all instructions in all the loop blocks.
+    if (!DeadInsts.empty()) {
+      Changed = true;
+      RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI);
+    }
 
-    Changed |= LocalChanged;
-  } while (LocalChanged);
+    // If we never found a PHI that needs to be simplified in the next
+    // iteration, we're done.
+    if (Next->empty())
+      break;
+
+    // Otherwise, put the next set in place for the next iteration and reset it
+    // and the visited PHIs for that iteration.
+    std::swap(Next, ToSimplify);
+    Next->clear();
+    VisitedPHIs.clear();
+    DeadInsts.clear();
+  }
 
   return Changed;
 }
@@ -174,21 +173,20 @@ public:
   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
     if (skipLoop(L))
       return false;
-    DominatorTreeWrapperPass *DTWP =
-        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-    DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    AssumptionCache *AC =
-        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
             *L->getHeader()->getParent());
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
-    return SimplifyLoopInst(L, DT, LI, AC, TLI);
+    return simplifyLoopInst(*L, DT, LI, AC, TLI);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.setPreservesCFG();
     getLoopAnalysisUsage(AU);
@@ -200,7 +198,7 @@ public:
 PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
                                             LoopStandardAnalysisResults &AR,
                                             LPMUpdater &) {
-  if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
+  if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI))
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 0be05d81fbda..38a9d34aece8 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -434,18 +434,31 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
 
   SmallVector<Instruction*, 16> DeadInsts;
   DeadInsts.push_back(I);
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
 
-  do {
-    I = DeadInsts.pop_back_val();
-    salvageDebugInfo(*I);
+  return true;
+}
+
+void llvm::RecursivelyDeleteTriviallyDeadInstructions(
+    SmallVectorImpl<Instruction *> &DeadInsts, const TargetLibraryInfo *TLI) {
+  // Process the dead instruction list until empty.
+  while (!DeadInsts.empty()) {
+    Instruction &I = *DeadInsts.pop_back_val();
+    assert(I.use_empty() && "Instructions with uses are not dead.");
+    assert(isInstructionTriviallyDead(&I, TLI) &&
+           "Live instruction found in dead worklist!");
+
+    // Don't lose the debug info while deleting the instructions.
+    salvageDebugInfo(I);
 
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      Value *OpV = I->getOperand(i);
-      I->setOperand(i, nullptr);
+    for (Use &OpU : I.operands()) {
+      Value *OpV = OpU.get();
+      OpU.set(nullptr);
 
-      if (!OpV->use_empty()) continue;
+      if (!OpV->use_empty())
+        continue;
 
       // If the operand is an instruction that became dead as we nulled out the
       // operand, and if it is 'trivially' dead, delete it in a future loop
@@ -455,10 +468,8 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
           DeadInsts.push_back(OpI);
     }
 
-    I->eraseFromParent();
-  } while (!DeadInsts.empty());
-
-  return true;
+    I.eraseFromParent();
+  }
 }
 
 /// areAllUsesEqual - Check whether the uses of a value are all the same.
diff --git a/llvm/test/Transforms/LoopInstSimplify/basic.ll b/llvm/test/Transforms/LoopInstSimplify/basic.ll
new file mode 100644
index 000000000000..7e6cafd85576
--- /dev/null
+++ b/llvm/test/Transforms/LoopInstSimplify/basic.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -passes=loop-instsimplify | FileCheck %s
+
+; Test very basic folding and propagation occurs within a loop body. This should
+; collapse to the loop iteration structure and the LCSSA PHI node.
+define i32 @test1(i32 %n, i32 %x) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %x.add = add nsw i32 %x, 0
+  %x.sub = sub i32 %x.add, 0
+  %x.and = and i32 %x.sub, -1
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.and, %loop ]
+  ret i32 %x.lcssa
+}
+
+; Test basic loop structure that still has a simplification feed a prior PHI.
+define i32 @test2(i32 %n, i32 %x) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %x.loop = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+  %x.next = add nsw i32 %x.loop, 0
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.loop, %loop ]
+  ret i32 %x.lcssa
+}
+
+; Test a diamond CFG with inner PHI nodes.
+define i32 @test3(i32 %n, i32 %x) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[X_CMP:%.*]] = icmp slt i32 [[I]], 42
+; CHECK-NEXT:    br i1 [[X_CMP]], label [[LOOP_LHS:%.*]], label [[LOOP_RHS:%.*]]
+; CHECK:       loop.lhs:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.rhs:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %x.loop = phi i32 [ %x, %entry ], [ %x.phi, %loop.latch ]
+  %x.add = add nsw i32 %x.loop, 0
+  %x.cmp = icmp slt i32 %i, 42
+  br i1 %x.cmp, label %loop.lhs, label %loop.rhs
+
+loop.lhs:
+  %x.l.add = add nsw i32 %x.add, 0
+  br label %loop.latch
+
+loop.rhs:
+  %x.r.sub = sub nsw i32 %x.add, 0
+  br label %loop.latch
+
+loop.latch:
+  %x.phi = phi i32 [ %x.l.add, %loop.lhs ], [ %x.r.sub, %loop.rhs ]
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.loop, %loop.latch ]
+  ret i32 %x.lcssa
+}
+
+; Test an inner loop that is only simplified when processing the outer loop, and
+; an outer loop only simplified when processing the inner loop.
+define i32 @test4(i32 %n, i32 %m, i32 %x) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_INNER:%.*]]
+; CHECK:       loop.inner:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[J_NEXT:%.*]], [[LOOP_INNER]] ]
+; CHECK-NEXT:    [[J_NEXT]] = add nsw i32 [[J]], 1
+; CHECK-NEXT:    [[J_CMP:%.*]] = icmp slt i32 [[J_NEXT]], [[M:%.*]]
+; CHECK-NEXT:    br i1 [[J_CMP]], label [[LOOP_INNER]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %x.loop = phi i32 [ %x, %entry ], [ %x.inner.lcssa, %loop.latch ]
+  %x.add = add nsw i32 %x.loop, 0
+  br label %loop.inner
+
+loop.inner:
+  %j = phi i32 [ 0, %loop ], [ %j.next, %loop.inner ]
+  %x.inner.loop = phi i32 [ %x.add, %loop ], [ %x.inner.add, %loop.inner ]
+  %x.inner.add = add nsw i32 %x.inner.loop, 0
+  %j.next = add nsw i32 %j, 1
+  %j.cmp = icmp slt i32 %j.next, %m
+  br i1 %j.cmp, label %loop.inner, label %loop.latch
+
+loop.latch:
+  %x.inner.lcssa = phi i32 [ %x.inner.loop, %loop.inner ]
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.loop, %loop.latch ]
+  ret i32 %x.lcssa
+}