From fac1498067bdacffdf5a01198fd7ba180c31680e Mon Sep 17 00:00:00 2001
From: Hanchen Ye <hanchenye@gmail.com>
Date: Mon, 14 Dec 2020 23:41:08 -0600
Subject: [PATCH] [LoopPipelining] move automatic flattening to this pass;
 [QoREstimation] support AffineIfOp estimation, know issue: dependencies
 between load/store operations should be considered in scheduling

---
 include/Analysis/QoREstimation.h          |   7 +-
 lib/Analysis/QoREstimation.cpp            | 224 +++++++++++++++-------
 lib/Transforms/LoopPipelining.cpp         |  24 ++-
 test/Analysis/QoREstimation/test_for.mlir |   4 +-
 4 files changed, 180 insertions(+), 79 deletions(-)

diff --git a/include/Analysis/QoREstimation.h b/include/Analysis/QoREstimation.h
index 1fa632f..4ea42eb 100644
--- a/include/Analysis/QoREstimation.h
+++ b/include/Analysis/QoREstimation.h
@@ -134,18 +134,21 @@ public:
 
   using HLSCppVisitorBase::visitOp;
   bool visitOp(AffineForOp op);
+  bool visitOp(AffineIfOp op);
+  bool visitOp(ArrayOp op);
 
   void getBlockMemInfo(Block &block, LoadStoreDict &info);
 
   unsigned getLoadStoreSchedule(Operation *op, unsigned begin,
                                 MemPortDicts &dicts);
+  void updateChildBlockSchedule(Block &block, unsigned begin);
   unsigned getBlockSchedule(Block &block);
 
   unsigned getResMinII(AffineForOp forOp, LoadStoreDict dict);
   unsigned getDepMinII(AffineForOp forOp, LoadStoreDict dict);
 
-  void estimateFunc(FuncOp func);
-  void estimateBlock(Block &block);
+  bool estimateFunc(FuncOp func);
+  bool estimateBlock(Block &block);
 };
 
 } // namespace scalehls
diff --git a/lib/Analysis/QoREstimation.cpp b/lib/Analysis/QoREstimation.cpp
index 38d3785..c913073 100644
--- a/lib/Analysis/QoREstimation.cpp
+++ b/lib/Analysis/QoREstimation.cpp
@@ -7,6 +7,7 @@
 #include "Dialect/HLSCpp/HLSCpp.h"
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
+#include "mlir/Analysis/Liveness.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/IR/Operation.h"
@@ -124,7 +125,7 @@ unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin,
           rdwrPort = 1;
         else {
           rdwrPort = 2;
-          arrayOp.emitError("unsupported storage type.");
+          // arrayOp.emitError("unsupported storage type.");
         }
         PortInfo portInfo(rdPort, wrPort, rdwrPort);
         memPort.push_back(portInfo);
@@ -175,7 +176,30 @@ unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin,
   return begin;
 }
 
-/// Calculate scheduling information of the block.
+void HLSCppEstimator::updateChildBlockSchedule(Block &block, unsigned begin) {
+  for (auto &op : block) {
+    unsigned newBegin = begin;
+    unsigned newEnd = begin;
+
+    // Update the schedule of all operations in the child block.
+    if (getUIntAttrValue(&op, "schedule_end")) {
+      newBegin += getUIntAttrValue(&op, "schedule_begin");
+      newEnd += getUIntAttrValue(&op, "schedule_end");
+      setAttrValue(&op, "schedule_begin", newBegin);
+      setAttrValue(&op, "schedule_end", newEnd);
+    }
+
+    // Recursively apply to all child blocks.
+    if (op.getNumRegions()) {
+      for (auto &region : op.getRegions()) {
+        for (auto &block : region.getBlocks())
+          updateChildBlockSchedule(block, begin);
+      }
+    }
+  }
+}
+
+/// Schedule the block with ASAP algorithm.
 unsigned HLSCppEstimator::getBlockSchedule(Block &block) {
   unsigned blockEnd = 0;
   MemPortDicts dicts;
@@ -191,24 +215,60 @@ unsigned HLSCppEstimator::getBlockSchedule(Block &block) {
         begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
     }
 
-    // Insert new pipeline stages to the memory port dicts.
-    while (begin >= dicts.size()) {
-      MemPortDict memPortDict;
-      dicts.push_back(memPortDict);
-    }
-
-    // Handle load/store operations, ensure the current schedule meets memory
-    // port limitation.
-    if (isa<mlir::AffineReadOpInterface, mlir::AffineWriteOpInterface>(op)) {
-      begin = getLoadStoreSchedule(&op, begin, dicts);
-      end = begin + 1;
-    }
     // Handle loop operations.
-    else if (auto forOp = dyn_cast<AffineForOp>(op)) {
+    if (auto forOp = dyn_cast<AffineForOp>(op)) {
+      // Live ins of the for loop body will also impact the schedule begin.
+      Liveness liveness(block.getParentOp());
+      for (auto liveIn : liveness.getLiveIn(&forOp.getLoopBody().front())) {
+        if (auto defOp = liveIn.getDefiningOp())
+          begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
+      }
+
+      // Update the schedule of all operations in the loop body.
+      updateChildBlockSchedule(forOp.getLoopBody().front(), begin);
+
       // Child loop is considered as a large node, and two extra clock cycles
       // will be required to enter and exit the child loop.
       end = begin + getUIntAttrValue(forOp, "latency") + 2;
     }
+
+    // Handle if operations.
+    else if (auto ifOp = dyn_cast<AffineIfOp>(op)) {
+      // Live ins of the if body will also impact the schedule begin.
+      Liveness liveness(block.getParentOp());
+      for (auto liveIn : liveness.getLiveIn(ifOp.getThenBlock())) {
+        if (auto defOp = liveIn.getDefiningOp())
+          begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
+      }
+
+      if (ifOp.hasElse()) {
+        for (auto liveIn : liveness.getLiveIn(ifOp.getElseBlock())) {
+          if (auto defOp = liveIn.getDefiningOp())
+            begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
+        }
+        // Update the schedule of all operations in the else block.
+        updateChildBlockSchedule(*ifOp.getElseBlock(), begin);
+      }
+
+      // Update the schedule of all operations in the then block.
+      updateChildBlockSchedule(*ifOp.getThenBlock(), begin);
+
+      end = begin + getUIntAttrValue(ifOp, "latency");
+    }
+
+    // Handle load/store operations.
+    else if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
+      // Insert new schedule level to the memory port dicts.
+      while (begin >= dicts.size()) {
+        MemPortDict memPortDict;
+        dicts.push_back(memPortDict);
+      }
+
+      // Ensure the current schedule meets memory port limitation.
+      begin = getLoadStoreSchedule(&op, begin, dicts);
+      end = begin + 1;
+    }
+
     // Default case. All normal expressions and operations will be handled by
     // this branch.
     else {
@@ -253,6 +313,10 @@ unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoreDict dict) {
           accessNum = 2;
         else if (storageType == "ram_1p")
           accessNum = 1;
+        else {
+          accessNum = 2;
+          // arrayOp.emitError("unsupported storage type.");
+        }
 
         // The rationale here is an undetermined partition access will
         // introduce a large mux which will avoid Vivado HLS to process any
@@ -364,8 +428,24 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) {
 
 bool HLSCppEstimator::visitOp(AffineForOp op) {
   auto &body = op.getLoopBody();
-  if (body.getBlocks().size() != 1)
+  if (body.getBlocks().size() != 1) {
     op.emitError("has zero or more than one basic blocks.");
+    return false;
+  }
+
+  // Recursively estimate all contained operations.
+  if (!estimateBlock(body.front()))
+    return false;
+
+  // Set an attribute indicating the trip count. For now, we assume all
+  // loops have static loop bound.
+  if (auto tripCount = getConstantTripCount(op))
+    setAttrValue(op, "trip_count", (unsigned)tripCount.getValue());
+  else {
+    setAttrValue(op, "trip_count", (unsigned)0);
+    op.emitError("has undetermined trip count");
+    return false;
+  }
 
   // If the current loop is annotated as pipeline, extra dependency and II
   // analysis will be executed.
@@ -388,13 +468,10 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
     return true;
   }
 
-  // Recursively estimate all inner loops.
-  estimateBlock(body.front());
-
-  // This simply means the current loop can be flattened into the child loop
-  // pipeline. This will increase the flattened loop trip count without
-  // changing the iteration latency. Note that this will be propogated above
-  // until meeting an imperfect loop.
+  // This means the current loop can be flattened into the child loop. If the
+  // child loop is pipelined, this will increase the flattened loop trip count
+  // without changing the iteration latency. Note that this will be propogated
+  // above until meeting an imperfect loop.
   if (getBoolAttrValue(op, "flatten")) {
     if (auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front())) {
       // This means the inner loop is pipelined, because otherwise II will be
@@ -434,71 +511,70 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
   return true;
 }
 
-void HLSCppEstimator::estimateBlock(Block &block) {
+bool HLSCppEstimator::visitOp(AffineIfOp op) {
+  auto thenBlock = op.getThenBlock();
+  if (!estimateBlock(*thenBlock))
+    return false;
+
+  LoadStoreDict dict;
+  getBlockMemInfo(*thenBlock, dict);
+  auto latency = getBlockSchedule(*thenBlock);
+
+  // Handle else block if required.
+  if (op.hasElse()) {
+    auto elseBlock = op.getElseBlock();
+    if (!estimateBlock(*elseBlock))
+      return false;
+
+    getBlockMemInfo(*elseBlock, dict);
+    latency = max(latency, getBlockSchedule(*elseBlock));
+  }
+
+  setAttrValue(op, "latency", latency);
+  return true;
+}
+
+bool HLSCppEstimator::visitOp(ArrayOp op) {
+  unsigned partitionNum = 1;
+  if (op.partition()) {
+    auto rank = op.getType().cast<ShapedType>().getRank();
+    for (unsigned i = 0; i < rank; ++i) {
+      if (auto factor = getPartitionFactor(op, i))
+        partitionNum *= factor;
+    }
+  }
+  setAttrValue(op, "partition_num", partitionNum);
+  return true;
+}
+
+bool HLSCppEstimator::estimateBlock(Block &block) {
   for (auto &op : block) {
     if (dispatchVisitor(&op))
       continue;
-    op.emitError("can't be correctly estimated.");
+    else {
+      op.emitError("can't be correctly estimated.");
+      return false;
+    }
   }
+  return true;
 }
 
-void HLSCppEstimator::estimateFunc(FuncOp func) {
-  if (func.getBlocks().size() != 1)
+bool HLSCppEstimator::estimateFunc(FuncOp func) {
+  if (func.getBlocks().size() != 1) {
     func.emitError("has zero or more than one basic blocks.");
+    return false;
+  }
 
-  // Extract all static parameters and current pragma configurations.
-  func.walk([&](ArrayOp op) {
-    unsigned factor = 1;
-    if (getBoolAttrValue(op, "partition")) {
-      for (unsigned i = 0, e = op.getType().cast<ShapedType>().getRank(); i < e;
-           ++i)
-        factor *= getPartitionFactor(op, i);
-    }
-    setAttrValue(op, "partition_num", factor);
-  });
-
-  func.walk([&](AffineForOp op) {
-    // We assume loop contains a single basic block.
-    auto &body = op.getLoopBody();
-    if (body.getBlocks().size() != 1)
-      op.emitError("has zero or more than one basic blocks.");
-
-    // Set an attribute indicating the trip count. For now, we assume all
-    // loops have static loop bound.
-    if (auto tripCount = getConstantTripCount(op))
-      setAttrValue(op, "trip_count", (unsigned)tripCount.getValue());
-    else {
-      setAttrValue(op, "trip_count", (unsigned)0);
-      op.emitError("has variable trip count");
-    }
-
-    // Set attributes indicating this loop can be flatten or not.
-    unsigned opNum = 0;
-    unsigned forNum = 0;
-    bool innerFlatten = false;
-
-    for (auto &bodyOp : body.front()) {
-      if (!isa<AffineYieldOp>(bodyOp))
-        opNum++;
-      if (isa<AffineForOp>(bodyOp)) {
-        forNum++;
-        innerFlatten = getBoolAttrValue(&bodyOp, "flatten");
-      }
-    }
-
-    if (forNum == 0 || (opNum == 1 && innerFlatten))
-      setAttrValue(op, "flatten", true);
-    else
-      setAttrValue(op, "flatten", false);
-  });
-
-  estimateBlock(func.front());
+  // Recursively estimate all contained operations.
+  if (!estimateBlock(func.front()))
+    return false;
 
   LoadStoreDict dict;
   getBlockMemInfo(func.front(), dict);
 
   auto latency = getBlockSchedule(func.front());
   setAttrValue(func, "latency", latency);
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Transforms/LoopPipelining.cpp b/lib/Transforms/LoopPipelining.cpp
index b8c4a11..38e565c 100644
--- a/lib/Transforms/LoopPipelining.cpp
+++ b/lib/Transforms/LoopPipelining.cpp
@@ -34,11 +34,33 @@ void LoopPipelining::runOnOperation() {
 
     targetLoop.setAttr("pipeline", builder.getBoolAttr(true));
 
-    // All intter loops of the pipelined loop are automatically unrolled.
+    // All inner loops of the pipelined loop are automatically unrolled.
     targetLoop.walk([&](mlir::AffineForOp loop) {
       if (loop != targetLoop)
         loopUnrollFull(loop);
     });
+
+    // All outer loops that perfect nest the pipelined loop can be flattened.
+    forOp.walk([&](mlir::AffineForOp loop) {
+      unsigned opNum = 0;
+      unsigned forNum = 0;
+      bool innerFlatten = false;
+
+      for (auto &bodyOp : loop.getLoopBody().front()) {
+        if (!isa<AffineYieldOp>(bodyOp))
+          opNum++;
+        if (isa<AffineForOp>(bodyOp)) {
+          forNum++;
+          if (auto flatten = bodyOp.getAttrOfType<BoolAttr>("flatten"))
+            innerFlatten = flatten.getValue();
+        }
+      }
+
+      if (forNum == 0 || (opNum == 1 && innerFlatten))
+        loop.setAttr("flatten", builder.getBoolAttr(true));
+      else
+        loop.setAttr("flatten", builder.getBoolAttr(false));
+    });
   }
 
   // Canonicalize the IR after loop unrolling.
diff --git a/test/Analysis/QoREstimation/test_for.mlir b/test/Analysis/QoREstimation/test_for.mlir
index 2ec864a..8b86e66 100644
--- a/test/Analysis/QoREstimation/test_for.mlir
+++ b/test/Analysis/QoREstimation/test_for.mlir
@@ -9,8 +9,8 @@ func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attribu
       affine.for %k = 0 to 4 {
         %0 = affine.load %array0[%i, %j, %k] : memref<16x4x4xindex>
         %1 = affine.load %array1[%i, %j, %k] : memref<16x4x4xindex>
-        %2 = addi %1, %1 : index
-        affine.store %2, %array1[%i, %j+1, %k] : memref<16x4x4xindex>
+        %2 = addi %0, %1 : index
+        affine.store %2, %array1[%i, %j, %k+1] : memref<16x4x4xindex>
       } {pipeline = false, unroll = false, flatten = false}
     } {pipeline = false, unroll = false, flatten = false}
   } {pipeline = false, unroll = false, flatten = false}