[QoREstimation] unroll loops in IR now! support a real partition-aware loop scheduling

2020-10-10 02:33:26 -05:00 · 2020-10-10 02:33:26 -05:00 · cbfb623c67
parent 28926c5cc2
commit cbfb623c67
3 changed files with 323 additions and 300 deletions
--- a/include/Analysis/QoREstimation.h
+++ b/include/Analysis/QoREstimation.h
@ -9,6 +9,7 @@
 #include "Visitor.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/TypeSwitch.h"

 namespace mlir {
@ -23,24 +24,46 @@ public:
  explicit HLSCppToolBase(OpBuilder &builder) : builder(builder) {}

  /// Get value methods.
+  int64_t getIntAttrValue(Operation *op, StringRef name) {
+    if (auto attr = op->getAttrOfType<IntegerAttr>(name))
+      return attr.getInt();
+    else
+      return -1;
+  }
+
  unsigned getUIntAttrValue(Operation *op, StringRef name) {
-    return op->getAttrOfType<IntegerAttr>(name).getUInt();
+    if (auto attr = op->getAttrOfType<IntegerAttr>(name))
+      return attr.getUInt();
+    else
+      return 0;
  }

  bool getBoolAttrValue(Operation *op, StringRef name) {
-    return op->getAttrOfType<BoolAttr>(name).getValue();
+    if (auto attr = op->getAttrOfType<BoolAttr>(name))
+      return attr.getValue();
+    else
+      return false;
  }

  StringRef getStrAttrValue(Operation *op, StringRef name) {
-    return op->getAttrOfType<StringAttr>(name).getValue();
+    if (auto attr = op->getAttrOfType<StringAttr>(name))
+      return attr.getValue();
+    else
+      return "";
  }

-  StringRef getPartitionType(ArrayOp *op, unsigned dim) {
-    return op->partition_type()[dim].cast<StringAttr>().getValue();
+  StringRef getPartitionType(ArrayOp op, unsigned dim) {
+    if (auto attr = op.partition_type()[dim].cast<StringAttr>())
+      return attr.getValue();
+    else
+      return "";
  }

-  unsigned getPartitionFactor(ArrayOp *op, unsigned dim) {
-    return op->partition_factor()[dim].cast<IntegerAttr>().getUInt();
+  unsigned getPartitionFactor(ArrayOp op, unsigned dim) {
+    if (auto attr = op.partition_factor()[dim].cast<IntegerAttr>())
+      return attr.getUInt();
+    else
+      return 0;
  }

  /// Set value methods.
@ -48,6 +71,10 @@ public:
    op->setAttr(name, builder.getUI32IntegerAttr(value));
  }

+  void setAttrValue(Operation *op, StringRef name, int32_t value) {
+    op->setAttr(name, builder.getI32IntegerAttr(value));
+  }
+
  void setAttrValue(Operation *op, StringRef name, bool value) {
    op->setAttr(name, builder.getBoolAttr(value));
  }
@ -65,11 +92,10 @@ public:
    return getAffineDimExpr(value, builder.getContext());
  }

-  AffineExpr getConstExpr(unsigned value) {
+  AffineExpr getConstExpr(int64_t value) {
    return getAffineConstantExpr(value, builder.getContext());
  }

-private:
  OpBuilder &builder;
 };

@ -97,20 +123,30 @@ public:
 // HLSCppEstimator Class Declaration
 //===----------------------------------------------------------------------===//

-// For storing the scheduled time stamp of operations.
-using OpScheduleMap = llvm::SmallDenseMap<Operation *, unsigned, 16>;
+// Indicate the unoccupied memory ports number.
+struct PortNum {
+  PortNum(unsigned rdPort = 0, unsigned wrPort = 0, unsigned rdwrPort = 0)
+      : rdPort(rdPort), wrPort(wrPort), rdwrPort(rdwrPort) {}

-// For storing each memory access operations indexed by its targed memory
-// value symbol.
-using MemAccess = SmallVector<Operation *, 4>;
-using MemAccessDict = llvm::SmallDenseMap<Value, MemAccess, 16>;
+  unsigned rdPort;
+  unsigned wrPort;
+  unsigned rdwrPort;
+};

-// For storing memory access and schedule information of pipelined region.
-struct PipelineInfo {
-  PipelineInfo(unsigned baseII) : II(baseII) {}
+// For storing ports number information of each memory instance.
+using MemPort = llvm::SmallDenseMap<Operation *, SmallVector<PortNum, 16>, 8>;

-  unsigned II;
-  OpScheduleMap opScheduleMap;
+// For storing MemPort indexed by the pipeline stage (a basic block).
+using MemPortList = SmallVector<MemPort, 16>;
+
+// For storing each memory access operations (including AffineLoadOp and
+// AffineStoreOp) indexed by the array instantce (ArrayOp).
+using MemAccess = SmallVector<Operation *, 16>;
+using MemAccessDict = llvm::SmallDenseMap<Operation *, MemAccess, 8>;
+
+// An aggregate information structure for storing memory load and store
+// MemAccessDict in the scope of loop/function/other region.
+struct MemInfo {
  MemAccessDict memLoadDict;
  MemAccessDict memStoreDict;
 };
@ -126,11 +162,6 @@ struct InductionInfo {
 };
 using InductionInfoList = SmallVector<InductionInfo, 8>;

-// This records the number of accesses for each partition.
-using AccessNum = SmallVector<unsigned, 16>;
-// This records the AccessNum of each dimension of an array.
-using AccessNumList = SmallVector<AccessNum, 8>;
-
 class HLSCppEstimator : public HLSCppVisitorBase<HLSCppEstimator, bool>,
                        public HLSCppToolBase {
 public:
@ -143,14 +174,12 @@ public:
  bool visitOp(AffineForOp op);
  bool visitOp(AffineIfOp op);

-  void setBlockSchedule(Block &block, unsigned opSchedule,
-                        OpScheduleMap &opScheduleMap);
-  unsigned getBlockSchedule(Block &block, bool innerFlatten,
-                            OpScheduleMap &opScheduleMap);
+  int32_t getPartitionIdx(AffineMap map, ArrayOp op);
+  void getMemInfo(Block &block, MemInfo &info);

-  void getPipelineInfo(Block &block, PipelineInfo &info);
-
-  template <typename OpType> void getAccessNum(OpType op, ArrayOp arrayOp);
+  unsigned getLoadStoreSchedule(Operation *op, ArrayOp arrayOp,
+                                MemPortList &memPortList, unsigned begin);
+  unsigned getBlockSchedule(Block &block, MemInfo memInfo);

  void estimateOperation(Operation *op);
  void estimateFunc(FuncOp func);
--- a/lib/Analysis/QoREstimation.cpp
+++ b/lib/Analysis/QoREstimation.cpp
@ -5,6 +5,7 @@
 #include "Analysis/QoREstimation.h"
 #include "Analysis/Passes.h"
 #include "Dialect/HLSCpp/HLSCpp.h"
+#include "mlir/IR/PatternMatch.h"

 using namespace std;
 using namespace mlir;
@ -16,14 +17,41 @@ using namespace hlscpp;
 //===----------------------------------------------------------------------===//

 bool HLSCppAnalyzer::visitOp(AffineForOp op) {
+  // If the current loop is annotated as unroll, all inner loops and itself are
+  // automatically unrolled.
+  if (getBoolAttrValue(op, "unroll")) {
+    op.emitRemark("this loop and all inner loops are automatically unrolled.");
+    op.walk([&](AffineForOp forOp) {
+      if (forOp.getLoopBody().getBlocks().size() != 1)
+        op.emitError("has zero or more than one basic blocks.");
+      loopUnrollFull(forOp);
+    });
+    return true;
+  }
+
+  // If the current loop is annotated as pipeline, all intter loops are
+  // automatically unrolled.
+  if (getBoolAttrValue(op, "pipeline")) {
+    op.emitRemark("all inner loops are automatically unrolled.");
+    op.walk([&](AffineForOp forOp) {
+      if (forOp != op) {
+        if (forOp.getLoopBody().getBlocks().size() != 1)
+          op.emitError("has zero or more than one basic blocks.");
+        loopUnrollFull(forOp);
+      }
+    });
+  }
+
+  // We assume loop contains a single basic block.
  auto &body = op.getLoopBody();
  if (body.getBlocks().size() != 1)
    op.emitError("has zero or more than one basic blocks.");

-  // Recursively analyze all childs.
+  // Recursively analyze all inner loops.
  analyzeBlock(body.front());

-  // Set an attribute indicating trip count.
+  // Set an attribute indicating the trip count. For now, we assume all loops
+  // have static loop bound.
  if (!op.hasConstantLowerBound() || !op.hasConstantUpperBound())
    op.emitError("has variable upper or lower bound.");

@ -31,25 +59,24 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) {
      (op.getConstantUpperBound() - op.getConstantLowerBound()) / op.getStep();
  setAttrValue(op, "trip_count", tripCount);

-  // Set attributes indicating this loop is perfect or not.
+  // Set attributes indicating this loop can be flatten or not.
  unsigned opNum = 0;
-  unsigned childNum = 0;
-  bool childPerfect = false;
+  unsigned forNum = 0;
+  bool innerFlatten = false;
+
  for (auto &bodyOp : body.front()) {
    if (!isa<AffineYieldOp>(bodyOp))
      opNum += 1;
-    if (auto child = dyn_cast<AffineForOp>(bodyOp)) {
-      childNum += 1;
-      childPerfect = getBoolAttrValue(child, "perfect");
+    if (isa<AffineForOp>(bodyOp)) {
+      forNum += 1;
+      innerFlatten = getBoolAttrValue(&bodyOp, "flatten");
    }
  }

-  if (opNum == 1 && childNum == 1 && childPerfect)
-    setAttrValue(op, "perfect", true);
-  else if (childNum == 0)
-    setAttrValue(op, "perfect", true);
+  if (forNum == 0 || (opNum == 1 && innerFlatten))
+    setAttrValue(op, "flatten", true);
  else
-    setAttrValue(op, "perfect", false);
+    setAttrValue(op, "flatten", false);

  return true;
 }
@ -105,205 +132,200 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
  llvm::outs() << latency << "\n";
 }

-void HLSCppEstimator::setBlockSchedule(Block &block, unsigned opSchedule,
-                                       OpScheduleMap &opScheduleMap) {
+/// Calculate the partition index according to the affine map of a memory access
+/// operation, and store the results as attribute.
+int32_t HLSCppEstimator::getPartitionIdx(AffineMap map, ArrayOp op) {
+  int32_t partitionIdx = 0;
+  unsigned accumFactor = 1;
+  unsigned dim = 0;
+  for (auto expr : map.getResults()) {
+    auto idxExpr = getConstExpr(0);
+    unsigned factor = 1;
+    if (op.partition()) {
+      auto type = getPartitionType(op, dim);
+      factor = getPartitionFactor(op, dim);
+
+      if (type == "cyclic")
+        idxExpr = expr % getConstExpr(factor);
+      else if (type == "block") {
+        auto size = op.getType().cast<ShapedType>().getShape()[dim];
+        idxExpr = expr.floorDiv(getConstExpr((size + factor - 1) / factor));
+      }
+    }
+    if (auto constExpr = idxExpr.dyn_cast<AffineConstantExpr>()) {
+      if (dim == 0)
+        partitionIdx = constExpr.getValue();
+      else
+        partitionIdx += constExpr.getValue() * accumFactor;
+    } else {
+      partitionIdx = -1;
+      break;
+    }
+
+    accumFactor *= factor;
+    dim += 1;
+  }
+  return partitionIdx;
+}
+
+void HLSCppEstimator::getMemInfo(Block &block, MemInfo &info) {
  for (auto &op : block) {
-    if (auto child = dyn_cast<AffineForOp>(op))
-      setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
-    opScheduleMap[&op] = opSchedule;
+    if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
+      auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
+      info.memLoadDict[arrayOp].push_back(loadOp);
+      setAttrValue(loadOp, "partition_index",
+                   getPartitionIdx(loadOp.getAffineMap(), arrayOp));
+      // TODO: consider RAW, WAR, WAW dependency for scheduling.
+
+    } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
+      auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
+      info.memLoadDict[arrayOp].push_back(storeOp);
+      setAttrValue(storeOp, "partition_index",
+                   getPartitionIdx(storeOp.getAffineMap(), arrayOp));
+    }
  }
 }

-unsigned HLSCppEstimator::getBlockSchedule(Block &block, bool innerUnroll,
-                                           OpScheduleMap &opScheduleMap) {
-  unsigned blockSchedule = 0;
+unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, ArrayOp arrayOp,
+                                               MemPortList &memPortList,
+                                               unsigned begin) {
+  auto partitionIdx = getIntAttrValue(op, "partition_index");
+
+  // Try to avoid memory port violation until a legal schedule is found.
+  // Since an infinite length pipeline can be generated, this while loop can
+  // be proofed to have an end.
+  while (true) {
+    auto partitionPortNum = memPortList[begin][arrayOp];
+    bool memEmpty = false;
+    // Partition factor.
+    unsigned factor = 1;
+
+    // If the memory has not been occupied by the current stage, it should
+    // be initialized according to its storage type. Note that each
+    // partition should have one PortNum structure.
+    if (partitionPortNum.empty()) {
+      memEmpty = true;
+
+      if (getBoolAttrValue(arrayOp, "partition")) {
+        for (unsigned dim = 0;
+             dim < arrayOp.getType().cast<ShapedType>().getRank(); ++dim)
+          factor *= getPartitionFactor(arrayOp, dim);
+      }
+
+      auto storagetType = getStrAttrValue(arrayOp, "storage_type");
+      for (unsigned p = 0; p < factor; ++p) {
+        unsigned rdPort = 0;
+        unsigned wrPort = 0;
+        unsigned rdwrPort = 0;
+
+        if (storagetType == "ram_s2p")
+          rdPort = 1, wrPort = 1;
+        else if (storagetType == "ram_2p" || storagetType == "ram_t2p")
+          rdwrPort = 2;
+        else if (storagetType == "ram_1p")
+          rdwrPort = 1;
+        else {
+          rdwrPort = 2;
+          arrayOp.emitError("unsupported storage type.");
+        }
+        PortNum portNum(rdPort, wrPort, rdwrPort);
+        partitionPortNum.push_back(portNum);
+      }
+    }
+
+    // TODO: When partition index can't be determined, this operation will be
+    // considered to occupy all ports.
+    if (partitionIdx == -1) {
+      if (memEmpty) {
+        for (unsigned p = 0; p < factor; ++p) {
+          partitionPortNum[partitionIdx].rdPort = 0;
+          partitionPortNum[partitionIdx].wrPort = 0;
+          partitionPortNum[partitionIdx].rdwrPort = 0;
+        }
+        memPortList[begin][arrayOp] = partitionPortNum;
+        break;
+      } else {
+        if (++begin >= memPortList.size()) {
+          MemPort memPort;
+          memPortList.push_back(memPort);
+        }
+      }
+    }
+
+    // Find whether the current schedule meets memory port limitation. If
+    // not, the schedule will increase by 1.
+    if (partitionPortNum[partitionIdx].rdPort > 0) {
+      partitionPortNum[partitionIdx].rdPort -= 1;
+      memPortList[begin][arrayOp] = partitionPortNum;
+      break;
+    } else if (partitionPortNum[partitionIdx].rdwrPort > 0) {
+      partitionPortNum[partitionIdx].rdwrPort -= 1;
+      memPortList[begin][arrayOp] = partitionPortNum;
+      break;
+    } else {
+      if (++begin >= memPortList.size()) {
+        MemPort memPort;
+        memPortList.push_back(memPort);
+      }
+    }
+  }
+  return begin;
+}
+
+unsigned HLSCppEstimator::getBlockSchedule(Block &block, MemInfo memInfo) {
+  unsigned blockEnd = 0;
+  MemPortList memPortList;

  for (auto &op : block) {
-    unsigned opSchedule = 0;
-
-    // Add the latest scheduled time among all predecessors.
+    // Find the latest predecessor dominating the current operation. This should
+    // be considered as the earliest stage that the current operation can be
+    // scheduled.
+    unsigned begin = 0;
+    unsigned end = 0;
    for (auto operand : op.getOperands()) {
      if (operand.getKind() != Value::Kind::BlockArgument)
-        opSchedule = max(opSchedule, opScheduleMap[operand.getDefiningOp()]);
+        begin = max(begin,
+                    getUIntAttrValue(operand.getDefiningOp(), "schedule_end"));
    }

-    // Add latency of the current operation.
-    unsigned childSchedule = 0;
-    if (auto child = dyn_cast<AffineForOp>(op)) {
-      if (innerUnroll) {
-        setAttrValue(child, "unroll", true);
-        setAttrValue(child, "flatten", false);
-        childSchedule = getBlockSchedule(child.getRegion().front(),
-                                         /*innerUnroll=*/true, opScheduleMap);
-      } else {
-        // Two extra clock cycles will be required to enter and exit child loop.
-        opSchedule += getUIntAttrValue(child, "latency") + 2;
-        setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
-      }
-    } else {
-      // For now we make a simple assumption tha all standard operations has an
-      // unit latency.
-      // TODO: Support estimation from profiling data.
-      opSchedule += 1;
+    // Insert new pipeline stages.
+    while (begin >= memPortList.size()) {
+      MemPort memPort;
+      memPortList.push_back(memPort);
    }

-    opScheduleMap[&op] = opSchedule;
-    blockSchedule = max({blockSchedule, childSchedule, opSchedule});
-  }
-  return blockSchedule;
-}
-
-void HLSCppEstimator::getPipelineInfo(Block &block, PipelineInfo &info) {
-  for (auto &op : block) {
-    // Handle load operations and RAW dependencies.
+    // Handle load operations, ensure the current schedule meets memory port
+    // limitation.
    if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
-      for (auto prevOp : info.memStoreDict[loadOp.getMemRef()]) {
-        unsigned RAWLatency =
-            info.opScheduleMap[loadOp] - info.opScheduleMap[prevOp];
-        info.II = max(info.II, RAWLatency);
-      }
-      info.memLoadDict[loadOp.getMemRef()].push_back(loadOp);
+      auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
+      begin = getLoadStoreSchedule(loadOp, arrayOp, memPortList, begin);
+      end = begin + 1;
    }
-
-    // Handle Store operations and RAW/WAW dependencies.
+    // Handle store operations.
    else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
-      for (auto prevOp : info.memLoadDict[storeOp.getMemRef()]) {
-        unsigned WARLatency =
-            info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
-        info.II = max(info.II, WARLatency);
-      }
-      for (auto prevOp : info.memStoreDict[storeOp.getMemRef()]) {
-        unsigned WAWLatency =
-            info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
-        info.II = max(info.II, WAWLatency);
-      }
-      info.memStoreDict[storeOp.getMemRef()].push_back(storeOp);
+      auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
+      begin = getLoadStoreSchedule(storeOp, arrayOp, memPortList, begin);
+      end = begin + 1;
+    }
+    // Handle loop operations.
+    else if (auto forOp = dyn_cast<AffineForOp>(op)) {
+      // Child loop is considered as a large node, and two extra clock cycles
+      // will be required to enter and exit the child loop.
+      end = begin + getUIntAttrValue(forOp, "latency") + 2;
+    }
+    // Default case. All normal expressions and operations will be handled by
+    // this branch.
+    else {
+      // TODO: For now, we assume all operations take one clock cycle to
+      // execute, should support to accept profiling data.
+      end = begin + 1;
    }

-    // Recursively handle child loops.
-    else if (auto child = dyn_cast<AffineForOp>(op))
-      getPipelineInfo(child.getRegion().front(), info);
-  }
-}
-
-template <typename OpType>
-void HLSCppEstimator::getAccessNum(OpType op, ArrayOp arrayOp) {
-  InductionInfoList inductionInfoList;
-  SmallVector<AffineExpr, 8> replacements;
-  SmallVector<unsigned, 8> unrollDims;
-  unsigned unrollTripCount = 1;
-
-  // Collect loop information, including induction & unroll information,
-  // and etc. Note that we assume all operands are dims.
-  unsigned operandIdx = 0;
-  for (auto operand : op.getMapOperands()) {
-    if (auto forOp = getForInductionVarOwner(operand)) {
-      auto lowerBound = forOp.getConstantLowerBound();
-      auto upperBound = forOp.getConstantUpperBound();
-      auto step = forOp.getStep();
-      inductionInfoList.push_back(InductionInfo(lowerBound, upperBound, step));
-
-      auto unroll = getBoolAttrValue(forOp, "unroll");
-      auto tripCount = getUIntAttrValue(forOp, "trip_count");
-      if (unroll) {
-        unrollDims.push_back(operandIdx);
-        unrollTripCount *= tripCount;
-      }
-
-      if (unroll)
-        replacements.push_back(getConstExpr(lowerBound));
-      else
-        replacements.push_back(getDimExpr(operandIdx));
-    } else
-      op.emitError("has index constructed by dynamic values.");
-    operandIdx += 1;
-  }
-
-  // Initialize number of accesses for each partition of each array
-  // dimension as zero.
-  AccessNumList accessNumList;
-  for (auto dim : unrollDims) {
-    AccessNum accessNum;
-    if (arrayOp.partition()) {
-      for (unsigned i = 0; i < getPartitionFactor(&arrayOp, dim); ++i)
-        accessNum.push_back(0);
-    } else
-      accessNum.push_back(0);
-    accessNumList.push_back(accessNum);
-  }
-
-  // Trace all possible index to find potential violations regarding
-  // memory ports number. Violations may cause increasement of iteration
-  // latency or initial interval. This will update the accessNumList.
-  for (unsigned i = 0; i < unrollTripCount; ++i) {
-
-    // Calculate number of accesses for each partition of each array dimension.
-    unsigned idx = 0;
-    for (auto dim : unrollDims) {
-      AffineExpr expr = op.getAffineMap().getResult(dim);
-      auto indexExpr = expr.replaceDimsAndSymbols(replacements, {});
-
-      // Calculate which partition is falled in.
-      if (arrayOp.partition()) {
-        auto type = getPartitionType(&arrayOp, dim);
-        auto factor = getPartitionFactor(&arrayOp, dim);
-        if (type == "cyclic")
-          indexExpr = indexExpr % getConstExpr(factor);
-        else if (type == "block") {
-          auto dimSize = arrayOp.getType().cast<ShapedType>().getShape()[dim];
-          indexExpr =
-              indexExpr.floorDiv(getConstExpr((dimSize + factor - 1) / factor));
-        }
-      } else
-        indexExpr = getConstExpr(0);
-
-      // According to partition information.
-      if (auto constExpr = indexExpr.dyn_cast<AffineConstantExpr>()) {
-        auto partitionId = constExpr.getValue();
-        accessNumList[idx][partitionId] += 1;
-      } else {
-      }
-      idx += 1;
-    }
-
-    // Update replacement.
-    unsigned order = 0;
-    for (auto dim : unrollDims) {
-      auto value = replacements[dim].cast<AffineConstantExpr>().getValue();
-
-      // The little-end value will always increase with a stride of
-      // step.
-      if (order == 0)
-        value += inductionInfoList[dim].step;
-
-      // The value of the current dimension should return to lowerBound
-      // if is greater or equal to upperBound.
-      if (value >= inductionInfoList[dim].upperBound) {
-        value = inductionInfoList[dim].lowerBound;
-
-        // Update the value of the next dimension.
-        if (order < unrollDims.size() - 1) {
-          auto nextDim = unrollDims[order + 1];
-          auto nextValue =
-              replacements[nextDim].cast<AffineConstantExpr>().getValue();
-          nextValue += inductionInfoList[nextDim].step;
-          replacements[nextDim] = getConstExpr(nextValue);
-        }
-      }
-
-      // Update the value of the current dimension.
-      replacements[dim] = getConstExpr(value);
-      order += 1;
-    }
-  }
-
-  // update
-  for (auto accessNum : accessNumList) {
-    llvm::outs() << "new dim\n";
-    for (auto num : accessNum) {
-      llvm::outs() << num << "\n";
-    }
+    setAttrValue(&op, "schedule_begin", begin);
+    setAttrValue(&op, "schedule_end", end);
+    blockEnd = max(blockEnd, end);
  }
+  return blockEnd;
 }

 bool HLSCppEstimator::visitOp(AffineForOp op) {
@ -311,95 +333,58 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
  if (body.getBlocks().size() != 1)
    op.emitError("has zero or more than one basic blocks.");

-  // If loop is unrolled, all inner loops will be unrolled accordingly.
-  if (getBoolAttrValue(op, "unroll")) {
-    setAttrValue(op, "pipeline", false);
-    setAttrValue(op, "flatten", false);
-    op.emitRemark("all inner loops are automatically unrolled.");
-
-    OpScheduleMap opScheduleMap;
-    auto latency =
-        getBlockSchedule(body.front(), /*innerUnroll=*/true, opScheduleMap);
-    setAttrValue(op, "latency", latency);
-    return true;
-  }
-
-  // If loop is pipelined, the pipelined loop will be estimated as a whole since
-  // all loops inside of a pipeline will be automatically fully unrolled.
+  // If the current loop is annotated as pipeline, extra dependency and II
+  // analysis will be executed.
  if (getBoolAttrValue(op, "pipeline")) {
-    setAttrValue(op, "flatten", true);
-    op.emitRemark("all inner loops are automatically unrolled.");
+    MemInfo memInfo;
+    getMemInfo(body.front(), memInfo);

    // Calculate latency of each iteration.
-    PipelineInfo pipelineInfo(/*baseII=*/1);
-    auto iterLatency = getBlockSchedule(body.front(), /*innerUnroll=*/true,
-                                        pipelineInfo.opScheduleMap);
+    auto iterLatency = getBlockSchedule(body.front(), memInfo);
    setAttrValue(op, "iter_latency", iterLatency);

    // For now we make a simple assumption that II is equal to 1.
    auto tripCount = getUIntAttrValue(op, "trip_count");
    setAttrValue(op, "flatten_trip_count", tripCount);

-    // Collect pipeline information including II and memory access information.
-    getPipelineInfo(body.front(), pipelineInfo);
-
-    // Calculate latency and II considering memory ports violations.
-    for (auto &memLoad : pipelineInfo.memLoadDict) {
-      auto arrayOp = dyn_cast<ArrayOp>(memLoad.first.getDefiningOp());
-      if (!arrayOp)
-        op.emitError("is accessing an array that is not defined by ArrayOp.");
-
-      for (auto loadOp : memLoad.second) {
-        getAccessNum<AffineLoadOp>(cast<AffineLoadOp>(loadOp), arrayOp);
-      }
-    }
-
-    setAttrValue(op, "init_interval", pipelineInfo.II);
-    setAttrValue(op, "latency",
-                 iterLatency + pipelineInfo.II * (tripCount - 1));
+    setAttrValue(op, "init_interval", (unsigned)1);
+    setAttrValue(op, "latency", iterLatency + 1 * (tripCount - 1));
    return true;
  }

-  // If the loop is not pipelined or unrolled, the estimation is different and
-  // requires to recursively enter each child loop for estimating the overall
-  // latency of the current loop.
+  // Recursively estimate all inner loops.
  estimateBlock(body.front());

  // This simply means the current loop can be flattened into the child loop
  // pipeline. This will increase the flattened loop trip count without
  // changing the iteration latency. Note that this will be propogated above
  // until meeting an imperfect loop.
-  if (getBoolAttrValue(op, "perfect")) {
-    if (auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front())) {
-      if (getBoolAttrValue(child, "flatten")) {
-        setAttrValue(op, "flatten", true);
-        op.emitRemark("this loop is flattened into its child loop.");
+  if (getBoolAttrValue(op, "flatten")) {
+    auto child = cast<AffineForOp>(op.getLoopBody().front().front());
+    op.emitRemark("this loop is flattened into its inner loop.");

-        auto II = getUIntAttrValue(child, "init_interval");
-        auto iterLatency = getUIntAttrValue(child, "iter_latency");
-        auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
-                                getUIntAttrValue(op, "trip_count");
+    auto II = getUIntAttrValue(child, "init_interval");
+    auto iterLatency = getUIntAttrValue(child, "iter_latency");
+    auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
+                            getUIntAttrValue(op, "trip_count");

-        setAttrValue(op, "init_interval", II);
-        setAttrValue(op, "iter_latency", iterLatency);
-        setAttrValue(op, "flatten_trip_count", flattenTripCount);
+    setAttrValue(op, "init_interval", II);
+    setAttrValue(op, "iter_latency", iterLatency);
+    setAttrValue(op, "flatten_trip_count", flattenTripCount);

-        setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1));
-        return true;
-      }
-    }
+    setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1));
  }
+  // Default case, aka !pipeline && !flatten.
+  else {
+    MemInfo memInfo;
+    getMemInfo(body.front(), memInfo);

-  // Default case, aka !unroll && !pipeline && !(perfect && child.flatten).
-  setAttrValue(op, "flatten", false);
+    auto iterLatency = getBlockSchedule(body.front(), memInfo);
+    setAttrValue(op, "iter_latency", iterLatency);

-  OpScheduleMap opScheduleMap;
-  auto iterLatency =
-      getBlockSchedule(body.front(), /*innerUnroll=*/false, opScheduleMap);
-  setAttrValue(op, "iter_latency", iterLatency);
-
-  unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
-  setAttrValue(op, "latency", latency);
+    unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
+    setAttrValue(op, "latency", latency);
+  }
  return true;
 }

@ -419,9 +404,10 @@ void HLSCppEstimator::estimateFunc(FuncOp func) {

  estimateBlock(func.front());

-  OpScheduleMap opScheduleMap;
-  auto latency =
-      getBlockSchedule(func.front(), /*innerUnroll=*/false, opScheduleMap);
+  MemInfo memInfo;
+  getMemInfo(func.front(), memInfo);
+
+  auto latency = getBlockSchedule(func.front(), memInfo);
  setAttrValue(func, "latency", latency);
 }

@ -447,6 +433,16 @@ struct QoREstimation : public scalehls::QoREstimationBase<QoREstimation> {
    HLSCppAnalyzer analyzer(builder);
    analyzer.analyzeModule(getOperation());

+    // Canonicalize the analyzed IR.
+    OwningRewritePatternList patterns;
+
+    auto *context = &getContext();
+    for (auto *op : context->getRegisteredOperations())
+      op->getCanonicalizationPatterns(patterns, context);
+
+    Operation *op = getOperation();
+    applyPatternsAndFoldGreedily(op->getRegions(), patterns);
+
    // Estimate performance and resource utilization.
    HLSCppEstimator estimator(builder, targetSpec, opLatency);
    estimator.estimateModule(getOperation());
--- a/test/Analysis/QoREstimation/test_for.mlir
+++ b/test/Analysis/QoREstimation/test_for.mlir
@ -2,14 +2,12 @@

 // CHECK-LABEL: func @test_for
 func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attributes {dataflow = false} {
-  %array0 = "hlscpp.array"(%arg0) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
-  %array1 = "hlscpp.array"(%arg1) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
-  //"hlscpp.array_pragma" (%arg0) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
-  //"hlscpp.array_pragma" (%arg1) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
+  %array0 = "hlscpp.array"(%arg0) {interface = false, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
+  %array1 = "hlscpp.array"(%arg1) {interface = false, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
  affine.for %i = 0 to 16 {
    affine.for %j = 0 to 4 {
-      affine.for %k = 0 to 4 {
-        %0 = affine.load %array0[%i, %j, %i + %k] : memref<16x4x4xindex>
+      affine.for %k = 0 to 4{
+        %0 = affine.load %array0[%i, %j, %k] : memref<16x4x4xindex>
        %1 = affine.load %array1[%i, %j, %k] : memref<16x4x4xindex>
        %2 = muli %0, %1 : index
        affine.store %2, %array1[%i, %j, %k] : memref<16x4x4xindex>