From 9cd80630e0691c0b61433cc45ad4244e05909954 Mon Sep 17 00:00:00 2001
From: Hanchen Ye <hanchenye@gmail.com>
Date: Sat, 23 Jan 2021 23:25:13 -0600
Subject: [PATCH] [MultipleLevelDSE] support more fine-grained tiling strategy
 searching

---
 include/scalehls/Analysis/QoREstimation.h |   4 +-
 lib/Transforms/MultipleLevelDSE.cpp       | 179 +++++++++++++++-------
 lib/Transforms/PartialAffineLoopTile.cpp  |   2 +-
 3 files changed, 126 insertions(+), 59 deletions(-)
diff --git a/include/scalehls/Analysis/QoREstimation.h b/include/scalehls/Analysis/QoREstimation.h
index c88a3bd..d0170fe 100644
--- a/include/scalehls/Analysis/QoREstimation.h
+++ b/include/scalehls/Analysis/QoREstimation.h
@@ -22,7 +22,7 @@ class HLSCppEstimator
     : public HLSCppVisitorBase<HLSCppEstimator, bool, int64_t>,
       public HLSCppAnalysisBase {
 public:
-  explicit HLSCppEstimator(FuncOp &func, LatencyMap &latencyMap)
+  explicit HLSCppEstimator(FuncOp func, LatencyMap &latencyMap)
       : HLSCppAnalysisBase(OpBuilder(func)), func(func),
         latencyMap(latencyMap) {
     getFuncDependencies();
@@ -133,7 +133,7 @@ public:
   void reverseSchedule();
   void estimateFunc();
 
-  FuncOp &func;
+  FuncOp func;
   DependsMap dependsMap;
   PortsMapDict portsMapDict;
   LatencyMap &latencyMap;
diff --git a/lib/Transforms/MultipleLevelDSE.cpp b/lib/Transforms/MultipleLevelDSE.cpp
index 10aaee8..b2a1b86 100644
--- a/lib/Transforms/MultipleLevelDSE.cpp
+++ b/lib/Transforms/MultipleLevelDSE.cpp
@@ -33,23 +33,13 @@ static int64_t getInnerParallelism(AffineForOp forOp) {
   return std::max(count, (int64_t)1);
 }
 
-/// Clean up all attributes annotated for scheduling in the function for the
-/// convenience of other transforms.
-// static void cleanScheduleAttributes(FuncOp func) {
-//   func.walk([&](Operation *op) {
-//     op->removeAttr("schedule_begin");
-//     op->removeAttr("schedule_end");
-//     op->removeAttr("partition_index");
-//   });
-// }
-
 //===----------------------------------------------------------------------===//
 // Optimizer Class Declaration
 //===----------------------------------------------------------------------===//
 
 class HLSCppOptimizer : public HLSCppAnalysisBase {
 public:
-  explicit HLSCppOptimizer(FuncOp &func, LatencyMap &latencyMap, int64_t numDSP)
+  explicit HLSCppOptimizer(FuncOp func, LatencyMap &latencyMap, int64_t numDSP)
       : HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap),
         numDSP(numDSP) {
     // TODO: only insert affine-related patterns.
@@ -61,14 +51,16 @@ public:
 
   using TileSizes = SmallVector<unsigned, 8>;
 
-  void emitDebugInfo(FuncOp &targetFunc, StringRef message);
-  void applyLoopTilingStrategy(FuncOp &targetFunc,
+  void emitDebugInfo(FuncOp targetFunc, StringRef message);
+  void applyLoopTilingStrategy(FuncOp targetFunc,
                                ArrayRef<TileSizes> tileSizesList);
+  void updateTileSizesAtHead(TileSizes &tileSizes, const TileSizes &tripCounts,
+                             unsigned &head);
 
   /// This is a temporary approach that does not scale.
   void applyMultipleLevelDSE();
 
-  FuncOp &func;
+  FuncOp func;
   LatencyMap &latencyMap;
   int64_t numDSP;
   FrozenRewritePatternList patterns;
@@ -78,7 +70,7 @@ public:
 // Optimizer Class Definition
 //===----------------------------------------------------------------------===//
 
-void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) {
+void HLSCppOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) {
   LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency");
              auto dsp = getIntAttrValue(targetFunc, "dsp");
 
@@ -88,7 +80,7 @@ void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) {
 }
 
 void HLSCppOptimizer::applyLoopTilingStrategy(
-    FuncOp &targetFunc, ArrayRef<TileSizes> tileSizesList) {
+    FuncOp targetFunc, ArrayRef<TileSizes> tileSizesList) {
   AffineLoopBands targetBands;
   getLoopBands(targetFunc.front(), targetBands);
 
@@ -99,10 +91,8 @@ void HLSCppOptimizer::applyLoopTilingStrategy(
   applyPatternsAndFoldGreedily(targetFunc, patterns);
 
   // Apply loop pipelining.
-  for (auto band : targetBands) {
-    auto pipelineLoop = band[band.size() / 2 - 1];
-    applyLoopPipelining(pipelineLoop, builder);
-  }
+  for (auto &band : targetBands)
+    applyLoopPipelining(band[band.size() / 2 - 1], builder);
   applyPatternsAndFoldGreedily(targetFunc, patterns);
 
   // Apply general optimizations and array partition.
@@ -113,11 +103,45 @@ void HLSCppOptimizer::applyLoopTilingStrategy(
   applyPatternsAndFoldGreedily(targetFunc, patterns);
 
   // Estimate performance and resource utilization.
+  LLVM_DEBUG(llvm::dbgs() << "Current tiling strategy:\n"; idx = 0;
+             for (auto tileSizes
+                  : tileSizesList) {
+               llvm::dbgs() << "Loop band " << Twine(idx++) << ":";
+               for (auto size : tileSizes) {
+                 llvm::dbgs() << " " << Twine(size);
+               }
+               llvm::dbgs() << "\n";
+             });
   HLSCppEstimator(targetFunc, latencyMap).estimateFunc();
   emitDebugInfo(targetFunc, "Apply loop tiling and pipelining, general "
                             "optimizations, and array partition.");
 }
 
+/// Update tile sizes by a factor of 2 at the head location.
+void HLSCppOptimizer::updateTileSizesAtHead(TileSizes &tileSizes,
+                                            const TileSizes &tripCounts,
+                                            unsigned &head) {
+  assert(tileSizes.size() == tripCounts.size() &&
+         "unexpected input tile sizes");
+
+  for (unsigned e = tileSizes.size(); head < e; ++head) {
+    auto size = tileSizes[head];
+    auto tripCount = tripCounts[head];
+
+    // At this stage, size must be 1 or a number which is divisible
+    // by tripCount. We need to find the update factor now.
+    if (size < tripCount) {
+      unsigned factor = 2;
+      while (tripCount % (size * factor) != 0)
+        factor++;
+
+      size *= factor;
+      tileSizes[head] = size;
+      break;
+    }
+  }
+}
+
 /// This is a temporary approach that does not scale.
 void HLSCppOptimizer::applyMultipleLevelDSE() {
   HLSCppEstimator(func, latencyMap).estimateFunc();
@@ -280,60 +304,103 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
   //===--------------------------------------------------------------------===//
 
   // Holding trip counts of all loops in each loop band.
-  std::vector<TileSizes> targetTileSizesList;
+  std::vector<TileSizes> tripCountsList;
   // Holding the current tiling sizes of each loop band.
-  std::vector<TileSizes> currentTileSizesList;
+  std::vector<TileSizes> tileSizesList;
   // Holding the current loop tiling location in each loop band.
-  SmallVector<unsigned, 8> headLocationList;
+  SmallVector<unsigned, 8> headLocList;
 
   // Initialize all design vectors.
   for (auto band : targetBands) {
-    TileSizes targetSizes;
-    TileSizes baseSizes;
+    TileSizes tripCounts;
+    TileSizes sizes;
     for (auto loop : band) {
-      targetSizes.push_back(getIntAttrValue(loop, "trip_count"));
-      baseSizes.push_back(1);
+      tripCounts.push_back(getIntAttrValue(loop, "trip_count"));
+      sizes.push_back(1);
     }
-    targetTileSizesList.push_back(targetSizes);
-    currentTileSizesList.push_back(baseSizes);
-    headLocationList.push_back(0);
+    tripCountsList.push_back(tripCounts);
+    tileSizesList.push_back(sizes);
+    headLocList.push_back(0);
   }
 
-  // For recording the minimum latency and best tiling strategy.
-  unsigned minLatency = getIntAttrValue(func, "latency");
-  std::vector<TileSizes> bestTileSizesList;
+  LLVM_DEBUG(llvm::dbgs() << "3. Search for the best tiling strategy.\n";);
+  applyLoopTilingStrategy(func, tileSizesList);
 
   // TODO: more fined grained and comprehensive dse.
-  unsigned tolerantCount = 0;
+  unsigned minLatency = getIntAttrValue(func, "latency");
+  unsigned targetNum = targetBands.size();
   while (true) {
-    // Clone the current function and apply the current tiling strategy.
-    auto tmpFunc = func.clone();
-    applyLoopTilingStrategy(tmpFunc, currentTileSizesList);
+    // If there're more than one loop bands in the function, we'll first try to
+    // update the tiling size of ALL target loop bands with a factor of 2. This
+    // is for reducing the DSE complexity.
+    if (targetNum > 1) {
+      std::vector<TileSizes> newTileSizesList = tileSizesList;
+      SmallVector<unsigned, 8> newHeadLocList = headLocList;
 
-    // If the resource constaints are not met or the latency is not increased,
-    // increase the tolerant counter by 1.
-    auto latency = getIntAttrValue(tmpFunc, "latency");
-    if (getIntAttrValue(tmpFunc, "dsp") <= numDSP) {
-      if (latency < minLatency) {
+      for (unsigned i = 0; i < targetNum; ++i)
+        updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i],
+                              newHeadLocList[i]);
+
+      auto tmpFunc = func.clone();
+      applyLoopTilingStrategy(tmpFunc, newTileSizesList);
+
+      // If the resource constaints are not met or the latency is not increased,
+      // we try more fine grained strategy. Otherwise, we accept the new tile
+      // strategy and head location, and enter the next iteration. We set a
+      // threshold 0.95 here to avoid glitches.
+      // TODO: fine tune the exit condition.
+      auto latency = getIntAttrValue(tmpFunc, "latency");
+      auto dsp = getIntAttrValue(tmpFunc, "dsp");
+
+      if (dsp <= numDSP && latency < minLatency * 0.95) {
+        tileSizesList = newTileSizesList;
+        headLocList = newHeadLocList;
         minLatency = latency;
-        bestTileSizesList = currentTileSizesList;
-        tolerantCount = 0;
-      } else
-        tolerantCount++;
+        continue;
+      }
+    }
 
-      // If the tolerant counter is larger than a threshold, we'll stop to
-      // increase the tiling size.
-      if (tolerantCount > 1)
-        break;
-      // else
-      //   currentTileSize *= 2;
-    } else
+    // Walk through all loop bands in the function and update tiling strategy
+    // one by one.
+    bool hasUpdated = false;
+    for (unsigned i = 0; i < targetNum; ++i) {
+      // TODO: This is not efficient. As our estimation can be conducted in a
+      // more structural way, we should only focus on the current loop rather
+      // than the whole function. But for now this makes sense because we are
+      // only focusing on computation kernel level algorithms that typcially
+      // only have handy loop bands.
+      for (unsigned head = headLocList[i], e = tileSizesList[i].size();
+           head < e; ++head) {
+        // Only update the tiling strategy and head location of the current
+        // loop band.
+        std::vector<TileSizes> newTileSizesList = tileSizesList;
+        updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i], head);
+
+        auto tmpFunc = func.clone();
+        applyLoopTilingStrategy(tmpFunc, newTileSizesList);
+
+        auto latency = getIntAttrValue(tmpFunc, "latency");
+        auto dsp = getIntAttrValue(tmpFunc, "dsp");
+
+        if (dsp <= numDSP && latency < minLatency * 0.95) {
+          tileSizesList = newTileSizesList;
+          headLocList[i] = head;
+          minLatency = latency;
+
+          hasUpdated = true;
+          break;
+        }
+      }
+    }
+
+    // If no loop band is updated, break the searching.
+    if (!hasUpdated)
       break;
   }
 
-  // Finally, apply the best tiling strategy.
-  LLVM_DEBUG(llvm::dbgs() << "Found the best tiling strategy.\n";);
-  applyLoopTilingStrategy(func, bestTileSizesList);
+  // Finally, we found the best tiling strategy.
+  LLVM_DEBUG(llvm::dbgs() << "4. Apply the best tiling strategy.\n";);
+  applyLoopTilingStrategy(func, tileSizesList);
 }
 
 namespace {
diff --git a/lib/Transforms/PartialAffineLoopTile.cpp b/lib/Transforms/PartialAffineLoopTile.cpp
index 4026038..a1132c3 100644
--- a/lib/Transforms/PartialAffineLoopTile.cpp
+++ b/lib/Transforms/PartialAffineLoopTile.cpp
@@ -56,7 +56,7 @@ struct PartialAffineLoopTile
     for (auto *op : func.getContext()->getRegisteredOperations())
       op->getCanonicalizationPatterns(patterns, func.getContext());
 
-    applyPatternsAndFoldGreedily(func.getRegion(), std::move(patterns));
+    applyPatternsAndFoldGreedily(func, std::move(patterns));
   }
 };
 } // namespace