From 9cd80630e0691c0b61433cc45ad4244e05909954 Mon Sep 17 00:00:00 2001 From: Hanchen Ye Date: Sat, 23 Jan 2021 23:25:13 -0600 Subject: [PATCH] [MultipleLevelDSE] support more fine-grained tiling strategy searching --- include/scalehls/Analysis/QoREstimation.h | 4 +- lib/Transforms/MultipleLevelDSE.cpp | 179 +++++++++++++++------- lib/Transforms/PartialAffineLoopTile.cpp | 2 +- 3 files changed, 126 insertions(+), 59 deletions(-) diff --git a/include/scalehls/Analysis/QoREstimation.h b/include/scalehls/Analysis/QoREstimation.h index c88a3bd..d0170fe 100644 --- a/include/scalehls/Analysis/QoREstimation.h +++ b/include/scalehls/Analysis/QoREstimation.h @@ -22,7 +22,7 @@ class HLSCppEstimator : public HLSCppVisitorBase, public HLSCppAnalysisBase { public: - explicit HLSCppEstimator(FuncOp &func, LatencyMap &latencyMap) + explicit HLSCppEstimator(FuncOp func, LatencyMap &latencyMap) : HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap) { getFuncDependencies(); @@ -133,7 +133,7 @@ public: void reverseSchedule(); void estimateFunc(); - FuncOp &func; + FuncOp func; DependsMap dependsMap; PortsMapDict portsMapDict; LatencyMap &latencyMap; diff --git a/lib/Transforms/MultipleLevelDSE.cpp b/lib/Transforms/MultipleLevelDSE.cpp index 10aaee8..b2a1b86 100644 --- a/lib/Transforms/MultipleLevelDSE.cpp +++ b/lib/Transforms/MultipleLevelDSE.cpp @@ -33,23 +33,13 @@ static int64_t getInnerParallelism(AffineForOp forOp) { return std::max(count, (int64_t)1); } -/// Clean up all attributes annotated for scheduling in the function for the -/// convenience of other transforms. -// static void cleanScheduleAttributes(FuncOp func) { -// func.walk([&](Operation *op) { -// op->removeAttr("schedule_begin"); -// op->removeAttr("schedule_end"); -// op->removeAttr("partition_index"); -// }); -// } - //===----------------------------------------------------------------------===// // Optimizer Class Declaration //===----------------------------------------------------------------------===// class HLSCppOptimizer : public HLSCppAnalysisBase { public: - explicit HLSCppOptimizer(FuncOp &func, LatencyMap &latencyMap, int64_t numDSP) + explicit HLSCppOptimizer(FuncOp func, LatencyMap &latencyMap, int64_t numDSP) : HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap), numDSP(numDSP) { // TODO: only insert affine-related patterns. @@ -61,14 +51,16 @@ public: using TileSizes = SmallVector; - void emitDebugInfo(FuncOp &targetFunc, StringRef message); - void applyLoopTilingStrategy(FuncOp &targetFunc, + void emitDebugInfo(FuncOp targetFunc, StringRef message); + void applyLoopTilingStrategy(FuncOp targetFunc, ArrayRef tileSizesList); + void updateTileSizesAtHead(TileSizes &tileSizes, const TileSizes &tripCounts, + unsigned &head); /// This is a temporary approach that does not scale. void applyMultipleLevelDSE(); - FuncOp &func; + FuncOp func; LatencyMap &latencyMap; int64_t numDSP; FrozenRewritePatternList patterns; @@ -78,7 +70,7 @@ public: // Optimizer Class Definition //===----------------------------------------------------------------------===// -void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) { +void HLSCppOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) { LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency"); auto dsp = getIntAttrValue(targetFunc, "dsp"); @@ -88,7 +80,7 @@ void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) { } void HLSCppOptimizer::applyLoopTilingStrategy( - FuncOp &targetFunc, ArrayRef tileSizesList) { + FuncOp targetFunc, ArrayRef tileSizesList) { AffineLoopBands targetBands; getLoopBands(targetFunc.front(), targetBands); @@ -99,10 +91,8 @@ void HLSCppOptimizer::applyLoopTilingStrategy( applyPatternsAndFoldGreedily(targetFunc, patterns); // Apply loop pipelining. - for (auto band : targetBands) { - auto pipelineLoop = band[band.size() / 2 - 1]; - applyLoopPipelining(pipelineLoop, builder); - } + for (auto &band : targetBands) + applyLoopPipelining(band[band.size() / 2 - 1], builder); applyPatternsAndFoldGreedily(targetFunc, patterns); // Apply general optimizations and array partition. @@ -113,11 +103,45 @@ void HLSCppOptimizer::applyLoopTilingStrategy( applyPatternsAndFoldGreedily(targetFunc, patterns); // Estimate performance and resource utilization. + LLVM_DEBUG(llvm::dbgs() << "Current tiling strategy:\n"; idx = 0; + for (auto tileSizes + : tileSizesList) { + llvm::dbgs() << "Loop band " << Twine(idx++) << ":"; + for (auto size : tileSizes) { + llvm::dbgs() << " " << Twine(size); + } + llvm::dbgs() << "\n"; + }); HLSCppEstimator(targetFunc, latencyMap).estimateFunc(); emitDebugInfo(targetFunc, "Apply loop tiling and pipelining, general " "optimizations, and array partition."); } +/// Update tile sizes by a factor of 2 at the head location. +void HLSCppOptimizer::updateTileSizesAtHead(TileSizes &tileSizes, + const TileSizes &tripCounts, + unsigned &head) { + assert(tileSizes.size() == tripCounts.size() && + "unexpected input tile sizes"); + + for (unsigned e = tileSizes.size(); head < e; ++head) { + auto size = tileSizes[head]; + auto tripCount = tripCounts[head]; + + // At this stage, size must be 1 or a number which is divisible + // by tripCount. We need to find the update factor now. + if (size < tripCount) { + unsigned factor = 2; + while (tripCount % (size * factor) != 0) + factor++; + + size *= factor; + tileSizes[head] = size; + break; + } + } +} + /// This is a temporary approach that does not scale. void HLSCppOptimizer::applyMultipleLevelDSE() { HLSCppEstimator(func, latencyMap).estimateFunc(); @@ -280,60 +304,103 @@ void HLSCppOptimizer::applyMultipleLevelDSE() { //===--------------------------------------------------------------------===// // Holding trip counts of all loops in each loop band. - std::vector targetTileSizesList; + std::vector tripCountsList; // Holding the current tiling sizes of each loop band. - std::vector currentTileSizesList; + std::vector tileSizesList; // Holding the current loop tiling location in each loop band. - SmallVector headLocationList; + SmallVector headLocList; // Initialize all design vectors. for (auto band : targetBands) { - TileSizes targetSizes; - TileSizes baseSizes; + TileSizes tripCounts; + TileSizes sizes; for (auto loop : band) { - targetSizes.push_back(getIntAttrValue(loop, "trip_count")); - baseSizes.push_back(1); + tripCounts.push_back(getIntAttrValue(loop, "trip_count")); + sizes.push_back(1); } - targetTileSizesList.push_back(targetSizes); - currentTileSizesList.push_back(baseSizes); - headLocationList.push_back(0); + tripCountsList.push_back(tripCounts); + tileSizesList.push_back(sizes); + headLocList.push_back(0); } - // For recording the minimum latency and best tiling strategy. - unsigned minLatency = getIntAttrValue(func, "latency"); - std::vector bestTileSizesList; + LLVM_DEBUG(llvm::dbgs() << "3. Search for the best tiling strategy.\n";); + applyLoopTilingStrategy(func, tileSizesList); // TODO: more fined grained and comprehensive dse. - unsigned tolerantCount = 0; + unsigned minLatency = getIntAttrValue(func, "latency"); + unsigned targetNum = targetBands.size(); while (true) { - // Clone the current function and apply the current tiling strategy. - auto tmpFunc = func.clone(); - applyLoopTilingStrategy(tmpFunc, currentTileSizesList); + // If there're more than one loop bands in the function, we'll first try to + // update the tiling size of ALL target loop bands with a factor of 2. This + // is for reducing the DSE complexity. + if (targetNum > 1) { + std::vector newTileSizesList = tileSizesList; + SmallVector newHeadLocList = headLocList; - // If the resource constaints are not met or the latency is not increased, - // increase the tolerant counter by 1. - auto latency = getIntAttrValue(tmpFunc, "latency"); - if (getIntAttrValue(tmpFunc, "dsp") <= numDSP) { - if (latency < minLatency) { + for (unsigned i = 0; i < targetNum; ++i) + updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i], + newHeadLocList[i]); + + auto tmpFunc = func.clone(); + applyLoopTilingStrategy(tmpFunc, newTileSizesList); + + // If the resource constaints are not met or the latency is not increased, + // we try more fine grained strategy. Otherwise, we accept the new tile + // strategy and head location, and enter the next iteration. We set a + // threshold 0.95 here to avoid glitches. + // TODO: fine tune the exit condition. + auto latency = getIntAttrValue(tmpFunc, "latency"); + auto dsp = getIntAttrValue(tmpFunc, "dsp"); + + if (dsp <= numDSP && latency < minLatency * 0.95) { + tileSizesList = newTileSizesList; + headLocList = newHeadLocList; minLatency = latency; - bestTileSizesList = currentTileSizesList; - tolerantCount = 0; - } else - tolerantCount++; + continue; + } + } - // If the tolerant counter is larger than a threshold, we'll stop to - // increase the tiling size. - if (tolerantCount > 1) - break; - // else - // currentTileSize *= 2; - } else + // Walk through all loop bands in the function and update tiling strategy + // one by one. + bool hasUpdated = false; + for (unsigned i = 0; i < targetNum; ++i) { + // TODO: This is not efficient. As our estimation can be conducted in a + // more structural way, we should only focus on the current loop rather + // than the whole function. But for now this makes sense because we are + // only focusing on computation kernel level algorithms that typcially + // only have handy loop bands. + for (unsigned head = headLocList[i], e = tileSizesList[i].size(); + head < e; ++head) { + // Only update the tiling strategy and head location of the current + // loop band. + std::vector newTileSizesList = tileSizesList; + updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i], head); + + auto tmpFunc = func.clone(); + applyLoopTilingStrategy(tmpFunc, newTileSizesList); + + auto latency = getIntAttrValue(tmpFunc, "latency"); + auto dsp = getIntAttrValue(tmpFunc, "dsp"); + + if (dsp <= numDSP && latency < minLatency * 0.95) { + tileSizesList = newTileSizesList; + headLocList[i] = head; + minLatency = latency; + + hasUpdated = true; + break; + } + } + } + + // If no loop band is updated, break the searching. + if (!hasUpdated) break; } - // Finally, apply the best tiling strategy. - LLVM_DEBUG(llvm::dbgs() << "Found the best tiling strategy.\n";); - applyLoopTilingStrategy(func, bestTileSizesList); + // Finally, we found the best tiling strategy. + LLVM_DEBUG(llvm::dbgs() << "4. Apply the best tiling strategy.\n";); + applyLoopTilingStrategy(func, tileSizesList); } namespace { diff --git a/lib/Transforms/PartialAffineLoopTile.cpp b/lib/Transforms/PartialAffineLoopTile.cpp index 4026038..a1132c3 100644 --- a/lib/Transforms/PartialAffineLoopTile.cpp +++ b/lib/Transforms/PartialAffineLoopTile.cpp @@ -56,7 +56,7 @@ struct PartialAffineLoopTile for (auto *op : func.getContext()->getRegisteredOperations()) op->getCanonicalizationPatterns(patterns, func.getContext()); - applyPatternsAndFoldGreedily(func.getRegion(), std::move(patterns)); + applyPatternsAndFoldGreedily(func, std::move(patterns)); } }; } // namespace