From 1e1cd2f8285de3c4776d64f25e24a4e424e8af7d Mon Sep 17 00:00:00 2001 From: Hanchen Ye Date: Wed, 20 Jan 2021 00:56:40 -0600 Subject: [PATCH] [AffineLoopOrderOpt] complete the impl of this pass; move getLoopBandFromLeaf/Root to analysis utils, update several pass accordingly --- README.md | 18 +--- include/scalehls/Analysis/Utils.h | 3 + lib/Analysis/Utils.cpp | 36 ++++++++ lib/Transforms/AffineLoopOrderOpt.cpp | 109 ++++++++++++++++++++---- lib/Transforms/AffineLoopPerfection.cpp | 23 +++-- lib/Transforms/LoopPipelining.cpp | 43 +++++----- lib/Transforms/MultipleLevelDSE.cpp | 41 +-------- 7 files changed, 170 insertions(+), 103 deletions(-) diff --git a/README.md b/README.md index cf949bb..36072ef 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ $ cd $SCALEHLS_DIR $ # Loop and pragma-level optimizations, performance estimation, and C++ code generation. $ scalehls-opt samples/polybench/syrk.mlir \ -affine-loop-perfection -remove-variable-bound -affine-loop-normalize \ - -partial-affine-loop-tile="tile-level=1 tile-size=4" \ + -affine-loop-order-opt -partial-affine-loop-tile="tile-level=1 tile-size=2" \ -legalize-to-hlscpp="top-func=test_syrk" -loop-pipelining="pipeline-level=1" \ -affine-store-forward -simplify-memref-access -array-partition -cse -canonicalize \ -qor-estimation="target-spec=config/target-spec.ini" \ @@ -47,18 +47,8 @@ $ scalehls-opt samples/polybench/syrk.mlir \ $ # Benchmark generation, dataflow-level optimization, HLSKernel lowering and bufferization. $ benchmark-gen -type "cnn" -config "config/cnn-config.ini" -number 1 \ - | scalehls-opt -legalize-dataflow -split-function \ + | scalehls-opt -legalize-dataflow="min-gran=2 insert-copy=true" -split-function \ -hlskernel-bufferize -hlskernel-to-affine -func-bufferize -canonicalize - -$ # Put them together. -$ benchmark-gen -type "cnn" -config "config/cnn-config.ini" -number 1 \ - | scalehls-opt -legalize-dataflow -split-function \ - -hlskernel-bufferize -hlskernel-to-affine -func-bufferize \ - -affine-loop-perfection -affine-loop-normalize \ - -legalize-to-hlscpp="top-func=auto_gen_cnn" \ - -affine-store-forward -simplify-memref-access -cse -canonicalize \ - -qor-estimation="target-spec=config/target-spec.ini" \ - | scalehls-translate -emit-hlscpp ``` ## Integration with ONNX-MLIR @@ -83,9 +73,9 @@ $ dot -Tpng resnet18.gv > resnet18.png $ # Legalize the output of ONNX-MLIR, optimize and emit C++ code. $ scalehls-opt resnet18.mlir -legalize-onnx -affine-loop-normalize -canonicalize \ - -legalize-dataflow="min-gran=2 insert-copy=false" -split-function \ + -legalize-dataflow="min-gran=3 insert-copy=true" -split-function \ -convert-linalg-to-affine-loops -affine-loop-fusion \ - -legalize-to-hlscpp="top-func=main_graph" \ + -legalize-to-hlscpp="top-func=main_graph" -loop-pipelining -canonicalize \ | scalehls-translate -emit-hlscpp ``` diff --git a/include/scalehls/Analysis/Utils.h b/include/scalehls/Analysis/Utils.h index 054e37b..7d7ad61 100644 --- a/include/scalehls/Analysis/Utils.h +++ b/include/scalehls/Analysis/Utils.h @@ -95,6 +95,9 @@ int64_t getPartitionFactors(MemRefType memrefType, /// contained by the input operation. unsigned getChildLoopNum(Operation *op); +AffineForOp getLoopBandFromRoot(AffineForOp forOp, AffineLoopBand &band); +AffineForOp getLoopBandFromLeaf(AffineForOp forOp, AffineLoopBand &band); + } // namespace scalehls } // namespace mlir diff --git a/lib/Analysis/Utils.cpp b/lib/Analysis/Utils.cpp index 93498eb..a9a60c2 100644 --- a/lib/Analysis/Utils.cpp +++ b/lib/Analysis/Utils.cpp @@ -161,3 +161,39 @@ unsigned scalehls::getChildLoopNum(Operation *op) { return childNum; } + +AffineForOp scalehls::getLoopBandFromRoot(AffineForOp forOp, + AffineLoopBand &band) { + auto currentLoop = forOp; + while (true) { + band.push_back(currentLoop); + + if (getChildLoopNum(currentLoop) == 1) + currentLoop = *currentLoop.getOps().begin(); + else + break; + } + return band.back(); +} + +AffineForOp scalehls::getLoopBandFromLeaf(AffineForOp forOp, + AffineLoopBand &band) { + AffineLoopBand reverseBand; + + auto currentLoop = forOp; + while (true) { + reverseBand.push_back(currentLoop); + + auto parentLoop = currentLoop->getParentOfType(); + if (!parentLoop) + break; + + if (getChildLoopNum(parentLoop) == 1) + currentLoop = parentLoop; + else + break; + } + + band.append(reverseBand.rbegin(), reverseBand.rend()); + return band.front(); +} \ No newline at end of file diff --git a/lib/Transforms/AffineLoopOrderOpt.cpp b/lib/Transforms/AffineLoopOrderOpt.cpp index 91748dc..2282410 100644 --- a/lib/Transforms/AffineLoopOrderOpt.cpp +++ b/lib/Transforms/AffineLoopOrderOpt.cpp @@ -6,6 +6,7 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/Utils.h" +#include "mlir/Transforms/LoopUtils.h" #include "scalehls/Analysis/Utils.h" #include "scalehls/Transforms/Passes.h" @@ -14,24 +15,44 @@ using namespace scalehls; namespace { struct AffineLoopOrderOpt : public AffineLoopOrderOptBase { - void runOnOperation() override {} + void runOnOperation() override { + auto func = getOperation(); + auto builder = OpBuilder(func); + + // Collect all target loop bands. + AffineLoopBands targetBands; + func.walk([&](AffineForOp loop) { + if (getChildLoopNum(loop) == 0) { + AffineLoopBand band; + getLoopBandFromLeaf(loop, band); + targetBands.push_back(band); + } + }); + + // Apply loop order optimization to each loop band. + for (auto band : targetBands) + applyAffineLoopOrderOpt(band, builder); + } }; } // namespace bool scalehls::applyAffineLoopOrderOpt(AffineLoopBand band, OpBuilder &builder) { auto &loopBlock = band.back().getLoopBody().front(); - auto depth = band.size(); + auto bandDepth = band.size(); // Collect all load and store operations for each memory in the loop block, // and calculate the number of common surrouding loops for later uses. - MemAccessesMap map; - getMemAccessesMap(loopBlock, map); + MemAccessesMap loadStoresMap; + getMemAccessesMap(loopBlock, loadStoresMap); auto commonLoopDepth = getNumCommonSurroundingLoops( *loopBlock.begin(), *std::next(loopBlock.begin())); + // A map of dependency distances indexed by the loop in the band. + llvm::SmallDenseMap distanceMap; + // Traverse all memories in the loop block. - for (auto pair : map) { + for (auto pair : loadStoresMap) { auto loadStores = pair.second; // Find all dependencies associated to the current memory. @@ -44,26 +65,84 @@ bool scalehls::applyAffineLoopOrderOpt(AffineLoopBand band, FlatAffineConstraints depConstrs; SmallVector depComps; - for (unsigned loopDepth = commonLoopDepth - depth + 1; - loopDepth <= commonLoopDepth + 1; ++loopDepth) { + // Only the loops in the loop band will be checked. + for (unsigned depth = commonLoopDepth - bandDepth + 1; + depth <= commonLoopDepth + 1; ++depth) { + DependenceResult result = checkMemrefAccessDependence( - srcAccess, dstAccess, loopDepth, &depConstrs, &depComps, + srcAccess, dstAccess, depth, &depConstrs, &depComps, /*allowRAR=*/false); if (hasDependence(result)) { - // llvm::outs() << "\n----------\n"; - // llvm::outs() << *srcOp << " -> " << *dstOp << "\n"; - // llvm::outs() << "depth: " << loopDepth << ", distance: "; - // for (auto dep : depComps) - // llvm::outs() << "(" << dep.lb.getValue() << "," - // << dep.ub.getValue() << "), "; - // llvm::outs() << "\n"; + auto depComp = depComps[depth - 1]; + + auto targetLoop = depComp.op; + unsigned minPosDistance = + std::max(depComp.lb.getValue(), (int64_t)1); + + // Only positive distance will be considered, keep the minimum + // distance in the distance map. + if (depComp.ub.getValue() > 0) { + if (distanceMap.count(targetLoop)) { + auto currentDistance = distanceMap[targetLoop]; + distanceMap[targetLoop] = + std::min(currentDistance, minPosDistance); + } else + distanceMap[targetLoop] = minPosDistance; + } } } } dstIndex++; } } + + // Permute the target loops one by one. + for (unsigned i = 0, e = distanceMap.size(); i < e; ++i) { + // Find the loop with the smallest dependency distance. The rationale is + // small dependency distance tends to increase the achievable II when + // applying loop pipelining. + Operation *targetLoop = nullptr; + unsigned count = 0; + for (auto pair : distanceMap) { + if (count == 0) + targetLoop = pair.first; + else if (pair.second < distanceMap[targetLoop]) + targetLoop = pair.first; + count++; + } + + // Remove the target loop from the distance map as it will be handled in + // this iteration. + distanceMap.erase(targetLoop); + + // Find the current location of the target loop in the loop band. + unsigned targetLoopLoc = + std::find(band.begin(), band.end(), targetLoop) - band.begin(); + + // Permute the target loop to an as outer as possible position. + for (unsigned dstLoc = 0; dstLoc < targetLoopLoc; ++dstLoc) { + SmallVector permMap; + + // Construct permutation map. + for (unsigned loc = 0; loc < bandDepth; ++loc) { + if (loc < dstLoc) + permMap.push_back(loc); + else if (loc < targetLoopLoc) + permMap.push_back(loc + 1); + else if (loc == targetLoopLoc) + permMap.push_back(dstLoc); + else + permMap.push_back(loc); + } + + // Check the validation of the current permutation. + if (isValidLoopInterchangePermutation(band, permMap)) { + permuteLoops(band, permMap); + break; + } + } + } return true; } diff --git a/lib/Transforms/AffineLoopPerfection.cpp b/lib/Transforms/AffineLoopPerfection.cpp index 154c16f..688b366 100644 --- a/lib/Transforms/AffineLoopPerfection.cpp +++ b/lib/Transforms/AffineLoopPerfection.cpp @@ -18,20 +18,17 @@ struct AffineLoopPerfection auto func = getOperation(); auto builder = OpBuilder(func); - // Walk through all loops. - for (auto forOp : func.getOps()) { - // Collect all loops that: (1) is the innermost loop (contains zero child - // loop nest); or (2) contains more than one child loop nest. - SmallVector targetLoops; - forOp.walk([&](AffineForOp loop) { - if (getChildLoopNum(loop) != 1) - targetLoops.push_back(loop); - }); + // Collect all loops that: (1) is the innermost loop (contains zero child + // loop nest); or (2) contains more than one child loop nest. + SmallVector targetLoops; + func.walk([&](AffineForOp loop) { + if (getChildLoopNum(loop) != 1) + targetLoops.push_back(loop); + }); - // Apply loop perfection to each target loop. - for (auto loop : targetLoops) - applyAffineLoopPerfection(loop, builder); - } + // Apply loop perfection to each target loop. + for (auto loop : targetLoops) + applyAffineLoopPerfection(loop, builder); } }; } // namespace diff --git a/lib/Transforms/LoopPipelining.cpp b/lib/Transforms/LoopPipelining.cpp index 400ca5e..1ed43fa 100644 --- a/lib/Transforms/LoopPipelining.cpp +++ b/lib/Transforms/LoopPipelining.cpp @@ -18,32 +18,29 @@ struct LoopPipelining : public LoopPipeliningBase { auto func = getOperation(); auto builder = OpBuilder(func); - // Walk through all loops. - for (auto forOp : func.getOps()) { - // Collect all innermost loops. - SmallVector innermostLoops; - forOp.walk([&](AffineForOp loop) { - if (getChildLoopNum(loop) == 0) - innermostLoops.push_back(loop); - }); + // Collect all innermost loops. + SmallVector innermostLoops; + func.walk([&](AffineForOp loop) { + if (getChildLoopNum(loop) == 0) + innermostLoops.push_back(loop); + }); - // Apply loop pipelining to coresponding level of each innermost loop. - for (auto loop : innermostLoops) { - auto currentLoop = loop; - unsigned loopLevel = 0; - while (true) { - auto parentLoop = currentLoop->getParentOfType(); + // Apply loop pipelining to coresponding level of each innermost loop. + for (auto loop : innermostLoops) { + auto currentLoop = loop; + unsigned loopLevel = 0; + while (true) { + auto parentLoop = currentLoop->getParentOfType(); - // If meet the outermost loop, pipeline the current loop. - if (!parentLoop || pipelineLevel == loopLevel) { - applyLoopPipelining(currentLoop, builder); - break; - } - - // Move to the next loop level. - currentLoop = parentLoop; - ++loopLevel; + // If meet the outermost loop, pipeline the current loop. + if (!parentLoop || pipelineLevel == loopLevel) { + applyLoopPipelining(currentLoop, builder); + break; } + + // Move to the next loop level. + currentLoop = parentLoop; + ++loopLevel; } } diff --git a/lib/Transforms/MultipleLevelDSE.cpp b/lib/Transforms/MultipleLevelDSE.cpp index 6b7c255..9774a0a 100644 --- a/lib/Transforms/MultipleLevelDSE.cpp +++ b/lib/Transforms/MultipleLevelDSE.cpp @@ -15,42 +15,6 @@ using namespace scalehls; // Helper methods //===----------------------------------------------------------------------===// -static AffineForOp getLoopBandFromRoot(AffineForOp forOp, - AffineLoopBand &band) { - auto currentLoop = forOp; - while (true) { - band.push_back(currentLoop); - - if (getChildLoopNum(currentLoop) == 1) - currentLoop = *currentLoop.getOps().begin(); - else - break; - } - return band.back(); -} - -static AffineForOp getLoopBandFromLeaf(AffineForOp forOp, - AffineLoopBand &band) { - AffineLoopBand reverseBand; - - auto currentLoop = forOp; - while (true) { - reverseBand.push_back(currentLoop); - - auto parentLoop = currentLoop->getParentOfType(); - if (!parentLoop) - break; - - if (getChildLoopNum(parentLoop) == 1) - currentLoop = parentLoop; - else - break; - } - - band.append(reverseBand.rbegin(), reverseBand.rend()); - return band.front(); -} - static int64_t getInnerParallelism(AffineForOp forOp) { int64_t count = 0; for (auto loop : forOp.getOps()) { @@ -137,6 +101,7 @@ void HLSCppOptimizer::applyMultipleLevelDSE() { while (!targetLoops.empty()) { SmallVector candidateLoops; + llvm::SmallDenseMap parallelismMap; // Collect all candidate loops. Here, only loops whose innermost loop has // more than one inner loops will be considered as a candidate. @@ -147,7 +112,7 @@ void HLSCppOptimizer::applyMultipleLevelDSE() { // Calculate the overall introduced parallelism if the innermost loop of // the current loop band is pipelined. auto parallelism = getInnerParallelism(innermostLoop); - setAttrValue(innermostLoop, "inner_parallelism", parallelism); + parallelismMap[innermostLoop] = parallelism; // Collect all candidate loops into an ordered vector. The loop indicating // the largest parallelism will show in the front. @@ -156,7 +121,7 @@ void HLSCppOptimizer::applyMultipleLevelDSE() { candidateLoops.push_back(innermostLoop); else for (auto &candidate : candidateLoops) { - if (parallelism > getIntAttrValue(candidate, "inner_parallelism")) { + if (parallelism > parallelismMap[candidate]) { candidateLoops.insert(&candidate, innermostLoop); break; }