[AffineLoopOrderOpt] complete the impl of this pass; move getLoopBandFromLeaf/Root to analysis utils, update several pass accordingly

This commit is contained in:
Hanchen Ye 2021-01-20 00:56:40 -06:00
parent d3d13e0bd0
commit 1e1cd2f828
7 changed files with 170 additions and 103 deletions

View File

@ -39,7 +39,7 @@ $ cd $SCALEHLS_DIR
$ # Loop and pragma-level optimizations, performance estimation, and C++ code generation.
$ scalehls-opt samples/polybench/syrk.mlir \
-affine-loop-perfection -remove-variable-bound -affine-loop-normalize \
-partial-affine-loop-tile="tile-level=1 tile-size=4" \
-affine-loop-order-opt -partial-affine-loop-tile="tile-level=1 tile-size=2" \
-legalize-to-hlscpp="top-func=test_syrk" -loop-pipelining="pipeline-level=1" \
-affine-store-forward -simplify-memref-access -array-partition -cse -canonicalize \
-qor-estimation="target-spec=config/target-spec.ini" \
@ -47,18 +47,8 @@ $ scalehls-opt samples/polybench/syrk.mlir \
$ # Benchmark generation, dataflow-level optimization, HLSKernel lowering and bufferization.
$ benchmark-gen -type "cnn" -config "config/cnn-config.ini" -number 1 \
| scalehls-opt -legalize-dataflow -split-function \
| scalehls-opt -legalize-dataflow="min-gran=2 insert-copy=true" -split-function \
-hlskernel-bufferize -hlskernel-to-affine -func-bufferize -canonicalize
$ # Put them together.
$ benchmark-gen -type "cnn" -config "config/cnn-config.ini" -number 1 \
| scalehls-opt -legalize-dataflow -split-function \
-hlskernel-bufferize -hlskernel-to-affine -func-bufferize \
-affine-loop-perfection -affine-loop-normalize \
-legalize-to-hlscpp="top-func=auto_gen_cnn" \
-affine-store-forward -simplify-memref-access -cse -canonicalize \
-qor-estimation="target-spec=config/target-spec.ini" \
| scalehls-translate -emit-hlscpp
```
## Integration with ONNX-MLIR
@ -83,9 +73,9 @@ $ dot -Tpng resnet18.gv > resnet18.png
$ # Legalize the output of ONNX-MLIR, optimize and emit C++ code.
$ scalehls-opt resnet18.mlir -legalize-onnx -affine-loop-normalize -canonicalize \
-legalize-dataflow="min-gran=2 insert-copy=false" -split-function \
-legalize-dataflow="min-gran=3 insert-copy=true" -split-function \
-convert-linalg-to-affine-loops -affine-loop-fusion \
-legalize-to-hlscpp="top-func=main_graph" \
-legalize-to-hlscpp="top-func=main_graph" -loop-pipelining -canonicalize \
| scalehls-translate -emit-hlscpp
```

View File

@ -95,6 +95,9 @@ int64_t getPartitionFactors(MemRefType memrefType,
/// contained by the input operation.
unsigned getChildLoopNum(Operation *op);
AffineForOp getLoopBandFromRoot(AffineForOp forOp, AffineLoopBand &band);
AffineForOp getLoopBandFromLeaf(AffineForOp forOp, AffineLoopBand &band);
} // namespace scalehls
} // namespace mlir

View File

@ -161,3 +161,39 @@ unsigned scalehls::getChildLoopNum(Operation *op) {
return childNum;
}
AffineForOp scalehls::getLoopBandFromRoot(AffineForOp forOp,
AffineLoopBand &band) {
auto currentLoop = forOp;
while (true) {
band.push_back(currentLoop);
if (getChildLoopNum(currentLoop) == 1)
currentLoop = *currentLoop.getOps<AffineForOp>().begin();
else
break;
}
return band.back();
}
AffineForOp scalehls::getLoopBandFromLeaf(AffineForOp forOp,
AffineLoopBand &band) {
AffineLoopBand reverseBand;
auto currentLoop = forOp;
while (true) {
reverseBand.push_back(currentLoop);
auto parentLoop = currentLoop->getParentOfType<AffineForOp>();
if (!parentLoop)
break;
if (getChildLoopNum(parentLoop) == 1)
currentLoop = parentLoop;
else
break;
}
band.append(reverseBand.rbegin(), reverseBand.rend());
return band.front();
}

View File

@ -6,6 +6,7 @@
#include "mlir/Analysis/AffineAnalysis.h"
#include "mlir/Analysis/Utils.h"
#include "mlir/Transforms/LoopUtils.h"
#include "scalehls/Analysis/Utils.h"
#include "scalehls/Transforms/Passes.h"
@ -14,24 +15,44 @@ using namespace scalehls;
namespace {
struct AffineLoopOrderOpt : public AffineLoopOrderOptBase<AffineLoopOrderOpt> {
void runOnOperation() override {}
void runOnOperation() override {
auto func = getOperation();
auto builder = OpBuilder(func);
// Collect all target loop bands.
AffineLoopBands targetBands;
func.walk([&](AffineForOp loop) {
if (getChildLoopNum(loop) == 0) {
AffineLoopBand band;
getLoopBandFromLeaf(loop, band);
targetBands.push_back(band);
}
});
// Apply loop order optimization to each loop band.
for (auto band : targetBands)
applyAffineLoopOrderOpt(band, builder);
}
};
} // namespace
bool scalehls::applyAffineLoopOrderOpt(AffineLoopBand band,
OpBuilder &builder) {
auto &loopBlock = band.back().getLoopBody().front();
auto depth = band.size();
auto bandDepth = band.size();
// Collect all load and store operations for each memory in the loop block,
// and calculate the number of common surrouding loops for later uses.
MemAccessesMap map;
getMemAccessesMap(loopBlock, map);
MemAccessesMap loadStoresMap;
getMemAccessesMap(loopBlock, loadStoresMap);
auto commonLoopDepth = getNumCommonSurroundingLoops(
*loopBlock.begin(), *std::next(loopBlock.begin()));
// A map of dependency distances indexed by the loop in the band.
llvm::SmallDenseMap<Operation *, unsigned, 4> distanceMap;
// Traverse all memories in the loop block.
for (auto pair : map) {
for (auto pair : loadStoresMap) {
auto loadStores = pair.second;
// Find all dependencies associated to the current memory.
@ -44,26 +65,84 @@ bool scalehls::applyAffineLoopOrderOpt(AffineLoopBand band,
FlatAffineConstraints depConstrs;
SmallVector<DependenceComponent, 2> depComps;
for (unsigned loopDepth = commonLoopDepth - depth + 1;
loopDepth <= commonLoopDepth + 1; ++loopDepth) {
// Only the loops in the loop band will be checked.
for (unsigned depth = commonLoopDepth - bandDepth + 1;
depth <= commonLoopDepth + 1; ++depth) {
DependenceResult result = checkMemrefAccessDependence(
srcAccess, dstAccess, loopDepth, &depConstrs, &depComps,
srcAccess, dstAccess, depth, &depConstrs, &depComps,
/*allowRAR=*/false);
if (hasDependence(result)) {
// llvm::outs() << "\n----------\n";
// llvm::outs() << *srcOp << " -> " << *dstOp << "\n";
// llvm::outs() << "depth: " << loopDepth << ", distance: ";
// for (auto dep : depComps)
// llvm::outs() << "(" << dep.lb.getValue() << ","
// << dep.ub.getValue() << "), ";
// llvm::outs() << "\n";
auto depComp = depComps[depth - 1];
auto targetLoop = depComp.op;
unsigned minPosDistance =
std::max(depComp.lb.getValue(), (int64_t)1);
// Only positive distance will be considered, keep the minimum
// distance in the distance map.
if (depComp.ub.getValue() > 0) {
if (distanceMap.count(targetLoop)) {
auto currentDistance = distanceMap[targetLoop];
distanceMap[targetLoop] =
std::min(currentDistance, minPosDistance);
} else
distanceMap[targetLoop] = minPosDistance;
}
}
}
}
dstIndex++;
}
}
// Permute the target loops one by one.
for (unsigned i = 0, e = distanceMap.size(); i < e; ++i) {
// Find the loop with the smallest dependency distance. The rationale is
// small dependency distance tends to increase the achievable II when
// applying loop pipelining.
Operation *targetLoop = nullptr;
unsigned count = 0;
for (auto pair : distanceMap) {
if (count == 0)
targetLoop = pair.first;
else if (pair.second < distanceMap[targetLoop])
targetLoop = pair.first;
count++;
}
// Remove the target loop from the distance map as it will be handled in
// this iteration.
distanceMap.erase(targetLoop);
// Find the current location of the target loop in the loop band.
unsigned targetLoopLoc =
std::find(band.begin(), band.end(), targetLoop) - band.begin();
// Permute the target loop to an as outer as possible position.
for (unsigned dstLoc = 0; dstLoc < targetLoopLoc; ++dstLoc) {
SmallVector<unsigned, 4> permMap;
// Construct permutation map.
for (unsigned loc = 0; loc < bandDepth; ++loc) {
if (loc < dstLoc)
permMap.push_back(loc);
else if (loc < targetLoopLoc)
permMap.push_back(loc + 1);
else if (loc == targetLoopLoc)
permMap.push_back(dstLoc);
else
permMap.push_back(loc);
}
// Check the validation of the current permutation.
if (isValidLoopInterchangePermutation(band, permMap)) {
permuteLoops(band, permMap);
break;
}
}
}
return true;
}

View File

@ -18,20 +18,17 @@ struct AffineLoopPerfection
auto func = getOperation();
auto builder = OpBuilder(func);
// Walk through all loops.
for (auto forOp : func.getOps<AffineForOp>()) {
// Collect all loops that: (1) is the innermost loop (contains zero child
// loop nest); or (2) contains more than one child loop nest.
SmallVector<AffineForOp, 4> targetLoops;
forOp.walk([&](AffineForOp loop) {
if (getChildLoopNum(loop) != 1)
targetLoops.push_back(loop);
});
// Collect all loops that: (1) is the innermost loop (contains zero child
// loop nest); or (2) contains more than one child loop nest.
SmallVector<AffineForOp, 4> targetLoops;
func.walk([&](AffineForOp loop) {
if (getChildLoopNum(loop) != 1)
targetLoops.push_back(loop);
});
// Apply loop perfection to each target loop.
for (auto loop : targetLoops)
applyAffineLoopPerfection(loop, builder);
}
// Apply loop perfection to each target loop.
for (auto loop : targetLoops)
applyAffineLoopPerfection(loop, builder);
}
};
} // namespace

View File

@ -18,32 +18,29 @@ struct LoopPipelining : public LoopPipeliningBase<LoopPipelining> {
auto func = getOperation();
auto builder = OpBuilder(func);
// Walk through all loops.
for (auto forOp : func.getOps<AffineForOp>()) {
// Collect all innermost loops.
SmallVector<AffineForOp, 4> innermostLoops;
forOp.walk([&](AffineForOp loop) {
if (getChildLoopNum(loop) == 0)
innermostLoops.push_back(loop);
});
// Collect all innermost loops.
SmallVector<AffineForOp, 4> innermostLoops;
func.walk([&](AffineForOp loop) {
if (getChildLoopNum(loop) == 0)
innermostLoops.push_back(loop);
});
// Apply loop pipelining to coresponding level of each innermost loop.
for (auto loop : innermostLoops) {
auto currentLoop = loop;
unsigned loopLevel = 0;
while (true) {
auto parentLoop = currentLoop->getParentOfType<AffineForOp>();
// Apply loop pipelining to coresponding level of each innermost loop.
for (auto loop : innermostLoops) {
auto currentLoop = loop;
unsigned loopLevel = 0;
while (true) {
auto parentLoop = currentLoop->getParentOfType<AffineForOp>();
// If meet the outermost loop, pipeline the current loop.
if (!parentLoop || pipelineLevel == loopLevel) {
applyLoopPipelining(currentLoop, builder);
break;
}
// Move to the next loop level.
currentLoop = parentLoop;
++loopLevel;
// If meet the outermost loop, pipeline the current loop.
if (!parentLoop || pipelineLevel == loopLevel) {
applyLoopPipelining(currentLoop, builder);
break;
}
// Move to the next loop level.
currentLoop = parentLoop;
++loopLevel;
}
}

View File

@ -15,42 +15,6 @@ using namespace scalehls;
// Helper methods
//===----------------------------------------------------------------------===//
static AffineForOp getLoopBandFromRoot(AffineForOp forOp,
AffineLoopBand &band) {
auto currentLoop = forOp;
while (true) {
band.push_back(currentLoop);
if (getChildLoopNum(currentLoop) == 1)
currentLoop = *currentLoop.getOps<AffineForOp>().begin();
else
break;
}
return band.back();
}
static AffineForOp getLoopBandFromLeaf(AffineForOp forOp,
AffineLoopBand &band) {
AffineLoopBand reverseBand;
auto currentLoop = forOp;
while (true) {
reverseBand.push_back(currentLoop);
auto parentLoop = currentLoop->getParentOfType<AffineForOp>();
if (!parentLoop)
break;
if (getChildLoopNum(parentLoop) == 1)
currentLoop = parentLoop;
else
break;
}
band.append(reverseBand.rbegin(), reverseBand.rend());
return band.front();
}
static int64_t getInnerParallelism(AffineForOp forOp) {
int64_t count = 0;
for (auto loop : forOp.getOps<AffineForOp>()) {
@ -137,6 +101,7 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
while (!targetLoops.empty()) {
SmallVector<AffineForOp, 8> candidateLoops;
llvm::SmallDenseMap<Operation *, int64_t, 8> parallelismMap;
// Collect all candidate loops. Here, only loops whose innermost loop has
// more than one inner loops will be considered as a candidate.
@ -147,7 +112,7 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
// Calculate the overall introduced parallelism if the innermost loop of
// the current loop band is pipelined.
auto parallelism = getInnerParallelism(innermostLoop);
setAttrValue(innermostLoop, "inner_parallelism", parallelism);
parallelismMap[innermostLoop] = parallelism;
// Collect all candidate loops into an ordered vector. The loop indicating
// the largest parallelism will show in the front.
@ -156,7 +121,7 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
candidateLoops.push_back(innermostLoop);
else
for (auto &candidate : candidateLoops) {
if (parallelism > getIntAttrValue(candidate, "inner_parallelism")) {
if (parallelism > parallelismMap[candidate]) {
candidateLoops.insert(&candidate, innermostLoop);
break;
}