[AffineLoopOrderOpt] complete the impl of this pass; move getLoopBandFromLeaf/Root to analysis utils, update several pass accordingly
This commit is contained in:
parent
d3d13e0bd0
commit
1e1cd2f828
18
README.md
18
README.md
|
@ -39,7 +39,7 @@ $ cd $SCALEHLS_DIR
|
|||
$ # Loop and pragma-level optimizations, performance estimation, and C++ code generation.
|
||||
$ scalehls-opt samples/polybench/syrk.mlir \
|
||||
-affine-loop-perfection -remove-variable-bound -affine-loop-normalize \
|
||||
-partial-affine-loop-tile="tile-level=1 tile-size=4" \
|
||||
-affine-loop-order-opt -partial-affine-loop-tile="tile-level=1 tile-size=2" \
|
||||
-legalize-to-hlscpp="top-func=test_syrk" -loop-pipelining="pipeline-level=1" \
|
||||
-affine-store-forward -simplify-memref-access -array-partition -cse -canonicalize \
|
||||
-qor-estimation="target-spec=config/target-spec.ini" \
|
||||
|
@ -47,18 +47,8 @@ $ scalehls-opt samples/polybench/syrk.mlir \
|
|||
|
||||
$ # Benchmark generation, dataflow-level optimization, HLSKernel lowering and bufferization.
|
||||
$ benchmark-gen -type "cnn" -config "config/cnn-config.ini" -number 1 \
|
||||
| scalehls-opt -legalize-dataflow -split-function \
|
||||
| scalehls-opt -legalize-dataflow="min-gran=2 insert-copy=true" -split-function \
|
||||
-hlskernel-bufferize -hlskernel-to-affine -func-bufferize -canonicalize
|
||||
|
||||
$ # Put them together.
|
||||
$ benchmark-gen -type "cnn" -config "config/cnn-config.ini" -number 1 \
|
||||
| scalehls-opt -legalize-dataflow -split-function \
|
||||
-hlskernel-bufferize -hlskernel-to-affine -func-bufferize \
|
||||
-affine-loop-perfection -affine-loop-normalize \
|
||||
-legalize-to-hlscpp="top-func=auto_gen_cnn" \
|
||||
-affine-store-forward -simplify-memref-access -cse -canonicalize \
|
||||
-qor-estimation="target-spec=config/target-spec.ini" \
|
||||
| scalehls-translate -emit-hlscpp
|
||||
```
|
||||
|
||||
## Integration with ONNX-MLIR
|
||||
|
@ -83,9 +73,9 @@ $ dot -Tpng resnet18.gv > resnet18.png
|
|||
|
||||
$ # Legalize the output of ONNX-MLIR, optimize and emit C++ code.
|
||||
$ scalehls-opt resnet18.mlir -legalize-onnx -affine-loop-normalize -canonicalize \
|
||||
-legalize-dataflow="min-gran=2 insert-copy=false" -split-function \
|
||||
-legalize-dataflow="min-gran=3 insert-copy=true" -split-function \
|
||||
-convert-linalg-to-affine-loops -affine-loop-fusion \
|
||||
-legalize-to-hlscpp="top-func=main_graph" \
|
||||
-legalize-to-hlscpp="top-func=main_graph" -loop-pipelining -canonicalize \
|
||||
| scalehls-translate -emit-hlscpp
|
||||
```
|
||||
|
||||
|
|
|
@ -95,6 +95,9 @@ int64_t getPartitionFactors(MemRefType memrefType,
|
|||
/// contained by the input operation.
|
||||
unsigned getChildLoopNum(Operation *op);
|
||||
|
||||
AffineForOp getLoopBandFromRoot(AffineForOp forOp, AffineLoopBand &band);
|
||||
AffineForOp getLoopBandFromLeaf(AffineForOp forOp, AffineLoopBand &band);
|
||||
|
||||
} // namespace scalehls
|
||||
} // namespace mlir
|
||||
|
||||
|
|
|
@ -161,3 +161,39 @@ unsigned scalehls::getChildLoopNum(Operation *op) {
|
|||
|
||||
return childNum;
|
||||
}
|
||||
|
||||
AffineForOp scalehls::getLoopBandFromRoot(AffineForOp forOp,
|
||||
AffineLoopBand &band) {
|
||||
auto currentLoop = forOp;
|
||||
while (true) {
|
||||
band.push_back(currentLoop);
|
||||
|
||||
if (getChildLoopNum(currentLoop) == 1)
|
||||
currentLoop = *currentLoop.getOps<AffineForOp>().begin();
|
||||
else
|
||||
break;
|
||||
}
|
||||
return band.back();
|
||||
}
|
||||
|
||||
AffineForOp scalehls::getLoopBandFromLeaf(AffineForOp forOp,
|
||||
AffineLoopBand &band) {
|
||||
AffineLoopBand reverseBand;
|
||||
|
||||
auto currentLoop = forOp;
|
||||
while (true) {
|
||||
reverseBand.push_back(currentLoop);
|
||||
|
||||
auto parentLoop = currentLoop->getParentOfType<AffineForOp>();
|
||||
if (!parentLoop)
|
||||
break;
|
||||
|
||||
if (getChildLoopNum(parentLoop) == 1)
|
||||
currentLoop = parentLoop;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
band.append(reverseBand.rbegin(), reverseBand.rend());
|
||||
return band.front();
|
||||
}
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include "mlir/Analysis/AffineAnalysis.h"
|
||||
#include "mlir/Analysis/Utils.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "scalehls/Analysis/Utils.h"
|
||||
#include "scalehls/Transforms/Passes.h"
|
||||
|
||||
|
@ -14,24 +15,44 @@ using namespace scalehls;
|
|||
|
||||
namespace {
|
||||
struct AffineLoopOrderOpt : public AffineLoopOrderOptBase<AffineLoopOrderOpt> {
|
||||
void runOnOperation() override {}
|
||||
void runOnOperation() override {
|
||||
auto func = getOperation();
|
||||
auto builder = OpBuilder(func);
|
||||
|
||||
// Collect all target loop bands.
|
||||
AffineLoopBands targetBands;
|
||||
func.walk([&](AffineForOp loop) {
|
||||
if (getChildLoopNum(loop) == 0) {
|
||||
AffineLoopBand band;
|
||||
getLoopBandFromLeaf(loop, band);
|
||||
targetBands.push_back(band);
|
||||
}
|
||||
});
|
||||
|
||||
// Apply loop order optimization to each loop band.
|
||||
for (auto band : targetBands)
|
||||
applyAffineLoopOrderOpt(band, builder);
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
bool scalehls::applyAffineLoopOrderOpt(AffineLoopBand band,
|
||||
OpBuilder &builder) {
|
||||
auto &loopBlock = band.back().getLoopBody().front();
|
||||
auto depth = band.size();
|
||||
auto bandDepth = band.size();
|
||||
|
||||
// Collect all load and store operations for each memory in the loop block,
|
||||
// and calculate the number of common surrouding loops for later uses.
|
||||
MemAccessesMap map;
|
||||
getMemAccessesMap(loopBlock, map);
|
||||
MemAccessesMap loadStoresMap;
|
||||
getMemAccessesMap(loopBlock, loadStoresMap);
|
||||
auto commonLoopDepth = getNumCommonSurroundingLoops(
|
||||
*loopBlock.begin(), *std::next(loopBlock.begin()));
|
||||
|
||||
// A map of dependency distances indexed by the loop in the band.
|
||||
llvm::SmallDenseMap<Operation *, unsigned, 4> distanceMap;
|
||||
|
||||
// Traverse all memories in the loop block.
|
||||
for (auto pair : map) {
|
||||
for (auto pair : loadStoresMap) {
|
||||
auto loadStores = pair.second;
|
||||
|
||||
// Find all dependencies associated to the current memory.
|
||||
|
@ -44,26 +65,84 @@ bool scalehls::applyAffineLoopOrderOpt(AffineLoopBand band,
|
|||
FlatAffineConstraints depConstrs;
|
||||
SmallVector<DependenceComponent, 2> depComps;
|
||||
|
||||
for (unsigned loopDepth = commonLoopDepth - depth + 1;
|
||||
loopDepth <= commonLoopDepth + 1; ++loopDepth) {
|
||||
// Only the loops in the loop band will be checked.
|
||||
for (unsigned depth = commonLoopDepth - bandDepth + 1;
|
||||
depth <= commonLoopDepth + 1; ++depth) {
|
||||
|
||||
DependenceResult result = checkMemrefAccessDependence(
|
||||
srcAccess, dstAccess, loopDepth, &depConstrs, &depComps,
|
||||
srcAccess, dstAccess, depth, &depConstrs, &depComps,
|
||||
/*allowRAR=*/false);
|
||||
|
||||
if (hasDependence(result)) {
|
||||
// llvm::outs() << "\n----------\n";
|
||||
// llvm::outs() << *srcOp << " -> " << *dstOp << "\n";
|
||||
// llvm::outs() << "depth: " << loopDepth << ", distance: ";
|
||||
// for (auto dep : depComps)
|
||||
// llvm::outs() << "(" << dep.lb.getValue() << ","
|
||||
// << dep.ub.getValue() << "), ";
|
||||
// llvm::outs() << "\n";
|
||||
auto depComp = depComps[depth - 1];
|
||||
|
||||
auto targetLoop = depComp.op;
|
||||
unsigned minPosDistance =
|
||||
std::max(depComp.lb.getValue(), (int64_t)1);
|
||||
|
||||
// Only positive distance will be considered, keep the minimum
|
||||
// distance in the distance map.
|
||||
if (depComp.ub.getValue() > 0) {
|
||||
if (distanceMap.count(targetLoop)) {
|
||||
auto currentDistance = distanceMap[targetLoop];
|
||||
distanceMap[targetLoop] =
|
||||
std::min(currentDistance, minPosDistance);
|
||||
} else
|
||||
distanceMap[targetLoop] = minPosDistance;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dstIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
// Permute the target loops one by one.
|
||||
for (unsigned i = 0, e = distanceMap.size(); i < e; ++i) {
|
||||
// Find the loop with the smallest dependency distance. The rationale is
|
||||
// small dependency distance tends to increase the achievable II when
|
||||
// applying loop pipelining.
|
||||
Operation *targetLoop = nullptr;
|
||||
unsigned count = 0;
|
||||
for (auto pair : distanceMap) {
|
||||
if (count == 0)
|
||||
targetLoop = pair.first;
|
||||
else if (pair.second < distanceMap[targetLoop])
|
||||
targetLoop = pair.first;
|
||||
count++;
|
||||
}
|
||||
|
||||
// Remove the target loop from the distance map as it will be handled in
|
||||
// this iteration.
|
||||
distanceMap.erase(targetLoop);
|
||||
|
||||
// Find the current location of the target loop in the loop band.
|
||||
unsigned targetLoopLoc =
|
||||
std::find(band.begin(), band.end(), targetLoop) - band.begin();
|
||||
|
||||
// Permute the target loop to an as outer as possible position.
|
||||
for (unsigned dstLoc = 0; dstLoc < targetLoopLoc; ++dstLoc) {
|
||||
SmallVector<unsigned, 4> permMap;
|
||||
|
||||
// Construct permutation map.
|
||||
for (unsigned loc = 0; loc < bandDepth; ++loc) {
|
||||
if (loc < dstLoc)
|
||||
permMap.push_back(loc);
|
||||
else if (loc < targetLoopLoc)
|
||||
permMap.push_back(loc + 1);
|
||||
else if (loc == targetLoopLoc)
|
||||
permMap.push_back(dstLoc);
|
||||
else
|
||||
permMap.push_back(loc);
|
||||
}
|
||||
|
||||
// Check the validation of the current permutation.
|
||||
if (isValidLoopInterchangePermutation(band, permMap)) {
|
||||
permuteLoops(band, permMap);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -18,20 +18,17 @@ struct AffineLoopPerfection
|
|||
auto func = getOperation();
|
||||
auto builder = OpBuilder(func);
|
||||
|
||||
// Walk through all loops.
|
||||
for (auto forOp : func.getOps<AffineForOp>()) {
|
||||
// Collect all loops that: (1) is the innermost loop (contains zero child
|
||||
// loop nest); or (2) contains more than one child loop nest.
|
||||
SmallVector<AffineForOp, 4> targetLoops;
|
||||
forOp.walk([&](AffineForOp loop) {
|
||||
if (getChildLoopNum(loop) != 1)
|
||||
targetLoops.push_back(loop);
|
||||
});
|
||||
// Collect all loops that: (1) is the innermost loop (contains zero child
|
||||
// loop nest); or (2) contains more than one child loop nest.
|
||||
SmallVector<AffineForOp, 4> targetLoops;
|
||||
func.walk([&](AffineForOp loop) {
|
||||
if (getChildLoopNum(loop) != 1)
|
||||
targetLoops.push_back(loop);
|
||||
});
|
||||
|
||||
// Apply loop perfection to each target loop.
|
||||
for (auto loop : targetLoops)
|
||||
applyAffineLoopPerfection(loop, builder);
|
||||
}
|
||||
// Apply loop perfection to each target loop.
|
||||
for (auto loop : targetLoops)
|
||||
applyAffineLoopPerfection(loop, builder);
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
|
|
@ -18,32 +18,29 @@ struct LoopPipelining : public LoopPipeliningBase<LoopPipelining> {
|
|||
auto func = getOperation();
|
||||
auto builder = OpBuilder(func);
|
||||
|
||||
// Walk through all loops.
|
||||
for (auto forOp : func.getOps<AffineForOp>()) {
|
||||
// Collect all innermost loops.
|
||||
SmallVector<AffineForOp, 4> innermostLoops;
|
||||
forOp.walk([&](AffineForOp loop) {
|
||||
if (getChildLoopNum(loop) == 0)
|
||||
innermostLoops.push_back(loop);
|
||||
});
|
||||
// Collect all innermost loops.
|
||||
SmallVector<AffineForOp, 4> innermostLoops;
|
||||
func.walk([&](AffineForOp loop) {
|
||||
if (getChildLoopNum(loop) == 0)
|
||||
innermostLoops.push_back(loop);
|
||||
});
|
||||
|
||||
// Apply loop pipelining to coresponding level of each innermost loop.
|
||||
for (auto loop : innermostLoops) {
|
||||
auto currentLoop = loop;
|
||||
unsigned loopLevel = 0;
|
||||
while (true) {
|
||||
auto parentLoop = currentLoop->getParentOfType<AffineForOp>();
|
||||
// Apply loop pipelining to coresponding level of each innermost loop.
|
||||
for (auto loop : innermostLoops) {
|
||||
auto currentLoop = loop;
|
||||
unsigned loopLevel = 0;
|
||||
while (true) {
|
||||
auto parentLoop = currentLoop->getParentOfType<AffineForOp>();
|
||||
|
||||
// If meet the outermost loop, pipeline the current loop.
|
||||
if (!parentLoop || pipelineLevel == loopLevel) {
|
||||
applyLoopPipelining(currentLoop, builder);
|
||||
break;
|
||||
}
|
||||
|
||||
// Move to the next loop level.
|
||||
currentLoop = parentLoop;
|
||||
++loopLevel;
|
||||
// If meet the outermost loop, pipeline the current loop.
|
||||
if (!parentLoop || pipelineLevel == loopLevel) {
|
||||
applyLoopPipelining(currentLoop, builder);
|
||||
break;
|
||||
}
|
||||
|
||||
// Move to the next loop level.
|
||||
currentLoop = parentLoop;
|
||||
++loopLevel;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -15,42 +15,6 @@ using namespace scalehls;
|
|||
// Helper methods
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
static AffineForOp getLoopBandFromRoot(AffineForOp forOp,
|
||||
AffineLoopBand &band) {
|
||||
auto currentLoop = forOp;
|
||||
while (true) {
|
||||
band.push_back(currentLoop);
|
||||
|
||||
if (getChildLoopNum(currentLoop) == 1)
|
||||
currentLoop = *currentLoop.getOps<AffineForOp>().begin();
|
||||
else
|
||||
break;
|
||||
}
|
||||
return band.back();
|
||||
}
|
||||
|
||||
static AffineForOp getLoopBandFromLeaf(AffineForOp forOp,
|
||||
AffineLoopBand &band) {
|
||||
AffineLoopBand reverseBand;
|
||||
|
||||
auto currentLoop = forOp;
|
||||
while (true) {
|
||||
reverseBand.push_back(currentLoop);
|
||||
|
||||
auto parentLoop = currentLoop->getParentOfType<AffineForOp>();
|
||||
if (!parentLoop)
|
||||
break;
|
||||
|
||||
if (getChildLoopNum(parentLoop) == 1)
|
||||
currentLoop = parentLoop;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
band.append(reverseBand.rbegin(), reverseBand.rend());
|
||||
return band.front();
|
||||
}
|
||||
|
||||
static int64_t getInnerParallelism(AffineForOp forOp) {
|
||||
int64_t count = 0;
|
||||
for (auto loop : forOp.getOps<AffineForOp>()) {
|
||||
|
@ -137,6 +101,7 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
|
|||
|
||||
while (!targetLoops.empty()) {
|
||||
SmallVector<AffineForOp, 8> candidateLoops;
|
||||
llvm::SmallDenseMap<Operation *, int64_t, 8> parallelismMap;
|
||||
|
||||
// Collect all candidate loops. Here, only loops whose innermost loop has
|
||||
// more than one inner loops will be considered as a candidate.
|
||||
|
@ -147,7 +112,7 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
|
|||
// Calculate the overall introduced parallelism if the innermost loop of
|
||||
// the current loop band is pipelined.
|
||||
auto parallelism = getInnerParallelism(innermostLoop);
|
||||
setAttrValue(innermostLoop, "inner_parallelism", parallelism);
|
||||
parallelismMap[innermostLoop] = parallelism;
|
||||
|
||||
// Collect all candidate loops into an ordered vector. The loop indicating
|
||||
// the largest parallelism will show in the front.
|
||||
|
@ -156,7 +121,7 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
|
|||
candidateLoops.push_back(innermostLoop);
|
||||
else
|
||||
for (auto &candidate : candidateLoops) {
|
||||
if (parallelism > getIntAttrValue(candidate, "inner_parallelism")) {
|
||||
if (parallelism > parallelismMap[candidate]) {
|
||||
candidateLoops.insert(&candidate, innermostLoop);
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue