[MultipleLevelDSE] support to simplify loop nest by loop pipelining

This commit is contained in:
Hanchen Ye 2021-01-18 20:17:18 -06:00
parent 4a594c9303
commit 0ac1bd5d4a
2 changed files with 173 additions and 48 deletions

View File

@ -847,8 +847,7 @@ struct QoREstimation : public scalehls::QoREstimationBase<QoREstimation> {
// Read configuration file.
INIReader spec(targetSpec);
if (spec.ParseError())
module->emitError(
"target spec file parse fail, please pass in correct file path\n");
emitError(module.getLoc(), "target spec file parse fail\n");
// Collect profiling latency data.
LatencyMap latencyMap;

View File

@ -10,22 +10,21 @@
using namespace std;
using namespace mlir;
using namespace scalehls;
using namespace hlscpp;
namespace {
struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
void runOnOperation() override;
};
} // namespace
//===----------------------------------------------------------------------===//
// Helper methods
//===----------------------------------------------------------------------===//
static void getSeqLoopBand(AffineForOp forOp,
SmallVector<AffineForOp, 4> &loopBand) {
using AffineLoopBand = SmallVector<AffineForOp, 4>;
using AffineLoopBands = SmallVector<AffineLoopBand, 4>;
static AffineForOp getSeqLoopBand(AffineForOp forOp, AffineLoopBand &loopBand) {
auto currentLoop = forOp;
while (true) {
auto childLoopNum = getChildLoopNum(currentLoop);
// Only if the current loop has zero or one child, it will be pushed back to
// the loop band.
// Only if the current loop has zero or one child, it will be pushed back
// to the loop band.
if (childLoopNum == 1)
loopBand.push_back(currentLoop);
else {
@ -36,12 +35,13 @@ static void getSeqLoopBand(AffineForOp forOp,
// Update the current loop.
currentLoop = *currentLoop.getOps<AffineForOp>().begin();
}
return loopBand.back();
}
static int64_t getChildLoopsTripCount(AffineForOp forOp) {
static int64_t getInnerParallelism(AffineForOp forOp) {
int64_t count = 0;
for (auto loop : forOp.getOps<AffineForOp>()) {
auto innerCount = getChildLoopsTripCount(loop);
auto innerCount = getInnerParallelism(loop);
if (auto trip = getConstantTripCount(loop))
count += trip.getValue() * innerCount;
else
@ -49,53 +49,179 @@ static int64_t getChildLoopsTripCount(AffineForOp forOp) {
}
// If the current loop is innermost loop, count should be one.
return max(count, (int64_t)1);
return std::max(count, (int64_t)1);
}
/// This is a temporary approach.
static void applyMultipleLevelDSE(FuncOp func, OpBuilder &builder,
LatencyMap &latencyMap, int64_t numDSP) {
//===--------------------------------------------------------------------===//
// Try function pipelining
//===--------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// Optimizer Class Declaration and Definition
//===----------------------------------------------------------------------===//
// HLSCppEstimator estimator(func, latencyMap);
// estimator.estimateFunc();
class HLSCppOptimizer : public HLSCppAnalysisBase {
public:
explicit HLSCppOptimizer(FuncOp &func, LatencyMap &latencyMap, int64_t numDSP)
: HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap),
numDSP(numDSP) {}
// builder.setInsertionPoint(func);
/// This is a temporary approach that does not scale.
void applyMultipleLevelDSE();
//===--------------------------------------------------------------------===//
//
//===--------------------------------------------------------------------===//
FuncOp &func;
LatencyMap &latencyMap;
int64_t numDSP;
};
for (auto loop : func.getOps<AffineForOp>()) {
SmallVector<AffineForOp, 4> loopBand;
getSeqLoopBand(loop, loopBand);
/// This is a temporary approach that does not scale.
void HLSCppOptimizer::applyMultipleLevelDSE() {
// Try function pipelining.
auto tmpFunc = func.clone();
applyFuncPipelining(func, builder);
llvm::outs() << getChildLoopsTripCount(loopBand.back()) << "\n";
// Estimate the pipelined function.
HLSCppEstimator estimator(func, latencyMap);
estimator.estimateFunc();
if (getIntAttrValue(tmpFunc, "dsp") <= numDSP) {
applyFuncPipelining(func, builder);
return;
}
// Simplify loop nests by pipelining. If we take the following loops as
// example, where each nodes represents one sequential loop nests (LN). In the
// simplification, we'll first try to pipeline LN1 and LN6. Suppose pipelining
// LN6 meets the resource constaints while pipelining LN1 not, we'll pipeline
// LN6 (fully unroll LN7 and LN8) and keep LN1 untouched. In the next step,
// we'll look into LN1 and check whether LN2 can be pipelined. Suppose
// pipelining LN2 meets the resource constraints, we'll pipeling LN2 (fully
// unroll LN7 and LN8). Note that in this simplification, all LNs that don't
// contain any LNs will not be pipelined, such as LN5. Their optimization will
// be explored later. This procedure will recursively applied to inner LNs
// until no eligible LN exists.
//
// LN1 LN6
// | |
// / \ / \
// LN2 LN5 LN7 LN8
// |
// / \
// LN3 LN4
//
// After the simplification, the loop becomes the following one, where LN1 has
// been proved untouchable as loop pipelining is the primary optimization that
// consumes the least extra resources. Formally, in the simplified function,
// all non-leaf LNs is untouchable (LN1) and only leaf LNs can be further
// optimized (LN2, LN5, and LN6).
//
// LN1 LN6
// |
// / \
// LN2 LN5
//
// TODO: there is a large design space in this simplification.
auto funcForOps = func.getOps<AffineForOp>();
auto targetLoops =
SmallVector<AffineForOp, 8>(funcForOps.begin(), funcForOps.end());
while (!targetLoops.empty()) {
SmallVector<AffineForOp, 8> candidateLoops;
// Collect all candidate loops. Here, only loops whose innermost loop has
// more than one inner loops will be considered as a candidate.
for (auto target : targetLoops) {
AffineLoopBand loopBand;
auto innermostLoop = getSeqLoopBand(target, loopBand);
// Calculate the overall introduced parallelism if the innermost loop of
// the current loop band is pipelined.
auto parallelism = getInnerParallelism(innermostLoop);
setAttrValue(innermostLoop, "inner_parallelism", parallelism);
// Collect all candidate loops into an ordered vector. The loop indicating
// the largest parallelism will show in the front.
if (parallelism > 1) {
if (candidateLoops.empty())
candidateLoops.push_back(innermostLoop);
else
for (auto &candidate : candidateLoops) {
if (parallelism > getIntAttrValue(candidate, "inner_parallelism")) {
candidateLoops.insert(&candidate, innermostLoop);
break;
}
// Push back if having the smallest parallelism.
if (&candidate == candidateLoops.end())
candidateLoops.push_back(innermostLoop);
}
}
}
// Since all target loops have been handled, clear the targetLoops vector.
targetLoops.clear();
// Traverse all candidates to check whether applying loop pipelining has
// violation with the resource constraints. If so, add all inner loops into
// targetLoops. Otherwise, pipeline the candidate.
for (auto &candidate : candidateLoops) {
// Create a temporary function.
setAttrValue(candidate, "opt_flag", true);
auto tmpFunc = func.clone();
// Find the candidate loop in the temporary function and apply loop
// pipelining to it.
tmpFunc.walk([&](AffineForOp loop) {
if (getIntAttrValue(loop, "opt_flag")) {
applyLoopPipelining(loop, builder);
return;
}
});
// Estimate the temporary function.
auto estimator = HLSCppEstimator(tmpFunc, latencyMap);
estimator.estimateFunc();
// Pipeline the candidate loop or delve into child loops.
if (getIntAttrValue(tmpFunc, "dsp") <= numDSP)
applyLoopPipelining(candidate, builder);
else {
auto childForOps = candidate.getOps<AffineForOp>();
targetLoops.append(childForOps.begin(), childForOps.end());
}
candidate.removeAttr("opt_flat");
}
}
// Optimize leaf loop nests. Different optimization conbinations will be
// applied to each leaf LNs, and the best one which meets the resource
// constrains will be picked as the final solution.
// TODO: apply different optimizations to different leaf LNs.
}
void MultipleLevelDSE::runOnOperation() {
auto module = getOperation();
auto builder = OpBuilder(module);
namespace {
struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
void runOnOperation() override {
auto module = getOperation();
// Read configuration file.
INIReader spec(targetSpec);
if (spec.ParseError())
module->emitError(
"target spec file parse fail, please pass in correct file path\n");
// Read configuration file.
INIReader spec(targetSpec);
if (spec.ParseError())
emitError(module.getLoc(), "target spec file parse fail\n");
// Collect profiling data, where default values are based on PYNQ-Z1 board.
LatencyMap latencyMap;
getLatencyMap(spec, latencyMap);
auto numDSP = spec.GetInteger("specification", "dsp", 220);
// Collect profiling data, where default values are based on PYNQ-Z1 board.
LatencyMap latencyMap;
getLatencyMap(spec, latencyMap);
auto numDSP = spec.GetInteger("specification", "dsp", 220);
for (auto func : module.getOps<FuncOp>())
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
if (topFunction.getValue())
applyMultipleLevelDSE(func, builder, latencyMap, numDSP);
}
// Optimize the top function.
for (auto func : module.getOps<FuncOp>())
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
if (topFunction.getValue()) {
auto optimizer = HLSCppOptimizer(func, latencyMap, numDSP);
optimizer.applyMultipleLevelDSE();
}
}
};
} // namespace
std::unique_ptr<mlir::Pass> scalehls::createMultipleLevelDSEPass() {
return std::make_unique<MultipleLevelDSE>();