[MultipleLevelDSE] support to simplify loop nest by loop pipelining
This commit is contained in:
parent
4a594c9303
commit
0ac1bd5d4a
|
@ -847,8 +847,7 @@ struct QoREstimation : public scalehls::QoREstimationBase<QoREstimation> {
|
|||
// Read configuration file.
|
||||
INIReader spec(targetSpec);
|
||||
if (spec.ParseError())
|
||||
module->emitError(
|
||||
"target spec file parse fail, please pass in correct file path\n");
|
||||
emitError(module.getLoc(), "target spec file parse fail\n");
|
||||
|
||||
// Collect profiling latency data.
|
||||
LatencyMap latencyMap;
|
||||
|
|
|
@ -10,22 +10,21 @@
|
|||
using namespace std;
|
||||
using namespace mlir;
|
||||
using namespace scalehls;
|
||||
using namespace hlscpp;
|
||||
|
||||
namespace {
|
||||
struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
|
||||
void runOnOperation() override;
|
||||
};
|
||||
} // namespace
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Helper methods
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
static void getSeqLoopBand(AffineForOp forOp,
|
||||
SmallVector<AffineForOp, 4> &loopBand) {
|
||||
using AffineLoopBand = SmallVector<AffineForOp, 4>;
|
||||
using AffineLoopBands = SmallVector<AffineLoopBand, 4>;
|
||||
|
||||
static AffineForOp getSeqLoopBand(AffineForOp forOp, AffineLoopBand &loopBand) {
|
||||
auto currentLoop = forOp;
|
||||
while (true) {
|
||||
auto childLoopNum = getChildLoopNum(currentLoop);
|
||||
|
||||
// Only if the current loop has zero or one child, it will be pushed back to
|
||||
// the loop band.
|
||||
// Only if the current loop has zero or one child, it will be pushed back
|
||||
// to the loop band.
|
||||
if (childLoopNum == 1)
|
||||
loopBand.push_back(currentLoop);
|
||||
else {
|
||||
|
@ -36,12 +35,13 @@ static void getSeqLoopBand(AffineForOp forOp,
|
|||
// Update the current loop.
|
||||
currentLoop = *currentLoop.getOps<AffineForOp>().begin();
|
||||
}
|
||||
return loopBand.back();
|
||||
}
|
||||
|
||||
static int64_t getChildLoopsTripCount(AffineForOp forOp) {
|
||||
static int64_t getInnerParallelism(AffineForOp forOp) {
|
||||
int64_t count = 0;
|
||||
for (auto loop : forOp.getOps<AffineForOp>()) {
|
||||
auto innerCount = getChildLoopsTripCount(loop);
|
||||
auto innerCount = getInnerParallelism(loop);
|
||||
if (auto trip = getConstantTripCount(loop))
|
||||
count += trip.getValue() * innerCount;
|
||||
else
|
||||
|
@ -49,53 +49,179 @@ static int64_t getChildLoopsTripCount(AffineForOp forOp) {
|
|||
}
|
||||
|
||||
// If the current loop is innermost loop, count should be one.
|
||||
return max(count, (int64_t)1);
|
||||
return std::max(count, (int64_t)1);
|
||||
}
|
||||
|
||||
/// This is a temporary approach.
|
||||
static void applyMultipleLevelDSE(FuncOp func, OpBuilder &builder,
|
||||
LatencyMap &latencyMap, int64_t numDSP) {
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Try function pipelining
|
||||
//===--------------------------------------------------------------------===//
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Optimizer Class Declaration and Definition
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// HLSCppEstimator estimator(func, latencyMap);
|
||||
// estimator.estimateFunc();
|
||||
class HLSCppOptimizer : public HLSCppAnalysisBase {
|
||||
public:
|
||||
explicit HLSCppOptimizer(FuncOp &func, LatencyMap &latencyMap, int64_t numDSP)
|
||||
: HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap),
|
||||
numDSP(numDSP) {}
|
||||
|
||||
// builder.setInsertionPoint(func);
|
||||
/// This is a temporary approach that does not scale.
|
||||
void applyMultipleLevelDSE();
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
//
|
||||
//===--------------------------------------------------------------------===//
|
||||
FuncOp &func;
|
||||
LatencyMap &latencyMap;
|
||||
int64_t numDSP;
|
||||
};
|
||||
|
||||
for (auto loop : func.getOps<AffineForOp>()) {
|
||||
SmallVector<AffineForOp, 4> loopBand;
|
||||
getSeqLoopBand(loop, loopBand);
|
||||
/// This is a temporary approach that does not scale.
|
||||
void HLSCppOptimizer::applyMultipleLevelDSE() {
|
||||
// Try function pipelining.
|
||||
auto tmpFunc = func.clone();
|
||||
applyFuncPipelining(func, builder);
|
||||
|
||||
llvm::outs() << getChildLoopsTripCount(loopBand.back()) << "\n";
|
||||
// Estimate the pipelined function.
|
||||
HLSCppEstimator estimator(func, latencyMap);
|
||||
estimator.estimateFunc();
|
||||
|
||||
if (getIntAttrValue(tmpFunc, "dsp") <= numDSP) {
|
||||
applyFuncPipelining(func, builder);
|
||||
return;
|
||||
}
|
||||
|
||||
// Simplify loop nests by pipelining. If we take the following loops as
|
||||
// example, where each nodes represents one sequential loop nests (LN). In the
|
||||
// simplification, we'll first try to pipeline LN1 and LN6. Suppose pipelining
|
||||
// LN6 meets the resource constaints while pipelining LN1 not, we'll pipeline
|
||||
// LN6 (fully unroll LN7 and LN8) and keep LN1 untouched. In the next step,
|
||||
// we'll look into LN1 and check whether LN2 can be pipelined. Suppose
|
||||
// pipelining LN2 meets the resource constraints, we'll pipeling LN2 (fully
|
||||
// unroll LN7 and LN8). Note that in this simplification, all LNs that don't
|
||||
// contain any LNs will not be pipelined, such as LN5. Their optimization will
|
||||
// be explored later. This procedure will recursively applied to inner LNs
|
||||
// until no eligible LN exists.
|
||||
//
|
||||
// LN1 LN6
|
||||
// | |
|
||||
// / \ / \
|
||||
// LN2 LN5 LN7 LN8
|
||||
// |
|
||||
// / \
|
||||
// LN3 LN4
|
||||
//
|
||||
// After the simplification, the loop becomes the following one, where LN1 has
|
||||
// been proved untouchable as loop pipelining is the primary optimization that
|
||||
// consumes the least extra resources. Formally, in the simplified function,
|
||||
// all non-leaf LNs is untouchable (LN1) and only leaf LNs can be further
|
||||
// optimized (LN2, LN5, and LN6).
|
||||
//
|
||||
// LN1 LN6
|
||||
// |
|
||||
// / \
|
||||
// LN2 LN5
|
||||
//
|
||||
// TODO: there is a large design space in this simplification.
|
||||
|
||||
auto funcForOps = func.getOps<AffineForOp>();
|
||||
auto targetLoops =
|
||||
SmallVector<AffineForOp, 8>(funcForOps.begin(), funcForOps.end());
|
||||
|
||||
while (!targetLoops.empty()) {
|
||||
SmallVector<AffineForOp, 8> candidateLoops;
|
||||
|
||||
// Collect all candidate loops. Here, only loops whose innermost loop has
|
||||
// more than one inner loops will be considered as a candidate.
|
||||
for (auto target : targetLoops) {
|
||||
AffineLoopBand loopBand;
|
||||
auto innermostLoop = getSeqLoopBand(target, loopBand);
|
||||
|
||||
// Calculate the overall introduced parallelism if the innermost loop of
|
||||
// the current loop band is pipelined.
|
||||
auto parallelism = getInnerParallelism(innermostLoop);
|
||||
setAttrValue(innermostLoop, "inner_parallelism", parallelism);
|
||||
|
||||
// Collect all candidate loops into an ordered vector. The loop indicating
|
||||
// the largest parallelism will show in the front.
|
||||
if (parallelism > 1) {
|
||||
if (candidateLoops.empty())
|
||||
candidateLoops.push_back(innermostLoop);
|
||||
else
|
||||
for (auto &candidate : candidateLoops) {
|
||||
if (parallelism > getIntAttrValue(candidate, "inner_parallelism")) {
|
||||
candidateLoops.insert(&candidate, innermostLoop);
|
||||
break;
|
||||
}
|
||||
|
||||
// Push back if having the smallest parallelism.
|
||||
if (&candidate == candidateLoops.end())
|
||||
candidateLoops.push_back(innermostLoop);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Since all target loops have been handled, clear the targetLoops vector.
|
||||
targetLoops.clear();
|
||||
|
||||
// Traverse all candidates to check whether applying loop pipelining has
|
||||
// violation with the resource constraints. If so, add all inner loops into
|
||||
// targetLoops. Otherwise, pipeline the candidate.
|
||||
for (auto &candidate : candidateLoops) {
|
||||
// Create a temporary function.
|
||||
setAttrValue(candidate, "opt_flag", true);
|
||||
auto tmpFunc = func.clone();
|
||||
|
||||
// Find the candidate loop in the temporary function and apply loop
|
||||
// pipelining to it.
|
||||
tmpFunc.walk([&](AffineForOp loop) {
|
||||
if (getIntAttrValue(loop, "opt_flag")) {
|
||||
applyLoopPipelining(loop, builder);
|
||||
return;
|
||||
}
|
||||
});
|
||||
|
||||
// Estimate the temporary function.
|
||||
auto estimator = HLSCppEstimator(tmpFunc, latencyMap);
|
||||
estimator.estimateFunc();
|
||||
|
||||
// Pipeline the candidate loop or delve into child loops.
|
||||
if (getIntAttrValue(tmpFunc, "dsp") <= numDSP)
|
||||
applyLoopPipelining(candidate, builder);
|
||||
else {
|
||||
auto childForOps = candidate.getOps<AffineForOp>();
|
||||
targetLoops.append(childForOps.begin(), childForOps.end());
|
||||
}
|
||||
|
||||
candidate.removeAttr("opt_flat");
|
||||
}
|
||||
}
|
||||
|
||||
// Optimize leaf loop nests. Different optimization conbinations will be
|
||||
// applied to each leaf LNs, and the best one which meets the resource
|
||||
// constrains will be picked as the final solution.
|
||||
// TODO: apply different optimizations to different leaf LNs.
|
||||
}
|
||||
|
||||
void MultipleLevelDSE::runOnOperation() {
|
||||
auto module = getOperation();
|
||||
auto builder = OpBuilder(module);
|
||||
namespace {
|
||||
struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
|
||||
void runOnOperation() override {
|
||||
auto module = getOperation();
|
||||
|
||||
// Read configuration file.
|
||||
INIReader spec(targetSpec);
|
||||
if (spec.ParseError())
|
||||
module->emitError(
|
||||
"target spec file parse fail, please pass in correct file path\n");
|
||||
// Read configuration file.
|
||||
INIReader spec(targetSpec);
|
||||
if (spec.ParseError())
|
||||
emitError(module.getLoc(), "target spec file parse fail\n");
|
||||
|
||||
// Collect profiling data, where default values are based on PYNQ-Z1 board.
|
||||
LatencyMap latencyMap;
|
||||
getLatencyMap(spec, latencyMap);
|
||||
auto numDSP = spec.GetInteger("specification", "dsp", 220);
|
||||
// Collect profiling data, where default values are based on PYNQ-Z1 board.
|
||||
LatencyMap latencyMap;
|
||||
getLatencyMap(spec, latencyMap);
|
||||
auto numDSP = spec.GetInteger("specification", "dsp", 220);
|
||||
|
||||
for (auto func : module.getOps<FuncOp>())
|
||||
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
|
||||
if (topFunction.getValue())
|
||||
applyMultipleLevelDSE(func, builder, latencyMap, numDSP);
|
||||
}
|
||||
// Optimize the top function.
|
||||
for (auto func : module.getOps<FuncOp>())
|
||||
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
|
||||
if (topFunction.getValue()) {
|
||||
auto optimizer = HLSCppOptimizer(func, latencyMap, numDSP);
|
||||
optimizer.applyMultipleLevelDSE();
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<mlir::Pass> scalehls::createMultipleLevelDSEPass() {
|
||||
return std::make_unique<MultipleLevelDSE>();
|
||||
|
|
Loading…
Reference in New Issue