[MultipleLevelDSE] support ds initialization and pareto frontier update; support ds dump; remove ProfileDesignSpace pass; [PartialAffineLoopTile] become fully in-place tiling
This commit is contained in:
parent
390333e5ec
commit
3e3435dc84
|
@ -22,10 +22,9 @@ public:
|
|||
int64_t numDSP)
|
||||
: ScaleHLSAnalysisBase(builder), estimator(estimator), numDSP(numDSP) {}
|
||||
|
||||
void emitDebugInfo(FuncOp targetFunc, StringRef message);
|
||||
|
||||
/// This is a temporary approach that does not scale.
|
||||
void applyMultipleLevelDSE(FuncOp func);
|
||||
void applyMultipleLevelDSE(FuncOp func, raw_ostream &os,
|
||||
unsigned maxInitializeParallel = 16);
|
||||
|
||||
ScaleHLSEstimator &estimator;
|
||||
int64_t numDSP;
|
||||
|
|
|
@ -19,7 +19,6 @@ namespace scalehls {
|
|||
|
||||
/// Design space exploration pass.
|
||||
std::unique_ptr<Pass> createMultipleLevelDSEPass();
|
||||
std::unique_ptr<Pass> createProfileDesignSpacePass();
|
||||
|
||||
/// Dataflow optimization passes.
|
||||
std::unique_ptr<Pass> createLegalizeDataflowPass();
|
||||
|
|
|
@ -25,30 +25,14 @@ def MultipleLevelDSE : Pass<"multiple-level-dse", "ModuleOp"> {
|
|||
|
||||
let constructor = "mlir::scalehls::createMultipleLevelDSEPass()";
|
||||
|
||||
let options = [
|
||||
Option<"targetSpec", "target-spec", "std::string",
|
||||
/*default=*/"\"../config/target-spec.ini\"",
|
||||
"File path: target backend specifications and configurations">
|
||||
];
|
||||
}
|
||||
|
||||
def ProfileDesignSpace : Pass<"profile-design-space", "ModuleOp"> {
|
||||
let summary = "Optimize HLS design at multiple abstraction level";
|
||||
let description = [{
|
||||
This profile-design-space pass will profile the partial design space and
|
||||
output clock cycle and resource utilization estimation results.
|
||||
}];
|
||||
|
||||
let constructor = "mlir::scalehls::createProfileDesignSpacePass()";
|
||||
|
||||
let options = [
|
||||
Option<"targetSpec", "target-spec", "std::string",
|
||||
/*default=*/"\"../config/target-spec.ini\"",
|
||||
"File path: target backend specifications and configurations">,
|
||||
Option<"outputFile", "output-file", "std::string",
|
||||
/*default=*/"\"-\"", "File path: the output file path of profiling">,
|
||||
Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"1",
|
||||
"Positive number: the maximum tiling parallelism of the profiling">
|
||||
/*default=*/"\"-\"", "File path: the output file path">,
|
||||
Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"16",
|
||||
"Positive number: the maximum parallelism of initialization">
|
||||
];
|
||||
}
|
||||
|
||||
|
|
|
@ -30,21 +30,21 @@ bool applyRemoveVariableBound(AffineLoopBand &band);
|
|||
/// passed in because the post-tiling optimizations have to take function as
|
||||
/// target, e.g. canonicalizer and array partition.
|
||||
bool applyOptStrategy(AffineLoopBand &band, FuncOp func, TileList tileList,
|
||||
int64_t targetII);
|
||||
unsigned targetII);
|
||||
|
||||
/// Apply optimization strategy to a function.
|
||||
bool applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
|
||||
ArrayRef<int64_t> targetIIs);
|
||||
ArrayRef<unsigned> targetIIs);
|
||||
|
||||
/// Apply loop tiling to the input loop band and return the location of the
|
||||
/// original innermost loop in the tiled loop band. If tile is failed, -1 will
|
||||
/// be returned.
|
||||
int64_t applyLoopTiling(AffineLoopBand &band, TileList tileList);
|
||||
/// Apply loop tiling to the input loop band and sink all intra-tile loops to
|
||||
/// the innermost loop with the original loop order. Return the location of the
|
||||
/// innermost tile-space loop.
|
||||
Optional<unsigned> applyLoopTiling(AffineLoopBand &band, TileList tileList);
|
||||
|
||||
/// Apply loop pipelining to the pipelineLoc of the input loop band, all inner
|
||||
/// loops are automatically fully unrolled.
|
||||
bool applyLoopPipelining(AffineLoopBand &band, int64_t pipelineLoc,
|
||||
int64_t targetII);
|
||||
bool applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
|
||||
unsigned targetII);
|
||||
|
||||
/// Fully unroll all loops insides of a loop block.
|
||||
bool applyFullyLoopUnrolling(Block &block);
|
||||
|
|
|
@ -181,6 +181,7 @@ static unsigned getChildLoopNum(Operation *op) {
|
|||
|
||||
/// Get the whole loop band given the innermost loop and return it in "band".
|
||||
static void getLoopBandFromInnermost(AffineForOp forOp, AffineLoopBand &band) {
|
||||
band.clear();
|
||||
AffineLoopBand reverseBand;
|
||||
|
||||
auto currentLoop = forOp;
|
||||
|
@ -204,6 +205,7 @@ static void getLoopBandFromInnermost(AffineForOp forOp, AffineLoopBand &band) {
|
|||
/// Meanwhile, the return value is the innermost loop of this loop band.
|
||||
AffineForOp scalehls::getLoopBandFromOutermost(AffineForOp forOp,
|
||||
AffineLoopBand &band) {
|
||||
band.clear();
|
||||
auto currentLoop = forOp;
|
||||
while (true) {
|
||||
band.push_back(currentLoop);
|
||||
|
@ -222,6 +224,7 @@ AffineForOp scalehls::getLoopBandFromOutermost(AffineForOp forOp,
|
|||
/// loops are collected.
|
||||
void scalehls::getLoopBands(Block &block, AffineLoopBands &bands,
|
||||
bool allowHavingChilds) {
|
||||
bands.clear();
|
||||
block.walk([&](AffineForOp loop) {
|
||||
auto childNum = getChildLoopNum(loop);
|
||||
|
||||
|
|
|
@ -32,8 +32,8 @@ bool scalehls::applyFullyLoopUnrolling(Block &block) {
|
|||
|
||||
/// Apply loop pipelining to the input loop, all inner loops are automatically
|
||||
/// fully unrolled.
|
||||
bool scalehls::applyLoopPipelining(AffineLoopBand &band, int64_t pipelineLoc,
|
||||
int64_t targetII) {
|
||||
bool scalehls::applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
|
||||
unsigned targetII) {
|
||||
auto targetLoop = band[pipelineLoc];
|
||||
|
||||
// All inner loops of the pipelined loop are automatically unrolled.
|
||||
|
|
|
@ -12,61 +12,57 @@
|
|||
using namespace mlir;
|
||||
using namespace scalehls;
|
||||
|
||||
/// Apply loop tiling to the input loop band and return the location of the
|
||||
/// original innermost loop in the tiled loop band. If tile is failed, -1 will
|
||||
/// be returned.
|
||||
int64_t scalehls::applyLoopTiling(AffineLoopBand &band, TileList tileList) {
|
||||
/// Apply loop tiling to the input loop band and sink all intra-tile loops to
|
||||
/// the innermost loop with the original loop order. Return the location of the
|
||||
/// innermost tile-space loop.
|
||||
Optional<unsigned> scalehls::applyLoopTiling(AffineLoopBand &band,
|
||||
TileList tileList) {
|
||||
if (!isPerfectlyNested(band))
|
||||
return -1;
|
||||
|
||||
// Collect each loop location that is fully tiled and can be eliminated.
|
||||
SmallVector<unsigned, 8> fullyTiledLoops;
|
||||
unsigned pipelineLoc = 0;
|
||||
unsigned loc = 0;
|
||||
for (auto loop : band) {
|
||||
if (auto tripCount = getConstantTripCount(loop)) {
|
||||
if (tripCount.getValue() == tileList[loc])
|
||||
fullyTiledLoops.push_back(loc);
|
||||
else
|
||||
pipelineLoc = loc;
|
||||
} else
|
||||
return -1;
|
||||
++loc;
|
||||
}
|
||||
|
||||
// If all loops are fully tiled, keep the last loop untouched.
|
||||
if (fullyTiledLoops.size() == band.size()) {
|
||||
fullyTiledLoops.pop_back();
|
||||
pipelineLoc = band.size() - 1;
|
||||
}
|
||||
return Optional<unsigned>();
|
||||
|
||||
// Loop tiling.
|
||||
AffineLoopBand tiledBand;
|
||||
if (failed(tilePerfectlyNested(band, tileList, &tiledBand)))
|
||||
return -1;
|
||||
band = tiledBand;
|
||||
return Optional<unsigned>();
|
||||
|
||||
auto builder = OpBuilder(band.back());
|
||||
// Record the band size and clear the original loop band.
|
||||
auto originalBandSize = band.size();
|
||||
band.clear();
|
||||
|
||||
// Remove fully tiled loops.
|
||||
for (auto loc : fullyTiledLoops) {
|
||||
auto loop = band[loc];
|
||||
// Remove redundant loops in the tiled loop band.
|
||||
auto builder = OpBuilder(tiledBand.back());
|
||||
unsigned erasedLoopNum = 0;
|
||||
unsigned loc = 0;
|
||||
|
||||
// Create an affine apply operation generating a constant zero.
|
||||
builder.setInsertionPoint(loop);
|
||||
auto constZero = builder.create<AffineApplyOp>(
|
||||
loop.getLoc(), builder.getConstantAffineMap(0), ValueRange({}));
|
||||
loop.getInductionVar().replaceAllUsesWith(constZero);
|
||||
for (auto loop : tiledBand) {
|
||||
if (erasedLoopNum >= originalBandSize - 1 || loc >= originalBandSize ||
|
||||
getConstantTripCount(loop).getValue() > 1) {
|
||||
// All tile-space loops which have a trip count larger than 1 and all
|
||||
// intra-tile loops are pushed back. Meanwhile, we are not willing to see
|
||||
// all tile-space loops removed since in that case many analysis and
|
||||
// transforms will become very hard. Thereby we record the number of
|
||||
// erased loop so far and always keep at least one tile-space loop
|
||||
// remained in the loop band even if it has a trip count of 1.
|
||||
band.push_back(loop);
|
||||
} else {
|
||||
// Create an affine apply operation to represent the lower bound.
|
||||
builder.setInsertionPoint(loop);
|
||||
auto newIterVar = builder.create<AffineApplyOp>(
|
||||
loop.getLoc(), loop.getLowerBoundMap(), loop.getLowerBoundOperands());
|
||||
loop.getInductionVar().replaceAllUsesWith(newIterVar);
|
||||
|
||||
// Move all operation except the terminator to the outside.
|
||||
auto &parentBlock = loop->getBlock()->getOperations();
|
||||
auto &loopBlock = loop.getBody()->getOperations();
|
||||
parentBlock.splice(loop->getIterator(), loopBlock, loopBlock.begin(),
|
||||
std::prev(loopBlock.end()));
|
||||
loop.erase();
|
||||
// Move all operation except the terminator to the outside.
|
||||
auto &parentBlock = loop->getBlock()->getOperations();
|
||||
auto &loopBlock = loop.getBody()->getOperations();
|
||||
parentBlock.splice(loop->getIterator(), loopBlock, loopBlock.begin(),
|
||||
std::prev(loopBlock.end()));
|
||||
loop.erase();
|
||||
++erasedLoopNum;
|
||||
}
|
||||
++loc;
|
||||
}
|
||||
|
||||
return pipelineLoc;
|
||||
return band.size() - originalBandSize - 1;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
|
|
@ -6,9 +6,12 @@
|
|||
|
||||
#include "scalehls/Transforms/MultipleLevelDSE.h"
|
||||
#include "mlir/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Support/FileUtilities.h"
|
||||
#include "scalehls/Transforms/Passes.h"
|
||||
#include "scalehls/Transforms/Utils.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include <numeric>
|
||||
#include <pthread.h>
|
||||
|
||||
#define DEBUG_TYPE "scalehls"
|
||||
|
@ -16,26 +19,12 @@
|
|||
using namespace mlir;
|
||||
using namespace scalehls;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Helper methods
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
static int64_t getInnerParallelism(AffineForOp forOp) {
|
||||
int64_t count = 0;
|
||||
for (auto loop : forOp.getOps<AffineForOp>()) {
|
||||
auto innerCount = getInnerParallelism(loop);
|
||||
if (auto trip = getConstantTripCount(loop))
|
||||
count += trip.getValue() * innerCount;
|
||||
else
|
||||
count += innerCount;
|
||||
}
|
||||
|
||||
// If the current loop is innermost loop, count should be one.
|
||||
return std::max(count, (int64_t)1);
|
||||
}
|
||||
|
||||
using TileConfig = unsigned;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Helper Methods and Classes
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
namespace {
|
||||
struct DesignPoint {
|
||||
public:
|
||||
|
@ -122,10 +111,47 @@ public:
|
|||
return sqrtf(distanceSquare);
|
||||
}
|
||||
|
||||
void findParetoFrontiers() {}
|
||||
/// Update paretoPoints to remove design points that are not pareto frontiers.
|
||||
void updateParetoPoints() {
|
||||
// Sort the pareto points with in an ascending order of latency and the an
|
||||
// ascending order of dsp number.
|
||||
auto latencyThenDspNum = [&](const DesignPoint &a, const DesignPoint &b) {
|
||||
return (a.latency < b.latency ||
|
||||
(a.latency == b.latency && a.dspNum < b.dspNum));
|
||||
};
|
||||
std::sort(paretoPoints.begin(), paretoPoints.end(), latencyThenDspNum);
|
||||
|
||||
// Find pareto frontiers. After the sorting, the first design point must be
|
||||
// a pareto point.
|
||||
auto paretoPoint = paretoPoints[0];
|
||||
auto paretoLatency = paretoPoint.latency;
|
||||
auto paretoDspNum = paretoPoint.dspNum;
|
||||
SmallVector<DesignPoint, 16> frontiers;
|
||||
|
||||
for (auto point : paretoPoints) {
|
||||
auto tmpLatency = point.latency;
|
||||
auto tmpDspNum = point.dspNum;
|
||||
|
||||
if (tmpDspNum < paretoDspNum) {
|
||||
frontiers.push_back(point);
|
||||
|
||||
paretoPoint = point;
|
||||
paretoLatency = tmpLatency;
|
||||
paretoDspNum = tmpDspNum;
|
||||
|
||||
} else if (tmpDspNum == paretoDspNum && tmpLatency == paretoLatency)
|
||||
frontiers.push_back(point);
|
||||
}
|
||||
|
||||
paretoPoints = frontiers;
|
||||
}
|
||||
|
||||
/// Evaluate all design points under the given tile config.
|
||||
bool evaluateTileConfig(TileConfig config) {
|
||||
// If the current tile config is already estimated, return true.
|
||||
if (!unestimatedTileConfigs.count(config))
|
||||
return true;
|
||||
|
||||
/// Explore all design points under the given tile config.
|
||||
bool exploreTileConfig(TileConfig config) {
|
||||
// Clone a temporary loop band by cloning the outermost loop.
|
||||
auto tmpOuterLoop = band.front().clone();
|
||||
AffineLoopBand tmpBand;
|
||||
|
@ -149,7 +175,7 @@ public:
|
|||
|
||||
// Apply the current tiling config and start the estimation. Note that after
|
||||
// optimization, tmpBand is optimized in place and becomes a new loop band.
|
||||
if (!applyOptStrategy(tmpBand, func, tileList, 1))
|
||||
if (!applyOptStrategy(tmpBand, func, tileList, (unsigned)1))
|
||||
return false;
|
||||
tmpOuterLoop = tmpBand.front();
|
||||
estimator.estimateLoop(tmpOuterLoop);
|
||||
|
@ -173,17 +199,63 @@ public:
|
|||
paretoPoints.push_back(point);
|
||||
}
|
||||
|
||||
// Erase the temporary loop band and return.
|
||||
// Erase the temporary loop band and annotate the current tile config as
|
||||
// estimated.
|
||||
tmpOuterLoop.erase();
|
||||
unestimatedTileConfigs.erase(config);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Initialize the design space.
|
||||
void initializeDesignSpace(unsigned maxInitializeParallel) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "3.1 Initializing the design space...\n";);
|
||||
|
||||
for (TileConfig config = 0; config < validTileConfigNum; ++config) {
|
||||
unestimatedTileConfigs.erase(config);
|
||||
auto tileList = getTileList(config);
|
||||
|
||||
// We only evaluate the design points whose overall parallelism is smaller
|
||||
// than the maxInitializeParallel to improve the efficiency.
|
||||
auto parallel = std::accumulate(tileList.begin(), tileList.end(),
|
||||
(unsigned)1, std::multiplies<unsigned>());
|
||||
if (parallel > maxInitializeParallel)
|
||||
continue;
|
||||
|
||||
LLVM_DEBUG(llvm::dbgs() << config << ",");
|
||||
evaluateTileConfig(config);
|
||||
}
|
||||
|
||||
LLVM_DEBUG(llvm::dbgs() << "\n\n");
|
||||
updateParetoPoints();
|
||||
}
|
||||
|
||||
/// Dump pareto and non-pareto points which have been evaluated in the design
|
||||
/// space to a csv output file.
|
||||
void dumpDesignSpace(raw_ostream &os) {
|
||||
// Print header row.
|
||||
for (unsigned i = 0; i < tripCountList.size(); ++i)
|
||||
os << "l" << i << ",";
|
||||
os << "ii,cycle,dsp,type\n";
|
||||
|
||||
// Print pareto design points.
|
||||
for (auto point : paretoPoints) {
|
||||
for (auto size : getTileList(point.tileConfig))
|
||||
os << size << ",";
|
||||
os << point.targetII << "," << point.latency << "," << point.dspNum
|
||||
<< ",pareto\n";
|
||||
}
|
||||
|
||||
// Print all design points.
|
||||
for (auto point : allPoints) {
|
||||
for (auto size : getTileList(point.tileConfig))
|
||||
os << size << ",";
|
||||
os << point.targetII << "," << point.latency << "," << point.dspNum
|
||||
<< ",non-pareto\n";
|
||||
}
|
||||
|
||||
LLVM_DEBUG(llvm::dbgs() << "Design space is dumped to output file.\n\n");
|
||||
}
|
||||
|
||||
/// Stores current pareto frontiers and all evaluated design points.
|
||||
SmallVector<DesignPoint, 16> paretoPoints;
|
||||
SmallVector<DesignPoint, 16> allPoints;
|
||||
|
||||
|
@ -208,25 +280,37 @@ public:
|
|||
};
|
||||
} // namespace
|
||||
|
||||
static int64_t getInnerParallelism(AffineForOp forOp) {
|
||||
int64_t count = 0;
|
||||
for (auto loop : forOp.getOps<AffineForOp>()) {
|
||||
auto innerCount = getInnerParallelism(loop);
|
||||
if (auto trip = getConstantTripCount(loop))
|
||||
count += trip.getValue() * innerCount;
|
||||
else
|
||||
count += innerCount;
|
||||
}
|
||||
|
||||
// If the current loop is innermost loop, count should be one.
|
||||
return std::max(count, (int64_t)1);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Optimizer Class Definition
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
void ScaleHLSOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) {
|
||||
LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency");
|
||||
auto dsp = getIntAttrValue(targetFunc, "dsp");
|
||||
|
||||
llvm::dbgs() << message << "\n";
|
||||
llvm::dbgs() << "Current latency is " << Twine(latency)
|
||||
<< ", DSP utilization is " << Twine(dsp) << ".\n\n";);
|
||||
}
|
||||
|
||||
/// This is a temporary approach that does not scale.
|
||||
void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
|
||||
void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func, raw_ostream &os,
|
||||
unsigned maxInitializeParallel) {
|
||||
estimator.estimateFunc(func);
|
||||
if (getIntAttrValue(func, "dsp") > numDSP)
|
||||
return;
|
||||
emitDebugInfo(func, "Start multiple level design space exploration.");
|
||||
|
||||
LLVM_DEBUG(auto latency = getIntAttrValue(func, "latency");
|
||||
auto dspNum = getIntAttrValue(func, "dsp");
|
||||
|
||||
llvm::dbgs() << "\nStart the design space exploration.\n";
|
||||
llvm::dbgs() << "Initial clock cycle is " << Twine(latency)
|
||||
<< ", DSP usage is " << Twine(dspNum) << ".\n\n";);
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// STAGE 1: Simplify Loop Nests Structure
|
||||
|
@ -331,7 +415,7 @@ void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
|
|||
estimator.estimateFunc(func);
|
||||
if (getIntAttrValue(func, "dsp") > numDSP)
|
||||
return;
|
||||
emitDebugInfo(func, "1. Simplify loop nests structure.");
|
||||
LLVM_DEBUG(llvm::dbgs() << "1. Simplify loop nests structure.\n\n");
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// STAGE 2: Loop Bands Optimization
|
||||
|
@ -357,21 +441,19 @@ void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
|
|||
estimator.estimateFunc(func);
|
||||
if (getIntAttrValue(func, "dsp") > numDSP)
|
||||
return;
|
||||
emitDebugInfo(func, "2. Apply loop perfection, remove variable bound, and "
|
||||
"loop order opt.");
|
||||
LLVM_DEBUG(llvm::dbgs() << "2. Apply loop perfection, loop order opt, and "
|
||||
"remove variable loop bound.\n\n");
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// STAGE 3: Loop Bands Tiling and Finalization
|
||||
// STAGE 3: Search for pareto frontiers
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
LLVM_DEBUG(llvm::dbgs() << "3. Search for pareto design points...\n\n";);
|
||||
|
||||
for (unsigned i = 0; i < targetNum; ++i) {
|
||||
auto space = DesignSpace(func, targetBands[i], estimator);
|
||||
space.exploreTileConfig(1);
|
||||
|
||||
for (auto point : space.paretoPoints) {
|
||||
llvm::outs() << "latency: " << point.latency
|
||||
<< ", dsp_num: " << point.dspNum
|
||||
<< ", ii: " << point.targetII << "\n";
|
||||
}
|
||||
space.initializeDesignSpace(maxInitializeParallel);
|
||||
space.dumpDesignSpace(os);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -392,6 +474,12 @@ struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
|
|||
getLatencyMap(spec, latencyMap);
|
||||
int64_t numDSP = ceil(spec.GetInteger("specification", "dsp", 220) * 1.1);
|
||||
|
||||
// Parse output file.
|
||||
std::string errorMessage;
|
||||
auto output = mlir::openOutputFile(outputFile, &errorMessage);
|
||||
if (!output)
|
||||
emitError(module.getLoc(), errorMessage);
|
||||
|
||||
// Initialize an performance and resource estimator.
|
||||
auto estimator = ScaleHLSEstimator(builder, latencyMap);
|
||||
auto optimizer = ScaleHLSOptimizer(builder, estimator, numDSP);
|
||||
|
@ -400,7 +488,9 @@ struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
|
|||
for (auto func : module.getOps<FuncOp>())
|
||||
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
|
||||
if (topFunction.getValue())
|
||||
optimizer.applyMultipleLevelDSE(func);
|
||||
optimizer.applyMultipleLevelDSE(func, output->os(), maxParallel);
|
||||
|
||||
output->keep();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
|
|
@ -1,201 +0,0 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Copyright 2020-2021 The ScaleHLS Authors.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Support/FileUtilities.h"
|
||||
#include "scalehls/Analysis/QoREstimation.h"
|
||||
#include "scalehls/Transforms/Passes.h"
|
||||
#include "scalehls/Transforms/Utils.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
|
||||
#define DEBUG_TYPE "scalehls"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace scalehls;
|
||||
|
||||
/// Currently only support single loop band profiling.
|
||||
static void applyProfiling(FuncOp func, raw_ostream &os,
|
||||
ScaleHLSEstimator &estimator, unsigned maxParallel) {
|
||||
if (!dyn_cast<AffineForOp>(func.front().front())) {
|
||||
func.emitError("first operation is not loop");
|
||||
return;
|
||||
}
|
||||
|
||||
// Helper function for fetching the target loop band.
|
||||
auto getFirstBand = [&](FuncOp targetFunc) {
|
||||
// Get the first loop band as target.
|
||||
auto target = dyn_cast<AffineForOp>(targetFunc.front().front());
|
||||
AffineLoopBand band;
|
||||
getLoopBandFromOutermost(target, band);
|
||||
return band;
|
||||
};
|
||||
|
||||
// Perfect and optimize loop order of the target loop band.
|
||||
auto band = getFirstBand(func);
|
||||
auto loopNum = band.size();
|
||||
applyAffineLoopPerfection(band);
|
||||
applyAffineLoopOrderOpt(band);
|
||||
applyRemoveVariableBound(band);
|
||||
|
||||
// Initialize tile size and trip count vector.
|
||||
auto tileList = TileList(loopNum, 1);
|
||||
auto tripCounts = TileList();
|
||||
unsigned iterations = 1;
|
||||
for (unsigned loc = 0; loc < loopNum; ++loc) {
|
||||
auto tripCount = getConstantTripCount(band[loc]).getValue();
|
||||
tripCounts.push_back(tripCount);
|
||||
iterations *= (log2(tripCount) + 1);
|
||||
os << "l" << loc << ",";
|
||||
}
|
||||
os << "ii,cycle,dsp,pareto\n";
|
||||
|
||||
// Storing all design points.
|
||||
using DesignPoint = SmallVector<int64_t, 8>;
|
||||
std::vector<DesignPoint> designPoints;
|
||||
|
||||
// Traverse each tile size configuration.
|
||||
for (unsigned i = 0; i < iterations - 1; ++i) {
|
||||
for (unsigned loc = 0; loc < loopNum; ++loc) {
|
||||
auto &tileSize = tileList[loc];
|
||||
if (loc == 0)
|
||||
tileSize *= 2;
|
||||
else if (tileList[loc - 1] > tripCounts[loc - 1])
|
||||
tileSize *= 2;
|
||||
}
|
||||
|
||||
unsigned iterNum = 1;
|
||||
unsigned parallel = 1;
|
||||
for (unsigned loc = 0; loc < loopNum; ++loc) {
|
||||
auto &tileSize = tileList[loc];
|
||||
if (tileSize > tripCounts[loc])
|
||||
tileSize = 1;
|
||||
iterNum *= tripCounts[loc] / tileSize;
|
||||
parallel *= tileSize;
|
||||
LLVM_DEBUG(llvm::dbgs() << tileSize << ", ";);
|
||||
}
|
||||
LLVM_DEBUG(llvm::dbgs() << "\n";);
|
||||
|
||||
if (parallel > maxParallel)
|
||||
continue;
|
||||
|
||||
// Apply tiling strategy.
|
||||
auto tmpFunc = func.clone();
|
||||
applyOptStrategy(tmpFunc, tileList, 1);
|
||||
estimator.estimateFunc(tmpFunc);
|
||||
auto tmpLoop = getFirstBand(tmpFunc).back();
|
||||
|
||||
// Fetch latency and resource utilization.
|
||||
auto II = estimator.getIntAttrValue(tmpLoop, "ii");
|
||||
auto iterLatency = estimator.getIntAttrValue(tmpLoop, "iter_latency");
|
||||
auto shareDspNum = estimator.getIntAttrValue(tmpLoop, "share_dsp");
|
||||
auto noShareDspNum = estimator.getIntAttrValue(tmpLoop, "noshare_dsp");
|
||||
|
||||
// Improve target II until II is equal to iteration latency.
|
||||
for (auto tmpII = II; tmpII <= iterLatency; ++tmpII) {
|
||||
auto tmpDspNum = std::max(shareDspNum, noShareDspNum / tmpII);
|
||||
auto tmpLatency = iterLatency + tmpII * (iterNum - 1);
|
||||
|
||||
auto point = SmallVector<int64_t, 8>(tileList.begin(), tileList.end());
|
||||
point.append({tmpII, tmpLatency, tmpDspNum});
|
||||
designPoints.push_back(point);
|
||||
|
||||
if (iterNum == 1)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort all design points by latency.
|
||||
auto compareLatency = [&](const DesignPoint &a, const DesignPoint &b) {
|
||||
return a[loopNum + 1] < b[loopNum + 1];
|
||||
};
|
||||
std::sort(designPoints.begin(), designPoints.end(), compareLatency);
|
||||
|
||||
// Sort all design points with the same latency by dsp number.
|
||||
auto compareDspNum = [&](const DesignPoint &a, const DesignPoint &b) {
|
||||
return a[loopNum + 2] < b[loopNum + 2];
|
||||
};
|
||||
for (auto i = designPoints.begin(); i < designPoints.end();) {
|
||||
auto j = i;
|
||||
for (; j < designPoints.end(); ++j)
|
||||
if ((*i)[loopNum + 1] != (*j)[loopNum + 1])
|
||||
break;
|
||||
std::sort(i, j, compareDspNum);
|
||||
i = j;
|
||||
}
|
||||
|
||||
// Find pareto frontiers. After the sorting, the first design point must be a
|
||||
// pareto point.
|
||||
auto paretoPoint = designPoints[0];
|
||||
auto paretoLatency = paretoPoint[loopNum + 1];
|
||||
auto paretoDspNum = paretoPoint[loopNum + 2];
|
||||
std::vector<DesignPoint> paretoPoints;
|
||||
|
||||
for (auto point : designPoints) {
|
||||
auto tmpLatency = point[loopNum + 1];
|
||||
auto tmpDspNum = point[loopNum + 2];
|
||||
|
||||
if (tmpDspNum < paretoDspNum) {
|
||||
paretoPoints.push_back(point);
|
||||
paretoPoint = point;
|
||||
paretoLatency = tmpLatency;
|
||||
paretoDspNum = tmpDspNum;
|
||||
} else if (tmpDspNum == paretoDspNum && tmpLatency == paretoLatency)
|
||||
paretoPoints.push_back(point);
|
||||
}
|
||||
|
||||
// Print all pareto design points.
|
||||
for (auto point : paretoPoints) {
|
||||
for (auto element : point)
|
||||
os << element << ",";
|
||||
os << "pareto\n";
|
||||
}
|
||||
|
||||
// Print all design points.
|
||||
for (auto point : designPoints) {
|
||||
for (auto element : point)
|
||||
os << element << ",";
|
||||
os << "non-pareto\n";
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct ProfileDesignSpace : public ProfileDesignSpaceBase<ProfileDesignSpace> {
|
||||
void runOnOperation() override {
|
||||
auto module = getOperation();
|
||||
auto builder = Builder(module);
|
||||
|
||||
// Read configuration file.
|
||||
INIReader spec(targetSpec);
|
||||
if (spec.ParseError())
|
||||
emitError(module.getLoc(), "target spec file parse fail\n");
|
||||
|
||||
// Collect profiling latency data, where default values are based on Xilinx
|
||||
// PYNQ-Z1 board.
|
||||
LatencyMap latencyMap;
|
||||
getLatencyMap(spec, latencyMap);
|
||||
|
||||
// Initialize an performance and resource estimator.
|
||||
auto estimator = ScaleHLSEstimator(builder, latencyMap);
|
||||
|
||||
// Optimize the top function.
|
||||
for (auto func : module.getOps<FuncOp>())
|
||||
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
|
||||
if (topFunction.getValue()) {
|
||||
std::string errorMessage;
|
||||
auto output = mlir::openOutputFile(outputFile, &errorMessage);
|
||||
if (!output)
|
||||
emitError(module.getLoc(), errorMessage);
|
||||
|
||||
applyProfiling(func, output->os(), estimator, maxParallel);
|
||||
output->keep();
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<Pass> scalehls::createProfileDesignSpacePass() {
|
||||
return std::make_unique<ProfileDesignSpace>();
|
||||
}
|
|
@ -33,18 +33,18 @@ static void addPassPipeline(PassManager &pm) {
|
|||
/// passed in because the post-tiling optimizations have to take function as
|
||||
/// target, e.g. canonicalizer and array partition.
|
||||
bool scalehls::applyOptStrategy(AffineLoopBand &band, FuncOp func,
|
||||
TileList tileList, int64_t targetII) {
|
||||
TileList tileList, unsigned targetII) {
|
||||
// By design the input function must be the ancestor of the input loop band.
|
||||
if (!func->isProperAncestor(band.front()))
|
||||
return false;
|
||||
|
||||
// Apply loop tiling.
|
||||
auto pipelineLoc = applyLoopTiling(band, tileList);
|
||||
if (pipelineLoc == -1)
|
||||
if (!pipelineLoc)
|
||||
return false;
|
||||
|
||||
// Apply loop pipelining.
|
||||
if (!applyLoopPipelining(band, pipelineLoc, targetII))
|
||||
if (!applyLoopPipelining(band, pipelineLoc.getValue(), targetII))
|
||||
return false;
|
||||
|
||||
// Apply general optimizations and array partition.
|
||||
|
@ -59,17 +59,17 @@ bool scalehls::applyOptStrategy(AffineLoopBand &band, FuncOp func,
|
|||
|
||||
/// Apply optimization strategy to a function.
|
||||
bool scalehls::applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
|
||||
ArrayRef<int64_t> targetIIs) {
|
||||
ArrayRef<unsigned> targetIIs) {
|
||||
AffineLoopBands bands;
|
||||
getLoopBands(func.front(), bands);
|
||||
|
||||
// Apply loop tiling and pipelining to all loop bands.
|
||||
for (unsigned i = 0, e = bands.size(); i < e; ++i) {
|
||||
auto pipelineLoc = applyLoopTiling(bands[i], tileLists[i]);
|
||||
if (pipelineLoc == -1)
|
||||
if (!pipelineLoc)
|
||||
return false;
|
||||
|
||||
if (!applyLoopPipelining(bands[i], pipelineLoc, targetIIs[i]))
|
||||
if (!applyLoopPipelining(bands[i], pipelineLoc.getValue(), targetIIs[i]))
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
// RUN: scalehls-opt -profile-design-space="target-spec=../../config/target-spec.ini max-parallel=32" %s | FileCheck %s
|
||||
|
||||
// CHECK-LABEL: func @test
|
||||
func @test() {
|
||||
return
|
||||
}
|
Loading…
Reference in New Issue