[MultipleLevelDSE] support ds initialization and pareto frontier update; support ds dump; remove ProfileDesignSpace pass; [PartialAffineLoopTile] become fully in-place tiling

This commit is contained in:
Hanchen Ye 2021-02-12 14:54:36 -06:00
parent 390333e5ec
commit 3e3435dc84
11 changed files with 201 additions and 337 deletions

View File

@ -22,10 +22,9 @@ public:
int64_t numDSP)
: ScaleHLSAnalysisBase(builder), estimator(estimator), numDSP(numDSP) {}
void emitDebugInfo(FuncOp targetFunc, StringRef message);
/// This is a temporary approach that does not scale.
void applyMultipleLevelDSE(FuncOp func);
void applyMultipleLevelDSE(FuncOp func, raw_ostream &os,
unsigned maxInitializeParallel = 16);
ScaleHLSEstimator &estimator;
int64_t numDSP;

View File

@ -19,7 +19,6 @@ namespace scalehls {
/// Design space exploration pass.
std::unique_ptr<Pass> createMultipleLevelDSEPass();
std::unique_ptr<Pass> createProfileDesignSpacePass();
/// Dataflow optimization passes.
std::unique_ptr<Pass> createLegalizeDataflowPass();

View File

@ -25,30 +25,14 @@ def MultipleLevelDSE : Pass<"multiple-level-dse", "ModuleOp"> {
let constructor = "mlir::scalehls::createMultipleLevelDSEPass()";
let options = [
Option<"targetSpec", "target-spec", "std::string",
/*default=*/"\"../config/target-spec.ini\"",
"File path: target backend specifications and configurations">
];
}
def ProfileDesignSpace : Pass<"profile-design-space", "ModuleOp"> {
let summary = "Optimize HLS design at multiple abstraction level";
let description = [{
This profile-design-space pass will profile the partial design space and
output clock cycle and resource utilization estimation results.
}];
let constructor = "mlir::scalehls::createProfileDesignSpacePass()";
let options = [
Option<"targetSpec", "target-spec", "std::string",
/*default=*/"\"../config/target-spec.ini\"",
"File path: target backend specifications and configurations">,
Option<"outputFile", "output-file", "std::string",
/*default=*/"\"-\"", "File path: the output file path of profiling">,
Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"1",
"Positive number: the maximum tiling parallelism of the profiling">
/*default=*/"\"-\"", "File path: the output file path">,
Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"16",
"Positive number: the maximum parallelism of initialization">
];
}

View File

@ -30,21 +30,21 @@ bool applyRemoveVariableBound(AffineLoopBand &band);
/// passed in because the post-tiling optimizations have to take function as
/// target, e.g. canonicalizer and array partition.
bool applyOptStrategy(AffineLoopBand &band, FuncOp func, TileList tileList,
int64_t targetII);
unsigned targetII);
/// Apply optimization strategy to a function.
bool applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
ArrayRef<int64_t> targetIIs);
ArrayRef<unsigned> targetIIs);
/// Apply loop tiling to the input loop band and return the location of the
/// original innermost loop in the tiled loop band. If tile is failed, -1 will
/// be returned.
int64_t applyLoopTiling(AffineLoopBand &band, TileList tileList);
/// Apply loop tiling to the input loop band and sink all intra-tile loops to
/// the innermost loop with the original loop order. Return the location of the
/// innermost tile-space loop.
Optional<unsigned> applyLoopTiling(AffineLoopBand &band, TileList tileList);
/// Apply loop pipelining to the pipelineLoc of the input loop band, all inner
/// loops are automatically fully unrolled.
bool applyLoopPipelining(AffineLoopBand &band, int64_t pipelineLoc,
int64_t targetII);
bool applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
unsigned targetII);
/// Fully unroll all loops insides of a loop block.
bool applyFullyLoopUnrolling(Block &block);

View File

@ -181,6 +181,7 @@ static unsigned getChildLoopNum(Operation *op) {
/// Get the whole loop band given the innermost loop and return it in "band".
static void getLoopBandFromInnermost(AffineForOp forOp, AffineLoopBand &band) {
band.clear();
AffineLoopBand reverseBand;
auto currentLoop = forOp;
@ -204,6 +205,7 @@ static void getLoopBandFromInnermost(AffineForOp forOp, AffineLoopBand &band) {
/// Meanwhile, the return value is the innermost loop of this loop band.
AffineForOp scalehls::getLoopBandFromOutermost(AffineForOp forOp,
AffineLoopBand &band) {
band.clear();
auto currentLoop = forOp;
while (true) {
band.push_back(currentLoop);
@ -222,6 +224,7 @@ AffineForOp scalehls::getLoopBandFromOutermost(AffineForOp forOp,
/// loops are collected.
void scalehls::getLoopBands(Block &block, AffineLoopBands &bands,
bool allowHavingChilds) {
bands.clear();
block.walk([&](AffineForOp loop) {
auto childNum = getChildLoopNum(loop);

View File

@ -32,8 +32,8 @@ bool scalehls::applyFullyLoopUnrolling(Block &block) {
/// Apply loop pipelining to the input loop, all inner loops are automatically
/// fully unrolled.
bool scalehls::applyLoopPipelining(AffineLoopBand &band, int64_t pipelineLoc,
int64_t targetII) {
bool scalehls::applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
unsigned targetII) {
auto targetLoop = band[pipelineLoc];
// All inner loops of the pipelined loop are automatically unrolled.

View File

@ -12,61 +12,57 @@
using namespace mlir;
using namespace scalehls;
/// Apply loop tiling to the input loop band and return the location of the
/// original innermost loop in the tiled loop band. If tile is failed, -1 will
/// be returned.
int64_t scalehls::applyLoopTiling(AffineLoopBand &band, TileList tileList) {
/// Apply loop tiling to the input loop band and sink all intra-tile loops to
/// the innermost loop with the original loop order. Return the location of the
/// innermost tile-space loop.
Optional<unsigned> scalehls::applyLoopTiling(AffineLoopBand &band,
TileList tileList) {
if (!isPerfectlyNested(band))
return -1;
// Collect each loop location that is fully tiled and can be eliminated.
SmallVector<unsigned, 8> fullyTiledLoops;
unsigned pipelineLoc = 0;
unsigned loc = 0;
for (auto loop : band) {
if (auto tripCount = getConstantTripCount(loop)) {
if (tripCount.getValue() == tileList[loc])
fullyTiledLoops.push_back(loc);
else
pipelineLoc = loc;
} else
return -1;
++loc;
}
// If all loops are fully tiled, keep the last loop untouched.
if (fullyTiledLoops.size() == band.size()) {
fullyTiledLoops.pop_back();
pipelineLoc = band.size() - 1;
}
return Optional<unsigned>();
// Loop tiling.
AffineLoopBand tiledBand;
if (failed(tilePerfectlyNested(band, tileList, &tiledBand)))
return -1;
band = tiledBand;
return Optional<unsigned>();
auto builder = OpBuilder(band.back());
// Record the band size and clear the original loop band.
auto originalBandSize = band.size();
band.clear();
// Remove fully tiled loops.
for (auto loc : fullyTiledLoops) {
auto loop = band[loc];
// Remove redundant loops in the tiled loop band.
auto builder = OpBuilder(tiledBand.back());
unsigned erasedLoopNum = 0;
unsigned loc = 0;
// Create an affine apply operation generating a constant zero.
builder.setInsertionPoint(loop);
auto constZero = builder.create<AffineApplyOp>(
loop.getLoc(), builder.getConstantAffineMap(0), ValueRange({}));
loop.getInductionVar().replaceAllUsesWith(constZero);
for (auto loop : tiledBand) {
if (erasedLoopNum >= originalBandSize - 1 || loc >= originalBandSize ||
getConstantTripCount(loop).getValue() > 1) {
// All tile-space loops which have a trip count larger than 1 and all
// intra-tile loops are pushed back. Meanwhile, we are not willing to see
// all tile-space loops removed since in that case many analysis and
// transforms will become very hard. Thereby we record the number of
// erased loop so far and always keep at least one tile-space loop
// remained in the loop band even if it has a trip count of 1.
band.push_back(loop);
} else {
// Create an affine apply operation to represent the lower bound.
builder.setInsertionPoint(loop);
auto newIterVar = builder.create<AffineApplyOp>(
loop.getLoc(), loop.getLowerBoundMap(), loop.getLowerBoundOperands());
loop.getInductionVar().replaceAllUsesWith(newIterVar);
// Move all operation except the terminator to the outside.
auto &parentBlock = loop->getBlock()->getOperations();
auto &loopBlock = loop.getBody()->getOperations();
parentBlock.splice(loop->getIterator(), loopBlock, loopBlock.begin(),
std::prev(loopBlock.end()));
loop.erase();
// Move all operation except the terminator to the outside.
auto &parentBlock = loop->getBlock()->getOperations();
auto &loopBlock = loop.getBody()->getOperations();
parentBlock.splice(loop->getIterator(), loopBlock, loopBlock.begin(),
std::prev(loopBlock.end()));
loop.erase();
++erasedLoopNum;
}
++loc;
}
return pipelineLoc;
return band.size() - originalBandSize - 1;
}
namespace {

View File

@ -6,9 +6,12 @@
#include "scalehls/Transforms/MultipleLevelDSE.h"
#include "mlir/Analysis/LoopAnalysis.h"
#include "mlir/Support/FileUtilities.h"
#include "scalehls/Transforms/Passes.h"
#include "scalehls/Transforms/Utils.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ToolOutputFile.h"
#include <numeric>
#include <pthread.h>
#define DEBUG_TYPE "scalehls"
@ -16,26 +19,12 @@
using namespace mlir;
using namespace scalehls;
//===----------------------------------------------------------------------===//
// Helper methods
//===----------------------------------------------------------------------===//
static int64_t getInnerParallelism(AffineForOp forOp) {
int64_t count = 0;
for (auto loop : forOp.getOps<AffineForOp>()) {
auto innerCount = getInnerParallelism(loop);
if (auto trip = getConstantTripCount(loop))
count += trip.getValue() * innerCount;
else
count += innerCount;
}
// If the current loop is innermost loop, count should be one.
return std::max(count, (int64_t)1);
}
using TileConfig = unsigned;
//===----------------------------------------------------------------------===//
// Helper Methods and Classes
//===----------------------------------------------------------------------===//
namespace {
struct DesignPoint {
public:
@ -122,10 +111,47 @@ public:
return sqrtf(distanceSquare);
}
void findParetoFrontiers() {}
/// Update paretoPoints to remove design points that are not pareto frontiers.
void updateParetoPoints() {
// Sort the pareto points with in an ascending order of latency and the an
// ascending order of dsp number.
auto latencyThenDspNum = [&](const DesignPoint &a, const DesignPoint &b) {
return (a.latency < b.latency ||
(a.latency == b.latency && a.dspNum < b.dspNum));
};
std::sort(paretoPoints.begin(), paretoPoints.end(), latencyThenDspNum);
// Find pareto frontiers. After the sorting, the first design point must be
// a pareto point.
auto paretoPoint = paretoPoints[0];
auto paretoLatency = paretoPoint.latency;
auto paretoDspNum = paretoPoint.dspNum;
SmallVector<DesignPoint, 16> frontiers;
for (auto point : paretoPoints) {
auto tmpLatency = point.latency;
auto tmpDspNum = point.dspNum;
if (tmpDspNum < paretoDspNum) {
frontiers.push_back(point);
paretoPoint = point;
paretoLatency = tmpLatency;
paretoDspNum = tmpDspNum;
} else if (tmpDspNum == paretoDspNum && tmpLatency == paretoLatency)
frontiers.push_back(point);
}
paretoPoints = frontiers;
}
/// Evaluate all design points under the given tile config.
bool evaluateTileConfig(TileConfig config) {
// If the current tile config is already estimated, return true.
if (!unestimatedTileConfigs.count(config))
return true;
/// Explore all design points under the given tile config.
bool exploreTileConfig(TileConfig config) {
// Clone a temporary loop band by cloning the outermost loop.
auto tmpOuterLoop = band.front().clone();
AffineLoopBand tmpBand;
@ -149,7 +175,7 @@ public:
// Apply the current tiling config and start the estimation. Note that after
// optimization, tmpBand is optimized in place and becomes a new loop band.
if (!applyOptStrategy(tmpBand, func, tileList, 1))
if (!applyOptStrategy(tmpBand, func, tileList, (unsigned)1))
return false;
tmpOuterLoop = tmpBand.front();
estimator.estimateLoop(tmpOuterLoop);
@ -173,17 +199,63 @@ public:
paretoPoints.push_back(point);
}
// Erase the temporary loop band and return.
// Erase the temporary loop band and annotate the current tile config as
// estimated.
tmpOuterLoop.erase();
unestimatedTileConfigs.erase(config);
return true;
}
/// Initialize the design space.
void initializeDesignSpace(unsigned maxInitializeParallel) {
LLVM_DEBUG(llvm::dbgs() << "3.1 Initializing the design space...\n";);
for (TileConfig config = 0; config < validTileConfigNum; ++config) {
unestimatedTileConfigs.erase(config);
auto tileList = getTileList(config);
// We only evaluate the design points whose overall parallelism is smaller
// than the maxInitializeParallel to improve the efficiency.
auto parallel = std::accumulate(tileList.begin(), tileList.end(),
(unsigned)1, std::multiplies<unsigned>());
if (parallel > maxInitializeParallel)
continue;
LLVM_DEBUG(llvm::dbgs() << config << ",");
evaluateTileConfig(config);
}
LLVM_DEBUG(llvm::dbgs() << "\n\n");
updateParetoPoints();
}
/// Dump pareto and non-pareto points which have been evaluated in the design
/// space to a csv output file.
void dumpDesignSpace(raw_ostream &os) {
// Print header row.
for (unsigned i = 0; i < tripCountList.size(); ++i)
os << "l" << i << ",";
os << "ii,cycle,dsp,type\n";
// Print pareto design points.
for (auto point : paretoPoints) {
for (auto size : getTileList(point.tileConfig))
os << size << ",";
os << point.targetII << "," << point.latency << "," << point.dspNum
<< ",pareto\n";
}
// Print all design points.
for (auto point : allPoints) {
for (auto size : getTileList(point.tileConfig))
os << size << ",";
os << point.targetII << "," << point.latency << "," << point.dspNum
<< ",non-pareto\n";
}
LLVM_DEBUG(llvm::dbgs() << "Design space is dumped to output file.\n\n");
}
/// Stores current pareto frontiers and all evaluated design points.
SmallVector<DesignPoint, 16> paretoPoints;
SmallVector<DesignPoint, 16> allPoints;
@ -208,25 +280,37 @@ public:
};
} // namespace
static int64_t getInnerParallelism(AffineForOp forOp) {
int64_t count = 0;
for (auto loop : forOp.getOps<AffineForOp>()) {
auto innerCount = getInnerParallelism(loop);
if (auto trip = getConstantTripCount(loop))
count += trip.getValue() * innerCount;
else
count += innerCount;
}
// If the current loop is innermost loop, count should be one.
return std::max(count, (int64_t)1);
}
//===----------------------------------------------------------------------===//
// Optimizer Class Definition
//===----------------------------------------------------------------------===//
void ScaleHLSOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) {
LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency");
auto dsp = getIntAttrValue(targetFunc, "dsp");
llvm::dbgs() << message << "\n";
llvm::dbgs() << "Current latency is " << Twine(latency)
<< ", DSP utilization is " << Twine(dsp) << ".\n\n";);
}
/// This is a temporary approach that does not scale.
void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func, raw_ostream &os,
unsigned maxInitializeParallel) {
estimator.estimateFunc(func);
if (getIntAttrValue(func, "dsp") > numDSP)
return;
emitDebugInfo(func, "Start multiple level design space exploration.");
LLVM_DEBUG(auto latency = getIntAttrValue(func, "latency");
auto dspNum = getIntAttrValue(func, "dsp");
llvm::dbgs() << "\nStart the design space exploration.\n";
llvm::dbgs() << "Initial clock cycle is " << Twine(latency)
<< ", DSP usage is " << Twine(dspNum) << ".\n\n";);
//===--------------------------------------------------------------------===//
// STAGE 1: Simplify Loop Nests Structure
@ -331,7 +415,7 @@ void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
estimator.estimateFunc(func);
if (getIntAttrValue(func, "dsp") > numDSP)
return;
emitDebugInfo(func, "1. Simplify loop nests structure.");
LLVM_DEBUG(llvm::dbgs() << "1. Simplify loop nests structure.\n\n");
//===--------------------------------------------------------------------===//
// STAGE 2: Loop Bands Optimization
@ -357,21 +441,19 @@ void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
estimator.estimateFunc(func);
if (getIntAttrValue(func, "dsp") > numDSP)
return;
emitDebugInfo(func, "2. Apply loop perfection, remove variable bound, and "
"loop order opt.");
LLVM_DEBUG(llvm::dbgs() << "2. Apply loop perfection, loop order opt, and "
"remove variable loop bound.\n\n");
//===--------------------------------------------------------------------===//
// STAGE 3: Loop Bands Tiling and Finalization
// STAGE 3: Search for pareto frontiers
//===--------------------------------------------------------------------===//
LLVM_DEBUG(llvm::dbgs() << "3. Search for pareto design points...\n\n";);
for (unsigned i = 0; i < targetNum; ++i) {
auto space = DesignSpace(func, targetBands[i], estimator);
space.exploreTileConfig(1);
for (auto point : space.paretoPoints) {
llvm::outs() << "latency: " << point.latency
<< ", dsp_num: " << point.dspNum
<< ", ii: " << point.targetII << "\n";
}
space.initializeDesignSpace(maxInitializeParallel);
space.dumpDesignSpace(os);
}
}
@ -392,6 +474,12 @@ struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
getLatencyMap(spec, latencyMap);
int64_t numDSP = ceil(spec.GetInteger("specification", "dsp", 220) * 1.1);
// Parse output file.
std::string errorMessage;
auto output = mlir::openOutputFile(outputFile, &errorMessage);
if (!output)
emitError(module.getLoc(), errorMessage);
// Initialize an performance and resource estimator.
auto estimator = ScaleHLSEstimator(builder, latencyMap);
auto optimizer = ScaleHLSOptimizer(builder, estimator, numDSP);
@ -400,7 +488,9 @@ struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
for (auto func : module.getOps<FuncOp>())
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
if (topFunction.getValue())
optimizer.applyMultipleLevelDSE(func);
optimizer.applyMultipleLevelDSE(func, output->os(), maxParallel);
output->keep();
}
};
} // namespace

View File

@ -1,201 +0,0 @@
//===----------------------------------------------------------------------===//
//
// Copyright 2020-2021 The ScaleHLS Authors.
//
//===----------------------------------------------------------------------===//
#include "mlir/Analysis/LoopAnalysis.h"
#include "mlir/Support/FileUtilities.h"
#include "scalehls/Analysis/QoREstimation.h"
#include "scalehls/Transforms/Passes.h"
#include "scalehls/Transforms/Utils.h"
#include "llvm/Support/ToolOutputFile.h"
#define DEBUG_TYPE "scalehls"
using namespace mlir;
using namespace scalehls;
/// Currently only support single loop band profiling.
static void applyProfiling(FuncOp func, raw_ostream &os,
ScaleHLSEstimator &estimator, unsigned maxParallel) {
if (!dyn_cast<AffineForOp>(func.front().front())) {
func.emitError("first operation is not loop");
return;
}
// Helper function for fetching the target loop band.
auto getFirstBand = [&](FuncOp targetFunc) {
// Get the first loop band as target.
auto target = dyn_cast<AffineForOp>(targetFunc.front().front());
AffineLoopBand band;
getLoopBandFromOutermost(target, band);
return band;
};
// Perfect and optimize loop order of the target loop band.
auto band = getFirstBand(func);
auto loopNum = band.size();
applyAffineLoopPerfection(band);
applyAffineLoopOrderOpt(band);
applyRemoveVariableBound(band);
// Initialize tile size and trip count vector.
auto tileList = TileList(loopNum, 1);
auto tripCounts = TileList();
unsigned iterations = 1;
for (unsigned loc = 0; loc < loopNum; ++loc) {
auto tripCount = getConstantTripCount(band[loc]).getValue();
tripCounts.push_back(tripCount);
iterations *= (log2(tripCount) + 1);
os << "l" << loc << ",";
}
os << "ii,cycle,dsp,pareto\n";
// Storing all design points.
using DesignPoint = SmallVector<int64_t, 8>;
std::vector<DesignPoint> designPoints;
// Traverse each tile size configuration.
for (unsigned i = 0; i < iterations - 1; ++i) {
for (unsigned loc = 0; loc < loopNum; ++loc) {
auto &tileSize = tileList[loc];
if (loc == 0)
tileSize *= 2;
else if (tileList[loc - 1] > tripCounts[loc - 1])
tileSize *= 2;
}
unsigned iterNum = 1;
unsigned parallel = 1;
for (unsigned loc = 0; loc < loopNum; ++loc) {
auto &tileSize = tileList[loc];
if (tileSize > tripCounts[loc])
tileSize = 1;
iterNum *= tripCounts[loc] / tileSize;
parallel *= tileSize;
LLVM_DEBUG(llvm::dbgs() << tileSize << ", ";);
}
LLVM_DEBUG(llvm::dbgs() << "\n";);
if (parallel > maxParallel)
continue;
// Apply tiling strategy.
auto tmpFunc = func.clone();
applyOptStrategy(tmpFunc, tileList, 1);
estimator.estimateFunc(tmpFunc);
auto tmpLoop = getFirstBand(tmpFunc).back();
// Fetch latency and resource utilization.
auto II = estimator.getIntAttrValue(tmpLoop, "ii");
auto iterLatency = estimator.getIntAttrValue(tmpLoop, "iter_latency");
auto shareDspNum = estimator.getIntAttrValue(tmpLoop, "share_dsp");
auto noShareDspNum = estimator.getIntAttrValue(tmpLoop, "noshare_dsp");
// Improve target II until II is equal to iteration latency.
for (auto tmpII = II; tmpII <= iterLatency; ++tmpII) {
auto tmpDspNum = std::max(shareDspNum, noShareDspNum / tmpII);
auto tmpLatency = iterLatency + tmpII * (iterNum - 1);
auto point = SmallVector<int64_t, 8>(tileList.begin(), tileList.end());
point.append({tmpII, tmpLatency, tmpDspNum});
designPoints.push_back(point);
if (iterNum == 1)
break;
}
}
// Sort all design points by latency.
auto compareLatency = [&](const DesignPoint &a, const DesignPoint &b) {
return a[loopNum + 1] < b[loopNum + 1];
};
std::sort(designPoints.begin(), designPoints.end(), compareLatency);
// Sort all design points with the same latency by dsp number.
auto compareDspNum = [&](const DesignPoint &a, const DesignPoint &b) {
return a[loopNum + 2] < b[loopNum + 2];
};
for (auto i = designPoints.begin(); i < designPoints.end();) {
auto j = i;
for (; j < designPoints.end(); ++j)
if ((*i)[loopNum + 1] != (*j)[loopNum + 1])
break;
std::sort(i, j, compareDspNum);
i = j;
}
// Find pareto frontiers. After the sorting, the first design point must be a
// pareto point.
auto paretoPoint = designPoints[0];
auto paretoLatency = paretoPoint[loopNum + 1];
auto paretoDspNum = paretoPoint[loopNum + 2];
std::vector<DesignPoint> paretoPoints;
for (auto point : designPoints) {
auto tmpLatency = point[loopNum + 1];
auto tmpDspNum = point[loopNum + 2];
if (tmpDspNum < paretoDspNum) {
paretoPoints.push_back(point);
paretoPoint = point;
paretoLatency = tmpLatency;
paretoDspNum = tmpDspNum;
} else if (tmpDspNum == paretoDspNum && tmpLatency == paretoLatency)
paretoPoints.push_back(point);
}
// Print all pareto design points.
for (auto point : paretoPoints) {
for (auto element : point)
os << element << ",";
os << "pareto\n";
}
// Print all design points.
for (auto point : designPoints) {
for (auto element : point)
os << element << ",";
os << "non-pareto\n";
}
}
namespace {
struct ProfileDesignSpace : public ProfileDesignSpaceBase<ProfileDesignSpace> {
void runOnOperation() override {
auto module = getOperation();
auto builder = Builder(module);
// Read configuration file.
INIReader spec(targetSpec);
if (spec.ParseError())
emitError(module.getLoc(), "target spec file parse fail\n");
// Collect profiling latency data, where default values are based on Xilinx
// PYNQ-Z1 board.
LatencyMap latencyMap;
getLatencyMap(spec, latencyMap);
// Initialize an performance and resource estimator.
auto estimator = ScaleHLSEstimator(builder, latencyMap);
// Optimize the top function.
for (auto func : module.getOps<FuncOp>())
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
if (topFunction.getValue()) {
std::string errorMessage;
auto output = mlir::openOutputFile(outputFile, &errorMessage);
if (!output)
emitError(module.getLoc(), errorMessage);
applyProfiling(func, output->os(), estimator, maxParallel);
output->keep();
}
}
};
} // namespace
std::unique_ptr<Pass> scalehls::createProfileDesignSpacePass() {
return std::make_unique<ProfileDesignSpace>();
}

View File

@ -33,18 +33,18 @@ static void addPassPipeline(PassManager &pm) {
/// passed in because the post-tiling optimizations have to take function as
/// target, e.g. canonicalizer and array partition.
bool scalehls::applyOptStrategy(AffineLoopBand &band, FuncOp func,
TileList tileList, int64_t targetII) {
TileList tileList, unsigned targetII) {
// By design the input function must be the ancestor of the input loop band.
if (!func->isProperAncestor(band.front()))
return false;
// Apply loop tiling.
auto pipelineLoc = applyLoopTiling(band, tileList);
if (pipelineLoc == -1)
if (!pipelineLoc)
return false;
// Apply loop pipelining.
if (!applyLoopPipelining(band, pipelineLoc, targetII))
if (!applyLoopPipelining(band, pipelineLoc.getValue(), targetII))
return false;
// Apply general optimizations and array partition.
@ -59,17 +59,17 @@ bool scalehls::applyOptStrategy(AffineLoopBand &band, FuncOp func,
/// Apply optimization strategy to a function.
bool scalehls::applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
ArrayRef<int64_t> targetIIs) {
ArrayRef<unsigned> targetIIs) {
AffineLoopBands bands;
getLoopBands(func.front(), bands);
// Apply loop tiling and pipelining to all loop bands.
for (unsigned i = 0, e = bands.size(); i < e; ++i) {
auto pipelineLoc = applyLoopTiling(bands[i], tileLists[i]);
if (pipelineLoc == -1)
if (!pipelineLoc)
return false;
if (!applyLoopPipelining(bands[i], pipelineLoc, targetIIs[i]))
if (!applyLoopPipelining(bands[i], pipelineLoc.getValue(), targetIIs[i]))
return false;
}

View File

@ -1,6 +0,0 @@
// RUN: scalehls-opt -profile-design-space="target-spec=../../config/target-spec.ini max-parallel=32" %s | FileCheck %s
// CHECK-LABEL: func @test
func @test() {
return
}