diff --git a/include/scalehls/Transforms/MultipleLevelDSE.h b/include/scalehls/Transforms/MultipleLevelDSE.h index 6eaec3d..24915db 100644 --- a/include/scalehls/Transforms/MultipleLevelDSE.h +++ b/include/scalehls/Transforms/MultipleLevelDSE.h @@ -22,10 +22,9 @@ public: int64_t numDSP) : ScaleHLSAnalysisBase(builder), estimator(estimator), numDSP(numDSP) {} - void emitDebugInfo(FuncOp targetFunc, StringRef message); - /// This is a temporary approach that does not scale. - void applyMultipleLevelDSE(FuncOp func); + void applyMultipleLevelDSE(FuncOp func, raw_ostream &os, + unsigned maxInitializeParallel = 16); ScaleHLSEstimator &estimator; int64_t numDSP; diff --git a/include/scalehls/Transforms/Passes.h b/include/scalehls/Transforms/Passes.h index 23808b1..ea95823 100644 --- a/include/scalehls/Transforms/Passes.h +++ b/include/scalehls/Transforms/Passes.h @@ -19,7 +19,6 @@ namespace scalehls { /// Design space exploration pass. std::unique_ptr createMultipleLevelDSEPass(); -std::unique_ptr createProfileDesignSpacePass(); /// Dataflow optimization passes. std::unique_ptr createLegalizeDataflowPass(); diff --git a/include/scalehls/Transforms/Passes.td b/include/scalehls/Transforms/Passes.td index a46763d..bdc459a 100644 --- a/include/scalehls/Transforms/Passes.td +++ b/include/scalehls/Transforms/Passes.td @@ -25,30 +25,14 @@ def MultipleLevelDSE : Pass<"multiple-level-dse", "ModuleOp"> { let constructor = "mlir::scalehls::createMultipleLevelDSEPass()"; - let options = [ - Option<"targetSpec", "target-spec", "std::string", - /*default=*/"\"../config/target-spec.ini\"", - "File path: target backend specifications and configurations"> - ]; -} - -def ProfileDesignSpace : Pass<"profile-design-space", "ModuleOp"> { - let summary = "Optimize HLS design at multiple abstraction level"; - let description = [{ - This profile-design-space pass will profile the partial design space and - output clock cycle and resource utilization estimation results. - }]; - - let constructor = "mlir::scalehls::createProfileDesignSpacePass()"; - let options = [ Option<"targetSpec", "target-spec", "std::string", /*default=*/"\"../config/target-spec.ini\"", "File path: target backend specifications and configurations">, Option<"outputFile", "output-file", "std::string", - /*default=*/"\"-\"", "File path: the output file path of profiling">, - Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"1", - "Positive number: the maximum tiling parallelism of the profiling"> + /*default=*/"\"-\"", "File path: the output file path">, + Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"16", + "Positive number: the maximum parallelism of initialization"> ]; } diff --git a/include/scalehls/Transforms/Utils.h b/include/scalehls/Transforms/Utils.h index 70aab2e..f1dbd24 100644 --- a/include/scalehls/Transforms/Utils.h +++ b/include/scalehls/Transforms/Utils.h @@ -30,21 +30,21 @@ bool applyRemoveVariableBound(AffineLoopBand &band); /// passed in because the post-tiling optimizations have to take function as /// target, e.g. canonicalizer and array partition. bool applyOptStrategy(AffineLoopBand &band, FuncOp func, TileList tileList, - int64_t targetII); + unsigned targetII); /// Apply optimization strategy to a function. bool applyOptStrategy(FuncOp func, ArrayRef tileLists, - ArrayRef targetIIs); + ArrayRef targetIIs); -/// Apply loop tiling to the input loop band and return the location of the -/// original innermost loop in the tiled loop band. If tile is failed, -1 will -/// be returned. -int64_t applyLoopTiling(AffineLoopBand &band, TileList tileList); +/// Apply loop tiling to the input loop band and sink all intra-tile loops to +/// the innermost loop with the original loop order. Return the location of the +/// innermost tile-space loop. +Optional applyLoopTiling(AffineLoopBand &band, TileList tileList); /// Apply loop pipelining to the pipelineLoc of the input loop band, all inner /// loops are automatically fully unrolled. -bool applyLoopPipelining(AffineLoopBand &band, int64_t pipelineLoc, - int64_t targetII); +bool applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc, + unsigned targetII); /// Fully unroll all loops insides of a loop block. bool applyFullyLoopUnrolling(Block &block); diff --git a/lib/Analysis/Utils.cpp b/lib/Analysis/Utils.cpp index 48655a4..846a537 100644 --- a/lib/Analysis/Utils.cpp +++ b/lib/Analysis/Utils.cpp @@ -181,6 +181,7 @@ static unsigned getChildLoopNum(Operation *op) { /// Get the whole loop band given the innermost loop and return it in "band". static void getLoopBandFromInnermost(AffineForOp forOp, AffineLoopBand &band) { + band.clear(); AffineLoopBand reverseBand; auto currentLoop = forOp; @@ -204,6 +205,7 @@ static void getLoopBandFromInnermost(AffineForOp forOp, AffineLoopBand &band) { /// Meanwhile, the return value is the innermost loop of this loop band. AffineForOp scalehls::getLoopBandFromOutermost(AffineForOp forOp, AffineLoopBand &band) { + band.clear(); auto currentLoop = forOp; while (true) { band.push_back(currentLoop); @@ -222,6 +224,7 @@ AffineForOp scalehls::getLoopBandFromOutermost(AffineForOp forOp, /// loops are collected. void scalehls::getLoopBands(Block &block, AffineLoopBands &bands, bool allowHavingChilds) { + bands.clear(); block.walk([&](AffineForOp loop) { auto childNum = getChildLoopNum(loop); diff --git a/lib/Transforms/Directive/LoopPipelining.cpp b/lib/Transforms/Directive/LoopPipelining.cpp index cecda09..56f5bd2 100644 --- a/lib/Transforms/Directive/LoopPipelining.cpp +++ b/lib/Transforms/Directive/LoopPipelining.cpp @@ -32,8 +32,8 @@ bool scalehls::applyFullyLoopUnrolling(Block &block) { /// Apply loop pipelining to the input loop, all inner loops are automatically /// fully unrolled. -bool scalehls::applyLoopPipelining(AffineLoopBand &band, int64_t pipelineLoc, - int64_t targetII) { +bool scalehls::applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc, + unsigned targetII) { auto targetLoop = band[pipelineLoc]; // All inner loops of the pipelined loop are automatically unrolled. diff --git a/lib/Transforms/Loop/PartialAffineLoopTile.cpp b/lib/Transforms/Loop/PartialAffineLoopTile.cpp index 02d7c5c..b5e5fce 100644 --- a/lib/Transforms/Loop/PartialAffineLoopTile.cpp +++ b/lib/Transforms/Loop/PartialAffineLoopTile.cpp @@ -12,61 +12,57 @@ using namespace mlir; using namespace scalehls; -/// Apply loop tiling to the input loop band and return the location of the -/// original innermost loop in the tiled loop band. If tile is failed, -1 will -/// be returned. -int64_t scalehls::applyLoopTiling(AffineLoopBand &band, TileList tileList) { +/// Apply loop tiling to the input loop band and sink all intra-tile loops to +/// the innermost loop with the original loop order. Return the location of the +/// innermost tile-space loop. +Optional scalehls::applyLoopTiling(AffineLoopBand &band, + TileList tileList) { if (!isPerfectlyNested(band)) - return -1; - - // Collect each loop location that is fully tiled and can be eliminated. - SmallVector fullyTiledLoops; - unsigned pipelineLoc = 0; - unsigned loc = 0; - for (auto loop : band) { - if (auto tripCount = getConstantTripCount(loop)) { - if (tripCount.getValue() == tileList[loc]) - fullyTiledLoops.push_back(loc); - else - pipelineLoc = loc; - } else - return -1; - ++loc; - } - - // If all loops are fully tiled, keep the last loop untouched. - if (fullyTiledLoops.size() == band.size()) { - fullyTiledLoops.pop_back(); - pipelineLoc = band.size() - 1; - } + return Optional(); // Loop tiling. AffineLoopBand tiledBand; if (failed(tilePerfectlyNested(band, tileList, &tiledBand))) - return -1; - band = tiledBand; + return Optional(); - auto builder = OpBuilder(band.back()); + // Record the band size and clear the original loop band. + auto originalBandSize = band.size(); + band.clear(); - // Remove fully tiled loops. - for (auto loc : fullyTiledLoops) { - auto loop = band[loc]; + // Remove redundant loops in the tiled loop band. + auto builder = OpBuilder(tiledBand.back()); + unsigned erasedLoopNum = 0; + unsigned loc = 0; - // Create an affine apply operation generating a constant zero. - builder.setInsertionPoint(loop); - auto constZero = builder.create( - loop.getLoc(), builder.getConstantAffineMap(0), ValueRange({})); - loop.getInductionVar().replaceAllUsesWith(constZero); + for (auto loop : tiledBand) { + if (erasedLoopNum >= originalBandSize - 1 || loc >= originalBandSize || + getConstantTripCount(loop).getValue() > 1) { + // All tile-space loops which have a trip count larger than 1 and all + // intra-tile loops are pushed back. Meanwhile, we are not willing to see + // all tile-space loops removed since in that case many analysis and + // transforms will become very hard. Thereby we record the number of + // erased loop so far and always keep at least one tile-space loop + // remained in the loop band even if it has a trip count of 1. + band.push_back(loop); + } else { + // Create an affine apply operation to represent the lower bound. + builder.setInsertionPoint(loop); + auto newIterVar = builder.create( + loop.getLoc(), loop.getLowerBoundMap(), loop.getLowerBoundOperands()); + loop.getInductionVar().replaceAllUsesWith(newIterVar); - // Move all operation except the terminator to the outside. - auto &parentBlock = loop->getBlock()->getOperations(); - auto &loopBlock = loop.getBody()->getOperations(); - parentBlock.splice(loop->getIterator(), loopBlock, loopBlock.begin(), - std::prev(loopBlock.end())); - loop.erase(); + // Move all operation except the terminator to the outside. + auto &parentBlock = loop->getBlock()->getOperations(); + auto &loopBlock = loop.getBody()->getOperations(); + parentBlock.splice(loop->getIterator(), loopBlock, loopBlock.begin(), + std::prev(loopBlock.end())); + loop.erase(); + ++erasedLoopNum; + } + ++loc; } - return pipelineLoc; + return band.size() - originalBandSize - 1; } namespace { diff --git a/lib/Transforms/MultipleLevelDSE.cpp b/lib/Transforms/MultipleLevelDSE.cpp index ff736a1..2c61d2d 100644 --- a/lib/Transforms/MultipleLevelDSE.cpp +++ b/lib/Transforms/MultipleLevelDSE.cpp @@ -6,9 +6,12 @@ #include "scalehls/Transforms/MultipleLevelDSE.h" #include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Support/FileUtilities.h" #include "scalehls/Transforms/Passes.h" #include "scalehls/Transforms/Utils.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ToolOutputFile.h" +#include #include #define DEBUG_TYPE "scalehls" @@ -16,26 +19,12 @@ using namespace mlir; using namespace scalehls; -//===----------------------------------------------------------------------===// -// Helper methods -//===----------------------------------------------------------------------===// - -static int64_t getInnerParallelism(AffineForOp forOp) { - int64_t count = 0; - for (auto loop : forOp.getOps()) { - auto innerCount = getInnerParallelism(loop); - if (auto trip = getConstantTripCount(loop)) - count += trip.getValue() * innerCount; - else - count += innerCount; - } - - // If the current loop is innermost loop, count should be one. - return std::max(count, (int64_t)1); -} - using TileConfig = unsigned; +//===----------------------------------------------------------------------===// +// Helper Methods and Classes +//===----------------------------------------------------------------------===// + namespace { struct DesignPoint { public: @@ -122,10 +111,47 @@ public: return sqrtf(distanceSquare); } - void findParetoFrontiers() {} + /// Update paretoPoints to remove design points that are not pareto frontiers. + void updateParetoPoints() { + // Sort the pareto points with in an ascending order of latency and the an + // ascending order of dsp number. + auto latencyThenDspNum = [&](const DesignPoint &a, const DesignPoint &b) { + return (a.latency < b.latency || + (a.latency == b.latency && a.dspNum < b.dspNum)); + }; + std::sort(paretoPoints.begin(), paretoPoints.end(), latencyThenDspNum); + + // Find pareto frontiers. After the sorting, the first design point must be + // a pareto point. + auto paretoPoint = paretoPoints[0]; + auto paretoLatency = paretoPoint.latency; + auto paretoDspNum = paretoPoint.dspNum; + SmallVector frontiers; + + for (auto point : paretoPoints) { + auto tmpLatency = point.latency; + auto tmpDspNum = point.dspNum; + + if (tmpDspNum < paretoDspNum) { + frontiers.push_back(point); + + paretoPoint = point; + paretoLatency = tmpLatency; + paretoDspNum = tmpDspNum; + + } else if (tmpDspNum == paretoDspNum && tmpLatency == paretoLatency) + frontiers.push_back(point); + } + + paretoPoints = frontiers; + } + + /// Evaluate all design points under the given tile config. + bool evaluateTileConfig(TileConfig config) { + // If the current tile config is already estimated, return true. + if (!unestimatedTileConfigs.count(config)) + return true; - /// Explore all design points under the given tile config. - bool exploreTileConfig(TileConfig config) { // Clone a temporary loop band by cloning the outermost loop. auto tmpOuterLoop = band.front().clone(); AffineLoopBand tmpBand; @@ -149,7 +175,7 @@ public: // Apply the current tiling config and start the estimation. Note that after // optimization, tmpBand is optimized in place and becomes a new loop band. - if (!applyOptStrategy(tmpBand, func, tileList, 1)) + if (!applyOptStrategy(tmpBand, func, tileList, (unsigned)1)) return false; tmpOuterLoop = tmpBand.front(); estimator.estimateLoop(tmpOuterLoop); @@ -173,17 +199,63 @@ public: paretoPoints.push_back(point); } - // Erase the temporary loop band and return. + // Erase the temporary loop band and annotate the current tile config as + // estimated. tmpOuterLoop.erase(); + unestimatedTileConfigs.erase(config); return true; } + /// Initialize the design space. void initializeDesignSpace(unsigned maxInitializeParallel) { + LLVM_DEBUG(llvm::dbgs() << "3.1 Initializing the design space...\n";); + for (TileConfig config = 0; config < validTileConfigNum; ++config) { - unestimatedTileConfigs.erase(config); + auto tileList = getTileList(config); + + // We only evaluate the design points whose overall parallelism is smaller + // than the maxInitializeParallel to improve the efficiency. + auto parallel = std::accumulate(tileList.begin(), tileList.end(), + (unsigned)1, std::multiplies()); + if (parallel > maxInitializeParallel) + continue; + + LLVM_DEBUG(llvm::dbgs() << config << ","); + evaluateTileConfig(config); } + + LLVM_DEBUG(llvm::dbgs() << "\n\n"); + updateParetoPoints(); } + /// Dump pareto and non-pareto points which have been evaluated in the design + /// space to a csv output file. + void dumpDesignSpace(raw_ostream &os) { + // Print header row. + for (unsigned i = 0; i < tripCountList.size(); ++i) + os << "l" << i << ","; + os << "ii,cycle,dsp,type\n"; + + // Print pareto design points. + for (auto point : paretoPoints) { + for (auto size : getTileList(point.tileConfig)) + os << size << ","; + os << point.targetII << "," << point.latency << "," << point.dspNum + << ",pareto\n"; + } + + // Print all design points. + for (auto point : allPoints) { + for (auto size : getTileList(point.tileConfig)) + os << size << ","; + os << point.targetII << "," << point.latency << "," << point.dspNum + << ",non-pareto\n"; + } + + LLVM_DEBUG(llvm::dbgs() << "Design space is dumped to output file.\n\n"); + } + + /// Stores current pareto frontiers and all evaluated design points. SmallVector paretoPoints; SmallVector allPoints; @@ -208,25 +280,37 @@ public: }; } // namespace +static int64_t getInnerParallelism(AffineForOp forOp) { + int64_t count = 0; + for (auto loop : forOp.getOps()) { + auto innerCount = getInnerParallelism(loop); + if (auto trip = getConstantTripCount(loop)) + count += trip.getValue() * innerCount; + else + count += innerCount; + } + + // If the current loop is innermost loop, count should be one. + return std::max(count, (int64_t)1); +} + //===----------------------------------------------------------------------===// // Optimizer Class Definition //===----------------------------------------------------------------------===// -void ScaleHLSOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) { - LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency"); - auto dsp = getIntAttrValue(targetFunc, "dsp"); - - llvm::dbgs() << message << "\n"; - llvm::dbgs() << "Current latency is " << Twine(latency) - << ", DSP utilization is " << Twine(dsp) << ".\n\n";); -} - /// This is a temporary approach that does not scale. -void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) { +void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func, raw_ostream &os, + unsigned maxInitializeParallel) { estimator.estimateFunc(func); if (getIntAttrValue(func, "dsp") > numDSP) return; - emitDebugInfo(func, "Start multiple level design space exploration."); + + LLVM_DEBUG(auto latency = getIntAttrValue(func, "latency"); + auto dspNum = getIntAttrValue(func, "dsp"); + + llvm::dbgs() << "\nStart the design space exploration.\n"; + llvm::dbgs() << "Initial clock cycle is " << Twine(latency) + << ", DSP usage is " << Twine(dspNum) << ".\n\n";); //===--------------------------------------------------------------------===// // STAGE 1: Simplify Loop Nests Structure @@ -331,7 +415,7 @@ void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) { estimator.estimateFunc(func); if (getIntAttrValue(func, "dsp") > numDSP) return; - emitDebugInfo(func, "1. Simplify loop nests structure."); + LLVM_DEBUG(llvm::dbgs() << "1. Simplify loop nests structure.\n\n"); //===--------------------------------------------------------------------===// // STAGE 2: Loop Bands Optimization @@ -357,21 +441,19 @@ void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) { estimator.estimateFunc(func); if (getIntAttrValue(func, "dsp") > numDSP) return; - emitDebugInfo(func, "2. Apply loop perfection, remove variable bound, and " - "loop order opt."); + LLVM_DEBUG(llvm::dbgs() << "2. Apply loop perfection, loop order opt, and " + "remove variable loop bound.\n\n"); //===--------------------------------------------------------------------===// - // STAGE 3: Loop Bands Tiling and Finalization + // STAGE 3: Search for pareto frontiers //===--------------------------------------------------------------------===// + + LLVM_DEBUG(llvm::dbgs() << "3. Search for pareto design points...\n\n";); + for (unsigned i = 0; i < targetNum; ++i) { auto space = DesignSpace(func, targetBands[i], estimator); - space.exploreTileConfig(1); - - for (auto point : space.paretoPoints) { - llvm::outs() << "latency: " << point.latency - << ", dsp_num: " << point.dspNum - << ", ii: " << point.targetII << "\n"; - } + space.initializeDesignSpace(maxInitializeParallel); + space.dumpDesignSpace(os); } } @@ -392,6 +474,12 @@ struct MultipleLevelDSE : public MultipleLevelDSEBase { getLatencyMap(spec, latencyMap); int64_t numDSP = ceil(spec.GetInteger("specification", "dsp", 220) * 1.1); + // Parse output file. + std::string errorMessage; + auto output = mlir::openOutputFile(outputFile, &errorMessage); + if (!output) + emitError(module.getLoc(), errorMessage); + // Initialize an performance and resource estimator. auto estimator = ScaleHLSEstimator(builder, latencyMap); auto optimizer = ScaleHLSOptimizer(builder, estimator, numDSP); @@ -400,7 +488,9 @@ struct MultipleLevelDSE : public MultipleLevelDSEBase { for (auto func : module.getOps()) if (auto topFunction = func->getAttrOfType("top_function")) if (topFunction.getValue()) - optimizer.applyMultipleLevelDSE(func); + optimizer.applyMultipleLevelDSE(func, output->os(), maxParallel); + + output->keep(); } }; } // namespace diff --git a/lib/Transforms/ProfileDesignSpace.cpp b/lib/Transforms/ProfileDesignSpace.cpp deleted file mode 100644 index 870e65b..0000000 --- a/lib/Transforms/ProfileDesignSpace.cpp +++ /dev/null @@ -1,201 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Copyright 2020-2021 The ScaleHLS Authors. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Analysis/LoopAnalysis.h" -#include "mlir/Support/FileUtilities.h" -#include "scalehls/Analysis/QoREstimation.h" -#include "scalehls/Transforms/Passes.h" -#include "scalehls/Transforms/Utils.h" -#include "llvm/Support/ToolOutputFile.h" - -#define DEBUG_TYPE "scalehls" - -using namespace mlir; -using namespace scalehls; - -/// Currently only support single loop band profiling. -static void applyProfiling(FuncOp func, raw_ostream &os, - ScaleHLSEstimator &estimator, unsigned maxParallel) { - if (!dyn_cast(func.front().front())) { - func.emitError("first operation is not loop"); - return; - } - - // Helper function for fetching the target loop band. - auto getFirstBand = [&](FuncOp targetFunc) { - // Get the first loop band as target. - auto target = dyn_cast(targetFunc.front().front()); - AffineLoopBand band; - getLoopBandFromOutermost(target, band); - return band; - }; - - // Perfect and optimize loop order of the target loop band. - auto band = getFirstBand(func); - auto loopNum = band.size(); - applyAffineLoopPerfection(band); - applyAffineLoopOrderOpt(band); - applyRemoveVariableBound(band); - - // Initialize tile size and trip count vector. - auto tileList = TileList(loopNum, 1); - auto tripCounts = TileList(); - unsigned iterations = 1; - for (unsigned loc = 0; loc < loopNum; ++loc) { - auto tripCount = getConstantTripCount(band[loc]).getValue(); - tripCounts.push_back(tripCount); - iterations *= (log2(tripCount) + 1); - os << "l" << loc << ","; - } - os << "ii,cycle,dsp,pareto\n"; - - // Storing all design points. - using DesignPoint = SmallVector; - std::vector designPoints; - - // Traverse each tile size configuration. - for (unsigned i = 0; i < iterations - 1; ++i) { - for (unsigned loc = 0; loc < loopNum; ++loc) { - auto &tileSize = tileList[loc]; - if (loc == 0) - tileSize *= 2; - else if (tileList[loc - 1] > tripCounts[loc - 1]) - tileSize *= 2; - } - - unsigned iterNum = 1; - unsigned parallel = 1; - for (unsigned loc = 0; loc < loopNum; ++loc) { - auto &tileSize = tileList[loc]; - if (tileSize > tripCounts[loc]) - tileSize = 1; - iterNum *= tripCounts[loc] / tileSize; - parallel *= tileSize; - LLVM_DEBUG(llvm::dbgs() << tileSize << ", ";); - } - LLVM_DEBUG(llvm::dbgs() << "\n";); - - if (parallel > maxParallel) - continue; - - // Apply tiling strategy. - auto tmpFunc = func.clone(); - applyOptStrategy(tmpFunc, tileList, 1); - estimator.estimateFunc(tmpFunc); - auto tmpLoop = getFirstBand(tmpFunc).back(); - - // Fetch latency and resource utilization. - auto II = estimator.getIntAttrValue(tmpLoop, "ii"); - auto iterLatency = estimator.getIntAttrValue(tmpLoop, "iter_latency"); - auto shareDspNum = estimator.getIntAttrValue(tmpLoop, "share_dsp"); - auto noShareDspNum = estimator.getIntAttrValue(tmpLoop, "noshare_dsp"); - - // Improve target II until II is equal to iteration latency. - for (auto tmpII = II; tmpII <= iterLatency; ++tmpII) { - auto tmpDspNum = std::max(shareDspNum, noShareDspNum / tmpII); - auto tmpLatency = iterLatency + tmpII * (iterNum - 1); - - auto point = SmallVector(tileList.begin(), tileList.end()); - point.append({tmpII, tmpLatency, tmpDspNum}); - designPoints.push_back(point); - - if (iterNum == 1) - break; - } - } - - // Sort all design points by latency. - auto compareLatency = [&](const DesignPoint &a, const DesignPoint &b) { - return a[loopNum + 1] < b[loopNum + 1]; - }; - std::sort(designPoints.begin(), designPoints.end(), compareLatency); - - // Sort all design points with the same latency by dsp number. - auto compareDspNum = [&](const DesignPoint &a, const DesignPoint &b) { - return a[loopNum + 2] < b[loopNum + 2]; - }; - for (auto i = designPoints.begin(); i < designPoints.end();) { - auto j = i; - for (; j < designPoints.end(); ++j) - if ((*i)[loopNum + 1] != (*j)[loopNum + 1]) - break; - std::sort(i, j, compareDspNum); - i = j; - } - - // Find pareto frontiers. After the sorting, the first design point must be a - // pareto point. - auto paretoPoint = designPoints[0]; - auto paretoLatency = paretoPoint[loopNum + 1]; - auto paretoDspNum = paretoPoint[loopNum + 2]; - std::vector paretoPoints; - - for (auto point : designPoints) { - auto tmpLatency = point[loopNum + 1]; - auto tmpDspNum = point[loopNum + 2]; - - if (tmpDspNum < paretoDspNum) { - paretoPoints.push_back(point); - paretoPoint = point; - paretoLatency = tmpLatency; - paretoDspNum = tmpDspNum; - } else if (tmpDspNum == paretoDspNum && tmpLatency == paretoLatency) - paretoPoints.push_back(point); - } - - // Print all pareto design points. - for (auto point : paretoPoints) { - for (auto element : point) - os << element << ","; - os << "pareto\n"; - } - - // Print all design points. - for (auto point : designPoints) { - for (auto element : point) - os << element << ","; - os << "non-pareto\n"; - } -} - -namespace { -struct ProfileDesignSpace : public ProfileDesignSpaceBase { - void runOnOperation() override { - auto module = getOperation(); - auto builder = Builder(module); - - // Read configuration file. - INIReader spec(targetSpec); - if (spec.ParseError()) - emitError(module.getLoc(), "target spec file parse fail\n"); - - // Collect profiling latency data, where default values are based on Xilinx - // PYNQ-Z1 board. - LatencyMap latencyMap; - getLatencyMap(spec, latencyMap); - - // Initialize an performance and resource estimator. - auto estimator = ScaleHLSEstimator(builder, latencyMap); - - // Optimize the top function. - for (auto func : module.getOps()) - if (auto topFunction = func->getAttrOfType("top_function")) - if (topFunction.getValue()) { - std::string errorMessage; - auto output = mlir::openOutputFile(outputFile, &errorMessage); - if (!output) - emitError(module.getLoc(), errorMessage); - - applyProfiling(func, output->os(), estimator, maxParallel); - output->keep(); - } - } -}; -} // namespace - -std::unique_ptr scalehls::createProfileDesignSpacePass() { - return std::make_unique(); -} diff --git a/lib/Transforms/Utils.cpp b/lib/Transforms/Utils.cpp index b8bbd3a..17493ca 100644 --- a/lib/Transforms/Utils.cpp +++ b/lib/Transforms/Utils.cpp @@ -33,18 +33,18 @@ static void addPassPipeline(PassManager &pm) { /// passed in because the post-tiling optimizations have to take function as /// target, e.g. canonicalizer and array partition. bool scalehls::applyOptStrategy(AffineLoopBand &band, FuncOp func, - TileList tileList, int64_t targetII) { + TileList tileList, unsigned targetII) { // By design the input function must be the ancestor of the input loop band. if (!func->isProperAncestor(band.front())) return false; // Apply loop tiling. auto pipelineLoc = applyLoopTiling(band, tileList); - if (pipelineLoc == -1) + if (!pipelineLoc) return false; // Apply loop pipelining. - if (!applyLoopPipelining(band, pipelineLoc, targetII)) + if (!applyLoopPipelining(band, pipelineLoc.getValue(), targetII)) return false; // Apply general optimizations and array partition. @@ -59,17 +59,17 @@ bool scalehls::applyOptStrategy(AffineLoopBand &band, FuncOp func, /// Apply optimization strategy to a function. bool scalehls::applyOptStrategy(FuncOp func, ArrayRef tileLists, - ArrayRef targetIIs) { + ArrayRef targetIIs) { AffineLoopBands bands; getLoopBands(func.front(), bands); // Apply loop tiling and pipelining to all loop bands. for (unsigned i = 0, e = bands.size(); i < e; ++i) { auto pipelineLoc = applyLoopTiling(bands[i], tileLists[i]); - if (pipelineLoc == -1) + if (!pipelineLoc) return false; - if (!applyLoopPipelining(bands[i], pipelineLoc, targetIIs[i])) + if (!applyLoopPipelining(bands[i], pipelineLoc.getValue(), targetIIs[i])) return false; } diff --git a/test/Analysis/profile_design_space.mlir b/test/Analysis/profile_design_space.mlir deleted file mode 100644 index 5a957c4..0000000 --- a/test/Analysis/profile_design_space.mlir +++ /dev/null @@ -1,6 +0,0 @@ -// RUN: scalehls-opt -profile-design-space="target-spec=../../config/target-spec.ini max-parallel=32" %s | FileCheck %s - -// CHECK-LABEL: func @test -func @test() { - return -}