[MultipleLevelDSE] remove greedy dse algorithm; [ProfileDesignSpace] move this pass to Transforms folder due to dependence issue; update all CMakeLists
This commit is contained in:
parent
2beeff1cd5
commit
d7a3456fc2
|
@ -18,7 +18,6 @@ namespace mlir {
|
|||
namespace scalehls {
|
||||
|
||||
std::unique_ptr<Pass> createQoREstimationPass();
|
||||
std::unique_ptr<Pass> createProfileDesignSpacePass();
|
||||
|
||||
void registerAnalysisPasses();
|
||||
|
||||
|
|
|
@ -27,25 +27,4 @@ def QoREstimation : Pass<"qor-estimation", "ModuleOp"> {
|
|||
];
|
||||
}
|
||||
|
||||
def ProfileDesignSpace : Pass<"profile-design-space", "ModuleOp"> {
|
||||
let summary = "Optimize HLS design at multiple abstraction level";
|
||||
let description = [{
|
||||
This profile-design-space pass will profile the partial design space and
|
||||
output clock cycle and resource utilization estimation results.
|
||||
}];
|
||||
|
||||
let constructor = "mlir::scalehls::createProfileDesignSpacePass()";
|
||||
|
||||
let options = [
|
||||
Option<"targetSpec", "target-spec", "std::string",
|
||||
/*default=*/"\"../config/target-spec.ini\"",
|
||||
"File path: target backend specifications and configurations">,
|
||||
Option<"profileFile", "profile-file", "std::string",
|
||||
/*default=*/"\"-\"", "File path: the output file path of profiling">,
|
||||
Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"1",
|
||||
"Positive number: the maximum tiling parallelism of the profiling">
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
#endif // SCALEHLS_ANALYSIS_PASSES_TD
|
||||
|
|
|
@ -7,9 +7,7 @@
|
|||
#ifndef SCALEHLS_TRANSFORMS_MULTIPLELEVELDSE_H
|
||||
#define SCALEHLS_TRANSFORMS_MULTIPLELEVELDSE_H
|
||||
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "scalehls/Analysis/QoREstimation.h"
|
||||
#include "scalehls/Transforms/Utils.h"
|
||||
|
||||
namespace mlir {
|
||||
namespace scalehls {
|
||||
|
@ -22,55 +20,15 @@ class ScaleHLSOptimizer : public ScaleHLSAnalysisBase {
|
|||
public:
|
||||
explicit ScaleHLSOptimizer(Builder &builder, ScaleHLSEstimator &estimator,
|
||||
int64_t numDSP)
|
||||
: ScaleHLSAnalysisBase(builder), estimator(estimator), numDSP(numDSP) {
|
||||
// TODO: only insert affine-related patterns.
|
||||
OwningRewritePatternList owningPatterns;
|
||||
for (auto *op : builder.getContext()->getRegisteredOperations())
|
||||
op->getCanonicalizationPatterns(owningPatterns, builder.getContext());
|
||||
patterns = std::move(owningPatterns);
|
||||
}
|
||||
|
||||
enum LoopState { HOT = 0, COLD = 1, FROZEN = 2 };
|
||||
using BandState = SmallVector<LoopState, 8>;
|
||||
|
||||
bool loopBandIsFrozen(BandState bandState) {
|
||||
for (auto loopState : bandState)
|
||||
if (loopState != LoopState::FROZEN)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool loopBandIsColdOrFrozen(BandState bandState) {
|
||||
for (auto loopState : bandState)
|
||||
if (loopState == LoopState::HOT)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool loopBandIsOneHot(BandState bandState) {
|
||||
unsigned hotNum = 0;
|
||||
for (auto loopState : bandState)
|
||||
if (loopState == LoopState::HOT)
|
||||
hotNum++;
|
||||
|
||||
if (hotNum == 1)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
: ScaleHLSAnalysisBase(builder), estimator(estimator), numDSP(numDSP) {}
|
||||
|
||||
void emitDebugInfo(FuncOp targetFunc, StringRef message);
|
||||
void emitTilingInfo(FuncOp targetFunc, ArrayRef<TileSizes> tileSizesList);
|
||||
|
||||
bool incrTileSizeAtLoc(TileSizes &tileSizes, TileSizes &tripCounts,
|
||||
unsigned &loc);
|
||||
|
||||
/// This is a temporary approach that does not scale.
|
||||
void applyMultipleLevelDSE(FuncOp func);
|
||||
|
||||
ScaleHLSEstimator &estimator;
|
||||
int64_t numDSP;
|
||||
FrozenRewritePatternList patterns;
|
||||
};
|
||||
|
||||
} // namespace scalehls
|
||||
|
|
|
@ -19,6 +19,7 @@ namespace scalehls {
|
|||
|
||||
/// Design space exploration pass.
|
||||
std::unique_ptr<Pass> createMultipleLevelDSEPass();
|
||||
std::unique_ptr<Pass> createProfileDesignSpacePass();
|
||||
|
||||
/// Dataflow optimization passes.
|
||||
std::unique_ptr<Pass> createLegalizeDataflowPass();
|
||||
|
|
|
@ -32,6 +32,26 @@ def MultipleLevelDSE : Pass<"multiple-level-dse", "ModuleOp"> {
|
|||
];
|
||||
}
|
||||
|
||||
def ProfileDesignSpace : Pass<"profile-design-space", "ModuleOp"> {
|
||||
let summary = "Optimize HLS design at multiple abstraction level";
|
||||
let description = [{
|
||||
This profile-design-space pass will profile the partial design space and
|
||||
output clock cycle and resource utilization estimation results.
|
||||
}];
|
||||
|
||||
let constructor = "mlir::scalehls::createProfileDesignSpacePass()";
|
||||
|
||||
let options = [
|
||||
Option<"targetSpec", "target-spec", "std::string",
|
||||
/*default=*/"\"../config/target-spec.ini\"",
|
||||
"File path: target backend specifications and configurations">,
|
||||
Option<"outputFile", "output-file", "std::string",
|
||||
/*default=*/"\"-\"", "File path: the output file path of profiling">,
|
||||
Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"1",
|
||||
"Positive number: the maximum tiling parallelism of the profiling">
|
||||
];
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Dataflow Optimization Passes
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -5,8 +5,4 @@ add_mlir_library(MLIRScaleHLSAnalysis
|
|||
|
||||
DEPENDS
|
||||
MLIRScaleHLSAnalysisIncGen
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRHLSCpp
|
||||
MLIRHLSKernel
|
||||
)
|
||||
|
|
|
@ -5,8 +5,4 @@ add_mlir_library(MLIRScaleHLSConversion
|
|||
|
||||
DEPENDS
|
||||
MLIRScaleHLSConversionIncGen
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRHLSCpp
|
||||
MLIRHLSKernel
|
||||
)
|
||||
|
|
|
@ -5,8 +5,4 @@ add_mlir_library(MLIRScaleHLSTransforms
|
|||
|
||||
DEPENDS
|
||||
MLIRScaleHLSTransformsIncGen
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRHLSCpp
|
||||
MLIRHLSKernel
|
||||
)
|
||||
|
|
|
@ -47,48 +47,8 @@ void ScaleHLSOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) {
|
|||
<< ", DSP utilization is " << Twine(dsp) << ".\n\n";);
|
||||
}
|
||||
|
||||
void ScaleHLSOptimizer::emitTilingInfo(FuncOp targetFunc,
|
||||
ArrayRef<TileSizes> tileSizesList) {
|
||||
// Estimate performance and resource utilization.
|
||||
estimator.estimateFunc(targetFunc);
|
||||
LLVM_DEBUG(llvm::dbgs() << "Current tiling strategy:\n";
|
||||
for (unsigned idx = 0; idx < tileSizesList.size(); ++idx) {
|
||||
auto tileSizes = tileSizesList[idx];
|
||||
llvm::dbgs() << "Loop band " << Twine(idx) << ":";
|
||||
|
||||
for (auto size : tileSizes)
|
||||
llvm::dbgs() << " " << Twine(size);
|
||||
llvm::dbgs() << "\n";
|
||||
});
|
||||
|
||||
emitDebugInfo(targetFunc, "Apply loop tiling and pipelining, generic IR "
|
||||
"opts, and array partition.");
|
||||
}
|
||||
|
||||
bool ScaleHLSOptimizer::incrTileSizeAtLoc(TileSizes &tileSizes,
|
||||
TileSizes &tripCounts,
|
||||
unsigned &loc) {
|
||||
auto size = tileSizes[loc];
|
||||
auto tripCount = tripCounts[loc];
|
||||
|
||||
if (size >= tripCount || tripCount % size != 0)
|
||||
return false;
|
||||
|
||||
// Fine the minimum factor that can be applied.
|
||||
unsigned factor = 2;
|
||||
while (tripCount % (size * factor) != 0)
|
||||
factor++;
|
||||
|
||||
// Increase and update tile size.
|
||||
size *= factor;
|
||||
tileSizes[loc] = size;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// This is a temporary approach that does not scale.
|
||||
void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
|
||||
// Canonicalize the function and start the dse.
|
||||
applyPatternsAndFoldGreedily(func, patterns);
|
||||
estimator.estimateFunc(func);
|
||||
if (getIntAttrValue(func, "dsp") > numDSP)
|
||||
return;
|
||||
|
@ -229,187 +189,6 @@ void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
|
|||
//===--------------------------------------------------------------------===//
|
||||
// STAGE 3: Loop Bands Tiling and Finalization
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
// Hold trip counts of all loops in each loop band, this can also be
|
||||
// considered as maxTileSizesList.
|
||||
std::vector<TileSizes> tripCountsList;
|
||||
// Hold the loop number in each loop band.
|
||||
SmallVector<unsigned, 8> loopNumList;
|
||||
|
||||
// Hold the current tiling sizes of each loop band. This is the main design
|
||||
// vector which will evolve in the procedure of DSE.
|
||||
std::vector<TileSizes> tileSizesList;
|
||||
std::vector<int64_t> targetIIList;
|
||||
// Hold the DSE status of all loops in each loop band.
|
||||
std::vector<BandState> BandStateList;
|
||||
|
||||
// Initialize all lists.
|
||||
for (auto band : targetBands) {
|
||||
TileSizes tripCounts;
|
||||
for (auto loop : band)
|
||||
tripCounts.push_back(getIntAttrValue(loop, "trip_count"));
|
||||
|
||||
// These two lists will not be modified in the DSE.
|
||||
tripCountsList.push_back(tripCounts);
|
||||
loopNumList.push_back(band.size());
|
||||
|
||||
// These two lists will evolve in the DSE.
|
||||
tileSizesList.push_back(TileSizes(band.size(), 1));
|
||||
targetIIList.push_back(1);
|
||||
BandStateList.push_back(BandState(band.size(), LoopState::COLD));
|
||||
}
|
||||
|
||||
// Try and record the none tiling performance.
|
||||
auto nonTileFunc = func.clone();
|
||||
applyOptStrategy(nonTileFunc, tileSizesList, targetIIList);
|
||||
emitTilingInfo(func, tileSizesList);
|
||||
unsigned minLatency = getIntAttrValue(nonTileFunc, "latency");
|
||||
|
||||
if (getIntAttrValue(nonTileFunc, "dsp") > numDSP)
|
||||
return;
|
||||
nonTileFunc.erase();
|
||||
LLVM_DEBUG(llvm::dbgs() << "3. Search for the best tiling strategy.\n";);
|
||||
|
||||
// Main loop for design space exploration.
|
||||
unsigned iteration = 0;
|
||||
while (true) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Iteration " << iteration++ << ":\n\n";);
|
||||
bool isAllFrozen = true;
|
||||
// Walk through each target loop band.
|
||||
for (unsigned i = 0; i < targetNum; ++i) {
|
||||
auto &bandState = BandStateList[i];
|
||||
|
||||
// Update state of the current loop band.
|
||||
for (unsigned loc = 0; loc < loopNumList[i]; ++loc)
|
||||
if (tileSizesList[i][loc] >= tripCountsList[i][loc])
|
||||
bandState[loc] = LoopState::FROZEN;
|
||||
|
||||
// If all loop in the current loop band are frozen, continue and visit
|
||||
// next loop band.
|
||||
if (loopBandIsFrozen(bandState))
|
||||
continue;
|
||||
isAllFrozen = false;
|
||||
|
||||
// If all loop in the current loop band are cold or frozen, walk through
|
||||
// all loop levels and heat the best one to hot state.
|
||||
if (loopBandIsColdOrFrozen(bandState)) {
|
||||
unsigned bestLoc = 0;
|
||||
unsigned bestLatency = UINT_MAX;
|
||||
|
||||
for (unsigned loc = 0; loc < loopNumList[i]; ++loc) {
|
||||
if (bandState[loc] == LoopState::FROZEN)
|
||||
continue;
|
||||
|
||||
// Increase the tile size of current location.
|
||||
auto tmpTileSizesList = tileSizesList;
|
||||
if (incrTileSizeAtLoc(tmpTileSizesList[i], tripCountsList[i], loc)) {
|
||||
// Try to apply the new tile size.
|
||||
auto tmpFunc = func.clone();
|
||||
if (applyOptStrategy(tmpFunc, tmpTileSizesList, targetIIList)) {
|
||||
emitTilingInfo(tmpFunc, tmpTileSizesList);
|
||||
auto latency = getIntAttrValue(tmpFunc, "latency");
|
||||
auto dsp = getIntAttrValue(tmpFunc, "dsp");
|
||||
|
||||
if (dsp < numDSP && latency < bestLatency * 0.95) {
|
||||
bestLoc = loc;
|
||||
bestLatency = latency;
|
||||
}
|
||||
// Move to the next location.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// If the current loop cannot be further tiled, set it as frozen.
|
||||
bandState[loc] = LoopState::FROZEN;
|
||||
}
|
||||
|
||||
if (bestLatency != UINT_MAX) {
|
||||
// Heat the best loop location. If the best latency is already better
|
||||
// than the minimum found latency, apply it. Otherwise, only heat the
|
||||
// location.
|
||||
bandState[bestLoc] = LoopState::HOT;
|
||||
if (bestLatency < minLatency * 0.95) {
|
||||
incrTileSizeAtLoc(tileSizesList[i], tripCountsList[i], bestLoc);
|
||||
minLatency = bestLatency;
|
||||
}
|
||||
} else {
|
||||
// If cannot find a proper tiling strategy for the current loop band,
|
||||
// frozen all loops.
|
||||
for (unsigned loc = 0; loc < loopNumList[i]; ++loc)
|
||||
bandState[loc] = LoopState::FROZEN;
|
||||
}
|
||||
// Move to the next DSE iteration.
|
||||
continue;
|
||||
}
|
||||
|
||||
// For now, there should only one loop locations are in HOT state.
|
||||
if (loopBandIsOneHot(bandState)) {
|
||||
unsigned hotLoc = 0;
|
||||
for (unsigned loc = 0; loc < loopNumList[i]; ++loc)
|
||||
if (bandState[loc] == LoopState::HOT)
|
||||
hotLoc = loc;
|
||||
|
||||
unsigned lastLatency = minLatency;
|
||||
unsigned tolerantCounter = 0;
|
||||
|
||||
// Increase the tile size of current location until the latency is
|
||||
// improved or tile size cannot be further increased.
|
||||
auto tmpTileSizesList = tileSizesList;
|
||||
while (true) {
|
||||
// If the latency has not been improved for more than a certain
|
||||
// number of iterations, stop to increase tile size.
|
||||
if (tolerantCounter > 1) {
|
||||
bandState[hotLoc] = LoopState::FROZEN;
|
||||
break;
|
||||
}
|
||||
|
||||
// Try to increase the tile size.
|
||||
if (incrTileSizeAtLoc(tmpTileSizesList[i], tripCountsList[i],
|
||||
hotLoc)) {
|
||||
// Try to apply the new tile size.
|
||||
auto tmpFunc = func.clone();
|
||||
if (applyOptStrategy(tmpFunc, tmpTileSizesList, targetIIList)) {
|
||||
emitTilingInfo(tmpFunc, tmpTileSizesList);
|
||||
auto latency = getIntAttrValue(tmpFunc, "latency");
|
||||
auto dsp = getIntAttrValue(tmpFunc, "dsp");
|
||||
|
||||
if (dsp < numDSP && latency < minLatency * 0.95) {
|
||||
// If find a new minimum latency, apply it.
|
||||
tileSizesList = tmpTileSizesList;
|
||||
minLatency = latency;
|
||||
break;
|
||||
} else if (dsp < numDSP && latency < lastLatency * 0.95) {
|
||||
// If the latency is better than the last iteration, even if it
|
||||
// is not the minimum latency, continue to try on the hot loop
|
||||
// location.
|
||||
lastLatency = latency;
|
||||
tolerantCounter = 0;
|
||||
continue;
|
||||
} else {
|
||||
// If the latency is worse than the last iteration, increase the
|
||||
// tolerant counter by 1 and continue to
|
||||
lastLatency = latency;
|
||||
tolerantCounter++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the hot location cannot contribute to the improvement of
|
||||
// latency, set it as frozen.
|
||||
bandState[hotLoc] = LoopState::FROZEN;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isAllFrozen)
|
||||
break;
|
||||
}
|
||||
|
||||
// Finally, we found the best tiling strategy.
|
||||
LLVM_DEBUG(llvm::dbgs() << "4. Apply the best tiling strategy.\n";);
|
||||
applyOptStrategy(func, tileSizesList, targetIIList);
|
||||
emitTilingInfo(func, tileSizesList);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
|
|
@ -6,8 +6,8 @@
|
|||
|
||||
#include "mlir/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Support/FileUtilities.h"
|
||||
#include "scalehls/Analysis/Passes.h"
|
||||
#include "scalehls/Analysis/QoREstimation.h"
|
||||
#include "scalehls/Transforms/Passes.h"
|
||||
#include "scalehls/Transforms/Utils.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
|
||||
|
@ -185,7 +185,7 @@ struct ProfileDesignSpace : public ProfileDesignSpaceBase<ProfileDesignSpace> {
|
|||
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
|
||||
if (topFunction.getValue()) {
|
||||
std::string errorMessage;
|
||||
auto output = mlir::openOutputFile(profileFile, &errorMessage);
|
||||
auto output = mlir::openOutputFile(outputFile, &errorMessage);
|
||||
if (!output)
|
||||
emitError(module.getLoc(), errorMessage);
|
||||
|
|
@ -1,12 +1,14 @@
|
|||
get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
|
||||
|
||||
set(LIBS
|
||||
add_llvm_tool(benchmark-gen
|
||||
benchmark-gen.cpp
|
||||
)
|
||||
|
||||
llvm_update_compile_flags(benchmark-gen)
|
||||
|
||||
target_link_libraries(benchmark-gen
|
||||
PRIVATE
|
||||
${dialect_libs}
|
||||
|
||||
MLIRHLSKernel
|
||||
)
|
||||
|
||||
add_llvm_executable(benchmark-gen benchmark-gen.cpp)
|
||||
|
||||
llvm_update_compile_flags(benchmark-gen)
|
||||
target_link_libraries(benchmark-gen PRIVATE ${LIBS})
|
||||
|
|
|
@ -1,10 +1,16 @@
|
|||
get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
|
||||
get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
|
||||
|
||||
set(LIBS
|
||||
add_llvm_tool(scalehls-opt
|
||||
scalehls-opt.cpp
|
||||
)
|
||||
|
||||
llvm_update_compile_flags(scalehls-opt)
|
||||
|
||||
target_link_libraries(scalehls-opt
|
||||
PRIVATE
|
||||
${dialect_libs}
|
||||
${conversion_libs}
|
||||
Threads::Threads
|
||||
MLIROptLib
|
||||
|
||||
MLIRHLSCpp
|
||||
|
@ -12,9 +18,6 @@ set(LIBS
|
|||
MLIRScaleHLSConversion
|
||||
MLIRScaleHLSTransforms
|
||||
MLIRScaleHLSAnalysis
|
||||
|
||||
Threads::Threads
|
||||
)
|
||||
|
||||
add_llvm_executable(scalehls-opt scalehls-opt.cpp)
|
||||
|
||||
llvm_update_compile_flags(scalehls-opt)
|
||||
target_link_libraries(scalehls-opt PRIVATE ${LIBS})
|
||||
|
|
|
@ -1,21 +1,21 @@
|
|||
get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
|
||||
get_property(translation_libs GLOBAL PROPERTY MLIR_TRANSLATION_LIBS)
|
||||
|
||||
set(LLVM_LINK_COMPONENTS
|
||||
Support
|
||||
)
|
||||
|
||||
get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
|
||||
get_property(translation_libs GLOBAL PROPERTY MLIR_TRANSLATION_LIBS)
|
||||
add_llvm_tool(scalehls-translate
|
||||
scalehls-translate.cpp
|
||||
)
|
||||
|
||||
set(LIBS
|
||||
llvm_update_compile_flags(scalehls-translate)
|
||||
|
||||
target_link_libraries(scalehls-translate
|
||||
PRIVATE
|
||||
${dialect_libs}
|
||||
${translation_libs}
|
||||
|
||||
MLIRScaleHLSEmitHLSCpp
|
||||
MLIRScaleHLSAnalysis
|
||||
)
|
||||
|
||||
add_llvm_executable(scalehls-translate scalehls-translate.cpp)
|
||||
|
||||
llvm_update_compile_flags(scalehls-translate)
|
||||
target_link_libraries(scalehls-translate PRIVATE ${LIBS})
|
||||
|
||||
mlir_check_link_libraries(scalehls-translate)
|
||||
|
|
Loading…
Reference in New Issue