[MultipleLevelDSE] support more fine-grained tiling strategy searching
This commit is contained in:
parent
7641af0c04
commit
9cd80630e0
|
@ -22,7 +22,7 @@ class HLSCppEstimator
|
||||||
: public HLSCppVisitorBase<HLSCppEstimator, bool, int64_t>,
|
: public HLSCppVisitorBase<HLSCppEstimator, bool, int64_t>,
|
||||||
public HLSCppAnalysisBase {
|
public HLSCppAnalysisBase {
|
||||||
public:
|
public:
|
||||||
explicit HLSCppEstimator(FuncOp &func, LatencyMap &latencyMap)
|
explicit HLSCppEstimator(FuncOp func, LatencyMap &latencyMap)
|
||||||
: HLSCppAnalysisBase(OpBuilder(func)), func(func),
|
: HLSCppAnalysisBase(OpBuilder(func)), func(func),
|
||||||
latencyMap(latencyMap) {
|
latencyMap(latencyMap) {
|
||||||
getFuncDependencies();
|
getFuncDependencies();
|
||||||
|
@ -133,7 +133,7 @@ public:
|
||||||
void reverseSchedule();
|
void reverseSchedule();
|
||||||
void estimateFunc();
|
void estimateFunc();
|
||||||
|
|
||||||
FuncOp &func;
|
FuncOp func;
|
||||||
DependsMap dependsMap;
|
DependsMap dependsMap;
|
||||||
PortsMapDict portsMapDict;
|
PortsMapDict portsMapDict;
|
||||||
LatencyMap &latencyMap;
|
LatencyMap &latencyMap;
|
||||||
|
|
|
@ -33,23 +33,13 @@ static int64_t getInnerParallelism(AffineForOp forOp) {
|
||||||
return std::max(count, (int64_t)1);
|
return std::max(count, (int64_t)1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Clean up all attributes annotated for scheduling in the function for the
|
|
||||||
/// convenience of other transforms.
|
|
||||||
// static void cleanScheduleAttributes(FuncOp func) {
|
|
||||||
// func.walk([&](Operation *op) {
|
|
||||||
// op->removeAttr("schedule_begin");
|
|
||||||
// op->removeAttr("schedule_end");
|
|
||||||
// op->removeAttr("partition_index");
|
|
||||||
// });
|
|
||||||
// }
|
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// Optimizer Class Declaration
|
// Optimizer Class Declaration
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
class HLSCppOptimizer : public HLSCppAnalysisBase {
|
class HLSCppOptimizer : public HLSCppAnalysisBase {
|
||||||
public:
|
public:
|
||||||
explicit HLSCppOptimizer(FuncOp &func, LatencyMap &latencyMap, int64_t numDSP)
|
explicit HLSCppOptimizer(FuncOp func, LatencyMap &latencyMap, int64_t numDSP)
|
||||||
: HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap),
|
: HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap),
|
||||||
numDSP(numDSP) {
|
numDSP(numDSP) {
|
||||||
// TODO: only insert affine-related patterns.
|
// TODO: only insert affine-related patterns.
|
||||||
|
@ -61,14 +51,16 @@ public:
|
||||||
|
|
||||||
using TileSizes = SmallVector<unsigned, 8>;
|
using TileSizes = SmallVector<unsigned, 8>;
|
||||||
|
|
||||||
void emitDebugInfo(FuncOp &targetFunc, StringRef message);
|
void emitDebugInfo(FuncOp targetFunc, StringRef message);
|
||||||
void applyLoopTilingStrategy(FuncOp &targetFunc,
|
void applyLoopTilingStrategy(FuncOp targetFunc,
|
||||||
ArrayRef<TileSizes> tileSizesList);
|
ArrayRef<TileSizes> tileSizesList);
|
||||||
|
void updateTileSizesAtHead(TileSizes &tileSizes, const TileSizes &tripCounts,
|
||||||
|
unsigned &head);
|
||||||
|
|
||||||
/// This is a temporary approach that does not scale.
|
/// This is a temporary approach that does not scale.
|
||||||
void applyMultipleLevelDSE();
|
void applyMultipleLevelDSE();
|
||||||
|
|
||||||
FuncOp &func;
|
FuncOp func;
|
||||||
LatencyMap &latencyMap;
|
LatencyMap &latencyMap;
|
||||||
int64_t numDSP;
|
int64_t numDSP;
|
||||||
FrozenRewritePatternList patterns;
|
FrozenRewritePatternList patterns;
|
||||||
|
@ -78,7 +70,7 @@ public:
|
||||||
// Optimizer Class Definition
|
// Optimizer Class Definition
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) {
|
void HLSCppOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) {
|
||||||
LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency");
|
LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency");
|
||||||
auto dsp = getIntAttrValue(targetFunc, "dsp");
|
auto dsp = getIntAttrValue(targetFunc, "dsp");
|
||||||
|
|
||||||
|
@ -88,7 +80,7 @@ void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void HLSCppOptimizer::applyLoopTilingStrategy(
|
void HLSCppOptimizer::applyLoopTilingStrategy(
|
||||||
FuncOp &targetFunc, ArrayRef<TileSizes> tileSizesList) {
|
FuncOp targetFunc, ArrayRef<TileSizes> tileSizesList) {
|
||||||
AffineLoopBands targetBands;
|
AffineLoopBands targetBands;
|
||||||
getLoopBands(targetFunc.front(), targetBands);
|
getLoopBands(targetFunc.front(), targetBands);
|
||||||
|
|
||||||
|
@ -99,10 +91,8 @@ void HLSCppOptimizer::applyLoopTilingStrategy(
|
||||||
applyPatternsAndFoldGreedily(targetFunc, patterns);
|
applyPatternsAndFoldGreedily(targetFunc, patterns);
|
||||||
|
|
||||||
// Apply loop pipelining.
|
// Apply loop pipelining.
|
||||||
for (auto band : targetBands) {
|
for (auto &band : targetBands)
|
||||||
auto pipelineLoop = band[band.size() / 2 - 1];
|
applyLoopPipelining(band[band.size() / 2 - 1], builder);
|
||||||
applyLoopPipelining(pipelineLoop, builder);
|
|
||||||
}
|
|
||||||
applyPatternsAndFoldGreedily(targetFunc, patterns);
|
applyPatternsAndFoldGreedily(targetFunc, patterns);
|
||||||
|
|
||||||
// Apply general optimizations and array partition.
|
// Apply general optimizations and array partition.
|
||||||
|
@ -113,11 +103,45 @@ void HLSCppOptimizer::applyLoopTilingStrategy(
|
||||||
applyPatternsAndFoldGreedily(targetFunc, patterns);
|
applyPatternsAndFoldGreedily(targetFunc, patterns);
|
||||||
|
|
||||||
// Estimate performance and resource utilization.
|
// Estimate performance and resource utilization.
|
||||||
|
LLVM_DEBUG(llvm::dbgs() << "Current tiling strategy:\n"; idx = 0;
|
||||||
|
for (auto tileSizes
|
||||||
|
: tileSizesList) {
|
||||||
|
llvm::dbgs() << "Loop band " << Twine(idx++) << ":";
|
||||||
|
for (auto size : tileSizes) {
|
||||||
|
llvm::dbgs() << " " << Twine(size);
|
||||||
|
}
|
||||||
|
llvm::dbgs() << "\n";
|
||||||
|
});
|
||||||
HLSCppEstimator(targetFunc, latencyMap).estimateFunc();
|
HLSCppEstimator(targetFunc, latencyMap).estimateFunc();
|
||||||
emitDebugInfo(targetFunc, "Apply loop tiling and pipelining, general "
|
emitDebugInfo(targetFunc, "Apply loop tiling and pipelining, general "
|
||||||
"optimizations, and array partition.");
|
"optimizations, and array partition.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Update tile sizes by a factor of 2 at the head location.
|
||||||
|
void HLSCppOptimizer::updateTileSizesAtHead(TileSizes &tileSizes,
|
||||||
|
const TileSizes &tripCounts,
|
||||||
|
unsigned &head) {
|
||||||
|
assert(tileSizes.size() == tripCounts.size() &&
|
||||||
|
"unexpected input tile sizes");
|
||||||
|
|
||||||
|
for (unsigned e = tileSizes.size(); head < e; ++head) {
|
||||||
|
auto size = tileSizes[head];
|
||||||
|
auto tripCount = tripCounts[head];
|
||||||
|
|
||||||
|
// At this stage, size must be 1 or a number which is divisible
|
||||||
|
// by tripCount. We need to find the update factor now.
|
||||||
|
if (size < tripCount) {
|
||||||
|
unsigned factor = 2;
|
||||||
|
while (tripCount % (size * factor) != 0)
|
||||||
|
factor++;
|
||||||
|
|
||||||
|
size *= factor;
|
||||||
|
tileSizes[head] = size;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// This is a temporary approach that does not scale.
|
/// This is a temporary approach that does not scale.
|
||||||
void HLSCppOptimizer::applyMultipleLevelDSE() {
|
void HLSCppOptimizer::applyMultipleLevelDSE() {
|
||||||
HLSCppEstimator(func, latencyMap).estimateFunc();
|
HLSCppEstimator(func, latencyMap).estimateFunc();
|
||||||
|
@ -280,60 +304,103 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
// Holding trip counts of all loops in each loop band.
|
// Holding trip counts of all loops in each loop band.
|
||||||
std::vector<TileSizes> targetTileSizesList;
|
std::vector<TileSizes> tripCountsList;
|
||||||
// Holding the current tiling sizes of each loop band.
|
// Holding the current tiling sizes of each loop band.
|
||||||
std::vector<TileSizes> currentTileSizesList;
|
std::vector<TileSizes> tileSizesList;
|
||||||
// Holding the current loop tiling location in each loop band.
|
// Holding the current loop tiling location in each loop band.
|
||||||
SmallVector<unsigned, 8> headLocationList;
|
SmallVector<unsigned, 8> headLocList;
|
||||||
|
|
||||||
// Initialize all design vectors.
|
// Initialize all design vectors.
|
||||||
for (auto band : targetBands) {
|
for (auto band : targetBands) {
|
||||||
TileSizes targetSizes;
|
TileSizes tripCounts;
|
||||||
TileSizes baseSizes;
|
TileSizes sizes;
|
||||||
for (auto loop : band) {
|
for (auto loop : band) {
|
||||||
targetSizes.push_back(getIntAttrValue(loop, "trip_count"));
|
tripCounts.push_back(getIntAttrValue(loop, "trip_count"));
|
||||||
baseSizes.push_back(1);
|
sizes.push_back(1);
|
||||||
}
|
}
|
||||||
targetTileSizesList.push_back(targetSizes);
|
tripCountsList.push_back(tripCounts);
|
||||||
currentTileSizesList.push_back(baseSizes);
|
tileSizesList.push_back(sizes);
|
||||||
headLocationList.push_back(0);
|
headLocList.push_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// For recording the minimum latency and best tiling strategy.
|
LLVM_DEBUG(llvm::dbgs() << "3. Search for the best tiling strategy.\n";);
|
||||||
unsigned minLatency = getIntAttrValue(func, "latency");
|
applyLoopTilingStrategy(func, tileSizesList);
|
||||||
std::vector<TileSizes> bestTileSizesList;
|
|
||||||
|
|
||||||
// TODO: more fined grained and comprehensive dse.
|
// TODO: more fined grained and comprehensive dse.
|
||||||
unsigned tolerantCount = 0;
|
unsigned minLatency = getIntAttrValue(func, "latency");
|
||||||
|
unsigned targetNum = targetBands.size();
|
||||||
while (true) {
|
while (true) {
|
||||||
// Clone the current function and apply the current tiling strategy.
|
// If there're more than one loop bands in the function, we'll first try to
|
||||||
auto tmpFunc = func.clone();
|
// update the tiling size of ALL target loop bands with a factor of 2. This
|
||||||
applyLoopTilingStrategy(tmpFunc, currentTileSizesList);
|
// is for reducing the DSE complexity.
|
||||||
|
if (targetNum > 1) {
|
||||||
|
std::vector<TileSizes> newTileSizesList = tileSizesList;
|
||||||
|
SmallVector<unsigned, 8> newHeadLocList = headLocList;
|
||||||
|
|
||||||
// If the resource constaints are not met or the latency is not increased,
|
for (unsigned i = 0; i < targetNum; ++i)
|
||||||
// increase the tolerant counter by 1.
|
updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i],
|
||||||
auto latency = getIntAttrValue(tmpFunc, "latency");
|
newHeadLocList[i]);
|
||||||
if (getIntAttrValue(tmpFunc, "dsp") <= numDSP) {
|
|
||||||
if (latency < minLatency) {
|
auto tmpFunc = func.clone();
|
||||||
|
applyLoopTilingStrategy(tmpFunc, newTileSizesList);
|
||||||
|
|
||||||
|
// If the resource constaints are not met or the latency is not increased,
|
||||||
|
// we try more fine grained strategy. Otherwise, we accept the new tile
|
||||||
|
// strategy and head location, and enter the next iteration. We set a
|
||||||
|
// threshold 0.95 here to avoid glitches.
|
||||||
|
// TODO: fine tune the exit condition.
|
||||||
|
auto latency = getIntAttrValue(tmpFunc, "latency");
|
||||||
|
auto dsp = getIntAttrValue(tmpFunc, "dsp");
|
||||||
|
|
||||||
|
if (dsp <= numDSP && latency < minLatency * 0.95) {
|
||||||
|
tileSizesList = newTileSizesList;
|
||||||
|
headLocList = newHeadLocList;
|
||||||
minLatency = latency;
|
minLatency = latency;
|
||||||
bestTileSizesList = currentTileSizesList;
|
continue;
|
||||||
tolerantCount = 0;
|
}
|
||||||
} else
|
}
|
||||||
tolerantCount++;
|
|
||||||
|
|
||||||
// If the tolerant counter is larger than a threshold, we'll stop to
|
// Walk through all loop bands in the function and update tiling strategy
|
||||||
// increase the tiling size.
|
// one by one.
|
||||||
if (tolerantCount > 1)
|
bool hasUpdated = false;
|
||||||
break;
|
for (unsigned i = 0; i < targetNum; ++i) {
|
||||||
// else
|
// TODO: This is not efficient. As our estimation can be conducted in a
|
||||||
// currentTileSize *= 2;
|
// more structural way, we should only focus on the current loop rather
|
||||||
} else
|
// than the whole function. But for now this makes sense because we are
|
||||||
|
// only focusing on computation kernel level algorithms that typcially
|
||||||
|
// only have handy loop bands.
|
||||||
|
for (unsigned head = headLocList[i], e = tileSizesList[i].size();
|
||||||
|
head < e; ++head) {
|
||||||
|
// Only update the tiling strategy and head location of the current
|
||||||
|
// loop band.
|
||||||
|
std::vector<TileSizes> newTileSizesList = tileSizesList;
|
||||||
|
updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i], head);
|
||||||
|
|
||||||
|
auto tmpFunc = func.clone();
|
||||||
|
applyLoopTilingStrategy(tmpFunc, newTileSizesList);
|
||||||
|
|
||||||
|
auto latency = getIntAttrValue(tmpFunc, "latency");
|
||||||
|
auto dsp = getIntAttrValue(tmpFunc, "dsp");
|
||||||
|
|
||||||
|
if (dsp <= numDSP && latency < minLatency * 0.95) {
|
||||||
|
tileSizesList = newTileSizesList;
|
||||||
|
headLocList[i] = head;
|
||||||
|
minLatency = latency;
|
||||||
|
|
||||||
|
hasUpdated = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no loop band is updated, break the searching.
|
||||||
|
if (!hasUpdated)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finally, apply the best tiling strategy.
|
// Finally, we found the best tiling strategy.
|
||||||
LLVM_DEBUG(llvm::dbgs() << "Found the best tiling strategy.\n";);
|
LLVM_DEBUG(llvm::dbgs() << "4. Apply the best tiling strategy.\n";);
|
||||||
applyLoopTilingStrategy(func, bestTileSizesList);
|
applyLoopTilingStrategy(func, tileSizesList);
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
|
@ -56,7 +56,7 @@ struct PartialAffineLoopTile
|
||||||
for (auto *op : func.getContext()->getRegisteredOperations())
|
for (auto *op : func.getContext()->getRegisteredOperations())
|
||||||
op->getCanonicalizationPatterns(patterns, func.getContext());
|
op->getCanonicalizationPatterns(patterns, func.getContext());
|
||||||
|
|
||||||
applyPatternsAndFoldGreedily(func.getRegion(), std::move(patterns));
|
applyPatternsAndFoldGreedily(func, std::move(patterns));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
Loading…
Reference in New Issue