[MultipleLevelDSE] support more fine-grained tiling strategy searching

This commit is contained in:
Hanchen Ye 2021-01-23 23:25:13 -06:00
parent 7641af0c04
commit 9cd80630e0
3 changed files with 126 additions and 59 deletions

View File

@ -22,7 +22,7 @@ class HLSCppEstimator
: public HLSCppVisitorBase<HLSCppEstimator, bool, int64_t>, : public HLSCppVisitorBase<HLSCppEstimator, bool, int64_t>,
public HLSCppAnalysisBase { public HLSCppAnalysisBase {
public: public:
explicit HLSCppEstimator(FuncOp &func, LatencyMap &latencyMap) explicit HLSCppEstimator(FuncOp func, LatencyMap &latencyMap)
: HLSCppAnalysisBase(OpBuilder(func)), func(func), : HLSCppAnalysisBase(OpBuilder(func)), func(func),
latencyMap(latencyMap) { latencyMap(latencyMap) {
getFuncDependencies(); getFuncDependencies();
@ -133,7 +133,7 @@ public:
void reverseSchedule(); void reverseSchedule();
void estimateFunc(); void estimateFunc();
FuncOp &func; FuncOp func;
DependsMap dependsMap; DependsMap dependsMap;
PortsMapDict portsMapDict; PortsMapDict portsMapDict;
LatencyMap &latencyMap; LatencyMap &latencyMap;

View File

@ -33,23 +33,13 @@ static int64_t getInnerParallelism(AffineForOp forOp) {
return std::max(count, (int64_t)1); return std::max(count, (int64_t)1);
} }
/// Clean up all attributes annotated for scheduling in the function for the
/// convenience of other transforms.
// static void cleanScheduleAttributes(FuncOp func) {
// func.walk([&](Operation *op) {
// op->removeAttr("schedule_begin");
// op->removeAttr("schedule_end");
// op->removeAttr("partition_index");
// });
// }
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// Optimizer Class Declaration // Optimizer Class Declaration
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
class HLSCppOptimizer : public HLSCppAnalysisBase { class HLSCppOptimizer : public HLSCppAnalysisBase {
public: public:
explicit HLSCppOptimizer(FuncOp &func, LatencyMap &latencyMap, int64_t numDSP) explicit HLSCppOptimizer(FuncOp func, LatencyMap &latencyMap, int64_t numDSP)
: HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap), : HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap),
numDSP(numDSP) { numDSP(numDSP) {
// TODO: only insert affine-related patterns. // TODO: only insert affine-related patterns.
@ -61,14 +51,16 @@ public:
using TileSizes = SmallVector<unsigned, 8>; using TileSizes = SmallVector<unsigned, 8>;
void emitDebugInfo(FuncOp &targetFunc, StringRef message); void emitDebugInfo(FuncOp targetFunc, StringRef message);
void applyLoopTilingStrategy(FuncOp &targetFunc, void applyLoopTilingStrategy(FuncOp targetFunc,
ArrayRef<TileSizes> tileSizesList); ArrayRef<TileSizes> tileSizesList);
void updateTileSizesAtHead(TileSizes &tileSizes, const TileSizes &tripCounts,
unsigned &head);
/// This is a temporary approach that does not scale. /// This is a temporary approach that does not scale.
void applyMultipleLevelDSE(); void applyMultipleLevelDSE();
FuncOp &func; FuncOp func;
LatencyMap &latencyMap; LatencyMap &latencyMap;
int64_t numDSP; int64_t numDSP;
FrozenRewritePatternList patterns; FrozenRewritePatternList patterns;
@ -78,7 +70,7 @@ public:
// Optimizer Class Definition // Optimizer Class Definition
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) { void HLSCppOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) {
LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency"); LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency");
auto dsp = getIntAttrValue(targetFunc, "dsp"); auto dsp = getIntAttrValue(targetFunc, "dsp");
@ -88,7 +80,7 @@ void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) {
} }
void HLSCppOptimizer::applyLoopTilingStrategy( void HLSCppOptimizer::applyLoopTilingStrategy(
FuncOp &targetFunc, ArrayRef<TileSizes> tileSizesList) { FuncOp targetFunc, ArrayRef<TileSizes> tileSizesList) {
AffineLoopBands targetBands; AffineLoopBands targetBands;
getLoopBands(targetFunc.front(), targetBands); getLoopBands(targetFunc.front(), targetBands);
@ -99,10 +91,8 @@ void HLSCppOptimizer::applyLoopTilingStrategy(
applyPatternsAndFoldGreedily(targetFunc, patterns); applyPatternsAndFoldGreedily(targetFunc, patterns);
// Apply loop pipelining. // Apply loop pipelining.
for (auto band : targetBands) { for (auto &band : targetBands)
auto pipelineLoop = band[band.size() / 2 - 1]; applyLoopPipelining(band[band.size() / 2 - 1], builder);
applyLoopPipelining(pipelineLoop, builder);
}
applyPatternsAndFoldGreedily(targetFunc, patterns); applyPatternsAndFoldGreedily(targetFunc, patterns);
// Apply general optimizations and array partition. // Apply general optimizations and array partition.
@ -113,11 +103,45 @@ void HLSCppOptimizer::applyLoopTilingStrategy(
applyPatternsAndFoldGreedily(targetFunc, patterns); applyPatternsAndFoldGreedily(targetFunc, patterns);
// Estimate performance and resource utilization. // Estimate performance and resource utilization.
LLVM_DEBUG(llvm::dbgs() << "Current tiling strategy:\n"; idx = 0;
for (auto tileSizes
: tileSizesList) {
llvm::dbgs() << "Loop band " << Twine(idx++) << ":";
for (auto size : tileSizes) {
llvm::dbgs() << " " << Twine(size);
}
llvm::dbgs() << "\n";
});
HLSCppEstimator(targetFunc, latencyMap).estimateFunc(); HLSCppEstimator(targetFunc, latencyMap).estimateFunc();
emitDebugInfo(targetFunc, "Apply loop tiling and pipelining, general " emitDebugInfo(targetFunc, "Apply loop tiling and pipelining, general "
"optimizations, and array partition."); "optimizations, and array partition.");
} }
/// Update tile sizes by a factor of 2 at the head location.
void HLSCppOptimizer::updateTileSizesAtHead(TileSizes &tileSizes,
const TileSizes &tripCounts,
unsigned &head) {
assert(tileSizes.size() == tripCounts.size() &&
"unexpected input tile sizes");
for (unsigned e = tileSizes.size(); head < e; ++head) {
auto size = tileSizes[head];
auto tripCount = tripCounts[head];
// At this stage, size must be 1 or a number which is divisible
// by tripCount. We need to find the update factor now.
if (size < tripCount) {
unsigned factor = 2;
while (tripCount % (size * factor) != 0)
factor++;
size *= factor;
tileSizes[head] = size;
break;
}
}
}
/// This is a temporary approach that does not scale. /// This is a temporary approach that does not scale.
void HLSCppOptimizer::applyMultipleLevelDSE() { void HLSCppOptimizer::applyMultipleLevelDSE() {
HLSCppEstimator(func, latencyMap).estimateFunc(); HLSCppEstimator(func, latencyMap).estimateFunc();
@ -280,60 +304,103 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
//===--------------------------------------------------------------------===// //===--------------------------------------------------------------------===//
// Holding trip counts of all loops in each loop band. // Holding trip counts of all loops in each loop band.
std::vector<TileSizes> targetTileSizesList; std::vector<TileSizes> tripCountsList;
// Holding the current tiling sizes of each loop band. // Holding the current tiling sizes of each loop band.
std::vector<TileSizes> currentTileSizesList; std::vector<TileSizes> tileSizesList;
// Holding the current loop tiling location in each loop band. // Holding the current loop tiling location in each loop band.
SmallVector<unsigned, 8> headLocationList; SmallVector<unsigned, 8> headLocList;
// Initialize all design vectors. // Initialize all design vectors.
for (auto band : targetBands) { for (auto band : targetBands) {
TileSizes targetSizes; TileSizes tripCounts;
TileSizes baseSizes; TileSizes sizes;
for (auto loop : band) { for (auto loop : band) {
targetSizes.push_back(getIntAttrValue(loop, "trip_count")); tripCounts.push_back(getIntAttrValue(loop, "trip_count"));
baseSizes.push_back(1); sizes.push_back(1);
} }
targetTileSizesList.push_back(targetSizes); tripCountsList.push_back(tripCounts);
currentTileSizesList.push_back(baseSizes); tileSizesList.push_back(sizes);
headLocationList.push_back(0); headLocList.push_back(0);
} }
// For recording the minimum latency and best tiling strategy. LLVM_DEBUG(llvm::dbgs() << "3. Search for the best tiling strategy.\n";);
unsigned minLatency = getIntAttrValue(func, "latency"); applyLoopTilingStrategy(func, tileSizesList);
std::vector<TileSizes> bestTileSizesList;
// TODO: more fined grained and comprehensive dse. // TODO: more fined grained and comprehensive dse.
unsigned tolerantCount = 0; unsigned minLatency = getIntAttrValue(func, "latency");
unsigned targetNum = targetBands.size();
while (true) { while (true) {
// Clone the current function and apply the current tiling strategy. // If there're more than one loop bands in the function, we'll first try to
auto tmpFunc = func.clone(); // update the tiling size of ALL target loop bands with a factor of 2. This
applyLoopTilingStrategy(tmpFunc, currentTileSizesList); // is for reducing the DSE complexity.
if (targetNum > 1) {
std::vector<TileSizes> newTileSizesList = tileSizesList;
SmallVector<unsigned, 8> newHeadLocList = headLocList;
// If the resource constaints are not met or the latency is not increased, for (unsigned i = 0; i < targetNum; ++i)
// increase the tolerant counter by 1. updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i],
auto latency = getIntAttrValue(tmpFunc, "latency"); newHeadLocList[i]);
if (getIntAttrValue(tmpFunc, "dsp") <= numDSP) {
if (latency < minLatency) { auto tmpFunc = func.clone();
applyLoopTilingStrategy(tmpFunc, newTileSizesList);
// If the resource constaints are not met or the latency is not increased,
// we try more fine grained strategy. Otherwise, we accept the new tile
// strategy and head location, and enter the next iteration. We set a
// threshold 0.95 here to avoid glitches.
// TODO: fine tune the exit condition.
auto latency = getIntAttrValue(tmpFunc, "latency");
auto dsp = getIntAttrValue(tmpFunc, "dsp");
if (dsp <= numDSP && latency < minLatency * 0.95) {
tileSizesList = newTileSizesList;
headLocList = newHeadLocList;
minLatency = latency; minLatency = latency;
bestTileSizesList = currentTileSizesList; continue;
tolerantCount = 0; }
} else }
tolerantCount++;
// If the tolerant counter is larger than a threshold, we'll stop to // Walk through all loop bands in the function and update tiling strategy
// increase the tiling size. // one by one.
if (tolerantCount > 1) bool hasUpdated = false;
break; for (unsigned i = 0; i < targetNum; ++i) {
// else // TODO: This is not efficient. As our estimation can be conducted in a
// currentTileSize *= 2; // more structural way, we should only focus on the current loop rather
} else // than the whole function. But for now this makes sense because we are
// only focusing on computation kernel level algorithms that typcially
// only have handy loop bands.
for (unsigned head = headLocList[i], e = tileSizesList[i].size();
head < e; ++head) {
// Only update the tiling strategy and head location of the current
// loop band.
std::vector<TileSizes> newTileSizesList = tileSizesList;
updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i], head);
auto tmpFunc = func.clone();
applyLoopTilingStrategy(tmpFunc, newTileSizesList);
auto latency = getIntAttrValue(tmpFunc, "latency");
auto dsp = getIntAttrValue(tmpFunc, "dsp");
if (dsp <= numDSP && latency < minLatency * 0.95) {
tileSizesList = newTileSizesList;
headLocList[i] = head;
minLatency = latency;
hasUpdated = true;
break;
}
}
}
// If no loop band is updated, break the searching.
if (!hasUpdated)
break; break;
} }
// Finally, apply the best tiling strategy. // Finally, we found the best tiling strategy.
LLVM_DEBUG(llvm::dbgs() << "Found the best tiling strategy.\n";); LLVM_DEBUG(llvm::dbgs() << "4. Apply the best tiling strategy.\n";);
applyLoopTilingStrategy(func, bestTileSizesList); applyLoopTilingStrategy(func, tileSizesList);
} }
namespace { namespace {

View File

@ -56,7 +56,7 @@ struct PartialAffineLoopTile
for (auto *op : func.getContext()->getRegisteredOperations()) for (auto *op : func.getContext()->getRegisteredOperations())
op->getCanonicalizationPatterns(patterns, func.getContext()); op->getCanonicalizationPatterns(patterns, func.getContext());
applyPatternsAndFoldGreedily(func.getRegion(), std::move(patterns)); applyPatternsAndFoldGreedily(func, std::move(patterns));
} }
}; };
} // namespace } // namespace