[MultipleLevelDSE] support more fine-grained tiling strategy searching
This commit is contained in:
parent
7641af0c04
commit
9cd80630e0
|
@ -22,7 +22,7 @@ class HLSCppEstimator
|
|||
: public HLSCppVisitorBase<HLSCppEstimator, bool, int64_t>,
|
||||
public HLSCppAnalysisBase {
|
||||
public:
|
||||
explicit HLSCppEstimator(FuncOp &func, LatencyMap &latencyMap)
|
||||
explicit HLSCppEstimator(FuncOp func, LatencyMap &latencyMap)
|
||||
: HLSCppAnalysisBase(OpBuilder(func)), func(func),
|
||||
latencyMap(latencyMap) {
|
||||
getFuncDependencies();
|
||||
|
@ -133,7 +133,7 @@ public:
|
|||
void reverseSchedule();
|
||||
void estimateFunc();
|
||||
|
||||
FuncOp &func;
|
||||
FuncOp func;
|
||||
DependsMap dependsMap;
|
||||
PortsMapDict portsMapDict;
|
||||
LatencyMap &latencyMap;
|
||||
|
|
|
@ -33,23 +33,13 @@ static int64_t getInnerParallelism(AffineForOp forOp) {
|
|||
return std::max(count, (int64_t)1);
|
||||
}
|
||||
|
||||
/// Clean up all attributes annotated for scheduling in the function for the
|
||||
/// convenience of other transforms.
|
||||
// static void cleanScheduleAttributes(FuncOp func) {
|
||||
// func.walk([&](Operation *op) {
|
||||
// op->removeAttr("schedule_begin");
|
||||
// op->removeAttr("schedule_end");
|
||||
// op->removeAttr("partition_index");
|
||||
// });
|
||||
// }
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Optimizer Class Declaration
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
class HLSCppOptimizer : public HLSCppAnalysisBase {
|
||||
public:
|
||||
explicit HLSCppOptimizer(FuncOp &func, LatencyMap &latencyMap, int64_t numDSP)
|
||||
explicit HLSCppOptimizer(FuncOp func, LatencyMap &latencyMap, int64_t numDSP)
|
||||
: HLSCppAnalysisBase(OpBuilder(func)), func(func), latencyMap(latencyMap),
|
||||
numDSP(numDSP) {
|
||||
// TODO: only insert affine-related patterns.
|
||||
|
@ -61,14 +51,16 @@ public:
|
|||
|
||||
using TileSizes = SmallVector<unsigned, 8>;
|
||||
|
||||
void emitDebugInfo(FuncOp &targetFunc, StringRef message);
|
||||
void applyLoopTilingStrategy(FuncOp &targetFunc,
|
||||
void emitDebugInfo(FuncOp targetFunc, StringRef message);
|
||||
void applyLoopTilingStrategy(FuncOp targetFunc,
|
||||
ArrayRef<TileSizes> tileSizesList);
|
||||
void updateTileSizesAtHead(TileSizes &tileSizes, const TileSizes &tripCounts,
|
||||
unsigned &head);
|
||||
|
||||
/// This is a temporary approach that does not scale.
|
||||
void applyMultipleLevelDSE();
|
||||
|
||||
FuncOp &func;
|
||||
FuncOp func;
|
||||
LatencyMap &latencyMap;
|
||||
int64_t numDSP;
|
||||
FrozenRewritePatternList patterns;
|
||||
|
@ -78,7 +70,7 @@ public:
|
|||
// Optimizer Class Definition
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) {
|
||||
void HLSCppOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) {
|
||||
LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency");
|
||||
auto dsp = getIntAttrValue(targetFunc, "dsp");
|
||||
|
||||
|
@ -88,7 +80,7 @@ void HLSCppOptimizer::emitDebugInfo(FuncOp &targetFunc, StringRef message) {
|
|||
}
|
||||
|
||||
void HLSCppOptimizer::applyLoopTilingStrategy(
|
||||
FuncOp &targetFunc, ArrayRef<TileSizes> tileSizesList) {
|
||||
FuncOp targetFunc, ArrayRef<TileSizes> tileSizesList) {
|
||||
AffineLoopBands targetBands;
|
||||
getLoopBands(targetFunc.front(), targetBands);
|
||||
|
||||
|
@ -99,10 +91,8 @@ void HLSCppOptimizer::applyLoopTilingStrategy(
|
|||
applyPatternsAndFoldGreedily(targetFunc, patterns);
|
||||
|
||||
// Apply loop pipelining.
|
||||
for (auto band : targetBands) {
|
||||
auto pipelineLoop = band[band.size() / 2 - 1];
|
||||
applyLoopPipelining(pipelineLoop, builder);
|
||||
}
|
||||
for (auto &band : targetBands)
|
||||
applyLoopPipelining(band[band.size() / 2 - 1], builder);
|
||||
applyPatternsAndFoldGreedily(targetFunc, patterns);
|
||||
|
||||
// Apply general optimizations and array partition.
|
||||
|
@ -113,11 +103,45 @@ void HLSCppOptimizer::applyLoopTilingStrategy(
|
|||
applyPatternsAndFoldGreedily(targetFunc, patterns);
|
||||
|
||||
// Estimate performance and resource utilization.
|
||||
LLVM_DEBUG(llvm::dbgs() << "Current tiling strategy:\n"; idx = 0;
|
||||
for (auto tileSizes
|
||||
: tileSizesList) {
|
||||
llvm::dbgs() << "Loop band " << Twine(idx++) << ":";
|
||||
for (auto size : tileSizes) {
|
||||
llvm::dbgs() << " " << Twine(size);
|
||||
}
|
||||
llvm::dbgs() << "\n";
|
||||
});
|
||||
HLSCppEstimator(targetFunc, latencyMap).estimateFunc();
|
||||
emitDebugInfo(targetFunc, "Apply loop tiling and pipelining, general "
|
||||
"optimizations, and array partition.");
|
||||
}
|
||||
|
||||
/// Update tile sizes by a factor of 2 at the head location.
|
||||
void HLSCppOptimizer::updateTileSizesAtHead(TileSizes &tileSizes,
|
||||
const TileSizes &tripCounts,
|
||||
unsigned &head) {
|
||||
assert(tileSizes.size() == tripCounts.size() &&
|
||||
"unexpected input tile sizes");
|
||||
|
||||
for (unsigned e = tileSizes.size(); head < e; ++head) {
|
||||
auto size = tileSizes[head];
|
||||
auto tripCount = tripCounts[head];
|
||||
|
||||
// At this stage, size must be 1 or a number which is divisible
|
||||
// by tripCount. We need to find the update factor now.
|
||||
if (size < tripCount) {
|
||||
unsigned factor = 2;
|
||||
while (tripCount % (size * factor) != 0)
|
||||
factor++;
|
||||
|
||||
size *= factor;
|
||||
tileSizes[head] = size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is a temporary approach that does not scale.
|
||||
void HLSCppOptimizer::applyMultipleLevelDSE() {
|
||||
HLSCppEstimator(func, latencyMap).estimateFunc();
|
||||
|
@ -280,60 +304,103 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
|
|||
//===--------------------------------------------------------------------===//
|
||||
|
||||
// Holding trip counts of all loops in each loop band.
|
||||
std::vector<TileSizes> targetTileSizesList;
|
||||
std::vector<TileSizes> tripCountsList;
|
||||
// Holding the current tiling sizes of each loop band.
|
||||
std::vector<TileSizes> currentTileSizesList;
|
||||
std::vector<TileSizes> tileSizesList;
|
||||
// Holding the current loop tiling location in each loop band.
|
||||
SmallVector<unsigned, 8> headLocationList;
|
||||
SmallVector<unsigned, 8> headLocList;
|
||||
|
||||
// Initialize all design vectors.
|
||||
for (auto band : targetBands) {
|
||||
TileSizes targetSizes;
|
||||
TileSizes baseSizes;
|
||||
TileSizes tripCounts;
|
||||
TileSizes sizes;
|
||||
for (auto loop : band) {
|
||||
targetSizes.push_back(getIntAttrValue(loop, "trip_count"));
|
||||
baseSizes.push_back(1);
|
||||
tripCounts.push_back(getIntAttrValue(loop, "trip_count"));
|
||||
sizes.push_back(1);
|
||||
}
|
||||
targetTileSizesList.push_back(targetSizes);
|
||||
currentTileSizesList.push_back(baseSizes);
|
||||
headLocationList.push_back(0);
|
||||
tripCountsList.push_back(tripCounts);
|
||||
tileSizesList.push_back(sizes);
|
||||
headLocList.push_back(0);
|
||||
}
|
||||
|
||||
// For recording the minimum latency and best tiling strategy.
|
||||
unsigned minLatency = getIntAttrValue(func, "latency");
|
||||
std::vector<TileSizes> bestTileSizesList;
|
||||
LLVM_DEBUG(llvm::dbgs() << "3. Search for the best tiling strategy.\n";);
|
||||
applyLoopTilingStrategy(func, tileSizesList);
|
||||
|
||||
// TODO: more fined grained and comprehensive dse.
|
||||
unsigned tolerantCount = 0;
|
||||
unsigned minLatency = getIntAttrValue(func, "latency");
|
||||
unsigned targetNum = targetBands.size();
|
||||
while (true) {
|
||||
// Clone the current function and apply the current tiling strategy.
|
||||
auto tmpFunc = func.clone();
|
||||
applyLoopTilingStrategy(tmpFunc, currentTileSizesList);
|
||||
// If there're more than one loop bands in the function, we'll first try to
|
||||
// update the tiling size of ALL target loop bands with a factor of 2. This
|
||||
// is for reducing the DSE complexity.
|
||||
if (targetNum > 1) {
|
||||
std::vector<TileSizes> newTileSizesList = tileSizesList;
|
||||
SmallVector<unsigned, 8> newHeadLocList = headLocList;
|
||||
|
||||
// If the resource constaints are not met or the latency is not increased,
|
||||
// increase the tolerant counter by 1.
|
||||
auto latency = getIntAttrValue(tmpFunc, "latency");
|
||||
if (getIntAttrValue(tmpFunc, "dsp") <= numDSP) {
|
||||
if (latency < minLatency) {
|
||||
for (unsigned i = 0; i < targetNum; ++i)
|
||||
updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i],
|
||||
newHeadLocList[i]);
|
||||
|
||||
auto tmpFunc = func.clone();
|
||||
applyLoopTilingStrategy(tmpFunc, newTileSizesList);
|
||||
|
||||
// If the resource constaints are not met or the latency is not increased,
|
||||
// we try more fine grained strategy. Otherwise, we accept the new tile
|
||||
// strategy and head location, and enter the next iteration. We set a
|
||||
// threshold 0.95 here to avoid glitches.
|
||||
// TODO: fine tune the exit condition.
|
||||
auto latency = getIntAttrValue(tmpFunc, "latency");
|
||||
auto dsp = getIntAttrValue(tmpFunc, "dsp");
|
||||
|
||||
if (dsp <= numDSP && latency < minLatency * 0.95) {
|
||||
tileSizesList = newTileSizesList;
|
||||
headLocList = newHeadLocList;
|
||||
minLatency = latency;
|
||||
bestTileSizesList = currentTileSizesList;
|
||||
tolerantCount = 0;
|
||||
} else
|
||||
tolerantCount++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// If the tolerant counter is larger than a threshold, we'll stop to
|
||||
// increase the tiling size.
|
||||
if (tolerantCount > 1)
|
||||
break;
|
||||
// else
|
||||
// currentTileSize *= 2;
|
||||
} else
|
||||
// Walk through all loop bands in the function and update tiling strategy
|
||||
// one by one.
|
||||
bool hasUpdated = false;
|
||||
for (unsigned i = 0; i < targetNum; ++i) {
|
||||
// TODO: This is not efficient. As our estimation can be conducted in a
|
||||
// more structural way, we should only focus on the current loop rather
|
||||
// than the whole function. But for now this makes sense because we are
|
||||
// only focusing on computation kernel level algorithms that typcially
|
||||
// only have handy loop bands.
|
||||
for (unsigned head = headLocList[i], e = tileSizesList[i].size();
|
||||
head < e; ++head) {
|
||||
// Only update the tiling strategy and head location of the current
|
||||
// loop band.
|
||||
std::vector<TileSizes> newTileSizesList = tileSizesList;
|
||||
updateTileSizesAtHead(newTileSizesList[i], tripCountsList[i], head);
|
||||
|
||||
auto tmpFunc = func.clone();
|
||||
applyLoopTilingStrategy(tmpFunc, newTileSizesList);
|
||||
|
||||
auto latency = getIntAttrValue(tmpFunc, "latency");
|
||||
auto dsp = getIntAttrValue(tmpFunc, "dsp");
|
||||
|
||||
if (dsp <= numDSP && latency < minLatency * 0.95) {
|
||||
tileSizesList = newTileSizesList;
|
||||
headLocList[i] = head;
|
||||
minLatency = latency;
|
||||
|
||||
hasUpdated = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no loop band is updated, break the searching.
|
||||
if (!hasUpdated)
|
||||
break;
|
||||
}
|
||||
|
||||
// Finally, apply the best tiling strategy.
|
||||
LLVM_DEBUG(llvm::dbgs() << "Found the best tiling strategy.\n";);
|
||||
applyLoopTilingStrategy(func, bestTileSizesList);
|
||||
// Finally, we found the best tiling strategy.
|
||||
LLVM_DEBUG(llvm::dbgs() << "4. Apply the best tiling strategy.\n";);
|
||||
applyLoopTilingStrategy(func, tileSizesList);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
|
|
@ -56,7 +56,7 @@ struct PartialAffineLoopTile
|
|||
for (auto *op : func.getContext()->getRegisteredOperations())
|
||||
op->getCanonicalizationPatterns(patterns, func.getContext());
|
||||
|
||||
applyPatternsAndFoldGreedily(func.getRegion(), std::move(patterns));
|
||||
applyPatternsAndFoldGreedily(func, std::move(patterns));
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
|
Loading…
Reference in New Issue