[QoREstimation] Basic bram estimation with array partitioning

This commit is contained in:
Paul Jeong 2021-01-11 03:03:45 -06:00 committed by Hanchen Ye
parent e47205e779
commit 5b8b38b732
3 changed files with 69 additions and 17 deletions

View File

@ -48,9 +48,27 @@ public:
// For storing the DSP resource utilization indexed by the schedule level.
using ResourceMap = DenseMap<int64_t, int64_t>;
/// For storing all resource types.
struct Resource {
int64_t bram;
int64_t dsp;
int64_t ff;
int64_t lut;
Resource(int64_t bram = 0, int64_t dsp = 0, int64_t ff = 0, int64_t lut = 0)
: bram(bram), dsp(dsp), ff(ff), lut(lut) {}
};
/// Collect all dependencies detected in the function.
void getFuncDependencies();
void setResourceValue(Operation *op, Resource resource) {
setAttrValue(op, "bram", resource.bram);
setAttrValue(op, "dsp", resource.dsp);
setAttrValue(op, "ff", resource.ff);
setAttrValue(op, "lut", resource.lut);
}
void setScheduleValue(Operation *op, int64_t begin, int64_t end) {
setAttrValue(op, "schedule_begin", begin);
setAttrValue(op, "schedule_end", end);
@ -107,7 +125,7 @@ public:
/// Block scheduler and estimator.
int64_t getResourceMap(Block &block, ResourceMap &addFMap,
ResourceMap &mulFMap);
int64_t estimateResource(Block &block, int64_t interval = -1);
Resource estimateResource(Block &block, int64_t interval = -1);
Optional<std::pair<int64_t, int64_t>> estimateBlock(Block &block,
int64_t begin);
void reverseSchedule();

View File

@ -507,7 +507,7 @@ bool HLSCppEstimator::visitOp(AffineForOp op, int64_t begin) {
setScheduleValue(op, begin, begin + latency + 2);
// Estimate the loop block resource utilization.
setAttrValue(op, "dsp", estimateResource(loopBlock, II));
setResourceValue(op, estimateResource(loopBlock, II));
return true;
}
@ -548,7 +548,7 @@ bool HLSCppEstimator::visitOp(AffineForOp op, int64_t begin) {
setAttrValue(op, "latency", latency);
setScheduleValue(op, begin, begin + latency + 2);
setAttrValue(op, "dsp", estimateResource(loopBlock));
setResourceValue(op, estimateResource(loopBlock));
return true;
}
@ -602,6 +602,7 @@ bool HLSCppEstimator::visitOp(mlir::CallOp op, int64_t begin) {
// Block Scheduler and Estimator
//===----------------------------------------------------------------------===//
// TODO: name to change.
int64_t HLSCppEstimator::getResourceMap(Block &block, ResourceMap &addFMap,
ResourceMap &mulFMap) {
int64_t loopResource = 0;
@ -632,7 +633,8 @@ int64_t HLSCppEstimator::getResourceMap(Block &block, ResourceMap &addFMap,
return loopResource;
}
int64_t HLSCppEstimator::estimateResource(Block &block, int64_t interval) {
HLSCppEstimator::Resource HLSCppEstimator::estimateResource(Block &block,
int64_t interval) {
ResourceMap addFMap;
ResourceMap mulFMap;
auto loopResource = getResourceMap(block, addFMap, mulFMap);
@ -657,17 +659,24 @@ int64_t HLSCppEstimator::estimateResource(Block &block, int64_t interval) {
totalAddF /= (latencyMap["fadd"] + 1);
totalMulF /= (latencyMap["fmul"] + 1);
// We assume the loop resource utilization cannot be shared. Therefore, the
// overall resource utilization is loops' plus other operstions'. According to
// profiling, floating-point add and muliply will consume 2 and 3 DSP units,
// respectively.
auto dsp = loopResource + maxAddF * 2 + maxMulF * 3;
// If the block is pipelined (interval is positive), the minimum resource
// utilization is determined by interval. We assume the loop resource
// utilization cannot be shared. Therefore, the overall resource utilization
// is loops' plus other operstions'. According to profiling, floating-point
// add and muliply will consume 2 and 3 DSP units, respectively.
// utilization is determined by interval.
if (interval > 0) {
auto minResource = (totalAddF * 2 + totalMulF * 3) / interval;
return loopResource + max(maxAddF * 2 + maxMulF * 3, minResource);
dsp = loopResource + max(maxAddF * 2 + maxMulF * 3, minResource);
}
return loopResource + maxAddF * 2 + maxMulF * 3;
// TODO
int64_t bram = 0;
int64_t ff = 0;
int64_t lut = 0;
return Resource(bram, dsp, ff, lut);
}
/// Estimate the latency of a block with ALAP scheduling strategy, return the
@ -759,20 +768,21 @@ void HLSCppEstimator::reverseSchedule() {
}
void HLSCppEstimator::estimateFunc() {
// Collect all memory access operations for later use.
MemAccessesMap map;
getMemAccessesMap(func.front(), map);
// Recursively estimate blocks in the function.
if (auto schedule = estimateBlock(func.front(), 0)) {
auto latency = schedule.getValue().second;
setAttrValue(func, "latency", latency);
// TODO: support dataflow interval estimation.
if (getBoolAttrValue(func, "dataflow")) {
// TODO: support dataflow interval estimation.
}
// TODO: support CallOp inside of the function.
if (getBoolAttrValue(func, "pipeline")) {
// Collect all memory access operations for calculating II.
MemAccessesMap map;
getMemAccessesMap(func.front(), map);
// Calculate initial interval.
auto II = max(getResMinII(map), getDepMinII(func, map));
setAttrValue(func, "interval", II);
}
@ -791,7 +801,28 @@ void HLSCppEstimator::estimateFunc() {
// Estimate the resource utilization of the function.
auto interval = getIntAttrValue(func, "interval");
setAttrValue(func, "dsp", estimateResource(func.front(), interval));
auto resource = estimateResource(func.front(), interval);
// Calculate the function memrefs BRAM utilization.
int64_t numBram = 0;
for (auto &pair : map) {
auto memrefType = pair.first.getType().cast<MemRefType>();
auto partitionNum = getPartitionFactors(memrefType);
auto storageType = MemoryKind(memrefType.getMemorySpace());
if (storageType == MemoryKind::BRAM_1P ||
storageType == MemoryKind::BRAM_S2P ||
storageType == MemoryKind::BRAM_T2P) {
// Multiply bit width of type.
// TODO: handle index types.
int64_t memrefSize =
memrefType.getElementTypeBitWidth() * memrefType.getNumElements();
numBram += ((memrefSize + 18000 - 1) / 18000) * partitionNum;
}
}
resource.bram += numBram;
setResourceValue(func, resource);
// TODO: estimate BRAM and LUT utilization.
}

View File

@ -65,6 +65,9 @@ bool scalehls::applyArrayPartition(FuncOp func, OpBuilder &builder) {
partitions.push_back(PartitionInfo(PartitionKind::NONE, 1));
}
// TODO: the issue is the same dimension of different memref accesses
// represent different value. Therefore, the two memref access map need to
// be somehow merged to keep things correct.
// Find the best partition solution for each dimensions of the memref.
for (int64_t dim = 0; dim < memrefType.getRank(); ++dim) {
// Collect all array access indices of the current dimension.