[QoREstimation] consider loop/function interval in the DSP resource estimation

This commit is contained in:
Hanchen Ye 2021-01-10 20:48:54 -06:00
parent 9cc5f3abdc
commit 996c9cff8a
3 changed files with 71 additions and 56 deletions

View File

@ -108,7 +108,7 @@ public:
/// Block scheduler and estimator.
int64_t getResourceMap(Block &block, ResourceMap &addFMap,
ResourceMap &mulFMap);
int64_t estimateResource(Block &block);
int64_t estimateResource(Block &block, int64_t interval = -1);
Optional<std::pair<int64_t, int64_t>> estimateBlock(Block &block,
int64_t begin);
void reverseSchedule();

View File

@ -443,6 +443,39 @@ bool HLSCppEstimator::visitOp(AffineForOp op, int64_t begin) {
setAttrValue(op, "trip_count", (int64_t)1);
}
// If the current loop is annotated as pipelined loop, extra dependency and
// resource aware II analysis will be executed.
if (getBoolAttrValue(op, "pipeline")) {
// Collect load and store operations in the loop block for solving possible
// carried dependencies.
// TODO: include CallOps, how? Maybe we need to somehow analyze the memory
// access behavior of the CallOp.
MemAccessesMap map;
getMemAccessesMap(loopBlock, map);
// Calculate initial interval.
auto II = max(getResMinII(map), getDepMinII(op, map));
// auto II = max({getOpMinII(op), getResMinII(map), getDepMinII(op, map)});
setAttrValue(op, "init_interval", II);
auto tripCount = getIntAttrValue(op, "trip_count");
setAttrValue(op, "flatten_trip_count", tripCount);
// Calculate latency of each iteration.
auto iterLatency = end - begin;
setAttrValue(op, "iter_latency", iterLatency);
auto latency = iterLatency + II * (tripCount - 1);
setAttrValue(op, "latency", latency);
// Entering and leaving a loop will consume extra 2 clock cycles.
setScheduleValue(op, begin, begin + latency + 2);
// Estimate the loop block resource utilization.
setAttrValue(op, "dsp", estimateResource(loopBlock, II));
return true;
}
// If the current loop is annotated as flatten, it will be flattened into the
// child pipelined loop. This will increase the flattened loop trip count
// without changing the iteration latency.
@ -471,50 +504,16 @@ bool HLSCppEstimator::visitOp(AffineForOp op, int64_t begin) {
return true;
}
// Estimate the loop block resource utilization.
auto resource = estimateResource(loopBlock);
// Calculate latency of each iteration.
// Default case (not flattend or pipelined), calculate latency and resource
// utilization accordingly.
auto iterLatency = end - begin;
setAttrValue(op, "iter_latency", iterLatency);
// If the current loop is annotated as pipelined loop, extra dependency and
// resource aware II analysis will be executed.
if (getBoolAttrValue(op, "pipeline")) {
// Collect load and store operations in the loop block for solving possible
// carried dependencies.
// TODO: include CallOps, how? Maybe we need to somehow analyze the memory
// access behavior of the CallOp.
MemAccessesMap map;
getMemAccessesMap(loopBlock, map);
// Calculate initial interval.
auto II = max(getResMinII(map), getDepMinII(op, map));
// auto II = max({getOpMinII(op), getResMinII(map), getDepMinII(op, map)});
setAttrValue(op, "init_interval", II);
auto tripCount = getIntAttrValue(op, "trip_count");
setAttrValue(op, "flatten_trip_count", tripCount);
auto latency = iterLatency + II * (tripCount - 1);
setAttrValue(op, "latency", latency);
// Entering and leaving a loop will consume extra 2 clock cycles.
setScheduleValue(op, begin, begin + latency + 2);
// TODO: For pipelined loop, we also need to consider the II when estimating
// resource utilization.
setAttrValue(op, "dsp", resource);
return true;
}
// Default case (not flattend or pipelined), calculate latency and resource
// utilization accordingly.
auto latency = iterLatency * getIntAttrValue(op, "trip_count");
setAttrValue(op, "latency", latency);
setScheduleValue(op, begin, begin + latency + 2);
setAttrValue(op, "dsp", resource);
setAttrValue(op, "dsp", estimateResource(loopBlock));
return true;
}
@ -598,24 +597,41 @@ int64_t HLSCppEstimator::getResourceMap(Block &block, ResourceMap &addFMap,
return loopResource;
}
int64_t HLSCppEstimator::estimateResource(Block &block) {
int64_t HLSCppEstimator::estimateResource(Block &block, int64_t interval) {
ResourceMap addFMap;
ResourceMap mulFMap;
auto loopResource = getResourceMap(block, addFMap, mulFMap);
// Find the max resource utilization across all schedule levels.
int64_t maxAddF = 0;
for (auto level : addFMap)
int64_t totalAddF = 0;
for (auto level : addFMap) {
maxAddF = max(maxAddF, level.second);
totalAddF += level.second;
}
int64_t maxMulF = 0;
for (auto level : mulFMap)
int64_t totalMulF = 0;
for (auto level : mulFMap) {
maxMulF = max(maxMulF, level.second);
totalMulF += level.second;
}
// Calculate the total fadd and fmul number as each operation will cover
// {latency + 1} scheduling level.
totalAddF /= (latencyMap["fadd"] + 1);
totalMulF /= (latencyMap["fmul"] + 1);
// If the block is pipelined (interval is positive), the minimum resource
// utilization is determined by interval. We assume the loop resource
// utilization cannot be shared. Therefore, the overall resource utilization
// is loops' plus other operstions'. According to profiling, floating-point
// add and muliply will consume 2 and 3 DSP units, respectively.
if (interval > 0) {
auto minResource = (totalAddF * 2 + totalMulF * 3) / interval;
return loopResource + max(maxAddF * 2 + maxMulF * 3, minResource);
}
// We assume the loop resource utilization cannot be shared. Therefore, the
// overall resource utilization is loops' plus other operstions'. According to
// profiling, floating-point add and muliply will consume 2 and 3 DSP units,
// respectively.
return loopResource + maxAddF * 2 + maxMulF * 3;
}
@ -716,16 +732,14 @@ void HLSCppEstimator::estimateFunc() {
// TODO: support dataflow interval estimation.
// TODO: support CallOp inside of the function.
if (auto attr = func.getAttrOfType<BoolAttr>("pipeline")) {
if (attr.getValue()) {
// Collect all memory access operations for calculating II.
MemAccessesMap map;
getMemAccessesMap(func.front(), map);
if (getBoolAttrValue(func, "pipeline")) {
// Collect all memory access operations for calculating II.
MemAccessesMap map;
getMemAccessesMap(func.front(), map);
// Calculate initial interval.
auto II = max(getResMinII(map), getDepMinII(func, map));
setAttrValue(func, "interval", II);
}
// Calculate initial interval.
auto II = max(getResMinII(map), getDepMinII(func, map));
setAttrValue(func, "interval", II);
}
// Scheduled levels of all operations are reversed in this method, because
@ -741,7 +755,8 @@ void HLSCppEstimator::estimateFunc() {
}
// Estimate the resource utilization of the function.
setAttrValue(func, "dsp", estimateResource(func.front()));
auto interval = getIntAttrValue(func, "interval");
setAttrValue(func, "dsp", estimateResource(func.front(), interval));
// TODO: estimate BRAM and LUT utilization.
}

View File

@ -8,7 +8,7 @@ func @test_syrk(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %C: memref<16x16
%0 = affine.load %C[%i, %j] : memref<16x16xf32>
%1 = mulf %beta, %0 : f32
affine.store %1, %C[%i, %j] : memref<16x16xf32>
affine.for %k = 0 to 8 {
affine.for %k = 0 to 16 {
%2 = affine.load %A[%i, %k] : memref<16x16xf32>
%3 = affine.load %A[%j, %k] : memref<16x16xf32>
%4 = affine.load %C[%i, %j] : memref<16x16xf32>