[QoREstimation] consider loop/function interval in the DSP resource estimation

2021-01-10 20:48:54 -06:00 · 2021-01-10 20:48:54 -06:00 · 996c9cff8a
parent 9cc5f3abdc
commit 996c9cff8a
3 changed files with 71 additions and 56 deletions
--- a/include/Analysis/QoREstimation.h
+++ b/include/Analysis/QoREstimation.h
@ -108,7 +108,7 @@ public:
  /// Block scheduler and estimator.
  int64_t getResourceMap(Block &block, ResourceMap &addFMap,
                         ResourceMap &mulFMap);
-  int64_t estimateResource(Block &block);
+  int64_t estimateResource(Block &block, int64_t interval = -1);
  Optional<std::pair<int64_t, int64_t>> estimateBlock(Block &block,
                                                      int64_t begin);
  void reverseSchedule();
--- a/lib/Analysis/QoREstimation.cpp
+++ b/lib/Analysis/QoREstimation.cpp
@ -443,6 +443,39 @@ bool HLSCppEstimator::visitOp(AffineForOp op, int64_t begin) {
    setAttrValue(op, "trip_count", (int64_t)1);
  }

+  // If the current loop is annotated as pipelined loop, extra dependency and
+  // resource aware II analysis will be executed.
+  if (getBoolAttrValue(op, "pipeline")) {
+    // Collect load and store operations in the loop block for solving possible
+    // carried dependencies.
+    // TODO: include CallOps, how? Maybe we need to somehow analyze the memory
+    // access behavior of the CallOp.
+    MemAccessesMap map;
+    getMemAccessesMap(loopBlock, map);
+
+    // Calculate initial interval.
+    auto II = max(getResMinII(map), getDepMinII(op, map));
+    // auto II = max({getOpMinII(op), getResMinII(map), getDepMinII(op, map)});
+    setAttrValue(op, "init_interval", II);
+
+    auto tripCount = getIntAttrValue(op, "trip_count");
+    setAttrValue(op, "flatten_trip_count", tripCount);
+
+    // Calculate latency of each iteration.
+    auto iterLatency = end - begin;
+    setAttrValue(op, "iter_latency", iterLatency);
+
+    auto latency = iterLatency + II * (tripCount - 1);
+    setAttrValue(op, "latency", latency);
+
+    // Entering and leaving a loop will consume extra 2 clock cycles.
+    setScheduleValue(op, begin, begin + latency + 2);
+
+    // Estimate the loop block resource utilization.
+    setAttrValue(op, "dsp", estimateResource(loopBlock, II));
+    return true;
+  }
+
  // If the current loop is annotated as flatten, it will be flattened into the
  // child pipelined loop. This will increase the flattened loop trip count
  // without changing the iteration latency.
@ -471,50 +504,16 @@ bool HLSCppEstimator::visitOp(AffineForOp op, int64_t begin) {
    return true;
  }

-  // Estimate the loop block resource utilization.
-  auto resource = estimateResource(loopBlock);
-
-  // Calculate latency of each iteration.
+  // Default case (not flattend or pipelined), calculate latency and resource
+  // utilization accordingly.
  auto iterLatency = end - begin;
  setAttrValue(op, "iter_latency", iterLatency);

-  // If the current loop is annotated as pipelined loop, extra dependency and
-  // resource aware II analysis will be executed.
-  if (getBoolAttrValue(op, "pipeline")) {
-    // Collect load and store operations in the loop block for solving possible
-    // carried dependencies.
-    // TODO: include CallOps, how? Maybe we need to somehow analyze the memory
-    // access behavior of the CallOp.
-    MemAccessesMap map;
-    getMemAccessesMap(loopBlock, map);
-
-    // Calculate initial interval.
-    auto II = max(getResMinII(map), getDepMinII(op, map));
-    // auto II = max({getOpMinII(op), getResMinII(map), getDepMinII(op, map)});
-    setAttrValue(op, "init_interval", II);
-
-    auto tripCount = getIntAttrValue(op, "trip_count");
-    setAttrValue(op, "flatten_trip_count", tripCount);
-
-    auto latency = iterLatency + II * (tripCount - 1);
-    setAttrValue(op, "latency", latency);
-
-    // Entering and leaving a loop will consume extra 2 clock cycles.
-    setScheduleValue(op, begin, begin + latency + 2);
-
-    // TODO: For pipelined loop, we also need to consider the II when estimating
-    // resource utilization.
-    setAttrValue(op, "dsp", resource);
-    return true;
-  }
-
-  // Default case (not flattend or pipelined), calculate latency and resource
-  // utilization accordingly.
  auto latency = iterLatency * getIntAttrValue(op, "trip_count");
  setAttrValue(op, "latency", latency);

  setScheduleValue(op, begin, begin + latency + 2);
-  setAttrValue(op, "dsp", resource);
+  setAttrValue(op, "dsp", estimateResource(loopBlock));
  return true;
 }

@ -598,24 +597,41 @@ int64_t HLSCppEstimator::getResourceMap(Block &block, ResourceMap &addFMap,
  return loopResource;
 }

-int64_t HLSCppEstimator::estimateResource(Block &block) {
+int64_t HLSCppEstimator::estimateResource(Block &block, int64_t interval) {
  ResourceMap addFMap;
  ResourceMap mulFMap;
  auto loopResource = getResourceMap(block, addFMap, mulFMap);

  // Find the max resource utilization across all schedule levels.
  int64_t maxAddF = 0;
-  for (auto level : addFMap)
+  int64_t totalAddF = 0;
+  for (auto level : addFMap) {
    maxAddF = max(maxAddF, level.second);
+    totalAddF += level.second;
+  }

  int64_t maxMulF = 0;
-  for (auto level : mulFMap)
+  int64_t totalMulF = 0;
+  for (auto level : mulFMap) {
    maxMulF = max(maxMulF, level.second);
+    totalMulF += level.second;
+  }
+
+  // Calculate the total fadd and fmul number as each operation will cover
+  // {latency + 1} scheduling level.
+  totalAddF /= (latencyMap["fadd"] + 1);
+  totalMulF /= (latencyMap["fmul"] + 1);
+
+  // If the block is pipelined (interval is positive), the minimum resource
+  // utilization is determined by interval. We assume the loop resource
+  // utilization cannot be shared. Therefore, the overall resource utilization
+  // is loops' plus other operstions'. According to profiling, floating-point
+  // add and muliply will consume 2 and 3 DSP units, respectively.
+  if (interval > 0) {
+    auto minResource = (totalAddF * 2 + totalMulF * 3) / interval;
+    return loopResource + max(maxAddF * 2 + maxMulF * 3, minResource);
+  }

-  // We assume the loop resource utilization cannot be shared. Therefore, the
-  // overall resource utilization is loops' plus other operstions'. According to
-  // profiling, floating-point add and muliply will consume 2 and 3 DSP units,
-  // respectively.
  return loopResource + maxAddF * 2 + maxMulF * 3;
 }

@ -716,16 +732,14 @@ void HLSCppEstimator::estimateFunc() {
    // TODO: support dataflow interval estimation.

    // TODO: support CallOp inside of the function.
-    if (auto attr = func.getAttrOfType<BoolAttr>("pipeline")) {
-      if (attr.getValue()) {
-        // Collect all memory access operations for calculating II.
-        MemAccessesMap map;
-        getMemAccessesMap(func.front(), map);
+    if (getBoolAttrValue(func, "pipeline")) {
+      // Collect all memory access operations for calculating II.
+      MemAccessesMap map;
+      getMemAccessesMap(func.front(), map);

-        // Calculate initial interval.
-        auto II = max(getResMinII(map), getDepMinII(func, map));
-        setAttrValue(func, "interval", II);
-      }
+      // Calculate initial interval.
+      auto II = max(getResMinII(map), getDepMinII(func, map));
+      setAttrValue(func, "interval", II);
    }

    // Scheduled levels of all operations are reversed in this method, because
@ -741,7 +755,8 @@ void HLSCppEstimator::estimateFunc() {
  }

  // Estimate the resource utilization of the function.
-  setAttrValue(func, "dsp", estimateResource(func.front()));
+  auto interval = getIntAttrValue(func, "interval");
+  setAttrValue(func, "dsp", estimateResource(func.front(), interval));
  // TODO: estimate BRAM and LUT utilization.
 }

--- a/samples/polybench/syrk.mlir
+++ b/samples/polybench/syrk.mlir
@ -8,7 +8,7 @@ func @test_syrk(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %C: memref<16x16
      %0 = affine.load %C[%i, %j] : memref<16x16xf32>
      %1 = mulf %beta, %0 : f32
      affine.store %1, %C[%i, %j] : memref<16x16xf32>
-      affine.for %k = 0 to 8 {
+      affine.for %k = 0 to 16 {
        %2 = affine.load %A[%i, %k] : memref<16x16xf32>
        %3 = affine.load %A[%j, %k] : memref<16x16xf32>
        %4 = affine.load %C[%i, %j] : memref<16x16xf32>