[QoREstimation] design parameters data structure; initial impl of longer path search

2020-09-23 18:23:43 -05:00 · 2020-09-23 18:23:43 -05:00 · ce293ee4c5
parent 9049f0cec8
commit ce293ee4c5
3 changed files with 130 additions and 65 deletions
--- a/include/Dialect/HLSCpp/Attributes.td
+++ b/include/Dialect/HLSCpp/Attributes.td
@ -50,19 +50,4 @@ def LegalPartitionType : AttrConstraint<Or<[

 def PartitionTypeAttr : Confined<StrAttr, [LegalPartitionType]> {}

-//===----------------------------------------------------------------------===//
-// Pragma bind_op Constraints
-//===----------------------------------------------------------------------===//
-
-def LegalOpImpl : AttrConstraint<Or<[
-  CPred<"$_self.cast<StringAttr>().getValue() == \"dsp\"">,
-  CPred<"$_self.cast<StringAttr>().getValue() == \"fabric\"">,
-  CPred<"$_self.cast<StringAttr>().getValue() == \"meddsp\"">,
-  CPred<"$_self.cast<StringAttr>().getValue() == \"fulldsp\"">,
-  CPred<"$_self.cast<StringAttr>().getValue() == \"maxdsp\"">,
-  CPred<"$_self.cast<StringAttr>().getValue() == \"primitivedsp\"">
-]>>;
-
-def OpImplAttr : Confined<StrAttr, [LegalOpImpl]> {}
-
 #endif // SCALEHLS_DIALECT_HLSCPP_ATTRIBUTES_TD
--- a/lib/Transforms/PragmaDSE.cpp
+++ b/lib/Transforms/PragmaDSE.cpp
@ -6,8 +6,10 @@
 #include "Transforms/INIReader.h"
 #include "Transforms/Passes.h"

+using namespace std;
 using namespace mlir;
 using namespace scalehls;
+using namespace hlscpp;

 namespace {
 struct PragmaDSE : public PragmaDSEBase<PragmaDSE> {
--- a/lib/Transforms/QoREstimation.cpp
+++ b/lib/Transforms/QoREstimation.cpp
@ -6,66 +6,153 @@
 #include "Transforms/INIReader.h"
 #include "Transforms/Passes.h"

+using namespace std;
 using namespace mlir;
 using namespace scalehls;
+using namespace hlscpp;
+
+/// This class includes all possible parameters kind for "processes" (function,
+/// for/parallel loop, and if).
+enum class ProcParam {
+  // Pragam configurations.
+  EnablePipeline,
+  InitialInterval,
+  UnrollFactor,
+
+  // Performance parameters.
+  LoopBound,
+  IterLatency,
+  Latency,
+
+  // Resource parameters.
+  LUT,
+  DSP,
+  BRAM
+};
+
+/// This class includes all possible parameters kind for memories (memref,
+/// tensor, and vector).
+enum class MemParam {
+  // Pragma configurations.
+  StorageType,
+  StorageImpl,
+  PartitionType,
+  PartitionFactor,
+  InterfaceMode,
+
+  // Performance parameters.
+  ReadNum,
+  WriteNum,
+  ReadPorts,
+  WritePorts,
+  DepdcyLatency,
+  DepdcyDistance,
+
+  // Resource parameters.
+  LUT,
+  BRAM
+};

-/*
 namespace {
 class QoREstimator {
 public:
-  explicit QoREstimator(std::string toolConfigPath, std::string opLatencyPath) {
-    INIReader toolConfig(toolConfigPath);
-    if (toolConfig.ParseError())
-      llvm::outs() << "error: Tool configuration file parse fail.\n";
+  explicit QoREstimator(std::string targetSpecPath, std::string opLatencyPath);

-    INIReader opLatency(opLatencyPath);
-    if (opLatency.ParseError())
-      llvm::outs() << "error: Op latency file parse fail.\n";
-
-    auto freq = toolConfig.Get("config", "frequency", "200MHz");
-    auto latency = opLatency.GetInteger(freq, "op", 0);
-    llvm::outs() << latency << "\n";
+  /// Get parameters.
+  unsigned getMemParam(Value *mem, MemParam kind) {
+    return memParams[mem][(unsigned)kind];
+  }
+  unsigned getProcParam(Operation *proc, ProcParam kind) {
+    return procParams[proc][(unsigned)kind];
  }

-  void estimateLoop(AffineForOp loop);
+  /// These methods can extract static parameters and pragma configurations (if
+  /// applicable) of the input CDFG, and update them in procParams or memParams.
+  void analyzePragma(ModuleOp module);
+  void analyzeModule(ModuleOp module);
+
+  /// These methods can estimate the performance and resource utilization of a
+  /// specific MLIR structure, and update them in procParams or memroyParams.
+  void estimateAffineFor(AffineForOp affineFor);
+  void estimateAffineParallel(AffineParallelOp affineParallel);
+  void estimateAffineIf(AffineIfOp affineIf);
  void estimateFunc(FuncOp func);
  void estimateModule(ModuleOp module);

 private:
+  DenseMap<Operation *, SmallVector<unsigned, 9>> procParams;
+  DenseMap<Value *, SmallVector<unsigned, 13>> memParams;
+
+  // Set parameters.
+  void setMemParam(Value *mem, unsigned kind, unsigned param) {
+    memParams[mem][(unsigned)kind] = param;
+  }
+  void setProcParam(Operation *proc, MemParam kind, unsigned param) {
+    procParams[proc][(unsigned)kind] = param;
+  }
 };
 } // namespace

+/// Estimator constructor.
+QoREstimator::QoREstimator(std::string targetSpecPath,
+                           std::string opLatencyPath) {
+  INIReader targetSpec(targetSpecPath);
+  if (targetSpec.ParseError())
+    llvm::outs() << "error: target spec file parse fail, please refer to "
+                    "--help option and pass in correct file path\n";
+
+  INIReader opLatency(opLatencyPath);
+  if (opLatency.ParseError())
+    llvm::outs() << "error: Op latency file parse fail, please refer to "
+                    "--help option and pass in correct file path\n";
+
+  auto freq = targetSpec.Get("config", "frequency", "200MHz");
+  auto latency = opLatency.GetInteger(freq, "op", 0);
+  llvm::outs() << latency << "\n";
+}
+
+/// This method will search the longest path in a DAG block using a ASAP (As
+/// Soon As Possible) manner. Loop, function, if, and other operation owning
+/// regions will be considered as a whole.
+unsigned searchLongestPath(Block &block) {
+  DenseMap<Value, unsigned> valueReadyTime;
+  unsigned blockReadyTime = 0;
+  for (auto &op : block) {
+
+    // Calculate ready time of all predecessors.
+    unsigned allPredsReadyTime = 0;
+    for (auto operand : op.getOperands()) {
+      if (operand.getKind() == Value::Kind::BlockArgument)
+        continue;
+      else if (operand.getParentBlock() != &block)
+        continue;
+      else
+        allPredsReadyTime = max(allPredsReadyTime, valueReadyTime[operand]);
+    }
+
+    // Calculate ready time of the current operation.
+    unsigned opReadyTime = allPredsReadyTime + 1;
+    for (auto result : op.getResults())
+      valueReadyTime[result] = opReadyTime;
+
+    // Update block ready time.
+    blockReadyTime = max(blockReadyTime, opReadyTime);
+  }
+  return blockReadyTime;
+}
+
 /// For now, estimation for unrolled loops are following the analytical model
 /// of COMBA, which is suspected to be wrong. Meanwhile, we assume the absence
 /// of function call in the loop body.
-void QoREstimator::estimateLoop(AffineForOp loop) {
-  auto &body = loop.getLoopBody();
+void QoREstimator::estimateAffineFor(AffineForOp affineFor) {
+  auto &body = affineFor.getLoopBody();
  if (body.getBlocks().size() != 1)
-    loop.emitError("has zero or more than one basic blocks.");
+    affineFor.emitError("has zero or more than one basic blocks.");

-  auto paramOp = dyn_cast<hlscpp::LoopParamOp>(body.front().front());
-  if (!paramOp) {
-    loop.emitError("doesn't have parameter operations as front.");
-    return;
-  }
-
-  // TODO: a simple AEAP scheduling.
-  unsigned iterLatency = paramOp.getNonprocLatency();
  for (auto &op : body.front()) {
-    if (auto subLoop = dyn_cast<mlir::AffineForOp>(op)) {
-      estimateLoop(subLoop);
-      auto subParamOp =
-          dyn_cast<hlscpp::LoopParamOp>(subLoop.getLoopBody().front().front());
-      iterLatency += subParamOp.getLatency();
-    }
+    if (auto subAffineFor = dyn_cast<mlir::AffineForOp>(op))
+      estimateAffineFor(subAffineFor);
  }
-
-  unsigned latency = iterLatency;
-  // When loop is not completely unrolled.
-  if (paramOp.getLoopBound() > 1)
-    latency = iterLatency * paramOp.getLoopBound() * paramOp.getUnrollFactor();
-  auto builder = Builder(paramOp.getContext());
-  paramOp.setAttr("latency", builder.getUI32IntegerAttr(latency));
 }

 /// For now, function pipelining and task-level dataflow optimizations are not
@ -74,25 +161,18 @@ void QoREstimator::estimateFunc(FuncOp func) {
  if (func.getBlocks().size() != 1)
    func.emitError("has zero or more than one basic blocks.");

-  auto paramOp = dyn_cast<FuncParamOp>(func.front().front());
-  if (!paramOp) {
-    func.emitError("doesn't have parameter operations as front.");
-    return;
-  }
-
  // Recursively estimate latency of sub-elements, including functions and
  // loops. These sub-elements will be considered as a normal node in the CDFG
  // for function latency estimzation.
  for (auto &op : func.front()) {
    if (auto subFunc = dyn_cast<FuncOp>(op))
      estimateFunc(subFunc);
-    else if (auto subLoop = dyn_cast<AffineForOp>(op))
-      estimateLoop(subLoop);
+    else if (auto subAffineFor = dyn_cast<AffineForOp>(op))
+      estimateAffineFor(subAffineFor);
  }

  // Estimate function latency.
-  for (auto &op : func.front()) {
-  }
+  llvm::outs() << searchLongestPath(func.front()) << "\n";
 }

 void QoREstimator::estimateModule(ModuleOp module) {
@ -104,12 +184,10 @@ void QoREstimator::estimateModule(ModuleOp module) {
  }
 }

-*/
-
 namespace {
 struct QoREstimation : public QoREstimationBase<QoREstimation> {
  void runOnOperation() override {
-    // QoREstimator(toolConfig, opLatency).estimateModule(getOperation());
+    QoREstimator(targetSpec, opLatency).estimateModule(getOperation());
  }
 };
 } // namespace