hanchenye-scalehls/lib/Transforms/Directive/QoREstimation.cpp

//===----------------------------------------------------------------------===//
//
// Copyright 2020-2021 The ScaleHLS Authors.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Support/FileUtilities.h"
#include "scalehls/Transforms/Estimator.h"
#include "scalehls/Transforms/Passes.h"
#include "llvm/Support/MemoryBuffer.h"

using namespace std;
using namespace mlir;
using namespace scalehls;
using namespace hls;

//===----------------------------------------------------------------------===//
// LoadOp and StoreOp Related Methods
//===----------------------------------------------------------------------===//

/// Calculate the overall partition index.
void ScaleHLSEstimator::getPartitionIndices(Operation *op) {
  auto builder = Builder(op);
  auto access = MemRefAccess(op);
  auto memrefType = access.memref.getType().cast<MemRefType>();

  // If the layout map does not exist, it means the memory is not partitioned.
  auto layoutMap = memrefType.getLayout().getAffineMap();
  if (layoutMap.isIdentity()) {
    auto partitionIndices = SmallVector<int64_t, 8>(memrefType.getRank(), 0);
    op->setAttr("partition_indices", builder.getI64ArrayAttr(partitionIndices));
    return;
  }

  AffineValueMap accessMap;
  access.getAccessMap(&accessMap);

  // Replace all dims in the memory access AffineMap with (step * dims). This
  // will ensure the "cyclic" array partition can be correctly detected.
  SmallVector<AffineExpr, 4> dimReplacements;
  SmallVector<AffineExpr, 4> symReplacements;

  unsigned operandIdx = 0;
  for (auto operand : accessMap.getOperands()) {
    if (operandIdx < accessMap.getNumDims()) {
      int64_t step = 1;
      if (isForInductionVar(operand))
        step = getForInductionVarOwner(operand).getStep();

      dimReplacements.push_back(step * builder.getAffineDimExpr(operandIdx));
    } else {
      symReplacements.push_back(
          builder.getAffineSymbolExpr(operandIdx - accessMap.getNumDims()));
    }
    ++operandIdx;
  }

  auto newMap = accessMap.getAffineMap().replaceDimsAndSymbols(
      dimReplacements, symReplacements, accessMap.getNumDims(),
      accessMap.getNumSymbols());

  // Compose the access map with the layout map.
  auto composeMap = layoutMap.compose(newMap);

  // Collect partition factors.
  SmallVector<int64_t, 8> factors;
  getPartitionFactors(memrefType, &factors);

  // Calculate the partition index of this load/store operation honoring the
  // partition strategy applied.
  SmallVector<int64_t, 8> partitionIndices;
  int64_t maxMuxSize = 1;
  bool hasUncertainIdx = false;

  for (int64_t dim = 0; dim < memrefType.getRank(); ++dim) {
    auto idxExpr = composeMap.getResult(dim);

    if (auto constExpr = idxExpr.dyn_cast<AffineConstantExpr>())
      partitionIndices.push_back(constExpr.getValue());
    else {
      partitionIndices.push_back(-1);
      maxMuxSize = max(maxMuxSize, factors[dim]);
      hasUncertainIdx = true;
    }
  }

  op->setAttr("partition_indices", builder.getI64ArrayAttr(partitionIndices));
  if (hasUncertainIdx)
    op->setAttr("max_mux_size", builder.getI64IntegerAttr(maxMuxSize));
}

/// Timing load/store operation honoring the memory ports number limitation.
void ScaleHLSEstimator::estimateLoadStoreTiming(Operation *op, int64_t begin) {
  auto access = MemRefAccess(op);
  auto memref = access.memref;
  auto memrefType = memref.getType().cast<MemRefType>();

  // No port limitation for single-element memories as they are implemented with
  // registers.
  if (memrefType.getNumElements() == 1) {
    setTiming(op, begin, begin + 1, 1, 1);
    return;
  }

  SmallVector<int64_t, 8> factors;
  auto partitionNum = getPartitionFactors(memrefType, &factors);
  auto storageType = MemoryKind(memrefType.getMemorySpaceAsInt());
  auto partitionIndices = getIntArrayAttrValue(op, "partition_indices");

  // Try to avoid memory port violation until a legal schedule is found. Since
  // an infinite length schedule cannot be generated, this while loop can be
  // proofed to have an end.
  int64_t resMinII = 1;
  for (;; ++begin) {
    auto &memPortInfos = memPortInfosMap[begin][memref];

    // If the memory has not been occupied by the current schedule level, it
    // should be initialized according to its storage type. Note that each
    // partition should have one PortInfo structure, where the default case is
    // BRAM_S2P.
    if (memPortInfos.empty())
      for (unsigned p = 0; p < partitionNum; ++p) {
        MemPortInfo info;

        if (storageType == MemoryKind::BRAM_1P)
          info.rdwrPort = 1;
        else if (storageType == MemoryKind::BRAM_T2P)
          info.rdwrPort = 2;
        else if (storageType == MemoryKind::BRAM_S2P)
          info.rdPort = 1, info.wrPort = 1;
        else if (storageType == MemoryKind::DRAM)
          info.rdPort = UINT_MAX, info.wrPort = UINT_MAX;
        else
          llvm_unreachable("unknown memory kind");

        memPortInfos.push_back(info);
      }

    // Indicate whether the memory access operation is successfully scheduled in
    // the current schedule level.
    bool successFlag = true;

    // Walk through all partitions to check whether the current partition is
    // occupied and whether available memory ports are enough to schedule the
    // current memory access operation.
    for (int64_t idx = 0; idx < partitionNum; ++idx) {
      bool isOccupied = true;
      int64_t accumFactor = 1;

      for (int64_t dim = 0; dim < memrefType.getRank(); ++dim) {
        // If the index is -1, all ports in the current partition will be
        // occupied and a multiplexer will be generated in HLS.
        if (partitionIndices[dim] != -1 &&
            idx / accumFactor % factors[dim] != partitionIndices[dim]) {
          isOccupied = false;
          break;
        }
        accumFactor *= factors[dim];
      }

      if (isOccupied) {
        auto &info = memPortInfos[idx];
        if (isa<AffineReadOpInterface>(op)) {
          bool hasIdenticalAccess = false;
          // The rationale is as long as the current read operation has
          // identical memory access information with any scheduled read
          // operation, the schedule will success.
          for (auto rdAccess : info.rdAccesses) {
            if (access == rdAccess &&
                op->getBlock() == rdAccess.opInst->getBlock())
              hasIdenticalAccess = true;
          }

          if (hasIdenticalAccess)
            continue;

          if (info.rdPort > 0) {
            info.rdPort--;
            info.rdAccesses.push_back(access);
          } else if (info.rdwrPort > 0) {
            info.rdwrPort--;
            info.rdAccesses.push_back(access);
          } else {
            successFlag = false;
            break;
          }
        } else if (isa<AffineWriteOpInterface>(op)) {
          if (info.wrPort > 0)
            info.wrPort--;
          else if (info.rdwrPort > 0)
            info.rdwrPort--;
          else {
            successFlag = false;
            break;
          }
        }
      }
    }

    if (successFlag)
      break;
    ++resMinII;
  }

  if (isa<AffineReadOpInterface>(op))
    setTiming(op, begin, begin + 2, 2, 1);
  else
    setTiming(op, begin, begin + 1, 1, 1);
}

//===----------------------------------------------------------------------===//
// AffineForOp Related Methods
//===----------------------------------------------------------------------===//

static int64_t getMaxMuxSize(Operation *op) {
  if (auto maxMuxSize = op->getAttrOfType<IntegerAttr>("max_mux_size"))
    return maxMuxSize.getInt();
  else
    return 1;
}

static bool isNoTouch(Operation *op) {
  if (auto noTouch = op->getAttrOfType<BoolAttr>("no_touch"))
    if (noTouch.getValue())
      return true;

  return false;
}

int64_t ScaleHLSEstimator::getResMinII(int64_t begin, int64_t end,
                                       MemAccessesMap &map) {
  int64_t II = 1;
  for (auto &pair : map) {
    auto memref = pair.first;
    auto memrefType = memref.getType().cast<MemRefType>();
    auto partitionNum = getPartitionFactors(memrefType);
    auto storageType = MemoryKind(memrefType.getMemorySpaceAsInt());

    // FIXME: Study how Vivado HLS handle AXI interfaces.
    if (storageType == MemoryKind::DRAM)
      continue;

    auto accessNum = SmallVector<int64_t, 16>(partitionNum, 0);
    // Prepare for BRAM_S1P memory kind.
    auto writeNum = SmallVector<int64_t, 16>(partitionNum, 0);

    // TODO: fine-tune for BRAM_T2P.
    for (int64_t level = begin; level < end; ++level)
      if (memPortInfosMap.count(level))
        if (memPortInfosMap[level].count(memref)) {
          auto &memPortInfos = memPortInfosMap[level][memref];

          for (int64_t idx = 0; idx < partitionNum; ++idx) {
            auto &info = memPortInfos[idx];
            if (storageType == MemoryKind::BRAM_1P && info.rdwrPort < 1)
              ++accessNum[idx];
            else if (storageType == MemoryKind::BRAM_T2P && info.rdwrPort < 2)
              ++accessNum[idx];
            else if (info.rdPort < 1)
              ++accessNum[idx];
            else if (info.wrPort < 1)
              ++writeNum[idx];
          }
        }

    II = max({II, *std::max_element(writeNum.begin(), writeNum.end()),
              *std::max_element(accessNum.begin(), accessNum.end())});
  }
  return II;
}

/// Calculate the minimum dependency II of function.
int64_t ScaleHLSEstimator::getDepMinII(int64_t II, FuncOp func,
                                       MemAccessesMap &map) {
  for (auto &pair : map) {
    auto loadStores = pair.second;

    // Walk through each pair of dependency source and destination.
    for (auto dstOp : loadStores)
      for (auto srcOp : loadStores) {
        if (dstOp == srcOp)
          continue;

        // If delay is smaller than the current II, stop and continue because
        // the minimum distance is one.
        auto delay = getTiming(dstOp).getEnd() - getTiming(srcOp).getBegin();
        if (delay <= II)
          continue;

        // In some cases RAR is not considered as dependency in Vivado HLS.
        auto srcMuxSize = getMaxMuxSize(srcOp);
        auto dstMuxSize = getMaxMuxSize(dstOp);
        if (isa<AffineReadOpInterface>(srcOp) && srcMuxSize <= 3 &&
            isa<AffineReadOpInterface>(dstOp) && dstMuxSize <= 3)
          continue;

        // Distance is always 1 thus the minimum II is equal to delay.
        // TODO: need more case study.
        if (MemRefAccess(srcOp) == MemRefAccess(dstOp))
          II = max(II, delay);
      }
  }
  return II;
}

/// Calculate the minimum dependency II of loop.
int64_t ScaleHLSEstimator::getDepMinII(int64_t II, AffineForOp forOp,
                                       MemAccessesMap &map) {
  AffineLoopBand band;
  getLoopIVs(forOp.front(), &band);

  // Find all loop levels whose dependency need to be checked.
  SmallVector<unsigned, 8> loopDepths;
  for (unsigned i = 1, e = band.size(); i <= e; ++i) {
    auto loop = band[i - 1];
    auto loopDirect = getLoopDirective(loop);
    if (!hasParallelAttr(forOp) && loopDirect)
      if (loopDirect.getFlatten() || loopDirect.getPipeline())
        loopDepths.push_back(i);
  }

  for (auto &pair : map) {
    auto loadStores = pair.second;

    // Walk through each pair of source and destination.
    unsigned dstIndex = 1;
    for (auto dstOp : loadStores) {
      auto srcOps = SmallVector<Operation *, 16>(
          llvm::drop_begin(loadStores, dstIndex++));

      for (auto it = srcOps.rbegin(); it != srcOps.rend(); ++it) {
        auto srcOp = *it;

        // If delay is smaller than the current II, stop and continue because
        // the minimum distance is one.
        auto delay = getTiming(dstOp).getEnd() - getTiming(srcOp).getBegin();
        if (delay <= II)
          continue;

        // In some cases RAR is not considered as dependency in Vivado HLS.
        auto srcMuxSize = getMaxMuxSize(srcOp);
        auto dstMuxSize = getMaxMuxSize(dstOp);
        if (isa<AffineReadOpInterface>(srcOp) && srcMuxSize <= 3 &&
            isa<AffineReadOpInterface>(dstOp) && dstMuxSize <= 3)
          continue;

        // Now we must check whether carried dependency exists and calculate the
        // dependency distance if required.
        MemRefAccess dstAccess(dstOp);
        MemRefAccess srcAccess(srcOp);

        // If depAnalysis is not set, only when the two memref accesses are
        // identical, we analyze their dependency.
        if (!depAnalysis && dstAccess != srcAccess)
          continue;

        for (auto depth : loopDepths) {
          FlatAffineValueConstraints depConstrs;
          SmallVector<DependenceComponent, 2> depComps;

          DependenceResult result = checkMemrefAccessDependence(
              srcAccess, dstAccess, depth, &depConstrs, &depComps,
              /*allowRAR=*/true);

          if (hasDependence(result)) {
            int64_t distance = 0;

            if (dstMuxSize > 3 || srcMuxSize > 3) {
              // If the two memory accesses are identical or one of them is
              // implemented as separate function call, the dependency exists.
              distance = 1;
            } else {
              SmallVector<int64_t, 8> accumTrips;
              accumTrips.push_back(1);

              // Calculate the distance of this dependency.
              for (auto i = depComps.rbegin(); i < depComps.rend(); ++i) {
                auto dep = *i;
                auto loop = cast<AffineForOp>(dep.op);

                auto ub = dep.ub.getValue();
                auto lb = dep.lb.getValue();

                // If ub is more than zero, calculate the minimum positive
                // disatance. Otherwise, set distance to negative and break.
                if (ub >= 0)
                  distance +=
                      accumTrips.back() * max(lb, (int64_t)0) / loop.getStep();
                else {
                  distance = -1;
                  break;
                }
                accumTrips.push_back(accumTrips.back() *
                                     getAverageTripCount(loop).getValue());
              }
            }

            // We will only consider intra-dependencies with positive distance.
            if (distance > 0) {
              int64_t minII = std::ceil((float)delay / distance);
              II = max(II, minII);
            }
          }
        }
      }
    }
  }
  return II;
}

bool ScaleHLSEstimator::visitOp(AffineForOp op, int64_t begin) {
  // If a loop is marked as no_touch, then directly infer the schedule_end with
  // the exist latency.
  if (isNoTouch(op)) {
    auto timing = getTiming(op);
    auto resource = getResource(op);

    if (timing && resource) {
      auto latency = timing.getLatency();
      setTiming(op, begin, begin + latency, latency, latency);
      return true;
    }
  }

  // Set an attribute indicating the trip count. For now, we assume all loops
  // have static loop bound.
  auto optionalTripCount = getAverageTripCount(op);
  if (!optionalTripCount)
    return false;
  auto tripCount = optionalTripCount.getValue();

  // Estimate the contained loop block.
  auto &loopBlock = *op.getBody();
  auto timing = estimateBlock(loopBlock, begin);
  if (!timing)
    return false;

  assert(begin == timing.getBegin() && "unexpected estimation result");
  auto end = timing.getEnd();

  // Handle pipelined or flattened loops.
  if (auto loopDirect = getLoopDirective(op)) {
    // If the current loop is annotated as pipelined loop, extra dependency and
    // resource aware II analysis will be executed.
    if (loopDirect.getPipeline()) {
      // Collect load and store operations in the loop block for solving
      // possible carried dependencies.
      // TODO: include CallOps, how? It seems dependencies always exist for all
      // CallOps not matter its access pattern.
      MemAccessesMap map;
      getMemAccessesMap(loopBlock, map);

      // Calculate initial interval.
      auto targetII = loopDirect.getTargetII();
      auto resII = getResMinII(begin, end, map);
      auto depII = getDepMinII(max(targetII, resII), op, map);
      auto II = max({targetII, resII, depII});

      // Calculate latency of each iteration and update loop information.
      auto iterLatency = end - begin;
      setLoopInfo(op, tripCount, iterLatency, II);

      // Entering and leaving a loop will consume extra 2 clock cycles.
      auto latency = iterLatency + II * (tripCount - 1) + 2;
      setTiming(op, begin, begin + latency, latency, latency);

      // Once the loop is pipelined, the resource sharing scheme is different.
      // Specifically, all operators are shared inside of II cycles. Therefore,
      // we need to update the numOperatorMap here.
      for (auto &pair : totalNumOperatorMap)
        pair.second = (pair.second + II - 1) / II;

      for (auto i = begin; i < end; ++i)
        numOperatorMap.erase(i);
      numOperatorMap[begin] = totalNumOperatorMap;
      return true;
    }

    // If the current loop is annotated as flatten, it will be flattened into
    // the child pipelined loop. This will increase the flattened loop trip
    // count without changing the iteration latency.
    else if (loopDirect.getFlatten()) {
      auto child = dyn_cast<AffineForOp>(op.getBody()->front());
      assert(child && "the first containing operation is not a loop");
      auto childLoopInfo = getLoopInfo(child);

      // Flattened loop share the same II with its child loop. Calculate latency
      // of each iteration and update loop information.
      auto iterLatency = childLoopInfo.getIterLatency();
      auto flattenTripCount = childLoopInfo.getFlattenTripCount() * tripCount;
      auto II = childLoopInfo.getMinII();
      setLoopInfo(op, flattenTripCount, iterLatency, II);

      auto latency = iterLatency + II * (flattenTripCount - 1) + 2;
      setTiming(op, begin, begin + latency, latency, latency);
      return true;
    }
  }

  // Default case (not flattend or pipelined), calculate latency and resource
  // utilization accordingly.
  auto iterLatency = end - begin;
  setLoopInfo(op, tripCount, iterLatency, iterLatency);

  auto latency = iterLatency * tripCount + 2;
  setTiming(op, begin, begin + latency, latency, latency);
  return true;
}

//===----------------------------------------------------------------------===//
// Other Operation Handlers
//===----------------------------------------------------------------------===//

bool ScaleHLSEstimator::visitOp(AffineIfOp op, int64_t begin) {
  auto end = begin;
  auto thenBlock = op.getThenBlock();

  // Estimate then block.
  if (auto timing = estimateBlock(*thenBlock, begin))
    end = max(end, timing.getEnd());
  else
    return false;

  // Handle else block if required.
  if (op.hasElse()) {
    auto elseBlock = op.getElseBlock();

    if (auto timing = estimateBlock(*elseBlock, begin))
      end = max(end, timing.getEnd());
    else
      return false;
  }

  // In our assumption, AffineIfOp is completely transparent. Therefore, we
  // set a dummy schedule begin here.
  setTiming(op, begin, end, end - begin, 0);
  return true;
}

bool ScaleHLSEstimator::visitOp(scf::IfOp op, int64_t begin) {
  auto end = begin;
  auto thenBlock = op.thenBlock();

  // Estimate then block.
  if (auto timing = estimateBlock(*thenBlock, begin))
    end = max(end, timing.getEnd());
  else
    return false;

  // Handle else block if required.
  if (auto elseBlock = op.elseBlock()) {
    if (auto timing = estimateBlock(*elseBlock, begin))
      end = max(end, timing.getEnd());
    else
      return false;
  }

  // In our assumption, scf::IfOp is completely transparent. Therefore, we
  // set a dummy schedule begin here.
  setTiming(op, begin, end, end - begin, 0);
  return true;
}

bool ScaleHLSEstimator::visitOp(func::CallOp op, int64_t begin) {
  auto callee = SymbolTable::lookupNearestSymbolFrom(op, op.getCalleeAttr());
  auto subFunc = dyn_cast<FuncOp>(callee);
  assert(subFunc && "callable is not a function operation");

  ScaleHLSEstimator estimator(latencyMap, dspUsageMap, depAnalysis);
  estimator.estimateFunc(subFunc);

  // We assume enter and leave the subfunction require extra 2 clock cycles.
  if (auto timing = getTiming(subFunc)) {
    auto latency = timing.getLatency();
    setTiming(op, begin, begin + latency, latency, timing.getInterval());
    setResource(op, getResource(subFunc));
    return true;
  } else
    return false;
}

//===----------------------------------------------------------------------===//
// Block Scheduler and Estimator
//===----------------------------------------------------------------------===//

// Get the pointer of the scrOp's parent loop, which should locate at the same
// level with dstOp's any parent loop.
static Operation *getSameLevelDstOp(Operation *srcOp, Operation *dstOp) {
  // If srcOp and dstOp are already at the same level, return the srcOp.
  if (checkSameLevel(srcOp, dstOp))
    return dstOp;

  // Helper to get all surrouding AffineForOps. AffineIfOps are skipped.
  auto getSurroundFors =
      ([&](Operation *op, SmallVector<Operation *, 4> &nests) {
        nests.push_back(op);
        auto currentOp = op;
        while (true) {
          auto parentOp = currentOp->getParentOp();
          if (isa<AffineForOp>(parentOp)) {
            nests.push_back(parentOp);
            currentOp = parentOp;
          } else if (isa<AffineIfOp, scf::IfOp>(parentOp))
            currentOp = parentOp;
          else
            break;
        }
      });

  SmallVector<Operation *, 4> srcNests;
  SmallVector<Operation *, 4> dstNests;

  getSurroundFors(srcOp, srcNests);
  getSurroundFors(dstOp, dstNests);

  // If any parent of srcOp (or itself) and any parent of dstOp (or itself) are
  // at the same level, return the pointer.
  for (auto src : srcNests)
    for (auto dst : dstNests)
      if (checkSameLevel(src, dst))
        return dst;

  return nullptr;
}

/// Estimate the latency of a block with ALAP scheduling strategy, return the
/// estimated timing attribute.
TimingAttr ScaleHLSEstimator::estimateBlock(Block &block, int64_t begin) {
  if (!isa<AffineIfOp, scf::IfOp>(block.getParentOp()))
    totalNumOperatorMap.clear();

  auto blockBegin = begin;
  auto blockEnd = begin;

  // Reversely walk through all operations in the block.
  for (auto i = block.rbegin(), e = block.rend(); i != e; ++i) {
    auto op = &*i;
    auto opBegin = begin;
    auto opEnd = begin;

    // Calculate the partition indices of memory load and store operations.
    if (isa<AffineLoadOp, AffineStoreOp>(op))
      getPartitionIndices(op);

    // Find the latest arrived successor depending on the current operation.
    for (auto user : op->getUsers()) {
      auto sameLevelUser = getSameLevelDstOp(op, user);
      opBegin = max(opBegin, getTiming(sameLevelUser).getEnd());
    }

    // Loop shouldn't overlap with any other scheduled operations. The rationale
    // here is in Vivado HLS, a loop will always be blocked by other operations
    // before it, even if no actual dependency exists between them.
    if (isa<mlir::AffineForOp>(op))
      opBegin = max(opBegin, blockEnd);

    // Check memory dependencies of the operation and update schedule level.
    for (auto operand : op->getOperands()) {
      if (operand.getType().isa<MemRefType>())
        // All users of the same memref value has the possibility to share
        // dependency with the current operation.
        for (auto depOp : operand.getUsers()) {
          // If the depOp has not been scheduled or its schedule level will not
          // impact the current operation's scheduling, stop and continue.
          auto sameLevelDstOp = getSameLevelDstOp(op, depOp);
          auto depOpTiming = getTiming(sameLevelDstOp);
          if (!depOpTiming)
            continue;

          auto depOpEnd = depOpTiming.getEnd();
          if (depOpEnd <= opBegin || !DT.properlyDominates(op, depOp))
            continue;

          // If either the depOp or the current operation is a function call,
          // dependency exists and the schedule level should be updated.
          if (isa<func::CallOp, memref::CopyOp>(op) ||
              isa<func::CallOp, memref::CopyOp>(depOp)) {
            opBegin = max(opBegin, depOpEnd);
            continue;
          }

          // Now both of the depOp and the current operation must be memory
          // load/store operation.
          auto opMuxSize = getMaxMuxSize(op);
          auto depOpMuxSize = getMaxMuxSize(depOp);

          // In Vivado HLS, a memory access with undetermined partition index
          // will be implemented as a function call with a multiplexer if the
          // partition factor is larger than 3. Function call has dependency
          // with any load/store operation including RAR.
          if (isa<AffineReadOpInterface>(op) && opMuxSize <= 3 &&
              isa<AffineReadOpInterface>(depOp) && depOpMuxSize <= 3)
            continue;

          // Now we must check whether any dependency exists between the two
          // operations. If so, update the scheduling level.
          auto opAccess = MemRefAccess(op);
          auto depOpAccess = MemRefAccess(depOp);

          // If depAnalysis is not set, only when the two memref accesses are
          // identical, we analyze their dependency.
          if (!depAnalysis && opAccess != depOpAccess)
            continue;

          AffineLoopBand commonLoops;
          auto loopDepth = getCommonSurroundingLoops(op, depOp, &commonLoops);

          for (unsigned depth = 1; depth <= loopDepth + 1; ++depth) {
            // Skip all parallel loop level.
            if (depth != loopDepth + 1 &&
                hasParallelAttr(commonLoops[depth - 1]))
              continue;

            FlatAffineValueConstraints dependConstrs;
            DependenceResult result = checkMemrefAccessDependence(
                opAccess, depOpAccess, depth, &dependConstrs,
                /*dependenceComponents=*/nullptr, /*allowRAR=*/true);

            if (hasDependence(result)) {
              opBegin = max(opBegin, depOpEnd);
              break;
            }
          }
        }
    }

    // Estimate the current operation.
    if (dispatchVisitor(op, opBegin))
      opEnd = max(opEnd, getTiming(op).getEnd());
    else {
      op->emitError("Failed to estimate op");
      return TimingAttr();
    }

    // Update the block schedule end and begin.
    if (i == block.rbegin())
      blockBegin = opBegin;
    else
      blockBegin = min(blockBegin, opBegin);

    blockEnd = max(blockEnd, opEnd);
  }

  return TimingAttr::get(block.getParentOp()->getContext(), blockBegin,
                         blockEnd, blockEnd - blockBegin,
                         blockEnd - blockBegin);
}

/// Get the innermost surrounding operation, either an AffineForOp or a FuncOp.
/// In this method, AffineIfOp is transparent as well.
static Operation *getSurroundingOp(Operation *op) {
  auto currentOp = op;
  while (true) {
    auto parentOp = currentOp->getParentOp();
    if (isa<AffineIfOp, scf::IfOp>(parentOp))
      currentOp = parentOp;
    else if (isa<AffineForOp, FuncOp>(parentOp))
      return parentOp;
    else
      return nullptr;
  }
}

void ScaleHLSEstimator::reverseTiming(Block &block) {
  block.walk([&](Operation *op) {
    // Get schedule level.
    if (auto timing = getTiming(op)) {
      auto begin = timing.getBegin();
      auto end = timing.getEnd();
      auto latency = timing.getLatency();
      auto interval = timing.getInterval();

      // Reverse schedule level.
      if (auto srd = getSurroundingOp(op)) {
        if (isa<AffineForOp, scf::ForOp>(srd)) {
          auto srdBegin = getTiming(srd).getBegin();

          // Handle normal cases.
          auto iterLatency = getLoopInfo(srd).getIterLatency();
          setTiming(op, srdBegin + iterLatency - end,
                    srdBegin + iterLatency - begin, latency, interval);

          // Handle flattened surrounding loops.
          if (auto srdDirect = getLoopDirective(srd)) {
            if (srdDirect.getFlatten())
              setTiming(op, srdBegin, srdBegin + latency, latency, interval);
          }
        } else if (isa<FuncOp>(srd)) {
          auto srdLatency = getTiming(srd).getLatency() - 2;
          setTiming(op, srdLatency - end, srdLatency - begin, latency,
                    interval);
        } else
          op->emitError("unexpected surrounding operation");
      }
    }
  });
}

void ScaleHLSEstimator::initEstimator(Block &block) {
  // Clear global maps and scheduling information.
  memPortInfosMap.clear();
  numOperatorMap.clear();

  block.walk([&](Operation *op) {
    if (!isNoTouch(op)) {
      op->removeAttr("resource");
      op->removeAttr("timing");
      op->removeAttr("loop_info");
    }
  });
}

ResourceAttr ScaleHLSEstimator::calculateResource(Operation *funcOrLoop) {
  // Calculate the static DSP and BRAM utilization.
  int64_t dspNum = 0;
  int64_t bramNum = 0;
  funcOrLoop->walk([&](Operation *op) {
    if (isa<func::CallOp>(op) || isNoTouch(op)) {
      // TODO: For now, we consider the resource utilization of sub-fuctions are
      // static and not shareable. But actually this is not the truth. The
      // resource can be shared between different sub-functions to some extent,
      // whose shareing scheme has not been characterized by the estimator.
      if (auto resource = getResource(op))
        dspNum += resource.getDsp();

    } else if (isa<memref::AllocaOp, memref::AllocOp>(op)) {
      auto memrefType = op->getResult(0).getType().cast<MemRefType>();
      if (memrefType.getNumElements() > 1) {
        auto partitionNum = getPartitionFactors(memrefType);
        auto storageType = MemoryKind(memrefType.getMemorySpaceAsInt());

        // TODO: Support URAM and interface BRAMs?
        if (storageType == MemoryKind::BRAM_1P ||
            storageType == MemoryKind::BRAM_S2P ||
            storageType == MemoryKind::BRAM_T2P) {
          // Multiply bit width of type.
          // TODO: handle index types.
          int64_t memrefSize = memrefType.getElementTypeBitWidth() *
                               memrefType.getNumElements() / partitionNum;
          bramNum += ((memrefSize + 18000 - 1) / 18000) * partitionNum;
        }
      }
    }
  });

  auto timing = getTiming(funcOrLoop);
  assert(timing && "timing has not been estimated");

  llvm::StringMap<int64_t> operatorNums;
  for (auto level : numOperatorMap) {
    if (level.first < timing.getBegin() || level.first >= timing.getEnd())
      continue;

    for (auto &nameAndNum : level.second) {
      auto &num = operatorNums[nameAndNum.first()];
      num = max(num, nameAndNum.second);
    }
  }
  for (auto &nameAndNum : operatorNums)
    dspNum += dspUsageMap[nameAndNum.first()] * nameAndNum.second;

  return ResourceAttr::get(funcOrLoop->getContext(), 0, dspNum, bramNum);
}

void ScaleHLSEstimator::estimateFunc(FuncOp func) {
  initEstimator(func.front());
  DT = DominanceInfo(func);

  // Collect all memory access operations for later use.
  MemAccessesMap map;
  getMemAccessesMap(func.front(), map);

  // Recursively estimate blocks in the function.
  auto timing = estimateBlock(func.front());
  if (!timing)
    return;

  auto latency = timing.getEnd() + 2;
  auto interval = latency;

  // Handle pipelined or dataflowed loops.
  if (auto funcDirect = getFuncDirective(func)) {
    if (funcDirect.getDataflow()) {
      interval = 1;
      for (auto callOp : func.getOps<func::CallOp>()) {
        auto subFuncLatency =
            getTiming(callOp).getEnd() - getTiming(callOp).getBegin();
        interval = max(interval, subFuncLatency);
      }

    } else if (funcDirect.getPipeline()) {
      // TODO: support CallOp inside of the function.
      auto targetInterval = funcDirect.getTargetInterval();
      auto resInterval = getResMinII(0, timing.getEnd(), map);
      auto depInterval =
          getDepMinII(max(targetInterval, resInterval), func, map);
      interval = max({targetInterval, resInterval, depInterval});
      // TODO: Tune numOperatorMap like visitOp(AffineForOp op);
    }
  }

  // Estimate and set timing and resource attributes.
  setTiming(func, 0, latency, latency, interval);
  setResource(func, calculateResource(func));

  // Scheduled levels of all operations are reversed in this method, because
  // we have done the ALAP scheduling in a reverse order. Note that after
  // the reverse, the annotated scheduling level of each operation is a
  // relative level of the nearest surrounding AffineForOp or FuncOp.
  reverseTiming(func.front());
}

void ScaleHLSEstimator::estimateLoop(AffineForOp loop, FuncOp func) {
  initEstimator(func.getBody().front());
  DT = DominanceInfo(loop);
  visitOp(loop, 0);
  setResource(loop, calculateResource(loop));
}

//===----------------------------------------------------------------------===//
// Entry of scalehls-opt
//===----------------------------------------------------------------------===//

void scalehls::getLatencyMap(llvm::json::Object *config,
                             llvm::StringMap<int64_t> &latencyMap) {
  auto frequency =
      config->getObject(config->getString("frequency").getValueOr("100MHz"));

  latencyMap["fadd"] = frequency->getInteger("fadd").getValueOr(4);
  latencyMap["fmul"] = frequency->getInteger("fmul").getValueOr(3);
  latencyMap["fdiv"] = frequency->getInteger("fdiv").getValueOr(15);
  latencyMap["fcmp"] = frequency->getInteger("fcmp").getValueOr(1);
  latencyMap["fexp"] = frequency->getInteger("fexp").getValueOr(8);
}

void scalehls::getDspUsageMap(llvm::json::Object *config,
                              llvm::StringMap<int64_t> &dspUsageMap) {
  auto dspUsage = config->getObject("dsp_usage");

  dspUsageMap["fadd"] = dspUsage->getInteger("fadd").getValueOr(2);
  dspUsageMap["fmul"] = dspUsage->getInteger("fmul").getValueOr(3);
  dspUsageMap["fdiv"] = dspUsage->getInteger("fdiv").getValueOr(0);
  dspUsageMap["fcmp"] = dspUsage->getInteger("fcmp").getValueOr(0);
  dspUsageMap["fexp"] = dspUsage->getInteger("fexp").getValueOr(7);
}

namespace {
struct QoREstimation : public scalehls::QoREstimationBase<QoREstimation> {
  QoREstimation() = default;
  QoREstimation(std::string qorTargetSpec) { targetSpec = qorTargetSpec; }

  void runOnOperation() override {
    auto module = getOperation();

    // Read target specification JSON file.
    std::string errorMessage;
    auto configFile = mlir::openInputFile(targetSpec, &errorMessage);
    if (!configFile) {
      llvm::errs() << errorMessage << "\n";
      return signalPassFailure();
    }

    // Parse JSON file into memory.
    auto config = llvm::json::parse(configFile->getBuffer());
    if (!config) {
      llvm::errs() << "failed to parse the target spec json file\n";
      return signalPassFailure();
    }
    auto configObj = config.get().getAsObject();
    if (!configObj) {
      llvm::errs() << "support an object in the target spec json file, found "
                      "something else\n";
      return signalPassFailure();
    }

    // Collect profiling latency and DSP usage data, where default values are
    // based on Xilinx PYNQ-Z1 board.
    llvm::StringMap<int64_t> latencyMap;
    getLatencyMap(configObj, latencyMap);
    llvm::StringMap<int64_t> dspUsageMap;
    getDspUsageMap(configObj, dspUsageMap);

    // Estimate performance and resource utilization. If any other functions are
    // called by the top function, it will be estimated in the procedure of
    // estimating the top function.
    for (auto func : module.getOps<FuncOp>())
      if (hasTopFuncAttr(func))
        ScaleHLSEstimator(latencyMap, dspUsageMap, true).estimateFunc(func);
  }
};
} // namespace

std::unique_ptr<Pass>
scalehls::createQoREstimationPass(std::string qorTargetSpec) {
  return std::make_unique<QoREstimation>(qorTargetSpec);
}