From 512c842908091a6540f9d0446ae2e33b0fd4052a Mon Sep 17 00:00:00 2001 From: Hanchen Ye Date: Thu, 17 Dec 2020 06:10:48 -0600 Subject: [PATCH] [QoREstimator] thoroughly rewrite the estimator with new strategy and structure --- include/Analysis/QoREstimation.h | 102 ++-- lib/Analysis/QoREstimation.cpp | 724 ++++++++++++++++------------- lib/Conversion/ConvertToHLSCpp.cpp | 4 +- lib/Transforms/ArrayPartition.cpp | 16 +- lib/Transforms/LoopPipelining.cpp | 36 +- 5 files changed, 491 insertions(+), 391 deletions(-) diff --git a/include/Analysis/QoREstimation.h b/include/Analysis/QoREstimation.h index 4ea42eb..70b6222 100644 --- a/include/Analysis/QoREstimation.h +++ b/include/Analysis/QoREstimation.h @@ -7,6 +7,8 @@ #include "Dialect/HLSCpp/Visitor.h" #include "INIReader.h" +#include "mlir/Analysis/AffineAnalysis.h" +#include "mlir/Analysis/Liveness.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" @@ -21,10 +23,12 @@ namespace scalehls { class HLSCppToolBase { public: - explicit HLSCppToolBase(OpBuilder &builder) : builder(builder) {} + explicit HLSCppToolBase(OpBuilder builder) : builder(builder) {} - /// Get value methods. - int64_t getIntAttrValue(Operation *op, StringRef name) { + OpBuilder builder; + + /// Get attribute value methods. + int32_t getIntAttrValue(Operation *op, StringRef name) { if (auto attr = op->getAttrOfType(name)) return attr.getInt(); else @@ -52,6 +56,7 @@ public: return ""; } + /// Get partition information methods. StringRef getPartitionType(ArrayOp op, unsigned dim) { if (auto attr = op.partition_type()[dim].cast()) return attr.getValue(); @@ -66,15 +71,15 @@ public: return 0; } - /// Set value methods. - void setAttrValue(Operation *op, StringRef name, unsigned value) { - op->setAttr(name, builder.getUI32IntegerAttr(value)); - } - + /// Set attribute value methods. void setAttrValue(Operation *op, StringRef name, int32_t value) { op->setAttr(name, builder.getI32IntegerAttr(value)); } + void setAttrValue(Operation *op, StringRef name, unsigned value) { + op->setAttr(name, builder.getUI32IntegerAttr(value)); + } + void setAttrValue(Operation *op, StringRef name, bool value) { op->setAttr(name, builder.getBoolAttr(value)); } @@ -82,21 +87,6 @@ public: void setAttrValue(Operation *op, StringRef name, StringRef value) { op->setAttr(name, builder.getStringAttr(value)); } - - /// Get expression methods. - AffineExpr getSymbolExpr(unsigned value) { - return getAffineSymbolExpr(value, builder.getContext()); - } - - AffineExpr getDimExpr(unsigned value) { - return getAffineDimExpr(value, builder.getContext()); - } - - AffineExpr getConstExpr(int64_t value) { - return getAffineConstantExpr(value, builder.getContext()); - } - - OpBuilder &builder; }; //===----------------------------------------------------------------------===// @@ -104,9 +94,13 @@ public: //===----------------------------------------------------------------------===// // For storing all memory access operations (including AffineLoadOp and -// AffineStoreOp) indexed by the array instantce (ArrayOp). -using LoadStore = SmallVector; -using LoadStoreDict = llvm::SmallDenseMap; +// AffineStoreOp) indexed by the array instance (ArrayOp). +using LoadStores = SmallVector; +using LoadStoresMap = DenseMap; + +// For storing all dependent operations indexed by the source operation. +using Depends = SmallVector; +using DependsMap = DenseMap; // Indicate the unoccupied memory ports number. struct PortInfo { @@ -118,37 +112,49 @@ struct PortInfo { unsigned rdwrPort; }; -// For storing ports number information of each memory instance. -using MemPort = SmallVector; -using MemPortDict = llvm::SmallDenseMap; +// For storing ports number of all partitions indexed by the array instance +// (ArrayOp). +using Ports = SmallVector; +using PortsMap = DenseMap; -// For storing MemPort indexed by the pipeline stage. -using MemPortDicts = SmallVector; +// For storing PortsMap indexed by the scheduling level. +using PortsMapDict = DenseMap; -class HLSCppEstimator : public HLSCppVisitorBase, - public HLSCppToolBase { +class HLSCppEstimator + : public HLSCppVisitorBase, unsigned>, + public HLSCppToolBase { public: - explicit HLSCppEstimator(OpBuilder &builder, std::string targetSpecPath); - - bool visitUnhandledOp(Operation *op) { return true; } + explicit HLSCppEstimator(FuncOp &func) + : HLSCppToolBase(OpBuilder(func)), func(func), liveness(Liveness(func)) { + getFuncMemRefDepends(); + } + void getFuncMemRefDepends(); using HLSCppVisitorBase::visitOp; - bool visitOp(AffineForOp op); - bool visitOp(AffineIfOp op); - bool visitOp(ArrayOp op); + Optional visitUnhandledOp(Operation *op, unsigned begin) { + // Default latency of any unhandled operation is 1. + return begin + 1; + } - void getBlockMemInfo(Block &block, LoadStoreDict &info); + int32_t getPartitionIndex(Operation *op); + unsigned getLoadStoreSchedule(Operation *op, unsigned begin); + Optional visitOp(AffineLoadOp op, unsigned begin); + Optional visitOp(AffineStoreOp op, unsigned begin); - unsigned getLoadStoreSchedule(Operation *op, unsigned begin, - MemPortDicts &dicts); - void updateChildBlockSchedule(Block &block, unsigned begin); - unsigned getBlockSchedule(Block &block); + unsigned getResMinII(AffineForOp forOp, LoadStoresMap &map); + unsigned getDepMinII(AffineForOp forOp, LoadStoresMap &map); + Optional visitOp(AffineForOp op, unsigned begin); - unsigned getResMinII(AffineForOp forOp, LoadStoreDict dict); - unsigned getDepMinII(AffineForOp forOp, LoadStoreDict dict); + Optional visitOp(AffineIfOp op, unsigned begin); + Optional visitOp(ArrayOp op, unsigned begin); - bool estimateFunc(FuncOp func); - bool estimateBlock(Block &block); + Optional estimateBlock(Block &block, unsigned blockBegin); + void estimateFunc(); + + FuncOp &func; + Liveness liveness; + DependsMap dependsMap; + PortsMapDict portsMapDict; }; } // namespace scalehls diff --git a/lib/Analysis/QoREstimation.cpp b/lib/Analysis/QoREstimation.cpp index c913073..3ad1513 100644 --- a/lib/Analysis/QoREstimation.cpp +++ b/lib/Analysis/QoREstimation.cpp @@ -5,14 +5,11 @@ #include "Analysis/QoREstimation.h" #include "Analysis/Passes.h" #include "Dialect/HLSCpp/HLSCpp.h" -#include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" -#include "mlir/Analysis/Liveness.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/IR/Operation.h" #include "mlir/IR/PatternMatch.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" using namespace std; using namespace mlir; @@ -20,97 +17,246 @@ using namespace scalehls; using namespace hlscpp; //===----------------------------------------------------------------------===// -// HLSCppEstimator Class Definition +// Helpers //===----------------------------------------------------------------------===// -/// Estimator constructor. -HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath) - : HLSCppToolBase(builder) { +// Check if the lhsOp and rhsOp is at the same scheduling level. In this check, +// AffineIfOp is transparent. +static bool checkSameLevel(Operation *lhsOp, Operation *rhsOp) { + // If lhsOp and rhsOp are already at the same level, return true. + if (lhsOp->getBlock() == rhsOp->getBlock()) + return true; - INIReader targetSpec(targetSpecPath); - if (targetSpec.ParseError()) - llvm::outs() << "error: target spec file parse fail, please refer to " - "--help option and pass in correct file path\n"; + // Get all nested parent AffineIfOps, include lhsOp and rhsOp. + auto getNests = ([&](Operation *op, SmallVector &nests) { + nests.push_back(op); + auto currentOp = op; + while (true) { + if (auto parentOp = currentOp->getParentOfType()) { + nests.push_back(parentOp); + currentOp = parentOp; + } else + break; + } + }); - // TODO: Support estimator initiation from profiling data. - auto freq = targetSpec.Get("spec", "frequency", "200MHz"); - auto latency = targetSpec.GetInteger(freq, "op", 0); - llvm::outs() << latency << "\n"; + SmallVector lhsNests; + SmallVector rhsNests; + + getNests(lhsOp, lhsNests); + getNests(rhsOp, rhsNests); + + // If any parent of lhsOp and any parent of rhsOp are at the same level, + // return true. + for (auto lhs : lhsNests) + for (auto rhs : rhsNests) + if (lhs->getBlock() == rhs->getBlock()) + return true; + + return false; } -/// Collect memory access information of the block. -void HLSCppEstimator::getBlockMemInfo(Block &block, LoadStoreDict &dict) { - // Walk through all load/store operations in the current block. +/// Get all nested parent AffineForOps. Since AffineIfOps are transparent, +/// AffineIfOps are skipped during the procedure. +static void getLoopNests(Operation *op, SmallVector &nests) { + auto currentOp = op; + while (true) { + if (auto parentOp = currentOp->getParentOfType()) { + nests.push_back(parentOp); + currentOp = parentOp; + } else if (auto parentOp = currentOp->getParentOfType()) + currentOp = parentOp; + else + break; + } +} + +/// Get the definition ArrayOp given any memory access operation. +static ArrayOp getArrayOp(Operation *op) { + auto defOp = MemRefAccess(op).memref.getDefiningOp(); + assert(defOp && "MemRef is block argument"); + + auto arrayOp = dyn_cast(defOp); + assert(arrayOp && "MemRef is not defined by ArrayOp"); + + return arrayOp; +} + +/// Collect all load and store operations in the block. +static void getLoadStoresMap(Block &block, LoadStoresMap &map) { block.walk([&](Operation *op) { - if (isa(op)) { - auto memAccess = MemRefAccess(op); - auto arrayOp = cast(memAccess.memref.getDefiningOp()); - - AffineValueMap accessMap; - memAccess.getAccessMap(&accessMap); - - dict[arrayOp].push_back(op); - - // Calculate the partition index of this load/store operation honoring the - // partition strategy applied. - int32_t partitionIdx = 0; - unsigned accumFactor = 1; - unsigned dim = 0; - for (auto expr : accessMap.getAffineMap().getResults()) { - auto idxExpr = getConstExpr(0); - unsigned factor = 1; - if (arrayOp.partition()) { - auto type = getPartitionType(arrayOp, dim); - factor = getPartitionFactor(arrayOp, dim); - - if (type == "cyclic") - idxExpr = expr % getConstExpr(factor); - else if (type == "block") { - auto size = arrayOp.getType().cast().getShape()[dim]; - idxExpr = expr.floorDiv(getConstExpr((size + factor - 1) / factor)); - } - } - if (auto constExpr = idxExpr.dyn_cast()) { - if (dim == 0) - partitionIdx = constExpr.getValue(); - else - partitionIdx += constExpr.getValue() * accumFactor; - } else { - partitionIdx = -1; - break; - } - - accumFactor *= factor; - dim++; - } - - // Set partition index attribute. - setAttrValue(op, "partition_index", partitionIdx); - } + if (isa(op)) + map[getArrayOp(op)].push_back(op); }); } -/// Calculate load/store operation schedule honoring the memory ports number -/// limitation. This method will be called by getBlockSchedule method. -unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin, - MemPortDicts &dicts) { - auto memAccess = MemRefAccess(op); - auto arrayOp = cast(memAccess.memref.getDefiningOp()); +//===----------------------------------------------------------------------===// +// MemRef Dependency Collection Methods +//===----------------------------------------------------------------------===// - auto partitionIdx = getIntAttrValue(op, "partition_index"); +/// Get the common loop depth shared by lhsOp and rhsOp. +static unsigned getCommonLoopDepth(Operation *lhsOp, Operation *rhsOp) { + // Collect all parent nested loops. + SmallVector lhsLoopNests; + SmallVector rhsLoopNests; + + getLoopNests(lhsOp, lhsLoopNests); + getLoopNests(rhsOp, rhsLoopNests); + + // Calculate common loop depth. + auto lhsDepth = lhsLoopNests.size(); + auto rhsDepth = rhsLoopNests.size(); + unsigned commonLoopDepth = 0; + + for (unsigned i = 0, e = min(lhsDepth, rhsDepth); i < e; ++i) { + if (lhsLoopNests[lhsDepth - 1 - i] == rhsLoopNests[rhsDepth - 1 - i]) + commonLoopDepth++; + else + break; + } + + return commonLoopDepth; +} + +/// Collect all dependencies detected in the function. +void HLSCppEstimator::getFuncMemRefDepends() { + LoadStoresMap loadStoresMap; + getLoadStoresMap(func.front(), loadStoresMap); + + // Walk through all ArrayOp - LoadOp/StoreOp pairs. + for (auto &pair : loadStoresMap) { + auto loadStores = pair.second; + + // Walk through each pair of source and destination. Note that for intra + // dependencies, srcOp is always before dstOp. + unsigned srcIndex = 1; + for (auto srcOp : loadStores) { + MemRefAccess srcAccess(srcOp); + for (auto dstOp : llvm::drop_begin(loadStores, srcIndex)) { + MemRefAccess dstAccess(dstOp); + + bool dependFlag = false; + auto commonLoopDepth = getCommonLoopDepth(srcOp, dstOp); + for (unsigned depth = 1; depth <= commonLoopDepth; ++depth) { + // Initialize constraints and components. + FlatAffineConstraints dependConstrs; + SmallVector dependComps; + + // Check dependency. + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, depth, &dependConstrs, &dependComps); + dependFlag = hasDependence(result); + } + + // All dependencies are pushed into the dependsMap output. + if (dependFlag) + dependsMap[dstOp].push_back(srcOp); + } + srcIndex++; + } + } +} + +//===----------------------------------------------------------------------===// +// LoadOp and StoreOp Related Methods +//===----------------------------------------------------------------------===// + +// Get the pointer of the scrOp's parent loop, which should locate at the same +// level with dstOp's any parent loop. +static Operation *getSameLevelSourceOp(Operation *srcOp, Operation *dstOp) { + // If srcOp and dstOp are already at the same level, return the srcOp. + if (checkSameLevel(srcOp, dstOp)) + return srcOp; + + SmallVector srcNests; + SmallVector dstNests; + srcNests.push_back(srcOp); + dstNests.push_back(dstOp); + + getLoopNests(srcOp, srcNests); + getLoopNests(dstOp, dstNests); + + // If any parent of srcOp (or itself) and any parent of dstOp (or itself) are + // at the same level, return the pointer. + for (auto src : srcNests) + for (auto dst : dstNests) + if (checkSameLevel(src, dst)) + return src; + + return nullptr; +} + +/// Calculate the overall partition index. +int32_t HLSCppEstimator::getPartitionIndex(Operation *op) { + auto arrayOp = getArrayOp(op); + AffineValueMap accessMap; + MemRefAccess(op).getAccessMap(&accessMap); + + // Calculate the partition index of this load/store operation honoring the + // partition strategy applied. + int32_t partitionIdx = 0; + unsigned accumFactor = 1; + unsigned dim = 0; + + for (auto expr : accessMap.getAffineMap().getResults()) { + auto idxExpr = builder.getAffineConstantExpr(0); + unsigned factor = 1; + + if (arrayOp.partition()) { + auto type = getPartitionType(arrayOp, dim); + factor = getPartitionFactor(arrayOp, dim); + + if (type == "cyclic") + idxExpr = expr % builder.getAffineConstantExpr(factor); + else if (type == "block") { + auto size = arrayOp.getType().cast().getShape()[dim]; + idxExpr = expr.floorDiv( + builder.getAffineConstantExpr((size + factor - 1) / factor)); + } + } + + if (auto constExpr = idxExpr.dyn_cast()) { + if (dim == 0) + partitionIdx = constExpr.getValue(); + else + partitionIdx += constExpr.getValue() * accumFactor; + } else { + partitionIdx = -1; + break; + } + + accumFactor *= factor; + dim++; + } + return partitionIdx; +} + +/// Schedule load/store operation honoring the memory ports number limitation. +unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin) { + // Check dependencies of the operation and update schedule level. + for (auto srcOp : dependsMap[op]) { + auto sameLevelSrcOp = getSameLevelSourceOp(srcOp, op); + begin = max(getUIntAttrValue(sameLevelSrcOp, "schedule_end"), begin); + } + + // Calculate partition index. + auto partitionIdx = getPartitionIndex(op); + setAttrValue(op, "partition_index", partitionIdx); + + auto arrayOp = getArrayOp(op); auto partitionNum = getUIntAttrValue(arrayOp, "partition_num"); auto storageType = getStrAttrValue(arrayOp, "storage_type"); - // Try to avoid memory port violation until a legal schedule is found. - // Since an infinite length pipeline can be generated, this while loop can - // be proofed to have an end. + // Try to avoid memory port violation until a legal schedule is found. Since + // an infinite length schedule cannot be generated, this while loop can be + // proofed to have an end. while (true) { - auto memPort = dicts[begin][arrayOp]; + auto memPort = portsMapDict[begin][arrayOp]; bool memPortEmpty = memPort.empty(); - // If the memory has not been occupied by the current stage, it should - // be initialized according to its storage type. Note that each - // partition should have one PortNum structure. + // If the memory has not been occupied by the current schedule level, it + // should be initialized according to its storage type. Note that each + // partition should have one PortInfo structure. if (memPortEmpty) { for (unsigned p = 0; p < partitionNum; ++p) { unsigned rdPort = 0; @@ -125,167 +271,72 @@ unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin, rdwrPort = 1; else { rdwrPort = 2; - // arrayOp.emitError("unsupported storage type."); } - PortInfo portInfo(rdPort, wrPort, rdwrPort); - memPort.push_back(portInfo); + + memPort.push_back(PortInfo(rdPort, wrPort, rdwrPort)); } } - // TODO: When partition index can't be determined, this operation will be - // considered to occupy all ports. + // Indicate whether the operation is successfully scheduled in the current + // schedule level. + bool successFlag = false; + if (partitionIdx == -1) { + // When partition index can't be determined, this operation must occupy + // all ports in the scheduled level. if (memPortEmpty) { for (unsigned p = 0; p < partitionNum; ++p) { memPort[p].rdPort = 0; memPort[p].wrPort = 0; memPort[p].rdwrPort = 0; } - dicts[begin][arrayOp] = memPort; - break; - } else { - if (++begin >= dicts.size()) { - MemPortDict memPortDict; - dicts.push_back(memPortDict); - } + successFlag = true; } - } - - // Find whether the current schedule meets memory port limitation. If - // not, the schedule will increase by 1. - PortInfo portInfo = memPort[partitionIdx]; - if (isa(op) && portInfo.rdPort > 0) { - memPort[partitionIdx].rdPort -= 1; - dicts[begin][arrayOp] = memPort; - break; - } else if (isa(op) && portInfo.wrPort > 0) { - memPort[partitionIdx].wrPort -= 1; - dicts[begin][arrayOp] = memPort; - break; - } else if (portInfo.rdwrPort > 0) { - memPort[partitionIdx].rdwrPort -= 1; - dicts[begin][arrayOp] = memPort; - break; } else { - if (++begin >= dicts.size()) { - MemPortDict memPortDict; - dicts.push_back(memPortDict); + // When partition index can be determined, figure out whether the current + // schedule meets memory port limitation. + PortInfo portInfo = memPort[partitionIdx]; + if (isa(op) && portInfo.rdPort > 0) { + memPort[partitionIdx].rdPort -= 1; + successFlag = true; + + } else if (isa(op) && portInfo.wrPort > 0) { + memPort[partitionIdx].wrPort -= 1; + successFlag = true; + + } else if (portInfo.rdwrPort > 0) { + memPort[partitionIdx].rdwrPort -= 1; + successFlag = true; } } + + // If successed, break the while loop. Otherwise increase the schedule level + // by 1 and continue to try. + if (successFlag) { + portsMapDict[begin][arrayOp] = memPort; + break; + } else + begin++; } - return begin; + + // Memory load/store operation always consumes 1 clock cycle. + return begin + 1; } -void HLSCppEstimator::updateChildBlockSchedule(Block &block, unsigned begin) { - for (auto &op : block) { - unsigned newBegin = begin; - unsigned newEnd = begin; - - // Update the schedule of all operations in the child block. - if (getUIntAttrValue(&op, "schedule_end")) { - newBegin += getUIntAttrValue(&op, "schedule_begin"); - newEnd += getUIntAttrValue(&op, "schedule_end"); - setAttrValue(&op, "schedule_begin", newBegin); - setAttrValue(&op, "schedule_end", newEnd); - } - - // Recursively apply to all child blocks. - if (op.getNumRegions()) { - for (auto ®ion : op.getRegions()) { - for (auto &block : region.getBlocks()) - updateChildBlockSchedule(block, begin); - } - } - } +Optional HLSCppEstimator::visitOp(AffineLoadOp op, unsigned begin) { + return getLoadStoreSchedule(op, begin); } -/// Schedule the block with ASAP algorithm. -unsigned HLSCppEstimator::getBlockSchedule(Block &block) { - unsigned blockEnd = 0; - MemPortDicts dicts; - - for (auto &op : block) { - // Find the latest predecessor dominating the current operation. This - // should be considered as the earliest stage that the current operation - // can be scheduled. - unsigned begin = 0; - unsigned end = 0; - for (auto operand : op.getOperands()) { - if (auto defOp = operand.getDefiningOp()) - begin = max(getUIntAttrValue(defOp, "schedule_end"), begin); - } - - // Handle loop operations. - if (auto forOp = dyn_cast(op)) { - // Live ins of the for loop body will also impact the schedule begin. - Liveness liveness(block.getParentOp()); - for (auto liveIn : liveness.getLiveIn(&forOp.getLoopBody().front())) { - if (auto defOp = liveIn.getDefiningOp()) - begin = max(getUIntAttrValue(defOp, "schedule_end"), begin); - } - - // Update the schedule of all operations in the loop body. - updateChildBlockSchedule(forOp.getLoopBody().front(), begin); - - // Child loop is considered as a large node, and two extra clock cycles - // will be required to enter and exit the child loop. - end = begin + getUIntAttrValue(forOp, "latency") + 2; - } - - // Handle if operations. - else if (auto ifOp = dyn_cast(op)) { - // Live ins of the if body will also impact the schedule begin. - Liveness liveness(block.getParentOp()); - for (auto liveIn : liveness.getLiveIn(ifOp.getThenBlock())) { - if (auto defOp = liveIn.getDefiningOp()) - begin = max(getUIntAttrValue(defOp, "schedule_end"), begin); - } - - if (ifOp.hasElse()) { - for (auto liveIn : liveness.getLiveIn(ifOp.getElseBlock())) { - if (auto defOp = liveIn.getDefiningOp()) - begin = max(getUIntAttrValue(defOp, "schedule_end"), begin); - } - // Update the schedule of all operations in the else block. - updateChildBlockSchedule(*ifOp.getElseBlock(), begin); - } - - // Update the schedule of all operations in the then block. - updateChildBlockSchedule(*ifOp.getThenBlock(), begin); - - end = begin + getUIntAttrValue(ifOp, "latency"); - } - - // Handle load/store operations. - else if (isa(op)) { - // Insert new schedule level to the memory port dicts. - while (begin >= dicts.size()) { - MemPortDict memPortDict; - dicts.push_back(memPortDict); - } - - // Ensure the current schedule meets memory port limitation. - begin = getLoadStoreSchedule(&op, begin, dicts); - end = begin + 1; - } - - // Default case. All normal expressions and operations will be handled by - // this branch. - else { - // TODO: For now, we assume all operations take one clock cycle to - // execute, should support to accept profiling data. - end = begin + 1; - } - - setAttrValue(&op, "schedule_begin", begin); - setAttrValue(&op, "schedule_end", end); - blockEnd = max(blockEnd, end); - } - return blockEnd; +Optional HLSCppEstimator::visitOp(AffineStoreOp op, unsigned begin) { + return getLoadStoreSchedule(op, begin); } +//===----------------------------------------------------------------------===// +// AffineForOp Related Methods +//===----------------------------------------------------------------------===// + /// Calculate the minimum resource II. -unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoreDict dict) { +unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoresMap &dict) { unsigned II = 1; for (auto &pair : dict) { @@ -354,7 +405,7 @@ unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoreDict dict) { } /// Calculate the minimum dependency II. -unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) { +unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoresMap &dict) { unsigned II = 1; // Collect start and end level of the pipeline. @@ -375,8 +426,9 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) { auto loadStores = pair.second; // Walk through each pair of source and destination, and each loop level - // that are pipelined. - for (auto loopDepth = startLevel; loopDepth <= endLevel; ++loopDepth) { + // that are pipelined. Note that for inter-dependency, dstOp is always + // before srcOp. + for (unsigned loopDepth = startLevel; loopDepth <= endLevel; ++loopDepth) { unsigned dstIndex = 1; for (auto dstOp : loadStores) { MemRefAccess dstAccess(dstOp); @@ -393,7 +445,7 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) { if (hasDependence(result)) { SmallVector flattenTripCounts; flattenTripCounts.push_back(1); - unsigned distance = 0; + int64_t distance = 0; // Calculate the distance of this dependency. for (auto it = depComps.rbegin(); it < depComps.rend(); ++it) { @@ -413,7 +465,7 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) { unsigned delay = getUIntAttrValue(srcOp, "schedule_begin") - getUIntAttrValue(dstOp, "schedule_begin"); - if (distance != 0) { + if (distance > 0) { unsigned minII = ceil((float)delay / distance); II = max(II, minII); } @@ -426,37 +478,48 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) { return II; } -bool HLSCppEstimator::visitOp(AffineForOp op) { - auto &body = op.getLoopBody(); - if (body.getBlocks().size() != 1) { +Optional HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) { + if (op.getLoopBody().getBlocks().size() != 1) { op.emitError("has zero or more than one basic blocks."); - return false; + return Optional(); } - // Recursively estimate all contained operations. - if (!estimateBlock(body.front())) - return false; - - // Set an attribute indicating the trip count. For now, we assume all - // loops have static loop bound. + // Set an attribute indicating the trip count. For now, we assume all loops + // have static loop bound. if (auto tripCount = getConstantTripCount(op)) setAttrValue(op, "trip_count", (unsigned)tripCount.getValue()); else { setAttrValue(op, "trip_count", (unsigned)0); op.emitError("has undetermined trip count"); - return false; + return Optional(); } - // If the current loop is annotated as pipeline, extra dependency and II - // analysis will be executed. - if (getBoolAttrValue(op, "pipeline")) { - LoadStoreDict dict; - getBlockMemInfo(body.front(), dict); + unsigned end = begin; + auto &loopBlock = op.getLoopBody().front(); + // Live ins will impact the scheduling. + for (auto liveIn : liveness.getLiveIn(&loopBlock)) + if (auto defOp = liveIn.getDefiningOp()) + begin = max(begin, getUIntAttrValue(defOp, "schedule_end")); + + // Estimate the loop block. + if (auto esti = estimateBlock(loopBlock, begin)) + end = max(end, esti.getValue()); + else + return Optional(); + + // If the current loop is annotated as pipeline, extra dependency and + // resource aware II analysis will be executed. + if (getBoolAttrValue(op, "pipeline")) { // Calculate latency of each iteration. - auto iterLatency = getBlockSchedule(body.front()); + auto iterLatency = end - begin; setAttrValue(op, "iter_latency", iterLatency); + // Collect load and store operations in the loop block for estimating the + // achievable initial interval. + LoadStoresMap dict; + getLoadStoresMap(loopBlock, dict); + // Calculate initial interval. auto II = max(getResMinII(op, dict), getDepMinII(op, dict)); setAttrValue(op, "init_interval", II); @@ -464,117 +527,143 @@ bool HLSCppEstimator::visitOp(AffineForOp op) { auto tripCount = getUIntAttrValue(op, "trip_count"); setAttrValue(op, "flatten_trip_count", tripCount); - setAttrValue(op, "latency", iterLatency + II * (tripCount - 1)); - return true; + auto latency = iterLatency + II * (tripCount - 1); + setAttrValue(op, "latency", latency); + + // Entering and leaving a loop will consume extra 2 clock cycles. + return begin + latency + 2; } - // This means the current loop can be flattened into the child loop. If the - // child loop is pipelined, this will increase the flattened loop trip count - // without changing the iteration latency. Note that this will be propogated - // above until meeting an imperfect loop. + // If the current loop is annotated as flatten, it will be flattened into + // the child pipelined loop. This will increase the flattened loop trip count + // without changing the iteration latency. if (getBoolAttrValue(op, "flatten")) { - if (auto child = dyn_cast(op.getLoopBody().front().front())) { - // This means the inner loop is pipelined, because otherwise II will be - // equal to zero. So that in this case, this loop will be flattened into - // the inner pipelined loop. - if (auto II = getUIntAttrValue(child, "init_interval")) { - setAttrValue(op, "init_interval", II); + auto child = dyn_cast(op.getLoopBody().front().front()); + assert(child && "the first containing operation is not a loop"); - auto iterLatency = getUIntAttrValue(child, "iter_latency"); - setAttrValue(op, "iter_latency", iterLatency); + auto iterLatency = getUIntAttrValue(child, "iter_latency"); + setAttrValue(op, "iter_latency", iterLatency); - auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") * - getUIntAttrValue(op, "trip_count"); - setAttrValue(op, "flatten_trip_count", flattenTripCount); + auto II = getUIntAttrValue(child, "init_interval"); + setAttrValue(op, "init_interval", II); - setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1)); - } else { - auto iterLatency = getUIntAttrValue(child, "latency"); - setAttrValue(op, "iter_latency", iterLatency); + auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") * + getUIntAttrValue(op, "trip_count"); + setAttrValue(op, "flatten_trip_count", flattenTripCount); - unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count"); - setAttrValue(op, "latency", latency); - } - return true; - } + auto latency = iterLatency + II * (flattenTripCount - 1); + setAttrValue(op, "latency", latency); + + // Since the loop is flattened, it will no longer be entered and left. + return begin + latency; } - // Default case, aka !pipeline && !flatten. - LoadStoreDict dict; - getBlockMemInfo(body.front(), dict); - - auto iterLatency = getBlockSchedule(body.front()); + // Default case, calculate latency of each iteration. + auto iterLatency = end - begin; setAttrValue(op, "iter_latency", iterLatency); unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count"); setAttrValue(op, "latency", latency); - return true; + + return begin + latency + 2; } -bool HLSCppEstimator::visitOp(AffineIfOp op) { - auto thenBlock = op.getThenBlock(); - if (!estimateBlock(*thenBlock)) - return false; +//===----------------------------------------------------------------------===// +// Other Operation Handlers +//===----------------------------------------------------------------------===// - LoadStoreDict dict; - getBlockMemInfo(*thenBlock, dict); - auto latency = getBlockSchedule(*thenBlock); +Optional HLSCppEstimator::visitOp(AffineIfOp op, unsigned begin) { + unsigned end = begin; + auto thenBlock = op.getThenBlock(); + + // Live ins will impact the scheduling. + for (auto liveIn : liveness.getLiveIn(thenBlock)) + if (auto defOp = liveIn.getDefiningOp()) + begin = max(begin, getUIntAttrValue(defOp, "schedule_end")); + + // Estimate then block. + if (auto esti = estimateBlock(*thenBlock, begin)) + end = max(end, esti.getValue()); + else + return Optional(); // Handle else block if required. if (op.hasElse()) { auto elseBlock = op.getElseBlock(); - if (!estimateBlock(*elseBlock)) - return false; - getBlockMemInfo(*elseBlock, dict); - latency = max(latency, getBlockSchedule(*elseBlock)); + for (auto liveIn : liveness.getLiveIn(elseBlock)) + if (auto defOp = liveIn.getDefiningOp()) + begin = max(begin, getUIntAttrValue(defOp, "schedule_end")); + + if (auto esti = estimateBlock(*elseBlock, begin)) + end = max(end, esti.getValue()); + else + return Optional(); } - setAttrValue(op, "latency", latency); - return true; + return end; } -bool HLSCppEstimator::visitOp(ArrayOp op) { +Optional HLSCppEstimator::visitOp(ArrayOp op, unsigned begin) { + // Annotate the total parition number of the array. unsigned partitionNum = 1; if (op.partition()) { auto rank = op.getType().cast().getRank(); - for (unsigned i = 0; i < rank; ++i) { - if (auto factor = getPartitionFactor(op, i)) + for (unsigned dim = 0; dim < rank; ++dim) { + if (auto factor = getPartitionFactor(op, dim)) partitionNum *= factor; } } setAttrValue(op, "partition_num", partitionNum); - return true; + + // ArrayOp is a dummy memory instance which does not consume any clock + // cycles. + return begin; } -bool HLSCppEstimator::estimateBlock(Block &block) { +//===----------------------------------------------------------------------===// +// Block Scheduler and Estimator +//===----------------------------------------------------------------------===// + +/// Estimate the latency of a block with ASAP scheduling strategy. +Optional HLSCppEstimator::estimateBlock(Block &block, + unsigned blockBegin) { + unsigned blockEnd = blockBegin; + for (auto &op : block) { - if (dispatchVisitor(&op)) - continue; - else { - op.emitError("can't be correctly estimated."); - return false; - } + unsigned begin = blockBegin; + unsigned end = blockBegin; + + // Find the latest arrived predecessor dominating the current operation. + // This should be considered as the earliest possible scheduling level + // that the current operation can be scheduled. + for (auto operand : op.getOperands()) + if (auto defOp = operand.getDefiningOp()) + begin = max(begin, getUIntAttrValue(defOp, "schedule_end")); + + // Estimate the current operation. + if (auto esti = dispatchVisitor(&op, begin)) + end = max(end, esti.getValue()); + else + return Optional(); + + setAttrValue(&op, "schedule_begin", begin); + setAttrValue(&op, "schedule_end", end); + + blockEnd = max(blockEnd, end); } - return true; + return blockEnd; } -bool HLSCppEstimator::estimateFunc(FuncOp func) { - if (func.getBlocks().size() != 1) { +void HLSCppEstimator::estimateFunc() { + if (func.getBlocks().size() != 1) func.emitError("has zero or more than one basic blocks."); - return false; - } - // Recursively estimate all contained operations. - if (!estimateBlock(func.front())) - return false; - - LoadStoreDict dict; - getBlockMemInfo(func.front(), dict); - - auto latency = getBlockSchedule(func.front()); - setAttrValue(func, "latency", latency); - return true; + // Recursively estimate blocks in the function. + if (auto esti = estimateBlock(func.front(), 0)) + setAttrValue(func, "latency", esti.getValue()); + else + setAttrValue(func, "latency", "unknown"); } //===----------------------------------------------------------------------===// @@ -584,13 +673,22 @@ bool HLSCppEstimator::estimateFunc(FuncOp func) { namespace { struct QoREstimation : public scalehls::QoREstimationBase { void runOnOperation() override { - auto module = getOperation(); - auto builder = OpBuilder(module); + // Read configuration file. + INIReader spec(targetSpec); + if (spec.ParseError()) + llvm::outs() << "error: target spec file parse fail, please refer to " + "--help option and pass in correct file path\n"; + + // TODO: Support estimator initiation from profiling data, constructing a + // unique data structure for holding latency and resource information. + auto freq = spec.Get("spec", "frequency", "200MHz"); + auto latency = spec.GetInteger(freq, "op", 0); // Estimate performance and resource utilization. - HLSCppEstimator estimator(builder, targetSpec); - for (auto func : module.getOps()) - estimator.estimateFunc(func); + for (auto func : getOperation().getOps()) { + HLSCppEstimator estimator(func); + estimator.estimateFunc(); + } } }; } // namespace diff --git a/lib/Conversion/ConvertToHLSCpp.cpp b/lib/Conversion/ConvertToHLSCpp.cpp index ba4c0bc..a5188a9 100644 --- a/lib/Conversion/ConvertToHLSCpp.cpp +++ b/lib/Conversion/ConvertToHLSCpp.cpp @@ -117,8 +117,8 @@ void ConvertToHLSCpp::runOnOperation() { if (!forOp.getAttr("pipeline")) forOp.setAttr("pipeline", builder.getBoolAttr(false)); - if (!forOp.getAttr("unroll")) - forOp.setAttr("unroll", builder.getBoolAttr(false)); + // if (!forOp.getAttr("unroll")) + // forOp.setAttr("unroll", builder.getBoolAttr(false)); if (!forOp.getAttr("flatten")) forOp.setAttr("flatten", builder.getBoolAttr(false)); diff --git a/lib/Transforms/ArrayPartition.cpp b/lib/Transforms/ArrayPartition.cpp index 6c285d0..1cefbbe 100644 --- a/lib/Transforms/ArrayPartition.cpp +++ b/lib/Transforms/ArrayPartition.cpp @@ -34,8 +34,8 @@ static mlir::AffineForOp getPipelineLoop(mlir::AffineForOp root) { } template -static void applyArrayPartition(LoadStoreDict &dict, OpBuilder &builder) { - for (auto pair : dict) { +static void applyArrayPartition(LoadStoresMap &map, OpBuilder &builder) { + for (auto pair : map) { auto arrayOp = cast(pair.first); auto arrayType = arrayOp.getType().cast(); auto arrayAccesses = pair.second; @@ -116,21 +116,21 @@ void ArrayPartition::runOnOperation() { for (auto forOp : func.getOps()) { if (auto outermost = getPipelineLoop(forOp)) { // Collect memory access information. - LoadStoreDict loadDict; + LoadStoresMap loadMap; outermost.walk([&](mlir::AffineLoadOp loadOp) { auto arrayOp = cast(loadOp.getMemRef().getDefiningOp()); - loadDict[arrayOp].push_back(loadOp); + loadMap[arrayOp].push_back(loadOp); }); - LoadStoreDict storeDict; + LoadStoresMap storeMap; outermost.walk([&](mlir::AffineStoreOp storeOp) { auto arrayOp = cast(storeOp.getMemRef().getDefiningOp()); - storeDict[arrayOp].push_back(storeOp); + storeMap[arrayOp].push_back(storeOp); }); // Apply array partition pragma. - applyArrayPartition(loadDict, builder); - applyArrayPartition(storeDict, builder); + applyArrayPartition(loadMap, builder); + applyArrayPartition(storeMap, builder); } } } diff --git a/lib/Transforms/LoopPipelining.cpp b/lib/Transforms/LoopPipelining.cpp index 38e565c..bcfffa8 100644 --- a/lib/Transforms/LoopPipelining.cpp +++ b/lib/Transforms/LoopPipelining.cpp @@ -41,26 +41,22 @@ void LoopPipelining::runOnOperation() { }); // All outer loops that perfect nest the pipelined loop can be flattened. - forOp.walk([&](mlir::AffineForOp loop) { - unsigned opNum = 0; - unsigned forNum = 0; - bool innerFlatten = false; - - for (auto &bodyOp : loop.getLoopBody().front()) { - if (!isa(bodyOp)) - opNum++; - if (isa(bodyOp)) { - forNum++; - if (auto flatten = bodyOp.getAttrOfType("flatten")) - innerFlatten = flatten.getValue(); - } - } - - if (forNum == 0 || (opNum == 1 && innerFlatten)) - loop.setAttr("flatten", builder.getBoolAttr(true)); - else - loop.setAttr("flatten", builder.getBoolAttr(false)); - }); + SmallVector flattenedLoops; + flattenedLoops.push_back(targetLoop); + while (true) { + auto currentLoop = flattenedLoops.back(); + if (auto outerLoop = currentLoop.getParentOfType()) { + // Only if the current loop is the only child loop of the outer loop, + // the outer loop can be flattened into the current loop. + auto &body = outerLoop.getLoopBody().front(); + if (&body.front() == currentLoop && body.getOperations().size() == 2) { + flattenedLoops.push_back(outerLoop); + outerLoop.setAttr("flatten", builder.getBoolAttr("true")); + } else + break; + } else + break; + } } // Canonicalize the IR after loop unrolling.