[QoREstimator] thoroughly rewrite the estimator with new strategy and structure

This commit is contained in:
Hanchen Ye 2020-12-17 06:10:48 -06:00
parent fac1498067
commit 512c842908
5 changed files with 491 additions and 391 deletions

View File

@ -7,6 +7,8 @@
#include "Dialect/HLSCpp/Visitor.h"
#include "INIReader.h"
#include "mlir/Analysis/AffineAnalysis.h"
#include "mlir/Analysis/Liveness.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
@ -21,10 +23,12 @@ namespace scalehls {
class HLSCppToolBase {
public:
explicit HLSCppToolBase(OpBuilder &builder) : builder(builder) {}
explicit HLSCppToolBase(OpBuilder builder) : builder(builder) {}
/// Get value methods.
int64_t getIntAttrValue(Operation *op, StringRef name) {
OpBuilder builder;
/// Get attribute value methods.
int32_t getIntAttrValue(Operation *op, StringRef name) {
if (auto attr = op->getAttrOfType<IntegerAttr>(name))
return attr.getInt();
else
@ -52,6 +56,7 @@ public:
return "";
}
/// Get partition information methods.
StringRef getPartitionType(ArrayOp op, unsigned dim) {
if (auto attr = op.partition_type()[dim].cast<StringAttr>())
return attr.getValue();
@ -66,15 +71,15 @@ public:
return 0;
}
/// Set value methods.
void setAttrValue(Operation *op, StringRef name, unsigned value) {
op->setAttr(name, builder.getUI32IntegerAttr(value));
}
/// Set attribute value methods.
void setAttrValue(Operation *op, StringRef name, int32_t value) {
op->setAttr(name, builder.getI32IntegerAttr(value));
}
void setAttrValue(Operation *op, StringRef name, unsigned value) {
op->setAttr(name, builder.getUI32IntegerAttr(value));
}
void setAttrValue(Operation *op, StringRef name, bool value) {
op->setAttr(name, builder.getBoolAttr(value));
}
@ -82,21 +87,6 @@ public:
void setAttrValue(Operation *op, StringRef name, StringRef value) {
op->setAttr(name, builder.getStringAttr(value));
}
/// Get expression methods.
AffineExpr getSymbolExpr(unsigned value) {
return getAffineSymbolExpr(value, builder.getContext());
}
AffineExpr getDimExpr(unsigned value) {
return getAffineDimExpr(value, builder.getContext());
}
AffineExpr getConstExpr(int64_t value) {
return getAffineConstantExpr(value, builder.getContext());
}
OpBuilder &builder;
};
//===----------------------------------------------------------------------===//
@ -104,9 +94,13 @@ public:
//===----------------------------------------------------------------------===//
// For storing all memory access operations (including AffineLoadOp and
// AffineStoreOp) indexed by the array instantce (ArrayOp).
using LoadStore = SmallVector<Operation *, 16>;
using LoadStoreDict = llvm::SmallDenseMap<Operation *, LoadStore, 8>;
// AffineStoreOp) indexed by the array instance (ArrayOp).
using LoadStores = SmallVector<Operation *, 16>;
using LoadStoresMap = DenseMap<Operation *, LoadStores>;
// For storing all dependent operations indexed by the source operation.
using Depends = SmallVector<Operation *, 16>;
using DependsMap = DenseMap<Operation *, Depends>;
// Indicate the unoccupied memory ports number.
struct PortInfo {
@ -118,37 +112,49 @@ struct PortInfo {
unsigned rdwrPort;
};
// For storing ports number information of each memory instance.
using MemPort = SmallVector<PortInfo, 16>;
using MemPortDict = llvm::SmallDenseMap<Operation *, MemPort, 8>;
// For storing ports number of all partitions indexed by the array instance
// (ArrayOp).
using Ports = SmallVector<PortInfo, 16>;
using PortsMap = DenseMap<Operation *, Ports>;
// For storing MemPort indexed by the pipeline stage.
using MemPortDicts = SmallVector<MemPortDict, 16>;
// For storing PortsMap indexed by the scheduling level.
using PortsMapDict = DenseMap<unsigned, PortsMap>;
class HLSCppEstimator : public HLSCppVisitorBase<HLSCppEstimator, bool>,
public HLSCppToolBase {
class HLSCppEstimator
: public HLSCppVisitorBase<HLSCppEstimator, Optional<unsigned>, unsigned>,
public HLSCppToolBase {
public:
explicit HLSCppEstimator(OpBuilder &builder, std::string targetSpecPath);
bool visitUnhandledOp(Operation *op) { return true; }
explicit HLSCppEstimator(FuncOp &func)
: HLSCppToolBase(OpBuilder(func)), func(func), liveness(Liveness(func)) {
getFuncMemRefDepends();
}
void getFuncMemRefDepends();
using HLSCppVisitorBase::visitOp;
bool visitOp(AffineForOp op);
bool visitOp(AffineIfOp op);
bool visitOp(ArrayOp op);
Optional<unsigned> visitUnhandledOp(Operation *op, unsigned begin) {
// Default latency of any unhandled operation is 1.
return begin + 1;
}
void getBlockMemInfo(Block &block, LoadStoreDict &info);
int32_t getPartitionIndex(Operation *op);
unsigned getLoadStoreSchedule(Operation *op, unsigned begin);
Optional<unsigned> visitOp(AffineLoadOp op, unsigned begin);
Optional<unsigned> visitOp(AffineStoreOp op, unsigned begin);
unsigned getLoadStoreSchedule(Operation *op, unsigned begin,
MemPortDicts &dicts);
void updateChildBlockSchedule(Block &block, unsigned begin);
unsigned getBlockSchedule(Block &block);
unsigned getResMinII(AffineForOp forOp, LoadStoresMap &map);
unsigned getDepMinII(AffineForOp forOp, LoadStoresMap &map);
Optional<unsigned> visitOp(AffineForOp op, unsigned begin);
unsigned getResMinII(AffineForOp forOp, LoadStoreDict dict);
unsigned getDepMinII(AffineForOp forOp, LoadStoreDict dict);
Optional<unsigned> visitOp(AffineIfOp op, unsigned begin);
Optional<unsigned> visitOp(ArrayOp op, unsigned begin);
bool estimateFunc(FuncOp func);
bool estimateBlock(Block &block);
Optional<unsigned> estimateBlock(Block &block, unsigned blockBegin);
void estimateFunc();
FuncOp &func;
Liveness liveness;
DependsMap dependsMap;
PortsMapDict portsMapDict;
};
} // namespace scalehls

View File

@ -5,14 +5,11 @@
#include "Analysis/QoREstimation.h"
#include "Analysis/Passes.h"
#include "Dialect/HLSCpp/HLSCpp.h"
#include "mlir/Analysis/AffineAnalysis.h"
#include "mlir/Analysis/AffineStructures.h"
#include "mlir/Analysis/Liveness.h"
#include "mlir/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
using namespace std;
using namespace mlir;
@ -20,97 +17,246 @@ using namespace scalehls;
using namespace hlscpp;
//===----------------------------------------------------------------------===//
// HLSCppEstimator Class Definition
// Helpers
//===----------------------------------------------------------------------===//
/// Estimator constructor.
HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath)
: HLSCppToolBase(builder) {
// Check if the lhsOp and rhsOp is at the same scheduling level. In this check,
// AffineIfOp is transparent.
static bool checkSameLevel(Operation *lhsOp, Operation *rhsOp) {
// If lhsOp and rhsOp are already at the same level, return true.
if (lhsOp->getBlock() == rhsOp->getBlock())
return true;
INIReader targetSpec(targetSpecPath);
if (targetSpec.ParseError())
llvm::outs() << "error: target spec file parse fail, please refer to "
"--help option and pass in correct file path\n";
// Get all nested parent AffineIfOps, include lhsOp and rhsOp.
auto getNests = ([&](Operation *op, SmallVector<Operation *, 4> &nests) {
nests.push_back(op);
auto currentOp = op;
while (true) {
if (auto parentOp = currentOp->getParentOfType<AffineIfOp>()) {
nests.push_back(parentOp);
currentOp = parentOp;
} else
break;
}
});
// TODO: Support estimator initiation from profiling data.
auto freq = targetSpec.Get("spec", "frequency", "200MHz");
auto latency = targetSpec.GetInteger(freq, "op", 0);
llvm::outs() << latency << "\n";
SmallVector<Operation *, 4> lhsNests;
SmallVector<Operation *, 4> rhsNests;
getNests(lhsOp, lhsNests);
getNests(rhsOp, rhsNests);
// If any parent of lhsOp and any parent of rhsOp are at the same level,
// return true.
for (auto lhs : lhsNests)
for (auto rhs : rhsNests)
if (lhs->getBlock() == rhs->getBlock())
return true;
return false;
}
/// Collect memory access information of the block.
void HLSCppEstimator::getBlockMemInfo(Block &block, LoadStoreDict &dict) {
// Walk through all load/store operations in the current block.
/// Get all nested parent AffineForOps. Since AffineIfOps are transparent,
/// AffineIfOps are skipped during the procedure.
static void getLoopNests(Operation *op, SmallVector<Operation *, 4> &nests) {
auto currentOp = op;
while (true) {
if (auto parentOp = currentOp->getParentOfType<AffineForOp>()) {
nests.push_back(parentOp);
currentOp = parentOp;
} else if (auto parentOp = currentOp->getParentOfType<AffineIfOp>())
currentOp = parentOp;
else
break;
}
}
/// Get the definition ArrayOp given any memory access operation.
static ArrayOp getArrayOp(Operation *op) {
auto defOp = MemRefAccess(op).memref.getDefiningOp();
assert(defOp && "MemRef is block argument");
auto arrayOp = dyn_cast<ArrayOp>(defOp);
assert(arrayOp && "MemRef is not defined by ArrayOp");
return arrayOp;
}
/// Collect all load and store operations in the block.
static void getLoadStoresMap(Block &block, LoadStoresMap &map) {
block.walk([&](Operation *op) {
if (isa<mlir::AffineReadOpInterface, mlir::AffineWriteOpInterface>(op)) {
auto memAccess = MemRefAccess(op);
auto arrayOp = cast<ArrayOp>(memAccess.memref.getDefiningOp());
AffineValueMap accessMap;
memAccess.getAccessMap(&accessMap);
dict[arrayOp].push_back(op);
// Calculate the partition index of this load/store operation honoring the
// partition strategy applied.
int32_t partitionIdx = 0;
unsigned accumFactor = 1;
unsigned dim = 0;
for (auto expr : accessMap.getAffineMap().getResults()) {
auto idxExpr = getConstExpr(0);
unsigned factor = 1;
if (arrayOp.partition()) {
auto type = getPartitionType(arrayOp, dim);
factor = getPartitionFactor(arrayOp, dim);
if (type == "cyclic")
idxExpr = expr % getConstExpr(factor);
else if (type == "block") {
auto size = arrayOp.getType().cast<ShapedType>().getShape()[dim];
idxExpr = expr.floorDiv(getConstExpr((size + factor - 1) / factor));
}
}
if (auto constExpr = idxExpr.dyn_cast<AffineConstantExpr>()) {
if (dim == 0)
partitionIdx = constExpr.getValue();
else
partitionIdx += constExpr.getValue() * accumFactor;
} else {
partitionIdx = -1;
break;
}
accumFactor *= factor;
dim++;
}
// Set partition index attribute.
setAttrValue(op, "partition_index", partitionIdx);
}
if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
map[getArrayOp(op)].push_back(op);
});
}
/// Calculate load/store operation schedule honoring the memory ports number
/// limitation. This method will be called by getBlockSchedule method.
unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin,
MemPortDicts &dicts) {
auto memAccess = MemRefAccess(op);
auto arrayOp = cast<ArrayOp>(memAccess.memref.getDefiningOp());
//===----------------------------------------------------------------------===//
// MemRef Dependency Collection Methods
//===----------------------------------------------------------------------===//
auto partitionIdx = getIntAttrValue(op, "partition_index");
/// Get the common loop depth shared by lhsOp and rhsOp.
static unsigned getCommonLoopDepth(Operation *lhsOp, Operation *rhsOp) {
// Collect all parent nested loops.
SmallVector<Operation *, 4> lhsLoopNests;
SmallVector<Operation *, 4> rhsLoopNests;
getLoopNests(lhsOp, lhsLoopNests);
getLoopNests(rhsOp, rhsLoopNests);
// Calculate common loop depth.
auto lhsDepth = lhsLoopNests.size();
auto rhsDepth = rhsLoopNests.size();
unsigned commonLoopDepth = 0;
for (unsigned i = 0, e = min(lhsDepth, rhsDepth); i < e; ++i) {
if (lhsLoopNests[lhsDepth - 1 - i] == rhsLoopNests[rhsDepth - 1 - i])
commonLoopDepth++;
else
break;
}
return commonLoopDepth;
}
/// Collect all dependencies detected in the function.
void HLSCppEstimator::getFuncMemRefDepends() {
LoadStoresMap loadStoresMap;
getLoadStoresMap(func.front(), loadStoresMap);
// Walk through all ArrayOp - LoadOp/StoreOp pairs.
for (auto &pair : loadStoresMap) {
auto loadStores = pair.second;
// Walk through each pair of source and destination. Note that for intra
// dependencies, srcOp is always before dstOp.
unsigned srcIndex = 1;
for (auto srcOp : loadStores) {
MemRefAccess srcAccess(srcOp);
for (auto dstOp : llvm::drop_begin(loadStores, srcIndex)) {
MemRefAccess dstAccess(dstOp);
bool dependFlag = false;
auto commonLoopDepth = getCommonLoopDepth(srcOp, dstOp);
for (unsigned depth = 1; depth <= commonLoopDepth; ++depth) {
// Initialize constraints and components.
FlatAffineConstraints dependConstrs;
SmallVector<DependenceComponent, 2> dependComps;
// Check dependency.
DependenceResult result = checkMemrefAccessDependence(
srcAccess, dstAccess, depth, &dependConstrs, &dependComps);
dependFlag = hasDependence(result);
}
// All dependencies are pushed into the dependsMap output.
if (dependFlag)
dependsMap[dstOp].push_back(srcOp);
}
srcIndex++;
}
}
}
//===----------------------------------------------------------------------===//
// LoadOp and StoreOp Related Methods
//===----------------------------------------------------------------------===//
// Get the pointer of the scrOp's parent loop, which should locate at the same
// level with dstOp's any parent loop.
static Operation *getSameLevelSourceOp(Operation *srcOp, Operation *dstOp) {
// If srcOp and dstOp are already at the same level, return the srcOp.
if (checkSameLevel(srcOp, dstOp))
return srcOp;
SmallVector<Operation *, 4> srcNests;
SmallVector<Operation *, 4> dstNests;
srcNests.push_back(srcOp);
dstNests.push_back(dstOp);
getLoopNests(srcOp, srcNests);
getLoopNests(dstOp, dstNests);
// If any parent of srcOp (or itself) and any parent of dstOp (or itself) are
// at the same level, return the pointer.
for (auto src : srcNests)
for (auto dst : dstNests)
if (checkSameLevel(src, dst))
return src;
return nullptr;
}
/// Calculate the overall partition index.
int32_t HLSCppEstimator::getPartitionIndex(Operation *op) {
auto arrayOp = getArrayOp(op);
AffineValueMap accessMap;
MemRefAccess(op).getAccessMap(&accessMap);
// Calculate the partition index of this load/store operation honoring the
// partition strategy applied.
int32_t partitionIdx = 0;
unsigned accumFactor = 1;
unsigned dim = 0;
for (auto expr : accessMap.getAffineMap().getResults()) {
auto idxExpr = builder.getAffineConstantExpr(0);
unsigned factor = 1;
if (arrayOp.partition()) {
auto type = getPartitionType(arrayOp, dim);
factor = getPartitionFactor(arrayOp, dim);
if (type == "cyclic")
idxExpr = expr % builder.getAffineConstantExpr(factor);
else if (type == "block") {
auto size = arrayOp.getType().cast<ShapedType>().getShape()[dim];
idxExpr = expr.floorDiv(
builder.getAffineConstantExpr((size + factor - 1) / factor));
}
}
if (auto constExpr = idxExpr.dyn_cast<AffineConstantExpr>()) {
if (dim == 0)
partitionIdx = constExpr.getValue();
else
partitionIdx += constExpr.getValue() * accumFactor;
} else {
partitionIdx = -1;
break;
}
accumFactor *= factor;
dim++;
}
return partitionIdx;
}
/// Schedule load/store operation honoring the memory ports number limitation.
unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin) {
// Check dependencies of the operation and update schedule level.
for (auto srcOp : dependsMap[op]) {
auto sameLevelSrcOp = getSameLevelSourceOp(srcOp, op);
begin = max(getUIntAttrValue(sameLevelSrcOp, "schedule_end"), begin);
}
// Calculate partition index.
auto partitionIdx = getPartitionIndex(op);
setAttrValue(op, "partition_index", partitionIdx);
auto arrayOp = getArrayOp(op);
auto partitionNum = getUIntAttrValue(arrayOp, "partition_num");
auto storageType = getStrAttrValue(arrayOp, "storage_type");
// Try to avoid memory port violation until a legal schedule is found.
// Since an infinite length pipeline can be generated, this while loop can
// be proofed to have an end.
// Try to avoid memory port violation until a legal schedule is found. Since
// an infinite length schedule cannot be generated, this while loop can be
// proofed to have an end.
while (true) {
auto memPort = dicts[begin][arrayOp];
auto memPort = portsMapDict[begin][arrayOp];
bool memPortEmpty = memPort.empty();
// If the memory has not been occupied by the current stage, it should
// be initialized according to its storage type. Note that each
// partition should have one PortNum structure.
// If the memory has not been occupied by the current schedule level, it
// should be initialized according to its storage type. Note that each
// partition should have one PortInfo structure.
if (memPortEmpty) {
for (unsigned p = 0; p < partitionNum; ++p) {
unsigned rdPort = 0;
@ -125,167 +271,72 @@ unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin,
rdwrPort = 1;
else {
rdwrPort = 2;
// arrayOp.emitError("unsupported storage type.");
}
PortInfo portInfo(rdPort, wrPort, rdwrPort);
memPort.push_back(portInfo);
memPort.push_back(PortInfo(rdPort, wrPort, rdwrPort));
}
}
// TODO: When partition index can't be determined, this operation will be
// considered to occupy all ports.
// Indicate whether the operation is successfully scheduled in the current
// schedule level.
bool successFlag = false;
if (partitionIdx == -1) {
// When partition index can't be determined, this operation must occupy
// all ports in the scheduled level.
if (memPortEmpty) {
for (unsigned p = 0; p < partitionNum; ++p) {
memPort[p].rdPort = 0;
memPort[p].wrPort = 0;
memPort[p].rdwrPort = 0;
}
dicts[begin][arrayOp] = memPort;
break;
} else {
if (++begin >= dicts.size()) {
MemPortDict memPortDict;
dicts.push_back(memPortDict);
}
successFlag = true;
}
}
// Find whether the current schedule meets memory port limitation. If
// not, the schedule will increase by 1.
PortInfo portInfo = memPort[partitionIdx];
if (isa<AffineLoadOp>(op) && portInfo.rdPort > 0) {
memPort[partitionIdx].rdPort -= 1;
dicts[begin][arrayOp] = memPort;
break;
} else if (isa<AffineStoreOp>(op) && portInfo.wrPort > 0) {
memPort[partitionIdx].wrPort -= 1;
dicts[begin][arrayOp] = memPort;
break;
} else if (portInfo.rdwrPort > 0) {
memPort[partitionIdx].rdwrPort -= 1;
dicts[begin][arrayOp] = memPort;
break;
} else {
if (++begin >= dicts.size()) {
MemPortDict memPortDict;
dicts.push_back(memPortDict);
// When partition index can be determined, figure out whether the current
// schedule meets memory port limitation.
PortInfo portInfo = memPort[partitionIdx];
if (isa<AffineLoadOp>(op) && portInfo.rdPort > 0) {
memPort[partitionIdx].rdPort -= 1;
successFlag = true;
} else if (isa<AffineStoreOp>(op) && portInfo.wrPort > 0) {
memPort[partitionIdx].wrPort -= 1;
successFlag = true;
} else if (portInfo.rdwrPort > 0) {
memPort[partitionIdx].rdwrPort -= 1;
successFlag = true;
}
}
// If successed, break the while loop. Otherwise increase the schedule level
// by 1 and continue to try.
if (successFlag) {
portsMapDict[begin][arrayOp] = memPort;
break;
} else
begin++;
}
return begin;
// Memory load/store operation always consumes 1 clock cycle.
return begin + 1;
}
void HLSCppEstimator::updateChildBlockSchedule(Block &block, unsigned begin) {
for (auto &op : block) {
unsigned newBegin = begin;
unsigned newEnd = begin;
// Update the schedule of all operations in the child block.
if (getUIntAttrValue(&op, "schedule_end")) {
newBegin += getUIntAttrValue(&op, "schedule_begin");
newEnd += getUIntAttrValue(&op, "schedule_end");
setAttrValue(&op, "schedule_begin", newBegin);
setAttrValue(&op, "schedule_end", newEnd);
}
// Recursively apply to all child blocks.
if (op.getNumRegions()) {
for (auto &region : op.getRegions()) {
for (auto &block : region.getBlocks())
updateChildBlockSchedule(block, begin);
}
}
}
Optional<unsigned> HLSCppEstimator::visitOp(AffineLoadOp op, unsigned begin) {
return getLoadStoreSchedule(op, begin);
}
/// Schedule the block with ASAP algorithm.
unsigned HLSCppEstimator::getBlockSchedule(Block &block) {
unsigned blockEnd = 0;
MemPortDicts dicts;
for (auto &op : block) {
// Find the latest predecessor dominating the current operation. This
// should be considered as the earliest stage that the current operation
// can be scheduled.
unsigned begin = 0;
unsigned end = 0;
for (auto operand : op.getOperands()) {
if (auto defOp = operand.getDefiningOp())
begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
}
// Handle loop operations.
if (auto forOp = dyn_cast<AffineForOp>(op)) {
// Live ins of the for loop body will also impact the schedule begin.
Liveness liveness(block.getParentOp());
for (auto liveIn : liveness.getLiveIn(&forOp.getLoopBody().front())) {
if (auto defOp = liveIn.getDefiningOp())
begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
}
// Update the schedule of all operations in the loop body.
updateChildBlockSchedule(forOp.getLoopBody().front(), begin);
// Child loop is considered as a large node, and two extra clock cycles
// will be required to enter and exit the child loop.
end = begin + getUIntAttrValue(forOp, "latency") + 2;
}
// Handle if operations.
else if (auto ifOp = dyn_cast<AffineIfOp>(op)) {
// Live ins of the if body will also impact the schedule begin.
Liveness liveness(block.getParentOp());
for (auto liveIn : liveness.getLiveIn(ifOp.getThenBlock())) {
if (auto defOp = liveIn.getDefiningOp())
begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
}
if (ifOp.hasElse()) {
for (auto liveIn : liveness.getLiveIn(ifOp.getElseBlock())) {
if (auto defOp = liveIn.getDefiningOp())
begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
}
// Update the schedule of all operations in the else block.
updateChildBlockSchedule(*ifOp.getElseBlock(), begin);
}
// Update the schedule of all operations in the then block.
updateChildBlockSchedule(*ifOp.getThenBlock(), begin);
end = begin + getUIntAttrValue(ifOp, "latency");
}
// Handle load/store operations.
else if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
// Insert new schedule level to the memory port dicts.
while (begin >= dicts.size()) {
MemPortDict memPortDict;
dicts.push_back(memPortDict);
}
// Ensure the current schedule meets memory port limitation.
begin = getLoadStoreSchedule(&op, begin, dicts);
end = begin + 1;
}
// Default case. All normal expressions and operations will be handled by
// this branch.
else {
// TODO: For now, we assume all operations take one clock cycle to
// execute, should support to accept profiling data.
end = begin + 1;
}
setAttrValue(&op, "schedule_begin", begin);
setAttrValue(&op, "schedule_end", end);
blockEnd = max(blockEnd, end);
}
return blockEnd;
Optional<unsigned> HLSCppEstimator::visitOp(AffineStoreOp op, unsigned begin) {
return getLoadStoreSchedule(op, begin);
}
//===----------------------------------------------------------------------===//
// AffineForOp Related Methods
//===----------------------------------------------------------------------===//
/// Calculate the minimum resource II.
unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoreDict dict) {
unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoresMap &dict) {
unsigned II = 1;
for (auto &pair : dict) {
@ -354,7 +405,7 @@ unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoreDict dict) {
}
/// Calculate the minimum dependency II.
unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) {
unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoresMap &dict) {
unsigned II = 1;
// Collect start and end level of the pipeline.
@ -375,8 +426,9 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) {
auto loadStores = pair.second;
// Walk through each pair of source and destination, and each loop level
// that are pipelined.
for (auto loopDepth = startLevel; loopDepth <= endLevel; ++loopDepth) {
// that are pipelined. Note that for inter-dependency, dstOp is always
// before srcOp.
for (unsigned loopDepth = startLevel; loopDepth <= endLevel; ++loopDepth) {
unsigned dstIndex = 1;
for (auto dstOp : loadStores) {
MemRefAccess dstAccess(dstOp);
@ -393,7 +445,7 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) {
if (hasDependence(result)) {
SmallVector<unsigned, 2> flattenTripCounts;
flattenTripCounts.push_back(1);
unsigned distance = 0;
int64_t distance = 0;
// Calculate the distance of this dependency.
for (auto it = depComps.rbegin(); it < depComps.rend(); ++it) {
@ -413,7 +465,7 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) {
unsigned delay = getUIntAttrValue(srcOp, "schedule_begin") -
getUIntAttrValue(dstOp, "schedule_begin");
if (distance != 0) {
if (distance > 0) {
unsigned minII = ceil((float)delay / distance);
II = max(II, minII);
}
@ -426,37 +478,48 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) {
return II;
}
bool HLSCppEstimator::visitOp(AffineForOp op) {
auto &body = op.getLoopBody();
if (body.getBlocks().size() != 1) {
Optional<unsigned> HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) {
if (op.getLoopBody().getBlocks().size() != 1) {
op.emitError("has zero or more than one basic blocks.");
return false;
return Optional<unsigned>();
}
// Recursively estimate all contained operations.
if (!estimateBlock(body.front()))
return false;
// Set an attribute indicating the trip count. For now, we assume all
// loops have static loop bound.
// Set an attribute indicating the trip count. For now, we assume all loops
// have static loop bound.
if (auto tripCount = getConstantTripCount(op))
setAttrValue(op, "trip_count", (unsigned)tripCount.getValue());
else {
setAttrValue(op, "trip_count", (unsigned)0);
op.emitError("has undetermined trip count");
return false;
return Optional<unsigned>();
}
// If the current loop is annotated as pipeline, extra dependency and II
// analysis will be executed.
if (getBoolAttrValue(op, "pipeline")) {
LoadStoreDict dict;
getBlockMemInfo(body.front(), dict);
unsigned end = begin;
auto &loopBlock = op.getLoopBody().front();
// Live ins will impact the scheduling.
for (auto liveIn : liveness.getLiveIn(&loopBlock))
if (auto defOp = liveIn.getDefiningOp())
begin = max(begin, getUIntAttrValue(defOp, "schedule_end"));
// Estimate the loop block.
if (auto esti = estimateBlock(loopBlock, begin))
end = max(end, esti.getValue());
else
return Optional<unsigned>();
// If the current loop is annotated as pipeline, extra dependency and
// resource aware II analysis will be executed.
if (getBoolAttrValue(op, "pipeline")) {
// Calculate latency of each iteration.
auto iterLatency = getBlockSchedule(body.front());
auto iterLatency = end - begin;
setAttrValue(op, "iter_latency", iterLatency);
// Collect load and store operations in the loop block for estimating the
// achievable initial interval.
LoadStoresMap dict;
getLoadStoresMap(loopBlock, dict);
// Calculate initial interval.
auto II = max(getResMinII(op, dict), getDepMinII(op, dict));
setAttrValue(op, "init_interval", II);
@ -464,117 +527,143 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
auto tripCount = getUIntAttrValue(op, "trip_count");
setAttrValue(op, "flatten_trip_count", tripCount);
setAttrValue(op, "latency", iterLatency + II * (tripCount - 1));
return true;
auto latency = iterLatency + II * (tripCount - 1);
setAttrValue(op, "latency", latency);
// Entering and leaving a loop will consume extra 2 clock cycles.
return begin + latency + 2;
}
// This means the current loop can be flattened into the child loop. If the
// child loop is pipelined, this will increase the flattened loop trip count
// without changing the iteration latency. Note that this will be propogated
// above until meeting an imperfect loop.
// If the current loop is annotated as flatten, it will be flattened into
// the child pipelined loop. This will increase the flattened loop trip count
// without changing the iteration latency.
if (getBoolAttrValue(op, "flatten")) {
if (auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front())) {
// This means the inner loop is pipelined, because otherwise II will be
// equal to zero. So that in this case, this loop will be flattened into
// the inner pipelined loop.
if (auto II = getUIntAttrValue(child, "init_interval")) {
setAttrValue(op, "init_interval", II);
auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front());
assert(child && "the first containing operation is not a loop");
auto iterLatency = getUIntAttrValue(child, "iter_latency");
setAttrValue(op, "iter_latency", iterLatency);
auto iterLatency = getUIntAttrValue(child, "iter_latency");
setAttrValue(op, "iter_latency", iterLatency);
auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
getUIntAttrValue(op, "trip_count");
setAttrValue(op, "flatten_trip_count", flattenTripCount);
auto II = getUIntAttrValue(child, "init_interval");
setAttrValue(op, "init_interval", II);
setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1));
} else {
auto iterLatency = getUIntAttrValue(child, "latency");
setAttrValue(op, "iter_latency", iterLatency);
auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
getUIntAttrValue(op, "trip_count");
setAttrValue(op, "flatten_trip_count", flattenTripCount);
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
setAttrValue(op, "latency", latency);
}
return true;
}
auto latency = iterLatency + II * (flattenTripCount - 1);
setAttrValue(op, "latency", latency);
// Since the loop is flattened, it will no longer be entered and left.
return begin + latency;
}
// Default case, aka !pipeline && !flatten.
LoadStoreDict dict;
getBlockMemInfo(body.front(), dict);
auto iterLatency = getBlockSchedule(body.front());
// Default case, calculate latency of each iteration.
auto iterLatency = end - begin;
setAttrValue(op, "iter_latency", iterLatency);
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
setAttrValue(op, "latency", latency);
return true;
return begin + latency + 2;
}
bool HLSCppEstimator::visitOp(AffineIfOp op) {
auto thenBlock = op.getThenBlock();
if (!estimateBlock(*thenBlock))
return false;
//===----------------------------------------------------------------------===//
// Other Operation Handlers
//===----------------------------------------------------------------------===//
LoadStoreDict dict;
getBlockMemInfo(*thenBlock, dict);
auto latency = getBlockSchedule(*thenBlock);
Optional<unsigned> HLSCppEstimator::visitOp(AffineIfOp op, unsigned begin) {
unsigned end = begin;
auto thenBlock = op.getThenBlock();
// Live ins will impact the scheduling.
for (auto liveIn : liveness.getLiveIn(thenBlock))
if (auto defOp = liveIn.getDefiningOp())
begin = max(begin, getUIntAttrValue(defOp, "schedule_end"));
// Estimate then block.
if (auto esti = estimateBlock(*thenBlock, begin))
end = max(end, esti.getValue());
else
return Optional<unsigned>();
// Handle else block if required.
if (op.hasElse()) {
auto elseBlock = op.getElseBlock();
if (!estimateBlock(*elseBlock))
return false;
getBlockMemInfo(*elseBlock, dict);
latency = max(latency, getBlockSchedule(*elseBlock));
for (auto liveIn : liveness.getLiveIn(elseBlock))
if (auto defOp = liveIn.getDefiningOp())
begin = max(begin, getUIntAttrValue(defOp, "schedule_end"));
if (auto esti = estimateBlock(*elseBlock, begin))
end = max(end, esti.getValue());
else
return Optional<unsigned>();
}
setAttrValue(op, "latency", latency);
return true;
return end;
}
bool HLSCppEstimator::visitOp(ArrayOp op) {
Optional<unsigned> HLSCppEstimator::visitOp(ArrayOp op, unsigned begin) {
// Annotate the total parition number of the array.
unsigned partitionNum = 1;
if (op.partition()) {
auto rank = op.getType().cast<ShapedType>().getRank();
for (unsigned i = 0; i < rank; ++i) {
if (auto factor = getPartitionFactor(op, i))
for (unsigned dim = 0; dim < rank; ++dim) {
if (auto factor = getPartitionFactor(op, dim))
partitionNum *= factor;
}
}
setAttrValue(op, "partition_num", partitionNum);
return true;
// ArrayOp is a dummy memory instance which does not consume any clock
// cycles.
return begin;
}
bool HLSCppEstimator::estimateBlock(Block &block) {
//===----------------------------------------------------------------------===//
// Block Scheduler and Estimator
//===----------------------------------------------------------------------===//
/// Estimate the latency of a block with ASAP scheduling strategy.
Optional<unsigned> HLSCppEstimator::estimateBlock(Block &block,
unsigned blockBegin) {
unsigned blockEnd = blockBegin;
for (auto &op : block) {
if (dispatchVisitor(&op))
continue;
else {
op.emitError("can't be correctly estimated.");
return false;
}
unsigned begin = blockBegin;
unsigned end = blockBegin;
// Find the latest arrived predecessor dominating the current operation.
// This should be considered as the earliest possible scheduling level
// that the current operation can be scheduled.
for (auto operand : op.getOperands())
if (auto defOp = operand.getDefiningOp())
begin = max(begin, getUIntAttrValue(defOp, "schedule_end"));
// Estimate the current operation.
if (auto esti = dispatchVisitor(&op, begin))
end = max(end, esti.getValue());
else
return Optional<unsigned>();
setAttrValue(&op, "schedule_begin", begin);
setAttrValue(&op, "schedule_end", end);
blockEnd = max(blockEnd, end);
}
return true;
return blockEnd;
}
bool HLSCppEstimator::estimateFunc(FuncOp func) {
if (func.getBlocks().size() != 1) {
void HLSCppEstimator::estimateFunc() {
if (func.getBlocks().size() != 1)
func.emitError("has zero or more than one basic blocks.");
return false;
}
// Recursively estimate all contained operations.
if (!estimateBlock(func.front()))
return false;
LoadStoreDict dict;
getBlockMemInfo(func.front(), dict);
auto latency = getBlockSchedule(func.front());
setAttrValue(func, "latency", latency);
return true;
// Recursively estimate blocks in the function.
if (auto esti = estimateBlock(func.front(), 0))
setAttrValue(func, "latency", esti.getValue());
else
setAttrValue(func, "latency", "unknown");
}
//===----------------------------------------------------------------------===//
@ -584,13 +673,22 @@ bool HLSCppEstimator::estimateFunc(FuncOp func) {
namespace {
struct QoREstimation : public scalehls::QoREstimationBase<QoREstimation> {
void runOnOperation() override {
auto module = getOperation();
auto builder = OpBuilder(module);
// Read configuration file.
INIReader spec(targetSpec);
if (spec.ParseError())
llvm::outs() << "error: target spec file parse fail, please refer to "
"--help option and pass in correct file path\n";
// TODO: Support estimator initiation from profiling data, constructing a
// unique data structure for holding latency and resource information.
auto freq = spec.Get("spec", "frequency", "200MHz");
auto latency = spec.GetInteger(freq, "op", 0);
// Estimate performance and resource utilization.
HLSCppEstimator estimator(builder, targetSpec);
for (auto func : module.getOps<FuncOp>())
estimator.estimateFunc(func);
for (auto func : getOperation().getOps<FuncOp>()) {
HLSCppEstimator estimator(func);
estimator.estimateFunc();
}
}
};
} // namespace

View File

@ -117,8 +117,8 @@ void ConvertToHLSCpp::runOnOperation() {
if (!forOp.getAttr("pipeline"))
forOp.setAttr("pipeline", builder.getBoolAttr(false));
if (!forOp.getAttr("unroll"))
forOp.setAttr("unroll", builder.getBoolAttr(false));
// if (!forOp.getAttr("unroll"))
// forOp.setAttr("unroll", builder.getBoolAttr(false));
if (!forOp.getAttr("flatten"))
forOp.setAttr("flatten", builder.getBoolAttr(false));

View File

@ -34,8 +34,8 @@ static mlir::AffineForOp getPipelineLoop(mlir::AffineForOp root) {
}
template <typename OpType>
static void applyArrayPartition(LoadStoreDict &dict, OpBuilder &builder) {
for (auto pair : dict) {
static void applyArrayPartition(LoadStoresMap &map, OpBuilder &builder) {
for (auto pair : map) {
auto arrayOp = cast<ArrayOp>(pair.first);
auto arrayType = arrayOp.getType().cast<MemRefType>();
auto arrayAccesses = pair.second;
@ -116,21 +116,21 @@ void ArrayPartition::runOnOperation() {
for (auto forOp : func.getOps<mlir::AffineForOp>()) {
if (auto outermost = getPipelineLoop(forOp)) {
// Collect memory access information.
LoadStoreDict loadDict;
LoadStoresMap loadMap;
outermost.walk([&](mlir::AffineLoadOp loadOp) {
auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
loadDict[arrayOp].push_back(loadOp);
loadMap[arrayOp].push_back(loadOp);
});
LoadStoreDict storeDict;
LoadStoresMap storeMap;
outermost.walk([&](mlir::AffineStoreOp storeOp) {
auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
storeDict[arrayOp].push_back(storeOp);
storeMap[arrayOp].push_back(storeOp);
});
// Apply array partition pragma.
applyArrayPartition<mlir::AffineLoadOp>(loadDict, builder);
applyArrayPartition<mlir::AffineStoreOp>(storeDict, builder);
applyArrayPartition<mlir::AffineLoadOp>(loadMap, builder);
applyArrayPartition<mlir::AffineStoreOp>(storeMap, builder);
}
}
}

View File

@ -41,26 +41,22 @@ void LoopPipelining::runOnOperation() {
});
// All outer loops that perfect nest the pipelined loop can be flattened.
forOp.walk([&](mlir::AffineForOp loop) {
unsigned opNum = 0;
unsigned forNum = 0;
bool innerFlatten = false;
for (auto &bodyOp : loop.getLoopBody().front()) {
if (!isa<AffineYieldOp>(bodyOp))
opNum++;
if (isa<AffineForOp>(bodyOp)) {
forNum++;
if (auto flatten = bodyOp.getAttrOfType<BoolAttr>("flatten"))
innerFlatten = flatten.getValue();
}
}
if (forNum == 0 || (opNum == 1 && innerFlatten))
loop.setAttr("flatten", builder.getBoolAttr(true));
else
loop.setAttr("flatten", builder.getBoolAttr(false));
});
SmallVector<mlir::AffineForOp, 4> flattenedLoops;
flattenedLoops.push_back(targetLoop);
while (true) {
auto currentLoop = flattenedLoops.back();
if (auto outerLoop = currentLoop.getParentOfType<mlir::AffineForOp>()) {
// Only if the current loop is the only child loop of the outer loop,
// the outer loop can be flattened into the current loop.
auto &body = outerLoop.getLoopBody().front();
if (&body.front() == currentLoop && body.getOperations().size() == 2) {
flattenedLoops.push_back(outerLoop);
outerLoop.setAttr("flatten", builder.getBoolAttr("true"));
} else
break;
} else
break;
}
}
// Canonicalize the IR after loop unrolling.