[QoREstimation] fix bugs and incorrect behaviors in the implementation; [Passes] add some TODOs

This commit is contained in:
Hanchen Ye 2020-12-17 17:05:26 -06:00
parent 512c842908
commit 18e5f434bb
5 changed files with 132 additions and 119 deletions

View File

@ -87,6 +87,20 @@ public:
void setAttrValue(Operation *op, StringRef name, StringRef value) {
op->setAttr(name, builder.getStringAttr(value));
}
/// Schedule attribute related methods.
void setScheduleValue(Operation *op, unsigned begin, unsigned end) {
setAttrValue(op, "schedule_begin", begin);
setAttrValue(op, "schedule_end", end);
}
unsigned getLatencyValue(Operation *op) {
if (auto latency = getUIntAttrValue(op, "latency"))
return latency;
else
return getUIntAttrValue(op, "schedule_end") -
getUIntAttrValue(op, "schedule_begin");
}
};
//===----------------------------------------------------------------------===//
@ -133,6 +147,7 @@ public:
using HLSCppVisitorBase::visitOp;
Optional<unsigned> visitUnhandledOp(Operation *op, unsigned begin) {
// Default latency of any unhandled operation is 1.
setScheduleValue(op, begin, begin + 1);
return begin + 1;
}
@ -148,7 +163,8 @@ public:
Optional<unsigned> visitOp(AffineIfOp op, unsigned begin);
Optional<unsigned> visitOp(ArrayOp op, unsigned begin);
Optional<unsigned> estimateBlock(Block &block, unsigned blockBegin);
Optional<std::pair<unsigned, unsigned>> estimateBlock(Block &block,
unsigned begin);
void estimateFunc();
FuncOp &func;

View File

@ -71,6 +71,31 @@ static void getLoopNests(Operation *op, SmallVector<Operation *, 4> &nests) {
}
}
// Get the pointer of the scrOp's parent loop, which should locate at the same
// level with dstOp's any parent loop.
static Operation *getSameLevelSourceOp(Operation *srcOp, Operation *dstOp) {
// If srcOp and dstOp are already at the same level, return the srcOp.
if (checkSameLevel(srcOp, dstOp))
return srcOp;
SmallVector<Operation *, 4> srcNests;
SmallVector<Operation *, 4> dstNests;
srcNests.push_back(srcOp);
dstNests.push_back(dstOp);
getLoopNests(srcOp, srcNests);
getLoopNests(dstOp, dstNests);
// If any parent of srcOp (or itself) and any parent of dstOp (or itself) are
// at the same level, return the pointer.
for (auto src : srcNests)
for (auto dst : dstNests)
if (checkSameLevel(src, dst))
return src;
return nullptr;
}
/// Get the definition ArrayOp given any memory access operation.
static ArrayOp getArrayOp(Operation *op) {
auto defOp = MemRefAccess(op).memref.getDefiningOp();
@ -84,10 +109,15 @@ static ArrayOp getArrayOp(Operation *op) {
/// Collect all load and store operations in the block.
static void getLoadStoresMap(Block &block, LoadStoresMap &map) {
block.walk([&](Operation *op) {
for (auto &op : block) {
if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
map[getArrayOp(op)].push_back(op);
});
map[getArrayOp(&op)].push_back(&op);
else if (op.getNumRegions()) {
for (auto &region : op.getRegions())
for (auto &block : region)
getLoadStoresMap(block, map);
}
}
}
//===----------------------------------------------------------------------===//
@ -137,7 +167,7 @@ void HLSCppEstimator::getFuncMemRefDepends() {
bool dependFlag = false;
auto commonLoopDepth = getCommonLoopDepth(srcOp, dstOp);
for (unsigned depth = 1; depth <= commonLoopDepth; ++depth) {
for (unsigned depth = 1; depth <= commonLoopDepth + 1; ++depth) {
// Initialize constraints and components.
FlatAffineConstraints dependConstrs;
SmallVector<DependenceComponent, 2> dependComps;
@ -161,31 +191,6 @@ void HLSCppEstimator::getFuncMemRefDepends() {
// LoadOp and StoreOp Related Methods
//===----------------------------------------------------------------------===//
// Get the pointer of the scrOp's parent loop, which should locate at the same
// level with dstOp's any parent loop.
static Operation *getSameLevelSourceOp(Operation *srcOp, Operation *dstOp) {
// If srcOp and dstOp are already at the same level, return the srcOp.
if (checkSameLevel(srcOp, dstOp))
return srcOp;
SmallVector<Operation *, 4> srcNests;
SmallVector<Operation *, 4> dstNests;
srcNests.push_back(srcOp);
dstNests.push_back(dstOp);
getLoopNests(srcOp, srcNests);
getLoopNests(dstOp, dstNests);
// If any parent of srcOp (or itself) and any parent of dstOp (or itself) are
// at the same level, return the pointer.
for (auto src : srcNests)
for (auto dst : dstNests)
if (checkSameLevel(src, dst))
return src;
return nullptr;
}
/// Calculate the overall partition index.
int32_t HLSCppEstimator::getPartitionIndex(Operation *op) {
auto arrayOp = getArrayOp(op);
@ -320,6 +325,7 @@ unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin) {
}
// Memory load/store operation always consumes 1 clock cycle.
setScheduleValue(op, begin, begin + 1);
return begin + 1;
}
@ -336,11 +342,11 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineStoreOp op, unsigned begin) {
//===----------------------------------------------------------------------===//
/// Calculate the minimum resource II.
unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoresMap &dict) {
unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoresMap &map) {
unsigned II = 1;
for (auto &pair : dict) {
auto arrayOp = cast<ArrayOp>(pair.first);
for (auto &pair : map) {
auto arrayOp = pair.first;
unsigned partitionNum = getUIntAttrValue(arrayOp, "partition_num");
auto storageType = getStrAttrValue(arrayOp, "storage_type");
@ -354,25 +360,21 @@ unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoresMap &dict) {
auto loadStores = pair.second;
for (auto op : loadStores) {
// Calculate resource-aware minimal II.
auto partitionIdx = getIntAttrValue(op, "partition_index");
if (partitionIdx == -1) {
unsigned accessNum = 1;
unsigned accessNum = 2;
if (storageType == "ram_s2p")
accessNum = 1;
else if (storageType == "ram_2p" || "ram_t2p")
accessNum = 2;
else if (storageType == "ram_1p")
accessNum = 1;
else {
else if (storageType == "ram_2p" || storageType == "ram_t2p" ||
storageType == "")
accessNum = 2;
// arrayOp.emitError("unsupported storage type.");
}
// The rationale here is an undetermined partition access will
// introduce a large mux which will avoid Vivado HLS to process any
// concurrent data access among all partitions. This is equivalent to
// increase read or write number for all partitions.
// The rationale here is an undetermined partition access will introduce
// a large mux which will avoid Vivado HLS to process any concurrent
// data access among all partitions. This is equivalent to increase read
// or write number for all partitions.
for (unsigned p = 0, e = partitionNum; p < e; ++p) {
if (isa<AffineLoadOp>(op))
readNum[p] += accessNum;
@ -386,18 +388,18 @@ unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoresMap &dict) {
}
unsigned minII = 1;
if (storageType == "ram_s2p") {
if (storageType == "ram_s2p")
minII = max({minII, *std::max_element(readNum.begin(), readNum.end()),
*std::max_element(writeNum.begin(), writeNum.end())});
} else if (storageType == "ram_2p" || storageType == "ram_t2p") {
for (unsigned i = 0, e = partitionNum; i < e; ++i) {
minII = max(minII, (readNum[i] + writeNum[i] + 1) / 2);
}
} else if (storageType == "ram_1p") {
for (unsigned i = 0, e = partitionNum; i < e; ++i) {
else if (storageType == "ram_1p")
for (unsigned i = 0, e = partitionNum; i < e; ++i)
minII = max(minII, readNum[i] + writeNum[i]);
}
}
else if (storageType == "ram_2p" || storageType == "ram_t2p" ||
storageType == "")
for (unsigned i = 0, e = partitionNum; i < e; ++i)
minII = max(minII, (readNum[i] + writeNum[i] + 1) / 2);
II = max(II, minII);
}
@ -405,7 +407,7 @@ unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoresMap &dict) {
}
/// Calculate the minimum dependency II.
unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoresMap &dict) {
unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoresMap &map) {
unsigned II = 1;
// Collect start and end level of the pipeline.
@ -422,7 +424,7 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoresMap &dict) {
break;
}
for (auto &pair : dict) {
for (auto &pair : map) {
auto loadStores = pair.second;
// Walk through each pair of source and destination, and each loop level
@ -462,8 +464,8 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoresMap &dict) {
flattenTripCounts.push_back(flattenTripCounts.back() * tripCount);
}
unsigned delay = getUIntAttrValue(srcOp, "schedule_begin") -
getUIntAttrValue(dstOp, "schedule_begin");
unsigned delay = getUIntAttrValue(srcOp, "schedule_end") -
getUIntAttrValue(dstOp, "schedule_end");
if (distance > 0) {
unsigned minII = ceil((float)delay / distance);
@ -479,33 +481,34 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoresMap &dict) {
}
Optional<unsigned> HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) {
if (op.getLoopBody().getBlocks().size() != 1) {
op.emitError("has zero or more than one basic blocks.");
return Optional<unsigned>();
}
// Set an attribute indicating the trip count. For now, we assume all loops
// have static loop bound.
if (auto tripCount = getConstantTripCount(op))
setAttrValue(op, "trip_count", (unsigned)tripCount.getValue());
else {
setAttrValue(op, "trip_count", (unsigned)0);
op.emitError("has undetermined trip count");
else
return Optional<unsigned>();
}
unsigned end = begin;
auto &loopBlock = op.getLoopBody().front();
// Live ins will impact the scheduling.
for (auto liveIn : liveness.getLiveIn(&loopBlock))
if (auto defOp = liveIn.getDefiningOp())
begin = max(begin, getUIntAttrValue(defOp, "schedule_end"));
// Collect load and store operations in the loop block for solving possible
// dependencies.
LoadStoresMap map;
getLoadStoresMap(loopBlock, map);
// Check dependencies of all load/store operations and update schedule level.
for (auto pair : map)
for (auto dstOp : pair.second)
for (auto srcOp : dependsMap[dstOp]) {
auto sameLevelSrcOp = getSameLevelSourceOp(srcOp, dstOp);
begin = max(getUIntAttrValue(sameLevelSrcOp, "schedule_end"), begin);
}
// Estimate the loop block.
if (auto esti = estimateBlock(loopBlock, begin))
end = max(end, esti.getValue());
else
if (auto schedule = estimateBlock(loopBlock, begin)) {
begin = max(begin, schedule.getValue().first);
end = max(end, schedule.getValue().second);
} else
return Optional<unsigned>();
// If the current loop is annotated as pipeline, extra dependency and
@ -515,13 +518,8 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) {
auto iterLatency = end - begin;
setAttrValue(op, "iter_latency", iterLatency);
// Collect load and store operations in the loop block for estimating the
// achievable initial interval.
LoadStoresMap dict;
getLoadStoresMap(loopBlock, dict);
// Calculate initial interval.
auto II = max(getResMinII(op, dict), getDepMinII(op, dict));
auto II = max(getResMinII(op, map), getDepMinII(op, map));
setAttrValue(op, "init_interval", II);
auto tripCount = getUIntAttrValue(op, "trip_count");
@ -531,6 +529,7 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) {
setAttrValue(op, "latency", latency);
// Entering and leaving a loop will consume extra 2 clock cycles.
setScheduleValue(op, begin, begin + latency + 2);
return begin + latency + 2;
}
@ -555,6 +554,7 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) {
setAttrValue(op, "latency", latency);
// Since the loop is flattened, it will no longer be entered and left.
setScheduleValue(op, begin, begin + latency);
return begin + latency;
}
@ -565,6 +565,7 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) {
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
setAttrValue(op, "latency", latency);
setScheduleValue(op, begin, begin + latency + 2);
return begin + latency + 2;
}
@ -576,14 +577,9 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineIfOp op, unsigned begin) {
unsigned end = begin;
auto thenBlock = op.getThenBlock();
// Live ins will impact the scheduling.
for (auto liveIn : liveness.getLiveIn(thenBlock))
if (auto defOp = liveIn.getDefiningOp())
begin = max(begin, getUIntAttrValue(defOp, "schedule_end"));
// Estimate then block.
if (auto esti = estimateBlock(*thenBlock, begin))
end = max(end, esti.getValue());
if (auto schedule = estimateBlock(*thenBlock, begin))
end = max(end, schedule.getValue().second);
else
return Optional<unsigned>();
@ -591,16 +587,15 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineIfOp op, unsigned begin) {
if (op.hasElse()) {
auto elseBlock = op.getElseBlock();
for (auto liveIn : liveness.getLiveIn(elseBlock))
if (auto defOp = liveIn.getDefiningOp())
begin = max(begin, getUIntAttrValue(defOp, "schedule_end"));
if (auto esti = estimateBlock(*elseBlock, begin))
end = max(end, esti.getValue());
if (auto schedule = estimateBlock(*elseBlock, begin))
end = max(end, schedule.getValue().second);
else
return Optional<unsigned>();
}
// In our assumption, AffineIfOp is completely transparent. Therefore, we set
// a dummy schedule begin here.
setScheduleValue(op, end, end);
return end;
}
@ -616,8 +611,8 @@ Optional<unsigned> HLSCppEstimator::visitOp(ArrayOp op, unsigned begin) {
}
setAttrValue(op, "partition_num", partitionNum);
// ArrayOp is a dummy memory instance which does not consume any clock
// cycles.
// ArrayOp is a dummy memory instance which does not consume any clock cycles.
setScheduleValue(op, begin, begin);
return begin;
}
@ -625,45 +620,44 @@ Optional<unsigned> HLSCppEstimator::visitOp(ArrayOp op, unsigned begin) {
// Block Scheduler and Estimator
//===----------------------------------------------------------------------===//
/// Estimate the latency of a block with ASAP scheduling strategy.
Optional<unsigned> HLSCppEstimator::estimateBlock(Block &block,
unsigned blockBegin) {
unsigned blockEnd = blockBegin;
/// Estimate the latency of a block with ASAP scheduling strategy, return a pair
/// of schedule begin and schedule end.
Optional<std::pair<unsigned, unsigned>>
HLSCppEstimator::estimateBlock(Block &block, unsigned begin) {
unsigned blockBegin = begin;
unsigned blockEnd = begin;
for (auto &op : block) {
unsigned begin = blockBegin;
unsigned end = blockBegin;
unsigned opBegin = begin;
unsigned opEnd = begin;
// Find the latest arrived predecessor dominating the current operation.
// This should be considered as the earliest possible scheduling level
// that the current operation can be scheduled.
// This should be considered as the earliest possible scheduling level that
// the current operation can be scheduled.
for (auto operand : op.getOperands())
if (auto defOp = operand.getDefiningOp())
begin = max(begin, getUIntAttrValue(defOp, "schedule_end"));
opBegin = max(opBegin, getUIntAttrValue(defOp, "schedule_end"));
// Estimate the current operation.
if (auto esti = dispatchVisitor(&op, begin))
end = max(end, esti.getValue());
if (auto scheduleEnd = dispatchVisitor(&op, opBegin))
opEnd = max(opEnd, scheduleEnd.getValue());
else
return Optional<unsigned>();
return Optional<std::pair<unsigned, unsigned>>();
setAttrValue(&op, "schedule_begin", begin);
setAttrValue(&op, "schedule_end", end);
blockEnd = max(blockEnd, end);
// Update the block schedule begin and end.
blockBegin = min(blockBegin, opBegin);
blockEnd = max(blockEnd, opEnd);
}
return blockEnd;
return std::pair<unsigned, unsigned>(blockBegin, blockEnd);
}
void HLSCppEstimator::estimateFunc() {
if (func.getBlocks().size() != 1)
func.emitError("has zero or more than one basic blocks.");
// Recursively estimate blocks in the function.
if (auto esti = estimateBlock(func.front(), 0))
setAttrValue(func, "latency", esti.getValue());
if (auto schedule = estimateBlock(func.front(), 0))
setAttrValue(func, "latency",
schedule.getValue().second - schedule.getValue().first);
else
setAttrValue(func, "latency", "unknown");
setAttrValue(func, "latency", -1);
}
//===----------------------------------------------------------------------===//

View File

@ -114,6 +114,7 @@ void ArrayPartition::runOnOperation() {
// Apply array partition.
for (auto forOp : func.getOps<mlir::AffineForOp>()) {
// TODO: support imperfect loop.
if (auto outermost = getPipelineLoop(forOp)) {
// Collect memory access information.
LoadStoresMap loadMap;

View File

@ -25,6 +25,7 @@ void LoopPipelining::runOnOperation() {
// Walk through loops in the function.
for (auto forOp : func.getOps<mlir::AffineForOp>()) {
// TODO: support more fine-grained pipeline insertion configuration.
SmallVector<mlir::AffineForOp, 4> nestedLoops;
forOp.walk([&](mlir::AffineForOp loop) { nestedLoops.push_back(loop); });

View File

@ -26,12 +26,13 @@ void RemoveVarLoopBound::runOnOperation() {
// Walk through all functions and loops.
for (auto forOp : func.getOps<mlir::AffineForOp>()) {
SmallVector<mlir::AffineForOp, 4> nestedLoops;
// TODO: support imperfect loops.
getPerfectlyNestedLoops(nestedLoops, forOp);
SmallVector<Value, 4> inductionVars;
for (auto loop : nestedLoops) {
// TODO: support affine expression with more than one operand as
// variable loop bound.
// TODO: support affine expression with more than one operand as ariable
// loop bound.
// TODO: support remove variable lower bound.
if (!loop.hasConstantUpperBound() &&
loop.getUpperBoundMap().getResult(0).getKind() ==