[QoREstimator] support DSP estimation (#18)
This commit is contained in:
parent
85c47e98e3
commit
4c1cc9e4b1
|
@ -24,7 +24,7 @@ public:
|
|||
if (auto attr = op.partition_type()[dim].cast<StringAttr>())
|
||||
return attr.getValue();
|
||||
else
|
||||
return "";
|
||||
return StringRef();
|
||||
}
|
||||
|
||||
unsigned getPartitionFactor(hlscpp::ArrayOp op, unsigned dim) {
|
||||
|
@ -60,7 +60,7 @@ public:
|
|||
if (auto attr = op->getAttrOfType<StringAttr>(name))
|
||||
return attr.getValue();
|
||||
else
|
||||
return "";
|
||||
return StringRef();
|
||||
}
|
||||
|
||||
/// Set attribute value methods.
|
||||
|
|
|
@ -26,7 +26,7 @@ using LatencyMap = llvm::StringMap<unsigned>;
|
|||
|
||||
namespace {
|
||||
class HLSCppEstimator
|
||||
: public HLSCppVisitorBase<HLSCppEstimator, Optional<unsigned>, unsigned>,
|
||||
: public HLSCppVisitorBase<HLSCppEstimator, bool, unsigned>,
|
||||
public HLSCppAnalysisBase {
|
||||
public:
|
||||
explicit HLSCppEstimator(FuncOp &func, LatencyMap &latencyMap)
|
||||
|
@ -35,6 +35,10 @@ public:
|
|||
getFuncDependencies();
|
||||
}
|
||||
|
||||
// For storing all dependencies indexed by the dependency source operation.
|
||||
using Depends = SmallVector<Operation *, 16>;
|
||||
using DependsMap = DenseMap<Operation *, Depends>;
|
||||
|
||||
// Indicate the unoccupied memory ports number.
|
||||
struct PortInfo {
|
||||
PortInfo(unsigned rdPort = 0, unsigned wrPort = 0, unsigned rdwrPort = 0)
|
||||
|
@ -44,17 +48,14 @@ public:
|
|||
unsigned wrPort;
|
||||
unsigned rdwrPort;
|
||||
};
|
||||
|
||||
// For storing ports number of all partitions indexed by the array (ArrayOp).
|
||||
using Ports = SmallVector<PortInfo, 16>;
|
||||
using PortsMap = DenseMap<Operation *, Ports>;
|
||||
|
||||
// For storing PortsMap indexed by the scheduling level.
|
||||
using PortsMapDict = DenseMap<unsigned, PortsMap>;
|
||||
|
||||
// For storing all dependent operations indexed by the source operation.
|
||||
using Depends = SmallVector<Operation *, 16>;
|
||||
using DependsMap = DenseMap<Operation *, Depends>;
|
||||
// For storing the DSP resource utilization indexed by the schedule level.
|
||||
using ResourceMap = DenseMap<unsigned, unsigned>;
|
||||
|
||||
/// Collect all dependencies detected in the function.
|
||||
void getFuncDependencies();
|
||||
|
@ -65,38 +66,37 @@ public:
|
|||
}
|
||||
|
||||
using HLSCppVisitorBase::visitOp;
|
||||
Optional<unsigned> visitUnhandledOp(Operation *op, unsigned begin) {
|
||||
bool visitUnhandledOp(Operation *op, unsigned begin) {
|
||||
// Default latency of any unhandled operation is 0.
|
||||
setScheduleValue(op, begin, begin);
|
||||
return begin;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// LoadOp and StoreOp related methods.
|
||||
int32_t getPartitionIndex(Operation *op);
|
||||
unsigned getLoadStoreSchedule(Operation *op, unsigned begin);
|
||||
Optional<unsigned> visitOp(AffineLoadOp op, unsigned begin) {
|
||||
return getLoadStoreSchedule(op, begin);
|
||||
void estimateLoadStore(Operation *op, unsigned begin);
|
||||
bool visitOp(AffineLoadOp op, unsigned begin) {
|
||||
return estimateLoadStore(op, begin), true;
|
||||
}
|
||||
Optional<unsigned> visitOp(AffineStoreOp op, unsigned begin) {
|
||||
return getLoadStoreSchedule(op, begin);
|
||||
bool visitOp(AffineStoreOp op, unsigned begin) {
|
||||
return estimateLoadStore(op, begin), true;
|
||||
}
|
||||
|
||||
/// AffineForOp related methods.
|
||||
// unsigned getOpMinII(AffineForOp forOp);
|
||||
unsigned getResMinII(MemAccessesMap &map);
|
||||
unsigned getDepMinII(AffineForOp forOp, MemAccessesMap &map);
|
||||
Optional<unsigned> visitOp(AffineForOp op, unsigned begin);
|
||||
bool visitOp(AffineForOp op, unsigned begin);
|
||||
|
||||
/// Other operation handlers.
|
||||
Optional<unsigned> visitOp(AffineIfOp op, unsigned begin);
|
||||
Optional<unsigned> visitOp(CallOp op, unsigned begin);
|
||||
bool visitOp(AffineIfOp op, unsigned begin);
|
||||
bool visitOp(CallOp op, unsigned begin);
|
||||
|
||||
/// Handle operations with profiled latency.
|
||||
#define HANDLE(OPTYPE, KEYNAME) \
|
||||
Optional<unsigned> visitOp(OPTYPE op, unsigned begin) { \
|
||||
auto end = begin + latencyMap[KEYNAME] + 1; \
|
||||
setScheduleValue(op, begin, end); \
|
||||
return end; \
|
||||
bool visitOp(OPTYPE op, unsigned begin) { \
|
||||
setScheduleValue(op, begin, begin + latencyMap[KEYNAME] + 1); \
|
||||
return true; \
|
||||
}
|
||||
HANDLE(AddFOp, "fadd");
|
||||
HANDLE(MulFOp, "fmul");
|
||||
|
@ -105,6 +105,9 @@ public:
|
|||
#undef HANDLE
|
||||
|
||||
/// Block scheduler and estimator.
|
||||
unsigned getResourceMap(Block &block, ResourceMap &addFMap,
|
||||
ResourceMap &mulFMap);
|
||||
unsigned estimateResource(Block &block);
|
||||
Optional<std::pair<unsigned, unsigned>> estimateBlock(Block &block,
|
||||
unsigned begin);
|
||||
void reverseSchedule();
|
||||
|
@ -258,7 +261,7 @@ int32_t HLSCppEstimator::getPartitionIndex(Operation *op) {
|
|||
}
|
||||
|
||||
/// Schedule load/store operation honoring the memory ports number limitation.
|
||||
unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin) {
|
||||
void HLSCppEstimator::estimateLoadStore(Operation *op, unsigned begin) {
|
||||
// Calculate partition index.
|
||||
auto partitionIdx = getPartitionIndex(op);
|
||||
setAttrValue(op, "partition_index", partitionIdx);
|
||||
|
@ -340,11 +343,10 @@ unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin) {
|
|||
begin++;
|
||||
}
|
||||
|
||||
unsigned end = begin + 1;
|
||||
if (isa<AffineReadOpInterface>(op))
|
||||
end++;
|
||||
setScheduleValue(op, begin, end);
|
||||
return end;
|
||||
setScheduleValue(op, begin, begin + 2);
|
||||
else
|
||||
setScheduleValue(op, begin, begin + 1);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -501,56 +503,26 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, MemAccessesMap &map) {
|
|||
return II;
|
||||
}
|
||||
|
||||
Optional<unsigned> HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) {
|
||||
// Set an attribute indicating the trip count. For now, we assume all loops
|
||||
// have static loop bound.
|
||||
if (auto tripCount = getConstantTripCount(op))
|
||||
setAttrValue(op, "trip_count", (unsigned)tripCount.getValue());
|
||||
else
|
||||
return Optional<unsigned>();
|
||||
|
||||
bool HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) {
|
||||
unsigned end = begin;
|
||||
auto &loopBlock = op.getLoopBody().front();
|
||||
|
||||
// Collect load and store operations in the loop block for solving possible
|
||||
// dependencies.
|
||||
// TODO: include CallOps, how? Maybe we need to somehow analyze the memory
|
||||
// access behavior of the CallOp.
|
||||
MemAccessesMap map;
|
||||
getMemAccessesMap(loopBlock, map);
|
||||
|
||||
// Estimate the loop block.
|
||||
if (auto schedule = estimateBlock(loopBlock, begin)) {
|
||||
end = max(end, schedule.getValue().second);
|
||||
begin = max(begin, schedule.getValue().first);
|
||||
} else
|
||||
return Optional<unsigned>();
|
||||
return false;
|
||||
|
||||
// If the current loop is annotated as pipeline, extra dependency and
|
||||
// resource aware II analysis will be executed.
|
||||
if (getBoolAttrValue(op, "pipeline")) {
|
||||
// Calculate latency of each iteration.
|
||||
auto iterLatency = end - begin;
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
// Set an attribute indicating the trip count. For now, we assume all loops
|
||||
// have static loop bound.
|
||||
if (auto tripCount = getConstantTripCount(op))
|
||||
setAttrValue(op, "trip_count", (unsigned)tripCount.getValue());
|
||||
else
|
||||
return false;
|
||||
|
||||
// Calculate initial interval.
|
||||
auto II = max(getResMinII(map), getDepMinII(op, map));
|
||||
// auto II = max({getOpMinII(op), getResMinII(map), getDepMinII(op, map)});
|
||||
setAttrValue(op, "init_interval", II);
|
||||
|
||||
auto tripCount = getUIntAttrValue(op, "trip_count");
|
||||
setAttrValue(op, "flatten_trip_count", tripCount);
|
||||
|
||||
auto latency = iterLatency + II * (tripCount - 1);
|
||||
setAttrValue(op, "latency", latency);
|
||||
|
||||
// Entering and leaving a loop will consume extra 2 clock cycles.
|
||||
setScheduleValue(op, begin, begin + latency + 2);
|
||||
return begin + latency + 2;
|
||||
}
|
||||
|
||||
// If the current loop is annotated as flatten, it will be flattened into
|
||||
// the child pipelined loop. This will increase the flattened loop trip count
|
||||
// If the current loop is annotated as flatten, it will be flattened into the
|
||||
// child pipelined loop. This will increase the flattened loop trip count
|
||||
// without changing the iteration latency.
|
||||
if (getBoolAttrValue(op, "flatten")) {
|
||||
auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front());
|
||||
|
@ -571,25 +543,64 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineForOp op, unsigned begin) {
|
|||
|
||||
// Since the loop is flattened, it will no longer be entered and left.
|
||||
setScheduleValue(op, begin, begin + latency);
|
||||
return begin + latency;
|
||||
|
||||
// The resource utilization of flattened loop is equal to its child's.
|
||||
setAttrValue(op, "dsp", getUIntAttrValue(child, "dsp"));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Default case, calculate latency of each iteration.
|
||||
// Estimate the loop block resource utilization.
|
||||
auto resource = estimateResource(loopBlock);
|
||||
|
||||
// Calculate latency of each iteration.
|
||||
auto iterLatency = end - begin;
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
|
||||
// If the current loop is annotated as pipelined loop, extra dependency and
|
||||
// resource aware II analysis will be executed.
|
||||
if (getBoolAttrValue(op, "pipeline")) {
|
||||
// Collect load and store operations in the loop block for solving possible
|
||||
// carried dependencies.
|
||||
// TODO: include CallOps, how? Maybe we need to somehow analyze the memory
|
||||
// access behavior of the CallOp.
|
||||
MemAccessesMap map;
|
||||
getMemAccessesMap(loopBlock, map);
|
||||
|
||||
// Calculate initial interval.
|
||||
auto II = max(getResMinII(map), getDepMinII(op, map));
|
||||
// auto II = max({getOpMinII(op), getResMinII(map), getDepMinII(op, map)});
|
||||
setAttrValue(op, "init_interval", II);
|
||||
|
||||
auto tripCount = getUIntAttrValue(op, "trip_count");
|
||||
setAttrValue(op, "flatten_trip_count", tripCount);
|
||||
|
||||
auto latency = iterLatency + II * (tripCount - 1);
|
||||
setAttrValue(op, "latency", latency);
|
||||
|
||||
// Entering and leaving a loop will consume extra 2 clock cycles.
|
||||
setScheduleValue(op, begin, begin + latency + 2);
|
||||
|
||||
// TODO: For pipelined loop, we also need to consider the II when estimating
|
||||
// resource utilization.
|
||||
setAttrValue(op, "dsp", resource);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Default case (not flattend or pipelined), calculate latency and resource
|
||||
// utilization accordingly.
|
||||
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
|
||||
setAttrValue(op, "latency", latency);
|
||||
|
||||
setScheduleValue(op, begin, begin + latency + 2);
|
||||
return begin + latency + 2;
|
||||
setAttrValue(op, "dsp", resource);
|
||||
return true;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Other Operation Handlers
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
Optional<unsigned> HLSCppEstimator::visitOp(AffineIfOp op, unsigned begin) {
|
||||
bool HLSCppEstimator::visitOp(AffineIfOp op, unsigned begin) {
|
||||
unsigned end = begin;
|
||||
auto thenBlock = op.getThenBlock();
|
||||
|
||||
|
@ -597,7 +608,7 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineIfOp op, unsigned begin) {
|
|||
if (auto schedule = estimateBlock(*thenBlock, begin))
|
||||
end = max(end, schedule.getValue().second);
|
||||
else
|
||||
return Optional<unsigned>();
|
||||
return false;
|
||||
|
||||
// Handle else block if required.
|
||||
if (op.hasElse()) {
|
||||
|
@ -606,16 +617,16 @@ Optional<unsigned> HLSCppEstimator::visitOp(AffineIfOp op, unsigned begin) {
|
|||
if (auto schedule = estimateBlock(*elseBlock, begin))
|
||||
end = max(end, schedule.getValue().second);
|
||||
else
|
||||
return Optional<unsigned>();
|
||||
return false;
|
||||
}
|
||||
|
||||
// In our assumption, AffineIfOp is completely transparent. Therefore, we set
|
||||
// a dummy schedule begin here.
|
||||
setScheduleValue(op, end, end);
|
||||
return end;
|
||||
return true;
|
||||
}
|
||||
|
||||
Optional<unsigned> HLSCppEstimator::visitOp(mlir::CallOp op, unsigned begin) {
|
||||
bool HLSCppEstimator::visitOp(mlir::CallOp op, unsigned begin) {
|
||||
auto callee = SymbolTable::lookupSymbolIn(func.getParentOp(), op.getCallee());
|
||||
auto subFunc = dyn_cast<FuncOp>(callee);
|
||||
assert(subFunc && "callable is not a function operation");
|
||||
|
@ -626,15 +637,66 @@ Optional<unsigned> HLSCppEstimator::visitOp(mlir::CallOp op, unsigned begin) {
|
|||
// We assume enter and leave the subfunction require extra 2 clock cycles.
|
||||
if (auto subLatency = getUIntAttrValue(subFunc, "latency")) {
|
||||
setScheduleValue(op, begin, begin + subLatency + 2);
|
||||
return begin + subLatency + 1;
|
||||
return true;
|
||||
} else
|
||||
return Optional<unsigned>();
|
||||
return false;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Block Scheduler and Estimator
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
unsigned HLSCppEstimator::getResourceMap(Block &block, ResourceMap &addFMap,
|
||||
ResourceMap &mulFMap) {
|
||||
unsigned loopResource = 0;
|
||||
for (auto &op : block) {
|
||||
auto begin = getUIntAttrValue(&op, "schedule_begin");
|
||||
auto end = getUIntAttrValue(&op, "schedule_end");
|
||||
|
||||
// Accumulate the resource utilization of each operation.
|
||||
if (isa<AddFOp>(op))
|
||||
for (unsigned i = begin; i < end; ++i)
|
||||
addFMap[i]++;
|
||||
|
||||
else if (isa<MulFOp>(op))
|
||||
for (unsigned i = begin; i < end; ++i)
|
||||
mulFMap[i]++;
|
||||
|
||||
else if (isa<AffineForOp>(op))
|
||||
loopResource += getUIntAttrValue(&op, "dsp");
|
||||
|
||||
else if (auto ifOp = dyn_cast<AffineIfOp>(op)) {
|
||||
// AffineIfOp is transparent during scheduling, thus here we recursively
|
||||
// enter each if block.
|
||||
loopResource += getResourceMap(*ifOp.getThenBlock(), addFMap, mulFMap);
|
||||
if (ifOp.hasElse())
|
||||
loopResource += getResourceMap(*ifOp.getElseBlock(), addFMap, mulFMap);
|
||||
}
|
||||
}
|
||||
return loopResource;
|
||||
}
|
||||
|
||||
unsigned HLSCppEstimator::estimateResource(Block &block) {
|
||||
ResourceMap addFMap;
|
||||
ResourceMap mulFMap;
|
||||
unsigned loopResource = getResourceMap(block, addFMap, mulFMap);
|
||||
|
||||
// Find the max resource utilization across all schedule levels.
|
||||
unsigned maxAddF = 0;
|
||||
for (auto level : addFMap)
|
||||
maxAddF = max(maxAddF, level.second);
|
||||
|
||||
unsigned maxMulF = 0;
|
||||
for (auto level : mulFMap)
|
||||
maxMulF = max(maxMulF, level.second);
|
||||
|
||||
// We assume the loop resource utilization cannot be shared. Therefore, the
|
||||
// overall resource utilization is loops' plus other operstions'. According to
|
||||
// profiling, floating-point add and muliply will consume 2 and 3 DSP units,
|
||||
// respectively.
|
||||
return loopResource + maxAddF * 2 + maxMulF * 3;
|
||||
}
|
||||
|
||||
/// Estimate the latency of a block with ALAP scheduling strategy, return the
|
||||
/// end level of schedule. Meanwhile, the input begin will also be updated if
|
||||
/// required (typically happens in AffineForOps).
|
||||
|
@ -663,8 +725,8 @@ HLSCppEstimator::estimateBlock(Block &block, unsigned begin) {
|
|||
}
|
||||
|
||||
// Estimate the current operation.
|
||||
if (auto scheduleEnd = dispatchVisitor(op, opBegin))
|
||||
opEnd = max(opEnd, scheduleEnd.getValue());
|
||||
if (dispatchVisitor(op, opBegin))
|
||||
opEnd = max(opEnd, getUIntAttrValue(op, "schedule_end"));
|
||||
else
|
||||
return Optional<std::pair<unsigned, unsigned>>();
|
||||
|
||||
|
@ -724,6 +786,9 @@ void HLSCppEstimator::estimateFunc() {
|
|||
// variable loop bound.
|
||||
setAttrValue(func, "latency", std::string("unknown"));
|
||||
}
|
||||
|
||||
// Estimate the resource utilization of the function.
|
||||
setAttrValue(func, "dsp", estimateResource(func.front()));
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -1413,10 +1413,10 @@ void ModuleEmitter::emitInfoAndNewLine(Operation *op) {
|
|||
os << "," << end.getUInt() << ")";
|
||||
|
||||
// Print loop information.
|
||||
if (auto interval = op->getAttrOfType<IntegerAttr>("init_interval"))
|
||||
os << ", interval=" << interval.getUInt();
|
||||
if (auto iteration = op->getAttrOfType<IntegerAttr>("iter_latency"))
|
||||
os << ", iteration=" << iteration.getUInt();
|
||||
if (auto II = op->getAttrOfType<IntegerAttr>("init_interval"))
|
||||
os << ", II=" << II.getUInt();
|
||||
if (auto latency = op->getAttrOfType<IntegerAttr>("iter_latency"))
|
||||
os << ", latency=" << latency.getUInt();
|
||||
|
||||
os << "\n";
|
||||
}
|
||||
|
@ -1449,7 +1449,10 @@ void ModuleEmitter::emitFunction(FuncOp func) {
|
|||
os << "/// This is top function.\n";
|
||||
|
||||
if (auto latency = func.getAttrOfType<IntegerAttr>("latency"))
|
||||
os << "/// Function latency is " << latency.getUInt() << ".\n";
|
||||
os << "/// Latency=" << latency.getUInt() << "\n";
|
||||
|
||||
if (auto dsp = func.getAttrOfType<IntegerAttr>("dsp"))
|
||||
os << "/// DSP=" << dsp.getUInt() << "\n";
|
||||
|
||||
// Emit function signature.
|
||||
os << "void " << func.getName() << "(\n";
|
||||
|
|
Loading…
Reference in New Issue