[QoREstimation] impl of getAccessNum method, this will take partition information and count number of accesses for each partition of each dimension of each load/store op, which will help to better estimate II and iteration latency
This commit is contained in:
parent
58655710f5
commit
28926c5cc2
|
@ -35,6 +35,14 @@ public:
|
|||
return op->getAttrOfType<StringAttr>(name).getValue();
|
||||
}
|
||||
|
||||
StringRef getPartitionType(ArrayOp *op, unsigned dim) {
|
||||
return op->partition_type()[dim].cast<StringAttr>().getValue();
|
||||
}
|
||||
|
||||
unsigned getPartitionFactor(ArrayOp *op, unsigned dim) {
|
||||
return op->partition_factor()[dim].cast<IntegerAttr>().getUInt();
|
||||
}
|
||||
|
||||
/// Set value methods.
|
||||
void setAttrValue(Operation *op, StringRef name, unsigned value) {
|
||||
op->setAttr(name, builder.getUI32IntegerAttr(value));
|
||||
|
@ -48,6 +56,19 @@ public:
|
|||
op->setAttr(name, builder.getStringAttr(value));
|
||||
}
|
||||
|
||||
/// Get expression methods.
|
||||
AffineExpr getSymbolExpr(unsigned value) {
|
||||
return getAffineSymbolExpr(value, builder.getContext());
|
||||
}
|
||||
|
||||
AffineExpr getDimExpr(unsigned value) {
|
||||
return getAffineDimExpr(value, builder.getContext());
|
||||
}
|
||||
|
||||
AffineExpr getConstExpr(unsigned value) {
|
||||
return getAffineConstantExpr(value, builder.getContext());
|
||||
}
|
||||
|
||||
private:
|
||||
OpBuilder &builder;
|
||||
};
|
||||
|
@ -61,8 +82,6 @@ class HLSCppAnalyzer : public HLSCppVisitorBase<HLSCppAnalyzer, bool>,
|
|||
public:
|
||||
explicit HLSCppAnalyzer(OpBuilder &builder) : HLSCppToolBase(builder) {}
|
||||
|
||||
bool inPipeline;
|
||||
|
||||
bool visitUnhandledOp(Operation *op) { return true; }
|
||||
|
||||
using HLSCppVisitorBase::visitOp;
|
||||
|
@ -78,45 +97,61 @@ public:
|
|||
// HLSCppEstimator Class Declaration
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// For storing the scheduled time stamp of operations.
|
||||
using OpScheduleMap = llvm::SmallDenseMap<Operation *, unsigned, 16>;
|
||||
|
||||
// For storing each memory access operations indexed by its targed memory
|
||||
// value symbol.
|
||||
using MemAccess = SmallVector<Operation *, 4>;
|
||||
using MemAccessDict = llvm::SmallDenseMap<Value, MemAccess, 16>;
|
||||
|
||||
// For storing memory access and schedule information of pipelined region.
|
||||
struct PipelineInfo {
|
||||
PipelineInfo(unsigned baseII) : II(baseII) {}
|
||||
|
||||
unsigned II;
|
||||
OpScheduleMap opScheduleMap;
|
||||
MemAccessDict memLoadDict;
|
||||
MemAccessDict memStoreDict;
|
||||
};
|
||||
|
||||
// For storing loop induction information.
|
||||
struct InductionInfo {
|
||||
InductionInfo(unsigned lowerBound, unsigned upperBound, unsigned step)
|
||||
: lowerBound(lowerBound), upperBound(upperBound), step(step) {}
|
||||
|
||||
unsigned lowerBound;
|
||||
unsigned upperBound;
|
||||
unsigned step;
|
||||
};
|
||||
using InductionInfoList = SmallVector<InductionInfo, 8>;
|
||||
|
||||
// This records the number of accesses for each partition.
|
||||
using AccessNum = SmallVector<unsigned, 16>;
|
||||
// This records the AccessNum of each dimension of an array.
|
||||
using AccessNumList = SmallVector<AccessNum, 8>;
|
||||
|
||||
class HLSCppEstimator : public HLSCppVisitorBase<HLSCppEstimator, bool>,
|
||||
public HLSCppToolBase {
|
||||
public:
|
||||
explicit HLSCppEstimator(OpBuilder &builder, std::string targetSpecPath,
|
||||
std::string opLatencyPath);
|
||||
|
||||
// For storing the scheduled time stamp of operations;
|
||||
using ScheduleMap = llvm::SmallDenseMap<Operation *, unsigned, 16>;
|
||||
|
||||
// For storing each memory access operations indexed by its targed memory
|
||||
// value symbol.
|
||||
using MemAccess = std::pair<Value, Operation *>;
|
||||
using MemAccessList = SmallVector<MemAccess, 16>;
|
||||
|
||||
// For storing required memory ports for each partition of each array.
|
||||
using MemPort = SmallVector<unsigned, 16>;
|
||||
using MemPortMap = llvm::SmallDenseMap<Value, MemPort, 16>;
|
||||
|
||||
// This flag indicates that currently the estimator is in a pipelined region,
|
||||
// which will impact the estimation strategy.
|
||||
bool inPipeline;
|
||||
|
||||
bool visitUnhandledOp(Operation *op) { return true; }
|
||||
|
||||
using HLSCppVisitorBase::visitOp;
|
||||
/// These methods can estimate the performance and resource utilization of a
|
||||
/// specific MLIR structure, and update them in procParams or memroyParams.
|
||||
bool visitOp(AffineForOp op);
|
||||
bool visitOp(AffineIfOp op);
|
||||
|
||||
/// These methods are used for searching longest path in a DAG.
|
||||
void alignBlockSchedule(Block &block, ScheduleMap &opScheduleMap,
|
||||
unsigned opSchedule);
|
||||
unsigned getBlockSchedule(Block &block, ScheduleMap &opScheduleMap);
|
||||
unsigned getBlockII(Block &block, ScheduleMap &opScheduleMap,
|
||||
MemAccessList &memLoadList, MemAccessList &memStoreList,
|
||||
unsigned initInterval);
|
||||
void setBlockSchedule(Block &block, unsigned opSchedule,
|
||||
OpScheduleMap &opScheduleMap);
|
||||
unsigned getBlockSchedule(Block &block, bool innerFlatten,
|
||||
OpScheduleMap &opScheduleMap);
|
||||
|
||||
void getPipelineInfo(Block &block, PipelineInfo &info);
|
||||
|
||||
template <typename OpType> void getAccessNum(OpType op, ArrayOp arrayOp);
|
||||
|
||||
/// MLIR component estimators.
|
||||
void estimateOperation(Operation *op);
|
||||
void estimateFunc(FuncOp func);
|
||||
void estimateBlock(Block &block);
|
||||
|
|
|
@ -22,6 +22,7 @@ def ArrayPragmaOp : HLSCppOp<"array_pragma", [PragmaOpInterface]> {
|
|||
DefaultValuedAttr<PositiveUI32Attr, "1024"> : $interface_depth,
|
||||
|
||||
// BindStorage-related attributes.
|
||||
DefaultValuedAttr<BoolAttr, "false"> : $storage,
|
||||
DefaultValuedAttr<StorageTypeAttr, "ram_2p"> : $storage_type,
|
||||
DefaultValuedAttr<StorageImplAttr, "bram"> : $storage_impl,
|
||||
|
||||
|
@ -50,8 +51,9 @@ def LoopPragmaOp : HLSCppOp<"loop_pragma", [
|
|||
DefaultValuedAttr<BoolAttr, "false"> : $pipeline,
|
||||
DefaultValuedAttr<PositiveUI32Attr, "1"> : $pipeline_II,
|
||||
|
||||
// Unroll-related attributes.
|
||||
DefaultValuedAttr<PositiveUI32Attr, "1"> : $unroll_factor
|
||||
// Loop-related attributes.
|
||||
DefaultValuedAttr<BoolAttr, "false"> : $flatten,
|
||||
DefaultValuedAttr<BoolAttr, "false"> : $unroll
|
||||
);
|
||||
|
||||
let assemblyFormat = [{attr-dict}];
|
||||
|
|
|
@ -31,18 +31,19 @@ def ArrayOp : HLSCppOp<"array", [SameOperandsAndResultType]> {
|
|||
let arguments = (ins Type<IsShapedTypePred> : $input,
|
||||
|
||||
// Interface-related attributes.
|
||||
OptionalAttr<BoolAttr> : $interface,
|
||||
OptionalAttr<InterfaceModeAttr> : $interface_mode,
|
||||
OptionalAttr<PositiveUI32Attr> : $interface_depth,
|
||||
DefaultValuedAttr<BoolAttr, "false"> : $interface,
|
||||
DefaultValuedAttr<InterfaceModeAttr, "m_axi"> : $interface_mode,
|
||||
DefaultValuedAttr<PositiveUI32Attr, "1024"> : $interface_depth,
|
||||
|
||||
// BindStorage-related attributes.
|
||||
OptionalAttr<StorageTypeAttr> : $storage_type,
|
||||
OptionalAttr<StorageImplAttr> : $storage_impl,
|
||||
DefaultValuedAttr<BoolAttr, "false"> : $storage,
|
||||
DefaultValuedAttr<StorageTypeAttr, "ram_2p"> : $storage_type,
|
||||
DefaultValuedAttr<StorageImplAttr, "bram"> : $storage_impl,
|
||||
|
||||
// ArrayPartition-related attributes.
|
||||
OptionalAttr<BoolAttr> : $partition,
|
||||
OptionalAttr<PartitionTypeArrayAttr> : $partition_type,
|
||||
OptionalAttr<PositiveUI32ArrayAttr> : $partition_factor
|
||||
DefaultValuedAttr<BoolAttr, "false"> : $partition,
|
||||
DefaultValuedAttr<PartitionTypeArrayAttr, "{}"> : $partition_type,
|
||||
DefaultValuedAttr<PositiveUI32ArrayAttr, "{}"> : $partition_factor
|
||||
);
|
||||
|
||||
let results = (outs Type<IsShapedTypePred> : $output);
|
||||
|
|
|
@ -11,10 +11,6 @@ using namespace mlir;
|
|||
using namespace scalehls;
|
||||
using namespace hlscpp;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Utils
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// HLSCppAnalyzer Class Definition
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -27,33 +23,30 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) {
|
|||
// Recursively analyze all childs.
|
||||
analyzeBlock(body.front());
|
||||
|
||||
// Set an attribute indicating iteration number .
|
||||
// Set an attribute indicating trip count.
|
||||
if (!op.hasConstantLowerBound() || !op.hasConstantUpperBound())
|
||||
op.emitError("has variable upper or lower bound.");
|
||||
|
||||
unsigned iterNumber =
|
||||
(op.getConstantUpperBound() - op.getConstantLowerBound()) /
|
||||
getUIntAttrValue(op, "unroll_factor") / op.getStep();
|
||||
unsigned tripCount =
|
||||
(op.getConstantUpperBound() - op.getConstantLowerBound()) / op.getStep();
|
||||
setAttrValue(op, "trip_count", tripCount);
|
||||
|
||||
setAttrValue(op, "iter_number", iterNumber);
|
||||
|
||||
// Set an attribute indicating this loop is perfect or not.
|
||||
// Set attributes indicating this loop is perfect or not.
|
||||
unsigned opNum = 0;
|
||||
unsigned loopNum = 0;
|
||||
unsigned childNum = 0;
|
||||
bool childPerfect = false;
|
||||
for (auto &bodyOp : body.front()) {
|
||||
if (!isa<AffineYieldOp>(bodyOp))
|
||||
opNum += 1;
|
||||
|
||||
if (auto child = dyn_cast<AffineForOp>(bodyOp)) {
|
||||
loopNum += 1;
|
||||
childNum += 1;
|
||||
childPerfect = getBoolAttrValue(child, "perfect");
|
||||
}
|
||||
}
|
||||
|
||||
if (opNum == 1 && loopNum == 1 && childPerfect)
|
||||
if (opNum == 1 && childNum == 1 && childPerfect)
|
||||
setAttrValue(op, "perfect", true);
|
||||
else if (loopNum == 0)
|
||||
else if (childNum == 0)
|
||||
setAttrValue(op, "perfect", true);
|
||||
else
|
||||
setAttrValue(op, "perfect", false);
|
||||
|
@ -96,8 +89,6 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
|
|||
string opLatencyPath)
|
||||
: HLSCppToolBase(builder) {
|
||||
|
||||
inPipeline = false;
|
||||
|
||||
INIReader targetSpec(targetSpecPath);
|
||||
if (targetSpec.ParseError())
|
||||
llvm::outs() << "error: target spec file parse fail, please refer to "
|
||||
|
@ -114,18 +105,17 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
|
|||
llvm::outs() << latency << "\n";
|
||||
}
|
||||
|
||||
void HLSCppEstimator::alignBlockSchedule(Block &block,
|
||||
ScheduleMap &opScheduleMap,
|
||||
unsigned opSchedule) {
|
||||
void HLSCppEstimator::setBlockSchedule(Block &block, unsigned opSchedule,
|
||||
OpScheduleMap &opScheduleMap) {
|
||||
for (auto &op : block) {
|
||||
if (auto child = dyn_cast<mlir::AffineForOp>(op))
|
||||
alignBlockSchedule(child.getRegion().front(), opScheduleMap, opSchedule);
|
||||
if (auto child = dyn_cast<AffineForOp>(op))
|
||||
setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
|
||||
opScheduleMap[&op] = opSchedule;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned HLSCppEstimator::getBlockSchedule(Block &block,
|
||||
ScheduleMap &opScheduleMap) {
|
||||
unsigned HLSCppEstimator::getBlockSchedule(Block &block, bool innerUnroll,
|
||||
OpScheduleMap &opScheduleMap) {
|
||||
unsigned blockSchedule = 0;
|
||||
|
||||
for (auto &op : block) {
|
||||
|
@ -139,14 +129,17 @@ unsigned HLSCppEstimator::getBlockSchedule(Block &block,
|
|||
|
||||
// Add latency of the current operation.
|
||||
unsigned childSchedule = 0;
|
||||
if (auto child = dyn_cast<mlir::AffineForOp>(op)) {
|
||||
opSchedule += getUIntAttrValue(child, "latency");
|
||||
if (inPipeline)
|
||||
childSchedule =
|
||||
getBlockSchedule(child.getRegion().front(), opScheduleMap);
|
||||
else
|
||||
alignBlockSchedule(child.getRegion().front(), opScheduleMap,
|
||||
opSchedule);
|
||||
if (auto child = dyn_cast<AffineForOp>(op)) {
|
||||
if (innerUnroll) {
|
||||
setAttrValue(child, "unroll", true);
|
||||
setAttrValue(child, "flatten", false);
|
||||
childSchedule = getBlockSchedule(child.getRegion().front(),
|
||||
/*innerUnroll=*/true, opScheduleMap);
|
||||
} else {
|
||||
// Two extra clock cycles will be required to enter and exit child loop.
|
||||
opSchedule += getUIntAttrValue(child, "latency") + 2;
|
||||
setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
|
||||
}
|
||||
} else {
|
||||
// For now we make a simple assumption tha all standard operations has an
|
||||
// unit latency.
|
||||
|
@ -160,52 +153,157 @@ unsigned HLSCppEstimator::getBlockSchedule(Block &block,
|
|||
return blockSchedule;
|
||||
}
|
||||
|
||||
unsigned HLSCppEstimator::getBlockII(Block &block, ScheduleMap &opScheduleMap,
|
||||
MemAccessList &memLoadList,
|
||||
MemAccessList &memStoreList,
|
||||
unsigned initInterval) {
|
||||
void HLSCppEstimator::getPipelineInfo(Block &block, PipelineInfo &info) {
|
||||
for (auto &op : block) {
|
||||
|
||||
// Handle load operations.
|
||||
// Handle load operations and RAW dependencies.
|
||||
if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
|
||||
for (auto memStore : memStoreList) {
|
||||
if (loadOp.getMemRef() == memStore.first) {
|
||||
// TODO: For now, we simply assume the distance between dependency
|
||||
// always takes 1. Thus the II is equal to the latency between
|
||||
// dependency.
|
||||
unsigned RAWLatency =
|
||||
opScheduleMap[loadOp] - opScheduleMap[memStore.second];
|
||||
initInterval = max(initInterval, RAWLatency);
|
||||
}
|
||||
for (auto prevOp : info.memStoreDict[loadOp.getMemRef()]) {
|
||||
unsigned RAWLatency =
|
||||
info.opScheduleMap[loadOp] - info.opScheduleMap[prevOp];
|
||||
info.II = max(info.II, RAWLatency);
|
||||
}
|
||||
memLoadList.push_back(MemAccess(loadOp.getMemRef(), loadOp));
|
||||
info.memLoadDict[loadOp.getMemRef()].push_back(loadOp);
|
||||
}
|
||||
|
||||
// Handle Store operations.
|
||||
// Handle Store operations and RAW/WAW dependencies.
|
||||
else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
|
||||
for (auto memStore : memStoreList) {
|
||||
if (loadOp.getMemRef() == memStore.first) {
|
||||
unsigned WAWLatency =
|
||||
opScheduleMap[storeOp] - opScheduleMap[memStore.second];
|
||||
initInterval = max(initInterval, WAWLatency);
|
||||
}
|
||||
for (auto prevOp : info.memLoadDict[storeOp.getMemRef()]) {
|
||||
unsigned WARLatency =
|
||||
info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
|
||||
info.II = max(info.II, WARLatency);
|
||||
}
|
||||
for (auto memLoad : memLoadList) {
|
||||
if (storeOp.getMemRef() == memLoad.first) {
|
||||
unsigned WARLatency =
|
||||
opScheduleMap[storeOp] - opScheduleMap[memLoad.second];
|
||||
initInterval = max(initInterval, WARLatency);
|
||||
}
|
||||
for (auto prevOp : info.memStoreDict[storeOp.getMemRef()]) {
|
||||
unsigned WAWLatency =
|
||||
info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
|
||||
info.II = max(info.II, WAWLatency);
|
||||
}
|
||||
memStoreList.push_back(MemAccess(storeOp.getMemRef(), storeOp));
|
||||
info.memStoreDict[storeOp.getMemRef()].push_back(storeOp);
|
||||
}
|
||||
|
||||
// Recursively handle child loops.
|
||||
else if (auto child = dyn_cast<AffineForOp>(op))
|
||||
initInterval = getBlockII(child.getRegion().front(), opScheduleMap,
|
||||
memLoadList, memStoreList, initInterval);
|
||||
getPipelineInfo(child.getRegion().front(), info);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OpType>
|
||||
void HLSCppEstimator::getAccessNum(OpType op, ArrayOp arrayOp) {
|
||||
InductionInfoList inductionInfoList;
|
||||
SmallVector<AffineExpr, 8> replacements;
|
||||
SmallVector<unsigned, 8> unrollDims;
|
||||
unsigned unrollTripCount = 1;
|
||||
|
||||
// Collect loop information, including induction & unroll information,
|
||||
// and etc. Note that we assume all operands are dims.
|
||||
unsigned operandIdx = 0;
|
||||
for (auto operand : op.getMapOperands()) {
|
||||
if (auto forOp = getForInductionVarOwner(operand)) {
|
||||
auto lowerBound = forOp.getConstantLowerBound();
|
||||
auto upperBound = forOp.getConstantUpperBound();
|
||||
auto step = forOp.getStep();
|
||||
inductionInfoList.push_back(InductionInfo(lowerBound, upperBound, step));
|
||||
|
||||
auto unroll = getBoolAttrValue(forOp, "unroll");
|
||||
auto tripCount = getUIntAttrValue(forOp, "trip_count");
|
||||
if (unroll) {
|
||||
unrollDims.push_back(operandIdx);
|
||||
unrollTripCount *= tripCount;
|
||||
}
|
||||
|
||||
if (unroll)
|
||||
replacements.push_back(getConstExpr(lowerBound));
|
||||
else
|
||||
replacements.push_back(getDimExpr(operandIdx));
|
||||
} else
|
||||
op.emitError("has index constructed by dynamic values.");
|
||||
operandIdx += 1;
|
||||
}
|
||||
|
||||
return initInterval;
|
||||
// Initialize number of accesses for each partition of each array
|
||||
// dimension as zero.
|
||||
AccessNumList accessNumList;
|
||||
for (auto dim : unrollDims) {
|
||||
AccessNum accessNum;
|
||||
if (arrayOp.partition()) {
|
||||
for (unsigned i = 0; i < getPartitionFactor(&arrayOp, dim); ++i)
|
||||
accessNum.push_back(0);
|
||||
} else
|
||||
accessNum.push_back(0);
|
||||
accessNumList.push_back(accessNum);
|
||||
}
|
||||
|
||||
// Trace all possible index to find potential violations regarding
|
||||
// memory ports number. Violations may cause increasement of iteration
|
||||
// latency or initial interval. This will update the accessNumList.
|
||||
for (unsigned i = 0; i < unrollTripCount; ++i) {
|
||||
|
||||
// Calculate number of accesses for each partition of each array dimension.
|
||||
unsigned idx = 0;
|
||||
for (auto dim : unrollDims) {
|
||||
AffineExpr expr = op.getAffineMap().getResult(dim);
|
||||
auto indexExpr = expr.replaceDimsAndSymbols(replacements, {});
|
||||
|
||||
// Calculate which partition is falled in.
|
||||
if (arrayOp.partition()) {
|
||||
auto type = getPartitionType(&arrayOp, dim);
|
||||
auto factor = getPartitionFactor(&arrayOp, dim);
|
||||
if (type == "cyclic")
|
||||
indexExpr = indexExpr % getConstExpr(factor);
|
||||
else if (type == "block") {
|
||||
auto dimSize = arrayOp.getType().cast<ShapedType>().getShape()[dim];
|
||||
indexExpr =
|
||||
indexExpr.floorDiv(getConstExpr((dimSize + factor - 1) / factor));
|
||||
}
|
||||
} else
|
||||
indexExpr = getConstExpr(0);
|
||||
|
||||
// According to partition information.
|
||||
if (auto constExpr = indexExpr.dyn_cast<AffineConstantExpr>()) {
|
||||
auto partitionId = constExpr.getValue();
|
||||
accessNumList[idx][partitionId] += 1;
|
||||
} else {
|
||||
}
|
||||
idx += 1;
|
||||
}
|
||||
|
||||
// Update replacement.
|
||||
unsigned order = 0;
|
||||
for (auto dim : unrollDims) {
|
||||
auto value = replacements[dim].cast<AffineConstantExpr>().getValue();
|
||||
|
||||
// The little-end value will always increase with a stride of
|
||||
// step.
|
||||
if (order == 0)
|
||||
value += inductionInfoList[dim].step;
|
||||
|
||||
// The value of the current dimension should return to lowerBound
|
||||
// if is greater or equal to upperBound.
|
||||
if (value >= inductionInfoList[dim].upperBound) {
|
||||
value = inductionInfoList[dim].lowerBound;
|
||||
|
||||
// Update the value of the next dimension.
|
||||
if (order < unrollDims.size() - 1) {
|
||||
auto nextDim = unrollDims[order + 1];
|
||||
auto nextValue =
|
||||
replacements[nextDim].cast<AffineConstantExpr>().getValue();
|
||||
nextValue += inductionInfoList[nextDim].step;
|
||||
replacements[nextDim] = getConstExpr(nextValue);
|
||||
}
|
||||
}
|
||||
|
||||
// Update the value of the current dimension.
|
||||
replacements[dim] = getConstExpr(value);
|
||||
order += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// update
|
||||
for (auto accessNum : accessNumList) {
|
||||
llvm::outs() << "new dim\n";
|
||||
for (auto num : accessNum) {
|
||||
llvm::outs() << num << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool HLSCppEstimator::visitOp(AffineForOp op) {
|
||||
|
@ -213,87 +311,95 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
|
|||
if (body.getBlocks().size() != 1)
|
||||
op.emitError("has zero or more than one basic blocks.");
|
||||
|
||||
if (getBoolAttrValue(op, "pipeline")) {
|
||||
inPipeline = true;
|
||||
// If loop is unrolled, all inner loops will be unrolled accordingly.
|
||||
if (getBoolAttrValue(op, "unroll")) {
|
||||
setAttrValue(op, "pipeline", false);
|
||||
setAttrValue(op, "flatten", false);
|
||||
op.emitRemark("all inner loops are automatically unrolled.");
|
||||
|
||||
ScheduleMap opScheduleMap;
|
||||
auto iterLatency = getBlockSchedule(body.front(), opScheduleMap);
|
||||
getUIntAttrValue(op, "iter_latency");
|
||||
|
||||
// For now we make a simple assumption that II is equal to 1.
|
||||
auto iterNumber = getUIntAttrValue(op, "iter_number");
|
||||
setAttrValue(op, "pipeline_iter", iterNumber);
|
||||
|
||||
// Calculate initial interval.
|
||||
MemAccessList memLoadList;
|
||||
MemAccessList memStoreList;
|
||||
unsigned initInterval = 1;
|
||||
initInterval = getBlockII(body.front(), opScheduleMap, memLoadList,
|
||||
memStoreList, initInterval);
|
||||
|
||||
// Calculate initial interval caused by limited memory ports. For now, we
|
||||
// just consider the memory access inside of the pipeline region, aks the
|
||||
// extra memory ports caused by unroll optimization out of the pipeline
|
||||
// region are not calculated.
|
||||
MemPortMap memLoadPortMap;
|
||||
MemPortMap memStorePortMap;
|
||||
for (auto &op : body.front()) {
|
||||
}
|
||||
|
||||
setAttrValue(op, "pipeline_II", initInterval);
|
||||
setAttrValue(op, "latency", iterLatency + initInterval * (iterNumber - 1));
|
||||
OpScheduleMap opScheduleMap;
|
||||
auto latency =
|
||||
getBlockSchedule(body.front(), /*innerUnroll=*/true, opScheduleMap);
|
||||
setAttrValue(op, "latency", latency);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the loop is not pipelined, the estimation is much different and requires
|
||||
// to recursively enter each child loop for estimating the overall latency of
|
||||
// the current loop.
|
||||
else {
|
||||
// Recursively estimate each operation, mainly AffineFor operation will be
|
||||
// differently handled for now.
|
||||
estimateBlock(body.front());
|
||||
// If loop is pipelined, the pipelined loop will be estimated as a whole since
|
||||
// all loops inside of a pipeline will be automatically fully unrolled.
|
||||
if (getBoolAttrValue(op, "pipeline")) {
|
||||
setAttrValue(op, "flatten", true);
|
||||
op.emitRemark("all inner loops are automatically unrolled.");
|
||||
|
||||
// This simply means the current loop can be merged into the child loop
|
||||
// pipeline. This will increase the total IterNumber without changing the
|
||||
// IterLatency.
|
||||
if (inPipeline && getBoolAttrValue(op, "perfect")) {
|
||||
if (auto child = dyn_cast<AffineForOp>(
|
||||
std::next(op.getLoopBody().front().begin()))) {
|
||||
auto initInterval = getUIntAttrValue(child, "pipeline_II");
|
||||
auto iterLatency = getUIntAttrValue(child, "iter_latency");
|
||||
auto pipeIterNumber = getUIntAttrValue(child, "pipeline_iter") *
|
||||
getUIntAttrValue(op, "iter_number");
|
||||
// Calculate latency of each iteration.
|
||||
PipelineInfo pipelineInfo(/*baseII=*/1);
|
||||
auto iterLatency = getBlockSchedule(body.front(), /*innerUnroll=*/true,
|
||||
pipelineInfo.opScheduleMap);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
|
||||
setAttrValue(op, "pipeline_II", initInterval);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
setAttrValue(op, "pipeline_iter", pipeIterNumber);
|
||||
// For now we make a simple assumption that II is equal to 1.
|
||||
auto tripCount = getUIntAttrValue(op, "trip_count");
|
||||
setAttrValue(op, "flatten_trip_count", tripCount);
|
||||
|
||||
setAttrValue(op, "latency",
|
||||
iterLatency + initInterval * (pipeIterNumber - 1));
|
||||
} else {
|
||||
inPipeline = false;
|
||||
op.emitError("is not a perfect loop.");
|
||||
// Collect pipeline information including II and memory access information.
|
||||
getPipelineInfo(body.front(), pipelineInfo);
|
||||
|
||||
// Calculate latency and II considering memory ports violations.
|
||||
for (auto &memLoad : pipelineInfo.memLoadDict) {
|
||||
auto arrayOp = dyn_cast<ArrayOp>(memLoad.first.getDefiningOp());
|
||||
if (!arrayOp)
|
||||
op.emitError("is accessing an array that is not defined by ArrayOp.");
|
||||
|
||||
for (auto loadOp : memLoad.second) {
|
||||
getAccessNum<AffineLoadOp>(cast<AffineLoadOp>(loadOp), arrayOp);
|
||||
}
|
||||
}
|
||||
|
||||
// This branch take cares of all unpipelined or imperfect loops.
|
||||
else {
|
||||
inPipeline = false;
|
||||
setAttrValue(op, "init_interval", pipelineInfo.II);
|
||||
setAttrValue(op, "latency",
|
||||
iterLatency + pipelineInfo.II * (tripCount - 1));
|
||||
return true;
|
||||
}
|
||||
|
||||
ScheduleMap opScheduleMap;
|
||||
auto iterLatency = getBlockSchedule(body.front(), opScheduleMap);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
// If the loop is not pipelined or unrolled, the estimation is different and
|
||||
// requires to recursively enter each child loop for estimating the overall
|
||||
// latency of the current loop.
|
||||
estimateBlock(body.front());
|
||||
|
||||
// For now we follow the COMBA approach for unrooled loops.
|
||||
unsigned latency = iterLatency;
|
||||
if (getUIntAttrValue(op, "iter_number") != 1)
|
||||
latency *= getUIntAttrValue(op, "iter_number") *
|
||||
getUIntAttrValue(op, "unroll_factor");
|
||||
setAttrValue(op, "latency", latency);
|
||||
// This simply means the current loop can be flattened into the child loop
|
||||
// pipeline. This will increase the flattened loop trip count without
|
||||
// changing the iteration latency. Note that this will be propogated above
|
||||
// until meeting an imperfect loop.
|
||||
if (getBoolAttrValue(op, "perfect")) {
|
||||
if (auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front())) {
|
||||
if (getBoolAttrValue(child, "flatten")) {
|
||||
setAttrValue(op, "flatten", true);
|
||||
op.emitRemark("this loop is flattened into its child loop.");
|
||||
|
||||
// TODO: Calculate initial interval.
|
||||
setAttrValue(op, "iter_latency", (unsigned)1);
|
||||
auto II = getUIntAttrValue(child, "init_interval");
|
||||
auto iterLatency = getUIntAttrValue(child, "iter_latency");
|
||||
auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
|
||||
getUIntAttrValue(op, "trip_count");
|
||||
|
||||
setAttrValue(op, "init_interval", II);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
setAttrValue(op, "flatten_trip_count", flattenTripCount);
|
||||
|
||||
setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Default case, aka !unroll && !pipeline && !(perfect && child.flatten).
|
||||
setAttrValue(op, "flatten", false);
|
||||
|
||||
OpScheduleMap opScheduleMap;
|
||||
auto iterLatency =
|
||||
getBlockSchedule(body.front(), /*innerUnroll=*/false, opScheduleMap);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
|
||||
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
|
||||
setAttrValue(op, "latency", latency);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -313,8 +419,9 @@ void HLSCppEstimator::estimateFunc(FuncOp func) {
|
|||
|
||||
estimateBlock(func.front());
|
||||
|
||||
ScheduleMap opScheduleMap;
|
||||
auto latency = getBlockSchedule(func.front(), opScheduleMap);
|
||||
OpScheduleMap opScheduleMap;
|
||||
auto latency =
|
||||
getBlockSchedule(func.front(), /*innerUnroll=*/false, opScheduleMap);
|
||||
setAttrValue(func, "latency", latency);
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,8 @@ static void convertBlock(Block &block) {
|
|||
bool insertArrayOp = false;
|
||||
if (operand.getKind() == Value::Kind::BlockArgument)
|
||||
insertArrayOp = true;
|
||||
else if (!isa<ArrayOp>(operand.getDefiningOp())) {
|
||||
else if (!isa<ArrayOp>(operand.getDefiningOp()) &&
|
||||
!isa<AssignOp>(operand.getDefiningOp())) {
|
||||
insertArrayOp = true;
|
||||
if (!arrayType.hasStaticShape())
|
||||
operand.getDefiningOp()->emitError(
|
||||
|
@ -51,8 +52,7 @@ static void convertBlock(Block &block) {
|
|||
// bram. Other attributes are not set here since they requires more
|
||||
// analysis to be determined.
|
||||
arrayOp.setAttr("interface", builder.getBoolAttr(false));
|
||||
arrayOp.setAttr("storage_type", builder.getStringAttr("ram_1p"));
|
||||
arrayOp.setAttr("storage_impl", builder.getStringAttr("bram"));
|
||||
arrayOp.setAttr("storage", builder.getBoolAttr(false));
|
||||
arrayOp.setAttr("partition", builder.getBoolAttr(false));
|
||||
}
|
||||
}
|
||||
|
@ -64,8 +64,8 @@ static void convertBlock(Block &block) {
|
|||
|
||||
// Set loop pragma attributes.
|
||||
forOp.setAttr("pipeline", builder.getBoolAttr(false));
|
||||
forOp.setAttr("pipeline_II", builder.getUI32IntegerAttr(1));
|
||||
forOp.setAttr("unroll_factor", builder.getUI32IntegerAttr(1));
|
||||
forOp.setAttr("unroll", builder.getBoolAttr(false));
|
||||
forOp.setAttr("flatten", builder.getBoolAttr(false));
|
||||
|
||||
convertBlock(forOp.getLoopBody().front());
|
||||
}
|
||||
|
|
|
@ -1034,20 +1034,18 @@ void ModuleEmitter::emitArray(ArrayOp *op) {}
|
|||
|
||||
/// Pragma operation emitters.
|
||||
void ModuleEmitter::emitLoopPragma(LoopPragmaOp *op) {
|
||||
indent();
|
||||
os << "#pragma HLS unroll";
|
||||
// TODO: default factor.
|
||||
os << " factor=" << op->unroll_factor();
|
||||
os << " skip_exit_check\n";
|
||||
|
||||
indent();
|
||||
os << "#pragma HLS pipeline";
|
||||
if (op->pipeline()) {
|
||||
if (op->pipeline())
|
||||
os << " II=" << op->pipeline_II();
|
||||
os << " rewind\n";
|
||||
} else
|
||||
else
|
||||
os << " off\n";
|
||||
|
||||
if (op->unroll()) {
|
||||
indent();
|
||||
os << "#pragma HLS unroll\n";
|
||||
}
|
||||
|
||||
// An empty line.
|
||||
os << "\n";
|
||||
}
|
||||
|
|
|
@ -2,19 +2,19 @@
|
|||
|
||||
// CHECK-LABEL: func @test_for
|
||||
func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attributes {dataflow = false} {
|
||||
%array0 = "hlscpp.array"(%arg0) {interface = false, partition = false, storage_impl = "bram", storage_type = "ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
%array1 = "hlscpp.array"(%arg1) {interface = false, partition = false, storage_impl = "bram", storage_type = "ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
%array0 = "hlscpp.array"(%arg0) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
%array1 = "hlscpp.array"(%arg1) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
//"hlscpp.array_pragma" (%arg0) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
|
||||
//"hlscpp.array_pragma" (%arg1) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
|
||||
affine.for %i = 0 to 16 {
|
||||
affine.for %j = 0 to 4 {
|
||||
affine.for %k = 0 to 4 {
|
||||
%0 = affine.load %array0[%i, %j, %k] : memref<16x4x4xindex>
|
||||
%0 = affine.load %array0[%i, %j, %i + %k] : memref<16x4x4xindex>
|
||||
%1 = affine.load %array1[%i, %j, %k] : memref<16x4x4xindex>
|
||||
%2 = muli %0, %1 : index
|
||||
affine.store %2, %array1[%i, %j, %k] : memref<16x4x4xindex>
|
||||
} {pipeline = false, pipeline_II = 1 : ui32, unroll_factor = 1 : ui32}
|
||||
} {pipeline = false, pipeline_II = 1 : ui32, unroll_factor = 1 : ui32}
|
||||
} {pipeline = false, pipeline_II = 1 : ui32, unroll_factor = 1 : ui32}
|
||||
} {pipeline = false, unroll = false, flatten = false}
|
||||
} {pipeline = true, unroll = false, flatten = false}
|
||||
} {pipeline = false, unroll = false, flatten = false}
|
||||
return
|
||||
}
|
||||
|
|
|
@ -3,13 +3,15 @@
|
|||
// CHECK-LABEL: func @test_conversion(
|
||||
// CHECK-SAME: %arg0: f32, %arg1: memref<16xf32>) -> (f32, memref<16xf32>, i32, tensor<2x2xi32>) attributes {dataflow = false} {
|
||||
func @test_conversion(%arg0: f32, %arg1: memref<16xf32>) -> (f32, memref<16xf32>, i32, tensor<2x2xi32>) {
|
||||
// CHECK: %[[VAL_0:.*]] = "hlscpp.array"(%[[ARG_1:.*]]) {interface = false, partition = false, storage_impl = "bram", storage_type = "ram_1p"} : (memref<16xf32>) -> memref<16xf32>
|
||||
// CHECK: %[[VAL_0:.*]] = "hlscpp.array"(%[[ARG_1:.*]]) {interface = false, partition = false, storage = false} : (memref<16xf32>) -> memref<16xf32>
|
||||
%c11_i32 = constant 11 : i32
|
||||
%cst = constant dense<[[11, 0], [0, -42]]> : tensor<2x2xi32>
|
||||
|
||||
// CHECK: %[[VAL_1:.*]] = "hlscpp.array"(%cst) {interface = false, partition = false, storage_impl = "bram", storage_type = "ram_1p"} : (tensor<2x2xi32>) -> tensor<2x2xi32>
|
||||
// CHECK: %[[VAL_2:.*]] = "hlscpp.assign"(%[[ARG_0:.*]]) : (f32) -> f32
|
||||
// CHECK: %[[VAL_2:.*]] = "hlscpp.assign"(%c11_i32) : (i32) -> i32
|
||||
// CHECK: return %[[VAL_2:.*]], %[[VAL_0:.*]], %[[VAL_3:.*]], %[[VAL_1:.*]] : f32, memref<16xf32>, i32, tensor<2x2xi32>
|
||||
// CHECK: %[[VAL_1:.*]] = "hlscpp.array"(%cst) {interface = false, partition = false, storage = false} : (tensor<2x2xi32>) -> tensor<2x2xi32>
|
||||
// CHECK: %[[VAL_2:.*]] = "hlscpp.assign"(%arg0) : (f32) -> f32
|
||||
// CHECK: %[[VAL_3:.*]] = "hlscpp.assign"(%[[VAL_0:.*]]) : (memref<16xf32>) -> memref<16xf32>
|
||||
// CHECK: %[[VAL_4:.*]] = "hlscpp.assign"(%c11_i32) : (i32) -> i32
|
||||
// CHECK: %[[VAL_5:.*]] = "hlscpp.assign"(%[[VAL_1:.*]]) : (tensor<2x2xi32>) -> tensor<2x2xi32>
|
||||
// CHECK: return %[[VAL_2:.*]], %[[VAL_3:.*]], %[[VAL_4:.*]], %[[VAL_5:.*]] : f32, memref<16xf32>, i32, tensor<2x2xi32>
|
||||
return %arg0, %arg1, %c11_i32, %cst : f32, memref<16xf32>, i32, tensor<2x2xi32>
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue