[QoREstimation] impl of getAccessNum method, this will take partition information and count number of accesses for each partition of each dimension of each load/store op, which will help to better estimate II and iteration latency

This commit is contained in:
Hanchen Ye 2020-10-08 02:45:44 -05:00
parent 58655710f5
commit 28926c5cc2
8 changed files with 343 additions and 198 deletions

View File

@ -35,6 +35,14 @@ public:
return op->getAttrOfType<StringAttr>(name).getValue();
}
StringRef getPartitionType(ArrayOp *op, unsigned dim) {
return op->partition_type()[dim].cast<StringAttr>().getValue();
}
unsigned getPartitionFactor(ArrayOp *op, unsigned dim) {
return op->partition_factor()[dim].cast<IntegerAttr>().getUInt();
}
/// Set value methods.
void setAttrValue(Operation *op, StringRef name, unsigned value) {
op->setAttr(name, builder.getUI32IntegerAttr(value));
@ -48,6 +56,19 @@ public:
op->setAttr(name, builder.getStringAttr(value));
}
/// Get expression methods.
AffineExpr getSymbolExpr(unsigned value) {
return getAffineSymbolExpr(value, builder.getContext());
}
AffineExpr getDimExpr(unsigned value) {
return getAffineDimExpr(value, builder.getContext());
}
AffineExpr getConstExpr(unsigned value) {
return getAffineConstantExpr(value, builder.getContext());
}
private:
OpBuilder &builder;
};
@ -61,8 +82,6 @@ class HLSCppAnalyzer : public HLSCppVisitorBase<HLSCppAnalyzer, bool>,
public:
explicit HLSCppAnalyzer(OpBuilder &builder) : HLSCppToolBase(builder) {}
bool inPipeline;
bool visitUnhandledOp(Operation *op) { return true; }
using HLSCppVisitorBase::visitOp;
@ -78,45 +97,61 @@ public:
// HLSCppEstimator Class Declaration
//===----------------------------------------------------------------------===//
// For storing the scheduled time stamp of operations.
using OpScheduleMap = llvm::SmallDenseMap<Operation *, unsigned, 16>;
// For storing each memory access operations indexed by its targed memory
// value symbol.
using MemAccess = SmallVector<Operation *, 4>;
using MemAccessDict = llvm::SmallDenseMap<Value, MemAccess, 16>;
// For storing memory access and schedule information of pipelined region.
struct PipelineInfo {
PipelineInfo(unsigned baseII) : II(baseII) {}
unsigned II;
OpScheduleMap opScheduleMap;
MemAccessDict memLoadDict;
MemAccessDict memStoreDict;
};
// For storing loop induction information.
struct InductionInfo {
InductionInfo(unsigned lowerBound, unsigned upperBound, unsigned step)
: lowerBound(lowerBound), upperBound(upperBound), step(step) {}
unsigned lowerBound;
unsigned upperBound;
unsigned step;
};
using InductionInfoList = SmallVector<InductionInfo, 8>;
// This records the number of accesses for each partition.
using AccessNum = SmallVector<unsigned, 16>;
// This records the AccessNum of each dimension of an array.
using AccessNumList = SmallVector<AccessNum, 8>;
class HLSCppEstimator : public HLSCppVisitorBase<HLSCppEstimator, bool>,
public HLSCppToolBase {
public:
explicit HLSCppEstimator(OpBuilder &builder, std::string targetSpecPath,
std::string opLatencyPath);
// For storing the scheduled time stamp of operations;
using ScheduleMap = llvm::SmallDenseMap<Operation *, unsigned, 16>;
// For storing each memory access operations indexed by its targed memory
// value symbol.
using MemAccess = std::pair<Value, Operation *>;
using MemAccessList = SmallVector<MemAccess, 16>;
// For storing required memory ports for each partition of each array.
using MemPort = SmallVector<unsigned, 16>;
using MemPortMap = llvm::SmallDenseMap<Value, MemPort, 16>;
// This flag indicates that currently the estimator is in a pipelined region,
// which will impact the estimation strategy.
bool inPipeline;
bool visitUnhandledOp(Operation *op) { return true; }
using HLSCppVisitorBase::visitOp;
/// These methods can estimate the performance and resource utilization of a
/// specific MLIR structure, and update them in procParams or memroyParams.
bool visitOp(AffineForOp op);
bool visitOp(AffineIfOp op);
/// These methods are used for searching longest path in a DAG.
void alignBlockSchedule(Block &block, ScheduleMap &opScheduleMap,
unsigned opSchedule);
unsigned getBlockSchedule(Block &block, ScheduleMap &opScheduleMap);
unsigned getBlockII(Block &block, ScheduleMap &opScheduleMap,
MemAccessList &memLoadList, MemAccessList &memStoreList,
unsigned initInterval);
void setBlockSchedule(Block &block, unsigned opSchedule,
OpScheduleMap &opScheduleMap);
unsigned getBlockSchedule(Block &block, bool innerFlatten,
OpScheduleMap &opScheduleMap);
void getPipelineInfo(Block &block, PipelineInfo &info);
template <typename OpType> void getAccessNum(OpType op, ArrayOp arrayOp);
/// MLIR component estimators.
void estimateOperation(Operation *op);
void estimateFunc(FuncOp func);
void estimateBlock(Block &block);

View File

@ -22,6 +22,7 @@ def ArrayPragmaOp : HLSCppOp<"array_pragma", [PragmaOpInterface]> {
DefaultValuedAttr<PositiveUI32Attr, "1024"> : $interface_depth,
// BindStorage-related attributes.
DefaultValuedAttr<BoolAttr, "false"> : $storage,
DefaultValuedAttr<StorageTypeAttr, "ram_2p"> : $storage_type,
DefaultValuedAttr<StorageImplAttr, "bram"> : $storage_impl,
@ -50,8 +51,9 @@ def LoopPragmaOp : HLSCppOp<"loop_pragma", [
DefaultValuedAttr<BoolAttr, "false"> : $pipeline,
DefaultValuedAttr<PositiveUI32Attr, "1"> : $pipeline_II,
// Unroll-related attributes.
DefaultValuedAttr<PositiveUI32Attr, "1"> : $unroll_factor
// Loop-related attributes.
DefaultValuedAttr<BoolAttr, "false"> : $flatten,
DefaultValuedAttr<BoolAttr, "false"> : $unroll
);
let assemblyFormat = [{attr-dict}];

View File

@ -31,18 +31,19 @@ def ArrayOp : HLSCppOp<"array", [SameOperandsAndResultType]> {
let arguments = (ins Type<IsShapedTypePred> : $input,
// Interface-related attributes.
OptionalAttr<BoolAttr> : $interface,
OptionalAttr<InterfaceModeAttr> : $interface_mode,
OptionalAttr<PositiveUI32Attr> : $interface_depth,
DefaultValuedAttr<BoolAttr, "false"> : $interface,
DefaultValuedAttr<InterfaceModeAttr, "m_axi"> : $interface_mode,
DefaultValuedAttr<PositiveUI32Attr, "1024"> : $interface_depth,
// BindStorage-related attributes.
OptionalAttr<StorageTypeAttr> : $storage_type,
OptionalAttr<StorageImplAttr> : $storage_impl,
DefaultValuedAttr<BoolAttr, "false"> : $storage,
DefaultValuedAttr<StorageTypeAttr, "ram_2p"> : $storage_type,
DefaultValuedAttr<StorageImplAttr, "bram"> : $storage_impl,
// ArrayPartition-related attributes.
OptionalAttr<BoolAttr> : $partition,
OptionalAttr<PartitionTypeArrayAttr> : $partition_type,
OptionalAttr<PositiveUI32ArrayAttr> : $partition_factor
DefaultValuedAttr<BoolAttr, "false"> : $partition,
DefaultValuedAttr<PartitionTypeArrayAttr, "{}"> : $partition_type,
DefaultValuedAttr<PositiveUI32ArrayAttr, "{}"> : $partition_factor
);
let results = (outs Type<IsShapedTypePred> : $output);

View File

@ -11,10 +11,6 @@ using namespace mlir;
using namespace scalehls;
using namespace hlscpp;
//===----------------------------------------------------------------------===//
// Utils
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// HLSCppAnalyzer Class Definition
//===----------------------------------------------------------------------===//
@ -27,33 +23,30 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) {
// Recursively analyze all childs.
analyzeBlock(body.front());
// Set an attribute indicating iteration number .
// Set an attribute indicating trip count.
if (!op.hasConstantLowerBound() || !op.hasConstantUpperBound())
op.emitError("has variable upper or lower bound.");
unsigned iterNumber =
(op.getConstantUpperBound() - op.getConstantLowerBound()) /
getUIntAttrValue(op, "unroll_factor") / op.getStep();
unsigned tripCount =
(op.getConstantUpperBound() - op.getConstantLowerBound()) / op.getStep();
setAttrValue(op, "trip_count", tripCount);
setAttrValue(op, "iter_number", iterNumber);
// Set an attribute indicating this loop is perfect or not.
// Set attributes indicating this loop is perfect or not.
unsigned opNum = 0;
unsigned loopNum = 0;
unsigned childNum = 0;
bool childPerfect = false;
for (auto &bodyOp : body.front()) {
if (!isa<AffineYieldOp>(bodyOp))
opNum += 1;
if (auto child = dyn_cast<AffineForOp>(bodyOp)) {
loopNum += 1;
childNum += 1;
childPerfect = getBoolAttrValue(child, "perfect");
}
}
if (opNum == 1 && loopNum == 1 && childPerfect)
if (opNum == 1 && childNum == 1 && childPerfect)
setAttrValue(op, "perfect", true);
else if (loopNum == 0)
else if (childNum == 0)
setAttrValue(op, "perfect", true);
else
setAttrValue(op, "perfect", false);
@ -96,8 +89,6 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
string opLatencyPath)
: HLSCppToolBase(builder) {
inPipeline = false;
INIReader targetSpec(targetSpecPath);
if (targetSpec.ParseError())
llvm::outs() << "error: target spec file parse fail, please refer to "
@ -114,18 +105,17 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
llvm::outs() << latency << "\n";
}
void HLSCppEstimator::alignBlockSchedule(Block &block,
ScheduleMap &opScheduleMap,
unsigned opSchedule) {
void HLSCppEstimator::setBlockSchedule(Block &block, unsigned opSchedule,
OpScheduleMap &opScheduleMap) {
for (auto &op : block) {
if (auto child = dyn_cast<mlir::AffineForOp>(op))
alignBlockSchedule(child.getRegion().front(), opScheduleMap, opSchedule);
if (auto child = dyn_cast<AffineForOp>(op))
setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
opScheduleMap[&op] = opSchedule;
}
}
unsigned HLSCppEstimator::getBlockSchedule(Block &block,
ScheduleMap &opScheduleMap) {
unsigned HLSCppEstimator::getBlockSchedule(Block &block, bool innerUnroll,
OpScheduleMap &opScheduleMap) {
unsigned blockSchedule = 0;
for (auto &op : block) {
@ -139,14 +129,17 @@ unsigned HLSCppEstimator::getBlockSchedule(Block &block,
// Add latency of the current operation.
unsigned childSchedule = 0;
if (auto child = dyn_cast<mlir::AffineForOp>(op)) {
opSchedule += getUIntAttrValue(child, "latency");
if (inPipeline)
childSchedule =
getBlockSchedule(child.getRegion().front(), opScheduleMap);
else
alignBlockSchedule(child.getRegion().front(), opScheduleMap,
opSchedule);
if (auto child = dyn_cast<AffineForOp>(op)) {
if (innerUnroll) {
setAttrValue(child, "unroll", true);
setAttrValue(child, "flatten", false);
childSchedule = getBlockSchedule(child.getRegion().front(),
/*innerUnroll=*/true, opScheduleMap);
} else {
// Two extra clock cycles will be required to enter and exit child loop.
opSchedule += getUIntAttrValue(child, "latency") + 2;
setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
}
} else {
// For now we make a simple assumption tha all standard operations has an
// unit latency.
@ -160,52 +153,157 @@ unsigned HLSCppEstimator::getBlockSchedule(Block &block,
return blockSchedule;
}
unsigned HLSCppEstimator::getBlockII(Block &block, ScheduleMap &opScheduleMap,
MemAccessList &memLoadList,
MemAccessList &memStoreList,
unsigned initInterval) {
void HLSCppEstimator::getPipelineInfo(Block &block, PipelineInfo &info) {
for (auto &op : block) {
// Handle load operations.
// Handle load operations and RAW dependencies.
if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
for (auto memStore : memStoreList) {
if (loadOp.getMemRef() == memStore.first) {
// TODO: For now, we simply assume the distance between dependency
// always takes 1. Thus the II is equal to the latency between
// dependency.
unsigned RAWLatency =
opScheduleMap[loadOp] - opScheduleMap[memStore.second];
initInterval = max(initInterval, RAWLatency);
}
for (auto prevOp : info.memStoreDict[loadOp.getMemRef()]) {
unsigned RAWLatency =
info.opScheduleMap[loadOp] - info.opScheduleMap[prevOp];
info.II = max(info.II, RAWLatency);
}
memLoadList.push_back(MemAccess(loadOp.getMemRef(), loadOp));
info.memLoadDict[loadOp.getMemRef()].push_back(loadOp);
}
// Handle Store operations.
// Handle Store operations and RAW/WAW dependencies.
else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
for (auto memStore : memStoreList) {
if (loadOp.getMemRef() == memStore.first) {
unsigned WAWLatency =
opScheduleMap[storeOp] - opScheduleMap[memStore.second];
initInterval = max(initInterval, WAWLatency);
}
for (auto prevOp : info.memLoadDict[storeOp.getMemRef()]) {
unsigned WARLatency =
info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
info.II = max(info.II, WARLatency);
}
for (auto memLoad : memLoadList) {
if (storeOp.getMemRef() == memLoad.first) {
unsigned WARLatency =
opScheduleMap[storeOp] - opScheduleMap[memLoad.second];
initInterval = max(initInterval, WARLatency);
}
for (auto prevOp : info.memStoreDict[storeOp.getMemRef()]) {
unsigned WAWLatency =
info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
info.II = max(info.II, WAWLatency);
}
memStoreList.push_back(MemAccess(storeOp.getMemRef(), storeOp));
info.memStoreDict[storeOp.getMemRef()].push_back(storeOp);
}
// Recursively handle child loops.
else if (auto child = dyn_cast<AffineForOp>(op))
initInterval = getBlockII(child.getRegion().front(), opScheduleMap,
memLoadList, memStoreList, initInterval);
getPipelineInfo(child.getRegion().front(), info);
}
}
template <typename OpType>
void HLSCppEstimator::getAccessNum(OpType op, ArrayOp arrayOp) {
InductionInfoList inductionInfoList;
SmallVector<AffineExpr, 8> replacements;
SmallVector<unsigned, 8> unrollDims;
unsigned unrollTripCount = 1;
// Collect loop information, including induction & unroll information,
// and etc. Note that we assume all operands are dims.
unsigned operandIdx = 0;
for (auto operand : op.getMapOperands()) {
if (auto forOp = getForInductionVarOwner(operand)) {
auto lowerBound = forOp.getConstantLowerBound();
auto upperBound = forOp.getConstantUpperBound();
auto step = forOp.getStep();
inductionInfoList.push_back(InductionInfo(lowerBound, upperBound, step));
auto unroll = getBoolAttrValue(forOp, "unroll");
auto tripCount = getUIntAttrValue(forOp, "trip_count");
if (unroll) {
unrollDims.push_back(operandIdx);
unrollTripCount *= tripCount;
}
if (unroll)
replacements.push_back(getConstExpr(lowerBound));
else
replacements.push_back(getDimExpr(operandIdx));
} else
op.emitError("has index constructed by dynamic values.");
operandIdx += 1;
}
return initInterval;
// Initialize number of accesses for each partition of each array
// dimension as zero.
AccessNumList accessNumList;
for (auto dim : unrollDims) {
AccessNum accessNum;
if (arrayOp.partition()) {
for (unsigned i = 0; i < getPartitionFactor(&arrayOp, dim); ++i)
accessNum.push_back(0);
} else
accessNum.push_back(0);
accessNumList.push_back(accessNum);
}
// Trace all possible index to find potential violations regarding
// memory ports number. Violations may cause increasement of iteration
// latency or initial interval. This will update the accessNumList.
for (unsigned i = 0; i < unrollTripCount; ++i) {
// Calculate number of accesses for each partition of each array dimension.
unsigned idx = 0;
for (auto dim : unrollDims) {
AffineExpr expr = op.getAffineMap().getResult(dim);
auto indexExpr = expr.replaceDimsAndSymbols(replacements, {});
// Calculate which partition is falled in.
if (arrayOp.partition()) {
auto type = getPartitionType(&arrayOp, dim);
auto factor = getPartitionFactor(&arrayOp, dim);
if (type == "cyclic")
indexExpr = indexExpr % getConstExpr(factor);
else if (type == "block") {
auto dimSize = arrayOp.getType().cast<ShapedType>().getShape()[dim];
indexExpr =
indexExpr.floorDiv(getConstExpr((dimSize + factor - 1) / factor));
}
} else
indexExpr = getConstExpr(0);
// According to partition information.
if (auto constExpr = indexExpr.dyn_cast<AffineConstantExpr>()) {
auto partitionId = constExpr.getValue();
accessNumList[idx][partitionId] += 1;
} else {
}
idx += 1;
}
// Update replacement.
unsigned order = 0;
for (auto dim : unrollDims) {
auto value = replacements[dim].cast<AffineConstantExpr>().getValue();
// The little-end value will always increase with a stride of
// step.
if (order == 0)
value += inductionInfoList[dim].step;
// The value of the current dimension should return to lowerBound
// if is greater or equal to upperBound.
if (value >= inductionInfoList[dim].upperBound) {
value = inductionInfoList[dim].lowerBound;
// Update the value of the next dimension.
if (order < unrollDims.size() - 1) {
auto nextDim = unrollDims[order + 1];
auto nextValue =
replacements[nextDim].cast<AffineConstantExpr>().getValue();
nextValue += inductionInfoList[nextDim].step;
replacements[nextDim] = getConstExpr(nextValue);
}
}
// Update the value of the current dimension.
replacements[dim] = getConstExpr(value);
order += 1;
}
}
// update
for (auto accessNum : accessNumList) {
llvm::outs() << "new dim\n";
for (auto num : accessNum) {
llvm::outs() << num << "\n";
}
}
}
bool HLSCppEstimator::visitOp(AffineForOp op) {
@ -213,87 +311,95 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
if (body.getBlocks().size() != 1)
op.emitError("has zero or more than one basic blocks.");
if (getBoolAttrValue(op, "pipeline")) {
inPipeline = true;
// If loop is unrolled, all inner loops will be unrolled accordingly.
if (getBoolAttrValue(op, "unroll")) {
setAttrValue(op, "pipeline", false);
setAttrValue(op, "flatten", false);
op.emitRemark("all inner loops are automatically unrolled.");
ScheduleMap opScheduleMap;
auto iterLatency = getBlockSchedule(body.front(), opScheduleMap);
getUIntAttrValue(op, "iter_latency");
// For now we make a simple assumption that II is equal to 1.
auto iterNumber = getUIntAttrValue(op, "iter_number");
setAttrValue(op, "pipeline_iter", iterNumber);
// Calculate initial interval.
MemAccessList memLoadList;
MemAccessList memStoreList;
unsigned initInterval = 1;
initInterval = getBlockII(body.front(), opScheduleMap, memLoadList,
memStoreList, initInterval);
// Calculate initial interval caused by limited memory ports. For now, we
// just consider the memory access inside of the pipeline region, aks the
// extra memory ports caused by unroll optimization out of the pipeline
// region are not calculated.
MemPortMap memLoadPortMap;
MemPortMap memStorePortMap;
for (auto &op : body.front()) {
}
setAttrValue(op, "pipeline_II", initInterval);
setAttrValue(op, "latency", iterLatency + initInterval * (iterNumber - 1));
OpScheduleMap opScheduleMap;
auto latency =
getBlockSchedule(body.front(), /*innerUnroll=*/true, opScheduleMap);
setAttrValue(op, "latency", latency);
return true;
}
// If the loop is not pipelined, the estimation is much different and requires
// to recursively enter each child loop for estimating the overall latency of
// the current loop.
else {
// Recursively estimate each operation, mainly AffineFor operation will be
// differently handled for now.
estimateBlock(body.front());
// If loop is pipelined, the pipelined loop will be estimated as a whole since
// all loops inside of a pipeline will be automatically fully unrolled.
if (getBoolAttrValue(op, "pipeline")) {
setAttrValue(op, "flatten", true);
op.emitRemark("all inner loops are automatically unrolled.");
// This simply means the current loop can be merged into the child loop
// pipeline. This will increase the total IterNumber without changing the
// IterLatency.
if (inPipeline && getBoolAttrValue(op, "perfect")) {
if (auto child = dyn_cast<AffineForOp>(
std::next(op.getLoopBody().front().begin()))) {
auto initInterval = getUIntAttrValue(child, "pipeline_II");
auto iterLatency = getUIntAttrValue(child, "iter_latency");
auto pipeIterNumber = getUIntAttrValue(child, "pipeline_iter") *
getUIntAttrValue(op, "iter_number");
// Calculate latency of each iteration.
PipelineInfo pipelineInfo(/*baseII=*/1);
auto iterLatency = getBlockSchedule(body.front(), /*innerUnroll=*/true,
pipelineInfo.opScheduleMap);
setAttrValue(op, "iter_latency", iterLatency);
setAttrValue(op, "pipeline_II", initInterval);
setAttrValue(op, "iter_latency", iterLatency);
setAttrValue(op, "pipeline_iter", pipeIterNumber);
// For now we make a simple assumption that II is equal to 1.
auto tripCount = getUIntAttrValue(op, "trip_count");
setAttrValue(op, "flatten_trip_count", tripCount);
setAttrValue(op, "latency",
iterLatency + initInterval * (pipeIterNumber - 1));
} else {
inPipeline = false;
op.emitError("is not a perfect loop.");
// Collect pipeline information including II and memory access information.
getPipelineInfo(body.front(), pipelineInfo);
// Calculate latency and II considering memory ports violations.
for (auto &memLoad : pipelineInfo.memLoadDict) {
auto arrayOp = dyn_cast<ArrayOp>(memLoad.first.getDefiningOp());
if (!arrayOp)
op.emitError("is accessing an array that is not defined by ArrayOp.");
for (auto loadOp : memLoad.second) {
getAccessNum<AffineLoadOp>(cast<AffineLoadOp>(loadOp), arrayOp);
}
}
// This branch take cares of all unpipelined or imperfect loops.
else {
inPipeline = false;
setAttrValue(op, "init_interval", pipelineInfo.II);
setAttrValue(op, "latency",
iterLatency + pipelineInfo.II * (tripCount - 1));
return true;
}
ScheduleMap opScheduleMap;
auto iterLatency = getBlockSchedule(body.front(), opScheduleMap);
setAttrValue(op, "iter_latency", iterLatency);
// If the loop is not pipelined or unrolled, the estimation is different and
// requires to recursively enter each child loop for estimating the overall
// latency of the current loop.
estimateBlock(body.front());
// For now we follow the COMBA approach for unrooled loops.
unsigned latency = iterLatency;
if (getUIntAttrValue(op, "iter_number") != 1)
latency *= getUIntAttrValue(op, "iter_number") *
getUIntAttrValue(op, "unroll_factor");
setAttrValue(op, "latency", latency);
// This simply means the current loop can be flattened into the child loop
// pipeline. This will increase the flattened loop trip count without
// changing the iteration latency. Note that this will be propogated above
// until meeting an imperfect loop.
if (getBoolAttrValue(op, "perfect")) {
if (auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front())) {
if (getBoolAttrValue(child, "flatten")) {
setAttrValue(op, "flatten", true);
op.emitRemark("this loop is flattened into its child loop.");
// TODO: Calculate initial interval.
setAttrValue(op, "iter_latency", (unsigned)1);
auto II = getUIntAttrValue(child, "init_interval");
auto iterLatency = getUIntAttrValue(child, "iter_latency");
auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
getUIntAttrValue(op, "trip_count");
setAttrValue(op, "init_interval", II);
setAttrValue(op, "iter_latency", iterLatency);
setAttrValue(op, "flatten_trip_count", flattenTripCount);
setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1));
return true;
}
}
}
// Default case, aka !unroll && !pipeline && !(perfect && child.flatten).
setAttrValue(op, "flatten", false);
OpScheduleMap opScheduleMap;
auto iterLatency =
getBlockSchedule(body.front(), /*innerUnroll=*/false, opScheduleMap);
setAttrValue(op, "iter_latency", iterLatency);
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
setAttrValue(op, "latency", latency);
return true;
}
@ -313,8 +419,9 @@ void HLSCppEstimator::estimateFunc(FuncOp func) {
estimateBlock(func.front());
ScheduleMap opScheduleMap;
auto latency = getBlockSchedule(func.front(), opScheduleMap);
OpScheduleMap opScheduleMap;
auto latency =
getBlockSchedule(func.front(), /*innerUnroll=*/false, opScheduleMap);
setAttrValue(func, "latency", latency);
}

View File

@ -32,7 +32,8 @@ static void convertBlock(Block &block) {
bool insertArrayOp = false;
if (operand.getKind() == Value::Kind::BlockArgument)
insertArrayOp = true;
else if (!isa<ArrayOp>(operand.getDefiningOp())) {
else if (!isa<ArrayOp>(operand.getDefiningOp()) &&
!isa<AssignOp>(operand.getDefiningOp())) {
insertArrayOp = true;
if (!arrayType.hasStaticShape())
operand.getDefiningOp()->emitError(
@ -51,8 +52,7 @@ static void convertBlock(Block &block) {
// bram. Other attributes are not set here since they requires more
// analysis to be determined.
arrayOp.setAttr("interface", builder.getBoolAttr(false));
arrayOp.setAttr("storage_type", builder.getStringAttr("ram_1p"));
arrayOp.setAttr("storage_impl", builder.getStringAttr("bram"));
arrayOp.setAttr("storage", builder.getBoolAttr(false));
arrayOp.setAttr("partition", builder.getBoolAttr(false));
}
}
@ -64,8 +64,8 @@ static void convertBlock(Block &block) {
// Set loop pragma attributes.
forOp.setAttr("pipeline", builder.getBoolAttr(false));
forOp.setAttr("pipeline_II", builder.getUI32IntegerAttr(1));
forOp.setAttr("unroll_factor", builder.getUI32IntegerAttr(1));
forOp.setAttr("unroll", builder.getBoolAttr(false));
forOp.setAttr("flatten", builder.getBoolAttr(false));
convertBlock(forOp.getLoopBody().front());
}

View File

@ -1034,20 +1034,18 @@ void ModuleEmitter::emitArray(ArrayOp *op) {}
/// Pragma operation emitters.
void ModuleEmitter::emitLoopPragma(LoopPragmaOp *op) {
indent();
os << "#pragma HLS unroll";
// TODO: default factor.
os << " factor=" << op->unroll_factor();
os << " skip_exit_check\n";
indent();
os << "#pragma HLS pipeline";
if (op->pipeline()) {
if (op->pipeline())
os << " II=" << op->pipeline_II();
os << " rewind\n";
} else
else
os << " off\n";
if (op->unroll()) {
indent();
os << "#pragma HLS unroll\n";
}
// An empty line.
os << "\n";
}

View File

@ -2,19 +2,19 @@
// CHECK-LABEL: func @test_for
func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attributes {dataflow = false} {
%array0 = "hlscpp.array"(%arg0) {interface = false, partition = false, storage_impl = "bram", storage_type = "ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
%array1 = "hlscpp.array"(%arg1) {interface = false, partition = false, storage_impl = "bram", storage_type = "ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
%array0 = "hlscpp.array"(%arg0) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
%array1 = "hlscpp.array"(%arg1) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
//"hlscpp.array_pragma" (%arg0) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
//"hlscpp.array_pragma" (%arg1) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
affine.for %i = 0 to 16 {
affine.for %j = 0 to 4 {
affine.for %k = 0 to 4 {
%0 = affine.load %array0[%i, %j, %k] : memref<16x4x4xindex>
%0 = affine.load %array0[%i, %j, %i + %k] : memref<16x4x4xindex>
%1 = affine.load %array1[%i, %j, %k] : memref<16x4x4xindex>
%2 = muli %0, %1 : index
affine.store %2, %array1[%i, %j, %k] : memref<16x4x4xindex>
} {pipeline = false, pipeline_II = 1 : ui32, unroll_factor = 1 : ui32}
} {pipeline = false, pipeline_II = 1 : ui32, unroll_factor = 1 : ui32}
} {pipeline = false, pipeline_II = 1 : ui32, unroll_factor = 1 : ui32}
} {pipeline = false, unroll = false, flatten = false}
} {pipeline = true, unroll = false, flatten = false}
} {pipeline = false, unroll = false, flatten = false}
return
}

View File

@ -3,13 +3,15 @@
// CHECK-LABEL: func @test_conversion(
// CHECK-SAME: %arg0: f32, %arg1: memref<16xf32>) -> (f32, memref<16xf32>, i32, tensor<2x2xi32>) attributes {dataflow = false} {
func @test_conversion(%arg0: f32, %arg1: memref<16xf32>) -> (f32, memref<16xf32>, i32, tensor<2x2xi32>) {
// CHECK: %[[VAL_0:.*]] = "hlscpp.array"(%[[ARG_1:.*]]) {interface = false, partition = false, storage_impl = "bram", storage_type = "ram_1p"} : (memref<16xf32>) -> memref<16xf32>
// CHECK: %[[VAL_0:.*]] = "hlscpp.array"(%[[ARG_1:.*]]) {interface = false, partition = false, storage = false} : (memref<16xf32>) -> memref<16xf32>
%c11_i32 = constant 11 : i32
%cst = constant dense<[[11, 0], [0, -42]]> : tensor<2x2xi32>
// CHECK: %[[VAL_1:.*]] = "hlscpp.array"(%cst) {interface = false, partition = false, storage_impl = "bram", storage_type = "ram_1p"} : (tensor<2x2xi32>) -> tensor<2x2xi32>
// CHECK: %[[VAL_2:.*]] = "hlscpp.assign"(%[[ARG_0:.*]]) : (f32) -> f32
// CHECK: %[[VAL_2:.*]] = "hlscpp.assign"(%c11_i32) : (i32) -> i32
// CHECK: return %[[VAL_2:.*]], %[[VAL_0:.*]], %[[VAL_3:.*]], %[[VAL_1:.*]] : f32, memref<16xf32>, i32, tensor<2x2xi32>
// CHECK: %[[VAL_1:.*]] = "hlscpp.array"(%cst) {interface = false, partition = false, storage = false} : (tensor<2x2xi32>) -> tensor<2x2xi32>
// CHECK: %[[VAL_2:.*]] = "hlscpp.assign"(%arg0) : (f32) -> f32
// CHECK: %[[VAL_3:.*]] = "hlscpp.assign"(%[[VAL_0:.*]]) : (memref<16xf32>) -> memref<16xf32>
// CHECK: %[[VAL_4:.*]] = "hlscpp.assign"(%c11_i32) : (i32) -> i32
// CHECK: %[[VAL_5:.*]] = "hlscpp.assign"(%[[VAL_1:.*]]) : (tensor<2x2xi32>) -> tensor<2x2xi32>
// CHECK: return %[[VAL_2:.*]], %[[VAL_3:.*]], %[[VAL_4:.*]], %[[VAL_5:.*]] : f32, memref<16xf32>, i32, tensor<2x2xi32>
return %arg0, %arg1, %c11_i32, %cst : f32, memref<16xf32>, i32, tensor<2x2xi32>
}