[QoREstimation] unroll loops in IR now! support a real partition-aware loop scheduling

This commit is contained in:
Hanchen Ye 2020-10-10 02:33:26 -05:00
parent 28926c5cc2
commit cbfb623c67
3 changed files with 323 additions and 300 deletions

View File

@ -9,6 +9,7 @@
#include "Visitor.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/TypeSwitch.h"
namespace mlir {
@ -23,24 +24,46 @@ public:
explicit HLSCppToolBase(OpBuilder &builder) : builder(builder) {}
/// Get value methods.
int64_t getIntAttrValue(Operation *op, StringRef name) {
if (auto attr = op->getAttrOfType<IntegerAttr>(name))
return attr.getInt();
else
return -1;
}
unsigned getUIntAttrValue(Operation *op, StringRef name) {
return op->getAttrOfType<IntegerAttr>(name).getUInt();
if (auto attr = op->getAttrOfType<IntegerAttr>(name))
return attr.getUInt();
else
return 0;
}
bool getBoolAttrValue(Operation *op, StringRef name) {
return op->getAttrOfType<BoolAttr>(name).getValue();
if (auto attr = op->getAttrOfType<BoolAttr>(name))
return attr.getValue();
else
return false;
}
StringRef getStrAttrValue(Operation *op, StringRef name) {
return op->getAttrOfType<StringAttr>(name).getValue();
if (auto attr = op->getAttrOfType<StringAttr>(name))
return attr.getValue();
else
return "";
}
StringRef getPartitionType(ArrayOp *op, unsigned dim) {
return op->partition_type()[dim].cast<StringAttr>().getValue();
StringRef getPartitionType(ArrayOp op, unsigned dim) {
if (auto attr = op.partition_type()[dim].cast<StringAttr>())
return attr.getValue();
else
return "";
}
unsigned getPartitionFactor(ArrayOp *op, unsigned dim) {
return op->partition_factor()[dim].cast<IntegerAttr>().getUInt();
unsigned getPartitionFactor(ArrayOp op, unsigned dim) {
if (auto attr = op.partition_factor()[dim].cast<IntegerAttr>())
return attr.getUInt();
else
return 0;
}
/// Set value methods.
@ -48,6 +71,10 @@ public:
op->setAttr(name, builder.getUI32IntegerAttr(value));
}
void setAttrValue(Operation *op, StringRef name, int32_t value) {
op->setAttr(name, builder.getI32IntegerAttr(value));
}
void setAttrValue(Operation *op, StringRef name, bool value) {
op->setAttr(name, builder.getBoolAttr(value));
}
@ -65,11 +92,10 @@ public:
return getAffineDimExpr(value, builder.getContext());
}
AffineExpr getConstExpr(unsigned value) {
AffineExpr getConstExpr(int64_t value) {
return getAffineConstantExpr(value, builder.getContext());
}
private:
OpBuilder &builder;
};
@ -97,20 +123,30 @@ public:
// HLSCppEstimator Class Declaration
//===----------------------------------------------------------------------===//
// For storing the scheduled time stamp of operations.
using OpScheduleMap = llvm::SmallDenseMap<Operation *, unsigned, 16>;
// Indicate the unoccupied memory ports number.
struct PortNum {
PortNum(unsigned rdPort = 0, unsigned wrPort = 0, unsigned rdwrPort = 0)
: rdPort(rdPort), wrPort(wrPort), rdwrPort(rdwrPort) {}
// For storing each memory access operations indexed by its targed memory
// value symbol.
using MemAccess = SmallVector<Operation *, 4>;
using MemAccessDict = llvm::SmallDenseMap<Value, MemAccess, 16>;
unsigned rdPort;
unsigned wrPort;
unsigned rdwrPort;
};
// For storing memory access and schedule information of pipelined region.
struct PipelineInfo {
PipelineInfo(unsigned baseII) : II(baseII) {}
// For storing ports number information of each memory instance.
using MemPort = llvm::SmallDenseMap<Operation *, SmallVector<PortNum, 16>, 8>;
unsigned II;
OpScheduleMap opScheduleMap;
// For storing MemPort indexed by the pipeline stage (a basic block).
using MemPortList = SmallVector<MemPort, 16>;
// For storing each memory access operations (including AffineLoadOp and
// AffineStoreOp) indexed by the array instantce (ArrayOp).
using MemAccess = SmallVector<Operation *, 16>;
using MemAccessDict = llvm::SmallDenseMap<Operation *, MemAccess, 8>;
// An aggregate information structure for storing memory load and store
// MemAccessDict in the scope of loop/function/other region.
struct MemInfo {
MemAccessDict memLoadDict;
MemAccessDict memStoreDict;
};
@ -126,11 +162,6 @@ struct InductionInfo {
};
using InductionInfoList = SmallVector<InductionInfo, 8>;
// This records the number of accesses for each partition.
using AccessNum = SmallVector<unsigned, 16>;
// This records the AccessNum of each dimension of an array.
using AccessNumList = SmallVector<AccessNum, 8>;
class HLSCppEstimator : public HLSCppVisitorBase<HLSCppEstimator, bool>,
public HLSCppToolBase {
public:
@ -143,14 +174,12 @@ public:
bool visitOp(AffineForOp op);
bool visitOp(AffineIfOp op);
void setBlockSchedule(Block &block, unsigned opSchedule,
OpScheduleMap &opScheduleMap);
unsigned getBlockSchedule(Block &block, bool innerFlatten,
OpScheduleMap &opScheduleMap);
int32_t getPartitionIdx(AffineMap map, ArrayOp op);
void getMemInfo(Block &block, MemInfo &info);
void getPipelineInfo(Block &block, PipelineInfo &info);
template <typename OpType> void getAccessNum(OpType op, ArrayOp arrayOp);
unsigned getLoadStoreSchedule(Operation *op, ArrayOp arrayOp,
MemPortList &memPortList, unsigned begin);
unsigned getBlockSchedule(Block &block, MemInfo memInfo);
void estimateOperation(Operation *op);
void estimateFunc(FuncOp func);

View File

@ -5,6 +5,7 @@
#include "Analysis/QoREstimation.h"
#include "Analysis/Passes.h"
#include "Dialect/HLSCpp/HLSCpp.h"
#include "mlir/IR/PatternMatch.h"
using namespace std;
using namespace mlir;
@ -16,14 +17,41 @@ using namespace hlscpp;
//===----------------------------------------------------------------------===//
bool HLSCppAnalyzer::visitOp(AffineForOp op) {
// If the current loop is annotated as unroll, all inner loops and itself are
// automatically unrolled.
if (getBoolAttrValue(op, "unroll")) {
op.emitRemark("this loop and all inner loops are automatically unrolled.");
op.walk([&](AffineForOp forOp) {
if (forOp.getLoopBody().getBlocks().size() != 1)
op.emitError("has zero or more than one basic blocks.");
loopUnrollFull(forOp);
});
return true;
}
// If the current loop is annotated as pipeline, all intter loops are
// automatically unrolled.
if (getBoolAttrValue(op, "pipeline")) {
op.emitRemark("all inner loops are automatically unrolled.");
op.walk([&](AffineForOp forOp) {
if (forOp != op) {
if (forOp.getLoopBody().getBlocks().size() != 1)
op.emitError("has zero or more than one basic blocks.");
loopUnrollFull(forOp);
}
});
}
// We assume loop contains a single basic block.
auto &body = op.getLoopBody();
if (body.getBlocks().size() != 1)
op.emitError("has zero or more than one basic blocks.");
// Recursively analyze all childs.
// Recursively analyze all inner loops.
analyzeBlock(body.front());
// Set an attribute indicating trip count.
// Set an attribute indicating the trip count. For now, we assume all loops
// have static loop bound.
if (!op.hasConstantLowerBound() || !op.hasConstantUpperBound())
op.emitError("has variable upper or lower bound.");
@ -31,25 +59,24 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) {
(op.getConstantUpperBound() - op.getConstantLowerBound()) / op.getStep();
setAttrValue(op, "trip_count", tripCount);
// Set attributes indicating this loop is perfect or not.
// Set attributes indicating this loop can be flatten or not.
unsigned opNum = 0;
unsigned childNum = 0;
bool childPerfect = false;
unsigned forNum = 0;
bool innerFlatten = false;
for (auto &bodyOp : body.front()) {
if (!isa<AffineYieldOp>(bodyOp))
opNum += 1;
if (auto child = dyn_cast<AffineForOp>(bodyOp)) {
childNum += 1;
childPerfect = getBoolAttrValue(child, "perfect");
if (isa<AffineForOp>(bodyOp)) {
forNum += 1;
innerFlatten = getBoolAttrValue(&bodyOp, "flatten");
}
}
if (opNum == 1 && childNum == 1 && childPerfect)
setAttrValue(op, "perfect", true);
else if (childNum == 0)
setAttrValue(op, "perfect", true);
if (forNum == 0 || (opNum == 1 && innerFlatten))
setAttrValue(op, "flatten", true);
else
setAttrValue(op, "perfect", false);
setAttrValue(op, "flatten", false);
return true;
}
@ -105,205 +132,200 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
llvm::outs() << latency << "\n";
}
void HLSCppEstimator::setBlockSchedule(Block &block, unsigned opSchedule,
OpScheduleMap &opScheduleMap) {
/// Calculate the partition index according to the affine map of a memory access
/// operation, and store the results as attribute.
int32_t HLSCppEstimator::getPartitionIdx(AffineMap map, ArrayOp op) {
int32_t partitionIdx = 0;
unsigned accumFactor = 1;
unsigned dim = 0;
for (auto expr : map.getResults()) {
auto idxExpr = getConstExpr(0);
unsigned factor = 1;
if (op.partition()) {
auto type = getPartitionType(op, dim);
factor = getPartitionFactor(op, dim);
if (type == "cyclic")
idxExpr = expr % getConstExpr(factor);
else if (type == "block") {
auto size = op.getType().cast<ShapedType>().getShape()[dim];
idxExpr = expr.floorDiv(getConstExpr((size + factor - 1) / factor));
}
}
if (auto constExpr = idxExpr.dyn_cast<AffineConstantExpr>()) {
if (dim == 0)
partitionIdx = constExpr.getValue();
else
partitionIdx += constExpr.getValue() * accumFactor;
} else {
partitionIdx = -1;
break;
}
accumFactor *= factor;
dim += 1;
}
return partitionIdx;
}
void HLSCppEstimator::getMemInfo(Block &block, MemInfo &info) {
for (auto &op : block) {
if (auto child = dyn_cast<AffineForOp>(op))
setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
opScheduleMap[&op] = opSchedule;
if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
info.memLoadDict[arrayOp].push_back(loadOp);
setAttrValue(loadOp, "partition_index",
getPartitionIdx(loadOp.getAffineMap(), arrayOp));
// TODO: consider RAW, WAR, WAW dependency for scheduling.
} else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
info.memLoadDict[arrayOp].push_back(storeOp);
setAttrValue(storeOp, "partition_index",
getPartitionIdx(storeOp.getAffineMap(), arrayOp));
}
}
}
unsigned HLSCppEstimator::getBlockSchedule(Block &block, bool innerUnroll,
OpScheduleMap &opScheduleMap) {
unsigned blockSchedule = 0;
unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, ArrayOp arrayOp,
MemPortList &memPortList,
unsigned begin) {
auto partitionIdx = getIntAttrValue(op, "partition_index");
// Try to avoid memory port violation until a legal schedule is found.
// Since an infinite length pipeline can be generated, this while loop can
// be proofed to have an end.
while (true) {
auto partitionPortNum = memPortList[begin][arrayOp];
bool memEmpty = false;
// Partition factor.
unsigned factor = 1;
// If the memory has not been occupied by the current stage, it should
// be initialized according to its storage type. Note that each
// partition should have one PortNum structure.
if (partitionPortNum.empty()) {
memEmpty = true;
if (getBoolAttrValue(arrayOp, "partition")) {
for (unsigned dim = 0;
dim < arrayOp.getType().cast<ShapedType>().getRank(); ++dim)
factor *= getPartitionFactor(arrayOp, dim);
}
auto storagetType = getStrAttrValue(arrayOp, "storage_type");
for (unsigned p = 0; p < factor; ++p) {
unsigned rdPort = 0;
unsigned wrPort = 0;
unsigned rdwrPort = 0;
if (storagetType == "ram_s2p")
rdPort = 1, wrPort = 1;
else if (storagetType == "ram_2p" || storagetType == "ram_t2p")
rdwrPort = 2;
else if (storagetType == "ram_1p")
rdwrPort = 1;
else {
rdwrPort = 2;
arrayOp.emitError("unsupported storage type.");
}
PortNum portNum(rdPort, wrPort, rdwrPort);
partitionPortNum.push_back(portNum);
}
}
// TODO: When partition index can't be determined, this operation will be
// considered to occupy all ports.
if (partitionIdx == -1) {
if (memEmpty) {
for (unsigned p = 0; p < factor; ++p) {
partitionPortNum[partitionIdx].rdPort = 0;
partitionPortNum[partitionIdx].wrPort = 0;
partitionPortNum[partitionIdx].rdwrPort = 0;
}
memPortList[begin][arrayOp] = partitionPortNum;
break;
} else {
if (++begin >= memPortList.size()) {
MemPort memPort;
memPortList.push_back(memPort);
}
}
}
// Find whether the current schedule meets memory port limitation. If
// not, the schedule will increase by 1.
if (partitionPortNum[partitionIdx].rdPort > 0) {
partitionPortNum[partitionIdx].rdPort -= 1;
memPortList[begin][arrayOp] = partitionPortNum;
break;
} else if (partitionPortNum[partitionIdx].rdwrPort > 0) {
partitionPortNum[partitionIdx].rdwrPort -= 1;
memPortList[begin][arrayOp] = partitionPortNum;
break;
} else {
if (++begin >= memPortList.size()) {
MemPort memPort;
memPortList.push_back(memPort);
}
}
}
return begin;
}
unsigned HLSCppEstimator::getBlockSchedule(Block &block, MemInfo memInfo) {
unsigned blockEnd = 0;
MemPortList memPortList;
for (auto &op : block) {
unsigned opSchedule = 0;
// Add the latest scheduled time among all predecessors.
// Find the latest predecessor dominating the current operation. This should
// be considered as the earliest stage that the current operation can be
// scheduled.
unsigned begin = 0;
unsigned end = 0;
for (auto operand : op.getOperands()) {
if (operand.getKind() != Value::Kind::BlockArgument)
opSchedule = max(opSchedule, opScheduleMap[operand.getDefiningOp()]);
begin = max(begin,
getUIntAttrValue(operand.getDefiningOp(), "schedule_end"));
}
// Add latency of the current operation.
unsigned childSchedule = 0;
if (auto child = dyn_cast<AffineForOp>(op)) {
if (innerUnroll) {
setAttrValue(child, "unroll", true);
setAttrValue(child, "flatten", false);
childSchedule = getBlockSchedule(child.getRegion().front(),
/*innerUnroll=*/true, opScheduleMap);
} else {
// Two extra clock cycles will be required to enter and exit child loop.
opSchedule += getUIntAttrValue(child, "latency") + 2;
setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
}
} else {
// For now we make a simple assumption tha all standard operations has an
// unit latency.
// TODO: Support estimation from profiling data.
opSchedule += 1;
// Insert new pipeline stages.
while (begin >= memPortList.size()) {
MemPort memPort;
memPortList.push_back(memPort);
}
opScheduleMap[&op] = opSchedule;
blockSchedule = max({blockSchedule, childSchedule, opSchedule});
}
return blockSchedule;
}
void HLSCppEstimator::getPipelineInfo(Block &block, PipelineInfo &info) {
for (auto &op : block) {
// Handle load operations and RAW dependencies.
// Handle load operations, ensure the current schedule meets memory port
// limitation.
if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
for (auto prevOp : info.memStoreDict[loadOp.getMemRef()]) {
unsigned RAWLatency =
info.opScheduleMap[loadOp] - info.opScheduleMap[prevOp];
info.II = max(info.II, RAWLatency);
}
info.memLoadDict[loadOp.getMemRef()].push_back(loadOp);
auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
begin = getLoadStoreSchedule(loadOp, arrayOp, memPortList, begin);
end = begin + 1;
}
// Handle Store operations and RAW/WAW dependencies.
// Handle store operations.
else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
for (auto prevOp : info.memLoadDict[storeOp.getMemRef()]) {
unsigned WARLatency =
info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
info.II = max(info.II, WARLatency);
}
for (auto prevOp : info.memStoreDict[storeOp.getMemRef()]) {
unsigned WAWLatency =
info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
info.II = max(info.II, WAWLatency);
}
info.memStoreDict[storeOp.getMemRef()].push_back(storeOp);
auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
begin = getLoadStoreSchedule(storeOp, arrayOp, memPortList, begin);
end = begin + 1;
}
// Handle loop operations.
else if (auto forOp = dyn_cast<AffineForOp>(op)) {
// Child loop is considered as a large node, and two extra clock cycles
// will be required to enter and exit the child loop.
end = begin + getUIntAttrValue(forOp, "latency") + 2;
}
// Default case. All normal expressions and operations will be handled by
// this branch.
else {
// TODO: For now, we assume all operations take one clock cycle to
// execute, should support to accept profiling data.
end = begin + 1;
}
// Recursively handle child loops.
else if (auto child = dyn_cast<AffineForOp>(op))
getPipelineInfo(child.getRegion().front(), info);
}
}
template <typename OpType>
void HLSCppEstimator::getAccessNum(OpType op, ArrayOp arrayOp) {
InductionInfoList inductionInfoList;
SmallVector<AffineExpr, 8> replacements;
SmallVector<unsigned, 8> unrollDims;
unsigned unrollTripCount = 1;
// Collect loop information, including induction & unroll information,
// and etc. Note that we assume all operands are dims.
unsigned operandIdx = 0;
for (auto operand : op.getMapOperands()) {
if (auto forOp = getForInductionVarOwner(operand)) {
auto lowerBound = forOp.getConstantLowerBound();
auto upperBound = forOp.getConstantUpperBound();
auto step = forOp.getStep();
inductionInfoList.push_back(InductionInfo(lowerBound, upperBound, step));
auto unroll = getBoolAttrValue(forOp, "unroll");
auto tripCount = getUIntAttrValue(forOp, "trip_count");
if (unroll) {
unrollDims.push_back(operandIdx);
unrollTripCount *= tripCount;
}
if (unroll)
replacements.push_back(getConstExpr(lowerBound));
else
replacements.push_back(getDimExpr(operandIdx));
} else
op.emitError("has index constructed by dynamic values.");
operandIdx += 1;
}
// Initialize number of accesses for each partition of each array
// dimension as zero.
AccessNumList accessNumList;
for (auto dim : unrollDims) {
AccessNum accessNum;
if (arrayOp.partition()) {
for (unsigned i = 0; i < getPartitionFactor(&arrayOp, dim); ++i)
accessNum.push_back(0);
} else
accessNum.push_back(0);
accessNumList.push_back(accessNum);
}
// Trace all possible index to find potential violations regarding
// memory ports number. Violations may cause increasement of iteration
// latency or initial interval. This will update the accessNumList.
for (unsigned i = 0; i < unrollTripCount; ++i) {
// Calculate number of accesses for each partition of each array dimension.
unsigned idx = 0;
for (auto dim : unrollDims) {
AffineExpr expr = op.getAffineMap().getResult(dim);
auto indexExpr = expr.replaceDimsAndSymbols(replacements, {});
// Calculate which partition is falled in.
if (arrayOp.partition()) {
auto type = getPartitionType(&arrayOp, dim);
auto factor = getPartitionFactor(&arrayOp, dim);
if (type == "cyclic")
indexExpr = indexExpr % getConstExpr(factor);
else if (type == "block") {
auto dimSize = arrayOp.getType().cast<ShapedType>().getShape()[dim];
indexExpr =
indexExpr.floorDiv(getConstExpr((dimSize + factor - 1) / factor));
}
} else
indexExpr = getConstExpr(0);
// According to partition information.
if (auto constExpr = indexExpr.dyn_cast<AffineConstantExpr>()) {
auto partitionId = constExpr.getValue();
accessNumList[idx][partitionId] += 1;
} else {
}
idx += 1;
}
// Update replacement.
unsigned order = 0;
for (auto dim : unrollDims) {
auto value = replacements[dim].cast<AffineConstantExpr>().getValue();
// The little-end value will always increase with a stride of
// step.
if (order == 0)
value += inductionInfoList[dim].step;
// The value of the current dimension should return to lowerBound
// if is greater or equal to upperBound.
if (value >= inductionInfoList[dim].upperBound) {
value = inductionInfoList[dim].lowerBound;
// Update the value of the next dimension.
if (order < unrollDims.size() - 1) {
auto nextDim = unrollDims[order + 1];
auto nextValue =
replacements[nextDim].cast<AffineConstantExpr>().getValue();
nextValue += inductionInfoList[nextDim].step;
replacements[nextDim] = getConstExpr(nextValue);
}
}
// Update the value of the current dimension.
replacements[dim] = getConstExpr(value);
order += 1;
}
}
// update
for (auto accessNum : accessNumList) {
llvm::outs() << "new dim\n";
for (auto num : accessNum) {
llvm::outs() << num << "\n";
}
setAttrValue(&op, "schedule_begin", begin);
setAttrValue(&op, "schedule_end", end);
blockEnd = max(blockEnd, end);
}
return blockEnd;
}
bool HLSCppEstimator::visitOp(AffineForOp op) {
@ -311,95 +333,58 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
if (body.getBlocks().size() != 1)
op.emitError("has zero or more than one basic blocks.");
// If loop is unrolled, all inner loops will be unrolled accordingly.
if (getBoolAttrValue(op, "unroll")) {
setAttrValue(op, "pipeline", false);
setAttrValue(op, "flatten", false);
op.emitRemark("all inner loops are automatically unrolled.");
OpScheduleMap opScheduleMap;
auto latency =
getBlockSchedule(body.front(), /*innerUnroll=*/true, opScheduleMap);
setAttrValue(op, "latency", latency);
return true;
}
// If loop is pipelined, the pipelined loop will be estimated as a whole since
// all loops inside of a pipeline will be automatically fully unrolled.
// If the current loop is annotated as pipeline, extra dependency and II
// analysis will be executed.
if (getBoolAttrValue(op, "pipeline")) {
setAttrValue(op, "flatten", true);
op.emitRemark("all inner loops are automatically unrolled.");
MemInfo memInfo;
getMemInfo(body.front(), memInfo);
// Calculate latency of each iteration.
PipelineInfo pipelineInfo(/*baseII=*/1);
auto iterLatency = getBlockSchedule(body.front(), /*innerUnroll=*/true,
pipelineInfo.opScheduleMap);
auto iterLatency = getBlockSchedule(body.front(), memInfo);
setAttrValue(op, "iter_latency", iterLatency);
// For now we make a simple assumption that II is equal to 1.
auto tripCount = getUIntAttrValue(op, "trip_count");
setAttrValue(op, "flatten_trip_count", tripCount);
// Collect pipeline information including II and memory access information.
getPipelineInfo(body.front(), pipelineInfo);
// Calculate latency and II considering memory ports violations.
for (auto &memLoad : pipelineInfo.memLoadDict) {
auto arrayOp = dyn_cast<ArrayOp>(memLoad.first.getDefiningOp());
if (!arrayOp)
op.emitError("is accessing an array that is not defined by ArrayOp.");
for (auto loadOp : memLoad.second) {
getAccessNum<AffineLoadOp>(cast<AffineLoadOp>(loadOp), arrayOp);
}
}
setAttrValue(op, "init_interval", pipelineInfo.II);
setAttrValue(op, "latency",
iterLatency + pipelineInfo.II * (tripCount - 1));
setAttrValue(op, "init_interval", (unsigned)1);
setAttrValue(op, "latency", iterLatency + 1 * (tripCount - 1));
return true;
}
// If the loop is not pipelined or unrolled, the estimation is different and
// requires to recursively enter each child loop for estimating the overall
// latency of the current loop.
// Recursively estimate all inner loops.
estimateBlock(body.front());
// This simply means the current loop can be flattened into the child loop
// pipeline. This will increase the flattened loop trip count without
// changing the iteration latency. Note that this will be propogated above
// until meeting an imperfect loop.
if (getBoolAttrValue(op, "perfect")) {
if (auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front())) {
if (getBoolAttrValue(child, "flatten")) {
setAttrValue(op, "flatten", true);
op.emitRemark("this loop is flattened into its child loop.");
if (getBoolAttrValue(op, "flatten")) {
auto child = cast<AffineForOp>(op.getLoopBody().front().front());
op.emitRemark("this loop is flattened into its inner loop.");
auto II = getUIntAttrValue(child, "init_interval");
auto iterLatency = getUIntAttrValue(child, "iter_latency");
auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
getUIntAttrValue(op, "trip_count");
auto II = getUIntAttrValue(child, "init_interval");
auto iterLatency = getUIntAttrValue(child, "iter_latency");
auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
getUIntAttrValue(op, "trip_count");
setAttrValue(op, "init_interval", II);
setAttrValue(op, "iter_latency", iterLatency);
setAttrValue(op, "flatten_trip_count", flattenTripCount);
setAttrValue(op, "init_interval", II);
setAttrValue(op, "iter_latency", iterLatency);
setAttrValue(op, "flatten_trip_count", flattenTripCount);
setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1));
return true;
}
}
setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1));
}
// Default case, aka !pipeline && !flatten.
else {
MemInfo memInfo;
getMemInfo(body.front(), memInfo);
// Default case, aka !unroll && !pipeline && !(perfect && child.flatten).
setAttrValue(op, "flatten", false);
auto iterLatency = getBlockSchedule(body.front(), memInfo);
setAttrValue(op, "iter_latency", iterLatency);
OpScheduleMap opScheduleMap;
auto iterLatency =
getBlockSchedule(body.front(), /*innerUnroll=*/false, opScheduleMap);
setAttrValue(op, "iter_latency", iterLatency);
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
setAttrValue(op, "latency", latency);
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
setAttrValue(op, "latency", latency);
}
return true;
}
@ -419,9 +404,10 @@ void HLSCppEstimator::estimateFunc(FuncOp func) {
estimateBlock(func.front());
OpScheduleMap opScheduleMap;
auto latency =
getBlockSchedule(func.front(), /*innerUnroll=*/false, opScheduleMap);
MemInfo memInfo;
getMemInfo(func.front(), memInfo);
auto latency = getBlockSchedule(func.front(), memInfo);
setAttrValue(func, "latency", latency);
}
@ -447,6 +433,16 @@ struct QoREstimation : public scalehls::QoREstimationBase<QoREstimation> {
HLSCppAnalyzer analyzer(builder);
analyzer.analyzeModule(getOperation());
// Canonicalize the analyzed IR.
OwningRewritePatternList patterns;
auto *context = &getContext();
for (auto *op : context->getRegisteredOperations())
op->getCanonicalizationPatterns(patterns, context);
Operation *op = getOperation();
applyPatternsAndFoldGreedily(op->getRegions(), patterns);
// Estimate performance and resource utilization.
HLSCppEstimator estimator(builder, targetSpec, opLatency);
estimator.estimateModule(getOperation());

View File

@ -2,14 +2,12 @@
// CHECK-LABEL: func @test_for
func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attributes {dataflow = false} {
%array0 = "hlscpp.array"(%arg0) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
%array1 = "hlscpp.array"(%arg1) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
//"hlscpp.array_pragma" (%arg0) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
//"hlscpp.array_pragma" (%arg1) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
%array0 = "hlscpp.array"(%arg0) {interface = false, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
%array1 = "hlscpp.array"(%arg1) {interface = false, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
affine.for %i = 0 to 16 {
affine.for %j = 0 to 4 {
affine.for %k = 0 to 4 {
%0 = affine.load %array0[%i, %j, %i + %k] : memref<16x4x4xindex>
affine.for %k = 0 to 4{
%0 = affine.load %array0[%i, %j, %k] : memref<16x4x4xindex>
%1 = affine.load %array1[%i, %j, %k] : memref<16x4x4xindex>
%2 = muli %0, %1 : index
affine.store %2, %array1[%i, %j, %k] : memref<16x4x4xindex>