[QoREstimation] unroll loops in IR now! support a real partition-aware loop scheduling
This commit is contained in:
parent
28926c5cc2
commit
cbfb623c67
|
@ -9,6 +9,7 @@
|
|||
#include "Visitor.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/ADT/TypeSwitch.h"
|
||||
|
||||
namespace mlir {
|
||||
|
@ -23,24 +24,46 @@ public:
|
|||
explicit HLSCppToolBase(OpBuilder &builder) : builder(builder) {}
|
||||
|
||||
/// Get value methods.
|
||||
int64_t getIntAttrValue(Operation *op, StringRef name) {
|
||||
if (auto attr = op->getAttrOfType<IntegerAttr>(name))
|
||||
return attr.getInt();
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
unsigned getUIntAttrValue(Operation *op, StringRef name) {
|
||||
return op->getAttrOfType<IntegerAttr>(name).getUInt();
|
||||
if (auto attr = op->getAttrOfType<IntegerAttr>(name))
|
||||
return attr.getUInt();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool getBoolAttrValue(Operation *op, StringRef name) {
|
||||
return op->getAttrOfType<BoolAttr>(name).getValue();
|
||||
if (auto attr = op->getAttrOfType<BoolAttr>(name))
|
||||
return attr.getValue();
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
StringRef getStrAttrValue(Operation *op, StringRef name) {
|
||||
return op->getAttrOfType<StringAttr>(name).getValue();
|
||||
if (auto attr = op->getAttrOfType<StringAttr>(name))
|
||||
return attr.getValue();
|
||||
else
|
||||
return "";
|
||||
}
|
||||
|
||||
StringRef getPartitionType(ArrayOp *op, unsigned dim) {
|
||||
return op->partition_type()[dim].cast<StringAttr>().getValue();
|
||||
StringRef getPartitionType(ArrayOp op, unsigned dim) {
|
||||
if (auto attr = op.partition_type()[dim].cast<StringAttr>())
|
||||
return attr.getValue();
|
||||
else
|
||||
return "";
|
||||
}
|
||||
|
||||
unsigned getPartitionFactor(ArrayOp *op, unsigned dim) {
|
||||
return op->partition_factor()[dim].cast<IntegerAttr>().getUInt();
|
||||
unsigned getPartitionFactor(ArrayOp op, unsigned dim) {
|
||||
if (auto attr = op.partition_factor()[dim].cast<IntegerAttr>())
|
||||
return attr.getUInt();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Set value methods.
|
||||
|
@ -48,6 +71,10 @@ public:
|
|||
op->setAttr(name, builder.getUI32IntegerAttr(value));
|
||||
}
|
||||
|
||||
void setAttrValue(Operation *op, StringRef name, int32_t value) {
|
||||
op->setAttr(name, builder.getI32IntegerAttr(value));
|
||||
}
|
||||
|
||||
void setAttrValue(Operation *op, StringRef name, bool value) {
|
||||
op->setAttr(name, builder.getBoolAttr(value));
|
||||
}
|
||||
|
@ -65,11 +92,10 @@ public:
|
|||
return getAffineDimExpr(value, builder.getContext());
|
||||
}
|
||||
|
||||
AffineExpr getConstExpr(unsigned value) {
|
||||
AffineExpr getConstExpr(int64_t value) {
|
||||
return getAffineConstantExpr(value, builder.getContext());
|
||||
}
|
||||
|
||||
private:
|
||||
OpBuilder &builder;
|
||||
};
|
||||
|
||||
|
@ -97,20 +123,30 @@ public:
|
|||
// HLSCppEstimator Class Declaration
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// For storing the scheduled time stamp of operations.
|
||||
using OpScheduleMap = llvm::SmallDenseMap<Operation *, unsigned, 16>;
|
||||
// Indicate the unoccupied memory ports number.
|
||||
struct PortNum {
|
||||
PortNum(unsigned rdPort = 0, unsigned wrPort = 0, unsigned rdwrPort = 0)
|
||||
: rdPort(rdPort), wrPort(wrPort), rdwrPort(rdwrPort) {}
|
||||
|
||||
// For storing each memory access operations indexed by its targed memory
|
||||
// value symbol.
|
||||
using MemAccess = SmallVector<Operation *, 4>;
|
||||
using MemAccessDict = llvm::SmallDenseMap<Value, MemAccess, 16>;
|
||||
unsigned rdPort;
|
||||
unsigned wrPort;
|
||||
unsigned rdwrPort;
|
||||
};
|
||||
|
||||
// For storing memory access and schedule information of pipelined region.
|
||||
struct PipelineInfo {
|
||||
PipelineInfo(unsigned baseII) : II(baseII) {}
|
||||
// For storing ports number information of each memory instance.
|
||||
using MemPort = llvm::SmallDenseMap<Operation *, SmallVector<PortNum, 16>, 8>;
|
||||
|
||||
unsigned II;
|
||||
OpScheduleMap opScheduleMap;
|
||||
// For storing MemPort indexed by the pipeline stage (a basic block).
|
||||
using MemPortList = SmallVector<MemPort, 16>;
|
||||
|
||||
// For storing each memory access operations (including AffineLoadOp and
|
||||
// AffineStoreOp) indexed by the array instantce (ArrayOp).
|
||||
using MemAccess = SmallVector<Operation *, 16>;
|
||||
using MemAccessDict = llvm::SmallDenseMap<Operation *, MemAccess, 8>;
|
||||
|
||||
// An aggregate information structure for storing memory load and store
|
||||
// MemAccessDict in the scope of loop/function/other region.
|
||||
struct MemInfo {
|
||||
MemAccessDict memLoadDict;
|
||||
MemAccessDict memStoreDict;
|
||||
};
|
||||
|
@ -126,11 +162,6 @@ struct InductionInfo {
|
|||
};
|
||||
using InductionInfoList = SmallVector<InductionInfo, 8>;
|
||||
|
||||
// This records the number of accesses for each partition.
|
||||
using AccessNum = SmallVector<unsigned, 16>;
|
||||
// This records the AccessNum of each dimension of an array.
|
||||
using AccessNumList = SmallVector<AccessNum, 8>;
|
||||
|
||||
class HLSCppEstimator : public HLSCppVisitorBase<HLSCppEstimator, bool>,
|
||||
public HLSCppToolBase {
|
||||
public:
|
||||
|
@ -143,14 +174,12 @@ public:
|
|||
bool visitOp(AffineForOp op);
|
||||
bool visitOp(AffineIfOp op);
|
||||
|
||||
void setBlockSchedule(Block &block, unsigned opSchedule,
|
||||
OpScheduleMap &opScheduleMap);
|
||||
unsigned getBlockSchedule(Block &block, bool innerFlatten,
|
||||
OpScheduleMap &opScheduleMap);
|
||||
int32_t getPartitionIdx(AffineMap map, ArrayOp op);
|
||||
void getMemInfo(Block &block, MemInfo &info);
|
||||
|
||||
void getPipelineInfo(Block &block, PipelineInfo &info);
|
||||
|
||||
template <typename OpType> void getAccessNum(OpType op, ArrayOp arrayOp);
|
||||
unsigned getLoadStoreSchedule(Operation *op, ArrayOp arrayOp,
|
||||
MemPortList &memPortList, unsigned begin);
|
||||
unsigned getBlockSchedule(Block &block, MemInfo memInfo);
|
||||
|
||||
void estimateOperation(Operation *op);
|
||||
void estimateFunc(FuncOp func);
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#include "Analysis/QoREstimation.h"
|
||||
#include "Analysis/Passes.h"
|
||||
#include "Dialect/HLSCpp/HLSCpp.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace mlir;
|
||||
|
@ -16,14 +17,41 @@ using namespace hlscpp;
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
bool HLSCppAnalyzer::visitOp(AffineForOp op) {
|
||||
// If the current loop is annotated as unroll, all inner loops and itself are
|
||||
// automatically unrolled.
|
||||
if (getBoolAttrValue(op, "unroll")) {
|
||||
op.emitRemark("this loop and all inner loops are automatically unrolled.");
|
||||
op.walk([&](AffineForOp forOp) {
|
||||
if (forOp.getLoopBody().getBlocks().size() != 1)
|
||||
op.emitError("has zero or more than one basic blocks.");
|
||||
loopUnrollFull(forOp);
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the current loop is annotated as pipeline, all intter loops are
|
||||
// automatically unrolled.
|
||||
if (getBoolAttrValue(op, "pipeline")) {
|
||||
op.emitRemark("all inner loops are automatically unrolled.");
|
||||
op.walk([&](AffineForOp forOp) {
|
||||
if (forOp != op) {
|
||||
if (forOp.getLoopBody().getBlocks().size() != 1)
|
||||
op.emitError("has zero or more than one basic blocks.");
|
||||
loopUnrollFull(forOp);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// We assume loop contains a single basic block.
|
||||
auto &body = op.getLoopBody();
|
||||
if (body.getBlocks().size() != 1)
|
||||
op.emitError("has zero or more than one basic blocks.");
|
||||
|
||||
// Recursively analyze all childs.
|
||||
// Recursively analyze all inner loops.
|
||||
analyzeBlock(body.front());
|
||||
|
||||
// Set an attribute indicating trip count.
|
||||
// Set an attribute indicating the trip count. For now, we assume all loops
|
||||
// have static loop bound.
|
||||
if (!op.hasConstantLowerBound() || !op.hasConstantUpperBound())
|
||||
op.emitError("has variable upper or lower bound.");
|
||||
|
||||
|
@ -31,25 +59,24 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) {
|
|||
(op.getConstantUpperBound() - op.getConstantLowerBound()) / op.getStep();
|
||||
setAttrValue(op, "trip_count", tripCount);
|
||||
|
||||
// Set attributes indicating this loop is perfect or not.
|
||||
// Set attributes indicating this loop can be flatten or not.
|
||||
unsigned opNum = 0;
|
||||
unsigned childNum = 0;
|
||||
bool childPerfect = false;
|
||||
unsigned forNum = 0;
|
||||
bool innerFlatten = false;
|
||||
|
||||
for (auto &bodyOp : body.front()) {
|
||||
if (!isa<AffineYieldOp>(bodyOp))
|
||||
opNum += 1;
|
||||
if (auto child = dyn_cast<AffineForOp>(bodyOp)) {
|
||||
childNum += 1;
|
||||
childPerfect = getBoolAttrValue(child, "perfect");
|
||||
if (isa<AffineForOp>(bodyOp)) {
|
||||
forNum += 1;
|
||||
innerFlatten = getBoolAttrValue(&bodyOp, "flatten");
|
||||
}
|
||||
}
|
||||
|
||||
if (opNum == 1 && childNum == 1 && childPerfect)
|
||||
setAttrValue(op, "perfect", true);
|
||||
else if (childNum == 0)
|
||||
setAttrValue(op, "perfect", true);
|
||||
if (forNum == 0 || (opNum == 1 && innerFlatten))
|
||||
setAttrValue(op, "flatten", true);
|
||||
else
|
||||
setAttrValue(op, "perfect", false);
|
||||
setAttrValue(op, "flatten", false);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -105,205 +132,200 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
|
|||
llvm::outs() << latency << "\n";
|
||||
}
|
||||
|
||||
void HLSCppEstimator::setBlockSchedule(Block &block, unsigned opSchedule,
|
||||
OpScheduleMap &opScheduleMap) {
|
||||
/// Calculate the partition index according to the affine map of a memory access
|
||||
/// operation, and store the results as attribute.
|
||||
int32_t HLSCppEstimator::getPartitionIdx(AffineMap map, ArrayOp op) {
|
||||
int32_t partitionIdx = 0;
|
||||
unsigned accumFactor = 1;
|
||||
unsigned dim = 0;
|
||||
for (auto expr : map.getResults()) {
|
||||
auto idxExpr = getConstExpr(0);
|
||||
unsigned factor = 1;
|
||||
if (op.partition()) {
|
||||
auto type = getPartitionType(op, dim);
|
||||
factor = getPartitionFactor(op, dim);
|
||||
|
||||
if (type == "cyclic")
|
||||
idxExpr = expr % getConstExpr(factor);
|
||||
else if (type == "block") {
|
||||
auto size = op.getType().cast<ShapedType>().getShape()[dim];
|
||||
idxExpr = expr.floorDiv(getConstExpr((size + factor - 1) / factor));
|
||||
}
|
||||
}
|
||||
if (auto constExpr = idxExpr.dyn_cast<AffineConstantExpr>()) {
|
||||
if (dim == 0)
|
||||
partitionIdx = constExpr.getValue();
|
||||
else
|
||||
partitionIdx += constExpr.getValue() * accumFactor;
|
||||
} else {
|
||||
partitionIdx = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
accumFactor *= factor;
|
||||
dim += 1;
|
||||
}
|
||||
return partitionIdx;
|
||||
}
|
||||
|
||||
void HLSCppEstimator::getMemInfo(Block &block, MemInfo &info) {
|
||||
for (auto &op : block) {
|
||||
if (auto child = dyn_cast<AffineForOp>(op))
|
||||
setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
|
||||
opScheduleMap[&op] = opSchedule;
|
||||
if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
|
||||
auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
|
||||
info.memLoadDict[arrayOp].push_back(loadOp);
|
||||
setAttrValue(loadOp, "partition_index",
|
||||
getPartitionIdx(loadOp.getAffineMap(), arrayOp));
|
||||
// TODO: consider RAW, WAR, WAW dependency for scheduling.
|
||||
|
||||
} else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
|
||||
auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
|
||||
info.memLoadDict[arrayOp].push_back(storeOp);
|
||||
setAttrValue(storeOp, "partition_index",
|
||||
getPartitionIdx(storeOp.getAffineMap(), arrayOp));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned HLSCppEstimator::getBlockSchedule(Block &block, bool innerUnroll,
|
||||
OpScheduleMap &opScheduleMap) {
|
||||
unsigned blockSchedule = 0;
|
||||
unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, ArrayOp arrayOp,
|
||||
MemPortList &memPortList,
|
||||
unsigned begin) {
|
||||
auto partitionIdx = getIntAttrValue(op, "partition_index");
|
||||
|
||||
// Try to avoid memory port violation until a legal schedule is found.
|
||||
// Since an infinite length pipeline can be generated, this while loop can
|
||||
// be proofed to have an end.
|
||||
while (true) {
|
||||
auto partitionPortNum = memPortList[begin][arrayOp];
|
||||
bool memEmpty = false;
|
||||
// Partition factor.
|
||||
unsigned factor = 1;
|
||||
|
||||
// If the memory has not been occupied by the current stage, it should
|
||||
// be initialized according to its storage type. Note that each
|
||||
// partition should have one PortNum structure.
|
||||
if (partitionPortNum.empty()) {
|
||||
memEmpty = true;
|
||||
|
||||
if (getBoolAttrValue(arrayOp, "partition")) {
|
||||
for (unsigned dim = 0;
|
||||
dim < arrayOp.getType().cast<ShapedType>().getRank(); ++dim)
|
||||
factor *= getPartitionFactor(arrayOp, dim);
|
||||
}
|
||||
|
||||
auto storagetType = getStrAttrValue(arrayOp, "storage_type");
|
||||
for (unsigned p = 0; p < factor; ++p) {
|
||||
unsigned rdPort = 0;
|
||||
unsigned wrPort = 0;
|
||||
unsigned rdwrPort = 0;
|
||||
|
||||
if (storagetType == "ram_s2p")
|
||||
rdPort = 1, wrPort = 1;
|
||||
else if (storagetType == "ram_2p" || storagetType == "ram_t2p")
|
||||
rdwrPort = 2;
|
||||
else if (storagetType == "ram_1p")
|
||||
rdwrPort = 1;
|
||||
else {
|
||||
rdwrPort = 2;
|
||||
arrayOp.emitError("unsupported storage type.");
|
||||
}
|
||||
PortNum portNum(rdPort, wrPort, rdwrPort);
|
||||
partitionPortNum.push_back(portNum);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: When partition index can't be determined, this operation will be
|
||||
// considered to occupy all ports.
|
||||
if (partitionIdx == -1) {
|
||||
if (memEmpty) {
|
||||
for (unsigned p = 0; p < factor; ++p) {
|
||||
partitionPortNum[partitionIdx].rdPort = 0;
|
||||
partitionPortNum[partitionIdx].wrPort = 0;
|
||||
partitionPortNum[partitionIdx].rdwrPort = 0;
|
||||
}
|
||||
memPortList[begin][arrayOp] = partitionPortNum;
|
||||
break;
|
||||
} else {
|
||||
if (++begin >= memPortList.size()) {
|
||||
MemPort memPort;
|
||||
memPortList.push_back(memPort);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find whether the current schedule meets memory port limitation. If
|
||||
// not, the schedule will increase by 1.
|
||||
if (partitionPortNum[partitionIdx].rdPort > 0) {
|
||||
partitionPortNum[partitionIdx].rdPort -= 1;
|
||||
memPortList[begin][arrayOp] = partitionPortNum;
|
||||
break;
|
||||
} else if (partitionPortNum[partitionIdx].rdwrPort > 0) {
|
||||
partitionPortNum[partitionIdx].rdwrPort -= 1;
|
||||
memPortList[begin][arrayOp] = partitionPortNum;
|
||||
break;
|
||||
} else {
|
||||
if (++begin >= memPortList.size()) {
|
||||
MemPort memPort;
|
||||
memPortList.push_back(memPort);
|
||||
}
|
||||
}
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
|
||||
unsigned HLSCppEstimator::getBlockSchedule(Block &block, MemInfo memInfo) {
|
||||
unsigned blockEnd = 0;
|
||||
MemPortList memPortList;
|
||||
|
||||
for (auto &op : block) {
|
||||
unsigned opSchedule = 0;
|
||||
|
||||
// Add the latest scheduled time among all predecessors.
|
||||
// Find the latest predecessor dominating the current operation. This should
|
||||
// be considered as the earliest stage that the current operation can be
|
||||
// scheduled.
|
||||
unsigned begin = 0;
|
||||
unsigned end = 0;
|
||||
for (auto operand : op.getOperands()) {
|
||||
if (operand.getKind() != Value::Kind::BlockArgument)
|
||||
opSchedule = max(opSchedule, opScheduleMap[operand.getDefiningOp()]);
|
||||
begin = max(begin,
|
||||
getUIntAttrValue(operand.getDefiningOp(), "schedule_end"));
|
||||
}
|
||||
|
||||
// Add latency of the current operation.
|
||||
unsigned childSchedule = 0;
|
||||
if (auto child = dyn_cast<AffineForOp>(op)) {
|
||||
if (innerUnroll) {
|
||||
setAttrValue(child, "unroll", true);
|
||||
setAttrValue(child, "flatten", false);
|
||||
childSchedule = getBlockSchedule(child.getRegion().front(),
|
||||
/*innerUnroll=*/true, opScheduleMap);
|
||||
} else {
|
||||
// Two extra clock cycles will be required to enter and exit child loop.
|
||||
opSchedule += getUIntAttrValue(child, "latency") + 2;
|
||||
setBlockSchedule(child.getRegion().front(), opSchedule, opScheduleMap);
|
||||
}
|
||||
} else {
|
||||
// For now we make a simple assumption tha all standard operations has an
|
||||
// unit latency.
|
||||
// TODO: Support estimation from profiling data.
|
||||
opSchedule += 1;
|
||||
// Insert new pipeline stages.
|
||||
while (begin >= memPortList.size()) {
|
||||
MemPort memPort;
|
||||
memPortList.push_back(memPort);
|
||||
}
|
||||
|
||||
opScheduleMap[&op] = opSchedule;
|
||||
blockSchedule = max({blockSchedule, childSchedule, opSchedule});
|
||||
}
|
||||
return blockSchedule;
|
||||
}
|
||||
|
||||
void HLSCppEstimator::getPipelineInfo(Block &block, PipelineInfo &info) {
|
||||
for (auto &op : block) {
|
||||
// Handle load operations and RAW dependencies.
|
||||
// Handle load operations, ensure the current schedule meets memory port
|
||||
// limitation.
|
||||
if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
|
||||
for (auto prevOp : info.memStoreDict[loadOp.getMemRef()]) {
|
||||
unsigned RAWLatency =
|
||||
info.opScheduleMap[loadOp] - info.opScheduleMap[prevOp];
|
||||
info.II = max(info.II, RAWLatency);
|
||||
}
|
||||
info.memLoadDict[loadOp.getMemRef()].push_back(loadOp);
|
||||
auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
|
||||
begin = getLoadStoreSchedule(loadOp, arrayOp, memPortList, begin);
|
||||
end = begin + 1;
|
||||
}
|
||||
|
||||
// Handle Store operations and RAW/WAW dependencies.
|
||||
// Handle store operations.
|
||||
else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
|
||||
for (auto prevOp : info.memLoadDict[storeOp.getMemRef()]) {
|
||||
unsigned WARLatency =
|
||||
info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
|
||||
info.II = max(info.II, WARLatency);
|
||||
}
|
||||
for (auto prevOp : info.memStoreDict[storeOp.getMemRef()]) {
|
||||
unsigned WAWLatency =
|
||||
info.opScheduleMap[storeOp] - info.opScheduleMap[prevOp];
|
||||
info.II = max(info.II, WAWLatency);
|
||||
}
|
||||
info.memStoreDict[storeOp.getMemRef()].push_back(storeOp);
|
||||
auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
|
||||
begin = getLoadStoreSchedule(storeOp, arrayOp, memPortList, begin);
|
||||
end = begin + 1;
|
||||
}
|
||||
// Handle loop operations.
|
||||
else if (auto forOp = dyn_cast<AffineForOp>(op)) {
|
||||
// Child loop is considered as a large node, and two extra clock cycles
|
||||
// will be required to enter and exit the child loop.
|
||||
end = begin + getUIntAttrValue(forOp, "latency") + 2;
|
||||
}
|
||||
// Default case. All normal expressions and operations will be handled by
|
||||
// this branch.
|
||||
else {
|
||||
// TODO: For now, we assume all operations take one clock cycle to
|
||||
// execute, should support to accept profiling data.
|
||||
end = begin + 1;
|
||||
}
|
||||
|
||||
// Recursively handle child loops.
|
||||
else if (auto child = dyn_cast<AffineForOp>(op))
|
||||
getPipelineInfo(child.getRegion().front(), info);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OpType>
|
||||
void HLSCppEstimator::getAccessNum(OpType op, ArrayOp arrayOp) {
|
||||
InductionInfoList inductionInfoList;
|
||||
SmallVector<AffineExpr, 8> replacements;
|
||||
SmallVector<unsigned, 8> unrollDims;
|
||||
unsigned unrollTripCount = 1;
|
||||
|
||||
// Collect loop information, including induction & unroll information,
|
||||
// and etc. Note that we assume all operands are dims.
|
||||
unsigned operandIdx = 0;
|
||||
for (auto operand : op.getMapOperands()) {
|
||||
if (auto forOp = getForInductionVarOwner(operand)) {
|
||||
auto lowerBound = forOp.getConstantLowerBound();
|
||||
auto upperBound = forOp.getConstantUpperBound();
|
||||
auto step = forOp.getStep();
|
||||
inductionInfoList.push_back(InductionInfo(lowerBound, upperBound, step));
|
||||
|
||||
auto unroll = getBoolAttrValue(forOp, "unroll");
|
||||
auto tripCount = getUIntAttrValue(forOp, "trip_count");
|
||||
if (unroll) {
|
||||
unrollDims.push_back(operandIdx);
|
||||
unrollTripCount *= tripCount;
|
||||
}
|
||||
|
||||
if (unroll)
|
||||
replacements.push_back(getConstExpr(lowerBound));
|
||||
else
|
||||
replacements.push_back(getDimExpr(operandIdx));
|
||||
} else
|
||||
op.emitError("has index constructed by dynamic values.");
|
||||
operandIdx += 1;
|
||||
}
|
||||
|
||||
// Initialize number of accesses for each partition of each array
|
||||
// dimension as zero.
|
||||
AccessNumList accessNumList;
|
||||
for (auto dim : unrollDims) {
|
||||
AccessNum accessNum;
|
||||
if (arrayOp.partition()) {
|
||||
for (unsigned i = 0; i < getPartitionFactor(&arrayOp, dim); ++i)
|
||||
accessNum.push_back(0);
|
||||
} else
|
||||
accessNum.push_back(0);
|
||||
accessNumList.push_back(accessNum);
|
||||
}
|
||||
|
||||
// Trace all possible index to find potential violations regarding
|
||||
// memory ports number. Violations may cause increasement of iteration
|
||||
// latency or initial interval. This will update the accessNumList.
|
||||
for (unsigned i = 0; i < unrollTripCount; ++i) {
|
||||
|
||||
// Calculate number of accesses for each partition of each array dimension.
|
||||
unsigned idx = 0;
|
||||
for (auto dim : unrollDims) {
|
||||
AffineExpr expr = op.getAffineMap().getResult(dim);
|
||||
auto indexExpr = expr.replaceDimsAndSymbols(replacements, {});
|
||||
|
||||
// Calculate which partition is falled in.
|
||||
if (arrayOp.partition()) {
|
||||
auto type = getPartitionType(&arrayOp, dim);
|
||||
auto factor = getPartitionFactor(&arrayOp, dim);
|
||||
if (type == "cyclic")
|
||||
indexExpr = indexExpr % getConstExpr(factor);
|
||||
else if (type == "block") {
|
||||
auto dimSize = arrayOp.getType().cast<ShapedType>().getShape()[dim];
|
||||
indexExpr =
|
||||
indexExpr.floorDiv(getConstExpr((dimSize + factor - 1) / factor));
|
||||
}
|
||||
} else
|
||||
indexExpr = getConstExpr(0);
|
||||
|
||||
// According to partition information.
|
||||
if (auto constExpr = indexExpr.dyn_cast<AffineConstantExpr>()) {
|
||||
auto partitionId = constExpr.getValue();
|
||||
accessNumList[idx][partitionId] += 1;
|
||||
} else {
|
||||
}
|
||||
idx += 1;
|
||||
}
|
||||
|
||||
// Update replacement.
|
||||
unsigned order = 0;
|
||||
for (auto dim : unrollDims) {
|
||||
auto value = replacements[dim].cast<AffineConstantExpr>().getValue();
|
||||
|
||||
// The little-end value will always increase with a stride of
|
||||
// step.
|
||||
if (order == 0)
|
||||
value += inductionInfoList[dim].step;
|
||||
|
||||
// The value of the current dimension should return to lowerBound
|
||||
// if is greater or equal to upperBound.
|
||||
if (value >= inductionInfoList[dim].upperBound) {
|
||||
value = inductionInfoList[dim].lowerBound;
|
||||
|
||||
// Update the value of the next dimension.
|
||||
if (order < unrollDims.size() - 1) {
|
||||
auto nextDim = unrollDims[order + 1];
|
||||
auto nextValue =
|
||||
replacements[nextDim].cast<AffineConstantExpr>().getValue();
|
||||
nextValue += inductionInfoList[nextDim].step;
|
||||
replacements[nextDim] = getConstExpr(nextValue);
|
||||
}
|
||||
}
|
||||
|
||||
// Update the value of the current dimension.
|
||||
replacements[dim] = getConstExpr(value);
|
||||
order += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// update
|
||||
for (auto accessNum : accessNumList) {
|
||||
llvm::outs() << "new dim\n";
|
||||
for (auto num : accessNum) {
|
||||
llvm::outs() << num << "\n";
|
||||
}
|
||||
setAttrValue(&op, "schedule_begin", begin);
|
||||
setAttrValue(&op, "schedule_end", end);
|
||||
blockEnd = max(blockEnd, end);
|
||||
}
|
||||
return blockEnd;
|
||||
}
|
||||
|
||||
bool HLSCppEstimator::visitOp(AffineForOp op) {
|
||||
|
@ -311,95 +333,58 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
|
|||
if (body.getBlocks().size() != 1)
|
||||
op.emitError("has zero or more than one basic blocks.");
|
||||
|
||||
// If loop is unrolled, all inner loops will be unrolled accordingly.
|
||||
if (getBoolAttrValue(op, "unroll")) {
|
||||
setAttrValue(op, "pipeline", false);
|
||||
setAttrValue(op, "flatten", false);
|
||||
op.emitRemark("all inner loops are automatically unrolled.");
|
||||
|
||||
OpScheduleMap opScheduleMap;
|
||||
auto latency =
|
||||
getBlockSchedule(body.front(), /*innerUnroll=*/true, opScheduleMap);
|
||||
setAttrValue(op, "latency", latency);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If loop is pipelined, the pipelined loop will be estimated as a whole since
|
||||
// all loops inside of a pipeline will be automatically fully unrolled.
|
||||
// If the current loop is annotated as pipeline, extra dependency and II
|
||||
// analysis will be executed.
|
||||
if (getBoolAttrValue(op, "pipeline")) {
|
||||
setAttrValue(op, "flatten", true);
|
||||
op.emitRemark("all inner loops are automatically unrolled.");
|
||||
MemInfo memInfo;
|
||||
getMemInfo(body.front(), memInfo);
|
||||
|
||||
// Calculate latency of each iteration.
|
||||
PipelineInfo pipelineInfo(/*baseII=*/1);
|
||||
auto iterLatency = getBlockSchedule(body.front(), /*innerUnroll=*/true,
|
||||
pipelineInfo.opScheduleMap);
|
||||
auto iterLatency = getBlockSchedule(body.front(), memInfo);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
|
||||
// For now we make a simple assumption that II is equal to 1.
|
||||
auto tripCount = getUIntAttrValue(op, "trip_count");
|
||||
setAttrValue(op, "flatten_trip_count", tripCount);
|
||||
|
||||
// Collect pipeline information including II and memory access information.
|
||||
getPipelineInfo(body.front(), pipelineInfo);
|
||||
|
||||
// Calculate latency and II considering memory ports violations.
|
||||
for (auto &memLoad : pipelineInfo.memLoadDict) {
|
||||
auto arrayOp = dyn_cast<ArrayOp>(memLoad.first.getDefiningOp());
|
||||
if (!arrayOp)
|
||||
op.emitError("is accessing an array that is not defined by ArrayOp.");
|
||||
|
||||
for (auto loadOp : memLoad.second) {
|
||||
getAccessNum<AffineLoadOp>(cast<AffineLoadOp>(loadOp), arrayOp);
|
||||
}
|
||||
}
|
||||
|
||||
setAttrValue(op, "init_interval", pipelineInfo.II);
|
||||
setAttrValue(op, "latency",
|
||||
iterLatency + pipelineInfo.II * (tripCount - 1));
|
||||
setAttrValue(op, "init_interval", (unsigned)1);
|
||||
setAttrValue(op, "latency", iterLatency + 1 * (tripCount - 1));
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the loop is not pipelined or unrolled, the estimation is different and
|
||||
// requires to recursively enter each child loop for estimating the overall
|
||||
// latency of the current loop.
|
||||
// Recursively estimate all inner loops.
|
||||
estimateBlock(body.front());
|
||||
|
||||
// This simply means the current loop can be flattened into the child loop
|
||||
// pipeline. This will increase the flattened loop trip count without
|
||||
// changing the iteration latency. Note that this will be propogated above
|
||||
// until meeting an imperfect loop.
|
||||
if (getBoolAttrValue(op, "perfect")) {
|
||||
if (auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front())) {
|
||||
if (getBoolAttrValue(child, "flatten")) {
|
||||
setAttrValue(op, "flatten", true);
|
||||
op.emitRemark("this loop is flattened into its child loop.");
|
||||
if (getBoolAttrValue(op, "flatten")) {
|
||||
auto child = cast<AffineForOp>(op.getLoopBody().front().front());
|
||||
op.emitRemark("this loop is flattened into its inner loop.");
|
||||
|
||||
auto II = getUIntAttrValue(child, "init_interval");
|
||||
auto iterLatency = getUIntAttrValue(child, "iter_latency");
|
||||
auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
|
||||
getUIntAttrValue(op, "trip_count");
|
||||
auto II = getUIntAttrValue(child, "init_interval");
|
||||
auto iterLatency = getUIntAttrValue(child, "iter_latency");
|
||||
auto flattenTripCount = getUIntAttrValue(child, "flatten_trip_count") *
|
||||
getUIntAttrValue(op, "trip_count");
|
||||
|
||||
setAttrValue(op, "init_interval", II);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
setAttrValue(op, "flatten_trip_count", flattenTripCount);
|
||||
setAttrValue(op, "init_interval", II);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
setAttrValue(op, "flatten_trip_count", flattenTripCount);
|
||||
|
||||
setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
setAttrValue(op, "latency", iterLatency + II * (flattenTripCount - 1));
|
||||
}
|
||||
// Default case, aka !pipeline && !flatten.
|
||||
else {
|
||||
MemInfo memInfo;
|
||||
getMemInfo(body.front(), memInfo);
|
||||
|
||||
// Default case, aka !unroll && !pipeline && !(perfect && child.flatten).
|
||||
setAttrValue(op, "flatten", false);
|
||||
auto iterLatency = getBlockSchedule(body.front(), memInfo);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
|
||||
OpScheduleMap opScheduleMap;
|
||||
auto iterLatency =
|
||||
getBlockSchedule(body.front(), /*innerUnroll=*/false, opScheduleMap);
|
||||
setAttrValue(op, "iter_latency", iterLatency);
|
||||
|
||||
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
|
||||
setAttrValue(op, "latency", latency);
|
||||
unsigned latency = iterLatency * getUIntAttrValue(op, "trip_count");
|
||||
setAttrValue(op, "latency", latency);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -419,9 +404,10 @@ void HLSCppEstimator::estimateFunc(FuncOp func) {
|
|||
|
||||
estimateBlock(func.front());
|
||||
|
||||
OpScheduleMap opScheduleMap;
|
||||
auto latency =
|
||||
getBlockSchedule(func.front(), /*innerUnroll=*/false, opScheduleMap);
|
||||
MemInfo memInfo;
|
||||
getMemInfo(func.front(), memInfo);
|
||||
|
||||
auto latency = getBlockSchedule(func.front(), memInfo);
|
||||
setAttrValue(func, "latency", latency);
|
||||
}
|
||||
|
||||
|
@ -447,6 +433,16 @@ struct QoREstimation : public scalehls::QoREstimationBase<QoREstimation> {
|
|||
HLSCppAnalyzer analyzer(builder);
|
||||
analyzer.analyzeModule(getOperation());
|
||||
|
||||
// Canonicalize the analyzed IR.
|
||||
OwningRewritePatternList patterns;
|
||||
|
||||
auto *context = &getContext();
|
||||
for (auto *op : context->getRegisteredOperations())
|
||||
op->getCanonicalizationPatterns(patterns, context);
|
||||
|
||||
Operation *op = getOperation();
|
||||
applyPatternsAndFoldGreedily(op->getRegions(), patterns);
|
||||
|
||||
// Estimate performance and resource utilization.
|
||||
HLSCppEstimator estimator(builder, targetSpec, opLatency);
|
||||
estimator.estimateModule(getOperation());
|
||||
|
|
|
@ -2,14 +2,12 @@
|
|||
|
||||
// CHECK-LABEL: func @test_for
|
||||
func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attributes {dataflow = false} {
|
||||
%array0 = "hlscpp.array"(%arg0) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
%array1 = "hlscpp.array"(%arg1) {interface = false, storage = false, partition = false} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
//"hlscpp.array_pragma" (%arg0) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
|
||||
//"hlscpp.array_pragma" (%arg1) {partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[4 : ui32, 2 : ui32, 4 : ui32], storage_type="ram_2p", interface=true, interface_mode="bram"} : (memref<16x4x4xindex>) -> ()
|
||||
%array0 = "hlscpp.array"(%arg0) {interface = false, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
%array1 = "hlscpp.array"(%arg1) {interface = false, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_1p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
affine.for %i = 0 to 16 {
|
||||
affine.for %j = 0 to 4 {
|
||||
affine.for %k = 0 to 4 {
|
||||
%0 = affine.load %array0[%i, %j, %i + %k] : memref<16x4x4xindex>
|
||||
affine.for %k = 0 to 4{
|
||||
%0 = affine.load %array0[%i, %j, %k] : memref<16x4x4xindex>
|
||||
%1 = affine.load %array1[%i, %j, %k] : memref<16x4x4xindex>
|
||||
%2 = muli %0, %1 : index
|
||||
affine.store %2, %array1[%i, %j, %k] : memref<16x4x4xindex>
|
||||
|
|
Loading…
Reference in New Issue