[LoopPipelining] move automatic flattening to this pass; [QoREstimation] support AffineIfOp estimation, know issue: dependencies between load/store operations should be considered in scheduling
This commit is contained in:
parent
93b5a2641e
commit
fac1498067
|
@ -134,18 +134,21 @@ public:
|
||||||
|
|
||||||
using HLSCppVisitorBase::visitOp;
|
using HLSCppVisitorBase::visitOp;
|
||||||
bool visitOp(AffineForOp op);
|
bool visitOp(AffineForOp op);
|
||||||
|
bool visitOp(AffineIfOp op);
|
||||||
|
bool visitOp(ArrayOp op);
|
||||||
|
|
||||||
void getBlockMemInfo(Block &block, LoadStoreDict &info);
|
void getBlockMemInfo(Block &block, LoadStoreDict &info);
|
||||||
|
|
||||||
unsigned getLoadStoreSchedule(Operation *op, unsigned begin,
|
unsigned getLoadStoreSchedule(Operation *op, unsigned begin,
|
||||||
MemPortDicts &dicts);
|
MemPortDicts &dicts);
|
||||||
|
void updateChildBlockSchedule(Block &block, unsigned begin);
|
||||||
unsigned getBlockSchedule(Block &block);
|
unsigned getBlockSchedule(Block &block);
|
||||||
|
|
||||||
unsigned getResMinII(AffineForOp forOp, LoadStoreDict dict);
|
unsigned getResMinII(AffineForOp forOp, LoadStoreDict dict);
|
||||||
unsigned getDepMinII(AffineForOp forOp, LoadStoreDict dict);
|
unsigned getDepMinII(AffineForOp forOp, LoadStoreDict dict);
|
||||||
|
|
||||||
void estimateFunc(FuncOp func);
|
bool estimateFunc(FuncOp func);
|
||||||
void estimateBlock(Block &block);
|
bool estimateBlock(Block &block);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace scalehls
|
} // namespace scalehls
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#include "Dialect/HLSCpp/HLSCpp.h"
|
#include "Dialect/HLSCpp/HLSCpp.h"
|
||||||
#include "mlir/Analysis/AffineAnalysis.h"
|
#include "mlir/Analysis/AffineAnalysis.h"
|
||||||
#include "mlir/Analysis/AffineStructures.h"
|
#include "mlir/Analysis/AffineStructures.h"
|
||||||
|
#include "mlir/Analysis/Liveness.h"
|
||||||
#include "mlir/Analysis/LoopAnalysis.h"
|
#include "mlir/Analysis/LoopAnalysis.h"
|
||||||
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
||||||
#include "mlir/IR/Operation.h"
|
#include "mlir/IR/Operation.h"
|
||||||
|
@ -124,7 +125,7 @@ unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin,
|
||||||
rdwrPort = 1;
|
rdwrPort = 1;
|
||||||
else {
|
else {
|
||||||
rdwrPort = 2;
|
rdwrPort = 2;
|
||||||
arrayOp.emitError("unsupported storage type.");
|
// arrayOp.emitError("unsupported storage type.");
|
||||||
}
|
}
|
||||||
PortInfo portInfo(rdPort, wrPort, rdwrPort);
|
PortInfo portInfo(rdPort, wrPort, rdwrPort);
|
||||||
memPort.push_back(portInfo);
|
memPort.push_back(portInfo);
|
||||||
|
@ -175,7 +176,30 @@ unsigned HLSCppEstimator::getLoadStoreSchedule(Operation *op, unsigned begin,
|
||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate scheduling information of the block.
|
void HLSCppEstimator::updateChildBlockSchedule(Block &block, unsigned begin) {
|
||||||
|
for (auto &op : block) {
|
||||||
|
unsigned newBegin = begin;
|
||||||
|
unsigned newEnd = begin;
|
||||||
|
|
||||||
|
// Update the schedule of all operations in the child block.
|
||||||
|
if (getUIntAttrValue(&op, "schedule_end")) {
|
||||||
|
newBegin += getUIntAttrValue(&op, "schedule_begin");
|
||||||
|
newEnd += getUIntAttrValue(&op, "schedule_end");
|
||||||
|
setAttrValue(&op, "schedule_begin", newBegin);
|
||||||
|
setAttrValue(&op, "schedule_end", newEnd);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recursively apply to all child blocks.
|
||||||
|
if (op.getNumRegions()) {
|
||||||
|
for (auto ®ion : op.getRegions()) {
|
||||||
|
for (auto &block : region.getBlocks())
|
||||||
|
updateChildBlockSchedule(block, begin);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Schedule the block with ASAP algorithm.
|
||||||
unsigned HLSCppEstimator::getBlockSchedule(Block &block) {
|
unsigned HLSCppEstimator::getBlockSchedule(Block &block) {
|
||||||
unsigned blockEnd = 0;
|
unsigned blockEnd = 0;
|
||||||
MemPortDicts dicts;
|
MemPortDicts dicts;
|
||||||
|
@ -191,24 +215,60 @@ unsigned HLSCppEstimator::getBlockSchedule(Block &block) {
|
||||||
begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
|
begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert new pipeline stages to the memory port dicts.
|
|
||||||
while (begin >= dicts.size()) {
|
|
||||||
MemPortDict memPortDict;
|
|
||||||
dicts.push_back(memPortDict);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle load/store operations, ensure the current schedule meets memory
|
|
||||||
// port limitation.
|
|
||||||
if (isa<mlir::AffineReadOpInterface, mlir::AffineWriteOpInterface>(op)) {
|
|
||||||
begin = getLoadStoreSchedule(&op, begin, dicts);
|
|
||||||
end = begin + 1;
|
|
||||||
}
|
|
||||||
// Handle loop operations.
|
// Handle loop operations.
|
||||||
else if (auto forOp = dyn_cast<AffineForOp>(op)) {
|
if (auto forOp = dyn_cast<AffineForOp>(op)) {
|
||||||
|
// Live ins of the for loop body will also impact the schedule begin.
|
||||||
|
Liveness liveness(block.getParentOp());
|
||||||
|
for (auto liveIn : liveness.getLiveIn(&forOp.getLoopBody().front())) {
|
||||||
|
if (auto defOp = liveIn.getDefiningOp())
|
||||||
|
begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the schedule of all operations in the loop body.
|
||||||
|
updateChildBlockSchedule(forOp.getLoopBody().front(), begin);
|
||||||
|
|
||||||
// Child loop is considered as a large node, and two extra clock cycles
|
// Child loop is considered as a large node, and two extra clock cycles
|
||||||
// will be required to enter and exit the child loop.
|
// will be required to enter and exit the child loop.
|
||||||
end = begin + getUIntAttrValue(forOp, "latency") + 2;
|
end = begin + getUIntAttrValue(forOp, "latency") + 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle if operations.
|
||||||
|
else if (auto ifOp = dyn_cast<AffineIfOp>(op)) {
|
||||||
|
// Live ins of the if body will also impact the schedule begin.
|
||||||
|
Liveness liveness(block.getParentOp());
|
||||||
|
for (auto liveIn : liveness.getLiveIn(ifOp.getThenBlock())) {
|
||||||
|
if (auto defOp = liveIn.getDefiningOp())
|
||||||
|
begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ifOp.hasElse()) {
|
||||||
|
for (auto liveIn : liveness.getLiveIn(ifOp.getElseBlock())) {
|
||||||
|
if (auto defOp = liveIn.getDefiningOp())
|
||||||
|
begin = max(getUIntAttrValue(defOp, "schedule_end"), begin);
|
||||||
|
}
|
||||||
|
// Update the schedule of all operations in the else block.
|
||||||
|
updateChildBlockSchedule(*ifOp.getElseBlock(), begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the schedule of all operations in the then block.
|
||||||
|
updateChildBlockSchedule(*ifOp.getThenBlock(), begin);
|
||||||
|
|
||||||
|
end = begin + getUIntAttrValue(ifOp, "latency");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle load/store operations.
|
||||||
|
else if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
|
||||||
|
// Insert new schedule level to the memory port dicts.
|
||||||
|
while (begin >= dicts.size()) {
|
||||||
|
MemPortDict memPortDict;
|
||||||
|
dicts.push_back(memPortDict);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the current schedule meets memory port limitation.
|
||||||
|
begin = getLoadStoreSchedule(&op, begin, dicts);
|
||||||
|
end = begin + 1;
|
||||||
|
}
|
||||||
|
|
||||||
// Default case. All normal expressions and operations will be handled by
|
// Default case. All normal expressions and operations will be handled by
|
||||||
// this branch.
|
// this branch.
|
||||||
else {
|
else {
|
||||||
|
@ -253,6 +313,10 @@ unsigned HLSCppEstimator::getResMinII(AffineForOp forOp, LoadStoreDict dict) {
|
||||||
accessNum = 2;
|
accessNum = 2;
|
||||||
else if (storageType == "ram_1p")
|
else if (storageType == "ram_1p")
|
||||||
accessNum = 1;
|
accessNum = 1;
|
||||||
|
else {
|
||||||
|
accessNum = 2;
|
||||||
|
// arrayOp.emitError("unsupported storage type.");
|
||||||
|
}
|
||||||
|
|
||||||
// The rationale here is an undetermined partition access will
|
// The rationale here is an undetermined partition access will
|
||||||
// introduce a large mux which will avoid Vivado HLS to process any
|
// introduce a large mux which will avoid Vivado HLS to process any
|
||||||
|
@ -364,8 +428,24 @@ unsigned HLSCppEstimator::getDepMinII(AffineForOp forOp, LoadStoreDict dict) {
|
||||||
|
|
||||||
bool HLSCppEstimator::visitOp(AffineForOp op) {
|
bool HLSCppEstimator::visitOp(AffineForOp op) {
|
||||||
auto &body = op.getLoopBody();
|
auto &body = op.getLoopBody();
|
||||||
if (body.getBlocks().size() != 1)
|
if (body.getBlocks().size() != 1) {
|
||||||
op.emitError("has zero or more than one basic blocks.");
|
op.emitError("has zero or more than one basic blocks.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recursively estimate all contained operations.
|
||||||
|
if (!estimateBlock(body.front()))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Set an attribute indicating the trip count. For now, we assume all
|
||||||
|
// loops have static loop bound.
|
||||||
|
if (auto tripCount = getConstantTripCount(op))
|
||||||
|
setAttrValue(op, "trip_count", (unsigned)tripCount.getValue());
|
||||||
|
else {
|
||||||
|
setAttrValue(op, "trip_count", (unsigned)0);
|
||||||
|
op.emitError("has undetermined trip count");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// If the current loop is annotated as pipeline, extra dependency and II
|
// If the current loop is annotated as pipeline, extra dependency and II
|
||||||
// analysis will be executed.
|
// analysis will be executed.
|
||||||
|
@ -388,13 +468,10 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Recursively estimate all inner loops.
|
// This means the current loop can be flattened into the child loop. If the
|
||||||
estimateBlock(body.front());
|
// child loop is pipelined, this will increase the flattened loop trip count
|
||||||
|
// without changing the iteration latency. Note that this will be propogated
|
||||||
// This simply means the current loop can be flattened into the child loop
|
// above until meeting an imperfect loop.
|
||||||
// pipeline. This will increase the flattened loop trip count without
|
|
||||||
// changing the iteration latency. Note that this will be propogated above
|
|
||||||
// until meeting an imperfect loop.
|
|
||||||
if (getBoolAttrValue(op, "flatten")) {
|
if (getBoolAttrValue(op, "flatten")) {
|
||||||
if (auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front())) {
|
if (auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front())) {
|
||||||
// This means the inner loop is pipelined, because otherwise II will be
|
// This means the inner loop is pipelined, because otherwise II will be
|
||||||
|
@ -434,71 +511,70 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HLSCppEstimator::estimateBlock(Block &block) {
|
bool HLSCppEstimator::visitOp(AffineIfOp op) {
|
||||||
|
auto thenBlock = op.getThenBlock();
|
||||||
|
if (!estimateBlock(*thenBlock))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
LoadStoreDict dict;
|
||||||
|
getBlockMemInfo(*thenBlock, dict);
|
||||||
|
auto latency = getBlockSchedule(*thenBlock);
|
||||||
|
|
||||||
|
// Handle else block if required.
|
||||||
|
if (op.hasElse()) {
|
||||||
|
auto elseBlock = op.getElseBlock();
|
||||||
|
if (!estimateBlock(*elseBlock))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
getBlockMemInfo(*elseBlock, dict);
|
||||||
|
latency = max(latency, getBlockSchedule(*elseBlock));
|
||||||
|
}
|
||||||
|
|
||||||
|
setAttrValue(op, "latency", latency);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HLSCppEstimator::visitOp(ArrayOp op) {
|
||||||
|
unsigned partitionNum = 1;
|
||||||
|
if (op.partition()) {
|
||||||
|
auto rank = op.getType().cast<ShapedType>().getRank();
|
||||||
|
for (unsigned i = 0; i < rank; ++i) {
|
||||||
|
if (auto factor = getPartitionFactor(op, i))
|
||||||
|
partitionNum *= factor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
setAttrValue(op, "partition_num", partitionNum);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HLSCppEstimator::estimateBlock(Block &block) {
|
||||||
for (auto &op : block) {
|
for (auto &op : block) {
|
||||||
if (dispatchVisitor(&op))
|
if (dispatchVisitor(&op))
|
||||||
continue;
|
continue;
|
||||||
op.emitError("can't be correctly estimated.");
|
else {
|
||||||
|
op.emitError("can't be correctly estimated.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HLSCppEstimator::estimateFunc(FuncOp func) {
|
bool HLSCppEstimator::estimateFunc(FuncOp func) {
|
||||||
if (func.getBlocks().size() != 1)
|
if (func.getBlocks().size() != 1) {
|
||||||
func.emitError("has zero or more than one basic blocks.");
|
func.emitError("has zero or more than one basic blocks.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// Extract all static parameters and current pragma configurations.
|
// Recursively estimate all contained operations.
|
||||||
func.walk([&](ArrayOp op) {
|
if (!estimateBlock(func.front()))
|
||||||
unsigned factor = 1;
|
return false;
|
||||||
if (getBoolAttrValue(op, "partition")) {
|
|
||||||
for (unsigned i = 0, e = op.getType().cast<ShapedType>().getRank(); i < e;
|
|
||||||
++i)
|
|
||||||
factor *= getPartitionFactor(op, i);
|
|
||||||
}
|
|
||||||
setAttrValue(op, "partition_num", factor);
|
|
||||||
});
|
|
||||||
|
|
||||||
func.walk([&](AffineForOp op) {
|
|
||||||
// We assume loop contains a single basic block.
|
|
||||||
auto &body = op.getLoopBody();
|
|
||||||
if (body.getBlocks().size() != 1)
|
|
||||||
op.emitError("has zero or more than one basic blocks.");
|
|
||||||
|
|
||||||
// Set an attribute indicating the trip count. For now, we assume all
|
|
||||||
// loops have static loop bound.
|
|
||||||
if (auto tripCount = getConstantTripCount(op))
|
|
||||||
setAttrValue(op, "trip_count", (unsigned)tripCount.getValue());
|
|
||||||
else {
|
|
||||||
setAttrValue(op, "trip_count", (unsigned)0);
|
|
||||||
op.emitError("has variable trip count");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set attributes indicating this loop can be flatten or not.
|
|
||||||
unsigned opNum = 0;
|
|
||||||
unsigned forNum = 0;
|
|
||||||
bool innerFlatten = false;
|
|
||||||
|
|
||||||
for (auto &bodyOp : body.front()) {
|
|
||||||
if (!isa<AffineYieldOp>(bodyOp))
|
|
||||||
opNum++;
|
|
||||||
if (isa<AffineForOp>(bodyOp)) {
|
|
||||||
forNum++;
|
|
||||||
innerFlatten = getBoolAttrValue(&bodyOp, "flatten");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (forNum == 0 || (opNum == 1 && innerFlatten))
|
|
||||||
setAttrValue(op, "flatten", true);
|
|
||||||
else
|
|
||||||
setAttrValue(op, "flatten", false);
|
|
||||||
});
|
|
||||||
|
|
||||||
estimateBlock(func.front());
|
|
||||||
|
|
||||||
LoadStoreDict dict;
|
LoadStoreDict dict;
|
||||||
getBlockMemInfo(func.front(), dict);
|
getBlockMemInfo(func.front(), dict);
|
||||||
|
|
||||||
auto latency = getBlockSchedule(func.front());
|
auto latency = getBlockSchedule(func.front());
|
||||||
setAttrValue(func, "latency", latency);
|
setAttrValue(func, "latency", latency);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
|
@ -34,11 +34,33 @@ void LoopPipelining::runOnOperation() {
|
||||||
|
|
||||||
targetLoop.setAttr("pipeline", builder.getBoolAttr(true));
|
targetLoop.setAttr("pipeline", builder.getBoolAttr(true));
|
||||||
|
|
||||||
// All intter loops of the pipelined loop are automatically unrolled.
|
// All inner loops of the pipelined loop are automatically unrolled.
|
||||||
targetLoop.walk([&](mlir::AffineForOp loop) {
|
targetLoop.walk([&](mlir::AffineForOp loop) {
|
||||||
if (loop != targetLoop)
|
if (loop != targetLoop)
|
||||||
loopUnrollFull(loop);
|
loopUnrollFull(loop);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// All outer loops that perfect nest the pipelined loop can be flattened.
|
||||||
|
forOp.walk([&](mlir::AffineForOp loop) {
|
||||||
|
unsigned opNum = 0;
|
||||||
|
unsigned forNum = 0;
|
||||||
|
bool innerFlatten = false;
|
||||||
|
|
||||||
|
for (auto &bodyOp : loop.getLoopBody().front()) {
|
||||||
|
if (!isa<AffineYieldOp>(bodyOp))
|
||||||
|
opNum++;
|
||||||
|
if (isa<AffineForOp>(bodyOp)) {
|
||||||
|
forNum++;
|
||||||
|
if (auto flatten = bodyOp.getAttrOfType<BoolAttr>("flatten"))
|
||||||
|
innerFlatten = flatten.getValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (forNum == 0 || (opNum == 1 && innerFlatten))
|
||||||
|
loop.setAttr("flatten", builder.getBoolAttr(true));
|
||||||
|
else
|
||||||
|
loop.setAttr("flatten", builder.getBoolAttr(false));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Canonicalize the IR after loop unrolling.
|
// Canonicalize the IR after loop unrolling.
|
||||||
|
|
|
@ -9,8 +9,8 @@ func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attribu
|
||||||
affine.for %k = 0 to 4 {
|
affine.for %k = 0 to 4 {
|
||||||
%0 = affine.load %array0[%i, %j, %k] : memref<16x4x4xindex>
|
%0 = affine.load %array0[%i, %j, %k] : memref<16x4x4xindex>
|
||||||
%1 = affine.load %array1[%i, %j, %k] : memref<16x4x4xindex>
|
%1 = affine.load %array1[%i, %j, %k] : memref<16x4x4xindex>
|
||||||
%2 = addi %1, %1 : index
|
%2 = addi %0, %1 : index
|
||||||
affine.store %2, %array1[%i, %j+1, %k] : memref<16x4x4xindex>
|
affine.store %2, %array1[%i, %j, %k+1] : memref<16x4x4xindex>
|
||||||
} {pipeline = false, unroll = false, flatten = false}
|
} {pipeline = false, unroll = false, flatten = false}
|
||||||
} {pipeline = false, unroll = false, flatten = false}
|
} {pipeline = false, unroll = false, flatten = false}
|
||||||
} {pipeline = false, unroll = false, flatten = false}
|
} {pipeline = false, unroll = false, flatten = false}
|
||||||
|
|
Loading…
Reference in New Issue