[FuncPipelining] comprehensively support function pipelining in emitter, estimator, and array partition pass

This commit is contained in:
Hanchen Ye 2021-01-10 18:43:40 -06:00
parent 5bf4d7943e
commit 9cc5f3abdc
8 changed files with 99 additions and 19 deletions

View File

@ -84,6 +84,7 @@ public:
/// AffineForOp related methods.
// unsigned getOpMinII(AffineForOp forOp);
int64_t getResMinII(MemAccessesMap &map);
int64_t getDepMinII(FuncOp func, MemAccessesMap &map);
int64_t getDepMinII(AffineForOp forOp, MemAccessesMap &map);
bool visitOp(AffineForOp op, int64_t begin);
@ -98,6 +99,7 @@ public:
return true; \
}
HANDLE(AddFOp, "fadd");
HANDLE(SubFOp, "fadd");
HANDLE(MulFOp, "fmul");
HANDLE(DivFOp, "fdiv");
HANDLE(CmpFOp, "fcmp");

View File

@ -322,7 +322,39 @@ int64_t HLSCppEstimator::getResMinII(MemAccessesMap &map) {
return II;
}
/// Calculate the minimum dependency II.
/// Calculate the minimum dependency II of function.
int64_t HLSCppEstimator::getDepMinII(FuncOp func, MemAccessesMap &map) {
int64_t II = 1;
for (auto &pair : map) {
auto loadStores = pair.second;
// Walk through each pair of source and destination. Note that here dstOp is
// always before srcOp.
int64_t dstIndex = 1;
for (auto dstOp : loadStores) {
for (auto srcOp : llvm::drop_begin(loadStores, dstIndex)) {
// We ignore RAR pairs.
if (isa<AffineReadOpInterface>(dstOp) &&
isa<AffineReadOpInterface>(srcOp))
continue;
if (MemRefAccess(dstOp) == MemRefAccess(srcOp)) {
float delay = getIntAttrValue(dstOp, "schedule_end") -
getIntAttrValue(srcOp, "schedule_begin");
// Distance is always 1. Therefore, the minimum II is equal to delay.
int64_t minII = delay;
II = max(II, minII);
}
}
dstIndex++;
}
}
return II;
}
/// Calculate the minimum dependency II of loop.
int64_t HLSCppEstimator::getDepMinII(AffineForOp forOp, MemAccessesMap &map) {
int64_t II = 1;
@ -344,8 +376,7 @@ int64_t HLSCppEstimator::getDepMinII(AffineForOp forOp, MemAccessesMap &map) {
auto loadStores = pair.second;
// Walk through each pair of source and destination, and each loop level
// that are pipelined. Note that for inter-dependency, dstOp is always
// before srcOp.
// that are pipelined. Note that here dstOp is always before srcOp.
for (unsigned loopDepth = startLevel; loopDepth <= endLevel; ++loopDepth) {
int64_t dstIndex = 1;
for (auto dstOp : loadStores) {
@ -545,7 +576,7 @@ int64_t HLSCppEstimator::getResourceMap(Block &block, ResourceMap &addFMap,
auto end = getIntAttrValue(&op, "schedule_end");
// Accumulate the resource utilization of each operation.
if (isa<AddFOp>(op))
if (isa<AddFOp, SubFOp>(op))
for (unsigned i = begin; i < end; ++i)
addFMap[i]++;
@ -682,6 +713,21 @@ void HLSCppEstimator::estimateFunc() {
auto latency = schedule.getValue().second;
setAttrValue(func, "latency", latency);
// TODO: support dataflow interval estimation.
// TODO: support CallOp inside of the function.
if (auto attr = func.getAttrOfType<BoolAttr>("pipeline")) {
if (attr.getValue()) {
// Collect all memory access operations for calculating II.
MemAccessesMap map;
getMemAccessesMap(func.front(), map);
// Calculate initial interval.
auto II = max(getResMinII(map), getDepMinII(func, map));
setAttrValue(func, "interval", II);
}
}
// Scheduled levels of all operations are reversed in this method, because
// we have done the ALAP scheduling in a reverse order. Note that after the
// reverse, the annotated scheduling level of each operation is a relative

View File

@ -1434,6 +1434,16 @@ void ModuleEmitter::emitFunctionPragmas(FuncOp func, ArrayRef<Value> portList) {
}
}
if (auto pipeline = func.getAttrOfType<BoolAttr>("pipeline")) {
if (pipeline.getValue()) {
indent();
os << "#pragma HLS pipeline\n";
// An empty line.
os << "\n";
}
}
// Only top function should emit interface pragmas.
if (auto topFunction = func.getAttrOfType<BoolAttr>("top_function")) {
if (topFunction.getValue()) {
@ -1489,8 +1499,12 @@ void ModuleEmitter::emitFunction(FuncOp func) {
if (top.getValue())
os << "/// This is top function.\n";
if (auto latency = func.getAttrOfType<IntegerAttr>("latency"))
os << "/// Latency=" << latency.getInt() << "\n";
if (auto latency = func.getAttrOfType<IntegerAttr>("latency")) {
os << "/// Latency=" << latency.getInt();
if (auto interval = func.getAttrOfType<IntegerAttr>("interval"))
os << ", interval=" << interval.getInt();
os << "\n";
}
if (auto dsp = func.getAttrOfType<IntegerAttr>("dsp"))
os << "/// DSP=" << dsp.getInt() << "\n";

View File

@ -25,22 +25,32 @@ struct ArrayPartition : public ArrayPartitionBase<ArrayPartition> {
} // namespace
bool scalehls::applyArrayPartition(FuncOp func, OpBuilder &builder) {
// Only memory accesses in pipelined loops will be executed in parallel.
SmallVector<AffineForOp, 4> pipelinedLoops;
func.walk([&](AffineForOp loop) {
if (auto attr = loop.getAttrOfType<BoolAttr>("pipeline"))
if (attr.getValue())
pipelinedLoops.push_back(loop);
});
// Check whether the input function is pipelined.
bool funcPipeline = false;
if (auto attr = func.getAttrOfType<BoolAttr>("pipeline"))
if (attr.getValue())
funcPipeline = true;
// Only memory accesses in pipelined loops or function will be executed in
// parallel and required to partition.
SmallVector<Block *, 4> pipelinedBlocks;
if (funcPipeline)
pipelinedBlocks.push_back(&func.front());
else
func.walk([&](AffineForOp loop) {
if (auto attr = loop.getAttrOfType<BoolAttr>("pipeline"))
if (attr.getValue())
pipelinedBlocks.push_back(&loop.getLoopBody().front());
});
// Storing the partition information of each memref.
using PartitionInfo = std::pair<PartitionKind, int64_t>;
DenseMap<Value, SmallVector<PartitionInfo, 4>> partitionsMap;
// Traverse all pipelined loops.
for (auto loop : pipelinedLoops) {
for (auto block : pipelinedBlocks) {
MemAccessesMap accessesMap;
getMemAccessesMap(loop.getLoopBody().front(), accessesMap);
getMemAccessesMap(*block, accessesMap);
for (auto pair : accessesMap) {
auto memref = pair.first;

View File

@ -38,6 +38,7 @@ bool scalehls::applyFuncPipelining(FuncOp func, OpBuilder &builder) {
func.walk([&](AffineForOp loop) { loopUnrollFull(loop); });
func.setAttr("pipeline", builder.getBoolAttr(true));
func.setAttr("dataflow", builder.getBoolAttr(false));
// For now, this method will always success.
return true;

View File

@ -57,7 +57,8 @@ void PartialAffineLoopTile::runOnOperation() {
else
permMap.push_back(i - realTileLevel);
}
permuteLoops(nestedLoops, permMap);
if (isValidLoopInterchangePermutation(nestedLoops, permMap))
permuteLoops(nestedLoops, permMap);
}
}

View File

@ -2,15 +2,15 @@
// CHECK: module {
#map = affine_map<(d0) -> (d0 + 1)>
func @test_syrk(%alpha: f32, %beta: f32, %A: memref<16x8xf32>, %C: memref<16x16xf32>) {
func @test_syrk(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %C: memref<16x16xf32>) {
affine.for %i = 0 to 16 {
affine.for %j = 0 to #map(%i) {
%0 = affine.load %C[%i, %j] : memref<16x16xf32>
%1 = mulf %beta, %0 : f32
affine.store %1, %C[%i, %j] : memref<16x16xf32>
affine.for %k = 0 to 8 {
%2 = affine.load %A[%i, %k] : memref<16x8xf32>
%3 = affine.load %A[%j, %k] : memref<16x8xf32>
%2 = affine.load %A[%i, %k] : memref<16x16xf32>
%3 = affine.load %A[%j, %k] : memref<16x16xf32>
%4 = affine.load %C[%i, %j] : memref<16x16xf32>
%5 = mulf %alpha, %2 : f32
%6 = mulf %5, %3 : f32

View File

@ -0,0 +1,6 @@
// RUN: scalehls-opt -func-pipelining %s | FileCheck %s
// CHECK-LABEL: func @test_for
func @test_for() {
return
}