diff --git a/include/Analysis/QoREstimation.h b/include/Analysis/QoREstimation.h index 261f227..ea1e5fc 100644 --- a/include/Analysis/QoREstimation.h +++ b/include/Analysis/QoREstimation.h @@ -84,6 +84,7 @@ public: /// AffineForOp related methods. // unsigned getOpMinII(AffineForOp forOp); int64_t getResMinII(MemAccessesMap &map); + int64_t getDepMinII(FuncOp func, MemAccessesMap &map); int64_t getDepMinII(AffineForOp forOp, MemAccessesMap &map); bool visitOp(AffineForOp op, int64_t begin); @@ -98,6 +99,7 @@ public: return true; \ } HANDLE(AddFOp, "fadd"); + HANDLE(SubFOp, "fadd"); HANDLE(MulFOp, "fmul"); HANDLE(DivFOp, "fdiv"); HANDLE(CmpFOp, "fcmp"); diff --git a/lib/Analysis/QoREstimation.cpp b/lib/Analysis/QoREstimation.cpp index 0f55798..9f99223 100644 --- a/lib/Analysis/QoREstimation.cpp +++ b/lib/Analysis/QoREstimation.cpp @@ -322,7 +322,39 @@ int64_t HLSCppEstimator::getResMinII(MemAccessesMap &map) { return II; } -/// Calculate the minimum dependency II. +/// Calculate the minimum dependency II of function. +int64_t HLSCppEstimator::getDepMinII(FuncOp func, MemAccessesMap &map) { + int64_t II = 1; + + for (auto &pair : map) { + auto loadStores = pair.second; + + // Walk through each pair of source and destination. Note that here dstOp is + // always before srcOp. + int64_t dstIndex = 1; + for (auto dstOp : loadStores) { + for (auto srcOp : llvm::drop_begin(loadStores, dstIndex)) { + // We ignore RAR pairs. + if (isa(dstOp) && + isa(srcOp)) + continue; + + if (MemRefAccess(dstOp) == MemRefAccess(srcOp)) { + float delay = getIntAttrValue(dstOp, "schedule_end") - + getIntAttrValue(srcOp, "schedule_begin"); + + // Distance is always 1. Therefore, the minimum II is equal to delay. + int64_t minII = delay; + II = max(II, minII); + } + } + dstIndex++; + } + } + return II; +} + +/// Calculate the minimum dependency II of loop. int64_t HLSCppEstimator::getDepMinII(AffineForOp forOp, MemAccessesMap &map) { int64_t II = 1; @@ -344,8 +376,7 @@ int64_t HLSCppEstimator::getDepMinII(AffineForOp forOp, MemAccessesMap &map) { auto loadStores = pair.second; // Walk through each pair of source and destination, and each loop level - // that are pipelined. Note that for inter-dependency, dstOp is always - // before srcOp. + // that are pipelined. Note that here dstOp is always before srcOp. for (unsigned loopDepth = startLevel; loopDepth <= endLevel; ++loopDepth) { int64_t dstIndex = 1; for (auto dstOp : loadStores) { @@ -545,7 +576,7 @@ int64_t HLSCppEstimator::getResourceMap(Block &block, ResourceMap &addFMap, auto end = getIntAttrValue(&op, "schedule_end"); // Accumulate the resource utilization of each operation. - if (isa(op)) + if (isa(op)) for (unsigned i = begin; i < end; ++i) addFMap[i]++; @@ -682,6 +713,21 @@ void HLSCppEstimator::estimateFunc() { auto latency = schedule.getValue().second; setAttrValue(func, "latency", latency); + // TODO: support dataflow interval estimation. + + // TODO: support CallOp inside of the function. + if (auto attr = func.getAttrOfType("pipeline")) { + if (attr.getValue()) { + // Collect all memory access operations for calculating II. + MemAccessesMap map; + getMemAccessesMap(func.front(), map); + + // Calculate initial interval. + auto II = max(getResMinII(map), getDepMinII(func, map)); + setAttrValue(func, "interval", II); + } + } + // Scheduled levels of all operations are reversed in this method, because // we have done the ALAP scheduling in a reverse order. Note that after the // reverse, the annotated scheduling level of each operation is a relative diff --git a/lib/EmitHLSCpp/EmitHLSCpp.cpp b/lib/EmitHLSCpp/EmitHLSCpp.cpp index a2d39d4..255d2d8 100644 --- a/lib/EmitHLSCpp/EmitHLSCpp.cpp +++ b/lib/EmitHLSCpp/EmitHLSCpp.cpp @@ -1434,6 +1434,16 @@ void ModuleEmitter::emitFunctionPragmas(FuncOp func, ArrayRef portList) { } } + if (auto pipeline = func.getAttrOfType("pipeline")) { + if (pipeline.getValue()) { + indent(); + os << "#pragma HLS pipeline\n"; + + // An empty line. + os << "\n"; + } + } + // Only top function should emit interface pragmas. if (auto topFunction = func.getAttrOfType("top_function")) { if (topFunction.getValue()) { @@ -1489,8 +1499,12 @@ void ModuleEmitter::emitFunction(FuncOp func) { if (top.getValue()) os << "/// This is top function.\n"; - if (auto latency = func.getAttrOfType("latency")) - os << "/// Latency=" << latency.getInt() << "\n"; + if (auto latency = func.getAttrOfType("latency")) { + os << "/// Latency=" << latency.getInt(); + if (auto interval = func.getAttrOfType("interval")) + os << ", interval=" << interval.getInt(); + os << "\n"; + } if (auto dsp = func.getAttrOfType("dsp")) os << "/// DSP=" << dsp.getInt() << "\n"; diff --git a/lib/Transforms/ArrayPartition.cpp b/lib/Transforms/ArrayPartition.cpp index dfda41c..17b7408 100644 --- a/lib/Transforms/ArrayPartition.cpp +++ b/lib/Transforms/ArrayPartition.cpp @@ -25,22 +25,32 @@ struct ArrayPartition : public ArrayPartitionBase { } // namespace bool scalehls::applyArrayPartition(FuncOp func, OpBuilder &builder) { - // Only memory accesses in pipelined loops will be executed in parallel. - SmallVector pipelinedLoops; - func.walk([&](AffineForOp loop) { - if (auto attr = loop.getAttrOfType("pipeline")) - if (attr.getValue()) - pipelinedLoops.push_back(loop); - }); + // Check whether the input function is pipelined. + bool funcPipeline = false; + if (auto attr = func.getAttrOfType("pipeline")) + if (attr.getValue()) + funcPipeline = true; + + // Only memory accesses in pipelined loops or function will be executed in + // parallel and required to partition. + SmallVector pipelinedBlocks; + if (funcPipeline) + pipelinedBlocks.push_back(&func.front()); + else + func.walk([&](AffineForOp loop) { + if (auto attr = loop.getAttrOfType("pipeline")) + if (attr.getValue()) + pipelinedBlocks.push_back(&loop.getLoopBody().front()); + }); // Storing the partition information of each memref. using PartitionInfo = std::pair; DenseMap> partitionsMap; // Traverse all pipelined loops. - for (auto loop : pipelinedLoops) { + for (auto block : pipelinedBlocks) { MemAccessesMap accessesMap; - getMemAccessesMap(loop.getLoopBody().front(), accessesMap); + getMemAccessesMap(*block, accessesMap); for (auto pair : accessesMap) { auto memref = pair.first; diff --git a/lib/Transforms/FuncPipelining.cpp b/lib/Transforms/FuncPipelining.cpp index 0ef4e91..e80263e 100644 --- a/lib/Transforms/FuncPipelining.cpp +++ b/lib/Transforms/FuncPipelining.cpp @@ -38,6 +38,7 @@ bool scalehls::applyFuncPipelining(FuncOp func, OpBuilder &builder) { func.walk([&](AffineForOp loop) { loopUnrollFull(loop); }); func.setAttr("pipeline", builder.getBoolAttr(true)); + func.setAttr("dataflow", builder.getBoolAttr(false)); // For now, this method will always success. return true; diff --git a/lib/Transforms/PartialAffineLoopTile.cpp b/lib/Transforms/PartialAffineLoopTile.cpp index 40bec16..a20c494 100644 --- a/lib/Transforms/PartialAffineLoopTile.cpp +++ b/lib/Transforms/PartialAffineLoopTile.cpp @@ -57,7 +57,8 @@ void PartialAffineLoopTile::runOnOperation() { else permMap.push_back(i - realTileLevel); } - permuteLoops(nestedLoops, permMap); + if (isValidLoopInterchangePermutation(nestedLoops, permMap)) + permuteLoops(nestedLoops, permMap); } } diff --git a/samples/polybench/syrk.mlir b/samples/polybench/syrk.mlir index bc49fcc..0906f34 100644 --- a/samples/polybench/syrk.mlir +++ b/samples/polybench/syrk.mlir @@ -2,15 +2,15 @@ // CHECK: module { #map = affine_map<(d0) -> (d0 + 1)> -func @test_syrk(%alpha: f32, %beta: f32, %A: memref<16x8xf32>, %C: memref<16x16xf32>) { +func @test_syrk(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %C: memref<16x16xf32>) { affine.for %i = 0 to 16 { affine.for %j = 0 to #map(%i) { %0 = affine.load %C[%i, %j] : memref<16x16xf32> %1 = mulf %beta, %0 : f32 affine.store %1, %C[%i, %j] : memref<16x16xf32> affine.for %k = 0 to 8 { - %2 = affine.load %A[%i, %k] : memref<16x8xf32> - %3 = affine.load %A[%j, %k] : memref<16x8xf32> + %2 = affine.load %A[%i, %k] : memref<16x16xf32> + %3 = affine.load %A[%j, %k] : memref<16x16xf32> %4 = affine.load %C[%i, %j] : memref<16x16xf32> %5 = mulf %alpha, %2 : f32 %6 = mulf %5, %3 : f32 diff --git a/test/Transforms/test_func_pipelining.mlir b/test/Transforms/test_func_pipelining.mlir new file mode 100644 index 0000000..42ee4e2 --- /dev/null +++ b/test/Transforms/test_func_pipelining.mlir @@ -0,0 +1,6 @@ +// RUN: scalehls-opt -func-pipelining %s | FileCheck %s + +// CHECK-LABEL: func @test_for +func @test_for() { + return +}