[ArrayPartition] resolve an issue that non-pipelined loops are not considered; [Transforms] add an applyFullyUnrollAndPartition() until; [Samples] reverse gemm, syrk, syr2k back...
This commit is contained in:
parent
4c9708d239
commit
dc3ef5ee67
|
@ -3,14 +3,14 @@
|
|||
max_init_parallel=16
|
||||
|
||||
# Positive number: the maximum DSE iteration number.
|
||||
max_iter_num=16
|
||||
max_iter_num=50
|
||||
|
||||
# Positive float number: the maximum distance of the neighbor search.
|
||||
max_distance=1.0
|
||||
max_distance=4.0
|
||||
|
||||
[specification]
|
||||
frequency=100MHz
|
||||
dsp=1024
|
||||
dsp=512
|
||||
bram=280
|
||||
lut=13300
|
||||
|
||||
|
|
|
@ -26,16 +26,6 @@ bool applyAffineLoopOrderOpt(AffineLoopBand &band, bool reverse = false);
|
|||
/// Try to rectangularize the input band.
|
||||
bool applyRemoveVariableBound(AffineLoopBand &band);
|
||||
|
||||
/// Apply optimization strategy to a loop band. The ancestor function is also
|
||||
/// passed in because the post-tiling optimizations have to take function as
|
||||
/// target, e.g. canonicalizer and array partition.
|
||||
bool applyOptStrategy(AffineLoopBand &band, FuncOp func, TileList tileList,
|
||||
unsigned targetII);
|
||||
|
||||
/// Apply optimization strategy to a function.
|
||||
bool applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
|
||||
ArrayRef<unsigned> targetIIs);
|
||||
|
||||
/// Apply loop tiling to the input loop band and sink all intra-tile loops to
|
||||
/// the innermost loop with the original loop order. Return the location of the
|
||||
/// innermost tile-space loop.
|
||||
|
@ -49,6 +39,18 @@ bool applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
|
|||
/// Fully unroll all loops insides of a loop block.
|
||||
bool applyFullyLoopUnrolling(Block &block);
|
||||
|
||||
bool applyFullyUnrollAndPartition(Block &block, FuncOp func);
|
||||
|
||||
/// Apply optimization strategy to a loop band. The ancestor function is also
|
||||
/// passed in because the post-tiling optimizations have to take function as
|
||||
/// target, e.g. canonicalizer and array partition.
|
||||
bool applyOptStrategy(AffineLoopBand &band, FuncOp func, TileList tileList,
|
||||
unsigned targetII);
|
||||
|
||||
/// Apply optimization strategy to a function.
|
||||
bool applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
|
||||
ArrayRef<unsigned> targetIIs);
|
||||
|
||||
} // namespace scalehls
|
||||
} // namespace mlir
|
||||
|
||||
|
|
|
@ -14,6 +14,8 @@ using namespace mlir;
|
|||
using namespace scalehls;
|
||||
using namespace hlscpp;
|
||||
|
||||
/// TODO: support to pass in partition strategy.
|
||||
|
||||
static bool applyArrayPartition(FuncOp func) {
|
||||
// Check whether the input function is pipelined.
|
||||
bool funcPipeline = false;
|
||||
|
@ -21,24 +23,26 @@ static bool applyArrayPartition(FuncOp func) {
|
|||
if (attr.getValue())
|
||||
funcPipeline = true;
|
||||
|
||||
// Only memory accesses in pipelined loops or function will be executed in
|
||||
// parallel and required to partition.
|
||||
SmallVector<Block *, 4> pipelinedBlocks;
|
||||
// Collect target basic blocks to be considered.
|
||||
SmallVector<Block *, 4> targetBlocks;
|
||||
if (funcPipeline)
|
||||
pipelinedBlocks.push_back(&func.front());
|
||||
else
|
||||
func.walk([&](AffineForOp loop) {
|
||||
if (auto attr = loop->getAttrOfType<BoolAttr>("pipeline"))
|
||||
if (attr.getValue())
|
||||
pipelinedBlocks.push_back(loop.getBody());
|
||||
});
|
||||
targetBlocks.push_back(&func.front());
|
||||
else {
|
||||
// Collect all target loop bands.
|
||||
AffineLoopBands targetBands;
|
||||
getLoopBands(func.front(), targetBands);
|
||||
|
||||
// Apply loop order optimization to each loop band.
|
||||
for (auto &band : targetBands)
|
||||
targetBlocks.push_back(band.back().getBody());
|
||||
}
|
||||
|
||||
// Storing the partition information of each memref.
|
||||
using PartitionInfo = std::pair<PartitionKind, int64_t>;
|
||||
DenseMap<Value, SmallVector<PartitionInfo, 4>> partitionsMap;
|
||||
|
||||
// Traverse all pipelined loops.
|
||||
for (auto block : pipelinedBlocks) {
|
||||
for (auto block : targetBlocks) {
|
||||
MemAccessesMap accessesMap;
|
||||
getMemAccessesMap(*block, accessesMap);
|
||||
|
||||
|
|
|
@ -4,32 +4,12 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "scalehls/Transforms/Passes.h"
|
||||
#include "scalehls/Transforms/Utils.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace scalehls;
|
||||
|
||||
/// Fully unroll all loops insides of a block.
|
||||
bool scalehls::applyFullyLoopUnrolling(Block &block) {
|
||||
// Try 8 iterations before exiting.
|
||||
for (auto i = 0; i < 8; ++i) {
|
||||
bool hasFullyUnrolled = true;
|
||||
block.walk([&](AffineForOp loop) {
|
||||
if (failed(loopUnrollFull(loop)))
|
||||
hasFullyUnrolled = false;
|
||||
});
|
||||
|
||||
if (hasFullyUnrolled)
|
||||
break;
|
||||
|
||||
if (i == 7)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Apply loop pipelining to the input loop, all inner loops are automatically
|
||||
/// fully unrolled.
|
||||
bool scalehls::applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
|
||||
|
|
|
@ -15,6 +15,8 @@
|
|||
using namespace mlir;
|
||||
using namespace scalehls;
|
||||
|
||||
/// TODO: support to pass in permutation map.
|
||||
|
||||
/// Optimize loop order. Loops associated with memory access dependencies are
|
||||
/// moved to an as outer as possible location of the input loop band. If
|
||||
/// "reverse" is true, as inner as possible.
|
||||
|
|
|
@ -544,12 +544,12 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
|
|||
auto innermostLoop = getLoopBandFromOutermost(target, loopBand);
|
||||
|
||||
// Calculate the overall introduced parallelism if the innermost loop of
|
||||
// the current loop band is pipelined.
|
||||
// the current loop band is fully unrolled.
|
||||
auto parallelism = getInnerParallelism(innermostLoop);
|
||||
|
||||
// Collect all candidate loops into an vector, we'll ignore too large
|
||||
// parallelism as unrolling them typically introduce very high cost.
|
||||
if (parallelism > 1 && parallelism < 128)
|
||||
if (parallelism > 1 && parallelism < 256)
|
||||
candidateLoops.push_back(
|
||||
std::pair<int64_t, AffineForOp>(parallelism, innermostLoop));
|
||||
}
|
||||
|
@ -560,9 +560,9 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
|
|||
// Sort the candidate loops.
|
||||
std::sort(candidateLoops.begin(), candidateLoops.end());
|
||||
|
||||
// Traverse all candidates to check whether applying loop pipelining has
|
||||
// violation with the resource constraints. If so, add all inner loops into
|
||||
// targetLoops. Otherwise, pipeline the candidate.
|
||||
// Traverse all candidates to check whether applying fully loop unrolling
|
||||
// has violation with the resource constraints. If so, add all inner loops
|
||||
// into targetLoops. Otherwise, fully unroll the candidate.
|
||||
for (auto pair : candidateLoops) {
|
||||
auto candidate = pair.second;
|
||||
|
||||
|
@ -570,11 +570,11 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
|
|||
setAttrValue(candidate, "opt_flag", true);
|
||||
auto tmpFunc = func.clone();
|
||||
|
||||
// Find the candidate loop in the temporary function and apply loop
|
||||
// pipelining to it.
|
||||
// Find the candidate loop in the temporary function and apply fully loop
|
||||
// unrolling to it.
|
||||
tmpFunc.walk([&](AffineForOp loop) {
|
||||
if (getIntAttrValue(loop, "opt_flag")) {
|
||||
applyFullyLoopUnrolling(*loop.getBody());
|
||||
applyFullyUnrollAndPartition(*loop.getBody(), tmpFunc);
|
||||
return;
|
||||
}
|
||||
});
|
||||
|
@ -582,9 +582,9 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
|
|||
// Estimate the temporary function.
|
||||
estimator.estimateFunc(tmpFunc);
|
||||
|
||||
// Pipeline the candidate loop or delve into child loops.
|
||||
// Fully unroll the candidate loop or delve into child loops.
|
||||
if (getIntAttrValue(tmpFunc, "dsp") <= maxDspNum)
|
||||
applyFullyLoopUnrolling(*candidate.getBody());
|
||||
applyFullyUnrollAndPartition(*candidate.getBody(), func);
|
||||
else {
|
||||
auto childForOps = candidate.getOps<AffineForOp>();
|
||||
targetLoops.append(childForOps.begin(), childForOps.end());
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include "scalehls/Transforms/Utils.h"
|
||||
#include "mlir/Pass/PassManager.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "mlir/Transforms/Passes.h"
|
||||
#include "scalehls/Conversion/Passes.h"
|
||||
#include "scalehls/Transforms/Passes.h"
|
||||
|
@ -13,6 +14,25 @@
|
|||
using namespace mlir;
|
||||
using namespace scalehls;
|
||||
|
||||
/// Fully unroll all loops insides of a block.
|
||||
bool scalehls::applyFullyLoopUnrolling(Block &block) {
|
||||
// Try 8 iterations before exiting.
|
||||
for (auto i = 0; i < 8; ++i) {
|
||||
bool hasFullyUnrolled = true;
|
||||
block.walk([&](AffineForOp loop) {
|
||||
if (failed(loopUnrollFull(loop)))
|
||||
hasFullyUnrolled = false;
|
||||
});
|
||||
|
||||
if (hasFullyUnrolled)
|
||||
break;
|
||||
|
||||
if (i == 7)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void addPassPipeline(PassManager &pm) {
|
||||
// To factor out the redundant AffineApply/AffineIf operations.
|
||||
pm.addPass(createCanonicalizerPass());
|
||||
|
@ -30,6 +50,18 @@ static void addPassPipeline(PassManager &pm) {
|
|||
pm.addPass(createArrayPartitionPass());
|
||||
}
|
||||
|
||||
bool scalehls::applyFullyUnrollAndPartition(Block &block, FuncOp func) {
|
||||
applyFullyLoopUnrolling(block);
|
||||
|
||||
// Apply general optimizations and array partition.
|
||||
PassManager optPM(func.getContext(), "func");
|
||||
addPassPipeline(optPM);
|
||||
if (failed(optPM.run(func)))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Apply optimization strategy to a loop band. The ancestor function is also
|
||||
/// passed in because the post-tiling optimizations have to take function as
|
||||
/// target, e.g. canonicalizer and array partition.
|
||||
|
|
|
@ -4,9 +4,7 @@ func @gemm(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %B: memref<16x16xf32>
|
|||
%0 = affine.load %C[%i, %j] : memref<16x16xf32>
|
||||
%1 = mulf %beta, %0 : f32
|
||||
affine.store %1, %C[%i, %j] : memref<16x16xf32>
|
||||
}
|
||||
affine.for %k = 0 to 16 {
|
||||
affine.for %j = 0 to 16 {
|
||||
affine.for %k = 0 to 16 {
|
||||
%2 = affine.load %A[%i, %k] : memref<16x16xf32>
|
||||
%3 = affine.load %B[%k, %j] : memref<16x16xf32>
|
||||
%4 = affine.load %C[%i, %j] : memref<16x16xf32>
|
||||
|
|
|
@ -5,9 +5,7 @@ func @syr2k(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %B: memref<16x16xf32
|
|||
%0 = affine.load %C[%i, %j] : memref<16x16xf32>
|
||||
%1 = mulf %beta, %0 : f32
|
||||
affine.store %1, %C[%i, %j] : memref<16x16xf32>
|
||||
}
|
||||
affine.for %k = 0 to 16 {
|
||||
affine.for %j = 0 to #map(%i) {
|
||||
affine.for %k = 0 to 16 {
|
||||
%2 = affine.load %A[%i, %k] : memref<16x16xf32>
|
||||
%3 = affine.load %B[%j, %k] : memref<16x16xf32>
|
||||
%4 = affine.load %B[%i, %k] : memref<16x16xf32>
|
||||
|
|
|
@ -5,9 +5,7 @@ func @syrk(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %C: memref<16x16xf32>
|
|||
%0 = affine.load %C[%i, %j] : memref<16x16xf32>
|
||||
%1 = mulf %beta, %0 : f32
|
||||
affine.store %1, %C[%i, %j] : memref<16x16xf32>
|
||||
}
|
||||
affine.for %k = 0 to 16 {
|
||||
affine.for %j = 0 to #map(%i) {
|
||||
affine.for %k = 0 to 16 {
|
||||
%2 = affine.load %A[%i, %k] : memref<16x16xf32>
|
||||
%3 = affine.load %A[%j, %k] : memref<16x16xf32>
|
||||
%4 = affine.load %C[%i, %j] : memref<16x16xf32>
|
||||
|
|
Loading…
Reference in New Issue