[ArrayPartition] resolve an issue that non-pipelined loops are not considered; [Transforms] add an applyFullyUnrollAndPartition() until; [Samples] reverse gemm, syrk, syr2k back...

This commit is contained in:
Hanchen Ye 2021-03-16 23:57:18 -05:00
parent 4c9708d239
commit dc3ef5ee67
10 changed files with 77 additions and 63 deletions

View File

@ -3,14 +3,14 @@
max_init_parallel=16
# Positive number: the maximum DSE iteration number.
max_iter_num=16
max_iter_num=50
# Positive float number: the maximum distance of the neighbor search.
max_distance=1.0
max_distance=4.0
[specification]
frequency=100MHz
dsp=1024
dsp=512
bram=280
lut=13300

View File

@ -26,16 +26,6 @@ bool applyAffineLoopOrderOpt(AffineLoopBand &band, bool reverse = false);
/// Try to rectangularize the input band.
bool applyRemoveVariableBound(AffineLoopBand &band);
/// Apply optimization strategy to a loop band. The ancestor function is also
/// passed in because the post-tiling optimizations have to take function as
/// target, e.g. canonicalizer and array partition.
bool applyOptStrategy(AffineLoopBand &band, FuncOp func, TileList tileList,
unsigned targetII);
/// Apply optimization strategy to a function.
bool applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
ArrayRef<unsigned> targetIIs);
/// Apply loop tiling to the input loop band and sink all intra-tile loops to
/// the innermost loop with the original loop order. Return the location of the
/// innermost tile-space loop.
@ -49,6 +39,18 @@ bool applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
/// Fully unroll all loops insides of a loop block.
bool applyFullyLoopUnrolling(Block &block);
bool applyFullyUnrollAndPartition(Block &block, FuncOp func);
/// Apply optimization strategy to a loop band. The ancestor function is also
/// passed in because the post-tiling optimizations have to take function as
/// target, e.g. canonicalizer and array partition.
bool applyOptStrategy(AffineLoopBand &band, FuncOp func, TileList tileList,
unsigned targetII);
/// Apply optimization strategy to a function.
bool applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
ArrayRef<unsigned> targetIIs);
} // namespace scalehls
} // namespace mlir

View File

@ -14,6 +14,8 @@ using namespace mlir;
using namespace scalehls;
using namespace hlscpp;
/// TODO: support to pass in partition strategy.
static bool applyArrayPartition(FuncOp func) {
// Check whether the input function is pipelined.
bool funcPipeline = false;
@ -21,24 +23,26 @@ static bool applyArrayPartition(FuncOp func) {
if (attr.getValue())
funcPipeline = true;
// Only memory accesses in pipelined loops or function will be executed in
// parallel and required to partition.
SmallVector<Block *, 4> pipelinedBlocks;
// Collect target basic blocks to be considered.
SmallVector<Block *, 4> targetBlocks;
if (funcPipeline)
pipelinedBlocks.push_back(&func.front());
else
func.walk([&](AffineForOp loop) {
if (auto attr = loop->getAttrOfType<BoolAttr>("pipeline"))
if (attr.getValue())
pipelinedBlocks.push_back(loop.getBody());
});
targetBlocks.push_back(&func.front());
else {
// Collect all target loop bands.
AffineLoopBands targetBands;
getLoopBands(func.front(), targetBands);
// Apply loop order optimization to each loop band.
for (auto &band : targetBands)
targetBlocks.push_back(band.back().getBody());
}
// Storing the partition information of each memref.
using PartitionInfo = std::pair<PartitionKind, int64_t>;
DenseMap<Value, SmallVector<PartitionInfo, 4>> partitionsMap;
// Traverse all pipelined loops.
for (auto block : pipelinedBlocks) {
for (auto block : targetBlocks) {
MemAccessesMap accessesMap;
getMemAccessesMap(*block, accessesMap);

View File

@ -4,32 +4,12 @@
//
//===----------------------------------------------------------------------===//
#include "mlir/Transforms/LoopUtils.h"
#include "scalehls/Transforms/Passes.h"
#include "scalehls/Transforms/Utils.h"
using namespace mlir;
using namespace scalehls;
/// Fully unroll all loops insides of a block.
bool scalehls::applyFullyLoopUnrolling(Block &block) {
// Try 8 iterations before exiting.
for (auto i = 0; i < 8; ++i) {
bool hasFullyUnrolled = true;
block.walk([&](AffineForOp loop) {
if (failed(loopUnrollFull(loop)))
hasFullyUnrolled = false;
});
if (hasFullyUnrolled)
break;
if (i == 7)
return false;
}
return true;
}
/// Apply loop pipelining to the input loop, all inner loops are automatically
/// fully unrolled.
bool scalehls::applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,

View File

@ -15,6 +15,8 @@
using namespace mlir;
using namespace scalehls;
/// TODO: support to pass in permutation map.
/// Optimize loop order. Loops associated with memory access dependencies are
/// moved to an as outer as possible location of the input loop band. If
/// "reverse" is true, as inner as possible.

View File

@ -544,12 +544,12 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
auto innermostLoop = getLoopBandFromOutermost(target, loopBand);
// Calculate the overall introduced parallelism if the innermost loop of
// the current loop band is pipelined.
// the current loop band is fully unrolled.
auto parallelism = getInnerParallelism(innermostLoop);
// Collect all candidate loops into an vector, we'll ignore too large
// parallelism as unrolling them typically introduce very high cost.
if (parallelism > 1 && parallelism < 128)
if (parallelism > 1 && parallelism < 256)
candidateLoops.push_back(
std::pair<int64_t, AffineForOp>(parallelism, innermostLoop));
}
@ -560,9 +560,9 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
// Sort the candidate loops.
std::sort(candidateLoops.begin(), candidateLoops.end());
// Traverse all candidates to check whether applying loop pipelining has
// violation with the resource constraints. If so, add all inner loops into
// targetLoops. Otherwise, pipeline the candidate.
// Traverse all candidates to check whether applying fully loop unrolling
// has violation with the resource constraints. If so, add all inner loops
// into targetLoops. Otherwise, fully unroll the candidate.
for (auto pair : candidateLoops) {
auto candidate = pair.second;
@ -570,11 +570,11 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
setAttrValue(candidate, "opt_flag", true);
auto tmpFunc = func.clone();
// Find the candidate loop in the temporary function and apply loop
// pipelining to it.
// Find the candidate loop in the temporary function and apply fully loop
// unrolling to it.
tmpFunc.walk([&](AffineForOp loop) {
if (getIntAttrValue(loop, "opt_flag")) {
applyFullyLoopUnrolling(*loop.getBody());
applyFullyUnrollAndPartition(*loop.getBody(), tmpFunc);
return;
}
});
@ -582,9 +582,9 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
// Estimate the temporary function.
estimator.estimateFunc(tmpFunc);
// Pipeline the candidate loop or delve into child loops.
// Fully unroll the candidate loop or delve into child loops.
if (getIntAttrValue(tmpFunc, "dsp") <= maxDspNum)
applyFullyLoopUnrolling(*candidate.getBody());
applyFullyUnrollAndPartition(*candidate.getBody(), func);
else {
auto childForOps = candidate.getOps<AffineForOp>();
targetLoops.append(childForOps.begin(), childForOps.end());

View File

@ -6,6 +6,7 @@
#include "scalehls/Transforms/Utils.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "scalehls/Conversion/Passes.h"
#include "scalehls/Transforms/Passes.h"
@ -13,6 +14,25 @@
using namespace mlir;
using namespace scalehls;
/// Fully unroll all loops insides of a block.
bool scalehls::applyFullyLoopUnrolling(Block &block) {
// Try 8 iterations before exiting.
for (auto i = 0; i < 8; ++i) {
bool hasFullyUnrolled = true;
block.walk([&](AffineForOp loop) {
if (failed(loopUnrollFull(loop)))
hasFullyUnrolled = false;
});
if (hasFullyUnrolled)
break;
if (i == 7)
return false;
}
return true;
}
static void addPassPipeline(PassManager &pm) {
// To factor out the redundant AffineApply/AffineIf operations.
pm.addPass(createCanonicalizerPass());
@ -30,6 +50,18 @@ static void addPassPipeline(PassManager &pm) {
pm.addPass(createArrayPartitionPass());
}
bool scalehls::applyFullyUnrollAndPartition(Block &block, FuncOp func) {
applyFullyLoopUnrolling(block);
// Apply general optimizations and array partition.
PassManager optPM(func.getContext(), "func");
addPassPipeline(optPM);
if (failed(optPM.run(func)))
return false;
return true;
}
/// Apply optimization strategy to a loop band. The ancestor function is also
/// passed in because the post-tiling optimizations have to take function as
/// target, e.g. canonicalizer and array partition.

View File

@ -4,9 +4,7 @@ func @gemm(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %B: memref<16x16xf32>
%0 = affine.load %C[%i, %j] : memref<16x16xf32>
%1 = mulf %beta, %0 : f32
affine.store %1, %C[%i, %j] : memref<16x16xf32>
}
affine.for %k = 0 to 16 {
affine.for %j = 0 to 16 {
affine.for %k = 0 to 16 {
%2 = affine.load %A[%i, %k] : memref<16x16xf32>
%3 = affine.load %B[%k, %j] : memref<16x16xf32>
%4 = affine.load %C[%i, %j] : memref<16x16xf32>

View File

@ -5,9 +5,7 @@ func @syr2k(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %B: memref<16x16xf32
%0 = affine.load %C[%i, %j] : memref<16x16xf32>
%1 = mulf %beta, %0 : f32
affine.store %1, %C[%i, %j] : memref<16x16xf32>
}
affine.for %k = 0 to 16 {
affine.for %j = 0 to #map(%i) {
affine.for %k = 0 to 16 {
%2 = affine.load %A[%i, %k] : memref<16x16xf32>
%3 = affine.load %B[%j, %k] : memref<16x16xf32>
%4 = affine.load %B[%i, %k] : memref<16x16xf32>

View File

@ -5,9 +5,7 @@ func @syrk(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %C: memref<16x16xf32>
%0 = affine.load %C[%i, %j] : memref<16x16xf32>
%1 = mulf %beta, %0 : f32
affine.store %1, %C[%i, %j] : memref<16x16xf32>
}
affine.for %k = 0 to 16 {
affine.for %j = 0 to #map(%i) {
affine.for %k = 0 to 16 {
%2 = affine.load %A[%i, %k] : memref<16x16xf32>
%3 = affine.load %A[%j, %k] : memref<16x16xf32>
%4 = affine.load %C[%i, %j] : memref<16x16xf32>