[Transforms] Refactor the applyLoopTiling API to make it more robust

This commit is contained in:
Hanchen Ye 2022-02-28 20:05:20 -06:00
parent 10252043d4
commit 690a2d2eaa
9 changed files with 122 additions and 213 deletions

View File

@ -64,10 +64,12 @@ bool applyAffineLoopOrderOpt(AffineLoopBand &band,
bool applyRemoveVariableBound(AffineLoopBand &band);
/// Apply loop tiling to the input loop band and sink all intra-tile loops to
/// the innermost loop with the original loop order. Return the location of the
/// innermost tile-space loop.
Optional<unsigned> applyLoopTiling(AffineLoopBand &band, TileList tileList,
bool simplify = true);
/// the innermost loop with the original loop order. If "tileOrderOpt" is true,
/// the order of all tile-space loops are optimizaed after tiling. If
/// "unrollPointLoops" is true, all intra-tile loops (also called point loops)
/// are fully unrolled after tiling.
bool applyLoopTiling(AffineLoopBand &band, TileList tileList,
bool tileOrderOpt = true, bool unrollPointLoops = true);
bool applyLegalizeToHLSCpp(FuncOp func, bool topFunc);
@ -76,22 +78,24 @@ bool applyLegalizeToHLSCpp(FuncOp func, bool topFunc);
bool applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
unsigned targetII);
/// Apply simplification optimizations.
bool applySimplificationOpts(FuncOp func);
/// Fully unroll all loops insides of a loop block.
bool applyFullyLoopUnrolling(Block &block);
bool applyFullyUnrollAndPartition(Block &block, FuncOp func);
bool applyMemoryAccessOpt(FuncOp func);
bool applyFullyLoopUnrolling(Block &block, unsigned maxIterNum = 10);
/// Apply the specified array partition factors and kinds.
bool applyArrayPartition(Value array, ArrayRef<unsigned> factors,
ArrayRef<hlscpp::PartitionKind> kinds,
bool updateFuncSignature = true);
/// Find the suitable array partition factors and kinds for all arrays in the
/// targeted function.
bool applyAutoArrayPartition(FuncOp func);
/// Apply optimization strategy to a loop band. The ancestor function is
/// also passed in because the post-tiling optimizations have to take
/// function as target, e.g. canonicalizer and array partition.
/// Apply optimization strategy to a loop band. The ancestor function is also
/// passed in because the post-tiling optimizations have to take function as
/// target, e.g. canonicalizer and array partition.
bool applyOptStrategy(AffineLoopBand &band, FuncOp func, TileList tileList,
unsigned targetII);

View File

@ -171,13 +171,12 @@ static bool loopVarBoundRemoval(PyAffineLoopBand band) {
/// If succeeded, return the location of the innermost tile-space loop.
/// Otherwise, return -1.
static int64_t loopTiling(PyAffineLoopBand band, py::object factorsObject,
bool simplify) {
static bool loopTiling(PyAffineLoopBand band, py::object factorsObject,
bool tileOrderOpt, bool unrollPointLoops) {
py::gil_scoped_release();
llvm::SmallVector<unsigned, 8> factors;
getVectorFromUnsignedNpArray(factorsObject.ptr(), factors);
auto loc = applyLoopTiling(band.get(), factors, simplify);
return loc.hasValue() ? loc.getValue() : -1;
return applyLoopTiling(band.get(), factors, tileOrderOpt, unrollPointLoops);
}
static bool loopPipelining(PyAffineLoopBand band, int64_t pipelineLoc,
@ -200,12 +199,12 @@ static bool legalizeToHLSCpp(MlirOperation op, bool topFunc) {
return applyLegalizeToHLSCpp(func, topFunc);
}
static bool memoryAccessOpt(MlirOperation op) {
static bool simplificationOpts(MlirOperation op) {
py::gil_scoped_release();
auto func = dyn_cast<FuncOp>(unwrap(op));
if (!func)
throw SetPyError(PyExc_ValueError, "targeted operation not a function");
return applyMemoryAccessOpt(func);
return applySimplificationOpts(func);
}
static bool autoArrayPartition(MlirOperation op) {
@ -273,7 +272,7 @@ PYBIND11_MODULE(_scalehls, m) {
// Function transform APIs.
m.def("legalize_to_hlscpp", &legalizeToHLSCpp);
m.def("memory_access_opt", &memoryAccessOpt);
m.def("memory_access_opt", &simplificationOpts);
m.def("auto_array_partition", &autoArrayPartition);
// Array transform APIs.

View File

@ -45,6 +45,7 @@ static void updateSubFuncs(FuncOp func, Builder builder) {
});
}
/// Apply the specified array partition factors and kinds.
bool scalehls::applyArrayPartition(Value array, ArrayRef<unsigned> factors,
ArrayRef<hlscpp::PartitionKind> kinds,
bool updateFuncSignature) {
@ -203,6 +204,8 @@ getDimAccessMaps(Operation *op, AffineValueMap valueMap, int64_t dim) {
return maps;
}
/// Find the suitable array partition factors and kinds for all arrays in the
/// targeted function.
bool scalehls::applyAutoArrayPartition(FuncOp func) {
// Check whether the input function is pipelined.
bool funcPipeline = false;

View File

@ -5,10 +5,8 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/IR/IntegerSet.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "scalehls/Transforms/Passes.h"
#include "scalehls/Transforms/Utils.h"
@ -16,109 +14,42 @@
using namespace mlir;
using namespace scalehls;
static IntegerSet simplify(IntegerSet set) { return simplifyIntegerSet(set); }
/// Performs basic affine map simplifications.
static AffineMap simplify(AffineMap map) {
MutableAffineMap mMap(map);
mMap.simplify();
return mMap.getAffineMap();
}
/// Utility to simplify an affine attribute and update its entry in the parent
/// operation if necessary.
template <typename AttrT>
static void
simplifyAndUpdateAttr(Operation *op, StringAttr name, AttrT attr,
DenseMap<Attribute, Attribute> &simplifiedAttrs) {
auto &simplified = simplifiedAttrs[attr];
if (simplified == attr)
return;
// This is a newly encountered attribute.
if (!simplified) {
// Try to simplify the value of the attribute.
auto value = attr.getValue();
auto simplifiedValue = simplify(value);
if (simplifiedValue == value) {
simplified = attr;
return;
}
simplified = AttrT::get(simplifiedValue);
}
// Simplification was successful, so update the attribute.
op->setAttr(name, simplified);
}
static void simplifyAffineStructures(Block &block) {
auto context = block.front().getContext();
DenseMap<Attribute, Attribute> simplifiedAttrs;
RewritePatternSet patterns(context);
AffineApplyOp::getCanonicalizationPatterns(patterns, context);
AffineForOp::getCanonicalizationPatterns(patterns, context);
AffineIfOp::getCanonicalizationPatterns(patterns, context);
FrozenRewritePatternSet frozenPatterns(std::move(patterns));
// The simplification of affine attributes will likely simplify the op. Try to
// fold/apply canonicalization patterns when we have affine dialect ops.
SmallVector<Operation *> opsToSimplify;
block.walk([&](Operation *op) {
for (auto attr : op->getAttrs()) {
if (auto mapAttr = attr.getValue().dyn_cast<AffineMapAttr>())
simplifyAndUpdateAttr(op, attr.getName(), mapAttr, simplifiedAttrs);
else if (auto setAttr = attr.getValue().dyn_cast<IntegerSetAttr>())
simplifyAndUpdateAttr(op, attr.getName(), setAttr, simplifiedAttrs);
}
if (isa<AffineForOp, AffineIfOp, AffineApplyOp>(op))
opsToSimplify.push_back(op);
});
applyOpPatternsAndFold(opsToSimplify, frozenPatterns, /*strict=*/true);
}
/// Apply loop tiling to the input loop band and sink all intra-tile loops to
/// the innermost loop with the original loop order. Return the location of the
/// innermost tile-space loop.
Optional<unsigned> scalehls::applyLoopTiling(AffineLoopBand &band,
TileList tileList, bool simplify) {
/// the innermost loop with the original loop order. If "tileOrderOpt" is true,
/// the order of all tile-space loops are optimizaed after tiling. If
/// "unrollPointLoops" is true, all intra-tile loops (also called point loops)
/// are fully unrolled after tiling.
bool scalehls::applyLoopTiling(AffineLoopBand &band, TileList tileList,
bool tileOrderOpt, bool unrollPointLoops) {
assert(!band.empty() && "no loops provided");
if (!isPerfectlyNested(band))
return Optional<unsigned>();
return false;
// Loop tiling.
// Record the original band size and attributes to make use of later.
auto originalBandSize = band.size();
SmallVector<LoopDirectiveAttr, 6> bandAttrs;
for (auto loop : band)
bandAttrs.push_back(getLoopDirective(loop));
// Apply loop tiling.
AffineLoopBand tiledBand;
if (failed(tilePerfectlyNested(band, tileList, &tiledBand)))
return Optional<unsigned>();
return false;
// Simplify the tiled loop band if required.
if (simplify) {
band.clear();
unsigned simplifiedBandSize = 0;
for (unsigned i = 0, e = tiledBand.size(); i < e; ++i) {
auto loop = tiledBand[i];
Optional<uint64_t> tripCount = getConstantTripCount(loop);
if (i < originalBandSize - 1 || simplifiedBandSize > 0 || !tripCount ||
tripCount.getValue() != 1)
(void)normalizeAffineFor(loop);
if (loop && !loop.getLoopBody().empty()) {
band.push_back(loop);
if (i < originalBandSize)
++simplifiedBandSize;
}
}
simplifyAffineStructures(*band.front().getBody());
return simplifiedBandSize - 1;
}
// Otherwise, directly return the tiled loop band.
// Get all tile-space loops and reannotate the attributes.
band = tiledBand;
return originalBandSize - 1;
band.resize(originalBandSize);
for (auto zip : llvm::zip(band, bandAttrs))
if (std::get<1>(zip))
setLoopDirective(std::get<0>(zip), std::get<1>(zip));
// Apply loop order optimization and point loops unrolling if required.
if (tileOrderOpt)
applyAffineLoopOrderOpt(band);
if (unrollPointLoops)
applyFullyLoopUnrolling(*band.back().getBody());
return true;
}
namespace {
@ -159,13 +90,9 @@ struct AffineLoopUnrollAndPipeline
sizes.push_back(1);
}
// Apply loop tiling and extract the tile loops if applicable.
if (auto tileLoc = applyLoopTiling(band, sizes))
band.resize(tileLoc.getValue() + 1);
// Apply loop order optimization and pipelining.
if (loopOrderOpt)
applyAffineLoopOrderOpt(band);
// Apply loop unrolling and pipelining.
applyLoopTiling(band, sizes, /*tileOrderOpt=*/loopOrderOpt.getValue(),
/*unrollPointLoops=*/true);
applyLoopPipelining(band, band.size() - 1, (unsigned)1);
}
}

View File

@ -676,7 +676,9 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
// unrolling to it.
tmpFunc.walk([&](AffineForOp loop) {
if (loop->getAttrOfType<BoolAttr>("opt_flag")) {
applyFullyUnrollAndPartition(*loop.getBody(), tmpFunc);
applyFullyLoopUnrolling(*loop.getBody());
applySimplificationOpts(tmpFunc);
applyAutoArrayPartition(tmpFunc);
return;
}
});
@ -685,9 +687,11 @@ bool ScaleHLSOptimizer::simplifyLoopNests(FuncOp func) {
estimator.estimateFunc(tmpFunc);
// Fully unroll the candidate loop or delve into child loops.
if (getResource(tmpFunc).getDsp() <= maxDspNum)
applyFullyUnrollAndPartition(*candidate.getBody(), func);
else {
if (getResource(tmpFunc).getDsp() <= maxDspNum) {
applyFullyLoopUnrolling(*candidate.getBody());
applySimplificationOpts(func);
applyAutoArrayPartition(func);
} else {
auto childForOps = candidate.getOps<AffineForOp>();
targetLoops.append(childForOps.begin(), childForOps.end());
}

View File

@ -83,7 +83,9 @@ void scalehls::registerScaleHLSPyTorchPipeline() {
scalehls::createAffineLoopUnrollAndPipelinePass(loopUnrollSize));
}
// Memory accessing simplifications.
// Apply simplifications.
pm.addPass(mlir::createAffineLoopNormalizePass());
pm.addPass(mlir::createSimplifyAffineStructuresPass());
pm.addPass(mlir::createCanonicalizerPass());
pm.addPass(scalehls::createSimplifyAffineIfPass());
pm.addPass(scalehls::createAffineStoreForwardPass());

View File

@ -6,6 +6,7 @@
#include "scalehls/Transforms/Utils.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
#include "scalehls/Transforms/Passes.h"
@ -84,10 +85,36 @@ void scalehls::setFuncDirective(Operation *op, bool pipeline,
// Loop transform utils
//===----------------------------------------------------------------------===//
static void addSimplificationPipeline(PassManager &pm) {
// To factor out the redundant affine operations.
pm.addPass(createAffineLoopNormalizePass());
pm.addPass(createSimplifyAffineStructuresPass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createSimplifyAffineIfPass());
// To simplify the memory accessing. Note that the store forwarding is
// non-trivial and has a worst case complexity of O(n^2).
pm.addPass(createAffineStoreForwardPass());
pm.addPass(createSimplifyMemrefAccessPass());
// Generic common sub expression elimination.
pm.addPass(createCSEPass());
pm.addPass(createReduceInitialIntervalPass());
}
/// Apply simplification optimizations.
bool scalehls::applySimplificationOpts(FuncOp func) {
// Apply general optimizations.
PassManager optPM(func.getContext(), "builtin.func");
addSimplificationPipeline(optPM);
if (failed(optPM.run(func)))
return false;
return true;
}
/// Fully unroll all loops insides of a block.
bool scalehls::applyFullyLoopUnrolling(Block &block) {
// Try 8 iterations before exiting.
for (auto i = 0; i < 8; ++i) {
bool scalehls::applyFullyLoopUnrolling(Block &block, unsigned maxIterNum) {
for (unsigned i = 0; i < maxIterNum; ++i) {
bool hasFullyUnrolled = true;
block.walk([&](AffineForOp loop) {
if (failed(loopUnrollFull(loop)))
@ -103,46 +130,6 @@ bool scalehls::applyFullyLoopUnrolling(Block &block) {
return true;
}
static void addPassPipeline(PassManager &pm) {
// To factor out the redundant AffineApply/AffineIf operations.
pm.addPass(createCanonicalizerPass());
pm.addPass(createSimplifyAffineIfPass());
// To simplify the memory accessing. Note that the store forwarding is
// non-trivial and has a worst case complexity of O(n^2).
pm.addPass(createAffineStoreForwardPass());
pm.addPass(createSimplifyMemrefAccessPass());
// Generic common sub expression elimination.
pm.addPass(createCSEPass());
pm.addPass(createReduceInitialIntervalPass());
}
bool scalehls::applyMemoryAccessOpt(FuncOp func) {
// Apply general optimizations.
PassManager optPM(func.getContext(), "builtin.func");
addPassPipeline(optPM);
if (failed(optPM.run(func)))
return false;
return true;
}
bool scalehls::applyFullyUnrollAndPartition(Block &block, FuncOp func) {
applyFullyLoopUnrolling(block);
// Apply general optimizations.
PassManager optPM(func.getContext(), "builtin.func");
addPassPipeline(optPM);
if (failed(optPM.run(func)))
return false;
// Apply the best suitable array partition strategy to the function.
applyAutoArrayPartition(func);
return true;
}
/// Apply optimization strategy to a loop band. The ancestor function is also
/// passed in because the post-tiling optimizations have to take function as
/// target, e.g. canonicalizer and array partition.
@ -152,27 +139,22 @@ bool scalehls::applyOptStrategy(AffineLoopBand &band, FuncOp func,
if (!func->isProperAncestor(band.front()))
return false;
// Apply loop tiling.
auto pipelineLoopLoc = applyLoopTiling(band, tileList);
if (!pipelineLoopLoc)
return false;
// Apply LegalizeToHLSCpp conversion.
applyLegalizeToHLSCpp(func, /*isTopFunc=*/true);
// Apply loop tiling.
if (!applyLoopTiling(band, tileList, /*tileOrderOpt=*/false,
/*unrollPointLoops=*/true))
return false;
// Apply loop pipelining.
if (!applyLoopPipelining(band, pipelineLoopLoc.getValue(), targetII))
if (!applyLoopPipelining(band, band.size() - 1, targetII))
return false;
// Apply generic optimizations.
PassManager optPM(func.getContext(), "builtin.func");
addPassPipeline(optPM);
if (failed(optPM.run(func)))
return false;
// Apply the best suitable array partition strategy to the function.
// Apply memory access optimizations and the best suitable array partition
// strategy to the function.
applySimplificationOpts(func);
applyAutoArrayPartition(func);
return true;
}
@ -181,32 +163,24 @@ bool scalehls::applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
ArrayRef<unsigned> targetIIs) {
AffineLoopBands bands;
getLoopBands(func.front(), bands);
// Apply loop tiling and pipelining to all loop bands.
SmallVector<unsigned, 4> pipelineLoopLocs;
for (unsigned i = 0, e = bands.size(); i < e; ++i) {
auto pipelineLoopLoc = applyLoopTiling(bands[i], tileLists[i]);
if (!pipelineLoopLoc)
return false;
pipelineLoopLocs.push_back(pipelineLoopLoc.getValue());
}
assert(bands.size() == tileLists.size() && bands.size() == targetIIs.size() &&
"unexpected size of tile lists or target IIs");
// Apply LegalizeToHLSCpp conversion.
applyLegalizeToHLSCpp(func, /*isTopFunc=*/true);
for (unsigned i = 0, e = bands.size(); i < e; ++i) {
if (!applyLoopPipelining(bands[i], pipelineLoopLocs[i], targetIIs[i]))
// Apply loop tiling to all loop bands.
for (unsigned i = 0, e = bands.size(); i < e; ++i)
if (!applyLoopTiling(bands[i], tileLists[i]))
return false;
}
// Apply generic optimizations.
PassManager optPM(func.getContext(), "builtin.func");
addPassPipeline(optPM);
if (failed(optPM.run(func)))
return false;
for (unsigned i = 0, e = bands.size(); i < e; ++i)
if (!applyLoopPipelining(bands[i], bands[i].size() - 1, targetIIs[i]))
return false;
// Apply the best suitable array partition strategy to the function.
// Apply memory access optimizations and the best suitable array partition
// strategy to the function.
applySimplificationOpts(func);
applyAutoArrayPartition(func);
return true;
}

View File

@ -1,22 +1,19 @@
// RUN: scalehls-opt -affine-loop-unroll-and-pipeline="unroll-size=2 loop-order-opt=false" %s | FileCheck %s
// CHECK: #map0 = affine_map<(d0, d1) -> (d0 + d1 * 2)>
// CHECK: #map1 = affine_map<(d0) -> (d0 + 1)>
// CHECK: #map = affine_map<(d0) -> (d0 + 1)>
// CHECK: #set0 = affine_set<(d0, d1) : (d0 - d1 >= 0)>
// CHECK: #set1 = affine_set<(d0) : (d0 == 0)>
#set0 = affine_set<(d0, d1) : (d0 - d1 >= 0)>
#set1 = affine_set<(d0) : (d0 == 0)>
module {
func @test_syrk(%arg0: f32, %arg1: f32, %arg2: memref<16x16xf32>, %arg3: memref<16x16xf32>) {
// CHECK: affine.for %arg4 = 0 to 8 {
// CHECK: affine.for %arg4 = 0 to 16 step 2 {
// CHECK: affine.for %arg5 = 0 to 16 {
// CHECK: affine.for %arg6 = 0 to 16 {
// CHECK-NOT: affine.for %arg7 = 0 to 2 {
affine.for %arg4 = 0 to 16 {
affine.for %arg5 = 0 to 16 {
affine.for %arg6 = 0 to 16 {
// CHECK: %0 = affine.apply #map0(%c0, %arg4)
// CHECK: affine.if #set0(%arg5, %arg6) {
affine.if #set0(%arg5, %arg6) {
%0 = affine.load %arg3[%arg5, %arg6] : memref<16x16xf32>
%1 = arith.mulf %arg1, %0 : f32
@ -31,9 +28,7 @@ module {
%7 = arith.addf %6, %4 : f32
affine.store %7, %arg3[%arg5, %arg6] : memref<16x16xf32>
}
// CHECK: %1 = affine.apply #map1(%c0)
// CHECK: %2 = affine.apply #map0(%1, %arg4)
// CHECK: affine.if #set0(%arg5, %arg6) {
// CHECK: %0 = affine.apply #map(%arg4)
}
}
}

View File

@ -71,10 +71,11 @@ def main():
# Note: We use the trip count to generate this example "factors".
factors = np.ones(band.depth, dtype=int)
factors[-1] = band.get_trip_count(band.depth - 1) / 4
loc = scalehls.loop_tiling(band, factors, True) # simplify = True
# tileOrderOpt = False, unrollPointLoops = True
scalehls.loop_tiling(band, factors, False, True)
# Apply loop pipelining. All loops inside of the pipelined loop are fully unrolled.
scalehls.loop_pipelining(band, loc, 3) # targetII = 3
scalehls.loop_pipelining(band, band.depth - 1, 3) # targetII = 3
# Traverse all arrays in the function.
arrays = scalehls.ArrayList(func)
@ -93,7 +94,7 @@ def main():
scalehls.legalize_to_hlscpp(
func, func.sym_name.value == opts.function)
# Optimize memory accesses through store forwarding, etc.
# Apply simplifications.
scalehls.memory_access_opt(func)
# Apply suitable array partition strategies through analyzing the array access pattern.