[LegalizeDataflow] Support linalg ops; Make this pass more robust; [SplitFunction] Support linalg ops
This commit is contained in:
parent
cc324c4b45
commit
bdf72c8cc3
|
@ -7,69 +7,130 @@
|
|||
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
|
||||
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/IR/Dominance.h"
|
||||
#include "scalehls/Transforms/Passes.h"
|
||||
#include "scalehls/Transforms/Utils.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace scalehls;
|
||||
|
||||
// For storing the intermediate memory and successor loops indexed by the
|
||||
// predecessor loop.
|
||||
using Successors = SmallVector<std::pair<Value, Operation *>, 2>;
|
||||
using SuccessorsMap = DenseMap<Operation *, Successors>;
|
||||
// A dataflow use includes the intermediate value and the user operation, which
|
||||
// is similar to the concept of OpOperand in the SSA graph.
|
||||
using DataflowUse = std::pair<Value, Operation *>;
|
||||
using DataflowUseRange = llvm::iterator_range<const DataflowUse *>;
|
||||
|
||||
static void getSuccessorsMap(Block &block, SuccessorsMap &map) {
|
||||
DenseMap<Operation *, SmallPtrSet<Value, 2>> memsMap;
|
||||
DenseMap<Value, SmallPtrSet<Operation *, 2>> loopsMap;
|
||||
// A mapping from an operation to all its dataflow uses.
|
||||
using DataflowUsesMap =
|
||||
llvm::SmallDenseMap<Operation *, SmallVector<DataflowUse, 4>, 64>;
|
||||
|
||||
// TODO: for now we only consider store/load operations.
|
||||
for (auto loop : block.getOps<AffineForOp>())
|
||||
loop.walk([&](Operation *op) {
|
||||
if (auto affineStore = dyn_cast<AffineStoreOp>(op))
|
||||
memsMap[loop].insert(affineStore.getMemRef());
|
||||
namespace {
|
||||
struct DataflowGraph {
|
||||
DataflowGraph(FuncOp func);
|
||||
|
||||
else if (auto store = dyn_cast<memref::StoreOp>(op))
|
||||
memsMap[loop].insert(store.getMemRef());
|
||||
const DataflowUseRange getUses(Operation *node) const {
|
||||
const auto &uses = usesMap.lookup(node);
|
||||
return llvm::make_range(uses.begin(), uses.end());
|
||||
}
|
||||
|
||||
else if (auto affineLoad = dyn_cast<AffineLoadOp>(op))
|
||||
loopsMap[affineLoad.getMemRef()].insert(loop);
|
||||
llvm::SmallDenseSet<Operation *, 4> getBundledNodes(Operation *node) const {
|
||||
llvm::SmallDenseSet<Operation *, 4> bundledNodes;
|
||||
for (auto use : getUses(node))
|
||||
for (auto updater : updatersMap.lookup(use.first))
|
||||
bundledNodes.insert(updater);
|
||||
return bundledNodes;
|
||||
}
|
||||
|
||||
else if (auto load = dyn_cast<memref::LoadOp>(op))
|
||||
loopsMap[load.getMemRef()].insert(loop);
|
||||
bool hasNode(Operation *node) const { return nodes.count(node); }
|
||||
|
||||
private:
|
||||
// Hold all nodes in the dataflow graph.
|
||||
llvm::SmallDenseSet<Operation *, 64> nodes;
|
||||
|
||||
// Hold the uses mapping.
|
||||
DataflowUsesMap usesMap;
|
||||
|
||||
// Hold the mapping from an intermediate value to all its updaters. Because in
|
||||
// the context of coarse-grained dataflow, intermediate value such as memory
|
||||
// can be written by more than one operations.
|
||||
llvm::SmallDenseMap<Value, llvm::SmallDenseSet<Operation *, 2>> updatersMap;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
DataflowGraph::DataflowGraph(FuncOp func) {
|
||||
// Results map of each operation.
|
||||
DenseMap<Operation *, llvm::SmallDenseSet<Value, 2>> resultsMap;
|
||||
|
||||
for (auto &op : func.front()) {
|
||||
// Handle Linalg dialect operations.
|
||||
if (isa<linalg::LinalgDialect>(op.getDialect())) {
|
||||
if (auto copy = dyn_cast<linalg::CopyOp>(op)) {
|
||||
resultsMap[&op].insert(copy.getTarget());
|
||||
updatersMap[copy.getTarget()].insert(&op);
|
||||
|
||||
} else {
|
||||
auto generic = dyn_cast<linalg::GenericOp>(op);
|
||||
if (!generic || !generic.hasBufferSemantics()) {
|
||||
op.emitOpError("found ungeneralized or unbufferized linalg ops");
|
||||
return;
|
||||
}
|
||||
for (auto result : generic.getOutputOperands()) {
|
||||
resultsMap[&op].insert(result->get());
|
||||
updatersMap[result->get()].insert(&op);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle memory stores. Child regions are recursively traversed, such that
|
||||
// for and if operations are considered as a node of the dataflow.
|
||||
op.walk([&](Operation *child) {
|
||||
// TODO: Support transfer write?
|
||||
if (auto affineStore = dyn_cast<mlir::AffineWriteOpInterface>(child)) {
|
||||
resultsMap[&op].insert(affineStore.getMemRef());
|
||||
updatersMap[affineStore.getMemRef()].insert(&op);
|
||||
|
||||
} else if (auto store = dyn_cast<memref::StoreOp>(child)) {
|
||||
resultsMap[&op].insert(store.getMemRef());
|
||||
updatersMap[store.getMemRef()].insert(&op);
|
||||
}
|
||||
});
|
||||
|
||||
// Find successors of all operations. Since this is a dataflow analysis, this
|
||||
// traverse will not enter any control flow operations.
|
||||
for (auto &op : block.getOperations()) {
|
||||
// TODO: Some operations are dataflow source, which will not be scheduled.
|
||||
if (isa<memref::AllocOp, memref::AllocaOp, arith::ConstantOp,
|
||||
bufferization::ToTensorOp, bufferization::ToMemrefOp>(op))
|
||||
// Handle normal SSA results.
|
||||
for (auto result : op.getResults()) {
|
||||
resultsMap[&op].insert(result);
|
||||
if (result.getType().isa<MemRefType>())
|
||||
updatersMap[result].insert(&op);
|
||||
}
|
||||
}
|
||||
|
||||
// Get the dominace tree for later use.
|
||||
DominanceInfo DT(func);
|
||||
|
||||
// Find successors of all operations.
|
||||
for (auto &op : func.front()) {
|
||||
// TODO: Some operations are dataflow source/sink node, which will not be
|
||||
// scheduled. Any other operations should appear here?
|
||||
if (isa<memref::GetGlobalOp, memref::AllocOp, memref::AllocaOp,
|
||||
bufferization::ToMemrefOp, arith::ConstantOp, ReturnOp>(op))
|
||||
continue;
|
||||
nodes.insert(&op);
|
||||
|
||||
// Collect all memref results if the current operation is a loop.
|
||||
auto mems = memsMap.lookup(&op);
|
||||
SmallVector<Value, 2> results(mems.begin(), mems.end());
|
||||
|
||||
// Collect all returned shaped type results.
|
||||
for (auto result : op.getResults())
|
||||
if (result.getType().isa<ShapedType>())
|
||||
results.push_back(result);
|
||||
|
||||
// Traverse all produced results.
|
||||
for (auto result : results) {
|
||||
for (auto user : loopsMap.lookup(result)) {
|
||||
// If the successor loop not only loads from the memory, but also store
|
||||
// to the memory, it is not considered as a successor.
|
||||
if (user == &op || memsMap.lookup(user).count(result))
|
||||
continue;
|
||||
map[&op].push_back(std::pair<Value, Operation *>(result, user));
|
||||
}
|
||||
|
||||
for (auto result : resultsMap.lookup(&op)) {
|
||||
for (auto user : result.getUsers()) {
|
||||
// User must be an operation in the block.
|
||||
if (user != block.findAncestorOpInBlock(*user))
|
||||
// If the same block user doesn't exist, or is not properly dominated,
|
||||
// or is also an updater of the result, continue.
|
||||
auto sameBlockUser = func.front().findAncestorOpInBlock(*user);
|
||||
if (!sameBlockUser || isa<ReturnOp>(sameBlockUser) ||
|
||||
!DT.properlyDominates(&op, sameBlockUser) ||
|
||||
updatersMap.lookup(result).count(sameBlockUser))
|
||||
continue;
|
||||
map[&op].push_back(std::pair<Value, Operation *>(result, user));
|
||||
|
||||
// Only push back non-exist uses.
|
||||
// TODO: Create a DenseMapInfo struct to make use SmallDenseSet.
|
||||
auto &uses = usesMap[&op];
|
||||
auto newUse = DataflowUse({result, sameBlockUser});
|
||||
if (llvm::find(uses, newUse) == uses.end())
|
||||
uses.push_back(newUse);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -78,44 +139,37 @@ static void getSuccessorsMap(Block &block, SuccessorsMap &map) {
|
|||
static bool applyLegalizeDataflow(FuncOp func, int64_t minGran,
|
||||
bool insertCopy) {
|
||||
auto builder = OpBuilder(func);
|
||||
|
||||
SuccessorsMap successorsMap;
|
||||
getSuccessorsMap(func.front(), successorsMap);
|
||||
DataflowGraph graph(func);
|
||||
|
||||
llvm::SmallDenseMap<int64_t, int64_t, 16> dataflowToMerge;
|
||||
|
||||
// Walk through all dataflow operations in a reversed order for establishing a
|
||||
// ALAP scheduling.
|
||||
// Walk through all dataflow operations in a reversed order for establishing
|
||||
// a ALAP scheduling.
|
||||
for (auto i = func.front().rbegin(); i != func.front().rend(); ++i) {
|
||||
auto op = &*i;
|
||||
// TODO: Here, we assume all dataflow operations should have successor.
|
||||
if (successorsMap.count(op)) {
|
||||
int64_t dataflowLevel = 0;
|
||||
if (!graph.hasNode(op))
|
||||
continue;
|
||||
|
||||
// Walk through all successor ops.
|
||||
for (auto pair : successorsMap[op]) {
|
||||
auto successor = pair.second;
|
||||
if (isa<ReturnOp>(successor))
|
||||
continue;
|
||||
|
||||
if (auto attr = successor->getAttrOfType<IntegerAttr>("dataflow_level"))
|
||||
dataflowLevel = std::max(dataflowLevel, attr.getInt());
|
||||
// Walk through all successor ops.
|
||||
int64_t dataflowLevel = 0;
|
||||
for (auto bundledNode : graph.getBundledNodes(op))
|
||||
for (const auto &use : graph.getUses(bundledNode))
|
||||
if (auto a = use.second->getAttrOfType<IntegerAttr>("dataflow_level"))
|
||||
dataflowLevel = std::max(dataflowLevel, a.getInt());
|
||||
else {
|
||||
op->emitError("has unexpected successor, legalization failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Set an attribute for indicating the scheduled dataflow level.
|
||||
op->setAttr("dataflow_level", builder.getIntegerAttr(builder.getI64Type(),
|
||||
dataflowLevel + 1));
|
||||
// Set an attribute for indicating the scheduled dataflow level.
|
||||
op->setAttr("dataflow_level", builder.getIntegerAttr(builder.getI64Type(),
|
||||
dataflowLevel + 1));
|
||||
|
||||
// Eliminate bypass paths if detected.
|
||||
for (auto pair : successorsMap[op]) {
|
||||
auto value = pair.first;
|
||||
auto successor = pair.second;
|
||||
if (isa<ReturnOp>(successor))
|
||||
continue;
|
||||
// Eliminate bypass paths if detected.
|
||||
for (auto bundledNode : graph.getBundledNodes(op))
|
||||
for (auto use : graph.getUses(bundledNode)) {
|
||||
auto value = use.first;
|
||||
auto successor = use.second;
|
||||
|
||||
auto successorDataflowLevel =
|
||||
successor->getAttrOfType<IntegerAttr>("dataflow_level").getInt();
|
||||
|
@ -134,14 +188,13 @@ static bool applyLegalizeDataflow(FuncOp func, int64_t minGran,
|
|||
builder.setInsertionPoint(successor);
|
||||
for (auto i = dataflowLevel; i > successorDataflowLevel; --i) {
|
||||
// Create CopyOp.
|
||||
Value newValue;
|
||||
Operation *copyOp;
|
||||
auto valueType = value.getType().dyn_cast<MemRefType>();
|
||||
assert(valueType && "only support memref type now, will introduce "
|
||||
"TOSA dialect for tackling tensor operators");
|
||||
newValue = builder.create<memref::AllocOp>(op->getLoc(), valueType);
|
||||
copyOp = builder.create<linalg::CopyOp>(op->getLoc(), values.back(),
|
||||
newValue);
|
||||
assert(valueType && "only support memref type, please pass Affine "
|
||||
"or bufferized Linalg IR as input");
|
||||
auto newValue =
|
||||
builder.create<memref::AllocOp>(op->getLoc(), valueType);
|
||||
auto copyOp = builder.create<linalg::CopyOp>(
|
||||
op->getLoc(), values.back(), newValue);
|
||||
|
||||
// Set CopyOp dataflow level.
|
||||
copyOp->setAttr("dataflow_level",
|
||||
|
@ -164,7 +217,6 @@ static bool applyLegalizeDataflow(FuncOp func, int64_t minGran,
|
|||
dataflowToMerge[successorDataflowLevel] = dataflowLevel;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Collect all operations in each dataflow level.
|
||||
|
|
|
@ -35,7 +35,6 @@ static bool applySplitFunction(FuncOp func, ArrayRef<Operation *> ops,
|
|||
for (auto op : ops)
|
||||
for (auto result : op->getResults()) {
|
||||
internalValues.insert(result);
|
||||
|
||||
if (isLiveOut(result)) {
|
||||
outputTypes.push_back(result.getType());
|
||||
outputValues.push_back(result);
|
||||
|
@ -52,9 +51,13 @@ static bool applySplitFunction(FuncOp func, ArrayRef<Operation *> ops,
|
|||
for (auto op : ops) {
|
||||
// Push back all operands and liveins as candidates.
|
||||
SmallVector<Value, 8> inputCandidates(op->getOperands());
|
||||
if (auto loop = dyn_cast<AffineForOp>(op)) {
|
||||
auto liveIns = liveness.getLiveIn(loop.getBody());
|
||||
inputCandidates.append(liveIns.begin(), liveIns.end());
|
||||
for (auto ®ion : op->getRegions()) {
|
||||
auto entryBlock = ®ion.front();
|
||||
auto args = entryBlock->getArguments();
|
||||
|
||||
for (auto liveIn : liveness.getLiveIn(entryBlock))
|
||||
if (llvm::find(args, liveIn) == args.end())
|
||||
inputCandidates.push_back(liveIn);
|
||||
}
|
||||
|
||||
for (auto input : inputCandidates) {
|
||||
|
|
|
@ -57,10 +57,12 @@ struct MemrefStoreRewritePattern : public OpRewritePattern<memref::StoreOp> {
|
|||
bool scalehls::applyLegalizeToHLSCpp(FuncOp func, bool isTopFunc) {
|
||||
auto builder = OpBuilder(func);
|
||||
|
||||
// We constain functions to only contain one block.
|
||||
if (func.getBlocks().size() != 1)
|
||||
// We constrain functions to only contain one block.
|
||||
if (!llvm::hasSingleElement(func))
|
||||
func.emitError("has zero or more than one basic blocks.");
|
||||
|
||||
// TODO: Make sure there's no memref store/load or scf operations?
|
||||
|
||||
// Set function pragma attributes.
|
||||
if (auto fd = getFuncDirective(func))
|
||||
setFuncDirective(func, fd.getPipeline(), fd.getTargetInterval(),
|
||||
|
|
|
@ -43,9 +43,8 @@ void scalehls::registerScaleHLSPassPipeline() {
|
|||
|
||||
// Adapt the model from torch-mlir or onnx-mlir front-end.
|
||||
if (opts.frontend == "torch") {
|
||||
pm.addPass(mlir::createLinalgGeneralizationPass());
|
||||
pm.addPass(mlir::createLinalgBufferizePass());
|
||||
pm.addPass(mlir::createFuncBufferizePass());
|
||||
pm.addPass(mlir::createLinalgGeneralizationPass());
|
||||
pm.addPass(mlir::createCanonicalizerPass());
|
||||
} else if (opts.frontend == "onnx") {
|
||||
pm.addPass(scalehls::createLegalizeOnnxPass());
|
||||
|
@ -57,9 +56,9 @@ void scalehls::registerScaleHLSPassPipeline() {
|
|||
|
||||
// Graph-level optimizations.
|
||||
if (dataflowGran) {
|
||||
pm.addPass(scalehls::createSimplifyGraphPass());
|
||||
pm.addPass(scalehls::createLegalizeDataflowPass(dataflowGran));
|
||||
pm.addPass(scalehls::createSplitFunctionPass());
|
||||
pm.addPass(scalehls::createSimplifyGraphPass());
|
||||
pm.addPass(mlir::createConvertLinalgToAffineLoopsPass());
|
||||
pm.addPass(mlir::createCanonicalizerPass());
|
||||
}
|
||||
|
|
|
@ -581,7 +581,7 @@ bool ScaleHLSEstimator::visitOp(CallOp op, int64_t begin) {
|
|||
// Block Scheduler and Estimator
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Get the pointer of the scrOp's parent loop, which should locat at the same
|
||||
// Get the pointer of the scrOp's parent loop, which should locate at the same
|
||||
// level with dstOp's any parent loop.
|
||||
static Operation *getSameLevelDstOp(Operation *srcOp, Operation *dstOp) {
|
||||
// If srcOp and dstOp are already at the same level, return the srcOp.
|
||||
|
|
|
@ -7,6 +7,7 @@ $ # Parse PyTorch model to Linalg dialect (with mlir_venv activated).
|
|||
$ python3 export_resnet18_mlir.py | torch-mlir-opt \
|
||||
-torchscript-module-to-torch-backend-pipeline="optimize=true" \
|
||||
-torch-backend-to-linalg-on-tensors-backend-pipeline="optimize=true" \
|
||||
-linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops create-deallocs=false" \
|
||||
-canonicalize > resnet18.mlir
|
||||
|
||||
$ # Optimize the model and emit C++ code (not working, will be fixed soon).
|
||||
|
|
Loading…
Reference in New Issue