diff --git a/lib/Transforms/Graph/LegalizeDataflow.cpp b/lib/Transforms/Graph/LegalizeDataflow.cpp index 5586c45..7eba812 100644 --- a/lib/Transforms/Graph/LegalizeDataflow.cpp +++ b/lib/Transforms/Graph/LegalizeDataflow.cpp @@ -7,69 +7,130 @@ #include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Dominance.h" #include "scalehls/Transforms/Passes.h" #include "scalehls/Transforms/Utils.h" using namespace mlir; using namespace scalehls; -// For storing the intermediate memory and successor loops indexed by the -// predecessor loop. -using Successors = SmallVector, 2>; -using SuccessorsMap = DenseMap; +// A dataflow use includes the intermediate value and the user operation, which +// is similar to the concept of OpOperand in the SSA graph. +using DataflowUse = std::pair; +using DataflowUseRange = llvm::iterator_range; -static void getSuccessorsMap(Block &block, SuccessorsMap &map) { - DenseMap> memsMap; - DenseMap> loopsMap; +// A mapping from an operation to all its dataflow uses. +using DataflowUsesMap = + llvm::SmallDenseMap, 64>; - // TODO: for now we only consider store/load operations. - for (auto loop : block.getOps()) - loop.walk([&](Operation *op) { - if (auto affineStore = dyn_cast(op)) - memsMap[loop].insert(affineStore.getMemRef()); +namespace { +struct DataflowGraph { + DataflowGraph(FuncOp func); - else if (auto store = dyn_cast(op)) - memsMap[loop].insert(store.getMemRef()); + const DataflowUseRange getUses(Operation *node) const { + const auto &uses = usesMap.lookup(node); + return llvm::make_range(uses.begin(), uses.end()); + } - else if (auto affineLoad = dyn_cast(op)) - loopsMap[affineLoad.getMemRef()].insert(loop); + llvm::SmallDenseSet getBundledNodes(Operation *node) const { + llvm::SmallDenseSet bundledNodes; + for (auto use : getUses(node)) + for (auto updater : updatersMap.lookup(use.first)) + bundledNodes.insert(updater); + return bundledNodes; + } - else if (auto load = dyn_cast(op)) - loopsMap[load.getMemRef()].insert(loop); + bool hasNode(Operation *node) const { return nodes.count(node); } + +private: + // Hold all nodes in the dataflow graph. + llvm::SmallDenseSet nodes; + + // Hold the uses mapping. + DataflowUsesMap usesMap; + + // Hold the mapping from an intermediate value to all its updaters. Because in + // the context of coarse-grained dataflow, intermediate value such as memory + // can be written by more than one operations. + llvm::SmallDenseMap> updatersMap; +}; +} // namespace + +DataflowGraph::DataflowGraph(FuncOp func) { + // Results map of each operation. + DenseMap> resultsMap; + + for (auto &op : func.front()) { + // Handle Linalg dialect operations. + if (isa(op.getDialect())) { + if (auto copy = dyn_cast(op)) { + resultsMap[&op].insert(copy.getTarget()); + updatersMap[copy.getTarget()].insert(&op); + + } else { + auto generic = dyn_cast(op); + if (!generic || !generic.hasBufferSemantics()) { + op.emitOpError("found ungeneralized or unbufferized linalg ops"); + return; + } + for (auto result : generic.getOutputOperands()) { + resultsMap[&op].insert(result->get()); + updatersMap[result->get()].insert(&op); + } + } + continue; + } + + // Handle memory stores. Child regions are recursively traversed, such that + // for and if operations are considered as a node of the dataflow. + op.walk([&](Operation *child) { + // TODO: Support transfer write? + if (auto affineStore = dyn_cast(child)) { + resultsMap[&op].insert(affineStore.getMemRef()); + updatersMap[affineStore.getMemRef()].insert(&op); + + } else if (auto store = dyn_cast(child)) { + resultsMap[&op].insert(store.getMemRef()); + updatersMap[store.getMemRef()].insert(&op); + } }); - // Find successors of all operations. Since this is a dataflow analysis, this - // traverse will not enter any control flow operations. - for (auto &op : block.getOperations()) { - // TODO: Some operations are dataflow source, which will not be scheduled. - if (isa(op)) + // Handle normal SSA results. + for (auto result : op.getResults()) { + resultsMap[&op].insert(result); + if (result.getType().isa()) + updatersMap[result].insert(&op); + } + } + + // Get the dominace tree for later use. + DominanceInfo DT(func); + + // Find successors of all operations. + for (auto &op : func.front()) { + // TODO: Some operations are dataflow source/sink node, which will not be + // scheduled. Any other operations should appear here? + if (isa(op)) continue; + nodes.insert(&op); - // Collect all memref results if the current operation is a loop. - auto mems = memsMap.lookup(&op); - SmallVector results(mems.begin(), mems.end()); - - // Collect all returned shaped type results. - for (auto result : op.getResults()) - if (result.getType().isa()) - results.push_back(result); - - // Traverse all produced results. - for (auto result : results) { - for (auto user : loopsMap.lookup(result)) { - // If the successor loop not only loads from the memory, but also store - // to the memory, it is not considered as a successor. - if (user == &op || memsMap.lookup(user).count(result)) - continue; - map[&op].push_back(std::pair(result, user)); - } - + for (auto result : resultsMap.lookup(&op)) { for (auto user : result.getUsers()) { - // User must be an operation in the block. - if (user != block.findAncestorOpInBlock(*user)) + // If the same block user doesn't exist, or is not properly dominated, + // or is also an updater of the result, continue. + auto sameBlockUser = func.front().findAncestorOpInBlock(*user); + if (!sameBlockUser || isa(sameBlockUser) || + !DT.properlyDominates(&op, sameBlockUser) || + updatersMap.lookup(result).count(sameBlockUser)) continue; - map[&op].push_back(std::pair(result, user)); + + // Only push back non-exist uses. + // TODO: Create a DenseMapInfo struct to make use SmallDenseSet. + auto &uses = usesMap[&op]; + auto newUse = DataflowUse({result, sameBlockUser}); + if (llvm::find(uses, newUse) == uses.end()) + uses.push_back(newUse); } } } @@ -78,44 +139,37 @@ static void getSuccessorsMap(Block &block, SuccessorsMap &map) { static bool applyLegalizeDataflow(FuncOp func, int64_t minGran, bool insertCopy) { auto builder = OpBuilder(func); - - SuccessorsMap successorsMap; - getSuccessorsMap(func.front(), successorsMap); + DataflowGraph graph(func); llvm::SmallDenseMap dataflowToMerge; - // Walk through all dataflow operations in a reversed order for establishing a - // ALAP scheduling. + // Walk through all dataflow operations in a reversed order for establishing + // a ALAP scheduling. for (auto i = func.front().rbegin(); i != func.front().rend(); ++i) { auto op = &*i; - // TODO: Here, we assume all dataflow operations should have successor. - if (successorsMap.count(op)) { - int64_t dataflowLevel = 0; + if (!graph.hasNode(op)) + continue; - // Walk through all successor ops. - for (auto pair : successorsMap[op]) { - auto successor = pair.second; - if (isa(successor)) - continue; - - if (auto attr = successor->getAttrOfType("dataflow_level")) - dataflowLevel = std::max(dataflowLevel, attr.getInt()); + // Walk through all successor ops. + int64_t dataflowLevel = 0; + for (auto bundledNode : graph.getBundledNodes(op)) + for (const auto &use : graph.getUses(bundledNode)) + if (auto a = use.second->getAttrOfType("dataflow_level")) + dataflowLevel = std::max(dataflowLevel, a.getInt()); else { op->emitError("has unexpected successor, legalization failed"); return false; } - } - // Set an attribute for indicating the scheduled dataflow level. - op->setAttr("dataflow_level", builder.getIntegerAttr(builder.getI64Type(), - dataflowLevel + 1)); + // Set an attribute for indicating the scheduled dataflow level. + op->setAttr("dataflow_level", builder.getIntegerAttr(builder.getI64Type(), + dataflowLevel + 1)); - // Eliminate bypass paths if detected. - for (auto pair : successorsMap[op]) { - auto value = pair.first; - auto successor = pair.second; - if (isa(successor)) - continue; + // Eliminate bypass paths if detected. + for (auto bundledNode : graph.getBundledNodes(op)) + for (auto use : graph.getUses(bundledNode)) { + auto value = use.first; + auto successor = use.second; auto successorDataflowLevel = successor->getAttrOfType("dataflow_level").getInt(); @@ -134,14 +188,13 @@ static bool applyLegalizeDataflow(FuncOp func, int64_t minGran, builder.setInsertionPoint(successor); for (auto i = dataflowLevel; i > successorDataflowLevel; --i) { // Create CopyOp. - Value newValue; - Operation *copyOp; auto valueType = value.getType().dyn_cast(); - assert(valueType && "only support memref type now, will introduce " - "TOSA dialect for tackling tensor operators"); - newValue = builder.create(op->getLoc(), valueType); - copyOp = builder.create(op->getLoc(), values.back(), - newValue); + assert(valueType && "only support memref type, please pass Affine " + "or bufferized Linalg IR as input"); + auto newValue = + builder.create(op->getLoc(), valueType); + auto copyOp = builder.create( + op->getLoc(), values.back(), newValue); // Set CopyOp dataflow level. copyOp->setAttr("dataflow_level", @@ -164,7 +217,6 @@ static bool applyLegalizeDataflow(FuncOp func, int64_t minGran, dataflowToMerge[successorDataflowLevel] = dataflowLevel; } } - } } // Collect all operations in each dataflow level. diff --git a/lib/Transforms/Graph/SplitFunction.cpp b/lib/Transforms/Graph/SplitFunction.cpp index 3d89d2d..8456d8a 100644 --- a/lib/Transforms/Graph/SplitFunction.cpp +++ b/lib/Transforms/Graph/SplitFunction.cpp @@ -35,7 +35,6 @@ static bool applySplitFunction(FuncOp func, ArrayRef ops, for (auto op : ops) for (auto result : op->getResults()) { internalValues.insert(result); - if (isLiveOut(result)) { outputTypes.push_back(result.getType()); outputValues.push_back(result); @@ -52,9 +51,13 @@ static bool applySplitFunction(FuncOp func, ArrayRef ops, for (auto op : ops) { // Push back all operands and liveins as candidates. SmallVector inputCandidates(op->getOperands()); - if (auto loop = dyn_cast(op)) { - auto liveIns = liveness.getLiveIn(loop.getBody()); - inputCandidates.append(liveIns.begin(), liveIns.end()); + for (auto ®ion : op->getRegions()) { + auto entryBlock = ®ion.front(); + auto args = entryBlock->getArguments(); + + for (auto liveIn : liveness.getLiveIn(entryBlock)) + if (llvm::find(args, liveIn) == args.end()) + inputCandidates.push_back(liveIn); } for (auto input : inputCandidates) { diff --git a/lib/Transforms/LegalizeToHLSCpp.cpp b/lib/Transforms/LegalizeToHLSCpp.cpp index 1035d35..c89f62a 100644 --- a/lib/Transforms/LegalizeToHLSCpp.cpp +++ b/lib/Transforms/LegalizeToHLSCpp.cpp @@ -57,10 +57,12 @@ struct MemrefStoreRewritePattern : public OpRewritePattern { bool scalehls::applyLegalizeToHLSCpp(FuncOp func, bool isTopFunc) { auto builder = OpBuilder(func); - // We constain functions to only contain one block. - if (func.getBlocks().size() != 1) + // We constrain functions to only contain one block. + if (!llvm::hasSingleElement(func)) func.emitError("has zero or more than one basic blocks."); + // TODO: Make sure there's no memref store/load or scf operations? + // Set function pragma attributes. if (auto fd = getFuncDirective(func)) setFuncDirective(func, fd.getPipeline(), fd.getTargetInterval(), diff --git a/lib/Transforms/Passes.cpp b/lib/Transforms/Passes.cpp index 08a0d71..d0a637e 100644 --- a/lib/Transforms/Passes.cpp +++ b/lib/Transforms/Passes.cpp @@ -43,9 +43,8 @@ void scalehls::registerScaleHLSPassPipeline() { // Adapt the model from torch-mlir or onnx-mlir front-end. if (opts.frontend == "torch") { - pm.addPass(mlir::createLinalgGeneralizationPass()); pm.addPass(mlir::createLinalgBufferizePass()); - pm.addPass(mlir::createFuncBufferizePass()); + pm.addPass(mlir::createLinalgGeneralizationPass()); pm.addPass(mlir::createCanonicalizerPass()); } else if (opts.frontend == "onnx") { pm.addPass(scalehls::createLegalizeOnnxPass()); @@ -57,9 +56,9 @@ void scalehls::registerScaleHLSPassPipeline() { // Graph-level optimizations. if (dataflowGran) { - pm.addPass(scalehls::createSimplifyGraphPass()); pm.addPass(scalehls::createLegalizeDataflowPass(dataflowGran)); pm.addPass(scalehls::createSplitFunctionPass()); + pm.addPass(scalehls::createSimplifyGraphPass()); pm.addPass(mlir::createConvertLinalgToAffineLoopsPass()); pm.addPass(mlir::createCanonicalizerPass()); } diff --git a/lib/Transforms/QoREstimation.cpp b/lib/Transforms/QoREstimation.cpp index ec96679..5da6e31 100644 --- a/lib/Transforms/QoREstimation.cpp +++ b/lib/Transforms/QoREstimation.cpp @@ -581,7 +581,7 @@ bool ScaleHLSEstimator::visitOp(CallOp op, int64_t begin) { // Block Scheduler and Estimator //===----------------------------------------------------------------------===// -// Get the pointer of the scrOp's parent loop, which should locat at the same +// Get the pointer of the scrOp's parent loop, which should locate at the same // level with dstOp's any parent loop. static Operation *getSameLevelDstOp(Operation *srcOp, Operation *dstOp) { // If srcOp and dstOp are already at the same level, return the srcOp. diff --git a/samples/pytorch/torch-mlir/README.md b/samples/pytorch/torch-mlir/README.md index ad37a19..8a2de4a 100644 --- a/samples/pytorch/torch-mlir/README.md +++ b/samples/pytorch/torch-mlir/README.md @@ -7,6 +7,7 @@ $ # Parse PyTorch model to Linalg dialect (with mlir_venv activated). $ python3 export_resnet18_mlir.py | torch-mlir-opt \ -torchscript-module-to-torch-backend-pipeline="optimize=true" \ -torch-backend-to-linalg-on-tensors-backend-pipeline="optimize=true" \ + -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops create-deallocs=false" \ -canonicalize > resnet18.mlir $ # Optimize the model and emit C++ code (not working, will be fixed soon).