diff --git a/include/scalehls/Transforms/Passes.h b/include/scalehls/Transforms/Passes.h index 73acf3c..a9c6d26 100644 --- a/include/scalehls/Transforms/Passes.h +++ b/include/scalehls/Transforms/Passes.h @@ -31,11 +31,9 @@ std::unique_ptr createMultipleLevelDSEPass(std::string dseTargetSpec); /// Graph optimization passes. std::unique_ptr createFakeQuantizePass(); std::unique_ptr createSimplifyTosaGraphPass(); -std::unique_ptr createLegalizeDataflowPass(); -std::unique_ptr -createLegalizeDataflowPass(unsigned dataflowGran, - bool dataflowInsertCopy = true); -std::unique_ptr createSplitFunctionPass(); +std::unique_ptr createFuncDataflowPass(); +std::unique_ptr createFuncDataflowPass(unsigned dataflowGran, + bool dataflowInsertCopy = true); std::unique_ptr createConvertCopyToAffineLoopsPass(); /// Runtime-related passes. diff --git a/include/scalehls/Transforms/Passes.td b/include/scalehls/Transforms/Passes.td index 0521755..001c7fe 100644 --- a/include/scalehls/Transforms/Passes.td +++ b/include/scalehls/Transforms/Passes.td @@ -77,15 +77,17 @@ def SimplifyTosaGraph : Pass<"scalehls-simplify-tosa-graph", "FuncOp"> { let constructor = "mlir::scalehls::createSimplifyTosaGraphPass()"; } -def LegalizeDataflow : Pass<"scalehls-legalize-dataflow", "FuncOp"> { - let summary = "Legalize the dataflow scheduling"; +def FuncDataflow : Pass<"scalehls-func-dataflow", "ModuleOp"> { + let summary = "Apply dataflow to functions"; let description = [{ - This legalize-dataflow pass will legalize the dataflow scheduling to meet + This func-dataflow pass will first legalize the dataflow scheduling to meet the requirements of the dataflow pragma: 1) single-producer single-consumer; - 2) no bypass paths. + 2) no bypass paths. Then, it will split operations/loops scheduled at the + same dataflow level into a separate sub-function and apply the dataflow + directive to the top function. }]; - let constructor = "mlir::scalehls::createLegalizeDataflowPass()"; + let constructor = "mlir::scalehls::createFuncDataflowPass()"; let options = [ Option<"insertCopy", "insert-copy", "bool", /*default=*/"true", @@ -95,17 +97,6 @@ def LegalizeDataflow : Pass<"scalehls-legalize-dataflow", "FuncOp"> { ]; } -def SplitFunction : Pass<"scalehls-split-function", "ModuleOp"> { - let summary = "Split function for enabling the dataflow pragma"; - let description = [{ - This split-function pass will split operations/loops scheduled at the same - dataflow level into a separate sub-function for applying the dataflow pragma - to the top function. - }]; - - let constructor = "mlir::scalehls::createSplitFunctionPass()"; -} - //===----------------------------------------------------------------------===// // Runtime-related Passes //===----------------------------------------------------------------------===// diff --git a/include/scalehls/Transforms/Utils.h b/include/scalehls/Transforms/Utils.h index 73af74a..105d288 100644 --- a/include/scalehls/Transforms/Utils.h +++ b/include/scalehls/Transforms/Utils.h @@ -96,13 +96,8 @@ bool applyLegalizeToHLSCpp(FuncOp func, bool topFunc, bool axiInterf = false); // Graph transform utils //===----------------------------------------------------------------------===// -/// Legalize the dataflow of "block", whose parent operation must be a function -/// or affine loop. Return false if the legalization failed, for example, the -/// dataflow has cycles. -bool applyLegalizeDataflow(Block &block, int64_t minGran, bool insertCopy); - -/// Split each dataflow stage of "block" into a separate sub-function. -bool applySplitFunction(Block &block); +/// Apply dataflow (coarse-grained pipeline) to the block. +bool applyDataflow(Block &block, unsigned minGran, bool insertCopy); /// Apply optimization strategy to a loop band. The ancestor function is also /// passed in because the post-tiling optimizations have to take function as diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt index 36ca884..10c33d7 100644 --- a/lib/Transforms/CMakeLists.txt +++ b/lib/Transforms/CMakeLists.txt @@ -4,9 +4,8 @@ add_mlir_library(MLIRScaleHLSTransforms Directive/FuncPipelining.cpp Directive/LoopPipelining.cpp Graph/FakeQuantize.cpp - Graph/LegalizeDataflow.cpp + Graph/FuncDataflow.cpp Graph/SimplifyTosaGraph.cpp - Graph/SplitFunction.cpp Loop/AffineLoopOrderOpt.cpp Loop/AffineLoopPerfection.cpp Loop/AffineLoopTile.cpp diff --git a/lib/Transforms/Graph/FuncDataflow.cpp b/lib/Transforms/Graph/FuncDataflow.cpp new file mode 100644 index 0000000..9cfcabc --- /dev/null +++ b/lib/Transforms/Graph/FuncDataflow.cpp @@ -0,0 +1,456 @@ +//===----------------------------------------------------------------------===// +// +// Copyright 2020-2021 The ScaleHLS Authors. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/Liveness.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/Tosa/IR/TosaOps.h" +#include "mlir/IR/Dominance.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "scalehls/Dialect/HLSCpp/HLSCpp.h" +#include "scalehls/Transforms/Passes.h" +#include "scalehls/Transforms/Utils.h" + +using namespace mlir; +using namespace scalehls; + +/// A dataflow use includes the intermediate value and the user operation, which +/// is similar to the concept of OpOperand in the SSA graph. +using DataflowUse = std::pair; +using DataflowUses = SmallVector; + +/// A mapping from an operation to all its dataflow uses. +using DataflowUsesMap = llvm::SmallDenseMap; + +namespace { +struct DataflowGraph { + DataflowGraph(Block &block); + + bool hasNode(Operation *node) const { return nodes.count(node); } + DataflowUses getNodeUses(Operation *node) const { + return usesMap.lookup(node); + } + +private: + // Hold all nodes in the dataflow graph. + llvm::SmallDenseSet nodes; + + // Hold the uses mapping. + DataflowUsesMap usesMap; +}; +} // namespace + +DataflowGraph::DataflowGraph(Block &block) { + // Results map of each operation. + DenseMap> resultsMap; + + for (auto &op : block) { + // Handle Linalg dialect operations. + if (isa(op.getDialect())) { + auto generic = dyn_cast(op); + if (!generic || !generic.hasBufferSemantics()) { + op.emitOpError("found ungeneralized or unbufferized linalg ops"); + return; + } + for (auto result : generic.getOutputOperands()) + resultsMap[&op].insert(result->get()); + continue; + } + + // Handle copy operations. + if (auto copy = dyn_cast(op)) + resultsMap[&op].insert(copy.getTarget()); + + // Handle memory stores. Child regions are recursively traversed, such that + // for and if operations are considered as a node of the dataflow. + op.walk([&](Operation *child) { + // TODO: Support transfer write? + if (auto affineStore = dyn_cast(child)) { + resultsMap[&op].insert(affineStore.getMemRef()); + + } else if (auto store = dyn_cast(child)) + resultsMap[&op].insert(store.getMemRef()); + }); + + // Handle normal SSA results. + for (auto result : op.getResults()) + resultsMap[&op].insert(result); + } + + // Get the dominace tree for later use. + DominanceInfo DT(block.getParentOp()); + + // Find successors of all operations. + for (auto &op : block) { + // TODO: Some operations are dataflow source/sink/call node, which will not + // be scheduled. Any other operations should appear here? + if (isa(op)) + continue; + nodes.insert(&op); + + for (auto result : resultsMap.lookup(&op)) { + for (auto user : result.getUsers()) { + // If the same block user doesn't exist, or is not properly dominated, + // or is also an updater of the result, continue. + auto sameBlockUser = block.findAncestorOpInBlock(*user); + if (!sameBlockUser || isa(sameBlockUser) || + !DT.properlyDominates(&op, sameBlockUser)) + continue; + + // Only push back non-exist uses. + // TODO: Create a DenseMapInfo struct to make use SmallDenseSet. + auto &uses = usesMap[&op]; + auto newUse = DataflowUse({result, sameBlockUser}); + if (llvm::find(uses, newUse) == uses.end()) + uses.push_back(newUse); + } + } + } +} + +/// Legalize the dataflow of "block", whose parent operation must be a function +/// or affine loop. Return false if the legalization failed, for example, the +/// dataflow has cycles. +static bool applyLegalizeDataflow(Block &block, int64_t minGran, + bool insertCopy) { + auto builder = OpBuilder(block.getParentOp()); + DataflowGraph graph(block); + + llvm::SmallDenseMap map; + llvm::SmallDenseMap dataflowToMerge; + + // Walk through all dataflow operations in a reversed order for establishing + // a ALAP scheduling. + for (auto it = block.rbegin(); it != block.rend(); ++it) { + auto op = &*it; + if (!graph.hasNode(op)) + continue; + + // Walk through all uses and schedule the dataflow level. + int64_t dataflowLevel = 0; + for (auto use : graph.getNodeUses(op)) { + if (!map.count(use.second)) + return op->emitOpError("has unexpected use, legalize failed"), false; + dataflowLevel = std::max(dataflowLevel, map.lookup(use.second)); + } + map[op] = dataflowLevel + 1; + + // Eliminate bypass paths if detected. + for (auto use : graph.getNodeUses(op)) { + auto value = use.first; + auto successor = use.second; + + // Continue if bypass path does not exist. + auto successorDataflowLevel = map.lookup(successor); + if (dataflowLevel == successorDataflowLevel) + continue; + + // If insert-copy is set, insert CopyOp to the bypass path. Otherwise, + // record all the bypass paths in dataflowToMerge. + if (insertCopy) { + // Insert CopyOps if required. + SmallVector values; + values.push_back(value); + + builder.setInsertionPoint(successor); + for (auto i = dataflowLevel; i > successorDataflowLevel; --i) { + // Create and set the dataflow level of CopyOp. + Value newValue; + Operation *copyOp; + if (auto type = value.getType().dyn_cast()) { + newValue = builder.create(op->getLoc(), type); + copyOp = builder.create(op->getLoc(), values.back(), + newValue); + } else { + copyOp = builder.create( + op->getLoc(), value.getType(), values.back()); + newValue = copyOp->getResult(0); + } + map[copyOp] = i; + + // Chain created CopyOps. + if (i == successorDataflowLevel + 1) + value.replaceUsesWithIf(newValue, [&](OpOperand &use) { + return successor->isAncestor(use.getOwner()); + }); + else + values.push_back(newValue); + } + } else { + // Always retain the longest merge path. + auto dst = dataflowToMerge.lookup(successorDataflowLevel); + dataflowToMerge[successorDataflowLevel] = std::max(dst, dataflowLevel); + } + } + } + + // Merge dataflow levels according to the bypasses and minimum granularity. + if (minGran != 1 || !insertCopy) { + // Collect all operations in each dataflow level. + DenseMap> dataflowOps; + for (auto &op : block.getOperations()) + if (map.count(&op)) + dataflowOps[map.lookup(&op)].push_back(&op); + + unsigned newLevel = 1; + unsigned toMerge = minGran; + for (unsigned i = 1, e = dataflowOps.size(); i <= e; ++i) { + // If the current level is the start point of a bypass, refresh toMerge. + // Otherwise, decrease toMerge by 1. + if (auto dst = dataflowToMerge.lookup(i)) + toMerge = dst - i; + else + toMerge--; + + // Annotate all ops in the current level to the new level. + for (auto op : dataflowOps[i]) + op->setAttr("dataflow_level", + builder.getIntegerAttr(builder.getI64Type(), newLevel)); + + // Update toMerge and newLevel if required. + if (toMerge == 0) { + toMerge = minGran; + ++newLevel; + } + } + } else { + for (auto pair : map) + pair.first->setAttr( + "dataflow_level", + builder.getIntegerAttr(builder.getI64Type(), pair.second)); + } + return true; +} + +static bool createSubFunction(Block &block, ArrayRef ops, + StringRef name, OpBuilder &builder) { + Liveness liveness(block.getParentOp()); + + // A helper that checks whether a value is a liveout value. + auto isLiveOut = [&](Value value) { + return any_of(value.getUsers(), [&](auto user) { + return all_of(ops, [&](auto op) { return !op->isAncestor(user); }); + }); + }; + + // Output types and values of the sub-function. + SmallVector outputTypes; + SmallVector outputValues; + + // Internal values of the sub-function. + llvm::SmallDenseSet internalValues; + + for (auto op : ops) + for (auto result : op->getResults()) { + internalValues.insert(result); + if (isLiveOut(result)) { + outputTypes.push_back(result.getType()); + outputValues.push_back(result); + } + } + + // Input types and values of the sub-function. + SmallVector inputTypes; + SmallVector inputValues; + + // Local buffers of the sub-function. + llvm::SmallDenseSet localOps; + + for (auto op : ops) { + // Push back all operands and liveins as candidates. + SmallVector inputCandidates(op->getOperands()); + for (auto ®ion : op->getRegions()) { + auto entryBlock = ®ion.front(); + auto args = entryBlock->getArguments(); + + for (auto liveIn : liveness.getLiveIn(entryBlock)) + if (llvm::find(args, liveIn) == args.end()) + inputCandidates.push_back(liveIn); + } + + for (auto input : inputCandidates) { + // If the current input is a induction variable or internal value, it + // doesn't needs to be passed in as argument. + if (isForInductionVar(input) || internalValues.count(input)) + continue; + + if (auto defOp = input.getDefiningOp()) { + // If the current input is not a liveout and it's defined by an memref + // alloc/alloca/get_global or tensor_init op, it is a local buffer and + // can be localized later. + if (!isLiveOut(input) && + isa(defOp)) { + localOps.insert(defOp); + continue; + } + + // Since we have localized all tosa constant operations, we can safely + // insert a constant as a local op here. + if (isa(defOp)) { + localOps.insert(defOp); + continue; + } + } + + // Only unique inputs will be added. + if (llvm::find(inputValues, input) != inputValues.end()) + continue; + + inputTypes.push_back(input.getType()); + inputValues.push_back(input); + } + } + + // Create a new function for the current dataflow level. + auto loc = builder.getUnknownLoc(); + builder.setInsertionPoint(block.getParent()->getParentOfType()); + auto subFunc = builder.create( + loc, name, builder.getFunctionType(inputTypes, outputTypes)); + + // Create a function call and reconnect all inputs and outputs. + builder.setInsertionPointAfter(ops.back()); + auto call = builder.create(loc, subFunc, inputValues); + unsigned outputIdx = 0; + for (auto result : call.getResults()) + outputValues[outputIdx++].replaceAllUsesWith(result); + + // Create new return operation in the new created function. + auto entry = subFunc.addEntryBlock(); + builder.setInsertionPointToEnd(entry); + auto returnOp = builder.create(loc, outputValues); + + // Move local buffers into the new created function. + for (auto localOp : localOps) + localOp->moveBefore(&subFunc.front().front()); + + // Move same level operations into the new created function. + for (auto op : ops) { + op->moveBefore(returnOp); + op->removeAttr("dataflow_level"); + } + + // Connect operands to the arguments of the new created function. + for (unsigned i = 0, e = inputValues.size(); i < e; ++i) + inputValues[i].replaceUsesWithIf( + entry->getArgument(i), + [&](OpOperand &use) { return subFunc->isAncestor(use.getOwner()); }); + + return true; +} + +/// Split each dataflow stage of "block" into a separate sub-function. +static bool applySplitFunction(Block &block) { + auto builder = OpBuilder(block.getParentOp()); + + // Collect all constants that have more than one use. + SmallVector constants; + block.walk([&](tosa::ConstOp constant) { + if (!constant->hasOneUse()) + constants.push_back(constant); + }); + // Localize constants to each of its use. + for (auto constant : constants) { + for (auto &use : llvm::make_early_inc_range(constant->getUses())) { + auto cloneConstant = constant->clone(); + builder.setInsertionPoint(use.getOwner()); + builder.insert(cloneConstant); + use.set(cloneConstant->getResult(0)); + } + } + + // Split sub-functions. + DenseMap> dataflowOps; + for (auto &op : block) + if (auto attr = op.getAttrOfType("dataflow_level")) + dataflowOps[attr.getInt()].push_back(&op); + + for (auto pair : dataflowOps) { + auto name = "dataflow" + std::to_string(pair.first); + if (!createSubFunction(block, pair.second, name, builder)) + return false; + } + return true; +} + +namespace { +/// The tosa reshape to tensor reshape conversion. +struct ReshapeOpRewritePattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::ReshapeOp reshape, + PatternRewriter &rewriter) const override { + rewriter.setInsertionPoint(reshape); + auto newShapeType = RankedTensorType::get( + {(int64_t)reshape.new_shape().size()}, rewriter.getI32Type()); + auto newShapeArray = llvm::to_vector<8>( + llvm::map_range(reshape.new_shape(), [&](Attribute attr) { + return APInt(32, attr.cast().getInt()); + })); + auto newShapeAttr = DenseIntElementsAttr::get(newShapeType, newShapeArray); + + auto newShape = + rewriter.create(reshape.getLoc(), newShapeAttr); + rewriter.replaceOpWithNewOp(reshape, reshape.getType(), + reshape.input1(), newShape); + return success(); + } +}; +} // namespace + +/// Apply dataflow (coarse-grained pipeline) to the block. +bool scalehls::applyDataflow(Block &block, unsigned minGran, bool insertCopy) { + if (!applyLegalizeDataflow(block, minGran, insertCopy)) + return false; + if (!applySplitFunction(block)) + return false; + + auto parentOp = block.getParentOp(); + if (isa(parentOp)) + setFuncDirective(parentOp, false, 1, true); + else if (isa(parentOp)) + setLoopDirective(parentOp, false, 1, true, false); + else + return false; + + return true; +} + +namespace { +struct FuncDataflow : public FuncDataflowBase { + FuncDataflow() = default; + FuncDataflow(unsigned dataflowGran, bool dataflowInsertCopy) { + minGran = dataflowGran; + insertCopy = dataflowInsertCopy; + } + + void runOnOperation() override { + auto module = getOperation(); + + // Split each functions in the module. + for (auto func : llvm::make_early_inc_range(module.getOps())) + applyDataflow(func.front(), minGran, insertCopy); + + // Simplify copy and assign operations generated by LegalizeDataflow. + auto context = module.getContext(); + mlir::RewritePatternSet patterns(context); + patterns.add(context); + hlscpp::AssignOp::getCanonicalizationPatterns(patterns, context); + (void)applyPatternsAndFoldGreedily(module, std::move(patterns)); + } +}; +} // namespace + +std::unique_ptr scalehls::createFuncDataflowPass() { + return std::make_unique(); +} +std::unique_ptr +scalehls::createFuncDataflowPass(unsigned dataflowGran, + bool dataflowInsertCopy) { + return std::make_unique(dataflowGran, dataflowInsertCopy); +} diff --git a/lib/Transforms/Graph/LegalizeDataflow.cpp b/lib/Transforms/Graph/LegalizeDataflow.cpp deleted file mode 100644 index 064bc9d..0000000 --- a/lib/Transforms/Graph/LegalizeDataflow.cpp +++ /dev/null @@ -1,252 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Copyright 2020-2021 The ScaleHLS Authors. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/StandardOps/IR/Ops.h" -#include "mlir/Dialect/Tosa/IR/TosaOps.h" -#include "mlir/IR/Dominance.h" -#include "scalehls/Transforms/Passes.h" -#include "scalehls/Transforms/Utils.h" - -using namespace mlir; -using namespace scalehls; - -// A dataflow use includes the intermediate value and the user operation, which -// is similar to the concept of OpOperand in the SSA graph. -using DataflowUse = std::pair; -using DataflowUses = SmallVector; - -// A mapping from an operation to all its dataflow uses. -using DataflowUsesMap = llvm::SmallDenseMap; - -namespace { -struct DataflowGraph { - DataflowGraph(Block &block); - - bool hasNode(Operation *node) const { return nodes.count(node); } - DataflowUses getNodeUses(Operation *node) const { - return usesMap.lookup(node); - } - -private: - // Hold all nodes in the dataflow graph. - llvm::SmallDenseSet nodes; - - // Hold the uses mapping. - DataflowUsesMap usesMap; -}; -} // namespace - -DataflowGraph::DataflowGraph(Block &block) { - // Results map of each operation. - DenseMap> resultsMap; - - for (auto &op : block) { - // Handle Linalg dialect operations. - if (isa(op.getDialect())) { - auto generic = dyn_cast(op); - if (!generic || !generic.hasBufferSemantics()) { - op.emitOpError("found ungeneralized or unbufferized linalg ops"); - return; - } - for (auto result : generic.getOutputOperands()) - resultsMap[&op].insert(result->get()); - continue; - } - - // Handle copy operations. - if (auto copy = dyn_cast(op)) - resultsMap[&op].insert(copy.getTarget()); - - // Handle memory stores. Child regions are recursively traversed, such that - // for and if operations are considered as a node of the dataflow. - op.walk([&](Operation *child) { - // TODO: Support transfer write? - if (auto affineStore = dyn_cast(child)) { - resultsMap[&op].insert(affineStore.getMemRef()); - - } else if (auto store = dyn_cast(child)) - resultsMap[&op].insert(store.getMemRef()); - }); - - // Handle normal SSA results. - for (auto result : op.getResults()) - resultsMap[&op].insert(result); - } - - // Get the dominace tree for later use. - DominanceInfo DT(block.getParentOp()); - - // Find successors of all operations. - for (auto &op : block) { - // TODO: Some operations are dataflow source/sink/call node, which will not - // be scheduled. Any other operations should appear here? - if (isa(op)) - continue; - nodes.insert(&op); - - for (auto result : resultsMap.lookup(&op)) { - for (auto user : result.getUsers()) { - // If the same block user doesn't exist, or is not properly dominated, - // or is also an updater of the result, continue. - auto sameBlockUser = block.findAncestorOpInBlock(*user); - if (!sameBlockUser || isa(sameBlockUser) || - !DT.properlyDominates(&op, sameBlockUser)) - continue; - - // Only push back non-exist uses. - // TODO: Create a DenseMapInfo struct to make use SmallDenseSet. - auto &uses = usesMap[&op]; - auto newUse = DataflowUse({result, sameBlockUser}); - if (llvm::find(uses, newUse) == uses.end()) - uses.push_back(newUse); - } - } - } -} - -/// Legalize the dataflow of "block", whose parent operation must be a function -/// or affine loop. Return false if the legalization failed, for example, the -/// dataflow has cycles. -bool scalehls::applyLegalizeDataflow(Block &block, int64_t minGran, - bool insertCopy) { - auto builder = OpBuilder(block.getParentOp()); - DataflowGraph graph(block); - - llvm::SmallDenseMap map; - llvm::SmallDenseMap dataflowToMerge; - - // Walk through all dataflow operations in a reversed order for establishing - // a ALAP scheduling. - for (auto it = block.rbegin(); it != block.rend(); ++it) { - auto op = &*it; - if (!graph.hasNode(op)) - continue; - - // Walk through all uses and schedule the dataflow level. - int64_t dataflowLevel = 0; - for (auto use : graph.getNodeUses(op)) { - if (!map.count(use.second)) - return op->emitOpError("has unexpected use, legalize failed"), false; - dataflowLevel = std::max(dataflowLevel, map.lookup(use.second)); - } - map[op] = dataflowLevel + 1; - - // Eliminate bypass paths if detected. - for (auto use : graph.getNodeUses(op)) { - auto value = use.first; - auto successor = use.second; - - // Continue if bypass path does not exist. - auto successorDataflowLevel = map.lookup(successor); - if (dataflowLevel == successorDataflowLevel) - continue; - - // If insert-copy is set, insert CopyOp to the bypass path. Otherwise, - // record all the bypass paths in dataflowToMerge. - if (insertCopy) { - // Insert CopyOps if required. - SmallVector values; - values.push_back(value); - - builder.setInsertionPoint(successor); - for (auto i = dataflowLevel; i > successorDataflowLevel; --i) { - // Create and set the dataflow level of CopyOp. - Value newValue; - Operation *copyOp; - if (auto type = value.getType().dyn_cast()) { - newValue = builder.create(op->getLoc(), type); - copyOp = builder.create(op->getLoc(), values.back(), - newValue); - } else { - copyOp = builder.create( - op->getLoc(), value.getType(), values.back()); - newValue = copyOp->getResult(0); - } - map[copyOp] = i; - - // Chain created CopyOps. - if (i == successorDataflowLevel + 1) - value.replaceUsesWithIf(newValue, [&](OpOperand &use) { - return successor->isAncestor(use.getOwner()); - }); - else - values.push_back(newValue); - } - } else { - // Always retain the longest merge path. - auto dst = dataflowToMerge.lookup(successorDataflowLevel); - dataflowToMerge[successorDataflowLevel] = std::max(dst, dataflowLevel); - } - } - } - - // Merge dataflow levels according to the bypasses and minimum granularity. - if (minGran != 1 || !insertCopy) { - // Collect all operations in each dataflow level. - DenseMap> dataflowOps; - for (auto &op : block.getOperations()) - if (map.count(&op)) - dataflowOps[map.lookup(&op)].push_back(&op); - - unsigned newLevel = 1; - unsigned toMerge = minGran; - for (unsigned i = 1, e = dataflowOps.size(); i <= e; ++i) { - // If the current level is the start point of a bypass, refresh toMerge. - // Otherwise, decrease toMerge by 1. - if (auto dst = dataflowToMerge.lookup(i)) - toMerge = dst - i; - else - toMerge--; - - // Annotate all ops in the current level to the new level. - for (auto op : dataflowOps[i]) - op->setAttr("dataflow_level", - builder.getIntegerAttr(builder.getI64Type(), newLevel)); - - // Update toMerge and newLevel if required. - if (toMerge == 0) { - toMerge = minGran; - ++newLevel; - } - } - } else { - for (auto pair : map) - pair.first->setAttr( - "dataflow_level", - builder.getIntegerAttr(builder.getI64Type(), pair.second)); - } - return true; -} - -namespace { -struct LegalizeDataflow : public LegalizeDataflowBase { - LegalizeDataflow() = default; - LegalizeDataflow(unsigned dataflowGran, bool dataflowInsertCopy) { - minGran = dataflowGran; - insertCopy = dataflowInsertCopy; - } - - void runOnOperation() override { - auto func = getOperation(); - applyLegalizeDataflow(func.front(), minGran, insertCopy); - setFuncDirective(func, false, 1, true); - } -}; -} // namespace - -std::unique_ptr scalehls::createLegalizeDataflowPass() { - return std::make_unique(); -} -std::unique_ptr -scalehls::createLegalizeDataflowPass(unsigned dataflowGran, - bool dataflowInsertCopy) { - return std::make_unique(dataflowGran, dataflowInsertCopy); -} diff --git a/lib/Transforms/Graph/SplitFunction.cpp b/lib/Transforms/Graph/SplitFunction.cpp deleted file mode 100644 index 443e2ff..0000000 --- a/lib/Transforms/Graph/SplitFunction.cpp +++ /dev/null @@ -1,216 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Copyright 2020-2021 The ScaleHLS Authors. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Analysis/Liveness.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/Tosa/IR/TosaOps.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "scalehls/Dialect/HLSCpp/HLSCpp.h" -#include "scalehls/Transforms/Passes.h" -#include "scalehls/Transforms/Utils.h" - -using namespace mlir; -using namespace scalehls; - -static bool createSubFunction(Block &block, ArrayRef ops, - StringRef name, OpBuilder &builder) { - Liveness liveness(block.getParentOp()); - - // A helper that checks whether a value is a liveout value. - auto isLiveOut = [&](Value value) { - return any_of(value.getUsers(), [&](auto user) { - return all_of(ops, [&](auto op) { return !op->isAncestor(user); }); - }); - }; - - // Output types and values of the sub-function. - SmallVector outputTypes; - SmallVector outputValues; - - // Internal values of the sub-function. - llvm::SmallDenseSet internalValues; - - for (auto op : ops) - for (auto result : op->getResults()) { - internalValues.insert(result); - if (isLiveOut(result)) { - outputTypes.push_back(result.getType()); - outputValues.push_back(result); - } - } - - // Input types and values of the sub-function. - SmallVector inputTypes; - SmallVector inputValues; - - // Local buffers of the sub-function. - llvm::SmallDenseSet localOps; - - for (auto op : ops) { - // Push back all operands and liveins as candidates. - SmallVector inputCandidates(op->getOperands()); - for (auto ®ion : op->getRegions()) { - auto entryBlock = ®ion.front(); - auto args = entryBlock->getArguments(); - - for (auto liveIn : liveness.getLiveIn(entryBlock)) - if (llvm::find(args, liveIn) == args.end()) - inputCandidates.push_back(liveIn); - } - - for (auto input : inputCandidates) { - // If the current input is a induction variable or internal value, it - // doesn't needs to be passed in as argument. - if (isForInductionVar(input) || internalValues.count(input)) - continue; - - if (auto defOp = input.getDefiningOp()) { - // If the current input is not a liveout and it's defined by an memref - // alloc/alloca/get_global or tensor_init op, it is a local buffer and - // can be localized later. - if (!isLiveOut(input) && - isa(defOp)) { - localOps.insert(defOp); - continue; - } - - // Since we have localized all tosa constant operations, we can safely - // insert a constant as a local op here. - if (isa(defOp)) { - localOps.insert(defOp); - continue; - } - } - - // Only unique inputs will be added. - if (llvm::find(inputValues, input) != inputValues.end()) - continue; - - inputTypes.push_back(input.getType()); - inputValues.push_back(input); - } - } - - // Create a new function for the current dataflow level. - auto loc = builder.getUnknownLoc(); - builder.setInsertionPoint(block.getParent()->getParentOfType()); - auto subFunc = builder.create( - loc, name, builder.getFunctionType(inputTypes, outputTypes)); - - // Create a function call and reconnect all inputs and outputs. - builder.setInsertionPointAfter(ops.back()); - auto call = builder.create(loc, subFunc, inputValues); - unsigned outputIdx = 0; - for (auto result : call.getResults()) - outputValues[outputIdx++].replaceAllUsesWith(result); - - // Create new return operation in the new created function. - auto entry = subFunc.addEntryBlock(); - builder.setInsertionPointToEnd(entry); - auto returnOp = builder.create(loc, outputValues); - - // Move local buffers into the new created function. - for (auto localOp : localOps) - localOp->moveBefore(&subFunc.front().front()); - - // Move same level operations into the new created function. - for (auto op : ops) { - op->moveBefore(returnOp); - op->removeAttr("dataflow_level"); - } - - // Connect operands to the arguments of the new created function. - for (unsigned i = 0, e = inputValues.size(); i < e; ++i) - inputValues[i].replaceUsesWithIf( - entry->getArgument(i), - [&](OpOperand &use) { return subFunc->isAncestor(use.getOwner()); }); - - return true; -} - -/// Split each dataflow stage of "block" into a separate sub-function. -bool scalehls::applySplitFunction(Block &block) { - auto builder = OpBuilder(block.getParentOp()); - - // Collect all constants that have more than one use. - SmallVector constants; - block.walk([&](tosa::ConstOp constant) { - if (!constant->hasOneUse()) - constants.push_back(constant); - }); - // Localize constants to each of its use. - for (auto constant : constants) { - for (auto &use : llvm::make_early_inc_range(constant->getUses())) { - auto cloneConstant = constant->clone(); - builder.setInsertionPoint(use.getOwner()); - builder.insert(cloneConstant); - use.set(cloneConstant->getResult(0)); - } - } - - // Split sub-functions. - DenseMap> dataflowOps; - for (auto &op : block) - if (auto attr = op.getAttrOfType("dataflow_level")) - dataflowOps[attr.getInt()].push_back(&op); - - for (auto pair : dataflowOps) { - auto name = "dataflow" + std::to_string(pair.first); - if (!createSubFunction(block, pair.second, name, builder)) - return false; - } - return true; -} - -namespace { -/// The tosa reshape to tensor reshape conversion. -struct ReshapeOpRewritePattern : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(tosa::ReshapeOp reshape, - PatternRewriter &rewriter) const override { - rewriter.setInsertionPoint(reshape); - auto newShapeType = RankedTensorType::get( - {(int64_t)reshape.new_shape().size()}, rewriter.getI32Type()); - auto newShapeArray = llvm::to_vector<8>( - llvm::map_range(reshape.new_shape(), [&](Attribute attr) { - return APInt(32, attr.cast().getInt()); - })); - auto newShapeAttr = DenseIntElementsAttr::get(newShapeType, newShapeArray); - - auto newShape = - rewriter.create(reshape.getLoc(), newShapeAttr); - rewriter.replaceOpWithNewOp(reshape, reshape.getType(), - reshape.input1(), newShape); - return success(); - } -}; -} // namespace - -namespace { -struct SplitFunction : public SplitFunctionBase { - void runOnOperation() override { - auto module = getOperation(); - auto context = module.getContext(); - - // Split each functions in the module. - for (auto func : llvm::make_early_inc_range(module.getOps())) - applySplitFunction(func.front()); - - // Simplify copy and assign operations generated by LegalizeDataflow. - mlir::RewritePatternSet patterns(context); - // TODO: This reshape op rewriting should be factored out! It's quite weird - // to see this as a part of SplitFunction. - patterns.add(context); - hlscpp::AssignOp::getCanonicalizationPatterns(patterns, context); - (void)applyPatternsAndFoldGreedily(module, std::move(patterns)); - } -}; -} // namespace - -std::unique_ptr scalehls::createSplitFunctionPass() { - return std::make_unique(); -} diff --git a/lib/Transforms/Passes.cpp b/lib/Transforms/Passes.cpp index de7c857..c604d98 100644 --- a/lib/Transforms/Passes.cpp +++ b/lib/Transforms/Passes.cpp @@ -62,9 +62,8 @@ void scalehls::registerScaleHLSDSEPipeline() { // If AXI interfaces are created, we need to dataflow the program to // hide the latency of data load/store from/to external memories. if (opts.hlsAxiInterf) { - pm.addPass(scalehls::createLegalizeDataflowPass( + pm.addPass(scalehls::createFuncDataflowPass( /*dataflowGran=*/(unsigned)1, /*dataflowInsertCopy=*/false)); - pm.addPass(scalehls::createSplitFunctionPass()); pm.addPass(scalehls::createConvertCopyToAffineLoopsPass()); } @@ -129,8 +128,7 @@ void scalehls::registerScaleHLSPyTorchPipeline() { pm.addPass(mlir::createCanonicalizerPass()); pm.addPass(scalehls::createSimplifyTosaGraphPass()); if (dataflowGran) - pm.addPass(scalehls::createLegalizeDataflowPass(dataflowGran)); - pm.addPass(scalehls::createSplitFunctionPass()); + pm.addPass(scalehls::createFuncDataflowPass(dataflowGran)); pm.addPass(tosa::createTosaToLinalgNamed()); pm.addPass(mlir::createCanonicalizerPass()); pm.addPass(tosa::createTosaToLinalg()); diff --git a/test/Transforms/Graph/legalize_dataflow.mlir b/test/Transforms/Graph/func_dataflow.mlir similarity index 54% rename from test/Transforms/Graph/legalize_dataflow.mlir rename to test/Transforms/Graph/func_dataflow.mlir index dff7f8d..24065f9 100644 --- a/test/Transforms/Graph/legalize_dataflow.mlir +++ b/test/Transforms/Graph/func_dataflow.mlir @@ -1,8 +1,52 @@ -// RUN: scalehls-opt -scalehls-legalize-dataflow="min-gran=3 insert-copy=true" %s | FileCheck %s +// RUN: scalehls-opt -scalehls-func-dataflow="min-gran=3 insert-copy=true" %s | FileCheck %s module { + // CHECK: func @dataflow2(%arg0: tensor<1x32x32x64xi8>) -> tensor<1x1x64xi8> { + // CHECK: %1 = "tosa.avg_pool2d" + // CHECK: %2 = "tosa.transpose" + // CHECK: %3 = tensor.reshape + // CHECK: return %3 : tensor<1x1x64xi8> + // CHECK: } + + // CHECK: func @dataflow4(%arg0: tensor<1x32x32x64xi8>) -> (tensor<1x32x32x64xi8>, tensor<1x32x32x64xi8>) { + // CHECK: %2 = "tosa.clamp" + // CHECK: %3 = "tosa.conv2d" + // CHECK: %4 = "tosa.clamp" + // CHECK: %5 = "hlscpp.assign" + // CHECK: return %4, %5 + // CHECK: } + + // CHECK: func @dataflow1(%arg0: tensor<1x1x64xi8>) -> tensor<1x10xi8> { + // CHECK: %2 = "tosa.matmul" + // CHECK: %3 = tensor.reshape + // CHECK: %4 = "tosa.add" + // CHECK: return %4 + // CHECK: } + + // CHECK: func @dataflow3(%arg0: tensor<1x32x32x64xi8>, %arg1: tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> { + // CHECK: %2 = "tosa.conv2d" + // CHECK: %3 = "hlscpp.assign" + // CHECK: %4 = "tosa.add" + // CHECK: %5 = "tosa.clamp" + // CHECK: return %5 + // CHECK: } + + // CHECK: func @dataflow5(%arg0: tensor<1x3x32x32xi8>) -> tensor<1x32x32x64xi8> { + // CHECK: %3 = "tosa.transpose" + // CHECK: %4 = "tosa.conv2d" + // CHECK: return %4 + // CHECK: } + // CHECK: func @forward(%arg0: tensor<1x3x32x32xi8>) -> tensor<1x10xi8> attributes {func_directive = #hlscpp.fd} { func @forward(%arg0: tensor<1x3x32x32xi8>) -> tensor<1x10xi8> { + // CHECK-NOT: %0 = "tosa.const"() {value = dense<0> : tensor<1x10xi8>} : () -> tensor<1x10xi8> + // CHECK-NOT: %1 = "tosa.const"() {value = dense<1> : tensor<1x64x10xi8>} : () -> tensor<1x64x10xi8> + // CHECK-NOT: %2 = "tosa.const"() {value = dense<2> : tensor<64x3x3x64xi8>} : () -> tensor<64x3x3x64xi8> + // CHECK-NOT: %3 = "tosa.const"() {value = dense<3> : tensor<64x3x3x64xi8>} : () -> tensor<64x3x3x64xi8> + // CHECK-NOT: %4 = "tosa.const"() {value = dense<4> : tensor<64x3x3x3xi8>} : () -> tensor<64x3x3x3xi8> + // CHECK-NOT: %5 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> + // CHECK-NOT: %6 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> + // CHECK-NOT: %7 = "tosa.const"() {value = dense<5> : tensor<64xi8>} : () -> tensor<64xi8> %0 = "tosa.const"() {value = dense<0> : tensor<1x10xi8>} : () -> tensor<1x10xi8> %1 = "tosa.const"() {value = dense<1> : tensor<1x64x10xi8>} : () -> tensor<1x64x10xi8> %2 = "tosa.const"() {value = dense<2> : tensor<64x3x3x64xi8>} : () -> tensor<64x3x3x64xi8> @@ -12,43 +56,31 @@ module { %6 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %7 = "tosa.const"() {value = dense<5> : tensor<64xi8>} : () -> tensor<64xi8> - // CHECK: %8 = "tosa.transpose"(%arg0, %6) - // CHECK-SAME: dataflow_level = 5 + // CHECK: %0 = call @dataflow5(%arg0) : (tensor<1x3x32x32xi8>) -> tensor<1x32x32x64xi8> %8 = "tosa.transpose"(%arg0, %6) : (tensor<1x3x32x32xi8>, tensor<4xi32>) -> tensor<1x32x32x3xi8> %9 = "tosa.conv2d"(%8, %4, %7) {dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = 0 : i32, weight_zp = 0 : i32}, stride = [1, 1]} : (tensor<1x32x32x3xi8>, tensor<64x3x3x3xi8>, tensor<64xi8>) -> tensor<1x32x32x64xi8> - // CHECK: %10 = "tosa.clamp"(%9) - // CHECK-SAME: dataflow_level = 4 + // CHECK: %1:2 = call @dataflow4(%0) : (tensor<1x32x32x64xi8>) -> (tensor<1x32x32x64xi8>, tensor<1x32x32x64xi8>) %10 = "tosa.clamp"(%9) {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> %11 = "tosa.conv2d"(%10, %3, %7) {dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = 0 : i32, weight_zp = 0 : i32}, stride = [1, 1]} : (tensor<1x32x32x64xi8>, tensor<64x3x3x64xi8>, tensor<64xi8>) -> tensor<1x32x32x64xi8> %12 = "tosa.clamp"(%11) {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - - // CHECK: %13 = "tosa.conv2d"(%12, %2, %7) - // CHECK-SAME: dataflow_level = 3 %13 = "tosa.conv2d"(%12, %2, %7) {dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = 0 : i32, weight_zp = 0 : i32}, stride = [1, 1]} : (tensor<1x32x32x64xi8>, tensor<64x3x3x64xi8>, tensor<64xi8>) -> tensor<1x32x32x64xi8> - // CHECK: %14 = "hlscpp.assign"(%10) - // CHECK-SAME: dataflow_level = 4 - // CHECK: %15 = "hlscpp.assign"(%14) - // CHECK-SAME: dataflow_level = 4 - // CHECK: %16 = "hlscpp.assign"(%15) - // CHECK-SAME: dataflow_level = 3 - // CHECK: %17 = "tosa.add"(%13, %16) - // CHECK-SAME: dataflow_level = 3 + // CHECK: %2 = call @dataflow3(%1#0, %1#1) : (tensor<1x32x32x64xi8>, tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> %14 = "tosa.add"(%13, %10) : (tensor<1x32x32x64xi8>, tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> %15 = "tosa.clamp"(%14) {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - // CHECK: %19 = "tosa.avg_pool2d"(%18) - // CHECK-SAME: dataflow_level = 2 + // CHECK: %3 = call @dataflow2(%2) : (tensor<1x32x32x64xi8>) -> tensor<1x1x64xi8> %16 = "tosa.avg_pool2d"(%15) {kernel = [32, 32], pad = [0, 0, 0, 0], quantization_info = {input_zp = 0 : i32, output_zp = 0 : i32}, stride = [32, 32]} : (tensor<1x32x32x64xi8>) -> tensor<1x1x1x64xi8> %17 = "tosa.transpose"(%16, %5) : (tensor<1x1x1x64xi8>, tensor<4xi32>) -> tensor<1x64x1x1xi8> %18 = "tosa.reshape"(%17) {new_shape = [1, 1, 64]} : (tensor<1x64x1x1xi8>) -> tensor<1x1x64xi8> - // CHECK: %22 = "tosa.matmul"(%21, %1) - // CHECK-SAME: dataflow_level = 1 + // CHECK: %4 = call @dataflow1(%3) : (tensor<1x1x64xi8>) -> tensor<1x10xi8> %19 = "tosa.matmul"(%18, %1) {quantization_info = {a_zp = 0 : i32, b_zp = 0 : i32}} : (tensor<1x1x64xi8>, tensor<1x64x10xi8>) -> tensor<1x1x10xi8> %20 = "tosa.reshape"(%19) {new_shape = [1, 10]} : (tensor<1x1x10xi8>) -> tensor<1x10xi8> %21 = "tosa.add"(%20, %0) : (tensor<1x10xi8>, tensor<1x10xi8>) -> tensor<1x10xi8> + + // CHECK: return %4 : tensor<1x10xi8> return %21 : tensor<1x10xi8> } } diff --git a/test/Transforms/Graph/split_function.mlir b/test/Transforms/Graph/split_function.mlir deleted file mode 100644 index 4a56b24..0000000 --- a/test/Transforms/Graph/split_function.mlir +++ /dev/null @@ -1,89 +0,0 @@ -// RUN: scalehls-opt -scalehls-split-function %s | FileCheck %s - -module { - // CHECK: func @dataflow2(%arg0: tensor<1x32x32x64xi8>) -> tensor<1x1x64xi8> { - // CHECK: %1 = "tosa.avg_pool2d" - // CHECK: %2 = "tosa.transpose" - // CHECK: %3 = tensor.reshape - // CHECK: return %3 : tensor<1x1x64xi8> - // CHECK: } - - // CHECK: func @dataflow4(%arg0: tensor<1x32x32x64xi8>) -> (tensor<1x32x32x64xi8>, tensor<1x32x32x64xi8>) { - // CHECK: %2 = "tosa.clamp" - // CHECK: %3 = "tosa.conv2d" - // CHECK: %4 = "tosa.clamp" - // CHECK: %5 = "hlscpp.assign" - // CHECK: return %4, %5 - // CHECK: } - - // CHECK: func @dataflow1(%arg0: tensor<1x1x64xi8>) -> tensor<1x10xi8> { - // CHECK: %2 = "tosa.matmul" - // CHECK: %3 = tensor.reshape - // CHECK: %4 = "tosa.add" - // CHECK: return %4 - // CHECK: } - - // CHECK: func @dataflow3(%arg0: tensor<1x32x32x64xi8>, %arg1: tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> { - // CHECK: %2 = "tosa.conv2d" - // CHECK: %3 = "hlscpp.assign" - // CHECK: %4 = "tosa.add" - // CHECK: %5 = "tosa.clamp" - // CHECK: return %5 - // CHECK: } - - // CHECK: func @dataflow5(%arg0: tensor<1x3x32x32xi8>) -> tensor<1x32x32x64xi8> { - // CHECK: %3 = "tosa.transpose" - // CHECK: %4 = "tosa.conv2d" - // CHECK: return %4 - // CHECK: } - - // CHECK: func @forward(%arg0: tensor<1x3x32x32xi8>) -> tensor<1x10xi8> attributes {func_directive = #hlscpp.fd} { - func @forward(%arg0: tensor<1x3x32x32xi8>) -> tensor<1x10xi8> attributes {func_directive = #hlscpp.fd} { - // CHECK-NOT: %0 = "tosa.const"() {value = dense<0> : tensor<1x10xi8>} : () -> tensor<1x10xi8> - // CHECK-NOT: %1 = "tosa.const"() {value = dense<1> : tensor<1x64x10xi8>} : () -> tensor<1x64x10xi8> - // CHECK-NOT: %2 = "tosa.const"() {value = dense<2> : tensor<64x3x3x64xi8>} : () -> tensor<64x3x3x64xi8> - // CHECK-NOT: %3 = "tosa.const"() {value = dense<3> : tensor<64x3x3x64xi8>} : () -> tensor<64x3x3x64xi8> - // CHECK-NOT: %4 = "tosa.const"() {value = dense<4> : tensor<64x3x3x3xi8>} : () -> tensor<64x3x3x3xi8> - // CHECK-NOT: %5 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> - // CHECK-NOT: %6 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> - // CHECK-NOT: %7 = "tosa.const"() {value = dense<5> : tensor<64xi8>} : () -> tensor<64xi8> - %0 = "tosa.const"() {value = dense<0> : tensor<1x10xi8>} : () -> tensor<1x10xi8> - %1 = "tosa.const"() {value = dense<1> : tensor<1x64x10xi8>} : () -> tensor<1x64x10xi8> - %2 = "tosa.const"() {value = dense<2> : tensor<64x3x3x64xi8>} : () -> tensor<64x3x3x64xi8> - %3 = "tosa.const"() {value = dense<3> : tensor<64x3x3x64xi8>} : () -> tensor<64x3x3x64xi8> - %4 = "tosa.const"() {value = dense<4> : tensor<64x3x3x3xi8>} : () -> tensor<64x3x3x3xi8> - %5 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> - %6 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> - %7 = "tosa.const"() {value = dense<5> : tensor<64xi8>} : () -> tensor<64xi8> - - // CHECK: %0 = call @dataflow5(%arg0) : (tensor<1x3x32x32xi8>) -> tensor<1x32x32x64xi8> - %8 = "tosa.transpose"(%arg0, %6) {dataflow_level = 5 : i64} : (tensor<1x3x32x32xi8>, tensor<4xi32>) -> tensor<1x32x32x3xi8> - %9 = "tosa.conv2d"(%8, %4, %7) {dataflow_level = 5 : i64, dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = 0 : i32, weight_zp = 0 : i32}, stride = [1, 1]} : (tensor<1x32x32x3xi8>, tensor<64x3x3x3xi8>, tensor<64xi8>) -> tensor<1x32x32x64xi8> - - // CHECK: %1:2 = call @dataflow4(%0) : (tensor<1x32x32x64xi8>) -> (tensor<1x32x32x64xi8>, tensor<1x32x32x64xi8>) - %10 = "tosa.clamp"(%9) {dataflow_level = 4 : i64, max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - %11 = "tosa.conv2d"(%10, %3, %7) {dataflow_level = 4 : i64, dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = 0 : i32, weight_zp = 0 : i32}, stride = [1, 1]} : (tensor<1x32x32x64xi8>, tensor<64x3x3x64xi8>, tensor<64xi8>) -> tensor<1x32x32x64xi8> - %12 = "tosa.clamp"(%11) {dataflow_level = 4 : i64, max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - %13 = "tosa.conv2d"(%12, %2, %7) {dataflow_level = 3 : i64, dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = 0 : i32, weight_zp = 0 : i32}, stride = [1, 1]} : (tensor<1x32x32x64xi8>, tensor<64x3x3x64xi8>, tensor<64xi8>) -> tensor<1x32x32x64xi8> - %14 = "hlscpp.assign"(%10) {dataflow_level = 4 : i64} : (tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - %15 = "hlscpp.assign"(%14) {dataflow_level = 4 : i64} : (tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - - // CHECK: %2 = call @dataflow3(%1#0, %1#1) : (tensor<1x32x32x64xi8>, tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - %16 = "hlscpp.assign"(%15) {dataflow_level = 3 : i64} : (tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - %17 = "tosa.add"(%13, %16) {dataflow_level = 3 : i64} : (tensor<1x32x32x64xi8>, tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - %18 = "tosa.clamp"(%17) {dataflow_level = 3 : i64, max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x32x32x64xi8>) -> tensor<1x32x32x64xi8> - - // CHECK: %3 = call @dataflow2(%2) : (tensor<1x32x32x64xi8>) -> tensor<1x1x64xi8> - %19 = "tosa.avg_pool2d"(%18) {dataflow_level = 2 : i64, kernel = [32, 32], pad = [0, 0, 0, 0], quantization_info = {input_zp = 0 : i32, output_zp = 0 : i32}, stride = [32, 32]} : (tensor<1x32x32x64xi8>) -> tensor<1x1x1x64xi8> - %20 = "tosa.transpose"(%19, %5) {dataflow_level = 2 : i64} : (tensor<1x1x1x64xi8>, tensor<4xi32>) -> tensor<1x64x1x1xi8> - %21 = "tosa.reshape"(%20) {dataflow_level = 2 : i64, new_shape = [1, 1, 64]} : (tensor<1x64x1x1xi8>) -> tensor<1x1x64xi8> - - // CHECK: %4 = call @dataflow1(%3) : (tensor<1x1x64xi8>) -> tensor<1x10xi8> - %22 = "tosa.matmul"(%21, %1) {dataflow_level = 1 : i64, quantization_info = {a_zp = 0 : i32, b_zp = 0 : i32}} : (tensor<1x1x64xi8>, tensor<1x64x10xi8>) -> tensor<1x1x10xi8> - %23 = "tosa.reshape"(%22) {dataflow_level = 1 : i64, new_shape = [1, 10]} : (tensor<1x1x10xi8>) -> tensor<1x10xi8> - %24 = "tosa.add"(%23, %0) {dataflow_level = 1 : i64} : (tensor<1x10xi8>, tensor<1x10xi8>) -> tensor<1x10xi8> - - // CHECK: return %4 : tensor<1x10xi8> - return %24 : tensor<1x10xi8> - } -}