[LegalizeDataflow] Support linalg ops; Make this pass more robust; [SplitFunction] Support linalg ops

This commit is contained in:
Hanchen Ye 2022-02-21 19:41:03 -06:00
parent cc324c4b45
commit bdf72c8cc3
6 changed files with 147 additions and 90 deletions

View File

@ -7,69 +7,130 @@
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/Dominance.h"
#include "scalehls/Transforms/Passes.h"
#include "scalehls/Transforms/Utils.h"
using namespace mlir;
using namespace scalehls;
// For storing the intermediate memory and successor loops indexed by the
// predecessor loop.
using Successors = SmallVector<std::pair<Value, Operation *>, 2>;
using SuccessorsMap = DenseMap<Operation *, Successors>;
// A dataflow use includes the intermediate value and the user operation, which
// is similar to the concept of OpOperand in the SSA graph.
using DataflowUse = std::pair<Value, Operation *>;
using DataflowUseRange = llvm::iterator_range<const DataflowUse *>;
static void getSuccessorsMap(Block &block, SuccessorsMap &map) {
DenseMap<Operation *, SmallPtrSet<Value, 2>> memsMap;
DenseMap<Value, SmallPtrSet<Operation *, 2>> loopsMap;
// A mapping from an operation to all its dataflow uses.
using DataflowUsesMap =
llvm::SmallDenseMap<Operation *, SmallVector<DataflowUse, 4>, 64>;
// TODO: for now we only consider store/load operations.
for (auto loop : block.getOps<AffineForOp>())
loop.walk([&](Operation *op) {
if (auto affineStore = dyn_cast<AffineStoreOp>(op))
memsMap[loop].insert(affineStore.getMemRef());
namespace {
struct DataflowGraph {
DataflowGraph(FuncOp func);
else if (auto store = dyn_cast<memref::StoreOp>(op))
memsMap[loop].insert(store.getMemRef());
const DataflowUseRange getUses(Operation *node) const {
const auto &uses = usesMap.lookup(node);
return llvm::make_range(uses.begin(), uses.end());
}
else if (auto affineLoad = dyn_cast<AffineLoadOp>(op))
loopsMap[affineLoad.getMemRef()].insert(loop);
llvm::SmallDenseSet<Operation *, 4> getBundledNodes(Operation *node) const {
llvm::SmallDenseSet<Operation *, 4> bundledNodes;
for (auto use : getUses(node))
for (auto updater : updatersMap.lookup(use.first))
bundledNodes.insert(updater);
return bundledNodes;
}
else if (auto load = dyn_cast<memref::LoadOp>(op))
loopsMap[load.getMemRef()].insert(loop);
bool hasNode(Operation *node) const { return nodes.count(node); }
private:
// Hold all nodes in the dataflow graph.
llvm::SmallDenseSet<Operation *, 64> nodes;
// Hold the uses mapping.
DataflowUsesMap usesMap;
// Hold the mapping from an intermediate value to all its updaters. Because in
// the context of coarse-grained dataflow, intermediate value such as memory
// can be written by more than one operations.
llvm::SmallDenseMap<Value, llvm::SmallDenseSet<Operation *, 2>> updatersMap;
};
} // namespace
DataflowGraph::DataflowGraph(FuncOp func) {
// Results map of each operation.
DenseMap<Operation *, llvm::SmallDenseSet<Value, 2>> resultsMap;
for (auto &op : func.front()) {
// Handle Linalg dialect operations.
if (isa<linalg::LinalgDialect>(op.getDialect())) {
if (auto copy = dyn_cast<linalg::CopyOp>(op)) {
resultsMap[&op].insert(copy.getTarget());
updatersMap[copy.getTarget()].insert(&op);
} else {
auto generic = dyn_cast<linalg::GenericOp>(op);
if (!generic || !generic.hasBufferSemantics()) {
op.emitOpError("found ungeneralized or unbufferized linalg ops");
return;
}
for (auto result : generic.getOutputOperands()) {
resultsMap[&op].insert(result->get());
updatersMap[result->get()].insert(&op);
}
}
continue;
}
// Handle memory stores. Child regions are recursively traversed, such that
// for and if operations are considered as a node of the dataflow.
op.walk([&](Operation *child) {
// TODO: Support transfer write?
if (auto affineStore = dyn_cast<mlir::AffineWriteOpInterface>(child)) {
resultsMap[&op].insert(affineStore.getMemRef());
updatersMap[affineStore.getMemRef()].insert(&op);
} else if (auto store = dyn_cast<memref::StoreOp>(child)) {
resultsMap[&op].insert(store.getMemRef());
updatersMap[store.getMemRef()].insert(&op);
}
});
// Find successors of all operations. Since this is a dataflow analysis, this
// traverse will not enter any control flow operations.
for (auto &op : block.getOperations()) {
// TODO: Some operations are dataflow source, which will not be scheduled.
if (isa<memref::AllocOp, memref::AllocaOp, arith::ConstantOp,
bufferization::ToTensorOp, bufferization::ToMemrefOp>(op))
// Handle normal SSA results.
for (auto result : op.getResults()) {
resultsMap[&op].insert(result);
if (result.getType().isa<MemRefType>())
updatersMap[result].insert(&op);
}
}
// Get the dominace tree for later use.
DominanceInfo DT(func);
// Find successors of all operations.
for (auto &op : func.front()) {
// TODO: Some operations are dataflow source/sink node, which will not be
// scheduled. Any other operations should appear here?
if (isa<memref::GetGlobalOp, memref::AllocOp, memref::AllocaOp,
bufferization::ToMemrefOp, arith::ConstantOp, ReturnOp>(op))
continue;
nodes.insert(&op);
// Collect all memref results if the current operation is a loop.
auto mems = memsMap.lookup(&op);
SmallVector<Value, 2> results(mems.begin(), mems.end());
// Collect all returned shaped type results.
for (auto result : op.getResults())
if (result.getType().isa<ShapedType>())
results.push_back(result);
// Traverse all produced results.
for (auto result : results) {
for (auto user : loopsMap.lookup(result)) {
// If the successor loop not only loads from the memory, but also store
// to the memory, it is not considered as a successor.
if (user == &op || memsMap.lookup(user).count(result))
continue;
map[&op].push_back(std::pair<Value, Operation *>(result, user));
}
for (auto result : resultsMap.lookup(&op)) {
for (auto user : result.getUsers()) {
// User must be an operation in the block.
if (user != block.findAncestorOpInBlock(*user))
// If the same block user doesn't exist, or is not properly dominated,
// or is also an updater of the result, continue.
auto sameBlockUser = func.front().findAncestorOpInBlock(*user);
if (!sameBlockUser || isa<ReturnOp>(sameBlockUser) ||
!DT.properlyDominates(&op, sameBlockUser) ||
updatersMap.lookup(result).count(sameBlockUser))
continue;
map[&op].push_back(std::pair<Value, Operation *>(result, user));
// Only push back non-exist uses.
// TODO: Create a DenseMapInfo struct to make use SmallDenseSet.
auto &uses = usesMap[&op];
auto newUse = DataflowUse({result, sameBlockUser});
if (llvm::find(uses, newUse) == uses.end())
uses.push_back(newUse);
}
}
}
@ -78,44 +139,37 @@ static void getSuccessorsMap(Block &block, SuccessorsMap &map) {
static bool applyLegalizeDataflow(FuncOp func, int64_t minGran,
bool insertCopy) {
auto builder = OpBuilder(func);
SuccessorsMap successorsMap;
getSuccessorsMap(func.front(), successorsMap);
DataflowGraph graph(func);
llvm::SmallDenseMap<int64_t, int64_t, 16> dataflowToMerge;
// Walk through all dataflow operations in a reversed order for establishing a
// ALAP scheduling.
// Walk through all dataflow operations in a reversed order for establishing
// a ALAP scheduling.
for (auto i = func.front().rbegin(); i != func.front().rend(); ++i) {
auto op = &*i;
// TODO: Here, we assume all dataflow operations should have successor.
if (successorsMap.count(op)) {
int64_t dataflowLevel = 0;
if (!graph.hasNode(op))
continue;
// Walk through all successor ops.
for (auto pair : successorsMap[op]) {
auto successor = pair.second;
if (isa<ReturnOp>(successor))
continue;
if (auto attr = successor->getAttrOfType<IntegerAttr>("dataflow_level"))
dataflowLevel = std::max(dataflowLevel, attr.getInt());
// Walk through all successor ops.
int64_t dataflowLevel = 0;
for (auto bundledNode : graph.getBundledNodes(op))
for (const auto &use : graph.getUses(bundledNode))
if (auto a = use.second->getAttrOfType<IntegerAttr>("dataflow_level"))
dataflowLevel = std::max(dataflowLevel, a.getInt());
else {
op->emitError("has unexpected successor, legalization failed");
return false;
}
}
// Set an attribute for indicating the scheduled dataflow level.
op->setAttr("dataflow_level", builder.getIntegerAttr(builder.getI64Type(),
dataflowLevel + 1));
// Set an attribute for indicating the scheduled dataflow level.
op->setAttr("dataflow_level", builder.getIntegerAttr(builder.getI64Type(),
dataflowLevel + 1));
// Eliminate bypass paths if detected.
for (auto pair : successorsMap[op]) {
auto value = pair.first;
auto successor = pair.second;
if (isa<ReturnOp>(successor))
continue;
// Eliminate bypass paths if detected.
for (auto bundledNode : graph.getBundledNodes(op))
for (auto use : graph.getUses(bundledNode)) {
auto value = use.first;
auto successor = use.second;
auto successorDataflowLevel =
successor->getAttrOfType<IntegerAttr>("dataflow_level").getInt();
@ -134,14 +188,13 @@ static bool applyLegalizeDataflow(FuncOp func, int64_t minGran,
builder.setInsertionPoint(successor);
for (auto i = dataflowLevel; i > successorDataflowLevel; --i) {
// Create CopyOp.
Value newValue;
Operation *copyOp;
auto valueType = value.getType().dyn_cast<MemRefType>();
assert(valueType && "only support memref type now, will introduce "
"TOSA dialect for tackling tensor operators");
newValue = builder.create<memref::AllocOp>(op->getLoc(), valueType);
copyOp = builder.create<linalg::CopyOp>(op->getLoc(), values.back(),
newValue);
assert(valueType && "only support memref type, please pass Affine "
"or bufferized Linalg IR as input");
auto newValue =
builder.create<memref::AllocOp>(op->getLoc(), valueType);
auto copyOp = builder.create<linalg::CopyOp>(
op->getLoc(), values.back(), newValue);
// Set CopyOp dataflow level.
copyOp->setAttr("dataflow_level",
@ -164,7 +217,6 @@ static bool applyLegalizeDataflow(FuncOp func, int64_t minGran,
dataflowToMerge[successorDataflowLevel] = dataflowLevel;
}
}
}
}
// Collect all operations in each dataflow level.

View File

@ -35,7 +35,6 @@ static bool applySplitFunction(FuncOp func, ArrayRef<Operation *> ops,
for (auto op : ops)
for (auto result : op->getResults()) {
internalValues.insert(result);
if (isLiveOut(result)) {
outputTypes.push_back(result.getType());
outputValues.push_back(result);
@ -52,9 +51,13 @@ static bool applySplitFunction(FuncOp func, ArrayRef<Operation *> ops,
for (auto op : ops) {
// Push back all operands and liveins as candidates.
SmallVector<Value, 8> inputCandidates(op->getOperands());
if (auto loop = dyn_cast<AffineForOp>(op)) {
auto liveIns = liveness.getLiveIn(loop.getBody());
inputCandidates.append(liveIns.begin(), liveIns.end());
for (auto &region : op->getRegions()) {
auto entryBlock = &region.front();
auto args = entryBlock->getArguments();
for (auto liveIn : liveness.getLiveIn(entryBlock))
if (llvm::find(args, liveIn) == args.end())
inputCandidates.push_back(liveIn);
}
for (auto input : inputCandidates) {

View File

@ -57,10 +57,12 @@ struct MemrefStoreRewritePattern : public OpRewritePattern<memref::StoreOp> {
bool scalehls::applyLegalizeToHLSCpp(FuncOp func, bool isTopFunc) {
auto builder = OpBuilder(func);
// We constain functions to only contain one block.
if (func.getBlocks().size() != 1)
// We constrain functions to only contain one block.
if (!llvm::hasSingleElement(func))
func.emitError("has zero or more than one basic blocks.");
// TODO: Make sure there's no memref store/load or scf operations?
// Set function pragma attributes.
if (auto fd = getFuncDirective(func))
setFuncDirective(func, fd.getPipeline(), fd.getTargetInterval(),

View File

@ -43,9 +43,8 @@ void scalehls::registerScaleHLSPassPipeline() {
// Adapt the model from torch-mlir or onnx-mlir front-end.
if (opts.frontend == "torch") {
pm.addPass(mlir::createLinalgGeneralizationPass());
pm.addPass(mlir::createLinalgBufferizePass());
pm.addPass(mlir::createFuncBufferizePass());
pm.addPass(mlir::createLinalgGeneralizationPass());
pm.addPass(mlir::createCanonicalizerPass());
} else if (opts.frontend == "onnx") {
pm.addPass(scalehls::createLegalizeOnnxPass());
@ -57,9 +56,9 @@ void scalehls::registerScaleHLSPassPipeline() {
// Graph-level optimizations.
if (dataflowGran) {
pm.addPass(scalehls::createSimplifyGraphPass());
pm.addPass(scalehls::createLegalizeDataflowPass(dataflowGran));
pm.addPass(scalehls::createSplitFunctionPass());
pm.addPass(scalehls::createSimplifyGraphPass());
pm.addPass(mlir::createConvertLinalgToAffineLoopsPass());
pm.addPass(mlir::createCanonicalizerPass());
}

View File

@ -581,7 +581,7 @@ bool ScaleHLSEstimator::visitOp(CallOp op, int64_t begin) {
// Block Scheduler and Estimator
//===----------------------------------------------------------------------===//
// Get the pointer of the scrOp's parent loop, which should locat at the same
// Get the pointer of the scrOp's parent loop, which should locate at the same
// level with dstOp's any parent loop.
static Operation *getSameLevelDstOp(Operation *srcOp, Operation *dstOp) {
// If srcOp and dstOp are already at the same level, return the srcOp.

View File

@ -7,6 +7,7 @@ $ # Parse PyTorch model to Linalg dialect (with mlir_venv activated).
$ python3 export_resnet18_mlir.py | torch-mlir-opt \
-torchscript-module-to-torch-backend-pipeline="optimize=true" \
-torch-backend-to-linalg-on-tensors-backend-pipeline="optimize=true" \
-linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops create-deallocs=false" \
-canonicalize > resnet18.mlir
$ # Optimize the model and emit C++ code (not working, will be fixed soon).