diff --git a/lib/Transforms/Graph/LegalizeDataflow.cpp b/lib/Transforms/Graph/LegalizeDataflow.cpp
index 5586c45..7eba812 100644
--- a/lib/Transforms/Graph/LegalizeDataflow.cpp
+++ b/lib/Transforms/Graph/LegalizeDataflow.cpp
@@ -7,69 +7,130 @@
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Dominance.h"
 #include "scalehls/Transforms/Passes.h"
 #include "scalehls/Transforms/Utils.h"
 
 using namespace mlir;
 using namespace scalehls;
 
-// For storing the intermediate memory and successor loops indexed by the
-// predecessor loop.
-using Successors = SmallVector<std::pair<Value, Operation *>, 2>;
-using SuccessorsMap = DenseMap<Operation *, Successors>;
+// A dataflow use includes the intermediate value and the user operation, which
+// is similar to the concept of OpOperand in the SSA graph.
+using DataflowUse = std::pair<Value, Operation *>;
+using DataflowUseRange = llvm::iterator_range<const DataflowUse *>;
 
-static void getSuccessorsMap(Block &block, SuccessorsMap &map) {
-  DenseMap<Operation *, SmallPtrSet<Value, 2>> memsMap;
-  DenseMap<Value, SmallPtrSet<Operation *, 2>> loopsMap;
+// A mapping from an operation to all its dataflow uses.
+using DataflowUsesMap =
+    llvm::SmallDenseMap<Operation *, SmallVector<DataflowUse, 4>, 64>;
 
-  // TODO: for now we only consider store/load operations.
-  for (auto loop : block.getOps<AffineForOp>())
-    loop.walk([&](Operation *op) {
-      if (auto affineStore = dyn_cast<AffineStoreOp>(op))
-        memsMap[loop].insert(affineStore.getMemRef());
+namespace {
+struct DataflowGraph {
+  DataflowGraph(FuncOp func);
 
-      else if (auto store = dyn_cast<memref::StoreOp>(op))
-        memsMap[loop].insert(store.getMemRef());
+  const DataflowUseRange getUses(Operation *node) const {
+    const auto &uses = usesMap.lookup(node);
+    return llvm::make_range(uses.begin(), uses.end());
+  }
 
-      else if (auto affineLoad = dyn_cast<AffineLoadOp>(op))
-        loopsMap[affineLoad.getMemRef()].insert(loop);
+  llvm::SmallDenseSet<Operation *, 4> getBundledNodes(Operation *node) const {
+    llvm::SmallDenseSet<Operation *, 4> bundledNodes;
+    for (auto use : getUses(node))
+      for (auto updater : updatersMap.lookup(use.first))
+        bundledNodes.insert(updater);
+    return bundledNodes;
+  }
 
-      else if (auto load = dyn_cast<memref::LoadOp>(op))
-        loopsMap[load.getMemRef()].insert(loop);
+  bool hasNode(Operation *node) const { return nodes.count(node); }
+
+private:
+  // Hold all nodes in the dataflow graph.
+  llvm::SmallDenseSet<Operation *, 64> nodes;
+
+  // Hold the uses mapping.
+  DataflowUsesMap usesMap;
+
+  // Hold the mapping from an intermediate value to all its updaters. Because in
+  // the context of coarse-grained dataflow, intermediate value such as memory
+  // can be written by more than one operations.
+  llvm::SmallDenseMap<Value, llvm::SmallDenseSet<Operation *, 2>> updatersMap;
+};
+} // namespace
+
+DataflowGraph::DataflowGraph(FuncOp func) {
+  // Results map of each operation.
+  DenseMap<Operation *, llvm::SmallDenseSet<Value, 2>> resultsMap;
+
+  for (auto &op : func.front()) {
+    // Handle Linalg dialect operations.
+    if (isa<linalg::LinalgDialect>(op.getDialect())) {
+      if (auto copy = dyn_cast<linalg::CopyOp>(op)) {
+        resultsMap[&op].insert(copy.getTarget());
+        updatersMap[copy.getTarget()].insert(&op);
+
+      } else {
+        auto generic = dyn_cast<linalg::GenericOp>(op);
+        if (!generic || !generic.hasBufferSemantics()) {
+          op.emitOpError("found ungeneralized or unbufferized linalg ops");
+          return;
+        }
+        for (auto result : generic.getOutputOperands()) {
+          resultsMap[&op].insert(result->get());
+          updatersMap[result->get()].insert(&op);
+        }
+      }
+      continue;
+    }
+
+    // Handle memory stores. Child regions are recursively traversed, such that
+    // for and if operations are considered as a node of the dataflow.
+    op.walk([&](Operation *child) {
+      // TODO: Support transfer write?
+      if (auto affineStore = dyn_cast<mlir::AffineWriteOpInterface>(child)) {
+        resultsMap[&op].insert(affineStore.getMemRef());
+        updatersMap[affineStore.getMemRef()].insert(&op);
+
+      } else if (auto store = dyn_cast<memref::StoreOp>(child)) {
+        resultsMap[&op].insert(store.getMemRef());
+        updatersMap[store.getMemRef()].insert(&op);
+      }
     });
 
-  // Find successors of all operations. Since this is a dataflow analysis, this
-  // traverse will not enter any control flow operations.
-  for (auto &op : block.getOperations()) {
-    // TODO: Some operations are dataflow source, which will not be scheduled.
-    if (isa<memref::AllocOp, memref::AllocaOp, arith::ConstantOp,
-            bufferization::ToTensorOp, bufferization::ToMemrefOp>(op))
+    // Handle normal SSA results.
+    for (auto result : op.getResults()) {
+      resultsMap[&op].insert(result);
+      if (result.getType().isa<MemRefType>())
+        updatersMap[result].insert(&op);
+    }
+  }
+
+  // Get the dominace tree for later use.
+  DominanceInfo DT(func);
+
+  // Find successors of all operations.
+  for (auto &op : func.front()) {
+    // TODO: Some operations are dataflow source/sink node, which will not be
+    // scheduled. Any other operations should appear here?
+    if (isa<memref::GetGlobalOp, memref::AllocOp, memref::AllocaOp,
+            bufferization::ToMemrefOp, arith::ConstantOp, ReturnOp>(op))
       continue;
+    nodes.insert(&op);
 
-    // Collect all memref results if the current operation is a loop.
-    auto mems = memsMap.lookup(&op);
-    SmallVector<Value, 2> results(mems.begin(), mems.end());
-
-    // Collect all returned shaped type results.
-    for (auto result : op.getResults())
-      if (result.getType().isa<ShapedType>())
-        results.push_back(result);
-
-    // Traverse all produced results.
-    for (auto result : results) {
-      for (auto user : loopsMap.lookup(result)) {
-        // If the successor loop not only loads from the memory, but also store
-        // to the memory, it is not considered as a successor.
-        if (user == &op || memsMap.lookup(user).count(result))
-          continue;
-        map[&op].push_back(std::pair<Value, Operation *>(result, user));
-      }
-
+    for (auto result : resultsMap.lookup(&op)) {
       for (auto user : result.getUsers()) {
-        // User must be an operation in the block.
-        if (user != block.findAncestorOpInBlock(*user))
+        // If the same block user doesn't exist, or is not properly dominated,
+        // or is also an updater of the result, continue.
+        auto sameBlockUser = func.front().findAncestorOpInBlock(*user);
+        if (!sameBlockUser || isa<ReturnOp>(sameBlockUser) ||
+            !DT.properlyDominates(&op, sameBlockUser) ||
+            updatersMap.lookup(result).count(sameBlockUser))
           continue;
-        map[&op].push_back(std::pair<Value, Operation *>(result, user));
+
+        // Only push back non-exist uses.
+        // TODO: Create a DenseMapInfo struct to make use SmallDenseSet.
+        auto &uses = usesMap[&op];
+        auto newUse = DataflowUse({result, sameBlockUser});
+        if (llvm::find(uses, newUse) == uses.end())
+          uses.push_back(newUse);
       }
     }
   }
@@ -78,44 +139,37 @@ static void getSuccessorsMap(Block &block, SuccessorsMap &map) {
 static bool applyLegalizeDataflow(FuncOp func, int64_t minGran,
                                   bool insertCopy) {
   auto builder = OpBuilder(func);
-
-  SuccessorsMap successorsMap;
-  getSuccessorsMap(func.front(), successorsMap);
+  DataflowGraph graph(func);
 
   llvm::SmallDenseMap<int64_t, int64_t, 16> dataflowToMerge;
 
-  // Walk through all dataflow operations in a reversed order for establishing a
-  // ALAP scheduling.
+  // Walk through all dataflow operations in a reversed order for establishing
+  // a ALAP scheduling.
   for (auto i = func.front().rbegin(); i != func.front().rend(); ++i) {
     auto op = &*i;
-    // TODO: Here, we assume all dataflow operations should have successor.
-    if (successorsMap.count(op)) {
-      int64_t dataflowLevel = 0;
+    if (!graph.hasNode(op))
+      continue;
 
-      // Walk through all successor ops.
-      for (auto pair : successorsMap[op]) {
-        auto successor = pair.second;
-        if (isa<ReturnOp>(successor))
-          continue;
-
-        if (auto attr = successor->getAttrOfType<IntegerAttr>("dataflow_level"))
-          dataflowLevel = std::max(dataflowLevel, attr.getInt());
+    // Walk through all successor ops.
+    int64_t dataflowLevel = 0;
+    for (auto bundledNode : graph.getBundledNodes(op))
+      for (const auto &use : graph.getUses(bundledNode))
+        if (auto a = use.second->getAttrOfType<IntegerAttr>("dataflow_level"))
+          dataflowLevel = std::max(dataflowLevel, a.getInt());
         else {
           op->emitError("has unexpected successor, legalization failed");
           return false;
         }
-      }
 
-      // Set an attribute for indicating the scheduled dataflow level.
-      op->setAttr("dataflow_level", builder.getIntegerAttr(builder.getI64Type(),
-                                                           dataflowLevel + 1));
+    // Set an attribute for indicating the scheduled dataflow level.
+    op->setAttr("dataflow_level", builder.getIntegerAttr(builder.getI64Type(),
+                                                         dataflowLevel + 1));
 
-      // Eliminate bypass paths if detected.
-      for (auto pair : successorsMap[op]) {
-        auto value = pair.first;
-        auto successor = pair.second;
-        if (isa<ReturnOp>(successor))
-          continue;
+    // Eliminate bypass paths if detected.
+    for (auto bundledNode : graph.getBundledNodes(op))
+      for (auto use : graph.getUses(bundledNode)) {
+        auto value = use.first;
+        auto successor = use.second;
 
         auto successorDataflowLevel =
             successor->getAttrOfType<IntegerAttr>("dataflow_level").getInt();
@@ -134,14 +188,13 @@ static bool applyLegalizeDataflow(FuncOp func, int64_t minGran,
           builder.setInsertionPoint(successor);
           for (auto i = dataflowLevel; i > successorDataflowLevel; --i) {
             // Create CopyOp.
-            Value newValue;
-            Operation *copyOp;
             auto valueType = value.getType().dyn_cast<MemRefType>();
-            assert(valueType && "only support memref type now, will introduce "
-                                "TOSA dialect for tackling tensor operators");
-            newValue = builder.create<memref::AllocOp>(op->getLoc(), valueType);
-            copyOp = builder.create<linalg::CopyOp>(op->getLoc(), values.back(),
-                                                    newValue);
+            assert(valueType && "only support memref type, please pass Affine "
+                                "or bufferized Linalg IR as input");
+            auto newValue =
+                builder.create<memref::AllocOp>(op->getLoc(), valueType);
+            auto copyOp = builder.create<linalg::CopyOp>(
+                op->getLoc(), values.back(), newValue);
 
             // Set CopyOp dataflow level.
             copyOp->setAttr("dataflow_level",
@@ -164,7 +217,6 @@ static bool applyLegalizeDataflow(FuncOp func, int64_t minGran,
             dataflowToMerge[successorDataflowLevel] = dataflowLevel;
         }
       }
-    }
   }
 
   // Collect all operations in each dataflow level.
diff --git a/lib/Transforms/Graph/SplitFunction.cpp b/lib/Transforms/Graph/SplitFunction.cpp
index 3d89d2d..8456d8a 100644
--- a/lib/Transforms/Graph/SplitFunction.cpp
+++ b/lib/Transforms/Graph/SplitFunction.cpp
@@ -35,7 +35,6 @@ static bool applySplitFunction(FuncOp func, ArrayRef<Operation *> ops,
   for (auto op : ops)
     for (auto result : op->getResults()) {
       internalValues.insert(result);
-
       if (isLiveOut(result)) {
         outputTypes.push_back(result.getType());
         outputValues.push_back(result);
@@ -52,9 +51,13 @@ static bool applySplitFunction(FuncOp func, ArrayRef<Operation *> ops,
   for (auto op : ops) {
     // Push back all operands and liveins as candidates.
     SmallVector<Value, 8> inputCandidates(op->getOperands());
-    if (auto loop = dyn_cast<AffineForOp>(op)) {
-      auto liveIns = liveness.getLiveIn(loop.getBody());
-      inputCandidates.append(liveIns.begin(), liveIns.end());
+    for (auto &region : op->getRegions()) {
+      auto entryBlock = &region.front();
+      auto args = entryBlock->getArguments();
+
+      for (auto liveIn : liveness.getLiveIn(entryBlock))
+        if (llvm::find(args, liveIn) == args.end())
+          inputCandidates.push_back(liveIn);
     }
 
     for (auto input : inputCandidates) {
diff --git a/lib/Transforms/LegalizeToHLSCpp.cpp b/lib/Transforms/LegalizeToHLSCpp.cpp
index 1035d35..c89f62a 100644
--- a/lib/Transforms/LegalizeToHLSCpp.cpp
+++ b/lib/Transforms/LegalizeToHLSCpp.cpp
@@ -57,10 +57,12 @@ struct MemrefStoreRewritePattern : public OpRewritePattern<memref::StoreOp> {
 bool scalehls::applyLegalizeToHLSCpp(FuncOp func, bool isTopFunc) {
   auto builder = OpBuilder(func);
 
-  // We constain functions to only contain one block.
-  if (func.getBlocks().size() != 1)
+  // We constrain functions to only contain one block.
+  if (!llvm::hasSingleElement(func))
     func.emitError("has zero or more than one basic blocks.");
 
+  // TODO: Make sure there's no memref store/load or scf operations?
+
   // Set function pragma attributes.
   if (auto fd = getFuncDirective(func))
     setFuncDirective(func, fd.getPipeline(), fd.getTargetInterval(),
diff --git a/lib/Transforms/Passes.cpp b/lib/Transforms/Passes.cpp
index 08a0d71..d0a637e 100644
--- a/lib/Transforms/Passes.cpp
+++ b/lib/Transforms/Passes.cpp
@@ -43,9 +43,8 @@ void scalehls::registerScaleHLSPassPipeline() {
 
         // Adapt the model from torch-mlir or onnx-mlir front-end.
         if (opts.frontend == "torch") {
-          pm.addPass(mlir::createLinalgGeneralizationPass());
           pm.addPass(mlir::createLinalgBufferizePass());
-          pm.addPass(mlir::createFuncBufferizePass());
+          pm.addPass(mlir::createLinalgGeneralizationPass());
           pm.addPass(mlir::createCanonicalizerPass());
         } else if (opts.frontend == "onnx") {
           pm.addPass(scalehls::createLegalizeOnnxPass());
@@ -57,9 +56,9 @@ void scalehls::registerScaleHLSPassPipeline() {
 
         // Graph-level optimizations.
         if (dataflowGran) {
-          pm.addPass(scalehls::createSimplifyGraphPass());
           pm.addPass(scalehls::createLegalizeDataflowPass(dataflowGran));
           pm.addPass(scalehls::createSplitFunctionPass());
+          pm.addPass(scalehls::createSimplifyGraphPass());
           pm.addPass(mlir::createConvertLinalgToAffineLoopsPass());
           pm.addPass(mlir::createCanonicalizerPass());
         }
diff --git a/lib/Transforms/QoREstimation.cpp b/lib/Transforms/QoREstimation.cpp
index ec96679..5da6e31 100644
--- a/lib/Transforms/QoREstimation.cpp
+++ b/lib/Transforms/QoREstimation.cpp
@@ -581,7 +581,7 @@ bool ScaleHLSEstimator::visitOp(CallOp op, int64_t begin) {
 // Block Scheduler and Estimator
 //===----------------------------------------------------------------------===//
 
-// Get the pointer of the scrOp's parent loop, which should locat at the same
+// Get the pointer of the scrOp's parent loop, which should locate at the same
 // level with dstOp's any parent loop.
 static Operation *getSameLevelDstOp(Operation *srcOp, Operation *dstOp) {
   // If srcOp and dstOp are already at the same level, return the srcOp.
diff --git a/samples/pytorch/torch-mlir/README.md b/samples/pytorch/torch-mlir/README.md
index ad37a19..8a2de4a 100644
--- a/samples/pytorch/torch-mlir/README.md
+++ b/samples/pytorch/torch-mlir/README.md
@@ -7,6 +7,7 @@ $ # Parse PyTorch model to Linalg dialect (with mlir_venv activated).
 $ python3 export_resnet18_mlir.py | torch-mlir-opt \
     -torchscript-module-to-torch-backend-pipeline="optimize=true" \
     -torch-backend-to-linalg-on-tensors-backend-pipeline="optimize=true" \
+    -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops create-deallocs=false" \
     -canonicalize > resnet18.mlir
 
 $ # Optimize the model and emit C++ code (not working, will be fixed soon).