diff --git a/include/Analysis/QoREstimation.h b/include/Analysis/QoREstimation.h
index 261f227..ea1e5fc 100644
--- a/include/Analysis/QoREstimation.h
+++ b/include/Analysis/QoREstimation.h
@@ -84,6 +84,7 @@ public:
   /// AffineForOp related methods.
   // unsigned getOpMinII(AffineForOp forOp);
   int64_t getResMinII(MemAccessesMap &map);
+  int64_t getDepMinII(FuncOp func, MemAccessesMap &map);
   int64_t getDepMinII(AffineForOp forOp, MemAccessesMap &map);
   bool visitOp(AffineForOp op, int64_t begin);
 
@@ -98,6 +99,7 @@ public:
     return true;                                                               \
   }
   HANDLE(AddFOp, "fadd");
+  HANDLE(SubFOp, "fadd");
   HANDLE(MulFOp, "fmul");
   HANDLE(DivFOp, "fdiv");
   HANDLE(CmpFOp, "fcmp");
diff --git a/lib/Analysis/QoREstimation.cpp b/lib/Analysis/QoREstimation.cpp
index 0f55798..9f99223 100644
--- a/lib/Analysis/QoREstimation.cpp
+++ b/lib/Analysis/QoREstimation.cpp
@@ -322,7 +322,39 @@ int64_t HLSCppEstimator::getResMinII(MemAccessesMap &map) {
   return II;
 }
 
-/// Calculate the minimum dependency II.
+/// Calculate the minimum dependency II of function.
+int64_t HLSCppEstimator::getDepMinII(FuncOp func, MemAccessesMap &map) {
+  int64_t II = 1;
+
+  for (auto &pair : map) {
+    auto loadStores = pair.second;
+
+    // Walk through each pair of source and destination. Note that here dstOp is
+    // always before srcOp.
+    int64_t dstIndex = 1;
+    for (auto dstOp : loadStores) {
+      for (auto srcOp : llvm::drop_begin(loadStores, dstIndex)) {
+        // We ignore RAR pairs.
+        if (isa<AffineReadOpInterface>(dstOp) &&
+            isa<AffineReadOpInterface>(srcOp))
+          continue;
+
+        if (MemRefAccess(dstOp) == MemRefAccess(srcOp)) {
+          float delay = getIntAttrValue(dstOp, "schedule_end") -
+                        getIntAttrValue(srcOp, "schedule_begin");
+
+          // Distance is always 1. Therefore, the minimum II is equal to delay.
+          int64_t minII = delay;
+          II = max(II, minII);
+        }
+      }
+      dstIndex++;
+    }
+  }
+  return II;
+}
+
+/// Calculate the minimum dependency II of loop.
 int64_t HLSCppEstimator::getDepMinII(AffineForOp forOp, MemAccessesMap &map) {
   int64_t II = 1;
 
@@ -344,8 +376,7 @@ int64_t HLSCppEstimator::getDepMinII(AffineForOp forOp, MemAccessesMap &map) {
     auto loadStores = pair.second;
 
     // Walk through each pair of source and destination, and each loop level
-    // that are pipelined. Note that for inter-dependency, dstOp is always
-    // before srcOp.
+    // that are pipelined. Note that here dstOp is always before srcOp.
     for (unsigned loopDepth = startLevel; loopDepth <= endLevel; ++loopDepth) {
       int64_t dstIndex = 1;
       for (auto dstOp : loadStores) {
@@ -545,7 +576,7 @@ int64_t HLSCppEstimator::getResourceMap(Block &block, ResourceMap &addFMap,
     auto end = getIntAttrValue(&op, "schedule_end");
 
     // Accumulate the resource utilization of each operation.
-    if (isa<AddFOp>(op))
+    if (isa<AddFOp, SubFOp>(op))
       for (unsigned i = begin; i < end; ++i)
         addFMap[i]++;
 
@@ -682,6 +713,21 @@ void HLSCppEstimator::estimateFunc() {
     auto latency = schedule.getValue().second;
     setAttrValue(func, "latency", latency);
 
+    // TODO: support dataflow interval estimation.
+
+    // TODO: support CallOp inside of the function.
+    if (auto attr = func.getAttrOfType<BoolAttr>("pipeline")) {
+      if (attr.getValue()) {
+        // Collect all memory access operations for calculating II.
+        MemAccessesMap map;
+        getMemAccessesMap(func.front(), map);
+
+        // Calculate initial interval.
+        auto II = max(getResMinII(map), getDepMinII(func, map));
+        setAttrValue(func, "interval", II);
+      }
+    }
+
     // Scheduled levels of all operations are reversed in this method, because
     // we have done the ALAP scheduling in a reverse order. Note that after the
     // reverse, the annotated scheduling level of each operation is a relative
diff --git a/lib/EmitHLSCpp/EmitHLSCpp.cpp b/lib/EmitHLSCpp/EmitHLSCpp.cpp
index a2d39d4..255d2d8 100644
--- a/lib/EmitHLSCpp/EmitHLSCpp.cpp
+++ b/lib/EmitHLSCpp/EmitHLSCpp.cpp
@@ -1434,6 +1434,16 @@ void ModuleEmitter::emitFunctionPragmas(FuncOp func, ArrayRef<Value> portList) {
     }
   }
 
+  if (auto pipeline = func.getAttrOfType<BoolAttr>("pipeline")) {
+    if (pipeline.getValue()) {
+      indent();
+      os << "#pragma HLS pipeline\n";
+
+      // An empty line.
+      os << "\n";
+    }
+  }
+
   // Only top function should emit interface pragmas.
   if (auto topFunction = func.getAttrOfType<BoolAttr>("top_function")) {
     if (topFunction.getValue()) {
@@ -1489,8 +1499,12 @@ void ModuleEmitter::emitFunction(FuncOp func) {
     if (top.getValue())
       os << "/// This is top function.\n";
 
-  if (auto latency = func.getAttrOfType<IntegerAttr>("latency"))
-    os << "/// Latency=" << latency.getInt() << "\n";
+  if (auto latency = func.getAttrOfType<IntegerAttr>("latency")) {
+    os << "/// Latency=" << latency.getInt();
+    if (auto interval = func.getAttrOfType<IntegerAttr>("interval"))
+      os << ", interval=" << interval.getInt();
+    os << "\n";
+  }
 
   if (auto dsp = func.getAttrOfType<IntegerAttr>("dsp"))
     os << "/// DSP=" << dsp.getInt() << "\n";
diff --git a/lib/Transforms/ArrayPartition.cpp b/lib/Transforms/ArrayPartition.cpp
index dfda41c..17b7408 100644
--- a/lib/Transforms/ArrayPartition.cpp
+++ b/lib/Transforms/ArrayPartition.cpp
@@ -25,22 +25,32 @@ struct ArrayPartition : public ArrayPartitionBase<ArrayPartition> {
 } // namespace
 
 bool scalehls::applyArrayPartition(FuncOp func, OpBuilder &builder) {
-  // Only memory accesses in pipelined loops will be executed in parallel.
-  SmallVector<AffineForOp, 4> pipelinedLoops;
-  func.walk([&](AffineForOp loop) {
-    if (auto attr = loop.getAttrOfType<BoolAttr>("pipeline"))
-      if (attr.getValue())
-        pipelinedLoops.push_back(loop);
-  });
+  // Check whether the input function is pipelined.
+  bool funcPipeline = false;
+  if (auto attr = func.getAttrOfType<BoolAttr>("pipeline"))
+    if (attr.getValue())
+      funcPipeline = true;
+
+  // Only memory accesses in pipelined loops or function will be executed in
+  // parallel and required to partition.
+  SmallVector<Block *, 4> pipelinedBlocks;
+  if (funcPipeline)
+    pipelinedBlocks.push_back(&func.front());
+  else
+    func.walk([&](AffineForOp loop) {
+      if (auto attr = loop.getAttrOfType<BoolAttr>("pipeline"))
+        if (attr.getValue())
+          pipelinedBlocks.push_back(&loop.getLoopBody().front());
+    });
 
   // Storing the partition information of each memref.
   using PartitionInfo = std::pair<PartitionKind, int64_t>;
   DenseMap<Value, SmallVector<PartitionInfo, 4>> partitionsMap;
 
   // Traverse all pipelined loops.
-  for (auto loop : pipelinedLoops) {
+  for (auto block : pipelinedBlocks) {
     MemAccessesMap accessesMap;
-    getMemAccessesMap(loop.getLoopBody().front(), accessesMap);
+    getMemAccessesMap(*block, accessesMap);
 
     for (auto pair : accessesMap) {
       auto memref = pair.first;
diff --git a/lib/Transforms/FuncPipelining.cpp b/lib/Transforms/FuncPipelining.cpp
index 0ef4e91..e80263e 100644
--- a/lib/Transforms/FuncPipelining.cpp
+++ b/lib/Transforms/FuncPipelining.cpp
@@ -38,6 +38,7 @@ bool scalehls::applyFuncPipelining(FuncOp func, OpBuilder &builder) {
   func.walk([&](AffineForOp loop) { loopUnrollFull(loop); });
 
   func.setAttr("pipeline", builder.getBoolAttr(true));
+  func.setAttr("dataflow", builder.getBoolAttr(false));
 
   // For now, this method will always success.
   return true;
diff --git a/lib/Transforms/PartialAffineLoopTile.cpp b/lib/Transforms/PartialAffineLoopTile.cpp
index 40bec16..a20c494 100644
--- a/lib/Transforms/PartialAffineLoopTile.cpp
+++ b/lib/Transforms/PartialAffineLoopTile.cpp
@@ -57,7 +57,8 @@ void PartialAffineLoopTile::runOnOperation() {
       else
         permMap.push_back(i - realTileLevel);
     }
-    permuteLoops(nestedLoops, permMap);
+    if (isValidLoopInterchangePermutation(nestedLoops, permMap))
+      permuteLoops(nestedLoops, permMap);
   }
 }
 
diff --git a/samples/polybench/syrk.mlir b/samples/polybench/syrk.mlir
index bc49fcc..0906f34 100644
--- a/samples/polybench/syrk.mlir
+++ b/samples/polybench/syrk.mlir
@@ -2,15 +2,15 @@
 
 // CHECK: module {
 #map = affine_map<(d0) -> (d0 + 1)>
-func @test_syrk(%alpha: f32, %beta: f32, %A: memref<16x8xf32>, %C: memref<16x16xf32>) {
+func @test_syrk(%alpha: f32, %beta: f32, %A: memref<16x16xf32>, %C: memref<16x16xf32>) {
   affine.for %i = 0 to 16 {
     affine.for %j = 0 to #map(%i) {
       %0 = affine.load %C[%i, %j] : memref<16x16xf32>
       %1 = mulf %beta, %0 : f32
       affine.store %1, %C[%i, %j] : memref<16x16xf32>
       affine.for %k = 0 to 8 {
-        %2 = affine.load %A[%i, %k] : memref<16x8xf32>
-        %3 = affine.load %A[%j, %k] : memref<16x8xf32>
+        %2 = affine.load %A[%i, %k] : memref<16x16xf32>
+        %3 = affine.load %A[%j, %k] : memref<16x16xf32>
         %4 = affine.load %C[%i, %j] : memref<16x16xf32>
         %5 = mulf %alpha, %2 : f32
         %6 = mulf %5, %3 : f32
diff --git a/test/Transforms/test_func_pipelining.mlir b/test/Transforms/test_func_pipelining.mlir
new file mode 100644
index 0000000..42ee4e2
--- /dev/null
+++ b/test/Transforms/test_func_pipelining.mlir
@@ -0,0 +1,6 @@
+// RUN: scalehls-opt -func-pipelining %s | FileCheck %s
+
+// CHECK-LABEL: func @test_for
+func @test_for() {
+  return
+}