[MultipleLevelDSE] support ds initialization and pareto frontier update; support ds dump; remove ProfileDesignSpace pass; [PartialAffineLoopTile] become fully in-place tiling

2021-02-12 14:54:36 -06:00 · 2021-02-12 14:54:36 -06:00 · 3e3435dc84
parent 390333e5ec
commit 3e3435dc84
11 changed files with 201 additions and 337 deletions
--- a/include/scalehls/Transforms/MultipleLevelDSE.h
+++ b/include/scalehls/Transforms/MultipleLevelDSE.h
@ -22,10 +22,9 @@ public:
                             int64_t numDSP)
      : ScaleHLSAnalysisBase(builder), estimator(estimator), numDSP(numDSP) {}

-  void emitDebugInfo(FuncOp targetFunc, StringRef message);
-
  /// This is a temporary approach that does not scale.
-  void applyMultipleLevelDSE(FuncOp func);
+  void applyMultipleLevelDSE(FuncOp func, raw_ostream &os,
+                             unsigned maxInitializeParallel = 16);

  ScaleHLSEstimator &estimator;
  int64_t numDSP;
--- a/include/scalehls/Transforms/Passes.h
+++ b/include/scalehls/Transforms/Passes.h
@ -19,7 +19,6 @@ namespace scalehls {

 /// Design space exploration pass.
 std::unique_ptr<Pass> createMultipleLevelDSEPass();
-std::unique_ptr<Pass> createProfileDesignSpacePass();

 /// Dataflow optimization passes.
 std::unique_ptr<Pass> createLegalizeDataflowPass();
--- a/include/scalehls/Transforms/Passes.td
+++ b/include/scalehls/Transforms/Passes.td
@ -25,30 +25,14 @@ def MultipleLevelDSE : Pass<"multiple-level-dse", "ModuleOp"> {

  let constructor = "mlir::scalehls::createMultipleLevelDSEPass()";

-  let options = [
-    Option<"targetSpec", "target-spec", "std::string",
-           /*default=*/"\"../config/target-spec.ini\"",
-           "File path: target backend specifications and configurations">
-  ];
-}
-
-def ProfileDesignSpace : Pass<"profile-design-space", "ModuleOp"> {
-  let summary = "Optimize HLS design at multiple abstraction level";
-  let description = [{
-    This profile-design-space pass will profile the partial design space and
-    output clock cycle and resource utilization estimation results.
-  }];
-
-  let constructor = "mlir::scalehls::createProfileDesignSpacePass()";
-
  let options = [
    Option<"targetSpec", "target-spec", "std::string",
           /*default=*/"\"../config/target-spec.ini\"",
           "File path: target backend specifications and configurations">,
    Option<"outputFile", "output-file", "std::string",
-           /*default=*/"\"-\"", "File path: the output file path of profiling">,
-    Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"1",
-           "Positive number: the maximum tiling parallelism of the profiling">
+           /*default=*/"\"-\"", "File path: the output file path">,
+    Option<"maxParallel", "max-parallel", "unsigned", /*default=*/"16",
+           "Positive number: the maximum parallelism of initialization">
  ];
 }

--- a/include/scalehls/Transforms/Utils.h
+++ b/include/scalehls/Transforms/Utils.h
@ -30,21 +30,21 @@ bool applyRemoveVariableBound(AffineLoopBand &band);
 /// passed in because the post-tiling optimizations have to take function as
 /// target, e.g. canonicalizer and array partition.
 bool applyOptStrategy(AffineLoopBand &band, FuncOp func, TileList tileList,
-                      int64_t targetII);
+                      unsigned targetII);

 /// Apply optimization strategy to a function.
 bool applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
-                      ArrayRef<int64_t> targetIIs);
+                      ArrayRef<unsigned> targetIIs);

-/// Apply loop tiling to the input loop band and return the location of the
-/// original innermost loop in the tiled loop band. If tile is failed, -1 will
-/// be returned.
-int64_t applyLoopTiling(AffineLoopBand &band, TileList tileList);
+/// Apply loop tiling to the input loop band and sink all intra-tile loops to
+/// the innermost loop with the original loop order. Return the location of the
+/// innermost tile-space loop.
+Optional<unsigned> applyLoopTiling(AffineLoopBand &band, TileList tileList);

 /// Apply loop pipelining to the pipelineLoc of the input loop band, all inner
 /// loops are automatically fully unrolled.
-bool applyLoopPipelining(AffineLoopBand &band, int64_t pipelineLoc,
-                         int64_t targetII);
+bool applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
+                         unsigned targetII);

 /// Fully unroll all loops insides of a loop block.
 bool applyFullyLoopUnrolling(Block &block);
--- a/lib/Analysis/Utils.cpp
+++ b/lib/Analysis/Utils.cpp
@ -181,6 +181,7 @@ static unsigned getChildLoopNum(Operation *op) {

 /// Get the whole loop band given the innermost loop and return it in "band".
 static void getLoopBandFromInnermost(AffineForOp forOp, AffineLoopBand &band) {
+  band.clear();
  AffineLoopBand reverseBand;

  auto currentLoop = forOp;
@ -204,6 +205,7 @@ static void getLoopBandFromInnermost(AffineForOp forOp, AffineLoopBand &band) {
 /// Meanwhile, the return value is the innermost loop of this loop band.
 AffineForOp scalehls::getLoopBandFromOutermost(AffineForOp forOp,
                                               AffineLoopBand &band) {
+  band.clear();
  auto currentLoop = forOp;
  while (true) {
    band.push_back(currentLoop);
@ -222,6 +224,7 @@ AffineForOp scalehls::getLoopBandFromOutermost(AffineForOp forOp,
 /// loops are collected.
 void scalehls::getLoopBands(Block &block, AffineLoopBands &bands,
                            bool allowHavingChilds) {
+  bands.clear();
  block.walk([&](AffineForOp loop) {
    auto childNum = getChildLoopNum(loop);

--- a/lib/Transforms/Directive/LoopPipelining.cpp
+++ b/lib/Transforms/Directive/LoopPipelining.cpp
@ -32,8 +32,8 @@ bool scalehls::applyFullyLoopUnrolling(Block &block) {

 /// Apply loop pipelining to the input loop, all inner loops are automatically
 /// fully unrolled.
-bool scalehls::applyLoopPipelining(AffineLoopBand &band, int64_t pipelineLoc,
-                                   int64_t targetII) {
+bool scalehls::applyLoopPipelining(AffineLoopBand &band, unsigned pipelineLoc,
+                                   unsigned targetII) {
  auto targetLoop = band[pipelineLoc];

  // All inner loops of the pipelined loop are automatically unrolled.
--- a/lib/Transforms/Loop/PartialAffineLoopTile.cpp
+++ b/lib/Transforms/Loop/PartialAffineLoopTile.cpp
@ -12,61 +12,57 @@
 using namespace mlir;
 using namespace scalehls;

-/// Apply loop tiling to the input loop band and return the location of the
-/// original innermost loop in the tiled loop band. If tile is failed, -1 will
-/// be returned.
-int64_t scalehls::applyLoopTiling(AffineLoopBand &band, TileList tileList) {
+/// Apply loop tiling to the input loop band and sink all intra-tile loops to
+/// the innermost loop with the original loop order. Return the location of the
+/// innermost tile-space loop.
+Optional<unsigned> scalehls::applyLoopTiling(AffineLoopBand &band,
+                                             TileList tileList) {
  if (!isPerfectlyNested(band))
-    return -1;
-
-  // Collect each loop location that is fully tiled and can be eliminated.
-  SmallVector<unsigned, 8> fullyTiledLoops;
-  unsigned pipelineLoc = 0;
-  unsigned loc = 0;
-  for (auto loop : band) {
-    if (auto tripCount = getConstantTripCount(loop)) {
-      if (tripCount.getValue() == tileList[loc])
-        fullyTiledLoops.push_back(loc);
-      else
-        pipelineLoc = loc;
-    } else
-      return -1;
-    ++loc;
-  }
-
-  // If all loops are fully tiled, keep the last loop untouched.
-  if (fullyTiledLoops.size() == band.size()) {
-    fullyTiledLoops.pop_back();
-    pipelineLoc = band.size() - 1;
-  }
+    return Optional<unsigned>();

  // Loop tiling.
  AffineLoopBand tiledBand;
  if (failed(tilePerfectlyNested(band, tileList, &tiledBand)))
-    return -1;
-  band = tiledBand;
+    return Optional<unsigned>();

-  auto builder = OpBuilder(band.back());
+  // Record the band size and clear the original loop band.
+  auto originalBandSize = band.size();
+  band.clear();

-  // Remove fully tiled loops.
-  for (auto loc : fullyTiledLoops) {
-    auto loop = band[loc];
+  // Remove redundant loops in the tiled loop band.
+  auto builder = OpBuilder(tiledBand.back());
+  unsigned erasedLoopNum = 0;
+  unsigned loc = 0;

-    // Create an affine apply operation generating a constant zero.
-    builder.setInsertionPoint(loop);
-    auto constZero = builder.create<AffineApplyOp>(
-        loop.getLoc(), builder.getConstantAffineMap(0), ValueRange({}));
-    loop.getInductionVar().replaceAllUsesWith(constZero);
+  for (auto loop : tiledBand) {
+    if (erasedLoopNum >= originalBandSize - 1 || loc >= originalBandSize ||
+        getConstantTripCount(loop).getValue() > 1) {
+      // All tile-space loops which have a trip count larger than 1 and all
+      // intra-tile loops are pushed back. Meanwhile, we are not willing to see
+      // all tile-space loops removed since in that case many analysis and
+      // transforms will become very hard. Thereby we record the number of
+      // erased loop so far and always keep at least one tile-space loop
+      // remained in the loop band even if it has a trip count of 1.
+      band.push_back(loop);
+    } else {
+      // Create an affine apply operation to represent the lower bound.
+      builder.setInsertionPoint(loop);
+      auto newIterVar = builder.create<AffineApplyOp>(
+          loop.getLoc(), loop.getLowerBoundMap(), loop.getLowerBoundOperands());
+      loop.getInductionVar().replaceAllUsesWith(newIterVar);

-    // Move all operation except the terminator to the outside.
-    auto &parentBlock = loop->getBlock()->getOperations();
-    auto &loopBlock = loop.getBody()->getOperations();
-    parentBlock.splice(loop->getIterator(), loopBlock, loopBlock.begin(),
-                       std::prev(loopBlock.end()));
-    loop.erase();
+      // Move all operation except the terminator to the outside.
+      auto &parentBlock = loop->getBlock()->getOperations();
+      auto &loopBlock = loop.getBody()->getOperations();
+      parentBlock.splice(loop->getIterator(), loopBlock, loopBlock.begin(),
+                         std::prev(loopBlock.end()));
+      loop.erase();
+      ++erasedLoopNum;
+    }
+    ++loc;
  }

-  return pipelineLoc;
+  return band.size() - originalBandSize - 1;
 }

 namespace {
--- a/lib/Transforms/MultipleLevelDSE.cpp
+++ b/lib/Transforms/MultipleLevelDSE.cpp
@ -6,9 +6,12 @@

 #include "scalehls/Transforms/MultipleLevelDSE.h"
 #include "mlir/Analysis/LoopAnalysis.h"
+#include "mlir/Support/FileUtilities.h"
 #include "scalehls/Transforms/Passes.h"
 #include "scalehls/Transforms/Utils.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <numeric>
 #include <pthread.h>

 #define DEBUG_TYPE "scalehls"
@ -16,26 +19,12 @@
 using namespace mlir;
 using namespace scalehls;

-//===----------------------------------------------------------------------===//
-// Helper methods
-//===----------------------------------------------------------------------===//
-
-static int64_t getInnerParallelism(AffineForOp forOp) {
-  int64_t count = 0;
-  for (auto loop : forOp.getOps<AffineForOp>()) {
-    auto innerCount = getInnerParallelism(loop);
-    if (auto trip = getConstantTripCount(loop))
-      count += trip.getValue() * innerCount;
-    else
-      count += innerCount;
-  }
-
-  // If the current loop is innermost loop, count should be one.
-  return std::max(count, (int64_t)1);
-}
-
 using TileConfig = unsigned;

+//===----------------------------------------------------------------------===//
+// Helper Methods and Classes
+//===----------------------------------------------------------------------===//
+
 namespace {
 struct DesignPoint {
 public:
@ -122,10 +111,47 @@ public:
    return sqrtf(distanceSquare);
  }

-  void findParetoFrontiers() {}
+  /// Update paretoPoints to remove design points that are not pareto frontiers.
+  void updateParetoPoints() {
+    // Sort the pareto points with in an ascending order of latency and the an
+    // ascending order of dsp number.
+    auto latencyThenDspNum = [&](const DesignPoint &a, const DesignPoint &b) {
+      return (a.latency < b.latency ||
+              (a.latency == b.latency && a.dspNum < b.dspNum));
+    };
+    std::sort(paretoPoints.begin(), paretoPoints.end(), latencyThenDspNum);
+
+    // Find pareto frontiers. After the sorting, the first design point must be
+    // a pareto point.
+    auto paretoPoint = paretoPoints[0];
+    auto paretoLatency = paretoPoint.latency;
+    auto paretoDspNum = paretoPoint.dspNum;
+    SmallVector<DesignPoint, 16> frontiers;
+
+    for (auto point : paretoPoints) {
+      auto tmpLatency = point.latency;
+      auto tmpDspNum = point.dspNum;
+
+      if (tmpDspNum < paretoDspNum) {
+        frontiers.push_back(point);
+
+        paretoPoint = point;
+        paretoLatency = tmpLatency;
+        paretoDspNum = tmpDspNum;
+
+      } else if (tmpDspNum == paretoDspNum && tmpLatency == paretoLatency)
+        frontiers.push_back(point);
+    }
+
+    paretoPoints = frontiers;
+  }
+
+  /// Evaluate all design points under the given tile config.
+  bool evaluateTileConfig(TileConfig config) {
+    // If the current tile config is already estimated, return true.
+    if (!unestimatedTileConfigs.count(config))
+      return true;

-  /// Explore all design points under the given tile config.
-  bool exploreTileConfig(TileConfig config) {
    // Clone a temporary loop band by cloning the outermost loop.
    auto tmpOuterLoop = band.front().clone();
    AffineLoopBand tmpBand;
@ -149,7 +175,7 @@ public:

    // Apply the current tiling config and start the estimation. Note that after
    // optimization, tmpBand is optimized in place and becomes a new loop band.
-    if (!applyOptStrategy(tmpBand, func, tileList, 1))
+    if (!applyOptStrategy(tmpBand, func, tileList, (unsigned)1))
      return false;
    tmpOuterLoop = tmpBand.front();
    estimator.estimateLoop(tmpOuterLoop);
@ -173,17 +199,63 @@ public:
      paretoPoints.push_back(point);
    }

-    // Erase the temporary loop band and return.
+    // Erase the temporary loop band and annotate the current tile config as
+    // estimated.
    tmpOuterLoop.erase();
+    unestimatedTileConfigs.erase(config);
    return true;
  }

+  /// Initialize the design space.
  void initializeDesignSpace(unsigned maxInitializeParallel) {
+    LLVM_DEBUG(llvm::dbgs() << "3.1 Initializing the design space...\n";);
+
    for (TileConfig config = 0; config < validTileConfigNum; ++config) {
-      unestimatedTileConfigs.erase(config);
+      auto tileList = getTileList(config);
+
+      // We only evaluate the design points whose overall parallelism is smaller
+      // than the maxInitializeParallel to improve the efficiency.
+      auto parallel = std::accumulate(tileList.begin(), tileList.end(),
+                                      (unsigned)1, std::multiplies<unsigned>());
+      if (parallel > maxInitializeParallel)
+        continue;
+
+      LLVM_DEBUG(llvm::dbgs() << config << ",");
+      evaluateTileConfig(config);
    }
+
+    LLVM_DEBUG(llvm::dbgs() << "\n\n");
+    updateParetoPoints();
  }

+  /// Dump pareto and non-pareto points which have been evaluated in the design
+  /// space to a csv output file.
+  void dumpDesignSpace(raw_ostream &os) {
+    // Print header row.
+    for (unsigned i = 0; i < tripCountList.size(); ++i)
+      os << "l" << i << ",";
+    os << "ii,cycle,dsp,type\n";
+
+    // Print pareto design points.
+    for (auto point : paretoPoints) {
+      for (auto size : getTileList(point.tileConfig))
+        os << size << ",";
+      os << point.targetII << "," << point.latency << "," << point.dspNum
+         << ",pareto\n";
+    }
+
+    // Print all design points.
+    for (auto point : allPoints) {
+      for (auto size : getTileList(point.tileConfig))
+        os << size << ",";
+      os << point.targetII << "," << point.latency << "," << point.dspNum
+         << ",non-pareto\n";
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "Design space is dumped to output file.\n\n");
+  }
+
+  /// Stores current pareto frontiers and all evaluated design points.
  SmallVector<DesignPoint, 16> paretoPoints;
  SmallVector<DesignPoint, 16> allPoints;

@ -208,25 +280,37 @@ public:
 };
 } // namespace

+static int64_t getInnerParallelism(AffineForOp forOp) {
+  int64_t count = 0;
+  for (auto loop : forOp.getOps<AffineForOp>()) {
+    auto innerCount = getInnerParallelism(loop);
+    if (auto trip = getConstantTripCount(loop))
+      count += trip.getValue() * innerCount;
+    else
+      count += innerCount;
+  }
+
+  // If the current loop is innermost loop, count should be one.
+  return std::max(count, (int64_t)1);
+}
+
 //===----------------------------------------------------------------------===//
 // Optimizer Class Definition
 //===----------------------------------------------------------------------===//

-void ScaleHLSOptimizer::emitDebugInfo(FuncOp targetFunc, StringRef message) {
-  LLVM_DEBUG(auto latency = getIntAttrValue(targetFunc, "latency");
-             auto dsp = getIntAttrValue(targetFunc, "dsp");
-
-             llvm::dbgs() << message << "\n";
-             llvm::dbgs() << "Current latency is " << Twine(latency)
-                          << ", DSP utilization is " << Twine(dsp) << ".\n\n";);
-}
-
 /// This is a temporary approach that does not scale.
-void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
+void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func, raw_ostream &os,
+                                              unsigned maxInitializeParallel) {
  estimator.estimateFunc(func);
  if (getIntAttrValue(func, "dsp") > numDSP)
    return;
-  emitDebugInfo(func, "Start multiple level design space exploration.");
+
+  LLVM_DEBUG(auto latency = getIntAttrValue(func, "latency");
+             auto dspNum = getIntAttrValue(func, "dsp");
+
+             llvm::dbgs() << "\nStart the design space exploration.\n";
+             llvm::dbgs() << "Initial clock cycle is " << Twine(latency)
+                          << ", DSP usage is " << Twine(dspNum) << ".\n\n";);

  //===--------------------------------------------------------------------===//
  // STAGE 1: Simplify Loop Nests Structure
@ -331,7 +415,7 @@ void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
  estimator.estimateFunc(func);
  if (getIntAttrValue(func, "dsp") > numDSP)
    return;
-  emitDebugInfo(func, "1. Simplify loop nests structure.");
+  LLVM_DEBUG(llvm::dbgs() << "1. Simplify loop nests structure.\n\n");

  //===--------------------------------------------------------------------===//
  // STAGE 2: Loop Bands Optimization
@ -357,21 +441,19 @@ void ScaleHLSOptimizer::applyMultipleLevelDSE(FuncOp func) {
  estimator.estimateFunc(func);
  if (getIntAttrValue(func, "dsp") > numDSP)
    return;
-  emitDebugInfo(func, "2. Apply loop perfection, remove variable bound, and "
-                      "loop order opt.");
+  LLVM_DEBUG(llvm::dbgs() << "2. Apply loop perfection, loop order opt, and "
+                             "remove variable loop bound.\n\n");

  //===--------------------------------------------------------------------===//
-  // STAGE 3: Loop Bands Tiling and Finalization
+  // STAGE 3: Search for pareto frontiers
  //===--------------------------------------------------------------------===//
+
+  LLVM_DEBUG(llvm::dbgs() << "3. Search for pareto design points...\n\n";);
+
  for (unsigned i = 0; i < targetNum; ++i) {
    auto space = DesignSpace(func, targetBands[i], estimator);
-    space.exploreTileConfig(1);
-
-    for (auto point : space.paretoPoints) {
-      llvm::outs() << "latency: " << point.latency
-                   << ", dsp_num: " << point.dspNum
-                   << ", ii: " << point.targetII << "\n";
-    }
+    space.initializeDesignSpace(maxInitializeParallel);
+    space.dumpDesignSpace(os);
  }
 }

@ -392,6 +474,12 @@ struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
    getLatencyMap(spec, latencyMap);
    int64_t numDSP = ceil(spec.GetInteger("specification", "dsp", 220) * 1.1);

+    // Parse output file.
+    std::string errorMessage;
+    auto output = mlir::openOutputFile(outputFile, &errorMessage);
+    if (!output)
+      emitError(module.getLoc(), errorMessage);
+
    // Initialize an performance and resource estimator.
    auto estimator = ScaleHLSEstimator(builder, latencyMap);
    auto optimizer = ScaleHLSOptimizer(builder, estimator, numDSP);
@ -400,7 +488,9 @@ struct MultipleLevelDSE : public MultipleLevelDSEBase<MultipleLevelDSE> {
    for (auto func : module.getOps<FuncOp>())
      if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
        if (topFunction.getValue())
-          optimizer.applyMultipleLevelDSE(func);
+          optimizer.applyMultipleLevelDSE(func, output->os(), maxParallel);
+
+    output->keep();
  }
 };
 } // namespace
--- a/lib/Transforms/ProfileDesignSpace.cpp
+++ b/lib/Transforms/ProfileDesignSpace.cpp
@ -1,201 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Copyright 2020-2021 The ScaleHLS Authors.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Analysis/LoopAnalysis.h"
-#include "mlir/Support/FileUtilities.h"
-#include "scalehls/Analysis/QoREstimation.h"
-#include "scalehls/Transforms/Passes.h"
-#include "scalehls/Transforms/Utils.h"
-#include "llvm/Support/ToolOutputFile.h"
-
-#define DEBUG_TYPE "scalehls"
-
-using namespace mlir;
-using namespace scalehls;
-
-/// Currently only support single loop band profiling.
-static void applyProfiling(FuncOp func, raw_ostream &os,
-                           ScaleHLSEstimator &estimator, unsigned maxParallel) {
-  if (!dyn_cast<AffineForOp>(func.front().front())) {
-    func.emitError("first operation is not loop");
-    return;
-  }
-
-  // Helper function for fetching the target loop band.
-  auto getFirstBand = [&](FuncOp targetFunc) {
-    // Get the first loop band as target.
-    auto target = dyn_cast<AffineForOp>(targetFunc.front().front());
-    AffineLoopBand band;
-    getLoopBandFromOutermost(target, band);
-    return band;
-  };
-
-  // Perfect and optimize loop order of the target loop band.
-  auto band = getFirstBand(func);
-  auto loopNum = band.size();
-  applyAffineLoopPerfection(band);
-  applyAffineLoopOrderOpt(band);
-  applyRemoveVariableBound(band);
-
-  // Initialize tile size and trip count vector.
-  auto tileList = TileList(loopNum, 1);
-  auto tripCounts = TileList();
-  unsigned iterations = 1;
-  for (unsigned loc = 0; loc < loopNum; ++loc) {
-    auto tripCount = getConstantTripCount(band[loc]).getValue();
-    tripCounts.push_back(tripCount);
-    iterations *= (log2(tripCount) + 1);
-    os << "l" << loc << ",";
-  }
-  os << "ii,cycle,dsp,pareto\n";
-
-  // Storing all design points.
-  using DesignPoint = SmallVector<int64_t, 8>;
-  std::vector<DesignPoint> designPoints;
-
-  // Traverse each tile size configuration.
-  for (unsigned i = 0; i < iterations - 1; ++i) {
-    for (unsigned loc = 0; loc < loopNum; ++loc) {
-      auto &tileSize = tileList[loc];
-      if (loc == 0)
-        tileSize *= 2;
-      else if (tileList[loc - 1] > tripCounts[loc - 1])
-        tileSize *= 2;
-    }
-
-    unsigned iterNum = 1;
-    unsigned parallel = 1;
-    for (unsigned loc = 0; loc < loopNum; ++loc) {
-      auto &tileSize = tileList[loc];
-      if (tileSize > tripCounts[loc])
-        tileSize = 1;
-      iterNum *= tripCounts[loc] / tileSize;
-      parallel *= tileSize;
-      LLVM_DEBUG(llvm::dbgs() << tileSize << ", ";);
-    }
-    LLVM_DEBUG(llvm::dbgs() << "\n";);
-
-    if (parallel > maxParallel)
-      continue;
-
-    // Apply tiling strategy.
-    auto tmpFunc = func.clone();
-    applyOptStrategy(tmpFunc, tileList, 1);
-    estimator.estimateFunc(tmpFunc);
-    auto tmpLoop = getFirstBand(tmpFunc).back();
-
-    // Fetch latency and resource utilization.
-    auto II = estimator.getIntAttrValue(tmpLoop, "ii");
-    auto iterLatency = estimator.getIntAttrValue(tmpLoop, "iter_latency");
-    auto shareDspNum = estimator.getIntAttrValue(tmpLoop, "share_dsp");
-    auto noShareDspNum = estimator.getIntAttrValue(tmpLoop, "noshare_dsp");
-
-    // Improve target II until II is equal to iteration latency.
-    for (auto tmpII = II; tmpII <= iterLatency; ++tmpII) {
-      auto tmpDspNum = std::max(shareDspNum, noShareDspNum / tmpII);
-      auto tmpLatency = iterLatency + tmpII * (iterNum - 1);
-
-      auto point = SmallVector<int64_t, 8>(tileList.begin(), tileList.end());
-      point.append({tmpII, tmpLatency, tmpDspNum});
-      designPoints.push_back(point);
-
-      if (iterNum == 1)
-        break;
-    }
-  }
-
-  // Sort all design points by latency.
-  auto compareLatency = [&](const DesignPoint &a, const DesignPoint &b) {
-    return a[loopNum + 1] < b[loopNum + 1];
-  };
-  std::sort(designPoints.begin(), designPoints.end(), compareLatency);
-
-  // Sort all design points with the same latency by dsp number.
-  auto compareDspNum = [&](const DesignPoint &a, const DesignPoint &b) {
-    return a[loopNum + 2] < b[loopNum + 2];
-  };
-  for (auto i = designPoints.begin(); i < designPoints.end();) {
-    auto j = i;
-    for (; j < designPoints.end(); ++j)
-      if ((*i)[loopNum + 1] != (*j)[loopNum + 1])
-        break;
-    std::sort(i, j, compareDspNum);
-    i = j;
-  }
-
-  // Find pareto frontiers. After the sorting, the first design point must be a
-  // pareto point.
-  auto paretoPoint = designPoints[0];
-  auto paretoLatency = paretoPoint[loopNum + 1];
-  auto paretoDspNum = paretoPoint[loopNum + 2];
-  std::vector<DesignPoint> paretoPoints;
-
-  for (auto point : designPoints) {
-    auto tmpLatency = point[loopNum + 1];
-    auto tmpDspNum = point[loopNum + 2];
-
-    if (tmpDspNum < paretoDspNum) {
-      paretoPoints.push_back(point);
-      paretoPoint = point;
-      paretoLatency = tmpLatency;
-      paretoDspNum = tmpDspNum;
-    } else if (tmpDspNum == paretoDspNum && tmpLatency == paretoLatency)
-      paretoPoints.push_back(point);
-  }
-
-  // Print all pareto design points.
-  for (auto point : paretoPoints) {
-    for (auto element : point)
-      os << element << ",";
-    os << "pareto\n";
-  }
-
-  // Print all design points.
-  for (auto point : designPoints) {
-    for (auto element : point)
-      os << element << ",";
-    os << "non-pareto\n";
-  }
-}
-
-namespace {
-struct ProfileDesignSpace : public ProfileDesignSpaceBase<ProfileDesignSpace> {
-  void runOnOperation() override {
-    auto module = getOperation();
-    auto builder = Builder(module);
-
-    // Read configuration file.
-    INIReader spec(targetSpec);
-    if (spec.ParseError())
-      emitError(module.getLoc(), "target spec file parse fail\n");
-
-    // Collect profiling latency data, where default values are based on Xilinx
-    // PYNQ-Z1 board.
-    LatencyMap latencyMap;
-    getLatencyMap(spec, latencyMap);
-
-    // Initialize an performance and resource estimator.
-    auto estimator = ScaleHLSEstimator(builder, latencyMap);
-
-    // Optimize the top function.
-    for (auto func : module.getOps<FuncOp>())
-      if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
-        if (topFunction.getValue()) {
-          std::string errorMessage;
-          auto output = mlir::openOutputFile(outputFile, &errorMessage);
-          if (!output)
-            emitError(module.getLoc(), errorMessage);
-
-          applyProfiling(func, output->os(), estimator, maxParallel);
-          output->keep();
-        }
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> scalehls::createProfileDesignSpacePass() {
-  return std::make_unique<ProfileDesignSpace>();
-}
--- a/lib/Transforms/Utils.cpp
+++ b/lib/Transforms/Utils.cpp
@ -33,18 +33,18 @@ static void addPassPipeline(PassManager &pm) {
 /// passed in because the post-tiling optimizations have to take function as
 /// target, e.g. canonicalizer and array partition.
 bool scalehls::applyOptStrategy(AffineLoopBand &band, FuncOp func,
-                                TileList tileList, int64_t targetII) {
+                                TileList tileList, unsigned targetII) {
  // By design the input function must be the ancestor of the input loop band.
  if (!func->isProperAncestor(band.front()))
    return false;

  // Apply loop tiling.
  auto pipelineLoc = applyLoopTiling(band, tileList);
-  if (pipelineLoc == -1)
+  if (!pipelineLoc)
    return false;

  // Apply loop pipelining.
-  if (!applyLoopPipelining(band, pipelineLoc, targetII))
+  if (!applyLoopPipelining(band, pipelineLoc.getValue(), targetII))
    return false;

  // Apply general optimizations and array partition.
@ -59,17 +59,17 @@ bool scalehls::applyOptStrategy(AffineLoopBand &band, FuncOp func,

 /// Apply optimization strategy to a function.
 bool scalehls::applyOptStrategy(FuncOp func, ArrayRef<TileList> tileLists,
-                                ArrayRef<int64_t> targetIIs) {
+                                ArrayRef<unsigned> targetIIs) {
  AffineLoopBands bands;
  getLoopBands(func.front(), bands);

  // Apply loop tiling and pipelining to all loop bands.
  for (unsigned i = 0, e = bands.size(); i < e; ++i) {
    auto pipelineLoc = applyLoopTiling(bands[i], tileLists[i]);
-    if (pipelineLoc == -1)
+    if (!pipelineLoc)
      return false;

-    if (!applyLoopPipelining(bands[i], pipelineLoc, targetIIs[i]))
+    if (!applyLoopPipelining(bands[i], pipelineLoc.getValue(), targetIIs[i]))
      return false;
  }

--- a/test/Analysis/profile_design_space.mlir
+++ b/test/Analysis/profile_design_space.mlir
@ -1,6 +0,0 @@
-// RUN: scalehls-opt -profile-design-space="target-spec=../../config/target-spec.ini max-parallel=32" %s | FileCheck %s
-
-// CHECK-LABEL: func @test
-func @test() {
-  return
-}