fix bugs related to dependency analysis; [AffineLoopOrderOpt] add initial impl of applyAffineLoopOrderOpt method
This commit is contained in:
parent
7c1130452e
commit
d3d13e0bd0
|
@ -7,8 +7,7 @@
|
||||||
#ifndef SCALEHLS_ANALYSIS_UTILS_H
|
#ifndef SCALEHLS_ANALYSIS_UTILS_H
|
||||||
#define SCALEHLS_ANALYSIS_UTILS_H
|
#define SCALEHLS_ANALYSIS_UTILS_H
|
||||||
|
|
||||||
#include "mlir/IR/Builders.h"
|
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||||
#include "mlir/IR/Operation.h"
|
|
||||||
|
|
||||||
namespace mlir {
|
namespace mlir {
|
||||||
namespace scalehls {
|
namespace scalehls {
|
||||||
|
@ -62,6 +61,9 @@ public:
|
||||||
// Helper methods
|
// Helper methods
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
using AffineLoopBand = SmallVector<AffineForOp, 4>;
|
||||||
|
using AffineLoopBands = SmallVector<AffineLoopBand, 4>;
|
||||||
|
|
||||||
// For storing all affine memory access operations (including CallOp,
|
// For storing all affine memory access operations (including CallOp,
|
||||||
// AffineLoadOp, and AffineStoreOp) indexed by the corresponding memref.
|
// AffineLoadOp, and AffineStoreOp) indexed by the corresponding memref.
|
||||||
using MemAccesses = SmallVector<Operation *, 16>;
|
using MemAccesses = SmallVector<Operation *, 16>;
|
||||||
|
|
|
@ -7,8 +7,8 @@
|
||||||
#ifndef SCALEHLS_TRANSFORMS_PASSES_H
|
#ifndef SCALEHLS_TRANSFORMS_PASSES_H
|
||||||
#define SCALEHLS_TRANSFORMS_PASSES_H
|
#define SCALEHLS_TRANSFORMS_PASSES_H
|
||||||
|
|
||||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
|
||||||
#include "mlir/Pass/Pass.h"
|
#include "mlir/Pass/Pass.h"
|
||||||
|
#include "scalehls/Analysis/Utils.h"
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
namespace mlir {
|
namespace mlir {
|
||||||
|
@ -34,7 +34,7 @@ bool applyAffineLoopPerfection(AffineForOp loop, OpBuilder &builder);
|
||||||
/// Apply remove variable bound to all inner loops of the input loop.
|
/// Apply remove variable bound to all inner loops of the input loop.
|
||||||
bool applyRemoveVariableBound(AffineForOp loop, OpBuilder &builder);
|
bool applyRemoveVariableBound(AffineForOp loop, OpBuilder &builder);
|
||||||
|
|
||||||
bool applyAffineLoopOrderOpt(AffineForOp loop, OpBuilder &builder);
|
bool applyAffineLoopOrderOpt(AffineLoopBand band, OpBuilder &builder);
|
||||||
|
|
||||||
/// Apply loop pipelining to the input loop, all inner loops are automatically
|
/// Apply loop pipelining to the input loop, all inner loops are automatically
|
||||||
/// fully unrolled.
|
/// fully unrolled.
|
||||||
|
|
|
@ -426,18 +426,26 @@ int64_t HLSCppEstimator::getDepMinII(AffineForOp forOp, MemAccessesMap &map) {
|
||||||
if (srcMuxSize > 2 || dstMuxSize > 2)
|
if (srcMuxSize > 2 || dstMuxSize > 2)
|
||||||
distance = 1;
|
distance = 1;
|
||||||
else {
|
else {
|
||||||
SmallVector<int64_t, 2> accumTripCounts;
|
SmallVector<int64_t, 8> accumTrips;
|
||||||
accumTripCounts.push_back(1);
|
accumTrips.push_back(1);
|
||||||
|
|
||||||
// Calculate the distance of this dependency.
|
// Calculate the distance of this dependency.
|
||||||
for (auto i = depComps.rbegin(); i < depComps.rend(); ++i) {
|
for (auto i = depComps.rbegin(); i < depComps.rend(); ++i) {
|
||||||
auto dep = *i;
|
auto dep = *i;
|
||||||
auto tripCount = getIntAttrValue(dep.op, "trip_count");
|
auto tripCount = getIntAttrValue(dep.op, "trip_count");
|
||||||
|
auto ub = dep.ub.getValue();
|
||||||
|
auto lb = dep.lb.getValue();
|
||||||
|
|
||||||
if (dep.lb)
|
// Ff ub is more than zero, calculate the minimum positive
|
||||||
distance += accumTripCounts.back() * dep.lb.getValue();
|
// disatance. Otherwise, set distance to negative and break.
|
||||||
|
if (ub >= 0)
|
||||||
|
distance += accumTrips.back() * max(lb, (int64_t)0);
|
||||||
|
else {
|
||||||
|
distance = -1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
accumTripCounts.push_back(accumTripCounts.back() * tripCount);
|
accumTrips.push_back(accumTrips.back() * tripCount);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -481,8 +489,8 @@ bool HLSCppEstimator::visitOp(AffineForOp op, int64_t begin) {
|
||||||
// If the current loop is annotated as pipelined loop, extra dependency and
|
// If the current loop is annotated as pipelined loop, extra dependency and
|
||||||
// resource aware II analysis will be executed.
|
// resource aware II analysis will be executed.
|
||||||
if (getBoolAttrValue(op, "pipeline")) {
|
if (getBoolAttrValue(op, "pipeline")) {
|
||||||
// Collect load and store operations in the loop block for solving possible
|
// Collect load and store operations in the loop block for solving
|
||||||
// carried dependencies.
|
// possible carried dependencies.
|
||||||
// TODO: include CallOps, how? It seems dependencies always exist for all
|
// TODO: include CallOps, how? It seems dependencies always exist for all
|
||||||
// CallOps not matter its access pattern.
|
// CallOps not matter its access pattern.
|
||||||
MemAccessesMap map;
|
MemAccessesMap map;
|
||||||
|
@ -510,9 +518,9 @@ bool HLSCppEstimator::visitOp(AffineForOp op, int64_t begin) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the current loop is annotated as flatten, it will be flattened into the
|
// If the current loop is annotated as flatten, it will be flattened into
|
||||||
// child pipelined loop. This will increase the flattened loop trip count
|
// the child pipelined loop. This will increase the flattened loop trip
|
||||||
// without changing the iteration latency.
|
// count without changing the iteration latency.
|
||||||
if (getBoolAttrValue(op, "flatten")) {
|
if (getBoolAttrValue(op, "flatten")) {
|
||||||
auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front());
|
auto child = dyn_cast<AffineForOp>(op.getLoopBody().front().front());
|
||||||
assert(child && "the first containing operation is not a loop");
|
assert(child && "the first containing operation is not a loop");
|
||||||
|
@ -575,8 +583,8 @@ bool HLSCppEstimator::visitOp(AffineIfOp op, int64_t begin) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// In our assumption, AffineIfOp is completely transparent. Therefore, we set
|
// In our assumption, AffineIfOp is completely transparent. Therefore, we
|
||||||
// a dummy schedule begin here.
|
// set a dummy schedule begin here.
|
||||||
setScheduleValue(op, end, end);
|
setScheduleValue(op, end, end);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -659,9 +667,9 @@ HLSCppEstimator::Resource HLSCppEstimator::estimateResource(Block &block,
|
||||||
totalFmul /= (latencyMap["fmul"] + 1);
|
totalFmul /= (latencyMap["fmul"] + 1);
|
||||||
|
|
||||||
// We assume the loop resource utilization cannot be shared. Therefore, the
|
// We assume the loop resource utilization cannot be shared. Therefore, the
|
||||||
// overall resource utilization is loops' plus other operstions'. According to
|
// overall resource utilization is loops' plus other operstions'. According
|
||||||
// profiling, floating-point add and muliply will consume 2 and 3 DSP units,
|
// to profiling, floating-point add and muliply will consume 2 and 3 DSP
|
||||||
// respectively.
|
// units, respectively.
|
||||||
auto dsp = loopDSPNum + maxFadd * 2 + maxFmul * 3;
|
auto dsp = loopDSPNum + maxFadd * 2 + maxFmul * 3;
|
||||||
|
|
||||||
// If the block is pipelined (interval is positive), the minimum resource
|
// If the block is pipelined (interval is positive), the minimum resource
|
||||||
|
@ -787,14 +795,14 @@ void HLSCppEstimator::estimateFunc() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scheduled levels of all operations are reversed in this method, because
|
// Scheduled levels of all operations are reversed in this method, because
|
||||||
// we have done the ALAP scheduling in a reverse order. Note that after the
|
// we have done the ALAP scheduling in a reverse order. Note that after
|
||||||
// reverse, the annotated scheduling level of each operation is a relative
|
// the reverse, the annotated scheduling level of each operation is a
|
||||||
// level of the nearest surrounding AffineForOp or FuncOp.
|
// relative level of the nearest surrounding AffineForOp or FuncOp.
|
||||||
reverseSchedule();
|
reverseSchedule();
|
||||||
} else {
|
} else {
|
||||||
// Scheduling failed due to early error.
|
// Scheduling failed due to early error.
|
||||||
// TODO: further refinement and try the best to avoid failing, e.g. support
|
// TODO: further refinement and try the best to avoid failing, e.g.
|
||||||
// variable loop bound.
|
// support variable loop bound.
|
||||||
setAttrValue(func, "latency", std::string("unknown"));
|
setAttrValue(func, "latency", std::string("unknown"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -856,9 +864,9 @@ struct QoREstimation : public scalehls::QoREstimationBase<QoREstimation> {
|
||||||
for (auto func : module.getOps<FuncOp>())
|
for (auto func : module.getOps<FuncOp>())
|
||||||
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
|
if (auto topFunction = func->getAttrOfType<BoolAttr>("top_function"))
|
||||||
if (topFunction.getValue()) {
|
if (topFunction.getValue()) {
|
||||||
// Estimate the top function. If any other functions are called by the
|
// Estimate the top function. If any other functions are called by
|
||||||
// top function, it will be estimated in the procedure of estimating
|
// the top function, it will be estimated in the procedure of
|
||||||
// the top function.
|
// estimating the top function.
|
||||||
HLSCppEstimator estimator(func, latencyMap);
|
HLSCppEstimator estimator(func, latencyMap);
|
||||||
estimator.estimateFunc();
|
estimator.estimateFunc();
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,6 @@
|
||||||
|
|
||||||
#include "scalehls/Analysis/Utils.h"
|
#include "scalehls/Analysis/Utils.h"
|
||||||
#include "mlir/Analysis/AffineAnalysis.h"
|
#include "mlir/Analysis/AffineAnalysis.h"
|
||||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
|
||||||
|
|
||||||
using namespace mlir;
|
using namespace mlir;
|
||||||
using namespace scalehls;
|
using namespace scalehls;
|
||||||
|
|
|
@ -4,6 +4,9 @@
|
||||||
//
|
//
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "mlir/Analysis/AffineAnalysis.h"
|
||||||
|
#include "mlir/Analysis/Utils.h"
|
||||||
|
#include "scalehls/Analysis/Utils.h"
|
||||||
#include "scalehls/Transforms/Passes.h"
|
#include "scalehls/Transforms/Passes.h"
|
||||||
|
|
||||||
using namespace mlir;
|
using namespace mlir;
|
||||||
|
@ -15,7 +18,52 @@ struct AffineLoopOrderOpt : public AffineLoopOrderOptBase<AffineLoopOrderOpt> {
|
||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
bool scalehls::applyAffineLoopOrderOpt(AffineForOp loop, OpBuilder &builder) {
|
bool scalehls::applyAffineLoopOrderOpt(AffineLoopBand band,
|
||||||
|
OpBuilder &builder) {
|
||||||
|
auto &loopBlock = band.back().getLoopBody().front();
|
||||||
|
auto depth = band.size();
|
||||||
|
|
||||||
|
// Collect all load and store operations for each memory in the loop block,
|
||||||
|
// and calculate the number of common surrouding loops for later uses.
|
||||||
|
MemAccessesMap map;
|
||||||
|
getMemAccessesMap(loopBlock, map);
|
||||||
|
auto commonLoopDepth = getNumCommonSurroundingLoops(
|
||||||
|
*loopBlock.begin(), *std::next(loopBlock.begin()));
|
||||||
|
|
||||||
|
// Traverse all memories in the loop block.
|
||||||
|
for (auto pair : map) {
|
||||||
|
auto loadStores = pair.second;
|
||||||
|
|
||||||
|
// Find all dependencies associated to the current memory.
|
||||||
|
int64_t dstIndex = 1;
|
||||||
|
for (auto dstOp : loadStores) {
|
||||||
|
for (auto srcOp : llvm::drop_begin(loadStores, dstIndex)) {
|
||||||
|
MemRefAccess dstAccess(dstOp);
|
||||||
|
MemRefAccess srcAccess(srcOp);
|
||||||
|
|
||||||
|
FlatAffineConstraints depConstrs;
|
||||||
|
SmallVector<DependenceComponent, 2> depComps;
|
||||||
|
|
||||||
|
for (unsigned loopDepth = commonLoopDepth - depth + 1;
|
||||||
|
loopDepth <= commonLoopDepth + 1; ++loopDepth) {
|
||||||
|
DependenceResult result = checkMemrefAccessDependence(
|
||||||
|
srcAccess, dstAccess, loopDepth, &depConstrs, &depComps,
|
||||||
|
/*allowRAR=*/false);
|
||||||
|
|
||||||
|
if (hasDependence(result)) {
|
||||||
|
// llvm::outs() << "\n----------\n";
|
||||||
|
// llvm::outs() << *srcOp << " -> " << *dstOp << "\n";
|
||||||
|
// llvm::outs() << "depth: " << loopDepth << ", distance: ";
|
||||||
|
// for (auto dep : depComps)
|
||||||
|
// llvm::outs() << "(" << dep.lb.getValue() << ","
|
||||||
|
// << dep.ub.getValue() << "), ";
|
||||||
|
// llvm::outs() << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dstIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,9 +15,6 @@ using namespace scalehls;
|
||||||
// Helper methods
|
// Helper methods
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
using AffineLoopBand = SmallVector<AffineForOp, 4>;
|
|
||||||
using AffineLoopBands = SmallVector<AffineLoopBand, 4>;
|
|
||||||
|
|
||||||
static AffineForOp getLoopBandFromRoot(AffineForOp forOp,
|
static AffineForOp getLoopBandFromRoot(AffineForOp forOp,
|
||||||
AffineLoopBand &band) {
|
AffineLoopBand &band) {
|
||||||
auto currentLoop = forOp;
|
auto currentLoop = forOp;
|
||||||
|
@ -218,13 +215,16 @@ void HLSCppOptimizer::applyMultipleLevelDSE() {
|
||||||
AffineLoopBand band;
|
AffineLoopBand band;
|
||||||
getLoopBandFromLeaf(loop, band);
|
getLoopBandFromLeaf(loop, band);
|
||||||
targetBands.push_back(band);
|
targetBands.push_back(band);
|
||||||
|
|
||||||
// Loop perfection and remove variable bound are always applied for the
|
|
||||||
// convenience of polyhedral optimizations.
|
|
||||||
applyAffineLoopPerfection(band.back(), builder);
|
|
||||||
applyRemoveVariableBound(band.front(), builder);
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Loop perfection, remove variable bound, and loop order optimization are
|
||||||
|
// always applied for the convenience of polyhedral optimizations.
|
||||||
|
for (auto band : targetBands) {
|
||||||
|
applyAffineLoopPerfection(band.back(), builder);
|
||||||
|
applyRemoveVariableBound(band.front(), builder);
|
||||||
|
applyAffineLoopOrderOpt(band, builder);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
|
@ -40,11 +40,13 @@ bool scalehls::applySimplifyMemrefAccess(FuncOp func) {
|
||||||
auto secondAccess = MemRefAccess(secondOp);
|
auto secondAccess = MemRefAccess(secondOp);
|
||||||
auto secondIsRead = isa<AffineReadOpInterface>(secondOp);
|
auto secondIsRead = isa<AffineReadOpInterface>(secondOp);
|
||||||
|
|
||||||
auto sameLevelOps = checkSameLevel(firstOp, secondOp);
|
// Check whether the two operations statically have the same access.
|
||||||
|
if (firstAccess == secondAccess) {
|
||||||
// Check whether the two operations statically have the same access
|
// If the two operations are at different loop levels, break.
|
||||||
// element while at the same level.
|
// TODO: memory access operation hoisting?
|
||||||
if ((firstAccess == secondAccess) && sameLevelOps) {
|
auto sameLevelOps = checkSameLevel(firstOp, secondOp);
|
||||||
|
if (!sameLevelOps)
|
||||||
|
break;
|
||||||
|
|
||||||
// If the second operation's access direction is different with the
|
// If the second operation's access direction is different with the
|
||||||
// first operation, the first operation is known not redundant.
|
// first operation, the first operation is known not redundant.
|
||||||
|
@ -80,9 +82,10 @@ bool scalehls::applySimplifyMemrefAccess(FuncOp func) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Find possible dependencies.
|
// Find possible dependencies. If dependency appears, the first is no
|
||||||
|
// longer be able to be simplified.
|
||||||
unsigned nsLoops = getNumCommonSurroundingLoops(*firstOp, *secondOp);
|
unsigned nsLoops = getNumCommonSurroundingLoops(*firstOp, *secondOp);
|
||||||
bool dependencyFlag = false;
|
bool foundDependence = false;
|
||||||
|
|
||||||
for (unsigned depth = 1; depth <= nsLoops + 1; ++depth) {
|
for (unsigned depth = 1; depth <= nsLoops + 1; ++depth) {
|
||||||
FlatAffineConstraints dependenceConstraints;
|
FlatAffineConstraints dependenceConstraints;
|
||||||
|
@ -94,19 +97,23 @@ bool scalehls::applySimplifyMemrefAccess(FuncOp func) {
|
||||||
|
|
||||||
// Only zero distance dependencies are considered here.
|
// Only zero distance dependencies are considered here.
|
||||||
if (hasDependence(result)) {
|
if (hasDependence(result)) {
|
||||||
int64_t distance = 0;
|
bool hasZeroDistance = true;
|
||||||
for (auto dep : dependenceComponents)
|
|
||||||
if (dep.lb)
|
|
||||||
distance += std::abs(dep.lb.getValue());
|
|
||||||
|
|
||||||
if (distance == 0) {
|
for (auto dep : dependenceComponents)
|
||||||
dependencyFlag = true;
|
if (dep.lb.getValue() > 0 || dep.ub.getValue() < 0) {
|
||||||
|
hasZeroDistance = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasZeroDistance) {
|
||||||
|
foundDependence = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dependencyFlag)
|
// If any dependence is found, break.
|
||||||
|
if (foundDependence)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
// RUN: scalehls-opt -qor-estimation="target-spec=../../config/target-spec.ini" %s | FileCheck %s
|
// RUN: scalehls-opt -qor-estimation="target-spec=../../config/target-spec.ini" %s | FileCheck %s
|
||||||
|
|
||||||
// CHECK-LABEL: func @test_qor_estimation
|
// CHECK-LABEL: func @qor_estimation
|
||||||
func @qor_estimation() {
|
func @qor_estimation() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue