[ArrayPartition] impl of this pass; note that due to the insufficient AffineExpr simplification mechanism in MLIR, this pass can't handle too complicated cases
This commit is contained in:
parent
03b2695fe7
commit
f9bf38240e
|
@ -21,7 +21,8 @@ def PositiveUI32ArrayAttr : TypedArrayAttrBase<PositiveUI32Attr, ""> {}
|
|||
def PartitionTypeAttr : StrEnumAttr<"PartitionType", "", [
|
||||
StrEnumAttrCase<"cyclic", 0>,
|
||||
StrEnumAttrCase<"block", 1>,
|
||||
StrEnumAttrCase<"complete", 2>
|
||||
StrEnumAttrCase<"complete", 2>,
|
||||
StrEnumAttrCase<"none", 3>
|
||||
]> {
|
||||
let cppNamespace = "::mlir::scalehls::hlscpp";
|
||||
}
|
||||
|
|
|
@ -33,11 +33,13 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) {
|
|||
// If the current loop is annotated as unroll, all inner loops and itself are
|
||||
// automatically unrolled.
|
||||
if (getBoolAttrValue(op, "unroll")) {
|
||||
op.emitRemark("this loop and all inner loops are automatically unrolled.");
|
||||
op.walk([&](AffineForOp forOp) {
|
||||
if (forOp.getLoopBody().getBlocks().size() != 1)
|
||||
op.emitError("has zero or more than one basic blocks.");
|
||||
loopUnrollFull(forOp);
|
||||
if (failed(loopUnrollFull(forOp))) {
|
||||
forOp.emitError("failed to be fully unrolled.");
|
||||
return;
|
||||
}
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
@ -45,12 +47,14 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) {
|
|||
// If the current loop is annotated as pipeline, all intter loops are
|
||||
// automatically unrolled.
|
||||
if (getBoolAttrValue(op, "pipeline")) {
|
||||
op.emitRemark("all inner loops are automatically unrolled.");
|
||||
op.walk([&](AffineForOp forOp) {
|
||||
if (forOp != op) {
|
||||
if (forOp.getLoopBody().getBlocks().size() != 1)
|
||||
op.emitError("has zero or more than one basic blocks.");
|
||||
loopUnrollFull(forOp);
|
||||
if (failed(loopUnrollFull(forOp))) {
|
||||
forOp.emitError("failed to be fully unrolled.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -129,6 +133,7 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
|
|||
string opLatencyPath)
|
||||
: HLSCppToolBase(builder) {
|
||||
|
||||
/*
|
||||
INIReader targetSpec(targetSpecPath);
|
||||
if (targetSpec.ParseError())
|
||||
llvm::outs() << "error: target spec file parse fail, please refer to "
|
||||
|
@ -143,6 +148,7 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
|
|||
auto freq = targetSpec.Get("spec", "frequency", "200MHz");
|
||||
auto latency = opLatency.GetInteger(freq, "op", 0);
|
||||
llvm::outs() << latency << "\n";
|
||||
*/
|
||||
}
|
||||
|
||||
/// Calculate the partition index according to the affine map of a memory access
|
||||
|
@ -467,8 +473,6 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
|
|||
// equal to zero. So that in this case, this loop will be flattened into
|
||||
// the inner pipelined loop.
|
||||
if (auto II = getUIntAttrValue(child, "init_interval")) {
|
||||
op.emitRemark("this loop is flattened into its inner loop.");
|
||||
|
||||
setAttrValue(op, "init_interval", II);
|
||||
|
||||
auto iterLatency = getUIntAttrValue(child, "iter_latency");
|
||||
|
|
|
@ -18,59 +18,6 @@ public:
|
|||
};
|
||||
} // namespace
|
||||
|
||||
static void convertBlock(Block &block) {
|
||||
for (auto &op : block) {
|
||||
if (isa<ArrayOp>(op))
|
||||
continue;
|
||||
auto builder = OpBuilder(&op);
|
||||
|
||||
// ArrayOp will be inserted after each ShapedType value from declaration
|
||||
// or function signature.
|
||||
for (auto operand : op.getOperands()) {
|
||||
if (auto arrayType = operand.getType().dyn_cast<ShapedType>()) {
|
||||
bool insertArrayOp = false;
|
||||
if (operand.getKind() == Value::Kind::BlockArgument)
|
||||
insertArrayOp = true;
|
||||
else if (!isa<ArrayOp>(operand.getDefiningOp()) &&
|
||||
!isa<AssignOp>(operand.getDefiningOp())) {
|
||||
insertArrayOp = true;
|
||||
if (!arrayType.hasStaticShape())
|
||||
operand.getDefiningOp()->emitError(
|
||||
"is unranked or has dynamic shape which is illegal.");
|
||||
}
|
||||
|
||||
if (insertArrayOp) {
|
||||
// Insert array operation and set attributes.
|
||||
builder.setInsertionPointAfterValue(operand);
|
||||
auto arrayOp =
|
||||
builder.create<ArrayOp>(op.getLoc(), operand.getType(), operand);
|
||||
operand.replaceAllUsesExcept(arrayOp.getResult(),
|
||||
SmallPtrSet<Operation *, 1>{arrayOp});
|
||||
|
||||
// Set array pragma attributes, default array instance is ram_1p
|
||||
// bram. Other attributes are not set here since they requires more
|
||||
// analysis to be determined.
|
||||
arrayOp.setAttr("interface", builder.getBoolAttr(false));
|
||||
arrayOp.setAttr("storage", builder.getBoolAttr(false));
|
||||
arrayOp.setAttr("partition", builder.getBoolAttr(false));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (auto forOp = dyn_cast<AffineForOp>(op)) {
|
||||
if (forOp.getLoopBody().getBlocks().size() != 1)
|
||||
forOp.emitError("has zero or more than one basic blocks");
|
||||
|
||||
// Set loop pragma attributes.
|
||||
forOp.setAttr("pipeline", builder.getBoolAttr(false));
|
||||
forOp.setAttr("unroll", builder.getBoolAttr(false));
|
||||
forOp.setAttr("flatten", builder.getBoolAttr(false));
|
||||
|
||||
convertBlock(forOp.getLoopBody().front());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ConvertToHLSCpp::runOnOperation() {
|
||||
for (auto func : getOperation().getOps<FuncOp>()) {
|
||||
auto b = OpBuilder(func);
|
||||
|
@ -101,7 +48,55 @@ void ConvertToHLSCpp::runOnOperation() {
|
|||
func.emitError("doesn't have a return as terminator.");
|
||||
|
||||
// Recursively convert every for loop body blocks.
|
||||
convertBlock(func.front());
|
||||
func.walk([&](Operation *op) {
|
||||
auto builder = OpBuilder(op);
|
||||
|
||||
// ArrayOp will be inserted after each ShapedType value from declaration
|
||||
// or function signature.
|
||||
for (auto operand : op->getOperands()) {
|
||||
if (auto arrayType = operand.getType().dyn_cast<ShapedType>()) {
|
||||
bool insertArrayOp = false;
|
||||
if (operand.getKind() == Value::Kind::BlockArgument)
|
||||
insertArrayOp = true;
|
||||
else if (!isa<ArrayOp>(operand.getDefiningOp()) &&
|
||||
!isa<AssignOp>(operand.getDefiningOp())) {
|
||||
insertArrayOp = true;
|
||||
if (!arrayType.hasStaticShape())
|
||||
operand.getDefiningOp()->emitError(
|
||||
"is unranked or has dynamic shape which is illegal.");
|
||||
}
|
||||
|
||||
if (isa<ArrayOp>(op))
|
||||
insertArrayOp = false;
|
||||
|
||||
if (insertArrayOp) {
|
||||
// Insert array operation and set attributes.
|
||||
builder.setInsertionPointAfterValue(operand);
|
||||
auto arrayOp = builder.create<ArrayOp>(op->getLoc(),
|
||||
operand.getType(), operand);
|
||||
operand.replaceAllUsesExcept(arrayOp.getResult(),
|
||||
SmallPtrSet<Operation *, 1>{arrayOp});
|
||||
|
||||
// Set array pragma attributes, default array instance is ram_1p
|
||||
// bram. Other attributes are not set here since they requires more
|
||||
// analysis to be determined.
|
||||
arrayOp.setAttr("interface", builder.getBoolAttr(false));
|
||||
arrayOp.setAttr("storage", builder.getBoolAttr(false));
|
||||
arrayOp.setAttr("partition", builder.getBoolAttr(false));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (auto forOp = dyn_cast<AffineForOp>(op)) {
|
||||
if (forOp.getLoopBody().getBlocks().size() != 1)
|
||||
forOp.emitError("has zero or more than one basic blocks");
|
||||
|
||||
// Set loop pragma attributes.
|
||||
forOp.setAttr("pipeline", builder.getBoolAttr(false));
|
||||
forOp.setAttr("unroll", builder.getBoolAttr(false));
|
||||
forOp.setAttr("flatten", builder.getBoolAttr(false));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,10 +2,12 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Analysis/QoREstimation.h"
|
||||
#include "Transforms/Passes.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
|
||||
using namespace std;
|
||||
|
@ -18,7 +20,130 @@ struct ArrayPartition : public ArrayPartitionBase<ArrayPartition> {
|
|||
};
|
||||
} // namespace
|
||||
|
||||
void ArrayPartition::runOnOperation() { return; }
|
||||
void ArrayPartition::runOnOperation() {
|
||||
auto module = getOperation();
|
||||
auto builder = OpBuilder(module);
|
||||
|
||||
// Extract all static parameters and current pragma configurations.
|
||||
HLSCppAnalyzer analyzer(builder);
|
||||
analyzer.analyzeModule(getOperation());
|
||||
|
||||
// Canonicalize the analyzed IR.
|
||||
OwningRewritePatternList patterns;
|
||||
|
||||
auto *context = &getContext();
|
||||
for (auto *op : context->getRegisteredOperations())
|
||||
op->getCanonicalizationPatterns(patterns, context);
|
||||
|
||||
Operation *op = getOperation();
|
||||
applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns));
|
||||
|
||||
// Estimate performance and resource utilization.
|
||||
for (auto func : module.getOps<FuncOp>()) {
|
||||
for (auto forOp : func.getOps<mlir::AffineForOp>()) {
|
||||
// TODO: support imperfect loop nests.
|
||||
SmallVector<mlir::AffineForOp, 4> nestedLoops;
|
||||
getPerfectlyNestedLoops(nestedLoops, forOp);
|
||||
auto innermost = nestedLoops.back();
|
||||
|
||||
// Collect memory access information.
|
||||
MemAccessDict loadDict;
|
||||
innermost.walk([&](mlir::AffineLoadOp loadOp) {
|
||||
auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
|
||||
loadDict[arrayOp].push_back(loadOp);
|
||||
});
|
||||
|
||||
MemAccessDict storeDict;
|
||||
innermost.walk([&](mlir::AffineStoreOp storeOp) {
|
||||
auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
|
||||
storeDict[arrayOp].push_back(storeOp);
|
||||
});
|
||||
|
||||
// Apply array partition pragma.
|
||||
for (auto pair : loadDict) {
|
||||
auto arrayOp = cast<ArrayOp>(pair.first);
|
||||
auto arrayType = arrayOp.getType().cast<MemRefType>();
|
||||
auto arrayAccesses = pair.second;
|
||||
|
||||
// Walk through each dimension of the targeted array.
|
||||
SmallVector<Attribute, 4> partitionFactor;
|
||||
SmallVector<StringRef, 4> partitionType;
|
||||
|
||||
for (size_t dim = 0, e = arrayType.getShape().size(); dim < e; ++dim) {
|
||||
unsigned dimSize = arrayType.getShape()[dim];
|
||||
|
||||
// Collect all array access indices of the current dimension.
|
||||
SmallVector<AffineExpr, 4> indices;
|
||||
for (auto accessOp : arrayAccesses) {
|
||||
auto concreteOp = cast<mlir::AffineLoadOp>(accessOp);
|
||||
auto index = concreteOp.getAffineMap().getResult(dim);
|
||||
// Only add unique index.
|
||||
if (std::find(indices.begin(), indices.end(), index) ==
|
||||
indices.end())
|
||||
indices.push_back(index);
|
||||
}
|
||||
auto accessNum = indices.size();
|
||||
|
||||
// Find the max array access distance in the current block.
|
||||
unsigned maxDistance = 0;
|
||||
bool failFlag = false;
|
||||
|
||||
for (unsigned i = 0; i < accessNum; ++i) {
|
||||
for (unsigned j = i + 1; j < accessNum; ++j) {
|
||||
// TODO: this expression can't be simplified.
|
||||
auto expr = indices[j] - indices[i];
|
||||
|
||||
if (auto constDistance = expr.dyn_cast<AffineConstantExpr>()) {
|
||||
unsigned distance = abs(constDistance.getValue());
|
||||
maxDistance = max(maxDistance, distance);
|
||||
} else {
|
||||
// The array partition mechanism will fail if the distance is
|
||||
// not a constant number.
|
||||
// failFlag = true;
|
||||
// break;
|
||||
}
|
||||
}
|
||||
// if (failFlag)
|
||||
// break;
|
||||
}
|
||||
|
||||
// Determine array partition strategy.
|
||||
maxDistance += 1;
|
||||
if (failFlag || maxDistance == 1) {
|
||||
// This means all accesses have the same index, and this dimension
|
||||
// should not be partitioned.
|
||||
partitionType.push_back("none");
|
||||
partitionFactor.push_back(builder.getUI32IntegerAttr(1));
|
||||
|
||||
} else if (accessNum == dimSize) {
|
||||
// Apply complete array partition.
|
||||
partitionType.push_back("complete");
|
||||
partitionFactor.push_back(builder.getUI32IntegerAttr(1));
|
||||
|
||||
} else if (accessNum >= maxDistance) {
|
||||
// This means some elements are accessed more than once or exactly
|
||||
// once, and successive elements are accessed. In most cases, apply
|
||||
// "cyclic" partition should be the best solution.
|
||||
partitionType.push_back("cyclic");
|
||||
partitionFactor.push_back(builder.getUI32IntegerAttr(maxDistance));
|
||||
|
||||
} else {
|
||||
// This means discrete elements are accessed. Typically, "block"
|
||||
// partition will be most benefit for this occasion.
|
||||
partitionType.push_back("block");
|
||||
partitionFactor.push_back(builder.getUI32IntegerAttr(accessNum));
|
||||
}
|
||||
}
|
||||
|
||||
arrayOp.setAttr("partition", builder.getBoolAttr(true));
|
||||
arrayOp.setAttr("partition_type",
|
||||
builder.getStrArrayAttr(partitionType));
|
||||
arrayOp.setAttr("partition_factor",
|
||||
builder.getArrayAttr(partitionFactor));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<mlir::Pass> scalehls::createArrayPartitionPass() {
|
||||
return std::make_unique<ArrayPartition>();
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
// CHECK-LABEL: func @test_for
|
||||
func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attributes {dataflow = false} {
|
||||
%array0 = "hlscpp.array"(%arg0) {interface = true, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
%array1 = "hlscpp.array"(%arg1) {interface = true, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
%array0 = "hlscpp.array"(%arg0) {interface=true, storage=false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
%array1 = "hlscpp.array"(%arg1) {interface=true, storage=false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
|
||||
affine.for %i = 0 to 16 {
|
||||
affine.for %j = 0 to 4 {
|
||||
affine.for %k = 0 to 4 {
|
||||
|
|
Loading…
Reference in New Issue