[ArrayPartition] impl of this pass; note that due to the insufficient AffineExpr simplification mechanism in MLIR, this pass can't handle too complicated cases

This commit is contained in:
Hanchen Ye 2020-12-07 23:24:15 -06:00
parent 03b2695fe7
commit f9bf38240e
5 changed files with 189 additions and 64 deletions

View File

@ -21,7 +21,8 @@ def PositiveUI32ArrayAttr : TypedArrayAttrBase<PositiveUI32Attr, ""> {}
def PartitionTypeAttr : StrEnumAttr<"PartitionType", "", [ def PartitionTypeAttr : StrEnumAttr<"PartitionType", "", [
StrEnumAttrCase<"cyclic", 0>, StrEnumAttrCase<"cyclic", 0>,
StrEnumAttrCase<"block", 1>, StrEnumAttrCase<"block", 1>,
StrEnumAttrCase<"complete", 2> StrEnumAttrCase<"complete", 2>,
StrEnumAttrCase<"none", 3>
]> { ]> {
let cppNamespace = "::mlir::scalehls::hlscpp"; let cppNamespace = "::mlir::scalehls::hlscpp";
} }

View File

@ -33,11 +33,13 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) {
// If the current loop is annotated as unroll, all inner loops and itself are // If the current loop is annotated as unroll, all inner loops and itself are
// automatically unrolled. // automatically unrolled.
if (getBoolAttrValue(op, "unroll")) { if (getBoolAttrValue(op, "unroll")) {
op.emitRemark("this loop and all inner loops are automatically unrolled.");
op.walk([&](AffineForOp forOp) { op.walk([&](AffineForOp forOp) {
if (forOp.getLoopBody().getBlocks().size() != 1) if (forOp.getLoopBody().getBlocks().size() != 1)
op.emitError("has zero or more than one basic blocks."); op.emitError("has zero or more than one basic blocks.");
loopUnrollFull(forOp); if (failed(loopUnrollFull(forOp))) {
forOp.emitError("failed to be fully unrolled.");
return;
}
}); });
return true; return true;
} }
@ -45,12 +47,14 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) {
// If the current loop is annotated as pipeline, all intter loops are // If the current loop is annotated as pipeline, all intter loops are
// automatically unrolled. // automatically unrolled.
if (getBoolAttrValue(op, "pipeline")) { if (getBoolAttrValue(op, "pipeline")) {
op.emitRemark("all inner loops are automatically unrolled.");
op.walk([&](AffineForOp forOp) { op.walk([&](AffineForOp forOp) {
if (forOp != op) { if (forOp != op) {
if (forOp.getLoopBody().getBlocks().size() != 1) if (forOp.getLoopBody().getBlocks().size() != 1)
op.emitError("has zero or more than one basic blocks."); op.emitError("has zero or more than one basic blocks.");
loopUnrollFull(forOp); if (failed(loopUnrollFull(forOp))) {
forOp.emitError("failed to be fully unrolled.");
return;
}
} }
}); });
} }
@ -129,6 +133,7 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
string opLatencyPath) string opLatencyPath)
: HLSCppToolBase(builder) { : HLSCppToolBase(builder) {
/*
INIReader targetSpec(targetSpecPath); INIReader targetSpec(targetSpecPath);
if (targetSpec.ParseError()) if (targetSpec.ParseError())
llvm::outs() << "error: target spec file parse fail, please refer to " llvm::outs() << "error: target spec file parse fail, please refer to "
@ -143,6 +148,7 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath,
auto freq = targetSpec.Get("spec", "frequency", "200MHz"); auto freq = targetSpec.Get("spec", "frequency", "200MHz");
auto latency = opLatency.GetInteger(freq, "op", 0); auto latency = opLatency.GetInteger(freq, "op", 0);
llvm::outs() << latency << "\n"; llvm::outs() << latency << "\n";
*/
} }
/// Calculate the partition index according to the affine map of a memory access /// Calculate the partition index according to the affine map of a memory access
@ -467,8 +473,6 @@ bool HLSCppEstimator::visitOp(AffineForOp op) {
// equal to zero. So that in this case, this loop will be flattened into // equal to zero. So that in this case, this loop will be flattened into
// the inner pipelined loop. // the inner pipelined loop.
if (auto II = getUIntAttrValue(child, "init_interval")) { if (auto II = getUIntAttrValue(child, "init_interval")) {
op.emitRemark("this loop is flattened into its inner loop.");
setAttrValue(op, "init_interval", II); setAttrValue(op, "init_interval", II);
auto iterLatency = getUIntAttrValue(child, "iter_latency"); auto iterLatency = getUIntAttrValue(child, "iter_latency");

View File

@ -18,59 +18,6 @@ public:
}; };
} // namespace } // namespace
static void convertBlock(Block &block) {
for (auto &op : block) {
if (isa<ArrayOp>(op))
continue;
auto builder = OpBuilder(&op);
// ArrayOp will be inserted after each ShapedType value from declaration
// or function signature.
for (auto operand : op.getOperands()) {
if (auto arrayType = operand.getType().dyn_cast<ShapedType>()) {
bool insertArrayOp = false;
if (operand.getKind() == Value::Kind::BlockArgument)
insertArrayOp = true;
else if (!isa<ArrayOp>(operand.getDefiningOp()) &&
!isa<AssignOp>(operand.getDefiningOp())) {
insertArrayOp = true;
if (!arrayType.hasStaticShape())
operand.getDefiningOp()->emitError(
"is unranked or has dynamic shape which is illegal.");
}
if (insertArrayOp) {
// Insert array operation and set attributes.
builder.setInsertionPointAfterValue(operand);
auto arrayOp =
builder.create<ArrayOp>(op.getLoc(), operand.getType(), operand);
operand.replaceAllUsesExcept(arrayOp.getResult(),
SmallPtrSet<Operation *, 1>{arrayOp});
// Set array pragma attributes, default array instance is ram_1p
// bram. Other attributes are not set here since they requires more
// analysis to be determined.
arrayOp.setAttr("interface", builder.getBoolAttr(false));
arrayOp.setAttr("storage", builder.getBoolAttr(false));
arrayOp.setAttr("partition", builder.getBoolAttr(false));
}
}
}
if (auto forOp = dyn_cast<AffineForOp>(op)) {
if (forOp.getLoopBody().getBlocks().size() != 1)
forOp.emitError("has zero or more than one basic blocks");
// Set loop pragma attributes.
forOp.setAttr("pipeline", builder.getBoolAttr(false));
forOp.setAttr("unroll", builder.getBoolAttr(false));
forOp.setAttr("flatten", builder.getBoolAttr(false));
convertBlock(forOp.getLoopBody().front());
}
}
}
void ConvertToHLSCpp::runOnOperation() { void ConvertToHLSCpp::runOnOperation() {
for (auto func : getOperation().getOps<FuncOp>()) { for (auto func : getOperation().getOps<FuncOp>()) {
auto b = OpBuilder(func); auto b = OpBuilder(func);
@ -101,7 +48,55 @@ void ConvertToHLSCpp::runOnOperation() {
func.emitError("doesn't have a return as terminator."); func.emitError("doesn't have a return as terminator.");
// Recursively convert every for loop body blocks. // Recursively convert every for loop body blocks.
convertBlock(func.front()); func.walk([&](Operation *op) {
auto builder = OpBuilder(op);
// ArrayOp will be inserted after each ShapedType value from declaration
// or function signature.
for (auto operand : op->getOperands()) {
if (auto arrayType = operand.getType().dyn_cast<ShapedType>()) {
bool insertArrayOp = false;
if (operand.getKind() == Value::Kind::BlockArgument)
insertArrayOp = true;
else if (!isa<ArrayOp>(operand.getDefiningOp()) &&
!isa<AssignOp>(operand.getDefiningOp())) {
insertArrayOp = true;
if (!arrayType.hasStaticShape())
operand.getDefiningOp()->emitError(
"is unranked or has dynamic shape which is illegal.");
}
if (isa<ArrayOp>(op))
insertArrayOp = false;
if (insertArrayOp) {
// Insert array operation and set attributes.
builder.setInsertionPointAfterValue(operand);
auto arrayOp = builder.create<ArrayOp>(op->getLoc(),
operand.getType(), operand);
operand.replaceAllUsesExcept(arrayOp.getResult(),
SmallPtrSet<Operation *, 1>{arrayOp});
// Set array pragma attributes, default array instance is ram_1p
// bram. Other attributes are not set here since they requires more
// analysis to be determined.
arrayOp.setAttr("interface", builder.getBoolAttr(false));
arrayOp.setAttr("storage", builder.getBoolAttr(false));
arrayOp.setAttr("partition", builder.getBoolAttr(false));
}
}
}
if (auto forOp = dyn_cast<AffineForOp>(op)) {
if (forOp.getLoopBody().getBlocks().size() != 1)
forOp.emitError("has zero or more than one basic blocks");
// Set loop pragma attributes.
forOp.setAttr("pipeline", builder.getBoolAttr(false));
forOp.setAttr("unroll", builder.getBoolAttr(false));
forOp.setAttr("flatten", builder.getBoolAttr(false));
}
});
} }
} }

View File

@ -2,10 +2,12 @@
// //
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
#include "Analysis/QoREstimation.h"
#include "Transforms/Passes.h" #include "Transforms/Passes.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/Affine/Passes.h"
#include "mlir/IR/Builders.h" #include "mlir/IR/Builders.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/LoopUtils.h"
using namespace std; using namespace std;
@ -18,7 +20,130 @@ struct ArrayPartition : public ArrayPartitionBase<ArrayPartition> {
}; };
} // namespace } // namespace
void ArrayPartition::runOnOperation() { return; } void ArrayPartition::runOnOperation() {
auto module = getOperation();
auto builder = OpBuilder(module);
// Extract all static parameters and current pragma configurations.
HLSCppAnalyzer analyzer(builder);
analyzer.analyzeModule(getOperation());
// Canonicalize the analyzed IR.
OwningRewritePatternList patterns;
auto *context = &getContext();
for (auto *op : context->getRegisteredOperations())
op->getCanonicalizationPatterns(patterns, context);
Operation *op = getOperation();
applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns));
// Estimate performance and resource utilization.
for (auto func : module.getOps<FuncOp>()) {
for (auto forOp : func.getOps<mlir::AffineForOp>()) {
// TODO: support imperfect loop nests.
SmallVector<mlir::AffineForOp, 4> nestedLoops;
getPerfectlyNestedLoops(nestedLoops, forOp);
auto innermost = nestedLoops.back();
// Collect memory access information.
MemAccessDict loadDict;
innermost.walk([&](mlir::AffineLoadOp loadOp) {
auto arrayOp = cast<ArrayOp>(loadOp.getMemRef().getDefiningOp());
loadDict[arrayOp].push_back(loadOp);
});
MemAccessDict storeDict;
innermost.walk([&](mlir::AffineStoreOp storeOp) {
auto arrayOp = cast<ArrayOp>(storeOp.getMemRef().getDefiningOp());
storeDict[arrayOp].push_back(storeOp);
});
// Apply array partition pragma.
for (auto pair : loadDict) {
auto arrayOp = cast<ArrayOp>(pair.first);
auto arrayType = arrayOp.getType().cast<MemRefType>();
auto arrayAccesses = pair.second;
// Walk through each dimension of the targeted array.
SmallVector<Attribute, 4> partitionFactor;
SmallVector<StringRef, 4> partitionType;
for (size_t dim = 0, e = arrayType.getShape().size(); dim < e; ++dim) {
unsigned dimSize = arrayType.getShape()[dim];
// Collect all array access indices of the current dimension.
SmallVector<AffineExpr, 4> indices;
for (auto accessOp : arrayAccesses) {
auto concreteOp = cast<mlir::AffineLoadOp>(accessOp);
auto index = concreteOp.getAffineMap().getResult(dim);
// Only add unique index.
if (std::find(indices.begin(), indices.end(), index) ==
indices.end())
indices.push_back(index);
}
auto accessNum = indices.size();
// Find the max array access distance in the current block.
unsigned maxDistance = 0;
bool failFlag = false;
for (unsigned i = 0; i < accessNum; ++i) {
for (unsigned j = i + 1; j < accessNum; ++j) {
// TODO: this expression can't be simplified.
auto expr = indices[j] - indices[i];
if (auto constDistance = expr.dyn_cast<AffineConstantExpr>()) {
unsigned distance = abs(constDistance.getValue());
maxDistance = max(maxDistance, distance);
} else {
// The array partition mechanism will fail if the distance is
// not a constant number.
// failFlag = true;
// break;
}
}
// if (failFlag)
// break;
}
// Determine array partition strategy.
maxDistance += 1;
if (failFlag || maxDistance == 1) {
// This means all accesses have the same index, and this dimension
// should not be partitioned.
partitionType.push_back("none");
partitionFactor.push_back(builder.getUI32IntegerAttr(1));
} else if (accessNum == dimSize) {
// Apply complete array partition.
partitionType.push_back("complete");
partitionFactor.push_back(builder.getUI32IntegerAttr(1));
} else if (accessNum >= maxDistance) {
// This means some elements are accessed more than once or exactly
// once, and successive elements are accessed. In most cases, apply
// "cyclic" partition should be the best solution.
partitionType.push_back("cyclic");
partitionFactor.push_back(builder.getUI32IntegerAttr(maxDistance));
} else {
// This means discrete elements are accessed. Typically, "block"
// partition will be most benefit for this occasion.
partitionType.push_back("block");
partitionFactor.push_back(builder.getUI32IntegerAttr(accessNum));
}
}
arrayOp.setAttr("partition", builder.getBoolAttr(true));
arrayOp.setAttr("partition_type",
builder.getStrArrayAttr(partitionType));
arrayOp.setAttr("partition_factor",
builder.getArrayAttr(partitionFactor));
}
}
}
}
std::unique_ptr<mlir::Pass> scalehls::createArrayPartitionPass() { std::unique_ptr<mlir::Pass> scalehls::createArrayPartitionPass() {
return std::make_unique<ArrayPartition>(); return std::make_unique<ArrayPartition>();

View File

@ -2,8 +2,8 @@
// CHECK-LABEL: func @test_for // CHECK-LABEL: func @test_for
func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attributes {dataflow = false} { func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attributes {dataflow = false} {
%array0 = "hlscpp.array"(%arg0) {interface = true, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex> %array0 = "hlscpp.array"(%arg0) {interface=true, storage=false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
%array1 = "hlscpp.array"(%arg1) {interface = true, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex> %array1 = "hlscpp.array"(%arg1) {interface=true, storage=false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex>
affine.for %i = 0 to 16 { affine.for %i = 0 to 16 {
affine.for %j = 0 to 4 { affine.for %j = 0 to 4 {
affine.for %k = 0 to 4 { affine.for %k = 0 to 4 {