diff --git a/include/Dialect/HLSCpp/Attributes.td b/include/Dialect/HLSCpp/Attributes.td index f9b2b0e..dc1dac1 100644 --- a/include/Dialect/HLSCpp/Attributes.td +++ b/include/Dialect/HLSCpp/Attributes.td @@ -21,7 +21,8 @@ def PositiveUI32ArrayAttr : TypedArrayAttrBase {} def PartitionTypeAttr : StrEnumAttr<"PartitionType", "", [ StrEnumAttrCase<"cyclic", 0>, StrEnumAttrCase<"block", 1>, - StrEnumAttrCase<"complete", 2> + StrEnumAttrCase<"complete", 2>, + StrEnumAttrCase<"none", 3> ]> { let cppNamespace = "::mlir::scalehls::hlscpp"; } diff --git a/lib/Analysis/QoREstimation.cpp b/lib/Analysis/QoREstimation.cpp index 1461a57..268b04d 100644 --- a/lib/Analysis/QoREstimation.cpp +++ b/lib/Analysis/QoREstimation.cpp @@ -33,11 +33,13 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) { // If the current loop is annotated as unroll, all inner loops and itself are // automatically unrolled. if (getBoolAttrValue(op, "unroll")) { - op.emitRemark("this loop and all inner loops are automatically unrolled."); op.walk([&](AffineForOp forOp) { if (forOp.getLoopBody().getBlocks().size() != 1) op.emitError("has zero or more than one basic blocks."); - loopUnrollFull(forOp); + if (failed(loopUnrollFull(forOp))) { + forOp.emitError("failed to be fully unrolled."); + return; + } }); return true; } @@ -45,12 +47,14 @@ bool HLSCppAnalyzer::visitOp(AffineForOp op) { // If the current loop is annotated as pipeline, all intter loops are // automatically unrolled. if (getBoolAttrValue(op, "pipeline")) { - op.emitRemark("all inner loops are automatically unrolled."); op.walk([&](AffineForOp forOp) { if (forOp != op) { if (forOp.getLoopBody().getBlocks().size() != 1) op.emitError("has zero or more than one basic blocks."); - loopUnrollFull(forOp); + if (failed(loopUnrollFull(forOp))) { + forOp.emitError("failed to be fully unrolled."); + return; + } } }); } @@ -129,6 +133,7 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath, string opLatencyPath) : HLSCppToolBase(builder) { + /* INIReader targetSpec(targetSpecPath); if (targetSpec.ParseError()) llvm::outs() << "error: target spec file parse fail, please refer to " @@ -143,6 +148,7 @@ HLSCppEstimator::HLSCppEstimator(OpBuilder &builder, string targetSpecPath, auto freq = targetSpec.Get("spec", "frequency", "200MHz"); auto latency = opLatency.GetInteger(freq, "op", 0); llvm::outs() << latency << "\n"; + */ } /// Calculate the partition index according to the affine map of a memory access @@ -467,8 +473,6 @@ bool HLSCppEstimator::visitOp(AffineForOp op) { // equal to zero. So that in this case, this loop will be flattened into // the inner pipelined loop. if (auto II = getUIntAttrValue(child, "init_interval")) { - op.emitRemark("this loop is flattened into its inner loop."); - setAttrValue(op, "init_interval", II); auto iterLatency = getUIntAttrValue(child, "iter_latency"); diff --git a/lib/Conversion/ConvertToHLSCpp.cpp b/lib/Conversion/ConvertToHLSCpp.cpp index 31fb1af..50d3eec 100644 --- a/lib/Conversion/ConvertToHLSCpp.cpp +++ b/lib/Conversion/ConvertToHLSCpp.cpp @@ -18,59 +18,6 @@ public: }; } // namespace -static void convertBlock(Block &block) { - for (auto &op : block) { - if (isa(op)) - continue; - auto builder = OpBuilder(&op); - - // ArrayOp will be inserted after each ShapedType value from declaration - // or function signature. - for (auto operand : op.getOperands()) { - if (auto arrayType = operand.getType().dyn_cast()) { - bool insertArrayOp = false; - if (operand.getKind() == Value::Kind::BlockArgument) - insertArrayOp = true; - else if (!isa(operand.getDefiningOp()) && - !isa(operand.getDefiningOp())) { - insertArrayOp = true; - if (!arrayType.hasStaticShape()) - operand.getDefiningOp()->emitError( - "is unranked or has dynamic shape which is illegal."); - } - - if (insertArrayOp) { - // Insert array operation and set attributes. - builder.setInsertionPointAfterValue(operand); - auto arrayOp = - builder.create(op.getLoc(), operand.getType(), operand); - operand.replaceAllUsesExcept(arrayOp.getResult(), - SmallPtrSet{arrayOp}); - - // Set array pragma attributes, default array instance is ram_1p - // bram. Other attributes are not set here since they requires more - // analysis to be determined. - arrayOp.setAttr("interface", builder.getBoolAttr(false)); - arrayOp.setAttr("storage", builder.getBoolAttr(false)); - arrayOp.setAttr("partition", builder.getBoolAttr(false)); - } - } - } - - if (auto forOp = dyn_cast(op)) { - if (forOp.getLoopBody().getBlocks().size() != 1) - forOp.emitError("has zero or more than one basic blocks"); - - // Set loop pragma attributes. - forOp.setAttr("pipeline", builder.getBoolAttr(false)); - forOp.setAttr("unroll", builder.getBoolAttr(false)); - forOp.setAttr("flatten", builder.getBoolAttr(false)); - - convertBlock(forOp.getLoopBody().front()); - } - } -} - void ConvertToHLSCpp::runOnOperation() { for (auto func : getOperation().getOps()) { auto b = OpBuilder(func); @@ -101,7 +48,55 @@ void ConvertToHLSCpp::runOnOperation() { func.emitError("doesn't have a return as terminator."); // Recursively convert every for loop body blocks. - convertBlock(func.front()); + func.walk([&](Operation *op) { + auto builder = OpBuilder(op); + + // ArrayOp will be inserted after each ShapedType value from declaration + // or function signature. + for (auto operand : op->getOperands()) { + if (auto arrayType = operand.getType().dyn_cast()) { + bool insertArrayOp = false; + if (operand.getKind() == Value::Kind::BlockArgument) + insertArrayOp = true; + else if (!isa(operand.getDefiningOp()) && + !isa(operand.getDefiningOp())) { + insertArrayOp = true; + if (!arrayType.hasStaticShape()) + operand.getDefiningOp()->emitError( + "is unranked or has dynamic shape which is illegal."); + } + + if (isa(op)) + insertArrayOp = false; + + if (insertArrayOp) { + // Insert array operation and set attributes. + builder.setInsertionPointAfterValue(operand); + auto arrayOp = builder.create(op->getLoc(), + operand.getType(), operand); + operand.replaceAllUsesExcept(arrayOp.getResult(), + SmallPtrSet{arrayOp}); + + // Set array pragma attributes, default array instance is ram_1p + // bram. Other attributes are not set here since they requires more + // analysis to be determined. + arrayOp.setAttr("interface", builder.getBoolAttr(false)); + arrayOp.setAttr("storage", builder.getBoolAttr(false)); + arrayOp.setAttr("partition", builder.getBoolAttr(false)); + } + } + } + + if (auto forOp = dyn_cast(op)) { + if (forOp.getLoopBody().getBlocks().size() != 1) + forOp.emitError("has zero or more than one basic blocks"); + + // Set loop pragma attributes. + forOp.setAttr("pipeline", builder.getBoolAttr(false)); + forOp.setAttr("unroll", builder.getBoolAttr(false)); + forOp.setAttr("flatten", builder.getBoolAttr(false)); + } + }); } } diff --git a/lib/Transforms/ArrayPartition.cpp b/lib/Transforms/ArrayPartition.cpp index 800b8e2..ad8b5b4 100644 --- a/lib/Transforms/ArrayPartition.cpp +++ b/lib/Transforms/ArrayPartition.cpp @@ -2,10 +2,12 @@ // //===----------------------------------------------------------------------===// +#include "Analysis/QoREstimation.h" #include "Transforms/Passes.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/Passes.h" #include "mlir/IR/Builders.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/LoopUtils.h" using namespace std; @@ -18,7 +20,130 @@ struct ArrayPartition : public ArrayPartitionBase { }; } // namespace -void ArrayPartition::runOnOperation() { return; } +void ArrayPartition::runOnOperation() { + auto module = getOperation(); + auto builder = OpBuilder(module); + + // Extract all static parameters and current pragma configurations. + HLSCppAnalyzer analyzer(builder); + analyzer.analyzeModule(getOperation()); + + // Canonicalize the analyzed IR. + OwningRewritePatternList patterns; + + auto *context = &getContext(); + for (auto *op : context->getRegisteredOperations()) + op->getCanonicalizationPatterns(patterns, context); + + Operation *op = getOperation(); + applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns)); + + // Estimate performance and resource utilization. + for (auto func : module.getOps()) { + for (auto forOp : func.getOps()) { + // TODO: support imperfect loop nests. + SmallVector nestedLoops; + getPerfectlyNestedLoops(nestedLoops, forOp); + auto innermost = nestedLoops.back(); + + // Collect memory access information. + MemAccessDict loadDict; + innermost.walk([&](mlir::AffineLoadOp loadOp) { + auto arrayOp = cast(loadOp.getMemRef().getDefiningOp()); + loadDict[arrayOp].push_back(loadOp); + }); + + MemAccessDict storeDict; + innermost.walk([&](mlir::AffineStoreOp storeOp) { + auto arrayOp = cast(storeOp.getMemRef().getDefiningOp()); + storeDict[arrayOp].push_back(storeOp); + }); + + // Apply array partition pragma. + for (auto pair : loadDict) { + auto arrayOp = cast(pair.first); + auto arrayType = arrayOp.getType().cast(); + auto arrayAccesses = pair.second; + + // Walk through each dimension of the targeted array. + SmallVector partitionFactor; + SmallVector partitionType; + + for (size_t dim = 0, e = arrayType.getShape().size(); dim < e; ++dim) { + unsigned dimSize = arrayType.getShape()[dim]; + + // Collect all array access indices of the current dimension. + SmallVector indices; + for (auto accessOp : arrayAccesses) { + auto concreteOp = cast(accessOp); + auto index = concreteOp.getAffineMap().getResult(dim); + // Only add unique index. + if (std::find(indices.begin(), indices.end(), index) == + indices.end()) + indices.push_back(index); + } + auto accessNum = indices.size(); + + // Find the max array access distance in the current block. + unsigned maxDistance = 0; + bool failFlag = false; + + for (unsigned i = 0; i < accessNum; ++i) { + for (unsigned j = i + 1; j < accessNum; ++j) { + // TODO: this expression can't be simplified. + auto expr = indices[j] - indices[i]; + + if (auto constDistance = expr.dyn_cast()) { + unsigned distance = abs(constDistance.getValue()); + maxDistance = max(maxDistance, distance); + } else { + // The array partition mechanism will fail if the distance is + // not a constant number. + // failFlag = true; + // break; + } + } + // if (failFlag) + // break; + } + + // Determine array partition strategy. + maxDistance += 1; + if (failFlag || maxDistance == 1) { + // This means all accesses have the same index, and this dimension + // should not be partitioned. + partitionType.push_back("none"); + partitionFactor.push_back(builder.getUI32IntegerAttr(1)); + + } else if (accessNum == dimSize) { + // Apply complete array partition. + partitionType.push_back("complete"); + partitionFactor.push_back(builder.getUI32IntegerAttr(1)); + + } else if (accessNum >= maxDistance) { + // This means some elements are accessed more than once or exactly + // once, and successive elements are accessed. In most cases, apply + // "cyclic" partition should be the best solution. + partitionType.push_back("cyclic"); + partitionFactor.push_back(builder.getUI32IntegerAttr(maxDistance)); + + } else { + // This means discrete elements are accessed. Typically, "block" + // partition will be most benefit for this occasion. + partitionType.push_back("block"); + partitionFactor.push_back(builder.getUI32IntegerAttr(accessNum)); + } + } + + arrayOp.setAttr("partition", builder.getBoolAttr(true)); + arrayOp.setAttr("partition_type", + builder.getStrArrayAttr(partitionType)); + arrayOp.setAttr("partition_factor", + builder.getArrayAttr(partitionFactor)); + } + } + } +} std::unique_ptr scalehls::createArrayPartitionPass() { return std::make_unique(); diff --git a/test/Analysis/QoREstimation/test_for.mlir b/test/Analysis/QoREstimation/test_for.mlir index 4851326..2c37fa8 100644 --- a/test/Analysis/QoREstimation/test_for.mlir +++ b/test/Analysis/QoREstimation/test_for.mlir @@ -2,8 +2,8 @@ // CHECK-LABEL: func @test_for func @test_for(%arg0: memref<16x4x4xindex>, %arg1: memref<16x4x4xindex>) attributes {dataflow = false} { - %array0 = "hlscpp.array"(%arg0) {interface = true, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex> - %array1 = "hlscpp.array"(%arg1) {interface = true, storage = false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex> + %array0 = "hlscpp.array"(%arg0) {interface=true, storage=false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex> + %array1 = "hlscpp.array"(%arg1) {interface=true, storage=false, partition=true, partition_type=["cyclic", "cyclic", "cyclic"], partition_factor=[1 : ui32, 1 : ui32, 4 : ui32], storage_type="ram_2p"} : (memref<16x4x4xindex>) -> memref<16x4x4xindex> affine.for %i = 0 to 16 { affine.for %j = 0 to 4 { affine.for %k = 0 to 4 {