[ArrayPartition] support multi-loops array partition; factor out applyArrayPartition() method (#20)

This commit is contained in:
Hanchen Ye 2021-01-07 23:29:49 -06:00
parent a01b440a95
commit 07d77f7193
8 changed files with 122 additions and 155 deletions

View File

@ -5,28 +5,6 @@
#ifndef SCALEHLS_DIALECT_HLSCPP_ATTRIBUTES_TD
#define SCALEHLS_DIALECT_HLSCPP_ATTRIBUTES_TD
//===----------------------------------------------------------------------===//
// Customized ui32 Attributes
//===----------------------------------------------------------------------===//
def PositiveI64Attr : Confined<I64Attr, [IntPositive]> {}
def PositiveI64ArrayAttr : TypedArrayAttrBase<PositiveI64Attr, ""> {}
//===----------------------------------------------------------------------===//
// Pragma array_partition Attributes
//===----------------------------------------------------------------------===//
def PartitionTypeAttr : StrEnumAttr<"PartitionType", "", [
StrEnumAttrCase<"cyclic", 0>,
StrEnumAttrCase<"block", 1>,
StrEnumAttrCase<"complete", 2>,
StrEnumAttrCase<"none", 3>
]> {
let cppNamespace = "::mlir::scalehls::hlscpp";
}
def PartitionTypeArrayAttr : TypedArrayAttrBase<PartitionTypeAttr, ""> {}
//===----------------------------------------------------------------------===//
// Pragma Interface Attributes (for array ports)
//===----------------------------------------------------------------------===//

View File

@ -25,9 +25,11 @@ enum class MemoryKind {
// URAM_S2P = 4,
// URAM_T2P = 5,
DRAM = 3,
DRAM = 3
};
enum class PartitionKind { CYCLIC = 0, BLOCK = 1, NONE = 2 };
} // namespace hlscpp
} // namespace scalehls
} // namespace mlir

View File

@ -18,43 +18,6 @@ def AssignOp : HLSCppOp<"assign", [SameOperandsAndResultType]> {
let results = (outs AnyType : $output);
}
// Deprecated. Will be removed in the future.
def ArrayOp : HLSCppOp<"array", [SameOperandsAndResultType]> {
let summary = "A C++ array instance";
let description = [{
This hlscpp.array operation represent an array in C++. All shaped type value
(e.g., memref, tensor, and vector) should be passed through this operation
after declared by an allocation (e.g., Alloc, etc.) operation or in the
signature of a function. This will help the compiler to easily manage the
attributs and statistics of arrays.
}];
let arguments = (ins Type<IsShapedTypePred> : $input,
// Interface-related attributes.
DefaultValuedAttr<BoolAttr, "false"> : $interface,
DefaultValuedAttr<InterfaceModeAttr, "bram"> : $interface_mode,
// BindStorage-related attributes.
DefaultValuedAttr<BoolAttr, "false"> : $storage,
DefaultValuedAttr<StorageTypeAttr, "ram_1p_bram"> : $storage_type,
// ArrayPartition-related attributes.
DefaultValuedAttr<BoolAttr, "false"> : $partition,
DefaultValuedAttr<PositiveI64Attr, "1"> : $partition_num,
DefaultValuedAttr<PartitionTypeArrayAttr, "{}"> : $partition_type,
DefaultValuedAttr<PositiveI64ArrayAttr, "{}"> : $partition_factor
);
let results = (outs Type<IsShapedTypePred> : $output);
let extraClassDeclaration = [{
ShapedType getShapedType() {
return getType().cast<ShapedType>();
}
}];
}
def EndOp : HLSCppOp<"end", [Terminator]> {
let summary = "Mark the end of a HLSCpp region";
let description = [{

View File

@ -31,6 +31,8 @@ bool applyRemoveVariableBound(AffineForOp loop, OpBuilder &builder);
/// fully unrolled.
bool applyLoopPipelining(AffineForOp loop, OpBuilder &builder);
bool applyArrayPartition(FuncOp func, OpBuilder &builder);
//===----------------------------------------------------------------------===//
// Optimization Pass Entries
//===----------------------------------------------------------------------===//

View File

@ -241,7 +241,7 @@ int64_t HLSCppEstimator::getPartitionIndex(Operation *op) {
int64_t partitionIdx = 0;
int64_t accumFactor = 1;
for (auto dim = 0; dim < memrefType.getRank(); ++dim) {
for (int64_t dim = 0; dim < memrefType.getRank(); ++dim) {
auto idxExpr = composeMap.getResult(dim);
if (auto constExpr = idxExpr.dyn_cast<AffineConstantExpr>())

View File

@ -177,7 +177,7 @@ int64_t scalehls::getPartitionFactors(MemRefType memrefType,
auto layoutMap = getLayoutMap(memrefType, memrefType.getContext());
int64_t accumFactor = 1;
for (unsigned dim = 0; dim < memrefType.getRank(); ++dim) {
for (int64_t dim = 0; dim < memrefType.getRank(); ++dim) {
int64_t factor = 1;
if (!layoutMap.isEmpty()) {

View File

@ -1397,7 +1397,7 @@ void ModuleEmitter::emitArrayPragmas(Value memref) {
SmallVector<int64_t, 4> factors;
getPartitionFactors(type, &factors);
for (unsigned dim = 0; dim < type.getRank(); ++dim) {
for (int64_t dim = 0; dim < type.getRank(); ++dim) {
if (factors[dim] != 1) {
emitPragmaFlag = true;

View File

@ -4,7 +4,9 @@
#include "Analysis/Utils.h"
#include "Transforms/Passes.h"
#include "mlir/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
using namespace std;
using namespace mlir;
@ -13,89 +15,135 @@ using namespace hlscpp;
namespace {
struct ArrayPartition : public ArrayPartitionBase<ArrayPartition> {
void runOnOperation() override;
void runOnOperation() override {
auto func = getOperation();
auto builder = OpBuilder(func);
applyArrayPartition(func, builder);
}
};
} // namespace
static mlir::AffineForOp getPipelineLoop(mlir::AffineForOp root) {
SmallVector<mlir::AffineForOp, 4> nestedLoops;
root.walk([&](mlir::AffineForOp loop) {
if (auto attr = loop.getAttrOfType<BoolAttr>("pipeline")) {
bool scalehls::applyArrayPartition(FuncOp func, OpBuilder &builder) {
// Only memory accesses in pipelined loops will be executed in parallel.
SmallVector<AffineForOp, 4> pipelinedLoops;
func.walk([&](AffineForOp loop) {
if (auto attr = loop.getAttrOfType<BoolAttr>("pipeline"))
if (attr.getValue())
nestedLoops.push_back(loop);
}
pipelinedLoops.push_back(loop);
});
if (nestedLoops.empty())
return nullptr;
else
return nestedLoops.back();
}
template <typename OpType>
static void applyArrayPartition(MemAccessesMap &map, OpBuilder &builder) {
for (auto pair : map) {
// Storing the partition information of each memref.
using PartitionInfo = std::pair<PartitionKind, int64_t>;
DenseMap<Value, SmallVector<PartitionInfo, 4>> partitionsMap;
// Traverse all pipelined loops.
for (auto loop : pipelinedLoops) {
MemAccessesMap accessesMap;
getMemAccessesMap(loop.getLoopBody().front(), accessesMap);
for (auto pair : accessesMap) {
auto memref = pair.first;
auto memrefType = memref.getType().cast<MemRefType>();
auto loadStores = pair.second;
auto &partitions = partitionsMap[memref];
// If the current partitionsMap is empty, initialize it with no partition
// and factor of 1.
if (partitions.empty()) {
for (int64_t dim = 0; dim < memrefType.getRank(); ++dim)
partitions.push_back(PartitionInfo(PartitionKind::NONE, 1));
}
// Find the best partition solution for each dimensions of the memref.
for (int64_t dim = 0; dim < memrefType.getRank(); ++dim) {
// Collect all array access indices of the current dimension.
SmallVector<AffineExpr, 4> indices;
for (auto accessOp : loadStores) {
// Get memory access map.
AffineValueMap accessMap;
MemRefAccess(accessOp).getAccessMap(&accessMap);
// Get index expression.
auto index = accessMap.getResult(dim);
// Only add unique index.
if (std::find(indices.begin(), indices.end(), index) == indices.end())
indices.push_back(index);
}
auto accessNum = indices.size();
// Find the max array access distance in the current block.
unsigned maxDistance = 0;
for (unsigned i = 0; i < accessNum; ++i) {
for (unsigned j = i + 1; j < accessNum; ++j) {
// TODO: this expression can't be simplified in some cases.
auto expr = indices[j] - indices[i];
if (auto constDistance = expr.dyn_cast<AffineConstantExpr>()) {
unsigned distance = abs(constDistance.getValue());
maxDistance = max(maxDistance, distance);
}
}
}
// Determine array partition strategy.
// TODO: take storage type into consideration.
maxDistance += 1;
if (maxDistance == 1) {
// This means all accesses have the same index, and this dimension
// should not be partitioned.
continue;
} else if (accessNum >= maxDistance) {
// This means some elements are accessed more than once or exactly
// once, and successive elements are accessed. In most cases, apply
// "cyclic" partition should be the best solution.
unsigned factor = maxDistance;
if (factor > partitions[dim].second)
partitions[dim] = PartitionInfo(PartitionKind::CYCLIC, factor);
} else {
// This means discrete elements are accessed. Typically, "block"
// partition will be most benefit for this occasion.
unsigned factor = accessNum;
if (factor > partitions[dim].second)
partitions[dim] = PartitionInfo(PartitionKind::BLOCK, factor);
}
}
}
}
// Constuct and set new type to each partitioned MemRefType.
for (auto pair : partitionsMap) {
auto memref = pair.first;
auto memrefType = memref.getType().cast<MemRefType>();
auto loadStores = pair.second;
auto partitions = pair.second;
// Walk through each dimension of the targeted array.
// Walk through each dimension of the current memory.
SmallVector<AffineExpr, 4> partitionIndices;
SmallVector<AffineExpr, 4> addressIndices;
for (unsigned dim = 0; dim < memrefType.getRank(); ++dim) {
// Collect all array access indices of the current dimension.
SmallVector<AffineExpr, 4> indices;
for (auto accessOp : loadStores) {
auto concreteOp = cast<OpType>(accessOp);
auto index = concreteOp.getAffineMap().getResult(dim);
// Only add unique index.
if (std::find(indices.begin(), indices.end(), index) == indices.end())
indices.push_back(index);
}
auto accessNum = indices.size();
// Find the max array access distance in the current block.
unsigned maxDistance = 0;
for (unsigned i = 0; i < accessNum; ++i) {
for (unsigned j = i + 1; j < accessNum; ++j) {
// TODO: this expression can't be simplified.
auto expr = indices[j] - indices[i];
if (auto constDistance = expr.dyn_cast<AffineConstantExpr>()) {
unsigned distance = abs(constDistance.getValue());
maxDistance = max(maxDistance, distance);
}
}
}
// Determine array partition strategy.
maxDistance += 1;
if (maxDistance == 1) {
// This means all accesses have the same index, and this dimension
// should not be partitioned.
partitionIndices.push_back(builder.getAffineConstantExpr(0));
addressIndices.push_back(builder.getAffineDimExpr(dim));
} else if (accessNum >= maxDistance) {
// This means some elements are accessed more than once or exactly
// once, and successive elements are accessed. In most cases,
// apply "cyclic" partition should be the best solution.
unsigned factor = maxDistance;
for (int64_t dim = 0; dim < memrefType.getRank(); ++dim) {
auto partition = partitions[dim];
auto kind = partition.first;
auto factor = partition.second;
if (kind == PartitionKind::CYCLIC) {
partitionIndices.push_back(builder.getAffineDimExpr(dim) % factor);
addressIndices.push_back(
builder.getAffineDimExpr(dim).floorDiv(factor));
} else {
// This means discrete elements are accessed. Typically, "block"
// partition will be most benefit for this occasion.
unsigned factor = accessNum;
} else if (kind == PartitionKind::BLOCK) {
auto blockFactor = (memrefType.getShape()[dim] + factor - 1) / factor;
partitionIndices.push_back(
builder.getAffineDimExpr(dim).floorDiv(blockFactor));
addressIndices.push_back(builder.getAffineDimExpr(dim) % blockFactor);
} else {
partitionIndices.push_back(builder.getAffineConstantExpr(0));
addressIndices.push_back(builder.getAffineDimExpr(dim));
}
}
@ -112,41 +160,15 @@ static void applyArrayPartition(MemAccessesMap &map, OpBuilder &builder) {
// Set new type.
memref.setType(newType);
}
}
void ArrayPartition::runOnOperation() {
auto func = getOperation();
auto builder = OpBuilder(func);
// Apply array partition.
for (auto forOp : func.getOps<mlir::AffineForOp>()) {
// TODO: support imperfect loop.
if (auto outermost = getPipelineLoop(forOp)) {
// Collect memory access information.
MemAccessesMap loadMap;
outermost.walk([&](mlir::AffineLoadOp loadOp) {
loadMap[loadOp.getMemRef()].push_back(loadOp);
});
MemAccessesMap storeMap;
outermost.walk([&](mlir::AffineStoreOp storeOp) {
storeMap[storeOp.getMemRef()].push_back(storeOp);
});
// Apply array partition pragma.
// TODO: how to decide which to pick?
applyArrayPartition<mlir::AffineLoadOp>(loadMap, builder);
applyArrayPartition<mlir::AffineStoreOp>(storeMap, builder);
// TODO: how to handle the case when different sub-functions have
// different array partition strategy selected?
}
}
// Align function type with entry block argument types.
auto resultTypes = func.front().getTerminator()->getOperandTypes();
auto inputTypes = func.front().getArgumentTypes();
func.setType(builder.getFunctionType(inputTypes, resultTypes));
// TODO: how to handle the case when different sub-functions have different
// array partition strategy selected?
return true;
}
std::unique_ptr<mlir::Pass> scalehls::createArrayPartitionPass() {