[HLSKernelToAffine] impl lowering of maxpool, relu, gemm ops

This commit is contained in:
Hanchen Ye 2020-11-05 15:48:47 -06:00
parent 6adc65da7a
commit cf4e36e626
2 changed files with 192 additions and 6 deletions

View File

@ -11,8 +11,7 @@ def ConvOp : HLSKernelOp<"conv", [HLSKernelOpInterface]> {
Convolution operation. For now, only static shaped 4-dims X (batch, channel, Convolution operation. For now, only static shaped 4-dims X (batch, channel,
row, and col), 4-dims W (input channel, output channel, kernel row, and row, and col), 4-dims W (input channel, output channel, kernel row, and
kernel col), 1-dim B (output channel), and 4-dims Y (batch, channel, row, kernel col), 1-dim B (output channel), and 4-dims Y (batch, channel, row,
and col) are supported. Meanwhile, in the current lowering, padding is not and col) is supported. Verifiers will ensure the legalness of the operation.
allowed. Verifiers will ensure the legalness of the operation.
}]; }];
let arguments = (ins let arguments = (ins

View File

@ -33,6 +33,7 @@ public:
}; };
} // namespace } // namespace
/// Padding is not suppored.
bool HLSKernelVisitor::visitOp(ConvOp op) { bool HLSKernelVisitor::visitOp(ConvOp op) {
OpBuilder builder(op); OpBuilder builder(op);
@ -115,11 +116,197 @@ bool HLSKernelVisitor::visitOp(ConvOp op) {
return true; return true;
} }
bool HLSKernelVisitor::visitOp(MaxPoolOp op) { return true; } // Only support when kernel size is equal to stride size.
bool HLSKernelVisitor::visitOp(MaxPoolOp op) {
OpBuilder builder(op);
bool HLSKernelVisitor::visitOp(ReluOp op) { return true; } SmallVector<int64_t, 2> kernelShape;
for (auto shape : op.getAttrOfType<ArrayAttr>("kernel_shape"))
kernelShape.push_back(shape.cast<IntegerAttr>().getInt());
bool HLSKernelVisitor::visitOp(GemmOp op) { return true; } auto X = op.getOperand();
auto Y = op.getResult();
auto YShape = Y.getType().cast<MemRefType>().getShape();
auto dataType = Y.getType().cast<MemRefType>().getElementType();
auto newY = builder.create<mlir::AllocOp>(op.getLoc(),
Y.getType().cast<MemRefType>());
Y.replaceAllUsesWith(newY);
// Create batch loop.
auto gLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, YShape[0]);
builder.setInsertionPointToStart(&gLoop.getLoopBody().front());
auto g = gLoop.getInductionVar();
// Create channel loop.
auto cLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, YShape[1]);
builder.setInsertionPointToStart(&cLoop.getLoopBody().front());
auto c = cLoop.getInductionVar();
// Create height loop.
auto hLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, YShape[2]);
builder.setInsertionPointToStart(&hLoop.getLoopBody().front());
auto h = hLoop.getInductionVar();
// Create width loop.
auto wLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, YShape[3]);
builder.setInsertionPointToStart(&wLoop.getLoopBody().front());
auto w = wLoop.getInductionVar();
// Set largest value as zero.
auto zeroConstant = builder.create<mlir::ConstantOp>(
op.getLoc(), builder.getZeroAttr(dataType));
builder.create<mlir::AffineStoreOp>(op.getLoc(), zeroConstant, newY,
ArrayRef<Value>({g, c, h, w}));
// Create kernel height loop.
auto rLoop =
builder.create<mlir::AffineForOp>(op.getLoc(), 0, kernelShape[0]);
builder.setInsertionPointToStart(&rLoop.getLoopBody().front());
auto r = rLoop.getInductionVar();
// Create kernel width loop.
auto sLoop =
builder.create<mlir::AffineForOp>(op.getLoc(), 0, kernelShape[1]);
builder.setInsertionPointToStart(&sLoop.getLoopBody().front());
auto s = sLoop.getInductionVar();
// Fetch feature map.
SmallVector<AffineExpr, 4> idxExprs;
idxExprs.push_back(builder.getAffineDimExpr(0));
idxExprs.push_back(builder.getAffineDimExpr(1));
idxExprs.push_back(builder.getAffineDimExpr(2) *
builder.getAffineConstantExpr(kernelShape[0]) +
builder.getAffineDimExpr(4));
idxExprs.push_back(builder.getAffineDimExpr(3) *
builder.getAffineConstantExpr(kernelShape[1]) +
builder.getAffineDimExpr(5));
auto fmap = builder.create<mlir::AffineLoadOp>(
op.getLoc(), X, AffineMap::get(6, 0, idxExprs, op.getContext()),
ArrayRef<Value>({g, c, h, w, r, s}));
// Fetch current greatest value.
auto tmpGreatest = builder.create<mlir::AffineLoadOp>(
op.getLoc(), newY, ArrayRef<Value>({g, c, h, w}));
auto greaterThanTmp = builder.create<mlir::CmpFOp>(
op.getLoc(), CmpFPredicate::OGT, fmap, tmpGreatest);
auto newGreatest = builder.create<mlir::SelectOp>(op.getLoc(), greaterThanTmp,
fmap, tmpGreatest);
// Store back the greater value.
builder.create<mlir::AffineStoreOp>(op.getLoc(), newGreatest, newY,
ArrayRef<Value>({g, c, h, w}));
return true;
}
bool HLSKernelVisitor::visitOp(ReluOp op) {
OpBuilder builder(op);
auto X = op.getOperand();
auto Y = op.getResult();
auto YShape = Y.getType().cast<MemRefType>().getShape();
auto newY = builder.create<mlir::AllocOp>(op.getLoc(),
Y.getType().cast<MemRefType>());
Y.replaceAllUsesWith(newY);
// Create batch loop.
auto gLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, YShape[0]);
builder.setInsertionPointToStart(&gLoop.getLoopBody().front());
auto g = gLoop.getInductionVar();
// Create channel loop.
auto cLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, YShape[1]);
builder.setInsertionPointToStart(&cLoop.getLoopBody().front());
auto c = cLoop.getInductionVar();
// Create height loop.
auto hLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, YShape[2]);
builder.setInsertionPointToStart(&hLoop.getLoopBody().front());
auto h = hLoop.getInductionVar();
// Create width loop.
auto wLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, YShape[3]);
builder.setInsertionPointToStart(&wLoop.getLoopBody().front());
auto w = wLoop.getInductionVar();
// Load original value from input array.
auto fmap = builder.create<mlir::AffineLoadOp>(op.getLoc(), X,
ArrayRef<Value>({g, c, h, w}));
// Carry out activation.
auto zeroConstant = builder.create<mlir::ConstantOp>(
op.getLoc(), builder.getZeroAttr(fmap.getType()));
auto greaterThanZero = builder.create<mlir::CmpFOp>(
op.getLoc(), CmpFPredicate::OGT, fmap, zeroConstant);
auto activ = builder.create<mlir::SelectOp>(op.getLoc(), greaterThanZero,
fmap, zeroConstant);
// Store back the activations.
builder.create<mlir::AffineStoreOp>(op.getLoc(), activ, newY,
ArrayRef<Value>({g, c, h, w}));
return true;
}
bool HLSKernelVisitor::visitOp(GemmOp op) {
OpBuilder builder(op);
auto X = op.getOperand(0);
auto W = op.getOperand(1);
auto B = op.getOperand(2);
auto Y = op.getResult();
auto WShape = W.getType().cast<MemRefType>().getShape();
auto YShape = Y.getType().cast<MemRefType>().getShape();
auto newY = builder.create<mlir::AllocOp>(op.getLoc(),
Y.getType().cast<MemRefType>());
Y.replaceAllUsesWith(newY);
// Create batch loop.
auto gLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, YShape[0]);
builder.setInsertionPointToStart(&gLoop.getLoopBody().front());
auto g = gLoop.getInductionVar();
// Create output channel loop.
auto kLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, WShape[0]);
builder.setInsertionPointToStart(&kLoop.getLoopBody().front());
auto k = kLoop.getInductionVar();
// Load bias into newY array.
auto bias = builder.create<mlir::AffineLoadOp>(op.getLoc(), B, k);
builder.create<mlir::AffineStoreOp>(op.getLoc(), bias, newY,
ArrayRef<Value>({g, k}));
// Create input channel loop.
auto cLoop = builder.create<mlir::AffineForOp>(op.getLoc(), 0, WShape[1]);
builder.setInsertionPointToStart(&cLoop.getLoopBody().front());
auto c = cLoop.getInductionVar();
// Fetch feature map, weight and carry out multiplication.
auto fmap = builder.create<mlir::AffineLoadOp>(op.getLoc(), X,
ArrayRef<Value>({g, c}));
auto weight = builder.create<mlir::AffineLoadOp>(op.getLoc(), W,
ArrayRef<Value>({k, c}));
auto multi =
builder.create<mlir::MulFOp>(op.getLoc(), fmap.getType(), fmap, weight);
// Fetch partial result and carry out accumulation.
auto partial = builder.create<mlir::AffineLoadOp>(op.getLoc(), newY,
ArrayRef<Value>({g, k}));
auto accum =
builder.create<mlir::AddFOp>(op.getLoc(), fmap.getType(), partial, multi);
builder.create<mlir::AffineStoreOp>(op.getLoc(), accum, newY,
ArrayRef<Value>({g, k}));
return true;
}
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// HLSkernel to Affine Lowering Pass // HLSkernel to Affine Lowering Pass
@ -140,7 +327,7 @@ void HLSKernelToAffinePass::runOnOperation() {
if (auto func = dyn_cast<FuncOp>(op)) { if (auto func = dyn_cast<FuncOp>(op)) {
func.walk([&](HLSKernelOpInterface kernelOp) { func.walk([&](HLSKernelOpInterface kernelOp) {
if (visitor.dispatchVisitor(kernelOp)) { if (visitor.dispatchVisitor(kernelOp)) {
// kernelOp.erase(); kernelOp.erase();
} else } else
kernelOp.emitError("can't be correctly lowered."); kernelOp.emitError("can't be correctly lowered.");
}); });