Disable the Loop Vectorizer in case of GEMM

Currently, in case of GEMM and the pattern matching based optimizations, we
use only the SLP Vectorizer out of two LLVM vectorizers. Since the Loop
Vectorizer can get in the way of optimal code generation, we disable the Loop
Vectorizer for the innermost loop using mark nodes and emitting the
corresponding metadata.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D36928

llvm-svn: 311473
This commit is contained in:
Roman Gareev 2017-08-22 17:38:46 +00:00
parent 595b77bc0b
commit 0956a606ff
17 changed files with 107 additions and 34 deletions

View File

@ -59,8 +59,8 @@ public:
void annotate(llvm::Instruction *I);
/// Annotate the loop latch @p B wrt. @p L.
void annotateLoopLatch(llvm::BranchInst *B, llvm::Loop *L,
bool IsParallel) const;
void annotateLoopLatch(llvm::BranchInst *B, llvm::Loop *L, bool IsParallel,
bool IsLoopVectorizerDisabled) const;
/// Add alternative alias based pointers
///

View File

@ -31,29 +31,36 @@ using namespace llvm;
/// Create a scalar do/for-style loop.
///
/// @param LowerBound The starting value of the induction variable.
/// @param UpperBound The upper bound of the induction variable.
/// @param Stride The value by which the induction variable is incremented.
/// @param LowerBound The starting value of the induction variable.
/// @param UpperBound The upper bound of the induction variable.
/// @param Stride The value by which the induction variable
/// is incremented.
///
/// @param Builder The builder used to create the loop.
/// @param P A pointer to the pass that uses this function. It is used
/// to update analysis information.
/// @param LI The loop info for the current function
/// @param DT The dominator tree we need to update
/// @param ExitBlock The block the loop will exit to.
/// @param Predicate The predicate used to generate the upper loop bound.
/// @param Annotator This function can (optionally) take a ScopAnnotator which
/// annotates loops and alias information in the SCoP.
/// @param Parallel If this loop should be marked parallel in the Annotator.
/// @param UseGuard Create a guard in front of the header to check if the
/// loop is executed at least once, otherwise just assume it.
/// @param Builder The builder used to create the loop.
/// @param P A pointer to the pass that uses this function.
/// It is used to update analysis information.
/// @param LI The loop info for the current function
/// @param DT The dominator tree we need to update
/// @param ExitBlock The block the loop will exit to.
/// @param Predicate The predicate used to generate the upper loop
/// bound.
/// @param Annotator This function can (optionally) take
/// a ScopAnnotator which
/// annotates loops and alias information in the SCoP.
/// @param Parallel If this loop should be marked parallel in
/// the Annotator.
/// @param UseGuard Create a guard in front of the header to check if
/// the loop is executed at least once, otherwise just
/// assume it.
/// @param LoopVectDisabled If the Loop vectorizer should be disabled for this
/// loop.
///
/// @return Value* The newly created induction variable for this loop.
Value *createLoop(Value *LowerBound, Value *UpperBound, Value *Stride,
PollyIRBuilder &Builder, LoopInfo &LI, DominatorTree &DT,
BasicBlock *&ExitBlock, ICmpInst::Predicate Predicate,
ScopAnnotator *Annotator = NULL, bool Parallel = false,
bool UseGuard = true);
bool UseGuard = true, bool LoopVectDisabled = false);
/// The ParallelLoopGenerator allows to create parallelized loops
///

View File

@ -114,15 +114,27 @@ void ScopAnnotator::popLoop(bool IsParallel) {
ParallelLoops.pop_back();
}
void ScopAnnotator::annotateLoopLatch(BranchInst *B, Loop *L,
bool IsParallel) const {
if (!IsParallel)
return;
void ScopAnnotator::annotateLoopLatch(BranchInst *B, Loop *L, bool IsParallel,
bool IsLoopVectorizerDisabled) const {
MDNode *MData = nullptr;
assert(!ParallelLoops.empty() && "Expected a parallel loop to annotate");
MDNode *Ids = ParallelLoops.back();
MDNode *Id = cast<MDNode>(Ids->getOperand(Ids->getNumOperands() - 1));
B->setMetadata("llvm.loop", Id);
if (IsLoopVectorizerDisabled) {
SmallVector<Metadata *, 3> Args;
LLVMContext &Ctx = SE->getContext();
Args.push_back(MDString::get(Ctx, "llvm.loop.vectorize.enable"));
auto *FalseValue = ConstantInt::get(Type::getInt1Ty(Ctx), 0);
Args.push_back(ValueAsMetadata::get(FalseValue));
MData = MDNode::concatenate(MData, getID(Ctx, MDNode::get(Ctx, Args)));
}
if (IsParallel) {
assert(!ParallelLoops.empty() && "Expected a parallel loop to annotate");
MDNode *Ids = ParallelLoops.back();
MDNode *Id = cast<MDNode>(Ids->getOperand(Ids->getNumOperands() - 1));
MData = MDNode::concatenate(MData, Id);
}
B->setMetadata("llvm.loop", MData);
}
/// Get the pointer operand

View File

@ -482,6 +482,27 @@ void IslNodeBuilder::createForVector(__isl_take isl_ast_node *For,
isl_ast_expr_free(Iterator);
}
/// Restore the initial ordering of dimensions of the band node
///
/// In case the band node represents all the dimensions of the iteration
/// domain, recreate the band node to restore the initial ordering of the
/// dimensions.
///
/// @param Node The band node to be modified.
/// @return The modified schedule node.
namespace {
bool IsLoopVectorizerDisabled(isl::ast_node Node) {
assert(isl_ast_node_get_type(Node.keep()) == isl_ast_node_for);
auto Body = Node.for_get_body();
if (isl_ast_node_get_type(Body.keep()) != isl_ast_node_mark)
return false;
auto Id = Body.mark_get_id();
if (!strcmp(Id.get_name().c_str(), "Loop Vectorizer Disabled"))
return true;
return false;
}
} // namespace
void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For,
bool KnownParallel) {
isl_ast_node *Body;
@ -497,6 +518,9 @@ void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For,
Parallel = KnownParallel || (IslAstInfo::isParallel(For) &&
!IslAstInfo::isReductionParallel(For));
bool LoopVectorizerDisabled =
IsLoopVectorizerDisabled(isl::manage(isl_ast_node_copy(For)));
Body = isl_ast_node_for_get_body(For);
// isl_ast_node_for_is_degenerate(For)
@ -532,7 +556,8 @@ void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For,
bool UseGuardBB =
!SE.isKnownPredicate(Predicate, SE.getSCEV(ValueLB), SE.getSCEV(ValueUB));
IV = createLoop(ValueLB, ValueUB, ValueInc, Builder, LI, DT, ExitBlock,
Predicate, &Annotator, Parallel, UseGuardBB);
Predicate, &Annotator, Parallel, UseGuardBB,
LoopVectorizerDisabled);
IDToValue[IteratorID] = IV;
create(Body);

View File

@ -56,8 +56,8 @@ Value *polly::createLoop(Value *LB, Value *UB, Value *Stride,
PollyIRBuilder &Builder, LoopInfo &LI,
DominatorTree &DT, BasicBlock *&ExitBB,
ICmpInst::Predicate Predicate,
ScopAnnotator *Annotator, bool Parallel,
bool UseGuard) {
ScopAnnotator *Annotator, bool Parallel, bool UseGuard,
bool LoopVectDisabled) {
Function *F = Builder.GetInsertBlock()->getParent();
LLVMContext &Context = F->getContext();
@ -132,7 +132,7 @@ Value *polly::createLoop(Value *LB, Value *UB, Value *Stride,
// Create the loop latch and annotate it as such.
BranchInst *B = Builder.CreateCondBr(LoopCondition, HeaderBB, ExitBB);
if (Annotator)
Annotator->annotateLoopLatch(B, NewLoop, Parallel);
Annotator->annotateLoopLatch(B, NewLoop, Parallel, LoopVectDisabled);
IV->addIncoming(IncrementedIV, HeaderBB);
if (GuardBB)

View File

@ -993,7 +993,7 @@ optimizeDataLayoutMatrMulPattern(isl::schedule_node Node, isl::map MapOldIndVar,
// Create a copy statement that corresponds to the memory access to the
// matrix B, the second operand of the matrix multiplication.
Node = Node.parent().parent().parent().parent().parent();
Node = Node.parent().parent().parent().parent().parent().parent();
Node = isl::manage(isl_schedule_node_band_split(Node.release(), 2)).child(0);
auto AccRel = getMatMulAccRel(isl::manage(MapOldIndVar.copy()), 3, 7);
unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
@ -1046,7 +1046,7 @@ optimizeDataLayoutMatrMulPattern(isl::schedule_node Node, isl::map MapOldIndVar,
ExtMap = ExtMap.intersect_range(Domain);
ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
Node = createExtensionNode(Node, ExtMap);
return Node.child(0).child(0).child(0).child(0);
return Node.child(0).child(0).child(0).child(0).child(0);
}
/// Get a relation mapping induction variables produced by schedule
@ -1106,11 +1106,11 @@ isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node,
isl::union_set Options = IsolateOption.unite(AtomicOption);
Options = Options.unite(getUnrollIsolatedSetOptions(Ctx));
Node = Node.band_set_ast_build_options(Options);
Node = Node.parent().parent();
Node = Node.parent().parent().parent();
IsolateOption = getIsolateOptions(Prefix, 3);
Options = IsolateOption.unite(AtomicOption);
Node = Node.band_set_ast_build_options(Options);
Node = Node.child(0).child(0);
Node = Node.child(0).child(0).child(0);
return Node;
}
@ -1129,6 +1129,15 @@ static isl::schedule_node markInterIterationAliasFree(isl::schedule_node Node,
return Node.insert_mark(Id).child(0);
}
/// Insert "Loop Vectorizer Disabled" mark node.
///
/// @param Node The child of the mark node to be inserted.
/// @return The modified isl_schedule_node.
static isl::schedule_node markLoopVectorizerDisabled(isl::schedule_node Node) {
auto Id = isl::id::alloc(Node.get_ctx(), "Loop Vectorizer Disabled", nullptr);
return Node.insert_mark(Id).child(0);
}
/// Restore the initial ordering of dimensions of the band node
///
/// In case the band node represents all the dimensions of the iteration
@ -1187,6 +1196,7 @@ isl::schedule_node ScheduleTreeOptimizer::optimizeMatMulPattern(
MacroKernelParams);
if (!MapOldIndVar)
return Node;
Node = markLoopVectorizerDisabled(Node.parent()).child(0);
Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
MacroKernelParams, MMI);

View File

@ -42,6 +42,7 @@
; CHECK-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1)
; CHECK-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: {
; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);

View File

@ -26,6 +26,7 @@
; CHECK-NEXT: for (int c3 = 0; c3 <= 30; c3 += 1) {
; CHECK-NEXT: for (int c4 = 0; c4 <= min(47, -48 * c2 + 126); c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(511, -512 * c1 + 1019); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: {
; CHECK-NEXT: Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3, 512 * c1 + c5);
@ -288,6 +289,7 @@
; CHECK-NEXT: }
; CHECK-NEXT: if (c2 == 2)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(511, -512 * c1 + 1019); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: for (int c6 = 0; c6 <= 3; c6 += 1)
; CHECK-NEXT: for (int c7 = 0; c7 <= 31; c7 += 1)
@ -296,6 +298,7 @@
; CHECK-NEXT: }
; CHECK-NEXT: for (int c4 = 0; c4 <= min(47, -48 * c2 + 127); c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(511, -512 * c1 + 1019); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: for (int c6 = 0; c6 <= min(7, -384 * c2 - 8 * c4 + 1019); c6 += 1)
; CHECK-NEXT: for (int c7 = 0; c7 <= 27; c7 += 1)

View File

@ -28,6 +28,7 @@
; CHECK-NEXT: for (int c3 = 0; c3 <= min(255, -256 * c0 + 332); c3 += 1)
; CHECK-NEXT: for (int c4 = 0; c4 <= 15; c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(306, -307 * c1 + 1999); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: {
; CHECK-NEXT: Stmt_for_body6(80 * c2 + 5 * c4, 1536 * c0 + 6 * c3, 307 * c1 + c5);
@ -65,6 +66,7 @@
; CHECK-NEXT: if (c0 == 1)
; CHECK-NEXT: for (int c4 = 0; c4 <= 15; c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(306, -307 * c1 + 1999); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: for (int c6 = 0; c6 <= 4; c6 += 1)
; CHECK-NEXT: for (int c7 = 0; c7 <= 1; c7 += 1)

View File

@ -12,7 +12,10 @@
; Check that we do not create different alias sets for locations represented by
; different raw pointers.
;
; Also check that we disable the Loop Vectorizer.
;
; CHECK-NOT: !76 = distinct !{!76, !5, !"second level alias metadata"}
; CHECK: !{!"llvm.loop.vectorize.enable", i1 false}
;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"

View File

@ -100,6 +100,7 @@
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1)
; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) {
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Loop Vectorizer Disabled
; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Points
; EXTRACTION-OF-MACRO-KERNEL-NEXT: {
; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);

View File

@ -36,6 +36,7 @@
; PATTERN-MATCHING-OPTS-NEXT: for (int c3 = 0; c3 <= 127; c3 += 1)
; PATTERN-MATCHING-OPTS-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + 255); c4 += 1)
; PATTERN-MATCHING-OPTS-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) {
; PATTERN-MATCHING-OPTS-NEXT: // Loop Vectorizer Disabled
; PATTERN-MATCHING-OPTS-NEXT: // Register tiling - Points
; PATTERN-MATCHING-OPTS-NEXT: {
; PATTERN-MATCHING-OPTS-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3);

View File

@ -56,6 +56,7 @@
; CHECK-NEXT: for (int c3 = 0; c3 <= min(255, -256 * c0 + nj / 8 - 1); c3 += 1) {
; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + ni / 4 - 1); c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: {
; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3, 256 * c1 + c5);
@ -94,6 +95,7 @@
; CHECK-NEXT: }
; CHECK-NEXT: if (96 * c2 + 95 >= ni)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: for (int c6 = 0; c6 < ni % 4; c6 += 1)
; CHECK-NEXT: for (int c7 = 0; c7 <= 7; c7 += 1)
@ -106,6 +108,7 @@
; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + (ni - 1) / 4); c4 += 1)
; CHECK-NEXT: if ((ni >= 96 * c2 + 4 && 2048 * c0 + 8 * c3 + 7 >= nj) || 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: for (int c6 = 0; c6 <= min(3, ni - 96 * c2 - 4 * c4 - 1); c6 += 1)
; CHECK-NEXT: for (int c7 = 0; c7 <= min(7, nj - 2048 * c0 - 8 * c3 - 1); c7 += 1)

View File

@ -53,6 +53,7 @@
; CHECK-NEXT: for (int c3 = 0; c3 <= 126; c3 += 1)
; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + 254); c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1019); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: {
; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
@ -91,6 +92,7 @@
; CHECK-NEXT: }
; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + 254); c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1019); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: for (int c6 = 0; c6 <= 3; c6 += 1)
; CHECK-NEXT: for (int c7 = 0; c7 <= 3; c7 += 1)

View File

@ -36,6 +36,7 @@
; CHECK-NEXT: for (int c3 = 0; c3 <= 127; c3 += 1)
; CHECK-NEXT: for (int c4 = 0; c4 <= 15; c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(383, -384 * c1 + 1023); c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: {
; CHECK-NEXT: Stmt_for_body6(128 * c2 + 8 * c4, 8 * c3, 384 * c1 + c5);

View File

@ -37,6 +37,7 @@
; CHECK-NEXT: for (int c3 = 0; c3 <= 127; c3 += 1)
; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + 255); c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: {
; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);

View File

@ -43,6 +43,7 @@
; CHECK-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1)
; CHECK-NEXT: for (int c4 = 0; c4 <= min(47, -48 * c2 + 127); c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= 511; c5 += 1) {
; CHECK-NEXT: // Loop Vectorizer Disabled
; CHECK-NEXT: // Register tiling - Points
; CHECK-NEXT: {
; CHECK-NEXT: Stmt_for_body6(384 * c2 + 8 * c4, 32 * c3, 512 * c1 + c5);