Annotation of SIMD loops

Use 'mark' nodes annotate a SIMD loop during ScheduleTransformation and skip
parallelism checks.

The buildbot shows the following compile/execution time changes:

  Compile time:
    Improvements    Δ     Previous  Current  σ
    …/gesummv      -6.06% 0.2640    0.2480   0.0055
    …/gemver       -4.46% 0.4480    0.4280   0.0044
    …/covariance   -4.31% 0.8360    0.8000   0.0065
    …/adi          -3.23% 0.9920    0.9600   0.0065
    …/doitgen      -2.53% 0.9480    0.9240   0.0090
    …/3mm          -2.33% 1.0320    1.0080   0.0087

  Execution time:
    Regressions     Δ     Previous  Current  σ
    …/viterbi       1.70% 5.1840    5.2720   0.0074
    …/smallpt       1.06% 12.4920   12.6240  0.0040

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D14491

llvm-svn: 261620
This commit is contained in:
Roman Gareev 2016-02-23 09:00:13 +00:00
parent 1360f4db47
commit 11001e1534
10 changed files with 94 additions and 38 deletions

View File

@ -242,7 +242,7 @@ protected:
bool preloadInvariantEquivClass(const InvariantEquivClassTy &IAClass);
void createForVector(__isl_take isl_ast_node *For, int VectorWidth);
void createForSequential(__isl_take isl_ast_node *For);
void createForSequential(__isl_take isl_ast_node *For, bool KnownParallel);
/// Create LLVM-IR that executes a for node thread parallel.
///

View File

@ -255,11 +255,13 @@ astBuildAfterFor(__isl_take isl_ast_node *Node, __isl_keep isl_ast_build *Build,
// tested for parallelism. Test them here to ensure we check all innermost
// loops for parallelism.
if (Payload->IsInnermost && BuildInfo->InParallelFor) {
if (Payload->IsOutermostParallel)
if (Payload->IsOutermostParallel) {
Payload->IsInnermostParallel = true;
else
Payload->IsInnermostParallel =
astScheduleDimIsParallel(Build, BuildInfo->Deps, Payload);
} else {
if (PollyVectorizerChoice == VECTORIZER_NONE)
Payload->IsInnermostParallel =
astScheduleDimIsParallel(Build, BuildInfo->Deps, Payload);
}
}
if (Payload->IsOutermostParallel)
BuildInfo->InParallelFor = false;
@ -268,6 +270,31 @@ astBuildAfterFor(__isl_take isl_ast_node *Node, __isl_keep isl_ast_build *Build,
return Node;
}
static isl_stat astBuildBeforeMark(__isl_keep isl_id *MarkId,
__isl_keep isl_ast_build *Build,
void *User) {
if (!MarkId)
return isl_stat_error;
AstBuildUserInfo *BuildInfo = (AstBuildUserInfo *)User;
if (!strcmp(isl_id_get_name(MarkId), "SIMD"))
BuildInfo->InParallelFor = true;
return isl_stat_ok;
}
static __isl_give isl_ast_node *
astBuildAfterMark(__isl_take isl_ast_node *Node,
__isl_keep isl_ast_build *Build, void *User) {
assert(isl_ast_node_get_type(Node) == isl_ast_node_mark);
AstBuildUserInfo *BuildInfo = (AstBuildUserInfo *)User;
auto *Id = isl_ast_node_mark_get_id(Node);
if (!strcmp(isl_id_get_name(Id), "SIMD"))
BuildInfo->InParallelFor = false;
isl_id_free(Id);
return Node;
}
static __isl_give isl_ast_node *AtEachDomain(__isl_take isl_ast_node *Node,
__isl_keep isl_ast_build *Build,
void *User) {
@ -383,6 +410,12 @@ void IslAst::init(const Dependences &D) {
&BuildInfo);
Build =
isl_ast_build_set_after_each_for(Build, &astBuildAfterFor, &BuildInfo);
Build = isl_ast_build_set_before_each_mark(Build, &astBuildBeforeMark,
&BuildInfo);
Build = isl_ast_build_set_after_each_mark(Build, &astBuildAfterMark,
&BuildInfo);
}
buildRunCondition(Build);

View File

@ -352,9 +352,24 @@ void IslNodeBuilder::createUserVector(__isl_take isl_ast_node *User,
}
void IslNodeBuilder::createMark(__isl_take isl_ast_node *Node) {
auto *Id = isl_ast_node_mark_get_id(Node);
auto Child = isl_ast_node_mark_get_node(Node);
create(Child);
isl_ast_node_free(Node);
// If a child node of a 'SIMD mark' is a loop that has a single iteration,
// it will be optimized away and we should skip it.
if (!strcmp(isl_id_get_name(Id), "SIMD") &&
isl_ast_node_get_type(Child) == isl_ast_node_for) {
bool Vector = PollyVectorizerChoice == VECTORIZER_POLLY;
int VectorWidth = getNumberOfIterations(Child);
if (Vector && 1 < VectorWidth && VectorWidth <= 16)
createForVector(Child, VectorWidth);
else
createForSequential(Child, true);
isl_id_free(Id);
return;
}
create(Child);
isl_id_free(Id);
}
void IslNodeBuilder::createForVector(__isl_take isl_ast_node *For,
@ -417,7 +432,8 @@ void IslNodeBuilder::createForVector(__isl_take isl_ast_node *For,
isl_ast_expr_free(Iterator);
}
void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For) {
void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For,
bool KnownParallel) {
isl_ast_node *Body;
isl_ast_expr *Init, *Inc, *Iterator, *UB;
isl_id *IteratorID;
@ -428,8 +444,8 @@ void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For) {
CmpInst::Predicate Predicate;
bool Parallel;
Parallel =
IslAstInfo::isParallel(For) && !IslAstInfo::isReductionParallel(For);
Parallel = KnownParallel || (IslAstInfo::isParallel(For) &&
!IslAstInfo::isReductionParallel(For));
Body = isl_ast_node_for_get_body(For);
@ -647,7 +663,7 @@ void IslNodeBuilder::createFor(__isl_take isl_ast_node *For) {
createForParallel(For);
return;
}
createForSequential(For);
createForSequential(For, false);
}
void IslNodeBuilder::createIf(__isl_take isl_ast_node *If) {

View File

@ -289,6 +289,10 @@ ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
Node = isl_schedule_node_band_sink(Node);
Node = isl_schedule_node_child(Node, 0);
if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
Node = isl_schedule_node_parent(Node);
isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
Node = isl_schedule_node_insert_mark(Node, LoopMarker);
return Node;
}

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-codegen -polly-vectorizer=polly -S -dce < %s | FileCheck %s
; RUN: opt %loadPolly -polly-opt-isl -polly-codegen -polly-vectorizer=polly -polly-prevect-width=8 -S -dce < %s | FileCheck %s
;
; void foo(long n, float A[restrict][n], float B[restrict][n],
; float C[restrict][n], float D[restrict][n]) {

View File

@ -1,23 +1,26 @@
; RUN: opt -S %loadPolly -polly-vectorizer=stripmine -polly-opt-isl -polly-ast -analyze < %s | FileCheck %s
; CHECK: // 1st level tiling - Tiles
; CHECK-NEXT: #pragma known-parallel
; CHECK-NEXT: for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1)
; CHECK-NEXT: for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) {
; CHECK-NEXT: // 1st level tiling - Points
; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) {
; CHECK-NEXT: for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1)
; CHECK-NEXT: #pragma simd
; CHECK-NEXT: for (int c6 = 0; c6 <= 3; c6 += 1)
; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
; CHECK-NEXT: if (32 * c1 + 31 >= nj)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1)
; CHECK-NEXT: #pragma simd
; CHECK-NEXT: for (int c6 = 0; c6 < nj % 4; c6 += 1)
; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, -(nj % 4) + nj + c6, 32 * c2 + c5);
; CHECK-NEXT: }
; CHECK-NEXT: }
; CHECK-NEXT: #pragma known-parallel
; CHECK-NEXT: for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1)
; CHECK-NEXT: for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) {
; CHECK-NEXT: // 1st level tiling - Points
; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) {
; CHECK-NEXT: for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1)
; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {
; CHECK-NEXT: // SIMD
; CHECK-NEXT: for (int c6 = 0; c6 <= 3; c6 += 1)
; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
; CHECK-NEXT: }
; CHECK-NEXT: if (32 * c1 + 31 >= nj)
; CHECK-NEXT: #pragma minimal dependence distance: 1
; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {
; CHECK-NEXT: // SIMD
; CHECK-NEXT: for (int c6 = 0; c6 < nj % 4; c6 += 1)
; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, -(nj % 4) + nj + c6, 32 * c2 + c5);
; CHECK-NEXT: }
; CHECK-NEXT: }
; CHECK-NEXT: }
; Function Attrs: nounwind uwtable
define void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 {

View File

@ -56,14 +56,14 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
; CHECK: #pragma known-parallel
; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
; CHECK: for (int c1 = 0; c1 <= 383; c1 += 1)
; CHECK: #pragma simd
; CHECK: // SIMD
; CHECK: for (int c2 = 0; c2 <= 3; c2 += 1)
; CHECK: Stmt_for_body3(c0, 4 * c1 + c2);
; CHECK: #pragma known-parallel
; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
; CHECK: for (int c1 = 0; c1 <= 383; c1 += 1)
; CHECK: for (int c2 = 0; c2 <= 1535; c2 += 1)
; CHECK: #pragma simd
; CHECK: // SIMD
; CHECK: for (int c3 = 0; c3 <= 3; c3 += 1)
; CHECK: Stmt_for_body8(c0, 4 * c1 + c3, c2);

View File

@ -65,7 +65,7 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
; CHECK: for (int c1 = 0; c1 <= 47; c1 += 1)
; CHECK: for (int c2 = 0; c2 <= 31; c2 += 1)
; CHECK: for (int c3 = 0; c3 <= 7; c3 += 1)
; CHECK: #pragma simd
; CHECK: // SIMD
; CHECK: for (int c4 = 0; c4 <= 3; c4 += 1)
; CHECK: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 4 * c3 + c4);
; CHECK: #pragma known-parallel
@ -75,7 +75,7 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
; CHECK: for (int c3 = 0; c3 <= 31; c3 += 1)
; CHECK: for (int c4 = 0; c4 <= 7; c4 += 1)
; CHECK: for (int c5 = 0; c5 <= 31; c5 += 1)
; CHECK: #pragma simd
; CHECK: // SIMD
; CHECK: for (int c6 = 0; c6 <= 3; c6 += 1)
; CHECK: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
@ -85,7 +85,7 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1)
; VEC16: for (int c2 = 0; c2 <= 31; c2 += 1)
; VEC16: for (int c3 = 0; c3 <= 1; c3 += 1)
; VEC16: #pragma simd
; VEC16: // SIMD
; VEC16: for (int c4 = 0; c4 <= 15; c4 += 1)
; VEC16: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 16 * c3 + c4);
; VEC16: #pragma known-parallel
@ -95,7 +95,7 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
; VEC16: for (int c3 = 0; c3 <= 31; c3 += 1)
; VEC16: for (int c4 = 0; c4 <= 1; c4 += 1)
; VEC16: for (int c5 = 0; c5 <= 31; c5 += 1)
; VEC16: #pragma simd
; VEC16: // SIMD
; VEC16: for (int c6 = 0; c6 <= 15; c6 += 1)
; VEC16: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5);
; VEC16: }

View File

@ -74,10 +74,10 @@
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c3 = 0; c3 <= 1; c3 += 1)
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c4 = 0; c4 <= 7; c4 += 1)
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c5 = 0; c5 <= 1; c5 += 1) {
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: #pragma simd
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: // SIMD
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c8 = 0; c8 <= 3; c8 += 1)
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: Stmt_for_body3(256 * c0 + 16 * c2 + 2 * c4, 16 * c1 + 8 * c3 + 4 * c5 + c8);
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: #pragma simd
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: // SIMD
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c8 = 0; c8 <= 3; c8 += 1)
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: Stmt_for_body3(256 * c0 + 16 * c2 + 2 * c4 + 1, 16 * c1 + 8 * c3 + 4 * c5 + c8);
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: }

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-dir=%S -polly-vectorizer=polly -polly-codegen < %s -S | FileCheck %s
; RUN: opt %loadPolly -polly-opt-isl -polly-vectorizer=polly -polly-codegen < %s -S | FileCheck %s
; #pragma known-parallel
; for (int c0 = 0; c0 <= 31; c0 += 1)