[IslNodeBuilder] Relax complexity check in invariant loads and run it early
When performing invariant load hoisting we check that invariant load expressions are not too complex. Up to this commit, we performed this check by counting the sum of dimensions in the access range as a very simple heuristic. This heuristic is a little too conservative, as it prevents hoisting for any scops with a very large number of parameters. Hence, we update the heuristic to only count existentially quantified dimensions and set dimensions. We expect this to still detect the problematic expressions in h264 because of which this check was originally introduced. For some unknown reason, this complexity check was originally committed in IslNodeBuilder. It really belongs in ScopInfo, as there is no point in optimizing a program which we could have known earlier cannot be code generated. The benefit of running the check early is that we can avoid to even hoist checks that are expensive to code generate as invariant loads. This can be seen in the changed tests, where we now indeed detect the scop, but just not invariant load hoist the complicated access. We also improve the formatting of the code, document it, and use isl++ to simplify expressions. llvm-svn: 308659
This commit is contained in:
parent
f3a778d757
commit
1eeedf4829
|
@ -94,6 +94,12 @@ static int const MaxDisjunctsInDomain = 20;
|
|||
// number of disjunct when adding non-convex sets to the context.
|
||||
static int const MaxDisjunctsInContext = 4;
|
||||
|
||||
// The maximal number of dimensions we allow during invariant load construction.
|
||||
// More complex access ranges will result in very high compile time and are also
|
||||
// unlikely to result in good code. This value is very high and should only
|
||||
// trigger for corner cases (e.g., the "dct_luma" function in h264, SPEC2006).
|
||||
static int const MaxDimensionsInAccessRange = 9;
|
||||
|
||||
static cl::opt<int>
|
||||
OptComputeOut("polly-analysis-computeout",
|
||||
cl::desc("Bound the scop analysis by a maximal amount of "
|
||||
|
@ -4024,6 +4030,35 @@ void Scop::addInvariantLoads(ScopStmt &Stmt, InvariantAccessesTy &InvMAs) {
|
|||
isl_set_free(DomainCtx);
|
||||
}
|
||||
|
||||
/// Check if an access range is too complex.
|
||||
///
|
||||
/// An access range is too complex, if it contains either many disjuncts or
|
||||
/// very complex expressions. As a simple heuristic, we assume if a set to
|
||||
/// be too complex if the sum of existentially quantified dimensions and
|
||||
/// set dimensions is larger than a threshold. This reliably detects both
|
||||
/// sets with many disjuncts as well as sets with many divisions as they
|
||||
/// arise in h264.
|
||||
///
|
||||
/// @param AccessRange The range to check for complexity.
|
||||
///
|
||||
/// @returns True if the access range is too complex.
|
||||
static bool isAccessRangeTooComplex(isl::set AccessRange) {
|
||||
unsigned NumTotalDims = 0;
|
||||
|
||||
auto CountDimensions = [&NumTotalDims](isl::basic_set BSet) -> isl::stat {
|
||||
NumTotalDims += BSet.dim(isl::dim::div);
|
||||
NumTotalDims += BSet.dim(isl::dim::set);
|
||||
return isl::stat::ok;
|
||||
};
|
||||
|
||||
AccessRange.foreach_basic_set(CountDimensions);
|
||||
|
||||
if (NumTotalDims > MaxDimensionsInAccessRange)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
isl::set Scop::getNonHoistableCtx(MemoryAccess *Access, isl::union_map Writes) {
|
||||
// TODO: Loads that are not loop carried, hence are in a statement with
|
||||
// zero iterators, are by construction invariant, though we
|
||||
|
@ -4072,6 +4107,9 @@ isl::set Scop::getNonHoistableCtx(MemoryAccess *Access, isl::union_map Writes) {
|
|||
SafeToLoad = AccessRelation.range();
|
||||
}
|
||||
|
||||
if (isAccessRangeTooComplex(AccessRelation.range()))
|
||||
return nullptr;
|
||||
|
||||
isl::union_map Written = Writes.intersect_range(SafeToLoad);
|
||||
isl::set WrittenCtx = Written.params();
|
||||
bool IsWritten = !WrittenCtx.is_empty();
|
||||
|
|
|
@ -53,12 +53,6 @@ using namespace llvm;
|
|||
|
||||
STATISTIC(VersionedScops, "Number of SCoPs that required versioning.");
|
||||
|
||||
// The maximal number of dimensions we allow during invariant load construction.
|
||||
// More complex access ranges will result in very high compile time and are also
|
||||
// unlikely to result in good code. This value is very high and should only
|
||||
// trigger for corner cases (e.g., the "dct_luma" function in h264, SPEC2006).
|
||||
static int const MaxDimensionsInAccessRange = 9;
|
||||
|
||||
static cl::opt<bool> PollyGenerateRTCPrint(
|
||||
"polly-codegen-emit-rtc-print",
|
||||
cl::desc("Emit code that prints the runtime check result dynamically."),
|
||||
|
@ -1134,26 +1128,9 @@ bool IslNodeBuilder::materializeFortranArrayOutermostDimension() {
|
|||
return true;
|
||||
}
|
||||
|
||||
/// Add the number of dimensions in @p BS to @p U.
|
||||
static isl_stat countTotalDims(__isl_take isl_basic_set *BS, void *U) {
|
||||
unsigned *NumTotalDim = static_cast<unsigned *>(U);
|
||||
*NumTotalDim += isl_basic_set_total_dim(BS);
|
||||
isl_basic_set_free(BS);
|
||||
return isl_stat_ok;
|
||||
}
|
||||
|
||||
Value *IslNodeBuilder::preloadUnconditionally(isl_set *AccessRange,
|
||||
isl_ast_build *Build,
|
||||
Instruction *AccInst) {
|
||||
|
||||
// TODO: This check could be performed in the ScopInfo already.
|
||||
unsigned NumTotalDim = 0;
|
||||
isl_set_foreach_basic_set(AccessRange, countTotalDims, &NumTotalDim);
|
||||
if (NumTotalDim > MaxDimensionsInAccessRange) {
|
||||
isl_set_free(AccessRange);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
isl_pw_multi_aff *PWAccRel = isl_pw_multi_aff_from_set(AccessRange);
|
||||
isl_ast_expr *Access =
|
||||
isl_ast_build_access_from_pw_multi_aff(Build, PWAccRel);
|
||||
|
|
|
@ -1,70 +0,0 @@
|
|||
; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s
|
||||
;
|
||||
; Extracted from h246 in SPEC 2006.
|
||||
;
|
||||
; TODO: We check that we do compile this benchmark in reasonable time.
|
||||
; To do so we currently bail out due to the complex access range
|
||||
; (multiple modulos) of the invariant load.
|
||||
;
|
||||
; FIXME: We should not bail with a false RTC here.
|
||||
;
|
||||
; CHECK-LABEL: polly.preload.begin:
|
||||
; CHECK-NOT: br i1
|
||||
; CHECK-NOT: br label
|
||||
; CHECK: br i1 false, label %polly.start, label %entry.split
|
||||
;
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
%struct.IP = type { i32****, i32***, %struct.P, %struct.S, %struct.m }
|
||||
%struct.P = type { i32 }
|
||||
%struct.S = type { i32 }
|
||||
%struct.D = type { i32 }
|
||||
%struct.B = type { i32 }
|
||||
%struct.E = type { i32 }
|
||||
%struct.s = type { i32 }
|
||||
%struct.M = type { i32 }
|
||||
%struct.C = type { i32 }
|
||||
%struct.T = type { i32 }
|
||||
%struct.R = type { i32 }
|
||||
%struct.m = type { i32 }
|
||||
%struct.d = type { i32 }
|
||||
|
||||
@img = external global %struct.IP*, align 8
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @dct_luma(i32 %block_x, i32 %block_y) #0 {
|
||||
entry:
|
||||
br label %entry.split
|
||||
|
||||
entry.split: ; preds = %entry
|
||||
%div = sdiv i32 %block_x, 4
|
||||
%div1 = sdiv i32 %block_y, 4
|
||||
%rem = srem i32 %div1, 2
|
||||
%mul4 = shl nsw i32 %rem, 1
|
||||
%rem5 = srem i32 %div, 2
|
||||
%add6 = add nsw i32 %mul4, %rem5
|
||||
%idxprom = sext i32 %add6 to i64
|
||||
%0 = load %struct.IP*, %struct.IP** @img, align 8
|
||||
%cofAC = getelementptr inbounds %struct.IP, %struct.IP* %0, i32 0, i32 0
|
||||
%1 = load i32****, i32***** %cofAC, align 8
|
||||
%arrayidx = getelementptr inbounds i32***, i32**** %1, i64 0
|
||||
%2 = load i32***, i32**** %arrayidx, align 8
|
||||
%arrayidx8 = getelementptr inbounds i32**, i32*** %2, i64 %idxprom
|
||||
%3 = load i32**, i32*** %arrayidx8, align 8
|
||||
%mb_data = getelementptr inbounds %struct.IP, %struct.IP* %0, i64 0, i32 4
|
||||
%4 = load %struct.m, %struct.m* %mb_data, align 8
|
||||
br i1 false, label %land.rhs, label %land.end
|
||||
|
||||
land.rhs: ; preds = %entry.split
|
||||
br label %land.end
|
||||
|
||||
land.end: ; preds = %land.rhs, %entry.split
|
||||
%5 = phi i1 [ false, %entry.split ], [ undef, %land.rhs ]
|
||||
br i1 %5, label %for.cond104.preheader, label %for.cond34.preheader
|
||||
|
||||
for.cond34.preheader: ; preds = %land.end
|
||||
ret void
|
||||
|
||||
for.cond104.preheader: ; preds = %land.end
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
; RUN: opt %loadPolly -S -polly-scops -analyze \
|
||||
; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
%struct.IP = type { i32****, i32***, %struct.P, %struct.S, %struct.m }
|
||||
%struct.P = type { i32 }
|
||||
%struct.S = type { i32 }
|
||||
%struct.D = type { i32 }
|
||||
%struct.B = type { i32 }
|
||||
%struct.E = type { i32 }
|
||||
%struct.s = type { i32 }
|
||||
%struct.M = type { i32 }
|
||||
%struct.C = type { i32 }
|
||||
%struct.T = type { i32 }
|
||||
%struct.R = type { i32 }
|
||||
%struct.m = type { i32 }
|
||||
%struct.d = type { i32 }
|
||||
|
||||
|
||||
; Verify that we do not invariant load hoist very complex conditions.
|
||||
|
||||
; CHECK: Statements {
|
||||
; CHECK-NEXT: Stmt_entry_split
|
||||
; CHECK-NEXT: Domain :=
|
||||
; CHECK-NEXT: [block_y, block_x] -> { Stmt_entry_split[] };
|
||||
; CHECK-NEXT: Schedule :=
|
||||
; CHECK-NEXT: [block_y, block_x] -> { Stmt_entry_split[] -> [] };
|
||||
; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
|
||||
; CHECK-NEXT: [block_y, block_x] -> { Stmt_entry_split[] -> MemRef4[o0] : (-3 <= block_y < 0 and block_x <= -4 and -8 + block_x - 4o0 <= 8*floor((-1 + block_x)/8) <= -5 + block_x - 4o0) or (-3 <= block_y < 0 and block_x >= 0 and -3 + block_x - 4o0 <= 8*floor((block_x)/8) <= block_x - 4o0) or (block_y <= -4 and block_x <= -4 and -16 + block_x - 4o0 - 8*floor((-1 + block_x)/8) + 8*floor((-1 + block_y)/4) <= 16*floor((-1 + block_y)/8) <= -13 + block_x - 4o0 - 8*floor((-1 + block_x)/8) + 8*floor((-1 + block_y)/4)) or (block_y <= -4 and block_x >= 0 and -11 + block_x - 4o0 - 8*floor((block_x)/8) + 8*floor((-1 + block_y)/4) <= 16*floor((-1 + block_y)/8) <= -8 + block_x - 4o0 - 8*floor((block_x)/8) + 8*floor((-1 + block_y)/4)) or (block_y >= 0 and block_x <= -4 and -8 + block_x - 4o0 - 8*floor((-1 + block_x)/8) + 8*floor((block_y)/4) <= 16*floor((block_y)/8) <= -5 + block_x - 4o0 - 8*floor((-1 + block_x)/8) + 8*floor((block_y)/4)) or (block_y >= 0 and block_x >= 0 and -3 + block_x - 4o0 - 8*floor((block_x)/8) + 8*floor((block_y)/4) <= 16*floor((block_y)/8) <= block_x - 4o0 - 8*floor((block_x)/8) + 8*floor((block_y)/4)) or (4*floor((block_y)/8) = -o0 + 2*floor((block_y)/4) and block_y >= 0 and -3 <= block_x < 0 and 4*floor((block_y)/4) >= -7 + block_y + 2o0 and 4*floor((block_y)/4) <= block_y + 2o0) or (4*floor((-1 + block_y)/8) = -2 - o0 + 2*floor((-1 + block_y)/4) and block_y <= -4 and -3 <= block_x < 0 and 4*floor((-1 + block_y)/4) >= -4 + block_y + 2o0 and 4*floor((-1 + block_y)/4) <= 3 + block_y + 2o0); Stmt_entry_split[] -> MemRef4[0] : -3 <= block_y < 0 and -3 <= block_x < 0 };
|
||||
; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1]
|
||||
; CHECK-NEXT: [block_y, block_x] -> { Stmt_entry_split[] -> MemRef0[] };
|
||||
; CHECK-NEXT: }
|
||||
|
||||
@img = external global %struct.IP*, align 8
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @dct_luma(i32 %block_x, i32 %block_y) #0 {
|
||||
entry:
|
||||
br label %entry.split
|
||||
|
||||
entry.split: ; preds = %entry
|
||||
%div = sdiv i32 %block_x, 4
|
||||
%div1 = sdiv i32 %block_y, 4
|
||||
%rem = srem i32 %div1, 2
|
||||
%mul4 = shl nsw i32 %rem, 1
|
||||
%rem5 = srem i32 %div, 2
|
||||
%add6 = add nsw i32 %mul4, %rem5
|
||||
%idxprom = sext i32 %add6 to i64
|
||||
%0 = load %struct.IP*, %struct.IP** @img, align 8
|
||||
%cofAC = getelementptr inbounds %struct.IP, %struct.IP* %0, i32 0, i32 0
|
||||
%1 = load i32****, i32***** %cofAC, align 8
|
||||
%arrayidx = getelementptr inbounds i32***, i32**** %1, i64 0
|
||||
%2 = load i32***, i32**** %arrayidx, align 8
|
||||
%arrayidx8 = getelementptr inbounds i32**, i32*** %2, i64 %idxprom
|
||||
%3 = load i32**, i32*** %arrayidx8, align 8
|
||||
%mb_data = getelementptr inbounds %struct.IP, %struct.IP* %0, i64 0, i32 4
|
||||
%4 = load %struct.m, %struct.m* %mb_data, align 8
|
||||
br i1 false, label %land.rhs, label %land.end
|
||||
|
||||
land.rhs: ; preds = %entry.split
|
||||
br label %land.end
|
||||
|
||||
land.end: ; preds = %land.rhs, %entry.split
|
||||
%5 = phi i1 [ false, %entry.split ], [ undef, %land.rhs ]
|
||||
br i1 %5, label %for.cond104.preheader, label %for.cond34.preheader
|
||||
|
||||
for.cond34.preheader: ; preds = %land.end
|
||||
ret void
|
||||
|
||||
for.cond104.preheader: ; preds = %land.end
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue