[GPGPU] Simplify PPCGSCop to reduce compile time [NFC]
Summary: Drop unused parameter dimensions to reduce the size of the sets we are working with. Especially the computed dependences tend to accumulate a lot of parameters that are present in the input memory accesses, but often not necessary to express the actual dependences. As isl represents maps and sets with dense matrices, reducing the dimensionality of isl sets commonly reduces code generation performance. This reduces compile time from 17 to 11 seconds for our test case. While this is not impressive, this patch helped me to identify the previous two performance improvements and additionally also increases readability of the isl data structures we use. Reviewers: Meinersbur, bollu, singam-sanjay Reviewed By: bollu Subscribers: nemanjai, pollydev, llvm-commits, kbarton Tags: #polly Differential Revision: https://reviews.llvm.org/D36869 llvm-svn: 311161
This commit is contained in:
parent
02c9a3dfc3
commit
ec02acfb98
|
@ -2632,6 +2632,77 @@ public:
|
||||||
return Names;
|
return Names;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Remove unreferenced parameter dimensions from union_map.
|
||||||
|
isl::union_map removeUnusedParameters(isl::union_map UMap) {
|
||||||
|
auto New = isl::union_map::empty(isl::space(UMap.get_ctx(), 0, 0));
|
||||||
|
|
||||||
|
auto RemoveUnusedDims = [&New](isl::map S) -> isl::stat {
|
||||||
|
int Removed = 0;
|
||||||
|
int NumDims = S.dim(isl::dim::param);
|
||||||
|
for (long i = 0; i < NumDims; i++) {
|
||||||
|
const int Dim = i - Removed;
|
||||||
|
if (!S.involves_dims(isl::dim::param, Dim, 1)) {
|
||||||
|
S = S.remove_dims(isl::dim::param, Dim, 1);
|
||||||
|
Removed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
New = New.unite(S);
|
||||||
|
return isl::stat::ok;
|
||||||
|
};
|
||||||
|
|
||||||
|
UMap.foreach_map(RemoveUnusedDims);
|
||||||
|
return New;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remove unreferenced parameter dimensions from union_set.
|
||||||
|
isl::union_set removeUnusedParameters(isl::union_set USet) {
|
||||||
|
auto New = isl::union_set::empty(isl::space(USet.get_ctx(), 0, 0));
|
||||||
|
|
||||||
|
auto RemoveUnusedDims = [&New](isl::set S) -> isl::stat {
|
||||||
|
int Removed = 0;
|
||||||
|
int NumDims = S.dim(isl::dim::param);
|
||||||
|
for (long i = 0; i < NumDims; i++) {
|
||||||
|
const int Dim = i - Removed;
|
||||||
|
if (!S.involves_dims(isl::dim::param, Dim, 1)) {
|
||||||
|
S = S.remove_dims(isl::dim::param, Dim, 1);
|
||||||
|
Removed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
New = New.unite(S);
|
||||||
|
return isl::stat::ok;
|
||||||
|
};
|
||||||
|
|
||||||
|
USet.foreach_set(RemoveUnusedDims);
|
||||||
|
return New;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simplify PPCG scop to improve compile time.
|
||||||
|
///
|
||||||
|
/// We drop unused parameter dimensions to reduce the size of the sets we are
|
||||||
|
/// working with. Especially the computed dependences tend to accumulate a lot
|
||||||
|
/// of parameters that are present in the input memory accesses, but often are
|
||||||
|
/// not necessary to express the actual dependences. As isl represents maps
|
||||||
|
/// and sets with dense matrices, reducing the dimensionality of isl sets
|
||||||
|
/// commonly reduces code generation performance.
|
||||||
|
void simplifyPPCGScop(ppcg_scop *PPCGScop) {
|
||||||
|
PPCGScop->domain =
|
||||||
|
removeUnusedParameters(isl::manage(PPCGScop->domain)).release();
|
||||||
|
|
||||||
|
PPCGScop->dep_forced =
|
||||||
|
removeUnusedParameters(isl::manage(PPCGScop->dep_forced)).release();
|
||||||
|
PPCGScop->dep_false =
|
||||||
|
removeUnusedParameters(isl::manage(PPCGScop->dep_false)).release();
|
||||||
|
PPCGScop->dep_flow =
|
||||||
|
removeUnusedParameters(isl::manage(PPCGScop->dep_flow)).release();
|
||||||
|
PPCGScop->tagged_dep_flow =
|
||||||
|
removeUnusedParameters(isl::manage(PPCGScop->tagged_dep_flow))
|
||||||
|
.release();
|
||||||
|
|
||||||
|
PPCGScop->tagged_dep_order =
|
||||||
|
removeUnusedParameters(isl::manage(PPCGScop->tagged_dep_order))
|
||||||
|
.release();
|
||||||
|
}
|
||||||
|
|
||||||
/// Create a new PPCG scop from the current scop.
|
/// Create a new PPCG scop from the current scop.
|
||||||
///
|
///
|
||||||
/// The PPCG scop is initialized with data from the current polly::Scop. From
|
/// The PPCG scop is initialized with data from the current polly::Scop. From
|
||||||
|
@ -2689,6 +2760,7 @@ public:
|
||||||
compute_tagger(PPCGScop);
|
compute_tagger(PPCGScop);
|
||||||
compute_dependences(PPCGScop);
|
compute_dependences(PPCGScop);
|
||||||
eliminate_dead_code(PPCGScop);
|
eliminate_dead_code(PPCGScop);
|
||||||
|
simplifyPPCGScop(PPCGScop);
|
||||||
|
|
||||||
return PPCGScop;
|
return PPCGScop;
|
||||||
}
|
}
|
||||||
|
@ -3130,10 +3202,14 @@ public:
|
||||||
|
|
||||||
isl_schedule *Schedule = get_schedule(PPCGGen);
|
isl_schedule *Schedule = get_schedule(PPCGGen);
|
||||||
|
|
||||||
int has_permutable = has_any_permutable_node(Schedule);
|
/// Copy to and from device functions may introduce new parameters, which
|
||||||
|
/// must be present in the schedule tree root for code generation. Hence,
|
||||||
|
/// we ensure that all possible parameters are introduced from this point.
|
||||||
|
if (!PollyManagedMemory)
|
||||||
|
Schedule =
|
||||||
|
isl_schedule_align_params(Schedule, S->getFullParamSpace().release());
|
||||||
|
|
||||||
Schedule =
|
int has_permutable = has_any_permutable_node(Schedule);
|
||||||
isl_schedule_align_params(Schedule, S->getFullParamSpace().release());
|
|
||||||
|
|
||||||
if (!has_permutable || has_permutable < 0) {
|
if (!has_permutable || has_permutable < 0) {
|
||||||
Schedule = isl_schedule_free(Schedule);
|
Schedule = isl_schedule_free(Schedule);
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
|
; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
|
||||||
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
|
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
|
||||||
|
|
||||||
; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B)
|
; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A)
|
||||||
; KERNEL-NEXT: entry:
|
; KERNEL-NEXT: entry:
|
||||||
; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||||
; KERNEL-NEXT: %b0 = zext i32 %0 to i64
|
; KERNEL-NEXT: %b0 = zext i32 %0 to i64
|
||||||
|
@ -36,7 +36,7 @@
|
||||||
; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
|
; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
|
||||||
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
|
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
|
||||||
|
|
||||||
; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A)
|
; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B)
|
||||||
; KERNEL-NEXT: entry:
|
; KERNEL-NEXT: entry:
|
||||||
; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||||
; KERNEL-NEXT: %b0 = zext i32 %0 to i64
|
; KERNEL-NEXT: %b0 = zext i32 %0 to i64
|
||||||
|
@ -47,13 +47,13 @@
|
||||||
; KERNEL-NEXT: }
|
; KERNEL-NEXT: }
|
||||||
|
|
||||||
|
|
||||||
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
|
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
|
||||||
; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
|
; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
|
||||||
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
|
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
|
||||||
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
|
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
|
||||||
; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]
|
; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]
|
||||||
|
|
||||||
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
|
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
|
||||||
; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0
|
; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0
|
||||||
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
|
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
|
||||||
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*
|
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*
|
||||||
|
|
|
@ -12,14 +12,14 @@
|
||||||
|
|
||||||
; CODE: dim3 k0_dimBlock(8);
|
; CODE: dim3 k0_dimBlock(8);
|
||||||
; CODE-NEXT: dim3 k0_dimGrid(1);
|
; CODE-NEXT: dim3 k0_dimGrid(1);
|
||||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
|
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B);
|
||||||
; CODE-NEXT: cudaCheckKernel();
|
; CODE-NEXT: cudaCheckKernel();
|
||||||
; CODE-NEXT: }
|
; CODE-NEXT: }
|
||||||
|
|
||||||
; CODE: {
|
; CODE: {
|
||||||
; CODE-NEXT: dim3 k1_dimBlock(8);
|
; CODE-NEXT: dim3 k1_dimBlock(8);
|
||||||
; CODE-NEXT: dim3 k1_dimGrid(1);
|
; CODE-NEXT: dim3 k1_dimGrid(1);
|
||||||
; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_B);
|
; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
|
||||||
; CODE-NEXT: cudaCheckKernel();
|
; CODE-NEXT: cudaCheckKernel();
|
||||||
; CODE-NEXT: }
|
; CODE-NEXT: }
|
||||||
|
|
||||||
|
@ -27,10 +27,10 @@
|
||||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost));
|
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost));
|
||||||
|
|
||||||
; CODE: # kernel0
|
; CODE: # kernel0
|
||||||
; CODE-NEXT: Stmt_bb11(t0);
|
; CODE-NEXT: Stmt_bb3(t0);
|
||||||
|
|
||||||
; CODE: # kernel1
|
; CODE: # kernel1
|
||||||
; CODE-NEXT: Stmt_bb3(t0);
|
; CODE-NEXT: Stmt_bb11(t0);
|
||||||
|
|
||||||
; IR: %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32)
|
; IR: %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32)
|
||||||
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32)
|
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32)
|
||||||
|
|
Loading…
Reference in New Issue