From fe74a7a1f5e627abf88d754adc3df746115121fa Mon Sep 17 00:00:00 2001 From: Tobias Grosser Date: Sat, 17 Sep 2016 19:22:18 +0000 Subject: [PATCH] GPGPU: Detect read-only scalar arrays ... and pass these by value rather than by reference. llvm-svn: 281837 --- polly/include/polly/ScopInfo.h | 3 + polly/lib/Analysis/ScopInfo.cpp | 12 ++++ polly/lib/CodeGen/PPCGCodeGeneration.cpp | 46 +++++++++---- polly/test/GPGPU/double-parallel-loop.ll | 2 +- .../GPGPU/kernel-params-only-some-arrays.ll | 8 ++- polly/test/GPGPU/scalar-parameter-half.ll | 39 +++++++++++ polly/test/GPGPU/scalar-parameter.ll | 66 ++----------------- polly/test/GPGPU/scheduler-timeout.ll | 6 +- polly/test/GPGPU/shared-memory-scalar.ll | 20 ++---- 9 files changed, 107 insertions(+), 95 deletions(-) create mode 100644 polly/test/GPGPU/scalar-parameter-half.ll diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h index 4fa756091ff6..d584e262b31a 100644 --- a/polly/include/polly/ScopInfo.h +++ b/polly/include/polly/ScopInfo.h @@ -357,6 +357,9 @@ public: /// Get the space of this array access. __isl_give isl_space *getSpace() const; + /// If the array is read only + bool isReadOnly(); + private: void addDerivedSAI(ScopArrayInfo *DerivedSAI) { DerivedSAIs.insert(DerivedSAI); diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp index 2b304ae3ea6f..30e248d98340 100644 --- a/polly/lib/Analysis/ScopInfo.cpp +++ b/polly/lib/Analysis/ScopInfo.cpp @@ -196,6 +196,18 @@ __isl_give isl_space *ScopArrayInfo::getSpace() const { return Space; } +bool ScopArrayInfo::isReadOnly() { + isl_union_set *WriteSet = isl_union_map_range(S.getWrites()); + isl_space *Space = getSpace(); + WriteSet = isl_union_set_intersect( + WriteSet, isl_union_set_from_set(isl_set_universe(Space))); + + bool IsReadOnly = isl_union_set_is_empty(WriteSet); + isl_union_set_free(WriteSet); + + return IsReadOnly; +} + void ScopArrayInfo::updateElementType(Type *NewElementType) { if (NewElementType == ElementType) return; diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 48444d60aa0c..3466b231f0a8 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -1085,16 +1085,23 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset)); DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); } - - Instruction *Param = new AllocaInst( - Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index), - EntryBlock->getTerminator()); - Builder.CreateStore(DevArray, Param); Value *Slot = Builder.CreateGEP( Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); - Value *ParamTyped = - Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); - Builder.CreateStore(ParamTyped, Slot); + + if (gpu_array_is_read_only_scalar(&Prog->array[i])) { + Value *ValPtr = BlockGen.getOrCreateAlloca(SAI); + Value *ValPtrCast = + Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); + Builder.CreateStore(ValPtrCast, Slot); + } else { + Instruction *Param = new AllocaInst( + Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index), + EntryBlock->getTerminator()); + Builder.CreateStore(DevArray, Param); + Value *ParamTyped = + Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); + Builder.CreateStore(ParamTyped, Slot); + } Index++; } @@ -1255,7 +1262,13 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, if (!ppcg_kernel_requires_array_argument(Kernel, i)) continue; - Args.push_back(Builder.getInt8PtrTy()); + if (gpu_array_is_read_only_scalar(&Prog->array[i])) { + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); + Args.push_back(SAI->getElementType()); + } else { + Args.push_back(Builder.getInt8PtrTy()); + } } int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); @@ -1382,11 +1395,15 @@ void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { continue; } + Value *Val = &*Arg; + + if (!gpu_array_is_read_only_scalar(&Prog->array[i])) { + Type *TypePtr = SAI->getElementType()->getPointerTo(); + Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr); + Val = Builder.CreateLoad(TypedArgPtr); + } + Value *Alloca = BlockGen.getOrCreateAlloca(SAI); - Value *ArgPtr = &*Arg; - Type *TypePtr = SAI->getElementType()->getPointerTo(); - Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); - Value *Val = Builder.CreateLoad(TypedArgPtr); Builder.CreateStore(Val, Alloca); Arg++; @@ -1938,7 +1955,8 @@ public: PPCGArray.n_ref = 0; PPCGArray.refs = nullptr; PPCGArray.accessed = true; - PPCGArray.read_only_scalar = false; + PPCGArray.read_only_scalar = + Array->isReadOnly() && Array->getNumberOfDimensions() == 0; PPCGArray.has_compound_element = false; PPCGArray.local = false; PPCGArray.declare_local = false; diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll index da4a7fc0df89..d43b971fbabb 100644 --- a/polly/test/GPGPU/double-parallel-loop.ll +++ b/polly/test/GPGPU/double-parallel-loop.ll @@ -100,8 +100,8 @@ ; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8* ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304) ; IR-NEXT: [[DevPtr:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) -; IR-NEXT: store i8* [[DevPtr]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[ParamSlot:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR-NEXT: store i8* [[DevPtr]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* [[ParamTyped]], i8** [[ParamSlot]] ; IR-NEXT: call i8* @polly_getKernel diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll index 171b5a3a9526..5ed555baff26 100644 --- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll +++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll @@ -47,11 +47,15 @@ ; KERNEL-NEXT: } -; IR: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) +; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] -; IR: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 +; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) +; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 +; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] diff --git a/polly/test/GPGPU/scalar-parameter-half.ll b/polly/test/GPGPU/scalar-parameter-half.ll new file mode 100644 index 000000000000..005b4394e5c5 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-half.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s + +; REQUIRES: pollyacc + +; XFAIL: * + +; This fails today with "unexpected type" in the LLVM PTX backend. + +; void foo(half A[], half b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @half(half* %A, half %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds half, half* %A, i64 %i.0 + %tmp3 = load half, half* %tmp, align 4 + %tmp4 = fadd half %tmp3, %b + store half %tmp4, half* %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter.ll b/polly/test/GPGPU/scalar-parameter.ll index ee2a4ebe66c1..46f38e955196 100644 --- a/polly/test/GPGPU/scalar-parameter.ll +++ b/polly/test/GPGPU/scalar-parameter.ll @@ -12,68 +12,19 @@ ; REQUIRES: pollyacc -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(half), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(half), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, dev_MemRef_b); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(half), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(half A[], half b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define void @half(half* %A, half %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds half, half* %A, i64 %i.0 - %tmp3 = load half, half* %tmp, align 4 - %tmp4 = fadd half %tmp3, %b - store half %tmp4, half* %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i8* %MemRef_b) +; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, float %MemRef_b) ; CODE: Code ; CODE-NEXT: ==== ; CODE-NEXT: # host ; CODE-NEXT: { ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(32); ; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, dev_MemRef_b); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, MemRef_b); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } @@ -114,23 +65,20 @@ bb7: ; preds = %bb1 ret void } -; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i8* %MemRef_b) +; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, double %MemRef_b) ; KERNEL-NEXT: entry: -; KERNEL-NEXT: %b.s2a = alloca float -; KERNEL-NEXT: %0 = bitcast i8* %MemRef_b to float* -; KERNEL-NEXT: %1 = load float, float* %0 -; KERNEL-NEXT: store float %1, float* %b.s2a +; KERNEL-NEXT: %b.s2a = alloca double +; KERNEL-NEXT: store double %MemRef_b, double* %b.s2a ; CODE: Code ; CODE-NEXT: ==== ; CODE-NEXT: # host ; CODE-NEXT: { ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(double), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(32); ; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, dev_MemRef_b); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A, MemRef_b); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } @@ -321,8 +269,8 @@ bb7: ; preds = %bb1 ; IR-LABEL: @i8 ; IR: %1 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) -; IR-NEXT: store i8* %1, i8** %polly_launch_0_param_0 ; IR-NEXT: %2 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR-NEXT: store i8* %1, i8** %polly_launch_0_param_0 ; IR-NEXT: %3 = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* %3, i8** %2 ; IR-NEXT: store i8 %b, i8* %polly_launch_0_param_1 diff --git a/polly/test/GPGPU/scheduler-timeout.ll b/polly/test/GPGPU/scheduler-timeout.ll index 727e04a49e1e..61dfd2dc8f5f 100644 --- a/polly/test/GPGPU/scheduler-timeout.ll +++ b/polly/test/GPGPU/scheduler-timeout.ll @@ -32,22 +32,20 @@ target triple = "x86_64-unknown-linux-gnu" ; CODE-NEXT:# host ; CODE-NEXT: { ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_alpha, &MemRef_alpha, sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_beta, &MemRef_beta, sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(16, 32); ; CODE-NEXT: dim3 k0_dimGrid(128, 128); -; CODE-NEXT: kernel0 <<>> (dev_MemRef_tmp, dev_MemRef_A, dev_MemRef_alpha, dev_MemRef_B); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_tmp, dev_MemRef_A, MemRef_alpha, dev_MemRef_B); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } ; CODE: { ; CODE-NEXT: dim3 k1_dimBlock(16, 32); ; CODE-NEXT: dim3 k1_dimGrid(128, 128); -; CODE-NEXT: kernel1 <<>> (dev_MemRef_tmp, dev_MemRef_D, dev_MemRef_beta, dev_MemRef_C); +; CODE-NEXT: kernel1 <<>> (dev_MemRef_tmp, dev_MemRef_D, MemRef_beta, dev_MemRef_C); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } diff --git a/polly/test/GPGPU/shared-memory-scalar.ll b/polly/test/GPGPU/shared-memory-scalar.ll index d10a6acbd5c1..f28be873c6df 100644 --- a/polly/test/GPGPU/shared-memory-scalar.ll +++ b/polly/test/GPGPU/shared-memory-scalar.ll @@ -3,11 +3,6 @@ ; RUN: -disable-output < %s | \ ; RUN: FileCheck -check-prefix=CODE %s -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - ; REQUIRES: pollyacc ; void add(float *A, float alpha) { @@ -16,22 +11,17 @@ ; A[i] += alpha; ; } -; CODE: read(t0); -; CODE-NEXT: if (t0 == 0) -; CODE-NEXT: read(); +; CODE: read(t0); ; CODE-NEXT: sync0(); ; CODE-NEXT: for (int c3 = 0; c3 <= 9; c3 += 1) ; CODE-NEXT: Stmt_bb5(t0, c3); ; CODE-NEXT: sync1(); ; CODE-NEXT: write(t0); - -; KERNEL: @shared_MemRef_alpha = internal addrspace(3) global float 0.000000e+00, align 4 - -; KERNEL: %polly.access.cast.MemRef_alpha = bitcast i8* %MemRef_alpha to float* -; KERNEL-NEXT: %shared.read1 = load float, float* %polly.access.cast.MemRef_alpha -; KERNEL-NEXT: store float %shared.read1, float addrspace(3)* @shared_MemRef_alpha - +; This test case was intended to test code generation for scalars stored +; in shared memory. However, after properly marking the scalar as read-only +; the scalar is not stored any more in shared memory. We still leave this +; test case as documentation if we every forget to mark scalars as read-only. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"