GPGPU: Detect read-only scalar arrays ...
and pass these by value rather than by reference. llvm-svn: 281837
This commit is contained in:
parent
194470cd11
commit
fe74a7a1f5
|
@ -357,6 +357,9 @@ public:
|
|||
/// Get the space of this array access.
|
||||
__isl_give isl_space *getSpace() const;
|
||||
|
||||
/// If the array is read only
|
||||
bool isReadOnly();
|
||||
|
||||
private:
|
||||
void addDerivedSAI(ScopArrayInfo *DerivedSAI) {
|
||||
DerivedSAIs.insert(DerivedSAI);
|
||||
|
|
|
@ -196,6 +196,18 @@ __isl_give isl_space *ScopArrayInfo::getSpace() const {
|
|||
return Space;
|
||||
}
|
||||
|
||||
bool ScopArrayInfo::isReadOnly() {
|
||||
isl_union_set *WriteSet = isl_union_map_range(S.getWrites());
|
||||
isl_space *Space = getSpace();
|
||||
WriteSet = isl_union_set_intersect(
|
||||
WriteSet, isl_union_set_from_set(isl_set_universe(Space)));
|
||||
|
||||
bool IsReadOnly = isl_union_set_is_empty(WriteSet);
|
||||
isl_union_set_free(WriteSet);
|
||||
|
||||
return IsReadOnly;
|
||||
}
|
||||
|
||||
void ScopArrayInfo::updateElementType(Type *NewElementType) {
|
||||
if (NewElementType == ElementType)
|
||||
return;
|
||||
|
|
|
@ -1085,16 +1085,23 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
|
|||
DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset));
|
||||
DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
|
||||
}
|
||||
|
||||
Instruction *Param = new AllocaInst(
|
||||
Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index),
|
||||
EntryBlock->getTerminator());
|
||||
Builder.CreateStore(DevArray, Param);
|
||||
Value *Slot = Builder.CreateGEP(
|
||||
Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
|
||||
Value *ParamTyped =
|
||||
Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
|
||||
Builder.CreateStore(ParamTyped, Slot);
|
||||
|
||||
if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
|
||||
Value *ValPtr = BlockGen.getOrCreateAlloca(SAI);
|
||||
Value *ValPtrCast =
|
||||
Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy());
|
||||
Builder.CreateStore(ValPtrCast, Slot);
|
||||
} else {
|
||||
Instruction *Param = new AllocaInst(
|
||||
Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index),
|
||||
EntryBlock->getTerminator());
|
||||
Builder.CreateStore(DevArray, Param);
|
||||
Value *ParamTyped =
|
||||
Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
|
||||
Builder.CreateStore(ParamTyped, Slot);
|
||||
}
|
||||
Index++;
|
||||
}
|
||||
|
||||
|
@ -1255,7 +1262,13 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
|
|||
if (!ppcg_kernel_requires_array_argument(Kernel, i))
|
||||
continue;
|
||||
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
|
||||
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
|
||||
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
|
||||
Args.push_back(SAI->getElementType());
|
||||
} else {
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
}
|
||||
}
|
||||
|
||||
int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
|
||||
|
@ -1382,11 +1395,15 @@ void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
|
|||
continue;
|
||||
}
|
||||
|
||||
Value *Val = &*Arg;
|
||||
|
||||
if (!gpu_array_is_read_only_scalar(&Prog->array[i])) {
|
||||
Type *TypePtr = SAI->getElementType()->getPointerTo();
|
||||
Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr);
|
||||
Val = Builder.CreateLoad(TypedArgPtr);
|
||||
}
|
||||
|
||||
Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
|
||||
Value *ArgPtr = &*Arg;
|
||||
Type *TypePtr = SAI->getElementType()->getPointerTo();
|
||||
Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
|
||||
Value *Val = Builder.CreateLoad(TypedArgPtr);
|
||||
Builder.CreateStore(Val, Alloca);
|
||||
|
||||
Arg++;
|
||||
|
@ -1938,7 +1955,8 @@ public:
|
|||
PPCGArray.n_ref = 0;
|
||||
PPCGArray.refs = nullptr;
|
||||
PPCGArray.accessed = true;
|
||||
PPCGArray.read_only_scalar = false;
|
||||
PPCGArray.read_only_scalar =
|
||||
Array->isReadOnly() && Array->getNumberOfDimensions() == 0;
|
||||
PPCGArray.has_compound_element = false;
|
||||
PPCGArray.local = false;
|
||||
PPCGArray.declare_local = false;
|
||||
|
|
|
@ -100,8 +100,8 @@
|
|||
; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
|
||||
; IR-NEXT: [[DevPtr:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
|
||||
; IR-NEXT: store i8* [[DevPtr]], i8** %polly_launch_0_param_0
|
||||
; IR-NEXT: [[ParamSlot:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0
|
||||
; IR-NEXT: store i8* [[DevPtr]], i8** %polly_launch_0_param_0
|
||||
; IR-NEXT: [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
|
||||
; IR-NEXT: store i8* [[ParamTyped]], i8** [[ParamSlot]]
|
||||
; IR-NEXT: call i8* @polly_getKernel
|
||||
|
|
|
@ -47,11 +47,15 @@
|
|||
; KERNEL-NEXT: }
|
||||
|
||||
|
||||
; IR: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0
|
||||
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
|
||||
; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0
|
||||
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
|
||||
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
|
||||
; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]
|
||||
|
||||
; IR: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0
|
||||
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
|
||||
; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0
|
||||
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
|
||||
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*
|
||||
; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]
|
||||
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; XFAIL: *
|
||||
|
||||
; This fails today with "unexpected type" in the LLVM PTX backend.
|
||||
|
||||
; void foo(half A[], half b) {
|
||||
; for (long i = 0; i < 1024; i++)
|
||||
; A[i] += b;
|
||||
; }
|
||||
;
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @half(half* %A, half %b) {
|
||||
bb:
|
||||
br label %bb1
|
||||
|
||||
bb1: ; preds = %bb5, %bb
|
||||
%i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
|
||||
%exitcond = icmp ne i64 %i.0, 1024
|
||||
br i1 %exitcond, label %bb2, label %bb7
|
||||
|
||||
bb2: ; preds = %bb1
|
||||
%tmp = getelementptr inbounds half, half* %A, i64 %i.0
|
||||
%tmp3 = load half, half* %tmp, align 4
|
||||
%tmp4 = fadd half %tmp3, %b
|
||||
store half %tmp4, half* %tmp, align 4
|
||||
br label %bb5
|
||||
|
||||
bb5: ; preds = %bb2
|
||||
%tmp6 = add nuw nsw i64 %i.0, 1
|
||||
br label %bb1
|
||||
|
||||
bb7: ; preds = %bb1
|
||||
ret void
|
||||
}
|
||||
|
|
@ -12,68 +12,19 @@
|
|||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; CODE: Code
|
||||
; CODE-NEXT: ====
|
||||
; CODE-NEXT: # host
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(half), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(half), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: dim3 k0_dimBlock(32);
|
||||
; CODE-NEXT: dim3 k0_dimGrid(32);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_b);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(half), cudaMemcpyDeviceToHost));
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: # kernel0
|
||||
; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
|
||||
|
||||
; void foo(half A[], half b) {
|
||||
; for (long i = 0; i < 1024; i++)
|
||||
; A[i] += b;
|
||||
; }
|
||||
;
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @half(half* %A, half %b) {
|
||||
bb:
|
||||
br label %bb1
|
||||
|
||||
bb1: ; preds = %bb5, %bb
|
||||
%i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
|
||||
%exitcond = icmp ne i64 %i.0, 1024
|
||||
br i1 %exitcond, label %bb2, label %bb7
|
||||
|
||||
bb2: ; preds = %bb1
|
||||
%tmp = getelementptr inbounds half, half* %A, i64 %i.0
|
||||
%tmp3 = load half, half* %tmp, align 4
|
||||
%tmp4 = fadd half %tmp3, %b
|
||||
store half %tmp4, half* %tmp, align 4
|
||||
br label %bb5
|
||||
|
||||
bb5: ; preds = %bb2
|
||||
%tmp6 = add nuw nsw i64 %i.0, 1
|
||||
br label %bb1
|
||||
|
||||
bb7: ; preds = %bb1
|
||||
ret void
|
||||
}
|
||||
|
||||
; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i8* %MemRef_b)
|
||||
; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, float %MemRef_b)
|
||||
|
||||
; CODE: Code
|
||||
; CODE-NEXT: ====
|
||||
; CODE-NEXT: # host
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: dim3 k0_dimBlock(32);
|
||||
; CODE-NEXT: dim3 k0_dimGrid(32);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_b);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
|
@ -114,23 +65,20 @@ bb7: ; preds = %bb1
|
|||
ret void
|
||||
}
|
||||
|
||||
; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i8* %MemRef_b)
|
||||
; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, double %MemRef_b)
|
||||
; KERNEL-NEXT: entry:
|
||||
; KERNEL-NEXT: %b.s2a = alloca float
|
||||
; KERNEL-NEXT: %0 = bitcast i8* %MemRef_b to float*
|
||||
; KERNEL-NEXT: %1 = load float, float* %0
|
||||
; KERNEL-NEXT: store float %1, float* %b.s2a
|
||||
; KERNEL-NEXT: %b.s2a = alloca double
|
||||
; KERNEL-NEXT: store double %MemRef_b, double* %b.s2a
|
||||
|
||||
; CODE: Code
|
||||
; CODE-NEXT: ====
|
||||
; CODE-NEXT: # host
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(double), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(double), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: dim3 k0_dimBlock(32);
|
||||
; CODE-NEXT: dim3 k0_dimGrid(32);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_b);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
|
@ -321,8 +269,8 @@ bb7: ; preds = %bb1
|
|||
; IR-LABEL: @i8
|
||||
|
||||
; IR: %1 = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
|
||||
; IR-NEXT: store i8* %1, i8** %polly_launch_0_param_0
|
||||
; IR-NEXT: %2 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
|
||||
; IR-NEXT: store i8* %1, i8** %polly_launch_0_param_0
|
||||
; IR-NEXT: %3 = bitcast i8** %polly_launch_0_param_0 to i8*
|
||||
; IR-NEXT: store i8* %3, i8** %2
|
||||
; IR-NEXT: store i8 %b, i8* %polly_launch_0_param_1
|
||||
|
|
|
@ -32,22 +32,20 @@ target triple = "x86_64-unknown-linux-gnu"
|
|||
; CODE-NEXT:# host
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_alpha, &MemRef_alpha, sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_beta, &MemRef_beta, sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: dim3 k0_dimBlock(16, 32);
|
||||
; CODE-NEXT: dim3 k0_dimGrid(128, 128);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_A, dev_MemRef_alpha, dev_MemRef_B);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_A, MemRef_alpha, dev_MemRef_B);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: {
|
||||
; CODE-NEXT: dim3 k1_dimBlock(16, 32);
|
||||
; CODE-NEXT: dim3 k1_dimGrid(128, 128);
|
||||
; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_D, dev_MemRef_beta, dev_MemRef_C);
|
||||
; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_D, MemRef_beta, dev_MemRef_C);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
|
|
|
@ -3,11 +3,6 @@
|
|||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck -check-prefix=CODE %s
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg \
|
||||
; RUN: -polly-acc-use-shared \
|
||||
; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
|
||||
; RUN: FileCheck -check-prefix=KERNEL %s
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; void add(float *A, float alpha) {
|
||||
|
@ -16,22 +11,17 @@
|
|||
; A[i] += alpha;
|
||||
; }
|
||||
|
||||
; CODE: read(t0);
|
||||
; CODE-NEXT: if (t0 == 0)
|
||||
; CODE-NEXT: read();
|
||||
; CODE: read(t0);
|
||||
; CODE-NEXT: sync0();
|
||||
; CODE-NEXT: for (int c3 = 0; c3 <= 9; c3 += 1)
|
||||
; CODE-NEXT: Stmt_bb5(t0, c3);
|
||||
; CODE-NEXT: sync1();
|
||||
; CODE-NEXT: write(t0);
|
||||
|
||||
|
||||
; KERNEL: @shared_MemRef_alpha = internal addrspace(3) global float 0.000000e+00, align 4
|
||||
|
||||
; KERNEL: %polly.access.cast.MemRef_alpha = bitcast i8* %MemRef_alpha to float*
|
||||
; KERNEL-NEXT: %shared.read1 = load float, float* %polly.access.cast.MemRef_alpha
|
||||
; KERNEL-NEXT: store float %shared.read1, float addrspace(3)* @shared_MemRef_alpha
|
||||
|
||||
; This test case was intended to test code generation for scalars stored
|
||||
; in shared memory. However, after properly marking the scalar as read-only
|
||||
; the scalar is not stored any more in shared memory. We still leave this
|
||||
; test case as documentation if we every forget to mark scalars as read-only.
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
|
|
Loading…
Reference in New Issue