GPGPU: Store back non-read-only scalars

We may generate GPU kernels that store into scalars in case we run some
sequential code on the GPU because the remaining data is expected to already be
on the GPU. For these kernels it is important to not keep the scalar values
in thread-local registers, but to store them back to the corresponding device
memory objects that backs them up.

We currently only store scalars back at the end of a kernel. This is only
correct if precisely one thread is executed. In case more than one thread may
be run, we currently invalidate the scop. To support such cases correctly,
we would need to always load and store back from a corresponding global
memory slot instead of a thread-local alloca slot.

llvm-svn: 281838
This commit is contained in:
Tobias Grosser 2016-09-17 19:22:31 +00:00
parent fe74a7a1f5
commit 51dfc27589
2 changed files with 231 additions and 2 deletions

View File

@ -379,6 +379,15 @@ private:
/// @returns The Assembly string of the kernel.
std::string finalizeKernelFunction();
/// Finalize the generation of the kernel arguments.
///
/// This function ensures that not-read-only scalars used in a kernel are
/// stored back to the global memory location they ared backed up with before
/// the kernel terminates.
///
/// @params Kernel The kernel to finalize kernel arguments for.
void finalizeKernelArguments(ppcg_kernel *Kernel);
/// Create code that allocates memory to store arrays on device.
void allocateDeviceArrays();
@ -1198,13 +1207,13 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
create(isl_ast_node_copy(Kernel->tree));
finalizeKernelArguments(Kernel);
Function *F = Builder.GetInsertBlock()->getParent();
addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
clearDominators(F);
clearScalarEvolution(F);
clearLoops(F);
Builder.SetInsertPoint(&HostInsertPoint);
IDToValue = HostIDs;
ValueMap = std::move(HostValueMap);
@ -1217,9 +1226,10 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
LocalArrays.clear();
std::string ASMString = finalizeKernelFunction();
Builder.SetInsertPoint(&HostInsertPoint);
Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues);
std::string ASMString = finalizeKernelFunction();
std::string Name = "kernel_" + std::to_string(Kernel->id);
Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
@ -1410,6 +1420,49 @@ void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
}
}
void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) {
auto *FN = Builder.GetInsertBlock()->getParent();
auto Arg = FN->arg_begin();
bool StoredScalar = false;
for (long i = 0; i < Kernel->n_array; i++) {
if (!ppcg_kernel_requires_array_argument(Kernel, i))
continue;
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
isl_id_free(Id);
if (SAI->getNumberOfDimensions() > 0) {
Arg++;
continue;
}
if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
Arg++;
continue;
}
Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
Value *ArgPtr = &*Arg;
Type *TypePtr = SAI->getElementType()->getPointerTo();
Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
Value *Val = Builder.CreateLoad(Alloca);
Builder.CreateStore(Val, TypedArgPtr);
StoredScalar = true;
Arg++;
}
if (StoredScalar)
/// In case more than one thread contains scalar stores, the generated
/// code might be incorrect, if we only store at the end of the kernel.
/// To support this case we need to store these scalars back at each
/// memory store or at least before each kernel barrier.
if (Kernel->n_block != 0 || Kernel->n_grid != 0)
BuildSuccessful = 0;
}
void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();

View File

@ -0,0 +1,176 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=CODE %s
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -disable-output < %s | \
; RUN: FileCheck %s -check-prefix=KERNEL-IR
;
; REQUIRES: pollyacc
;
; #include <stdio.h>
;
; float foo(float A[]) {
; float sum = 0;
;
; for (long i = 0; i < 32; i++)
; A[i] = i;
;
; for (long i = 0; i < 32; i++)
; A[i] += i;
;
; for (long i = 0; i < 32; i++)
; sum += A[i];
;
; return sum;
; }
;
; int main() {
; float A[32];
; float sum = foo(A);
; printf("%f\n", sum);
; }
; CODE: Code
; CODE-NEXT: ====
; CODE-NEXT: # host
; CODE-NEXT: {
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
; CODE-NEXT: dim3 k0_dimGrid(1);
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: {
; CODE-NEXT: dim3 k1_dimBlock;
; CODE-NEXT: dim3 k1_dimGrid;
; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_sum_0__phi);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: for (int c0 = 0; c0 <= 32; c0 += 1) {
; CODE-NEXT: {
; CODE-NEXT: dim3 k2_dimBlock;
; CODE-NEXT: dim3 k2_dimGrid;
; CODE-NEXT: kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: if (c0 <= 31)
; CODE-NEXT: {
; CODE-NEXT: dim3 k3_dimBlock;
; CODE-NEXT: dim3 k3_dimGrid;
; CODE-NEXT: kernel3 <<<k3_dimGrid, k3_dimBlock>>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: }
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0__phi, dev_MemRef_sum_0__phi, sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: }
; CODE: # kernel0
; CODE-NEXT: {
; CODE-NEXT: Stmt_bb4(t0);
; CODE-NEXT: Stmt_bb10(t0);
; CODE-NEXT: }
; CODE: # kernel1
; CODE-NEXT: Stmt_bb17();
; CODE: # kernel2
; CODE-NEXT: Stmt_bb18(c0);
; CODE: # kernel3
; CODE-NEXT: Stmt_bb20(c0);
; KERNEL-IR: store float %p_tmp23, float* %sum.0.phiops
; KERNEL-IR-NEXT: [[REGA:%.+]] = bitcast i8* %MemRef_sum_0__phi to float*
; KERNEL-IR-NEXT: [[REGB:%.+]] = load float, float* %sum.0.phiops
; KERNEL-IR-NEXT: store float [[REGB]], float* [[REGA]]
; KERNEL-IR-NEXT: [[REGC:%.+]] = bitcast i8* %MemRef_sum_0 to float*
; KERNEL-IR-NEXT: [[REGD:%.+]] = load float, float* %sum.0.s2a
; KERNEL-IR-NEXT: store float [[REGD]], float* [[REGC]]
; KERNEL-IR-NEXT: ret void
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1
define float @foo(float* %A) {
bb:
br label %bb3
bb3: ; preds = %bb6, %bb
%i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
%exitcond2 = icmp ne i64 %i.0, 32
br i1 %exitcond2, label %bb4, label %bb8
bb4: ; preds = %bb3
%tmp = sitofp i64 %i.0 to float
%tmp5 = getelementptr inbounds float, float* %A, i64 %i.0
store float %tmp, float* %tmp5, align 4
br label %bb6
bb6: ; preds = %bb4
%tmp7 = add nuw nsw i64 %i.0, 1
br label %bb3
bb8: ; preds = %bb3
br label %bb9
bb9: ; preds = %bb15, %bb8
%i1.0 = phi i64 [ 0, %bb8 ], [ %tmp16, %bb15 ]
%exitcond1 = icmp ne i64 %i1.0, 32
br i1 %exitcond1, label %bb10, label %bb17
bb10: ; preds = %bb9
%tmp11 = sitofp i64 %i1.0 to float
%tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0
%tmp13 = load float, float* %tmp12, align 4
%tmp14 = fadd float %tmp13, %tmp11
store float %tmp14, float* %tmp12, align 4
br label %bb15
bb15: ; preds = %bb10
%tmp16 = add nuw nsw i64 %i1.0, 1
br label %bb9
bb17: ; preds = %bb9
br label %bb18
bb18: ; preds = %bb20, %bb17
%sum.0 = phi float [ 0.000000e+00, %bb17 ], [ %tmp23, %bb20 ]
%i2.0 = phi i64 [ 0, %bb17 ], [ %tmp24, %bb20 ]
%exitcond = icmp ne i64 %i2.0, 32
br i1 %exitcond, label %bb19, label %bb25
bb19: ; preds = %bb18
br label %bb20
bb20: ; preds = %bb19
%tmp21 = getelementptr inbounds float, float* %A, i64 %i2.0
%tmp22 = load float, float* %tmp21, align 4
%tmp23 = fadd float %sum.0, %tmp22
%tmp24 = add nuw nsw i64 %i2.0, 1
br label %bb18
bb25: ; preds = %bb18
%sum.0.lcssa = phi float [ %sum.0, %bb18 ]
ret float %sum.0.lcssa
}
define i32 @main() {
bb:
%A = alloca [32 x float], align 16
%tmp = getelementptr inbounds [32 x float], [32 x float]* %A, i64 0, i64 0
%tmp1 = call float @foo(float* %tmp)
%tmp2 = fpext float %tmp1 to double
%tmp3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double %tmp2) #2
ret i32 0
}
declare i32 @printf(i8*, ...) #1