GPGPU: Store back non-read-only scalars

We may generate GPU kernels that store into scalars in case we run some sequential code on the GPU because the remaining data is expected to already be on the GPU. For these kernels it is important to not keep the scalar values in thread-local registers, but to store them back to the corresponding device memory objects that backs them up. We currently only store scalars back at the end of a kernel. This is only correct if precisely one thread is executed. In case more than one thread may be run, we currently invalidate the scop. To support such cases correctly, we would need to always load and store back from a corresponding global memory slot instead of a thread-local alloca slot. llvm-svn: 281838
2016-09-17 19:22:31 +00:00 · 2016-09-17 19:22:31 +00:00 · 51dfc27589
parent fe74a7a1f5
commit 51dfc27589
2 changed files with 231 additions and 2 deletions
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@ -379,6 +379,15 @@ private:
  /// @returns The Assembly string of the kernel.
  std::string finalizeKernelFunction();

+  /// Finalize the generation of the kernel arguments.
+  ///
+  /// This function ensures that not-read-only scalars used in a kernel are
+  /// stored back to the global memory location they ared backed up with before
+  /// the kernel terminates.
+  ///
+  /// @params Kernel The kernel to finalize kernel arguments for.
+  void finalizeKernelArguments(ppcg_kernel *Kernel);
+
  /// Create code that allocates memory to store arrays on device.
  void allocateDeviceArrays();

@ -1198,13 +1207,13 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {

  create(isl_ast_node_copy(Kernel->tree));

+  finalizeKernelArguments(Kernel);
  Function *F = Builder.GetInsertBlock()->getParent();
  addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
  clearDominators(F);
  clearScalarEvolution(F);
  clearLoops(F);

-  Builder.SetInsertPoint(&HostInsertPoint);
  IDToValue = HostIDs;

  ValueMap = std::move(HostValueMap);
@ -1217,9 +1226,10 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
    S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
  LocalArrays.clear();

+  std::string ASMString = finalizeKernelFunction();
+  Builder.SetInsertPoint(&HostInsertPoint);
  Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues);

-  std::string ASMString = finalizeKernelFunction();
  std::string Name = "kernel_" + std::to_string(Kernel->id);
  Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
  Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
@ -1410,6 +1420,49 @@ void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
  }
 }

+void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) {
+  auto *FN = Builder.GetInsertBlock()->getParent();
+  auto Arg = FN->arg_begin();
+
+  bool StoredScalar = false;
+  for (long i = 0; i < Kernel->n_array; i++) {
+    if (!ppcg_kernel_requires_array_argument(Kernel, i))
+      continue;
+
+    isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
+    const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
+    isl_id_free(Id);
+
+    if (SAI->getNumberOfDimensions() > 0) {
+      Arg++;
+      continue;
+    }
+
+    if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
+      Arg++;
+      continue;
+    }
+
+    Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
+    Value *ArgPtr = &*Arg;
+    Type *TypePtr = SAI->getElementType()->getPointerTo();
+    Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
+    Value *Val = Builder.CreateLoad(Alloca);
+    Builder.CreateStore(Val, TypedArgPtr);
+    StoredScalar = true;
+
+    Arg++;
+  }
+
+  if (StoredScalar)
+    /// In case more than one thread contains scalar stores, the generated
+    /// code might be incorrect, if we only store at the end of the kernel.
+    /// To support this case we need to store these scalars back at each
+    /// memory store or at least before each kernel barrier.
+    if (Kernel->n_block != 0 || Kernel->n_grid != 0)
+      BuildSuccessful = 0;
+}
+
 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();

--- a/polly/test/GPGPU/non-read-only-scalars.ll
+++ b/polly/test/GPGPU/non-read-only-scalars.ll
@ -0,0 +1,176 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck %s -check-prefix=KERNEL-IR
+;
+; REQUIRES: pollyacc
+;
+; #include <stdio.h>
+;
+; float foo(float A[]) {
+;   float sum = 0;
+;
+;   for (long i = 0; i < 32; i++)
+;     A[i] = i;
+;
+;   for (long i = 0; i < 32; i++)
+;     A[i] += i;
+;
+;   for (long i = 0; i < 32; i++)
+;     sum += A[i];
+;
+;   return sum;
+; }
+;
+; int main() {
+;   float A[32];
+;   float sum = foo(A);
+;   printf("%f\n", sum);
+; }
+
+; CODE: Code
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(32);
+; CODE-NEXT:     dim3 k0_dimGrid(1);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   {
+; CODE-NEXT:     dim3 k1_dimBlock;
+; CODE-NEXT:     dim3 k1_dimGrid;
+; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_sum_0__phi);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   for (int c0 = 0; c0 <= 32; c0 += 1) {
+; CODE-NEXT:     {
+; CODE-NEXT:       dim3 k2_dimBlock;
+; CODE-NEXT:       dim3 k2_dimGrid;
+; CODE-NEXT:       kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0);
+; CODE-NEXT:       cudaCheckKernel();
+; CODE-NEXT:     }
+
+; CODE:     if (c0 <= 31)
+; CODE-NEXT:       {
+; CODE-NEXT:         dim3 k3_dimBlock;
+; CODE-NEXT:         dim3 k3_dimGrid;
+; CODE-NEXT:         kernel3 <<<k3_dimGrid, k3_dimBlock>>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0);
+; CODE-NEXT:         cudaCheckKernel();
+; CODE-NEXT:       }
+
+; CODE:   }
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0__phi, dev_MemRef_sum_0__phi, sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: {
+; CODE-NEXT:   Stmt_bb4(t0);
+; CODE-NEXT:   Stmt_bb10(t0);
+; CODE-NEXT: }
+
+; CODE: # kernel1
+; CODE-NEXT: Stmt_bb17();
+
+; CODE: # kernel2
+; CODE-NEXT: Stmt_bb18(c0);
+
+; CODE: # kernel3
+; CODE-NEXT: Stmt_bb20(c0);
+
+; KERNEL-IR:       store float %p_tmp23, float* %sum.0.phiops
+; KERNEL-IR-NEXT:  [[REGA:%.+]] = bitcast i8* %MemRef_sum_0__phi to float*
+; KERNEL-IR-NEXT:  [[REGB:%.+]] = load float, float* %sum.0.phiops
+; KERNEL-IR-NEXT:  store float [[REGB]], float* [[REGA]]
+; KERNEL-IR-NEXT:  [[REGC:%.+]] = bitcast i8* %MemRef_sum_0 to float*
+; KERNEL-IR-NEXT:  [[REGD:%.+]] = load float, float* %sum.0.s2a
+; KERNEL-IR-NEXT:  store float [[REGD]], float* [[REGC]]
+; KERNEL-IR-NEXT:  ret void
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1
+
+define float @foo(float* %A) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb6, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
+  %exitcond2 = icmp ne i64 %i.0, 32
+  br i1 %exitcond2, label %bb4, label %bb8
+
+bb4:                                              ; preds = %bb3
+  %tmp = sitofp i64 %i.0 to float
+  %tmp5 = getelementptr inbounds float, float* %A, i64 %i.0
+  store float %tmp, float* %tmp5, align 4
+  br label %bb6
+
+bb6:                                              ; preds = %bb4
+  %tmp7 = add nuw nsw i64 %i.0, 1
+  br label %bb3
+
+bb8:                                              ; preds = %bb3
+  br label %bb9
+
+bb9:                                              ; preds = %bb15, %bb8
+  %i1.0 = phi i64 [ 0, %bb8 ], [ %tmp16, %bb15 ]
+  %exitcond1 = icmp ne i64 %i1.0, 32
+  br i1 %exitcond1, label %bb10, label %bb17
+
+bb10:                                             ; preds = %bb9
+  %tmp11 = sitofp i64 %i1.0 to float
+  %tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0
+  %tmp13 = load float, float* %tmp12, align 4
+  %tmp14 = fadd float %tmp13, %tmp11
+  store float %tmp14, float* %tmp12, align 4
+  br label %bb15
+
+bb15:                                             ; preds = %bb10
+  %tmp16 = add nuw nsw i64 %i1.0, 1
+  br label %bb9
+
+bb17:                                             ; preds = %bb9
+  br label %bb18
+
+bb18:                                             ; preds = %bb20, %bb17
+  %sum.0 = phi float [ 0.000000e+00, %bb17 ], [ %tmp23, %bb20 ]
+  %i2.0 = phi i64 [ 0, %bb17 ], [ %tmp24, %bb20 ]
+  %exitcond = icmp ne i64 %i2.0, 32
+  br i1 %exitcond, label %bb19, label %bb25
+
+bb19:                                             ; preds = %bb18
+  br label %bb20
+
+bb20:                                             ; preds = %bb19
+  %tmp21 = getelementptr inbounds float, float* %A, i64 %i2.0
+  %tmp22 = load float, float* %tmp21, align 4
+  %tmp23 = fadd float %sum.0, %tmp22
+  %tmp24 = add nuw nsw i64 %i2.0, 1
+  br label %bb18
+
+bb25:                                             ; preds = %bb18
+  %sum.0.lcssa = phi float [ %sum.0, %bb18 ]
+  ret float %sum.0.lcssa
+}
+
+define i32 @main() {
+bb:
+  %A = alloca [32 x float], align 16
+  %tmp = getelementptr inbounds [32 x float], [32 x float]* %A, i64 0, i64 0
+  %tmp1 = call float @foo(float* %tmp)
+  %tmp2 = fpext float %tmp1 to double
+  %tmp3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double %tmp2) #2
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...) #1
+