diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 3466b231f0a8..6b33db53607a 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -379,6 +379,15 @@ private: /// @returns The Assembly string of the kernel. std::string finalizeKernelFunction(); + /// Finalize the generation of the kernel arguments. + /// + /// This function ensures that not-read-only scalars used in a kernel are + /// stored back to the global memory location they ared backed up with before + /// the kernel terminates. + /// + /// @params Kernel The kernel to finalize kernel arguments for. + void finalizeKernelArguments(ppcg_kernel *Kernel); + /// Create code that allocates memory to store arrays on device. void allocateDeviceArrays(); @@ -1198,13 +1207,13 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { create(isl_ast_node_copy(Kernel->tree)); + finalizeKernelArguments(Kernel); Function *F = Builder.GetInsertBlock()->getParent(); addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); clearDominators(F); clearScalarEvolution(F); clearLoops(F); - Builder.SetInsertPoint(&HostInsertPoint); IDToValue = HostIDs; ValueMap = std::move(HostValueMap); @@ -1217,9 +1226,10 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); LocalArrays.clear(); + std::string ASMString = finalizeKernelFunction(); + Builder.SetInsertPoint(&HostInsertPoint); Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); - std::string ASMString = finalizeKernelFunction(); std::string Name = "kernel_" + std::to_string(Kernel->id); Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); @@ -1410,6 +1420,49 @@ void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { } } +void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) { + auto *FN = Builder.GetInsertBlock()->getParent(); + auto Arg = FN->arg_begin(); + + bool StoredScalar = false; + for (long i = 0; i < Kernel->n_array; i++) { + if (!ppcg_kernel_requires_array_argument(Kernel, i)) + continue; + + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); + isl_id_free(Id); + + if (SAI->getNumberOfDimensions() > 0) { + Arg++; + continue; + } + + if (gpu_array_is_read_only_scalar(&Prog->array[i])) { + Arg++; + continue; + } + + Value *Alloca = BlockGen.getOrCreateAlloca(SAI); + Value *ArgPtr = &*Arg; + Type *TypePtr = SAI->getElementType()->getPointerTo(); + Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); + Value *Val = Builder.CreateLoad(Alloca); + Builder.CreateStore(Val, TypedArgPtr); + StoredScalar = true; + + Arg++; + } + + if (StoredScalar) + /// In case more than one thread contains scalar stores, the generated + /// code might be incorrect, if we only store at the end of the kernel. + /// To support this case we need to store these scalars back at each + /// memory store or at least before each kernel barrier. + if (Kernel->n_block != 0 || Kernel->n_grid != 0) + BuildSuccessful = 0; +} + void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll new file mode 100644 index 000000000000..bebf021d537a --- /dev/null +++ b/polly/test/GPGPU/non-read-only-scalars.ll @@ -0,0 +1,176 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s -check-prefix=KERNEL-IR +; +; REQUIRES: pollyacc +; +; #include +; +; float foo(float A[]) { +; float sum = 0; +; +; for (long i = 0; i < 32; i++) +; A[i] = i; +; +; for (long i = 0; i < 32; i++) +; A[i] += i; +; +; for (long i = 0; i < 32; i++) +; sum += A[i]; +; +; return sum; +; } +; +; int main() { +; float A[32]; +; float sum = foo(A); +; printf("%f\n", sum); +; } + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(32); +; CODE-NEXT: dim3 k0_dimGrid(1); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: { +; CODE-NEXT: dim3 k1_dimBlock; +; CODE-NEXT: dim3 k1_dimGrid; +; CODE-NEXT: kernel1 <<>> (dev_MemRef_sum_0__phi); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: for (int c0 = 0; c0 <= 32; c0 += 1) { +; CODE-NEXT: { +; CODE-NEXT: dim3 k2_dimBlock; +; CODE-NEXT: dim3 k2_dimGrid; +; CODE-NEXT: kernel2 <<>> (dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: if (c0 <= 31) +; CODE-NEXT: { +; CODE-NEXT: dim3 k3_dimBlock; +; CODE-NEXT: dim3 k3_dimGrid; +; CODE-NEXT: kernel3 <<>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0, c0); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: } +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0__phi, dev_MemRef_sum_0__phi, sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: { +; CODE-NEXT: Stmt_bb4(t0); +; CODE-NEXT: Stmt_bb10(t0); +; CODE-NEXT: } + +; CODE: # kernel1 +; CODE-NEXT: Stmt_bb17(); + +; CODE: # kernel2 +; CODE-NEXT: Stmt_bb18(c0); + +; CODE: # kernel3 +; CODE-NEXT: Stmt_bb20(c0); + +; KERNEL-IR: store float %p_tmp23, float* %sum.0.phiops +; KERNEL-IR-NEXT: [[REGA:%.+]] = bitcast i8* %MemRef_sum_0__phi to float* +; KERNEL-IR-NEXT: [[REGB:%.+]] = load float, float* %sum.0.phiops +; KERNEL-IR-NEXT: store float [[REGB]], float* [[REGA]] +; KERNEL-IR-NEXT: [[REGC:%.+]] = bitcast i8* %MemRef_sum_0 to float* +; KERNEL-IR-NEXT: [[REGD:%.+]] = load float, float* %sum.0.s2a +; KERNEL-IR-NEXT: store float [[REGD]], float* [[REGC]] +; KERNEL-IR-NEXT: ret void + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1 + +define float @foo(float* %A) { +bb: + br label %bb3 + +bb3: ; preds = %bb6, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] + %exitcond2 = icmp ne i64 %i.0, 32 + br i1 %exitcond2, label %bb4, label %bb8 + +bb4: ; preds = %bb3 + %tmp = sitofp i64 %i.0 to float + %tmp5 = getelementptr inbounds float, float* %A, i64 %i.0 + store float %tmp, float* %tmp5, align 4 + br label %bb6 + +bb6: ; preds = %bb4 + %tmp7 = add nuw nsw i64 %i.0, 1 + br label %bb3 + +bb8: ; preds = %bb3 + br label %bb9 + +bb9: ; preds = %bb15, %bb8 + %i1.0 = phi i64 [ 0, %bb8 ], [ %tmp16, %bb15 ] + %exitcond1 = icmp ne i64 %i1.0, 32 + br i1 %exitcond1, label %bb10, label %bb17 + +bb10: ; preds = %bb9 + %tmp11 = sitofp i64 %i1.0 to float + %tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0 + %tmp13 = load float, float* %tmp12, align 4 + %tmp14 = fadd float %tmp13, %tmp11 + store float %tmp14, float* %tmp12, align 4 + br label %bb15 + +bb15: ; preds = %bb10 + %tmp16 = add nuw nsw i64 %i1.0, 1 + br label %bb9 + +bb17: ; preds = %bb9 + br label %bb18 + +bb18: ; preds = %bb20, %bb17 + %sum.0 = phi float [ 0.000000e+00, %bb17 ], [ %tmp23, %bb20 ] + %i2.0 = phi i64 [ 0, %bb17 ], [ %tmp24, %bb20 ] + %exitcond = icmp ne i64 %i2.0, 32 + br i1 %exitcond, label %bb19, label %bb25 + +bb19: ; preds = %bb18 + br label %bb20 + +bb20: ; preds = %bb19 + %tmp21 = getelementptr inbounds float, float* %A, i64 %i2.0 + %tmp22 = load float, float* %tmp21, align 4 + %tmp23 = fadd float %sum.0, %tmp22 + %tmp24 = add nuw nsw i64 %i2.0, 1 + br label %bb18 + +bb25: ; preds = %bb18 + %sum.0.lcssa = phi float [ %sum.0, %bb18 ] + ret float %sum.0.lcssa +} + +define i32 @main() { +bb: + %A = alloca [32 x float], align 16 + %tmp = getelementptr inbounds [32 x float], [32 x float]* %A, i64 0, i64 0 + %tmp1 = call float @foo(float* %tmp) + %tmp2 = fpext float %tmp1 to double + %tmp3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double %tmp2) #2 + ret i32 0 +} + +declare i32 @printf(i8*, ...) #1 +