From 37d4156b1141ac688191b3be71cfe07563ec4c65 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 23 Jul 2018 13:52:12 +0000 Subject: [PATCH] [OPNEMP, NVPTX] Fixed sychronization construct + code cleanup. Summary: 1. Fixed internal problem in `__kmpc_barrier` function: SPMD mode synchronization function should be called only in L1 parallel level. 2. Removed some extra code for synchronization inside of the code, used `__kmpc_barrier` instead. 3. Some code cleanup. Reviewers: gtbercea, grokos Subscribers: openmp-commits Differential Revision: https://reviews.llvm.org/D49564 llvm-svn: 337691 --- .../libomptarget/deviceRTLs/nvptx/src/loop.cu | 18 +++-------- .../deviceRTLs/nvptx/src/reduction.cu | 26 ++++----------- .../deviceRTLs/nvptx/src/supporti.h | 3 +- .../libomptarget/deviceRTLs/nvptx/src/sync.cu | 32 ++++++++----------- 4 files changed, 25 insertions(+), 54 deletions(-) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu index 60818afdaf90..f3e475d7108b 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -240,12 +240,8 @@ public: // Process schedule. if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { - if (OrderedSchedule(schedule)) { - if (isSPMDMode()) - __syncthreads(); - else - __kmpc_barrier(loc, threadId); - } + if (OrderedSchedule(schedule)) + __kmpc_barrier(loc, threadId); PRINT(LD_LOOP, "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n", (long)tnum, P64(tripCount), schedule); @@ -338,10 +334,7 @@ public: omptarget_nvptx_threadPrivateContext->Stride(tid)); } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { - if (isSPMDMode()) - __syncthreads(); - else - __kmpc_barrier(loc, threadId); + __kmpc_barrier(loc, threadId); // save sched state int teamId = GetOmpTeamId(); omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; @@ -352,10 +345,7 @@ public: omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub; omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb; } - if (isSPMDMode()) - __syncthreads(); - else - __kmpc_barrier(loc, threadId); + __kmpc_barrier(loc, threadId); PRINT(LD_LOOP, "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 ", chunk %" PRIu64 "\n", diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu index aedb6359f9ac..b813a11d20f4 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -25,9 +25,8 @@ int32_t __gpu_block_reduce() { if (nt != blockDim.x) return 0; unsigned tnum = __ACTIVEMASK(); - if (tnum != (~0x0)) { // assume swapSize is 32 + if (tnum != (~0x0)) // assume swapSize is 32 return 0; - } return 1; } @@ -48,32 +47,21 @@ int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, if (numthread == 1) return 1; - else if (!__gpu_block_reduce()) + if (!__gpu_block_reduce()) return 2; - else { - if (threadIdx.x == 0) - return 1; - else - return 0; - } + if (threadIdx.x == 0) + return 1; + return 0; } EXTERN int32_t __kmpc_reduce_combined(kmp_Indent *loc) { - if (threadIdx.x == 0) { - return 2; - } else { - return 0; - } + return threadIdx.x == 0 ? 2 : 0; } EXTERN int32_t __kmpc_reduce_simd(kmp_Indent *loc) { - if (threadIdx.x % 32 == 0) { - return 1; - } else { - return 0; - } + return (threadIdx.x % 32 == 0) ? 1 : 0; } EXTERN diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h index 2b267c359d69..4de2039e42ea 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -155,8 +155,7 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } INLINE int GetNumberOfProcsInDevice() { if (isGenericMode()) return GetNumberOfWorkersInTeam(); - else - return GetNumberOfThreadsInBlock(); + return GetNumberOfThreadsInBlock(); } INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu index 68f08a16ac49..7e55df8ca71d 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -41,25 +41,21 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) { } EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) { - if (isSPMDMode()) { - __kmpc_barrier_simple_spmd(loc_ref, tid); - } else if (isRuntimeUninitialized()) { - __kmpc_barrier_simple_generic(loc_ref, tid); + if (isRuntimeUninitialized()) { + if (isSPMDMode()) + __kmpc_barrier_simple_spmd(loc_ref, tid); + else + __kmpc_barrier_simple_generic(loc_ref, tid); } else { tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); - if (!currTaskDescr->InL2OrHigherParallelRegion()) { - int numberOfActiveOMPThreads = - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - // On Volta and newer architectures we require that all lanes in - // a warp (at least, all present for the kernel launch) participate in the - // barrier. This is enforced when launching the parallel region. An - // exception is when there are < WARPSIZE workers. In this case only 1 - // worker is started, so we don't need a barrier. - if (numberOfActiveOMPThreads > 1) { -#endif + int numberOfActiveOMPThreads = GetNumberOfOmpThreads( + tid, isSPMDMode(), /*isRuntimeUninitialized=*/false); + if (numberOfActiveOMPThreads > 1) { + if (isSPMDMode()) { + __kmpc_barrier_simple_spmd(loc_ref, tid); + } else { // The #threads parameter must be rounded up to the WARPSIZE. int threads = WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); @@ -69,10 +65,8 @@ EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) { numberOfActiveOMPThreads, threads); // Barrier #1 is for synchronization among active threads. named_sync(L1_BARRIER, threads); -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - } // numberOfActiveOMPThreads > 1 -#endif - } + } + } // numberOfActiveOMPThreads > 1 PRINT0(LD_SYNC, "completed kmpc_barrier\n"); } }