From 37d4156b1141ac688191b3be71cfe07563ec4c65 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Mon, 23 Jul 2018 13:52:12 +0000
Subject: [PATCH] [OPNEMP, NVPTX] Fixed sychronization construct + code
 cleanup.

Summary:
1. Fixed internal problem in `__kmpc_barrier` function: SPMD mode
synchronization function should be called only in L1 parallel level.
2. Removed some extra code for synchronization inside of the code, used
`__kmpc_barrier` instead.
3. Some code cleanup.

Reviewers: gtbercea, grokos

Subscribers: openmp-commits

Differential Revision: https://reviews.llvm.org/D49564

llvm-svn: 337691
---
 .../libomptarget/deviceRTLs/nvptx/src/loop.cu | 18 +++--------
 .../deviceRTLs/nvptx/src/reduction.cu         | 26 ++++-----------
 .../deviceRTLs/nvptx/src/supporti.h           |  3 +-
 .../libomptarget/deviceRTLs/nvptx/src/sync.cu | 32 ++++++++-----------
 4 files changed, 25 insertions(+), 54 deletions(-)

diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 60818afdaf90..f3e475d7108b 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -240,12 +240,8 @@ public:
 
     // Process schedule.
     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
-      if (OrderedSchedule(schedule)) {
-        if (isSPMDMode())
-          __syncthreads();
-        else
-          __kmpc_barrier(loc, threadId);
-      }
+      if (OrderedSchedule(schedule))
+        __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
             "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
             (long)tnum, P64(tripCount), schedule);
@@ -338,10 +334,7 @@ public:
             omptarget_nvptx_threadPrivateContext->Stride(tid));
 
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      if (isSPMDMode())
-        __syncthreads();
-      else
-        __kmpc_barrier(loc, threadId);
+      __kmpc_barrier(loc, threadId);
       // save sched state
       int teamId = GetOmpTeamId();
       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
@@ -352,10 +345,7 @@ public:
         omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
         omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
       }
-      if (isSPMDMode())
-        __syncthreads();
-      else
-        __kmpc_barrier(loc, threadId);
+      __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
             "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
             ", chunk %" PRIu64 "\n",
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index aedb6359f9ac..b813a11d20f4 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -25,9 +25,8 @@ int32_t __gpu_block_reduce() {
   if (nt != blockDim.x)
     return 0;
   unsigned tnum = __ACTIVEMASK();
-  if (tnum != (~0x0)) { // assume swapSize is 32
+  if (tnum != (~0x0)) // assume swapSize is 32
     return 0;
-  }
   return 1;
 }
 
@@ -48,32 +47,21 @@ int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
 
   if (numthread == 1)
     return 1;
-  else if (!__gpu_block_reduce())
+  if (!__gpu_block_reduce())
     return 2;
-  else {
-    if (threadIdx.x == 0)
-      return 1;
-    else
-      return 0;
-  }
+  if (threadIdx.x == 0)
+    return 1;
+  return 0;
 }
 
 EXTERN
 int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
-  if (threadIdx.x == 0) {
-    return 2;
-  } else {
-    return 0;
-  }
+  return threadIdx.x == 0 ? 2 : 0;
 }
 
 EXTERN
 int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
-  if (threadIdx.x % 32 == 0) {
-    return 1;
-  } else {
-    return 0;
-  }
+  return (threadIdx.x % 32 == 0) ? 1 : 0;
 }
 
 EXTERN
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 2b267c359d69..4de2039e42ea 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -155,8 +155,7 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
 INLINE int GetNumberOfProcsInDevice() {
   if (isGenericMode())
     return GetNumberOfWorkersInTeam();
-  else
-    return GetNumberOfThreadsInBlock();
+  return GetNumberOfThreadsInBlock();
 }
 
 INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
index 68f08a16ac49..7e55df8ca71d 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -41,25 +41,21 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
 }
 
 EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
-  if (isSPMDMode()) {
-    __kmpc_barrier_simple_spmd(loc_ref, tid);
-  } else if (isRuntimeUninitialized()) {
-    __kmpc_barrier_simple_generic(loc_ref, tid);
+  if (isRuntimeUninitialized()) {
+    if (isSPMDMode())
+      __kmpc_barrier_simple_spmd(loc_ref, tid);
+    else
+      __kmpc_barrier_simple_generic(loc_ref, tid);
   } else {
     tid = GetLogicalThreadIdInBlock();
     omptarget_nvptx_TaskDescr *currTaskDescr =
         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
-    if (!currTaskDescr->InL2OrHigherParallelRegion()) {
-      int numberOfActiveOMPThreads =
-          GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-      // On Volta and newer architectures we require that all lanes in
-      // a warp (at least, all present for the kernel launch) participate in the
-      // barrier.  This is enforced when launching the parallel region.  An
-      // exception is when there are < WARPSIZE workers.  In this case only 1
-      // worker is started, so we don't need a barrier.
-      if (numberOfActiveOMPThreads > 1) {
-#endif
+    int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+        tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
+    if (numberOfActiveOMPThreads > 1) {
+      if (isSPMDMode()) {
+        __kmpc_barrier_simple_spmd(loc_ref, tid);
+      } else {
         // The #threads parameter must be rounded up to the WARPSIZE.
         int threads =
             WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
@@ -69,10 +65,8 @@ EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
               numberOfActiveOMPThreads, threads);
         // Barrier #1 is for synchronization among active threads.
         named_sync(L1_BARRIER, threads);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-      } // numberOfActiveOMPThreads > 1
-#endif
-    }
+      }
+    } // numberOfActiveOMPThreads > 1
     PRINT0(LD_SYNC, "completed kmpc_barrier\n");
   }
 }