[CUDA]Fix dynamic|guided scheduling.

The existing implementation of the dynamic scheduling breaks the contract introduced by the original openmp runtime and, thus, is incorrect. Patch fixes it and introduces correct dynamic scheduling model. Thanks to Alexey Bataev for submitting this patch. Differential Revision: https://reviews.llvm.org/D47333 llvm-svn: 333225
2018-05-24 21:12:41 +00:00 · 2018-05-24 21:12:41 +00:00 · 6da6f433a0
parent 93afb0598e
commit 6da6f433a0
1 changed files with 50 additions and 57 deletions
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@ -215,7 +215,8 @@ public:
           schedule <= kmp_sched_ordered_last;
  }

-  INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st,
+  INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
+                                   kmp_sched_t schedule, T lb, T ub, ST st,
                                   ST chunk) {
    int tid = GetLogicalThreadIdInBlock();
    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
@ -282,18 +283,15 @@ public:
             "unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
    }

-    // save sched state
-    omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-    omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-
    // init schedules
    if (schedule == kmp_sched_static_chunk) {
      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
      // save ub
      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
      // compute static chunk
      ST stride;
-      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
      int lastiter = 0;
      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
      // save computed params
@ -301,8 +299,8 @@ public:
      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
      PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64 ","
-            "next lower bound = %llu, stride = %llu\n",
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
            GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
@ -310,11 +308,12 @@ public:

    } else if (schedule == kmp_sched_static_nochunk) {
      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
      // save ub
      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
      // compute static chunk
      ST stride;
-      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
      int lastiter = 0;
      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
      // save computed params
@ -322,45 +321,50 @@ public:
      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
      PRINT(LD_LOOP,
-            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 ","
-            "next lower bound = %llu, stride = %llu\n",
+            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
            GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
            omptarget_nvptx_threadPrivateContext->Stride(tid));

    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      if (chunk < 1)
-        chunk = 1;
-      Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks
-      // but each thread (but one) must discover that it is last
-      eventNum += tnum;
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum;
+      if (isSPMDMode())
+        __syncthreads();
+      else
+        __kmpc_barrier(loc, threadId);
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      if (GetThreadIdInBlock() == 0) {
+        if (chunk < 1)
+          chunk = 1;
+        int teamId = GetOmpTeamId();
+        omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+      }
+      if (isSPMDMode())
+        __syncthreads();
+      else
+        __kmpc_barrier(loc, threadId);
      PRINT(LD_LOOP,
-            "dispatch init (dyn) : num threads = %d, ub = %" PRId64 ", chunk %" PRIu64 ", "
-            "events number = %llu\n",
+            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+            ", chunk %" PRIu64 "\n",
            GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->Chunk(tid),
-            omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
+            omptarget_nvptx_threadPrivateContext->Chunk(teamId));
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Support for dispatch next

-  INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg,
-                                     Counter priv, T &lb, T &ub,
-                                     Counter &chunkId, Counter &currentEvent,
-                                     T chunkSize, T loopUpperBound) {
-    // get next event atomically
-    Counter nextEvent = cg.Next();
-    // calculate chunk Id (priv was initialized upon entering the loop to
-    // 'start' == 'event')
-    chunkId = nextEvent - priv;
+  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+                                     Counter &loopLowerBound,
+                                     T loopUpperBound) {
    // calculate lower bound for all lanes in the warp
-    lb = chunkId * chunkSize; // this code assume normalization of LB
+    lb = atomicAdd(&loopLowerBound, (Counter)chunkSize);
    ub = lb + chunkSize - 1;  // Clang uses i <= ub

    // 3 result cases:
@ -368,9 +372,8 @@ public:
    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
    //  NOT_FINISHED
    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    currentEvent = nextEvent;
    // a.
-    if (ub <= loopUpperBound) {
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
            P64(ub), P64(loopUpperBound));
      return NOT_FINISHED;
@ -383,7 +386,8 @@ public:
      return LAST_CHUNK;
    }
    // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 1;
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
          P64(ub), P64(loopUpperBound));
    return FINISHED;
@ -437,29 +441,18 @@ public:
    ASSERT0(LT_FUSSY,
            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
            "bad sched");
-    omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
    T myLb, myUb;
-    Counter chunkId;
-    // xxx current event is now local
-    omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup();
+    int teamId = GetOmpTeamId();
    int finished = DynamicNextChunk(
-        cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb,
-        chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid),
-        omptarget_nvptx_threadPrivateContext->Chunk(tid),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
-
-    if (finished == FINISHED) {
-      cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid),
-                  omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
-      cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid),
-                 omptarget_nvptx_threadPrivateContext->CurrentEvent(tid));
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));

+    if (finished == FINISHED)
      return DISPATCH_FINISHED;
-    }

    // not finished (either not finished or last chunk)
-    *plast = (int32_t)(
-        myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+    *plast = (int32_t)(finished == LAST_CHUNK);
    *plower = myLb;
    *pupper = myUb;
    *pstride = 1;
@ -491,7 +484,7 @@ EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid,
                                   int32_t st, int32_t chunk) {
  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }

 EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
@ -499,7 +492,7 @@ EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
                                    int32_t st, int32_t chunk) {
  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }

 EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
@ -507,7 +500,7 @@ EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
                                   int64_t st, int64_t chunk) {
  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }

 EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
@ -515,7 +508,7 @@ EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
                                    int64_t st, int64_t chunk) {
  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }

 // next