[CUDA]Fix dynamic|guided scheduling.

The existing implementation of the dynamic scheduling
breaks the contract introduced by the original openmp
runtime and, thus, is incorrect. Patch fixes it and
introduces correct dynamic scheduling model.

Thanks to Alexey Bataev for submitting this patch.

Differential Revision: https://reviews.llvm.org/D47333

llvm-svn: 333225
This commit is contained in:
George Rokos 2018-05-24 21:12:41 +00:00
parent 93afb0598e
commit 6da6f433a0
1 changed files with 50 additions and 57 deletions

View File

@ -215,7 +215,8 @@ public:
schedule <= kmp_sched_ordered_last;
}
INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st,
INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
kmp_sched_t schedule, T lb, T ub, ST st,
ST chunk) {
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
@ -282,18 +283,15 @@ public:
"unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
}
// save sched state
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
// init schedules
if (schedule == kmp_sched_static_chunk) {
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
// save sched state
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
// save ub
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
// compute static chunk
ST stride;
T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
int lastiter = 0;
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
@ -301,8 +299,8 @@ public:
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
PRINT(LD_LOOP,
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64 ","
"next lower bound = %llu, stride = %llu\n",
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
", next lower bound = %llu, stride = %llu\n",
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
@ -310,11 +308,12 @@ public:
} else if (schedule == kmp_sched_static_nochunk) {
ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
// save sched state
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
// save ub
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
// compute static chunk
ST stride;
T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
int lastiter = 0;
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
// save computed params
@ -322,45 +321,50 @@ public:
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
PRINT(LD_LOOP,
"dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 ","
"next lower bound = %llu, stride = %llu\n",
"dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
", next lower bound = %llu, stride = %llu\n",
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->Stride(tid));
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
if (chunk < 1)
chunk = 1;
Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks
// but each thread (but one) must discover that it is last
eventNum += tnum;
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum;
if (isSPMDMode())
__syncthreads();
else
__kmpc_barrier(loc, threadId);
// save sched state
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
if (GetThreadIdInBlock() == 0) {
if (chunk < 1)
chunk = 1;
int teamId = GetOmpTeamId();
omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
}
if (isSPMDMode())
__syncthreads();
else
__kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"dispatch init (dyn) : num threads = %d, ub = %" PRId64 ", chunk %" PRIu64 ", "
"events number = %llu\n",
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
", chunk %" PRIu64 "\n",
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->Chunk(tid),
omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
omptarget_nvptx_threadPrivateContext->Chunk(teamId));
}
}
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch next
INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg,
Counter priv, T &lb, T &ub,
Counter &chunkId, Counter &currentEvent,
T chunkSize, T loopUpperBound) {
// get next event atomically
Counter nextEvent = cg.Next();
// calculate chunk Id (priv was initialized upon entering the loop to
// 'start' == 'event')
chunkId = nextEvent - priv;
INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
Counter &loopLowerBound,
T loopUpperBound) {
// calculate lower bound for all lanes in the warp
lb = chunkId * chunkSize; // this code assume normalization of LB
lb = atomicAdd(&loopLowerBound, (Counter)chunkSize);
ub = lb + chunkSize - 1; // Clang uses i <= ub
// 3 result cases:
@ -368,9 +372,8 @@ public:
// b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
// NOT_FINISHED
// c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
currentEvent = nextEvent;
// a.
if (ub <= loopUpperBound) {
if (lb <= loopUpperBound && ub < loopUpperBound) {
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
P64(ub), P64(loopUpperBound));
return NOT_FINISHED;
@ -383,7 +386,8 @@ public:
return LAST_CHUNK;
}
// c. if we are here, we are in case 'c'
lb = loopUpperBound + 1;
lb = loopUpperBound + 2;
ub = loopUpperBound + 1;
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
P64(ub), P64(loopUpperBound));
return FINISHED;
@ -437,29 +441,18 @@ public:
ASSERT0(LT_FUSSY,
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
"bad sched");
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
T myLb, myUb;
Counter chunkId;
// xxx current event is now local
omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup();
int teamId = GetOmpTeamId();
int finished = DynamicNextChunk(
cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb,
chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid),
omptarget_nvptx_threadPrivateContext->Chunk(tid),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
if (finished == FINISHED) {
cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid),
omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid),
omptarget_nvptx_threadPrivateContext->CurrentEvent(tid));
myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
if (finished == FINISHED)
return DISPATCH_FINISHED;
}
// not finished (either not finished or last chunk)
*plast = (int32_t)(
myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
*plast = (int32_t)(finished == LAST_CHUNK);
*plower = myLb;
*pupper = myUb;
*pstride = 1;
@ -491,7 +484,7 @@ EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid,
int32_t st, int32_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
(kmp_sched_t)schedule, lb, ub, st, chunk);
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
@ -499,7 +492,7 @@ EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
int32_t st, int32_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
(kmp_sched_t)schedule, lb, ub, st, chunk);
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
@ -507,7 +500,7 @@ EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
int64_t st, int64_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
(kmp_sched_t)schedule, lb, ub, st, chunk);
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
@ -515,7 +508,7 @@ EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
int64_t st, int64_t chunk) {
PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
(kmp_sched_t)schedule, lb, ub, st, chunk);
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
}
// next