[OPNEMP, NVPTX] Fixed sychronization construct + code cleanup.

Summary:
1. Fixed internal problem in `__kmpc_barrier` function: SPMD mode
synchronization function should be called only in L1 parallel level.
2. Removed some extra code for synchronization inside of the code, used
`__kmpc_barrier` instead.
3. Some code cleanup.

Reviewers: gtbercea, grokos

Subscribers: openmp-commits

Differential Revision: https://reviews.llvm.org/D49564

llvm-svn: 337691
This commit is contained in:
Alexey Bataev 2018-07-23 13:52:12 +00:00
parent 8264bb8d34
commit 37d4156b11
4 changed files with 25 additions and 54 deletions

View File

@ -240,12 +240,8 @@ public:
// Process schedule.
if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
if (OrderedSchedule(schedule)) {
if (isSPMDMode())
__syncthreads();
else
__kmpc_barrier(loc, threadId);
}
if (OrderedSchedule(schedule))
__kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
(long)tnum, P64(tripCount), schedule);
@ -338,10 +334,7 @@ public:
omptarget_nvptx_threadPrivateContext->Stride(tid));
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
if (isSPMDMode())
__syncthreads();
else
__kmpc_barrier(loc, threadId);
__kmpc_barrier(loc, threadId);
// save sched state
int teamId = GetOmpTeamId();
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
@ -352,10 +345,7 @@ public:
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
}
if (isSPMDMode())
__syncthreads();
else
__kmpc_barrier(loc, threadId);
__kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
", chunk %" PRIu64 "\n",

View File

@ -25,9 +25,8 @@ int32_t __gpu_block_reduce() {
if (nt != blockDim.x)
return 0;
unsigned tnum = __ACTIVEMASK();
if (tnum != (~0x0)) { // assume swapSize is 32
if (tnum != (~0x0)) // assume swapSize is 32
return 0;
}
return 1;
}
@ -48,32 +47,21 @@ int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
if (numthread == 1)
return 1;
else if (!__gpu_block_reduce())
if (!__gpu_block_reduce())
return 2;
else {
if (threadIdx.x == 0)
return 1;
else
return 0;
}
if (threadIdx.x == 0)
return 1;
return 0;
}
EXTERN
int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
if (threadIdx.x == 0) {
return 2;
} else {
return 0;
}
return threadIdx.x == 0 ? 2 : 0;
}
EXTERN
int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
if (threadIdx.x % 32 == 0) {
return 1;
} else {
return 0;
}
return (threadIdx.x % 32 == 0) ? 1 : 0;
}
EXTERN

View File

@ -155,8 +155,7 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
INLINE int GetNumberOfProcsInDevice() {
if (isGenericMode())
return GetNumberOfWorkersInTeam();
else
return GetNumberOfThreadsInBlock();
return GetNumberOfThreadsInBlock();
}
INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }

View File

@ -41,25 +41,21 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
}
EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
if (isSPMDMode()) {
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else if (isRuntimeUninitialized()) {
__kmpc_barrier_simple_generic(loc_ref, tid);
if (isRuntimeUninitialized()) {
if (isSPMDMode())
__kmpc_barrier_simple_spmd(loc_ref, tid);
else
__kmpc_barrier_simple_generic(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
if (!currTaskDescr->InL2OrHigherParallelRegion()) {
int numberOfActiveOMPThreads =
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
// On Volta and newer architectures we require that all lanes in
// a warp (at least, all present for the kernel launch) participate in the
// barrier. This is enforced when launching the parallel region. An
// exception is when there are < WARPSIZE workers. In this case only 1
// worker is started, so we don't need a barrier.
if (numberOfActiveOMPThreads > 1) {
#endif
int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
if (numberOfActiveOMPThreads > 1) {
if (isSPMDMode()) {
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
// The #threads parameter must be rounded up to the WARPSIZE.
int threads =
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
@ -69,10 +65,8 @@ EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
numberOfActiveOMPThreads, threads);
// Barrier #1 is for synchronization among active threads.
named_sync(L1_BARRIER, threads);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
} // numberOfActiveOMPThreads > 1
#endif
}
}
} // numberOfActiveOMPThreads > 1
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
}
}