[OPNEMP, NVPTX] Fixed sychronization construct + code cleanup.
Summary: 1. Fixed internal problem in `__kmpc_barrier` function: SPMD mode synchronization function should be called only in L1 parallel level. 2. Removed some extra code for synchronization inside of the code, used `__kmpc_barrier` instead. 3. Some code cleanup. Reviewers: gtbercea, grokos Subscribers: openmp-commits Differential Revision: https://reviews.llvm.org/D49564 llvm-svn: 337691
This commit is contained in:
parent
8264bb8d34
commit
37d4156b11
|
@ -240,12 +240,8 @@ public:
|
|||
|
||||
// Process schedule.
|
||||
if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
|
||||
if (OrderedSchedule(schedule)) {
|
||||
if (isSPMDMode())
|
||||
__syncthreads();
|
||||
else
|
||||
__kmpc_barrier(loc, threadId);
|
||||
}
|
||||
if (OrderedSchedule(schedule))
|
||||
__kmpc_barrier(loc, threadId);
|
||||
PRINT(LD_LOOP,
|
||||
"go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
|
||||
(long)tnum, P64(tripCount), schedule);
|
||||
|
@ -338,10 +334,7 @@ public:
|
|||
omptarget_nvptx_threadPrivateContext->Stride(tid));
|
||||
|
||||
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
|
||||
if (isSPMDMode())
|
||||
__syncthreads();
|
||||
else
|
||||
__kmpc_barrier(loc, threadId);
|
||||
__kmpc_barrier(loc, threadId);
|
||||
// save sched state
|
||||
int teamId = GetOmpTeamId();
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
|
||||
|
@ -352,10 +345,7 @@ public:
|
|||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
|
||||
}
|
||||
if (isSPMDMode())
|
||||
__syncthreads();
|
||||
else
|
||||
__kmpc_barrier(loc, threadId);
|
||||
__kmpc_barrier(loc, threadId);
|
||||
PRINT(LD_LOOP,
|
||||
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
|
||||
", chunk %" PRIu64 "\n",
|
||||
|
|
|
@ -25,9 +25,8 @@ int32_t __gpu_block_reduce() {
|
|||
if (nt != blockDim.x)
|
||||
return 0;
|
||||
unsigned tnum = __ACTIVEMASK();
|
||||
if (tnum != (~0x0)) { // assume swapSize is 32
|
||||
if (tnum != (~0x0)) // assume swapSize is 32
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -48,32 +47,21 @@ int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
|
|||
|
||||
if (numthread == 1)
|
||||
return 1;
|
||||
else if (!__gpu_block_reduce())
|
||||
if (!__gpu_block_reduce())
|
||||
return 2;
|
||||
else {
|
||||
if (threadIdx.x == 0)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
if (threadIdx.x == 0)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
|
||||
if (threadIdx.x == 0) {
|
||||
return 2;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return threadIdx.x == 0 ? 2 : 0;
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
|
||||
if (threadIdx.x % 32 == 0) {
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return (threadIdx.x % 32 == 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
EXTERN
|
||||
|
|
|
@ -155,8 +155,7 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
|
|||
INLINE int GetNumberOfProcsInDevice() {
|
||||
if (isGenericMode())
|
||||
return GetNumberOfWorkersInTeam();
|
||||
else
|
||||
return GetNumberOfThreadsInBlock();
|
||||
return GetNumberOfThreadsInBlock();
|
||||
}
|
||||
|
||||
INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
|
||||
|
|
|
@ -41,25 +41,21 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
|
|||
}
|
||||
|
||||
EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
|
||||
if (isSPMDMode()) {
|
||||
__kmpc_barrier_simple_spmd(loc_ref, tid);
|
||||
} else if (isRuntimeUninitialized()) {
|
||||
__kmpc_barrier_simple_generic(loc_ref, tid);
|
||||
if (isRuntimeUninitialized()) {
|
||||
if (isSPMDMode())
|
||||
__kmpc_barrier_simple_spmd(loc_ref, tid);
|
||||
else
|
||||
__kmpc_barrier_simple_generic(loc_ref, tid);
|
||||
} else {
|
||||
tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
|
||||
if (!currTaskDescr->InL2OrHigherParallelRegion()) {
|
||||
int numberOfActiveOMPThreads =
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
// On Volta and newer architectures we require that all lanes in
|
||||
// a warp (at least, all present for the kernel launch) participate in the
|
||||
// barrier. This is enforced when launching the parallel region. An
|
||||
// exception is when there are < WARPSIZE workers. In this case only 1
|
||||
// worker is started, so we don't need a barrier.
|
||||
if (numberOfActiveOMPThreads > 1) {
|
||||
#endif
|
||||
int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
|
||||
tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
|
||||
if (numberOfActiveOMPThreads > 1) {
|
||||
if (isSPMDMode()) {
|
||||
__kmpc_barrier_simple_spmd(loc_ref, tid);
|
||||
} else {
|
||||
// The #threads parameter must be rounded up to the WARPSIZE.
|
||||
int threads =
|
||||
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
|
||||
|
@ -69,10 +65,8 @@ EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
|
|||
numberOfActiveOMPThreads, threads);
|
||||
// Barrier #1 is for synchronization among active threads.
|
||||
named_sync(L1_BARRIER, threads);
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
} // numberOfActiveOMPThreads > 1
|
||||
#endif
|
||||
}
|
||||
}
|
||||
} // numberOfActiveOMPThreads > 1
|
||||
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue