From 5724f251ebcd4fe9e650fb3e02decb39fea8649c Mon Sep 17 00:00:00 2001
From: Pietro Bonfa
Date: Wed, 7 Nov 2018 10:52:20 +0100
Subject: [PATCH] Avoid some unnecessary data copies on collective
communications among groups with one process
---
UtilXlib/mp.f90 | 253 ++++++++++++++++++++++++++++++++++++++----------
1 file changed, 202 insertions(+), 51 deletions(-)
diff --git a/UtilXlib/mp.f90 b/UtilXlib/mp.f90
index 769ecf06f..ceaff7ac1 100644
--- a/UtilXlib/mp.f90
+++ b/UtilXlib/mp.f90
@@ -3815,20 +3815,26 @@ END SUBROUTINE mp_type_free
INTEGER, msg_h
INTEGER, INTENT(IN) :: gid
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
msglen = 1
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
- RETURN ! No need for final syncronization
+ ! No need for final syncronization
#else
!
msg_h = msg_d ! This syncs __MPI case
CALL reduce_base_integer( msglen, msg_h, gid, -1 )
msg_d = msg_h
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_i1_gpu
!
!------------------------------------------------------------------------------!
@@ -3840,20 +3846,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
- RETURN ! No need for final syncronization
+ ! No need for final syncronization
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_integer( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_iv_gpu
!
!------------------------------------------------------------------------------!
@@ -3865,20 +3877,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
- RETURN ! No need for final syncronization
+ ! No need for final syncronization
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_integer( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_im_gpu
!
!------------------------------------------------------------------------------!
@@ -3890,20 +3908,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT (IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
- RETURN ! No need for final syncronization
+ ! No need for final syncronization
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_integer( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_it_gpu
!
!------------------------------------------------------------------------------!
@@ -3915,19 +3939,25 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT (IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
msglen = 1
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! No need for final syncronization
+ ! No need for final syncronization
#else
msg_h=msg_d ! This syncs __MPI case
CALL reduce_base_real( msglen, msg_h, gid, -1 )
msg_d = msg_h
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_r1_gpu
!
!------------------------------------------------------------------------------!
@@ -3939,20 +3969,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT (IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! No need for final syncronization
+ ! No need for final syncronization
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_rv_gpu
!
!------------------------------------------------------------------------------!
@@ -3964,20 +4000,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT (IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! No need for final syncronization
+ ! No need for final syncronization
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_rm_gpu
!
!------------------------------------------------------------------------------!
@@ -4106,20 +4148,26 @@ END SUBROUTINE mp_type_free
REAL (DP), ALLOCATABLE :: msg_h(:,:,:)
INTEGER, INTENT(IN) :: gid
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_rt_gpu
!
!------------------------------------------------------------------------------!
@@ -4131,20 +4179,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_r4d_gpu
!
!------------------------------------------------------------------------------!
@@ -4156,19 +4210,25 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
msglen = 1
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
msg_h=msg_d ! This syncs __MPI case
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
msg_d = msg_h
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_c1_gpu
!
!------------------------------------------------------------------------------!
@@ -4180,20 +4240,27 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ !
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs the device after small message copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_cv_gpu
!
!------------------------------------------------------------------------------!
@@ -4204,20 +4271,26 @@ END SUBROUTINE mp_type_free
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:)
INTEGER, INTENT (IN) :: gid
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_cm_gpu
!
!------------------------------------------------------------------------------!
@@ -4258,20 +4331,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = SIZE(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_ct_gpu
!
!------------------------------------------------------------------------------!
@@ -4283,20 +4362,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_c4d_gpu
!
!------------------------------------------------------------------------------!
@@ -4308,20 +4393,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_c5d_gpu
!
!------------------------------------------------------------------------------!
@@ -4333,20 +4424,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_r5d_gpu
!
!------------------------------------------------------------------------------!
@@ -4358,20 +4455,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_sum_c6d_gpu
!
!------------------------------------------------------------------------------!
@@ -4383,19 +4486,25 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
msglen = 1
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
msg_h = msg_d ! This syncs __MPI case
CALL parallel_max_integer( msglen, msg_h, gid, -1 )
msg_d = msg_h
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_max_i_gpu
!
!------------------------------------------------------------------------------!
@@ -4407,20 +4516,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL parallel_max_integer( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_max_iv_gpu
!
!----------------------------------------------------------------------
@@ -4432,19 +4547,25 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
msglen = 1
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
msg_h = msg_d ! This syncs __MPI case
CALL parallel_max_real( msglen, msg_h, gid, -1 )
msg_d = msg_h
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_max_r_gpu
!
!------------------------------------------------------------------------------!
@@ -4456,20 +4577,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL parallel_max_real( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_max_rv_gpu
!
!------------------------------------------------------------------------------!
@@ -4481,19 +4608,25 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
msglen = 1
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
msg_h = msg_d ! This syncs __MPI case
CALL parallel_min_integer( msglen, msg_h, gid, -1 )
msg_d = msg_h
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_min_i_gpu
!
!------------------------------------------------------------------------------!
@@ -4505,20 +4638,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = SIZE(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL parallel_min_integer( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_min_iv_gpu
!
!------------------------------------------------------------------------------!
@@ -4530,19 +4669,25 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
msglen = 1
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
msg_h = msg_d ! This syncs __MPI case
CALL parallel_min_real( msglen, msg_h, gid, -1 )
msg_d = msg_h
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_min_r_gpu
!
!------------------------------------------------------------------------------!
@@ -4554,20 +4699,26 @@ END SUBROUTINE mp_type_free
INTEGER, INTENT(IN) :: gid
!
INTEGER :: msglen, ierr
+ ! Avoid unnecessary communications on __MPI and syncs SERIAL
+ IF ( mp_size(gid) == 1 ) THEN
+ ierr = cudaDeviceSynchronize()
+ RETURN
+ END IF
+ !
#if defined(__MPI)
#if defined(__GPU_MPI)
msglen = size(msg_d)
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 )
- RETURN ! Sync not needed after MPI call
+ ! Sync not needed after MPI call
#else
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
msglen = size(msg_h)
CALL parallel_min_real( msglen, msg_h, gid, -1 )
msg_d = msg_h; DEALLOCATE(msg_h)
+ ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
#endif
#endif
- ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
END SUBROUTINE mp_min_rv_gpu
!
!------------------------------------------------------------------------------!
@@ -5358,7 +5509,7 @@ END SUBROUTINE mp_type_free
!------------------------------------------------------------------------------!
END MODULE mp
!------------------------------------------------------------------------------!
-
+!
! Script to generate stop messages:
! # coding: utf-8
! import re