From 5724f251ebcd4fe9e650fb3e02decb39fea8649c Mon Sep 17 00:00:00 2001 From: Pietro Bonfa Date: Wed, 7 Nov 2018 10:52:20 +0100 Subject: [PATCH] Avoid some unnecessary data copies on collective communications among groups with one process --- UtilXlib/mp.f90 | 253 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 202 insertions(+), 51 deletions(-) diff --git a/UtilXlib/mp.f90 b/UtilXlib/mp.f90 index 769ecf06f..ceaff7ac1 100644 --- a/UtilXlib/mp.f90 +++ b/UtilXlib/mp.f90 @@ -3815,20 +3815,26 @@ END SUBROUTINE mp_type_free INTEGER, msg_h INTEGER, INTENT(IN) :: gid INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) msglen = 1 #if defined(__GPU_MPI) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 ) - RETURN ! No need for final syncronization + ! No need for final syncronization #else ! msg_h = msg_d ! This syncs __MPI case CALL reduce_base_integer( msglen, msg_h, gid, -1 ) msg_d = msg_h + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_i1_gpu ! !------------------------------------------------------------------------------! @@ -3840,20 +3846,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 ) - RETURN ! No need for final syncronization + ! No need for final syncronization #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_integer( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_iv_gpu ! !------------------------------------------------------------------------------! @@ -3865,20 +3877,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 ) - RETURN ! No need for final syncronization + ! No need for final syncronization #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_integer( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_im_gpu ! !------------------------------------------------------------------------------! @@ -3890,20 +3908,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT (IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 ) - RETURN ! No need for final syncronization + ! No need for final syncronization #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_integer( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_it_gpu ! !------------------------------------------------------------------------------! @@ -3915,19 +3939,25 @@ END SUBROUTINE mp_type_free INTEGER, INTENT (IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) msglen = 1 #if defined(__GPU_MPI) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! No need for final syncronization + ! No need for final syncronization #else msg_h=msg_d ! This syncs __MPI case CALL reduce_base_real( msglen, msg_h, gid, -1 ) msg_d = msg_h + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_r1_gpu ! !------------------------------------------------------------------------------! @@ -3939,20 +3969,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT (IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! No need for final syncronization + ! No need for final syncronization #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_rv_gpu ! !------------------------------------------------------------------------------! @@ -3964,20 +4000,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT (IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! No need for final syncronization + ! No need for final syncronization #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_rm_gpu ! !------------------------------------------------------------------------------! @@ -4106,20 +4148,26 @@ END SUBROUTINE mp_type_free REAL (DP), ALLOCATABLE :: msg_h(:,:,:) INTEGER, INTENT(IN) :: gid INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_rt_gpu ! !------------------------------------------------------------------------------! @@ -4131,20 +4179,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_r4d_gpu ! !------------------------------------------------------------------------------! @@ -4156,19 +4210,25 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) msglen = 1 #if defined(__GPU_MPI) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else msg_h=msg_d ! This syncs __MPI case CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) msg_d = msg_h + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_c1_gpu ! !------------------------------------------------------------------------------! @@ -4180,20 +4240,27 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs the device after small message copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_cv_gpu ! !------------------------------------------------------------------------------! @@ -4204,20 +4271,26 @@ END SUBROUTINE mp_type_free COMPLEX (DP), ALLOCATABLE :: msg_h(:,:) INTEGER, INTENT (IN) :: gid INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_cm_gpu ! !------------------------------------------------------------------------------! @@ -4258,20 +4331,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = SIZE(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_ct_gpu ! !------------------------------------------------------------------------------! @@ -4283,20 +4362,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_c4d_gpu ! !------------------------------------------------------------------------------! @@ -4308,20 +4393,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_c5d_gpu ! !------------------------------------------------------------------------------! @@ -4333,20 +4424,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_r5d_gpu ! !------------------------------------------------------------------------------! @@ -4358,20 +4455,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_sum_c6d_gpu ! !------------------------------------------------------------------------------! @@ -4383,19 +4486,25 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) msglen = 1 #if defined(__GPU_MPI) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else msg_h = msg_d ! This syncs __MPI case CALL parallel_max_integer( msglen, msg_h, gid, -1 ) msg_d = msg_h + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_max_i_gpu ! !------------------------------------------------------------------------------! @@ -4407,20 +4516,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL parallel_max_integer( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_max_iv_gpu ! !---------------------------------------------------------------------- @@ -4432,19 +4547,25 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) msglen = 1 #if defined(__GPU_MPI) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else msg_h = msg_d ! This syncs __MPI case CALL parallel_max_real( msglen, msg_h, gid, -1 ) msg_d = msg_h + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_max_r_gpu ! !------------------------------------------------------------------------------! @@ -4456,20 +4577,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL parallel_max_real( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_max_rv_gpu ! !------------------------------------------------------------------------------! @@ -4481,19 +4608,25 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) msglen = 1 #if defined(__GPU_MPI) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else msg_h = msg_d ! This syncs __MPI case CALL parallel_min_integer( msglen, msg_h, gid, -1 ) msg_d = msg_h + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_min_i_gpu ! !------------------------------------------------------------------------------! @@ -4505,20 +4638,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = SIZE(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL parallel_min_integer( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_min_iv_gpu ! !------------------------------------------------------------------------------! @@ -4530,19 +4669,25 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) msglen = 1 #if defined(__GPU_MPI) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else msg_h = msg_d ! This syncs __MPI case CALL parallel_min_real( msglen, msg_h, gid, -1 ) msg_d = msg_h + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_min_r_gpu ! !------------------------------------------------------------------------------! @@ -4554,20 +4699,26 @@ END SUBROUTINE mp_type_free INTEGER, INTENT(IN) :: gid ! INTEGER :: msglen, ierr + ! Avoid unnecessary communications on __MPI and syncs SERIAL + IF ( mp_size(gid) == 1 ) THEN + ierr = cudaDeviceSynchronize() + RETURN + END IF + ! #if defined(__MPI) #if defined(__GPU_MPI) msglen = size(msg_d) ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 ) - RETURN ! Sync not needed after MPI call + ! Sync not needed after MPI call #else ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case msglen = size(msg_h) CALL parallel_min_real( msglen, msg_h, gid, -1 ) msg_d = msg_h; DEALLOCATE(msg_h) + ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies #endif #endif - ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI END SUBROUTINE mp_min_rv_gpu ! !------------------------------------------------------------------------------! @@ -5358,7 +5509,7 @@ END SUBROUTINE mp_type_free !------------------------------------------------------------------------------! END MODULE mp !------------------------------------------------------------------------------! - +! ! Script to generate stop messages: ! # coding: utf-8 ! import re