mirror of https://gitlab.com/QEF/q-e.git
More tweaking, a hidden and relevant comment moved to a less invisible place.
In principle the machinery should turn task groups on when there are too many processors for FFT parallelization. In practice, I don't think it does that, otherwise it would stop on those cases that have no task group implementation (e.g. metaGGA). I don't yet know what is really happening and whether the old task groups are still present in the code, reachable and working.
This commit is contained in:
parent
978c0be323
commit
3e677a330f
|
@ -20,8 +20,12 @@ MODULE command_line_options
|
|||
!
|
||||
! ... Number of arguments in command line
|
||||
INTEGER :: nargs = 0
|
||||
! ... QE arguments read from command line
|
||||
INTEGER :: nimage_= 1, npool_= 0, ndiag_ = 0, nband_= 1, ntg_= 1, nyfft_ = 1, nmany_ = 1
|
||||
! ... QE arguments read from command line, default 1
|
||||
INTEGER :: nimage_= 1, nband_= 1, nyfft_ = 1, nmany_ = 1
|
||||
! ... As above, default 0, in order to distinguish the "not set" and the
|
||||
! ... "set to 1" cases - useful for automatically choosing those values
|
||||
INTEGER :: npool_= 0, ndiag_ = 0, ntg_= 0
|
||||
! ... Undocumented options
|
||||
LOGICAL :: pencil_decomposition_ = .false., rmm_with_paro_ = .false.
|
||||
! ... Indicate if using library init
|
||||
LOGICAL :: library_init = .FALSE.
|
||||
|
@ -101,16 +105,7 @@ CONTAINS
|
|||
ENDIF
|
||||
READ ( arg, *, ERR = 15, END = 15) npool_
|
||||
narg = narg + 1
|
||||
! FIXME: following comment should be moved to a more visible place
|
||||
! special case : task group parallelization and nyfft parallelization, both
|
||||
! introduced to improve scaling coexist and are in part interchangeable
|
||||
! if TG is available it's faster that NYFFT becouse it communicates larger
|
||||
! data chuncks less times. But sometimes it is not available as for instance
|
||||
! when metagga is used or realus or for conjugate gradient. nyfft can be used.
|
||||
!-ntg and -nyfft are both alloved flags set the same value for both ntg and nyfft.
|
||||
! These variables are kept separated to help understanding which operation belong
|
||||
! to TG or to NYFFT. This can enable to make them different if the need arises.
|
||||
!
|
||||
|
||||
CASE ( '-nt', '-ntg', '-ntask_groups', '-nyfft')
|
||||
IF (read_string) THEN
|
||||
CALL my_getarg ( input_command_line, narg, arg )
|
||||
|
|
|
@ -52,6 +52,17 @@ CONTAINS
|
|||
! convenient to call it in serial execution as well
|
||||
! IMPORTANT NOTICE 2: most parallelization levels are initialized here
|
||||
! but at least some will be moved to a later stage
|
||||
! SPECIAL CASE: command-line options "-ntg" and "-nyfft", introduced to
|
||||
! improve scaling, coexist and are in part interchangeable.
|
||||
! If task groups are available, -ntg is faster than -nyfft
|
||||
! because it communicates larger data chuncks less frequently
|
||||
! Sometimes task groups are not available as for instance
|
||||
! when metagga is used or realus or for conjugate gradient.
|
||||
! For those cases, -nyfft can be used instead.
|
||||
! You may specify one or another: the same value will be set
|
||||
! for both ntg and nyfft. These variables are kept separated
|
||||
! to help understanding which operation belong to task groups
|
||||
! or to nyfft, allowing to differenciate them if need arises.
|
||||
!
|
||||
USE command_line_options, ONLY : get_command_line, &
|
||||
nimage_, npool_, nband_, ntg_, nyfft_
|
||||
|
@ -88,6 +99,8 @@ CONTAINS
|
|||
! npool_ is 0 if not specified in command line
|
||||
IF ( npool_== 0 ) npool_ = 1
|
||||
CALL mp_start_pools ( npool_, intra_image_comm )
|
||||
! ntg_ is 0 if not specified in command line
|
||||
IF ( ntg_== 0 ) ntg_ = 1
|
||||
#if defined (__CUDA_OPTIMIZED)
|
||||
CALL mp_start_bands ( 1 , ntg_, nyfft_, intra_pool_comm )
|
||||
#else
|
||||
|
|
|
@ -696,6 +696,7 @@ SUBROUTINE setup_para ( nr3, nkstot, nbnd )
|
|||
!
|
||||
LOGICAL, EXTERNAL :: check_gpu_support
|
||||
LOGICAL, SAVE :: first = .TRUE.
|
||||
INTEGER :: maxtask
|
||||
!
|
||||
! do not execute twice: unpredictable results may follow
|
||||
!
|
||||
|
@ -705,13 +706,13 @@ SUBROUTINE setup_para ( nr3, nkstot, nbnd )
|
|||
! k-point parallelization first
|
||||
!
|
||||
IF ( npool_== 0 ) THEN
|
||||
npool_ = 1
|
||||
!
|
||||
! check if too many mpi processes for this fft dimension,
|
||||
! use k-point parallelization if available
|
||||
!
|
||||
if ( nproc_image <= nr3/2 ) then
|
||||
npool_ = 1
|
||||
else
|
||||
if ( nproc_image > nr3/2 .and. nkstot > 1 ) then
|
||||
!
|
||||
! if too many mpi processes for this fft dimension,
|
||||
! use k-point parallelization if available
|
||||
!
|
||||
do npool_ = 2, nkstot
|
||||
! npool should be a divisor of the number of k-points
|
||||
if ( mod(nkstot, npool_) /= 0 ) cycle
|
||||
|
@ -735,17 +736,23 @@ SUBROUTINE setup_para ( nr3, nkstot, nbnd )
|
|||
use_gpu = check_gpu_support( )
|
||||
!
|
||||
! Set "task_groups" if still too many processors for PW parallelization
|
||||
!
|
||||
IF ( ntask_groups == 0 ) THEN
|
||||
ntask_groups = 1
|
||||
if ( nproc_bgrp > nr3 ) THEN
|
||||
maxtask = min (nbnd, 16)
|
||||
do ntask_groups = 2, maxtask
|
||||
if ( mod(nproc_bgrp,ntask_groups) == 0 .and. &
|
||||
nproc_bgrp/ntask_groups <= nr3 .or.&
|
||||
nproc_bgrp/ntask_groups <= nr3/4 .or.&
|
||||
ntask_groups == maxtask ) exit
|
||||
end do
|
||||
end if
|
||||
END IF
|
||||
!
|
||||
! Note that "task_groups" require to set pencil_decomposition to .true.
|
||||
!
|
||||
IF ( ( ntask_groups /= 1 ) ) pencil_decomposition_ = .true.
|
||||
IF ( ( ntask_groups == 1 ) .AND. ( nproc_bgrp > nr3 ) ) THEN
|
||||
pencil_decomposition_ = .true.
|
||||
do ntask_groups = 2, min(nbnd,16)
|
||||
if ( mod(nproc_bgrp,ntask_groups) == 0 .and. &
|
||||
nproc_bgrp/ntask_groups <= nr3 ) exit
|
||||
if ( ntask_groups == min(nbnd,16) ) exit
|
||||
end do
|
||||
END IF
|
||||
IF ( ntask_groups /= 1 ) pencil_decomposition_ = .true.
|
||||
!
|
||||
! printout - same as in environment.f90
|
||||
!
|
||||
|
|
Loading…
Reference in New Issue