More tweaking, a hidden and relevant comment moved to a less invisible place.

In principle the machinery should turn task groups on when there are too many
processors for FFT parallelization. In practice, I don't think it does that,
otherwise it would stop on those cases that have no task group implementation
(e.g. metaGGA). I don't yet know what is really happening and whether the old
task groups are still present in the code, reachable and working.
This commit is contained in:
Paolo Giannozzi 2022-01-14 17:19:16 +01:00
parent 978c0be323
commit 3e677a330f
3 changed files with 42 additions and 27 deletions

View File

@ -20,8 +20,12 @@ MODULE command_line_options
!
! ... Number of arguments in command line
INTEGER :: nargs = 0
! ... QE arguments read from command line
INTEGER :: nimage_= 1, npool_= 0, ndiag_ = 0, nband_= 1, ntg_= 1, nyfft_ = 1, nmany_ = 1
! ... QE arguments read from command line, default 1
INTEGER :: nimage_= 1, nband_= 1, nyfft_ = 1, nmany_ = 1
! ... As above, default 0, in order to distinguish the "not set" and the
! ... "set to 1" cases - useful for automatically choosing those values
INTEGER :: npool_= 0, ndiag_ = 0, ntg_= 0
! ... Undocumented options
LOGICAL :: pencil_decomposition_ = .false., rmm_with_paro_ = .false.
! ... Indicate if using library init
LOGICAL :: library_init = .FALSE.
@ -101,16 +105,7 @@ CONTAINS
ENDIF
READ ( arg, *, ERR = 15, END = 15) npool_
narg = narg + 1
! FIXME: following comment should be moved to a more visible place
! special case : task group parallelization and nyfft parallelization, both
! introduced to improve scaling coexist and are in part interchangeable
! if TG is available it's faster that NYFFT becouse it communicates larger
! data chuncks less times. But sometimes it is not available as for instance
! when metagga is used or realus or for conjugate gradient. nyfft can be used.
!-ntg and -nyfft are both alloved flags set the same value for both ntg and nyfft.
! These variables are kept separated to help understanding which operation belong
! to TG or to NYFFT. This can enable to make them different if the need arises.
!
CASE ( '-nt', '-ntg', '-ntask_groups', '-nyfft')
IF (read_string) THEN
CALL my_getarg ( input_command_line, narg, arg )

View File

@ -52,6 +52,17 @@ CONTAINS
! convenient to call it in serial execution as well
! IMPORTANT NOTICE 2: most parallelization levels are initialized here
! but at least some will be moved to a later stage
! SPECIAL CASE: command-line options "-ntg" and "-nyfft", introduced to
! improve scaling, coexist and are in part interchangeable.
! If task groups are available, -ntg is faster than -nyfft
! because it communicates larger data chuncks less frequently
! Sometimes task groups are not available as for instance
! when metagga is used or realus or for conjugate gradient.
! For those cases, -nyfft can be used instead.
! You may specify one or another: the same value will be set
! for both ntg and nyfft. These variables are kept separated
! to help understanding which operation belong to task groups
! or to nyfft, allowing to differenciate them if need arises.
!
USE command_line_options, ONLY : get_command_line, &
nimage_, npool_, nband_, ntg_, nyfft_
@ -88,6 +99,8 @@ CONTAINS
! npool_ is 0 if not specified in command line
IF ( npool_== 0 ) npool_ = 1
CALL mp_start_pools ( npool_, intra_image_comm )
! ntg_ is 0 if not specified in command line
IF ( ntg_== 0 ) ntg_ = 1
#if defined (__CUDA_OPTIMIZED)
CALL mp_start_bands ( 1 , ntg_, nyfft_, intra_pool_comm )
#else

View File

@ -696,6 +696,7 @@ SUBROUTINE setup_para ( nr3, nkstot, nbnd )
!
LOGICAL, EXTERNAL :: check_gpu_support
LOGICAL, SAVE :: first = .TRUE.
INTEGER :: maxtask
!
! do not execute twice: unpredictable results may follow
!
@ -705,13 +706,13 @@ SUBROUTINE setup_para ( nr3, nkstot, nbnd )
! k-point parallelization first
!
IF ( npool_== 0 ) THEN
npool_ = 1
!
! check if too many mpi processes for this fft dimension,
! use k-point parallelization if available
!
if ( nproc_image <= nr3/2 ) then
npool_ = 1
else
if ( nproc_image > nr3/2 .and. nkstot > 1 ) then
!
! if too many mpi processes for this fft dimension,
! use k-point parallelization if available
!
do npool_ = 2, nkstot
! npool should be a divisor of the number of k-points
if ( mod(nkstot, npool_) /= 0 ) cycle
@ -735,17 +736,23 @@ SUBROUTINE setup_para ( nr3, nkstot, nbnd )
use_gpu = check_gpu_support( )
!
! Set "task_groups" if still too many processors for PW parallelization
!
IF ( ntask_groups == 0 ) THEN
ntask_groups = 1
if ( nproc_bgrp > nr3 ) THEN
maxtask = min (nbnd, 16)
do ntask_groups = 2, maxtask
if ( mod(nproc_bgrp,ntask_groups) == 0 .and. &
nproc_bgrp/ntask_groups <= nr3 .or.&
nproc_bgrp/ntask_groups <= nr3/4 .or.&
ntask_groups == maxtask ) exit
end do
end if
END IF
!
! Note that "task_groups" require to set pencil_decomposition to .true.
!
IF ( ( ntask_groups /= 1 ) ) pencil_decomposition_ = .true.
IF ( ( ntask_groups == 1 ) .AND. ( nproc_bgrp > nr3 ) ) THEN
pencil_decomposition_ = .true.
do ntask_groups = 2, min(nbnd,16)
if ( mod(nproc_bgrp,ntask_groups) == 0 .and. &
nproc_bgrp/ntask_groups <= nr3 ) exit
if ( ntask_groups == min(nbnd,16) ) exit
end do
END IF
IF ( ntask_groups /= 1 ) pencil_decomposition_ = .true.
!
! printout - same as in environment.f90
!