mirror of https://gitlab.com/QEF/q-e.git
357 lines
10 KiB
Fortran
357 lines
10 KiB
Fortran
!
|
|
! Copyright (C) 2013 Quantum ESPRESSO group
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
MODULE mp_exx
|
|
!----------------------------------------------------------------------------
|
|
!! Band groups (processors within a pool of bands).
|
|
!! Subdivision of pool group, used for parallelization over bands.
|
|
!
|
|
USE mp, ONLY : mp_barrier, mp_bcast, mp_size, mp_rank, mp_comm_split
|
|
USE mp_bands, ONLY : nbgrp, nproc_bgrp, me_bgrp, root_bgrp, my_bgrp_id, &
|
|
inter_bgrp_comm, intra_bgrp_comm
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
SAVE
|
|
!
|
|
INTEGER :: negrp = 1
|
|
!! number of band groups
|
|
INTEGER :: nproc_egrp = 1
|
|
!! number of processors within a band group
|
|
INTEGER :: me_egrp = 0
|
|
!! index of the processor within a band group
|
|
INTEGER :: root_egrp = 0
|
|
!! index of the root processor within a band group
|
|
INTEGER :: my_egrp_id = 0
|
|
!! index of my band group
|
|
INTEGER :: inter_egrp_comm = 0
|
|
!! inter band group communicator
|
|
INTEGER :: intra_egrp_comm = 0
|
|
!! intra band group communicator
|
|
!
|
|
! ... "task" groups (for band parallelization of FFT)
|
|
!
|
|
INTEGER :: ntask_groups = 1
|
|
!! number of proc. in an orbital "task group"
|
|
!
|
|
LOGICAL :: use_old_exx = .FALSE.
|
|
!! should the old band parallelization method be used?
|
|
!
|
|
! variables for the pair parallelization
|
|
!
|
|
INTEGER :: max_pairs
|
|
!! maximum pairs per band group
|
|
INTEGER, ALLOCATABLE :: egrp_pairs(:,:,:)
|
|
!! pairs for each band group
|
|
INTEGER, ALLOCATABLE :: band_roots(:)
|
|
!! root for each band
|
|
LOGICAL, ALLOCATABLE :: contributed_bands(:,:)
|
|
!! bands for which the bgroup has a pair
|
|
INTEGER, ALLOCATABLE :: nibands(:)
|
|
!! number of bands for which the bgroup has a pair
|
|
INTEGER, ALLOCATABLE :: ibands(:,:)
|
|
!! bands for which the bgroup has a pair
|
|
INTEGER :: iexx_start = 0
|
|
!! starting band index used in bgrp parallelization
|
|
INTEGER :: iexx_end = 0
|
|
!! ending band index used in bgrp parallelization
|
|
INTEGER, ALLOCATABLE :: iexx_istart(:)
|
|
!! starting band index for the outer loop
|
|
INTEGER, ALLOCATABLE :: iexx_iend(:)
|
|
!! ending band index used in the outer loop
|
|
INTEGER, ALLOCATABLE :: all_start(:)
|
|
!! starting band inded for the inner loop
|
|
INTEGER, ALLOCATABLE :: all_end(:)
|
|
!! ending band index used in the inner loop
|
|
!
|
|
INTEGER, ALLOCATABLE :: iexx_istart_d(:)
|
|
#if defined(__CUDA)
|
|
attributes(DEVICE) :: iexx_istart_d
|
|
#endif
|
|
!
|
|
INTEGER :: max_contributors
|
|
!
|
|
INTEGER :: exx_mode = 0
|
|
!! flag for whether the exx part of the calculation is in progress
|
|
!
|
|
INTEGER :: max_ibands
|
|
!! maximum number of bands for psi
|
|
!
|
|
INTEGER :: jblock
|
|
!
|
|
CONTAINS
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
SUBROUTINE mp_start_exx( nband_, ntg_, parent_comm )
|
|
!---------------------------------------------------------------------------
|
|
!! Divide processors (of the "parent_comm" group) into \(\text{nband}\_\)
|
|
!! pools.
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER, INTENT(IN) :: nband_
|
|
!! it must have been previously read from command line argument
|
|
!! by a call to routine \(\texttt{get\_command\_line}\).
|
|
INTEGER, INTENT(IN) :: parent_comm
|
|
!! typically processors of a k-point pool (\(\text{intra\_pool\_comm}\))
|
|
INTEGER, INTENT(IN), OPTIONAL :: ntg_
|
|
!
|
|
INTEGER :: parent_nproc = 1, parent_mype = 0
|
|
!
|
|
#if defined (__MPI)
|
|
!
|
|
parent_nproc = mp_size( parent_comm )
|
|
parent_mype = mp_rank( parent_comm )
|
|
!
|
|
! ... nband_ must have been previously read from command line argument
|
|
! ... by a call to routine get_command_line
|
|
!
|
|
negrp = nband_
|
|
!
|
|
IF ( negrp < 1 .OR. negrp > parent_nproc ) CALL errore( 'mp_start_bands',&
|
|
'invalid number of band groups, out of range', 1 )
|
|
IF ( MOD( parent_nproc, negrp ) /= 0 ) CALL errore( 'mp_start_bands', &
|
|
'n. of band groups must be divisor of parent_nproc', 1 )
|
|
!
|
|
! ... Set number of processors per band group
|
|
!
|
|
nproc_egrp = parent_nproc / negrp
|
|
!
|
|
! ... set index of band group for this processor ( 0 : negrp - 1 )
|
|
!
|
|
my_egrp_id = parent_mype / nproc_egrp
|
|
!
|
|
! ... set index of processor within the image ( 0 : nproc_image - 1 )
|
|
!
|
|
me_egrp = MOD( parent_mype, nproc_egrp )
|
|
!
|
|
CALL mp_barrier( parent_comm )
|
|
!
|
|
! ... the intra_egrp_comm communicator is created
|
|
!
|
|
CALL mp_comm_split( parent_comm, my_egrp_id, parent_mype, intra_egrp_comm )
|
|
!
|
|
CALL mp_barrier( parent_comm )
|
|
!
|
|
! ... the inter_egrp_comm communicator is created
|
|
!
|
|
CALL mp_comm_split( parent_comm, me_egrp, parent_mype, inter_egrp_comm )
|
|
!
|
|
IF ( PRESENT(ntg_) ) THEN
|
|
ntask_groups = ntg_
|
|
END IF
|
|
!
|
|
! turn on the old band group parallelization method
|
|
!
|
|
IF(use_old_exx) THEN
|
|
nbgrp = negrp
|
|
nproc_bgrp = nproc_egrp
|
|
me_bgrp = me_egrp
|
|
root_bgrp = root_egrp
|
|
my_bgrp_id = my_egrp_id
|
|
inter_bgrp_comm = inter_egrp_comm
|
|
intra_bgrp_comm = intra_egrp_comm
|
|
negrp = 1
|
|
END IF
|
|
#endif
|
|
RETURN
|
|
!
|
|
END SUBROUTINE mp_start_exx
|
|
!
|
|
SUBROUTINE init_index_over_band(comm,nbnd,m)
|
|
!! Initialize indexes of band loops.
|
|
!
|
|
USE io_global, ONLY : stdout
|
|
USE control_flags, ONLY : use_gpu
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT(IN) :: comm, nbnd
|
|
INTEGER, INTENT(IN) :: m
|
|
|
|
INTEGER :: npe, myrank, rest, k, rest_i, k_i
|
|
INTEGER :: i, j, ipair, iegrp, root
|
|
INTEGER :: ibnd, npairs, ncontributing
|
|
INTEGER :: n_underloaded ! number of band groups that are under max load
|
|
INTEGER :: pair_bands(nbnd,nbnd)
|
|
|
|
jblock = 7
|
|
|
|
max_ibands = CEILING(DBLE(nbnd)/DBLE(negrp))+2
|
|
IF (negrp == 1) max_ibands = nbnd
|
|
|
|
IF (ALLOCATED(all_start)) THEN
|
|
DEALLOCATE( all_start, all_end )
|
|
DEALLOCATE( iexx_istart, iexx_iend )
|
|
IF(use_gpu) DEALLOCATE(iexx_istart_d)
|
|
END IF
|
|
ALLOCATE( all_start(negrp) )
|
|
ALLOCATE( all_end(negrp) )
|
|
ALLOCATE( iexx_istart(negrp) )
|
|
ALLOCATE( iexx_iend(negrp) )
|
|
|
|
myrank = mp_rank(comm)
|
|
npe = mp_size(comm)
|
|
|
|
!determine all_start and all_end for all of the inner loop bands
|
|
rest = mod(nbnd, npe)
|
|
k = int(nbnd/npe)
|
|
all_start = 0
|
|
all_end = 0
|
|
DO i=1, negrp
|
|
IF ( k >= 1 ) THEN
|
|
IF ( rest > i-1 ) THEN
|
|
all_start(i) = (i-1)*k + i
|
|
all_end(i) = i*k + i
|
|
ELSE
|
|
all_start(i) = (i-1)*k + rest + 1
|
|
all_end(i) = i*k + rest
|
|
END IF
|
|
ELSE
|
|
IF(i.le.m) THEN
|
|
all_start(i) = i
|
|
all_end(i) = i
|
|
ELSE
|
|
all_start(i) = 0
|
|
all_end(i) = 0
|
|
END IF
|
|
END IF
|
|
END DO
|
|
iexx_start = all_start(myrank+1)
|
|
iexx_end = all_end (myrank+1)
|
|
|
|
!determine the first and last indices for the outer loop
|
|
rest_i = mod(m, npe)
|
|
k_i = int(m/npe)
|
|
DO i=1, negrp
|
|
IF ( k_i >= 1 ) THEN
|
|
IF ( rest_i > i-1 ) THEN
|
|
iexx_istart(i) = (i-1)*k_i + i
|
|
iexx_iend(i) = i*k_i + i
|
|
ELSE
|
|
iexx_istart(i) = (i-1)*k_i + rest_i + 1
|
|
iexx_iend(i) = i*k_i + rest_i
|
|
END IF
|
|
ELSE
|
|
IF(i.le.m) THEN
|
|
iexx_istart(i) = i
|
|
iexx_iend(i) = i
|
|
ELSE
|
|
iexx_istart(i) = 0
|
|
iexx_iend(i) = 0
|
|
END IF
|
|
END IF
|
|
END DO
|
|
max_pairs = CEILING(REAL(nbnd*m)/REAL(negrp))
|
|
n_underloaded = MODULO(max_pairs*negrp-nbnd*m,negrp)
|
|
!
|
|
!allocate and upload
|
|
IF(use_gpu) ALLOCATE(iexx_istart_d, source=iexx_istart)
|
|
!
|
|
! allocate arrays
|
|
!
|
|
IF (allocated(egrp_pairs)) THEN
|
|
DEALLOCATE(egrp_pairs)
|
|
DEALLOCATE(band_roots)
|
|
DEALLOCATE(contributed_bands)
|
|
DEALLOCATE(nibands)
|
|
DEALLOCATE(ibands)
|
|
END IF
|
|
!
|
|
IF (.not.allocated(egrp_pairs)) THEN
|
|
ALLOCATE(egrp_pairs(2,max_pairs,negrp))
|
|
ALLOCATE(band_roots(m))
|
|
ALLOCATE(contributed_bands(nbnd,negrp))
|
|
ALLOCATE(nibands(negrp))
|
|
ALLOCATE(ibands(nbnd,negrp))
|
|
END IF
|
|
!
|
|
! assign the pairs for each band group
|
|
!
|
|
pair_bands = 0
|
|
egrp_pairs = 0
|
|
j = 1
|
|
DO iegrp=1, negrp
|
|
npairs = max_pairs
|
|
IF (iegrp.le.n_underloaded) npairs = npairs - 1
|
|
DO ipair=1, npairs
|
|
!
|
|
! get the first value of i for which the (i,j) pair has not been
|
|
! assigned yet
|
|
!
|
|
i = 1
|
|
DO WHILE (pair_bands(i,j).gt.0)
|
|
i = i + 1
|
|
IF(i.gt.m) exit
|
|
END DO
|
|
IF (i.le.m) THEN
|
|
pair_bands(i,j) = iegrp
|
|
END IF
|
|
egrp_pairs(1,ipair,iegrp) = i
|
|
egrp_pairs(2,ipair,iegrp) = j
|
|
|
|
j = j + 1
|
|
IF (j.gt.nbnd) j = 1
|
|
END DO
|
|
|
|
END DO
|
|
!
|
|
! determine the bands for which this band group will calculate a pair
|
|
!
|
|
contributed_bands = .FALSE.
|
|
DO iegrp=1, negrp
|
|
npairs = max_pairs
|
|
IF (iegrp.le.n_underloaded) npairs = npairs - 1
|
|
DO ipair=1, npairs
|
|
contributed_bands(egrp_pairs(1,ipair,iegrp),iegrp) = .TRUE.
|
|
END DO
|
|
END DO
|
|
nibands = 0
|
|
ibands = 0
|
|
DO iegrp=1, negrp
|
|
DO i=1, nbnd
|
|
IF (contributed_bands(i,iegrp)) THEN
|
|
nibands(iegrp) = nibands(iegrp) + 1
|
|
ibands(nibands(iegrp),iegrp) = i
|
|
END IF
|
|
END DO
|
|
END DO
|
|
!
|
|
! determine the maximum number of contributing egrps for any band
|
|
!
|
|
max_contributors = 0
|
|
DO i=1, nbnd
|
|
!
|
|
! determine the number of sending egrps
|
|
!
|
|
ncontributing = 0
|
|
DO iegrp=1, negrp
|
|
IF(contributed_bands(i,iegrp)) THEN
|
|
ncontributing = ncontributing + 1
|
|
END IF
|
|
END DO
|
|
IF(ncontributing.gt.max_contributors) THEN
|
|
max_contributors = ncontributing
|
|
END IF
|
|
END DO
|
|
!
|
|
! create a list of the roots for each band
|
|
!
|
|
DO i=1, m
|
|
DO iegrp=1, negrp
|
|
IF(iexx_iend(iegrp).ge.i) THEN
|
|
band_roots(i) = iegrp - 1
|
|
exit
|
|
END IF
|
|
END DO
|
|
END DO
|
|
!
|
|
END SUBROUTINE init_index_over_band
|
|
!
|
|
END MODULE mp_exx
|
|
!
|
|
!
|