- task group number added in ftt_type_set (called by data_structure) so that when task_groups are active the plane distribution is more even and leads to at most to a single plane different between different task_groups. minor clean_up of indices in scatter_mod

- no point in not perfroming task_group parallelization when m < ntask_group.
 - some more timing report if verbosity >0



git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@12963 c92efa57-630b-4861-b058-cf58834340f0
This commit is contained in:
degironc 2016-09-15 07:41:30 +00:00
parent 441f121b0c
commit f7cd893d9f
10 changed files with 72 additions and 64 deletions

View File

@ -190,12 +190,13 @@ CONTAINS
!=----------------------------------------------------------------------------=!
SUBROUTINE fft_type_set( desc, tk, lpara, nst, ub, lb, idx, in1, in2, ncp, ncpw, ngp, ngpw, st, stw )
SUBROUTINE fft_type_set( desc, tk, lpara, ntg, nst, ub, lb, idx, in1, in2, ncp, ncpw, ngp, ngpw, st, stw )
TYPE (fft_type_descriptor) :: desc
LOGICAL, INTENT(in) :: tk ! gamma/not-gamma logical
LOGICAL, INTENT(in) :: lpara ! set array for parallel or serial FFT drivers
INTEGER, INTENT(in) :: ntg ! number of task groups (optimal spacing for residual-plane distribution)
INTEGER, INTENT(in) :: nst ! total number of stiks
INTEGER, INTENT(in) :: ub(3), lb(3) ! upper and lower bound of real space indices
INTEGER, INTENT(in) :: idx(:) ! sorting index of the sticks
@ -209,7 +210,7 @@ CONTAINS
INTEGER, INTENT(in) :: stw( lb(1) : ub(1), lb(2) : ub(2) ) ! stick owner of a given wave stick
INTEGER :: npp( desc%nproc ), n3( desc%nproc ), nsp( desc%nproc )
INTEGER :: np, nq, i, is, iss, i1, i2, m1, m2, n1, n2, ip
INTEGER :: np, nq, i, is, iss, itg, i1, i2, m1, m2, n1, n2, ip
INTEGER :: ncpx, nppx
INTEGER :: nr1, nr2, nr3 ! size of real space grid
INTEGER :: nr1x, nr2x, nr3x ! padded size of real space grid
@ -246,15 +247,20 @@ CONTAINS
! in other word do a slab partition along the z axis
npp = 0
IF ( desc%nproc == 1 ) THEN ! sigle processor: npp(1)=nr3
IF ( desc%nproc == 1 ) THEN ! sigle processor: npp(1)=nr3
npp(1) = nr3
ELSE
np = nr3 / desc%nproc
nq = nr3 - np * desc%nproc
DO i = 1, desc%nproc
npp(i) = np
IF ( i <= nq ) npp(i) = np + 1
ENDDO
npp(1:desc%nproc) = np ! assign a base value to all processors
reminder_loop : & ! assign an extra plane to processors so that they are spaced by ntg
DO itg = 1, ntg
DO i = itg, desc%nproc, ntg
IF (nq==0) EXIT reminder_loop
nq = nq - 1
npp(i) = np + 1
ENDDO
ENDDO reminder_loop
END IF
!-- npp(1:nproc) is the number of planes per processor
@ -474,7 +480,7 @@ CONTAINS
END SUBROUTINE fft_type_set
!=----------------------------------------------------------------------------=!
SUBROUTINE fft_type_init( dfft, smap, pers, lgamma, lpara, comm, at, bg, gcut_in, dual_in )
SUBROUTINE fft_type_init( dfft, smap, pers, lgamma, lpara, comm, at, bg, gcut_in, dual_in, ntask_groups )
USE stick_base
@ -488,6 +494,7 @@ CONTAINS
REAL(DP), INTENT(IN) :: bg(3,3)
REAL(DP), INTENT(IN) :: at(3,3)
REAL(DP), OPTIONAL, INTENT(IN) :: dual_in
INTEGER, OPTIONAL, INTENT(IN) :: ntask_groups
!
! Potential or dual
!
@ -514,11 +521,15 @@ CONTAINS
! ... sticks length for processor ip = number of G-vectors owned by the processor ip
INTEGER :: nstw
! ... nstw local number of sticks (wave functions)
INTEGER :: ntg
! ... ntg number of task groups (assigned from input if any)
REAL(DP) :: gcut, gkcut, dual
INTEGER :: ngm, ngw
ntg = 1
IF( PRESENT( ntask_groups ) ) ntg = ntask_groups
dual = fft_dual
IF( PRESENT( dual_in ) ) dual = dual_in
@ -555,7 +566,7 @@ CONTAINS
CALL get_sticks( smap, gkcut, nstpw, sstpw, stw, nstw, ngw )
CALL get_sticks( smap, gcut, nstp, sstp, st, nst, ngm )
CALL fft_type_set( dfft, .not.smap%lgamma, lpara, nst, smap%ub, smap%lb, smap%idx, &
CALL fft_type_set( dfft, .not.smap%lgamma, lpara, ntg, nst, smap%ub, smap%lb, smap%idx, &
smap%ist(:,1), smap%ist(:,2), nstp, nstpw, sstp, sstpw, st, stw )
DEALLOCATE( st )

View File

@ -94,7 +94,7 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
#if defined(__MPI)
INTEGER :: dest, from, k, offset, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom
INTEGER :: k, offset, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom
INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx, ipp, nblk, nsiz
!
LOGICAL :: use_tg_
@ -144,7 +144,7 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
!
! step one: store contiguously the slices
!
offset = 1
offset = 0
DO proc = 1, nprocp
IF( use_tg_ ) THEN
@ -153,11 +153,8 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
gproc = proc
ENDIF
!
from = offset
dest = 1 + ( proc - 1 ) * sendsiz
!
kdest = dest - 1
kfrom = from - 1
kdest = ( proc - 1 ) * sendsiz
kfrom = offset
!
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
@ -340,19 +337,17 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
!
!! f_in = 0.0_DP
!
offset = 1
offset = 0
DO proc = 1, nprocp
from = offset
IF( use_tg_ ) THEN
gproc = dtgs%nplist(proc)+1
ELSE
gproc = proc
ENDIF
dest = 1 + ( proc - 1 ) * sendsiz
!
kdest = dest - 1
kfrom = from - 1
kdest = ( proc - 1 ) * sendsiz
kfrom = offset
!
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
@ -424,7 +419,7 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
#if defined(__MPI)
INTEGER :: dest, from, k, offset, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom
INTEGER :: k, offset, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom
INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx, ipp, nblk, nsiz, ijp
INTEGER :: sh(dfft%nproc), rh(dfft%nproc)
INTEGER :: istat( MPI_STATUS_SIZE )
@ -473,13 +468,13 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
!
! step one: store contiguously the slices
!
offset = 1
offset = 0
IF( use_tg_ ) THEN
DO proc = 1, nprocp
gproc = dtgs%nplist(proc)+1
kdest = ( proc - 1 ) * sendsiz
kfrom = offset - 1
kfrom = offset
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
f_aux ( kdest + i ) = f_in ( kfrom + i )
@ -495,7 +490,7 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
ELSE
DO proc = 1, nprocp
kdest = ( proc - 1 ) * sendsiz
kfrom = offset - 1
kfrom = offset
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( proc )
f_aux ( kdest + i ) = f_in ( kfrom + i )
@ -703,13 +698,13 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
call mpi_waitall( nblk, rh, MPI_STATUSES_IGNORE, ierr )
call mpi_waitall( nblk, sh, MPI_STATUSES_IGNORE, ierr )
!
offset = 1
offset = 0
IF( use_tg_ ) THEN
DO proc = 1, nprocp
gproc = dtgs%nplist(proc) + 1
kdest = ( proc - 1 ) * sendsiz
kfrom = offset - 1
kfrom = offset
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
f_in ( kfrom + i ) = f_aux ( kdest + i )
@ -722,7 +717,7 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
ELSE
DO proc = 1, nprocp
kdest = ( proc - 1 ) * sendsiz
kfrom = offset - 1
kfrom = offset
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( proc )
f_in ( kfrom + i ) = f_aux ( kdest + i )

View File

@ -146,6 +146,13 @@ SUBROUTINE task_groups_init( dffts, dtgs, nogrp )
dtgs%tg_npp(1) = num_planes
#endif
#ifdef __TASK_PRINTOUT
write (6,*) 'TASK GROUP stick AND plane DISTRIBUTION '
do i=1,dtgs%nproc
write (6,*) i, dtgs%tg_nsw(i), dtgs%tg_npp(i)
end do
#endif
ALLOCATE( dtgs%tg_snd( dtgs%nogrp ) )
ALLOCATE( dtgs%tg_rcv( dtgs%nogrp ) )
ALLOCATE( dtgs%tg_psdsp( dtgs%nogrp ) )

View File

@ -63,7 +63,7 @@ SUBROUTINE data_structure( gamma_only )
!
! ... set up fft descriptors, including parallel stuff: sticks, planes, etc.
!
CALL fft_type_init( dffts, smap, "wave", gamma_only, lpara, intra_bgrp_comm, at, bg, gkcut, gcutms/gkcut )
CALL fft_type_init( dffts, smap, "wave", gamma_only, lpara, intra_bgrp_comm, at, bg, gkcut, gcutms/gkcut, ntask_groups )
CALL fft_type_init( dfftp, smap, "rho", gamma_only, lpara, intra_bgrp_comm, at, bg, gcutm )
CALL task_groups_init( dffts, dtgs, ntask_groups )
CALL fft_base_info( ionode, stdout )

View File

@ -126,7 +126,7 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
! ... real-space algorithm
! ... fixme: real_space without beta functions does not make sense
!
IF ( dtgs%have_task_groups .AND. ( m >= dtgs%nogrp )) then
IF ( dtgs%have_task_groups ) then
incr = 2 * dtgs%nogrp
ELSE
incr = 2
@ -163,7 +163,7 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
! ... real-space algorithm
! ... fixme: real_space without beta functions does not make sense
!
IF ( dtgs%have_task_groups .AND. ( m >= dtgs%nogrp )) then
IF ( dtgs%have_task_groups ) then
incr = dtgs%nogrp
ELSE
incr = 1

View File

@ -34,6 +34,10 @@ SUBROUTINE print_clock_pw()
!
WRITE( stdout, '(/5x,"Called by init_run:")' )
CALL print_clock( 'wfcinit' )
IF ( iverbosity > 0 ) THEN
CALL print_clock( 'wfcinit:atomic' )
CALL print_clock( 'wfcinit:wfcrot' )
END IF
CALL print_clock( 'potinit' )
CALL print_clock( 'realus' )
IF ( iverbosity > 0 ) THEN

View File

@ -1369,7 +1369,7 @@ MODULE realus
!
CALL start_clock( 'calbec_rs' )
!
IF( ( dtgs%have_task_groups ) .and. ( last >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
CALL errore( 'calbec_rs_gamma', 'task_groups not implemented', 1 )
@ -1478,7 +1478,7 @@ MODULE realus
!
CALL start_clock( 'calbec_rs' )
!
IF( ( dtgs%have_task_groups ) .and. ( last >= dtgs%nogrp ) ) CALL errore( 'calbec_rs_k', 'task_groups not implemented', 1 )
IF( dtgs%have_task_groups ) CALL errore( 'calbec_rs_k', 'task_groups not implemented', 1 )
call set_xkphase(current_k)
@ -1556,7 +1556,7 @@ MODULE realus
!
CALL start_clock( 's_psir' )
IF( ( dtgs%have_task_groups ) .and. ( last >= dtgs%nogrp ) ) CALL errore( 's_psir_gamma', 'task_groups not implemented', 1 )
IF( dtgs%have_task_groups ) CALL errore( 's_psir_gamma', 'task_groups not implemented', 1 )
!
fac = sqrt(omega)
@ -1641,7 +1641,7 @@ MODULE realus
CALL start_clock( 's_psir' )
IF( ( dtgs%have_task_groups ) .and. ( last >= dtgs%nogrp ) ) CALL errore( 's_psir_k', 'task_groups not implemented', 1 )
IF( dtgs%have_task_groups ) CALL errore( 's_psir_k', 'task_groups not implemented', 1 )
call set_xkphase(current_k)
@ -1729,7 +1729,7 @@ MODULE realus
!
CALL start_clock( 'add_vuspsir' )
IF( ( dtgs%have_task_groups ) .and. ( last >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
CALL errore( 'add_vuspsir_gamma', 'task_groups not implemented', 1 )
@ -1833,7 +1833,7 @@ MODULE realus
!
CALL start_clock( 'add_vuspsir' )
IF( ( dtgs%have_task_groups ) .and. ( last >= dtgs%nogrp ) ) CALL errore( 'add_vuspsir_k', 'task_groups not implemented', 1 )
IF( dtgs%have_task_groups ) CALL errore( 'add_vuspsir_k', 'task_groups not implemented', 1 )
call set_xkphase(current_k)
!
@ -1930,10 +1930,7 @@ MODULE realus
!print *, "->Real space"
CALL start_clock( 'invfft_orbital' )
!
! The following is dirty trick to prevent usage of task groups if
! the number of bands is smaller than the number of task groups
!
IF( dtgs%have_task_groups .and. ( last >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
!
tg_psic = (0.d0, 0.d0)
@ -2047,7 +2044,7 @@ MODULE realus
!print *, "->fourier space"
CALL start_clock( 'fwfft_orbital' )
!New task_groups versions
IF( ( dtgs%have_task_groups ) .and. ( last >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
!
CALL fwfft ('Wave', tg_psic, dffts, dtgs )
!
@ -2151,7 +2148,7 @@ MODULE realus
ik_ = current_k ; if (present(ik)) ik_ = ik
IF( ( dtgs%have_task_groups ) .and. ( last >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
!
tg_psic = ( 0.D0, 0.D0 )
ioff = 0
@ -2236,7 +2233,7 @@ MODULE realus
ik_ = current_k ; if (present(ik)) ik_ = ik
IF( ( dtgs%have_task_groups ) .and. ( last >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
!
CALL fwfft ('Wave', tg_psic, dffts, dtgs)
!
@ -2301,7 +2298,7 @@ MODULE realus
REAL(DP), ALLOCATABLE :: tg_v(:)
CALL start_clock( 'v_loc_psir' )
IF( dtgs%have_task_groups .and. last >= dtgs%nogrp ) THEN
IF( dtgs%have_task_groups ) THEN
IF (ibnd == 1 ) THEN
CALL tg_gather( dffts, dtgs, vrs(:,current_spin), tg_v )
!if ibnd==1 this is a new calculation, and tg_v should be distributed.
@ -2350,7 +2347,7 @@ MODULE realus
REAL(DP), ALLOCATABLE :: tg_v(:)
CALL start_clock( 'v_loc_psir' )
IF( dtgs%have_task_groups .and. last >= dtgs%nogrp ) THEN
IF( dtgs%have_task_groups ) THEN
IF (ibnd == 1 ) THEN
CALL tg_gather( dffts, dtgs, vrs(:,current_spin), tg_v )
!if ibnd==1 this is a new calculation, and tg_v should be distributed.

View File

@ -254,7 +254,7 @@ SUBROUTINE sum_band()
!
incr = 2
!
IF( dtgs%have_task_groups .AND. ( this_bgrp_nbnd >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
!
IF( dft_is_meta() .OR. lxdm) &
CALL errore( ' sum_band ', ' task groups with meta dft, not yet implemented ', 1 )
@ -270,7 +270,7 @@ SUBROUTINE sum_band()
!
k_loop: DO ik = 1, nks
!
IF( dtgs%have_task_groups .AND. ( this_bgrp_nbnd >= dtgs%nogrp ) ) tg_rho = 0.0_DP
IF( dtgs%have_task_groups ) tg_rho = 0.0_DP
IF ( lsda ) current_spin = isk(ik)
!
npw = ngk(ik)
@ -295,7 +295,7 @@ SUBROUTINE sum_band()
!
DO ibnd = ibnd_start, ibnd_end, incr
!
IF( dtgs%have_task_groups .AND. ( this_bgrp_nbnd >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
!
tg_psi(:) = ( 0.D0, 0.D0 )
ioff = 0
@ -436,7 +436,7 @@ SUBROUTINE sum_band()
!
END DO
!
IF( dtgs%have_task_groups .AND. ( this_bgrp_nbnd >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
!
! reduce the group charge
!
@ -466,7 +466,7 @@ SUBROUTINE sum_band()
!
IF( okvan .AND. becp%comm /= mp_get_comm_null() ) CALL mp_sum( becsum, becp%comm )
!
IF( dtgs%have_task_groups .AND. ( this_bgrp_nbnd >= dtgs%nogrp ) ) THEN
IF( dtgs%have_task_groups ) THEN
DEALLOCATE( tg_psi )
DEALLOCATE( tg_rho )
END IF
@ -502,7 +502,6 @@ SUBROUTINE sum_band()
! ... of the wavefunctions to the charge
!
use_tg = ( dtgs%have_task_groups ) .AND. &
( this_bgrp_nbnd >= dtgs%nogrp ) .AND. &
( .NOT. (dft_is_meta() .OR. lxdm) )
!
incr = 1

View File

@ -39,10 +39,7 @@ SUBROUTINE vloc_psi_gamma(lda, n, m, psi, v, hpsi)
!
incr = 2
!
! The following is dirty trick to prevent usage of task groups if
! the number of bands is smaller than the number of task groups
!
use_tg = dtgs%have_task_groups .and. ( m >= dtgs%nogrp )
use_tg = dtgs%have_task_groups
!
IF( use_tg ) THEN
!
@ -221,10 +218,7 @@ SUBROUTINE vloc_psi_k(lda, n, m, psi, v, hpsi)
COMPLEX(DP), ALLOCATABLE :: tg_psic(:)
INTEGER :: v_siz, idx, ioff
!
! The following is dirty trick to prevent usage of task groups if
! the number of bands is smaller than the number of task groups
!
use_tg = dtgs%have_task_groups .and. ( m >= dtgs%nogrp )
use_tg = dtgs%have_task_groups
!
incr = 1
!
@ -380,10 +374,7 @@ SUBROUTINE vloc_psi_nc (lda, n, m, psi, v, hpsi)
!
incr = 1
!
! The following is dirty trick to prevent usage of task groups if
! the number of bands is smaller than the number of task groups
!
use_tg = dtgs%have_task_groups .and. ( m >= dtgs%nogrp )
use_tg = dtgs%have_task_groups
!
IF( use_tg ) THEN
v_siz = dtgs%tg_nnr * dtgs%nogrp

View File

@ -246,7 +246,9 @@ SUBROUTINE init_wfc ( ik )
!
IF ( starting_wfc(1:6) == 'atomic' ) THEN
!
CALL start_clock( 'wfcinit:atomic' )
CALL atomic_wfc( ik, wfcatom )
CALL stop_clock( 'wfcinit:atomic' )
!
IF ( starting_wfc == 'atomic+random' .AND. &
n_starting_wfc == n_starting_atomic_wfc ) THEN
@ -325,8 +327,10 @@ SUBROUTINE init_wfc ( ik )
!
! ... subspace diagonalization (calls Hpsi)
!
CALL start_clock( 'wfcinit:wfcrot' )
CALL rotate_wfc ( npwx, ngk(ik), n_starting_wfc, gstart, &
nbnd, wfcatom, npol, okvan, evc, etatom )
CALL stop_clock( 'wfcinit:wfcrot' )
!
lelfield = lelfield_save
!