- cleanup, information on processor groups moved from task_group to

mp_global, in order to have a single place containing all info
  about the parallelization


git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@4567 c92efa57-630b-4861-b058-cf58834340f0
This commit is contained in:
cavazzon 2007-12-28 14:08:17 +00:00
parent 5424c6e82f
commit c465220073
6 changed files with 303 additions and 161 deletions

View File

@ -82,9 +82,8 @@ subroutine fft_scatter ( f_in, nrx3, nxx_, f_aux, ncp_, npp_, sign, use_tg )
USE parallel_include USE parallel_include
#endif #endif
use mp_global, ONLY : nproc_pool, me_pool, intra_pool_comm, nproc, & use mp_global, ONLY : nproc_pool, me_pool, intra_pool_comm, nproc, &
my_image_id, nogrp, pgrp_comm my_image_id, nogrp, pgrp_comm, nplist
USE kinds, ONLY : DP USE kinds, ONLY : DP
USE task_groups, ONLY : nplist
implicit none implicit none
@ -478,9 +477,8 @@ subroutine fft_scatter ( f_in, nrx3, nxx_, f_aux, ncp_, npp_, sign, use_tg )
USE parallel_include USE parallel_include
#endif #endif
use mp_global, ONLY : nproc_pool, me_pool, intra_pool_comm, nproc, & use mp_global, ONLY : nproc_pool, me_pool, intra_pool_comm, nproc, &
my_image_id, nogrp, pgrp_comm my_image_id, nogrp, pgrp_comm, nplist
USE kinds, ONLY : DP USE kinds, ONLY : DP
USE task_groups, ONLY : nplist
implicit none implicit none

View File

@ -19,7 +19,7 @@ CONTAINS
! Task groups driver ! Task groups driver
! !
!---------------------------------------------------------------------------- !----------------------------------------------------------------------------
SUBROUTINE tg_cft3s( f, dffts, sign ) SUBROUTINE tg_cft3s( f, dfft, isgn )
!---------------------------------------------------------------------------- !----------------------------------------------------------------------------
! !
!---------------------------------------------------------------------- !----------------------------------------------------------------------
@ -27,135 +27,103 @@ SUBROUTINE tg_cft3s( f, dffts, sign )
!Added: C. Bekas, Oct. 2005. Adopted from the CPMD code (A. Curioni) !Added: C. Bekas, Oct. 2005. Adopted from the CPMD code (A. Curioni)
!Revised by Carlo Cavazzoni 2007. !Revised by Carlo Cavazzoni 2007.
! !
! ... sign = +-1 : parallel 3d fft for rho and for the potential ! ... isgn = +-1 : parallel 3d fft for rho and for the potential
! NOT YET IMPLEMENTED WITH TASK GROUPS ! NOT YET IMPLEMENTED WITH TASK GROUPS
! ... sign = +-2 : parallel 3d fft for wavefunctions ! ... isgn = +-2 : parallel 3d fft for wavefunctions
! !
! ... sign = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R) ! ... isgn = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
! ... fft along z using pencils (cft_1z) ! ... fft along z using pencils (cft_1z)
! ... transpose across nodes (fft_scatter) ! ... transpose across nodes (fft_scatter)
! ... and reorder ! ... and reorder
! ... fft along y (using planes) and x (cft_2xy) ! ... fft along y (using planes) and x (cft_2xy)
! ... sign = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega ! ... isgn = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega
! ... fft along x and y(using planes) (cft_2xy) ! ... fft along x and y(using planes) (cft_2xy)
! ... transpose across nodes (fft_scatter) ! ... transpose across nodes (fft_scatter)
! ... and reorder ! ... and reorder
! ... fft along z using pencils (cft_1z) ! ... fft along z using pencils (cft_1z)
! !
! ... The array "planes" signals whether a fft is needed along y : ! ... The array "dfft%iplw" signals whether a fft is needed along y :
! ... planes(i)=0 : column f(i,*,*) empty , don't do fft along y ! ... dfft%iplw(i)=0 : column f(i,*,*) empty , don't do fft along y
! ... planes(i)=1 : column f(i,*,*) filled, fft along y needed ! ... dfft%iplw(i)=1 : column f(i,*,*) filled, fft along y needed
! ... "empty" = no active components are present in f(i,*,*) ! ... "empty" = no active components are present in f(i,*,*)
! ... after (sign>0) or before (sign<0) the fft on z direction ! ... after (isgn>0) or before (isgn<0) the fft on z direction
! !
! ... Note that if sign=+/-1 (fft on rho and pot.) all fft's are needed ! ... Note that if isgn=+/-1 (fft on rho and pot.) all fft's are needed
! ... and all planes(i) are set to 1 ! ... and all planes(i) are set to 1
! !
USE fft_scalar, ONLY : cft_1z, cft_2xy USE fft_scalar, ONLY : cft_1z, cft_2xy
USE fft_base, ONLY : fft_scatter USE fft_base, ONLY : fft_scatter
USE kinds, ONLY : DP USE kinds, ONLY : DP
USE mp_global, only : me_pool, nproc_pool, ogrp_comm, npgrp, nogrp, & USE mp_global, only : me_pool, nproc_pool, ogrp_comm, npgrp, nogrp, &
intra_pool_comm intra_pool_comm, nolist, nplist
USE fft_types, ONLY : fft_dlay_descriptor USE fft_types, ONLY : fft_dlay_descriptor
USE task_groups
USE parallel_include USE parallel_include
! !
IMPLICIT NONE IMPLICIT NONE
! !
COMPLEX(DP), INTENT(INOUT) :: f( : ) COMPLEX(DP), INTENT(INOUT) :: f( : )
type (fft_dlay_descriptor), intent(in) :: dffts type (fft_dlay_descriptor), intent(in) :: dfft
INTEGER, INTENT(IN) :: sign INTEGER, INTENT(IN) :: isgn
! !
INTEGER :: mc, i, j, ii, iproc, k INTEGER :: mc, i, j, ii, iproc, k
INTEGER :: me_p INTEGER :: me_p
INTEGER :: n1, n2, n3, nx1, nx2, nx3 INTEGER :: n1, n2, n3, nx1, nx2, nx3
INTEGER :: idx, ierr LOGICAL :: tg
COMPLEX(DP), ALLOCATABLE :: yf(:), aux (:) COMPLEX(DP), ALLOCATABLE :: yf(:), aux (:)
COMPLEX(DP) :: stmp
INTEGER, DIMENSION(NOGRP) :: send_cnt, send_displ, recv_cnt, recv_displ
INTEGER, SAVE :: ib = 1
!
! !
CALL start_clock( 'cft3s' ) CALL start_clock( 'cft3s' )
! !
n1 = dffts%nr1 n1 = dfft%nr1
n2 = dffts%nr2 n2 = dfft%nr2
n3 = dffts%nr3 n3 = dfft%nr3
nx1 = dffts%nr1x nx1 = dfft%nr1x
nx2 = dffts%nr2x nx2 = dfft%nr2x
nx3 = dffts%nr3x nx3 = dfft%nr3x
! !
ALLOCATE( aux( (NOGRP+1)*strd ) ) ALLOCATE( aux( (NOGRP+1)*dfft%nnrx ) )
ALLOCATE( YF ( (NOGRP+1)*strd ) ) ALLOCATE( YF ( (NOGRP+1)*dfft%nnrx ) )
! !
me_p = me_pool + 1 me_p = me_pool + 1
! !
IF ( sign > 0 ) THEN IF ( isgn > 0 ) THEN
! !
IF ( sign /= 2 ) THEN IF ( isgn /= 2 ) THEN
! !
CALL errore( ' tg_cfft ', ' task groups are implemented only for waves ', 1 ) CALL errore( ' tg_cfft ', ' task groups are implemented only for waves ', 1 )
! !
ELSE ELSE
! !
CALL pack_group_sticks()
! !
send_cnt(1) = nx3 * dffts%nsw( me_p ) CALL cft_1z( yf, dfft%tg_nsw( me_p ), n3, nx3, isgn, aux )
IF( nx3 * dffts%nsw( me_p ) > strd ) THEN
CALL errore( ' tg_cfft ', ' inconsistent strd ', 1 )
END IF
send_displ(1) = 0
recv_cnt(1) = nx3 * dffts%nsw( nolist(1) + 1 )
recv_displ(1) = 0
DO idx = 2, nogrp
send_cnt(idx) = nx3 * dffts%nsw( me_p )
send_displ(idx) = send_displ(idx-1) + strd
recv_cnt(idx) = nx3 * dffts%nsw( nolist(idx) + 1 )
recv_displ(idx) = recv_displ(idx-1) + recv_cnt(idx-1)
ENDDO
IF( recv_displ(nogrp) + recv_cnt(nogrp) > SIZE( yf ) ) THEN
CALL errore( ' tg_cfft ', ' inconsistent size ', 1 )
END IF
IF( send_displ(nogrp) + send_cnt(nogrp) > SIZE( f ) ) THEN
CALL errore( ' tg_cfft ', ' inconsistent size ', 2 )
END IF
CALL start_clock( 'ALLTOALL' )
! !
! Collect all the sticks of the different states, !Transpose data for the 2-D FFT on the x-y plane
! in "yf" processors will have all the sticks of the OGRP
#if defined __MPI
CALL MPI_ALLTOALLV( f(1), send_cnt, send_displ, MPI_DOUBLE_COMPLEX, yf(1), recv_cnt, &
& recv_displ, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR)
IF( ierr /= 0 ) THEN
CALL errore( ' tg_cfft ', ' alltoall error 1 ', ABS(ierr) )
END IF
#endif
! !
CALL cft_1z( yf, tmp_nsw( me_p ), n3, nx3, sign, aux ) !NOGRP*dfft%nnr: The length of aux and f
!nr3x: The length of each Z-stick
!aux: input - output
!f: working space
!isgn: type of scatter
!dfft%nsw(me) holds the number of Z-sticks proc. me has.
!dfft%npp: number of planes per processor
! !
CALL fft_scatter( aux, nx3, (nogrp+1)*strd, f, tmp_nsw, tmp_npp, sign, use_tg = .TRUE. ) CALL fft_scatter( aux, nx3, (nogrp+1)*dfft%nnrx, f, dfft%tg_nsw, dfft%tg_npp, isgn, dfft%use_task_groups )
! !
f(:) = ( 0.D0 , 0.D0 ) f(:) = ( 0.D0 , 0.D0 )
ii = 0 ii = 0
! !
DO iproc = 1, nproc_pool DO iproc = 1, nproc_pool
! !
DO i = 1, dffts%nsw( iproc ) DO i = 1, dfft%nsw( iproc )
! !
mc = dffts%ismap( i + dffts%iss(iproc) ) mc = dfft%ismap( i + dfft%iss(iproc) )
! !
ii = ii + 1 ii = ii + 1
! !
DO j = 1, tmp_npp (me_p) DO j = 1, dfft%tg_npp (me_p)
! !
f(mc+(j-1)*nx1*nx2) = aux(j+(ii-1)*tmp_npp( me_p ) ) f(mc+(j-1)*nx1*nx2) = aux(j+(ii-1)* dfft%tg_npp( me_p ) )
! !
END DO END DO
! !
@ -165,32 +133,31 @@ SUBROUTINE tg_cft3s( f, dffts, sign )
! !
END IF END IF
! !
CALL cft_2xy( f, tmp_npp( me_p ), n1, n2, nx1, nx2, sign, dffts%iplw ) CALL cft_2xy( f, dfft%tg_npp( me_p ), n1, n2, nx1, nx2, isgn, dfft%iplw )
! !
ELSE ELSE
! !
IF ( sign /= -2 ) THEN IF ( isgn /= -2 ) THEN
! !
CALL errore( ' tg_cfft ', ' task groups are implemented only for waves ', 1 ) CALL errore( ' tg_cfft ', ' task groups are implemented only for waves ', 1 )
! !
ELSE ELSE
! !
CALL cft_2xy( f, tmp_npp(me_p), n1, n2, nx1, nx2, sign, dffts%iplw ) CALL cft_2xy( f, dfft%tg_npp(me_p), n1, n2, nx1, nx2, isgn, dfft%iplw )
! !
ii = 0 ii = 0
! !
DO iproc = 1, nproc_pool DO iproc = 1, nproc_pool
! !
DO i = 1, dffts%nsw(iproc) DO i = 1, dfft%nsw(iproc)
! !
mc = dffts%ismap( i + dffts%iss(iproc) ) mc = dfft%ismap( i + dfft%iss(iproc) )
! !
ii = ii + 1 ii = ii + 1
! !
DO j = 1, tmp_npp(me_p) DO j = 1, dfft%tg_npp(me_p)
! !
aux(j+(ii-1)*tmp_npp(me_p)) = f( mc + (j-1)*nx1*nx2) aux(j+(ii-1)* dfft%tg_npp(me_p)) = f( mc + (j-1)*nx1*nx2)
! !
END DO END DO
! !
@ -198,42 +165,11 @@ SUBROUTINE tg_cft3s( f, dffts, sign )
! !
END DO END DO
! !
CALL fft_scatter( aux, nx3, (NOGRP+1)*strd, f, tmp_nsw, tmp_npp, sign, use_tg = .TRUE. ) CALL fft_scatter( aux, nx3, (NOGRP+1)*dfft%nnrx, f, dfft%tg_nsw, dfft%tg_npp, isgn, dfft%use_task_groups )
! !
CALL cft_1z( aux, tmp_nsw(me_p), n3, nx3, sign, yf ) CALL cft_1z( aux, dfft%tg_nsw(me_p), n3, nx3, isgn, yf )
!
! Bring pencils back to their original distribution
!
send_cnt (1) = nx3 * dffts%nsw( nolist(1) + 1 )
send_displ(1) = 0
recv_cnt (1) = nx3 * dffts%nsw( me_p )
recv_displ(1) = 0
DO idx = 2, NOGRP
send_cnt (idx) = nx3 * dffts%nsw( nolist(idx) + 1 )
send_displ(idx) = send_displ(idx-1) + send_cnt(idx-1)
recv_cnt (idx) = nx3 * dffts%nsw( me_p )
recv_displ(idx) = recv_displ(idx-1) + recv_cnt(idx-1)
ENDDO
IF( recv_displ(nogrp) + recv_cnt(nogrp) > SIZE( f ) ) THEN CALL unpack_group_sticks()
CALL errore( ' tg_cfft ', ' inconsistent size ', 3 )
END IF
IF( send_displ(nogrp) + send_cnt(nogrp) > SIZE( yf ) ) THEN
CALL errore( ' tg_cfft ', ' inconsistent size ', 4 )
END IF
CALL start_clock( 'ALLTOALL' )
#if defined __MPI
CALL MPI_Alltoallv( yf(1), &
send_cnt, send_displ, MPI_DOUBLE_COMPLEX, f(1), &
recv_cnt, recv_displ, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR)
IF( ierr /= 0 ) THEN
CALL errore( ' tg_cfft ', ' alltoall error 2 ', ABS(ierr) )
END IF
#endif
CALL stop_clock( 'ALLTOALL' )
END IF END IF
! !
@ -245,6 +181,199 @@ SUBROUTINE tg_cft3s( f, dffts, sign )
CALL stop_clock( 'cft3s' ) CALL stop_clock( 'cft3s' )
! !
RETURN RETURN
!
CONTAINS
!
SUBROUTINE pack_group_sticks()
INTEGER :: idx, ierr
INTEGER, DIMENSION(nogrp+1) :: send_cnt, send_displ, recv_cnt, recv_displ
!
send_cnt(1) = nx3 * dfft%nsw( me_p )
IF( nx3 * dfft%nsw( me_p ) > dfft%nnrx ) THEN
CALL errore( ' tg_cfft ', ' inconsistent dfft%nnrx ', 1 )
END IF
send_displ(1) = 0
recv_cnt(1) = nx3 * dfft%nsw( nolist(1) + 1 )
recv_displ(1) = 0
DO idx = 2, nogrp
send_cnt(idx) = nx3 * dfft%nsw( me_p )
send_displ(idx) = send_displ(idx-1) + dfft%nnrx
recv_cnt(idx) = nx3 * dfft%nsw( nolist(idx) + 1 )
recv_displ(idx) = recv_displ(idx-1) + recv_cnt(idx-1)
ENDDO
IF( recv_displ(nogrp) + recv_cnt(nogrp) > SIZE( yf ) ) THEN
CALL errore( ' tg_cfft ', ' inconsistent size ', 1 )
END IF
IF( send_displ(nogrp) + send_cnt(nogrp) > SIZE( f ) ) THEN
CALL errore( ' tg_cfft ', ' inconsistent size ', 2 )
END IF
CALL start_clock( 'ALLTOALL' )
!
! Collect all the sticks of the different states,
! in "yf" processors will have all the sticks of the OGRP
#if defined __MPI
CALL MPI_ALLTOALLV( f(1), send_cnt, send_displ, MPI_DOUBLE_COMPLEX, yf(1), recv_cnt, &
& recv_displ, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR)
IF( ierr /= 0 ) THEN
CALL errore( ' tg_cfft ', ' alltoall error 1 ', ABS(ierr) )
END IF
#endif
CALL stop_clock( 'ALLTOALL' )
!
!YF Contains all ( ~ NOGRP*dfft%nsw(me) ) Z-sticks
!
RETURN
END SUBROUTINE pack_group_sticks
SUBROUTINE unpack_group_sticks()
!
! Bring pencils back to their original distribution
!
INTEGER :: idx, ierr
INTEGER, DIMENSION(nogrp+1) :: send_cnt, send_displ, recv_cnt, recv_displ
send_cnt (1) = nx3 * dfft%nsw( nolist(1) + 1 )
send_displ(1) = 0
recv_cnt (1) = nx3 * dfft%nsw( me_p )
recv_displ(1) = 0
DO idx = 2, NOGRP
send_cnt (idx) = nx3 * dfft%nsw( nolist(idx) + 1 )
send_displ(idx) = send_displ(idx-1) + send_cnt(idx-1)
recv_cnt (idx) = nx3 * dfft%nsw( me_p )
recv_displ(idx) = recv_displ(idx-1) + recv_cnt(idx-1)
ENDDO
IF( recv_displ(nogrp) + recv_cnt(nogrp) > SIZE( f ) ) THEN
CALL errore( ' tg_cfft ', ' inconsistent size ', 3 )
END IF
IF( send_displ(nogrp) + send_cnt(nogrp) > SIZE( yf ) ) THEN
CALL errore( ' tg_cfft ', ' inconsistent size ', 4 )
END IF
CALL start_clock( 'ALLTOALL' )
#if defined __MPI
CALL MPI_Alltoallv( yf(1), &
send_cnt, send_displ, MPI_DOUBLE_COMPLEX, f(1), &
recv_cnt, recv_displ, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR)
IF( ierr /= 0 ) THEN
CALL errore( ' tg_cfft ', ' alltoall error 2 ', ABS(ierr) )
END IF
#endif
CALL stop_clock( 'ALLTOALL' )
RETURN
END SUBROUTINE unpack_group_sticks
#ifdef PIPPO
SUBROUTINE fw_scatter( iopt )
!
use fft_base, only: fft_scatter
!
INTEGER, INTENT(IN) :: iopt
INTEGER :: nppx
!
!
IF( iopt == 2 ) THEN
!
if ( nproc_image == 1 ) then
nppx = dfft%nr3x
else
nppx = dfft%npp( me )
end if
call fft_scatter( aux, nr3x, dfft%nnr, f, dfft%nsw, dfft%npp, iopt )
f(:) = (0.d0, 0.d0)
ii = 0
do proc = 1, nproc_image
do i = 1, dfft%nsw( proc )
mc = dfft%ismap( i + dfft%iss( proc ) )
ii = ii + 1
do j = 1, dfft%npp( me )
f( mc + ( j - 1 ) * dfft%nnp ) = aux( j + ( ii - 1 ) * nppx )
end do
end do
end do
!
ELSE IF( iopt == 1 ) THEN
!
if ( nproc_image == 1 ) then
nppx = dfft%nr3x
else
nppx = dfft%npp( me )
end if
call fft_scatter( aux, nr3x, dfft%nnr, f, dfft%nsp, dfft%npp, iopt )
f(:) = (0.d0, 0.d0)
do i = 1, dfft%nst
mc = dfft%ismap( i )
do j = 1, dfft%npp( me )
f( mc + ( j - 1 ) * dfft%nnp ) = aux( j + ( i - 1 ) * nppx )
end do
end do
!
END IF
!
RETURN
END SUBROUTINE fw_scatter
SUBROUTINE bw_scatter( iopt )
!
use fft_base, only: fft_scatter
!
INTEGER, INTENT(IN) :: iopt
INTEGER :: nppx
!
!
IF( iopt == -2 ) THEN
!
if ( nproc_image == 1 ) then
nppx = dfft%nr3x
else
nppx = dfft%npp( me )
end if
ii = 0
do proc = 1, nproc_image
do i = 1, dfft%nsw( proc )
mc = dfft%ismap( i + dfft%iss( proc ) )
ii = ii + 1
do j = 1, dfft%npp( me )
aux( j + ( ii - 1 ) * nppx ) = f( mc + ( j - 1 ) * dfft%nnp )
end do
end do
end do
call fft_scatter( aux, nr3x, dfft%nnr, f, dfft%nsw, dfft%npp, iopt )
!
ELSE IF( iopt == -1 ) THEN
!
if ( nproc_image == 1 ) then
nppx = dfft%nr3x
else
nppx = dfft%npp( me )
end if
do i = 1, dfft%nst
mc = dfft%ismap( i )
do j = 1, dfft%npp( me )
aux( j + ( i - 1 ) * nppx ) = f( mc + ( j - 1 ) * dfft%nnp )
end do
end do
call fft_scatter( aux, nr3x, dfft%nnr, f, dfft%nsp, dfft%npp, iopt )
!
END IF
!
RETURN
END SUBROUTINE bw_scatter
#endif
! !
END SUBROUTINE tg_cft3s END SUBROUTINE tg_cft3s
! !

View File

@ -53,6 +53,13 @@ MODULE fft_types
INTEGER, POINTER :: imax3(:) ! the last local plane INTEGER, POINTER :: imax3(:) ! the last local plane
INTEGER, POINTER :: np3(:) ! number of local plane for the box fft INTEGER, POINTER :: np3(:) ! number of local plane for the box fft
! !
! task groups
!
LOGICAL :: use_task_groups
INTEGER :: nnrx ! maximum among nnr
INTEGER, POINTER :: tg_nsw(:) ! number of sticks per task group ( wave func )
INTEGER, POINTER :: tg_npp(:) ! number of "Z" planes per task group
!
END TYPE END TYPE
@ -79,6 +86,7 @@ CONTAINS
desc%nsp = 0 desc%nsp = 0
desc%nsw = 0 desc%nsw = 0
desc%ngl = 0 desc%ngl = 0
desc%nwl = 0
desc%npp = 0 desc%npp = 0
desc%ipp = 0 desc%ipp = 0
desc%iss = 0 desc%iss = 0
@ -86,8 +94,13 @@ CONTAINS
desc%ismap = 0 desc%ismap = 0
desc%iplp = 0 desc%iplp = 0
desc%iplw = 0 desc%iplw = 0
desc%id = 0 desc%id = 0
desc%use_task_groups = .FALSE.
NULLIFY( desc%tg_nsw )
NULLIFY( desc%tg_npp )
END SUBROUTINE fft_dlay_allocate END SUBROUTINE fft_dlay_allocate
@ -105,6 +118,11 @@ CONTAINS
IF ( ASSOCIATED( desc%iplp ) ) DEALLOCATE( desc%iplp ) IF ( ASSOCIATED( desc%iplp ) ) DEALLOCATE( desc%iplp )
IF ( ASSOCIATED( desc%iplw ) ) DEALLOCATE( desc%iplw ) IF ( ASSOCIATED( desc%iplw ) ) DEALLOCATE( desc%iplw )
desc%id = 0 desc%id = 0
IF( desc%use_task_groups ) THEN
IF ( ASSOCIATED( desc%tg_nsw ) ) DEALLOCATE( desc%tg_nsw )
IF ( ASSOCIATED( desc%tg_npp ) ) DEALLOCATE( desc%tg_npp )
END IF
desc%use_task_groups = .FALSE.
END SUBROUTINE fft_dlay_deallocate END SUBROUTINE fft_dlay_deallocate
!=----------------------------------------------------------------------------=! !=----------------------------------------------------------------------------=!
@ -124,6 +142,7 @@ CONTAINS
desc%npp = 0 desc%npp = 0
desc%ipp = 0 desc%ipp = 0
desc%np3 = 0 desc%np3 = 0
desc%use_task_groups = .FALSE.
END SUBROUTINE fft_box_allocate END SUBROUTINE fft_box_allocate
SUBROUTINE fft_box_deallocate( desc ) SUBROUTINE fft_box_deallocate( desc )
@ -134,6 +153,7 @@ CONTAINS
IF( ASSOCIATED( desc%npp ) ) DEALLOCATE( desc%npp ) IF( ASSOCIATED( desc%npp ) ) DEALLOCATE( desc%npp )
IF( ASSOCIATED( desc%ipp ) ) DEALLOCATE( desc%ipp ) IF( ASSOCIATED( desc%ipp ) ) DEALLOCATE( desc%ipp )
IF( ASSOCIATED( desc%np3 ) ) DEALLOCATE( desc%np3 ) IF( ASSOCIATED( desc%np3 ) ) DEALLOCATE( desc%np3 )
desc%use_task_groups = .FALSE.
END SUBROUTINE fft_box_deallocate END SUBROUTINE fft_box_deallocate
@ -181,6 +201,7 @@ CONTAINS
IF( ( SIZE( ncp ) < nproc ) .OR. ( SIZE( ngp ) < nproc ) ) & IF( ( SIZE( ncp ) < nproc ) .OR. ( SIZE( ngp ) < nproc ) ) &
CALL errore( ' fft_dlay_set ', ' wrong stick dimensions ', 4 ) CALL errore( ' fft_dlay_set ', ' wrong stick dimensions ', 4 )
desc%use_task_groups = .FALSE.
! Set the number of "xy" planes for each processor ! Set the number of "xy" planes for each processor
! in other word do a slab partition along the z axis ! in other word do a slab partition along the z axis
@ -240,11 +261,21 @@ CONTAINS
! Set fft local workspace dimension ! Set fft local workspace dimension
IF ( nproc == 1 ) THEN IF ( nproc == 1 ) THEN
desc%nnr = nr1x * nr2x * nr3x desc%nnr = nr1x * nr2x * nr3x
desc%nnrx = desc%nnr
ELSE ELSE
desc%nnr = MAX( nr3x * ncp(me), nr1x * nr2x * npp(me) ) desc%nnr = MAX( nr3x * ncp(me), nr1x * nr2x * npp(me) )
desc%nnr = MAX( 1, desc%nnr ) ! ensure that desc%nrr > 0 ( for extreme parallelism )
desc%nnrx = desc%nnr
DO i = 1, nproc
desc%nnrx = MAX( desc%nnrx, nr3x * ncp( i ) )
desc%nnrx = MAX( desc%nnrx, nr1x * nr2x * npp( i ) )
END DO
desc%nnrx = MAX( 1, desc%nnrx ) ! ensure that desc%nrr > 0 ( for extreme parallelism )
END IF END IF
desc%ngl( 1:nproc ) = ngp( 1:nproc ) desc%ngl( 1:nproc ) = ngp( 1:nproc )
desc%nwl( 1:nproc ) = ngpw( 1:nproc ) desc%nwl( 1:nproc ) = ngpw( 1:nproc )
@ -442,6 +473,7 @@ CONTAINS
END DO END DO
desc%use_task_groups = .FALSE.
END SUBROUTINE fft_box_set END SUBROUTINE fft_box_set
@ -492,6 +524,8 @@ CONTAINS
desc%nnp = nr1x * nr2x desc%nnp = nr1x * nr2x
desc%npp = nr3 desc%npp = nr3
desc%ipp = 0 desc%ipp = 0
desc%use_task_groups = .FALSE.
desc%nnrx = desc%nnr
RETURN RETURN
END SUBROUTINE fft_dlay_scalar END SUBROUTINE fft_dlay_scalar

View File

@ -57,14 +57,12 @@ fft_base.o : fft_types.o
fft_base.o : kind.o fft_base.o : kind.o
fft_base.o : mp_global.o fft_base.o : mp_global.o
fft_base.o : parallel_include.o fft_base.o : parallel_include.o
fft_base.o : task_groups.o
fft_parallel.o : fft_base.o fft_parallel.o : fft_base.o
fft_parallel.o : fft_scalar.o fft_parallel.o : fft_scalar.o
fft_parallel.o : fft_types.o fft_parallel.o : fft_types.o
fft_parallel.o : kind.o fft_parallel.o : kind.o
fft_parallel.o : mp_global.o fft_parallel.o : mp_global.o
fft_parallel.o : parallel_include.o fft_parallel.o : parallel_include.o
fft_parallel.o : task_groups.o
fft_scalar.o : kind.o fft_scalar.o : kind.o
fft_types.o : io_global.o fft_types.o : io_global.o

View File

@ -48,6 +48,8 @@ MODULE mp_global
INTEGER :: np_ortho(2) = 1 ! size of the processor grid used in ortho INTEGER :: np_ortho(2) = 1 ! size of the processor grid used in ortho
INTEGER :: leg_ortho = 1 ! the distance in the father communicator INTEGER :: leg_ortho = 1 ! the distance in the father communicator
! of two neighbour processors in ortho_comm ! of two neighbour processors in ortho_comm
INTEGER, ALLOCATABLE :: nolist(:) ! list of processors in my orbital task group
INTEGER, ALLOCATABLE :: nplist(:) ! list of processors in my plane wave task group
! !
! ... communicators ! ... communicators
! !
@ -87,6 +89,10 @@ MODULE mp_global
inter_image_comm = group_i inter_image_comm = group_i
intra_image_comm = group_i intra_image_comm = group_i
ortho_comm = group_i ortho_comm = group_i
ALLOCATE( nolist( nproc_i ) )
ALLOCATE( nplist( nproc_i ) )
nolist = 0
nplist = 0
! !
RETURN RETURN
! !

View File

@ -17,27 +17,9 @@ MODULE task_groups
IMPLICIT NONE IMPLICIT NONE
SAVE SAVE
INTEGER, ALLOCATABLE :: nolist(:), nplist(:), pgroup(:)
INTEGER, ALLOCATABLE :: tmp_nsw(:), tmp_npp(:)
INTEGER :: strd
INTEGER :: nswx ! maximum number of stick per processor
CONTAINS CONTAINS
SUBROUTINE DEALLOCATE_GROUPS
IMPLICIT NONE
! ... Deallocate groups related arrays
IF (ALLOCATED(nolist)) DEALLOCATE(nolist)
IF (ALLOCATED(nplist)) DEALLOCATE(nplist)
IF (ALLOCATED(pgroup)) DEALLOCATE(pgroup)
IF (ALLOCATED(tmp_nsw)) DEALLOCATE(tmp_nsw)
IF (ALLOCATED(tmp_npp)) DEALLOCATE(tmp_npp)
END SUBROUTINE DEALLOCATE_GROUPS
!======================================================================================== !========================================================================================
! ADDED SUBROUTINEs FOR TASK GROUP PARALLIZATION ! ADDED SUBROUTINEs FOR TASK GROUP PARALLIZATION
@ -58,6 +40,7 @@ SUBROUTINE task_groups_init( dffts )
! !
USE mp_global, ONLY : me_pool, nproc_pool, intra_pool_comm USE mp_global, ONLY : me_pool, nproc_pool, intra_pool_comm
USE mp_global, ONLY : NOGRP, NPGRP, ogrp_comm, pgrp_comm USE mp_global, ONLY : NOGRP, NPGRP, ogrp_comm, pgrp_comm
USE mp_global, ONLY : nolist, nplist
USE mp, ONLY : mp_bcast USE mp, ONLY : mp_bcast
USE io_global, only : stdout USE io_global, only : stdout
USE fft_types, only : fft_dlay_descriptor USE fft_types, only : fft_dlay_descriptor
@ -68,17 +51,19 @@ SUBROUTINE task_groups_init( dffts )
IMPLICIT NONE IMPLICIT NONE
TYPE(fft_dlay_descriptor), INTENT(IN) :: dffts TYPE(fft_dlay_descriptor), INTENT(INOUT) :: dffts
!---------------------------------- !----------------------------------
!Local Variables declaration !Local Variables declaration
!---------------------------------- !----------------------------------
INTEGER :: MSGLEN, I, J, N1, IPOS, WORLD, NEWGROUP INTEGER :: MSGLEN, I, J, N1, IPOS, WORLD, NEWGROUP
INTEGER :: IERR INTEGER :: IERR
INTEGER :: itsk, ntsk, color, key INTEGER :: itsk, ntsk, color, key
INTEGER :: num_planes, num_sticks INTEGER :: num_planes, num_sticks
INTEGER, DIMENSION(:), ALLOCATABLE :: nnrsx_vec INTEGER :: nnrsx_vec ( nproc_pool )
INTEGER :: pgroup( nproc_pool )
INTEGER :: strd
! !
WRITE( stdout, 100 ) nogrp, npgrp WRITE( stdout, 100 ) nogrp, npgrp
@ -94,17 +79,11 @@ SUBROUTINE task_groups_init( dffts )
IF( MOD( nproc_pool, nogrp ) /= 0 ) & IF( MOD( nproc_pool, nogrp ) /= 0 ) &
CALL errore( " groups ", " nogrp should be a divisor of nproc_pool ", 1 ) CALL errore( " groups ", " nogrp should be a divisor of nproc_pool ", 1 )
ALLOCATE( pgroup( nproc_pool ) )
! !
DO i = 1, nproc_pool DO i = 1, nproc_pool
pgroup( i ) = i - 1 pgroup( i ) = i - 1
ENDDO ENDDO
! !
ALLOCATE( nplist( npgrp ) )
!
ALLOCATE( nolist( nogrp ) )
!
!-------------------------------------- !--------------------------------------
!LIST OF PROCESSORS IN MY ORBITAL GROUP !LIST OF PROCESSORS IN MY ORBITAL GROUP
!-------------------------------------- !--------------------------------------
@ -171,20 +150,16 @@ SUBROUTINE task_groups_init( dffts )
#endif #endif
ALLOCATE( nnrsx_vec( nproc_pool ) )
!Find maximum chunk of local data concerning coefficients of eigenfunctions in g-space !Find maximum chunk of local data concerning coefficients of eigenfunctions in g-space
#if defined __MPI #if defined __MPI
CALL MPI_Allgather( dffts%nnr, 1, MPI_INTEGER, nnrsx_vec, 1, MPI_INTEGER, intra_pool_comm, IERR) CALL MPI_Allgather( dffts%nnr, 1, MPI_INTEGER, nnrsx_vec, 1, MPI_INTEGER, intra_pool_comm, IERR)
strd = MAXVAL( nnrsx_vec( 1:nproc_pool ) ) strd = MAXVAL( nnrsx_vec( 1:nproc_pool ) )
nswx = MAXVAL( dffts%nsw( 1:nproc_pool ) )
#else #else
strd = dffts%nnr strd = dffts%nnr
nswx = dffts%nsw(1)
#endif #endif
DEALLOCATE( nnrsx_vec ) IF( strd /= dffts%nnrx ) CALL errore( ' task_groups_init ', ' inconsistent nnrx ', 1 )
!------------------------------------------------------------------------------------- !-------------------------------------------------------------------------------------
!C. Bekas...TASK GROUP RELATED. FFT DATA STRUCTURES ARE ALREADY DEFINED ABOVE !C. Bekas...TASK GROUP RELATED. FFT DATA STRUCTURES ARE ALREADY DEFINED ABOVE
@ -196,8 +171,8 @@ SUBROUTINE task_groups_init( dffts )
!we choose to do the latter one. !we choose to do the latter one.
!------------------------------------------------------------------------------------- !-------------------------------------------------------------------------------------
! !
ALLOCATE(tmp_nsw(nproc_pool)) ALLOCATE( dffts%tg_nsw(nproc_pool))
ALLOCATE(tmp_npp(nproc_pool)) ALLOCATE( dffts%tg_npp(nproc_pool))
num_sticks = 0 num_sticks = 0
num_planes = 0 num_planes = 0
@ -207,13 +182,15 @@ SUBROUTINE task_groups_init( dffts )
ENDDO ENDDO
#if defined __MPI #if defined __MPI
CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, tmp_nsw, 1, MPI_INTEGER, intra_pool_comm, IERR) CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, dffts%tg_nsw(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, tmp_npp, 1, MPI_INTEGER, intra_pool_comm, IERR) CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, dffts%tg_npp(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
#else #else
tmp_nsw(1) = num_sticks dffts%tg_nsw(1) = num_sticks
tmp_npp(1) = num_planes dffts%tg_npp(1) = num_planes
#endif #endif
dffts%use_task_groups = .TRUE.
RETURN RETURN
END SUBROUTINE task_groups_init END SUBROUTINE task_groups_init