From c4652200730fdd8658ccc43c6e8d9f744e482360 Mon Sep 17 00:00:00 2001 From: cavazzon Date: Fri, 28 Dec 2007 14:08:17 +0000 Subject: [PATCH] - cleanup, information on processor groups moved from task_group to mp_global, in order to have a single place containing all info about the parallelization git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@4567 c92efa57-630b-4861-b058-cf58834340f0 --- Modules/fft_base.f90 | 6 +- Modules/fft_parallel.f90 | 355 ++++++++++++++++++++++++++------------- Modules/fft_types.f90 | 36 +++- Modules/make.depend | 2 - Modules/mp_global.f90 | 6 + Modules/task_groups.f90 | 59 ++----- 6 files changed, 303 insertions(+), 161 deletions(-) diff --git a/Modules/fft_base.f90 b/Modules/fft_base.f90 index b597f4ec4..70ab45f8c 100644 --- a/Modules/fft_base.f90 +++ b/Modules/fft_base.f90 @@ -82,9 +82,8 @@ subroutine fft_scatter ( f_in, nrx3, nxx_, f_aux, ncp_, npp_, sign, use_tg ) USE parallel_include #endif use mp_global, ONLY : nproc_pool, me_pool, intra_pool_comm, nproc, & - my_image_id, nogrp, pgrp_comm + my_image_id, nogrp, pgrp_comm, nplist USE kinds, ONLY : DP - USE task_groups, ONLY : nplist implicit none @@ -478,9 +477,8 @@ subroutine fft_scatter ( f_in, nrx3, nxx_, f_aux, ncp_, npp_, sign, use_tg ) USE parallel_include #endif use mp_global, ONLY : nproc_pool, me_pool, intra_pool_comm, nproc, & - my_image_id, nogrp, pgrp_comm + my_image_id, nogrp, pgrp_comm, nplist USE kinds, ONLY : DP - USE task_groups, ONLY : nplist implicit none diff --git a/Modules/fft_parallel.f90 b/Modules/fft_parallel.f90 index 3d062a7e0..bb0b37b3b 100644 --- a/Modules/fft_parallel.f90 +++ b/Modules/fft_parallel.f90 @@ -19,7 +19,7 @@ CONTAINS ! Task groups driver ! !---------------------------------------------------------------------------- -SUBROUTINE tg_cft3s( f, dffts, sign ) +SUBROUTINE tg_cft3s( f, dfft, isgn ) !---------------------------------------------------------------------------- ! !---------------------------------------------------------------------- @@ -27,135 +27,103 @@ SUBROUTINE tg_cft3s( f, dffts, sign ) !Added: C. Bekas, Oct. 2005. Adopted from the CPMD code (A. Curioni) !Revised by Carlo Cavazzoni 2007. ! - ! ... sign = +-1 : parallel 3d fft for rho and for the potential + ! ... isgn = +-1 : parallel 3d fft for rho and for the potential ! NOT YET IMPLEMENTED WITH TASK GROUPS - ! ... sign = +-2 : parallel 3d fft for wavefunctions + ! ... isgn = +-2 : parallel 3d fft for wavefunctions ! - ! ... sign = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R) + ! ... isgn = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R) ! ... fft along z using pencils (cft_1z) ! ... transpose across nodes (fft_scatter) ! ... and reorder ! ... fft along y (using planes) and x (cft_2xy) - ! ... sign = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega + ! ... isgn = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega ! ... fft along x and y(using planes) (cft_2xy) ! ... transpose across nodes (fft_scatter) ! ... and reorder ! ... fft along z using pencils (cft_1z) ! - ! ... The array "planes" signals whether a fft is needed along y : - ! ... planes(i)=0 : column f(i,*,*) empty , don't do fft along y - ! ... planes(i)=1 : column f(i,*,*) filled, fft along y needed + ! ... The array "dfft%iplw" signals whether a fft is needed along y : + ! ... dfft%iplw(i)=0 : column f(i,*,*) empty , don't do fft along y + ! ... dfft%iplw(i)=1 : column f(i,*,*) filled, fft along y needed ! ... "empty" = no active components are present in f(i,*,*) - ! ... after (sign>0) or before (sign<0) the fft on z direction + ! ... after (isgn>0) or before (isgn<0) the fft on z direction ! - ! ... Note that if sign=+/-1 (fft on rho and pot.) all fft's are needed + ! ... Note that if isgn=+/-1 (fft on rho and pot.) all fft's are needed ! ... and all planes(i) are set to 1 ! USE fft_scalar, ONLY : cft_1z, cft_2xy USE fft_base, ONLY : fft_scatter USE kinds, ONLY : DP USE mp_global, only : me_pool, nproc_pool, ogrp_comm, npgrp, nogrp, & - intra_pool_comm + intra_pool_comm, nolist, nplist USE fft_types, ONLY : fft_dlay_descriptor - USE task_groups USE parallel_include ! IMPLICIT NONE ! COMPLEX(DP), INTENT(INOUT) :: f( : ) - type (fft_dlay_descriptor), intent(in) :: dffts - INTEGER, INTENT(IN) :: sign + type (fft_dlay_descriptor), intent(in) :: dfft + INTEGER, INTENT(IN) :: isgn ! INTEGER :: mc, i, j, ii, iproc, k INTEGER :: me_p INTEGER :: n1, n2, n3, nx1, nx2, nx3 - INTEGER :: idx, ierr + LOGICAL :: tg COMPLEX(DP), ALLOCATABLE :: yf(:), aux (:) - COMPLEX(DP) :: stmp - INTEGER, DIMENSION(NOGRP) :: send_cnt, send_displ, recv_cnt, recv_displ - INTEGER, SAVE :: ib = 1 - ! ! CALL start_clock( 'cft3s' ) ! - n1 = dffts%nr1 - n2 = dffts%nr2 - n3 = dffts%nr3 - nx1 = dffts%nr1x - nx2 = dffts%nr2x - nx3 = dffts%nr3x + n1 = dfft%nr1 + n2 = dfft%nr2 + n3 = dfft%nr3 + nx1 = dfft%nr1x + nx2 = dfft%nr2x + nx3 = dfft%nr3x ! - ALLOCATE( aux( (NOGRP+1)*strd ) ) - ALLOCATE( YF ( (NOGRP+1)*strd ) ) + ALLOCATE( aux( (NOGRP+1)*dfft%nnrx ) ) + ALLOCATE( YF ( (NOGRP+1)*dfft%nnrx ) ) ! me_p = me_pool + 1 ! - IF ( sign > 0 ) THEN + IF ( isgn > 0 ) THEN ! - IF ( sign /= 2 ) THEN + IF ( isgn /= 2 ) THEN ! CALL errore( ' tg_cfft ', ' task groups are implemented only for waves ', 1 ) ! ELSE ! + CALL pack_group_sticks() ! - send_cnt(1) = nx3 * dffts%nsw( me_p ) - IF( nx3 * dffts%nsw( me_p ) > strd ) THEN - CALL errore( ' tg_cfft ', ' inconsistent strd ', 1 ) - END IF - send_displ(1) = 0 - recv_cnt(1) = nx3 * dffts%nsw( nolist(1) + 1 ) - recv_displ(1) = 0 - DO idx = 2, nogrp - send_cnt(idx) = nx3 * dffts%nsw( me_p ) - send_displ(idx) = send_displ(idx-1) + strd - recv_cnt(idx) = nx3 * dffts%nsw( nolist(idx) + 1 ) - recv_displ(idx) = recv_displ(idx-1) + recv_cnt(idx-1) - ENDDO - - IF( recv_displ(nogrp) + recv_cnt(nogrp) > SIZE( yf ) ) THEN - CALL errore( ' tg_cfft ', ' inconsistent size ', 1 ) - END IF - IF( send_displ(nogrp) + send_cnt(nogrp) > SIZE( f ) ) THEN - CALL errore( ' tg_cfft ', ' inconsistent size ', 2 ) - END IF - - CALL start_clock( 'ALLTOALL' ) + CALL cft_1z( yf, dfft%tg_nsw( me_p ), n3, nx3, isgn, aux ) ! - ! Collect all the sticks of the different states, - ! in "yf" processors will have all the sticks of the OGRP - -#if defined __MPI - - CALL MPI_ALLTOALLV( f(1), send_cnt, send_displ, MPI_DOUBLE_COMPLEX, yf(1), recv_cnt, & - & recv_displ, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR) - IF( ierr /= 0 ) THEN - CALL errore( ' tg_cfft ', ' alltoall error 1 ', ABS(ierr) ) - END IF - -#endif - + !Transpose data for the 2-D FFT on the x-y plane ! - CALL cft_1z( yf, tmp_nsw( me_p ), n3, nx3, sign, aux ) - + !NOGRP*dfft%nnr: The length of aux and f + !nr3x: The length of each Z-stick + !aux: input - output + !f: working space + !isgn: type of scatter + !dfft%nsw(me) holds the number of Z-sticks proc. me has. + !dfft%npp: number of planes per processor ! - CALL fft_scatter( aux, nx3, (nogrp+1)*strd, f, tmp_nsw, tmp_npp, sign, use_tg = .TRUE. ) + CALL fft_scatter( aux, nx3, (nogrp+1)*dfft%nnrx, f, dfft%tg_nsw, dfft%tg_npp, isgn, dfft%use_task_groups ) ! f(:) = ( 0.D0 , 0.D0 ) ii = 0 ! DO iproc = 1, nproc_pool ! - DO i = 1, dffts%nsw( iproc ) + DO i = 1, dfft%nsw( iproc ) ! - mc = dffts%ismap( i + dffts%iss(iproc) ) + mc = dfft%ismap( i + dfft%iss(iproc) ) ! ii = ii + 1 ! - DO j = 1, tmp_npp (me_p) + DO j = 1, dfft%tg_npp (me_p) ! - f(mc+(j-1)*nx1*nx2) = aux(j+(ii-1)*tmp_npp( me_p ) ) + f(mc+(j-1)*nx1*nx2) = aux(j+(ii-1)* dfft%tg_npp( me_p ) ) ! END DO ! @@ -165,32 +133,31 @@ SUBROUTINE tg_cft3s( f, dffts, sign ) ! END IF ! - CALL cft_2xy( f, tmp_npp( me_p ), n1, n2, nx1, nx2, sign, dffts%iplw ) - + CALL cft_2xy( f, dfft%tg_npp( me_p ), n1, n2, nx1, nx2, isgn, dfft%iplw ) ! ELSE ! - IF ( sign /= -2 ) THEN + IF ( isgn /= -2 ) THEN ! CALL errore( ' tg_cfft ', ' task groups are implemented only for waves ', 1 ) ! ELSE ! - CALL cft_2xy( f, tmp_npp(me_p), n1, n2, nx1, nx2, sign, dffts%iplw ) + CALL cft_2xy( f, dfft%tg_npp(me_p), n1, n2, nx1, nx2, isgn, dfft%iplw ) ! ii = 0 ! DO iproc = 1, nproc_pool ! - DO i = 1, dffts%nsw(iproc) + DO i = 1, dfft%nsw(iproc) ! - mc = dffts%ismap( i + dffts%iss(iproc) ) + mc = dfft%ismap( i + dfft%iss(iproc) ) ! ii = ii + 1 ! - DO j = 1, tmp_npp(me_p) + DO j = 1, dfft%tg_npp(me_p) ! - aux(j+(ii-1)*tmp_npp(me_p)) = f( mc + (j-1)*nx1*nx2) + aux(j+(ii-1)* dfft%tg_npp(me_p)) = f( mc + (j-1)*nx1*nx2) ! END DO ! @@ -198,42 +165,11 @@ SUBROUTINE tg_cft3s( f, dffts, sign ) ! END DO ! - CALL fft_scatter( aux, nx3, (NOGRP+1)*strd, f, tmp_nsw, tmp_npp, sign, use_tg = .TRUE. ) + CALL fft_scatter( aux, nx3, (NOGRP+1)*dfft%nnrx, f, dfft%tg_nsw, dfft%tg_npp, isgn, dfft%use_task_groups ) ! - CALL cft_1z( aux, tmp_nsw(me_p), n3, nx3, sign, yf ) - ! - ! Bring pencils back to their original distribution - ! - send_cnt (1) = nx3 * dffts%nsw( nolist(1) + 1 ) - send_displ(1) = 0 - recv_cnt (1) = nx3 * dffts%nsw( me_p ) - recv_displ(1) = 0 - DO idx = 2, NOGRP - send_cnt (idx) = nx3 * dffts%nsw( nolist(idx) + 1 ) - send_displ(idx) = send_displ(idx-1) + send_cnt(idx-1) - recv_cnt (idx) = nx3 * dffts%nsw( me_p ) - recv_displ(idx) = recv_displ(idx-1) + recv_cnt(idx-1) - ENDDO + CALL cft_1z( aux, dfft%tg_nsw(me_p), n3, nx3, isgn, yf ) - IF( recv_displ(nogrp) + recv_cnt(nogrp) > SIZE( f ) ) THEN - CALL errore( ' tg_cfft ', ' inconsistent size ', 3 ) - END IF - IF( send_displ(nogrp) + send_cnt(nogrp) > SIZE( yf ) ) THEN - CALL errore( ' tg_cfft ', ' inconsistent size ', 4 ) - END IF - - CALL start_clock( 'ALLTOALL' ) - -#if defined __MPI - CALL MPI_Alltoallv( yf(1), & - send_cnt, send_displ, MPI_DOUBLE_COMPLEX, f(1), & - recv_cnt, recv_displ, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR) - IF( ierr /= 0 ) THEN - CALL errore( ' tg_cfft ', ' alltoall error 2 ', ABS(ierr) ) - END IF -#endif - - CALL stop_clock( 'ALLTOALL' ) + CALL unpack_group_sticks() END IF ! @@ -245,6 +181,199 @@ SUBROUTINE tg_cft3s( f, dffts, sign ) CALL stop_clock( 'cft3s' ) ! RETURN + ! +CONTAINS + ! + + SUBROUTINE pack_group_sticks() + + INTEGER :: idx, ierr + INTEGER, DIMENSION(nogrp+1) :: send_cnt, send_displ, recv_cnt, recv_displ + ! + send_cnt(1) = nx3 * dfft%nsw( me_p ) + IF( nx3 * dfft%nsw( me_p ) > dfft%nnrx ) THEN + CALL errore( ' tg_cfft ', ' inconsistent dfft%nnrx ', 1 ) + END IF + send_displ(1) = 0 + recv_cnt(1) = nx3 * dfft%nsw( nolist(1) + 1 ) + recv_displ(1) = 0 + DO idx = 2, nogrp + send_cnt(idx) = nx3 * dfft%nsw( me_p ) + send_displ(idx) = send_displ(idx-1) + dfft%nnrx + recv_cnt(idx) = nx3 * dfft%nsw( nolist(idx) + 1 ) + recv_displ(idx) = recv_displ(idx-1) + recv_cnt(idx-1) + ENDDO + + IF( recv_displ(nogrp) + recv_cnt(nogrp) > SIZE( yf ) ) THEN + CALL errore( ' tg_cfft ', ' inconsistent size ', 1 ) + END IF + IF( send_displ(nogrp) + send_cnt(nogrp) > SIZE( f ) ) THEN + CALL errore( ' tg_cfft ', ' inconsistent size ', 2 ) + END IF + + CALL start_clock( 'ALLTOALL' ) + ! + ! Collect all the sticks of the different states, + ! in "yf" processors will have all the sticks of the OGRP + +#if defined __MPI + + CALL MPI_ALLTOALLV( f(1), send_cnt, send_displ, MPI_DOUBLE_COMPLEX, yf(1), recv_cnt, & + & recv_displ, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR) + IF( ierr /= 0 ) THEN + CALL errore( ' tg_cfft ', ' alltoall error 1 ', ABS(ierr) ) + END IF + +#endif + + CALL stop_clock( 'ALLTOALL' ) + ! + !YF Contains all ( ~ NOGRP*dfft%nsw(me) ) Z-sticks + ! + RETURN + END SUBROUTINE pack_group_sticks + + SUBROUTINE unpack_group_sticks() + ! + ! Bring pencils back to their original distribution + ! + INTEGER :: idx, ierr + INTEGER, DIMENSION(nogrp+1) :: send_cnt, send_displ, recv_cnt, recv_displ + send_cnt (1) = nx3 * dfft%nsw( nolist(1) + 1 ) + send_displ(1) = 0 + recv_cnt (1) = nx3 * dfft%nsw( me_p ) + recv_displ(1) = 0 + DO idx = 2, NOGRP + send_cnt (idx) = nx3 * dfft%nsw( nolist(idx) + 1 ) + send_displ(idx) = send_displ(idx-1) + send_cnt(idx-1) + recv_cnt (idx) = nx3 * dfft%nsw( me_p ) + recv_displ(idx) = recv_displ(idx-1) + recv_cnt(idx-1) + ENDDO + + IF( recv_displ(nogrp) + recv_cnt(nogrp) > SIZE( f ) ) THEN + CALL errore( ' tg_cfft ', ' inconsistent size ', 3 ) + END IF + IF( send_displ(nogrp) + send_cnt(nogrp) > SIZE( yf ) ) THEN + CALL errore( ' tg_cfft ', ' inconsistent size ', 4 ) + END IF + + CALL start_clock( 'ALLTOALL' ) + +#if defined __MPI + CALL MPI_Alltoallv( yf(1), & + send_cnt, send_displ, MPI_DOUBLE_COMPLEX, f(1), & + recv_cnt, recv_displ, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR) + IF( ierr /= 0 ) THEN + CALL errore( ' tg_cfft ', ' alltoall error 2 ', ABS(ierr) ) + END IF +#endif + + CALL stop_clock( 'ALLTOALL' ) + + RETURN + END SUBROUTINE unpack_group_sticks + +#ifdef PIPPO + SUBROUTINE fw_scatter( iopt ) + ! + use fft_base, only: fft_scatter + ! + INTEGER, INTENT(IN) :: iopt + INTEGER :: nppx + ! + ! + IF( iopt == 2 ) THEN + ! + if ( nproc_image == 1 ) then + nppx = dfft%nr3x + else + nppx = dfft%npp( me ) + end if + call fft_scatter( aux, nr3x, dfft%nnr, f, dfft%nsw, dfft%npp, iopt ) + f(:) = (0.d0, 0.d0) + ii = 0 + do proc = 1, nproc_image + do i = 1, dfft%nsw( proc ) + mc = dfft%ismap( i + dfft%iss( proc ) ) + ii = ii + 1 + do j = 1, dfft%npp( me ) + f( mc + ( j - 1 ) * dfft%nnp ) = aux( j + ( ii - 1 ) * nppx ) + end do + end do + end do + ! + ELSE IF( iopt == 1 ) THEN + ! + if ( nproc_image == 1 ) then + nppx = dfft%nr3x + else + nppx = dfft%npp( me ) + end if + call fft_scatter( aux, nr3x, dfft%nnr, f, dfft%nsp, dfft%npp, iopt ) + f(:) = (0.d0, 0.d0) + do i = 1, dfft%nst + mc = dfft%ismap( i ) + do j = 1, dfft%npp( me ) + f( mc + ( j - 1 ) * dfft%nnp ) = aux( j + ( i - 1 ) * nppx ) + end do + end do + ! + END IF + ! + RETURN + END SUBROUTINE fw_scatter + + + + SUBROUTINE bw_scatter( iopt ) + ! + use fft_base, only: fft_scatter + ! + INTEGER, INTENT(IN) :: iopt + INTEGER :: nppx + ! + ! + IF( iopt == -2 ) THEN + ! + if ( nproc_image == 1 ) then + nppx = dfft%nr3x + else + nppx = dfft%npp( me ) + end if + ii = 0 + do proc = 1, nproc_image + do i = 1, dfft%nsw( proc ) + mc = dfft%ismap( i + dfft%iss( proc ) ) + ii = ii + 1 + do j = 1, dfft%npp( me ) + aux( j + ( ii - 1 ) * nppx ) = f( mc + ( j - 1 ) * dfft%nnp ) + end do + end do + end do + call fft_scatter( aux, nr3x, dfft%nnr, f, dfft%nsw, dfft%npp, iopt ) + ! + ELSE IF( iopt == -1 ) THEN + ! + if ( nproc_image == 1 ) then + nppx = dfft%nr3x + else + nppx = dfft%npp( me ) + end if + do i = 1, dfft%nst + mc = dfft%ismap( i ) + do j = 1, dfft%npp( me ) + aux( j + ( i - 1 ) * nppx ) = f( mc + ( j - 1 ) * dfft%nnp ) + end do + end do + call fft_scatter( aux, nr3x, dfft%nnr, f, dfft%nsp, dfft%npp, iopt ) + ! + END IF + ! + RETURN + END SUBROUTINE bw_scatter + +#endif + ! END SUBROUTINE tg_cft3s ! diff --git a/Modules/fft_types.f90 b/Modules/fft_types.f90 index 599be8ebd..6b8b1f319 100644 --- a/Modules/fft_types.f90 +++ b/Modules/fft_types.f90 @@ -53,6 +53,13 @@ MODULE fft_types INTEGER, POINTER :: imax3(:) ! the last local plane INTEGER, POINTER :: np3(:) ! number of local plane for the box fft ! + ! task groups + ! + LOGICAL :: use_task_groups + INTEGER :: nnrx ! maximum among nnr + INTEGER, POINTER :: tg_nsw(:) ! number of sticks per task group ( wave func ) + INTEGER, POINTER :: tg_npp(:) ! number of "Z" planes per task group + ! END TYPE @@ -79,6 +86,7 @@ CONTAINS desc%nsp = 0 desc%nsw = 0 desc%ngl = 0 + desc%nwl = 0 desc%npp = 0 desc%ipp = 0 desc%iss = 0 @@ -86,8 +94,13 @@ CONTAINS desc%ismap = 0 desc%iplp = 0 desc%iplw = 0 + desc%id = 0 + desc%use_task_groups = .FALSE. + NULLIFY( desc%tg_nsw ) + NULLIFY( desc%tg_npp ) + END SUBROUTINE fft_dlay_allocate @@ -105,6 +118,11 @@ CONTAINS IF ( ASSOCIATED( desc%iplp ) ) DEALLOCATE( desc%iplp ) IF ( ASSOCIATED( desc%iplw ) ) DEALLOCATE( desc%iplw ) desc%id = 0 + IF( desc%use_task_groups ) THEN + IF ( ASSOCIATED( desc%tg_nsw ) ) DEALLOCATE( desc%tg_nsw ) + IF ( ASSOCIATED( desc%tg_npp ) ) DEALLOCATE( desc%tg_npp ) + END IF + desc%use_task_groups = .FALSE. END SUBROUTINE fft_dlay_deallocate !=----------------------------------------------------------------------------=! @@ -124,6 +142,7 @@ CONTAINS desc%npp = 0 desc%ipp = 0 desc%np3 = 0 + desc%use_task_groups = .FALSE. END SUBROUTINE fft_box_allocate SUBROUTINE fft_box_deallocate( desc ) @@ -134,6 +153,7 @@ CONTAINS IF( ASSOCIATED( desc%npp ) ) DEALLOCATE( desc%npp ) IF( ASSOCIATED( desc%ipp ) ) DEALLOCATE( desc%ipp ) IF( ASSOCIATED( desc%np3 ) ) DEALLOCATE( desc%np3 ) + desc%use_task_groups = .FALSE. END SUBROUTINE fft_box_deallocate @@ -181,6 +201,7 @@ CONTAINS IF( ( SIZE( ncp ) < nproc ) .OR. ( SIZE( ngp ) < nproc ) ) & CALL errore( ' fft_dlay_set ', ' wrong stick dimensions ', 4 ) + desc%use_task_groups = .FALSE. ! Set the number of "xy" planes for each processor ! in other word do a slab partition along the z axis @@ -240,11 +261,21 @@ CONTAINS ! Set fft local workspace dimension IF ( nproc == 1 ) THEN - desc%nnr = nr1x * nr2x * nr3x + desc%nnr = nr1x * nr2x * nr3x + desc%nnrx = desc%nnr ELSE desc%nnr = MAX( nr3x * ncp(me), nr1x * nr2x * npp(me) ) + desc%nnr = MAX( 1, desc%nnr ) ! ensure that desc%nrr > 0 ( for extreme parallelism ) + desc%nnrx = desc%nnr + DO i = 1, nproc + desc%nnrx = MAX( desc%nnrx, nr3x * ncp( i ) ) + desc%nnrx = MAX( desc%nnrx, nr1x * nr2x * npp( i ) ) + END DO + desc%nnrx = MAX( 1, desc%nnrx ) ! ensure that desc%nrr > 0 ( for extreme parallelism ) END IF + + desc%ngl( 1:nproc ) = ngp( 1:nproc ) desc%nwl( 1:nproc ) = ngpw( 1:nproc ) @@ -442,6 +473,7 @@ CONTAINS END DO + desc%use_task_groups = .FALSE. END SUBROUTINE fft_box_set @@ -492,6 +524,8 @@ CONTAINS desc%nnp = nr1x * nr2x desc%npp = nr3 desc%ipp = 0 + desc%use_task_groups = .FALSE. + desc%nnrx = desc%nnr RETURN END SUBROUTINE fft_dlay_scalar diff --git a/Modules/make.depend b/Modules/make.depend index 10ebcc427..d71587a6c 100644 --- a/Modules/make.depend +++ b/Modules/make.depend @@ -57,14 +57,12 @@ fft_base.o : fft_types.o fft_base.o : kind.o fft_base.o : mp_global.o fft_base.o : parallel_include.o -fft_base.o : task_groups.o fft_parallel.o : fft_base.o fft_parallel.o : fft_scalar.o fft_parallel.o : fft_types.o fft_parallel.o : kind.o fft_parallel.o : mp_global.o fft_parallel.o : parallel_include.o -fft_parallel.o : task_groups.o fft_scalar.o : kind.o fft_types.o : io_global.o diff --git a/Modules/mp_global.f90 b/Modules/mp_global.f90 index fee9e0445..9f041f575 100644 --- a/Modules/mp_global.f90 +++ b/Modules/mp_global.f90 @@ -48,6 +48,8 @@ MODULE mp_global INTEGER :: np_ortho(2) = 1 ! size of the processor grid used in ortho INTEGER :: leg_ortho = 1 ! the distance in the father communicator ! of two neighbour processors in ortho_comm + INTEGER, ALLOCATABLE :: nolist(:) ! list of processors in my orbital task group + INTEGER, ALLOCATABLE :: nplist(:) ! list of processors in my plane wave task group ! ! ... communicators ! @@ -87,6 +89,10 @@ MODULE mp_global inter_image_comm = group_i intra_image_comm = group_i ortho_comm = group_i + ALLOCATE( nolist( nproc_i ) ) + ALLOCATE( nplist( nproc_i ) ) + nolist = 0 + nplist = 0 ! RETURN ! diff --git a/Modules/task_groups.f90 b/Modules/task_groups.f90 index f70c3e5de..8fdf01870 100755 --- a/Modules/task_groups.f90 +++ b/Modules/task_groups.f90 @@ -17,27 +17,9 @@ MODULE task_groups IMPLICIT NONE SAVE - INTEGER, ALLOCATABLE :: nolist(:), nplist(:), pgroup(:) - INTEGER, ALLOCATABLE :: tmp_nsw(:), tmp_npp(:) - INTEGER :: strd - INTEGER :: nswx ! maximum number of stick per processor CONTAINS -SUBROUTINE DEALLOCATE_GROUPS - - IMPLICIT NONE - - ! ... Deallocate groups related arrays - - IF (ALLOCATED(nolist)) DEALLOCATE(nolist) - IF (ALLOCATED(nplist)) DEALLOCATE(nplist) - IF (ALLOCATED(pgroup)) DEALLOCATE(pgroup) - IF (ALLOCATED(tmp_nsw)) DEALLOCATE(tmp_nsw) - IF (ALLOCATED(tmp_npp)) DEALLOCATE(tmp_npp) - -END SUBROUTINE DEALLOCATE_GROUPS - !======================================================================================== ! ADDED SUBROUTINEs FOR TASK GROUP PARALLIZATION @@ -58,6 +40,7 @@ SUBROUTINE task_groups_init( dffts ) ! USE mp_global, ONLY : me_pool, nproc_pool, intra_pool_comm USE mp_global, ONLY : NOGRP, NPGRP, ogrp_comm, pgrp_comm + USE mp_global, ONLY : nolist, nplist USE mp, ONLY : mp_bcast USE io_global, only : stdout USE fft_types, only : fft_dlay_descriptor @@ -68,17 +51,19 @@ SUBROUTINE task_groups_init( dffts ) IMPLICIT NONE - TYPE(fft_dlay_descriptor), INTENT(IN) :: dffts + TYPE(fft_dlay_descriptor), INTENT(INOUT) :: dffts !---------------------------------- !Local Variables declaration !---------------------------------- - INTEGER :: MSGLEN, I, J, N1, IPOS, WORLD, NEWGROUP - INTEGER :: IERR - INTEGER :: itsk, ntsk, color, key - INTEGER :: num_planes, num_sticks - INTEGER, DIMENSION(:), ALLOCATABLE :: nnrsx_vec + INTEGER :: MSGLEN, I, J, N1, IPOS, WORLD, NEWGROUP + INTEGER :: IERR + INTEGER :: itsk, ntsk, color, key + INTEGER :: num_planes, num_sticks + INTEGER :: nnrsx_vec ( nproc_pool ) + INTEGER :: pgroup( nproc_pool ) + INTEGER :: strd ! WRITE( stdout, 100 ) nogrp, npgrp @@ -94,17 +79,11 @@ SUBROUTINE task_groups_init( dffts ) IF( MOD( nproc_pool, nogrp ) /= 0 ) & CALL errore( " groups ", " nogrp should be a divisor of nproc_pool ", 1 ) - - ALLOCATE( pgroup( nproc_pool ) ) ! DO i = 1, nproc_pool pgroup( i ) = i - 1 ENDDO ! - ALLOCATE( nplist( npgrp ) ) - ! - ALLOCATE( nolist( nogrp ) ) - ! !-------------------------------------- !LIST OF PROCESSORS IN MY ORBITAL GROUP !-------------------------------------- @@ -171,20 +150,16 @@ SUBROUTINE task_groups_init( dffts ) #endif - ALLOCATE( nnrsx_vec( nproc_pool ) ) - !Find maximum chunk of local data concerning coefficients of eigenfunctions in g-space #if defined __MPI CALL MPI_Allgather( dffts%nnr, 1, MPI_INTEGER, nnrsx_vec, 1, MPI_INTEGER, intra_pool_comm, IERR) strd = MAXVAL( nnrsx_vec( 1:nproc_pool ) ) - nswx = MAXVAL( dffts%nsw( 1:nproc_pool ) ) #else strd = dffts%nnr - nswx = dffts%nsw(1) #endif - DEALLOCATE( nnrsx_vec ) + IF( strd /= dffts%nnrx ) CALL errore( ' task_groups_init ', ' inconsistent nnrx ', 1 ) !------------------------------------------------------------------------------------- !C. Bekas...TASK GROUP RELATED. FFT DATA STRUCTURES ARE ALREADY DEFINED ABOVE @@ -196,8 +171,8 @@ SUBROUTINE task_groups_init( dffts ) !we choose to do the latter one. !------------------------------------------------------------------------------------- ! - ALLOCATE(tmp_nsw(nproc_pool)) - ALLOCATE(tmp_npp(nproc_pool)) + ALLOCATE( dffts%tg_nsw(nproc_pool)) + ALLOCATE( dffts%tg_npp(nproc_pool)) num_sticks = 0 num_planes = 0 @@ -207,13 +182,15 @@ SUBROUTINE task_groups_init( dffts ) ENDDO #if defined __MPI - CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, tmp_nsw, 1, MPI_INTEGER, intra_pool_comm, IERR) - CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, tmp_npp, 1, MPI_INTEGER, intra_pool_comm, IERR) + CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, dffts%tg_nsw(1), 1, MPI_INTEGER, intra_pool_comm, IERR) + CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, dffts%tg_npp(1), 1, MPI_INTEGER, intra_pool_comm, IERR) #else - tmp_nsw(1) = num_sticks - tmp_npp(1) = num_planes + dffts%tg_nsw(1) = num_sticks + dffts%tg_npp(1) = num_planes #endif + dffts%use_task_groups = .TRUE. + RETURN END SUBROUTINE task_groups_init