From 591121ae4f16fc90808ce19c9c9272e06e008579 Mon Sep 17 00:00:00 2001 From: ccavazzoni Date: Wed, 20 Jun 2012 07:58:26 +0000 Subject: [PATCH] - MPI_ALLTOALLV replaced by MPI_ALLTOALL, it should give better performances with most MPI. git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@9113 c92efa57-630b-4861-b058-cf58834340f0 --- Modules/fft_base.f90 | 698 +++++-------------------------------------- 1 file changed, 79 insertions(+), 619 deletions(-) diff --git a/Modules/fft_base.f90 b/Modules/fft_base.f90 index a97520d62..852ab7f96 100644 --- a/Modules/fft_base.f90 +++ b/Modules/fft_base.f90 @@ -59,510 +59,6 @@ ! ! ! -#if defined __NONBLOCKING_FFT -! -! NON BLOCKING SCATTER, should be better on switched network -! like infiniband, ethernet, myrinet -! -!----------------------------------------------------------------------- -SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, use_tg ) - !----------------------------------------------------------------------- - ! - ! transpose the fft grid across nodes - ! a) From columns to planes (isgn > 0) - ! - ! "columns" (or "pencil") representation: - ! processor "me" has ncp_(me) contiguous columns along z - ! Each column has nr3x elements for a fft of order nr3 - ! nr3x can be =nr3+1 in order to reduce memory conflicts. - ! - ! The transpose take places in two steps: - ! 1) on each processor the columns are divided into slices along z - ! that are stored contiguously. On processor "me", slices for - ! processor "proc" are npp_(proc)*ncp_(me) big - ! 2) all processors communicate to exchange slices - ! (all columns with z in the slice belonging to "me" - ! must be received, all the others must be sent to "proc") - ! Finally one gets the "planes" representation: - ! processor "me" has npp_(me) complete xy planes - ! - ! b) From planes to columns (isgn < 0) - ! - ! Quite the same in the opposite direction - ! - ! The output is overwritten on f_in ; f_aux is used as work space - ! - ! If optional argument "use_tg" is true the subroutines performs - ! the trasposition using the Task Groups distribution - ! -#ifdef __MPI - USE parallel_include -#endif - USE kinds, ONLY : DP - - IMPLICIT NONE - - TYPE (fft_dlay_descriptor), INTENT(in) :: dfft - - INTEGER, INTENT(in) :: nr3x, nxx_, isgn, ncp_ (:), npp_ (:) - COMPLEX (DP), INTENT(inout) :: f_in (nxx_), f_aux (nxx_) - LOGICAL, OPTIONAL, INTENT(in) :: use_tg - -#ifdef __MPI - - INTEGER :: dest, from, k, ip, proc, ierr, me, ipoffset, nprocp, gproc, gcomm, i, kdest, kfrom - INTEGER :: sendcount(dfft%nproc), sdispls(dfft%nproc), recvcount(dfft%nproc), rdispls(dfft%nproc) - INTEGER :: offset(dfft%nproc) - INTEGER :: sh(dfft%nproc), rh(dfft%nproc) - ! - LOGICAL :: use_tg_ , lrcv, lsnd - LOGICAL :: tsts(dfft%nproc), tstr(dfft%nproc) - INTEGER :: istat( MPI_STATUS_SIZE ) - - INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff - -#if defined __HPM - ! CALL f_hpmstart( 10, 'scatter' ) -#endif - - ! - ! Task Groups - - use_tg_ = .false. - - IF( present( use_tg ) ) use_tg_ = use_tg - - me = dfft%mype + 1 - ! - IF( use_tg_ ) THEN - ! This is the number of procs. in the plane-wave group - nprocp = dfft%npgrp - ipoffset = dfft%me_pgrp - gcomm = dfft%pgrp_comm - ELSE - nprocp = dfft%nproc - ipoffset = dfft%mype - gcomm = dfft%comm - ENDIF - ! - IF ( nprocp == 1 ) RETURN - ! - CALL start_clock ('fft_scatter') - ! - ! sendcount(proc): amount of data processor "me" must send to processor - ! recvcount(proc): amount of data processor "me" must receive from - ! - ! offset is used to locate the slices to be sent to proc - ! sdispls+1 is the beginning of data that must be sent to proc - ! rdispls+1 is the beginning of data that must be received from pr - ! - IF( use_tg_ ) THEN - DO proc = 1, nprocp - gproc = dfft%nplist( proc ) + 1 - sendcount (proc) = npp_ ( gproc ) * ncp_ (me) - recvcount (proc) = npp_ (me) * ncp_ ( gproc ) - ENDDO - offset(1) = 0 - DO proc = 2, nprocp - gproc = dfft%nplist( proc - 1 ) + 1 - offset(proc) = offset(proc - 1) + npp_ ( gproc ) - ENDDO - ELSE - DO proc = 1, nprocp - sendcount (proc) = npp_ (proc) * ncp_ (me) - recvcount (proc) = npp_ (me) * ncp_ (proc) - ENDDO - offset(1) = 0 - DO proc = 2, nprocp - offset(proc) = offset(proc - 1) + npp_ (proc - 1) - ENDDO - ENDIF - ! - sdispls (1) = 0 - rdispls (1) = 0 - DO proc = 2, nprocp - sdispls (proc) = sdispls (proc - 1) + sendcount (proc - 1) - rdispls (proc) = rdispls (proc - 1) + recvcount (proc - 1) - ENDDO - ! - ierr = 0 - ! - IF ( isgn > 0 ) THEN - ! - ! "forward" scatter from columns to planes - ! - ! step one: store contiguously the slices and send - ! - DO ip = 1, nprocp - - ! the following two lines make the loop iterations different on each - ! proc in order to avoid that all procs send a msg at the same proc - ! at the same time. - ! - proc = ipoffset + 1 + ip - IF( proc > nprocp ) proc = proc - nprocp - - gproc = proc - IF( use_tg_ ) gproc = dfft%nplist( proc ) + 1 - ! - from = 1 + offset( proc ) - dest = 1 + sdispls( proc ) - ! - ! optimize for large parallel execution, where npp_ ( gproc ) ~ 1 - ! - SELECT CASE ( npp_ ( gproc ) ) - CASE ( 1 ) - DO k = 1, ncp_ (me) - f_aux (dest + (k - 1) ) = f_in (from + (k - 1) * nr3x ) - ENDDO - CASE ( 2 ) - DO k = 1, ncp_ (me) - f_aux ( dest + (k - 1) * 2 - 1 + 1 ) = f_in ( from + (k - 1) * nr3x - 1 + 1 ) - f_aux ( dest + (k - 1) * 2 - 1 + 2 ) = f_in ( from + (k - 1) * nr3x - 1 + 2 ) - ENDDO - CASE ( 3 ) - DO k = 1, ncp_ (me) - f_aux ( dest + (k - 1) * 3 - 1 + 1 ) = f_in ( from + (k - 1) * nr3x - 1 + 1 ) - f_aux ( dest + (k - 1) * 3 - 1 + 2 ) = f_in ( from + (k - 1) * nr3x - 1 + 2 ) - f_aux ( dest + (k - 1) * 3 - 1 + 3 ) = f_in ( from + (k - 1) * nr3x - 1 + 3 ) - ENDDO - CASE ( 4 ) - DO k = 1, ncp_ (me) - f_aux ( dest + (k - 1) * 4 - 1 + 1 ) = f_in ( from + (k - 1) * nr3x - 1 + 1 ) - f_aux ( dest + (k - 1) * 4 - 1 + 2 ) = f_in ( from + (k - 1) * nr3x - 1 + 2 ) - f_aux ( dest + (k - 1) * 4 - 1 + 3 ) = f_in ( from + (k - 1) * nr3x - 1 + 3 ) - f_aux ( dest + (k - 1) * 4 - 1 + 4 ) = f_in ( from + (k - 1) * nr3x - 1 + 4 ) - ENDDO - CASE DEFAULT - DO k = 1, ncp_ (me) - kdest = dest + (k - 1) * npp_ ( gproc ) - 1 - kfrom = from + (k - 1) * nr3x - 1 - DO i = 1, npp_ ( gproc ) - f_aux ( kdest + i ) = f_in ( kfrom + i ) - ENDDO - ENDDO - END SELECT - ! - ! post the non-blocking send, f_aux can't be overwritten until operation has completed - ! - CALL mpi_isend( f_aux( sdispls( proc ) + 1 ), sendcount( proc ), MPI_DOUBLE_COMPLEX, & - proc-1, me, gcomm, sh( proc ), ierr ) - ! - IF( abs(ierr) /= 0 ) CALL errore ('fft_scatter', ' forward send info<>0', abs(ierr) ) - ! - ! - ENDDO - ! - ! step two: receive - ! - DO ip = 1, nprocp - ! - proc = ipoffset + 1 - ip - IF( proc < 1 ) proc = proc + nprocp - ! - ! now post the receive - ! - CALL mpi_irecv( f_in( rdispls( proc ) + 1 ), recvcount( proc ), MPI_DOUBLE_COMPLEX, & - proc-1, MPI_ANY_TAG, gcomm, rh( proc ), ierr ) - ! - IF( abs(ierr) /= 0 ) CALL errore ('fft_scatter', ' forward receive info<>0', abs(ierr) ) - ! - tstr( proc ) = .false. - tsts( proc ) = .false. - ! - ENDDO - ! - ! maybe useless; ensures that no garbage is present in the output - ! - f_in( rdispls( nprocp ) + recvcount( nprocp ) + 1 : size( f_in ) ) = 0.0_DP - ! - lrcv = .false. - lsnd = .false. - ! - ! exit only when all test are true: message operation have completed - ! - DO WHILE ( .not. lrcv .or. .not. lsnd ) - lrcv = .true. - lsnd = .true. - DO proc = 1, nprocp - ! - IF( .not. tstr( proc ) ) THEN - CALL mpi_test( rh( proc ), tstr( proc ), istat, ierr ) - ENDIF - ! - IF( .not. tsts( proc ) ) THEN - CALL mpi_test( sh( proc ), tsts( proc ), istat, ierr ) - ENDIF - ! - lrcv = lrcv .and. tstr( proc ) - lsnd = lsnd .and. tsts( proc ) - ! - ENDDO - ! - ENDDO - ! - IF( isgn == 1 ) THEN - - me_p = dfft%mype + 1 - - IF ( dfft%nproc == 1 ) THEN - nppx = dfft%nr3x - ELSE - nppx = dfft%npp( me_p ) - ENDIF - -!$omp parallel default(shared) -!$omp do - DO i = 1, size(f_aux) - f_aux(i) = (0.d0, 0.d0) - ENDDO - ! -!$omp do private(mc,j) - DO i = 1, dfft%nst - mc = dfft%ismap( i ) - DO j = 1, dfft%npp( me_p ) - f_aux( mc + ( j - 1 ) * dfft%nnp ) = f_in( j + ( i - 1 ) * nppx ) - ENDDO - ENDDO -!$omp end parallel - - ELSE - - me_p = dfft%mype + 1 - - IF( use_tg_ ) THEN - ! - nppx = dfft%tg_npp( me_p ) - npp = dfft%tg_npp( me_p ) - nnp = dfft%nr1x * dfft%nr2x - ! - ELSE - ! - nppx = dfft%npp( me_p ) - IF( dfft%nproc == 1 ) nppx = dfft%nr3x - npp = dfft%npp( me_p ) - nnp = dfft%nnp - ! - ENDIF - ! -!$omp parallel default(shared), private( ii, mc, j, i, ioff, ip, it ) -!$omp do - DO i = 1, size( f_aux ) - f_aux(i) = (0.d0, 0.d0) - ENDDO - ! - ii = 0 - ! - DO ip = 1, dfft%nproc - ! - ioff = dfft%iss( ip ) - ! -!$omp do - DO i = 1, dfft%nsw( ip ) - ! - mc = dfft%ismap( i + ioff ) - ! - it = ( ii + i - 1 ) * nppx - ! - DO j = 1, npp - f_aux( mc + ( j - 1 ) * nnp ) = f_in( j + it ) - ENDDO - ! - ENDDO - ! - ii = ii + dfft%nsw( ip ) - ! - ENDDO -!$omp end parallel - - END IF - ! - ELSE - ! - ! "backward" scatter from planes to columns - ! - IF( isgn == -1 ) THEN - me_p = dfft%mype + 1 - IF ( dfft%nproc == 1 ) THEN - nppx = dfft%nr3x - ELSE - nppx = dfft%npp( me_p ) - ENDIF -!$omp parallel default(shared), private( mc, j, i ) -!$omp do - DO i = 1, dfft%nst - mc = dfft%ismap( i ) - DO j = 1, dfft%npp( me_p ) - f_in( j + ( i - 1 ) * nppx ) = f_aux( mc + ( j - 1 ) * dfft%nnp ) - ENDDO - ENDDO -!$omp end parallel - - ELSE - - me_p = dfft%mype + 1 - - IF( use_tg_ ) THEN - ! - nppx = dfft%tg_npp( me_p ) - npp = dfft%tg_npp( me_p ) - nnp = dfft%nr1x * dfft%nr2x - ! - ELSE - ! - nppx = dfft%npp( me_p ) - IF( dfft%nproc == 1 ) nppx = dfft%nr3x - npp = dfft%npp( me_p ) - nnp = dfft%nnp - ! - ENDIF - -!$omp parallel default(shared), private( mc, j, i, ii, ip, it ) - ii = 0 - DO ip = 1, dfft%nproc -!$omp do - DO i = 1, dfft%nsw( ip ) - mc = dfft%ismap( i + dfft%iss( ip ) ) - it = (ii + i - 1)*nppx - DO j = 1, npp - f_in( j + it ) = f_aux( mc + ( j - 1 ) * nnp ) - ENDDO - ENDDO - ii = ii + dfft%nsw( ip ) - ENDDO -!$omp end parallel - - END IF - ! - DO ip = 1, nprocp - - ! post the non blocking send - - proc = ipoffset + 1 + ip - IF( proc > nprocp ) proc = proc - nprocp - - CALL mpi_isend( f_in( rdispls( proc ) + 1 ), recvcount( proc ), MPI_DOUBLE_COMPLEX, & - proc-1, me, gcomm, sh( proc ), ierr ) - IF( abs(ierr) /= 0 ) CALL errore ('fft_scatter', ' backward send info<>0', abs(ierr) ) - - ! post the non blocking receive - - proc = ipoffset + 1 - ip - IF( proc < 1 ) proc = proc + nprocp - - CALL mpi_irecv( f_aux( sdispls( proc ) + 1 ), sendcount( proc ), MPI_DOUBLE_COMPLEX, & - proc-1, MPI_ANY_TAG, gcomm, rh(proc), ierr ) - IF( abs(ierr) /= 0 ) CALL errore ('fft_scatter', ' backward receive info<>0', abs(ierr) ) - - tstr( ip ) = .false. - tsts( ip ) = .false. - - ENDDO - ! - lrcv = .false. - lsnd = .false. - ! - ! exit only when all test are true: message hsve been sent and received - ! - DO WHILE ( .not. lsnd ) - ! - lsnd = .true. - ! - DO proc = 1, nprocp - ! - IF( .not. tsts( proc ) ) THEN - CALL mpi_test( sh( proc ), tsts( proc ), istat, ierr ) - ENDIF - - lsnd = lsnd .and. tsts( proc ) - - ENDDO - - ENDDO - ! - lrcv = .false. - ! - DO WHILE ( .not. lrcv ) - ! - lrcv = .true. - ! - DO proc = 1, nprocp - - gproc = proc - IF( use_tg_ ) gproc = dfft%nplist(proc)+1 - - IF( .not. tstr( proc ) ) THEN - - CALL mpi_test( rh( proc ), tstr( proc ), istat, ierr ) - - IF( tstr( proc ) ) THEN - - from = 1 + sdispls( proc ) - dest = 1 + offset( proc ) - ! - ! optimize for large parallel execution, where npp_ ( gproc ) ~ 1 - ! - SELECT CASE ( npp_ ( gproc ) ) - CASE ( 1 ) - DO k = 1, ncp_ (me) - f_in ( dest + (k - 1) * nr3x ) = f_aux ( from + k - 1 ) - ENDDO - CASE ( 2 ) - DO k = 1, ncp_ ( me ) - f_in ( dest + (k - 1) * nr3x - 1 + 1 ) = f_aux( from + (k - 1) * 2 - 1 + 1 ) - f_in ( dest + (k - 1) * nr3x - 1 + 2 ) = f_aux( from + (k - 1) * 2 - 1 + 2 ) - ENDDO - CASE ( 3 ) - DO k = 1, ncp_ ( me ) - f_in ( dest + (k - 1) * nr3x - 1 + 1 ) = f_aux( from + (k - 1) * 3 - 1 + 1 ) - f_in ( dest + (k - 1) * nr3x - 1 + 2 ) = f_aux( from + (k - 1) * 3 - 1 + 2 ) - f_in ( dest + (k - 1) * nr3x - 1 + 3 ) = f_aux( from + (k - 1) * 3 - 1 + 3 ) - ENDDO - CASE ( 4 ) - DO k = 1, ncp_ ( me ) - f_in ( dest + (k - 1) * nr3x - 1 + 1 ) = f_aux( from + (k - 1) * 4 - 1 + 1 ) - f_in ( dest + (k - 1) * nr3x - 1 + 2 ) = f_aux( from + (k - 1) * 4 - 1 + 2 ) - f_in ( dest + (k - 1) * nr3x - 1 + 3 ) = f_aux( from + (k - 1) * 4 - 1 + 3 ) - f_in ( dest + (k - 1) * nr3x - 1 + 4 ) = f_aux( from + (k - 1) * 4 - 1 + 4 ) - ENDDO - CASE DEFAULT - DO k = 1, ncp_ ( me ) - kdest = dest + (k - 1) * nr3x - 1 - kfrom = from + (k - 1) * npp_ ( gproc ) - 1 - DO i = 1, npp_ ( gproc ) - f_in ( kdest + i ) = f_aux( kfrom + i ) - ENDDO - ENDDO - END SELECT - - ENDIF - - ENDIF - - lrcv = lrcv .and. tstr( proc ) - - ENDDO - - ENDDO - - ENDIF - - CALL stop_clock ('fft_scatter') - -#endif - -#if defined __HPM - ! CALL f_hpmstop( 10 ) -#endif - - RETURN - -END SUBROUTINE fft_scatter -! -! -! -#else -! ! ALLTOALL based SCATTER, should be better on network ! with a defined topology, like on bluegene and cray machine ! @@ -612,8 +108,7 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, use_tg #ifdef __MPI INTEGER :: dest, from, k, offset, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom - INTEGER :: sendcount (dfft%nproc), sdispls (dfft%nproc), recvcount (dfft%nproc), rdispls (dfft%nproc) - INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff + INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx ! LOGICAL :: use_tg_ @@ -639,38 +134,30 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, use_tg ! CALL start_clock ('fft_scatter') ! - ! sendcount(proc): amount of data processor "me" must send to processor - ! recvcount(proc): amount of data processor "me" must receive from - ! offset1(proc) is used to locate the slices to be sent to proc - ! sdispls(proc)+1 is the beginning of data that must be sent to proc - ! rdispls(proc)+1 is the beginning of data that must be received from pr - ! - ! + ncpx = 0 + nppx = 0 IF( use_tg_ ) THEN DO proc = 1, nprocp gproc = dfft%nplist( proc ) + 1 - sendcount (proc) = npp_ ( gproc ) * ncp_ (me) - recvcount (proc) = npp_ (me) * ncp_ ( gproc ) + ncpx = max( ncpx, ncp_ ( gproc ) ) + nppx = max( nppx, npp_ ( gproc ) ) ENDDO ELSE DO proc = 1, nprocp - sendcount (proc) = npp_ (proc) * ncp_ (me) - recvcount (proc) = npp_ (me) * ncp_ (proc) + ncpx = max( ncpx, ncp_ ( proc ) ) + nppx = max( nppx, npp_ ( proc ) ) ENDDO + IF ( dfft%nproc == 1 ) THEN + nppx = dfft%nr3x + END IF ENDIF - ! - sdispls (1) = 0 - rdispls (1) = 0 - DO proc = 2, nprocp - sdispls (proc) = sdispls (proc - 1) + sendcount (proc - 1) - rdispls (proc) = rdispls (proc - 1) + recvcount (proc - 1) - ENDDO + sendsiz = ncpx * nppx ! ierr = 0 IF (isgn.gt.0) THEN - IF( nprocp < 2 ) GO TO 10 + IF (nprocp==1) GO TO 10 ! ! "forward" scatter from columns to planes ! @@ -680,15 +167,15 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, use_tg DO proc = 1, nprocp from = offset - dest = 1 + sdispls (proc) IF( use_tg_ ) THEN gproc = dfft%nplist(proc)+1 ELSE gproc = proc ENDIF + dest = 1 + ( proc - 1 ) * sendsiz ! DO k = 1, ncp_ (me) - kdest = dest + (k - 1) * npp_ ( gproc ) - 1 + kdest = dest + (k - 1) * nppx - 1 kfrom = from + (k - 1) * nr3x - 1 DO i = 1, npp_ ( gproc ) f_aux ( kdest + i ) = f_in ( kfrom + i ) @@ -712,87 +199,64 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, use_tg CALL mpi_barrier (gcomm, ierr) ! why barrier? for buggy openmpi over ib - CALL mpi_alltoallv (f_aux(1), sendcount, sdispls, MPI_DOUBLE_COMPLEX, f_in(1), & - recvcount, rdispls, MPI_DOUBLE_COMPLEX, gcomm, ierr) + CALL mpi_alltoall (f_aux(1), sendsiz, MPI_DOUBLE_COMPLEX, f_in(1), sendsiz, MPI_DOUBLE_COMPLEX, gcomm, ierr) IF( abs(ierr) /= 0 ) CALL errore ('fft_scatter', 'info<>0', abs(ierr) ) ! 10 CONTINUE - + ! + f_aux = (0.d0, 0.d0) + ! IF( isgn == 1 ) THEN - me_p = dfft%mype + 1 - - IF ( dfft%nproc == 1 ) THEN - nppx = dfft%nr3x - ELSE - nppx = dfft%npp( me_p ) - ENDIF - -!$omp parallel default(shared) -!$omp do - DO i = 1, size(f_aux) - f_aux(i) = (0.d0, 0.d0) - ENDDO - ! -!$omp do private(mc,j) - DO i = 1, dfft%nst - mc = dfft%ismap( i ) - DO j = 1, dfft%npp( me_p ) - f_aux( mc + ( j - 1 ) * dfft%nnp ) = f_in( j + ( i - 1 ) * nppx ) + DO ip = 1, dfft%nproc + ioff = dfft%iss( ip ) + DO i = 1, dfft%nsp( ip ) + mc = dfft%ismap( i + ioff ) + it = ( i - 1 ) * nppx + ( ip - 1 ) * sendsiz + DO j = 1, dfft%npp( me ) + f_aux( mc + ( j - 1 ) * dfft%nnp ) = f_in( j + it ) + ENDDO ENDDO ENDDO -!$omp end parallel ELSE - me_p = dfft%mype + 1 - IF( use_tg_ ) THEN - ! - nppx = dfft%tg_npp( me_p ) - npp = dfft%tg_npp( me_p ) + npp = dfft%tg_npp( me ) nnp = dfft%nr1x * dfft%nr2x - ! ELSE - ! - nppx = dfft%npp( me_p ) - IF( dfft%nproc == 1 ) nppx = dfft%nr3x - npp = dfft%npp( me_p ) + npp = dfft%npp( me ) nnp = dfft%nnp - ! ENDIF ! -!$omp parallel default(shared), private( ii, mc, j, i, ioff, ip, it ) -!$omp do - DO i = 1, size( f_aux ) - f_aux(i) = (0.d0, 0.d0) - ENDDO - ! - ii = 0 - ! DO ip = 1, dfft%nproc + + IF( use_tg_ ) THEN + gproc = ( ip - 1 ) / dfft%nogrp + 1 + IF( MOD( ip - 1, dfft%nogrp ) == 0 ) ii = 0 + ELSE + gproc = ip + ii = 0 + ENDIF ! ioff = dfft%iss( ip ) ! -!$omp do DO i = 1, dfft%nsw( ip ) ! mc = dfft%ismap( i + ioff ) ! - it = ( ii + i - 1 ) * nppx + it = ii * nppx + ( gproc - 1 ) * sendsiz ! DO j = 1, npp f_aux( mc + ( j - 1 ) * nnp ) = f_in( j + it ) ENDDO ! + ii = ii + 1 + ! ENDDO ! - ii = ii + dfft%nsw( ip ) - ! ENDDO -!$omp end parallel - END IF @@ -801,59 +265,59 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, use_tg ! "backward" scatter from planes to columns ! IF( isgn == -1 ) THEN - me_p = dfft%mype + 1 - IF ( dfft%nproc == 1 ) THEN - nppx = dfft%nr3x - ELSE - nppx = dfft%npp( me_p ) - ENDIF -!$omp parallel default(shared), private( mc, j, i ) -!$omp do - DO i = 1, dfft%nst - mc = dfft%ismap( i ) - DO j = 1, dfft%npp( me_p ) - f_in( j + ( i - 1 ) * nppx ) = f_aux( mc + ( j - 1 ) * dfft%nnp ) + + DO ip = 1, dfft%nproc + ioff = dfft%iss( ip ) + DO i = 1, dfft%nsp( ip ) + mc = dfft%ismap( i + ioff ) + it = ( i - 1 ) * nppx + ( ip - 1 ) * sendsiz + DO j = 1, dfft%npp( me ) + f_in( j + it ) = f_aux( mc + ( j - 1 ) * dfft%nnp ) + ENDDO ENDDO ENDDO -!$omp end parallel ELSE - me_p = dfft%mype + 1 - IF( use_tg_ ) THEN - ! - nppx = dfft%tg_npp( me_p ) - npp = dfft%tg_npp( me_p ) + npp = dfft%tg_npp( me ) nnp = dfft%nr1x * dfft%nr2x - ! ELSE - ! - nppx = dfft%npp( me_p ) - IF( dfft%nproc == 1 ) nppx = dfft%nr3x - npp = dfft%npp( me_p ) + npp = dfft%npp( me ) nnp = dfft%nnp - ! ENDIF -!$omp parallel default(shared), private( mc, j, i, ii, ip, it ) - ii = 0 DO ip = 1, dfft%nproc -!$omp do + + IF( use_tg_ ) THEN + gproc = ( ip - 1 ) / dfft%nogrp + 1 + IF( MOD( ip - 1, dfft%nogrp ) == 0 ) ii = 0 + ELSE + gproc = ip + ii = 0 + ENDIF + ! + ioff = dfft%iss( ip ) + ! DO i = 1, dfft%nsw( ip ) - mc = dfft%ismap( i + dfft%iss( ip ) ) - it = (ii + i - 1)*nppx + ! + mc = dfft%ismap( i + ioff ) + ! + it = ii * nppx + ( gproc - 1 ) * sendsiz + ! DO j = 1, npp f_in( j + it ) = f_aux( mc + ( j - 1 ) * nnp ) ENDDO + ! + ii = ii + 1 + ! ENDDO - ii = ii + dfft%nsw( ip ) + ENDDO -!$omp end parallel END IF - IF( nprocp < 2 ) GO TO 20 + IF( nprocp == 1 ) GO TO 20 ! ! step two: communication ! @@ -865,8 +329,7 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, use_tg CALL mpi_barrier (gcomm, ierr) ! why barrier? for buggy openmpi over ib - CALL mpi_alltoallv (f_in(1), recvcount, rdispls, MPI_DOUBLE_COMPLEX, f_aux(1), & - sendcount, sdispls, MPI_DOUBLE_COMPLEX, gcomm, ierr) + CALL mpi_alltoall (f_in(1), sendsiz, MPI_DOUBLE_COMPLEX, f_aux(1), sendsiz, MPI_DOUBLE_COMPLEX, gcomm, ierr) IF( abs(ierr) /= 0 ) CALL errore ('fft_scatter', 'info<>0', abs(ierr) ) ! @@ -875,26 +338,24 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, use_tg f_in = 0.0_DP ! offset = 1 - ! + DO proc = 1, nprocp - from = 1 + sdispls (proc) - dest = offset + from = offset IF( use_tg_ ) THEN gproc = dfft%nplist(proc)+1 ELSE gproc = proc ENDIF + dest = 1 + ( proc - 1 ) * sendsiz ! DO k = 1, ncp_ (me) - kdest = dest + (k - 1) * nr3x - 1 - kfrom = from + (k - 1) * npp_ ( gproc ) - 1 - DO i = 1, npp_ ( gproc ) - f_in ( kdest + i ) = f_aux( kfrom + i ) + kdest = dest + (k - 1) * nppx - 1 + kfrom = from + (k - 1) * nr3x - 1 + DO i = 1, npp_ ( gproc ) + f_in ( kfrom + i ) = f_aux ( kdest + i ) ENDDO ENDDO - ! offset = offset + npp_ ( gproc ) - ! ENDDO 20 CONTINUE @@ -913,7 +374,6 @@ SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, use_tg END SUBROUTINE fft_scatter -#endif !---------------------------------------------------------------------------- SUBROUTINE grid_gather( f_in, f_out )