quantum-espresso/FFTXlib/scatter_mod.f90

1360 lines
39 KiB
Fortran
Raw Normal View History

!
! Copyright (C) Quantum ESPRESSO group
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------
! FFT base Module.
! Written by Carlo Cavazzoni, modified by Paolo Giannozzi
!----------------------------------------------------------------------
!
!=----------------------------------------------------------------------=!
MODULE scatter_mod
!=----------------------------------------------------------------------=!
USE fft_types, ONLY: fft_type_descriptor
USE task_groups, ONLY: task_groups_descriptor
IMPLICIT NONE
INTERFACE gather_grid
MODULE PROCEDURE gather_real_grid, gather_complex_grid
END INTERFACE
INTERFACE scatter_grid
MODULE PROCEDURE scatter_real_grid, scatter_complex_grid
END INTERFACE
INTEGER, PARAMETER :: DP = selected_real_kind(14,200)
SAVE
PRIVATE
PUBLIC :: fft_type_descriptor
PUBLIC :: fft_scatter, gather_grid, scatter_grid
PUBLIC :: cgather_sym, cgather_sym_many, cscatter_sym_many
PUBLIC :: maps_sticks_to_3d
!=----------------------------------------------------------------------=!
CONTAINS
!=----------------------------------------------------------------------=!
!
!
#if ! defined __NON_BLOCKING_SCATTER
!
! ALLTOALL based SCATTER, should be better on network
! with a defined topology, like on bluegene and cray machine
!
!-----------------------------------------------------------------------
SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
!-----------------------------------------------------------------------
!
! transpose the fft grid across nodes
! a) From columns to planes (isgn > 0)
!
! "columns" (or "pencil") representation:
! processor "me" has ncp_(me) contiguous columns along z
! Each column has nr3x elements for a fft of order nr3
! nr3x can be =nr3+1 in order to reduce memory conflicts.
!
! The transpose take places in two steps:
! 1) on each processor the columns are divided into slices along z
! that are stored contiguously. On processor "me", slices for
! processor "proc" are npp_(proc)*ncp_(me) big
! 2) all processors communicate to exchange slices
! (all columns with z in the slice belonging to "me"
! must be received, all the others must be sent to "proc")
! Finally one gets the "planes" representation:
! processor "me" has npp_(me) complete xy planes
! f_in contains input columns, is destroyed on output
! f_aux contains output planes
!
! b) From planes to columns (isgn < 0)
!
! Quite the same in the opposite direction
! f_aux contains input planes, is destroyed on output
! f_in contains output columns
!
! If optional argument "dtgs" is present the subroutines performs
! the trasposition using the Task Groups distribution
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
TYPE (fft_type_descriptor), INTENT(in) :: dfft
INTEGER, INTENT(in) :: nr3x, nxx_, isgn, ncp_ (:), npp_ (:)
COMPLEX (DP), INTENT(inout) :: f_in (nxx_), f_aux (nxx_)
TYPE (task_groups_descriptor), OPTIONAL, INTENT(in) :: dtgs
#if defined(__MPI)
INTEGER :: k, offset, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom
INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx, ipp, nblk, nsiz
!
LOGICAL :: use_tg_
!
! Task Groups
use_tg_ = .false.
IF( present( dtgs ) ) use_tg_ = .true.
me = dfft%mype + 1
!
IF( use_tg_ ) THEN
! This is the number of procs. in the plane-wave group
nprocp = dtgs%npgrp
ELSE
nprocp = dfft%nproc
ENDIF
!
CALL start_clock ('fft_scatter')
!
ncpx = 0
nppx = 0
IF( use_tg_ ) THEN
ncpx = dtgs%tg_ncpx
nppx = dtgs%tg_nppx
gcomm = dtgs%pgrp_comm
ELSE
DO proc = 1, nprocp
ncpx = max( ncpx, ncp_ ( proc ) )
nppx = max( nppx, npp_ ( proc ) )
ENDDO
IF ( dfft%nproc == 1 ) THEN
nppx = dfft%nr3x
END IF
ENDIF
sendsiz = ncpx * nppx
!
ierr = 0
IF (isgn.gt.0) THEN
IF (nprocp==1) GO TO 10
!
! "forward" scatter from columns to planes
!
! step one: store contiguously the slices
!
offset = 0
DO proc = 1, nprocp
IF( use_tg_ ) THEN
gproc = dtgs%nplist(proc)+1
ELSE
gproc = proc
ENDIF
!
kdest = ( proc - 1 ) * sendsiz
kfrom = offset
!
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
f_aux ( kdest + i ) = f_in ( kfrom + i )
ENDDO
kdest = kdest + nppx
kfrom = kfrom + nr3x
ENDDO
offset = offset + npp_ ( gproc )
ENDDO
!
! maybe useless; ensures that no garbage is present in the output
!
!! f_in = 0.0_DP
!
! step two: communication
!
IF( use_tg_ ) THEN
gcomm = dtgs%pgrp_comm
ELSE
gcomm = dfft%comm
ENDIF
CALL mpi_alltoall (f_aux(1), sendsiz, MPI_DOUBLE_COMPLEX, f_in(1), sendsiz, MPI_DOUBLE_COMPLEX, gcomm, ierr)
IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
!
10 CONTINUE
!
f_aux = (0.d0, 0.d0)
!
IF( isgn == 1 ) THEN
DO ip = 1, dfft%nproc
ioff = dfft%iss( ip )
it = ( ip - 1 ) * sendsiz
DO i = 1, dfft%nsp( ip )
mc = dfft%ismap( i + ioff )
DO j = 1, dfft%npp( me )
f_aux( mc + ( j - 1 ) * dfft%nnp ) = f_in( j + it )
ENDDO
it = it + nppx
ENDDO
ENDDO
ELSE
IF( use_tg_ ) THEN
npp = dtgs%tg_npp( me )
nnp = dfft%nr1x * dfft%nr2x
ELSE
npp = dfft%npp( me )
nnp = dfft%nnp
ENDIF
IF( use_tg_ ) THEN
nblk = dfft%nproc / dtgs%nogrp
nsiz = dtgs%nogrp
ELSE
nblk = dfft%nproc
nsiz = 1
END IF
!
ip = 1
!
DO gproc = 1, nblk
!
ii = 0
!
DO ipp = 1, nsiz
!
ioff = dfft%iss( ip )
!
DO i = 1, dfft%nsw( ip )
!
mc = dfft%ismap( i + ioff )
!
it = ii * nppx + ( gproc - 1 ) * sendsiz
!
DO j = 1, npp
f_aux( mc + ( j - 1 ) * nnp ) = f_in( j + it )
ENDDO
!
ii = ii + 1
!
ENDDO
!
ip = ip + 1
!
ENDDO
!
ENDDO
END IF
ELSE
!
! "backward" scatter from planes to columns
!
IF( isgn == -1 ) THEN
DO ip = 1, dfft%nproc
ioff = dfft%iss( ip )
it = ( ip - 1 ) * sendsiz
DO i = 1, dfft%nsp( ip )
mc = dfft%ismap( i + ioff )
DO j = 1, dfft%npp( me )
f_in( j + it ) = f_aux( mc + ( j - 1 ) * dfft%nnp )
ENDDO
it = it + nppx
ENDDO
ENDDO
ELSE
IF( use_tg_ ) THEN
npp = dtgs%tg_npp( me )
nnp = dfft%nr1x * dfft%nr2x
ELSE
npp = dfft%npp( me )
nnp = dfft%nnp
ENDIF
IF( use_tg_ ) THEN
nblk = dtgs%nproc / dtgs%nogrp
nsiz = dtgs%nogrp
ELSE
nblk = dfft%nproc
nsiz = 1
END IF
!
ip = 1
!
DO gproc = 1, nblk
!
ii = 0
!
DO ipp = 1, nsiz
!
ioff = dfft%iss( ip )
!
DO i = 1, dfft%nsw( ip )
!
mc = dfft%ismap( i + ioff )
!
it = ii * nppx + ( gproc - 1 ) * sendsiz
!
DO j = 1, npp
f_in( j + it ) = f_aux( mc + ( j - 1 ) * nnp )
ENDDO
!
ii = ii + 1
!
ENDDO
!
ip = ip + 1
!
ENDDO
!
ENDDO
END IF
IF( nprocp == 1 ) GO TO 20
!
! step two: communication
!
IF( use_tg_ ) THEN
gcomm = dtgs%pgrp_comm
ELSE
gcomm = dfft%comm
ENDIF
CALL mpi_alltoall (f_in(1), sendsiz, MPI_DOUBLE_COMPLEX, f_aux(1), sendsiz, MPI_DOUBLE_COMPLEX, gcomm, ierr)
IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
!
! step one: store contiguously the columns
!
!! f_in = 0.0_DP
!
offset = 0
DO proc = 1, nprocp
IF( use_tg_ ) THEN
gproc = dtgs%nplist(proc)+1
ELSE
gproc = proc
ENDIF
!
kdest = ( proc - 1 ) * sendsiz
kfrom = offset
!
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
f_in ( kfrom + i ) = f_aux ( kdest + i )
ENDDO
kdest = kdest + nppx
kfrom = kfrom + nr3x
ENDDO
offset = offset + npp_ ( gproc )
ENDDO
20 CONTINUE
ENDIF
CALL stop_clock ('fft_scatter')
#endif
RETURN
END SUBROUTINE fft_scatter
!
#else
!
! NON BLOCKING SCATTER, should be better on switched network
! like infiniband, ethernet, myrinet
!
!-----------------------------------------------------------------------
SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
!-----------------------------------------------------------------------
!
! transpose the fft grid across nodes
! a) From columns to planes (isgn > 0)
!
! "columns" (or "pencil") representation:
! processor "me" has ncp_(me) contiguous columns along z
! Each column has nr3x elements for a fft of order nr3
! nr3x can be =nr3+1 in order to reduce memory conflicts.
!
! The transpose take places in two steps:
! 1) on each processor the columns are divided into slices along z
! that are stored contiguously. On processor "me", slices for
! processor "proc" are npp_(proc)*ncp_(me) big
! 2) all processors communicate to exchange slices
! (all columns with z in the slice belonging to "me"
! must be received, all the others must be sent to "proc")
! Finally one gets the "planes" representation:
! processor "me" has npp_(me) complete xy planes
!
! b) From planes to columns (isgn < 0)
!
! Quite the same in the opposite direction
!
! The output is overwritten on f_in ; f_aux is used as work space
!
! If optional argument "dtgs" is present the subroutines performs
! the trasposition using the Task Groups distribution
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
TYPE (fft_type_descriptor), INTENT(in) :: dfft
INTEGER, INTENT(in) :: nr3x, nxx_, isgn, ncp_ (:), npp_ (:)
COMPLEX (DP), INTENT(inout) :: f_in (nxx_), f_aux (nxx_)
TYPE (task_groups_descriptor), OPTIONAL, INTENT(in) :: dtgs
#if defined(__MPI)
INTEGER :: k, offset, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom
INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx, ipp, nblk, nsiz, ijp
INTEGER :: sh(dfft%nproc), rh(dfft%nproc)
INTEGER :: istat( MPI_STATUS_SIZE )
!
INTEGER, SAVE, ALLOCATABLE :: indmap(:,:)
INTEGER, SAVE, ALLOCATABLE :: indmap_bw(:)
INTEGER, SAVE :: nijp
INTEGER, SAVE :: dimref(4) = 0
INTEGER, SAVE :: dimref_bw(4) = 0
!
LOGICAL :: use_tg_
!
CALL start_clock ('fft_scatter')
use_tg_ = .false.
IF( present( dtgs ) ) use_tg_ = .true.
me = dfft%mype + 1
!
ncpx = 0
nppx = 0
IF( use_tg_ ) THEN
! This is the number of procs. in the plane-wave group
nprocp = dtgs%npgrp
ncpx = dtgs%tg_ncpx
nppx = dtgs%tg_nppx
gcomm = dtgs%pgrp_comm
ELSE
nprocp = dfft%nproc
DO proc = 1, nprocp
ncpx = max( ncpx, ncp_ ( proc ) )
nppx = max( nppx, npp_ ( proc ) )
ENDDO
IF ( dfft%nproc == 1 ) THEN
nppx = dfft%nr3x
END IF
gcomm = dfft%comm
ENDIF
!
sendsiz = ncpx * nppx
!
IF ( isgn .gt. 0 ) THEN
!
! "forward" scatter from columns to planes
!
! step one: store contiguously the slices
!
offset = 0
IF( use_tg_ ) THEN
DO proc = 1, nprocp
gproc = dtgs%nplist(proc)+1
kdest = ( proc - 1 ) * sendsiz
kfrom = offset
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
f_aux ( kdest + i ) = f_in ( kfrom + i )
ENDDO
kdest = kdest + nppx
kfrom = kfrom + nr3x
ENDDO
offset = offset + npp_ ( gproc )
! post the non-blocking send, f_aux can't be overwritten until operation has completed
CALL mpi_isend( f_aux( (proc-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, proc-1, me, gcomm, sh( proc ), ierr )
! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward send info<>0', abs(ierr) )
ENDDO
ELSE
DO proc = 1, nprocp
kdest = ( proc - 1 ) * sendsiz
kfrom = offset
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( proc )
f_aux ( kdest + i ) = f_in ( kfrom + i )
ENDDO
kdest = kdest + nppx
kfrom = kfrom + nr3x
ENDDO
offset = offset + npp_ ( proc )
! post the non-blocking send, f_aux can't be overwritten until operation has completed
CALL mpi_isend( f_aux( (proc-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, proc-1, me, gcomm, sh( proc ), ierr )
! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward send info<>0', abs(ierr) )
ENDDO
ENDIF
!
! step two: receive
!
DO proc = 1, nprocp
!
! now post the receive
!
CALL mpi_irecv( f_in( (proc-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, proc-1, MPI_ANY_TAG, gcomm, rh( proc ), ierr )
!IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward receive info<>0', abs(ierr) )
!
!
ENDDO
!
! maybe useless; ensures that no garbage is present in the output
!
!f_in( nprocp*sendsiz + 1 : size( f_in ) ) = 0.0_DP
!
call mpi_waitall( nprocp, sh, MPI_STATUSES_IGNORE, ierr )
!
f_aux = (0.d0, 0.d0)
!
call mpi_waitall( nprocp, rh, MPI_STATUSES_IGNORE, ierr )
!
IF( isgn == 1 ) THEN
DO ip = 1, dfft%nproc
ioff = dfft%iss( ip )
it = ( ip - 1 ) * sendsiz
DO i = 1, dfft%nsp( ip )
mc = dfft%ismap( i + ioff )
DO j = 1, dfft%npp( me )
f_aux( mc + ( j - 1 ) * dfft%nnp ) = f_in( j + it )
ENDDO
it = it + nppx
ENDDO
ENDDO
ELSE
IF( use_tg_ ) THEN
npp = dtgs%tg_npp( me )
nnp = dfft%nr1x * dfft%nr2x
nblk = dtgs%nproc / dtgs%nogrp
nsiz = dtgs%nogrp
ELSE
npp = dfft%npp( me )
nnp = dfft%nnp
nblk = dfft%nproc
nsiz = 1
ENDIF
!
IF( ( dimref(1) .ne. npp ) .or. ( dimref(2) .ne. nnp ) .or. &
( dimref(3) .ne. nblk ) .or. ( dimref(4) .ne. nsiz ) ) THEN
!
IF( ALLOCATED( indmap ) ) &
DEALLOCATE( indmap )
ALLOCATE( indmap(2,SIZE(f_aux)) )
!
ijp = 0
!
DO gproc = 1, nblk
ii = 0
DO ipp = 1, nsiz
ioff = dfft%iss( (gproc-1)*nsiz + ipp )
DO i = 1, dfft%nsw( (gproc-1)*nsiz + ipp )
mc = dfft%ismap( i + ioff )
it = ii * nppx + (gproc-1) * sendsiz
DO j = 1, npp
ijp = ijp + 1
indmap(1,ijp) = mc + ( j - 1 ) * nnp
indmap(2,ijp) = j + it
ENDDO
ii = ii + 1
ENDDO
ENDDO
ENDDO
!
nijp = ijp
CALL fftsort( nijp, indmap )
dimref(1) = npp
dimref(2) = nnp
dimref(3) = nblk
dimref(4) = nsiz
!
END IF
!
DO ijp = 1, nijp
f_aux( indmap(1,ijp) ) = f_in( indmap(2,ijp) )
END DO
!
END IF
ELSE
!
! "backward" scatter from planes to columns
!
IF( isgn == -1 ) THEN
nblk = dfft%nproc
DO ip = 1, dfft%nproc
ioff = dfft%iss( ip )
it = ( ip - 1 ) * sendsiz
DO i = 1, dfft%nsp( ip )
mc = dfft%ismap( i + ioff )
DO j = 1, dfft%npp( me )
f_in( j + it ) = f_aux( mc + ( j - 1 ) * dfft%nnp )
ENDDO
it = it + nppx
ENDDO
CALL mpi_isend( f_in( (ip-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, ip-1, me, gcomm, sh( ip ), ierr )
! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward send info<>0', abs(ierr) )
ENDDO
DO ip = 1, dfft%nproc
CALL mpi_irecv( f_aux( (ip-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, ip-1, MPI_ANY_TAG, gcomm, rh(ip), ierr )
! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) )
ENDDO
ELSE
IF( use_tg_ ) THEN
npp = dtgs%tg_npp( me )
nnp = dfft%nr1x * dfft%nr2x
nblk = dtgs%nproc / dtgs%nogrp
nsiz = dtgs%nogrp
ELSE
npp = dfft%npp( me )
nnp = dfft%nnp
nblk = dfft%nproc
nsiz = 1
ENDIF
!
IF( ( dimref_bw(1) .ne. npp ) .or. ( dimref_bw(2) .ne. nnp ) .or. &
( dimref_bw(3) .ne. nblk ) .or. ( dimref_bw(4) .ne. nsiz ) ) THEN
!
IF( ALLOCATED( indmap_bw ) ) &
DEALLOCATE( indmap_bw )
!
ALLOCATE( indmap_bw(SIZE(f_aux)) )
!
ijp = 0
DO gproc = 1, nblk
ii = 0
DO ipp = 1, nsiz
ioff = dfft%iss( (gproc-1)*nsiz + ipp )
DO i = 1, dfft%nsw( (gproc-1)*nsiz + ipp )
mc = dfft%ismap( i + ioff )
it = ii * nppx + ( gproc - 1 ) * sendsiz
DO j = 1, npp
indmap_bw( j + ijp ) = mc + ( j - 1 ) * nnp
ENDDO
ijp = ijp + npp
ii = ii + 1
ENDDO
ENDDO
ENDDO
dimref_bw(1) = npp
dimref_bw(2) = nnp
dimref_bw(3) = nblk
dimref_bw(4) = nsiz
END IF
ijp = 0
DO gproc = 0, nblk-1
ii = gproc * sendsiz
DO ipp = gproc*nsiz+1, gproc*nsiz+nsiz
DO i = 1, dfft%nsw( ipp )
DO j = 1, npp
f_in( j + ii ) = f_aux( indmap_bw( j + ijp ) )
ENDDO
ijp = ijp + npp
ii = ii + nppx
ENDDO
ENDDO
CALL mpi_isend( f_in( gproc*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, gproc, me, gcomm, sh( gproc+1 ), ierr )
! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward send info<>0', abs(ierr) )
ENDDO
DO gproc = 1, nblk
CALL mpi_irecv( f_aux( (gproc-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, gproc-1, MPI_ANY_TAG, gcomm, rh(gproc), ierr )
! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) )
ENDDO
END IF
!
call mpi_waitall( nblk, rh, MPI_STATUSES_IGNORE, ierr )
call mpi_waitall( nblk, sh, MPI_STATUSES_IGNORE, ierr )
!
offset = 0
IF( use_tg_ ) THEN
DO proc = 1, nprocp
gproc = dtgs%nplist(proc) + 1
kdest = ( proc - 1 ) * sendsiz
kfrom = offset
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
f_in ( kfrom + i ) = f_aux ( kdest + i )
ENDDO
kdest = kdest + nppx
kfrom = kfrom + nr3x
ENDDO
offset = offset + npp_ ( gproc )
ENDDO
ELSE
DO proc = 1, nprocp
kdest = ( proc - 1 ) * sendsiz
kfrom = offset
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( proc )
f_in ( kfrom + i ) = f_aux ( kdest + i )
ENDDO
kdest = kdest + nppx
kfrom = kfrom + nr3x
ENDDO
offset = offset + npp_ ( proc )
ENDDO
ENDIF
ENDIF
CALL stop_clock ('fft_scatter')
#endif
RETURN
END SUBROUTINE fft_scatter
!
#endif
!
!
SUBROUTINE maps_sticks_to_3d( dffts, dtgs, f_in, nxx_, f_aux, isgn )
!
! this subroutine copy sticks stored in 1D array into the 3D array
! to be used with 3D FFT.
! This is meant for the use of 3D scalar FFT in parallel build
! once the data have been "rotated" to have a single band in a single task
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
TYPE (fft_type_descriptor), INTENT(in) :: dffts
TYPE (task_groups_descriptor), INTENT(in) :: dtgs
INTEGER, INTENT(in) :: nxx_, isgn
COMPLEX (DP), INTENT(in) :: f_in (nxx_)
COMPLEX (DP), INTENT(out) :: f_aux (nxx_)
INTEGER :: ijp, ii, i, j, it, ioff, ipp, mc, jj, ip, gproc, nr12x
!
f_aux = 0.0d0
!
IF( isgn == 2 ) THEN
ip = 1
nr12x = dffts%nr1x * dffts%nr2x
DO gproc = 1, dtgs%nproc / dtgs%nogrp
ii = 0
DO ipp = 1, dtgs%nogrp
ioff = dffts%iss( ip )
DO i = 1, dffts%nsw( ip )
mc = dffts%ismap( i + ioff )
it = ( ii + ( gproc - 1 ) * dtgs%tg_ncpx ) * dtgs%tg_nppx
DO j = 1, dtgs%tg_npp( dffts%mype + 1 )
f_aux( mc + ( j - 1 ) * nr12x ) = f_in( j + it )
ENDDO
ii = ii + 1
ENDDO
ip = ip + 1
ENDDO
ENDDO
ELSE
CALL fftx_error__ (' maps_sticks_to_3d ', ' isgn .ne. 2 not implemented ', 999 )
END IF
RETURN
END SUBROUTINE maps_sticks_to_3d
!
!
!----------------------------------------------------------------------------
SUBROUTINE gather_real_grid ( dfft, f_in, f_out )
!----------------------------------------------------------------------------
!
! ... gathers a distributed real-space FFT grid to dfft%root, that is,
! ... the first processor of input descriptor dfft - version for real arrays
!
! ... REAL*8 f_in = distributed variable (dfft%nnr)
! ... REAL*8 f_out = gathered variable (dfft%nr1x*dfft%nr2x*dfft%nr3x)
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
!
REAL(DP), INTENT(in) :: f_in (:)
REAL(DP), INTENT(inout):: f_out(:)
TYPE ( fft_type_descriptor ), INTENT(IN) :: dfft
!
#if defined(__MPI)
!
INTEGER :: proc, info
! ... the following are automatic arrays
INTEGER :: displs(0:dfft%nproc-1), recvcount(0:dfft%nproc-1)
!
IF( size( f_in ) < dfft%nnr ) &
CALL fftx_error__( ' gather_grid ', ' f_in too small ', dfft%nnr-size( f_in ) )
!
CALL start_clock( 'gather_grid' )
!
DO proc = 0, ( dfft%nproc - 1 )
!
recvcount(proc) = dfft%nnp * dfft%npp(proc+1)
IF ( proc == 0 ) THEN
displs(proc) = 0
ELSE
displs(proc) = displs(proc-1) + recvcount(proc-1)
ENDIF
!
ENDDO
!
! ... the following check should be performed only on processor dfft%root
! ... otherwise f_out must be allocated on all processors even if not used
!
info = size( f_out ) - displs( dfft%nproc-1 ) - recvcount( dfft%nproc-1 )
IF( info < 0 ) &
CALL fftx_error__( ' gather_grid ', ' f_out too small ', -info )
!
info = 0
!
CALL MPI_GATHERV( f_in, recvcount(dfft%mype), MPI_DOUBLE_PRECISION, f_out, &
recvcount, displs, MPI_DOUBLE_PRECISION, dfft%root, &
dfft%comm, info )
!
CALL fftx_error__( 'gather_grid', 'info<>0', info )
!
CALL stop_clock( 'gather_grid' )
!
#else
CALL fftx_error__('gather_grid', 'do not use in serial execution', 1)
#endif
!
RETURN
!
END SUBROUTINE gather_real_grid
!----------------------------------------------------------------------------
SUBROUTINE gather_complex_grid ( dfft, f_in, f_out )
!----------------------------------------------------------------------------
!
! ... gathers a distributed real-space FFT grid to dfft%root, that is,
! ... the first processor of input descriptor dfft - complex arrays
!
! ... COMPLEX*16 f_in = distributed variable (dfft%nnr)
! ... COMPLEX*16 f_out = gathered variable (dfft%nr1x*dfft%nr2x*dfft%nr3x)
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
!
COMPLEX(DP), INTENT(in) :: f_in (:)
COMPLEX(DP), INTENT(inout):: f_out(:)
TYPE ( fft_type_descriptor ), INTENT(IN) :: dfft
!
#if defined(__MPI)
!
INTEGER :: proc, info
! ... the following are automatic arrays
INTEGER :: displs(0:dfft%nproc-1), recvcount(0:dfft%nproc-1)
!
IF( 2*size( f_in ) < dfft%nnr ) &
CALL fftx_error__( ' gather_grid ', ' f_in too small ', dfft%nnr-size( f_in ) )
!
CALL start_clock( 'gather_grid' )
!
DO proc = 0, ( dfft%nproc - 1 )
!
recvcount(proc) = 2*dfft%nnp * dfft%npp(proc+1)
IF ( proc == 0 ) THEN
displs(proc) = 0
ELSE
displs(proc) = displs(proc-1) + recvcount(proc-1)
ENDIF
!
ENDDO
!
! ... the following check should be performed only on processor dfft%root
! ... otherwise f_out must be allocated on all processors even if not used
!
info = 2*size( f_out ) - displs( dfft%nproc - 1 ) - recvcount( dfft%nproc-1 )
IF( info < 0 ) &
CALL fftx_error__( ' gather_grid ', ' f_out too small ', -info )
!
info = 0
!
CALL MPI_GATHERV( f_in, recvcount(dfft%mype), MPI_DOUBLE_PRECISION, f_out, &
recvcount, displs, MPI_DOUBLE_PRECISION, dfft%root, &
dfft%comm, info )
!
CALL fftx_error__( 'gather_grid', 'info<>0', info )
!
CALL stop_clock( 'gather_grid' )
!
#else
CALL fftx_error__('gather_grid', 'do not use in serial execution', 1)
#endif
!
RETURN
!
END SUBROUTINE gather_complex_grid
!----------------------------------------------------------------------------
SUBROUTINE scatter_real_grid ( dfft, f_in, f_out )
!----------------------------------------------------------------------------
!
! ... scatters a real-space FFT grid from dfft%root, first processor of
! ... input descriptor dfft, to all others - opposite of "gather_grid"
!
! ... REAL*8 f_in = gathered variable (dfft%nr1x*dfft%nr2x*dfft%nr3x)
! ... REAL*8 f_out = distributed variable (dfft%nnr)
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
!
REAL(DP), INTENT(in) :: f_in (:)
REAL(DP), INTENT(inout):: f_out(:)
TYPE ( fft_type_descriptor ), INTENT(IN) :: dfft
!
#if defined(__MPI)
!
INTEGER :: proc, info
! ... the following are automatic arrays
INTEGER :: displs(0:dfft%nproc-1), sendcount(0:dfft%nproc-1)
!
IF( size( f_out ) < dfft%nnr ) &
CALL fftx_error__( ' scatter_grid ', ' f_out too small ', dfft%nnr-size( f_in ) )
!
CALL start_clock( 'scatter_grid' )
!
DO proc = 0, ( dfft%nproc - 1 )
!
sendcount(proc) = dfft%nnp * dfft%npp(proc+1)
IF ( proc == 0 ) THEN
displs(proc) = 0
ELSE
displs(proc) = displs(proc-1) + sendcount(proc-1)
ENDIF
!
ENDDO
!
! ... the following check should be performed only on processor dfft%root
! ... otherwise f_in must be allocated on all processors even if not used
!
info = size( f_in ) - displs( dfft%nproc - 1 ) - sendcount( dfft%nproc - 1 )
IF( info < 0 ) &
CALL fftx_error__( ' scatter_grid ', ' f_in too small ', -info )
!
info = 0
!
CALL MPI_SCATTERV( f_in, sendcount, displs, MPI_DOUBLE_PRECISION, &
f_out, sendcount(dfft%mype), MPI_DOUBLE_PRECISION, &
dfft%root, dfft%comm, info )
!
CALL fftx_error__( 'scatter_grid', 'info<>0', info )
!
IF ( sendcount(dfft%mype) /= dfft%nnr ) &
f_out(sendcount(dfft%mype)+1:dfft%nnr) = 0.D0
!
CALL stop_clock( 'scatter_grid' )
!
#else
CALL fftx_error__('scatter_grid', 'do not use in serial execution', 1)
#endif
!
RETURN
!
END SUBROUTINE scatter_real_grid
!----------------------------------------------------------------------------
SUBROUTINE scatter_complex_grid ( dfft, f_in, f_out )
!----------------------------------------------------------------------------
!
! ... scatters a real-space FFT grid from dfft%root, first processor of
! ... input descriptor dfft, to all others - opposite of "gather_grid"
!
! ... COMPLEX*16 f_in = gathered variable (dfft%nr1x*dfft%nr2x*dfft%nr3x)
! ... COMPLEX*16 f_out = distributed variable (dfft%nnr)
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
!
COMPLEX(DP), INTENT(in) :: f_in (:)
COMPLEX(DP), INTENT(inout):: f_out(:)
TYPE ( fft_type_descriptor ), INTENT(IN) :: dfft
!
#if defined(__MPI)
!
INTEGER :: proc, info
! ... the following are automatic arrays
INTEGER :: displs(0:dfft%nproc-1), sendcount(0:dfft%nproc-1)
!
IF( 2*size( f_out ) < dfft%nnr ) &
CALL fftx_error__( ' scatter_grid ', ' f_out too small ', dfft%nnr-size( f_in ) )
!
CALL start_clock( 'scatter_grid' )
!
DO proc = 0, ( dfft%nproc - 1 )
!
sendcount(proc) = 2*dfft%nnp * dfft%npp(proc+1)
IF ( proc == 0 ) THEN
displs(proc) = 0
ELSE
displs(proc) = displs(proc-1) + sendcount(proc-1)
ENDIF
!
ENDDO
!
! ... the following check should be performed only on processor dfft%root
! ... otherwise f_in must be allocated on all processors even if not used
!
info = 2*size( f_in ) - displs( dfft%nproc - 1 ) - sendcount( dfft%nproc - 1 )
IF( info < 0 ) &
CALL fftx_error__( ' scatter_grid ', ' f_in too small ', -info )
!
info = 0
!
CALL MPI_SCATTERV( f_in, sendcount, displs, MPI_DOUBLE_PRECISION, &
f_out, sendcount(dfft%mype), MPI_DOUBLE_PRECISION, &
dfft%root, dfft%comm, info )
!
CALL fftx_error__( 'scatter_grid', 'info<>0', info )
!
IF ( sendcount(dfft%mype) /= dfft%nnr ) &
f_out(sendcount(dfft%mype)+1:dfft%nnr) = 0.D0
!
CALL stop_clock( 'scatter_grid' )
!
#else
CALL fftx_error__('scatter_grid', 'do not use in serial execution', 1)
#endif
!
RETURN
!
END SUBROUTINE scatter_complex_grid
!
! ... "gather"-like subroutines
!
!-----------------------------------------------------------------------
SUBROUTINE cgather_sym( dfftp, f_in, f_out )
!-----------------------------------------------------------------------
!
! ... gather complex data for symmetrization (used in phonon code)
! ... Differs from gather_grid because mpi_allgatherv is used instead
! ... of mpi_gatherv - all data is gathered on ALL processors
! ... COMPLEX*16 f_in = distributed variable (nrxx)
! ... COMPLEX*16 f_out = gathered variable (nr1x*nr2x*nr3x)
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
!
TYPE (fft_type_descriptor), INTENT(in) :: dfftp
COMPLEX(DP) :: f_in( : ), f_out(:)
!
#if defined(__MPI)
!
INTEGER :: proc, info
INTEGER :: displs(0:dfftp%nproc-1), recvcount(0:dfftp%nproc-1)
!
!
CALL start_clock( 'cgather' )
!
DO proc = 0, ( dfftp%nproc - 1 )
!
recvcount(proc) = 2 * dfftp%nnp * dfftp%npp(proc+1)
IF ( proc == 0 ) THEN
displs(proc) = 0
ELSE
displs(proc) = displs(proc-1) + recvcount(proc-1)
ENDIF
!
ENDDO
!
CALL MPI_BARRIER( dfftp%comm, info )
!
CALL MPI_ALLGATHERV( f_in, recvcount(dfftp%mype), MPI_DOUBLE_PRECISION, &
f_out, recvcount, displs, MPI_DOUBLE_PRECISION, &
dfftp%comm, info )
!
CALL fftx_error__( 'cgather_sym', 'info<>0', info )
!
CALL stop_clock( 'cgather' )
!
#else
CALL fftx_error__('cgather_sym', 'do not use in serial execution', 1)
#endif
!
RETURN
!
END SUBROUTINE cgather_sym
!
!
!-----------------------------------------------------------------------
SUBROUTINE cgather_sym_many( dfftp, f_in, f_out, nbnd, nbnd_proc, start_nbnd_proc )
!-----------------------------------------------------------------------
!
! ... Written by A. Dal Corso
!
! ... This routine generalizes cgather_sym, receiveng nbnd complex
! ... distributed functions and collecting nbnd_proc(dfftp%mype+1)
! ... functions in each processor.
! ... start_nbnd_proc(dfftp%mype+1), says where the data for each processor
! ... start in the distributed variable
! ... COMPLEX*16 f_in = distributed variable (nrxx,nbnd)
! ... COMPLEX*16 f_out = gathered variable (nr1x*nr2x*nr3x,
! nbnd_proc(dfftp%mype+1))
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
!
TYPE (fft_type_descriptor), INTENT(in) :: dfftp
INTEGER :: nbnd, nbnd_proc(dfftp%nproc), start_nbnd_proc(dfftp%nproc)
COMPLEX(DP) :: f_in(dfftp%nnr,nbnd)
COMPLEX(DP) :: f_out(dfftp%nnp*dfftp%nr3x,nbnd_proc(dfftp%mype+1))
!
#if defined(__MPI)
!
INTEGER :: proc, info
INTEGER :: ibnd, jbnd
INTEGER :: displs(0:dfftp%nproc-1), recvcount(0:dfftp%nproc-1)
!
!
CALL start_clock( 'cgather' )
!
DO proc = 0, ( dfftp%nproc - 1 )
!
recvcount(proc) = 2 * dfftp%nnp * dfftp%npp(proc+1)
!
IF ( proc == 0 ) THEN
!
displs(proc) = 0
!
ELSE
!
displs(proc) = displs(proc-1) + recvcount(proc-1)
!
ENDIF
!
ENDDO
!
CALL MPI_BARRIER( dfftp%comm, info )
!
DO proc = 0, dfftp%nproc - 1
DO ibnd = 1, nbnd_proc(proc+1)
jbnd = start_nbnd_proc(proc+1) + ibnd - 1
CALL MPI_GATHERV( f_in(1,jbnd), recvcount(dfftp%mype), &
MPI_DOUBLE_PRECISION, f_out(1,ibnd), recvcount, &
displs, MPI_DOUBLE_PRECISION, proc, dfftp%comm, info )
END DO
END DO
!
CALL fftx_error__( 'cgather_sym_many', 'info<>0', info )
!
! CALL mp_barrier( dfftp%comm )
!
CALL stop_clock( 'cgather' )
!
#else
CALL fftx_error__('cgather_sym_many', 'do not use in serial execution', 1)
#endif
!
RETURN
!
END SUBROUTINE cgather_sym_many
!
!----------------------------------------------------------------------------
SUBROUTINE cscatter_sym_many( dfftp, f_in, f_out, target_ibnd, nbnd, nbnd_proc, &
start_nbnd_proc )
!----------------------------------------------------------------------------
!
! ... Written by A. Dal Corso
!
! ... generalizes cscatter_sym. It assumes that each processor has
! ... a certain number of bands (nbnd_proc(dfftp%mype+1)). The processor
! ... that has target_ibnd scatters it to all the other processors
! ... that receive a distributed part of the target function.
! ... start_nbnd_proc(dfftp%mype+1) is used to identify the processor
! ... that has the required band
!
! ... COMPLEX*16 f_in = gathered variable (nr1x*nr2x*nr3x,
! nbnd_proc(dfftp%mype+1) )
! ... COMPLEX*16 f_out = distributed variable (nrxx)
!
IMPLICIT NONE
#if defined(__MPI)
INCLUDE 'mpif.h'
#endif
!
TYPE (fft_type_descriptor), INTENT(in) :: dfftp
INTEGER :: nbnd, nbnd_proc(dfftp%nproc), start_nbnd_proc(dfftp%nproc)
COMPLEX(DP) :: f_in(dfftp%nnp*dfftp%nr3x,nbnd_proc(dfftp%mype+1))
COMPLEX(DP) :: f_out(dfftp%nnr)
INTEGER :: target_ibnd
!
#if defined(__MPI)
!
INTEGER :: proc, info
INTEGER :: displs(0:dfftp%nproc-1), sendcount(0:dfftp%nproc-1)
INTEGER :: ibnd, jbnd
!
!
CALL start_clock( 'cscatter_sym' )
!
DO proc = 0, ( dfftp%nproc - 1 )
!
sendcount(proc) = 2 * dfftp%nnp * dfftp%npp(proc+1)
!
IF ( proc == 0 ) THEN
!
displs(proc) = 0
!
ELSE
!
displs(proc) = displs(proc-1) + sendcount(proc-1)
!
ENDIF
!
ENDDO
!
f_out = (0.0_DP, 0.0_DP)
!
CALL MPI_BARRIER( dfftp%comm, info )
!
DO proc = 0, dfftp%nproc - 1
DO ibnd = 1, nbnd_proc(proc+1)
jbnd = start_nbnd_proc(proc+1) + ibnd - 1
IF (jbnd==target_ibnd) &
CALL MPI_SCATTERV( f_in(1,ibnd), sendcount, displs, &
MPI_DOUBLE_PRECISION, f_out, sendcount(dfftp%mype), &
MPI_DOUBLE_PRECISION, proc, dfftp%comm, info )
ENDDO
ENDDO
!
CALL fftx_error__( 'cscatter_sym_many', 'info<>0', info )
!
CALL stop_clock( 'cscatter_sym' )
!
#else
CALL fftx_error__('cscatter_sym_many', 'do not use in serial execution', 1)
#endif
!
RETURN
!
END SUBROUTINE cscatter_sym_many
!=----------------------------------------------------------------------=!
END MODULE scatter_mod
!=----------------------------------------------------------------------=!
!
!
!---------------------------------------------------------------------
subroutine fftsort (n, ia)
!---------------------------------------------------------------------
! sort an integer array ia(1:n) into ascending order using heapsort algorithm.
! n is input, ia is replaced on output by its sorted rearrangement.
! create an index table (ind) by making an exchange in the index array
! whenever an exchange is made on the sorted data array (ia).
! in case of equal values in the data array (ia) the values in the
! index array (ind) are used to order the entries.
! if on input ind(1) = 0 then indices are initialized in the routine,
! if on input ind(1) != 0 then indices are assumed to have been
! initialized before entering the routine and these
! indices are carried around during the sorting process
!
! no work space needed !
! free us from machine-dependent sorting-routines !
!
! adapted from Numerical Recipes pg. 329 (new edition)
!
implicit none
!-input/output variables
integer :: n
integer :: ia (2,n)
!-local variables
integer :: i, ir, j, l
integer :: iia(2)
! nothing to order
if (n.lt.2) return
! initialize indices for hiring and retirement-promotion phase
l = n / 2 + 1
ir = n
10 continue
! still in hiring phase
if (l.gt.1) then
l = l - 1
iia(:) = ia (:,l)
! in retirement-promotion phase.
else
! clear a space at the end of the array
iia(:) = ia (:,ir)
!
! retire the top of the heap into it
ia (:,ir) = ia (:,1)
!
! decrease the size of the corporation
ir = ir - 1
! done with the last promotion
if (ir.eq.1) then
! the least competent worker at all !
ia (:,1) = iia(:)
!
return
endif
endif
! wheter in hiring or promotion phase, we
i = l
! set up to place iia in its proper level
j = l + l
!
do while (j.le.ir)
if (j.lt.ir) then
if (ia (1,j) .lt. ia (1,j + 1) ) then
j = j + 1
endif
endif
! demote iia
if (iia(1).lt.ia (1,j) ) then
ia (:,i) = ia (:,j)
i = j
j = j + j
else
! set j to terminate do-while loop
j = ir + 1
endif
enddo
ia (:,i) = iia(:)
goto 10
!
end subroutine fftsort