mirror of https://gitlab.com/QEF/q-e.git
991 lines
26 KiB
Fortran
991 lines
26 KiB
Fortran
!
|
|
! Copyright (C) 2001-2004 PWSCF group
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
#include "f_defs.h"
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
MODULE para_const
|
|
!----------------------------------------------------------------------------
|
|
!
|
|
SAVE
|
|
!
|
|
INTEGER, PARAMETER :: &
|
|
maxproc = 128 ! maximum number of processors
|
|
END MODULE para_const
|
|
!
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
MODULE pfft
|
|
!----------------------------------------------------------------------------
|
|
!
|
|
! ... parallel fft information for the dense grid
|
|
!
|
|
USE para_const
|
|
!
|
|
SAVE
|
|
!
|
|
INTEGER :: &
|
|
npp(maxproc), &! number of plane per processor
|
|
ncp(maxproc), &! number of (density) columns per proc
|
|
ncp0(maxproc), &! starting column for each processor
|
|
ncplane, &! number of columns in a plane
|
|
nct, &! total number of non-zero columns
|
|
nxx ! local fft data dim
|
|
!
|
|
END MODULE pfft
|
|
!
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
MODULE pffts
|
|
!----------------------------------------------------------------------------
|
|
!
|
|
! ... parallel fft information for the smooth grid
|
|
!
|
|
USE para_const
|
|
!
|
|
SAVE
|
|
!
|
|
INTEGER :: &
|
|
nkcp(maxproc) ! number of (wfs) columns per processor
|
|
INTEGER :: &
|
|
npps(maxproc), &! number of plane per processor
|
|
ncps(maxproc), &! number of (density) columns per proc
|
|
ncp0s(maxproc), &! starting column for each processor
|
|
ncplanes, &! number of columns in a plane
|
|
ncts, &! total number of non-zero columns
|
|
nxxs ! local fft data dim
|
|
!
|
|
END MODULE pffts
|
|
!
|
|
! ... here are all parallel subroutines (wrappers to MPI calls) used
|
|
! ... by the PWscf code
|
|
!
|
|
! ... "reduce"-like subroutines
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
SUBROUTINE reduce( dim, ps )
|
|
!----------------------------------------------------------------------------
|
|
!
|
|
! ... sums a distributed variable ps(dim) over the processors.
|
|
! ... This version uses a fixed-length buffer of appropriate (?) dim
|
|
! ... uses SHMEM for the T3D/T3E, MPI otherwhise
|
|
!
|
|
USE mp_global, ONLY : intra_pool_comm, my_pool_id, nproc_pool, npool
|
|
USE mp, ONLY : mp_barrier
|
|
USE kinds, ONLY : DP
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: dim
|
|
REAL (KIND=DP) :: ps(dim)
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: info, n, nbuf
|
|
INTEGER, PARAMETER :: maxb = 10000
|
|
!
|
|
# if (defined __SHMEM && defined __ALTIX) || (defined __SHMEM && defined __ORIGIN)
|
|
INTEGER :: sym_len
|
|
LOGICAL :: first
|
|
REAL (KIND=DP) :: buff(*), snd_buff(*)
|
|
POINTER (buff_p, buff), (snd_buff_p, snd_buff)
|
|
COMMON /sym_heap1/ buff_p, snd_buff_p, sym_len, first
|
|
# else
|
|
REAL (KIND=DP) :: buff(maxb)
|
|
# endif
|
|
!
|
|
# if defined (__SHMEM)
|
|
!
|
|
! ... SHMEM specific
|
|
!
|
|
INCLUDE 'mpp/shmem.fh'
|
|
# if defined (__ALTIX) || defined (__ORIGIN)
|
|
INTEGER :: pWrkSync(SHMEM_REDUCE_SYNC_SIZE), &
|
|
& pWrkData(1024*1024), start
|
|
DATA pWrkSync /SHMEM_REDUCE_SYNC_SIZE*SHMEM_SYNC_VALUE/
|
|
DATA pWrkData / 1048576 * 0 /
|
|
# else
|
|
INTEGER :: pWrkSync, pWrkData, start
|
|
COMMON / SH_SYNC / pWrkSync(SHMEM_BARRIER_SYNC_dim)
|
|
COMMON / SH_DATA / pWrkData(1024*1024)
|
|
DATA pWrkData / 1048576 * 0 /
|
|
DATA pWrkSync / SHMEM_BARRIER_SYNC_dim * SHMEM_SYNC_VALUE /
|
|
!DIR$ CACHE_ALIGN /SH_SYNC/
|
|
!DIR$ CACHE_ALIGN /SH_DATA/
|
|
# endif
|
|
!
|
|
# endif
|
|
!
|
|
!
|
|
IF ( dim <= 0 .OR. nproc_pool <= 1 ) RETURN
|
|
!
|
|
CALL start_clock( 'reduce' )
|
|
!
|
|
! ... syncronize processes - maybe unneeded on T3D but necessary on T3E !!!
|
|
!
|
|
CALL mp_barrier( intra_pool_comm )
|
|
!
|
|
nbuf = dim / maxb
|
|
!
|
|
# if defined (__SHMEM)
|
|
# if defined (__ALTIX) || defined (__ORIGIN)
|
|
IF (dim .GT. sym_len) THEN
|
|
IF (sym_len .NE. 0) THEN
|
|
CALL shpdeallc( snd_buff_p, info, -1 )
|
|
END IF
|
|
sym_len = dim
|
|
CALL shpalloc( snd_buff_p, 2*sym_len, info, -1 )
|
|
END IF
|
|
IF (first .NE. .TRUE.) THEN
|
|
CALL shpalloc( buff_p, 2*maxb, info, -1 )
|
|
first = .TRUE.
|
|
END IF
|
|
snd_buff(1:dim) = ps(1:dim)
|
|
# endif
|
|
!
|
|
start = my_pool_id * nproc_pool
|
|
!
|
|
# endif
|
|
!
|
|
DO n = 1, nbuf
|
|
!
|
|
# if defined (__SHMEM)
|
|
!
|
|
# if defined (__ALTIX) || defined (__ORIGIN)
|
|
CALL SHMEM_REAL8_SUM_TO_ALL( buff, snd_buff(1+(n-1)*maxb), maxb, &
|
|
start, 0, nproc_pool, pWrkData, pWrkSync )
|
|
# else
|
|
CALL SHMEM_REAL8_SUM_TO_ALL( buff, ps(1+(n-1)*maxb), maxb, &
|
|
start, 0, nproc_pool, pWrkData, pWrkSync )
|
|
#endif
|
|
!
|
|
# else
|
|
!
|
|
CALL MPI_ALLREDUCE( ps(1+(n-1)*maxb), buff, maxb, MPI_REAL8, &
|
|
MPI_SUM, intra_pool_comm, info )
|
|
!
|
|
CALL errore( 'reduce', 'error in allreduce1', info )
|
|
!
|
|
# endif
|
|
!
|
|
ps((1+(n-1)*maxb):(n*maxb)) = buff(1:maxb)
|
|
!
|
|
END DO
|
|
!
|
|
! ... possible remaining elements < maxb
|
|
!
|
|
IF ( ( dim - nbuf * maxb ) > 0 ) THEN
|
|
!
|
|
# if defined (__SHMEM)
|
|
!
|
|
# if defined (__ALTIX) || defined (__ORIGIN)
|
|
CALL SHMEM_REAL8_SUM_TO_ALL( buff, snd_buff(1+nbuf*maxb), &
|
|
& (dim-nbuf*maxb), start, 0, nproc_pool,&
|
|
& pWrkData, pWrkSync )
|
|
# else
|
|
CALL SHMEM_REAL8_SUM_TO_ALL( buff, ps(1+nbuf*maxb), (dim-nbuf*maxb), &
|
|
start, 0, nproc_pool, pWrkData, pWrkSync )
|
|
# endif
|
|
!
|
|
# else
|
|
!
|
|
CALL MPI_ALLREDUCE( ps(1+nbuf*maxb), buff, (dim-nbuf*maxb), MPI_REAL8, &
|
|
MPI_SUM, intra_pool_comm, info )
|
|
!
|
|
CALL errore( 'reduce', 'error in allreduce2', info )
|
|
!
|
|
# endif
|
|
!
|
|
ps((1+nbuf*maxb):dim) = buff(1:(dim-nbuf*maxb))
|
|
!
|
|
END IF
|
|
!
|
|
CALL stop_clock( 'reduce' )
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE reduce
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
SUBROUTINE ireduce( dim, is )
|
|
!----------------------------------------------------------------------------
|
|
!
|
|
! ... sums a distributed variable is(dim) over the processors.
|
|
!
|
|
USE mp_global, ONLY : intra_pool_comm, nproc_pool, npool
|
|
USE mp, ONLY : mp_barrier
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: dim, is(dim)
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: info, n, m, nbuf
|
|
INTEGER, PARAMETER :: maxi = 500
|
|
INTEGER :: buff(maxi)
|
|
!
|
|
!
|
|
IF ( dim <= 0 .OR. nproc_pool <= 1 ) RETURN
|
|
!
|
|
! ... syncronize processes
|
|
!
|
|
CALL mp_barrier( intra_pool_comm )
|
|
!
|
|
nbuf = dim / maxi
|
|
!
|
|
DO n = 1, nbuf
|
|
!
|
|
CALL MPI_ALLREDUCE( is(1+(n-1)*maxi), buff, maxi, MPI_INTEGER, &
|
|
MPI_SUM, intra_pool_comm, info )
|
|
!
|
|
CALL errore( 'ireduce', 'error in allreduce 1', info )
|
|
!
|
|
is((1+(n-1)*maxi):(n*maxi)) = buff(1:maxi)
|
|
!
|
|
END DO
|
|
!
|
|
! ... possible remaining elements < maxi
|
|
!
|
|
IF ( ( dim - nbuf * maxi ) > 0 ) THEN
|
|
!
|
|
CALL MPI_ALLREDUCE( is(1+nbuf*maxi), buff, (dim-nbuf*maxi), MPI_INTEGER, &
|
|
MPI_SUM, intra_pool_comm, info )
|
|
!
|
|
CALL errore( 'reduce', 'error in allreduce 2', info )
|
|
!
|
|
is((1+nbuf*maxi):dim) = buff(1:(dim-nbuf*maxi))
|
|
!
|
|
END IF
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE ireduce
|
|
!
|
|
!------------------------------------------------------------------------
|
|
SUBROUTINE poolreduce( dim, ps )
|
|
!-----------------------------------------------------------------------
|
|
!
|
|
! ... Sums a distributed variable ps(dim) over the pools.
|
|
! ... This MPI-only version uses a fixed-length buffer
|
|
!
|
|
USE mp_global, ONLY : inter_pool_comm, intra_image_comm, &
|
|
my_pool_id, nproc_pool, npool
|
|
USE mp, ONLY : mp_barrier
|
|
USE kinds, ONLY : DP
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: dim
|
|
REAL (KIND=DP) :: ps(dim)
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER, PARAMETER :: maxb = 10000
|
|
REAL (KIND=DP) :: buff(maxb)
|
|
INTEGER :: info, nbuf, n
|
|
!
|
|
!
|
|
IF ( dim <= 0 .OR. npool <= 1 ) RETURN
|
|
!
|
|
CALL start_clock( 'poolreduce' )
|
|
!
|
|
! ... MPI syncronize processes
|
|
!
|
|
CALL mp_barrier( intra_image_comm )
|
|
!
|
|
nbuf = dim / maxb
|
|
!
|
|
DO n = 1, nbuf
|
|
!
|
|
CALL MPI_ALLREDUCE( ps(1+(n-1)*maxb), buff, maxb, MPI_REAL8, &
|
|
MPI_SUM, inter_pool_comm, info )
|
|
!
|
|
CALL errore( 'poolreduce', 'info<>0 at allreduce1', info )
|
|
!
|
|
ps((1+(n-1)*maxb):(n*maxb)) = buff(1:maxb)
|
|
!
|
|
END DO
|
|
!
|
|
IF ( ( dim - nbuf * maxb ) > 0 ) THEN
|
|
!
|
|
CALL MPI_ALLREDUCE( ps(1+nbuf*maxb), buff, (dim-nbuf*maxb), MPI_REAL8, &
|
|
MPI_SUM, inter_pool_comm, info )
|
|
!
|
|
CALL errore( 'poolreduce', 'info<>0 at allreduce2', info )
|
|
!
|
|
ps((1+nbuf*maxb):dim) = buff(1:(dim-nbuf*maxb))
|
|
!
|
|
END IF
|
|
!
|
|
CALL stop_clock( 'poolreduce' )
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE poolreduce
|
|
!
|
|
! ... "gather"-like subroutines
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
SUBROUTINE gather( f_in, f_out )
|
|
!----------------------------------------------------------------------------
|
|
!
|
|
! ... gathers nproc_pool distributed data on the first processor of every pool
|
|
!
|
|
! ... REAL*8 f_in = distributed variable (nxx)
|
|
! ... REAL*8 f_out = gathered variable (nrx1*nrx2*nrx3)
|
|
!
|
|
USE pfft, ONLY : ncplane, npp, nxx
|
|
USE mp_global, ONLY : intra_pool_comm, nproc_pool, me_pool, root_pool
|
|
USE mp, ONLY : mp_barrier
|
|
USE kinds, ONLY : DP
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
REAL (KIND=DP) :: f_in(nxx), f_out(*)
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: proc, info
|
|
INTEGER :: displs(0:nproc_pool-1), recvcount(0:nproc_pool-1)
|
|
!
|
|
!
|
|
CALL start_clock( 'gather' )
|
|
!
|
|
DO proc = 0, ( nproc_pool - 1 )
|
|
!
|
|
recvcount(proc) = ncplane * npp(proc+1)
|
|
!
|
|
IF ( proc == 0 ) THEN
|
|
!
|
|
displs(proc) = 0
|
|
!
|
|
ELSE
|
|
!
|
|
displs(proc) = displs(proc-1) + recvcount(proc-1)
|
|
!
|
|
END IF
|
|
!
|
|
END DO
|
|
!
|
|
CALL mp_barrier( intra_pool_comm )
|
|
!
|
|
CALL MPI_GATHERV( f_in, recvcount(me_pool), MPI_REAL8, f_out, &
|
|
recvcount, displs, MPI_REAL8, root_pool, &
|
|
intra_pool_comm, info )
|
|
!
|
|
CALL errore( 'gather', 'info<>0', info )
|
|
!
|
|
CALL stop_clock( 'gather' )
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE gather
|
|
!
|
|
!-----------------------------------------------------------------------
|
|
SUBROUTINE cgather_sym( f_in, f_out )
|
|
!-----------------------------------------------------------------------
|
|
!
|
|
! ... gather complex data for symmetrization (in phonon code)
|
|
! ... COMPLEX*16 f_in = distributed variable (nrxx)
|
|
! ... COMPLEX*16 f_out = gathered variable (nrx1*nrx2*nrx3)
|
|
!
|
|
USE pfft, ONLY : ncplane, npp, nxx
|
|
USE mp_global, ONLY : intra_pool_comm, intra_image_comm, &
|
|
nproc_pool, me_pool
|
|
USE mp, ONLY : mp_barrier
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
COMPLEX(KIND=DP) :: f_in(nxx), f_out(*)
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: proc, info
|
|
INTEGER :: displs(0:nproc_pool-1), recvcount(0:nproc_pool-1)
|
|
!
|
|
!
|
|
CALL start_clock( 'cgather' )
|
|
!
|
|
DO proc = 0, ( nproc_pool - 1 )
|
|
!
|
|
recvcount(proc) = 2 * ncplane * npp(proc+1)
|
|
!
|
|
IF ( proc == 0 ) THEN
|
|
!
|
|
displs(proc) = 0
|
|
!
|
|
ELSE
|
|
!
|
|
displs(proc) = displs(proc-1) + recvcount(proc-1)
|
|
!
|
|
END IF
|
|
!
|
|
END DO
|
|
!
|
|
CALL mp_barrier( intra_pool_comm )
|
|
!
|
|
CALL MPI_ALLGATHERV( f_in, recvcount(me_pool), MPI_REAL8, &
|
|
f_out, recvcount, displs, MPI_REAL8, &
|
|
intra_pool_comm, info )
|
|
!
|
|
CALL errore( 'cgather_sym', 'info<>0', info )
|
|
!
|
|
CALL mp_barrier( intra_image_comm )
|
|
!
|
|
CALL stop_clock( 'cgather' )
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE cgather_sym
|
|
!
|
|
! ... "scatter"-like subroutines
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
SUBROUTINE scatter( f_in, f_out )
|
|
!----------------------------------------------------------------------------
|
|
!
|
|
! ... scatters data from the first processor of every pool
|
|
!
|
|
! ... REAL*8 f_in = gathered variable (nrx1*nrx2*nrx3)
|
|
! ... REAL*8 f_out = distributed variable (nxx)
|
|
!
|
|
USE pfft, ONLY : ncplane, npp, nxx
|
|
USE mp_global, ONLY : intra_pool_comm, nproc_pool, &
|
|
me_pool, root_pool
|
|
USE mp, ONLY : mp_barrier
|
|
USE kinds, ONLY : DP
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
REAL (KIND=DP) :: f_in(*), f_out(nxx)
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: proc, info
|
|
INTEGER :: displs(0:nproc_pool-1), sendcount(0:nproc_pool-1)
|
|
!
|
|
!
|
|
CALL start_clock( 'scatter' )
|
|
!
|
|
DO proc = 0, ( nproc_pool - 1 )
|
|
!
|
|
sendcount(proc) = ncplane * npp(proc+1)
|
|
!
|
|
IF ( proc == 0 ) THEN
|
|
!
|
|
displs(proc) = 0
|
|
!
|
|
ELSE
|
|
!
|
|
displs(proc) = displs(proc-1) + sendcount(proc-1)
|
|
!
|
|
END IF
|
|
!
|
|
END DO
|
|
!
|
|
CALL mp_barrier( intra_pool_comm )
|
|
!
|
|
CALL MPI_SCATTERV( f_in, sendcount, displs, MPI_REAL8, &
|
|
f_out, sendcount(me_pool), MPI_REAL8, &
|
|
root_pool, intra_pool_comm, info )
|
|
!
|
|
CALL errore( 'scatter', 'info<>0', info )
|
|
!
|
|
IF ( sendcount(me_pool) /= nxx ) f_out(sendcount(me_pool)+1:nxx) = 0.D0
|
|
!
|
|
CALL stop_clock( 'scatter' )
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE scatter
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
SUBROUTINE poolscatter( nsize, nkstot, f_in, nks, f_out )
|
|
!----------------------------------------------------------------------------
|
|
!
|
|
! ... This routine scatters a quantity ( typically the eigenvalues )
|
|
! ... among the pools.
|
|
! ... On input, f_in is required only on the first node of the first pool.
|
|
! ... f_in and f_out may coincide.
|
|
! ... Not a smart implementation!
|
|
!
|
|
USE kinds, ONLY : DP
|
|
USE mp_global, ONLY : intra_pool_comm, inter_pool_comm, &
|
|
my_pool_id, npool, me_pool, root_pool, kunit
|
|
USE mp, ONLY : mp_bcast
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: nsize, nkstot, nks
|
|
! first dimension of vectors f_in and f_out
|
|
! number of k-points per pool
|
|
! total number of k-points
|
|
REAL (KIND=DP) :: f_in(nsize,nkstot), f_out(nsize,nks)
|
|
! input ( contains values for all k-point )
|
|
! output ( only for k-points of mypool )
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: rest, nbase
|
|
! the rest of the integer division nkstot / npo
|
|
! the position in the original list
|
|
!
|
|
!
|
|
! ... copy from the first node of the first pool
|
|
! ... to the first node of all the other pools
|
|
!
|
|
IF ( me_pool == root_pool ) &
|
|
CALL mp_bcast( f_in, root_pool, inter_pool_comm )
|
|
!
|
|
! ... distribute the vector on the first node of each pool
|
|
!
|
|
rest = nkstot / kunit - ( nkstot / kunit / npool ) * npool
|
|
!
|
|
nbase = nks * my_pool_id
|
|
!
|
|
IF ( ( my_pool_id + 1 ) > rest ) nbase = nbase + rest * kunit
|
|
!
|
|
f_out(:,1:nks) = f_in(:,(nbase+1):(nbase+nks))
|
|
!
|
|
! ... copy from the first node of every pool
|
|
! ... to the other nodes of every pool
|
|
!
|
|
CALL mp_bcast( f_out, root_pool, intra_pool_comm )
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE poolscatter
|
|
!
|
|
!----------------------------------------------------------------------------
|
|
SUBROUTINE fft_scatter1( f_in, nrx3, nxx_, f_aux, ncp_, npp_, sign )
|
|
!----------------------------------------------------------------------------
|
|
!
|
|
! ... transpose the fft grid across nodes
|
|
!
|
|
! ... a) From columns to planes (sign > 0)
|
|
!
|
|
! ... "columns" (or "pencil") representation:
|
|
!
|
|
! ... processor "me" has ncp_(me) contiguous columns along z
|
|
! ... Each column has nrx3 elements for a fft of order nr3
|
|
! ... nrx3 can be = nr3 + 1 in order to reduce memory conflicts.
|
|
!
|
|
! ... The transpose take places in two steps:
|
|
!
|
|
! ... 1) on each processor the columns are divided into slices along z
|
|
! ... that are stored contiguously. On processor "me", slices for
|
|
! ... processor "proc" are npp_(proc)*ncp_(me) big
|
|
! ... 2) all processors communicate to exchange slices
|
|
! ... (all columns with z in the slice belonging to "me"
|
|
! ... must be received, all the others must be sent to "proc")
|
|
!
|
|
! ... Finally one gets the "planes" representation:
|
|
!
|
|
! ... processor "me" has npp_(me) complete xy planes
|
|
!
|
|
! ... b) From planes to columns (sign < 0)
|
|
!
|
|
! ... Quite the same in the opposite direction
|
|
!
|
|
! ... The output is overwritten on f_in ; f_aux is used as work space
|
|
!
|
|
USE para_const, ONLY : maxproc
|
|
USE mp_global, ONLY : intra_pool_comm, nproc_pool, me_pool
|
|
USE mp, ONLY : mp_barrier
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: nrx3, nxx_, sign, ncp_(maxproc), npp_(maxproc)
|
|
REAL (KIND=DP) :: f_in(2*nxx_), f_aux(2*nxx_)
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: dest, from, k, proc, ierr
|
|
INTEGER :: offset1(0:maxproc-1), sendcount(0:maxproc-1), &
|
|
sdispls(0:maxproc-1), recvcount(0:maxproc-1), &
|
|
rdispls(0:maxproc-1)
|
|
!
|
|
!
|
|
IF ( nproc_pool == 1 ) RETURN
|
|
!
|
|
CALL start_clock( 'fft_scatter' )
|
|
!
|
|
! ... sendcount(proc): amount of data processor "me" must send to processor
|
|
! ... recvcount(proc): amount of data processor "me" must receive from
|
|
!
|
|
DO proc = 0, ( nproc_pool - 1 )
|
|
!
|
|
sendcount(proc) = 2 * npp_(proc+1) * ncp_(me_pool+1)
|
|
recvcount(proc) = 2 * npp_(me_pool+1) * ncp_(proc+1)
|
|
!
|
|
END DO
|
|
!
|
|
! ... offset1(proc) is used to locate the slices to be sent to proc
|
|
! ... sdispls(proc)+1 is the beginning of data that must be sent to proc
|
|
! ... rdispls(proc)+1 is the beginning of data that must be received from proc
|
|
!
|
|
offset1(0) = 1
|
|
sdispls(0) = 0
|
|
rdispls(0) = 0
|
|
!
|
|
DO proc = 1, ( nproc_pool - 1 )
|
|
!
|
|
offset1(proc) = offset1(proc-1) + 2 * npp_(proc)
|
|
sdispls(proc) = sdispls(proc-1) + sendcount(proc-1)
|
|
rdispls(proc) = rdispls(proc-1) + recvcount(proc-1)
|
|
!
|
|
END DO
|
|
!
|
|
ierr = 0
|
|
!
|
|
IF ( sign > 0 ) THEN
|
|
!
|
|
! ... "forward" scatter from columns to planes
|
|
!
|
|
! ... step one: store contiguously the slices
|
|
!
|
|
DO proc = 0, ( nproc_pool - 1 )
|
|
!
|
|
from = offset1(proc)
|
|
dest = 1 + sdispls(proc)
|
|
!
|
|
DO k = 1, ncp_(me_pool+1)
|
|
!
|
|
CALL DCOPY( 2*npp_(proc+1), f_in(from+2*(k-1)*nrx3), &
|
|
1, f_aux(dest+2*(k-1)*npp_(proc+1)), 1 )
|
|
!
|
|
END DO
|
|
!
|
|
END DO
|
|
!
|
|
! ... maybe useless; ensures that no garbage is present in the output
|
|
!
|
|
f_in(:) = ( 0.D0, 0.D0 )
|
|
!
|
|
! ... step two: communication
|
|
!
|
|
CALL mp_barrier( intra_pool_comm )
|
|
!
|
|
CALL MPI_ALLTOALLV( f_aux, sendcount, sdispls, MPI_REAL8, &
|
|
f_in, recvcount, rdispls, MPI_REAL8, &
|
|
intra_pool_comm, ierr )
|
|
!
|
|
CALL errore( 'fft_scatter', 'info<>0', ierr )
|
|
!
|
|
ELSE
|
|
!
|
|
! ... "backward" scatter from planes to columns
|
|
!
|
|
! ... step two: communication
|
|
!
|
|
CALL mp_barrier( intra_pool_comm )
|
|
!
|
|
CALL MPI_ALLTOALLV( f_in, recvcount, rdispls, MPI_REAL8, &
|
|
f_aux, sendcount, sdispls, MPI_REAL8, &
|
|
intra_pool_comm, ierr )
|
|
!
|
|
CALL errore( 'fft_scatter', 'info<>0', ierr )
|
|
!
|
|
! ... step one: store contiguously the columns
|
|
!
|
|
f_in(:) = ( 0.D0, 0.D0 )
|
|
!
|
|
DO proc = 0,( nproc_pool - 1 )
|
|
!
|
|
from = 1 + sdispls(proc)
|
|
!
|
|
dest = offset1(proc)
|
|
!
|
|
DO k = 1, ncp_(me_pool+1)
|
|
!
|
|
CALL DCOPY( 2*npp_(proc+1), f_aux(from+2*(k-1)*npp_(proc+1)), &
|
|
1, f_in (dest+2*(k-1)*nrx3), 1 )
|
|
!
|
|
END DO
|
|
!
|
|
END DO
|
|
!
|
|
END IF
|
|
!
|
|
CALL stop_clock( 'fft_scatter' )
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE fft_scatter1
|
|
!
|
|
! ... other parallel subroutines
|
|
!
|
|
!------------------------------------------------------------------------
|
|
SUBROUTINE poolextreme( ps, iflag )
|
|
!-----------------------------------------------------------------------
|
|
!
|
|
! ... Finds the maximum (iflag.gt.0) or the minimum (iflag.le.0) value o
|
|
! ... a real variable among the values distributed across the pools and
|
|
! ... returns this value to all pools.
|
|
!
|
|
USE mp_global, ONLY : inter_pool_comm, intra_image_comm, npool
|
|
USE mp, ONLY : mp_barrier
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: iflag
|
|
REAL (KIND=DP) :: ps
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: info
|
|
REAL (KIND=DP) :: psr
|
|
!
|
|
!
|
|
IF ( npool <= 1 ) RETURN
|
|
!
|
|
CALL mp_barrier( intra_image_comm )
|
|
!
|
|
IF ( iflag > 0 ) THEN
|
|
!
|
|
CALL MPI_ALLREDUCE( ps, psr, 1, MPI_REAL8, MPI_MAX, &
|
|
inter_pool_comm, info )
|
|
!
|
|
CALL errore( 'poolextreme', 'info<>0 in allreduce1', info )
|
|
!
|
|
ELSE
|
|
!
|
|
CALL MPI_ALLREDUCE( ps, psr, 1, MPI_REAL8, MPI_MIN, &
|
|
inter_pool_comm, info )
|
|
!
|
|
CALL errore( 'poolextreme', 'info<>0 in allreduce2', info )
|
|
!
|
|
END IF
|
|
!
|
|
ps = psr
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE poolextreme
|
|
!
|
|
!-----------------------------------------------------------------------
|
|
SUBROUTINE poolrecover( vec, length, nkstot, nks )
|
|
!-----------------------------------------------------------------------
|
|
!
|
|
! ... recovers on the first processor of the first pool a
|
|
! ... distributed vector
|
|
!
|
|
USE mp_global, ONLY : inter_pool_comm, intra_image_comm, &
|
|
npool, me_pool, root_pool, my_pool_id, kunit
|
|
USE mp, ONLY : mp_barrier
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: length, nks, nkstot
|
|
REAL (KIND=DP) :: vec(length,nkstot)
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: status(MPI_STATUS_SIZE)
|
|
INTEGER :: i, nks1, rest, fine, nbase, info
|
|
!
|
|
!
|
|
IF ( npool <= 1 ) RETURN
|
|
!
|
|
IF ( MOD( nkstot, kunit ) /= 0 ) &
|
|
CALL errore( 'poolrecover', 'nkstot/kunit is not an integer', nkstot )
|
|
!
|
|
nks1 = kunit * ( nkstot / kunit / npool )
|
|
!
|
|
rest = ( nkstot - nks1 * npool ) / kunit
|
|
!
|
|
CALL mp_barrier( intra_image_comm )
|
|
!
|
|
IF ( me_pool == root_pool .AND. my_pool_id > 0 ) THEN
|
|
!
|
|
CALL MPI_SEND( vec, (length*nks), MPI_REAL8, 0, 17, &
|
|
inter_pool_comm, info )
|
|
!
|
|
CALL errore( 'poolrecover', 'info<>0 in send', info )
|
|
!
|
|
END IF
|
|
!
|
|
DO i = 2, npool
|
|
!
|
|
IF ( i <= rest ) THEN
|
|
!
|
|
fine = nks1 + kunit
|
|
!
|
|
nbase = ( nks1 + kunit ) * ( i - 1 )
|
|
!
|
|
ELSE
|
|
!
|
|
fine = nks1
|
|
!
|
|
nbase = rest * (nks1 + kunit) + (i - 1 - rest) * nks1
|
|
!
|
|
END IF
|
|
!
|
|
IF ( me_pool == root_pool .AND. my_pool_id == 0 ) THEN
|
|
!
|
|
CALL MPI_RECV( vec(1,nbase+1), (length*fine), MPI_REAL8, &
|
|
(i-1), 17, inter_pool_comm, status, info )
|
|
!
|
|
CALL errore( 'poolrecover', 'info<>0 in recv', info )
|
|
!
|
|
END IF
|
|
!
|
|
END DO
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE poolrecover
|
|
!
|
|
!------------------------------------------------------------------------
|
|
SUBROUTINE ipoolrecover( ivec, length, nkstot, nks )
|
|
!------------------------------------------------------------------------
|
|
!
|
|
! ... as above, for an integer vector
|
|
!
|
|
USE mp_global, ONLY : inter_pool_comm, intra_image_comm, &
|
|
npool, me_pool, root_pool, my_pool_id, kunit
|
|
USE mp, ONLY : mp_barrier
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: length, nks, nkstot
|
|
INTEGER :: ivec(length,nkstot)
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
INTEGER :: status(MPI_STATUS_SIZE)
|
|
INTEGER :: i, nks1, rest, fine, nbase, info
|
|
!
|
|
!
|
|
IF ( npool <= 1 ) RETURN
|
|
!
|
|
IF ( MOD( nkstot, kunit ) /= 0 ) &
|
|
CALL errore( 'poolrecover', 'nkstot/kunit is not an integer', nkstot )
|
|
!
|
|
nks1 = kunit * ( nkstot / kunit / npool )
|
|
!
|
|
rest = ( nkstot - nks1 * npool ) / kunit
|
|
!
|
|
CALL mp_barrier( intra_image_comm )
|
|
!
|
|
IF ( me_pool == root_pool .AND. my_pool_id > 0 ) THEN
|
|
!
|
|
CALL MPI_SEND( ivec, (length*nks), MPI_INTEGER, 0, 17, &
|
|
inter_pool_comm, info )
|
|
!
|
|
CALL errore( 'ipoolrecover', 'info<>0 in send', info )
|
|
!
|
|
END IF
|
|
!
|
|
DO i = 2, npool
|
|
!
|
|
IF ( i <= rest ) THEN
|
|
!
|
|
fine = nks1 + kunit
|
|
!
|
|
nbase = ( nks1 + kunit ) * ( i - 1 )
|
|
!
|
|
ELSE
|
|
!
|
|
fine = nks1
|
|
!
|
|
nbase = rest * ( nks1 + kunit ) + ( i - 1 - rest ) * nks1
|
|
!
|
|
END IF
|
|
!
|
|
IF ( me_pool == root_pool .AND. my_pool_id == 0 ) THEN
|
|
!
|
|
CALL MPI_RECV( ivec(1,nbase+1), (length*fine), MPI_INTEGER, &
|
|
(i-1), 17, inter_pool_comm, status, info )
|
|
!
|
|
CALL errore( 'ipoolrecover', 'info<>0 in recv', info )
|
|
!
|
|
END IF
|
|
!
|
|
END DO
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE ipoolrecover
|
|
!
|
|
!-----------------------------------------------------------------------
|
|
SUBROUTINE extreme( ps, iflag )
|
|
!-----------------------------------------------------------------------
|
|
!
|
|
! ... Finds the maximum (iflag.gt.0) or the minimum (iflag.le.0) value
|
|
! ... of a real variable among the values distributed on a given pool
|
|
!
|
|
USE mp_global, ONLY : intra_image_comm
|
|
USE mp, ONLY : mp_barrier
|
|
USE parallel_include
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
REAL (KIND=DP) :: ps
|
|
INTEGER :: iflag
|
|
!
|
|
#if defined (__PARA)
|
|
!
|
|
REAL (KIND=DP) :: psr
|
|
INTEGER :: info
|
|
!
|
|
!
|
|
CALL mp_barrier( intra_image_comm )
|
|
!
|
|
IF ( iflag > 0 ) THEN
|
|
!
|
|
CALL MPI_ALLREDUCE( ps, psr, 1, MPI_REAL8, MPI_MAX, &
|
|
intra_image_comm, info )
|
|
!
|
|
ELSE
|
|
!
|
|
CALL MPI_ALLREDUCE( ps, psr, 1, MPI_REAL8, MPI_MIN, &
|
|
intra_image_comm, info )
|
|
!
|
|
END IF
|
|
!
|
|
ps = psr
|
|
!
|
|
#endif
|
|
!
|
|
RETURN
|
|
!
|
|
END SUBROUTINE extreme
|