quantum-espresso/CPV/para.f90

478 lines
14 KiB
Fortran

!
! Copyright (C) 2002 CP90 group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#include "f_defs.h"
module para_mod
USE fft_types, ONLY: fft_dlay_descriptor, fft_dlay_allocate, &
fft_dlay_deallocate, fft_dlay_set
USE fft_base, ONLY: dfftp, dffts
integer maxproc, ncplanex
parameter (maxproc=64, ncplanex=37000)
character(len=3) :: node
! node: node number, useful for opening files
integer nproc, me, mygroup
! nproc: number of processors
! me: number of this processor
!
! parallel fft information for the dense grid
!
! npp: number of plane per processor
! n3: n3(me)+1 = first plane on proc. me
! ncp: number of (density) columns per proc
! ncp0: starting column for each processor
! ncplane: number of columns in a plane
! nct: total number of non-zero columns
! nnr_: local fft data size
! ipc: index saying which proc owns columns in a plane
! icpl: index relating columns and pos. in the plane
!
! n3 -> dfftp%ipp
! ncplane -> dfftp%nnp
! ncp -> dfftp%nsp
! ncp0 -> dfftp%iss
! npp -> dfftp%npp
! ipc -> dfftp%isind
! icpl -> dfftp%ismap
! nnr_ -> dfftp%nnr
!
! integer npp(maxproc), n3(maxproc), ncp(maxproc), ncp0(maxproc), &
! ncplane, nct, nnr_, ipc(ncplanex), icpl(ncplanex)
!
! parallel fft information for the smooth mesh
!
! npps: number of plane per processor
! ncps: number of (density) columns per proc
! ncpw: number of (wfs) columns per processor
! ncps0: starting column for each processor
! ncplanes:number of columns in a plane (smooth)
! ncts: total number of non-zero columns
! nnrs_: local fft data size
! ipcs: saying which proc owns columns in a plane
! icpls: index relating columns and pos. in the plane
!
! ncpw -> dffts%ncpw
! n3s -> dffts%ipp
! ncplanes -> dffts%nnp
! ncps -> dffts%nsp
! ncps0 -> dffts%iss
! npps -> dffts%npp
! ipcs -> dffts%isind
! icpls -> dffts%ismap
! nnrs_ -> dffts%nnr
!
contains
subroutine deallocate_para_mod
use stick_base, only: sticks_deallocate
call fft_dlay_deallocate( dfftp )
call fft_dlay_deallocate( dffts )
call sticks_deallocate()
end subroutine
end module para_mod
!
!-----------------------------------------------------------------------
subroutine startup
!-----------------------------------------------------------------------
!
! This subroutine initializes the Message Passing environment.
! The number of processors "nproc" returned by this routine is
! determined by the way the process was started.
! This is configuration- and machine-dependent. For inter-
! active execution it is determined by environment variable MP_PROCS
!
use para_mod
use mp, only: mp_start, mp_env
use io_global, only: stdout
use global_version
!
implicit none
integer ierr
!
call mp_start()
call mp_env( nproc, me, mygroup )
me = me + 1
!
!
! parent process (source) will have me=1 - child process me=2,...,NPROC
! (for historical reasons: MPI uses 0,...,NPROC-1 instead )
!
if ( nproc > maxproc) &
& call errore('startup',' too many processors ',nproc)
!
if (me < 10) then
write(node,'(i1,2x)') me
else if (me < 100) then
write(node,'(i2,1x)') me
else if (me < 1000) then
write(node,'(i3)') me
else
call errore('startup','wow, >1000 nodes !!',nproc)
end if
!
! only the first processor writes
!
if ( me == 1 ) then
WRITE( stdout,'(72("*"))')
WRITE( stdout,'(4("*"),64x,4("*"))')
WRITE( stdout,'(4("*")," CPV: variable-cell Car-Parrinello ", &
& "molecular dynamics ",4("*"))')
WRITE( stdout,'(4("*")," using ultrasoft Vanderbilt ", &
& "pseudopotentials - v.",a6,8x,4("*"))') version_number
WRITE( stdout,'(4("*"),64x,4("*"))')
WRITE( stdout,'(72("*"))')
WRITE( stdout,'(/5x,''Parallel version (MPI)'')')
WRITE( stdout,'(5x,''Number of processors in use: '',i4)') nproc
else
open(6,file='/dev/null',status='unknown')
!
! useful for debugging purposes
! open(6,file='out.'//node,status='unknown')
end if
!
return
end
!
!----------------------------------------------------------------------
subroutine read_rho(unit,nspin,rhor)
!----------------------------------------------------------------------
!
! read from file rhor(nnr,nspin) on first node and distribute to other nodes
!
use para_mod
use parallel_include
use grid_dimensions, only: nr1x, nr2x, nr3x, nnr => nnrx
implicit none
integer unit, nspin
real(kind=8) rhor(nnr,nspin)
!
integer ir, is
integer root, proc, ierr, n, displs(nproc), sendcount(nproc)
real(kind=8), allocatable:: rhodist(:)
!
!
if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x))
root = 0
do proc=1,nproc
sendcount(proc) = dfftp%nnp * ( dfftp%npp(proc) )
if (proc.eq.1) then
displs(proc)=0
else
displs(proc)=displs(proc-1) + sendcount(proc-1)
end if
end do
do is=1,nspin
!
! read the charge density from unit "unit" on first node only
!
if (me.eq.1) read(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x)
!
! distribute the charge density to the other nodes
!
#if defined __PARA
call mpi_barrier ( MPI_COMM_WORLD, ierr)
call mpi_scatterv(rhodist, sendcount, displs, MPI_DOUBLE_PRECISION, &
& rhor(1,is),sendcount(me), MPI_DOUBLE_PRECISION, &
& root, MPI_COMM_WORLD, ierr)
if (ierr.ne.0) call errore('mpi_scatterv','ierr<>0',ierr)
#endif
!
! just in case: set to zero unread elements (if any)
!
do ir=sendcount(me)+1,nnr
rhor(ir,is)=0.d0
end do
end do
if (me.eq.1) deallocate(rhodist)
!
return
end subroutine read_rho
!
!----------------------------------------------------------------------
subroutine write_rho(unit,nspin,rhor)
!----------------------------------------------------------------------
!
! collect rhor(nnr,nspin) on first node and write to file
!
use para_mod
use parallel_include
use grid_dimensions, only: nr1x, nr2x, nr3x, nnr => nnrx
use gvecw , only : ngw
implicit none
integer unit, nspin
real(kind=8) rhor(nnr,nspin)
!
integer ir, is
integer root, proc, ierr, displs(nproc), recvcount(nproc)
real(kind=8), allocatable:: rhodist(:)
!
!
if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x))
!
root = 0
do proc=1,nproc
recvcount(proc) = dfftp%nnp * ( dfftp%npp(proc) )
if (proc.eq.1) then
displs(proc)=0
else
displs(proc)=displs(proc-1) + recvcount(proc-1)
end if
end do
!
do is=1,nspin
!
! gather the charge density on the first node
!
#if defined __PARA
call mpi_barrier ( MPI_COMM_WORLD, ierr)
call mpi_gatherv (rhor(1,is), recvcount(me), MPI_DOUBLE_PRECISION, &
& rhodist,recvcount, displs, MPI_DOUBLE_PRECISION, &
& root, MPI_COMM_WORLD, ierr)
if (ierr.ne.0) call errore('mpi_gatherv','ierr<>0',ierr)
#endif
!
! write the charge density to unit "unit" from first node only
!
if (me.eq.1) write(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x)
! if (me.eq.1) write(unit,'(f12.7)') (rhodist(ir),ir=1,nr1x*nr2x*nr3x)
end do
if (me.eq.1) deallocate(rhodist)
!
return
end subroutine write_rho
!
!
!----------------------------------------------------------------------
subroutine cfftpb(f,nr1b,nr2b,nr3b,nr1bx,nr2bx,nr3bx,irb3,sign)
!----------------------------------------------------------------------
!
! not-so-parallel 3d fft for box grid, implemented only for sign=1
! G-space to R-space, output = \sum_G f(G)exp(+iG*R)
! The array f (overwritten on output) is NOT distributed:
! a copy is present on each processor.
! The fft along z is done on the entire grid.
! The fft along xy is done only on planes that have components on the
! dense grid for each processor. Note that the final array will no
! longer be the same on all processors.
!
use para_mod
use grid_dimensions, only: nr3
use fft_scalar, only: cft_b
!
implicit none
integer nr1b,nr2b,nr3b,nr1bx,nr2bx,nr3bx,irb3,sign
complex(kind=8) f(nr1bx*nr2bx*nr3bx)
!
integer ir3, ibig3, imin3, imax3, np3
!
call parabox(nr3b,irb3,nr3,imin3,imax3)
np3=imax3-imin3+1
! np3 is the number of planes to be transformed
if (np3.le.0) return
call cft_b(f,nr1b,nr2b,nr3b,nr1bx,nr2bx,nr3bx,imin3,imax3,sign)
!
return
end
!
!----------------------------------------------------------------------
subroutine parabox(nr3b,irb3,nr3,imin3,imax3)
!----------------------------------------------------------------------
!
! find if box grid planes in the z direction have component on the dense
! grid on this processor, and if, which range imin3-imax3
!
use para_mod
! input
integer nr3b,irb3,nr3
! output
integer imin3,imax3
! local
integer ir3, ibig3
!
imin3=nr3b
imax3=1
do ir3=1,nr3b
ibig3=1+mod(irb3+ir3-2,nr3)
if(ibig3.lt.1.or.ibig3.gt.nr3) &
& call errore('cfftpb','ibig3 wrong',ibig3)
ibig3=ibig3-dfftp%ipp(me)
if (ibig3.gt.0.and.ibig3.le.dfftp%npp(me)) then
imin3=min(imin3,ir3)
imax3=max(imax3,ir3)
end if
end do
!
return
end
!
!-----------------------------------------------------------------------
subroutine reduce(size,ps)
!-----------------------------------------------------------------------
!
! sums a distributed variable s(size) over the processors.
! This version uses a fixed-length buffer of appropriate (?) size
!
use para_mod
use parallel_include
!
implicit none
integer size
real(kind=8) ps(size)
!
integer ierr, n, nbuf
integer, parameter:: MAXB=10000
real(kind=8) buff(MAXB)
!
if (nproc.le.1) return
if (size.le.0) return
call start_clock( 'reduce' )
!
! syncronize processes
!
#if defined __PARA
call mpi_barrier(MPI_COMM_WORLD,ierr)
if (ierr.ne.0) call errore('reduce','error in barrier',ierr)
!
nbuf=size/MAXB
!
do n=1,nbuf
call mpi_allreduce (ps(1+(n-1)*MAXB), buff, MAXB, &
& MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
if (ierr.ne.0) &
& call errore('reduce','error in allreduce1',ierr)
call DCOPY(MAXB,buff,1,ps(1+(n-1)*MAXB),1)
end do
!
! possible remaining elements < maxb
!
if (size-nbuf*MAXB.gt.0) then
call mpi_allreduce (ps(1+nbuf*MAXB), buff, size-nbuf*MAXB, &
& MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
if (ierr.ne.0) &
& call errore('reduce','error in allreduce2',ierr)
call DCOPY(size-nbuf*MAXB,buff,1,ps(1+nbuf*MAXB),1)
endif
#endif
call stop_clock( 'reduce' )
!
return
end
!
!----------------------------------------------------------------------
subroutine nrbounds(ngw,nr1s,nr2s,nr3s,mill,nmin,nmax)
!----------------------------------------------------------------------
!
! find the bounds for (i,j,k) indexes of all wavefunction G-vectors
! The (i,j,k) indexes are defined as: G=i*g(1)+j*g(2)+k*g(3)
! where g(1), g(2), g(3) are basis vectors of the reciprocal lattice
!
use parallel_include
use mp, only: mp_min, mp_max
implicit none
! input
integer ngw,nr1s,nr2s,nr3s,mill(3,*)
! output
integer nmin(3), nmax(3)
! local
integer nmin0(3), nmax0(3), ig, ierr
!
!
nmin0(1)= nr1s
nmax0(1)= -nr1s
nmin0(2)= nr2s
nmax0(2)= -nr2s
nmin0(3)= nr3s
nmax0(3)= -nr3s
!
do ig=1,ngw
nmin0(1) = min(nmin0(1),mill(1,ig))
nmin0(2) = min(nmin0(2),mill(2,ig))
nmin0(3) = min(nmin0(3),mill(3,ig))
nmax0(1) = max(nmax0(1),mill(1,ig))
nmax0(2) = max(nmax0(2),mill(2,ig))
nmax0(3) = max(nmax0(3),mill(3,ig))
end do
!
! find minima and maxima for the FFT box across all nodes
!
CALL mp_min( nmin0 )
CALL mp_max( nmax0 )
nmin = nmin0
nmax = nmax0
return
end subroutine nrbounds
!----------------------------------------------------------------------
subroutine write_pot(unit,rhos2)
! - To write the hartree potential
! M.S
!----------------------------------------------------------------------
!
! collect rhos2(nnrs) on first node and write to file
!
use para_mod
use smooth_grid_dimensions , nnrs => nnrsx
use parallel_include
implicit none
integer unit, nspin
real(kind=8) rhos2(nnrs)
!
integer ir, is
integer root, proc, ierr, displs(nproc), recvcount(nproc)
real(kind=8), allocatable:: rhodist(:)
!
!
if (me.eq.1) allocate(rhodist(nr1sx*nr2sx*nr3sx))
!
root = 0
do proc=1,nproc
recvcount(proc) = dffts%nnp * dffts%npp(proc)
if (proc.eq.1) then
displs(proc)=0
else
displs(proc)=displs(proc-1) + recvcount(proc-1)
end if
end do
!
! do is=1,nspin
!
! gather the charge density on the first node
#if defined __PARA
call mpi_barrier ( MPI_COMM_WORLD, ierr)
call mpi_gatherv (rhos2, recvcount(me), MPI_DOUBLE_PRECISION, &
& rhodist,recvcount, displs, MPI_DOUBLE_PRECISION, &
& root, MPI_COMM_WORLD, ierr)
if (ierr.ne.0) call errore('mpi_gatherv','ierr<>0',ierr)
#endif
!
! write the charge density to unit "unit" from first node only
!
if (me.eq.1) write(unit,'(f12.6)') (rhodist(ir),ir=1,nr1sx*nr2sx*nr3sx)
! end do
if (me.eq.1) deallocate(rhodist)
!
return
end subroutine write_pot