2003-01-20 05:58:50 +08:00
|
|
|
!
|
|
|
|
! Copyright (C) 2002 CP90 group
|
|
|
|
! This file is distributed under the terms of the
|
|
|
|
! GNU General Public License. See the file `License'
|
|
|
|
! in the root directory of the present distribution,
|
|
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
|
|
!
|
|
|
|
|
2003-02-16 23:16:33 +08:00
|
|
|
#include "../include/machine.h"
|
|
|
|
|
2003-01-20 05:58:50 +08:00
|
|
|
module para_mod
|
|
|
|
|
|
|
|
integer maxproc, ncplanex
|
|
|
|
parameter (maxproc=64, ncplanex=37000)
|
|
|
|
|
|
|
|
character*3 node
|
|
|
|
! node: node number, useful for opening files
|
|
|
|
integer nproc, me, mygroup
|
|
|
|
! nproc: number of processors
|
|
|
|
! me: number of this processor
|
|
|
|
! parallel fft information for the dense grid
|
|
|
|
!
|
|
|
|
! npp: number of plane per processor
|
|
|
|
! n3: n3(me)+1 = first plane on proc. me
|
|
|
|
! ncp: number of (density) columns per proc
|
|
|
|
! ncp0: starting column for each processor
|
|
|
|
! ncplane: number of columns in a plane
|
|
|
|
! nct: total number of non-zero columns
|
|
|
|
! nnr_: local fft data size
|
|
|
|
! ipc: index saying which proc owns columns in a plane
|
|
|
|
! ocpl: index relating columns and pos. in the plane
|
|
|
|
!
|
|
|
|
integer npp(maxproc), n3(maxproc), ncp(maxproc), ncp0(maxproc), &
|
|
|
|
ncplane, nct, nnr_, ipc(ncplanex), icpl(ncplanex)
|
|
|
|
!
|
|
|
|
! parallel fft information for the smooth mesh
|
|
|
|
!
|
|
|
|
! npps: number of plane per processor
|
|
|
|
! ncps: number of (density) columns per proc
|
|
|
|
! ncpw: number of (wfs) columns per processor
|
|
|
|
! ncps0: starting column for each processor
|
|
|
|
! ncplanes:number of columns in a plane (smooth)
|
|
|
|
! ncts: total number of non-zero columns
|
|
|
|
! nnrs_: local fft data size
|
|
|
|
! ipcs: saying which proc owns columns in a plane
|
|
|
|
! icpls: index relating columns and pos. in the plane
|
|
|
|
!
|
|
|
|
integer npps(maxproc), ncps(maxproc), ncpw(maxproc), ncp0s(maxproc), &
|
|
|
|
ncplanes, ncts, nnrs_, ipcs(ncplanex), icpls(ncplanex)
|
|
|
|
|
|
|
|
end module para_mod
|
|
|
|
!
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
subroutine startup
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! This subroutine initializes MPI processes. The number of processors
|
|
|
|
! "nproc" returned by this routine is determined by the way the process
|
|
|
|
! was started. This is configuration- and machine-dependent. For inter-
|
|
|
|
! active execution it is determined by environment variable MP_PROCS
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
use parallel_include
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
|
|
|
|
integer ierr
|
|
|
|
!
|
|
|
|
#if defined __MPI
|
|
|
|
call mpi_init(ierr)
|
|
|
|
if (ierr.ne.0) call error('startup','mpi_init',ierr)
|
|
|
|
call mpi_comm_size(MPI_COMM_WORLD,nproc,ierr)
|
|
|
|
if (ierr.ne.0) call error('startup','mpi_comm_size',ierr)
|
|
|
|
call mpi_comm_rank(MPI_COMM_WORLD, me,ierr)
|
|
|
|
if (ierr.ne.0) call error('startup','mpi_comm_rank',ierr)
|
|
|
|
mygroup = MPI_COMM_WORLD
|
|
|
|
me=me+1
|
|
|
|
#else
|
|
|
|
nproc = 1
|
|
|
|
me = 1
|
|
|
|
mygroup = 0
|
|
|
|
#endif
|
|
|
|
!
|
|
|
|
!
|
|
|
|
! parent process (source) will have me=1 - child process me=2,...,NPROC
|
|
|
|
! (for historical reasons: MPI uses 0,...,NPROC-1 instead )
|
|
|
|
!
|
|
|
|
if (nproc.gt.maxproc) &
|
|
|
|
& call error('startup',' too many processors ',nproc)
|
|
|
|
!
|
|
|
|
if (me.lt.10) then
|
|
|
|
write(node,'(i1,2x)') me
|
|
|
|
else if (me.lt.100) then
|
|
|
|
write(node,'(i2,1x)') me
|
|
|
|
else if (me.lt.1000) then
|
|
|
|
write(node,'(i3)') me
|
|
|
|
else
|
|
|
|
call error('startup','wow, >1000 nodes !!',nproc)
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! only the first processor writes
|
|
|
|
!
|
|
|
|
if (me.eq.1) then
|
|
|
|
write(6,'(/5x,''Parallel version (MPI)'')')
|
|
|
|
write(6,'(5x,''Number of processors in use: '',i4)') nproc
|
|
|
|
else
|
|
|
|
open(6,file='/dev/null',status='unknown')
|
|
|
|
!
|
|
|
|
! useful for debugging purposes
|
|
|
|
! open(6,file='out.'//node,status='unknown')
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
|
|
|
#if defined __PARA
|
|
|
|
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine read_rho(unit,nspin,rhor)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! read from file rhor(nnr,nspin) on first node and distribute to other nodes
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
use parm
|
|
|
|
implicit none
|
|
|
|
include 'mpif.h'
|
|
|
|
integer unit, nspin
|
|
|
|
real(kind=8) rhor(nnr,nspin)
|
|
|
|
!
|
|
|
|
integer ir, is
|
|
|
|
integer root, proc, ierr, n, displs(nproc), sendcount(nproc)
|
|
|
|
real(kind=8), allocatable:: rhodist(:)
|
|
|
|
!
|
|
|
|
!
|
|
|
|
if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x))
|
|
|
|
root = 0
|
|
|
|
do proc=1,nproc
|
|
|
|
sendcount(proc) = ncplane*npp(proc)
|
|
|
|
if (proc.eq.1) then
|
|
|
|
displs(proc)=0
|
|
|
|
else
|
|
|
|
displs(proc)=displs(proc-1) + sendcount(proc-1)
|
|
|
|
end if
|
|
|
|
end do
|
|
|
|
do is=1,nspin
|
|
|
|
!
|
|
|
|
! read the charge density from unit "unit" on first node only
|
|
|
|
!
|
|
|
|
if (me.eq.1) read(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x)
|
|
|
|
!
|
|
|
|
! distribute the charge density to the other nodes
|
|
|
|
!
|
|
|
|
call mpi_barrier ( MPI_COMM_WORLD, ierr)
|
|
|
|
call mpi_scatterv(rhodist, sendcount, displs, MPI_REAL8, &
|
|
|
|
& rhor(1,is),sendcount(me), MPI_REAL8, &
|
|
|
|
& root, MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('mpi_scatterv','ierr<>0',ierr)
|
|
|
|
!
|
|
|
|
! just in case: set to zero unread elements (if any)
|
|
|
|
!
|
|
|
|
do ir=sendcount(me)+1,nnr
|
|
|
|
rhor(ir,is)=0.d0
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
if (me.eq.1) deallocate(rhodist)
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end subroutine read_rho
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine write_rho(unit,nspin,rhor)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! collect rhor(nnr,nspin) on first node and write to file
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
use parm
|
|
|
|
implicit none
|
|
|
|
include 'mpif.h'
|
|
|
|
integer unit, nspin
|
|
|
|
real(kind=8) rhor(nnr,nspin)
|
|
|
|
!
|
|
|
|
integer ir, is
|
|
|
|
integer root, proc, ierr, displs(nproc), recvcount(nproc)
|
|
|
|
real(kind=8), allocatable:: rhodist(:)
|
|
|
|
!
|
|
|
|
!
|
|
|
|
if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x))
|
|
|
|
!
|
|
|
|
root = 0
|
|
|
|
do proc=1,nproc
|
|
|
|
recvcount(proc) = ncplane*npp(proc)
|
|
|
|
if (proc.eq.1) then
|
|
|
|
displs(proc)=0
|
|
|
|
else
|
|
|
|
displs(proc)=displs(proc-1) + recvcount(proc-1)
|
|
|
|
end if
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
do is=1,nspin
|
|
|
|
!
|
|
|
|
! gather the charge density on the first node
|
|
|
|
!
|
|
|
|
call mpi_barrier ( MPI_COMM_WORLD, ierr)
|
|
|
|
call mpi_gatherv (rhor(1,is), recvcount(me), MPI_REAL8, &
|
|
|
|
& rhodist,recvcount, displs, MPI_REAL8, &
|
|
|
|
& root, MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('mpi_gatherv','ierr<>0',ierr)
|
|
|
|
!
|
|
|
|
! write the charge density to unit "unit" from first node only
|
|
|
|
!
|
|
|
|
if (me.eq.1) write(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x)
|
|
|
|
end do
|
|
|
|
if (me.eq.1) deallocate(rhodist)
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end subroutine write_rho
|
|
|
|
!
|
|
|
|
!
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
subroutine set_fft_para( b1, b2, b3, gcut, gcuts, gcutw, &
|
|
|
|
& nr1, nr2, nr3, nr1s, nr2s, nr3s, nnr, &
|
|
|
|
& nr1x,nr2x,nr3x,nr1sx,nr2sx,nr3sx,nnrs )
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! distribute columns to processes for parallel fft
|
|
|
|
! based on code written by Stefano de Gironcoli for PWSCF
|
|
|
|
! columns are sets of g-vectors along z: g(k) = i1*b1+i2*b2+i3*b3 ,
|
|
|
|
! with g^2<gcut and (i1,i2) running over the (xy) plane.
|
|
|
|
! Columns are "active" for a given (i1,i2) if they contain a nonzero
|
|
|
|
! number of wavevectors
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
real(kind=8) b1(3), b2(3), b3(3), gcut, gcuts, gcutw
|
|
|
|
integer nr1, nr2, nr3, nr1s, nr2s, nr3s, nnr, &
|
|
|
|
& nr1x,nr2x,nr3x,nr1sx,nr2sx,nr3sx,nnrs
|
|
|
|
!
|
|
|
|
integer ngc(ncplanex),&! number of g-vectors per column (dense grid)
|
|
|
|
& ngcs(ncplanex),&! number of g-vectors per column (smooth grid
|
|
|
|
& ngcw(ncplanex),&! number of wavefct plane waves per colum
|
|
|
|
& in1(ncplanex),&! index i for column (i1,i2)
|
|
|
|
& in2(ncplanex),&! index j for column (i1,i2)
|
|
|
|
& ic, &! fft index for this column (dense grid)
|
|
|
|
& ics, &! as above for the smooth grid
|
|
|
|
& icm, &! fft index for column (-i1,-i2) (dense grid)
|
|
|
|
& icms, &! as above for the smooth grid
|
|
|
|
& index(ncplanex),&! used to order column
|
|
|
|
& ncp_(maxproc),&! number of column per processor (work space)
|
|
|
|
& ngp(maxproc), &! number of g-vectors per proc (dense grid)
|
|
|
|
& ngps(maxproc),&! number of g-vectors per proc (smooth grid)
|
|
|
|
& ngpw(maxproc) ! number of wavefct plane waves per proc
|
|
|
|
!
|
|
|
|
integer np, nps1, &! counters on planes
|
|
|
|
& nq, nqs, &! counters on planes
|
|
|
|
& max1,min1,max2,min2, &! aux. variables
|
|
|
|
& m1, m2, n1, n2, i, mc,&! generic counter
|
|
|
|
& idum, nct_, &! check variables
|
|
|
|
& j,jj, &! counters on processors
|
|
|
|
& n1m1,n2m1,n3m1, &! nr1-1 and so on
|
|
|
|
& i1, i2, i3, &! counters on G space
|
|
|
|
& good_fft_dimension ! a function with obvious meaning
|
|
|
|
real(kind=8) &
|
|
|
|
& aux(ncplanex), &! used to order columns
|
|
|
|
& amod ! modulus of G vectors
|
|
|
|
!
|
|
|
|
call tictac(27,0)
|
|
|
|
!
|
|
|
|
! set the dimensions of fft arrays
|
|
|
|
!
|
|
|
|
nr1x =good_fft_dimension(nr1 )
|
|
|
|
nr2x = nr2
|
|
|
|
nr3x =good_fft_dimension(nr3 )
|
|
|
|
!
|
|
|
|
nr1sx=good_fft_dimension(nr1s)
|
|
|
|
nr2sx=nr2s
|
|
|
|
nr3sx=good_fft_dimension(nr3s)
|
|
|
|
!
|
|
|
|
! compute number of columns for each processor
|
|
|
|
!
|
|
|
|
ncplane = nr1x*nr2x
|
|
|
|
ncplanes= nr1sx*nr2sx
|
|
|
|
if (ncplane.gt.ncplanex .or. ncplanes.gt.ncplanex) &
|
|
|
|
& call error('set_fft_para','ncplanex too small',ncplane)
|
|
|
|
!
|
|
|
|
! set the number of plane per process
|
|
|
|
!
|
|
|
|
if (nr3.lt.nproc) call error('set_fft_para', &
|
|
|
|
& 'some processors have no planes ',-1)
|
|
|
|
if (nr3s.lt.nproc) call error('set_fft_para', &
|
|
|
|
& 'some processors have no smooth planes ',-1)
|
|
|
|
!
|
|
|
|
if (nproc.eq.1) then
|
|
|
|
npp(1) = nr3
|
|
|
|
npps(1)= nr3s
|
|
|
|
else
|
|
|
|
np = nr3/nproc
|
|
|
|
nq = nr3 - np*nproc
|
|
|
|
nps1 = nr3s/nproc
|
|
|
|
nqs = nr3s - nps1*nproc
|
|
|
|
do i = 1, nproc
|
|
|
|
npp(i) = np
|
|
|
|
if (i.le.nq) npp(i) = np + 1
|
|
|
|
npps(i) = nps1
|
|
|
|
if (i.le.nqs) npps(i) = nps1 + 1
|
|
|
|
end do
|
|
|
|
end if
|
|
|
|
n3(1)= 0
|
|
|
|
do i = 2, nproc
|
|
|
|
n3(i)=n3(i-1)+npp(i-1)
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! Now compute for each point of the big plane how many column have
|
|
|
|
! non zero vectors on the smooth and dense grid
|
|
|
|
!
|
|
|
|
do mc = 1,ncplane
|
|
|
|
ipc(mc) = 0
|
|
|
|
end do
|
|
|
|
do mc = 1,ncplanes
|
|
|
|
ipcs(mc)= 0
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
nct = 0
|
|
|
|
ncts= 0
|
|
|
|
!
|
|
|
|
! NOTA BENE: the exact limits for a correctly sized FFT grid are:
|
|
|
|
! -nr/2,..,+nr/2 for nr even; -(nr-1)/2,..,+(nr-1)/2 for nr odd.
|
|
|
|
! If the following limits are increased, a slightly undersized fft
|
|
|
|
! grid, with some degree of G-vector refolding, can be used
|
|
|
|
! (at your own risk - a check is done in ggen).
|
|
|
|
!
|
|
|
|
n1m1=nr1/2
|
|
|
|
n2m1=nr2/2
|
|
|
|
n3m1=nr3/2
|
|
|
|
!
|
|
|
|
do i1=-n1m1,n1m1
|
|
|
|
do i2=-n2m1,n2m1
|
|
|
|
!
|
|
|
|
! nct counts columns containing G-vectors for the dense grid
|
|
|
|
!
|
|
|
|
nct=nct+1
|
|
|
|
if (nct.gt.ncplane) &
|
|
|
|
& call error('set_fft_para','too many columns',1)
|
|
|
|
ngc (nct) = 0
|
|
|
|
ngcs(nct) = 0
|
|
|
|
ngcw(nct) = 0
|
|
|
|
!
|
|
|
|
do i3 = -n3m1,n3m1
|
|
|
|
amod = (b1(1)*i1 + b2(1)*i2 + b3(1)*i3)**2 + &
|
|
|
|
& (b1(2)*i1 + b2(2)*i2 + b3(2)*i3)**2 + &
|
|
|
|
& (b1(3)*i1 + b2(3)*i2 + b3(3)*i3)**2
|
|
|
|
if (amod.le.gcut ) &
|
|
|
|
& ngc (nct)= ngc (nct)+ 1
|
|
|
|
if (amod.le.gcuts) &
|
|
|
|
& ngcs(nct)= ngcs(nct)+ 1
|
|
|
|
if (amod.le.gcutw) &
|
|
|
|
& ngcw(nct)= ngcw(nct)+ 1
|
|
|
|
enddo
|
|
|
|
if (ngc(nct).gt.0) then
|
|
|
|
!
|
|
|
|
! this column contains G-vectors
|
|
|
|
!
|
|
|
|
in1(nct) = i1
|
|
|
|
in2(nct) = i2
|
|
|
|
if (ngcs(nct).gt.0) then
|
|
|
|
!
|
|
|
|
! ncts counts columns contaning G-vectors for the smooth grid
|
|
|
|
!
|
|
|
|
ncts=ncts+1
|
|
|
|
if (ncts.gt.ncplanes) &
|
|
|
|
& call error('set_fft_para','too many columns',2)
|
|
|
|
end if
|
|
|
|
else
|
|
|
|
!
|
|
|
|
! this column has no G-vectors: reset the counter
|
|
|
|
!
|
|
|
|
nct=nct-1
|
|
|
|
end if
|
|
|
|
enddo
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
if(nct .eq.0) call error('set_fft_para','number of column 0', 1)
|
|
|
|
if(ncts.eq.0) call error('set_fft_para', &
|
|
|
|
& 'number smooth column 0', 1)
|
|
|
|
!
|
|
|
|
! Sort the columns. First the column with the largest number of G
|
|
|
|
! vectors on the wavefunction sphere (active columns),
|
|
|
|
! then on the smooth sphere, then on the big sphere. Dirty trick:
|
|
|
|
!
|
|
|
|
do mc = 1,nct
|
|
|
|
aux(mc)=-(ngcw(mc)*nr3x**2 + ngcs(mc)*nr3x + ngc(mc))
|
|
|
|
end do
|
|
|
|
call kb07ad_cp90(aux,nct,index)
|
|
|
|
!
|
|
|
|
! assign columns to processes
|
|
|
|
!
|
|
|
|
do j=1,nproc
|
|
|
|
ncp (j) = 0
|
|
|
|
ncps(j) = 0
|
|
|
|
ncpw(j) = 0
|
|
|
|
ngp (j) = 0
|
|
|
|
ngps(j) = 0
|
|
|
|
ngpw(j) = 0
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
do mc=1, nct
|
|
|
|
i = index(mc)
|
|
|
|
!
|
|
|
|
! index contains the desired ordering of columns (see above)
|
|
|
|
!
|
|
|
|
i1=in1(i)
|
|
|
|
i2=in2(i)
|
|
|
|
!
|
|
|
|
if ( i1.lt.0.or.(i1.eq.0.and.i2.lt.0) ) go to 30
|
|
|
|
!
|
|
|
|
! only half of the columns, plus column (0,0), are scanned:
|
|
|
|
! column (-i1,-i2) must be assigned to the same proc as column (i1,i2)
|
|
|
|
!
|
|
|
|
! ic : position, in fft notation, in dense grid, of column ( i1, i2)
|
|
|
|
! icm : " " " " " " (-i1,-i2)
|
|
|
|
! ics : " " " smooth " " ( i1, i2)
|
|
|
|
! icms: " " " smooth " " (-i1,-i2)
|
|
|
|
!
|
|
|
|
m1 = i1 + 1
|
|
|
|
if (m1.lt.1) m1 = m1 + nr1
|
|
|
|
m2 = i2 + 1
|
|
|
|
if (m2.lt.1) m2 = m2 + nr2
|
|
|
|
ic = m1 + (m2-1)*nr1x
|
|
|
|
!
|
|
|
|
n1 = -i1 + 1
|
|
|
|
if (n1.lt.1) n1 = n1 + nr1
|
|
|
|
n2 = -i2 + 1
|
|
|
|
if (n2.lt.1) n2 = n2 + nr2
|
|
|
|
icm = n1 + (n2-1)*nr1x
|
|
|
|
!
|
|
|
|
m1 = i1 + 1
|
|
|
|
if (m1.lt.1) m1 = m1 + nr1s
|
|
|
|
m2 = i2 + 1
|
|
|
|
if (m2.lt.1) m2 = m2 + nr2s
|
|
|
|
ics = m1 + (m2-1)*nr1sx
|
|
|
|
!
|
|
|
|
n1 =-i1 + 1
|
|
|
|
if (n1.lt.1) n1 = n1 + nr1s
|
|
|
|
n2 =-i2 + 1
|
|
|
|
if (n2.lt.1) n2 = n2 + nr2s
|
|
|
|
icms = n1 + (n2-1)*nr1sx
|
|
|
|
!
|
|
|
|
jj=1
|
|
|
|
if (ngcw(i).gt.0) then
|
|
|
|
!
|
|
|
|
! this is an active column: find which processor has currently
|
|
|
|
! the smallest number of plane waves
|
|
|
|
!
|
|
|
|
do j=1,nproc
|
|
|
|
if (ngpw(j).lt.ngpw(jj)) jj = j
|
|
|
|
end do
|
|
|
|
else
|
|
|
|
!
|
|
|
|
! this is an inactive column: find which processor has currently
|
|
|
|
! the smallest number of G-vectors
|
|
|
|
!
|
|
|
|
do j=1,nproc
|
|
|
|
if (ngp(j).lt.ngp(jj)) jj = j
|
|
|
|
end do
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! jj is the processor to which this column is assigned
|
|
|
|
! use -jj for inactive columns, jj for active columns
|
|
|
|
!
|
|
|
|
ipc(ic) = -jj
|
|
|
|
ncp(jj) = ncp(jj) + 1
|
|
|
|
ngp(jj) = ngp(jj) + ngc(i)
|
|
|
|
if (ngcs(i).gt.0) then
|
|
|
|
ncps(jj)=ncps(jj)+1
|
|
|
|
ngps(jj)=ngps(jj)+ngcs(i)
|
|
|
|
ipcs(ics)=-jj
|
|
|
|
endif
|
|
|
|
if (ngcw(i).gt.0) then
|
|
|
|
ipcs(ics)=jj
|
|
|
|
ipc(ic) = jj
|
|
|
|
ngpw(jj)= ngpw(jj) + ngcw(i)
|
|
|
|
ncpw(jj)= ncpw(jj) + 1
|
|
|
|
endif
|
|
|
|
!
|
|
|
|
! now assign the (-i1,-i2) column to the same processor
|
|
|
|
!
|
|
|
|
if (i1.eq.0.and.i2.eq.0) go to 30
|
|
|
|
!
|
|
|
|
! do not count twice column (0,0) !
|
|
|
|
!
|
|
|
|
ipc(icm) = -jj
|
|
|
|
ncp(jj) = ncp(jj) + 1
|
|
|
|
ngp(jj) = ngp(jj) + ngc(i)
|
|
|
|
if (ngcs(i).gt.0) then
|
|
|
|
ncps(jj)=ncps(jj)+1
|
|
|
|
ngps(jj)=ngps(jj)+ngcs(i)
|
|
|
|
ipcs(icms)=-jj
|
|
|
|
endif
|
|
|
|
if (ngcw(i).gt.0) then
|
|
|
|
ipcs(icms)=jj
|
|
|
|
ipc(icm) = jj
|
|
|
|
ngpw(jj)= ngpw(jj) + ngcw(i)
|
|
|
|
ncpw(jj)= ncpw(jj) + 1
|
|
|
|
endif
|
|
|
|
30 continue
|
|
|
|
enddo
|
|
|
|
!
|
|
|
|
! ipc is the processor for this column in the dense grid
|
|
|
|
! ipcs is the same, for the smooth grid
|
|
|
|
!
|
|
|
|
write(6,'( &
|
|
|
|
& '' Proc planes cols G planes cols G columns G''/ &
|
|
|
|
& '' (dense grid) (smooth grid) (wavefct grid)'')')
|
|
|
|
do i=1,nproc
|
|
|
|
write(6,'(i3,2x,3(i5,2i7))') i, npp(i),ncp(i),ngp(i), &
|
|
|
|
& npps(i),ncps(i),ngps(i), ncpw(i), ngpw(i)
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! nnr_ and nnrs_ are copies of nnr and nnrs, the local fft data size,
|
|
|
|
! to be stored in "parallel" commons. Not a very elegant solution.
|
|
|
|
!
|
|
|
|
if (nproc.eq.1) then
|
|
|
|
nnr =nr1x*nr2x*nr3x
|
|
|
|
nnrs=nr1sx*nr2sx*nr3sx
|
|
|
|
else
|
|
|
|
nnr = max(nr3x*ncp(me), nr1x*nr2x*npp(me))
|
|
|
|
nnrs= max(nr3sx*ncps(me), nr1sx*nr2sx*npps(me))
|
|
|
|
end if
|
|
|
|
nnr_= nnr
|
|
|
|
nnrs_= nnrs
|
|
|
|
!
|
|
|
|
! computing the starting column for each processor
|
|
|
|
!
|
|
|
|
do i=1,nproc
|
|
|
|
if(ngpw(i).eq.0) &
|
|
|
|
& call error('set_fft_para', &
|
|
|
|
& 'some processors have no pencils, not yet implemented',1)
|
|
|
|
if (i.eq.1) then
|
|
|
|
ncp0(i) = 0
|
|
|
|
ncp0s(i)= 0
|
|
|
|
else
|
|
|
|
ncp0(i) = ncp0 (i-1) + ncp (i-1)
|
|
|
|
ncp0s(i)= ncp0s(i-1) + ncps(i-1)
|
|
|
|
endif
|
|
|
|
enddo
|
|
|
|
!
|
|
|
|
! Now compute the arrays ipc and icpl (dense grid):
|
|
|
|
! ipc contain the number of the column for that processor.
|
|
|
|
! zero if the column do not belong to the processor.
|
|
|
|
! Note that input ipc is used and overwritten.
|
|
|
|
! icpl contains the point in the plane for each column
|
|
|
|
!
|
|
|
|
!- active columns first........
|
|
|
|
!
|
|
|
|
do j=1,nproc
|
|
|
|
ncp_(j) = 0
|
|
|
|
end do
|
|
|
|
do mc =1,ncplane
|
|
|
|
if (ipc(mc).gt.0) then
|
|
|
|
j = ipc(mc)
|
|
|
|
ncp_(j) = ncp_(j) + 1
|
|
|
|
icpl(ncp_(j) + ncp0(j)) = mc
|
|
|
|
if (j.eq.me) then
|
|
|
|
ipc(mc) = ncp_(j)
|
|
|
|
else
|
|
|
|
ipc(mc) = 0
|
|
|
|
end if
|
|
|
|
end if
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
!-..... ( intermediate check ) ....
|
|
|
|
!
|
|
|
|
do j=1,nproc
|
|
|
|
if (ncp_(j).ne.ncpw(j)) &
|
|
|
|
& call error('set_fft_para','ncp_(j).ne.ncpw(j)',j)
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
!- ........then the remaining columns
|
|
|
|
!
|
|
|
|
do mc =1,ncplane
|
|
|
|
if (ipc(mc).lt.0) then
|
|
|
|
j = -ipc(mc)
|
|
|
|
ncp_(j) = ncp_(j) + 1
|
|
|
|
icpl(ncp_(j) + ncp0(j)) = mc
|
|
|
|
if (j.eq.me) then
|
|
|
|
ipc(mc) = ncp_(j)
|
|
|
|
else
|
|
|
|
ipc(mc) = 0
|
|
|
|
end if
|
|
|
|
end if
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
!-... ( final check )
|
|
|
|
!
|
|
|
|
nct_ = 0
|
|
|
|
do j=1,nproc
|
|
|
|
if (ncp_(j).ne.ncp(j)) &
|
|
|
|
& call error('set_fft_para','ncp_(j).ne.ncp(j)',j)
|
|
|
|
nct_ = nct_ + ncp_(j)
|
|
|
|
end do
|
|
|
|
if (nct_.ne.nct) &
|
|
|
|
& call error('set_fft_para','nct_.ne.nct',1)
|
|
|
|
!
|
|
|
|
! now compute the arrays ipcs and icpls
|
|
|
|
! (as ipc and icpls, for the smooth grid)
|
|
|
|
!
|
|
|
|
! active columns first...
|
|
|
|
!
|
|
|
|
do j=1,nproc
|
|
|
|
ncp_(j) = 0
|
|
|
|
end do
|
|
|
|
do mc =1,ncplanes
|
|
|
|
if (ipcs(mc).gt.0) then
|
|
|
|
j = ipcs(mc)
|
|
|
|
ncp_(j)=ncp_(j) + 1
|
|
|
|
icpls(ncp_(j) + ncp0s(j)) = mc
|
|
|
|
if (j.eq.me) then
|
|
|
|
ipcs(mc) = ncp_(j)
|
|
|
|
else
|
|
|
|
ipcs(mc) = 0
|
|
|
|
endif
|
|
|
|
endif
|
|
|
|
enddo
|
|
|
|
!
|
|
|
|
!-..... ( intermediate check ) ....
|
|
|
|
!
|
|
|
|
do j=1,nproc
|
|
|
|
if (ncp_(j).ne.ncpw(j)) &
|
|
|
|
& call error('set_fft_para','ncp_(j).ne.ncpw(j)',j)
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! and then all the others
|
|
|
|
!
|
|
|
|
do mc =1,ncplanes
|
|
|
|
if (ipcs(mc).lt.0) then
|
|
|
|
j = -ipcs(mc)
|
|
|
|
ncp_(j) = ncp_(j) + 1
|
|
|
|
icpls(ncp_(j) + ncp0s(j)) = mc
|
|
|
|
if (j.eq.me) then
|
|
|
|
ipcs(mc) = ncp_(j)
|
|
|
|
else
|
|
|
|
ipcs(mc) = 0
|
|
|
|
end if
|
|
|
|
end if
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
!-... ( final check )
|
|
|
|
!
|
|
|
|
nct_ = 0
|
|
|
|
do j=1,nproc
|
|
|
|
if (ncp_(j).ne.ncps(j)) &
|
|
|
|
& call error('set_fft_para','ncp_(j).ne.ncps(j)',j)
|
|
|
|
nct_ = nct_ + ncp_(j)
|
|
|
|
end do
|
|
|
|
if (nct_.ne.ncts) &
|
|
|
|
& call error('set_fft_para','nct_.ne.ncts',1)
|
|
|
|
call tictac(27,1)
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine cfftp(f,nr1,nr2,nr3,nr1x,nr2x,nr3x,sign)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! sign = +-1 : parallel 3d fft for rho and for the potential
|
|
|
|
!
|
|
|
|
! sign = +1 : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
|
|
|
|
! fft along z using pencils (cft_1)
|
|
|
|
! transpose across nodes (fft_scatter)
|
|
|
|
! and reorder
|
|
|
|
! fft along y and x (cft_2)
|
|
|
|
! sign = -1 : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega
|
|
|
|
! fft along x and y (cft_2)
|
|
|
|
! transpose across nodes (fft_scatter)
|
|
|
|
! and reorder
|
|
|
|
! fft along z using pencils (cft_1)
|
|
|
|
!
|
|
|
|
! based on code written by Stefano de Gironcoli for PWSCF
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
use work_fft
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
integer nr1,nr2,nr3,nr1x,nr2x,nr3x, sign, nppx
|
|
|
|
complex(kind=8) f(nnr_)
|
|
|
|
integer mc, i, j, ii
|
|
|
|
!
|
|
|
|
! the following is needed if the fft is distributed over only one processor
|
|
|
|
! for the special case nx3.ne.n3. Not an elegant solution, but simple, fast,
|
|
|
|
! and better than the preceding one that did not work in some cases. Note
|
|
|
|
! that fft_scatter does nothing if nproc=1. PG
|
|
|
|
!
|
|
|
|
if (nproc.eq.1) then
|
|
|
|
nppx=nr3x
|
|
|
|
else
|
|
|
|
nppx=npp(me)
|
|
|
|
end if
|
|
|
|
if (sign.eq.1) then
|
|
|
|
call cft_1(f,ncp(me),nr3,nr3x,sign,aux)
|
|
|
|
call fft_scatter(nproc,me,aux,nr3x,nnr_,f,ncp,npp,sign)
|
|
|
|
call zero(2*nnr_,f)
|
|
|
|
do i=1,nct
|
|
|
|
mc = icpl(i)
|
|
|
|
do j=1,npp(me)
|
|
|
|
f(mc+(j-1)*ncplane) = aux(j + (i-1)*nppx)
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
call cft_2(f,npp(me),nr1,nr2,nr1x,nr2x,sign)
|
|
|
|
!
|
|
|
|
else if (sign.eq.-1) then
|
|
|
|
!
|
|
|
|
call cft_2(f,npp(me),nr1,nr2,nr1x,nr2x,sign)
|
|
|
|
!
|
|
|
|
do i=1,nct
|
|
|
|
mc = icpl(i)
|
|
|
|
do j=1,npp(me)
|
|
|
|
aux(j + (i-1)*nppx) = f(mc+(j-1)*ncplane)
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
call fft_scatter(nproc,me,aux,nr3x,nnr_,f,ncp,npp,sign)
|
|
|
|
call cft_1(aux,ncp(me),nr3,nr3x,sign,f)
|
|
|
|
else
|
|
|
|
call error('cftp','not allowed',abs(sign))
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine cfftps (f,nr1,nr2,nr3,nr1x,nr2x,nr3x,sign)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! sign = +-1 : parallel 3d fft for rho and for the potential
|
|
|
|
! sign = +-2 : parallel 3d fft for wavefunctions
|
|
|
|
!
|
|
|
|
! sign = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
|
|
|
|
! fft along z using pencils (cft_1)
|
|
|
|
! transpose across nodes (fft_scatter)
|
|
|
|
! and reorder
|
|
|
|
! fft along y (using planes) and x (cft_2)
|
|
|
|
! sign = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega
|
|
|
|
! fft along x and y(using planes) (cft_2)
|
|
|
|
! transpose across nodes (fft_scatter)
|
|
|
|
! and reorder
|
|
|
|
! fft along z using pencils (cft_1)
|
|
|
|
!
|
|
|
|
! The array "planes" signals whether a fft is needed along y :
|
|
|
|
! planes(i)=0 : column f(i,*,*) empty , don't do fft along y
|
|
|
|
! planes(i)=1 : column f(i,*,*) filled, fft along y needed
|
|
|
|
! "empty" = no active components are present in f(i,*,*)
|
|
|
|
! after (sign>0) or before (sign<0) the fft on z direction
|
|
|
|
!
|
|
|
|
! Note that if sign=+/-1 (fft on rho and pot.) all fft's are needed
|
|
|
|
! and all planes(i) are set to 1
|
|
|
|
!
|
|
|
|
! based on code written by Stefano de Gironcoli for PWSCF
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
use work_fft
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
integer nr1,nr2,nr3,nr1x,nr2x,nr3x,sign
|
|
|
|
complex(kind=8) f(nnrs_)
|
|
|
|
integer mc, i, j, ii, proc, k, nppx
|
|
|
|
integer planes(nr1x)
|
|
|
|
!
|
|
|
|
! see comments in cfftp for the logic (or lack of it) of the following
|
|
|
|
!
|
|
|
|
if (nproc.eq.1) then
|
|
|
|
nppx=nr3x
|
|
|
|
else
|
|
|
|
nppx=npps(me)
|
|
|
|
end if
|
|
|
|
if (sign.gt.0) then
|
|
|
|
if (sign.ne.2) then
|
|
|
|
call cft_1s(f,ncps(me),nr3,nr3x,sign,aux)
|
|
|
|
call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncps,npps,sign)
|
|
|
|
call zero(2*nnrs_,f)
|
|
|
|
do i=1,ncts
|
|
|
|
mc = icpls(i)
|
|
|
|
do j=1,npps(me)
|
|
|
|
f(mc+(j-1)*ncplanes) = aux(j + (i-1)*nppx)
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
do i=1,nr1x
|
|
|
|
planes(i) = 1
|
|
|
|
enddo
|
|
|
|
else
|
|
|
|
call cft_1s(f,ncpw(me),nr3,nr3x,sign,aux)
|
|
|
|
call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncpw,npps,sign)
|
|
|
|
call zero(2*nnrs_,f)
|
|
|
|
ii = 0
|
|
|
|
do i=1,nr1x
|
|
|
|
planes(i) = 0
|
|
|
|
enddo
|
|
|
|
do proc=1,nproc
|
|
|
|
do i=1,ncpw(proc)
|
|
|
|
mc = icpls(i+ncp0s(proc))
|
|
|
|
ii = ii + 1
|
|
|
|
k=mod(mc-1,nr1x)+1
|
|
|
|
planes(k) = 1
|
|
|
|
do j=1,npps(me)
|
|
|
|
f(mc+(j-1)*ncplanes) = aux(j + (ii-1)*nppx)
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
call cft_2s(f,npps(me),nr1,nr2,nr1x,nr2x,sign,planes)
|
|
|
|
!
|
|
|
|
else
|
|
|
|
!
|
|
|
|
if (sign.ne.-2) then
|
|
|
|
do i=1,nr1x
|
|
|
|
planes(i) = 1
|
|
|
|
end do
|
|
|
|
else
|
|
|
|
do i=1,nr1x
|
|
|
|
planes(i) = 0
|
|
|
|
end do
|
|
|
|
do proc=1,nproc
|
|
|
|
do i=1,ncpw(proc)
|
|
|
|
mc = icpls(i+ncp0s(proc))
|
|
|
|
k=mod(mc-1,nr1x)+1
|
|
|
|
planes(k) = 1
|
|
|
|
enddo
|
|
|
|
enddo
|
|
|
|
endif
|
|
|
|
!
|
|
|
|
call cft_2s(f,npps(me),nr1,nr2,nr1x,nr2x,sign,planes)
|
|
|
|
!
|
|
|
|
if (sign.ne.-2) then
|
|
|
|
do i=1,ncts
|
|
|
|
mc = icpls(i)
|
|
|
|
do j=1,npps(me)
|
|
|
|
aux(j + (i-1)*nppx) = f(mc+(j-1)*ncplanes)
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncps,npps,sign)
|
|
|
|
call cft_1s(aux,ncps(me),nr3,nr3x,sign,f)
|
|
|
|
else
|
|
|
|
ii = 0
|
|
|
|
do proc=1,nproc
|
|
|
|
do i=1,ncpw(proc)
|
|
|
|
mc = icpls(i+ncp0s(proc))
|
|
|
|
ii = ii + 1
|
|
|
|
do j=1,npps(me)
|
|
|
|
aux(j + (ii-1)*nppx) = f(mc+(j-1)*ncplanes)
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncpw,npps,sign)
|
|
|
|
call cft_1s(aux,ncpw(me),nr3,nr3x,sign,f)
|
|
|
|
end if
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine cfftpb(f,nr1b,nr2b,nr3b,nr1bx,nr2bx,nr3bx,irb3,sign)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! not-so-parallel 3d fft for box grid, implemented only for sign=1
|
|
|
|
! G-space to R-space, output = \sum_G f(G)exp(+iG*R)
|
|
|
|
! The array f (overwritten on output) is NOT distributed:
|
|
|
|
! a copy is present on each processor.
|
|
|
|
! The fft along z is done on the entire grid.
|
|
|
|
! The fft along xy is done only on planes that have components on the
|
|
|
|
! dense grid for each processor. Note that the final array will no
|
|
|
|
! longer be the same on all processors.
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
use parm, only: nr3
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
integer nr1b,nr2b,nr3b,nr1bx,nr2bx,nr3bx,irb3,sign
|
|
|
|
complex(kind=8) f(nr1bx,nr2bx,nr3bx)
|
|
|
|
!
|
|
|
|
integer ir3, ibig3, imin3, imax3, np3
|
|
|
|
!
|
|
|
|
call parabox(nr3b,irb3,nr3,imin3,imax3)
|
|
|
|
np3=imax3-imin3+1
|
|
|
|
! np3 is the number of planes to be transformed
|
|
|
|
if (np3.le.0) return
|
|
|
|
call cft_b(f,nr1b,nr2b,nr3b,nr1bx,nr2bx,nr3bx,imin3,imax3,sign)
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine parabox(nr3b,irb3,nr3,imin3,imax3)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! find if box grid planes in the z direction have component on the dense
|
|
|
|
! grid on this processor, and if, which range imin3-imax3
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
! input
|
|
|
|
integer nr3b,irb3,nr3
|
|
|
|
! output
|
|
|
|
integer imin3,imax3
|
|
|
|
! local
|
|
|
|
integer ir3, ibig3
|
|
|
|
!
|
|
|
|
imin3=nr3b
|
|
|
|
imax3=1
|
|
|
|
do ir3=1,nr3b
|
|
|
|
ibig3=1+mod(irb3+ir3-2,nr3)
|
|
|
|
if(ibig3.lt.1.or.ibig3.gt.nr3) &
|
|
|
|
& call error('cfftpb','ibig3 wrong',ibig3)
|
|
|
|
ibig3=ibig3-n3(me)
|
|
|
|
if (ibig3.gt.0.and.ibig3.le.npp(me)) then
|
|
|
|
imin3=min(imin3,ir3)
|
|
|
|
imax3=max(imax3,ir3)
|
|
|
|
end if
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end
|
|
|
|
!
|
|
|
|
!------------------------------------------------------------------------
|
|
|
|
subroutine fft_scatter &
|
|
|
|
& (nproc, me, f_in, nr3x, nnr_, f_aux, ncp_, npp_, sign)
|
|
|
|
!------------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! transpose the fft grid across nodes
|
|
|
|
! a) From columns to planes (sign > 0)
|
|
|
|
!
|
|
|
|
! "columns" (or "pencil") representation:
|
|
|
|
! processor "me" has ncp_(me) contiguous columns along z
|
|
|
|
! Each column is nr3x=nr3+1 elements for a fft of order nr3
|
|
|
|
! (the additional element is added to reduce memory conflicts)
|
|
|
|
!
|
|
|
|
! The transpose take places in two steps:
|
|
|
|
! 1) on each processor the columns are divided into slices along z
|
|
|
|
! that are stored contiguously. On processor "me", slices for
|
|
|
|
! processor "proc" are npp_(proc)*ncp_(me) big
|
|
|
|
! 2) all processors communicate to exchange slices
|
|
|
|
! (all columns with z in the slice belonging to "me"
|
|
|
|
! must be received, all the others must be sent to "proc")
|
|
|
|
! Finally one gets the "planes" representation:
|
|
|
|
! processor "me" has npp_(me) complete xy planes
|
|
|
|
!
|
|
|
|
! b) From planes to columns (sign < 0)
|
|
|
|
!
|
|
|
|
! Quite the same in the opposite direction
|
|
|
|
!
|
|
|
|
! The output is overwritten on f_in ; f_aux is used as work space
|
|
|
|
!
|
|
|
|
! based on code written by Stefano de Gironcoli for PWSCF
|
|
|
|
!
|
|
|
|
use para_mod, only: maxproc
|
|
|
|
implicit none
|
|
|
|
!
|
|
|
|
include 'mpif.h'
|
|
|
|
integer nproc, me, nr3x, nnr_, sign, ncp_(nproc), npp_(nproc)
|
|
|
|
real(kind=8) f_in(2*nnr_), f_aux(2*nnr_)
|
|
|
|
!
|
|
|
|
integer dest, from, k, offset1(maxproc), &
|
|
|
|
& sendcount(maxproc), sdispls(maxproc), &
|
|
|
|
& recvcount(maxproc), rdispls(maxproc), &
|
|
|
|
& proc, ierr
|
|
|
|
!
|
|
|
|
!
|
|
|
|
if (nproc.eq.1) return
|
|
|
|
call tictac(28,0)
|
|
|
|
!
|
|
|
|
! sendcount(proc): amount of data processor "me" must send to processor proc
|
|
|
|
! recvcount(proc): amount of data processor "me" must receive from proc
|
|
|
|
!
|
|
|
|
do proc = 1, nproc
|
|
|
|
sendcount(proc) = 2*npp_(proc)*ncp_(me)
|
|
|
|
recvcount(proc) = 2*npp_(me)*ncp_(proc)
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! offset1(proc) is used to locate the slices to be sent to proc
|
|
|
|
! sdispls(proc)+1 is the beginning of data that must be sent to proc
|
|
|
|
! rdispls(proc)+1 is the beginning of data that must be received from proc
|
|
|
|
!
|
|
|
|
offset1(1) = 1
|
|
|
|
sdispls(1)=0
|
|
|
|
rdispls(1)=0
|
|
|
|
do proc = 2, nproc
|
|
|
|
offset1(proc) = offset1(proc-1) + 2 * npp_(proc-1)
|
|
|
|
sdispls(proc) = sdispls(proc-1) + sendcount(proc-1)
|
|
|
|
rdispls(proc) = rdispls(proc-1) + recvcount(proc-1)
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
if(sign.gt.0) then
|
|
|
|
!
|
|
|
|
! "forward" scatter from columns to planes
|
|
|
|
!
|
|
|
|
! step one: store contiguously the slices
|
|
|
|
!
|
|
|
|
do proc = 1, nproc
|
|
|
|
from = offset1(proc)
|
|
|
|
dest = 1 + sdispls(proc)
|
|
|
|
do k = 1, ncp_(me)
|
2003-02-16 23:16:33 +08:00
|
|
|
call DCOPY ( 2 * npp_(proc), &
|
2003-01-20 05:58:50 +08:00
|
|
|
& f_in ( from + 2*(k-1)*nr3x) , 1, &
|
|
|
|
& f_aux( dest + 2*(k-1)*npp_(proc) ) , 1 )
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! maybe useless; ensures that no garbage is present in the output
|
|
|
|
!
|
|
|
|
call zero (2*nnr_,f_in)
|
|
|
|
!
|
|
|
|
! step two: communication
|
|
|
|
!
|
|
|
|
call mpi_barrier( MPI_COMM_WORLD, ierr )
|
|
|
|
call mpi_alltoallv(f_aux,sendcount,sdispls,MPI_REAL8, &
|
|
|
|
& f_in ,recvcount,rdispls,MPI_REAL8, &
|
|
|
|
& MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('fft_scatter','ierr<>0',ierr)
|
|
|
|
!
|
|
|
|
else
|
|
|
|
!
|
|
|
|
! step two: communication
|
|
|
|
!
|
|
|
|
call mpi_barrier( MPI_COMM_WORLD, ierr )
|
|
|
|
call mpi_alltoallv(f_in ,recvcount,rdispls,MPI_REAL8, &
|
|
|
|
& f_aux,sendcount,sdispls,MPI_REAL8, &
|
|
|
|
& MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('fft_scatter','ierr<>0',ierr)
|
|
|
|
!
|
|
|
|
! step one: store contiguously the columns
|
|
|
|
!
|
|
|
|
call zero (2*nnr_,f_in)
|
|
|
|
!
|
|
|
|
do proc = 1, nproc
|
|
|
|
from = 1 + sdispls(proc)
|
|
|
|
dest = offset1(proc)
|
|
|
|
do k = 1, ncp_(me)
|
2003-02-16 23:16:33 +08:00
|
|
|
call DCOPY ( 2 * npp_(proc), &
|
2003-01-20 05:58:50 +08:00
|
|
|
& f_aux( from + 2*(k-1)*npp_(proc) ) , 1 , &
|
|
|
|
& f_in ( dest + 2*(k-1)*nr3x) , 1 )
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
end if
|
|
|
|
call tictac(28,1)
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end
|
|
|
|
!
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
subroutine reduce(size,ps)
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! sums a distributed variable s(size) over the processors.
|
|
|
|
! This version uses a fixed-length buffer of appropriate (?) size
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
integer size
|
|
|
|
real(kind=8) ps(size)
|
|
|
|
!
|
|
|
|
include 'mpif.h'
|
|
|
|
integer ierr, n, nbuf
|
|
|
|
integer, parameter:: MAXB=10000
|
|
|
|
real(kind=8) buff(MAXB)
|
|
|
|
!
|
|
|
|
if (nproc.le.1) return
|
|
|
|
if (size.le.0) return
|
|
|
|
call tictac(29,0)
|
|
|
|
!
|
|
|
|
! syncronize processes
|
|
|
|
!
|
|
|
|
call mpi_barrier(MPI_COMM_WORLD,ierr)
|
|
|
|
if (ierr.ne.0) call error('reduce','error in barrier',ierr)
|
|
|
|
!
|
|
|
|
nbuf=size/MAXB
|
|
|
|
!
|
|
|
|
do n=1,nbuf
|
|
|
|
call mpi_allreduce (ps(1+(n-1)*MAXB), buff, MAXB, &
|
|
|
|
& MPI_REAL8, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) &
|
|
|
|
& call error('reduce','error in allreduce1',ierr)
|
2003-02-16 23:16:33 +08:00
|
|
|
call DCOPY(MAXB,buff,1,ps(1+(n-1)*MAXB),1)
|
2003-01-20 05:58:50 +08:00
|
|
|
end do
|
|
|
|
!
|
|
|
|
! possible remaining elements < maxb
|
|
|
|
!
|
|
|
|
if (size-nbuf*MAXB.gt.0) then
|
|
|
|
call mpi_allreduce (ps(1+nbuf*MAXB), buff, size-nbuf*MAXB, &
|
|
|
|
& MPI_REAL8, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) &
|
|
|
|
& call error('reduce','error in allreduce2',ierr)
|
2003-02-16 23:16:33 +08:00
|
|
|
call DCOPY(size-nbuf*MAXB,buff,1,ps(1+nbuf*MAXB),1)
|
2003-01-20 05:58:50 +08:00
|
|
|
endif
|
|
|
|
call tictac(29,1)
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine print_para_times
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
use timex_mod
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
include 'mpif.h'
|
|
|
|
integer i, ierr
|
|
|
|
real(kind=8) mincpu(maxclock), maxcpu(maxclock)
|
|
|
|
!
|
|
|
|
! syncronize processes
|
|
|
|
!
|
|
|
|
call mpi_barrier(MPI_COMM_WORLD,ierr)
|
|
|
|
if (ierr.ne.0) &
|
|
|
|
& call error('print_all_times','error in barrier',ierr)
|
|
|
|
!
|
|
|
|
call mpi_allreduce (cputime, mincpu, maxclock, &
|
|
|
|
& MPI_REAL8, MPI_MIN, MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) &
|
|
|
|
& call error('print_para_times','error in minimum',ierr)
|
|
|
|
call mpi_allreduce (cputime, maxcpu, maxclock, &
|
|
|
|
& MPI_REAL8, MPI_MAX, MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) &
|
|
|
|
& call error('print_para_times','error in maximum',ierr)
|
|
|
|
!
|
|
|
|
write(6,*)
|
|
|
|
write(6,*) ' routine calls cpu time elapsed'
|
|
|
|
write(6,*) ' node0 node0, min, max node0'
|
|
|
|
write(6,*)
|
|
|
|
do i=1, maxclock
|
|
|
|
if (ntimes(i).gt.0) write(6,30) routine(i), &
|
|
|
|
& ntimes(i), cputime(i), mincpu(i),maxcpu(i), elapsed(i)
|
|
|
|
end do
|
|
|
|
30 format(a10,i7,4f8.1)
|
|
|
|
!
|
|
|
|
return
|
|
|
|
end
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine write_wfc(unit,c)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! collect wavefunctions on first node and write to file
|
|
|
|
!
|
|
|
|
use gvec
|
|
|
|
use gvecs
|
|
|
|
use elct
|
|
|
|
use para_mod
|
|
|
|
use parms
|
|
|
|
use work1
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
include 'mpif.h'
|
|
|
|
integer unit
|
|
|
|
complex(kind=8) c(ngw,nx)
|
|
|
|
complex(kind=8), pointer:: psis(:)
|
|
|
|
!
|
|
|
|
integer i, ii, ig, proc, ierr, ntot, ncol, mc
|
|
|
|
integer nmin(3), nmax(3), n1,n2,nzx,nz,nz_
|
|
|
|
integer root, displs(nproc), recvcount(nproc)
|
|
|
|
complex(kind=8), allocatable:: psitot(:), psiwr(:,:,:)
|
|
|
|
!
|
|
|
|
! nmin, nmax are the bounds on (i,j,k) indexes of wavefunction G-vectors
|
|
|
|
!
|
|
|
|
call nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax)
|
|
|
|
!
|
|
|
|
! nzx is the maximum length of a column along z
|
|
|
|
!
|
|
|
|
nzx=nmax(3)-nmin(3)+1
|
|
|
|
!
|
|
|
|
root = 0
|
|
|
|
! root is the first node
|
|
|
|
ntot = 0
|
|
|
|
do proc=1,nproc
|
|
|
|
recvcount(proc) = ncpw(proc)*nzx
|
|
|
|
!
|
|
|
|
! recvcount(proc) = size of data received from processor proc
|
|
|
|
! (number of columns times length of each column)
|
|
|
|
!
|
|
|
|
if (proc.eq.1) then
|
|
|
|
displs(proc)=0
|
|
|
|
else
|
|
|
|
displs(proc)=displs(proc-1) + recvcount(proc-1)
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! displs(proc) is the position of data received from processor proc
|
|
|
|
!
|
|
|
|
ntot = ntot + recvcount(proc)
|
|
|
|
!
|
|
|
|
! ntot = total size of gathered data
|
|
|
|
!
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! allocate the needed work spaces
|
|
|
|
!
|
|
|
|
psis=> wrk1
|
|
|
|
call zero(2*nnrs,psis)
|
|
|
|
if (me.eq.1) then
|
|
|
|
allocate(psitot(ntot))
|
|
|
|
allocate(psiwr(nmin(3):nmax(3),nmin(1):nmax(1),nmin(2):nmax(2)))
|
|
|
|
write(unit) n, nmin, nmax
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! fill array psis with c_i(G) (as packed columns along z)
|
|
|
|
!
|
|
|
|
do i=1,n
|
|
|
|
do ig=1,ngw
|
|
|
|
!
|
|
|
|
! ncol+1 is the index of the column
|
|
|
|
!
|
|
|
|
ncol=(nps(ig)-1)/nr3sx
|
|
|
|
!
|
|
|
|
! nz_ is the z component in FFT style (refolded between 1 and nr3s)
|
|
|
|
!
|
|
|
|
nz_ =nps(ig)-ncol*nr3sx
|
|
|
|
!
|
|
|
|
! nz is the z component in "natural" style (between nmin(3) and nmax(3))
|
|
|
|
!
|
|
|
|
nz =nz_-1
|
|
|
|
if (nz.ge.nr3s/2) nz=nz-nr3s
|
|
|
|
!
|
|
|
|
! ncpw(me) columns along z are stored in contiguous order on each node
|
|
|
|
!
|
|
|
|
psis(nz-nmin(3)+1+ncol*nzx)=c(ig,i)
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! gather all psis arrays on the first node, in psitot
|
|
|
|
!
|
|
|
|
call mpi_barrier ( MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('write_wfc','mpi_barrier 2',ierr)
|
|
|
|
call mpi_gatherv (psis, recvcount(me), MPI_DOUBLE_COMPLEX, &
|
|
|
|
& psitot,recvcount, displs,MPI_DOUBLE_COMPLEX, &
|
|
|
|
& root, MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('write_wfc','mpi_gatherv',ierr)
|
|
|
|
!
|
|
|
|
! now order the columns with a node-number-independent ordering
|
|
|
|
! We use n1,n2,nz for ordering, where G=n1*g(1)+n2*g(2)+nz*g(3)
|
|
|
|
! and g(1), g(2), g(3) are basis vectors of the reciprocal lattice.
|
|
|
|
! Note that the entire column is written, even if there are no
|
|
|
|
! wavefunction components associated to it. This is a waste of
|
|
|
|
! disk space but it is simpler to implement than other schemes.
|
|
|
|
!
|
|
|
|
if (me.eq.1) then
|
|
|
|
ncol=0
|
|
|
|
do proc=1,nproc
|
|
|
|
do ii=1,ncpw(proc)
|
|
|
|
ncol=ncol+1
|
|
|
|
mc=icpls(ii+ncp0s(proc))
|
|
|
|
!
|
|
|
|
! mc is the position in the xy plane of this column in FFT style
|
|
|
|
! we need to calculate "natural" indexes n1,n2, centered on the
|
|
|
|
! origin, for the smallest possible box.
|
|
|
|
!
|
|
|
|
n2=(mc-1)/nr1sx
|
|
|
|
n1= mc-1-n2*nr1sx
|
|
|
|
if (n2.ge.nr2s/2) n2=n2-nr2s
|
|
|
|
if (n1.le.nmax(1).and.(n1.ne.0.or.n2.ge.0)) then
|
|
|
|
!
|
|
|
|
! NB: n1.gt.nmax(1) correspond to negative indexes that should be
|
|
|
|
! refolded into negative n1. However these are absent because
|
|
|
|
! only half of the G sphere is considered. The other condition
|
|
|
|
! excludes the case n1=0, n2<0 that is also not present
|
|
|
|
!
|
|
|
|
do nz=nmin(3),nmax(3)
|
|
|
|
psiwr(nz,n1,n2)=psitot(nz-nmin(3)+1+(ncol-1)*nzx)
|
|
|
|
end do
|
|
|
|
end if
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! write the node-number-independent array
|
|
|
|
!
|
|
|
|
write(unit) psiwr
|
|
|
|
!
|
|
|
|
end if
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
if (me.eq.1) then
|
|
|
|
deallocate(psiwr)
|
|
|
|
deallocate(psitot)
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
return
|
|
|
|
!
|
|
|
|
end subroutine write_wfc
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine write_wfc2(unit,c)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! collect wavefunctions on first node and write to file
|
|
|
|
! This subroutine differs from write_wfc, since it writes
|
|
|
|
! to the disks only the wavefunctions components less than
|
|
|
|
! the cut-off, and following a given order, independent
|
|
|
|
! from the number of processors.
|
|
|
|
! Each wave function is collected (with the call to mergewf) on the
|
|
|
|
! master node, according to the array ig_l2g, and then written to
|
|
|
|
! the disk
|
|
|
|
!
|
|
|
|
use gvec
|
|
|
|
use gvecs
|
|
|
|
use elct
|
|
|
|
use para_mod
|
|
|
|
use parms
|
|
|
|
use work1
|
|
|
|
use mp_wave, ONLY: mergewf
|
|
|
|
use mp, ONLY: mp_sum
|
|
|
|
use parallel_include
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
integer unit
|
|
|
|
complex(kind=8) c(ngw,nx)
|
|
|
|
complex(kind=8), allocatable :: ctot(:)
|
|
|
|
!
|
|
|
|
integer i, ig, mpime, proc, ierr, ntot, ncol, mc, ngwt, root
|
|
|
|
|
|
|
|
ngwt = ngw
|
|
|
|
root = 0
|
|
|
|
mpime = me - 1
|
|
|
|
CALL mp_sum( ngwt )
|
|
|
|
|
|
|
|
ALLOCATE( ctot( ngwt ) )
|
|
|
|
DO i = 1, nx
|
|
|
|
CALL mergewf(c(:,i), ctot(:), ngw, ig_l2g, mpime, nproc, root)
|
|
|
|
IF( mpime == root ) THEN
|
|
|
|
WRITE(unit) ( ctot(ig), ig = 1, ngwt )
|
|
|
|
END IF
|
|
|
|
END DO
|
|
|
|
return
|
|
|
|
end subroutine write_wfc2
|
|
|
|
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine read_wfc2(unit,c)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! collect wavefunctions on first node and write to file
|
|
|
|
!
|
|
|
|
use gvec
|
|
|
|
use gvecs
|
|
|
|
use elct
|
|
|
|
use para_mod
|
|
|
|
use parms
|
|
|
|
use work1
|
|
|
|
use mp_wave, ONLY: splitwf
|
|
|
|
use mp, ONLY: mp_sum
|
|
|
|
use parallel_include
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
integer unit
|
|
|
|
complex(kind=8) c(ngw,nx)
|
|
|
|
complex(kind=8), allocatable :: ctot(:)
|
|
|
|
!
|
|
|
|
integer i, ig, mpime, proc, ierr, ntot, ncol, mc, ngwt, root
|
|
|
|
|
|
|
|
ngwt = ngw
|
|
|
|
root = 0
|
|
|
|
mpime = me - 1
|
|
|
|
CALL mp_sum( ngwt )
|
|
|
|
|
|
|
|
ALLOCATE( ctot( ngwt ) )
|
|
|
|
DO i = 1, nx
|
|
|
|
IF( mpime == root ) THEN
|
|
|
|
READ(unit) ( ctot(ig), ig = 1, ngwt )
|
|
|
|
END IF
|
|
|
|
CALL splitwf(c(:,i), ctot(:), ngw, ig_l2g, mpime, nproc, root)
|
|
|
|
END DO
|
|
|
|
return
|
|
|
|
end subroutine read_wfc2
|
|
|
|
|
|
|
|
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine read_wfc(unit,c)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! read wavefunctions from file and distribute to all nodes
|
|
|
|
!
|
|
|
|
use gvec
|
|
|
|
use gvecs
|
|
|
|
use elct
|
|
|
|
use para_mod
|
|
|
|
use parms
|
|
|
|
use work1
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
include 'mpif.h'
|
|
|
|
integer unit
|
|
|
|
complex(kind=8) c(ngw,nx)
|
|
|
|
complex(kind=8), pointer:: psis(:)
|
|
|
|
!
|
|
|
|
integer i, ii, ig, proc, ierr, ntot, ncol, mc, nr
|
|
|
|
integer nmin0(3), nmax0(3), nmin(3), nmax(3), n1,n2,nzx,nz,nz_
|
|
|
|
integer root, displs(nproc), sendcount(nproc)
|
|
|
|
complex(kind=8), allocatable:: psitot(:), psird(:,:,:)
|
|
|
|
!
|
|
|
|
! nmin, nmax are the bounds on (i,j,k) indexes of wavefunction G-vectors
|
|
|
|
!
|
|
|
|
call nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax)
|
|
|
|
!
|
|
|
|
! nzx is the maximum length of a column along z
|
|
|
|
!
|
|
|
|
nzx=nmax(3)-nmin(3)+1
|
|
|
|
!
|
|
|
|
root = 0
|
|
|
|
! root is the first node
|
|
|
|
ntot = 0
|
|
|
|
do proc=1,nproc
|
|
|
|
sendcount(proc) = ncpw(proc)*nzx
|
|
|
|
!
|
|
|
|
! sendcount(proc) = size of data send to processor proc
|
|
|
|
! (number of columns times length of each column)
|
|
|
|
!
|
|
|
|
if (proc.eq.1) then
|
|
|
|
displs(proc)=0
|
|
|
|
else
|
|
|
|
displs(proc)=displs(proc-1) + sendcount(proc-1)
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! displs(proc) is the position of data sent to processor proc
|
|
|
|
!
|
|
|
|
ntot = ntot + sendcount(proc)
|
|
|
|
!
|
|
|
|
! ntot = total size of gathered data
|
|
|
|
!
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! allocate the needed work spaces
|
|
|
|
!
|
|
|
|
psis=> wrk1
|
|
|
|
call zero(2*nnrs,psis)
|
|
|
|
if (me.eq.1) then
|
|
|
|
allocate(psitot(ntot))
|
|
|
|
!
|
|
|
|
! read the smallest FFT box that contains all wavefunction G-vectors
|
|
|
|
!
|
|
|
|
read(unit,end=10,err=10) nr, nmin0, nmax0
|
|
|
|
! check
|
|
|
|
if (nmin(1).ne.nmin0(1) .or. nmin(2).ne.nmin0(2) .or. &
|
|
|
|
& nmin(3).ne.nmin0(3) .or. nmax(1).ne.nmax0(1) .or. &
|
|
|
|
& nmax(2).ne.nmax0(2) .or. nmax(3).ne.nmax0(3) ) then
|
|
|
|
write(6,*) 'read nmin, nmax =',nmin, nmax
|
|
|
|
write(6,*) 'found nmin, nmax =',nmin0, nmax0
|
|
|
|
call error('read_wfc','wavefunction mismatch',1)
|
|
|
|
end if
|
|
|
|
if (nr.lt.n) call error('read_wfc','not enough wavefcts',nr)
|
|
|
|
allocate(psird(nmin(3):nmax(3),nmin(1):nmax(1),nmin(2):nmax(2)))
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
do i=1,n
|
|
|
|
!
|
|
|
|
! read the node-number-independent array
|
|
|
|
!
|
|
|
|
if (me.eq.1) then
|
|
|
|
read(unit,end=20,err=20) psird
|
|
|
|
!
|
|
|
|
! reorder as contiguously stored columns along z. See comments in
|
|
|
|
! write_wfc for the logic (or lack thereof) of the storage
|
|
|
|
!
|
|
|
|
ncol=0
|
|
|
|
do proc=1,nproc
|
|
|
|
do ii=1,ncpw(proc)
|
|
|
|
ncol=ncol+1
|
|
|
|
mc=icpls(ii+ncp0s(proc))
|
|
|
|
n2=(mc-1)/nr1sx
|
|
|
|
n1= mc-1-n2*nr1sx
|
|
|
|
if (n2.ge.nr2s/2) n2=n2-nr2s
|
|
|
|
if (n1.le.nmax(1).and.(n1.ne.0.or.n2.ge.0)) then
|
|
|
|
do nz=nmin(3),nmax(3)
|
|
|
|
psitot(nz-nmin(3)+1+(ncol-1)*nzx) = psird(nz,n1,n2)
|
|
|
|
end do
|
|
|
|
end if
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! distribute the array psitot on all nodes (in psis)
|
|
|
|
!
|
|
|
|
call mpi_barrier ( MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('write_wfc','mpi_barrier 2',ierr)
|
|
|
|
call mpi_scatterv(psitot,sendcount,displs,MPI_DOUBLE_COMPLEX, &
|
|
|
|
& psis ,sendcount(me), MPI_DOUBLE_COMPLEX, &
|
|
|
|
& root, MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('write_wfc','mpi_scatter',ierr)
|
|
|
|
!
|
|
|
|
! fill c_i(G) (see write_wfc for the logic-or lack thereof-of ordering)
|
|
|
|
!
|
|
|
|
do ig=1,ngw
|
|
|
|
ncol=(nps(ig)-1)/nr3sx
|
|
|
|
nz_ =nps(ig)-ncol*nr3sx
|
|
|
|
nz =nz_-1
|
|
|
|
if (nz.ge.nr3s/2) nz=nz-nr3s
|
|
|
|
c(ig,i)=psis(nz-nmin(3)+1+ncol*nzx)
|
|
|
|
end do
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
if (me.eq.1) then
|
|
|
|
deallocate(psird)
|
|
|
|
deallocate(psitot)
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
return
|
|
|
|
10 call error('read_wfc','file missing or wrong',ierr)
|
|
|
|
20 call error('read_wfc','wavefunction missing or wrong',ierr)
|
|
|
|
!
|
|
|
|
end subroutine read_wfc
|
|
|
|
!
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
subroutine readpfile &
|
|
|
|
& ( flag,ndr,h,hold,nfi,c0,cm,tau0,taum,vel,velm,acc, &
|
|
|
|
& lambda,lambdam,xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,ekincm, &
|
|
|
|
& xnhh0,xnhhm,vnhh,velh)
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! read from file and distribute data calculated in preceding iterations
|
|
|
|
! iflag=-1 : read only h and hold (for use in variable-cell)
|
|
|
|
! iflag= 0 : read h, hold, c0
|
|
|
|
! iflag=+1 : read everything
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
use mp
|
|
|
|
use elct, only: n, nx, ngw, ng0
|
|
|
|
use ions_module, only: nsp, na, nax
|
|
|
|
use parameters, only: nacx
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
integer flag, ndr, nfi
|
|
|
|
real(kind=8) h(3,3), hold(3,3)
|
|
|
|
complex(kind=8) c0(ngw,n), cm(ngw,n)
|
|
|
|
real(kind=8) taum(3,nax,nsp),tau0(3,nax,nsp)
|
|
|
|
real(kind=8) vel(3,nax,nsp), velm(3,nax,nsp)
|
|
|
|
real(kind=8) acc(nacx),lambda(nx,nx), lambdam(nx,nx)
|
|
|
|
real(kind=8) xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,ekincm
|
|
|
|
real(kind=8) xnhh0(3,3),xnhhm(3,3),vnhh(3,3),velh(3,3)
|
|
|
|
!
|
|
|
|
integer i, ia, is, j, root, ierr
|
|
|
|
!
|
|
|
|
! Only the first node reads
|
|
|
|
!
|
|
|
|
if (flag.eq.-1) then
|
|
|
|
write(6,'((a,i3,a))') ' ### reading from file ',ndr,' only h ##'
|
|
|
|
else if (flag.eq.0) then
|
|
|
|
write(6,'((a,i3,a))') ' ### reading from file ',ndr,' only c0 ##'
|
|
|
|
else
|
|
|
|
write(6,'((a,i3))') ' ### reading from file ',ndr
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
if (me.eq.1) then
|
|
|
|
open (unit=ndr,status='old',form='unformatted')
|
|
|
|
read(ndr,end=10,err=10) h, hold
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! h and hold are needed here by variable-cell calculation
|
|
|
|
!
|
|
|
|
root = 0
|
|
|
|
call mpi_barrier( MPI_COMM_WORLD, ierr)
|
|
|
|
call mp_bcast(h, root)
|
|
|
|
call mp_bcast(hold,root)
|
|
|
|
if (flag.eq.-1) then
|
|
|
|
if (me.eq.1) close (unit=ndr)
|
|
|
|
return
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! now read and distribute the wave functions
|
|
|
|
!
|
|
|
|
! call read_wfc(ndr,c0)
|
|
|
|
call read_wfc2(ndr,c0)
|
|
|
|
!
|
|
|
|
if (flag.eq.0) then
|
|
|
|
if (me.eq.1) close (unit=ndr)
|
|
|
|
return
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! call read_wfc(ndr,cm)
|
|
|
|
call read_wfc2(ndr,cm)
|
|
|
|
!
|
|
|
|
! arrays whose dimensions exceed what is actually read are explicitely set to zero
|
|
|
|
! in order to prevent potential problems with uninitialized trailing elements
|
|
|
|
!
|
|
|
|
tau0=0.d0
|
|
|
|
taum=0.d0
|
|
|
|
vel =0.d0
|
|
|
|
velm=0.d0
|
|
|
|
acc =0.d0
|
|
|
|
lambda =0.d0
|
|
|
|
lambdam=0.d0
|
|
|
|
!
|
|
|
|
! read all other variables
|
|
|
|
!
|
|
|
|
if (me.eq.1) then
|
|
|
|
read(ndr,end=10,err=10) nfi
|
|
|
|
read(ndr,end=10,err=10) (((tau0(i,ia,is),i=1,3),ia=1,na(is)),is=1,nsp)
|
|
|
|
read(ndr,end=10,err=10) (((taum(i,ia,is),i=1,3),ia=1,na(is)),is=1,nsp)
|
|
|
|
read(ndr,end=10,err=10) (((vel (i,ia,is),i=1,3),ia=1,na(is)),is=1,nsp)
|
|
|
|
read(ndr,end=10,err=10) (((velm(i,ia,is),i=1,3),ia=1,na(is)),is=1,nsp)
|
|
|
|
read(ndr,end=10,err=10) acc
|
|
|
|
read(ndr,end=10,err=10) ((lambda(i,j),i=1,n),j=1,n)
|
|
|
|
read(ndr,end=10,err=10) ((lambdam(i,j),i=1,n),j=1,n)
|
|
|
|
read(ndr,end=10,err=10) xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,ekincm
|
|
|
|
read(ndr,end=10,err=10) xnhh0,xnhhm,vnhh,velh
|
|
|
|
close (unit=ndr)
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! broadcast variables to the other nodes
|
|
|
|
!
|
|
|
|
call mpi_barrier( MPI_COMM_WORLD, ierr)
|
|
|
|
call mp_bcast( nfi, root)
|
|
|
|
call mp_bcast(tau0, root)
|
|
|
|
call mp_bcast(taum, root)
|
|
|
|
call mp_bcast(vel , root)
|
|
|
|
call mp_bcast(velm, root)
|
|
|
|
call mp_bcast(acc , root)
|
|
|
|
call mp_bcast(lambda, root)
|
|
|
|
call mp_bcast(lambdam,root)
|
|
|
|
call mp_bcast(xnhe0 , root)
|
|
|
|
call mp_bcast(xnhem , root)
|
|
|
|
call mp_bcast(vnhe , root)
|
|
|
|
call mp_bcast(xnhp0 , root)
|
|
|
|
call mp_bcast(xnhpm , root)
|
|
|
|
call mp_bcast(vnhp , root)
|
|
|
|
call mp_bcast(ekincm, root)
|
|
|
|
call mp_bcast(xnhh0, root)
|
|
|
|
call mp_bcast(xnhhm, root)
|
|
|
|
call mp_bcast(vnhh , root)
|
|
|
|
call mp_bcast(velh , root)
|
|
|
|
!
|
|
|
|
return
|
|
|
|
10 call error('readpfile','end of file detected',1)
|
|
|
|
end
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
subroutine writepfile &
|
|
|
|
& ( ndw,h,hold,nfi,c0,cm,tau0,taum,vel,velm,acc, &
|
|
|
|
& lambda,lambdam,xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,ekincm, &
|
|
|
|
& xnhh0,xnhhm,vnhh,velh)
|
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! read from file and distribute data calculated in preceding iterations
|
|
|
|
!
|
|
|
|
use para_mod
|
|
|
|
use elct, only: n, nx, ngw, ng0
|
|
|
|
use ions_module, only: nsp, na, nax
|
|
|
|
use parameters, only: nacx
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
integer ndw, nfi
|
|
|
|
real(kind=8) h(3,3), hold(3,3)
|
|
|
|
complex(kind=8) c0(ngw,n), cm(ngw,n)
|
|
|
|
real(kind=8) taum(3,nax,nsp),tau0(3,nax,nsp)
|
|
|
|
real(kind=8) vel(3,nax,nsp), velm(3,nax,nsp)
|
|
|
|
real(kind=8) acc(nacx),lambda(nx,nx), lambdam(nx,nx)
|
|
|
|
real(kind=8) xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,ekincm
|
|
|
|
real(kind=8) xnhh0(3,3),xnhhm(3,3),vnhh(3,3),velh(3,3)
|
|
|
|
!
|
|
|
|
include 'mpif.h'
|
|
|
|
integer i, ia, is, j
|
|
|
|
!
|
|
|
|
! Only the first node writes
|
|
|
|
!
|
|
|
|
if (me.eq.1) then
|
|
|
|
open (unit=ndw,status='unknown',form='unformatted')
|
|
|
|
!
|
|
|
|
! h and hold are needed here by variable-cell calculation
|
|
|
|
!
|
|
|
|
write(ndw) h, hold
|
|
|
|
end if
|
|
|
|
!
|
|
|
|
! now write the wave functions
|
|
|
|
!
|
|
|
|
! call write_wfc(ndw,c0)
|
|
|
|
! call write_wfc(ndw,cm)
|
|
|
|
call write_wfc2(ndw,c0)
|
|
|
|
call write_wfc2(ndw,cm)
|
|
|
|
!
|
|
|
|
! write all other variables
|
|
|
|
!
|
|
|
|
if (me.eq.1) then
|
|
|
|
write(ndw) nfi
|
|
|
|
write(ndw) (((tau0(i,ia,is),i=1,3),ia=1,na(is)),is=1,nsp)
|
|
|
|
write(ndw) (((taum(i,ia,is),i=1,3),ia=1,na(is)),is=1,nsp)
|
|
|
|
write(ndw) (((vel (i,ia,is),i=1,3),ia=1,na(is)),is=1,nsp)
|
|
|
|
write(ndw) (((velm(i,ia,is),i=1,3),ia=1,na(is)),is=1,nsp)
|
|
|
|
write(ndw) acc
|
|
|
|
write(ndw) ((lambda(i,j),i=1,n),j=1,n)
|
|
|
|
write(ndw) ((lambdam(i,j),i=1,n),j=1,n)
|
|
|
|
write(ndw) xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,ekincm
|
|
|
|
write(ndw) xnhh0,xnhhm,vnhh,velh
|
|
|
|
close (unit=ndw)
|
|
|
|
end if
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
|
|
|
!
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
subroutine nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax)
|
|
|
|
!----------------------------------------------------------------------
|
|
|
|
!
|
|
|
|
! find the bounds for (i,j,k) indexes of all wavefunction G-vectors
|
|
|
|
! The (i,j,k) indexes are defined as: G=i*g(1)+j*g(2)+k*g(3)
|
|
|
|
! where g(1), g(2), g(3) are basis vectors of the reciprocal lattice
|
|
|
|
!
|
|
|
|
implicit none
|
|
|
|
! input
|
|
|
|
integer ngw,nr1s,nr2s,nr3s,in1p(ngw),in2p(ngw),in3p(ngw)
|
|
|
|
! output
|
|
|
|
integer nmin(3), nmax(3)
|
|
|
|
! local
|
|
|
|
include 'mpif.h'
|
|
|
|
integer nmin0(3), nmax0(3), ig, ierr
|
|
|
|
!
|
|
|
|
!
|
|
|
|
nmin0(1)= nr1s
|
|
|
|
nmax0(1)= -nr1s
|
|
|
|
nmin0(2)= nr2s
|
|
|
|
nmax0(2)= -nr2s
|
|
|
|
nmin0(3)= nr3s
|
|
|
|
nmax0(3)= -nr3s
|
|
|
|
!
|
|
|
|
do ig=1,ngw
|
|
|
|
nmin0(1) = min(nmin0(1),in1p(ig))
|
|
|
|
nmin0(2) = min(nmin0(2),in2p(ig))
|
|
|
|
nmin0(3) = min(nmin0(3),in3p(ig))
|
|
|
|
nmax0(1) = max(nmax0(1),in1p(ig))
|
|
|
|
nmax0(2) = max(nmax0(2),in2p(ig))
|
|
|
|
nmax0(3) = max(nmax0(3),in3p(ig))
|
|
|
|
end do
|
|
|
|
!
|
|
|
|
! find minima and maxima for the FFT box across all nodes
|
|
|
|
!
|
|
|
|
call mpi_barrier( MPI_COMM_WORLD, ierr )
|
|
|
|
if (ierr.ne.0) call error('nrbounds','mpi_barrier 1',ierr)
|
|
|
|
call mpi_allreduce (nmin0, nmin, 3, MPI_INTEGER, MPI_MIN, &
|
|
|
|
& MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('nrbounds','mpi_allreduce min',ierr)
|
|
|
|
call mpi_allreduce (nmax0, nmax, 3, MPI_INTEGER, MPI_MAX, &
|
|
|
|
& MPI_COMM_WORLD, ierr)
|
|
|
|
if (ierr.ne.0) call error('nrbounds','mpi_allreduce max',ierr)
|
|
|
|
|
|
|
|
return
|
|
|
|
end subroutine nrbounds
|
|
|
|
|
|
|
|
#endif
|