quantum-espresso/CPV/para.f90

!
! Copyright (C) 2002 CP90 group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!

#include "../include/machine.h"

module para_mod

  USE fft_types, ONLY: fft_dlay_descriptor, fft_dlay_allocate, &
     fft_dlay_deallocate, fft_dlay_set
  USE fft_base, ONLY: dfftp, dffts

  integer maxproc, ncplanex
  parameter (maxproc=64, ncplanex=37000)

  character(len=3) :: node

! node:    node number, useful for opening files

  integer nproc, me, mygroup

! nproc:   number of processors
! me:      number of this processor
!

! parallel fft information for the dense grid
!
! npp:     number of plane per processor
! n3:      n3(me)+1 = first  plane on proc. me
! ncp:     number of (density) columns per proc
! ncp0:    starting column for each processor
! ncplane: number of columns in a plane
! nct:     total number of non-zero columns
! nnr_:    local fft data size
! ipc:     index saying which proc owns columns in a plane
! icpl:    index relating columns and pos. in the plane
!
! n3 -> dfftp%ipp
! ncplane -> dfftp%nnp
! ncp  -> dfftp%nsp
! ncp0 -> dfftp%iss
! npp  -> dfftp%npp
! ipc  -> dfftp%isind
! icpl -> dfftp%ismap
! nnr_ -> dfftp%nnr
!
!  integer  npp(maxproc), n3(maxproc), ncp(maxproc), ncp0(maxproc), &
!           ncplane, nct, nnr_, ipc(ncplanex), icpl(ncplanex)
!
! parallel fft information for the smooth mesh
!
! npps:    number of plane per processor
! ncps:    number of (density) columns per proc
! ncpw:    number of (wfs) columns per processor
! ncps0:   starting column for each processor
! ncplanes:number of columns in a plane (smooth)
! ncts:    total number of non-zero columns
! nnrs_:   local fft data size
! ipcs:    saying which proc owns columns in a plane
! icpls:   index relating columns and pos. in the plane
!
! ncpw -> dffts%ncpw
! n3s -> dffts%ipp
! ncplanes -> dffts%nnp
! ncps  -> dffts%nsp
! ncps0 -> dffts%iss
! npps  -> dffts%npp
! ipcs  -> dffts%isind
! icpls -> dffts%ismap
! nnrs_ -> dffts%nnr
!

contains

  subroutine deallocate_para_mod
    use stick_base, only: sticks_deallocate
    call fft_dlay_deallocate( dfftp )
    call fft_dlay_deallocate( dffts )
    call sticks_deallocate()
  end subroutine

end module para_mod
!
!-----------------------------------------------------------------------
      subroutine startup
!-----------------------------------------------------------------------
!
!  This subroutine initializes the Message Passing environment.
!  The number of processors "nproc" returned by this routine is
!  determined by the way the process was started.
!  This is configuration- and machine-dependent. For inter-
!  active execution it is determined by environment variable MP_PROCS
!
      use para_mod
      use mp, only: mp_start, mp_env
      use io_global, only: stdout
      use global_version
!
      implicit none

      integer ierr
!
      call mp_start()
      call mp_env( nproc, me, mygroup )
      me = me + 1
!
!
! parent process (source) will have me=1 - child process me=2,...,NPROC
! (for historical reasons: MPI uses 0,...,NPROC-1 instead )
!
      if ( nproc > maxproc) &
     &   call errore('startup',' too many processors ',nproc)
!
      if (me < 10) then
         write(node,'(i1,2x)') me
      else if (me < 100) then
         write(node,'(i2,1x)') me
      else if (me < 1000) then
         write(node,'(i3)') me
      else
         call errore('startup','wow, >1000 nodes !!',nproc)
      end if
!
! only the first processor writes
!
      if ( me == 1 ) then
         WRITE( stdout,'(72("*"))')
         WRITE( stdout,'(4("*"),64x,4("*"))')
         WRITE( stdout,'(4("*"),"  CPV: variable-cell Car-Parrinello ", &
              &  "molecular dynamics          ",4("*"))')
         WRITE( stdout,'(4("*"),"  using ultrasoft Vanderbilt ", &
              &  "pseudopotentials - v.",a6,8x,4("*"))') version_number
         WRITE( stdout,'(4("*"),64x,4("*"))')
         WRITE( stdout,'(72("*"))')
         WRITE( stdout,'(/5x,''Parallel version (MPI)'')')
         WRITE( stdout,'(5x,''Number of processors in use:   '',i4)') nproc
      else
         open(6,file='/dev/null',status='unknown')
!
! useful for debugging purposes
!         open(6,file='out.'//node,status='unknown')
      end if
!
      return
      end

#if defined __PARA

!
!----------------------------------------------------------------------
      subroutine read_rho(unit,nspin,rhor)
!----------------------------------------------------------------------
!
! read from file rhor(nnr,nspin) on first node and distribute to other nodes
!
      use para_mod
      use grid_dimensions, only: nr1x, nr2x, nr3x, nnr => nnrx
      implicit none
      include 'mpif.h'
      integer unit, nspin
      real(kind=8) rhor(nnr,nspin)
!
      integer ir, is
      integer root, proc, ierr, n, displs(nproc), sendcount(nproc)
      real(kind=8), allocatable:: rhodist(:)
!
!
      if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x))
      root = 0
      do proc=1,nproc
         sendcount(proc) =  dfftp%nnp * ( dfftp%npp(proc) )
         if (proc.eq.1) then
            displs(proc)=0
         else
            displs(proc)=displs(proc-1) + sendcount(proc-1)
         end if
      end do
      do is=1,nspin
!
! read the charge density from unit "unit" on first node only
!
         if (me.eq.1) read(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x)
!
! distribute the charge density to the other nodes
!
         call mpi_barrier ( MPI_COMM_WORLD, ierr)
         call mpi_scatterv(rhodist, sendcount, displs, MPI_REAL8,       &
     &                     rhor(1,is),sendcount(me),   MPI_REAL8,       &
     &                     root, MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call errore('mpi_scatterv','ierr<>0',ierr)
!
! just in case: set to zero unread elements (if any)
!
         do ir=sendcount(me)+1,nnr
            rhor(ir,is)=0.d0
         end do
      end do
      if (me.eq.1) deallocate(rhodist)
!
      return
      end subroutine read_rho
!
!----------------------------------------------------------------------
      subroutine write_rho(unit,nspin,rhor)
!----------------------------------------------------------------------
!
! collect rhor(nnr,nspin) on first node and write to file
!
      use para_mod
      use grid_dimensions, only: nr1x, nr2x, nr3x, nnr => nnrx
      implicit none
      include 'mpif.h'
      integer unit, nspin
      real(kind=8) rhor(nnr,nspin)
!
      integer ir, is
      integer root, proc, ierr, displs(nproc), recvcount(nproc)
      real(kind=8), allocatable:: rhodist(:)
!
!
      if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x))
!
      root = 0
      do proc=1,nproc
         recvcount(proc) =  dfftp%nnp  * ( dfftp%npp(proc) )
         if (proc.eq.1) then
            displs(proc)=0
         else
            displs(proc)=displs(proc-1) + recvcount(proc-1)
         end if
      end do
!
      do is=1,nspin
!
! gather the charge density on the first node
!
         call mpi_barrier ( MPI_COMM_WORLD, ierr)
         call mpi_gatherv (rhor(1,is), recvcount(me), MPI_REAL8,        &
     &                     rhodist,recvcount, displs, MPI_REAL8,        &
     &                     root, MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call errore('mpi_gatherv','ierr<>0',ierr)
!
! write the charge density to unit "unit" from first node only
!
         if (me.eq.1) write(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x)
      end do
      if (me.eq.1) deallocate(rhodist)
!
      return
      end subroutine write_rho
!
!
!-----------------------------------------------------------------------
      subroutine set_fft_para( b1, b2, b3, gcut, gcuts, gcutw,          &
     &                        nr1, nr2, nr3, nr1s, nr2s, nr3s, nnr,     &
     &                        nr1x,nr2x,nr3x,nr1sx,nr2sx,nr3sx,nnrs )
!-----------------------------------------------------------------------
!
! distribute columns to processes for parallel fft
! based on code written by Stefano de Gironcoli for PWSCF
! columns are sets of g-vectors along z: g(k) = i1*b1+i2*b2+i3*b3 ,
! with g^2<gcut and (i1,i2) running over the (xy) plane.
! Columns are "active" for a given (i1,i2) if they contain a nonzero
! number of wavevectors
!
      use para_mod
      use stick_base
      use fft_scalar, only: good_fft_dimension
      use io_global, only: stdout
!
      implicit none
      real(kind=8) b1(3), b2(3), b3(3), gcut, gcuts, gcutw
      integer nr1, nr2, nr3, nr1s, nr2s, nr3s, nnr,                     &
     &        nr1x,nr2x,nr3x,nr1sx,nr2sx,nr3sx,nnrs
!
      integer ngc(ncplanex),&! number of g-vectors per column (dense grid)
     &        ngcs(ncplanex),&! number of g-vectors per column (smooth grid
     &        ngcw(ncplanex),&! number of wavefct plane waves per colum
     &        in1(ncplanex),&! index i for column (i1,i2)
     &        in2(ncplanex),&! index j for column (i1,i2)
     &        index(ncplanex),&! used to order column
     &        ngp(maxproc), &! number of g-vectors per proc (dense grid)
     &        ngps(maxproc),&! number of g-vectors per proc (smooth grid)
     &        ngpw(maxproc)  ! number of wavefct plane waves per proc
!
      integer np, nps1,            &! counters on planes
     &        nq, nqs,             &! counters on planes
     &        n1m1,n2m1,n3m1        ! nr1-1 and so on
!
      integer  npp(maxproc), npps(maxproc)
      integer  ncp(maxproc), ncps(maxproc)
      integer  ncpw(maxproc)
      integer  ncplane, ncplanes
      integer  nct, ncts
      integer  i
!
      integer, allocatable :: st(:,:), stw(:,:), sts(:,:) ! sticks maps
      integer :: ub(3), lb(3)  ! upper and lower bounds for maps
      integer :: nctw
      logical :: tk = .FALSE.
!
      call start_clock( 'setfftpara' )
!
! set the dimensions of fft arrays
!
      nr1x  = good_fft_dimension(nr1 )
      nr2x  = nr2
      nr3x  = good_fft_dimension(nr3 )
!
      nr1sx = good_fft_dimension(nr1s)
      nr2sx = nr2s
      nr3sx = good_fft_dimension(nr3s)
!
!     compute number of columns for each processor
!
      ncplane  = nr1x  * nr2x
      ncplanes = nr1sx * nr2sx

      if ( ncplane > ncplanex .or. ncplanes > ncplanex )                &
     &     call errore('set_fft_para','ncplanex too small',ncplane)
!
! set the number of plane per process
!
      if (nr3.lt.nproc) call errore('set_fft_para',                      &
     &                'some processors have no planes ',-1)
      if (nr3s.lt.nproc) call errore('set_fft_para',                     &
     &                'some processors have no smooth planes ',-1)
!
      if (nproc.eq.1) then
         npp(1) = nr3
         npps(1)= nr3s
      else
         np = nr3/nproc
         nq = nr3 - np*nproc
         nps1 = nr3s/nproc
         nqs = nr3s - nps1*nproc
         do i = 1, nproc
            npp(i) = np
            if (i.le.nq) npp(i) = np + 1
            npps(i) = nps1
            if (i.le.nqs) npps(i) = nps1 + 1
         end do
      end if
!
      nct = 0
      ncts= 0
!
! NOTA BENE: the exact limits for a correctly sized FFT grid are:
! -nr/2,..,+nr/2  for nr even; -(nr-1)/2,..,+(nr-1)/2  for nr odd.
! If the following limits are increased, a slightly undersized fft
! grid, with some degree of G-vector refolding, can be used
! (at your own risk - a check is done in ggen).
!
      n1m1=nr1/2
      n2m1=nr2/2
      n3m1=nr3/2

      lb(1) = -n1m1
      lb(2) = -n2m1
      lb(3) = -n3m1
      ub(1) =  n1m1
      ub(2) =  n2m1
      ub(3) =  n3m1
!
      ALLOCATE( stw ( lb(1):ub(1), lb(2):ub(2) ) )
      ALLOCATE( st  ( lb(1):ub(1), lb(2):ub(2) ) )
      ALLOCATE( sts ( lb(1):ub(1), lb(2):ub(2) ) )

!
! ...     Fill in the stick maps, for given g-space base (b1,b2,b3)
! ...     and cut-offs
! ...     The value of the element (i,j) of the map ( st ) is equal to the
! ...     number of G-vector belonging to the (i,j) stick.
!

      CALL sticks_maps( tk, ub, lb, b1, b2, b3, gcut, gcutw, gcuts, st, stw, sts )

      nct  = COUNT( st  > 0 )
      nctw = COUNT( stw > 0 )
      ncts = COUNT( sts > 0 )

      if ( nct > ncplane )    &
     &    call errore('set_fft_para','too many sticks',1)

      if ( ncts > ncplanes )  &
     &    call errore('set_fft_para','too many sticks',2)

      if ( nct == 0 ) &
     &    call errore('set_fft_para','number of sticks 0', 1)

      if ( ncts == 0 ) &
     &    call errore('set_fft_para','number smooth sticks 0', 1)

!
! ...     initialize the sticks indexes array ist
! ...     nct counts columns containing G-vectors for the dense grid
! ...     ncts counts columns contaning G-vectors for the smooth grid
!

      CALL sticks_countg( tk, ub, lb, st, stw, sts, in1, in2, ngc, ngcw, ngcs )

!
!   Sort the columns. First the column with the largest number of G
!   vectors on the wavefunction sphere (active columns),
!   then on the smooth sphere, then on the big sphere. Dirty trick:
!

      CALL sticks_sort( ngc, ngcw, ngcs, nct, index )

!
!   assign columns to processes
!

      CALL sticks_dist( tk, ub, lb, index, in1, in2, ngc, ngcw, ngcs, nct, &
                ncp, ncpw, ncps, ngp, ngpw, ngps, st, stw, sts )

      CALL sticks_pairup( tk, ub, lb, index, in1, in2, ngc, ngcw, ngcs, nct, &
                ncp, ncpw, ncps, ngp, ngpw, ngps, st, stw, sts )

      CALL fft_dlay_allocate( dfftp, nproc, nr1x, nr2x )
      CALL fft_dlay_allocate( dffts, nproc, nr1sx, nr2sx )

      CALL fft_dlay_set( dfftp, tk, nct, nr1, nr2, nr3, nr1x, nr2x, nr3x, me, &
                nproc, ub, lb, index, in1, in2, ncp, ncpw, ngp, ngpw, st, stw)
      CALL fft_dlay_set( dffts, tk, ncts, nr1s, nr2s, nr3s, nr1sx, nr2sx, nr3sx, me, &
                nproc, ub, lb, index, in1, in2, ncps, ncpw, ngps, ngpw, sts, stw)

      DEALLOCATE( st, stw, sts )

      WRITE( stdout,"(                                                      &
       & ' Proc  planes cols    G   planes cols    G    columns  G',/,  &
       & '       (dense grid)     (smooth grid)   (wavefct grid)' )" )
      do i=1,nproc
         WRITE( stdout,'(i3,2x,3(i5,2i7))') i, npp(i),ncp(i),ngp(i),          &
     &        npps(i),ncps(i),ngps(i), ncpw(i), ngpw(i)
      end do
      WRITE( stdout,'(i3,2x,3(i5,2i7))') 0, SUM(npp(1:nproc)), SUM(ncp(1:nproc)), &
        SUM(ngp(1:nproc)), SUM(npps(1:nproc)), SUM(ncps(1:nproc)), &
        SUM(ngps(1:nproc)), SUM(ncpw(1:nproc)), SUM(ngpw(1:nproc))
!
      if ( nproc == 1 ) then
         nnr  = nr1x*nr2x*nr3x
         nnrs = nr1sx*nr2sx*nr3sx
      else
         nnr  = max(nr3x*ncp(me), nr1x*nr2x*npp(me))
         nnrs = max(nr3sx*ncps(me), nr1sx*nr2sx*npps(me))
      end if

      call stop_clock( 'setfftpara' )
!
      return
      end
!
!
!----------------------------------------------------------------------
      subroutine cfftpb(f,nr1b,nr2b,nr3b,nr1bx,nr2bx,nr3bx,irb3,sign)
!----------------------------------------------------------------------
!
!   not-so-parallel 3d fft for box grid, implemented only for sign=1
!   G-space to R-space, output = \sum_G f(G)exp(+iG*R)
!   The array f (overwritten on output) is NOT distributed:
!   a copy is present on each processor.
!   The fft along z  is done on the entire grid.
!   The fft along xy is done only on planes that have components on the
!   dense grid for each processor. Note that the final array will no
!   longer be the same on all processors.
!
      use para_mod
      use grid_dimensions, only: nr3
      use fft_scalar, only: cft_b
!
      implicit none
      integer nr1b,nr2b,nr3b,nr1bx,nr2bx,nr3bx,irb3,sign
      complex(kind=8) f(nr1bx*nr2bx*nr3bx)
!
      integer ir3, ibig3, imin3, imax3, np3
!
      call parabox(nr3b,irb3,nr3,imin3,imax3)
      np3=imax3-imin3+1
! np3 is the number of planes to be transformed
      if (np3.le.0) return
      call cft_b(f,nr1b,nr2b,nr3b,nr1bx,nr2bx,nr3bx,imin3,imax3,sign)
!
      return
      end
!
!----------------------------------------------------------------------
      subroutine parabox(nr3b,irb3,nr3,imin3,imax3)
!----------------------------------------------------------------------
!
! find if box grid planes in the z direction have component on the dense
! grid on this processor, and if, which range imin3-imax3
!
      use para_mod
! input
      integer nr3b,irb3,nr3
! output
      integer imin3,imax3
! local
      integer ir3, ibig3
!
      imin3=nr3b
      imax3=1
      do ir3=1,nr3b
         ibig3=1+mod(irb3+ir3-2,nr3)
         if(ibig3.lt.1.or.ibig3.gt.nr3)                                 &
     &        call errore('cfftpb','ibig3 wrong',ibig3)
         ibig3=ibig3-dfftp%ipp(me)
         if (ibig3.gt.0.and.ibig3.le.dfftp%npp(me)) then
            imin3=min(imin3,ir3)
            imax3=max(imax3,ir3)
         end if
      end do
!
      return
      end
!
!-----------------------------------------------------------------------
      subroutine reduce(size,ps)
!-----------------------------------------------------------------------
!
!     sums a distributed variable s(size) over the processors.
!     This version uses a fixed-length buffer of appropriate (?) size
!
      use para_mod
!
      implicit none
      integer size
      real(kind=8)  ps(size)
!
      include 'mpif.h'
      integer ierr, n, nbuf
      integer, parameter:: MAXB=10000
      real(kind=8) buff(MAXB)
!
      if (nproc.le.1) return
      if (size.le.0) return
      call start_clock( 'reduce' )
!
!  syncronize processes
!
      call mpi_barrier(MPI_COMM_WORLD,ierr)
      if (ierr.ne.0) call errore('reduce','error in barrier',ierr)
!
      nbuf=size/MAXB
!
      do n=1,nbuf
         call mpi_allreduce (ps(1+(n-1)*MAXB), buff, MAXB,              &
     &        MPI_REAL8, MPI_SUM, MPI_COMM_WORLD, ierr)
         if (ierr.ne.0)                                                 &
     &        call errore('reduce','error in allreduce1',ierr)
         call DCOPY(MAXB,buff,1,ps(1+(n-1)*MAXB),1)
      end do
!
!    possible remaining elements < maxb
!
      if (size-nbuf*MAXB.gt.0) then
          call mpi_allreduce (ps(1+nbuf*MAXB), buff, size-nbuf*MAXB,    &
     &          MPI_REAL8, MPI_SUM, MPI_COMM_WORLD, ierr)
          if (ierr.ne.0)                                                &
     &         call errore('reduce','error in allreduce2',ierr)
          call DCOPY(size-nbuf*MAXB,buff,1,ps(1+nbuf*MAXB),1)
      endif
      call stop_clock( 'reduce' )
!
      return
      end
!
!----------------------------------------------------------------------
      subroutine nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax)
!----------------------------------------------------------------------
!
! find the bounds for (i,j,k) indexes of all wavefunction G-vectors
! The (i,j,k) indexes are defined as: G=i*g(1)+j*g(2)+k*g(3)
! where g(1), g(2), g(3) are basis vectors of the reciprocal lattice
!
      implicit none
! input
      integer ngw,nr1s,nr2s,nr3s,in1p(ngw),in2p(ngw),in3p(ngw)
! output
      integer nmin(3), nmax(3)
! local
      include 'mpif.h'
      integer nmin0(3), nmax0(3), ig, ierr
!
!
      nmin0(1)=  nr1s
      nmax0(1)= -nr1s
      nmin0(2)=  nr2s
      nmax0(2)= -nr2s
      nmin0(3)=  nr3s
      nmax0(3)= -nr3s
!
      do ig=1,ngw
         nmin0(1) = min(nmin0(1),in1p(ig))
         nmin0(2) = min(nmin0(2),in2p(ig))
         nmin0(3) = min(nmin0(3),in3p(ig))
         nmax0(1) = max(nmax0(1),in1p(ig))
         nmax0(2) = max(nmax0(2),in2p(ig))
         nmax0(3) = max(nmax0(3),in3p(ig))
      end do
!
! find minima and maxima for the FFT box across all nodes
!
      call mpi_barrier( MPI_COMM_WORLD, ierr )
      if (ierr.ne.0) call errore('nrbounds','mpi_barrier 1',ierr)
      call mpi_allreduce (nmin0, nmin, 3, MPI_INTEGER, MPI_MIN,         &
     &           MPI_COMM_WORLD, ierr)
      if (ierr.ne.0) call errore('nrbounds','mpi_allreduce min',ierr)
      call mpi_allreduce (nmax0, nmax, 3, MPI_INTEGER, MPI_MAX,         &
     &           MPI_COMM_WORLD, ierr)
      if (ierr.ne.0) call errore('nrbounds','mpi_allreduce max',ierr)

      return
      end subroutine nrbounds

#endif