quantum-espresso/LR_Modules/ccgsolve_all.f90

!
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------
subroutine ccgsolve_all (ch_psi, ccg_psi, e, d0psi, dpsi, h_diag, &
     ndmx, ndim, ethr, ik, kter, conv_root, anorm, nbnd, npol, freq_c)
  !----------------------------------------------------------------------
  !
  !     iterative solution of the linear systems (i=1,nbnd):
  !
  !                 ( H - e_i + Q ) * dpsi_i = d0psi_i                      (1)
  !
  !     where H is a complex hermitean matrix, e_i is a real scalar, Q is a
  !     projector on occupied states, dpsi_i and d0psi_ are complex vectors
  !
  !     on input:
  !                 ch_psi   EXTERNAL  name of a subroutine:
  !                          Calculates  (H-e+Q)*psi products.
  !                          Vectors psi and psip should be dimensioned
  !                          (ndmx,nbnd)
  !
  !                 cg_psi   EXTERNAL  name of a subroutine:
  !                          which calculates (h-e)^-1 * psi, with
  !                          some approximation, e.g. (diag(h)-e)
  !
  !                 e        real     unperturbed eigenvalue.
  !
  !                 dpsi     contains an estimate of the solution
  !                          vector.
  !
  !                 d0psi    contains the right hand side vector
  !                          of the system.
  !
  !                 ndmx     integer row dimension of dpsi, ecc.
  !
  !                 ndim     integer actual row dimension of dpsi
  !
  !                 ethr     real     convergence threshold. solution
  !                          improvement is stopped when the error in
  !                          eq (1), defined as l.h.s. - r.h.s., becomes
  !                          less than ethr in norm.
  !
  !     on output:  dpsi     contains the refined estimate of the
  !                          solution vector.
  !
  !                 d0psi    is corrupted on exit
  !
  !   revised (extensively)       6 Apr 1997 by A. Dal Corso & F. Mauri
  !   revised (to reduce memory) 29 May 2004 by S. de Gironcoli
  !
  USE kinds,          ONLY : DP
  USE mp_bands,       ONLY : intra_bgrp_comm, inter_bgrp_comm, use_bgrp_in_hpsi
  USE mp,             ONLY : mp_sum, mp_barrier
  USE control_flags,  ONLY : gamma_only
  USE gvect,          ONLY : gstart
  USE eqv,            ONLY : evq

  implicit none
  !
  !   first the I/O variables
  !
  integer :: ndmx, & ! input: the maximum dimension of the vectors
             ndim, & ! input: the actual dimension of the vectors
             kter, & ! output: counter on iterations
             nbnd, & ! input: the number of bands
             npol, & ! input: number of components of the wavefunctions
             ik      ! input: the k point

  real(DP) :: &
             e(nbnd), & ! input: the actual eigenvalue
             anorm,   & ! output: the norm of the error in the solution
             ethr       ! input: the required precision

  complex(DP) :: &
             dpsi (ndmx*npol, nbnd), & ! output: the solution of the linear syst
             h_diag(ndmx*npol,nbnd), & ! input: an estimate of ( H - \epsilon -w)
                                       ! w is complex
             freq_c                , & ! complex frequency
             d0psi (ndmx*npol, nbnd)   ! input: the known term

  logical :: conv_root ! output: if true the root is converged
  external ch_psi      ! input: the routine computing ch_psi
  external ccg_psi      ! input: the routine computing cg_psi
  !
  !  here the local variables
  !
  integer, parameter :: maxter = 1000
  ! the maximum number of iterations
  integer :: iter, ibnd, ibnd_, lbnd
  ! counters on iteration, bands
  integer , allocatable :: conv (:)
  ! if 1 the root is converged

  complex(DP), allocatable :: g (:,:),  t (:,:),  h (:,:),  hold (:,:)
  COMPLEX(DP), ALLOCATABLE :: gs (:,:), ts (:,:), hs (:,:), hsold (:,:)
  !  the gradient of psi
  !  the preconditioned gradient
  !  the delta gradient
  !  the conjugate gradient
  !  work space
  complex(DP) ::  dcgamma, dcgamma1, dclambda, dclambda1
  !  the ratio between rho
  !  step length
!  REAL(kind=dp), EXTERNAL :: ddot
  REAL(kind=dp), EXTERNAL :: myddot
  REAL(kind=dp), EXTERNAL :: myddotv2
  !  the scalar product
  complex(DP), allocatable :: rho (:), rhoold (:), euc (:), a(:), c(:)
  ! the residue
  ! auxiliary for h_diag
  real(DP) :: kter_eff
  ! account the number of iterations with b
  ! coefficient of quadratic form
  integer, allocatable :: indb(:)

  ! bgrp parallelization auxiliary variables
  INTEGER :: n_start, n_end, my_nbnd
  logical :: lsave_use_bgrp_in_hpsi
  !auxiliary for ddot
  real(DP) :: ddotval, addot, cddot
  !
  call start_clock ('ccgsolve')

  call divide (inter_bgrp_comm,nbnd,n_start,n_end)
  my_nbnd = n_end - n_start + 1

  ! allocate workspace (bgrp distributed)
  allocate ( conv(nbnd) )
  allocate ( g(ndmx*npol,my_nbnd), t(ndmx*npol,my_nbnd), h(ndmx*npol,my_nbnd), &
             hold(ndmx*npol,my_nbnd) )
  allocate ( gs(ndmx*npol,my_nbnd), ts(ndmx*npol,my_nbnd), hs(ndmx*npol,my_nbnd), &
             hsold(ndmx*npol,my_nbnd) )
  allocate ( a(my_nbnd), c(my_nbnd) )
  allocate ( rho(my_nbnd), rhoold(my_nbnd) )
  allocate ( euc(my_nbnd) )
  allocate ( indb(my_nbnd) )
  !      WRITE( stdout,*) g,t,h,hold

  kter_eff = 0.d0 ; conv (1:nbnd) = 0

  ! bgrp parallelization is done outside h_psi/s_psi. set use_bgrp_in_hpsi temporarily to false
  lsave_use_bgrp_in_hpsi = use_bgrp_in_hpsi ; use_bgrp_in_hpsi = .false.
  !$acc enter data create(rho(1:My_nbnd),a(1:my_nbnd),c(1:my_nbnd),euc(1:my_nbnd),t(1:ndmx*npol,1:my_nbnd),ts(1:ndmx*npol,1:my_nbnd),g(1:ndmx*npol,1:my_nbnd),gs(1:ndmx*npol,1:my_nbnd),h(1:ndmx*npol,1:my_nbnd),hs(1:ndmx*npol,1:my_nbnd),hold(1:ndmx*npol,1:my_nbnd),hsold(1:ndmx*npol,1:my_nbnd)) copyin(dpsi(1:ndmx*npol,1:nbnd),evq,h_diag(1:ndmx*npol,1:nbnd),d0psi(1:ndmx*npol,1:nbnd))
  !$acc kernels present(g,t,h,hold,gs,ts,hs,hsold)
  g=(0.d0,0.d0)
  t=(0.d0,0.d0)
  h=(0.d0,0.d0)
  hold=(0.d0,0.d0)
  gs=(0.d0,0.d0)
  ts=(0.d0,0.d0)
  hs=(0.d0,0.d0)
  hsold=(0.d0,0.d0)
  !$acc end kernels
  do ibnd = 1, nbnd
     indb(ibnd) = ibnd
  enddo

  do iter = 1, maxter
     !
     !    compute the gradient. can reuse information from previous step
     !
     if (iter == 1) then
        do ibnd = n_start, n_end
           euc(ibnd) = CMPLX(e(indb(ibnd))+DREAL(freq_c), DIMAG(freq_c))
        ENDDO
        !$acc update device(euc)
        call ch_psi (ndim, dpsi, g, euc, ik, my_nbnd)
        !$acc parallel loop
        do ibnd = n_start, n_end
           ibnd_ = ibnd - n_start + 1
           g(:,ibnd_) = g(:,ibnd_) - d0psi(:,ibnd)
        enddo
        !$acc kernels present(g,gs)
        gs(:,:) = CONJG(g(:,:))
        !$acc end kernels
     endif
     !
     !    compute preconditioned residual vector and convergence check
     !
     lbnd = 0
!!!!     CALL start_clock('loop1')
     do ibnd = n_start, n_end ;  ibnd_ = ibnd - n_start + 1
        if (conv (ibnd) .eq.0) then
           lbnd = lbnd+1
           !$acc kernels
           h (:,ibnd_)= g (:,ibnd_)
           hs(:,ibnd_)= gs(:,ibnd_)
           !$acc end kernels

           call ccg_psi(ndmx, ndim, 1, h(1,ibnd_), h_diag(1,ibnd), 1 )
           call ccg_psi(ndmx, ndim, 1, hs(1,ibnd_), h_diag(1,ibnd), -1 )

           IF (gamma_only) THEN
              !$acc host_data use_device(g,h,rho)
              CALL MYDDOTV3(2*ndmx*npol,h(1,ibnd_),1,g(1,ibnd_),1, rho(lbnd))
              !$acc end host_data
!              rho(lbnd)=2.0d0*ddot(2*ndmx*npol,h(1,ibnd_),1,g(1,ibnd_),1)
              !$acc serial
              rho(lbnd) = rho(lbnd)*2.0d0
              !$acc end serial
              IF(gstart==2) THEN
                 !$acc serial
                 rho(lbnd)=rho(lbnd)-DBLE(h(1,ibnd_))*DBLE(g(1,ibnd_))
                 !$acc end serial
              ENDIF
           ELSE
              !$acc kernels present(hs, g)
              rho(lbnd) = dot_product (hs(:,ibnd_), g(:,ibnd_))
              !$acc end kernels
           ENDIF

        endif
     enddo
!!!     CALL stop_clock('loop1')
     kter_eff = kter_eff + DBLE (lbnd) / DBLE (nbnd)
     !$acc host_data use_device(rho)
     call mp_sum( rho(1:lbnd), intra_bgrp_comm )
     !$acc end host_data
     !$acc update host(rho)
     do ibnd = n_end, n_start, -1 ; ibnd_ = ibnd - n_start + 1
        if (conv(ibnd).eq.0) then
           rho(ibnd_)=rho(lbnd)
           lbnd = lbnd -1
           anorm = sqrt ( abs (rho (ibnd_)) )
           if (anorm.lt.ethr) conv (ibnd) = 1
        endif
     enddo
     !
     conv_root = .true.
     do ibnd = n_start, n_end
        conv_root = conv_root.and. (conv (ibnd) .eq.1)
     enddo
     if (conv_root) goto 100
     !
     !        compute the step direction h. Conjugate it to previous step
     !
     lbnd = 0
!!!     CALL start_clock('loop2')
     do ibnd = n_start, n_end ; ibnd_ = ibnd - n_start + 1
        if (conv (ibnd) .eq.0) then
           !
           ! change sign to h and hs
           !
           !$acc kernels
           h (:,ibnd_)=-h (:,ibnd_)
           hs(:,ibnd_)=-hs(:,ibnd_)
           !$acc end kernels
           if (iter.ne.1) then
              dcgamma = rho (ibnd_) / rhoold (ibnd_)
              dcgamma1 = CONJG(dcgamma)
              !$acc kernels
              h (:,ibnd_) = h (:,ibnd_) + dcgamma * hold(:,ibnd_)
              hs(:,ibnd_) = hs(:,ibnd_) + dcgamma1*hsold(:,ibnd_)
              !$acc end kernels
           endif
           !
           ! here hold is used as auxiliary vector in order to efficiently compute t = A*h
           ! it is later set to the current (becoming old) value of h
           !
           lbnd = lbnd+1
           !$acc kernels
           hold (:,lbnd) = h (:,ibnd_)
           hsold(:,lbnd) = hs(:,ibnd_)
           !$acc end kernels
           indb (lbnd) = ibnd
        endif
     enddo
!!!     CALL stop_clock('loop2')
     !
     !        compute t = A*h and  ts= A^+ * h
     !
     DO ibnd=1,lbnd
        euc(ibnd) = CMPLX(e(indb(ibnd))+DREAL(freq_c), DIMAG(freq_c))
     ENDDO
     !$acc update device(euc)
     !
     !$acc data present(t, ts, hold, hsold, euc)
     call ch_psi (ndim, hold, t, euc, ik, lbnd)
     call ch_psi (ndim, hsold, ts, conjg(euc), ik, lbnd)
     !$acc end data
     !
     !        compute the coefficients a and c for the line minimization
     !        compute step length lambda
     !
     lbnd=0
!!!     CALL start_clock('loop3')
     do ibnd = n_start, n_end ; ibnd_ = ibnd - n_start + 1
        if (conv (ibnd) .eq.0) then
           lbnd=lbnd+1
           IF (gamma_only) THEN
              CALL MYDDOTV3(2*ndmx*npol,hs(1,ibnd_),1,g(1,ibnd_),1,a(lbnd))
              CALL MYDDOTV3(2*ndmx*npol,hs(1,ibnd_),1,t(1,lbnd),1,c(lbnd))
              !$acc kernels
              a(lbnd) = a(lbnd)*2.0d0
              c(lbnd) = c(lbnd)*2.0d0
              !$acc end kernels
!              a(lbnd) = 2.0d0*ddot(2*ndmx*npol,hs(1,ibnd_),1,g(1,ibnd_),1)
!              c(lbnd) = 2.0d0*ddot(2*ndmx*npol,hs(1,ibnd_),1,t(1,lbnd),1)
              IF (gstart == 2) THEN
                 !$acc kernels
                 a(lbnd)=a(lbnd)-DBLE(hs(1,ibnd_))*DBLE(g(1,ibnd_))
                 c(lbnd)=c(lbnd)-DBLE(hs(1,ibnd_))*DBLE(t(1,lbnd))
                 !$acc end kernels
              ENDIF
           ELSE
              !$acc kernels present(hs,g,t)
              a(lbnd) = dot_product (hs(:,ibnd_), g(:,ibnd_))
              c(lbnd) = dot_product (hs(:,ibnd_), t(:,lbnd))
              !$acc end kernels
           ENDIF
        end if
     end do
!!!!     CALL stop_clock('loop3')
     !
     !$acc host_data use_device(a,c)
     call mp_sum(  a(1:lbnd), intra_bgrp_comm )
     call mp_sum(  c(1:lbnd), intra_bgrp_comm )
     !$acc end host_data
     !$acc update host(a,c)
     lbnd=0
!!     CALL start_clock('loop4')
     do ibnd = n_start, n_end ; ibnd_ = ibnd - n_start + 1
        if (conv (ibnd) .eq.0) then
           lbnd=lbnd+1
           dclambda = - a(lbnd) / c(lbnd)
           dclambda1 = CONJG(dclambda)
           !
           !    move to new position
           !
           !$acc kernels
           dpsi(:,ibnd) = dpsi(:,ibnd) + dclambda*h(:,ibnd_)
           !$acc end kernels
           !
           !    update to get the gradient
           !
           !g=g+lam
           !$acc data present(t,g,ts,gs)
           !$acc kernels
           g (:,ibnd_) = g (:,ibnd_) + dclambda *t (:,lbnd)
           gs(:,ibnd_) = gs(:,ibnd_) + dclambda1*ts(:,lbnd)
           !$acc end kernels
           !$acc end data
           !
           !    save current (now old) h and rho for later use
           !
           !$acc kernels
           hold (:,ibnd_) = h (:,ibnd_)
           hsold(:,ibnd_) = hs(:,ibnd_)
           !$acc end kernels
           rhoold (ibnd_) = rho (ibnd_)
        endif
     enddo
!     CALL stop_clock('loop4')
  enddo

100 continue
  !
  !$acc exit data delete(rho,evq,a,c,g,gs,h,hs,h_diag,d0psi,hold,hsold,t,ts,euc) copyout(dpsi)
  ! deallocate workspace not needed anymore
  deallocate (rho, rhoold) ; deallocate (a,c) ; deallocate (g, t, h, hold)
  deallocate (gs, ts, hs, hsold, indb)
  deallocate (euc)

  ! wait for all bgrp to complete their task
  CALL mp_barrier( inter_bgrp_comm )

  ! check if all root converged across all bgrp
  call mp_sum( conv, inter_bgrp_comm )
  conv_root = .true.
  do ibnd = 1, nbnd
     conv_root = conv_root.and. (conv (ibnd) .eq.1)
  enddo
  deallocate (conv)

  ! collect the result
  if (n_start > 1 ) dpsi(:, 1:n_start-1) = (0.d0,0.d0) ; if (n_end < nbnd) dpsi(:, n_end+1:nbnd) = (0.d0,0.d0)
  call mp_sum( dpsi, inter_bgrp_comm )

  call mp_sum( kter_eff, inter_bgrp_comm )
  kter = kter_eff

  ! restore the value of use_bgrp_in_hpsi to its saved value
  use_bgrp_in_hpsi = lsave_use_bgrp_in_hpsi


  call stop_clock ('ccgsolve')
  return
end subroutine ccgsolve_all