quantum-espresso/upflib/uspp.f90

!
! Copyright (C) 2004-2021 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!

#if defined(__CUDA)
#define PINMEM ,PINNED
#else
#define PINMEM
#endif
!
!
MODULE uspp
  !
  ! Ultrasoft PPs:
  ! - Clebsch-Gordan coefficients "ap", auxiliary variables "lpx", "lpl"
  ! - beta and q functions of the solid
  !
  USE upf_kinds,   ONLY: DP
  USE upf_params,  ONLY: lmaxx, lqmax
  USE upf_spinorb, ONLY: is_spinorbit, fcoef, fcoef_d
  IMPLICIT NONE
  PRIVATE
  SAVE
  !
  PUBLIC :: nlx, lpx, lpl, ap, aainit, indv, nhtol, nhtolm, ofsbeta, &
            nkb, nkbus, vkb, dvan, deeq, qq_at, qq_nt, nhtoj, ijtoh, beta, &
            becsum, ebecsum
  PUBLIC :: lpx_d, lpl_d, ap_d, indv_d, nhtol_d, nhtolm_d, ofsbeta_d, &
            dvan_d, deeq_d, qq_at_d, qq_nt_d, nhtoj_d, ijtoh_d, &
            becsum_d, ebecsum_d
  PUBLIC :: okvan, nlcc_any
  PUBLIC :: qq_so,   dvan_so,   deeq_nc,   fcoef
  PUBLIC :: qq_so_d, dvan_so_d, deeq_nc_d, fcoef_d
  PUBLIC :: dbeta
  !
  PUBLIC :: allocate_uspp, deallocate_uspp
  !
  ! Vars
  !
  INTEGER, PARAMETER :: &
       nlx  = (lmaxx+1)**2, &! maximum number of combined angular momentum
       mx   = 2*lqmax-1      ! maximum magnetic angular momentum of Q
  !
  INTEGER ::             &! for each pair of combined momenta lm(1),lm(2):
       lpx(nlx,nlx),     &! maximum combined angular momentum LM
       lpl(nlx,nlx,mx)    ! list of combined angular momenta  LM
  REAL(DP) :: ap(lqmax*lqmax,nlx,nlx)
                          ! Clebsch-Gordan coefficients for spherical harmonics
  ! GPU vars
  INTEGER, ALLOCATABLE ::  & ! for each pair of combined momenta lm(1),lm(2):
       lpx_d(:,:),         & ! maximum combined angular momentum LM
       lpl_d(:,:,:)          ! list of combined angular momenta  LM
  REAL(DP), ALLOCATABLE :: ap_d(:,:,:)
#if defined (__CUDA)
  attributes(DEVICE) :: lpx_d, lpl_d, ap_d
#endif
  !
  !
  INTEGER :: nkb,        &! total number of beta functions, with struct.fact.
             nkbus        ! as above, for US-PP only
  !
  INTEGER, ALLOCATABLE PINMEM ::&
       indv(:,:),        &! indes linking  atomic beta's to beta's in the solid
       nhtol(:,:),       &! correspondence n <-> angular momentum l
       nhtolm(:,:),      &! correspondence n <-> combined lm index for (l,m)
       ijtoh(:,:,:),     &! correspondence beta indexes ih,jh -> composite index ijh
       ofsbeta(:)      ! first beta (index in the solid) for each atom
  !
  ! GPU vars
  !
  INTEGER, ALLOCATABLE :: indv_d(:,:)
  INTEGER, ALLOCATABLE :: nhtol_d(:,:)
  INTEGER, ALLOCATABLE :: nhtolm_d(:,:)
  INTEGER, ALLOCATABLE :: ijtoh_d(:,:,:)
  INTEGER, ALLOCATABLE :: ofsbeta_d(:)
#if defined (__CUDA)
  attributes(DEVICE) :: indv_d, nhtol_d, nhtolm_d, ijtoh_d, ofsbeta_d
#endif

  LOGICAL :: &
       okvan = .FALSE.,&  ! if .TRUE. at least one pseudo is Vanderbilt
       nlcc_any=.FALSE.   ! if .TRUE. at least one pseudo has core corrections
  !
  !FIXME use !$acc declare create(vkb) to create and delete it automatically in the device
  !           be carefull cp still uses  vkb_d for device
  COMPLEX(DP), ALLOCATABLE, TARGET PINMEM :: &
       vkb(:,:)                ! all beta functions in reciprocal space
  REAL(DP), ALLOCATABLE :: &
       becsum(:,:,:)           ! \sum_i f(i) <psi(i)|beta_l><beta_m|psi(i)>
  REAL(DP), ALLOCATABLE :: &
       ebecsum(:,:,:)          ! \sum_i f(i) et(i) <psi(i)|beta_l><beta_m|psi(i)>
  REAL(DP), ALLOCATABLE PINMEM :: &
       dvan(:,:,:),           &! the D functions of the solid
       deeq(:,:,:,:),         &! the integral of V_eff and Q_{nm}
       qq_nt(:,:,:),          &! the integral of q functions in the solid (ONE PER NTYP) used to be the qq array
       qq_at(:,:,:),          &! the integral of q functions in the solid (ONE PER ATOM !!!!)
       nhtoj(:,:)              ! correspondence n <-> total angular momentum
  !
  COMPLEX(DP), ALLOCATABLE :: & ! variables for spin-orbit/noncolinear case:
       qq_so(:,:,:,:),           &! Q_{nm}
       dvan_so(:,:,:,:),         &! D_{nm}
       deeq_nc(:,:,:,:)           ! \int V_{eff}(r) Q_{nm}(r) dr
  !
  ! GPU vars
  !
  REAL(DP),    ALLOCATABLE :: becsum_d(:,:,:)
  REAL(DP),    ALLOCATABLE :: ebecsum_d(:,:,:)
  REAL(DP),    ALLOCATABLE :: dvan_d(:,:,:)
  REAL(DP),    ALLOCATABLE :: deeq_d(:,:,:,:)
  REAL(DP),    ALLOCATABLE :: qq_nt_d(:,:,:)
  REAL(DP),    ALLOCATABLE :: qq_at_d(:,:,:)
  REAL(DP),    ALLOCATABLE :: nhtoj_d(:,:)
  COMPLEX(DP), ALLOCATABLE :: qq_so_d(:,:,:,:)
  COMPLEX(DP), ALLOCATABLE :: dvan_so_d(:,:,:,:)
  COMPLEX(DP), ALLOCATABLE :: deeq_nc_d(:,:,:,:)
#if defined(__CUDA)
  attributes (DEVICE) :: becsum_d, ebecsum_d, dvan_d, deeq_d, qq_nt_d, &
                         qq_at_d, nhtoj_d, qq_so_d, dvan_so_d, deeq_nc_d
#endif

  !
  ! spin-orbit coupling: qq and dvan are complex, qq has additional spin index
  ! noncolinear magnetism: deeq is complex (even in absence of spin-orbit)
  !
  REAL(DP), ALLOCATABLE PINMEM :: &
       beta(:,:,:)           ! beta functions for CP (without struct.factor)
  REAL(DP), ALLOCATABLE PINMEM :: &
       dbeta(:,:,:,:,:)      ! derivative of beta functions w.r.t. cell for CP (without struct.factor)
  !
CONTAINS
  !
  !-----------------------------------------------------------------------
  subroutine aainit(lli)
    !-----------------------------------------------------------------------
    !
    ! this routine computes the coefficients of the expansion of the product
    ! of two real spherical harmonics into real spherical harmonics.
    !
    !     Y_limi(r) * Y_ljmj(r) = \sum_LM  ap(LM,limi,ljmj)  Y_LM(r)
    !
    ! On output:
    ! ap     the expansion coefficients
    ! lpx    for each input limi,ljmj is the number of LM in the sum
    ! lpl    for each input limi,ljmj points to the allowed LM
    !
    ! The indices limi,ljmj and LM assume the order for real spherical
    ! harmonics given in routine ylmr2
    !
    USE upf_invmat
    implicit none
    !
    ! input: the maximum li considered
    !
    integer :: lli
    !
    ! local variables
    !
    integer :: llx, l, li, lj
    real(DP) , allocatable :: r(:,:), rr(:), ylm(:,:), mly(:,:)
    ! an array of random vectors: r(3,llx)
    ! the norm of r: rr(llx)
    ! the real spherical harmonics for array r: ylm(llx,llx)
    ! the inverse of ylm considered as a matrix: mly(llx,llx)
    !
    if (lli < 0) call upf_error('aainit','lli not allowed',lli)

    if (lli*lli > nlx) call upf_error('aainit','nlx is too small ',lli*lli)

    llx = (2*lli-1)**2
    if (2*lli-1 > lqmax) &
         call upf_error('aainit','ap leading dimension is too small',llx)

    allocate (r( 3, llx ))
    allocate (rr( llx ))
    allocate (ylm( llx, llx ))
    allocate (mly( llx, llx ))

    r(:,:)   = 0.0_DP
    ylm(:,:) = 0.0_DP
    mly(:,:) = 0.0_DP
    ap(:,:,:)= 0.0_DP

    ! - generate an array of random vectors (uniform deviate on unitary sphere)

    call gen_rndm_r(llx,r,rr)

    ! - generate the real spherical harmonics for the array: ylm(ir,lm)

    call ylmr2(llx,llx,r,rr,ylm)

    !-  store the inverse of ylm(ir,lm) in mly(lm,ir)

    call invmat(llx, ylm, mly)

    !-  for each li,lj compute ap(l,li,lj) and the indices, lpx and lpl
    do li = 1, lli*lli
       do lj = 1, lli*lli
          lpx(li,lj)=0
          do l = 1, llx
             ap(l,li,lj) = compute_ap(l,li,lj,llx,ylm,mly)
             if (abs(ap(l,li,lj)) > 1.d-3) then
                lpx(li,lj) = lpx(li,lj) + 1
                if (lpx(li,lj) > mx) &
                     call upf_error('aainit','mx dimension too small', lpx(li,lj))
                lpl(li,lj,lpx(li,lj)) = l
             end if
          end do
       end do
    end do
    !
    deallocate(mly)
    deallocate(ylm)
    deallocate(rr)
    deallocate(r)
    !
#if defined (__CUDA)
    IF (ALLOCATED(ap_d)) DEALLOCATE(ap_d)
    ALLOCATE(ap_d, SOURCE=ap)
    IF (ALLOCATED(lpx_d)) DEALLOCATE(lpx_d)
    ALLOCATE(lpx_d, SOURCE=lpx)
    IF (ALLOCATED(lpl_d)) DEALLOCATE(lpl_d)
    ALLOCATE(lpl_d, SOURCE=lpl)
#endif
    return
  end subroutine aainit
  !
  !-----------------------------------------------------------------------
  subroutine gen_rndm_r(llx,r,rr)
    !-----------------------------------------------------------------------
    ! - generate an array of random vectors (uniform deviate on unitary sphere)
    !
    USE upf_const,  ONLY: tpi
    implicit none
    !
    ! first the I/O variables
    !
    integer :: llx   ! input: the dimension of r and rr
    real(DP) :: &
         r(3,llx),  &! output: an array of random vectors
         rr(llx)     ! output: the norm of r
    !
    ! here the local variables
    !
    integer  :: ir
    real(DP) :: costheta, sintheta, phi

    do ir = 1, llx
       costheta = 2.0_DP * randy() - 1.0_DP
       sintheta = SQRT ( 1.0_DP - costheta*costheta)
       phi = tpi * randy()
       r (1,ir) = sintheta * cos(phi)
       r (2,ir) = sintheta * sin(phi)
       r (3,ir) = costheta
       rr(ir)   = 1.0_DP
    end do

    return
  end subroutine gen_rndm_r
!
!------------------------------------------------------------------------
   FUNCTION randy ( irand )
      !------------------------------------------------------------------------
      !
      ! x=randy(n): reseed with initial seed idum=n ( 0 <= n <= ic, see below)
      !             if randy is not explicitly initialized, it will be
      !             initialized with seed idum=0 the first time it is called
      ! x=randy() : generate uniform real(DP) numbers x in [0,1]
      !
      use upf_kinds, only : DP
      implicit none
      !
      REAL(DP) :: randy
      INTEGER, optional    :: irand
      !
      INTEGER , PARAMETER  :: m    = 714025, &
                              ia   = 1366, &
                              ic   = 150889, &
                              ntab = 97
      REAL(DP), PARAMETER  :: rm = 1.0_DP / m
      INTEGER              :: j
      INTEGER, SAVE        :: ir(ntab), iy, idum=0
      LOGICAL, SAVE        :: first=.true.
      !
      IF ( present(irand) ) THEN
         idum = MIN( ABS(irand), ic)
         first=.true.
      END IF

      IF ( first ) THEN
         !
         first = .false.
         idum = MOD( ic - idum, m )
         !
         DO j=1,ntab
            idum=mod(ia*idum+ic,m)
            ir(j)=idum
         END DO
         idum=mod(ia*idum+ic,m)
         iy=idum
      END IF
      j=1+(ntab*iy)/m
      IF( j > ntab .OR. j <  1 ) call upf_error('randy','j out of range',ABS(j)+1)
      iy=ir(j)
      randy=iy*rm
      idum=mod(ia*idum+ic,m)
      ir(j)=idum
      !
      RETURN
      !
   END FUNCTION randy
  !
  !-----------------------------------------------------------------------
  function compute_ap(l,li,lj,llx,ylm,mly)
    !-----------------------------------------------------------------------
    !-  given an l and a li,lj pair compute ap(l,li,lj)
    implicit none
    !
    ! first the I/O variables
    !
    integer :: &
         llx,         &! the dimension of ylm and mly
         l,li,lj       ! the arguments of the array ap

    real(DP) :: &
         compute_ap,  &! this function
         ylm(llx,llx),&! the real spherical harmonics for array r
         mly(llx,llx)  ! the inverse of ylm considered as a matrix
    !
    ! here the local variables
    !
    integer :: ir

    compute_ap = 0.0_DP
    do ir = 1,llx
       compute_ap = compute_ap + mly(l,ir)*ylm(ir,li)*ylm(ir,lj)
    end do

    return
  end function compute_ap
  !
  !-----------------------------------------------------------------------
  subroutine allocate_uspp(use_gpu,noncolin,lspinorb,tqr,nhm,nsp,nat,nspin)
    !-----------------------------------------------------------------------
    implicit none
    logical, intent(in) :: use_gpu
    logical, intent(in) :: noncolin,lspinorb,tqr
    integer, intent(in) :: nhm,nsp,nat,nspin
    !
    allocate( indv(nhm,nsp)   )
    allocate( nhtol(nhm,nsp)  )
    allocate( nhtolm(nhm,nsp) )
    allocate( nhtoj(nhm,nsp)  )
    allocate( ijtoh(nhm,nhm,nsp) )
    allocate( deeq(nhm,nhm,nat,nspin) )
    if ( noncolin ) then
       allocate( deeq_nc(nhm,nhm,nat,nspin) )
    endif
    allocate( qq_at(nhm,nhm,nat) )
    allocate( qq_nt(nhm,nhm,nsp) )
    ! set the internal spin-orbit flag
    is_spinorbit = lspinorb
    if ( lspinorb ) then
       allocate( qq_so(nhm,nhm,4,nsp) )
       allocate( dvan_so(nhm,nhm,nspin,nsp) )
       allocate( fcoef(nhm,nhm,2,2,nsp) )
    else
       allocate( dvan(nhm,nhm,nsp) )
    endif
    allocate(becsum( nhm*(nhm+1)/2, nat, nspin))
    if (tqr) then
       allocate(ebecsum( nhm*(nhm+1)/2, nat, nspin))
    endif
    allocate( ofsbeta(nat) )
    !
    ! GPU-vars (protecting zero-size allocations)
    !
    if (use_gpu) then
      !
      if (nhm>0) then
        allocate( indv_d(nhm,nsp)   )
        allocate( nhtol_d(nhm,nsp)  )
        allocate( nhtolm_d(nhm,nsp) )
        allocate( nhtoj_d(nhm,nsp)  )
        allocate( ijtoh_d(nhm,nhm,nsp) )
        allocate( deeq_d(nhm,nhm,nat,nspin) )
        if ( noncolin ) then
           allocate( deeq_nc_d(nhm,nhm,nat,nspin) )
        endif
        allocate( qq_at_d(nhm,nhm,nat) )
        allocate( qq_nt_d(nhm,nhm,nsp) )
        if ( lspinorb ) then
           allocate( qq_so_d(nhm,nhm,4,nsp) )
           allocate( dvan_so_d(nhm,nhm,nspin,nsp) )
           allocate( fcoef_d(nhm,nhm,2,2,nsp) )
        else
           allocate( dvan_d(nhm,nhm,nsp) )
        endif
        allocate(becsum_d( nhm*(nhm+1)/2, nat, nspin))
        if (tqr) then
           allocate(ebecsum_d( nhm*(nhm+1)/2, nat, nspin))
        endif
        !
      endif
      allocate( ofsbeta_d(nat) )
      !
    endif
    !
  end subroutine allocate_uspp
  !
  !-----------------------------------------------------------------------
  SUBROUTINE deallocate_uspp()
    !-----------------------------------------------------------------------
    IMPLICIT NONE
    IF( ALLOCATED( nhtol ) )      DEALLOCATE( nhtol )
    IF( ALLOCATED( indv ) )       DEALLOCATE( indv )
    IF( ALLOCATED( nhtolm ) )     DEALLOCATE( nhtolm )
    IF( ALLOCATED( nhtoj ) )      DEALLOCATE( nhtoj )
    IF( ALLOCATED( ofsbeta ) ) DEALLOCATE( ofsbeta )
    IF( ALLOCATED( ijtoh ) )      DEALLOCATE( ijtoh )
!FIXME in order to be created and deleted automatically by using !$acc declare create(vkb) in
    IF( ALLOCATED( vkb ) ) THEN
!$acc exit data delete(vkb )
        DEALLOCATE( vkb )
    END IF
    IF( ALLOCATED( becsum ) )     DEALLOCATE( becsum )
    IF( ALLOCATED( ebecsum ) )    DEALLOCATE( ebecsum )
    IF( ALLOCATED( qq_at ) )      DEALLOCATE( qq_at )
    IF( ALLOCATED( qq_nt ) )      DEALLOCATE( qq_nt )
    IF( ALLOCATED( dvan ) )       DEALLOCATE( dvan )
    IF( ALLOCATED( deeq ) )       DEALLOCATE( deeq )
    IF( ALLOCATED( qq_so ) )      DEALLOCATE( qq_so )
    IF( ALLOCATED( dvan_so ) )    DEALLOCATE( dvan_so )
    IF( ALLOCATED( deeq_nc ) )    DEALLOCATE( deeq_nc )
    IF( ALLOCATED( fcoef ) )      DEALLOCATE( fcoef )
    IF( ALLOCATED( beta ) )       DEALLOCATE( beta )
    IF( ALLOCATED( dbeta ) )      DEALLOCATE( dbeta )
    !
    ! GPU variables
    IF( ALLOCATED( ap_d ) )       DEALLOCATE( ap_d )
    IF( ALLOCATED( lpx_d ) )      DEALLOCATE( lpx_d )
    IF( ALLOCATED( lpl_d ) )      DEALLOCATE( lpl_d )
    IF( ALLOCATED( indv_d ) )     DEALLOCATE( indv_d )
    IF( ALLOCATED( nhtol_d ) )    DEALLOCATE( nhtol_d )
    IF( ALLOCATED( nhtolm_d ) )   DEALLOCATE( nhtolm_d )
    IF( ALLOCATED( ijtoh_d ) )    DEALLOCATE( ijtoh_d )
    IF( ALLOCATED( ofsbeta_d)) DEALLOCATE( ofsbeta_d )
    IF( ALLOCATED( becsum_d ) )   DEALLOCATE( becsum_d )
    IF( ALLOCATED( ebecsum_d ) )  DEALLOCATE( ebecsum_d )
    IF( ALLOCATED( dvan_d ) )     DEALLOCATE( dvan_d )
    IF( ALLOCATED( deeq_d ) )     DEALLOCATE( deeq_d )
    IF( ALLOCATED( qq_nt_d ) )    DEALLOCATE( qq_nt_d )
    IF( ALLOCATED( qq_at_d ) )    DEALLOCATE( qq_at_d )
    IF( ALLOCATED( nhtoj_d ) )    DEALLOCATE( nhtoj_d )
    IF( ALLOCATED( qq_so_d ) )    DEALLOCATE( qq_so_d )
    IF( ALLOCATED( dvan_so_d ) )  DEALLOCATE( dvan_so_d )
    IF( ALLOCATED( deeq_nc_d ) )  DEALLOCATE( deeq_nc_d )
    IF( ALLOCATED( fcoef_d ) )    DEALLOCATE( fcoef_d )
    !
  END SUBROUTINE deallocate_uspp
  !
  ! In the following, allocations are not checked and assume
  ! to be dimensioned correctly.
  !
  ! intento is used to specify what the variable will  be used for :
  !  0 -> in , the variable needs to be synchronized but won't be changed
  !  1 -> inout , the variable needs to be synchronized AND will be changed
  !  2 -> out , NO NEED to synchronize the variable, everything will be overwritten
  !
END MODULE uspp