[skip-CI] dylmr2_gpu and dylmr2 merged

Note that ylmr2 now calls ylmr2_gpu if __CUDA
This commit is contained in:
Paolo Giannozzi 2024-01-11 14:06:18 +01:00
parent 7001de08d6
commit c91a55416b
7 changed files with 96 additions and 203 deletions

View File

@ -138,12 +138,7 @@ SUBROUTINE addusstress_g( sigmanlc )
!
DO ipol = 1, 3
!
#if defined(__CUDA)
CALL dylmr2_acc( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
#else
CALL dylmr2( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
!$acc update device(dylmk0)
#endif
!
DO nt = 1, ntyp
!

View File

@ -52,8 +52,7 @@ set(src_upflib
ylmr2.f90
dylmr2.f90
# GPU
ylmr2_gpu.f90
dylmr2_gpu.f90)
ylmr2_gpu.f90)
qe_enable_cuda_fortran("${src_upflib}")
qe_add_library(qe_xml xmltools.f90 dom.f90 wxml.f90)

View File

@ -68,10 +68,9 @@ ylmr2.o
# GPU versions of routines
OBJS_GPU= \
dylmr2_gpu.o \
ylmr2_gpu.o
OBJS_NODEP+= dom.o wxml.o
OBJS_NODEP+= $(OBJS_GPU) dom.o wxml.o
all : libupf.a virtual_v2.x upfconv.x casino2upf.x

View File

@ -1,96 +1,106 @@
!
! Copyright (C) 2001 PWSCF group
! Copyright (C) 2024 Quantum ESPRESSO Foundation
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-----------------------------------------------------------------------
subroutine dylmr2 (nylm, ngy, g, gg, dylm, ipol)
SUBROUTINE dylmr2( nylm, ngy, g, gg, dylm, ipol )
!-----------------------------------------------------------------------
!! Compute \partial Y_lm(G) \over \partial (G)_ipol
!! using simple numerical derivation (SdG)
!! The spherical harmonics are calculated in ylmr2
!! using simple numerical derivation (SdG).
!! The spherical harmonics are calculated in ylmr2.
!! ACC-enabled version
!
USE upf_kinds, ONLY : DP
implicit none
!
integer :: nylm
!! input: number of spherical harmonics
integer :: ngy
!! input: the number of g vectors to compute
integer :: ipol
!! input: desired polarization
real(DP) :: g(3,ngy)
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: nylm
! input: number of spherical harmonics
INTEGER, INTENT(IN) :: ngy
! input: the number of g vectors to compute
INTEGER, INTENT(IN) :: ipol
! input: desired polarization
REAL(DP), INTENT(IN) :: g(3,ngy)
!! input: the coordinates of g vectors
real(DP) :: gg (ngy)
REAL(DP), INTENT(IN) :: gg(ngy)
!! input: the moduli of g vectors
real(DP) :: dylm (ngy, nylm)
REAL(DP), INTENT(OUT) :: dylm(ngy,nylm)
!! output: the spherical harmonics derivatives
!
! ... local variables
!
integer :: ig, lm
INTEGER :: ig, lm, i
! counter on g vectors
! counter on l,m component
real(DP), parameter :: delta = 1.d-6
real(DP), allocatable :: dg (:), dgi (:), gx (:,:), ggx (:), ylmaux (:,:)
!
INTEGER :: apol, bpol
!
REAL(DP), PARAMETER :: delta = 1.e-6_dp, eps = 1.e-9_dp
REAL(DP), ALLOCATABLE :: dg(:), gx(:,:), ggx(:), ylmaux(:,:)
! dg is the finite increment for numerical derivation:
! dg = delta |G| = delta * sqrt(gg)
! dgi= 1 /(delta * sqrt(gg))
! dg = delta |G| = delta * sqrt(gg) (later overwritten by 1/dg)
! gx = g +/- dg
! ggx = gx^2
!
allocate ( gx(3,ngy), ggx(ngy), dg(ngy), dgi(ngy), ylmaux(ngy,nylm) )
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ig)
do ig = 1, ngy
dg (ig) = delta * sqrt (gg (ig) )
if (gg (ig) .gt. 1.d-9) then
dgi (ig) = 1.d0 / dg (ig)
else
dgi (ig) = 0.d0
endif
enddo
!$OMP END PARALLEL DO
call dcopy (3 * ngy, g, 1, gx, 1)
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ig)
do ig = 1, ngy
gx (ipol, ig) = g (ipol, ig) + dg (ig)
ggx (ig) = gx (1, ig) * gx (1, ig) + &
gx (2, ig) * gx (2, ig) + &
gx (3, ig) * gx (3, ig)
enddo
!$OMP END PARALLEL DO
call ylmr2 (nylm, ngy, gx, ggx, dylm)
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ig)
do ig = 1, ngy
gx (ipol, ig) = g (ipol, ig) - dg (ig)
ggx (ig) = gx (1, ig) * gx (1, ig) + &
gx (2, ig) * gx (2, ig) + &
gx (3, ig) * gx (3, ig)
enddo
!$OMP END PARALLEL DO
call ylmr2 (nylm, ngy, gx, ggx, ylmaux)
call daxpy (ngy * nylm, - 1.d0, ylmaux, 1, dylm, 1)
do lm = 1, nylm
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ig)
do ig = 1, ngy
dylm (ig, lm) = dylm (ig, lm) * 0.5d0 * dgi (ig)
enddo
!$OMP END PARALLEL DO
enddo
deallocate ( gx, ggx, dg, dgi, ylmaux )
return
end subroutine dylmr2
IF ( ipol==1 ) THEN
apol = 2 ; bpol = 3
ELSEIF ( ipol==2 ) THEN
apol = 1 ; bpol = 3
ELSEIF ( ipol==3 ) THEN
apol = 1 ; bpol = 2
ENDIF
!
ALLOCATE( gx(3,ngy), ggx(ngy), dg(ngy), ylmaux(ngy,nylm) )
!$acc data create(gx, ggx, dg, ylmaux) &
!$acc present_or_copyin(g, gg) present_or_copyout(dylm)
!
!$acc parallel loop
DO ig = 1, ngy
dg(ig) = delta * SQRT(gg(ig) )
ENDDO
!$acc parallel loop
DO ig = 1, ngy
gx(apol,ig) = g(apol,ig)
gx(bpol,ig) = g(bpol,ig)
gx(ipol,ig) = g(ipol,ig) + dg(ig)
ggx(ig) = gx(1,ig) * gx(1,ig) + &
gx(2,ig) * gx(2,ig) + &
gx(3,ig) * gx(3,ig)
ENDDO
!
CALL ylmr2( nylm, ngy, gx, ggx, dylm )
!
!$acc parallel loop
DO ig = 1, ngy
gx(ipol,ig) = g(ipol,ig) - dg(ig)
ggx(ig) = gx(1,ig) * gx(1,ig) + &
gx(2,ig) * gx(2,ig) + &
gx(3,ig) * gx(3,ig)
ENDDO
!
CALL ylmr2( nylm, ngy, gx, ggx, ylmaux )
!
!$acc parallel loop
DO ig = 1, ngy
IF (gg(ig) > eps) THEN
dg(ig) = 1.0_dp / dg(ig)
ELSE
dg(ig) = 0.0_dp
ENDIF
ENDDO
!$acc parallel loop collapse(2)
DO lm = 1, nylm
DO ig = 1, ngy
dylm(ig,lm) = (dylm(ig,lm)-ylmaux(ig,lm)) * 0.5_dp * dg(ig)
ENDDO
ENDDO
!
!$acc end data
DEALLOCATE( gx, ggx, dg, ylmaux )
!
RETURN
!
END SUBROUTINE dylmr2

View File

@ -1,111 +0,0 @@
!
! Copyright (C) 2024 Quantum ESPRESSO Foundation
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-----------------------------------------------------------------------
SUBROUTINE dylmr2_acc( nylm, ngy, g, gg, dylm, ipol )
!-----------------------------------------------------------------------
!! Compute \partial Y_lm(G) \over \partial (G)_ipol
!! using simple numerical derivation (SdG).
!! The spherical harmonics are calculated in ylmr2.
!! ACC-enabled version
!
USE upf_kinds, ONLY : DP
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: nylm
! input: number of spherical harmonics
INTEGER, INTENT(IN) :: ngy
! input: the number of g vectors to compute
INTEGER, INTENT(IN) :: ipol
! input: desired polarization
REAL(DP), INTENT(IN) :: g(3,ngy)
!! input: the coordinates of g vectors
REAL(DP), INTENT(IN) :: gg(ngy)
!! input: the moduli of g vectors
REAL(DP), INTENT(OUT) :: dylm(ngy,nylm)
!! output: the spherical harmonics derivatives
!
! ... local variables
!
INTEGER :: ig, lm, i
! counter on g vectors
! counter on l,m component
!
INTEGER :: apol, bpol
!
REAL(DP), PARAMETER :: delta = 1.e-6_dp, eps = 1.e-9_dp
REAL(DP), ALLOCATABLE :: dg(:), gx(:,:), ggx(:), ylmaux(:,:)
! dg is the finite increment for numerical derivation:
! dg = delta |G| = delta * sqrt(gg) (later overwritten by 1/dg)
! gx = g +/- dg
! ggx = gx^2
!
IF ( ipol==1 ) THEN
apol = 2 ; bpol = 3
ELSEIF ( ipol==2 ) THEN
apol = 1 ; bpol = 3
ELSEIF ( ipol==3 ) THEN
apol = 1 ; bpol = 2
ENDIF
!
ALLOCATE( gx(3,ngy), ggx(ngy), dg(ngy), ylmaux(ngy,nylm) )
!$acc data create(gx, ggx, dg, ylmaux) &
!$acc present_or_copyin(g, gg) present_or_copyout(dylm)
!
!$acc parallel loop
DO ig = 1, ngy
dg(ig) = delta * SQRT(gg(ig) )
ENDDO
!$acc parallel loop
DO ig = 1, ngy
gx(apol,ig) = g(apol,ig)
gx(bpol,ig) = g(bpol,ig)
gx(ipol,ig) = g(ipol,ig) + dg(ig)
ggx(ig) = gx(1,ig) * gx(1,ig) + &
gx(2,ig) * gx(2,ig) + &
gx(3,ig) * gx(3,ig)
ENDDO
!
!$acc host_data use_device (gx,ggx,dylm)
CALL ylmr2_gpu( nylm, ngy, gx, ggx, dylm )
!$acc end host_data
!
!$acc parallel loop
DO ig = 1, ngy
gx(ipol,ig) = g(ipol,ig) - dg(ig)
ggx(ig) = gx(1,ig) * gx(1,ig) + &
gx(2,ig) * gx(2,ig) + &
gx(3,ig) * gx(3,ig)
ENDDO
!
!$acc host_data use_device (gx,ggx,ylmaux)
CALL ylmr2_gpu( nylm, ngy, gx, ggx, ylmaux )
!$acc end host_data
!
!$acc parallel loop
DO ig = 1, ngy
IF (gg(ig) > eps) THEN
dg(ig) = 1.0_dp / dg(ig)
ELSE
dg(ig) = 0.0_dp
ENDIF
ENDDO
!$acc parallel loop collapse(2)
DO lm = 1, nylm
DO ig = 1, ngy
dylm(ig,lm) = (dylm(ig,lm)-ylmaux(ig,lm)) * 0.5_dp * dg(ig)
ENDDO
ENDDO
!
!$acc end data
DEALLOCATE( gx, ggx, dg, ylmaux )
!
RETURN
!
END SUBROUTINE dylmr2_acc

View File

@ -111,17 +111,9 @@ SUBROUTINE gen_us_dy_base( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
ALLOCATE( dylm(npw,(lmaxkb+1)**2,3) )
!$acc data create( dylm )
!
#if defined(__CUDA)
DO ipol = 1, 3
CALL dylmr2_acc( lmx2, npw, gk, q, dylm(:,:,ipol), ipol )
ENDDO
#else
!$acc update self( gk, q )
DO ipol = 1, 3
CALL dylmr2( lmx2, npw, gk, q, dylm(:,:,ipol), ipol )
ENDDO
!$acc update device( dylm )
#endif
!
u_ipol1 = u(1) ; u_ipol2 = u(2) ; u_ipol3 = u(3)
!

View File

@ -1,5 +1,5 @@
!
! Copyright (C) 2001-2021 Quantum ESPRESSO group
! Copyright (C) 2001-2024 Quantum ESPRESSO Foundation
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
@ -14,7 +14,7 @@ subroutine ylmr2 (lmax2, ng, g, gg, ylm)
! Numerical recursive algorithm based on the one given in Numerical
! Recipes but avoiding the calculation of factorials that generate
! overflow for lmax > 11
! Last modified May 2nd, 2021, by PG
! Last modified Jan. 2024, by PG: calls CUF version if __CUDA
!
USE upf_kinds, ONLY : DP
USE upf_const, ONLY : pi, fpi
@ -39,10 +39,18 @@ subroutine ylmr2 (lmax2, ng, g, gg, ylm)
if (ng < 1 .or. lmax2 < 1) return
do lmax = 0, 25
if ((lmax+1)**2 == lmax2) go to 10
if ((lmax+1)**2 > lmax2) exit
end do
call upf_error (' ylmr', 'l > 25 or wrong number of Ylm required',lmax2)
call upf_error (' ylmr2', 'wrong number of Ylm required',lmax2)
10 continue
!
#if defined(__CUDA)
!$acc data present_or_copyout(ylm) present_or_copyin(g, gg)
!$acc host_data use_device(g,gg,ylm)
call ylmr2_gpu(lmax2, ng, g, gg, ylm)
!$acc end host_data
!$acc end data
#else
if (lmax == 0) then
ylm(:,1) = sqrt (1.d0 / fpi)
return
@ -124,6 +132,7 @@ subroutine ylmr2 (lmax2, ng, g, gg, ylm)
end do
enddo
!$omp end parallel do
#endif
!
return
end subroutine ylmr2