mirror of https://gitlab.com/QEF/q-e.git
[skip-CI] dylmr2_gpu and dylmr2 merged
Note that ylmr2 now calls ylmr2_gpu if __CUDA
This commit is contained in:
parent
7001de08d6
commit
c91a55416b
|
@ -138,12 +138,7 @@ SUBROUTINE addusstress_g( sigmanlc )
|
|||
!
|
||||
DO ipol = 1, 3
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
CALL dylmr2_acc( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
|
||||
#else
|
||||
CALL dylmr2( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
|
||||
!$acc update device(dylmk0)
|
||||
#endif
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
!
|
||||
|
|
|
@ -52,8 +52,7 @@ set(src_upflib
|
|||
ylmr2.f90
|
||||
dylmr2.f90
|
||||
# GPU
|
||||
ylmr2_gpu.f90
|
||||
dylmr2_gpu.f90)
|
||||
ylmr2_gpu.f90)
|
||||
qe_enable_cuda_fortran("${src_upflib}")
|
||||
|
||||
qe_add_library(qe_xml xmltools.f90 dom.f90 wxml.f90)
|
||||
|
|
|
@ -68,10 +68,9 @@ ylmr2.o
|
|||
|
||||
# GPU versions of routines
|
||||
OBJS_GPU= \
|
||||
dylmr2_gpu.o \
|
||||
ylmr2_gpu.o
|
||||
|
||||
OBJS_NODEP+= dom.o wxml.o
|
||||
OBJS_NODEP+= $(OBJS_GPU) dom.o wxml.o
|
||||
|
||||
all : libupf.a virtual_v2.x upfconv.x casino2upf.x
|
||||
|
||||
|
|
|
@ -1,96 +1,106 @@
|
|||
!
|
||||
! Copyright (C) 2001 PWSCF group
|
||||
! Copyright (C) 2024 Quantum ESPRESSO Foundation
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine dylmr2 (nylm, ngy, g, gg, dylm, ipol)
|
||||
SUBROUTINE dylmr2( nylm, ngy, g, gg, dylm, ipol )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Compute \partial Y_lm(G) \over \partial (G)_ipol
|
||||
!! using simple numerical derivation (SdG)
|
||||
!! The spherical harmonics are calculated in ylmr2
|
||||
!! using simple numerical derivation (SdG).
|
||||
!! The spherical harmonics are calculated in ylmr2.
|
||||
!! ACC-enabled version
|
||||
!
|
||||
USE upf_kinds, ONLY : DP
|
||||
implicit none
|
||||
!
|
||||
integer :: nylm
|
||||
!! input: number of spherical harmonics
|
||||
integer :: ngy
|
||||
!! input: the number of g vectors to compute
|
||||
integer :: ipol
|
||||
!! input: desired polarization
|
||||
real(DP) :: g(3,ngy)
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: nylm
|
||||
! input: number of spherical harmonics
|
||||
INTEGER, INTENT(IN) :: ngy
|
||||
! input: the number of g vectors to compute
|
||||
INTEGER, INTENT(IN) :: ipol
|
||||
! input: desired polarization
|
||||
REAL(DP), INTENT(IN) :: g(3,ngy)
|
||||
!! input: the coordinates of g vectors
|
||||
real(DP) :: gg (ngy)
|
||||
REAL(DP), INTENT(IN) :: gg(ngy)
|
||||
!! input: the moduli of g vectors
|
||||
real(DP) :: dylm (ngy, nylm)
|
||||
REAL(DP), INTENT(OUT) :: dylm(ngy,nylm)
|
||||
!! output: the spherical harmonics derivatives
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
integer :: ig, lm
|
||||
INTEGER :: ig, lm, i
|
||||
! counter on g vectors
|
||||
! counter on l,m component
|
||||
|
||||
real(DP), parameter :: delta = 1.d-6
|
||||
real(DP), allocatable :: dg (:), dgi (:), gx (:,:), ggx (:), ylmaux (:,:)
|
||||
!
|
||||
INTEGER :: apol, bpol
|
||||
!
|
||||
REAL(DP), PARAMETER :: delta = 1.e-6_dp, eps = 1.e-9_dp
|
||||
REAL(DP), ALLOCATABLE :: dg(:), gx(:,:), ggx(:), ylmaux(:,:)
|
||||
! dg is the finite increment for numerical derivation:
|
||||
! dg = delta |G| = delta * sqrt(gg)
|
||||
! dgi= 1 /(delta * sqrt(gg))
|
||||
! dg = delta |G| = delta * sqrt(gg) (later overwritten by 1/dg)
|
||||
! gx = g +/- dg
|
||||
! ggx = gx^2
|
||||
!
|
||||
allocate ( gx(3,ngy), ggx(ngy), dg(ngy), dgi(ngy), ylmaux(ngy,nylm) )
|
||||
|
||||
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ig)
|
||||
do ig = 1, ngy
|
||||
dg (ig) = delta * sqrt (gg (ig) )
|
||||
if (gg (ig) .gt. 1.d-9) then
|
||||
dgi (ig) = 1.d0 / dg (ig)
|
||||
else
|
||||
dgi (ig) = 0.d0
|
||||
endif
|
||||
enddo
|
||||
!$OMP END PARALLEL DO
|
||||
|
||||
call dcopy (3 * ngy, g, 1, gx, 1)
|
||||
|
||||
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ig)
|
||||
do ig = 1, ngy
|
||||
gx (ipol, ig) = g (ipol, ig) + dg (ig)
|
||||
ggx (ig) = gx (1, ig) * gx (1, ig) + &
|
||||
gx (2, ig) * gx (2, ig) + &
|
||||
gx (3, ig) * gx (3, ig)
|
||||
enddo
|
||||
!$OMP END PARALLEL DO
|
||||
|
||||
call ylmr2 (nylm, ngy, gx, ggx, dylm)
|
||||
|
||||
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ig)
|
||||
do ig = 1, ngy
|
||||
gx (ipol, ig) = g (ipol, ig) - dg (ig)
|
||||
ggx (ig) = gx (1, ig) * gx (1, ig) + &
|
||||
gx (2, ig) * gx (2, ig) + &
|
||||
gx (3, ig) * gx (3, ig)
|
||||
enddo
|
||||
!$OMP END PARALLEL DO
|
||||
|
||||
call ylmr2 (nylm, ngy, gx, ggx, ylmaux)
|
||||
|
||||
call daxpy (ngy * nylm, - 1.d0, ylmaux, 1, dylm, 1)
|
||||
|
||||
do lm = 1, nylm
|
||||
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ig)
|
||||
do ig = 1, ngy
|
||||
dylm (ig, lm) = dylm (ig, lm) * 0.5d0 * dgi (ig)
|
||||
enddo
|
||||
!$OMP END PARALLEL DO
|
||||
enddo
|
||||
|
||||
deallocate ( gx, ggx, dg, dgi, ylmaux )
|
||||
|
||||
return
|
||||
end subroutine dylmr2
|
||||
|
||||
IF ( ipol==1 ) THEN
|
||||
apol = 2 ; bpol = 3
|
||||
ELSEIF ( ipol==2 ) THEN
|
||||
apol = 1 ; bpol = 3
|
||||
ELSEIF ( ipol==3 ) THEN
|
||||
apol = 1 ; bpol = 2
|
||||
ENDIF
|
||||
!
|
||||
ALLOCATE( gx(3,ngy), ggx(ngy), dg(ngy), ylmaux(ngy,nylm) )
|
||||
!$acc data create(gx, ggx, dg, ylmaux) &
|
||||
!$acc present_or_copyin(g, gg) present_or_copyout(dylm)
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ig = 1, ngy
|
||||
dg(ig) = delta * SQRT(gg(ig) )
|
||||
ENDDO
|
||||
!$acc parallel loop
|
||||
DO ig = 1, ngy
|
||||
gx(apol,ig) = g(apol,ig)
|
||||
gx(bpol,ig) = g(bpol,ig)
|
||||
gx(ipol,ig) = g(ipol,ig) + dg(ig)
|
||||
ggx(ig) = gx(1,ig) * gx(1,ig) + &
|
||||
gx(2,ig) * gx(2,ig) + &
|
||||
gx(3,ig) * gx(3,ig)
|
||||
ENDDO
|
||||
!
|
||||
CALL ylmr2( nylm, ngy, gx, ggx, dylm )
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ig = 1, ngy
|
||||
gx(ipol,ig) = g(ipol,ig) - dg(ig)
|
||||
ggx(ig) = gx(1,ig) * gx(1,ig) + &
|
||||
gx(2,ig) * gx(2,ig) + &
|
||||
gx(3,ig) * gx(3,ig)
|
||||
ENDDO
|
||||
!
|
||||
CALL ylmr2( nylm, ngy, gx, ggx, ylmaux )
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ig = 1, ngy
|
||||
IF (gg(ig) > eps) THEN
|
||||
dg(ig) = 1.0_dp / dg(ig)
|
||||
ELSE
|
||||
dg(ig) = 0.0_dp
|
||||
ENDIF
|
||||
ENDDO
|
||||
!$acc parallel loop collapse(2)
|
||||
DO lm = 1, nylm
|
||||
DO ig = 1, ngy
|
||||
dylm(ig,lm) = (dylm(ig,lm)-ylmaux(ig,lm)) * 0.5_dp * dg(ig)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( gx, ggx, dg, ylmaux )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE dylmr2
|
||||
|
|
|
@ -1,111 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2024 Quantum ESPRESSO Foundation
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE dylmr2_acc( nylm, ngy, g, gg, dylm, ipol )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Compute \partial Y_lm(G) \over \partial (G)_ipol
|
||||
!! using simple numerical derivation (SdG).
|
||||
!! The spherical harmonics are calculated in ylmr2.
|
||||
!! ACC-enabled version
|
||||
!
|
||||
USE upf_kinds, ONLY : DP
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: nylm
|
||||
! input: number of spherical harmonics
|
||||
INTEGER, INTENT(IN) :: ngy
|
||||
! input: the number of g vectors to compute
|
||||
INTEGER, INTENT(IN) :: ipol
|
||||
! input: desired polarization
|
||||
REAL(DP), INTENT(IN) :: g(3,ngy)
|
||||
!! input: the coordinates of g vectors
|
||||
REAL(DP), INTENT(IN) :: gg(ngy)
|
||||
!! input: the moduli of g vectors
|
||||
REAL(DP), INTENT(OUT) :: dylm(ngy,nylm)
|
||||
!! output: the spherical harmonics derivatives
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ig, lm, i
|
||||
! counter on g vectors
|
||||
! counter on l,m component
|
||||
!
|
||||
INTEGER :: apol, bpol
|
||||
!
|
||||
REAL(DP), PARAMETER :: delta = 1.e-6_dp, eps = 1.e-9_dp
|
||||
REAL(DP), ALLOCATABLE :: dg(:), gx(:,:), ggx(:), ylmaux(:,:)
|
||||
! dg is the finite increment for numerical derivation:
|
||||
! dg = delta |G| = delta * sqrt(gg) (later overwritten by 1/dg)
|
||||
! gx = g +/- dg
|
||||
! ggx = gx^2
|
||||
!
|
||||
IF ( ipol==1 ) THEN
|
||||
apol = 2 ; bpol = 3
|
||||
ELSEIF ( ipol==2 ) THEN
|
||||
apol = 1 ; bpol = 3
|
||||
ELSEIF ( ipol==3 ) THEN
|
||||
apol = 1 ; bpol = 2
|
||||
ENDIF
|
||||
!
|
||||
ALLOCATE( gx(3,ngy), ggx(ngy), dg(ngy), ylmaux(ngy,nylm) )
|
||||
!$acc data create(gx, ggx, dg, ylmaux) &
|
||||
!$acc present_or_copyin(g, gg) present_or_copyout(dylm)
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ig = 1, ngy
|
||||
dg(ig) = delta * SQRT(gg(ig) )
|
||||
ENDDO
|
||||
!$acc parallel loop
|
||||
DO ig = 1, ngy
|
||||
gx(apol,ig) = g(apol,ig)
|
||||
gx(bpol,ig) = g(bpol,ig)
|
||||
gx(ipol,ig) = g(ipol,ig) + dg(ig)
|
||||
ggx(ig) = gx(1,ig) * gx(1,ig) + &
|
||||
gx(2,ig) * gx(2,ig) + &
|
||||
gx(3,ig) * gx(3,ig)
|
||||
ENDDO
|
||||
!
|
||||
!$acc host_data use_device (gx,ggx,dylm)
|
||||
CALL ylmr2_gpu( nylm, ngy, gx, ggx, dylm )
|
||||
!$acc end host_data
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ig = 1, ngy
|
||||
gx(ipol,ig) = g(ipol,ig) - dg(ig)
|
||||
ggx(ig) = gx(1,ig) * gx(1,ig) + &
|
||||
gx(2,ig) * gx(2,ig) + &
|
||||
gx(3,ig) * gx(3,ig)
|
||||
ENDDO
|
||||
!
|
||||
!$acc host_data use_device (gx,ggx,ylmaux)
|
||||
CALL ylmr2_gpu( nylm, ngy, gx, ggx, ylmaux )
|
||||
!$acc end host_data
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ig = 1, ngy
|
||||
IF (gg(ig) > eps) THEN
|
||||
dg(ig) = 1.0_dp / dg(ig)
|
||||
ELSE
|
||||
dg(ig) = 0.0_dp
|
||||
ENDIF
|
||||
ENDDO
|
||||
!$acc parallel loop collapse(2)
|
||||
DO lm = 1, nylm
|
||||
DO ig = 1, ngy
|
||||
dylm(ig,lm) = (dylm(ig,lm)-ylmaux(ig,lm)) * 0.5_dp * dg(ig)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( gx, ggx, dg, ylmaux )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE dylmr2_acc
|
||||
|
|
@ -111,17 +111,9 @@ SUBROUTINE gen_us_dy_base( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
|
|||
ALLOCATE( dylm(npw,(lmaxkb+1)**2,3) )
|
||||
!$acc data create( dylm )
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
DO ipol = 1, 3
|
||||
CALL dylmr2_acc( lmx2, npw, gk, q, dylm(:,:,ipol), ipol )
|
||||
ENDDO
|
||||
#else
|
||||
!$acc update self( gk, q )
|
||||
DO ipol = 1, 3
|
||||
CALL dylmr2( lmx2, npw, gk, q, dylm(:,:,ipol), ipol )
|
||||
ENDDO
|
||||
!$acc update device( dylm )
|
||||
#endif
|
||||
!
|
||||
u_ipol1 = u(1) ; u_ipol2 = u(2) ; u_ipol3 = u(3)
|
||||
!
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
!
|
||||
! Copyright (C) 2001-2021 Quantum ESPRESSO group
|
||||
! Copyright (C) 2001-2024 Quantum ESPRESSO Foundation
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
|
@ -14,7 +14,7 @@ subroutine ylmr2 (lmax2, ng, g, gg, ylm)
|
|||
! Numerical recursive algorithm based on the one given in Numerical
|
||||
! Recipes but avoiding the calculation of factorials that generate
|
||||
! overflow for lmax > 11
|
||||
! Last modified May 2nd, 2021, by PG
|
||||
! Last modified Jan. 2024, by PG: calls CUF version if __CUDA
|
||||
!
|
||||
USE upf_kinds, ONLY : DP
|
||||
USE upf_const, ONLY : pi, fpi
|
||||
|
@ -39,10 +39,18 @@ subroutine ylmr2 (lmax2, ng, g, gg, ylm)
|
|||
if (ng < 1 .or. lmax2 < 1) return
|
||||
do lmax = 0, 25
|
||||
if ((lmax+1)**2 == lmax2) go to 10
|
||||
if ((lmax+1)**2 > lmax2) exit
|
||||
end do
|
||||
call upf_error (' ylmr', 'l > 25 or wrong number of Ylm required',lmax2)
|
||||
call upf_error (' ylmr2', 'wrong number of Ylm required',lmax2)
|
||||
10 continue
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!$acc data present_or_copyout(ylm) present_or_copyin(g, gg)
|
||||
!$acc host_data use_device(g,gg,ylm)
|
||||
call ylmr2_gpu(lmax2, ng, g, gg, ylm)
|
||||
!$acc end host_data
|
||||
!$acc end data
|
||||
#else
|
||||
if (lmax == 0) then
|
||||
ylm(:,1) = sqrt (1.d0 / fpi)
|
||||
return
|
||||
|
@ -124,6 +132,7 @@ subroutine ylmr2 (lmax2, ng, g, gg, ylm)
|
|||
end do
|
||||
enddo
|
||||
!$omp end parallel do
|
||||
#endif
|
||||
!
|
||||
return
|
||||
end subroutine ylmr2
|
||||
|
|
Loading…
Reference in New Issue