Merge branch 'stress_acc' into 'develop'

Stress Acc

See merge request QEF/q-e!1945
This commit is contained in:
giannozz 2022-08-17 14:33:32 +00:00
commit 6338722054
60 changed files with 2890 additions and 5508 deletions

View File

@ -184,9 +184,9 @@
CALL ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms )
!
END IF
!NOTE g and mill already allocate in the device they are initialized below.
!$acc data present(g, mill)
!$acc update device(g,mill)
!NOTE g, gg and mill already allocate in the device they are initialized below.
!$acc data present(g,gg,mill)
!$acc update device(g,gg,mill)
!$acc end data
!
CALL gshells (.TRUE.)
@ -429,13 +429,13 @@
! re-calculate G-vectors and kinetic energy
!
dfftp_ngm = dfftp%ngm
!$acc parallel loop present(g, mill) copyin(bg) copyout(gg)
!$acc parallel loop present(g,gg,mill) copyin(bg)
do ig = 1, dfftp_ngm
g(:,ig)= mill(1,ig)*bg(:,1) + mill(2,ig)*bg(:,2) + mill(3,ig)*bg(:,3)
gg(ig)=g(1,ig)**2 + g(2,ig)**2 + g(3,ig)**2
enddo
!$acc end parallel loop
!$acc update host(g)
!$acc end parallel loop
!$acc update host(g,gg)
!
call g2kin_init ( gg, tpiba2 )
!

View File

@ -357,7 +357,7 @@
!
DEV_ACC data present(rhoeg, rhops, mill,g ) copy(fion) create(rp(1:s_ngm_)) copyin(sfac, screen_coul, gg, vps, ityp,ei1, ei2, ei3)
DEV_ACC data present(rhoeg, rhops, mill,g,gg ) copy(fion) create(rp(1:s_ngm_)) copyin(sfac, screen_coul, vps, ityp,ei1, ei2, ei3)
!
DEV_OMP parallel default(none) &
DEV_OMP shared(gstart, dffts,sfac, rhops, screen_coul, rhoeg, nsp, gg, tpiba2, tpiba, mill, g, &

View File

@ -212,7 +212,8 @@ DEV_ACC enter data create(drhot(1:p_ngm_, 1:6))
!
zpseu = 0.0_DP
!
DEV_ACC data copyin(rhog,drhog,ht,sfac,vps,gg,rhops) copyout(vtemp)
DEV_ACC update device(gg)
DEV_ACC data copyin(rhog,drhog,ht,sfac,vps,rhops) copyout(vtemp)
DEV_OMP parallel default(shared), private(ig,is,ij,i,j,k)
!
DEV_OMP do
@ -337,7 +338,7 @@ DEV_ACC loop vector reduction(+:x_tmp)
DEV_ACC end parallel
!
DEV_OMP do
DEV_ACC parallel loop present(rhotmp)
DEV_ACC parallel loop present(rhotmp,gg)
DO ig = gstart, p_ngm_
vtemp(ig) = CONJG( rhotmp( ig ) ) * rhotmp( ig ) / gg( ig )
END DO
@ -402,7 +403,7 @@ DEV_ACC kernels
DEV_ACC end kernels
!
DEV_ACC parallel loop present(rhotmp)
DEV_ACC parallel loop present(rhotmp,gg)
!
DEV_OMP parallel default(shared), private(ig,is)
DEV_OMP do

View File

@ -120,7 +120,7 @@
ALLOCATE( igtongl_d(ngm) )
ALLOCATE( gl_d(ngm) )
ENDIF
!$acc enter data create( mill(1:3, 1:ngm), g(1:3, 1:ngm) )
!$acc enter data create( mill(1:3,1:ngm), g(1:3,1:ngm), gg(1:ngm) )
!
RETURN
!
@ -142,7 +142,10 @@
END IF
!
!
IF( ALLOCATED( gg ) ) DEALLOCATE( gg )
IF( ALLOCATED( gg ) ) THEN
!$acc exit data delete(gg)
DEALLOCATE( gg )
END IF
IF( ALLOCATED( g ) ) THEN
!$acc exit data delete(g)
DEALLOCATE( g )

View File

@ -29,11 +29,7 @@ subroutine drhodv (nu_i0, nper, drhoscf)
USE cell_base, ONLY : tpiba
USE lsda_mod, ONLY : current_spin, lsda, isk, nspin
USE wvfct, ONLY : npwx, nbnd
#if defined(__CUDA)
USE uspp, ONLY : nkb, vkb, deeq_nc, deeq_nc_d, okvan
#else
USE uspp, ONLY : nkb, vkb, deeq_nc, okvan
#endif
USE becmod, ONLY : calbec, bec_type, becscal, allocate_bec_type, &
deallocate_bec_type
USE fft_base, ONLY : dfftp
@ -142,18 +138,14 @@ subroutine drhodv (nu_i0, nper, drhoscf)
ELSE
IF (okvan) THEN
deeq_nc(:,:,:,:)=deeq_nc_save(:,:,:,:,2)
#if defined(__CUDA)
deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:)
#endif
!$acc update device(deeq_nc)
int1_nc(:,:,:,:,:)=int1_nc_save(:,:,:,:,:,2)
ENDIF
call drhodvnl (ik, ikk, nper, nu_i0, dynwrk, becpt, alphapt, &
dbecq, dalpq)
IF (okvan) THEN
deeq_nc(:,:,:,:)=deeq_nc_save(:,:,:,:,1)
#if defined(__CUDA)
deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:)
#endif
!$acc update device(deeq_nc)
int1_nc(:,:,:,:,:)=int1_nc_save(:,:,:,:,:,1)
ENDIF
ENDIF

View File

@ -65,7 +65,7 @@ subroutine phq_setup
USE symm_base, ONLY : nrot, nsym, s, irt, t_rev, time_reversal, &
sr, invs, inverse_s, d1, d2, d3, check_grid_sym
USE uspp_param, ONLY : upf
USE uspp, ONLY : nlcc_any, deeq_nc, deeq_nc_d, okvan
USE uspp, ONLY : nlcc_any, deeq_nc, okvan
USE noncollin_module, ONLY : noncolin, domag, m_loc, angle1, angle2, ux
USE nlcc_ph, ONLY : drc
USE control_ph, ONLY : rec_code, lgamma_gamma, search_sym, start_irr, &
@ -183,9 +183,7 @@ subroutine phq_setup
v%of_r(:,2:4)=-v%of_r(:,2:4)
deeq_nc_save(:,:,:,:,2)=deeq_nc(:,:,:,:)
deeq_nc(:,:,:,:)=deeq_nc_save(:,:,:,:,1)
#if defined(__CUDA)
deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:)
#endif
!$acc update device(deeq_nc)
ENDIF
ENDIF
!

View File

@ -47,10 +47,8 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
USE scf, ONLY : rho, vrs
#if defined(__CUDA)
USE scf_gpum, ONLY : vrs_d
USE uspp, ONLY : okvan, vkb, deeq_nc, deeq_nc_d
#else
USE uspp, ONLY : okvan, vkb, deeq_nc
#endif
USE uspp, ONLY : okvan, vkb, deeq_nc
USE uspp_param, ONLY : nhm
USE noncollin_module, ONLY : noncolin, domag, npol, nspin_mag
USE paw_variables, ONLY : okpaw
@ -282,22 +280,18 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
! Hubbard potential dvbare_hub_q * psi_kpoint
! is calculated and added to dvpsi.
!
IF (lda_plus_u) CALL dvqhub_barepsi_us(ik, u(1, mode))
IF (lda_plus_u) CALL dvqhub_barepsi_us(ik, u(1,mode))
!
ELSE
IF (okvan) THEN
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,2)
#if defined(__CUDA)
deeq_nc_d(:,:,:,:) = deeq_nc(:,:,:,:)
#endif
!$acc update device(deeq_nc)
int1_nc(:,:,:,:,:) = int1_nc_save(:,:,:,:,:,2)
ENDIF
CALL dvqpsi_us(ik, u(1, mode), .FAlSE., becpt, alphapt)
IF (okvan) THEN
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,1)
#if defined(__CUDA)
deeq_nc_d(:,:,:,:) = deeq_nc(:,:,:,:)
#endif
!$acc update device(deeq_nc)
int1_nc(:,:,:,:,:) = int1_nc_save(:,:,:,:,:,1)
ENDIF
ENDIF
@ -342,9 +336,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
#endif
IF (okvan) THEN
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,2)
#if defined(__CUDA)
deeq_nc_d(:,:,:,:) = deeq_nc(:,:,:,:)
#endif
!$acc update device(deeq_nc)
ENDIF
ENDIF
!
@ -375,9 +367,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
#endif
IF (okvan) THEN
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,1)
#if defined(__CUDA)
deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:)
#endif
!$acc update device(deeq_nc)
ENDIF
ENDIF
!

View File

@ -260,32 +260,22 @@ set(src_pw
src/force_corr_gpu.f90
src/orthoatwfc_gpu.f90
src/wfcinit_gpu.f90
src/stres_loc_gpu.f90
src/add_paw_to_deeq_gpu.f90
src/addusdens_gpu.f90
src/addusforce_gpu.f90
src/deriv_drhoc_gpu.f90
src/s_psi_gpu.f90
src/rotate_wfc_gpu.f90
src/usnldiag_gpu.f90
src/add_vuspsi_gpu.f90
src/hs_1psi_gpu.f90
src/g_psi_gpu.f90
src/stres_mgga_gpu.f90
src/atomic_wfc_gpu.f90
src/dvloc_of_g_gpu.f90
src/stres_ewa_gpu.f90
src/stres_knl_gpu.f90
src/compute_deff_gpu.f90
src/add_vhub_to_deeq_gpu.f90
src/s_1psi_gpu.f90
src/h_psi_gpu.f90
src/stres_us_gpu.f90
src/stres_cc_gpu.f90
src/utils_gpu.f90
src/vhpsi_gpu.f90
src/drhoc_gpu.f90
src/stres_har_gpu.f90
src/vloc_psi_gpu.f90
src/hs_psi_gpu.f90)
qe_enable_cuda_fortran("${src_pw}")

View File

@ -352,7 +352,7 @@ END SUBROUTINE cutoff_force_lc
!
!
!----------------------------------------------------------------------
SUBROUTINE cutoff_stres_evloc( psic_G, strf, evloc )
SUBROUTINE cutoff_stres_evloc( rho_G, strf, evloc )
!----------------------------------------------------------------------
!! This subroutine adds the contribution from the cutoff long-range part
!! of the local part of the ionic potential to \(\text{evloc}\).
@ -363,14 +363,14 @@ SUBROUTINE cutoff_stres_evloc( psic_G, strf, evloc )
!! to re-compute it here for the stress.
!
USE kinds
USE ions_base, ONLY : ntyp => nsp
USE gvect, ONLY : ngm, gstart
USE io_global, ONLY : stdout
USE fft_base, ONLY : dfftp
USE ions_base, ONLY: ntyp => nsp
USE gvect, ONLY: ngm, gstart
USE io_global, ONLY: stdout
USE fft_base, ONLY: dfftp
!
IMPLICIT NONE
!
COMPLEX(DP), INTENT(IN) :: psic_G(dfftp%nnr)
COMPLEX(DP), INTENT(IN) :: rho_G(dfftp%nnr)
!! charge density in G space
COMPLEX(DP), INTENT(IN) :: strf(ngm,ntyp)
!! the structure factor
@ -381,71 +381,28 @@ SUBROUTINE cutoff_stres_evloc( psic_G, strf, evloc )
!
INTEGER :: ng, nt
!
! If gstart=2, it means g(1) is G=0, but we have nothing to add for G=0
! So we start at gstart.
!$acc data present_or_copyin(rho_G,strf)
!
! ... If gstart=2, it means g(1) is G=0, but we have nothing to add for G=0
! So we start at gstart.
!
!$acc parallel loop collapse(2) reduction(+:evloc) copyin(lr_Vloc)
DO nt = 1, ntyp
DO ng = gstart, ngm
evloc = evloc + DBLE( CONJG(psic_G(ng)) * strf(ng,nt) ) &
* lr_Vloc(ng,nt)
ENDDO
DO ng = gstart, ngm
evloc = evloc + DBLE( CONJG(rho_G(ng)) * strf(ng,nt) ) &
* lr_Vloc(ng,nt)
ENDDO
ENDDO
!
!$acc end data
!
RETURN
!
END SUBROUTINE cutoff_stres_evloc
!
!----------------------------------------------------------------------
SUBROUTINE cutoff_stres_evloc_gpu( psicG_d, strf_d, evloc )
!----------------------------------------------------------------------
!! cutoff_stres_evloc - gpu version
!
USE kinds
USE ions_base, ONLY : ntyp => nsp
!USE vlocal, ONLY : strf
USE gvect, ONLY : ngm, gstart
USE io_global, ONLY : stdout
USE fft_base, ONLY : dfftp
!
IMPLICIT NONE
!
COMPLEX(DP), INTENT(IN) :: psicG_d(dfftp%nnr)
!! charge density in G space
COMPLEX(DP), INTENT(IN) :: strf_d(ngm,ntyp)
REAL(DP), INTENT(INOUT) :: evloc
!! the energy of the electrons in the local ionic potential
!
! ... local variables
!
REAL(DP), ALLOCATABLE :: lrVloc_d(:,:)
INTEGER :: ng, nt
!
#if defined(__CUDA)
attributes(DEVICE) :: psicG_d, strf_d, lrVloc_d
#endif
!
ALLOCATE( lrVloc_d(ngm,ntyp) )
lrVloc_d = lr_Vloc
!
! If gstart=2, it means g(1) is G=0, but we have nothing to add for G=0
! So we start at gstart.
!
!$cuf kernel do (2)
DO nt = 1, ntyp
DO ng = gstart, ngm
evloc = evloc + DBLE( CONJG(psicG_d(ng)) * strf_d(ng,nt) ) &
* lrVloc_d(ng,nt)
ENDDO
ENDDO
!
DEALLOCATE( lrVloc_d )
!
RETURN
!
END SUBROUTINE cutoff_stres_evloc_gpu
!
!
!----------------------------------------------------------------------
SUBROUTINE cutoff_stres_sigmaloc( psic_G, strf, sigmaloc )
SUBROUTINE cutoff_stres_sigmaloc( rho_G, strf, sigmaloc )
!----------------------------------------------------------------------
!! This subroutine adds the contribution from the cutoff long-range part
!! of the local part of the ionic potential to the rest of the
@ -454,141 +411,67 @@ SUBROUTINE cutoff_stres_sigmaloc( psic_G, strf, sigmaloc )
USE kinds
USE ions_base, ONLY : ntyp => nsp
USE constants, ONLY : eps8
USE gvect, ONLY : ngm, g, gg, gstart
USE gvect, ONLY : ngm, gstart, g, gg
USE cell_base, ONLY : tpiba, tpiba2, alat, omega
USE io_global, ONLY : stdout
USE fft_base, ONLY : dfftp
!
IMPLICIT NONE
!
COMPLEX(DP), INTENT(IN) :: psic_G(dfftp%nnr)
COMPLEX(DP), INTENT(IN) :: rho_G(dfftp%nnr)
!! charge density in G space
COMPLEX(DP), INTENT(IN) :: strf(ngm,ntyp)
!! the structure factor
REAL(DP), INTENT(INOUT) :: sigmaloc(3,3)
!! stress contribution for the local ionic potential
!
! ... local variables
!
INTEGER :: ng, nt, l, m
REAL(DP) :: Gp, G2lzo2Gp, beta, dlr_Vloc
!
! no G=0 contribution
DO nt = 1, ntyp
DO ng = gstart, ngm
!
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
! below is a somewhat cumbersome way to define beta of Eq. (61) of PRB 96, 075448
IF (Gp < eps8) THEN
! G^2*lz/2|Gp|
G2lzo2Gp = 0.0d0
beta = 0.0d0
ELSE
G2lzo2Gp = gg(ng)*tpiba2*lz/2.0d0/Gp
beta = G2lzo2Gp*(1.0d0-cutoff_2D(ng))/cutoff_2D(ng)
ENDIF
! dlr_vloc corresponds to the derivative of the long-range local ionic potential
! with respect to G
DO l = 1, 3
IF (l == 3) THEN
dlr_Vloc = - 1.0d0/(gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
* (1.0d0+ gg(ng)*tpiba2/4.0d0)
ELSE
dlr_Vloc = - 1.0d0/ (gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
* (1.0d0- beta + gg(ng)*tpiba2/4.0d0)
ENDIF
!
DO m = 1, l
sigmaloc(l,m) = sigmaloc(l,m) + DBLE( CONJG( psic_G(ng) ) &
* strf(ng,nt) ) * 2.0d0 * dlr_Vloc &
* tpiba2 * g(l,ng) * g(m,ng)
ENDDO
ENDDO
!
ENDDO
ENDDO
!
RETURN
!
END SUBROUTINE cutoff_stres_sigmaloc
!
!
!----------------------------------------------------------------------
SUBROUTINE cutoff_stres_sigmaloc_gpu( psicG_d, strf_d, sigmaloc )
!----------------------------------------------------------------------
!! This subroutine adds the contribution from the cutoff long-range part
!! of the local part of the ionic potential to the rest of the
!! \(\text{sigmaloc}\). That is, the rest of Eq. (63) of PRB 96, 075448.
!
USE kinds
USE ions_base, ONLY : ntyp => nsp
USE vlocal, ONLY : strf
USE constants, ONLY : eps8
USE gvect, ONLY : ngm, gstart, g_d, gg_d
USE cell_base, ONLY : tpiba, tpiba2, alat, omega
USE io_global, ONLY : stdout
USE fft_base, ONLY : dfftp
!
IMPLICIT NONE
!
COMPLEX(DP), INTENT(IN) :: psicG_d(dfftp%nnr)
!! charge density in G space
COMPLEX(DP), INTENT(IN) :: strf_d(ngm,ntyp)
REAL(DP), INTENT(INOUT) :: sigmaloc(3,3)
!! stress contribution for the local ionic potential
!
! ... local variables
!
INTEGER :: ng, nt, l, m
REAL(DP), ALLOCATABLE :: lrVloc_d(:,:), cutoff2D_d(:)
INTEGER :: ng, nt, l, m
REAL(DP) :: Gp, G2lzo2Gp, beta, dlr_Vloc1, dlr_Vloc2, dlr_Vloc3, &
no_lm_dep
REAL(DP) :: sigmaloc11, sigmaloc31, sigmaloc21, sigmaloc32, &
sigmaloc22, sigmaloc33
#if defined(__CUDA)
attributes(DEVICE) :: psicG_d, strf_d, lrVloc_d, cutoff2D_d
#endif
!
ALLOCATE( lrVloc_d(ngm,ntyp), cutoff2D_d(ngm) )
lrVloc_d = lr_Vloc
cutoff2D_d = cutoff_2D
!
!$acc data present_or_copyin(rho_G,strf)
!
sigmaloc11 = 0._DP ; sigmaloc31 = 0._DP
sigmaloc21 = 0._DP ; sigmaloc32 = 0._DP
sigmaloc22 = 0._DP ; sigmaloc33 = 0._DP
!
! no G=0 contribution
! ... no G=0 contribution
!
!$cuf kernel do (2) <<<*,*>>>
!$acc parallel loop collapse(2) copyin(lr_Vloc,cutoff_2D) &
!$acc reduction(+:sigmaloc11,sigmaloc21,sigmaloc22,sigmaloc31,&
!$acc sigmaloc32,sigmaloc33)
DO nt = 1, ntyp
DO ng = gstart, ngm
!
Gp = SQRT( g_d(1,ng)**2 + g_d(2,ng)**2 )*tpiba
! below is a somewhat cumbersome way to define beta of Eq. (61) of PRB 96, 075448
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
! ... below is a somewhat cumbersome way to define beta of Eq. (61) of PRB 96, 075448
IF (Gp < eps8) THEN
! G^2*lz/2|Gp|
! ... G^2*lz/2|Gp|
G2lzo2Gp = 0._DP
beta = 0._DP
ELSE
G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp
beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng)
G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
ENDIF
! dlrVloc corresponds to the derivative of the long-range local ionic potential
! with respect to G
dlr_Vloc1 = - 1._DP/ (gg_d(ng)*tpiba2) * lrVloc_d(ng,nt) &
* (1._DP- beta + gg_d(ng)*tpiba2/4._DP)
dlr_Vloc2 = - 1._DP/ (gg_d(ng)*tpiba2) * lrVloc_d(ng,nt) &
* (1._DP- beta + gg_d(ng)*tpiba2/4._DP)
dlr_Vloc3 = - 1._DP/ (gg_d(ng)*tpiba2) * lrVloc_d(ng,nt) &
* (1._DP+ gg_d(ng)*tpiba2/4._DP)
no_lm_dep = DBLE( CONJG( psicG_d(ng) ) &
* strf_d(ng,nt) ) * 2._DP * tpiba2
sigmaloc11 = sigmaloc11 + no_lm_dep * dlr_Vloc1 * g_d(1,ng) * g_d(1,ng)
sigmaloc21 = sigmaloc21 + no_lm_dep * dlr_Vloc2 * g_d(2,ng) * g_d(1,ng)
sigmaloc22 = sigmaloc22 + no_lm_dep * dlr_Vloc2 * g_d(2,ng) * g_d(2,ng)
sigmaloc31 = sigmaloc31 + no_lm_dep * dlr_Vloc3 * g_d(3,ng) * g_d(1,ng)
sigmaloc32 = sigmaloc32 + no_lm_dep * dlr_Vloc3 * g_d(3,ng) * g_d(2,ng)
sigmaloc33 = sigmaloc33 + no_lm_dep * dlr_Vloc3 * g_d(3,ng) * g_d(3,ng)
! ... dlrVloc corresponds to the derivative of the long-range local ionic potential
! with respect to G
dlr_Vloc1 = -1._DP / (gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
* (1._DP- beta + gg(ng)*tpiba2/4._DP)
dlr_Vloc2 = -1._DP / (gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
* (1._DP- beta + gg(ng)*tpiba2/4._DP)
dlr_Vloc3 = -1._DP / (gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
* (1._DP+ gg(ng)*tpiba2/4._DP)
no_lm_dep = DBLE( CONJG( rho_G(ng) ) &
* strf(ng,nt) ) * 2._DP * tpiba2
sigmaloc11 = sigmaloc11 + no_lm_dep * dlr_Vloc1 * g(1,ng) * g(1,ng)
sigmaloc21 = sigmaloc21 + no_lm_dep * dlr_Vloc2 * g(2,ng) * g(1,ng)
sigmaloc22 = sigmaloc22 + no_lm_dep * dlr_Vloc2 * g(2,ng) * g(2,ng)
sigmaloc31 = sigmaloc31 + no_lm_dep * dlr_Vloc3 * g(3,ng) * g(1,ng)
sigmaloc32 = sigmaloc32 + no_lm_dep * dlr_Vloc3 * g(3,ng) * g(2,ng)
sigmaloc33 = sigmaloc33 + no_lm_dep * dlr_Vloc3 * g(3,ng) * g(3,ng)
!
ENDDO
ENDDO
@ -600,70 +483,15 @@ SUBROUTINE cutoff_stres_sigmaloc_gpu( psicG_d, strf_d, sigmaloc )
sigmaloc(3,2) = sigmaloc(3,2) + sigmaloc32
sigmaloc(3,3) = sigmaloc(3,3) + sigmaloc33
!
DEALLOCATE( lrVloc_d, cutoff2D_d )
!$acc end data
!
RETURN
!
END SUBROUTINE cutoff_stres_sigmaloc_gpu
END SUBROUTINE cutoff_stres_sigmaloc
!
!
!----------------------------------------------------------------------
SUBROUTINE cutoff_stres_sigmahar( psic_G, sigmahar )
!----------------------------------------------------------------------
!! This subroutine cuts off the Hartree part of the stress.
!! See Eq. (62) of PRB 96, 075448.
!
USE kinds
USE gvect, ONLY : ngm, g, gg, gstart
USE constants, ONLY : eps8
USE cell_base, ONLY : tpiba2, alat, tpiba
USE io_global, ONLY : stdout
USE fft_base, ONLY : dfftp
!
IMPLICIT NONE
!
COMPLEX(DP), INTENT(IN) :: psic_G(dfftp%nnr)
!! charge density in G-space
REAL(DP), INTENT(INOUT) :: sigmahar(3,3)
!! hartree contribution to stress
!
! ... local variables
!
INTEGER :: ng, nt, l, m
REAL(DP) :: Gp, G2lzo2Gp, beta, shart, g2, fact
!
DO ng = gstart, ngm
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
IF (Gp < eps8) THEN
G2lzo2Gp = 0.0d0
beta = 0.0d0
ELSE
G2lzo2Gp = gg(ng)*tpiba2*lz/2.0d0/Gp
beta = G2lzo2Gp*(1.0d0-cutoff_2D(ng))/cutoff_2D(ng)
ENDIF
g2 = gg (ng) * tpiba2
shart = psic_G(ng) * CONJG(psic_G(ng)) / g2 * cutoff_2D(ng)
DO l = 1, 3
IF (l == 3) THEN
fact = 1.0d0
ELSE
fact = 1.0d0 - beta
ENDIF
DO m = 1, l
sigmahar(l,m) = sigmahar(l,m) + shart * tpiba2 * 2 * &
g(l,ng) * g(m,ng) / g2 * fact
ENDDO
ENDDO
ENDDO
!sigma is multiplied by 0.5*fpi*e2 after
!
RETURN
!
END SUBROUTINE cutoff_stres_sigmahar
!
!
!----------------------------------------------------------------------
SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar )
SUBROUTINE cutoff_stres_sigmahar( rho_G, sigmahar )
!----------------------------------------------------------------------
!! This subroutine cuts off the Hartree part of the stress.
!! See Eq. (62) of PRB 96, 075448.
@ -674,14 +502,14 @@ SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar )
USE cell_base, ONLY: tpiba2, alat, tpiba
USE io_global, ONLY: stdout
USE fft_base, ONLY: dfftp
USE gvect, ONLY: g_d, gg_d
USE gvect, ONLY: g, gg
!
IMPLICIT NONE
!
COMPLEX(DP), INTENT(IN) :: psicG_d(dfftp%nnr)
COMPLEX(DP), INTENT(IN) :: rho_G(dfftp%nnr)
!! charge density in G-space
REAL(DP), INTENT(INOUT) :: sigmahar(3,3)
!! hartree contribution to stress
!! Hartree contribution to stress
!
! ... local variables
!
@ -689,50 +517,45 @@ SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar )
REAL(DP) :: Gp, G2lzo2Gp, beta, shart, g2, fact
REAL(DP) :: sigmahar11, sigmahar31, sigmahar21, &
sigmahar32, sigmahar22, sigmahar33
REAL(DP), ALLOCATABLE :: cutoff2D_d(:)
!
#if defined(__CUDA)
attributes(DEVICE) :: psicG_d, cutoff2D_d
#endif
!
ALLOCATE( cutoff2D_d(ngm) )
cutoff2D_d = cutoff_2D
!$acc data present_or_copyin(rho_G)
!
sigmahar11 = 0._DP ; sigmahar31 = 0._DP
sigmahar21 = 0._DP ; sigmahar32 = 0._DP
sigmahar22 = 0._DP ; sigmahar33 = 0._DP
!
!$cuf kernel do (1) <<<*,*>>>
!$acc parallel loop copyin(cutoff_2D) &
!$acc reduction(+:sigmahar11,sigmahar21,sigmahar22,sigmahar31,&
!$acc sigmahar32,sigmahar33)
DO ng = gstart, ngm
Gp = SQRT(g_d(1,ng)**2 + g_d(2,ng)**2)*tpiba
Gp = SQRT(g(1,ng)**2 + g(2,ng)**2)*tpiba
IF (Gp < eps8) THEN
G2lzo2Gp = 0._DP
beta = 0._DP
ELSE
G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp
beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng)
G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
ENDIF
!
g2 = gg_d(ng) * tpiba2
g2 = gg(ng) * tpiba2
!
shart = DBLE(psicG_d(ng)*CONJG(psicG_d(ng))) /&
g2 * cutoff2D_d(ng)
shart = DBLE(rho_G(ng)*CONJG(rho_G(ng))) / &
g2 * cutoff_2D(ng)
!
fact = 1._DP - beta
!
sigmahar11 = sigmahar11 + shart *tpiba2*2._DP * &
g_d(1,ng) * g_d(1,ng) / g2 * fact
g(1,ng) * g(1,ng) / g2 * fact
sigmahar21 = sigmahar21 + shart *tpiba2*2._DP * &
g_d(2,ng) * g_d(1,ng) / g2 * fact
g(2,ng) * g(1,ng) / g2 * fact
sigmahar22 = sigmahar22 + shart *tpiba2*2._DP * &
g_d(2,ng) * g_d(2,ng) / g2 * fact
g(2,ng) * g(2,ng) / g2 * fact
sigmahar31 = sigmahar31 + shart *tpiba2*2._DP * &
g_d(3,ng) * g_d(1,ng) / g2
g(3,ng) * g(1,ng) / g2
sigmahar32 = sigmahar32 + shart *tpiba2*2._DP * &
g_d(3,ng) * g_d(2,ng) / g2
g(3,ng) * g(2,ng) / g2
sigmahar33 = sigmahar33 + shart *tpiba2*2._DP * &
g_d(3,ng) * g_d(3,ng) / g2
!
g(3,ng) * g(3,ng) / g2
ENDDO
!
sigmahar(1,1) = sigmahar(1,1) + sigmahar11
@ -743,11 +566,11 @@ SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar )
sigmahar(3,3) = sigmahar(3,3) + sigmahar33
!sigma is multiplied by 0.5*fpi*e2 after
!
DEALLOCATE( cutoff2D_d )
!$acc end data
!
RETURN
!
END SUBROUTINE cutoff_stres_sigmahar_gpu
END SUBROUTINE cutoff_stres_sigmahar
!
!
!----------------------------------------------------------------------
@ -759,87 +582,7 @@ SUBROUTINE cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa )
USE kinds
USE ions_base, ONLY : nat, zv, tau, ityp
USE constants, ONLY : e2, eps8
USE gvect, ONLY : ngm, g, gg, gstart
USE cell_base, ONLY : tpiba2, alat, omega, tpiba
USE io_global, ONLY : stdout
!
IMPLICIT NONE
!
REAL(DP), INTENT(IN) :: alpha
!! tuning param for LR/SR separation
REAL(DP), INTENT(INOUT) :: sigmaewa(3,3)
!! ewald contribution to stress
REAL(DP), INTENT(INOUT) :: sdewald
!! constant and diagonal terms
!
! ... local variables
!
INTEGER :: ng, na, l, m
REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact
COMPLEX(DP) :: rhostar
!
! g(1) is a problem if it's G=0, because we divide by G^2.
! So start at gstart.
! fact=1.0d0, gamma_only not implemented
! G=0 componenent of the long-range part of the local part of the
! pseudopotminus the Hartree potential is set to 0.
! in other words, sdewald=0.
! sdewald is the last term in equation B1 of PRB 32 3792.
! See also similar comment for ewaldg in cutoff_ewald routine
!
sdewald = 0._DP
DO ng = gstart, ngm
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
IF (Gp < eps8) THEN
G2lzo2Gp = 0._DP
beta = 0._DP
ELSE
G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
ENDIF
g2 = gg(ng) * tpiba2
g2a = g2 / 4._DP / alpha
rhostar = (0._DP,0._DP)
DO na = 1, nat
arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + &
g(3,ng) * tau(3,na) ) * tpi
rhostar = rhostar + zv (ityp(na) ) * CMPLX(COS(arg), SIN(arg), KIND=DP)
ENDDO
rhostar = rhostar / omega
sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2
! ... sewald is an other diagonal term that is similar to the diagonal terms
! in the other stress contributions. It basically gives a term prop to
! the ewald energy
sdewald = sdewald-sewald
DO l = 1, 3
IF (l == 3) THEN
fact = (g2a + 1.0d0)
ELSE
fact = (1.0d0+g2a-beta)
ENDIF
!
DO m = 1, l
sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * &
g(l,ng) * g(m,ng) / g2 * fact
ENDDO
ENDDO
!
ENDDO
!
RETURN
!
END SUBROUTINE cutoff_stres_sigmaewa
!
!----------------------------------------------------------------------
SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
!----------------------------------------------------------------------
!! This subroutine cuts off the Ewald part of the stress.
!! See Eq. (64) in PRB 96 075448
!
USE kinds
USE ions_base, ONLY : nat, zv, tau, ityp
USE constants, ONLY : e2, eps8
USE gvect, ONLY : ngm, gstart, g_d, gg_d
USE gvect, ONLY : ngm, gstart, g, gg
USE cell_base, ONLY : tpiba2, alat, omega, tpiba
USE io_global, ONLY : stdout
!
@ -858,72 +601,62 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact
REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
COMPLEX(DP) :: rhostar
!
INTEGER , ALLOCATABLE :: ityp_d(:)
REAL(DP), ALLOCATABLE :: cutoff2D_d(:), tau_d(:,:), zv_d(:)
!
#if defined(__CUDA)
attributes(DEVICE) :: cutoff2D_d, tau_d, zv_d, ityp_d
#endif
!
ntyp = SIZE(zv)
ALLOCATE( cutoff2D_d(ngm), tau_d(3,nat), zv_d(ntyp) )
ALLOCATE( ityp_d(nat) )
cutoff2D_d = cutoff_2D
tau_d = tau
zv_d = zv
ityp_d = ityp
! g(1) is a problem if it's G=0, because we divide by G^2.
! So start at gstart.
! fact=1.0d0, gamma_only not implemented
! G=0 componenent of the long-range part of the local part of the
! pseudopotminus the Hartree potential is set to 0.
! in other words, sdewald=0.
! sdewald is the last term in equation B1 of PRB 32 3792.
! See also similar comment for ewaldg in cutoff_ewald routine
!
! ... g(1) is a problem if it's G=0, because we divide by G^2.
! So start at gstart.
! fact=1.0d0, gamma_only not implemented
! G=0 componenent of the long-range part of the local part of the
! pseudopotminus the Hartree potential is set to 0.
! in other words, sdewald=0.
! sdewald is the last term in equation B1 of PRB 32 3792.
! See also similar comment for ewaldg in cutoff_ewald routine
!
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
!
sdewald = 0._DP
!
!$cuf kernel do (1) <<<*,*>>>
!$acc parallel loop copyin(cutoff_2D,tau,zv,ityp) &
!$acc& reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32, &
!$acc& sigma33)
DO ng = gstart, ngm
Gp = SQRT( g_d(1,ng)**2 + g_d(2,ng)**2 )*tpiba
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
IF (Gp < eps8) THEN
G2lzo2Gp = 0._DP
beta = 0._DP
ELSE
G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp
beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng)
G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
ENDIF
g2 = gg_d(ng) * tpiba2
g2 = gg(ng) * tpiba2
g2a = g2 / 4._DP / alpha
rhostar = (0._DP,0._DP)
DO na = 1, nat
arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + &
g_d(3,ng) * tau_d(3,na) ) * tpi
rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP)
arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + &
g(3,ng) * tau(3,na) ) * tpi
rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP)
ENDDO
rhostar = rhostar / CMPLX(omega)
sewald = tpi * e2 * EXP(-g2a) / g2* cutoff2D_d(ng) * ABS(rhostar)**2
sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2
! ... sewald is an other diagonal term that is similar to the diagonal terms
! in the other stress contributions. It basically gives a term prop to
! the ewald energy
! in the other stress contributions. It basically gives a term prop to
! the ewald energy
!
sdewald = sdewald - sewald
sigma11 = sigma11 + sewald * tpiba2 * 2._DP * &
g_d(1,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta)
g(1,ng) * g(1,ng) / g2 * (1._DP+g2a-beta)
sigma21 = sigma21 + sewald * tpiba2 * 2._DP * &
g_d(2,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta)
g(2,ng) * g(1,ng) / g2 * (1._DP+g2a-beta)
sigma22 = sigma22 + sewald * tpiba2 * 2._DP * &
g_d(2,ng) * g_d(2,ng) / g2 * (1._DP+g2a-beta)
g(2,ng) * g(2,ng) / g2 * (1._DP+g2a-beta)
sigma31 = sigma31 + sewald * tpiba2 * 2._DP * &
g_d(3,ng) * g_d(1,ng) / g2 * (g2a+1._DP)
g(3,ng) * g(1,ng) / g2 * (g2a+1._DP)
sigma32 = sigma32 + sewald * tpiba2 * 2._DP * &
g_d(3,ng) * g_d(2,ng) / g2 * (g2a+1._DP)
g(3,ng) * g(2,ng) / g2 * (g2a+1._DP)
sigma33 = sigma33 + sewald * tpiba2 * 2._DP * &
g_d(3,ng) * g_d(3,ng) / g2 * (g2a+1._DP)
g(3,ng) * g(3,ng) / g2 * (g2a+1._DP)
!
ENDDO
!
@ -934,11 +667,9 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
sigmaewa(3,2) = sigmaewa(3,2) + sigma32
sigmaewa(3,3) = sigmaewa(3,3) + sigma33
!
DEALLOCATE( cutoff2D_d, tau_d, zv_d )
DEALLOCATE( ityp_d )
!
RETURN
!
END SUBROUTINE cutoff_stres_sigmaewa_gpu
END SUBROUTINE cutoff_stres_sigmaewa
!
!
END MODULE Coul_cut_2D

View File

@ -290,16 +290,6 @@ PWLIBS += \
drhoc_gpu.o \
addusforce_gpu.o \
force_corr_gpu.o \
stres_loc_gpu.o \
dvloc_of_g_gpu.o \
stres_har_gpu.o \
stres_mgga_gpu.o \
stres_cc_gpu.o \
deriv_drhoc_gpu.o \
stres_ewa_gpu.o \
stres_knl_gpu.o \
stres_us_gpu.o \
compute_deff_gpu.o \
orthoatwfc_gpu.o \
atomic_wfc_gpu.o

View File

@ -19,7 +19,7 @@ SUBROUTINE add_paw_to_deeq_gpu(deeq_d)
!
IMPLICIT NONE
!
REAL(KIND=DP), INTENT(INOUT) :: deeq_d( nhm, nhm, nat, nspin )
REAL(DP), INTENT(INOUT) :: deeq_d(nhm,nhm,nat,nspin)
!! integral of the perturbed potential
!
! ... local variables
@ -27,7 +27,7 @@ SUBROUTINE add_paw_to_deeq_gpu(deeq_d)
INTEGER :: na, nb, nab, nt, ih, jh, ijh, nhnt, is
REAL(DP), ALLOCATABLE :: ddd_paw_d(:,:,:)
#if defined(__CUDA)
attributes(DEVICE) :: deeq_d, ddd_paw_d
attributes(DEVICE) :: ddd_paw_d, deeq_d
#endif
! OPTIMIZE HERE: squeeze loop on atoms having PAW pseudo
@ -39,7 +39,7 @@ SUBROUTINE add_paw_to_deeq_gpu(deeq_d)
nt = ityp(na)
IF (.not.upf(nt)%tpawp) CYCLE
nhnt = nh(nt)
!$cuf kernel do(3)
!$acc parallel loop collapse(3)
DO is=1,nspin
DO ih=1,nhnt
DO jh=1,nhnt

View File

@ -7,8 +7,8 @@
!
SUBROUTINE add_vhub_to_deeq_gpu( deeq_d )
!-----------------------------------------------------------------
!! Add Hubbard contributions to the integral of V_eff and Q_{nm} when
!! Hubbard_projectors is pseudo.
!! Add Hubbard contributions to the integral of \(V_\text{eff}\)
!! and \(Q_{nm}\) when Hubbard_projectors is pseudo.
!
USE kinds, ONLY : DP
USE ions_base, ONLY : nat, ntyp => nsp, ityp
@ -21,27 +21,23 @@ SUBROUTINE add_vhub_to_deeq_gpu( deeq_d )
!
IMPLICIT NONE
!
REAL(KIND=DP), INTENT(INOUT) :: deeq_d( nhm, nhm, nat, nspin )
!! integral of V_eff and Q_{nm}
!
#if defined(__CUDA)
attributes(DEVICE) :: deeq_d
#endif
REAL(KIND=DP), INTENT(INOUT) :: deeq_d(nhm,nhm,nat,nspin)
!! integral of \(V_\text{eff}\) and \(Q_{nm}\)
!
! ... local variables
!
REAL(KIND=DP), ALLOCATABLE :: deeq_aux_h( :, :, : )
REAL(KIND=DP), ALLOCATABLE :: deeq_aux_d( :, :, : )
REAL(KIND=DP), ALLOCATABLE :: deeq_aux_h(:,:,:)
REAL(KIND=DP), ALLOCATABLE :: deeq_aux_d(:,:,:)
#if defined(__CUDA)
attributes(DEVICE) :: deeq_aux_d
attributes(DEVICE) :: deeq_aux_d, deeq_d
#endif
INTEGER :: na, nt, ih, jh, ijh, m1, m2, ow1, ow2, is, nhnt
!
!
! (maybe) OPTIMIZE here ... reorder the loop ?
!
ALLOCATE(deeq_aux_h( nhm, nhm, nspin ))
ALLOCATE(deeq_aux_d( nhm, nhm, nspin ))
ALLOCATE( deeq_aux_h(nhm,nhm,nspin) )
ALLOCATE( deeq_aux_d(nhm,nhm,nspin) )
!
DO na = 1, nat
!
nt = ityp(na)
@ -68,11 +64,12 @@ SUBROUTINE add_vhub_to_deeq_gpu( deeq_d )
!
ENDDO
ENDDO
CALL dev_memcpy(deeq_aux_d, deeq_aux_h)
!
CALL dev_memcpy( deeq_aux_d, deeq_aux_h )
! HERE USE devXlib
nhnt = nh(nt)
!$cuf kernel do(3)
DO is=1, nspin
!$acc parallel loop collapse(3)
DO is = 1, nspin
DO ih = 1, nhnt
DO jh = 1, nhnt
deeq_d(jh,ih,na,is) = deeq_d(jh,ih,na,is) + deeq_aux_d(jh,ih,is)
@ -81,7 +78,7 @@ SUBROUTINE add_vhub_to_deeq_gpu( deeq_d )
END DO
!
ENDDO
!
DEALLOCATE(deeq_aux_h, deeq_aux_d)
!
END SUBROUTINE add_vhub_to_deeq_gpu

View File

@ -19,7 +19,7 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
USE lsda_mod, ONLY: current_spin
USE control_flags, ONLY: gamma_only
USE noncollin_module
USE uspp, ONLY: ofsbeta, nkb, vkb, deeq_d, deeq_nc_d
USE uspp, ONLY: ofsbeta, nkb, vkb, deeq, deeq_nc
USE uspp_param, ONLY: nh, nhm
USE becmod_gpum, ONLY: bec_type_d, becp_d, using_becp_r_d, &
using_becp_k_d, using_becp_nc_d
@ -133,10 +133,12 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
! (l'=l+ijkb0, m'=m+ijkb0, indices run from 1 to nh(nt))
!
IF ( m_loc > 0 ) THEN
!$acc host_data use_device(deeq)
CALL DGEMM('N', 'N', nh(nt), m_loc, nh(nt), 1.0_dp, &
deeq_d(1,1,na,current_spin), nhm, &
deeq(1,1,na,current_spin), nhm, &
becp_d%r_d(ofsbeta(na)+1,1), nkb, 0.0_dp, &
ps_d(ofsbeta(na)+1,1), nkb )
!$acc end host_data
END IF
!
END IF
@ -240,12 +242,12 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
! deeq is real: copy it into a complex variable to perform
! a zgemm - simple but sub-optimal solution
!
!deeaux_d(:,:) = CMPLX(deeq_d(1:nh(nt),1:nh(nt),na,current_spin), 0.0_dp, KIND=dp )
!deeaux_d(:,:) = CMPLX(deeq(1:nh(nt),1:nh(nt),na,current_spin), 0.0_dp, KIND=dp )
!
!$cuf kernel do(2) <<<*,*>>>
!$acc parallel loop collapse(2) present(deeq)
DO j = 1, nhnt
DO k = 1, nhnt
deeaux_d(k,j) = CMPLX(deeq_d(k,j,na,current_spin), 0.0_dp, KIND=DP )
deeaux_d(k,j) = CMPLX(deeq(k,j,na,current_spin), 0.0_dp, KIND=DP )
END DO
END DO
!
@ -313,29 +315,31 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
!
IF ( ityp(na) == nt ) THEN
!
!$acc host_data use_device(deeq_nc)
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
deeq_nc_d(1,1,na,1), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, &
deeq_nc(1,1,na,1), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, &
(0.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,1,1), 2*nkb )
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
deeq_nc_d(1,1,na,2), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, &
deeq_nc(1,1,na,2), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, &
(1.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,1,1), 2*nkb )
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
deeq_nc_d(1,1,na,3), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, &
deeq_nc(1,1,na,3), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, &
(0.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,2,1), 2*nkb )
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
deeq_nc_d(1,1,na,4), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, &
deeq_nc(1,1,na,4), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, &
(1.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,2,1), 2*nkb )
!$acc end host_data
!
! DO ibnd = 1, m
! !
! DO jh = 1, nh(nt)
! !
!!$cuf kernel do(1) <<<*,*>>>
!!$acc parallel loop present(deeq_nc)
! DO ih = 1, nh(nt)
! !
! ikb = ijkb0 + ih
@ -344,11 +348,11 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
! becpdn_jkb = becp_nc_d(jkb,2,ibnd)
! !
! ps_d(ikb,1,ibnd) = ps_d(ikb,1,ibnd) + &
! deeq_nc_d(ih,jh,na,1)*becpup_jkb + &
! deeq_nc_d(ih,jh,na,2)*becpdn_jkb
! deeq_nc(ih,jh,na,1)*becpup_jkb + &
! deeq_nc(ih,jh,na,2)*becpdn_jkb
! ps_d(ikb,2,ibnd) = ps_d(ikb,2,ibnd) + &
! deeq_nc_d(ih,jh,na,3)*becpup_jkb + &
! deeq_nc_d(ih,jh,na,4)*becpdn_jkb
! deeq_nc(ih,jh,na,3)*becpup_jkb + &
! deeq_nc(ih,jh,na,4)*becpdn_jkb
! !
! END DO
! !

View File

@ -23,7 +23,7 @@ SUBROUTINE addusstress( sigmanlc )
! ... local variables
!
REAL(DP) :: sigma_r(3,3), sigma_g(3,3)
INTEGER :: na,ijh, ipol,jpol
INTEGER :: na, ijh, ipol, jpol
!
IF ( tqr ) THEN
sigma_r(:,:) = 0.d0
@ -56,7 +56,7 @@ SUBROUTINE addusstress_g( sigmanlc )
!! routine).
!
USE kinds, ONLY : DP
USE ions_base, ONLY : nat, ntyp => nsp, ityp
USE ions_base, ONLY : nat, ntyp=>nsp, ityp
USE cell_base, ONLY : omega, tpiba
USE fft_base, ONLY : dfftp
USE fft_rho, ONLY : rho_r2g
@ -71,7 +71,7 @@ SUBROUTINE addusstress_g( sigmanlc )
!
IMPLICIT NONE
!
REAL(DP), INTENT(INOUT) :: sigmanlc(3, 3)
REAL(DP), INTENT(INOUT) :: sigmanlc(3,3)
!! the nonlocal stress
!
! ... local variables
@ -82,18 +82,18 @@ SUBROUTINE addusstress_g( sigmanlc )
! counters
COMPLEX(DP), ALLOCATABLE :: aux1(:,:), aux2(:,:), vg(:,:), qgm(:,:)
! work space (complex)
COMPLEX(DP) :: cfac
REAL(DP) :: fac(3,nspin), sus(3,3)
COMPLEX(DP) :: cfac
REAL(DP) :: fac(3,nspin), sus(3,3)
! auxiliary variables
REAL(DP) , ALLOCATABLE :: qmod(:), ylmk0(:,:), dylmk0(:,:), tbecsum(:,:)
! work space (real)
!
!
sus(:,:) = 0.d0
!
! Fourier transform of the total effective potential
! ... Fourier transform of the total effective potential
!
ALLOCATE( vg(ngm,nspin) )
ALLOCATE( vg(ngm,nspin) )
!$acc data create( vg )
!
DO is = 1, nspin
IF ( nspin == 4 .and. is /= 1 ) THEN
@ -103,10 +103,10 @@ SUBROUTINE addusstress_g( sigmanlc )
ENDIF
ENDDO
!
! With k-point parallelization, distribute G-vectors across processors
! ngm_s = index of first G-vector for this processor
! ngm_e = index of last G-vector for this processor
! ngm_l = local number of G-vectors
! ... With k-point parallelization, distribute G-vectors across processors:
! ngm_s = index of first G-vector for this processor
! ngm_e = index of last G-vector for this processor
! ngm_l = local number of G-vectors
!
CALL divide( inter_pool_comm, ngm, ngm_s, ngm_e )
ngm_l = ngm_e-ngm_s+1
@ -115,80 +115,132 @@ SUBROUTINE addusstress_g( sigmanlc )
!
ALLOCATE( aux1(ngm_l,3), aux2(ngm_l,nspin), qmod(ngm_l) )
ALLOCATE( ylmk0(ngm_l,lmaxq*lmaxq), dylmk0(ngm_l,lmaxq*lmaxq) )
!$acc data create(aux1,aux2,qmod)
!$acc data create(ylmk0,dylmk0)
!
CALL ylmr2( lmaxq * lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0 )
#if defined(__CUDA)
!$acc host_data use_device(g,gg,ylmk0)
CALL ylmr2_gpu( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0 )
!$acc end host_data
#else
CALL ylmr2( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0 )
!$acc update device(ylmk0)
#endif
!
!$acc parallel loop
DO ig = 1, ngm_l
qmod(ig) = SQRT( gg(ngm_s+ig-1) ) * tpiba
ENDDO
!
! here we compute the integral Q*V for each atom,
! ... here we compute the integral Q*V for each atom,
! I = sum_G i G_a exp(-iR.G) Q_nm v^*
! (no contribution from G=0)
! (no contribution from G=0)
!
DO ipol = 1, 3
CALL dylmr2( lmaxq * lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
!
#if defined(__CUDA)
!$acc host_data use_device(g,gg,dylmk0)
CALL dylmr2_gpu( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
!$acc end host_data
#else
CALL dylmr2( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
!$acc update device(dylmk0)
#endif
!
DO nt = 1, ntyp
!
IF ( upf(nt)%tvanp ) THEN
nij = nh(nt)*(nh(nt)+1)/2
ALLOCATE( qgm(ngm_l,nij), tbecsum(nij,nspin) )
!$acc data create(qgm,tbecsum)
ijh = 0
DO ih = 1, nh(nt)
DO jh = ih, nh(nt)
ijh = ijh + 1
CALL dqvan2( ih, jh, nt, ipol, ngm_l, g(1,ngm_s), tpiba, &
qmod, ylmk0, dylmk0, qgm(1,ijh) )
qmod, ylmk0, dylmk0, qgm(1,ijh) )
ENDDO
ENDDO
!
DO na = 1, nat
IF (ityp(na) == nt) THEN
!
tbecsum(:,:) = becsum( 1:nij, na, 1:nspin )
!$acc kernels
tbecsum(:,:) = becsum(1:nij,na,1:nspin)
!$acc end kernels
!
CALL dgemm( 'N', 'N', 2*ngm_l, nspin, nij, 1.0_dp, &
qgm, 2*ngm_l, tbecsum, nij, 0.0_dp, aux2, 2*ngm_l )
!$acc host_data use_device(qgm,tbecsum,aux2)
CALL MYDGEMM( 'N', 'N', 2*ngm_l, nspin, nij, 1.0_dp, &
qgm, 2*ngm_l, tbecsum, nij, 0.0_dp, aux2, 2*ngm_l )
!$acc end host_data
!
#if defined(_OPENACC)
!$acc parallel loop collapse(2)
#else
!$omp parallel do default(shared) private(is, ig)
#endif
DO is = 1, nspin
DO ig = 1, ngm_l
aux2(ig,is) = aux2(ig,is) * CONJG(vg (ngm_s+ig-1, is))
END DO
END DO
ENDDO
ENDDO
#if defined(_OPENACC)
!$acc parallel loop
#else
!$omp end parallel do
!$omp parallel do default(shared) private(ig, cfac)
#endif
DO ig = 1, ngm_l
cfac = CONJG( eigts1(mill (1,ngm_s+ig-1), na) * &
eigts2(mill (2,ngm_s+ig-1), na) * &
eigts3(mill (3,ngm_s+ig-1), na) ) * tpiba
aux1(ig,1) = cfac * g(1,ngm_s+ig-1)
aux1(ig,2) = cfac * g(2,ngm_s+ig-1)
aux1(ig,3) = cfac * g(3,ngm_s+ig-1)
ENDDO
cfac = CONJG( eigts1(mill(1,ngm_s+ig-1),na) * &
eigts2(mill(2,ngm_s+ig-1),na) * &
eigts3(mill(3,ngm_s+ig-1),na) ) * tpiba
aux1(ig,1) = cfac * g(1,ngm_s+ig-1)
aux1(ig,2) = cfac * g(2,ngm_s+ig-1)
aux1(ig,3) = cfac * g(3,ngm_s+ig-1)
ENDDO
#if !defined(_OPENACC)
!$omp end parallel do
CALL DGEMM('T','N', 3, nspin, 2*ngm_l, 1.0_dp, aux1, 2*ngm_l, &
aux2, 2*ngm_l, 0.0_dp, fac, 3 )
DO is = 1, nspin
DO jpol = 1, 3
sus(ipol, jpol) = sus(ipol, jpol) - omega * fac(jpol, is)
ENDDO
ENDDO
#endif
!
!$acc data copyout( fac )
!$acc host_data use_device(aux1,aux2,fac)
CALL MYDGEMM( 'T','N', 3, nspin, 2*ngm_l, 1.0_dp, aux1, 2*ngm_l, &
aux2, 2*ngm_l, 0.0_dp, fac, 3 )
!$acc end host_data
!$acc end data
!
DO is = 1, nspin
DO jpol = 1, 3
sus(ipol,jpol) = sus(ipol,jpol) - omega * fac(jpol,is)
ENDDO
ENDDO
!
ENDIF
ENDDO
!$acc end data
DEALLOCATE( tbecsum, qgm )
ENDIF
ENDDO
!
ENDDO
!
!$acc end data
!$acc end data
DEALLOCATE( ylmk0, dylmk0 )
DEALLOCATE( aux1, aux2, qmod )
!
10 CONTINUE
!
CALL mp_sum( sus, inter_pool_comm )
!
IF (gamma_only) THEN
sigmanlc(:,:) = sigmanlc(:,:) + 2.0_dp*sus(:,:)
ELSE
sigmanlc(:,:) = sigmanlc(:,:) + sus(:,:)
ENDIF
DEALLOCATE( ylmk0, dylmk0 )
DEALLOCATE( aux1, aux2, vg, qmod )
!
!$acc end data
DEALLOCATE( vg )
!
RETURN
!

View File

@ -5,16 +5,17 @@
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!-------------------------------------------------------------------------
SUBROUTINE compute_deff( deff, et )
!-----------------------------------------------------------------------
!! This routine computes the effective value of the D-eS coefficients
!! which appear often in many expressions in the US or PAW case.
!! This routine is for the collinear case.
!! This routine computes the effective value of the D-eS coefficients
!! which appear often in many expressions in the US or PAW case.
!! This routine is for the collinear case.
!
USE kinds, ONLY: DP
USE ions_base, ONLY: nsp, nat, ityp
USE uspp, ONLY: deeq, qq_at, okvan
USE ions_base, ONLY: nat
USE uspp, ONLY: okvan, deeq, qq_at
USE uspp_param, ONLY: nhm
USE lsda_mod, ONLY: current_spin
!
@ -27,24 +28,21 @@ SUBROUTINE compute_deff( deff, et )
!
! ... local variables
!
INTEGER :: nt, na, is
INTEGER :: nt, na, is, i, j
!
deff(:,:,:) = deeq(:,:,:,current_spin)
!$acc kernels present_or_copyout(deff)
!
IF (okvan) THEN
IF (.NOT. okvan) THEN
!
DO nt = 1, nsp
DO na = 1, nat
!
IF ( ityp(na) == nt ) THEN
deff(:,:,na) = deff(:,:,na) - et*qq_at(:,:,na)
ENDIF
!
ENDDO
ENDDO
deff(:,:,:) = deeq(:,:,:,current_spin)
!
ELSE
!
deff(:,:,:) = deeq(:,:,:,current_spin) - et*qq_at(:,:,:)
!
ENDIF
!
!$acc end kernels
!
RETURN
!
@ -52,7 +50,7 @@ END SUBROUTINE compute_deff
!
!
!---------------------------------------------------------------------------
SUBROUTINE compute_deff_nc( deff, et )
SUBROUTINE compute_deff_nc( deff_nc, et )
!-------------------------------------------------------------------------
!! This routine computes the effective value of the D-eS coefficients
!! which appears often in many expressions. This routine is for the
@ -60,8 +58,8 @@ SUBROUTINE compute_deff_nc( deff, et )
!
USE kinds, ONLY: DP
USE ions_base, ONLY: nsp, nat, ityp
USE noncollin_module, ONLY: npol, lspinorb
USE uspp, ONLY: deeq_nc, qq_at, qq_so, okvan
USE noncollin_module, ONLY: noncolin, npol, lspinorb
USE uspp, ONLY: okvan, deeq_nc, qq_so, qq_at
USE uspp_param, ONLY: nhm
USE lsda_mod, ONLY: nspin
!
@ -69,42 +67,71 @@ SUBROUTINE compute_deff_nc( deff, et )
!
REAL(DP), INTENT(IN) :: et
!! The eigenvalues of the hamiltonian
COMPLEX(DP), INTENT(OUT) :: deff(nhm,nhm,nat,nspin)
COMPLEX(DP), INTENT(OUT) :: deff_nc(nhm,nhm,nat,nspin)
!! Effective values of the D-eS coefficients
!
! ... local variables
!
INTEGER :: nt, na, is, js, ijs
INTEGER :: nt, na, is, ijs, i, j, ias
INTEGER :: nt_v(nat), na_v(nat)
!
deff=deeq_nc
IF (okvan) THEN
!
DO nt = 1, nsp
DO na = 1, nat
!
IF ( ityp(na) == nt ) THEN
IF (lspinorb) THEN
deff(:,:,na,:) = deff(:,:,na,:) - et * qq_so(:,:,:,nt)
ELSE
ijs=0
!
DO is=1,npol
DO js=1,npol
!
ijs=ijs+1
IF (is==js) deff(:,:,na,ijs)=deff(:,:,na,ijs)-et*qq_at(:,:,na)
!
ENDDO
ENDDO
!
ENDIF
ENDIF
!
!$acc data present_or_copyout( deff_nc )
!
!$acc kernels
deff_nc(:,:,:,:) = deeq_nc(:,:,:,:)
!$acc end kernels
!
IF (okvan) then
!
! ... set up index arrays to fill 'deff' in on gpu
i = 0
DO nt = 1, nsp
DO na = 1, nat
IF ( ityp(na)/=nt ) CYCLE
i = i + 1
nt_v(i) = nt
na_v(i) = na
ENDDO
ENDDO
!
!$acc data copyin( nt_v, na_v )
!
IF (lspinorb) THEN
!
!$acc parallel loop collapse(3)
DO ias = 1, nat
DO i = 1, nhm
DO j = 1, nhm
na = na_v(ias)
nt = nt_v(ias)
deff_nc(i,j,na,:) = deeq_nc(i,j,na,:) - CMPLX(et)*qq_so(i,j,:,nt)
ENDDO
ENDDO
ENDDO
!
ENDIF
ENDDO
!
ELSE
!
!$acc parallel loop collapse(3)
DO ias = 1, nat
DO i = 1, nhm
DO j = 1, nhm
na = na_v(ias)
!$acc loop seq
DO is = 1, npol
ijs = (is-1)*npol + is
deff_nc(i,j,na,ijs) = deeq_nc(i,j,na,ijs) - CMPLX(et*qq_at(i,j,na))
ENDDO
ENDDO
ENDDO
ENDDO
!
ENDIF
!
!$acc end data
!
ENDIF
!
!$acc end data
!
RETURN
!

View File

@ -1,155 +0,0 @@
!
! Copyright (C) 2009-2010 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-------------------------------------------------------------------------
SUBROUTINE compute_deff_gpu( deff_d, et )
!-----------------------------------------------------------------------
!! This routine computes the effective value of the D-eS coefficients
!! which appear often in many expressions in the US or PAW case.
!! This routine is for the collinear case.
!
USE kinds, ONLY: DP
USE ions_base, ONLY: nat
USE uspp, ONLY: okvan, deeq_d, qq_at_d
USE uspp_param, ONLY: nhm
USE lsda_mod, ONLY: current_spin
!
IMPLICIT NONE
!
REAL(DP), INTENT(IN) :: et
!! The eigenvalues of the hamiltonian
COMPLEX(DP), INTENT(OUT) :: deff_d(nhm,nhm,nat)
!! Effective values of the D-eS coefficients
!
! ... local variables
!
INTEGER :: nt, na, is, i, j
!
#if defined(__CUDA)
attributes(DEVICE) :: deff_d
#endif
!
IF (.NOT. okvan) THEN
!
!$cuf kernel do (3) <<<*,*>>>
DO na = 1, nat
DO i = 1, nhm
DO j = 1,nhm
deff_d(i,j,na) = CMPLX(deeq_d(i,j,na,current_spin))
ENDDO
ENDDO
ENDDO
!
ELSE
!
!$cuf kernel do (3) <<<*,*>>>
DO na = 1, nat
DO i = 1, nhm
DO j = 1,nhm
deff_d(i,j,na) = CMPLX(deeq_d(i,j,na,current_spin) - et*qq_at_d(i,j,na))
ENDDO
ENDDO
ENDDO
!
ENDIF
!
!
RETURN
!
END SUBROUTINE compute_deff_gpu
!
!
!---------------------------------------------------------------------------
SUBROUTINE compute_deff_nc_gpu( deff_d, et )
!-------------------------------------------------------------------------
!! This routine computes the effective value of the D-eS coefficients
!! which appears often in many expressions. This routine is for the
!! noncollinear case.
!
USE kinds, ONLY: DP
USE ions_base, ONLY: nsp, nat, ityp
USE noncollin_module, ONLY: noncolin, npol, lspinorb
USE uspp, ONLY: okvan, deeq_nc_d, qq_so_d, qq_at_d
USE uspp_param, ONLY: nhm
USE lsda_mod, ONLY: nspin
USE device_memcpy_m, ONLY: dev_memcpy
!
IMPLICIT NONE
!
REAL(DP), INTENT(IN) :: et
!! The eigenvalues of the hamiltonian
COMPLEX(DP), INTENT(OUT) :: deff_d(nhm,nhm,nat,nspin)
!! Effective values of the D-eS coefficients
!
! ... local variables
!
INTEGER :: nt, na, is, js, ijs, i, j, ias
INTEGER :: na_v(nat), nt_v(nat)
INTEGER, ALLOCATABLE :: na_d(:), nt_d(:)
!
#if defined(__CUDA)
attributes(DEVICE) :: deff_d, na_d, nt_d
#endif
!
CALL dev_memcpy( deff_d, deeq_nc_d )
!
IF ( okvan ) THEN
!
ALLOCATE( nt_d(nat), na_d(nat) )
!
i = 0
DO nt = 1, nsp
DO na = 1, nat
IF ( ityp(na)/=nt ) CYCLE
i = i + 1
nt_v(i) = nt
na_v(i) = na
ENDDO
ENDDO
!
nt_d = nt_v ; na_d = na_v
!
IF (lspinorb) THEN
!
!$cuf kernel do (3) <<<*,*>>>
DO ias = 1, nat
DO i = 1, nhm
DO j = 1, nhm
na = na_d(ias)
nt = nt_d(ias)
deff_d(i,j,na,:) = deeq_nc_d(i,j,na,:) - et*qq_so_d(i,j,:,nt)
ENDDO
ENDDO
ENDDO
!
ELSE
!
!$cuf kernel do (3) <<<*,*>>>
DO ias = 1, nat
DO i = 1, nhm
DO j = 1, nhm
na = na_d(ias)
nt = nt_d(ias)
DO is = 1, npol
ijs = (is-1)*npol+is
deff_d(i,j,na,ijs) = deeq_nc_d(i,j,na,ijs) - &
et*qq_at_d(i,j,na)
ENDDO
ENDDO
ENDDO
ENDDO
!
ENDIF
!
DEALLOCATE( nt_d, na_d )
!
ENDIF
!
!
RETURN
!
END SUBROUTINE compute_deff_nc_gpu

View File

@ -11,7 +11,8 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg )
!! Calculates the Fourier transform of \(d\text{Rho}_c/dG\).
!
USE kinds
USE constants, ONLY : pi, fpi
USE constants, ONLY : pi, fpi
USE upf_acc_interfaces, ONLY : simpson
!
IMPLICIT NONE
!
@ -46,10 +47,14 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg )
! counter on g shells
! lower limit for loop on ngl
!
!$acc data present_or_copyin(gl,r,rab,rhoc) present_or_copyout(drhocg)
!
! G=0 term
!
IF (gl(1) < 1.0d-8) THEN
!$acc kernels
drhocg(1) = 0.0d0
!$acc end kernels
igl0 = 2
ELSE
igl0 = 1
@ -57,12 +62,19 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg )
!
! G <> 0 term
!
!$omp parallel private(aux, gx, rhocg1)
#if !defined(_OPENACC)
!$omp parallel private(aux,gx,rhocg1)
#endif
!
ALLOCATE( aux(mesh) )
#if defined(_OPENACC)
!$acc parallel loop gang private(aux)
#else
!$omp do
#endif
DO igl = igl0, ngl
gx = SQRT( gl(igl) * tpiba2 )
!$acc loop vector
DO ir = 1, mesh
aux(ir) = r(ir)*rhoc(ir)*( r(ir) * COS(gx*r(ir)) / &
gx - SIN(gx*r(ir)) / gx**2 )
@ -70,10 +82,16 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg )
CALL simpson( mesh, aux, rab, rhocg1 )
drhocg(igl) = fpi / omega * rhocg1
ENDDO
#if !defined(_OPENACC)
!$omp end do nowait
DEALLOCATE( aux )
#endif
!
DEALLOCATE( aux )
!$acc end data
!
#if !defined(_OPENACC)
!$omp end parallel
#endif
!
RETURN
!

View File

@ -1,229 +0,0 @@
!
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
MODULE compute_drhocg_gpu_m
#if defined(__CUDA)
!
USE cudafor
!
IMPLICIT NONE
TYPE(dim3) :: drhocg_threads = dim3(32,8,1)
PUBLIC :: drhocg_threads
CONTAINS
ATTRIBUTES(global) SUBROUTINE compute_drhocg_gpu( n, tpiba2, omega, gl, r, rhoc, rab, mesh, drhocg )
!
USE cudafor
USE kinds, ONLY: DP
USE constants, ONLY: pi, fpi, eps14
!
IMPLICIT NONE
!
INTEGER, VALUE :: n, mesh
REAL(DP),VALUE :: tpiba2, omega
REAL(DP),DEVICE,INTENT(IN) :: gl(n), r(mesh), rhoc(mesh), rab(mesh)
REAL(DP),DEVICE,INTENT(OUT) :: drhocg(n)
!
INTEGER :: tx, ty, igl, ir
REAL(DP) :: mysum, val, gx, x
!
tx = threadIdx%x
ty = threadIdx%y
!
igl = (blockIdx%x - 1) * blockDim%y + ty
!
IF (igl > n ) RETURN
!
gx = SQRT(gl(igl) * tpiba2)
mysum = 0_dp
!
DO ir = tx, mesh, blockDim%x
!
val = r(ir)*rhoc(ir)*( r(ir) * COS(gx*r(ir)) / &
gx - SIN(gx*r(ir)) / gx**2 ) * rab(ir)
!
IF (ir == 1 .OR. ir == mesh) THEN
mysum = mysum + val
ELSEIF (MOD(ir,2)) THEN
mysum = mysum + 2._dp*val
ELSE
mysum = mysum + 4._dp*val
ENDIF
ENDDO
!
! Reduce by warp
val = __shfl_down(mysum,1)
mysum = mysum + val
val = __shfl_down(mysum,2)
mysum = mysum + val
val = __shfl_down(mysum,4)
mysum = mysum + val
val = __shfl_down(mysum,8)
mysum = mysum + val
val = __shfl_down(mysum,16)
mysum = mysum + val
!
IF (tx == 1) THEN
drhocg(igl) = fpi * mysum / (3._dp * omega)
ENDIF
!
END SUBROUTINE compute_drhocg_gpu
!
#endif
END MODULE compute_drhocg_gpu_m
!
#if defined(__CUDA)
!----------------------------------------------------------------------------
SUBROUTINE deriv_drhoc_gpu( ngl, gl_d, omega, tpiba2, mesh, r_d, rab_d, rhoc_d, &
drhocg_d )
!--------------------------------------------------------------------------
!! Calculates the Fourier transform of \(d\text{Rho}_c/dG\).
!---cuda kernel version
!
USE cudafor
USE kinds
USE constants, ONLY : pi, fpi
USE compute_drhocg_gpu_m
!
IMPLICIT NONE
!
INTEGER :: ngl
!! input: the number of g shell
INTEGER :: mesh
!! input: the number of radial mesh points
REAL(DP), INTENT(IN), DEVICE :: gl_d(ngl)
!! input: the number of G shells
REAL(DP), INTENT(IN), DEVICE :: r_d(mesh)
!! input: the radial mesh
REAL(DP), INTENT(IN), DEVICE :: rab_d(mesh)
!! input: the derivative of the radial mesh
REAL(DP), INTENT(IN), DEVICE :: rhoc_d(mesh)
!! input: the radial core charge
REAL(DP), INTENT(IN) :: omega
!! input: the volume of the unit cell
REAL(DP), INTENT(IN) :: tpiba2
!! input: 2 times pi / alat
REAL(DP), INTENT(OUT), DEVICE :: drhocg_d(ngl)
!! Fourier transform of d Rho_c/dG
!
! ... local variables
!
REAL(DP) :: gx, gl1
! the modulus of g for a given shell
INTEGER :: igl, blocks, igl0
TYPE(dim3) :: threads
! counter on radial mesh points
! counter on g shells
! lower limit for loop on ngl
!
! G=0 term
!
gl1 = gl_d(1)
IF (gl1 < 1.0d-8) THEN
drhocg_d(1) = 0.0_DP
igl0 = 2
ELSE
igl0 = 1
ENDIF
!
! G <> 0 term
!
threads = dim3(32,8,1)
blocks = CEILING(REAL(ngl-igl0+1)/8)
CALL compute_drhocg_gpu<<<blocks, threads>>>( ngl-igl0+1, tpiba2, omega, gl_d(igl0), r_d, &
rhoc_d, rab_d, mesh, drhocg_d(igl0) )
!
RETURN
!
END SUBROUTINE deriv_drhoc_gpu
!
!
#else
!
!
SUBROUTINE deriv_drhoc_gpu( ngl, gl_d, omega, tpiba2, mesh, r_d, rab_d, rhoc_d, &
drhocg_d )
!--------------------------------------------------------------------------
!! Calculates the Fourier transform of \(d\text{Rho}_c/dG\).
!---cuf kernel loop version---
!
USE kinds
USE constants, ONLY: pi, fpi
!
#if defined(__CUDA)
USE cudafor
#endif
USE simpsn_gpum, ONLY: simpsn_gpu_dev
USE device_fbuff_m, ONLY: dev_buf
!
IMPLICIT NONE
!
INTEGER :: ngl
!! input: the number of g shell
INTEGER :: mesh
!! input: the number of radial mesh points
REAL(DP), INTENT(IN) :: gl_d(ngl)
!! input: the number of G shells
REAL(DP), INTENT(IN) :: r_d(mesh)
!! input: the radial mesh
REAL(DP), INTENT(IN) :: rab_d(mesh)
!! input: the derivative of the radial mesh
REAL(DP), INTENT(IN) :: rhoc_d(mesh)
!! input: the radial core charge
REAL(DP), INTENT(IN) :: omega
!! input: the volume of the unit cell
REAL(DP), INTENT(IN) :: tpiba2
!! input: 2 times pi / alat
REAL(DP), INTENT(OUT) :: drhocg_d(ngl)
!! Fourier transform of d Rho_c/dG
!
! ... local variables
!
REAL(DP) :: gx, gl1
! the modulus of g for a given shell
REAL(DP), ALLOCATABLE :: aux_d(:,:)
! auxiliary memory for integration
INTEGER :: ir, igl, igl0
! counter on radial mesh points
! counter on g shells
!
#if defined(__CUDA)
attributes(DEVICE) :: gl_d, r_d, rab_d, rhoc_d, drhocg_d, aux_d
#endif
!
! G=0 term
!
gl1 = gl_d(1)
IF (gl1 < 1.0d-8) THEN
drhocg_d(1) = 0.0_DP
igl0 = 2
ELSE
igl0 = 1
ENDIF
!
! G <> 0 term
!
ALLOCATE( aux_d(mesh,ngl) )
!
!$cuf kernel do (1)
DO igl = igl0, ngl
gx = SQRT( gl_d(igl) * tpiba2 )
DO ir = 1, mesh
aux_d(ir,igl) = r_d(ir)*rhoc_d(ir)*( r_d(ir) * COS(gx*r_d(ir)) / &
gx - SIN(gx*r_d(ir)) / gx**2 )
ENDDO
CALL simpsn_gpu_dev( mesh, aux_d(:,igl), rab_d, drhocg_d(igl) )
drhocg_d(igl) = fpi / omega * drhocg_d(igl)
ENDDO
!
DEALLOCATE( aux_d )
!
RETURN
!
END SUBROUTINE deriv_drhoc_gpu
!
#endif

View File

@ -12,7 +12,8 @@ SUBROUTINE drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, rhocg )
!! Calculates the Fourier transform of the core charge.
!
USE kinds
USE constants, ONLY: pi, fpi
USE constants, ONLY: pi, fpi
USE upf_acc_interfaces, ONLY: simpson, sph_bes_acc
!
IMPLICIT NONE
!
@ -40,51 +41,83 @@ SUBROUTINE drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, rhocg )
REAL(DP) :: gx, rhocg1
! the modulus of g for a given shell
! the Fourier transform
REAL(DP), ALLOCATABLE :: aux(:)
REAL(DP), ALLOCATABLE :: aux(:), gxv(:)
! auxiliary memory for integration
INTEGER :: ir, igl, igl0
! counter on radial mesh points
! counter on g shells
! lower limit for loop on ngl
!
!--------PROVISIONAL------ gl def needs to be adjusted---------
ALLOCATE( gxv(ngl) )
DO igl = 1, ngl
gxv(igl) = SQRT(gl(igl) * tpiba2)
ENDDO
!--------------------------------------------------------------
!
!$omp parallel private(aux, gx, rhocg1)
#if !defined(_OPENACC)
!$omp parallel private(aux,gx,rhocg1)
#endif
!
ALLOCATE( aux(mesh) )
#if defined(_OPENACC)
!$acc data present_or_copyin(gl,r,rab,rhoc) present_or_copyout(rhocg)
!$acc data create(aux)
#endif
!
! G=0 term
!
#if !defined(_OPENACC)
!$omp single
#endif
IF ( gl(1) < 1.0d-8 ) THEN
!$acc parallel loop
DO ir = 1, mesh
aux(ir) = r(ir)**2 * rhoc(ir)
ENDDO
!$acc kernels
CALL simpson( mesh, aux, rab, rhocg1 )
rhocg(1) = fpi * rhocg1 / omega
!$acc end kernels
igl0 = 2
ELSE
igl0 = 1
ENDIF
#if !defined(_OPENACC)
!$omp end single
#endif
!$acc end data
!
! G <> 0 term
!
#if defined(_OPENACC)
!$acc parallel loop gang private(aux) copyin(gxv)
#else
!$omp do
#endif
DO igl = igl0, ngl
gx = SQRT(gl(igl) * tpiba2)
CALL sph_bes( mesh, r, gx, 0, aux )
gx = gxv(igl)
CALL sph_bes_acc( mesh, r, gx, 0, aux )
!$acc loop vector
DO ir = 1, mesh
aux(ir) = r(ir)**2 * rhoc(ir) * aux(ir)
aux(ir) = r(ir)**2 * rhoc(ir) * aux(ir)
ENDDO
CALL simpson( mesh, aux, rab, rhocg1 )
rhocg(igl) = fpi * rhocg1 / omega
ENDDO
#if !defined(_OPENACC)
!$omp end do nowait
DEALLOCATE( aux )
#endif
!
!$omp end parallel
!$acc end data
!
DEALLOCATE( aux )
#if !defined(_OPENACC)
!$omp end parallel
#endif
!
DEALLOCATE( gxv )
!
RETURN
!
END SUBROUTINE drhoc

View File

@ -7,142 +7,192 @@
!
!
!----------------------------------------------------------------------
subroutine dvloc_of_g (mesh, msh, rab, r, vloc_at, zp, tpiba2, ngl, gl, &
omega, dvloc)
SUBROUTINE dvloc_of_g( mesh, msh, rab, r, vloc_at, zp, tpiba2, ngl, gl, &
omega, dvloc )
!----------------------------------------------------------------------
!
! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g
!! This routine gives:
!! \[ \text{dvloc} = D\text{Vloc}(g^2)/Dg^2 = (1/2g)\ D\text{Vloc}(g)/Dg
!! \]
!
USE kinds
USE constants , ONLY : pi, fpi, e2, eps8
USE constants, ONLY: pi, fpi, e2, eps8
USE Coul_cut_2D, ONLY: do_cutoff_2D
USE esm, ONLY : do_comp_esm, esm_bc
implicit none
USE esm, ONLY: do_comp_esm, esm_bc
!
! first the dummy variables
IMPLICIT NONE
!
integer, intent(in) :: ngl, mesh, msh
! the number of shell of G vectors
! max number of mesh points
! number of mesh points for radial integration
real(DP), intent(in) :: zp, rab (mesh), r (mesh), vloc_at (mesh), &
tpiba2, omega, gl (ngl)
! valence pseudocharge
! the derivative of the radial grid
! the radial grid
! the pseudo on the radial grid
! 2 pi / alat
! the volume of the unit cell
! the moduli of g vectors for each s
INTEGER, INTENT(IN) :: ngl
!! the number of shell of G vectors
INTEGER, INTENT(IN) :: mesh
!! max number of mesh points
INTEGER, INTENT(IN) :: msh
!! number of mesh points for radial integration
REAL(DP), INTENT(IN) :: zp
!! valence pseudocharge
REAL(DP), INTENT(IN) :: rab(mesh)
!! the derivative of the radial grid
REAL(DP), INTENT(IN) :: r(mesh)
!! the radial grid
REAL(DP), INTENT(IN) :: vloc_at(mesh)
!! the pseudo on the radial grid
REAL(DP), INTENT(IN) :: tpiba2
!! 2 pi / alat
REAL(DP), INTENT(IN) :: omega
!! the volume of the unit cell
REAL(DP), INTENT(IN) :: gl(ngl)
!! the moduli of g vectors for each s
REAL(DP), INTENT(OUT) :: dvloc(ngl)
!! the Fourier transform dVloc/dG
!
real(DP), intent(out) :: dvloc (ngl)
! the fourier transform dVloc/dG
! ... local variables
!
real(DP) :: vlcp, g2a, gx
real(DP), allocatable :: aux (:), aux1 (:)
integer :: i, igl, igl0
REAL(DP) :: vlcp, g2a, gx, vlcp_0, vlcp_1
REAL(DP), ALLOCATABLE :: aux(:,:), aux1(:)
INTEGER :: i, igl, igl0
! counter on erf functions or gaussians
! counter on g shells vectors
! first shell with g != 0
REAL(DP), PARAMETER :: r12=1.0d0/3.0d0
!
!$acc data present( dvloc, gl )
!
! the G=0 component is not computed
if (gl (1) < eps8) then
dvloc (1) = 0.0d0
IF (gl(1) < eps8) THEN
!$acc kernels
dvloc(1) = 0.0d0
!$acc end kernels
igl0 = 2
else
ELSE
igl0 = 1
endif
ENDIF
!
! Pseudopotentials in numerical form (Vloc contains the local part)
! In order to perform the Fourier transform, a term erf(r)/r is
! subtracted in real space and added again in G space
allocate (aux1( mesh))
!
! This is the part of the integrand function
! indipendent of |G| in real space
ALLOCATE( aux1(mesh) )
!
do i = 1, msh
aux1 (i) = r (i) * vloc_at (i) + zp * e2 * erf (r (i) )
enddo
! This is the part of the integrand function
! indipendent of |G| in real space
!
!$omp parallel private(aux, gx, vlcp, g2a)
ALLOCATE( aux(mesh,ngl) )
!
allocate (aux( mesh))
!$acc data copyin(r,rab) create(aux1,aux)
!
!$acc parallel loop copyin(vloc_at)
DO i = 1, msh
aux1(i) = r(i)*vloc_at(i) + zp*e2*ERF(r(i))
ENDDO
!
#if defined(_OPENACC)
!$acc parallel loop gang present(aux,aux1,rab,r,dvloc)
#else
!$omp parallel private( gx, vlcp, vlcp_1, vlcp_0, g2a )
!$omp do
do igl = igl0, ngl
gx = sqrt (gl (igl) * tpiba2)
#endif
DO igl = igl0, ngl
!
gx = SQRT(gl(igl)*tpiba2)
!
! and here we perform the integral, after multiplying for the |G|
! dependent part
!
! DV(g)/Dg = Integral of r (Dj_0(gr)/Dg) V(r) dr
do i = 1, msh
aux (i) = aux1 (i) * (r (i) * cos (gx * r (i) ) / gx - sin (gx &
* r (i) ) / gx**2)
enddo
call simpson (msh, aux, rab, vlcp)
!
!$acc loop seq
DO i = 1, msh
aux(i,igl) = aux1(i)*(r(i)*COS(gx*r(i))/gx - SIN(gx*r(i))/gx**2)
ENDDO
!
!----Simpson int.---
vlcp_0 = 0.0d0
!$acc loop seq reduction(+:vlcp_0)
DO i = 2, msh-1, 2
vlcp_0 = vlcp_0 + ( aux(i-1,igl)*rab(i-1) + 4.0d0*aux(i,igl)*rab(i) + &
aux(i+1,igl)*rab(i+1) )*r12
ENDDO
!------
vlcp_1 = vlcp_0 * fpi / omega / 2.0d0 / gx
!
! DV(g^2)/Dg^2 = (DV(g)/Dg)/2g
vlcp = fpi / omega / 2.0d0 / gx * vlcp
! for ESM stress
!vlcp = fpi / omega / 2.0d0 / gx * vlcp
!
! for ESM stress
! In ESM, vloc and dvloc have only short term.
IF ( ( .not. do_comp_esm ) .or. ( esm_bc .eq. 'pbc' ) ) THEN
IF ( (( .NOT. do_comp_esm ) .OR. ( esm_bc .EQ. 'pbc' )) .AND. &
.NOT.do_cutoff_2D ) THEN
! subtract the long-range term
IF (.not.do_cutoff_2D) then ! 2D cutoff: do not re-add LR part here (re-added later in stres_loc)
g2a = gl (igl) * tpiba2 / 4.d0
vlcp = vlcp + fpi / omega * zp * e2 * exp ( - g2a) * (g2a + &
1.d0) / (gl (igl) * tpiba2) **2
ENDIF
END IF
dvloc (igl) = vlcp
enddo
! 2D cutoff: do not re-add LR part here re-added later in stres_loc)
g2a = gl(igl) * tpiba2 / 4.d0
vlcp = vlcp_1 + fpi / omega * zp * e2 * EXP(-g2a) * (g2a + 1.d0) / &
(gl(igl)*tpiba2)**2
ELSE
vlcp = vlcp_1
ENDIF
dvloc(igl) = vlcp
ENDDO
#if !defined(_OPENACC)
!$omp end do nowait
deallocate (aux)
!$omp end parallel
#else
!$acc end data
!$acc end data
#endif
!
deallocate (aux1)
return
end subroutine dvloc_of_g
DEALLOCATE( aux )
DEALLOCATE( aux1 )
!
RETURN
!
END SUBROUTINE dvloc_of_g
!
!----------------------------------------------------------------------
subroutine dvloc_coul (zp, tpiba2, ngl, gl, omega, dvloc)
SUBROUTINE dvloc_coul( zp, tpiba2, ngl, gl, omega, dvloc )
!----------------------------------------------------------------------
!
! Fourier transform of the Coulomb potential - For all-electron
! calculations, in specific cases only, for testing purposes
!! Fourier transform of the Coulomb potential - For all-electron
!! calculations, in specific cases only, for testing purposes.
!
USE kinds
USE constants , ONLY : fpi, e2, eps8
implicit none
USE constants, ONLY : fpi, e2, eps8
!
integer, intent(in) :: ngl
! the number of shell of G vectors
real(DP), intent(in) :: zp, tpiba2, omega, gl (ngl)
! valence pseudocharge
! 2 pi / alat
! the volume of the unit cell
! the moduli of g vectors for each s
real(DP), intent(out) :: dvloc (ngl)
! fourier transform: dvloc = D Vloc (g^2) / D g^2 = 4pi e^2/omegai /G^4
IMPLICIT NONE
!
integer :: igl0
INTEGER, INTENT(IN) :: ngl
!! the number of shell of G vectors
REAL(DP), INTENT(IN) :: zp
!! valence pseudocharge
REAL(DP), INTENT(IN) :: tpiba2
!! 2 pi / alat
REAL(DP), INTENT(IN) :: omega
!! the volume of the unit cell
REAL(DP), INTENT(IN) :: gl(ngl)
!! the moduli of g vectors for each s
REAL(DP), INTENT(OUT) :: dvloc(ngl)
!! Fourier transform:
!! dvloc = D Vloc (g^2) / D g^2 = 4pi e^2/omegai /G^4
!
INTEGER :: igl0
! first shell with g != 0
!
!$acc data present( dvloc, gl )
!
! the G=0 component is 0
if (gl (1) < eps8) then
dvloc (1) = 0.0d0
IF (gl(1) < eps8) THEN
!$acc kernels
dvloc(1) = 0.0d0
!$acc end kernels
igl0 = 2
else
ELSE
igl0 = 1
endif
dvloc (igl0:ngl) = fpi * zp * e2 / omega / ( tpiba2 * gl (igl0:ngl) ) ** 2
return
end subroutine dvloc_coul
ENDIF
!
!$acc kernels
dvloc(igl0:ngl) = fpi*zp*e2 / omega / (tpiba2*gl(igl0:ngl))**2
!$acc end kernels
!
!$acc end data
!
RETURN
!
END SUBROUTINE dvloc_coul

View File

@ -1,252 +0,0 @@
!
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
MODULE compute_dvloc_gpum
#if defined(__CUDA)
USE cudafor
IMPLICIT NONE
TYPE(dim3) :: dvloc_threads = dim3(32,8,1)
PUBLIC :: dvloc_threads
CONTAINS
ATTRIBUTES( global ) SUBROUTINE compute_dvloc_gpu( mesh, n, tpiba2, omega, zp, gl, aux1, r, rab ,dvloc, do_cutoff_2D, do_comp_esm, esm_bc_l )
!
!
USE cudafor
USE kinds
USE constants, ONLY : pi, fpi, e2, eps8
!
IMPLICIT NONE
!
INTEGER, VALUE :: n, mesh
REAL(DP), VALUE :: zp, tpiba2, omega
LOGICAL, VALUE :: do_cutoff_2D, do_comp_esm, esm_bc_l
REAL(DP), DEVICE, INTENT(IN) :: rab(mesh), r(mesh), gl(n)
REAL(DP), DEVICE, INTENT(IN) :: aux1(mesh)
!
REAL(DP), DEVICE, INTENT(OUT) :: dvloc(n)
! the Fourier transform dVloc/dG
!
REAL(DP) :: val, mysum
REAL(DP) :: g2a, gx
!
INTEGER :: tx, ty, ir, igl
!
tx = threadIdx%x
ty = threadIdx%y
!
igl = (blockIdx%x - 1) * blockDim%y + ty
!
IF (igl > n ) RETURN
!
gx = SQRT(gl(igl) * tpiba2)
mysum = 0.d0
!
DO ir = tx, mesh, blockDim%x
!
val = aux1(ir) * ( r(ir) * COS(gx*r(ir)) / gx - SIN(gx*r(ir)) / gx**2 )
val = val * rab(ir)
!
IF (ir == 1 .OR. ir == mesh) THEN
mysum = mysum + val
ELSEIF ( MOD(ir,2) ) THEN
mysum = mysum + 2.d0*val
ELSE
mysum = mysum + 4.d0*val
ENDIF
!
ENDDO
!
!
! Reduce by warp
val = __shfl_down(mysum,1)
mysum = mysum + val
val = __shfl_down(mysum,2)
mysum = mysum + val
val = __shfl_down(mysum,4)
mysum = mysum + val
val = __shfl_down(mysum,8)
mysum = mysum + val
val = __shfl_down(mysum,16)
mysum = mysum + val
!
IF (tx == 1) THEN
!
mysum = mysum * fpi / omega / 2.0d0 / gx / 3.d0
!
IF ( (.NOT. do_comp_esm) .OR. esm_bc_l ) THEN
IF (.NOT. do_cutoff_2D) THEN
g2a = gl(igl) * tpiba2 / 4.d0
mysum = mysum + fpi / omega * zp * e2 * EXP(-g2a) * (g2a + &
1.d0) / (gl(igl)*tpiba2)**2
ENDIF
ENDIF
!
dvloc(igl) = mysum
!
ENDIF
!
!
RETURN
!
END SUBROUTINE compute_dvloc_gpu
!
#endif
END MODULE compute_dvloc_gpum
!----------------------------------------------------------------------
SUBROUTINE dvloc_of_g_gpu( mesh, msh, rab_d, r_d, vloc_at_d, zp, tpiba2, &
ngl, gl_d, omega, dvloc_d ) !, igl0 )
!----------------------------------------------------------------------
!
! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g
!
#if defined(__CUDA)
USE cudafor
#endif
USE kinds
USE constants, ONLY : pi, fpi, e2, eps8
USE Coul_cut_2D, ONLY : do_cutoff_2D
USE esm, ONLY : do_comp_esm, esm_bc
#if defined(__CUDA)
USE compute_dvloc_gpum, ONLY : compute_dvloc_gpu
#endif
!
IMPLICIT NONE
!
! first the dummy variables
!
INTEGER, INTENT(IN) :: mesh, msh, ngl !, igl0
! the number of shell of G vectors
! max number of mesh points
! number of mesh points for radial integration
!
REAL(DP), INTENT(IN) :: rab_d(mesh), r_d(mesh), vloc_at_d(mesh), gl_d(ngl)
REAL(DP), INTENT(IN) :: zp, tpiba2, omega
!
REAL(DP), INTENT(OUT) :: dvloc_d(ngl)
! the Fourier transform dVloc/dG
!
#if defined(__CUDA)
attributes(DEVICE) :: dvloc_d, rab_d, r_d , gl_d, vloc_at_d
#endif
!
! valence pseudocharge
! the derivative of the radial grid
! the radial grid
! the pseudo on the radial grid
! 2 pi / alat
! the volume of the unit cell
! the moduli of g vectors for each s
!
LOGICAL :: esm_bc_lg
!
REAL(DP), ALLOCATABLE :: aux1_d(:)
#if defined(__CUDA)
attributes(DEVICE) :: aux1_d
!
TYPE(dim3) :: threads
#endif
!
REAL(DP) :: gl1
INTEGER :: i, igl, blocks, igl0
!
! counter on erf functions or gaussians
! counter on g shells vectors
! first shell with g != 0
!
! the G=0 component is not computed
gl1 = gl_d(1)
IF (gl1 < eps8) THEN
dvloc_d(1) = 0.0_DP
igl0 = 2
ELSE
igl0 = 1
ENDIF
!
! Pseudopotentials in numerical form (Vloc contains the local part)
! In order to perform the Fourier transform, a term erf(r)/r is
! subtracted in real space and added again in G space
!
!
ALLOCATE( aux1_d(mesh) )
!
! This is the part of the integrand function
! indipendent of |G| in real space
!
#if defined(__CUDA)
!$cuf kernel do(1)<<<*,*>>>
DO i = 1, msh
aux1_d(i) = r_d(i) * vloc_at_d(i) + zp * e2 * erf(r_d(i))
ENDDO
!
esm_bc_lg = ( esm_bc == 'pbe' )
!
threads = dim3(32,8,1)
blocks = CEILING(REAL(ngl-igl0+1)/8)
CALL compute_dvloc_gpu<<<blocks, threads>>>( msh, ngl-igl0+1, tpiba2, omega, zp, gl_d(igl0), &
aux1_d, r_d, rab_d, dvloc_d(igl0), do_cutoff_2D, &
do_comp_esm, esm_bc_lg )
!
#else
CALL errore( 'dvloc_of_g_gpu' , 'GPU version of dvloc_of_g called but not compiled.', 0 )
#endif
DEALLOCATE( aux1_d )
!
!
RETURN
!
END SUBROUTINE dvloc_of_g_gpu
!
!
!----------------------------------------------------------------------
SUBROUTINE dvloc_coul_gpu( zp, tpiba2, ngl, gl_d, omega, dvloc_d )
!----------------------------------------------------------------------
!! Fourier transform of the Coulomb potential - For all-electron
!! calculations, in specific cases only, for testing purposes.
!
USE kinds
USE constants, ONLY : fpi, e2, eps8
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ngl
! the number of shell of G vectors
! first shell with g != 0
REAL(DP), INTENT(IN) :: zp, tpiba2, omega
! valence pseudocharge
! 2 pi / alat
! the volume of the unit cell
REAL(DP), INTENT(IN) :: gl_d(ngl)
! the moduli of g vectors for each s
REAL(DP), INTENT(OUT) :: dvloc_d(ngl)
! fourier transform: dvloc = D Vloc (g^2) / D g^2 = 4pi e^2/omegai /G^4
INTEGER :: j, igl0
REAL(DP) :: gl1
!
#if defined(__CUDA)
attributes(DEVICE) :: dvloc_d, gl_d
#endif
!
! the G=0 component is 0
gl1 = gl_d(1)
IF (gl1 < eps8) then
dvloc_d(1) = 0.0_DP
igl0 = 2
ELSE
igl0 = 1
ENDIF
!
!$cuf kernel do(1) <<<*,*>>>
DO j = igl0, ngl
dvloc_d(j) = fpi * zp * e2 / omega / ( tpiba2 * gl_d(j) )**2
ENDDO
!
RETURN
!
END SUBROUTINE dvloc_coul_gpu

View File

@ -1134,7 +1134,7 @@ SUBROUTINE dprojdtau_k_gpu( spsi_d, alpha, na, ijkb0, ipol, ik, nb_s, nb_e, myke
offsetU_back1, ldim_u, backall, lda_plus_u_kind, &
Hubbard_projectors, oatwfc
USE wvfct, ONLY : nbnd, npwx, wg
USE uspp, ONLY : okvan, nkb, qq_at_d
USE uspp, ONLY : okvan, nkb
USE uspp_param, ONLY : nh
USE becmod_subs_gpum, ONLY : calbec_gpu
USE mp_bands, ONLY : intra_bgrp_comm
@ -1530,7 +1530,7 @@ SUBROUTINE matrix_element_of_dSdtau_gpu (alpha, ipol, ik, ijkb0, lA, A, lB, B, A
USE ions_base, ONLY : nat, ntyp => nsp, ityp
USE cell_base, ONLY : tpiba
USE wvfct, ONLY : npwx, wg
USE uspp, ONLY : nkb, okvan, vkb, qq_at_d
USE uspp, ONLY : nkb, okvan, vkb, qq_at
USE uspp_param, ONLY : nh
USE klist, ONLY : igk_k_d, ngk
USE becmod_subs_gpum, ONLY : calbec_gpu
@ -1585,10 +1585,10 @@ SUBROUTINE matrix_element_of_dSdtau_gpu (alpha, ipol, ik, ijkb0, lA, A, lB, B, A
CALL dev_buf%lock_buffer(qq , [nh(nt),nh(nt)], ierr) ! ALLOCATE ( qq(nh(nt),nh(nt)) )
IF ( ierr /= 0 ) CALL errore('matrix_element_of_dSdtau_gpu','Buffers allocation failed',ierr)
!
!$cuf kernel do(2)
!$acc parallel loop collapse(2) present(qq_at)
DO jh=1,nh_nt
DO ih=1,nh_nt
qq(ih,jh) = CMPLX(qq_at_d(ih,jh,alpha), 0.0d0, kind=DP)
qq(ih,jh) = CMPLX(qq_at(ih,jh,alpha), 0.0d0, kind=DP)
ENDDO
ENDDO
!
@ -1703,7 +1703,7 @@ SUBROUTINE dprojdtau_gamma_gpu( spsi_d, alpha, ijkb0, ipol, ik, nb_s, nb_e, &
offsetU_back, offsetU_back1, ldim_u, backall, &
Hubbard_projectors
USE wvfct, ONLY : nbnd, npwx, wg
USE uspp, ONLY : nkb, vkb, qq_at_d
USE uspp, ONLY : nkb, vkb, qq_at
USE uspp_param, ONLY : nh
USE wavefunctions, ONLY : evc
USE becmod_gpum, ONLY : bec_type_d, becp_d
@ -1900,12 +1900,12 @@ SUBROUTINE dprojdtau_gamma_gpu( spsi_d, alpha, ijkb0, ipol, ik, nb_s, nb_e, &
CALL dev_memset ( betapsi_d, 0.0_dp )
!
! here starts band parallelization
!$cuf kernel do(2)
!$acc parallel loop collapse(2) present(qq_at)
DO ih = 1, nh_nt
DO ibnd = nb_s, nb_e
DO jh = 1, nh_nt
betapsi_d(ih,ibnd) = betapsi_d(ih,ibnd) + &
qq_at_d(ih,jh,alpha) * dbetapsi_d(jh,ibnd)
qq_at(ih,jh,alpha) * dbetapsi_d(jh,ibnd)
ENDDO
ENDDO
ENDDO
@ -1918,12 +1918,12 @@ SUBROUTINE dprojdtau_gamma_gpu( spsi_d, alpha, ijkb0, ipol, ik, nb_s, nb_e, &
CALL dev_memset ( betapsi_d, 0.0_dp )
!
becpr_d => becp_d%r_d
!$cuf kernel do(2)
!$acc parallel loop collapse(2) present(qq_at)
DO ih = 1, nh_nt
DO ibnd = nb_s, nb_e
DO jh = 1, nh_nt
betapsi_d(ih,ibnd) = betapsi_d(ih,ibnd) + &
qq_at_d(ih,jh,alpha) * betapsi0_d(jh,ibnd)
qq_at(ih,jh,alpha) * betapsi0_d(jh,ibnd)
ENDDO
ENDDO
ENDDO

View File

@ -182,7 +182,7 @@ SUBROUTINE force_us_gpu( forcenl )
#if defined(__CUDA)
USE cublas
#endif
USE uspp, ONLY : qq_at_d, deeq_d
USE uspp, ONLY : qq_at, deeq
USE wvfct_gpum, ONLY : wg_d, using_wg_d, et_d, using_et_d
!
IMPLICIT NONE
@ -230,9 +230,11 @@ SUBROUTINE force_us_gpu( forcenl )
IF ( ityp(na) == nt ) THEN
ijkb0 = ofsbeta(na)
! this is \sum_j q_{ij} <beta_j|psi>
!$acc host_data use_device(qq_at)
CALL DGEMM ('N','N', nh(nt), becp_d%nbnd_loc, nh(nt), &
1.0_dp, qq_at_d(1,1,na), nhm, becp_d%r_d(ijkb0+1,1),&
1.0_dp, qq_at(1,1,na), nhm, becp_d%r_d(ijkb0+1,1),&
nkb, 0.0_dp, aux_d, nh(nt) )
!$acc end host_data
! multiply by -\epsilon_n
!$cuf kernel do(2)
DO ih = 1, nh_nt
@ -243,10 +245,12 @@ SUBROUTINE force_us_gpu( forcenl )
END DO
! add \sum_j d_{ij} <beta_j|psi>
!$acc host_data use_device(deeq)
CALL DGEMM ('N','N', nh(nt), becp_d%nbnd_loc, nh(nt), &
1.0_dp, deeq_d(1,1,na,current_spin), nhm, &
1.0_dp, deeq(1,1,na,current_spin), nhm, &
becp_d%r_d(ijkb0+1,1), nkb, 1.0_dp, aux_d, nh(nt) )
!$acc end host_data
!
! Auxiliary variable to perform the reduction with cuf kernels
forcenl_ipol = 0.0_dp
!$cuf kernel do(2)

View File

@ -98,7 +98,7 @@ SUBROUTINE init_run()
gg_d = gg
END IF
#endif
!$acc update device(mill, g)
!$acc update device(mill, g, gg)
!
IF (do_comp_esm) CALL esm_init(.NOT. lrism)
!

View File

@ -9,13 +9,13 @@ MODULE uspp_init
!
PRIVATE
PUBLIC :: init_us_2, gen_us_dj, gen_us_dy
PUBLIC :: gen_us_dj_gpu, gen_us_dy_gpu
!
CONTAINS
!
!----------------------------------------------------------------------
SUBROUTINE init_us_2( npw_, igk_, q_, vkb_, run_on_gpu_)
SUBROUTINE init_us_2( npw_, igk_, q_, vkb_, run_on_gpu_ )
!----------------------------------------------------------------------
!! wrapper to call init_us_2_base
!! Wrapper to call init_us_2_base.
!! Calculates beta functions (Kleinman-Bylander projectors), with
!! structure factor, for all atoms, in reciprocal space.
!
@ -40,38 +40,42 @@ CONTAINS
!! beta functions (npw_ <= npwx)
LOGICAL, OPTIONAL, INTENT(IN) :: run_on_gpu_
!! whether you wish to run on gpu in case use_gpu is true
!!
!
LOGICAL :: run_on_gpu
!
!
run_on_gpu = .false.
if(present(run_on_gpu_)) run_on_gpu = run_on_gpu_
run_on_gpu = .FALSE.
IF (PRESENT(run_on_gpu_)) run_on_gpu = run_on_gpu_
!
CALL start_clock( 'init_us_2' )
if(use_gpu.and.run_on_gpu) then
!
IF (use_gpu .AND. run_on_gpu) THEN
!
!$acc data present(igk_(1:npw_), mill(:,:), g(:,:), vkb_(1:npwx,1:nkb), eigts1(:,:), eigts2(:,:), eigts3(:,:))
!$acc host_data use_device(eigts1, eigts2, eigts3, mill, g, igk_, vkb_)
CALL init_us_2_base_gpu(npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega,&
dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, eigts3, mill, g,&
vkb_ )
!$acc data present( igk_(1:npw_), mill(:,:), g(:,:), vkb_(1:npwx,1:nkb), &
!$acc eigts1(:,:), eigts2(:,:), eigts3(:,:) )
!$acc host_data use_device( eigts1, eigts2, eigts3, mill, g, igk_, vkb_ )
CALL init_us_2_base_gpu( npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, &
dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, &
eigts3, mill, g, vkb_ )
!$acc end host_data
!$acc end data
!
else
CALL init_us_2_base(npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, &
dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, eigts3, mill, g,&
vkb_ )
end if
ELSE
!
CALL init_us_2_base( npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, &
dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, &
eigts3, mill, g, vkb_ )
ENDIF
!
CALL stop_clock( 'init_us_2' )
!
!
END SUBROUTINE init_us_2
!
!----------------------------------------------------------------------
SUBROUTINE gen_us_dj ( ik, dvkb )
SUBROUTINE gen_us_dj( ik, dvkb )
!----------------------------------------------------------------------
!! wrapper to call gen_us_dj: same as init_us_2, but with the derivative
!! Wrapper to call gen_us_dj: same as init_us_2, but with the derivative
!! of the Bessel functions dj_l/dq instead of j_l(qr) in the integral
!
USE kinds, ONLY : DP
@ -80,7 +84,7 @@ CONTAINS
USE gvect, ONLY : eigts1, eigts2, eigts3, mill, g
USE wvfct, ONLY : npwx
USE uspp, ONLY : nkb
USE fft_base , ONLY : dfftp
USE fft_base, ONLY : dfftp
USE klist, ONLY : xk, ngk, igk_k
!
IMPLICIT NONE
@ -92,16 +96,18 @@ CONTAINS
!
! CALL start_clock( 'gen_us_dj' )
!
CALL gen_us_dj_base (ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, &
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
eigts1, eigts2, eigts3, mill, g, dvkb )
!$acc data present_or_copyout( dvkb )
CALL gen_us_dj_base( ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, &
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, &
dfftp%nr3, eigts1, eigts2, eigts3, mill, g, dvkb )
!$acc end data
!
! CALL stop_clock( 'gen_us_dj' )
!
END SUBROUTINE gen_us_dj
!
!----------------------------------------------------------------------
SUBROUTINE gen_us_dy ( ik, u, dvkb )
SUBROUTINE gen_us_dy( ik, u, dvkb )
!----------------------------------------------------------------------
!! wrapper to call gen_us_dj: same as init_us_2, but with the derivative
!! of the spherical harmonics dY_lm/dq instead of Y_lm(q) in the integral
@ -126,86 +132,14 @@ CONTAINS
!
! CALL start_clock( 'gen_us_dy' )
!
CALL gen_us_dy_base (ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, &
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
eigts1, eigts2, eigts3, mill, g, u, dvkb )
!$acc data present_or_copyout( dvkb )
CALL gen_us_dy_base( ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, &
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, &
dfftp%nr3, eigts1, eigts2, eigts3, mill, g, u, dvkb )
!$acc end data
!
! CALL stop_clock( 'gen_us_d' )
!
END SUBROUTINE gen_us_dy
!
!----------------------------------------------------------------------
SUBROUTINE gen_us_dj_gpu ( ik, dvkb )
!----------------------------------------------------------------------
!! wrapper to call gen_us_dj: same as init_us_2, but with the derivative
!! of the Bessel functions dj_l/dq instead of j_l(qr) in the integral
!! (GPU version, FIXME: to be merged)
!
USE kinds, ONLY : dp
USE ions_base, ONLY : nat, ntyp=>nsp, ityp, tau
USE cell_base, ONLY : tpiba, omega
USE gvect, ONLY : eigts1_d, eigts2_d, eigts3_d, mill_d, g_d
USE wvfct, ONLY : npwx
USE uspp, ONLY : nkb
USE fft_base , ONLY : dfftp
USE klist, ONLY : xk, ngk, igk_k_d
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ik
!! k-point index
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb)
!! beta functions computed with dj_l/dq
#if defined(__CUDA)
attributes(DEVICE) :: dvkb
#endif
!
! CALL start_clock( 'gen_us_dj' )
!
CALL gen_us_dj_gpu_ (ngk(ik), npwx, igk_k_d(1,ik), xk(1,ik), nat, tau, &
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
eigts1_d, eigts2_d, eigts3_d, mill_d, g_d, dvkb )
!
! CALL stop_clock( 'gen_us_dj' )
!
END SUBROUTINE gen_us_dj_gpu
!
!----------------------------------------------------------------------
SUBROUTINE gen_us_dy_gpu ( ik, u, dvkb )
!----------------------------------------------------------------------
!! wrapper to call gen_us_dj: same as init_us_2, but with the derivative
!! of the spherical harmonics dY_lm/dq instead of Y_lm(q) in the integral
!! (GPU version, FIXME: to be merged)
!
USE kinds, ONLY : DP
USE ions_base, ONLY : nat, ntyp=>nsp, ityp, tau
USE cell_base, ONLY : tpiba, omega
USE gvect, ONLY : eigts1_d, eigts2_d, eigts3_d, mill_d, g_d
USE wvfct, ONLY : npwx
USE uspp, ONLY : nkb
USE fft_base , ONLY : dfftp
USE klist, ONLY : xk, ngk, igk_k_d
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ik
!! k-point index
REAL(dp), INTENT(IN) :: u(3)
!! k-point index
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb)
!! beta functions computed with dY_lm/dq
#if defined(__CUDA)
attributes(DEVICE) :: dvkb
#endif
!
! CALL start_clock( 'gen_us_dy' )
!
CALL gen_us_dy_gpu_ (ngk(ik), npwx, igk_k_d(1,ik), xk(1,ik), nat, tau, &
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
eigts1_d, eigts2_d, eigts3_d, mill_d, g_d, u, dvkb )
!
! CALL stop_clock( 'gen_us_dy' )
!
END SUBROUTINE gen_us_dy_gpu
!
END MODULE

View File

@ -184,8 +184,8 @@ SUBROUTINE newq( vr, deeq, skip_vltot )
ENDDO
!
DEALLOCATE( qmod, ylmk0, vaux )
CALL mp_sum( deeq( :, :, :, 1:nspin_mag ), inter_pool_comm )
CALL mp_sum( deeq( :, :, :, 1:nspin_mag ), intra_bgrp_comm )
CALL mp_sum( deeq(:,:,:,1:nspin_mag ), inter_pool_comm )
CALL mp_sum( deeq(:,:,:,1:nspin_mag ), intra_bgrp_comm )
!
END SUBROUTINE newq
!
@ -199,7 +199,7 @@ SUBROUTINE newd( )
USE kinds, ONLY : DP
USE ions_base, ONLY : nat, ntyp => nsp, ityp
USE lsda_mod, ONLY : nspin
USE uspp, ONLY : deeq, deeq_d, dvan, deeq_nc, deeq_nc_d, dvan_so, okvan
USE uspp, ONLY : deeq, dvan, deeq_nc, dvan_so, okvan
USE uspp_param, ONLY : upf, lmaxq, nh, nhm
USE noncollin_module, ONLY : noncolin, domag, nspin_mag, lspinorb
USE uspp, ONLY : nhtol, nhtolm
@ -248,13 +248,15 @@ SUBROUTINE newd( )
!
ENDDO
!
#if defined __CUDA
if (noncolin) then
if (nhm>0) deeq_nc_d=deeq_nc
else
if (nhm>0) deeq_d=deeq
endif
#endif
IF (noncolin) THEN
IF (nhm>0) THEN
!$acc update device(deeq_nc)
ENDIF
ELSE
IF (nhm>0) THEN
!$acc update device(deeq)
ENDIF
ENDIF
!
! ... early return
!
@ -311,13 +313,15 @@ SUBROUTINE newd( )
IF (lda_plus_U .AND. (Hubbard_projectors == 'pseudo')) CALL add_vhub_to_deeq( deeq )
!
! sync with GPUs
#if defined __CUDA
if (noncolin) then
if (nhm>0) deeq_nc_d=deeq_nc
else
if (nhm>0) deeq_d=deeq
endif
#endif
IF (noncolin) THEN
IF (nhm>0) THEN
!$acc update device(deeq_nc)
ENDIF
ELSE
IF (nhm>0) THEN
!$acc update device(deeq)
ENDIF
ENDIF
!
CALL stop_clock( 'newd' )
!

View File

@ -50,9 +50,6 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot)
! Input: potential , output: contribution to integral
REAL(kind=dp), intent(in) :: vr(dfftp%nnr,nspin)
REAL(kind=dp), intent(out) :: deeq_d( nhm, nhm, nat, nspin )
#if defined(__CUDA)
attributes(DEVICE) :: deeq_d
#endif
LOGICAL, intent(in) :: skip_vltot !If .false. vltot is added to vr when necessary
! INTERNAL
INTEGER :: ngm_s, ngm_e, ngm_l
@ -70,7 +67,7 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot)
! workaround for cuf kernel limitations
!
#if defined(__CUDA)
attributes(DEVICE) :: vaux_d, aux_d, qgm_d, ylmk0_d, qmod_d, deeaux_d, dfftp_nl_d
attributes(DEVICE) :: deeq_d, vaux_d, aux_d, qgm_d, ylmk0_d, qmod_d, deeaux_d, dfftp_nl_d
#endif
! variable to map index of atoms of the same type
INTEGER, ALLOCATABLE :: na_to_nab_h(:)
@ -87,7 +84,9 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot)
fact = 1.0_dp
ENDIF
!
!$acc kernels
deeq_d(:,:,:,:) = 0.D0
!$acc end kernels
!
! With k-point parallelization, distribute G-vectors across processors
! ngm_s = index of first G-vector for this processor
@ -237,7 +236,7 @@ SUBROUTINE newd_gpu( )
USE kinds, ONLY : DP
USE ions_base, ONLY : nat, ntyp => nsp, ityp
USE lsda_mod, ONLY : nspin
USE uspp, ONLY : deeq, okvan, deeq, deeq_d, deeq_nc, deeq_nc_d, dvan_d, dvan_so_d
USE uspp, ONLY : okvan, deeq, deeq_nc, dvan_d, dvan_so_d
USE uspp_param, ONLY : upf, lmaxq, nh, nhm
USE noncollin_module, ONLY : noncolin, domag, nspin_mag, lspinorb
USE uspp, ONLY : nhtol, nhtolm
@ -273,12 +272,12 @@ SUBROUTINE newd_gpu( )
!
IF ( lspinorb ) THEN
!
!$cuf kernel do(4)
!$acc parallel loop collapse(4)
DO is = 1, nspin
DO na = 1, nat
DO jh = 1, nht
DO ih = 1, nht
IF ( ityp_d(na) == nt ) deeq_nc_d(ih,jh,na,is) = dvan_so_d(ih,jh,is,nt)
IF ( ityp_d(na) == nt ) deeq_nc(ih,jh,na,is) = dvan_so_d(ih,jh,is,nt)
END DO
END DO
END DO
@ -286,15 +285,15 @@ SUBROUTINE newd_gpu( )
!
ELSE IF ( noncolin ) THEN
!
!$cuf kernel do(3)
!$acc parallel loop collapse(3)
DO na = 1, nat
DO jh = 1, nht
DO ih = 1, nht
IF ( ityp_d(na) == nt ) THEN
deeq_nc_d(ih,jh,na,1) = dvan_d(ih,jh,nt)
deeq_nc_d(ih,jh,na,2) = ( 0.D0, 0.D0 )
deeq_nc_d(ih,jh,na,3) = ( 0.D0, 0.D0 )
deeq_nc_d(ih,jh,na,4) = dvan_d(ih,jh,nt)
deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt)
deeq_nc(ih,jh,na,2) = ( 0.D0, 0.D0 )
deeq_nc(ih,jh,na,3) = ( 0.D0, 0.D0 )
deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt)
END IF
END DO
END DO
@ -303,13 +302,13 @@ SUBROUTINE newd_gpu( )
ELSE
!
if ( nht > 0 ) THEN
!$cuf kernel do(4)
!$acc parallel loop collapse(4)
DO is = 1, nspin
DO na = 1, nat
DO jh = 1, nht
DO ih = 1, nht
!
IF ( ityp_d(na) == nt ) deeq_d(ih,jh,na,is) = dvan_d(ih,jh,nt)
IF ( ityp_d(na) == nt ) deeq(ih,jh,na,is) = dvan_d(ih,jh,nt)
!
END DO
END DO
@ -328,9 +327,9 @@ SUBROUTINE newd_gpu( )
!
! ... sync with CPU
if (noncolin) then
deeq_nc=deeq_nc_d
!$acc update self(deeq_nc)
else
deeq=deeq_d
!$acc update self(deeq)
endif
!
RETURN
@ -346,12 +345,18 @@ SUBROUTINE newd_gpu( )
!
IF (tqr) THEN
CALL newq_r(v%of_r,deeq,.false.)
deeq_d=deeq
!$acc update device(deeq)
ELSE
CALL newq_gpu(v%of_r,deeq_d,.false.)
!$acc host_data use_device(deeq)
CALL newq_gpu(v%of_r,deeq,.false.)
!$acc end host_data
END IF
!
IF (noncolin) call add_paw_to_deeq_gpu(deeq_d)
IF (noncolin) THEN
!$acc host_data use_device(deeq)
call add_paw_to_deeq_gpu(deeq)
!$acc end host_data
ENDIF
!
types : &
DO nt = 1, ntyp
@ -372,13 +377,13 @@ SUBROUTINE newd_gpu( )
ELSE if_noncolin
!
nht = nh(nt)
!$cuf kernel do(4)
!$acc parallel loop collapse(4)
DO is = 1, nspin
DO na = 1, nat
DO ih = 1, nht
DO jh = 1, nht
IF ( ityp_d(na) == nt ) THEN
deeq_d(ih,jh,na,is) = deeq_d(ih,jh,na,is) + dvan_d(ih,jh,nt)
deeq(ih,jh,na,is) = deeq(ih,jh,na,is) + dvan_d(ih,jh,nt)
END IF
END DO
END DO
@ -389,17 +394,25 @@ SUBROUTINE newd_gpu( )
!
END DO types
!
IF (.NOT.noncolin) CALL add_paw_to_deeq_gpu(deeq_d)
IF (.NOT.noncolin) THEN
!$acc host_data use_device(deeq)
CALL add_paw_to_deeq_gpu(deeq)
!$acc end host_data
ENDIF
!
IF (lda_plus_U .AND. (Hubbard_projectors == 'pseudo')) CALL add_vhub_to_deeq_gpu(deeq_d)
IF (lda_plus_U .AND. (Hubbard_projectors == 'pseudo')) THEN
!$acc host_data use_device(deeq)
CALL add_vhub_to_deeq_gpu(deeq)
!$acc end host_data
ENDIF
!
CALL buffer%release_buffer(ityp_d, ierr)
CALL stop_clock_gpu( 'newd' )
!
if (noncolin) then
deeq_nc=deeq_nc_d
!$acc update self(deeq_nc)
else
deeq=deeq_d
!$acc update self(deeq)
endif
!
RETURN
@ -429,7 +442,7 @@ SUBROUTINE newd_gpu( )
ijs = ijs + 1
!
IF (domag) THEN
!$cuf kernel do(3)
!$acc parallel loop collapse(3) present(deeq_nc,deeq)
DO na = 1, nat
!
DO ih = 1, nhnt
@ -438,24 +451,24 @@ SUBROUTINE newd_gpu( )
!
IF ( ityp_d(na) == nt ) THEN
!
deeq_nc_d(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
!
DO kh = 1, nhnt
!
DO lh = 1, nhnt
!
deeq_nc_d(ih,jh,na,ijs) = deeq_nc_d(ih,jh,na,ijs) + &
deeq_d (kh,lh,na,1)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + &
deeq_nc(ih,jh,na,ijs) = deeq_nc(ih,jh,na,ijs) + &
deeq(kh,lh,na,1)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + &
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,2,is2,nt)) + &
deeq_d (kh,lh,na,2)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) + &
deeq(kh,lh,na,2)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) + &
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,1,is2,nt)) + &
(0.D0,-1.D0)*deeq_d (kh,lh,na,3)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) - &
(0.D0,-1.D0)*deeq(kh,lh,na,3)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) - &
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,1,is2,nt)) + &
deeq_d (kh,lh,na,4)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) - &
deeq(kh,lh,na,4)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) - &
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,2,is2,nt))
!
END DO
@ -471,7 +484,7 @@ SUBROUTINE newd_gpu( )
!
ELSE
!
!$cuf kernel do(3) <<<*,*>>>
!$acc parallel loop collapse(3) present(deeq_nc,deeq)
DO na = 1, nat
!
DO ih = 1, nhnt
@ -480,15 +493,15 @@ SUBROUTINE newd_gpu( )
!
IF ( ityp_d(na) == nt ) THEN
!
deeq_nc_d(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
!
DO kh = 1, nhnt
!
DO lh = 1, nhnt
!
deeq_nc_d(ih,jh,na,ijs) = deeq_nc_d(ih,jh,na,ijs) + &
deeq_d (kh,lh,na,1)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + &
deeq_nc(ih,jh,na,ijs) = deeq_nc(ih,jh,na,ijs) + &
deeq(kh,lh,na,1)* &
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + &
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,2,is2,nt) )
!
END DO
@ -526,42 +539,38 @@ SUBROUTINE newd_gpu( )
!
nhnt = nh(nt)
!
!$cuf kernel do(3)
!$acc parallel loop collapse(3) present(deeq_nc,deeq)
DO na = 1, nat
!
DO ih = 1, nhnt
!
DO jh = 1, nhnt
!
IF ( ityp_d(na) == nt ) THEN
!
IF (lspinorb) THEN
deeq_nc_d(ih,jh,na,1) = dvan_so_d(ih,jh,1,nt) + &
deeq_d(ih,jh,na,1) + deeq_d(ih,jh,na,4)
!
deeq_nc_d(ih,jh,na,4) = dvan_so_d(ih,jh,4,nt) + &
deeq_d(ih,jh,na,1) - deeq_d(ih,jh,na,4)
deeq_nc(ih,jh,na,1) = dvan_so_d(ih,jh,1,nt) + &
deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
!
deeq_nc(ih,jh,na,4) = dvan_so_d(ih,jh,4,nt) + &
deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
!
ELSE
deeq_nc_d(ih,jh,na,1) = dvan_d(ih,jh,nt) + &
deeq_d(ih,jh,na,1) + deeq_d(ih,jh,na,4)
!
deeq_nc_d(ih,jh,na,4) = dvan_d(ih,jh,nt) + &
deeq_d(ih,jh,na,1) - deeq_d(ih,jh,na,4)
deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt) + &
deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
!
deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt) + &
deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
!
END IF
deeq_nc_d(ih,jh,na,2) = deeq_d(ih,jh,na,2) - &
( 0.D0, 1.D0 ) * deeq_d(ih,jh,na,3)
!
deeq_nc_d(ih,jh,na,3) = deeq_d(ih,jh,na,2) + &
( 0.D0, 1.D0 ) * deeq_d(ih,jh,na,3)
deeq_nc(ih,jh,na,2) = deeq(ih,jh,na,2) - &
( 0.D0, 1.D0 ) * deeq(ih,jh,na,3)
!
deeq_nc(ih,jh,na,3) = deeq(ih,jh,na,2) + &
( 0.D0, 1.D0 ) * deeq(ih,jh,na,3)
!
END IF
!
END DO
!
END DO
!
END DO
!
RETURN

View File

@ -230,10 +230,10 @@ SUBROUTINE post_xml_init ( )
g_d = g
gg_d = gg
#endif
!$acc update device(mill, g)
!$acc update device(mill, g, gg)
!
CALL ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms )
CALL gshells ( lmovecell )
CALL gshells ( lmovecell )
!
IF (do_comp_esm) CALL esm_init()
IF (do_cutoff_2D) CALL cutoff_fact()

View File

@ -228,7 +228,7 @@ MODULE realus
USE constants, ONLY : pi, fpi, eps16, eps6
USE ions_base, ONLY : nat, nsp, ityp, tau
USE cell_base, ONLY : at, bg, alat
USE uspp, ONLY : okvan, qq_at, qq_at_d, qq_nt, nhtol
USE uspp, ONLY : okvan, qq_at, qq_at, qq_nt, nhtol
USE uspp_param, ONLY : upf, nh
USE atom, ONLY : rgrid
USE fft_types, ONLY : fft_type_descriptor
@ -404,9 +404,8 @@ MODULE realus
! and sync on GPUs
!
CALL mp_sum( qq_at, intra_bgrp_comm )
#if defined __CUDA
qq_at_d=qq_at
#endif
!
!$acc update device(qq_at)
!
! and test that they don't differ too much
! from the result computed on the atomic grid
@ -455,7 +454,7 @@ MODULE realus
!! Sync with GPU memory is performed outside
!
USE constants, ONLY : eps16, eps6
USE uspp, ONLY : indv, nhtolm, ap, qq_at, qq_at_d
USE uspp, ONLY : indv, nhtolm, ap, qq_at
USE uspp_param, ONLY : upf, lmaxq, nh
USE atom, ONLY : rgrid
USE splinelib, ONLY : spline, splint

View File

@ -216,7 +216,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
!
USE mp, ONLY : mp_get_comm_null, mp_circular_shift_left
USE device_fbuff_m, ONLY : dev_buf
USE uspp, ONLY : qq_at_d
USE uspp, ONLY : qq_at
!
IMPLICIT NONE
!
@ -273,9 +273,12 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
! (l'=l+ijkb0, m'=m+ijkb0, indices run from 1 to nh(nt))
!
IF ( m_loc > 0 ) THEN
!$acc host_data use_device(qq_at)
CALL DGEMM('N', 'N', nh(nt), m_loc, nh(nt), 1.0_dp, &
qq_at_d(1,1,na), nhm, becp_d%r_d(ofsbeta(na)+1,1),&
qq_at(1,1,na), nhm, becp_d%r_d(ofsbeta(na)+1,1),&
nkb, 0.0_dp, ps_d(ofsbeta(na)+1,1), nkb )
!$acc end host_data
END IF
END IF
END DO
@ -340,7 +343,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
!! k-points version of \(\textrm{s_psi}\) routine.
!
USE device_fbuff_m, ONLY : dev_buf
USE uspp, ONLY : qq_at_d
USE uspp, ONLY : qq_at
!
IMPLICIT NONE
!
@ -367,17 +370,17 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
! qq is real: copy it into a complex variable to perform
! a zgemm - simple but sub-optimal solution
!
! here we need to use qq_at_d instead of qq_nt_d otherwise real space augmentation brakes!
! here we need to use qq_at (device) instead of qq_nt_d otherwise real space augmentation brakes!
! qq_nt_d would be much faster and works for calculations without real space augmentation
CALL dev_buf%lock_buffer( qqc_d, (/ nhm, nhm, nat/), ierr )
IF( ierr /= 0 ) &
CALL errore( ' s_psi_k_gpu ', ' cannot allocate buffer (qqc_d) ', ABS(ierr) )
!$cuf kernel do(3) <<<*,*>>>
!$acc parallel loop collapse(3) present(qq_at)
DO na = 1, nat
DO jh = 1, nhm
DO ih = 1, nhm
qqc_d(ih,jh, na) = CMPLX ( qq_at_d(ih,jh, na), 0.0_dp, KIND=dp )
qqc_d(ih,jh, na) = CMPLX ( qq_at(ih,jh, na), 0.0_dp, KIND=dp )
END DO
END DO
END DO
@ -427,7 +430,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
!! k-points noncolinear/spinorbit version of \(\textrm{s_psi}\) routine.
!
USE device_fbuff_m, ONLY : dev_buf
USE uspp, ONLY : qq_at_d, qq_so_d
USE uspp, ONLY : qq_at, qq_so
!
IMPLICIT NONE
!
@ -455,11 +458,11 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
IF( ierr /= 0 .and. ierr /= -1 ) &
CALL errore( ' s_psi_nc_gpu ', ' cannot allocate buffer (qqc_d) ', ABS(ierr) )
! Possibly convert only what's needed??
!$cuf kernel do(3) <<<*,*>>>
!$acc parallel loop collapse(3) present(qq_at)
DO na = 1, nat
DO jh = 1, nhm
DO ih = 1, nhm
qqc_d(ih, jh, na) = CMPLX ( qq_at_d(ih,jh, na), 0.0_dp, KIND=dp )
qqc_d(ih, jh, na) = CMPLX ( qq_at(ih,jh, na), 0.0_dp, KIND=dp )
END DO
END DO
END DO
@ -482,19 +485,21 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
ELSE
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
!$acc host_data use_device(qq_so)
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
qq_so_d(1,1,1,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, &
qq_so(1,1,1,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, &
(0.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,1,1), nkb*npol )
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
qq_so_d(1,1,2,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, &
qq_so(1,1,2,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, &
(1.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,1,1), nkb*npol )
!
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
qq_so_d(1,1,3,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, &
qq_so(1,1,3,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, &
(0.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,2,1), nkb*npol )
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
qq_so_d(1,1,4,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, &
qq_so(1,1,4,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, &
(1.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,2,1), nkb*npol )
!$acc end host_data
END IF
END DO
END IF

View File

@ -7,8 +7,9 @@
!
!
!-----------------------------------------------------------------------
subroutine stres_cc( sigmaxcc )
SUBROUTINE stres_cc( sigmaxcc )
!-----------------------------------------------------------------------
!! Core correction term of the stress.
!
USE kinds, ONLY : DP
USE atom, ONLY : rgrid, msh
@ -17,7 +18,7 @@ subroutine stres_cc( sigmaxcc )
USE cell_base, ONLY : alat, omega, tpiba, tpiba2
USE fft_base, ONLY : dfftp
USE fft_rho, ONLY : rho_r2g
USE gvect, ONLY : ngm, gstart, g, gg, ngl, gl, igtongl
USE gvect, ONLY : ngm, gstart, ngl, gl, igtongl, g, gg
USE ener, ONLY : etxc, vtxc
USE lsda_mod, ONLY : nspin
USE scf, ONLY : rho, rho_core, rhog_core
@ -26,81 +27,135 @@ subroutine stres_cc( sigmaxcc )
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
!
implicit none
! output
real(DP) :: sigmaxcc(3,3)
! local variables
IMPLICIT NONE
!
integer :: nt, ng, l, m, ir
! counters
real(DP) :: fact, sigmadiag
real(DP), allocatable:: rhocg(:), vxc(:,:)
complex(DP), allocatable :: vaux(:,:)
REAL(DP), INTENT(OUT) :: sigmaxcc(3,3)
!
sigmaxcc(:,:) = 0.d0
if ( ANY (upf(1:ntyp)%nlcc) ) goto 15
! ... local variables
!
return
INTEGER :: nt, ng, l, m, ir
REAL(DP) :: fact
REAL(DP), ALLOCATABLE :: rhocg(:), vxc(:,:)
COMPLEX(DP), ALLOCATABLE :: vaux(:,:)
!
15 continue
REAL(DP) :: rhocg1, sigma_rid, sigmadiag
REAL(DP) :: sigma1, sigma2, sigma3, &
sigma4, sigma5, sigma6
!
! recalculate the exchange-correlation potential
sigmaxcc(:,:) = 0._DP
!
allocate( vxc(dfftp%nnr,nspin), vaux(dfftp%nnr,1) )
IF ( .NOT. ANY( upf(1:ntyp)%nlcc ) ) RETURN
!
call v_xc( rho, rho_core, rhog_core, etxc, vtxc, vxc )
! ... recalculate the exchange-correlation potential
!
if ( nspin==2 ) then
do ir = 1, dfftp%nnr
ALLOCATE( vxc(dfftp%nnr,nspin), vaux(dfftp%nnr,1) )
!
CALL v_xc( rho, rho_core, rhog_core, etxc, vtxc, vxc )
!
!$acc data create( vaux )
!$acc data copyin( vxc )
IF ( nspin==2 ) then
!$acc parallel loop
DO ir = 1, dfftp%nnr
vxc(ir,1) = 0.5d0 * ( vxc(ir,1) + vxc(ir,2) )
enddo
endif
ENDDO
ENDIF
!
call rho_r2g( dfftp, vxc(:,1), vaux(:,1:1) )
CALL rho_r2g( dfftp, vxc(:,1), vaux(:,1:1) )
!
! vaux contains now Vxc(G)
!$acc end data
DEALLOCATE( vxc )
!
allocate(rhocg(ngl))
sigmadiag = 0.0d0
if (gamma_only) then
fact = 2.d0
else
fact = 1.d0
end if
do nt = 1, ntyp
if ( upf(nt)%nlcc ) then
call drhoc( ngl, gl, omega, tpiba2, msh(nt), rgrid(nt)%r, &
rgrid(nt)%rab, upf(nt)%rho_atc, rhocg )
! diagonal term
if (gstart==2) then
sigmadiag = sigmadiag + CONJG(vaux(1,1)) * strf(1,nt) * rhocg(igtongl(1))
endif
do ng = gstart, ngm
sigmadiag = sigmadiag + CONJG(vaux(ng,1)) * &
strf(ng,nt) * rhocg(igtongl(ng)) * fact
enddo
! ... vaux contains now Vxc(G)
!
sigmadiag = 0._DP
!
fact = 1._DP
IF (gamma_only) fact = 2._DP
!
!$acc data copyin( gl, strf, igtongl )
!
ALLOCATE( rhocg(ngl) )
!$acc data create( rhocg )
!
sigma1 = 0._DP ; sigma4 = 0._DP
sigma2 = 0._DP ; sigma5 = 0._DP
sigma3 = 0._DP ; sigma6 = 0._DP
!
!
DO nt = 1, ntyp
IF ( upf(nt)%nlcc ) THEN
!
call deriv_drhoc( ngl, gl, omega, tpiba2, msh(nt), &
rgrid(nt)%r, rgrid(nt)%rab, upf(nt)%rho_atc, rhocg )
! non diagonal term (g=0 contribution missing)
do ng = gstart, ngm
do l = 1, 3
do m = 1, 3
sigmaxcc(l,m) = sigmaxcc(l,m) + CONJG(vaux(ng,1)) * strf(ng,nt) * &
rhocg(igtongl(ng)) * tpiba * g(l,ng) * g(m,ng) / &
sqrt(gg(ng)) * fact
enddo
enddo
enddo
endif
enddo
!$acc data copyin(rgrid(nt:nt),upf(nt:nt))
!$acc data copyin(rgrid(nt)%r,rgrid(nt)%rab,upf(nt)%rho_atc)
CALL drhoc( ngl, gl, omega, tpiba2, msh(nt), rgrid(nt)%r, &
rgrid(nt)%rab, upf(nt)%rho_atc, rhocg )
!
! ... diagonal term
IF (gstart==2) THEN
!$acc kernels
rhocg1 = rhocg(igtongl(1))
sigmadiag = sigmadiag + DBLE(CONJG(vaux(1,1)) * &
strf(1,nt)) * rhocg1
!$acc end kernels
ENDIF
!
!$acc parallel loop reduction(+:sigmadiag)
DO ng = gstart, ngm
sigmadiag = sigmadiag + DBLE(CONJG(vaux(ng,1)) * &
strf(ng,nt)) * rhocg(igtongl(ng)) * fact
ENDDO
!
CALL deriv_drhoc( ngl, gl, omega, tpiba2, msh(nt), &
rgrid(nt)%r, rgrid(nt)%rab, upf(nt)%rho_atc, &
rhocg )
!
! ... non diagonal term (g=0 contribution missing)
!
!$acc parallel loop reduction(+:sigma1,sigma2,sigma3,sigma4,sigma5,sigma6)
DO ng = gstart, ngm
!
sigma_rid = DBLE(CONJG(vaux(ng,1)) * strf(ng,nt)) * &
rhocg(igtongl(ng)) * tpiba / SQRT(gg(ng)) * fact
!
sigma1 = sigma1 + sigma_rid * g(1,ng)*g(1,ng)
sigma2 = sigma2 + sigma_rid * g(1,ng)*g(2,ng)
sigma3 = sigma3 + sigma_rid * g(1,ng)*g(3,ng)
sigma4 = sigma4 + sigma_rid * g(2,ng)*g(2,ng)
sigma5 = sigma5 + sigma_rid * g(3,ng)*g(2,ng)
sigma6 = sigma6 + sigma_rid * g(3,ng)*g(3,ng)
!
ENDDO
!
!$acc end data
!$acc end data
!
ENDIF
!
ENDDO
!
do l = 1, 3
!$acc end data
!
sigmaxcc(1,1) = sigma1 ; sigmaxcc(2,3) = sigma5
sigmaxcc(1,2) = sigma2 ; sigmaxcc(3,1) = sigma3
sigmaxcc(1,3) = sigma3 ; sigmaxcc(3,2) = sigma5
sigmaxcc(2,1) = sigma2 ; sigmaxcc(3,3) = sigma6
sigmaxcc(2,2) = sigma4
!
DO l = 1, 3
sigmaxcc(l,l) = sigmaxcc(l,l) + sigmadiag
enddo
call mp_sum( sigmaxcc, intra_bgrp_comm )
deallocate( rhocg )
deallocate( vaux, vxc )
return
end subroutine stres_cc
ENDDO
!
CALL mp_sum( sigmaxcc, intra_bgrp_comm )
!
!$acc end data
!$acc end data
!
DEALLOCATE( rhocg )
DEALLOCATE( vaux )
!
RETURN
!
END SUBROUTINE stres_cc

View File

@ -1,178 +0,0 @@
!
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!-----------------------------------------------------------------------
SUBROUTINE stres_cc_gpu( sigmaxcc )
!-----------------------------------------------------------------------
!
USE kinds, ONLY : DP
USE atom, ONLY : rgrid, msh
USE uspp_param, ONLY : upf
USE ions_base, ONLY : ntyp => nsp
USE cell_base, ONLY : alat, omega, tpiba, tpiba2
USE fft_base, ONLY : dfftp
USE fft_rho, ONLY : rho_r2g
USE gvect, ONLY : ngm, gstart, ngl, gl, igtongl, igtongl_d
USE ener, ONLY : etxc, vtxc
USE lsda_mod, ONLY : nspin
USE scf, ONLY : rho, rho_core, rhog_core
USE vlocal, ONLY : strf
USE control_flags, ONLY : gamma_only
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
!
USE gvect, ONLY : g_d, gg_d
#if defined(__CUDA)
USE device_fbuff_m, ONLY : dev_buf
USE device_memcpy_m, ONLY : dev_memcpy
#endif
!
IMPLICIT NONE
!
! output
REAL(DP) :: sigmaxcc(3,3)
! local variables
!
INTEGER :: nt, ng, l, m, ir
! counters
REAL(DP) :: fact
REAL(DP), ALLOCATABLE :: vxc(:,:)
COMPLEX(DP), ALLOCATABLE :: vaux(:,:)
!
REAL(DP), POINTER :: rhocg_d(:), r_d(:), rab_d(:), rhoc_d(:), gl_d(:)
COMPLEX(DP), POINTER :: strf_d(:)
!
INTEGER :: maxmesh, ierrs(6)
REAL(DP) :: sigma_rid, sigmadiag
REAL(DP) :: sigma1, sigma2, sigma3, &
sigma4, sigma5, sigma6
!
#if defined(__CUDA)
attributes(DEVICE) :: rhocg_d, r_d, rab_d, rhoc_d, gl_d, strf_d
!
sigmaxcc(:,:) = 0._DP
IF ( ANY( upf(1:ntyp)%nlcc ) ) GOTO 15
!
RETURN
!
15 CONTINUE
!
! recalculate the exchange-correlation potential
!
ALLOCATE( vxc(dfftp%nnr,nspin), vaux(dfftp%nnr,1) )
!
CALL v_xc( rho, rho_core, rhog_core, etxc, vtxc, vxc )
!
!$acc data copyin(vxc) create(vaux)
!
IF ( nspin==2 ) THEN
!$acc parallel loop
DO ir = 1, dfftp%nnr
vxc(ir,1) = 0.5d0 * ( vxc(ir,1) + vxc(ir,2) )
ENDDO
ENDIF
!
CALL rho_r2g( dfftp, vxc(:,1), vaux(:,1:1) )
!
! vaux contains now Vxc(G)
!
sigmadiag = 0._DP
!
fact = 1._DP
IF (gamma_only) fact = 2._DP
!
maxmesh = MAXVAL(msh(1:ntyp))
CALL dev_buf%lock_buffer( gl_d, ngl, ierrs(1) )
CALL dev_memcpy( gl_d, gl, (/ 1, ngl /) )
CALL dev_buf%lock_buffer( rhocg_d, ngl, ierrs(2) )
CALL dev_buf%lock_buffer( r_d, maxmesh, ierrs(3) )
CALL dev_buf%lock_buffer( rab_d, maxmesh, ierrs(4) )
CALL dev_buf%lock_buffer( rhoc_d,maxmesh, ierrs(5) )
CALL dev_buf%lock_buffer( strf_d, ngm, ierrs(6) )
IF (ANY(ierrs /= 0)) CALL errore( 'stres_cc_gpu', 'cannot allocate buffers', -1 )
!
sigma1 = 0._DP ; sigma4 = 0._DP
sigma2 = 0._DP ; sigma5 = 0._DP
sigma3 = 0._DP ; sigma6 = 0._DP
!
DO nt = 1, ntyp
IF ( upf(nt)%nlcc ) THEN
!
CALL dev_memcpy( strf_d, strf(:,nt), (/1, ngm/) )
CALL dev_memcpy( r_d, rgrid(nt)%r, (/1, msh(nt)/) )
CALL dev_memcpy( rab_d, rgrid(nt)%rab, (/1, msh(nt)/) )
CALL dev_memcpy( rhoc_d, upf(nt)%rho_atc, (/1, msh(nt)/) )
!
CALL drhoc_gpu( ngl, gl_d, omega, tpiba2, msh(nt), r_d, &
rab_d, rhoc_d, rhocg_d )
!
! diagonal term
IF (gstart==2) THEN
!$acc kernels
sigmadiag = sigmadiag + DBLE(CONJG(vaux(1,1))*strf_d(1)) * &
rhocg_d(igtongl_d(1))
!$acc end kernels
ENDIF
!
!$acc parallel loop
DO ng = gstart, ngm
sigmadiag = sigmadiag + DBLE(CONJG(vaux(ng,1)) * strf_d(ng)) * &
rhocg_d(igtongl_d(ng)) * fact
ENDDO
!
CALL deriv_drhoc_gpu( ngl, gl_d, omega, tpiba2, msh(nt), &
r_d, rab_d, rhoc_d, rhocg_d )
!
! non diagonal term (g=0 contribution missing)
!
!$acc parallel loop reduction(+:sigma1,sigma2,sigma3,sigma4,sigma5,sigma6)
DO ng = gstart, ngm
!
sigma_rid = DBLE(CONJG(vaux(ng,1)) &
* strf_d(ng)) * rhocg_d(igtongl_d(ng)) * tpiba &
/ SQRT(gg_d(ng)) * fact
!
sigma1 = sigma1 + sigma_rid * g_d(1,ng)*g_d(1,ng)
sigma2 = sigma2 + sigma_rid * g_d(1,ng)*g_d(2,ng)
sigma3 = sigma3 + sigma_rid * g_d(1,ng)*g_d(3,ng)
sigma4 = sigma4 + sigma_rid * g_d(2,ng)*g_d(2,ng)
sigma5 = sigma5 + sigma_rid * g_d(3,ng)*g_d(2,ng)
sigma6 = sigma6 + sigma_rid * g_d(3,ng)*g_d(3,ng)
!
ENDDO
!
ENDIF
!
ENDDO
!
sigmaxcc(1,1) = sigma1 ; sigmaxcc(2,3) = sigma5
sigmaxcc(1,2) = sigma2 ; sigmaxcc(3,1) = sigma3
sigmaxcc(1,3) = sigma3 ; sigmaxcc(3,2) = sigma5
sigmaxcc(2,1) = sigma2 ; sigmaxcc(3,3) = sigma6
sigmaxcc(2,2) = sigma4
!
DO l = 1, 3
sigmaxcc(l,l) = sigmaxcc(l,l) + sigmadiag
ENDDO
!
CALL mp_sum( sigmaxcc, intra_bgrp_comm )
!
!$acc end data
DEALLOCATE( vxc, vaux )
CALL dev_buf%release_buffer( gl_d, ierrs(1) )
CALL dev_buf%release_buffer( rhocg_d,ierrs(2) )
CALL dev_buf%release_buffer( r_d, ierrs(3) )
CALL dev_buf%release_buffer( rab_d, ierrs(4) )
CALL dev_buf%release_buffer( rhoc_d, ierrs(5) )
CALL dev_buf%release_buffer( strf_d, ierrs(6) )
#endif
!
RETURN
!
END SUBROUTINE stres_cc_gpu

View File

@ -53,7 +53,7 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
REAL(DP), INTENT(IN) :: gcutm
!! input: cut-off of g vectors
REAL(DP), INTENT(OUT) :: sigmaewa(3,3)
! output: the ewald stress
!! output: the ewald stress
!
! ... local variables
!
@ -85,6 +85,9 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
! diagonal term
! nondiagonal term
COMPLEX(DP) :: rhostar
REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
!
!$acc data present( g, gg )
!
tpiba2 = (tpi / alat)**2
sigmaewa(:,:) = 0.d0
@ -94,85 +97,113 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
charge = charge + zv(ityp(na))
ENDDO
!
! choose alpha in order to have convergence in the sum over G
! upperbound is a safe upper bound for the error ON THE ENERGY
! ... choose alpha in order to have convergence in the sum over G
! upperbound is a safe upper bound for the error ON THE ENERGY
!
alpha = 2.9d0
12 alpha = alpha - 0.1d0
!
IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 )
upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * &
erfc ( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) )
ERFC( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) )
!
IF (upperbound > 1d-7) GOTO 12
!
! G-space sum here
!
! Determine if this processor contains G=0 and set the constant term
!
! ... Determine if this processor contains G=0 and set the constant term
! sdewald is the diagonal term
IF (gstart == 2) THEN
sdewald = tpi * e2 / 4.d0 / alpha * (charge / omega)**2
ELSE
sdewald = 0.d0
ENDIF
!
! sdewald is the diagonal term
IF (gamma_only) THEN
fact = 2.d0
fact = 2.d0
ELSE
fact = 1.d0
ENDIF
!
! ... G-space sum here below
!
IF (do_cutoff_2D) THEN
!
CALL cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa )
!
ELSE
!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau, zv, ityp, nat, omega, fact)&
!$omp &private(g2, g2a, rhostar, na, arg, l, m, sewald)&
!$omp &reduction(+:sigmaewa,sdewald)
!
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
!
#if !defined(_OPENACC)
!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau,&
!$omp nat, zv, ityp, omega, fact) private(g2,g2a, rhostar, na, arg,&
!$omp sewald) reduction(+:sdewald,sigma11,sigma21,sigma22,sigma31,&
!$omp sigma32,sigma33)
#else
!$acc parallel loop copyin(tau,zv,ityp) reduction(+:sigma11,sigma21,sigma22,&
!$acc sigma31,sigma32,sigma33) reduction(-:sdewald)
#endif
DO ng = gstart, ngm
g2 = gg (ng) * tpiba2
g2a = g2 / 4.d0 / alpha
rhostar = (0.d0, 0.d0)
g2 = gg(ng) * tpiba2
g2a = g2 / 4._DP / alpha
rhostar = (0._DP,0._DP)
DO na = 1, nat
arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + &
g(3,ng) * tau(3,na) ) * tpi
rhostar = rhostar + zv(ityp(na)) * CMPLX(COS(arg), SIN(arg), KIND=DP)
rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP)
ENDDO
rhostar = rhostar / omega
rhostar = rhostar / CMPLX(omega)
sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2
sdewald = sdewald - sewald
DO l = 1, 3
DO m = 1, l
sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * &
g(l,ng) * g(m,ng) / g2 * (g2a + 1)
ENDDO
ENDDO
!
sigma11 = sigma11 + sewald * tpiba2 * 2._DP * &
g(1,ng) * g(1,ng) / g2 * (g2a + 1)
sigma21 = sigma21 + sewald * tpiba2 * 2._DP * &
g(2,ng) * g(1,ng) / g2 * (g2a + 1)
sigma22 = sigma22 + sewald * tpiba2 * 2._DP * &
g(2,ng) * g(2,ng) / g2 * (g2a + 1)
sigma31 = sigma31 + sewald * tpiba2 * 2._DP * &
g(3,ng) * g(1,ng) / g2 * (g2a + 1)
sigma32 = sigma32 + sewald * tpiba2 * 2._DP * &
g(3,ng) * g(2,ng) / g2 * (g2a + 1)
sigma33 = sigma33 + sewald * tpiba2 * 2._DP * &
g(3,ng) * g(3,ng) / g2 * (g2a + 1)
ENDDO
#if !defined(_OPENACC)
!$omp end parallel do
#endif
!
sigmaewa(1,1) = sigmaewa(1,1) + sigma11
sigmaewa(2,1) = sigmaewa(2,1) + sigma21
sigmaewa(2,2) = sigmaewa(2,2) + sigma22
sigmaewa(3,1) = sigmaewa(3,1) + sigma31
sigmaewa(3,2) = sigmaewa(3,2) + sigma32
sigmaewa(3,3) = sigmaewa(3,3) + sigma33
!
ENDIF
!
!$acc end data
!
DO l = 1, 3
sigmaewa(l,l) = sigmaewa(l,l) + sdewald
ENDDO
!
! R-space sum here (see ewald.f90 for details on parallelization)
! ... R-space sum here (see ewald.f90 for details on parallelization)
!
CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey )
!
IF ( mykey == 0 ) THEN
rmax = 4.0d0 / SQRT(alpha) / alat
!
! with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1
! ... with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1
!
!$omp parallel do default(none) shared(na_s, na_e, nat, tau, rmax, at, bg, alat, ityp, alpha, omega, zv)&
!$omp &private(nb, dtau, r, r2, nrm, nr, rr, fac, l, m)&
!$omp &reduction(+:sigmaewa)
!$omp parallel do default(none) shared(na_s,na_e,nat,tau,rmax,at,bg,alat,ityp,alpha,omega,zv)&
!$omp &private(nb,dtau,r,r2,nrm,nr,rr,fac,l,m) reduction(+:sigmaewa)
DO na = na_s, na_e
DO nb = 1, nat
dtau(:) = tau(:,na) - tau(:,nb)
!
! generates nearest-neighbors shells r(i)=R(i)-dtau(i)
! ... generates nearest-neighbors shells r(i)=R(i)-dtau(i)
!
CALL rgen( dtau, rmax, mxr, at, bg, r, r2, nrm )
!

View File

@ -1,246 +0,0 @@
!
! Copyright (C) 2001-2009 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!-----------------------------------------------------------------------
SUBROUTINE stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg, tau, &
omega, g_d, gg_d, ngm, gstart, gamma_only, &
gcutm, sigmaewa )
!---------------------------------------------------------------------
!! Ewald contribution. Both real- and reciprocal-space terms are
!! present.
!
USE kinds
USE constants, ONLY : tpi, e2, eps6
USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, nproc_bgrp
USE mp, ONLY : mp_sum
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_sigmaewa_gpu
!
IMPLICIT NONE
!
INTEGER :: nat
!! input: number of atoms in the unit cell
INTEGER :: ntyp
!! input: number of different types of atoms
INTEGER :: ityp(nat)
!! input: the type of each atom
INTEGER :: ngm
!! input: number of plane waves for G sum
INTEGER :: gstart
!! input: first nonzero g vector
LOGICAL, INTENT(IN) :: gamma_only
!! gamma point only
REAL(DP), INTENT(IN) :: tau(3,nat)
!! input: the positions of the atoms in the cell
REAL(DP), INTENT(IN) :: g_d(3,ngm)
!! input: the coordinates of G vectors
REAL(DP), INTENT(IN) :: gg_d(ngm)
!! input: the square moduli of G vectors
REAL(DP), INTENT(IN) :: zv(ntyp)
!! input: the charge of each type of atoms
REAL(DP), INTENT(IN) :: at(3,3)
!! input: the direct lattice vectors
REAL(DP), INTENT(IN) :: bg(3,3)
!! input: the reciprocal lattice vectors
REAL(DP), INTENT(IN) :: omega
!! input: the volume of the unit cell
REAL(DP), INTENT(IN) :: alat
!! input: measure of length
REAL(DP), INTENT(IN) :: gcutm
!! input: cut-off of g vectors
REAL(DP), INTENT(OUT) :: sigmaewa(3,3)
! output: the ewald stress
!
! ... local variables
!
INTEGER, PARAMETER :: mxr = 50
! the maximum number of R vectors included in r sum
INTEGER :: ng, nr, na, nb, l, m, nrm
! counter over reciprocal G vectors
! counter over direct vectors
! counter on atoms
! counter on atoms
! counter on atoms
! number of R vectors included in r sum
INTEGER :: na_s, na_e, mykey
!
REAL(DP) :: charge, arg, tpiba2, dtau(3), alpha, r(3,mxr), &
r2(mxr), rmax, rr, upperbound, fact, fac, g2, g2a, &
sdewald, sewald
! total ionic charge in the cell
! the argument of the phase
! length in reciprocal space
! the difference tau_s - tau_s'
! alpha term in ewald sum
! input of the rgen routine ( not used here )
! the square modulus of R_j-tau_s-tau_s'
! the maximum radius to consider real space sum
! buffer variable
! used to optimize alpha
! auxiliary variables
! diagonal term
! nondiagonal term
!
INTEGER :: ierr(2)
REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
COMPLEX(DP) :: rhostar
!
INTEGER, ALLOCATABLE :: ityp_d(:)
REAL(DP), ALLOCATABLE :: zv_d(:), tau_d(:,:)
!
#if defined(__CUDA)
attributes(DEVICE) :: g_d, gg_d, zv_d, ityp_d, tau_d
#endif
!
tpiba2 = (tpi / alat)**2
sigmaewa(:,:) = 0._DP
charge = 0._DP
!
ALLOCATE( zv_d(ntyp), tau_d(3,nat) )
zv_d = zv
tau_d = tau
ALLOCATE( ityp_d(nat) )
ityp_d = ityp
!
DO na = 1, nat
charge = charge + zv(ityp(na))
ENDDO
!
! choose alpha in order to have convergence in the sum over G
! upperbound is a safe upper bound for the error ON THE ENERGY
!
alpha = 2.9_DP
12 alpha = alpha - 0.1_DP
!
IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 )
upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * &
erfc ( SQRT(tpiba2 * gcutm / 4._DP / alpha) )
!
IF (upperbound > 1d-7) GOTO 12
!
! G-space sum here
!
! Determine if this processor contains G=0 and set the constant term
!
IF (gstart == 2) THEN
sdewald = tpi * e2 / 4._DP / alpha * (charge / omega)**2
ELSE
sdewald = 0._DP
ENDIF
!
! sdewald is the diagonal term
IF ( gamma_only ) THEN
fact = 2._DP
ELSE
fact = 1._DP
ENDIF
!
IF ( do_cutoff_2D ) THEN
!
CALL cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
!
ELSE
!
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
!
!$cuf kernel do (1) <<<*,*>>>
DO ng = gstart, ngm
g2 = gg_d(ng) * tpiba2
g2a = g2 / 4._DP / alpha
rhostar = (0._DP,0._DP)
DO na = 1, nat
arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + &
g_d(3,ng) * tau_d(3,na) ) * tpi
rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP)
ENDDO
rhostar = rhostar / CMPLX(omega)
sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2
sdewald = sdewald - sewald
!
sigma11 = sigma11 + sewald * tpiba2 * 2._DP * &
g_d(1,ng) * g_d(1,ng) / g2 * (g2a + 1)
sigma21 = sigma21 + sewald * tpiba2 * 2._DP * &
g_d(2,ng) * g_d(1,ng) / g2 * (g2a + 1)
sigma22 = sigma22 + sewald * tpiba2 * 2._DP * &
g_d(2,ng) * g_d(2,ng) / g2 * (g2a + 1)
sigma31 = sigma31 + sewald * tpiba2 * 2._DP * &
g_d(3,ng) * g_d(1,ng) / g2 * (g2a + 1)
sigma32 = sigma32 + sewald * tpiba2 * 2._DP * &
g_d(3,ng) * g_d(2,ng) / g2 * (g2a + 1)
sigma33 = sigma33 + sewald * tpiba2 * 2._DP * &
g_d(3,ng) * g_d(3,ng) / g2 * (g2a + 1)
!
ENDDO
!
sigmaewa(1,1) = sigmaewa(1,1) + sigma11
sigmaewa(2,1) = sigmaewa(2,1) + sigma21
sigmaewa(2,2) = sigmaewa(2,2) + sigma22
sigmaewa(3,1) = sigmaewa(3,1) + sigma31
sigmaewa(3,2) = sigmaewa(3,2) + sigma32
sigmaewa(3,3) = sigmaewa(3,3) + sigma33
!
ENDIF
!
DO l = 1, 3
sigmaewa(l,l) = sigmaewa(l,l) + sdewald
ENDDO
!
! R-space sum here (see ewald.f90 for details on parallelization)
!
CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey )
!
IF ( mykey == 0 ) THEN
rmax = 4.0d0 / SQRT(alpha) / alat
!
! with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1
!
DO na = na_s, na_e
DO nb = 1, nat
dtau(:) = tau(:,na) - tau(:,nb)
!
! generates nearest-neighbors shells r(i)=R(i)-dtau(i)
!
CALL rgen( dtau, rmax, mxr, at, bg, r, r2, nrm )
!
DO nr = 1, nrm
rr = SQRT(r2 (nr) ) * alat
fac = - e2 / 2.0_DP/ omega * alat**2 * zv(ityp(na)) * &
zv(ityp(nb)) / rr**3 * (erfc(SQRT(alpha) * rr) + &
rr * SQRT(8.0_DP * alpha / tpi) * EXP( - alpha * rr**2) )
DO l = 1, 3
DO m = 1, l
sigmaewa(l,m) = sigmaewa(l,m) + fac * r(l,nr) * r(m,nr)
ENDDO
ENDDO
ENDDO
!
ENDDO
ENDDO
ENDIF
!
DO l = 1, 3
DO m = 1, l - 1
sigmaewa(m,l) = sigmaewa(l,m)
ENDDO
ENDDO
!
DO l = 1, 3
DO m = 1, 3
sigmaewa(l,m) = - sigmaewa(l,m)
ENDDO
ENDDO
!
DEALLOCATE( zv_d, tau_d )
DEALLOCATE( ityp_d )
!
CALL mp_sum( sigmaewa, intra_bgrp_comm )
!
RETURN
!
END SUBROUTINE stres_ewa_gpu

View File

@ -7,8 +7,8 @@
!
!
!----------------------------------------------------------------------------
SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
dfft, g, alat, omega, sigmaxc )
SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
dfft, g, alat, omega, sigmaxc, kedtau )
!----------------------------------------------------------------------------
!
USE kinds, ONLY: DP
@ -24,9 +24,10 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
TYPE(fft_type_descriptor), INTENT(IN):: dfft
INTEGER, INTENT(IN) :: nspin
REAL(DP), INTENT(IN) :: rho(dfft%nnr,nspin), rho_core(dfft%nnr)
REAL(DP), INTENT(IN) :: g(3,dfft%ngm), alat, omega
REAL(DP), INTENT(INOUT) :: kedtau(dfft%nnr, nspin) ! FIXME: should be INTENT(IN)
COMPLEX(DP), INTENT(IN) :: rhog(dfft%ngm, nspin)
REAL(DP), INTENT(IN) :: g(3,dfft%ngm)
REAL(DP), INTENT(IN) :: alat, omega
REAL(DP), INTENT(IN), OPTIONAL :: kedtau(dfft%nnr,nspin)
COMPLEX(DP), INTENT(IN) :: rhog(dfft%ngm,nspin)
COMPLEX(DP), INTENT(IN) :: rhog_core(dfft%ngm)
REAL(DP), INTENT(INOUT) :: sigmaxc(3,3)
!
@ -34,30 +35,36 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
!
INTEGER :: k, l, m, ipol, ir, ig, is, nspin0, np
INTEGER :: nr1, nr2, nr3, nrxx, ngm
REAL(DP), ALLOCATABLE :: grho(:,:,:), grho2(:,:), rhoaux(:,:), segni(:)
REAL(DP), ALLOCATABLE :: grho(:,:,:), grho2(:,:), rhoaux(:,:), &
segni(:), kedtaue2(:,:)
COMPLEX(DP), ALLOCATABLE :: rhogaux(:,:)
!
REAL(DP), ALLOCATABLE :: sx(:), sc(:)
REAL(DP), ALLOCATABLE :: v1x(:,:), v2x(:,:), v3x(:,:)
REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:,:), v3c(:,:), v2c_ud(:)
REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:), v3c(:,:), v2c_ud(:), v2cm(:,:,:)
!
REAL(DP), PARAMETER :: epsr = 1.0d-6, epsg = 1.0d-10, e2 = 2.d0
REAL(DP) :: sigma_gradcorr(3, 3)
!
REAL(DP) :: v2xc, v2xc_uu, v2xc_dd
REAL(DP) :: sigma_gc11, sigma_gc31, sigma_gc21, &
sigma_gc32, sigma_gc22, sigma_gc33
REAL(DP) :: sigma_gradcorr(3,3)
!
IF ( .NOT. xclib_dft_is('gradient') .AND. .NOT. xclib_dft_is('meta') ) RETURN
!
IF ( xclib_dft_is('meta') .and. nspin>1 ) CALL errore('stres_gradcorr', &
'Meta-GGA stress does not work with spin polarization',1)
IF ( xclib_dft_is('meta') .AND. nspin>1 ) CALL errore( 'stres_gradcorr', &
'Meta-GGA stress does not work with spin polarization', 1 )
!
!$acc data present_or_copyin( rho, rho_core, rhog, rhog_core, g )
!
np = 1
IF ( nspin==2 .AND. xclib_dft_is('meta') ) np=3
IF ( nspin==2 .AND. xclib_dft_is('meta') ) np = 3
!
nspin0 = nspin
IF (nspin==4) nspin0 = 1
IF (nspin==4.and.domag) nspin0 = 2
IF (nspin==4.AND.domag) nspin0 = 2
!
sigma_gradcorr(:,:) = 0.0_DP
sigma_gc11=0.0_DP ; sigma_gc21=0.0_DP ; sigma_gc22=0.0_DP
sigma_gc31=0.0_DP ; sigma_gc32=0.0_DP ; sigma_gc33=0.0_DP
!
nr1 = dfft%nr1
nr2 = dfft%nr2
@ -66,44 +73,71 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
ngm = dfft%ngm
!
ALLOCATE( grho(3,nrxx,nspin0) )
ALLOCATE (rhoaux(nrxx,nspin0))
ALLOCATE( rhoaux(nrxx,nspin0) )
ALLOCATE( rhogaux(ngm,nspin0) )
IF (xclib_dft_is('meta')) ALLOCATE( kedtaue2(dfft%nnr,nspin) )
!$acc data create( grho, rhoaux )
!$acc data create( rhogaux )
!
! calculate the gradient of rho+rhocore in real space
! For convenience rhoaux is in (up,down) format
!
IF ( nspin0 == 1 ) THEN
!
rhoaux(:,1) = rho(:,1) + rho_core(:)
rhogaux(:,1) = rhog(:,1) + rhog_core(:)
!$acc parallel loop
DO k = 1, nrxx
rhoaux(k,1) = rho(k,1) + rho_core(k)
ENDDO
!$acc parallel loop
DO k = 1, ngm
rhogaux(k,1) = rhog(k,1) + rhog_core(k)
ENDDO
!
ELSE IF ( nspin0 == 2 ) THEN
ELSEIF ( nspin0 == 2 ) THEN
!
IF ( nspin == 4 .AND. domag ) THEN
!
ALLOCATE( segni( nrxx ) )
!
!$acc data copyout( segni )
CALL compute_rho( rho, rhoaux, segni, nrxx )
DEALLOCATE( segni )
rhoaux(:,1) = rhoaux(:,1) + rho_core(:) / 2.0_DP
rhoaux(:,2) = rhoaux(:,2) + rho_core(:) / 2.0_DP
CALL rho_r2g ( dfft, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
!$acc parallel loop
DO k = 1, nrxx
rhoaux(k,1) = rhoaux(k,1) + rho_core(k) / 2.0_DP
rhoaux(k,2) = rhoaux(k,2) + rho_core(k) / 2.0_DP
ENDDO
CALL rho_r2g( dfft, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
!$acc end data
ELSE
rhoaux(:,1) = ( rho(:,1) + rho(:,2) + rho_core(:) ) / 2.0_DP
rhoaux(:,2) = ( rho(:,1) - rho(:,2) + rho_core(:) ) / 2.0_DP
rhogaux(:,1) = ( rhog(:,1) + rhog(:,2) + rhog_core(:) ) / 2.0_DP
rhogaux(:,2) = ( rhog(:,1) - rhog(:,2) + rhog_core(:) ) / 2.0_DP
END IF
!$acc parallel loop
DO k = 1, nrxx
rhoaux(k,1) = ( rho(k,1) + rho(k,2) + rho_core(k) ) / 2.0_DP
rhoaux(k,2) = ( rho(k,1) - rho(k,2) + rho_core(k) ) / 2.0_DP
ENDDO
!$acc parallel loop
DO k = 1, ngm
rhogaux(k,1) = ( rhog(k,1) + rhog(k,2) + rhog_core(k) ) / 2.0_DP
rhogaux(k,2) = ( rhog(k,1) - rhog(k,2) + rhog_core(k) ) / 2.0_DP
ENDDO
ENDIF
ENDIF
!
DO is = 1, nspin0
CALL fft_gradient_g2r( dfft, rhogaux(1,is), g, grho(1,1,is) )
CALL fft_gradient_g2r( dfft, rhogaux(:,is), g, grho(:,:,is) )
ENDDO
DEALLOCATE (rhogaux)
!
!$acc end data
DEALLOCATE( rhogaux )
!
ALLOCATE( grho2(nrxx,nspin0) )
ALLOCATE( v1x(nrxx,nspin0), v2x(nrxx,nspin0), v3x(nrxx,nspin0) )
ALLOCATE( v1c(nrxx,nspin0), v2c(np,nrxx,nspin0), v3c(nrxx,nspin0) )
ALLOCATE( v1x(nrxx,nspin0), v2x(nrxx,nspin0) )
ALLOCATE( v1c(nrxx,nspin0), v2c(nrxx,nspin0) )
ALLOCATE( sx(nrxx), sc(nrxx) )
!
IF ( xclib_dft_is('meta') ) &
ALLOCATE( v2cm(np,nrxx,nspin0), v3x(nrxx,nspin0), v3c(nrxx,nspin0) )
!$acc data create( grho2, sx, sc, v1x, v2x, v1c, v2c )
!
IF (nspin0==1) THEN
!
! Spin-unpolarized case
@ -113,73 +147,128 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
!
! routine computing v1x_v and v2x_v is different for GGA and meta-GGA
!
grho2(:,1) = grho(1,:,1)**2 + grho(2,:,1)**2 + grho(3,:,1)**2
!$acc parallel loop
DO k = 1, nrxx
grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
ENDDO
!
IF ( xclib_dft_is('meta') .AND. xclib_get_id('MGGA','EXCH') /= 4 ) THEN
kedtau(:,1) = kedtau(:,1) / e2
CALL xc_metagcx( nrxx, 1, np, rhoaux, grho, kedtau, sx, sc, &
v1x, v2x, v3x, v1c, v2c, v3c )
kedtau(:,1) = kedtau(:,1) * e2
!$acc data present_or_copyin(kedtau) create( kedtaue2, v2cm, v3x, v3c )
!$acc parallel loop
DO k = 1, nrxx
kedtaue2(k,1) = kedtau(k,1) / e2
ENDDO
CALL xc_metagcx( nrxx, 1, np, rhoaux, grho, kedtaue2, sx, sc, &
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
!$acc parallel loop
DO k = 1, nrxx
v2c(k,1) = v2cm(1,k,1)
ENDDO
!$acc end data
ELSE
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c(1,:,:) )
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, gpu_args_=.TRUE. )
ENDIF
!
DO l = 1, 3
DO m = 1, l
sigma_gradcorr(l,m) = sigma_gradcorr(l,m) + SUM( grho(l,:,1)*grho(m,:,1)* &
e2 * (v2x(:,1) + v2c(1,:,1)) )
ENDDO
!$acc parallel loop reduction(+:sigma_gc11,sigma_gc21,sigma_gc22, &
!$acc& sigma_gc31,sigma_gc32,sigma_gc33)
DO k = 1, nrxx
v2xc = e2 * (v2x(k,1) + v2c(k,1))
sigma_gc11 = sigma_gc11 + grho(1,k,1)*grho(1,k,1) * v2xc
sigma_gc21 = sigma_gc21 + grho(2,k,1)*grho(1,k,1) * v2xc
sigma_gc22 = sigma_gc22 + grho(2,k,1)*grho(2,k,1) * v2xc
sigma_gc31 = sigma_gc31 + grho(3,k,1)*grho(1,k,1) * v2xc
sigma_gc32 = sigma_gc32 + grho(3,k,1)*grho(2,k,1) * v2xc
sigma_gc33 = sigma_gc33 + grho(3,k,1)*grho(3,k,1) * v2xc
ENDDO
!
ELSEIF (nspin0 == 2) THEN
!
! Spin-polarized case
!
grho2(:,:) = grho(1,:,:)**2 + grho(2,:,:)**2 + grho(3,:,:)**2
!$acc parallel loop
DO k = 1, nrxx
grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
grho2(k,2) = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2
ENDDO
!
IF ( xclib_dft_is('meta') ) THEN
!
kedtau(:,1:nspin0) = kedtau(:,1:nspin0) / e2
CALL xc_metagcx( nrxx, nspin0, np, rhoaux, grho, kedtau, sx, sc, &
v1x, v2x, v3x, v1c, v2c, v3c )
kedtau(:,1:nspin0) = kedtau(:,1:nspin0) * e2
!$acc data present(kedtau) create(kedtaue2, v2cm, v3x, v3c)
!$acc parallel loop
DO k = 1, nrxx
kedtaue2(k,1:nspin0) = kedtau(k,1:nspin0) / e2
ENDDO
CALL xc_metagcx( nrxx, nspin0, np, rhoaux, grho, kedtaue2, sx, sc, &
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
!$acc parallel loop
DO k = 1, nrxx
v2c(k,:) = v2cm(1,k,:)
ENDDO
!$acc end data
! FIXME : what are we supposed to do now?
!
ELSE
!
ALLOCATE( v2c_ud(nrxx) )
!$acc data create( v2c_ud )
!
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c(1,:,:), v2c_ud )
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud, gpu_args_=.TRUE. )
!
DO l = 1, 3
DO m = 1, l
!
! ... exchange
sigma_gradcorr(l,m) = &
SUM( grho(l,:,1) * grho(m,:,1) * e2 * v2x(:,1) + &
grho(l,:,2) * grho(m,:,2) * e2 * v2x(:,2) )
!
! ... correlation
sigma_gradcorr(l,m) = sigma_gradcorr(l,m) + &
SUM( grho(l,:,1) * grho(m,:,1) * v2c(1,:,1) + &
grho(l,:,2) * grho(m,:,2) * v2c(1,:,2) + &
(grho(l,:,1) * grho(m,:,2) + &
grho(l,:,2) * grho(m,:,1)) * v2c_ud(:) ) * e2
ENDDO
!$acc parallel loop reduction(+:sigma_gc11,sigma_gc21,sigma_gc22, &
!$acc& sigma_gc31,sigma_gc32,sigma_gc33)
DO k = 1, nrxx
!
v2xc_uu = e2 * (v2x(k,1)+v2c(k,1))
v2xc_dd = e2 * (v2x(k,2)+v2c(k,2))
!
sigma_gc11 = sigma_gc11 + grho(1,k,1)*grho(1,k,1) * v2xc_uu + &
grho(1,k,2)*grho(1,k,2) * v2xc_dd + &
(grho(1,k,1)*grho(1,k,2) + &
grho(1,k,2)*grho(1,k,1)) * v2c_ud(k) * e2
sigma_gc21 = sigma_gc21 + grho(2,k,1)*grho(1,k,1) * v2xc_uu + &
grho(2,k,2)*grho(1,k,2) * v2xc_dd + &
(grho(2,k,1)*grho(1,k,2) + &
grho(1,k,2)*grho(2,k,1)) * v2c_ud(k) * e2
sigma_gc22 = sigma_gc22 + grho(2,k,1)*grho(2,k,1) * v2xc_uu + &
grho(2,k,2)*grho(2,k,2) * v2xc_dd + &
(grho(2,k,1)*grho(2,k,2) + &
grho(2,k,2)*grho(2,k,1)) * v2c_ud(k) * e2
sigma_gc31 = sigma_gc31 + grho(3,k,1)*grho(1,k,1) * v2xc_uu + &
grho(3,k,2)*grho(1,k,2) * v2xc_dd + &
(grho(3,k,1)*grho(1,k,2) + &
grho(1,k,2)*grho(3,k,1)) * v2c_ud(k) * e2
sigma_gc32 = sigma_gc32 + grho(3,k,1)*grho(2,k,1) * v2xc_uu + &
grho(3,k,2)*grho(2,k,2) * v2xc_dd + &
(grho(3,k,1)*grho(2,k,2) + &
grho(2,k,2)*grho(3,k,1)) * v2c_ud(k) * e2
sigma_gc33 = sigma_gc33 + grho(3,k,1)*grho(3,k,1) * v2xc_uu + &
grho(3,k,2)*grho(3,k,2) * v2xc_dd + &
(grho(3,k,1)*grho(3,k,2) + &
grho(3,k,2)*grho(3,k,1)) * v2c_ud(k) * e2
ENDDO
!
!$acc end data
DEALLOCATE( v2c_ud )
!
ENDIF
!
ENDIF
!
sigma_gradcorr(1,1) = sigma_gc11
sigma_gradcorr(2,1) = sigma_gc21
sigma_gradcorr(2,2) = sigma_gc22
sigma_gradcorr(3,1) = sigma_gc31
sigma_gradcorr(3,2) = sigma_gc32
sigma_gradcorr(3,3) = sigma_gc33
!
!$acc end data
!$acc end data
DEALLOCATE( sc, sx )
DEALLOCATE( v1c, v2c, v3c )
DEALLOCATE( v1x, v2x, v3x )
DEALLOCATE( grho2 )
DEALLOCATE( v1c, v2c )
DEALLOCATE( v1x, v2x )
DEALLOCATE( grho, grho2 )
DEALLOCATE( rhoaux )
DEALLOCATE( grho )
IF (xclib_dft_is('meta')) DEALLOCATE( kedtaue2, v2cm, v3x, v3c )
!
DO l = 1, 3
DO m = 1, l - 1
@ -191,6 +280,8 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
!
sigmaxc(:,:) = sigmaxc(:,:) + sigma_gradcorr(:,:) / (nr1 * nr2 * nr3)
!
!$acc end data
!
RETURN
!
END SUBROUTINE stres_gradcorr

View File

@ -11,18 +11,18 @@ SUBROUTINE stres_har( sigmahar )
!--------------------------------------------------------------------
!! Calculates the Hartree contribution to the stress
!
USE kinds, ONLY: DP
USE constants, ONLY: e2, fpi
USE cell_base, ONLY: omega, tpiba2
USE ener, ONLY: ehart
USE fft_base, ONLY: dfftp
USE fft_rho, ONLY: rho_r2g
USE gvect, ONLY: ngm, gstart, g, gg
USE scf, ONLY: rho
USE control_flags, ONLY: gamma_only
USE mp_bands, ONLY: intra_bgrp_comm
USE mp, ONLY: mp_sum
USE Coul_cut_2D, ONLY: do_cutoff_2D, cutoff_stres_sigmahar
USE kinds, ONLY: DP
USE constants, ONLY: e2, fpi
USE cell_base, ONLY: omega, tpiba2
USE ener, ONLY: ehart
USE fft_base, ONLY: dfftp
USE fft_rho, ONLY: rho_r2g
USE gvect, ONLY: ngm, gstart, g, gg
USE scf, ONLY: rho
USE control_flags, ONLY: gamma_only
USE mp_bands, ONLY: intra_bgrp_comm
USE mp, ONLY: mp_sum
USE Coul_cut_2D, ONLY: do_cutoff_2D, cutoff_stres_sigmahar
!
IMPLICIT NONE
!
@ -31,58 +31,86 @@ SUBROUTINE stres_har( sigmahar )
!
! ... local variables
!
INTEGER :: ig, l, m
REAL(DP) :: shart, g2
REAL(DP), PARAMETER :: eps = 1.d-8
COMPLEX(DP), ALLOCATABLE :: rhog(:,:)
!
sigmahar(:,:) = 0.0_DP
REAL(DP), PARAMETER :: eps = 1.E-8_DP
INTEGER :: ig, l, m
REAL(DP) :: sigmahar11, sigmahar31, sigmahar21, &
sigmahar32, sigmahar22, sigmahar33
!
ALLOCATE( rhog(dfftp%nnr,1) )
!$acc data create(rhog)
!
CALL rho_r2g( dfftp, rho%of_r(:,1), rhog )
!
! rhog contains now the charge density in G space
! the G=0 component is not computed
IF (do_cutoff_2D) THEN
CALL cutoff_stres_sigmahar( rhog, sigmahar )
ELSE
DO ig = gstart, ngm
g2 = gg(ig) * tpiba2
shart = rhog(ig,1) * CONJG(rhog(ig,1)) / g2
DO l = 1, 3
DO m = 1, l
sigmahar(l,m) = sigmahar(l,m) + shart * tpiba2 * 2 * &
g(l,ig) * g(m,ig) / g2
ENDDO
ENDDO
ENDDO
ENDIF
! ... the G=0 component is not computed
!
sigmahar(:,:) = 0.0_DP
!
IF (do_cutoff_2D) THEN
!
CALL cutoff_stres_sigmahar( rhog(:,1), sigmahar )
!
ELSE
!
sigmahar11 = 0._DP ; sigmahar31 = 0._DP
sigmahar21 = 0._DP ; sigmahar32 = 0._DP
sigmahar22 = 0._DP ; sigmahar33 = 0._DP
!
!$acc parallel loop reduction(+:sigmahar11,sigmahar21,sigmahar22,&
!$acc& sigmahar31,sigmahar32,sigmahar33)
DO ig = gstart, ngm
!
g2 = gg(ig)
!
shart = DBLE(rhog(ig,1)*CONJG(rhog(ig,1))) / g2
!
sigmahar11 = sigmahar11 + shart *2._DP * &
g(1,ig) * g(1,ig) / g2
sigmahar21 = sigmahar21 + shart *2._DP * &
g(2,ig) * g(1,ig) / g2
sigmahar22 = sigmahar22 + shart *2._DP * &
g(2,ig) * g(2,ig) / g2
sigmahar31 = sigmahar31 + shart *2._DP * &
g(3,ig) * g(1,ig) / g2
sigmahar32 = sigmahar32 + shart *2._DP * &
g(3,ig) * g(2,ig) / g2
sigmahar33 = sigmahar33 + shart *2._DP * &
g(3,ig) * g(3,ig) / g2
ENDDO
!
sigmahar(1,1) = sigmahar(1,1) + sigmahar11 / tpiba2
sigmahar(2,1) = sigmahar(2,1) + sigmahar21 / tpiba2
sigmahar(2,2) = sigmahar(2,2) + sigmahar22 / tpiba2
sigmahar(3,1) = sigmahar(3,1) + sigmahar31 / tpiba2
sigmahar(3,2) = sigmahar(3,2) + sigmahar32 / tpiba2
sigmahar(3,3) = sigmahar(3,3) + sigmahar33 / tpiba2
!
ENDIF
!
!$acc end data
DEALLOCATE( rhog )
!
CALL mp_sum( sigmahar, intra_bgrp_comm )
!
IF (gamma_only) THEN
sigmahar(:,:) = fpi * e2 * sigmahar(:,:)
sigmahar(:,:) = fpi * e2 * sigmahar(:,:)
ELSE
sigmahar(:,:) = fpi * e2 * sigmahar(:,:) * 0.5_DP
sigmahar(:,:) = fpi * e2 * sigmahar(:,:) * 0.5_DP
ENDIF
!
DO l = 1, 3
sigmahar(l,l) = sigmahar(l,l) - ehart / omega
sigmahar(l,l) = sigmahar(l,l) - ehart / omega
ENDDO
!
DO l = 1, 3
DO m = 1, l-1
sigmahar(m,l) = sigmahar(l,m)
ENDDO
DO m = 1, l-1
sigmahar(m,l) = sigmahar(l,m)
ENDDO
ENDDO
!
sigmahar(:,:) = -sigmahar(:,:)
!
!
RETURN
!
END SUBROUTINE stres_har

View File

@ -1,122 +0,0 @@
!
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------
SUBROUTINE stres_har_gpu( sigmahar )
!--------------------------------------------------------------------
!! Calculates the Hartree contribution to the stress
!
USE kinds, ONLY: DP
USE constants, ONLY: e2, fpi
USE cell_base, ONLY: omega, tpiba2
USE ener, ONLY: ehart
USE fft_base, ONLY: dfftp
USE fft_rho, ONLY: rho_r2g
USE gvect, ONLY: ngm, gstart
USE scf, ONLY: rho
USE control_flags, ONLY: gamma_only
USE mp_bands, ONLY: intra_bgrp_comm
USE mp, ONLY: mp_sum
USE Coul_cut_2D, ONLY: do_cutoff_2D, cutoff_stres_sigmahar_gpu
!
USE gvect, ONLY: g_d, gg_d
!
IMPLICIT NONE
!
REAL(DP) :: sigmahar(3,3)
!! Hartree term of the stress tensor
!
! ... local variables
!
REAL(DP) :: shart, g2
REAL(DP), PARAMETER :: eps = 1.E-8_DP
INTEGER :: ig, l, m
!
REAL(DP) :: sigmahar11, sigmahar31, sigmahar21, &
sigmahar32, sigmahar22, sigmahar33
COMPLEX(DP), ALLOCATABLE :: rhog(:,:)
!
sigmahar(:,:) = 0.0_DP
!
ALLOCATE( rhog(dfftp%nnr,1) )
!$acc data create(rhog)
!
CALL rho_r2g( dfftp, rho%of_r(:,1), rhog )
!
! rhog contains now the charge density in G space
! the G=0 component is not computed
!
IF (do_cutoff_2D) THEN
!
CALL cutoff_stres_sigmahar_gpu( rhog, sigmahar )
!
ELSE
!
sigmahar11 = 0._DP ; sigmahar31 = 0._DP
sigmahar21 = 0._DP ; sigmahar32 = 0._DP
sigmahar22 = 0._DP ; sigmahar33 = 0._DP
!
!$acc parallel loop reduction(+:sigmahar11,sigmahar21,sigmahar22, &
!$acc sigmahar31,sigmahar32,sigmahar33)
DO ig = gstart, ngm
!
g2 = gg_d(ig)
!
shart = DBLE(rhog(ig,1)*CONJG(rhog(ig,1))) / g2
!
sigmahar11 = sigmahar11 + shart *2._DP * &
g_d(1,ig) * g_d(1,ig) / g2
sigmahar21 = sigmahar21 + shart *2._DP * &
g_d(2,ig) * g_d(1,ig) / g2
sigmahar22 = sigmahar22 + shart *2._DP * &
g_d(2,ig) * g_d(2,ig) / g2
sigmahar31 = sigmahar31 + shart *2._DP * &
g_d(3,ig) * g_d(1,ig) / g2
sigmahar32 = sigmahar32 + shart *2._DP * &
g_d(3,ig) * g_d(2,ig) / g2
sigmahar33 = sigmahar33 + shart *2._DP * &
g_d(3,ig) * g_d(3,ig) / g2
ENDDO
!
sigmahar(1,1) = sigmahar(1,1) + sigmahar11 / tpiba2
sigmahar(2,1) = sigmahar(2,1) + sigmahar21 / tpiba2
sigmahar(2,2) = sigmahar(2,2) + sigmahar22 / tpiba2
sigmahar(3,1) = sigmahar(3,1) + sigmahar31 / tpiba2
sigmahar(3,2) = sigmahar(3,2) + sigmahar32 / tpiba2
sigmahar(3,3) = sigmahar(3,3) + sigmahar33 / tpiba2
!
ENDIF
!
!$acc end data
DEALLOCATE( rhog )
!
CALL mp_sum( sigmahar, intra_bgrp_comm )
!
IF (gamma_only) THEN
sigmahar(:,:) = fpi * e2 * sigmahar(:,:)
ELSE
sigmahar(:,:) = fpi * e2 * sigmahar(:,:) * 0.5_DP
ENDIF
!
DO l = 1, 3
sigmahar(l,l) = sigmahar(l,l) - ehart / omega
ENDDO
!
DO l = 1, 3
DO m = 1, l-1
sigmahar(m,l) = sigmahar(l,m)
ENDDO
ENDDO
!
sigmahar(:,:) = -sigmahar(:,:)
!
!
RETURN
!
END SUBROUTINE stres_har_gpu

View File

@ -9,11 +9,11 @@
!-----------------------------------------------------------------------
SUBROUTINE stres_knl( sigmanlc, sigmakin )
!-----------------------------------------------------------------------
!! Computes the kinetic + nonlocal contribuition to the stress
!! Computes the kinetic + nonlocal contribuition to the stress.
!
USE kinds, ONLY: DP
USE constants, ONLY: pi, e2
USE cell_base, ONLY: omega, alat, at, bg, tpiba
USE cell_base, ONLY: omega, tpiba
USE gvect, ONLY: g
USE gvecw, ONLY: qcutz, ecfixed, q2sigma
USE klist, ONLY: nks, xk, ngk, igk_k
@ -27,76 +27,114 @@ SUBROUTINE stres_knl( sigmanlc, sigmakin )
USE mp_pools, ONLY: inter_pool_comm
USE mp_bands, ONLY: intra_bgrp_comm
USE mp, ONLY: mp_sum
USE wavefunctions_gpum, ONLY: using_evc
#if defined(__CUDA)
USE wavefunctions_gpum, ONLY: using_evc, using_evc_d
#endif
!
IMPLICIT NONE
!
REAL(DP) :: sigmanlc (3, 3)
REAL(DP) :: sigmanlc(3,3)
!! non-local contribution to stress
REAL(DP) :: sigmakin (3, 3)
REAL(DP) :: sigmakin(3,3)
!! kinetic contribution to stress
!
! ... local variables
!
REAL(DP), ALLOCATABLE :: gk (:,:), kfac (:)
REAL(DP) :: twobysqrtpi, gk2, arg
INTEGER :: npw, ik, l, m, i, ibnd, is
REAL(DP), ALLOCATABLE :: gk(:,:), kfac(:)
REAL(DP) :: twobysqrtpi, gk2, arg, s11, s21, s31, s22, s32, s33, &
xk1, xk2, xk3, tmpf, wg_nk
INTEGER :: npw, ik, l, m, i, ibnd
!
#if defined(__CUDA)
CALL using_evc(0)
ALLOCATE( gk(npwx,3) )
ALLOCATE( kfac(npwx) )
CALL using_evc_d(0)
#endif
!
sigmanlc(:,:) = 0.d0
sigmakin(:,:) = 0.d0
twobysqrtpi = 2.d0/SQRT(pi)
!$acc enter data create( evc )
!
kfac(:) = 1.d0
ALLOCATE( gk(npwx,3), kfac(npwx) )
!$acc data create( gk, kfac )
!$acc data copyin( wg )
!
sigmanlc(:,:) = 0._DP
sigmakin(:,:) = 0._DP
twobysqrtpi = 2._DP/SQRT(pi)
!
!$acc kernels
kfac(:) = 1._DP
!$acc end kernels
!
s11 = 0._DP ; s22 = 0._DP
s21 = 0._DP ; s32 = 0._DP
s31 = 0._DP ; s33 = 0._DP
!
DO ik = 1, nks
IF ( nks > 1 ) CALL get_buffer( evc, nwordwfc, iunwfc, ik )
if ( nks > 1 ) CALL using_evc(2)
IF ( nks > 1 ) THEN
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
#if defined(__CUDA)
CALL using_evc(2)
CALL using_evc_d(0)
#endif
ENDIF
!
npw = ngk(ik)
!
xk1 = xk(1,ik)
xk2 = xk(2,ik)
xk3 = xk(3,ik)
!
!$acc parallel loop
DO i = 1, npw
gk(i,1) = ( xk(1,ik) + g(1,igk_k(i,ik)) ) * tpiba
gk(i,2) = ( xk(2,ik) + g(2,igk_k(i,ik)) ) * tpiba
gk(i,3) = ( xk(3,ik) + g(3,igk_k(i,ik)) ) * tpiba
IF (qcutz > 0.d0) THEN
gk(i,1) = ( xk1 + g(1,igk_k(i,ik)) ) * tpiba
gk(i,2) = ( xk2 + g(2,igk_k(i,ik)) ) * tpiba
gk(i,3) = ( xk3 + g(3,igk_k(i,ik)) ) * tpiba
IF (qcutz > 0._DP) THEN
gk2 = gk(i,1)**2 + gk(i,2)**2 + gk(i,3)**2
arg = ( (gk2-ecfixed)/q2sigma )**2
kfac(i) = 1.d0 + qcutz / q2sigma * twobysqrtpi * EXP(-arg)
kfac(i) = 1._DP + qcutz / q2sigma * twobysqrtpi * EXP(-arg)
ENDIF
ENDDO
!
! ... kinetic contribution
!
DO l = 1, 3
DO m = 1, l
DO ibnd = 1, nbnd
DO i = 1, npw
IF (noncolin) THEN
sigmakin(l,m) = sigmakin(l,m) + wg(ibnd,ik) * &
gk(i,l) * gk(i, m) * kfac(i) * &
( DBLE (CONJG(evc( i ,ibnd))*evc( i ,ibnd)) + &
DBLE (CONJG(evc(i+npwx,ibnd))*evc(i+npwx,ibnd)))
ELSE
sigmakin(l,m) = sigmakin(l,m) + wg(ibnd,ik) * &
gk(i,l) * gk(i, m) * kfac(i) * &
DBLE (CONJG(evc(i, ibnd) ) * evc(i, ibnd) )
ENDIF
ENDDO
ENDDO
!$acc update device(evc)
!
!$acc parallel loop collapse(2) reduction(+:s11,s21,s31,s22,s32,s33)
DO ibnd = 1, nbnd
DO i = 1, npw
wg_nk = wg(ibnd,ik)
IF (noncolin) THEN
tmpf = wg_nk * kfac(i) * &
( DBLE(CONJG(evc( i ,ibnd))*evc( i ,ibnd)) + &
DBLE(CONJG(evc(i+npwx,ibnd))*evc(i+npwx,ibnd)) )
ELSE
tmpf = wg_nk * kfac(i) * &
DBLE( CONJG(evc(i,ibnd) ) * evc(i,ibnd) )
ENDIF
s11 = s11 + tmpf * gk(i,1)*gk(i,1)
s21 = s21 + tmpf * gk(i,2)*gk(i,1)
s31 = s31 + tmpf * gk(i,3)*gk(i,1)
s22 = s22 + tmpf * gk(i,2)*gk(i,2)
s32 = s32 + tmpf * gk(i,3)*gk(i,2)
s33 = s33 + tmpf * gk(i,3)*gk(i,3)
ENDDO
!
ENDDO
!
! ... contribution from the nonlocal part
! ... contribution from the nonlocal part
!
CALL stres_us( ik, gk, sigmanlc )
!
ENDDO
!
DEALLOCATE( kfac )
DEALLOCATE( gk )
sigmakin(:,1) = [s11, s21, s31]
sigmakin(:,2) = [0._DP,s22, s32]
sigmakin(:,3) = [0._DP,0._DP,s33]
!
!$acc end data
!$acc end data
DEALLOCATE( gk, kfac )
!
!$acc exit data delete(evc)
!
! ... the kinetic term must be summed over PW's and over k-points
!
@ -121,11 +159,11 @@ SUBROUTINE stres_knl( sigmanlc, sigmakin )
ENDDO
!
IF ( gamma_only ) THEN
sigmakin(:,:) = 2.d0 * e2 / omega * sigmakin(:,:)
sigmakin(:,:) = 2._DP * e2 / omega * sigmakin(:,:)
ELSE
sigmakin(:,:) = e2 / omega * sigmakin(:,:)
ENDIF
sigmanlc(:,:) = -1.d0 / omega * sigmanlc(:,:)
sigmanlc(:,:) = -1._DP / omega * sigmanlc(:,:)
!
! ... symmetrize stress
!

View File

@ -1,168 +0,0 @@
!
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!-----------------------------------------------------------------------
SUBROUTINE stres_knl_gpu( sigmanlc, sigmakin )
!-----------------------------------------------------------------------
!! Computes the kinetic + nonlocal contribuition to the stress
!
USE kinds, ONLY: DP
USE constants, ONLY: pi, e2
USE cell_base, ONLY: omega, alat, at, bg, tpiba
USE gvect, ONLY: g, g_d
USE gvecw, ONLY: qcutz, ecfixed, q2sigma
USE klist, ONLY: nks, xk, ngk, igk_k_d
USE io_files, ONLY: iunwfc, nwordwfc
USE buffers, ONLY: get_buffer
USE symme, ONLY: symmatrix
USE wvfct, ONLY: npwx, nbnd, wg
USE control_flags, ONLY: gamma_only, use_gpu
USE noncollin_module, ONLY: noncolin, npol
USE wavefunctions, ONLY: evc
USE mp_pools, ONLY: inter_pool_comm
USE mp_bands, ONLY: intra_bgrp_comm
USE mp, ONLY: mp_sum
#if defined(__CUDA)
USE wavefunctions_gpum, ONLY: using_evc, using_evc_d, evc_d
USE device_fbuff_m, ONLY : dev_buf
#endif
!
IMPLICIT NONE
!
REAL(DP) :: sigmanlc(3,3)
!! non-local contribution to stress
REAL(DP) :: sigmakin(3,3)
!! kinetic contribution to stress
!
! ... local variables
!
REAL(DP), POINTER :: gk_d(:,:), kfac_d (:)
REAL(DP) :: twobysqrtpi, gk2, arg, s11, s21, s31, s22, s32, s33, &
xk1, xk2, xk3, tmpf, wg_nk
INTEGER :: npw, ik, l, m, i, ibnd, is
INTEGER :: ierr(2)
!
#if defined(__CUDA)
ATTRIBUTES(DEVICE) :: gk_d, kfac_d
!
CALL using_evc(0)
CALL using_evc_d(0)
!
CALL dev_buf%lock_buffer( gk_d, [npwx,3], ierr(1) )
CALL dev_buf%lock_buffer( kfac_d, npwx, ierr(2) )
!
sigmanlc(:,:) = 0._DP
sigmakin(:,:) = 0._DP
twobysqrtpi = 2._DP/SQRT(pi)
!
kfac_d(:) = 1._DP
!
s11 = 0._DP ; s22 = 0._DP
s21 = 0._DP ; s32 = 0._DP
s31 = 0._DP ; s33 = 0._DP
!
DO ik = 1, nks
IF ( nks > 1 ) THEN
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
CALL using_evc(2)
CALL using_evc_d(0)
ENDIF
!
npw = ngk(ik)
!
xk1 = xk(1,ik)
xk2 = xk(2,ik)
xk3 = xk(3,ik)
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, npw
gk_d(i,1) = ( xk1 + g_d(1,igk_k_d(i,ik)) ) * tpiba
gk_d(i,2) = ( xk2 + g_d(2,igk_k_d(i,ik)) ) * tpiba
gk_d(i,3) = ( xk3 + g_d(3,igk_k_d(i,ik)) ) * tpiba
IF (qcutz > 0._DP) THEN
gk2 = gk_d(i,1)**2 + gk_d(i,2)**2 + gk_d(i,3)**2
arg = ( (gk2-ecfixed)/q2sigma )**2
kfac_d(i) = 1._DP + qcutz / q2sigma * twobysqrtpi * EXP(-arg)
ENDIF
ENDDO
!
! ... kinetic contribution
!
DO ibnd = 1, nbnd
wg_nk = wg(ibnd,ik)
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, npw
IF (noncolin) THEN
tmpf = wg_nk * kfac_d(i) * &
( DBLE(CONJG(evc_d( i ,ibnd))*evc_d( i ,ibnd)) + &
DBLE(CONJG(evc_d(i+npwx,ibnd))*evc_d(i+npwx,ibnd)) )
ELSE
tmpf = wg_nk * kfac_d(i) * &
DBLE( CONJG(evc_d(i, ibnd) ) * evc_d(i, ibnd) )
ENDIF
s11 = s11 + tmpf * gk_d(i,1)*gk_d(i,1)
s21 = s21 + tmpf * gk_d(i,2)*gk_d(i,1)
s31 = s31 + tmpf * gk_d(i,3)*gk_d(i,1)
s22 = s22 + tmpf * gk_d(i,2)*gk_d(i,2)
s32 = s32 + tmpf * gk_d(i,3)*gk_d(i,2)
s33 = s33 + tmpf * gk_d(i,3)*gk_d(i,3)
ENDDO
ENDDO
!
! ... contribution from the nonlocal part
!
CALL stres_us_gpu( ik, gk_d, sigmanlc )
!
ENDDO
!
sigmakin(:,1) = [s11, s21, s31]
sigmakin(:,2) = [0._DP,s22, s32]
sigmakin(:,3) = [0._DP,0._DP,s33]
!
CALL dev_buf%release_buffer( kfac_d, ierr(2) )
CALL dev_buf%release_buffer( gk_d, ierr(1) )
!
! ... the kinetic term must be summed over PW's and over k-points
!
CALL mp_sum( sigmakin, intra_bgrp_comm )
CALL mp_sum( sigmakin, inter_pool_comm )
!
! ... the nonlocal term is summed here only over k-points, because we add
! ... to it the US term from augmentation charge derivatives
!
CALL mp_sum( sigmanlc, inter_pool_comm )
!
! ... add US term from augmentation charge derivatives, sum result over PW's
!
CALL addusstress( sigmanlc )
CALL mp_sum( sigmanlc, intra_bgrp_comm )
!
DO l = 1, 3
DO m = 1, l-1
sigmanlc(m,l) = sigmanlc(l,m)
sigmakin(m,l) = sigmakin(l,m)
ENDDO
ENDDO
!
IF ( gamma_only ) THEN
sigmakin(:,:) = 2._DP * e2 / omega * sigmakin(:,:)
ELSE
sigmakin(:,:) = e2 / omega * sigmakin(:,:)
ENDIF
sigmanlc(:,:) = -1._DP / omega * sigmanlc(:,:)
!
! ... symmetrize stress
!
CALL symmatrix( sigmakin )
CALL symmatrix( sigmanlc )
#endif
!
RETURN
!
END SUBROUTINE stres_knl_gpu

View File

@ -1,4 +1,3 @@
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
@ -7,8 +6,9 @@
!
!
!----------------------------------------------------------------------
subroutine stres_loc( sigmaloc )
SUBROUTINE stres_loc( sigmaloc )
!----------------------------------------------------------------------
!! Calculate the local term of the stress.
!
USE kinds, ONLY : DP
USE atom, ONLY : msh, rgrid
@ -24,88 +24,125 @@ subroutine stres_loc( sigmaloc )
USE uspp_param, ONLY : upf
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_evloc, cutoff_stres_sigmaloc
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_evloc, &
cutoff_stres_sigmaloc
!
implicit none
IMPLICIT NONE
!
real(DP) :: sigmaloc(3,3)
real(DP), allocatable :: dvloc(:)
complex(DP), allocatable :: rhog(:,:)
real(DP) :: evloc, fact
integer :: ng, nt, l, m
REAL(DP) :: sigmaloc(3,3)
REAL(DP), ALLOCATABLE :: dvloc(:)
COMPLEX(DP), ALLOCATABLE :: rhog(:,:)
REAL(DP) :: evloc, fact
INTEGER :: ng, nt, l, m
! counter on g vectors
! counter on atomic type
! counter on angular momentum
! counter on spin components
REAL(DP) :: sigma11, sigma21, sigma22, spart, &
sigma31, sigma32, sigma33
!
allocate( dvloc(ngl), rhog(dfftp%nnr,1) )
ALLOCATE( dvloc(ngl), rhog(dfftp%nnr,1) )
sigmaloc(:,:) = 0.d0
!
call rho_r2g( dfftp, rho%of_r(:,1), rhog )
!$acc data create(rhog)
CALL rho_r2g( dfftp, rho%of_r(:,1), rhog )
!
! rhog contains the charge density in G space
if (gamma_only) then
!$acc data copyin(vloc,strf,gl,igtongl) create(dvloc)
!
IF (gamma_only) THEN
fact = 2.d0
else
ELSE
fact = 1.d0
endif
ENDIF
!
evloc = 0.0d0
do nt = 1, ntyp
if (gstart==2) evloc = evloc + rhog(1,1) * strf(1,nt) * vloc(igtongl(1),nt)
do ng = gstart, ngm
evloc = evloc + DBLE(CONJG(rhog(ng,1)) * strf(ng,nt) ) &
* vloc(igtongl(ng),nt) * fact
enddo
enddo
! 2D: add contribution from cutoff long-range part of Vloc
IF (do_cutoff_2D) call cutoff_stres_evloc( rhog(:,1), strf, evloc )
!
IF (gstart==2) THEN
!$acc parallel loop reduction(+:evloc)
DO nt = 1, ntyp
evloc = evloc + rhog(1,1)*strf(1,nt)*vloc(igtongl(1),nt)
ENDDO
ENDIF
!
!$acc parallel loop collapse(2) reduction(+:evloc)
DO nt = 1, ntyp
DO ng = gstart, ngm
evloc = evloc + DBLE(CONJG(rhog(ng,1)) * strf(ng,nt)) &
* vloc(igtongl(ng),nt) * fact
ENDDO
ENDDO
!
! ... 2D: add contribution from cutoff long-range part of Vloc
IF (do_cutoff_2D) CALL cutoff_stres_evloc( rhog(:,1), strf, evloc )
!
! WRITE( 6,*) ' evloc ', evloc, evloc*omega ! DEBUG
!
do nt = 1, ntyp
DO nt = 1, ntyp
!
IF ( upf(nt)%is_gth ) THEN
!
! special case: GTH pseudopotential
!
call dvloc_gth( nt, upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
CALL dvloc_gth( nt, upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
!
ELSEIF ( upf(nt)%tcoulombp ) THEN
!
! special case: pseudopotential is coulomb 1/r potential
!
call dvloc_coul( upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
CALL dvloc_coul( upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
!
ELSE
!
! normal case: dvloc contains dV_loc(G)/dG
!
call dvloc_of_g( rgrid(nt)%mesh, msh(nt), rgrid(nt)%rab, rgrid(nt)%r, &
upf(nt)%vloc(:), upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
CALL dvloc_of_g( rgrid(nt)%mesh, msh(nt), rgrid(nt)%rab, rgrid(nt)%r, &
upf(nt)%vloc(:), upf(nt)%zp, tpiba2, ngl, gl, omega, &
dvloc )
!
ENDIF
! no G=0 contribution
do ng = 1, ngm
do l = 1, 3
do m = 1, l
sigmaloc(l,m) = sigmaloc(l,m) + DBLE(CONJG(rhog(ng,1)) * strf(ng,nt))* &
2.0d0 * dvloc(igtongl(ng) ) * tpiba2 * &
g(l,ng) * g(m,ng) * fact
enddo
enddo
enddo
enddo
IF (do_cutoff_2D) call cutoff_stres_sigmaloc( rhog(:,1), strf, sigmaloc ) ! 2D: re-add LR Vloc to sigma here
! ... no G=0 contribution
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
!
!$acc parallel loop reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32,&
!$acc sigma33)
DO ng = 1, ngm
spart = DBLE(CONJG(rhog(ng,1)) * strf(ng,nt)) * 2.0_DP * &
dvloc(igtongl(ng))
sigma11 = sigma11 + spart * g(1,ng) * g(1,ng)
sigma21 = sigma21 + spart * g(2,ng) * g(1,ng)
sigma22 = sigma22 + spart * g(2,ng) * g(2,ng)
sigma31 = sigma31 + spart * g(3,ng) * g(1,ng)
sigma32 = sigma32 + spart * g(3,ng) * g(2,ng)
sigma33 = sigma33 + spart * g(3,ng) * g(3,ng)
ENDDO
!
sigmaloc(1,1) = sigmaloc(1,1) + sigma11 * fact * tpiba2
sigmaloc(2,1) = sigmaloc(2,1) + sigma21 * fact * tpiba2
sigmaloc(2,2) = sigmaloc(2,2) + sigma22 * fact * tpiba2
sigmaloc(3,1) = sigmaloc(3,1) + sigma31 * fact * tpiba2
sigmaloc(3,2) = sigmaloc(3,2) + sigma32 * fact * tpiba2
sigmaloc(3,3) = sigmaloc(3,3) + sigma33 * fact * tpiba2
!
ENDDO
!
do l = 1, 3
! ... 2D: re-add LR Vloc to sigma here
IF (do_cutoff_2D) CALL cutoff_stres_sigmaloc( rhog(:,1), strf, sigmaloc )
!
!$acc end data
!$acc end data
!
DO l = 1, 3
sigmaloc(l,l) = sigmaloc(l,l) + evloc
do m = 1, l-1
DO m = 1, l-1
sigmaloc(m,l) = sigmaloc(l,m)
enddo
enddo
ENDDO
ENDDO
!
call mp_sum( sigmaloc, intra_bgrp_comm )
CALL mp_sum( sigmaloc, intra_bgrp_comm )
!
deallocate( dvloc, rhog )
return
end subroutine stres_loc
DEALLOCATE( dvloc, rhog )
!
RETURN
!
END SUBROUTINE stres_loc

View File

@ -1,199 +0,0 @@
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------
SUBROUTINE stres_loc_gpu( sigmaloc )
!----------------------------------------------------------------------
!
USE kinds, ONLY : DP
USE atom, ONLY : msh, rgrid
USE m_gth, ONLY : dvloc_gth_gpu
USE ions_base, ONLY : ntyp => nsp
USE cell_base, ONLY : omega, tpiba2
USE fft_base, ONLY : dfftp
USE fft_rho, ONLY : rho_r2g
USE gvect, ONLY : ngm, gstart, ngl, gl, igtongl, &
g_d, gl_d, igtongl_d
USE scf, ONLY : rho
USE vlocal, ONLY : strf, vloc
USE control_flags, ONLY : gamma_only
USE uspp_param, ONLY : upf
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_evloc_gpu, cutoff_stres_sigmaloc_gpu
!
#if defined(__CUDA)
USE device_fbuff_m, ONLY : dev_buf
USE device_memcpy_m, ONLY : dev_memcpy
#endif
!
implicit none
!
REAL(DP) :: sigmaloc(3,3)
REAL(DP) :: evloc, fact
INTEGER :: ng, nt, l, m
! counter on g vectors
! counter on atomic type
! counter on angular momentum
! counter on spin components
!
INTEGER :: ierrs(5)
REAL(DP) :: evloc_d, zp_d
REAL(DP) :: spart, sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
!
INTEGER :: mshd
!
INTEGER, POINTER :: nl_d(:)
REAL(DP), POINTER :: rab_d(:), r_d(:), vloc_d(:,:), dvloc_d(:), &
upfvloc_d(:)
COMPLEX(DP), POINTER :: strf_d(:,:)
COMPLEX(DP), ALLOCATABLE :: rhog(:,:)
!
#if defined(__CUDA)
attributes(DEVICE) :: vloc_d, strf_d, nl_d, rab_d, r_d, dvloc_d, &
upfvloc_d
!
nl_d => dfftp%nl_d
!
sigmaloc(:,:) = 0._DP
!
ALLOCATE( rhog(dfftp%nnr,1) )
!$acc data create( rhog )
!
CALL rho_r2g( dfftp, rho%of_r(:,1), rhog )
!
gl_d = gl ; igtongl_d = igtongl
!
! rhog contains now the charge density in G space
fact = 1._DP
IF (gamma_only) fact = 2._DP
!
evloc = 0.0_DP
!
CALL dev_buf%lock_buffer( strf_d, (/ ngm, ntyp /), ierrs(1) )
CALL dev_memcpy( strf_d, strf )
!
CALL dev_buf%lock_buffer( vloc_d, (/ ngl, ntyp /), ierrs(2) )
CALL dev_memcpy( vloc_d, vloc )
!
!
DO nt = 1, ntyp
IF (gstart==2) THEN
!$acc kernels
evloc = evloc + rhog(1,1) * strf_d(1,nt) * vloc_d(igtongl_d(1),nt)
!$acc end kernels
ENDIF
!
evloc_d = 0._DP
!
!$acc parallel loop
DO ng = gstart, ngm
evloc_d = evloc_d + DBLE(CONJG(rhog(ng,1)) * strf_d(ng,nt)) &
* vloc_d(igtongl_d(ng),nt)
ENDDO
!
evloc = evloc + evloc_d * fact
!
ENDDO
!
!
CALL dev_buf%release_buffer( vloc_d, ierrs(2) )
!
CALL dev_buf%lock_buffer( dvloc_d, ngl, ierrs(2) )
mshd = MAXVAL(rgrid(1:ntyp)%mesh)
CALL dev_buf%lock_buffer( rab_d, mshd, ierrs(3) )
CALL dev_buf%lock_buffer( r_d, mshd, ierrs(4) )
CALL dev_buf%lock_buffer( upfvloc_d, mshd, ierrs(5) )
!
!
! 2D: add contribution from cutoff long-range part of Vloc
!$acc host_data use_device(rhog)
IF (do_cutoff_2D) CALL cutoff_stres_evloc_gpu( rhog(:,1), strf_d, evloc )
!$acc end host_data
!
! WRITE( 6,*) ' evloc ', evloc, evloc*omega ! DEBUG
!
DO nt = 1, ntyp
IF ( upf(nt)%is_gth ) THEN
!
! special case: GTH pseudopotential
!
CALL dvloc_gth_gpu( nt, upf(nt)%zp, tpiba2, ngl, gl_d, omega, dvloc_d )
!
ELSEIF ( upf(nt)%tcoulombp ) THEN
!
! special case: pseudopotential is coulomb 1/r potential
!
CALL dvloc_coul_gpu( upf(nt)%zp, tpiba2, ngl, gl_d, omega, dvloc_d )
!
ELSE
!
! normal case: dvloc contains dV_loc(G)/dG
!
! the G=0 component is not computed
rab_d(1:msh(nt)) = rgrid(nt)%rab(1:msh(nt))
r_d(1:msh(nt)) = rgrid(nt)%r(1:msh(nt))
upfvloc_d(1:msh(nt)) = upf(nt)%vloc(1:msh(nt))
zp_d = upf(nt)%zp
!
CALL dvloc_of_g_gpu( rgrid(nt)%mesh, msh(nt), rab_d(1:rgrid(nt)%mesh), &
r_d(1:rgrid(nt)%mesh), upfvloc_d(1:rgrid(nt)%mesh), &
zp_d, tpiba2, ngl, gl_d, omega, dvloc_d )
!
ENDIF
!
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
!
!$acc parallel loop reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32,sigma33)
DO ng = 1, ngm
spart = DBLE(CONJG(rhog(ng,1)) * strf_d(ng,nt)) * 2.0_DP *&
dvloc_d(igtongl_d(ng))
sigma11 = sigma11 + spart * g_d(1,ng) * g_d(1,ng)
sigma21 = sigma21 + spart * g_d(2,ng) * g_d(1,ng)
sigma22 = sigma22 + spart * g_d(2,ng) * g_d(2,ng)
sigma31 = sigma31 + spart * g_d(3,ng) * g_d(1,ng)
sigma32 = sigma32 + spart * g_d(3,ng) * g_d(2,ng)
sigma33 = sigma33 + spart * g_d(3,ng) * g_d(3,ng)
ENDDO
!
sigmaloc(1,1) = sigmaloc(1,1) + sigma11 * fact * tpiba2
sigmaloc(2,1) = sigmaloc(2,1) + sigma21 * fact * tpiba2
sigmaloc(2,2) = sigmaloc(2,2) + sigma22 * fact * tpiba2
sigmaloc(3,1) = sigmaloc(3,1) + sigma31 * fact * tpiba2
sigmaloc(3,2) = sigmaloc(3,2) + sigma32 * fact * tpiba2
sigmaloc(3,3) = sigmaloc(3,3) + sigma33 * fact * tpiba2
!
ENDDO
!
!$acc host_data use_device(rhog)
IF (do_cutoff_2D) CALL cutoff_stres_sigmaloc_gpu( rhog(:,1), strf_d, sigmaloc ) ! 2D: re-add LR Vloc to sigma here
!$acc end host_data
!
DO l = 1, 3
sigmaloc(l,l) = sigmaloc(l,l) + evloc
DO m = 1, l-1
sigmaloc(m,l) = sigmaloc(l,m)
ENDDO
ENDDO
!
CALL mp_sum( sigmaloc, intra_bgrp_comm )
!
!$acc end data
DEALLOCATE( rhog )
CALL dev_buf%release_buffer( strf_d, ierrs(1) )
CALL dev_buf%release_buffer( dvloc_d, ierrs(2) )
CALL dev_buf%release_buffer( rab_d, ierrs(3) )
CALL dev_buf%release_buffer( r_d, ierrs(4) )
CALL dev_buf%release_buffer( upfvloc_d, ierrs(5) )
!
#endif
RETURN
!
END SUBROUTINE stres_loc_gpu

View File

@ -9,16 +9,16 @@
!----------------------------------------------------------------------------
SUBROUTINE stres_mgga( sigmaxc )
!----------------------------------------------------------------------------
!
! Analytic stress tensor contribution from metagga is added to sigmaxc
!! Analytic stress tensor contribution from metagga is added to sigmaxc.
!
USE kinds, ONLY : DP
USE control_flags, ONLY : gamma_only
USE noncollin_module, ONLY : noncolin
USE cell_base, ONLY : alat, at, bg, omega, tpiba
USE cell_base, ONLY : omega
USE gvect, ONLY : g
USE scf, ONLY : rho, v
USE wavefunctions, ONLY : evc
USE wavefunctions_gpum, ONLY : using_evc
USE xc_lib, ONLY : xclib_dft_is
USE klist, ONLY : nks, xk, ngk
USE buffers, ONLY : get_buffer
@ -26,74 +26,75 @@ SUBROUTINE stres_mgga( sigmaxc )
USE wvfct, ONLY : nbnd, npwx, wg
USE lsda_mod, ONLY : lsda, nspin, current_spin, isk
USE fft_interfaces, ONLY : fwfft, invfft
USE fft_base, ONLY : dfftp, dffts
USE fft_base, ONLY : dffts
USE mp, ONLY : mp_sum
USE mp_pools, ONLY : inter_pool_comm
USE mp_bands, ONLY : intra_bgrp_comm
USE wavefunctions_gpum, ONLY : using_evc
!
IMPLICIT NONE
!
REAL(DP), INTENT(INOUT) :: sigmaxc(3,3)
REAL(DP), INTENT(INOUT) :: sigmaxc(3,3)
!
! Internal variables
! ... local variables
!
INTEGER :: ix, iy, ir, iss, ipol, incr, ibnd, ik, npw
INTEGER :: ipol2xy(3,3)
!! ipol2xy(i,j) = ipol2x(j,i) is a collapsed symmetric index
DATA ipol2xy / 1, 2, 3, 2, 4, 5, 3, 5, 6/
REAL(DP), PARAMETER :: epsr = 1.0d-6, epsg = 1.0d-10, e2 = 2.d0
COMPLEX(DP), ALLOCATABLE :: gradwfc (:,:), crosstaus(:,:,:)
REAL(DP) :: w1, w2, delta, sigma_mgga(3,3)
INTEGER :: ix, iy, K, ir, ipol, iss, incr, ibnd, ik, npw
!
if ( .not. xclib_dft_is('meta') ) return
REAL(DP), PARAMETER :: epsr = 1.E-6_DP, epsg = 1.E-10_DP, e2 = 2._DP
!
current_spin=1
COMPLEX(DP), ALLOCATABLE :: gradwfc(:,:), crosstaus(:,:,:)
REAL(DP), ALLOCATABLE :: vkin(:), rhokin(:)
REAL(DP) :: sigma1, sigma2, sigma3, &
sigma4, sigma5, sigma6
!
REAL(DP) :: w1, w2, delta, sigma_mgga(3,3)
!
!
! Stop if something is not yet implemented
IF ( .NOT. xclib_dft_is('meta') ) RETURN
!
if (noncolin) call errore('stres_mgga', &
'noncollinear stress + meta-GGA not implemented',1)
CALL using_evc(1)
!
! Initialization of a set of variables
current_spin = 1
!
allocate (gradwfc( dffts%nnr, 3))
allocate (crosstaus( dffts%nnr,6,nspin))
IF ( noncolin ) CALL errore( 'stres_mgga', 'noncollinear stress + meta-GGA &
&not implemented', 1 )
!
! For gamma_only efficiency
ALLOCATE( gradwfc(dffts%nnr,3) )
ALLOCATE( crosstaus(dffts%nnr,6,nspin) )
!$acc data create(crosstaus)
!$acc data create(gradwfc)
!
incr=1
IF ( gamma_only ) incr=2
!$acc kernels
crosstaus = 0.d0
!$acc end kernels
!
crosstaus(:,:,:) = 0.d0
gradwfc(:,:) = 0.d0
! ... For gamma_only efficiency
!
! Loop over the k points
incr = 1
IF ( gamma_only ) incr = 2
!
! ... Loop over the k points
!
k_loop: DO ik = 1, nks
!
!
IF ( lsda ) current_spin = isk(ik)
!
npw = ngk(ik)
!
! Read the wavefunctions
! ... Read the wavefunctions
!
IF ( nks > 1 ) THEN
!
CALL get_buffer ( evc, nwordwfc, iunwfc, ik )
CALL using_evc(2)
!
END IF
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
!$acc update device(evc)
ENDIF
!
do ibnd = 1, nbnd, incr
!
DO ibnd = 1, nbnd, incr
!
! w1, w2: weights for each k point and band
! ... w1, w2: weights for each k point and band
!
w1 = wg(ibnd,ik) / omega
!
IF ( (ibnd < nbnd) .and. (gamma_only) ) THEN
IF ( (ibnd < nbnd) .AND. (gamma_only) ) THEN
!
! ... two ffts at the same time
!
@ -103,165 +104,223 @@ SUBROUTINE stres_mgga( sigmaxc )
!
w2 = w1
!
END IF
ENDIF
!
! ... Gradient of the wavefunction in real space
!
! Gradient of the wavefunction in real space
!
CALL wfc_gradient( ibnd, ik, npw, gradwfc )
!
! Cross terms of kinetic energy density
! ... Cross terms of kinetic energy density
!
do ix=1,3
!
do iy=1,ix
!
ipol = ipol2xy(iy,ix)
!
do ir=1,dffts%nnr
!
crosstaus(ir,ipol,current_spin) = crosstaus(ir,ipol,current_spin) +&
2.0_DP*w1*DBLE(gradwfc(ir,ix))*DBLE(gradwfc(ir,iy)) +&
2.0_DP*w2*AIMAG(gradwfc(ir,ix))*AIMAG(gradwfc(ir,iy))
!
end do
!
end do
!
end do
!$acc parallel loop collapse(2)
DO ir = 1, dffts%nnr
DO ipol = 1, 6
!
! ... explanation here: https://stackoverflow.com/a/244550
!
! M*(M+1)/ 2
K = (3.0*4.0)/2.0 - 1 - (ipol - 1)
K = FLOOR((SQRT(FLOAT(8*K+1))-1)/2)
ix = (ipol-1) - (3.0*4.0)/2.0 + (K+1)*(K+2)/2.0 + 1 + (2-K)
iy = 3 - K
!
crosstaus(ir,ipol,current_spin) = crosstaus(ir,ipol,current_spin) + &
2.0_DP*w1*DBLE(gradwfc(ir,ix))*DBLE(gradwfc(ir,iy)) + &
2.0_DP*w2*AIMAG(gradwfc(ir,ix))*AIMAG(gradwfc(ir,iy))
ENDDO
ENDDO
!
end do
ENDDO !ibnd
!
END DO k_loop
ENDDO k_loop
!
call mp_sum( crosstaus, inter_pool_comm )
!$acc end data
DEALLOCATE( gradwfc )
!
! gradwfc not used anymore
!$acc host_data use_device(crosstaus)
CALL mp_sum( crosstaus, inter_pool_comm )
!$acc end host_data
!
deallocate (gradwfc)
ALLOCATE( vkin(dffts%nnr) )
ALLOCATE( rhokin(dffts%nnr) )
!$acc data create(vkin,rhokin)
!
sigma_mgga(:,:) = 0.D0
! ... metagga contribution to the stress tensor
sigma_mgga(:,:) = 0._DP
!
! metagga contribution to the stress tensor
sigma1 = 0.d0 ; sigma4 = 0.d0
sigma2 = 0.d0 ; sigma5 = 0.d0
sigma3 = 0.d0 ; sigma6 = 0.d0
!
do iss=1,nspin
DO iss = 1, nspin
!
do ix=1,3
!
do iy=1,3
!
delta=0.
if (ix==iy) delta=1.
!
do ir=1,dffts%nnr
!
sigma_mgga(ix,iy) = sigma_mgga(ix,iy) + v%kin_r(ir,iss) &
* ( rho%kin_r(ir,iss) * delta &
+ crosstaus(ir,ipol2xy(ix,iy),iss) )
!
end do
!
!
end do
!
end do
vkin = v%kin_r(:,iss)
rhokin = rho%kin_r(:,iss)
!$acc update device(vkin,rhokin)
!
end do
deallocate( crosstaus )
!$acc parallel loop reduction(+:sigma1,sigma2,sigma3,sigma4,sigma5,sigma6)
DO ir = 1, dffts%nnr
!
sigma1 = sigma1 + vkin(ir) * ( rhokin(ir) &
+ DBLE(crosstaus(ir,1,iss)) )
sigma2 = sigma2 + vkin(ir) * DBLE(crosstaus(ir,2,iss))
sigma3 = sigma3 + vkin(ir) * DBLE(crosstaus(ir,3,iss))
sigma4 = sigma4 + vkin(ir) * ( rhokin(ir) &
+ DBLE(crosstaus(ir,4,iss)) )
sigma5 = sigma5 + vkin(ir) * DBLE(crosstaus(ir,5,iss))
sigma6 = sigma6 + vkin(ir) * ( rhokin(ir) &
+ DBLE(crosstaus(ir,6,iss)) )
!
ENDDO
!
ENDDO
!
sigma_mgga(1,1) = sigma1 ; sigma_mgga(2,3) = sigma5
sigma_mgga(1,2) = sigma2 ; sigma_mgga(3,1) = sigma3
sigma_mgga(1,3) = sigma3 ; sigma_mgga(3,2) = sigma5
sigma_mgga(2,1) = sigma2 ; sigma_mgga(3,3) = sigma6
sigma_mgga(2,2) = sigma4
!
!$acc end data
!$acc end data
DEALLOCATE( vkin, rhokin )
DEALLOCATE( crosstaus )
!
CALL mp_sum( sigma_mgga, intra_bgrp_comm )
!
call mp_sum( sigma_mgga, intra_bgrp_comm )
sigmaxc(:,:) = sigmaxc(:,:) + sigma_mgga(:,:) / &
(dffts%nr1 * dffts%nr2 * dffts%nr3)
!
return
!
RETURN
!
END SUBROUTINE stres_mgga
SUBROUTINE wfc_gradient ( ibnd, ik, npw, gradpsi )
!
!
!----------------------------------------------------------
SUBROUTINE wfc_gradient( ibnd, ik, npw, gradpsi )
!----------------------------------------------------------
!! Returns the gradient of the wavefunction in real space.
!! Slightly adapted from sum_bands.f90
!
! Returns the gradient of the wavefunction in real space
! Slightly adapted from sum_bands.f90
!
USE kinds, ONLY : DP
USE control_flags, ONLY : gamma_only
USE wavefunctions, ONLY : psic, evc
USE wvfct, ONLY : npwx, nbnd
USE cell_base, ONLY : omega, tpiba
USE klist, ONLY : xk, igk_k
USE gvect, ONLY : g
USE fft_base, ONLY : dffts
USE fft_interfaces, ONLY : invfft
USE wavefunctions_gpum, ONLY : using_evc
USE kinds, ONLY: DP
USE control_flags, ONLY: gamma_only
USE wvfct, ONLY: npwx, nbnd
USE cell_base, ONLY: omega, tpiba
USE klist, ONLY: xk, igk_k
USE wavefunctions, ONLY: psic, evc
USE fft_base, ONLY: dffts
USE fft_interfaces, ONLY: invfft
USE gvect, ONLY: ngm, g
!
IMPLICIT NONE
!
INTEGER :: ibnd, ik, npw
COMPLEX(DP) :: gradpsi(dffts%nnr,3)
INTEGER, INTENT(IN) :: ibnd, ik, npw
COMPLEX(DP), INTENT(OUT) :: gradpsi(dffts%nnr,3)
!
! Internal variables
! ... local variables
!
REAL(DP) :: kplusg(npwx)
INTEGER :: ipol
INTEGER :: ipol, j
REAL(DP) :: kplusg
INTEGER, ALLOCATABLE :: nld(:), nlmd(:)
REAL(DP) :: xki(3)
!
CALL using_evc(0)
!$acc data present(gradpsi) copyin(evc) create(psic)
!
! Compute the gradient of the wavefunction in reciprocal space
ALLOCATE( nld(npw) )
nld = dffts%nl
!
xki(1:3) = xk(1:3,ik)
!
! ... Compute the gradient of the wavefunction in reciprocal space
!
IF ( gamma_only ) THEN
!
DO ipol=1,3
ALLOCATE( nlmd(npw) )
nlmd = dffts%nlm
!$acc data copyin(xki,nld,nlmd)
!
DO ipol = 1, 3
!
psic(:) = ( 0.D0, 0.D0 )
!
kplusg (1:npw) = (xk(ipol,ik)+g(ipol,igk_k(1:npw,ik))) * tpiba
!$acc kernels
psic(:) = (0._DP,0._DP)
!$acc end kernels
!
IF ( ibnd < nbnd ) THEN
!
! ... two ffts at the same time
!
psic(dffts%nl(1:npw)) = CMPLX(0d0, kplusg(1:npw),kind=DP)* &
( evc(1:npw,ibnd) + &
( 0.D0, 1.D0 ) * evc(1:npw,ibnd+1) )
!
psic(dffts%nlm(1:npw)) = CMPLX(0d0,-kplusg(1:npw),kind=DP) * &
CONJG( evc(1:npw,ibnd) - &
( 0.D0, 1.D0 ) * evc(1:npw,ibnd+1) )
!$acc parallel loop
DO j = 1, npw
kplusg = (xki(ipol)+g(ipol,igk_k(j,ik))) * tpiba
!
psic(nld(j)) = CMPLX(0._DP, kplusg, kind=DP) * &
( evc(j,ibnd) + (0._DP,1._DP) * evc(j,ibnd+1) )
!
psic(nlmd(j)) = CMPLX(0._DP,-kplusg, kind=DP) * &
CONJG( evc(j,ibnd) - (0._DP,1._DP) * evc(j,ibnd+1) )
ENDDO
!
ELSE
!
psic(dffts%nl(1:npw)) = CMPLX(0d0, kplusg(1:npw),kind=DP)* &
evc(1:npw,ibnd)
!$acc parallel loop
DO j = 1, npw
kplusg = (xki(ipol)+g(ipol,igk_k(j,ik))) * tpiba
!
psic(nld(j)) = CMPLX(0._DP, kplusg, kind=DP) * evc(j,ibnd)
!
psic(nlmd(j)) = CMPLX(0._DP,-kplusg,kind=DP) * CONJG(evc(j,ibnd))
ENDDO
!
psic(dffts%nlm(1:npw)) = CMPLX(0d0,-kplusg(1:npw),kind=DP) * &
CONJG( evc(1:npw,ibnd) )
!
END IF
ENDIF
!
! Gradient of the wavefunction in real space
! ... Gradient of the wavefunction in real space
!
CALL invfft ('Wave', psic, dffts)
!$acc host_data use_device(psic)
CALL invfft( 'Wave', psic, dffts )
!$acc end host_data
!
!$acc kernels
gradpsi(:,ipol) = psic
!$acc end kernels
!
END DO
ENDDO
!$acc end data
DEALLOCATE( nlmd )
!
ELSE
!
DO ipol=1,3
!
psic(:) = ( 0.D0, 0.D0 )
!
kplusg (1:npw) = (xk(ipol,ik)+g(ipol,igk_k(1:npw,ik))) * tpiba
psic(dffts%nl(igk_k(1:npw,ik))) = CMPLX(0d0, kplusg(1:npw),kind=DP)* &
evc(1:npw,ibnd)
!
! Gradient of the wavefunction in real space
!
CALL invfft ('Wave', psic, dffts)
!
gradpsi(:,ipol) = psic
!
END DO
!
END IF
!$acc data copyin(xki,nld)
DO ipol = 1, 3
!
!$acc kernels
psic(:) = (0._DP,0._DP)
!$acc end kernels
!
!$acc parallel loop
DO j = 1, npw
kplusg = (xki(ipol)+g(ipol,igk_k(j,ik))) * tpiba
!
psic(nld(j)) = CMPLX(0._DP,kplusg,kind=DP)*evc(j,ibnd)
ENDDO
!
! ... Gradient of the wavefunction in real space
!
!$acc host_data use_device(psic)
CALL invfft( 'Wave', psic, dffts )
!$acc end host_data
!
!$acc kernels
gradpsi(:,ipol) = psic
!$acc end kernels
!
ENDDO
!$acc end data
ENDIF
!
DEALLOCATE( nld )
!
!$acc end data
!
RETURN
!
END SUBROUTINE wfc_gradient

View File

@ -1,335 +0,0 @@
!
! Copyright (C) 2017 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------------
SUBROUTINE stres_mgga_gpu( sigmaxc )
!----------------------------------------------------------------------------
!! Analytic stress tensor contribution from metagga is added to sigmaxc
!
USE kinds, ONLY : DP
USE control_flags, ONLY : gamma_only
USE noncollin_module, ONLY : noncolin
USE cell_base, ONLY : alat, at, bg, omega, tpiba
USE gvect, ONLY : g
USE scf, ONLY : rho, v
USE wavefunctions, ONLY : evc
USE xc_lib, ONLY : xclib_dft_is
USE klist, ONLY : nks, xk, ngk
USE buffers, ONLY : get_buffer
USE io_files, ONLY : iunwfc, nwordwfc
USE wvfct, ONLY : nbnd, npwx, wg
USE lsda_mod, ONLY : lsda, nspin, current_spin, isk
USE fft_interfaces, ONLY : fwfft, invfft
USE fft_base, ONLY : dfftp, dffts
USE mp, ONLY : mp_sum
USE mp_pools, ONLY : inter_pool_comm
USE mp_bands, ONLY : intra_bgrp_comm
USE wavefunctions_gpum, ONLY : using_evc
!
#if defined(__CUDA)
USE device_fbuff_m , ONLY : dev_buf
USE device_memcpy_m, ONLY : dev_memcpy, dev_memset
#endif
!
IMPLICIT NONE
!
REAL(DP), INTENT(INOUT) :: sigmaxc(3,3)
!
! Internal variables
!
INTEGER :: ix, iy, K, ir, ipol, iss, incr, ibnd, ik, npw
!
INTEGER :: ierrs(4)
!
REAL(DP), PARAMETER :: epsr = 1.E-6_DP, epsg = 1.E-10_DP, e2 = 2._DP
!
COMPLEX(DP), POINTER :: gradwfc_d(:,:), crosstaus_d(:,:,:)
INTEGER :: ix_d(6), iy_d(6)
REAL(DP), POINTER :: vkin_d(:), rhokin_d(:)
REAL(DP) :: sigma1_d, sigma2_d, sigma3_d, &
sigma4_d, sigma5_d, sigma6_d
!
REAL(DP) :: w1, w2, delta, sigma_mgga(3,3)
!
#if defined(__CUDA)
attributes(DEVICE) :: gradwfc_d, crosstaus_d, vkin_d, rhokin_d, ix_d, iy_d
!
if ( .not. xclib_dft_is('meta') ) return
!
current_spin = 1
!
! Stop if something is not yet implemented
!
IF ( noncolin ) CALL errore( 'stres_mgga', &
'noncollinear stress + meta-GGA not implemented', 1 )
!
! Initialization of a set of variables
!
CALL dev_buf%lock_buffer( gradwfc_d, (/ dffts%nnr, 3 /), ierrs(1) )
CALL dev_buf%lock_buffer( crosstaus_d, (/ dffts%nnr, 6, nspin /), ierrs(2) )
IF (ANY(ierrs(1:2) /= 0)) CALL errore( 'stres_mgga_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs(1:2))) )
!
CALL dev_memset(crosstaus_d , (0._DP,0._DP) )
!
! For gamma_only efficiency
!
incr = 1
IF ( gamma_only ) incr = 2
!
!Polarization indexes
! ix_d(1) = 1 ; iy_d(1) = 1
! ix_d(2) = 2 ; iy_d(2) = 1
! ix_d(3) = 3 ; iy_d(3) = 1
! ix_d(4) = 2 ; iy_d(4) = 2
! ix_d(5) = 3 ; iy_d(5) = 2
! ix_d(6) = 3 ; iy_d(6) = 3
!
! Loop over the k points
!
k_loop: DO ik = 1, nks
!
IF ( lsda ) current_spin = isk(ik)
!
npw = ngk(ik)
!
! Read the wavefunctions
!
IF ( nks > 1 ) THEN
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
CALL using_evc(2)
ENDIF
!
DO ibnd = 1, nbnd, incr
!
! w1, w2: weights for each k point and band
!
w1 = wg(ibnd,ik) / omega
!
IF ( (ibnd < nbnd) .AND. (gamma_only) ) THEN
!
! ... two ffts at the same time
!
w2 = wg(ibnd+1,ik) / omega
!
ELSE
!
w2 = w1
!
ENDIF
!
! Gradient of the wavefunction in real space
!
CALL wfc_gradient_gpu( ibnd, ik, npw, gradwfc_d )
!
! Cross terms of kinetic energy density
!
! FIX ME: PGI complains here if I set do(2)
!$cuf kernel do (1) <<<*,*>>>
DO ir = 1, dffts%nnr
DO ipol = 1, 6
!
! explenation here: https://stackoverflow.com/a/244550
!
! M*(M+1)/ 2
K = (3.0*4.0)/2.0 - 1 - (ipol - 1)
K = floor((SQRT(FLOAT(8*K+1))-1)/2)
ix = (ipol-1) - (3.0*4.0)/2.0 + (K+1)*(K+2)/2.0 + 1 + (2-K)
iy = 3 - K
!
crosstaus_d(ir,ipol,current_spin) = crosstaus_d(ir,ipol,current_spin) + &
2.0_DP*w1*DBLE(gradwfc_d(ir,ix))*DBLE(gradwfc_d(ir,iy)) + &
2.0_DP*w2*AIMAG(gradwfc_d(ir,ix))*AIMAG(gradwfc_d(ir,iy))
ENDDO
ENDDO
!
!crosstaus(:,:,current_spin) = crosstaus(:,:,current_spin) + &
! crosstaus_d(:,:,current_spin)
!
ENDDO !ibnd
!
ENDDO k_loop
!
!
CALL dev_buf%release_buffer( gradwfc_d, ierrs(1) )
!
CALL mp_sum( crosstaus_d, inter_pool_comm )
!
CALL dev_buf%lock_buffer( vkin_d, dffts%nnr, ierrs(3) )
CALL dev_buf%lock_buffer( rhokin_d, dffts%nnr, ierrs(4) )
IF (ANY(ierrs(3:4) /= 0)) CALL errore( 'stres_mgga_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs(3:4))) )
!
! metagga contribution to the stress tensor
sigma_mgga(:,:) = 0._DP
!
sigma1_d = 0.d0 ; sigma4_d = 0.d0
sigma2_d = 0.d0 ; sigma5_d = 0.d0
sigma3_d = 0.d0 ; sigma6_d = 0.d0
!
!
DO iss = 1, nspin
!
CALL dev_memcpy( vkin_d, v%kin_r(:,iss) )
CALL dev_memcpy( rhokin_d, rho%kin_r(:,iss) )
!
!$cuf kernel do (1) <<<*,*>>>
DO ir = 1, dffts%nnr
!
sigma1_d = sigma1_d + vkin_d(ir) * (rhokin_d(ir) &
+ DBLE(crosstaus_d(ir,1,iss)) )
sigma2_d = sigma2_d + vkin_d(ir) * DBLE(crosstaus_d(ir,2,iss))
sigma3_d = sigma3_d + vkin_d(ir) * DBLE(crosstaus_d(ir,3,iss))
sigma4_d = sigma4_d + vkin_d(ir) * (rhokin_d(ir) &
+ DBLE(crosstaus_d(ir,4,iss)) )
sigma5_d = sigma5_d + vkin_d(ir) * DBLE(crosstaus_d(ir,5,iss))
sigma6_d = sigma6_d + vkin_d(ir) * (rhokin_d(ir) &
+ DBLE(crosstaus_d(ir,6,iss)) )
!
ENDDO
!
ENDDO
!
sigma_mgga(1,1) = sigma1_d ; sigma_mgga(2,3) = sigma5_d
sigma_mgga(1,2) = sigma2_d ; sigma_mgga(3,1) = sigma3_d
sigma_mgga(1,3) = sigma3_d ; sigma_mgga(3,2) = sigma5_d
sigma_mgga(2,1) = sigma2_d ; sigma_mgga(3,3) = sigma6_d
sigma_mgga(2,2) = sigma4_d
!
CALL dev_buf%release_buffer( vkin_d, ierrs(3) )
CALL dev_buf%release_buffer( rhokin_d, ierrs(4) )
CALL dev_buf%release_buffer( crosstaus_d, ierrs(2) )
!
CALL mp_sum( sigma_mgga, intra_bgrp_comm )
!
sigmaxc(:,:) = sigmaxc(:,:) + sigma_mgga(:,:) / &
(dffts%nr1 * dffts%nr2 * dffts%nr3)
!
#endif
RETURN
!
END SUBROUTINE stres_mgga_gpu
!
!
!----------------------------------------------------------
SUBROUTINE wfc_gradient_gpu( ibnd, ik, npw, gradpsi_d )
!----------------------------------------------------------
!! Returns the gradient of the wavefunction in real space.
!! Slightly adapted from sum_bands.f90
!
USE kinds, ONLY: DP
USE control_flags, ONLY: gamma_only
USE wvfct, ONLY: npwx, nbnd
USE cell_base, ONLY: omega, tpiba
USE klist, ONLY: xk, igk_k_d
USE fft_base, ONLY: dffts
USE fft_interfaces, ONLY: invfft
!
USE gvect, ONLY: g_d
USE wavefunctions_gpum, ONLY: using_evc, using_evc_d, evc_d, &
using_psic, using_psic_d, psic_d
#if defined(__CUDA)
USE device_fbuff_m, ONLY: dev_buf
USE device_memcpy_m, ONLY: dev_memcpy
#endif
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ibnd, ik, npw
COMPLEX(DP), INTENT(OUT) :: gradpsi_d(dffts%nnr,3)
!
! ... local variables
!
INTEGER :: ipol, j
REAL(DP) :: kplusg
INTEGER, POINTER :: nl_d(:), nlm_d(:)
REAL(DP) :: xk_d(3)
!
#if defined(__CUDA)
attributes(DEVICE) :: gradpsi_d, nl_d, nlm_d, xk_d
!
CALL using_evc_d(0)
CALL using_psic_d(2)
!
nl_d => dffts%nl_d
nlm_d => dffts%nlm_d
!
xk_d(1:3) = xk(1:3,ik)
!
! Compute the gradient of the wavefunction in reciprocal space
!
IF ( gamma_only ) THEN
!
DO ipol = 1, 3
!
psic_d(:) = (0._DP,0._DP)
!
IF ( ibnd < nbnd ) THEN
!
! ... two ffts at the same time
!
!$cuf kernel do (1) <<<*,*>>>
DO j = 1, npw
kplusg = (xk_d(ipol)+g_d(ipol,igk_k_d(j,ik))) * tpiba
!
psic_d(nl_d(j)) = CMPLX(0._DP, kplusg, kind=DP) * &
( evc_d(j,ibnd) + &
( 0._DP, 1._DP ) * evc_d(j,ibnd+1) )
!
psic_d(nlm_d(j)) = CMPLX(0._DP,-kplusg, kind=DP) * &
CONJG( evc_d(j,ibnd) - &
( 0._DP, 1._DP ) * evc_d(j,ibnd+1) )
ENDDO
!
ELSE
!
!$cuf kernel do (1) <<<*,*>>>
DO j = 1, npw
kplusg = (xk_d(ipol)+g_d(ipol,igk_k_d(j,ik))) * tpiba
!
psic_d(nl_d(j)) = CMPLX(0._DP, kplusg, kind=DP) * &
evc_d(j,ibnd)
!
psic_d(nlm_d(j)) = CMPLX(0._DP,-kplusg,kind=DP) * &
CONJG( evc_d(j,ibnd) )
ENDDO
!
ENDIF
!
! Gradient of the wavefunction in real space
!
CALL invfft( 'Wave', psic_d, dffts )
!
CALL dev_memcpy( gradpsi_d(:,ipol), psic_d )
!
ENDDO
!
ELSE
!
DO ipol = 1, 3
!
psic_d(:) = (0._DP,0._DP)
!
!$cuf kernel do(1) <<<*,*>>>
DO j = 1, npw
kplusg = (xk_d(ipol)+g_d(ipol,igk_k_d(j,ik))) * tpiba
!
psic_d(nl_d(j)) = CMPLX(0._DP,kplusg,kind=DP)*evc_d(j,ibnd)
ENDDO
!
! Gradient of the wavefunction in real space
!
CALL invfft( 'Wave', psic_d, dffts )
!
CALL dev_memcpy( gradpsi_d(:,ipol), psic_d )
!
ENDDO
!
ENDIF
#endif
!
END SUBROUTINE wfc_gradient_gpu

File diff suppressed because it is too large Load Diff

View File

@ -1,865 +0,0 @@
!
! Copyright (C) 2001-2021 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------------
SUBROUTINE stres_us_gpu( ik, gk_d, sigmanlc )
!----------------------------------------------------------------------------
!! nonlocal (separable pseudopotential) contribution to the stress
!! NOTICE: sum of partial results over procs is performed in calling routine
!
USE kinds, ONLY : DP
USE ions_base, ONLY : nat, ntyp => nsp, ityp
USE constants, ONLY : eps8
USE klist, ONLY : nks, xk, ngk, igk_k
USE lsda_mod, ONLY : current_spin, lsda, isk
USE wvfct, ONLY : npwx, nbnd, wg, et
USE control_flags, ONLY : gamma_only
USE uspp_param, ONLY : upf, lmaxkb, nh, nhm
USE uspp, ONLY : nkb, vkb, deeq_d
USE lsda_mod, ONLY : nspin
USE noncollin_module, ONLY : noncolin, npol, lspinorb
USE mp_pools, ONLY : me_pool, root_pool
USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, root_bgrp
USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, &
bec_type, becp, calbec
USE mp, ONLY : mp_sum, mp_get_comm_null, &
mp_circular_shift_left
USE wavefunctions_gpum, ONLY : using_evc, using_evc_d, evc_d
USE wvfct_gpum, ONLY : using_et
USE becmod_gpum, ONLY : becp_d, bec_type_d
USE becmod_subs_gpum, ONLY : using_becp_auto, using_becp_d_auto, &
calbec_gpu
USE uspp_init, ONLY : init_us_2, gen_us_dj_gpu, gen_us_dy_gpu
#if defined(__CUDA)
USE device_fbuff_m, ONLY : dev_buf
#endif
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ik
!! k-point index
REAL(DP), INTENT(IN) :: gk_d(npwx,3)
!! wave function components for fixed k-point
REAL(DP), INTENT(INOUT) :: sigmanlc(3,3)
!! stress tensor, non-local contribution
!
! ... local variables
!
REAL(DP), POINTER :: qm1_d(:)
REAL(DP) :: q
INTEGER :: npw , iu, np, ierr
!
INTEGER :: na1, np1, nh_np1, ijkb01, itot
LOGICAL :: ismulti_np
INTEGER, ALLOCATABLE :: shift(:)
INTEGER, ALLOCATABLE :: ix_d(:,:), ityp_d(:), nh_d(:), shift_d(:)
LOGICAL, ALLOCATABLE :: is_multinp_d(:)
!
#if defined(__CUDA)
attributes(DEVICE) :: gk_d, qm1_d, is_multinp_d, ix_d, shift_d, &
ityp_d, nh_d
!
CALL using_evc_d(0)
CALL using_evc(0)
!
IF ( nkb == 0 ) RETURN
!
IF ( lsda ) current_spin = isk(ik)
npw = ngk(ik)
IF ( nks > 1 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .true. )
!
CALL allocate_bec_type( nkb, nbnd, becp, intra_bgrp_comm )
CALL using_becp_auto(2)
!
CALL using_evc_d(0)
CALL using_becp_d_auto(2)
!
!$acc data present(vkb(:,:))
!$acc host_data use_device(vkb)
CALL calbec_gpu( npw, vkb, evc_d, becp_d )
!$acc end host_data
!$acc end data
!
CALL dev_buf%lock_buffer( qm1_d, npwx, ierr )
!$cuf kernel do (1) <<<*,*>>>
DO iu = 1, npw
q = SQRT( gk_d(iu,1)**2 + gk_d(iu,2)**2 + gk_d(iu,3)**2 )
IF ( q > eps8 ) THEN
qm1_d(iu) = 1._DP / q
ELSE
qm1_d(iu) = 0._DP
ENDIF
ENDDO
!
!----------define index arrays (type, atom, etc.) for cuf kernel loops------
ALLOCATE( is_multinp_d(nat*nhm) )
ALLOCATE( ix_d(nat*nhm,4), ityp_d(nat), nh_d(ntyp) )
ityp_d = ityp ; nh_d = nh
!
ALLOCATE( shift(nat) )
ijkb01 = 0
DO iu = 1, ntyp
DO na1 = 1, nat
IF (ityp(na1) == iu ) THEN
shift(na1) = ijkb01
ijkb01 = ijkb01 + nh(iu)
ENDIF
ENDDO
ENDDO
!
ALLOCATE( shift_d(nat) )
shift_d = shift
!
ijkb01 = 0
itot=0
DO np = 1, ntyp
DO na1 = 1, nat
np1 = ityp(na1)
IF (np /= np1) CYCLE
ijkb01 = shift(na1)
nh_np1 = nh(np1)
ismulti_np = upf(np1)%tvanp .OR. upf(np1)%is_multiproj
IF ( .NOT. ismulti_np .AND. noncolin .AND. lspinorb ) &
CALL errore('stres_us','wrong case',1)
!$cuf kernel do (1) <<<*,*>>>
DO iu = itot+1, itot+nh_np1
ix_d(iu,1) = na1 !na
ix_d(iu,2) = iu-itot !ih
ix_d(iu,3) = nh_np1 !nh(np)
ix_d(iu,4) = ijkb01 !ishift
is_multinp_d(iu) = ismulti_np
ENDDO
itot = itot + nh_np1
ENDDO
ENDDO
!--------------
!
!
IF ( gamma_only ) THEN
!
CALL stres_us_gamma_gpu()
!
ELSE
!
CALL stres_us_k_gpu()
!
ENDIF
!
CALL dev_buf%release_buffer( qm1_d, ierr )
!
DEALLOCATE( is_multinp_d )
DEALLOCATE( ix_d, ityp_d, nh_d, shift_d )
DEALLOCATE( shift )
!
CALL deallocate_bec_type( becp )
CALL using_becp_auto(2)
!
RETURN
!
CONTAINS
!
!-----------------------------------------------------------------------
SUBROUTINE stres_us_gamma_gpu()
!-----------------------------------------------------------------------
!! nonlocal contribution to the stress - gamma version
!
IMPLICIT NONE
!
! ... local variables
!
INTEGER :: na, np, nt, ibnd, ipol, jpol, l, i, ikb, &
jkb, ih, jh, ibnd_loc,ijkb0,nh_np, nproc, &
nbnd_loc, nbnd_begin, icyc, ishift, nhmx
REAL(DP) :: dot11, dot21, dot31, dot22, dot32, dot33, &
qm1i, gk1, gk2, gk3, wg_nk, fac, evps, aux
COMPLEX(DP) :: worksum, cv, wsum1, wsum2, wsum3, ps, evci
!
COMPLEX(DP), POINTER :: ps_d(:), deff_d(:,:,:), dvkb_d(:,:,:)
REAL(DP), POINTER :: becpr_d(:,:)
!
REAL(DP) :: xyz(3,3)
INTEGER :: ierrs(3)
!
! xyz are the three unit vectors in the x,y,z directions
DATA xyz / 1._DP, 0._DP, 0._DP, 0._DP, 1._DP, 0._DP, 0._DP, 0._DP, &
1._DP /
#if defined(__CUDA)
attributes(DEVICE) :: deff_d, becpr_d, ps_d, dvkb_d
#endif
!
CALL using_becp_auto(0)
!
IF( becp%comm /= mp_get_comm_null() ) THEN
nproc = becp%nproc
nbnd_loc = becp%nbnd_loc
nbnd_begin = becp%ibnd_begin
IF( ( nbnd_begin + nbnd_loc - 1 ) > nbnd ) nbnd_loc = nbnd - nbnd_begin + 1
ELSE
nproc = 1
nbnd_loc = nbnd
nbnd_begin = 1
ENDIF
!
CALL dev_buf%lock_buffer( deff_d, (/ nhm,nhm,nat /), ierrs(1) )
!
! ... diagonal contribution - if the result from "calbec" are not
! ... distributed, must be calculated on a single processor
!
! ... for the moment when using_gpu is true becp is always fully present in all processors
!
CALL using_et(0) ! compute_deff : intent(in)
!
CALL dev_buf%lock_buffer( ps_d, nkb, ierrs(2) )
IF (ANY(ierrs(1:2) /= 0)) CALL errore( 'stres_us_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs)) )
!
becpr_d => becp_d%r_d
!
evps = 0._DP
!
compute_evps: IF ( .NOT. (nproc == 1 .AND. me_pool /= root_pool) ) THEN
!
DO ibnd_loc = 1, nbnd_loc
ibnd = ibnd_loc + becp%ibnd_begin - 1
CALL compute_deff_gpu( deff_d, et(ibnd,ik) )
wg_nk = wg(ibnd,ik)
!
!$cuf kernel do (1) <<<*,*>>>
DO i = 1, itot
!
ih = ix_d(i,2) ; na = ix_d(i,1)
ishift = ix_d(i,4) ; ikb = ishift + ih
!
IF (.NOT. is_multinp_d(i)) THEN
aux = wg_nk * DBLE(deff_d(ih,ih,na)) * &
ABS(becpr_d(ikb,ibnd_loc))**2
ELSE
nh_np = ix_d(i,3)
!
aux = wg_nk * DBLE(deff_d(ih,ih,na)) &
* ABS(becpr_d(ikb,ibnd_loc))**2 &
+ becpr_d(ikb,ibnd_loc)* wg_nk * 2._DP &
* SUM( DBLE(deff_d(ih,ih+1:nh_np,na)) &
* becpr_d(ishift+ih+1:ishift+nh_np,ibnd_loc))
ENDIF
evps = evps + aux
!
ENDDO
!
ENDDO
!
ENDIF compute_evps
!
!
! ... non diagonal contribution - derivative of the bessel function
!------------------------------------
CALL dev_buf%lock_buffer( dvkb_d, (/ npwx,nkb,4 /), ierrs(3) )
IF (ierrs(3) /= 0) CALL errore( 'stres_us_gpu', 'cannot allocate buffers', ABS(ierrs(3)))
!
CALL gen_us_dj_gpu( ik, dvkb_d(:,:,4) )
IF ( lmaxkb > 0 ) THEN
DO ipol = 1, 3
CALL gen_us_dy_gpu( ik, xyz(1,ipol), dvkb_d(:,:,ipol))
ENDDO
ENDIF
!
!
DO icyc = 0, nproc -1
!
DO ibnd_loc = 1, nbnd_loc
!
ibnd = ibnd_loc + becp%ibnd_begin - 1
CALL compute_deff_gpu( deff_d, et(ibnd,ik) )
!
!$cuf kernel do (1) <<<*,*>>>
DO i = 1, itot
!
ih = ix_d(i,2) ; na = ix_d(i,1)
ishift = ix_d(i,4) ; ikb = ishift + ih
!
IF (.NOT. is_multinp_d(i)) THEN
ps_d(ikb) = deff_d(ih,ih,na) * CMPLX(becpr_d(ikb,ibnd_loc))
ELSE
nh_np = ix_d(i,3)
!
ps_d(ikb) = CMPLX( SUM( becpr_d(ishift+1:ishift+nh_np,ibnd_loc) &
* DBLE(deff_d(ih,1:nh_np,na))))
ENDIF
!
ENDDO
!
dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP
dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP
!
!$cuf kernel do(2) <<<*,*>>>
DO na =1, nat
DO i = 1, npw
worksum = (0._DP,0._DP)
np = ityp_d(na)
ijkb0 = shift_d(na)
nh_np = nh_d(np)
DO ih = 1, nh_np
ikb = ijkb0 + ih
worksum = worksum + ps_d(ikb) * dvkb_d(i,ikb,4)
ENDDO
evci = evc_d(i,ibnd)
gk1 = gk_d(i,1)
gk2 = gk_d(i,2)
gk3 = gk_d(i,3)
qm1i = qm1_d(i)
!
cv = evci * CMPLX(gk1 * gk1 * qm1i)
dot11 = dot11 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * CMPLX(gk2 * gk1 * qm1i )
dot21 = dot21 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * CMPLX(gk3 * gk1 * qm1i)
dot31 = dot31 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * CMPLX(gk2 * gk2 * qm1i)
dot22 = dot22 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * CMPLX(gk3 * gk2 * qm1i)
dot32 = dot32 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * CMPLX(gk3 * gk3 * qm1i)
dot33 = dot33 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
ENDDO
ENDDO
! ... a factor 2 accounts for the other half of the G-vector sphere
sigmanlc(:,1) = sigmanlc(:,1) - 4._DP * wg(ibnd,ik) * [dot11, dot21, dot31]
sigmanlc(:,2) = sigmanlc(:,2) - 4._DP * wg(ibnd,ik) * [0._DP, dot22, dot32]
sigmanlc(:,3) = sigmanlc(:,3) - 4._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33]
!
! ... non diagonal contribution - derivative of the spherical harmonics
! ... (no contribution from l=0)
!
IF ( lmaxkb == 0 ) CYCLE
!
!------------------------------------
!
dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP
dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP
!
!$cuf kernel do(2) <<<*,*>>>
DO ikb = 1, nkb
DO i = 1, npw
!
wsum1 = ps_d(ikb)*dvkb_d(i,ikb,1)
wsum2 = ps_d(ikb)*dvkb_d(i,ikb,2)
wsum3 = ps_d(ikb)*dvkb_d(i,ikb,3)
!
evci = evc_d(i,ibnd)
gk1 = gk_d(i,1)
gk2 = gk_d(i,2)
gk3 = gk_d(i,3)
!
cv = evci * CMPLX(gk1 )
dot11 = dot11 + DBLE( wsum1)* DBLE(cv) + DIMAG(wsum1)*DIMAG(cv)
dot21 = dot21 + DBLE( wsum2)* DBLE(cv) + DIMAG(wsum2)*DIMAG(cv)
dot31 = dot31 + DBLE( wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv)
!
cv = evci * CMPLX( gk2)
dot22 = dot22 + DBLE( wsum2)* DBLE(cv) + DIMAG(wsum2)*DIMAG(cv)
dot32 = dot32 + DBLE( wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv)
!
cv = evci * CMPLX( gk3 )
dot33 = dot33 + DBLE( wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv)
ENDDO
ENDDO
!
! ... a factor 2 accounts for the other half of the G-vector sphere
!
sigmanlc(:,1) = sigmanlc(:,1) -4._DP * wg(ibnd, ik) * [dot11, dot21, dot31]
sigmanlc(:,2) = sigmanlc(:,2) -4._DP * wg(ibnd, ik) * [0._DP, dot22, dot32]
sigmanlc(:,3) = sigmanlc(:,3) -4._DP * wg(ibnd, ik) * [0._DP, 0._DP, dot33]
IF ( nproc > 1 ) THEN
CALL errore ('stres_us_gamma_gpu line 303', &
'unexpected error nproc be 1 with GPU acceleration', 100)
!CALL mp_circular_shift_left(becp%r, icyc, becp%comm)
!CALL mp_circular_shift_left(becp%ibnd_begin, icyc, becp%comm)
!CALL mp_circular_shift_left(nbnd_loc, icyc, becp%comm)
ENDIF
ENDDO
ENDDO
!
10 CONTINUE
!
DO l = 1, 3
sigmanlc(l,l) = sigmanlc(l,l) - evps
ENDDO
!
!
CALL dev_buf%release_buffer( deff_d, ierrs(1) )
CALL dev_buf%release_buffer( ps_d, ierrs(2) )
CALL dev_buf%release_buffer( dvkb_d, ierrs(3) )
!
!
RETURN
!
END SUBROUTINE stres_us_gamma_gpu
!
!
!----------------------------------------------------------------------
SUBROUTINE stres_us_k_gpu()
!----------------------------------------------------------------------
!! nonlocal contribution to the stress - k-points version
!
IMPLICIT NONE
!
! ... local variables
!
INTEGER :: na, np, ibnd, ipol, jpol, l, i, nt, &
ikb, jkb, ih, jh, is, js, ijs, ishift, nh_np
REAL(DP) :: fac, evps, dot11, dot21, dot31, dot22, dot32, dot33, aux
COMPLEX(DP) :: qm1i, gk1, gk2, gk3, ps, ps_nc(2)
COMPLEX(DP) :: cv, cv1, cv2, worksum, worksum1, worksum2, evci, evc1i, &
evc2i, ps1, ps2, ps1d1, ps1d2, ps1d3, ps2d1, ps2d2, &
ps2d3, psd1, psd2, psd3
!
COMPLEX(DP), POINTER :: dvkb_d(:,:,:)
COMPLEX(DP), POINTER :: deff_d(:,:,:), deff_nc_d(:,:,:,:)
COMPLEX(DP), POINTER :: ps_d(:), ps_nc_d(:,:)
COMPLEX(DP), POINTER :: becpk_d(:,:), becpnc_d(:,:,:)
INTEGER :: nhmx, ierrs(3)
!
REAL(DP) :: xyz(3,3)
! xyz are the three unit vectors in the x,y,z directions
DATA xyz / 1._DP, 0._DP, 0._DP, 0._DP, 1._DP, 0._DP, 0._DP, &
0._DP, 1._DP /
!
#if defined(__CUDA)
ATTRIBUTES(DEVICE) :: ps_d, ps_nc_d, becpnc_d, becpk_d, dvkb_d, &
deff_d, deff_nc_d
#endif
!
!
!
evps = 0._DP
! ... diagonal contribution
!
CALL dev_buf%lock_buffer( dvkb_d, (/ npwx,nkb,4 /), ierrs(1) )
!
CALL gen_us_dj_gpu( ik, dvkb_d(:,:,4) )
IF ( lmaxkb > 0 ) THEN
DO ipol = 1, 3
CALL gen_us_dy_gpu( ik, xyz(1,ipol), dvkb_d(:,:,ipol) )
ENDDO
ENDIF
!
IF (noncolin) THEN
CALL dev_buf%lock_buffer( ps_nc_d, (/ nkb,npol /), ierrs(2) )
CALL dev_buf%lock_buffer( deff_nc_d, (/ nhm,nhm,nat,nspin /), ierrs(3) )
becpnc_d => becp_d%nc_d
ELSE
CALL dev_buf%lock_buffer ( ps_d, nkb, ierrs(2) )
CALL dev_buf%lock_buffer( deff_d, (/ nhm,nhm,nat /), ierrs(3) )
becpk_d => becp_d%k_d
ENDIF
IF (ANY(ierrs /= 0)) CALL errore( 'stres_us_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs)) )
!
CALL using_et(0)
!
! ... the contribution is calculated only on one processor because
! ... partial results are later summed over all processors
!
IF ( me_bgrp /= root_bgrp ) GO TO 100
!
!
DO ibnd = 1, nbnd
fac = wg(ibnd,ik)
IF (ABS(fac) < 1.d-9) CYCLE
IF (noncolin) THEN
CALL compute_deff_nc_gpu( deff_nc_d , et(ibnd,ik) )
ELSE
CALL compute_deff_gpu( deff_d, et(ibnd,ik) )
ENDIF
!
!
IF (noncolin) THEN
!
!$cuf kernel do (1) <<<*,*>>>
DO i = 1, itot
!
ih = ix_d(i,2) ; na = ix_d(i,1)
ishift = ix_d(i,4) ; ikb = ishift + ih
aux = 0.d0
!
IF (.NOT. is_multinp_d(i)) THEN
!
ijs = 0
DO is = 1, npol
DO js = 1, npol
ijs = ijs + 1
aux = aux + fac * DBLE(deff_nc_d(ih,ih,na,ijs) &
* CONJG(becpnc_d(ikb,is,ibnd)) * &
becpnc_d(ikb,js,ibnd))
ENDDO
ENDDO
!
ELSE
!
nh_np = ix_d(i,3)
!
ijs = 0
DO is = 1, npol
DO js = 1, npol
ijs = ijs + 1
aux = aux + fac * DBLE(deff_nc_d(ih,ih,na,ijs) &
* CONJG(becpnc_d(ikb,is,ibnd))* &
becpnc_d(ikb,js,ibnd))
ENDDO
ENDDO
!
DO jh = ih+1, nh_np
jkb = ishift + jh
ijs = 0
DO is = 1, npol
DO js = 1, npol
ijs = ijs + 1
aux = aux + 2._DP*fac * &
DBLE( deff_nc_d(ih,jh,na,ijs) * &
(CONJG(becpnc_d(ikb,is,ibnd)) * &
becpnc_d(jkb,js,ibnd)) )
ENDDO
ENDDO
ENDDO
!
ENDIF
evps = evps + aux
ENDDO
!
ELSE
!
aux = 0.d0
!$cuf kernel do (1) <<<*,*>>>
DO i = 1, itot
!
ih = ix_d(i,2) ; na = ix_d(i,1)
ishift = ix_d(i,4) ; ikb = ishift + ih
!
IF (.NOT. is_multinp_d(i)) THEN
!
aux = fac * deff_d(ih,ih,na) * &
ABS(becpk_d(ikb,ibnd) )**2
!
ELSE
!
nh_np = ix_d(i,3)
!
aux = fac * DBLE(deff_d(ih,ih,na) * &
ABS(becpk_d(ikb,ibnd) )**2) + &
DBLE(SUM( deff_d(ih,ih+1:nh_np,na) * &
fac * 2._DP*DBLE( CONJG(becpk_d(ikb,ibnd)) &
* becpk_d(ishift+ih+1:ishift+nh_np,ibnd) ) ))
!
ENDIF
evps = evps + aux
ENDDO
!
ENDIF
!
ENDDO
!
DO l = 1, 3
sigmanlc(l,l) = sigmanlc(l,l) - evps
ENDDO
!
100 CONTINUE
!
! ... non diagonal contribution - derivative of the bessel function
!
DO ibnd = 1, nbnd
!
IF ( noncolin ) THEN
!
CALL compute_deff_nc_gpu( deff_nc_d, et(ibnd,ik) )
!
!$cuf kernel do (1) <<<*,*>>>
DO i = 1, itot
!
ih = ix_d(i,2) ; na = ix_d(i,1)
ishift = ix_d(i,4) ; ikb = ishift + ih
!
IF (.NOT. is_multinp_d(i)) THEN
!
DO is = 1, npol
ijs = (is-1)*npol
ps_nc_d(ikb,is) = SUM( becpnc_d(ikb,1:npol,ibnd) * &
deff_nc_d(ih,ih,na,ijs+1:ijs+npol) )
ENDDO
!
ELSE
!
nh_np = ix_d(i,3)
!
DO is = 1, npol
ijs = (is-1)*npol
ps_nc_d(ikb,is) = SUM( becpnc_d(ishift+1:ishift+nh_np,1:npol,ibnd) * &
deff_nc_d(ih,1:nh_np,na,ijs+1:ijs+npol) )
ENDDO
!
ENDIF
ENDDO
!
ELSE
!
CALL compute_deff_gpu( deff_d, et(ibnd,ik) )
!
!$cuf kernel do (1) <<<*,*>>>
DO i = 1, itot
!
ih = ix_d(i,2) ; na = ix_d(i,1)
ishift = ix_d(i,4) ; ikb = ishift + ih
!
IF (.NOT. is_multinp_d(i)) THEN
ps_d(ikb) = CMPLX(deeq_d(ih,ih,na,current_spin)) * &
becpk_d(ikb,ibnd)
ELSE
nh_np = ix_d(i,3)
!
ps_d(ikb) = SUM( becpk_d(ishift+1:ishift+nh_np,ibnd) * &
deff_d(ih,1:nh_np,na) )
ENDIF
!
ENDDO
!
ENDIF
!
dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP
dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP
!
!
IF (noncolin) THEN
!$cuf kernel do(2) <<<*,*>>>
DO ikb =1, nkb
DO i = 1, npw
evc1i = evc_d(i, ibnd)
evc2i = evc_d(i+npwx,ibnd)
qm1i = CMPLX(qm1_d(i))
gk1 = CMPLX(gk_d(i,1))
gk2 = CMPLX(gk_d(i,2))
gk3 = CMPLX(gk_d(i,3))
worksum1 = ps_nc_d(ikb,1) * dvkb_d(i,ikb,4)
worksum2 = ps_nc_d(ikb,2) * dvkb_d(i,ikb,4)
!
cv1 = evc1i * gk1 * gk1 * qm1i
cv2 = evc2i * gk1 * gk1 * qm1i
dot11 = dot11 + DBLE(worksum1)*DBLE(cv1) + &
DIMAG(worksum1)*DIMAG(cv1) + &
DBLE(worksum2)*DBLE(cv2) + &
DIMAG(worksum2)*DIMAG(cv2)
!
cv1 = evc1i * gk2 * gk1 * qm1i
cv2 = evc2i * gk2 * gk1 * qm1i
dot21 = dot21 + DBLE(worksum1)*DBLE(cv1) + &
DIMAG(worksum1)*DIMAG(cv1) + &
DBLE(worksum2)*DBLE(cv2) + &
DIMAG(worksum2)*DIMAG(cv2)
!
cv1 = evc1i * gk3 * gk1 * qm1i
cv2 = evc2i * gk3 * gk1 * qm1i
dot31 = dot31 + DBLE(worksum1)*DBLE(cv1) + &
DIMAG(worksum1)*DIMAG(cv1) + &
DBLE(worksum2)*DBLE(cv2) + &
DIMAG(worksum2)*DIMAG(cv2)
!
cv1 = evc1i * gk2 * gk2 * qm1i
cv2 = evc2i * gk2 * gk2 * qm1i
dot22 = dot22 + DBLE(worksum1)*DBLE(cv1) + &
DIMAG(worksum1)*DIMAG(cv1) + &
DBLE(worksum2)*DBLE(cv2) + &
DIMAG(worksum2)*DIMAG(cv2)
!
cv1 = evc1i * gk3 * gk2 * qm1i
cv2 = evc2i * gk3 * gk2 * qm1i
dot32 = dot32 + DBLE(worksum1)*DBLE(cv1) + &
DIMAG(worksum1)*DIMAG(cv1) + &
DBLE(worksum2)*DBLE(cv2) + &
DIMAG(worksum2)*DIMAG(cv2)
!
cv1 = evc1i * gk3 * gk3 * qm1i
cv2 = evc2i * gk3 * gk3 * qm1i
dot33 = dot33 + DBLE(worksum1)*DBLE(cv1) + &
DIMAG(worksum1)*DIMAG(cv1) + &
DBLE(worksum2)*DBLE(cv2) + &
DIMAG(worksum2)*DIMAG(cv2)
ENDDO
ENDDO
!
ELSE
!
!
!$cuf kernel do(2) <<<*,*>>>
DO ikb = 1, nkb
DO i = 1, npw
!
worksum = ps_d(ikb) *dvkb_d(i,ikb,4)
!
evci = evc_d(i,ibnd)
qm1i = CMPLX(qm1_d(i))
gk1 = CMPLX(gk_d(i,1))
gk2 = CMPLX(gk_d(i,2))
gk3 = CMPLX(gk_d(i,3))
!
cv = evci * gk1 * gk1 * qm1i
dot11 = dot11 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * gk2 * gk1 * qm1i
dot21 = dot21 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * gk3 * gk1 * qm1i
dot31 = dot31 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * gk2 * gk2 * qm1i
dot22 = dot22 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * gk3 * gk2 * qm1i
dot32 = dot32 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
cv = evci * gk3 * gk3 * qm1i
dot33 = dot33 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
!
ENDDO
ENDDO
!
!IF ( me_bgrp /= root_bgrp ) stop
!
ENDIF
!
sigmanlc(:,1) = sigmanlc(:,1) - 2._DP * wg(ibnd,ik) * [dot11, dot21, dot31]
sigmanlc(:,2) = sigmanlc(:,2) - 2._DP * wg(ibnd,ik) * [0._DP, dot22, dot32]
sigmanlc(:,3) = sigmanlc(:,3) - 2._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33]
!
! ... non diagonal contribution - derivative of the spherical harmonics
! ... (no contribution from l=0)
!
IF ( lmaxkb == 0 ) CYCLE
!
dot11 = 0._DP ; dot21 = 0._DP
dot31 = 0._DP ; dot22 = 0._DP
dot32 = 0._DP ; dot33 = 0._DP
!
IF (noncolin) THEN
!
!$cuf kernel do(2) <<<*,*>>>
DO ikb =1, nkb
DO i = 1, npw
!
gk1 = CMPLX(gk_d(i,1))
gk2 = CMPLX(gk_d(i,2))
gk3 = CMPLX(gk_d(i,3))
!
ps1 = ps_nc_d(ikb,1)
ps2 = ps_nc_d(ikb,2)
!
ps1d1 = ps1 * dvkb_d(i,ikb,1)
ps1d2 = ps1 * dvkb_d(i,ikb,2)
ps1d3 = ps1 * dvkb_d(i,ikb,3)
!
ps2d1 = ps2 * dvkb_d(i,ikb,1)
ps2d2 = ps2 * dvkb_d(i,ikb,2)
ps2d3 = ps2 * dvkb_d(i,ikb,3)
!
evc1i = evc_d(i,ibnd)
evc2i = evc_d(i+npwx,ibnd)
!
cv1 = evc1i * gk1
cv2 = evc2i * gk1
dot11 = dot11 + DBLE(ps1d1)*DBLE(cv1) + &
DIMAG(ps1d1)*DIMAG(cv1) + &
DBLE(ps2d1)*DBLE(cv2) + &
DIMAG(ps2d1)*DIMAG(cv2)
!
dot21 = dot21 + DBLE(ps1d2)*DBLE(cv1) + &
DIMAG(ps1d2)*DIMAG(cv1) + &
DBLE(ps2d2)*DBLE(cv2) + &
DIMAG(ps2d2)*DIMAG(cv2)
!
dot31 = dot31 + DBLE(ps1d3)*DBLE(cv1) + &
DIMAG(ps1d3)*DIMAG(cv1) + &
DBLE(ps2d3)*DBLE(cv2) + &
DIMAG(ps2d3)*DIMAG(cv2)
!
cv1 = evc1i * gk2
cv2 = evc2i * gk2
dot22 = dot22 + DBLE(ps1d2)*DBLE(cv1) + &
DIMAG(ps1d2)*DIMAG(cv1) + &
DBLE(ps2d2)*DBLE(cv2) + &
DIMAG(ps2d2)*DIMAG(cv2)
!
dot32 = dot32 + DBLE(ps1d3)*DBLE(cv1) + &
DIMAG(ps1d3)*DIMAG(cv1) + &
DBLE(ps2d3)*DBLE(cv2) + &
DIMAG(ps2d3)*DIMAG(cv2)
!
cv1 = evc1i * gk3
cv2 = evc2i * gk3
dot33 = dot33 + DBLE(ps1d3)*DBLE(cv1) + &
DIMAG(ps1d3)*DIMAG(cv1) + &
DBLE(ps2d3)*DBLE(cv2) + &
DIMAG(ps2d3)*DIMAG(cv2)
!
ENDDO
ENDDO
!
ELSE
!
!$cuf kernel do(2) <<<*,*>>>
DO ikb = 1, nkb
DO i = 1, npw
ps = ps_d(ikb)
psd1 = ps*dvkb_d(i,ikb,1)
psd2 = ps*dvkb_d(i,ikb,2)
psd3 = ps*dvkb_d(i,ikb,3)
evci = evc_d(i,ibnd)
gk1 = CMPLX(gk_d(i,1))
gk2 = CMPLX(gk_d(i,2))
gk3 = CMPLX(gk_d(i,3))
!
cv = evci * gk1
dot11 = dot11 + DBLE(psd1)*DBLE(cv) + DIMAG(psd1)*DIMAG(cv)
dot21 = dot21 + DBLE(psd2)*DBLE(cv) + DIMAG(psd2)*DIMAG(cv)
dot31 = dot31 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv)
!
cv = evci * gk2
dot22 = dot22 + DBLE(psd2)*DBLE(cv) + DIMAG(psd2)*DIMAG(cv)
dot32 = dot32 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv)
!
cv = evci * gk3
dot33 = dot33 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv)
ENDDO
ENDDO
!
ENDIF
!
sigmanlc(:,1) = sigmanlc(:,1) -2._DP * wg(ibnd,ik) * [dot11, dot21, dot31]
sigmanlc(:,2) = sigmanlc(:,2) -2._DP * wg(ibnd,ik) * [0._DP, dot22, dot32]
sigmanlc(:,3) = sigmanlc(:,3) -2._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33]
!
ENDDO
!
10 CONTINUE
!
CALL dev_buf%release_buffer( dvkb_d, ierrs(1) )
!
IF (noncolin) THEN
CALL dev_buf%release_buffer( ps_nc_d, ierrs(2) )
CALL dev_buf%release_buffer( deff_nc_d, ierrs(3) )
ELSE
CALL dev_buf%release_buffer( ps_d, ierrs(2) )
CALL dev_buf%release_buffer( deff_d, ierrs(3) )
ENDIF
!
RETURN
!
END SUBROUTINE stres_us_k_gpu
#endif
!
END SUBROUTINE stres_us_gpu

View File

@ -17,12 +17,13 @@ SUBROUTINE stress( sigma )
USE ions_base, ONLY : nat, ntyp => nsp, ityp, tau, zv, atm
USE constants, ONLY : ry_kbar
USE ener, ONLY : etxc, vtxc
USE gvect, ONLY : ngm, gstart, g, gg, gcutm
USE gvect, ONLY : ngm, gstart, g, gg, gcutm, gl, gl_d
USE fft_base, ONLY : dfftp
USE ldaU, ONLY : lda_plus_u, Hubbard_projectors
USE lsda_mod, ONLY : nspin
USE scf, ONLY : rho, rho_core, rhog_core
USE control_flags, ONLY : iverbosity, gamma_only, llondon, ldftd3, lxdm, ts_vdw, mbd_vdw, use_gpu
USE control_flags, ONLY : iverbosity, gamma_only, llondon, ldftd3, lxdm, &
ts_vdw, mbd_vdw
USE xc_lib, ONLY : xclib_dft_is
USE symme, ONLY : symmatrix
USE bp, ONLY : lelfield
@ -36,8 +37,7 @@ SUBROUTINE stress( sigma )
USE libmbd_interface, ONLY : HmbdvdW
USE rism_module, ONLY : lrism, stres_rism
USE esm, ONLY : do_comp_esm, esm_bc ! for ESM stress
USE esm, ONLY : esm_stres_har, esm_stres_ewa, esm_stres_loclong ! for ESM stress
USE gvect, ONLY : g_d, gg_d
USE esm, ONLY : esm_stres_har, esm_stres_ewa, esm_stres_loclong
!
IMPLICIT NONE
!
@ -55,7 +55,7 @@ SUBROUTINE stress( sigma )
REAL(DP) :: sigmasol(3,3) ! for RISM stress
INTEGER :: l, m
!
! Auxiliary variables for Grimme-D3
! ... Auxiliary variables for Grimme-D3
!
INTEGER :: atnum(1:nat)
REAL(DP) :: latvecs(3,3)
@ -70,10 +70,14 @@ SUBROUTINE stress( sigma )
!
CALL start_clock( 'stress' )
!
! contribution from local potential
!$acc update device( g, gg )
#if defined(__CUDA)
gl_d = gl
#endif
!
IF (.NOT. use_gpu) CALL stres_loc( sigmaloc )
IF ( use_gpu) CALL stres_loc_gpu( sigmaloc )
! ... contribution from local potential
!
CALL stres_loc( sigmaloc )
!
IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN
! In ESM, sigmaloc has only short-range term: add long-range term
@ -81,51 +85,50 @@ SUBROUTINE stress( sigma )
sigmaloc(:,:) = sigmaloc(:,:) + sigmaloclong(:,:)
END IF
!
! hartree contribution
! ... Hartree contribution
!
IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress
CALL esm_stres_har( sigmahar, rho%of_g(:,1) )
ELSE
IF (.NOT. use_gpu) CALL stres_har( sigmahar )
IF ( use_gpu) CALL stres_har_gpu( sigmahar )
END IF
CALL stres_har( sigmahar )
ENDIF
!
! xc contribution (diagonal)
! ... XC contribution (diagonal)
!
sigmaxc(:,:) = 0.d0
DO l = 1, 3
sigmaxc (l, l) = - (etxc - vtxc) / omega
sigmaxc(l,l) = - (etxc - vtxc) / omega
ENDDO
!
! xc contribution: add gradient corrections (non diagonal)
! ... XC contribution: add gradient corrections (non diagonal)
!
CALL stres_gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, rho%kin_r, &
nspin, dfftp, g, alat, omega, sigmaxc )
IF (.NOT.xclib_dft_is('meta')) THEN
CALL stres_gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, &
nspin, dfftp, g, alat, omega, sigmaxc )
ELSE
CALL stres_gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, &
nspin, dfftp, g, alat, omega, sigmaxc, rho%kin_r )
ENDIF
!
! meta-GGA contribution
! ... meta-GGA contribution
!
IF (.NOT. use_gpu) CALL stres_mgga( sigmaxc )
IF ( use_gpu) CALL stres_mgga_gpu( sigmaxc )
CALL stres_mgga( sigmaxc )
!
! core correction contribution
! ... core correction contribution
!
IF (.NOT. use_gpu) CALL stres_cc( sigmaxcc )
IF ( use_gpu) CALL stres_cc_gpu( sigmaxcc )
CALL stres_cc( sigmaxcc )
!
! ewald contribution
! ... Ewald contribution
!
IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress
CALL esm_stres_ewa( sigmaewa )
ELSE
IF (.NOT. use_gpu) CALL stres_ewa( alat, nat, ntyp, ityp, zv, at, &
bg, tau, omega, g, gg, ngm, gstart, &
gamma_only, gcutm, sigmaewa )
IF ( use_gpu) CALL stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg,&
tau, omega, g_d,gg_d, ngm, gstart,&
gamma_only, gcutm, sigmaewa )
END IF
CALL stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, &
tau, omega, g, gg, ngm, gstart, &
gamma_only, gcutm, sigmaewa )
ENDIF
!
! semi-empirical dispersion contribution: Grimme-D2 and D3
! ... semi-empirical dispersion contribution: Grimme-D2 and D3
!
sigmad23( : , : ) = 0.d0
IF ( llondon ) THEN
@ -145,10 +148,9 @@ SUBROUTINE stress( sigma )
CALL stop_clock('stres_dftd3')
END IF
!
! kinetic + nonlocal contribuition
! ... kinetic + nonlocal contribuition
!
IF (.NOT. use_gpu) CALL stres_knl( sigmanlc, sigmakin )
IF ( use_gpu) CALL stres_knl_gpu( sigmanlc, sigmakin )
CALL stres_knl( sigmanlc, sigmakin )
!
DO l = 1, 3
DO m = 1, 3
@ -156,13 +158,13 @@ SUBROUTINE stress( sigma )
ENDDO
ENDDO
!
! Hubbard contribution
! (included by stres_knl if using beta as local projectors)
! ... Hubbard contribution
! (included by stres_knl if using beta as local projectors)
!
sigmah(:,:) = 0.d0
IF ( lda_plus_u .AND. Hubbard_projectors /= 'pseudo' ) CALL stres_hub( sigmah )
!
! Electric field contribution
! ... Electric field contribution
!
sigmael(:,:)=0.d0
sigmaion(:,:)=0.d0
@ -170,15 +172,15 @@ SUBROUTINE stress( sigma )
! call stress_bp_efield (sigmael )
! call stress_ion_efield (sigmaion )
!
! vdW dispersion contribution: xdm
! ... vdW dispersion contribution: xdm
!
sigmaxdm = 0._dp
IF (lxdm) sigmaxdm = stress_xdm()
!
! vdW dispersion contribution: Tkatchenko-Scheffler
! ... vdW dispersion contribution: Tkatchenko-Scheffler
!
sigma_ts = 0.0_DP
! vdW dispersion contribution: Many-Body Dispersion
! ... vdW dispersion contribution: Many-Body Dispersion
!
sigma_mbd = 0.0_DP
IF ( mbd_vdw ) THEN
@ -187,17 +189,17 @@ SUBROUTINE stress( sigma )
sigma_ts = -2.0_DP*alat*MATMUL( HtsvdW, TRANSPOSE(at) )/omega
ENDIF
!
! DFT-non_local contribution
! ... DFT-non_local contribution
!
sigma_nonloc_dft(:,:) = 0.d0
CALL stres_nonloc_dft( rho%of_r, rho_core, nspin, sigma_nonloc_dft )
!
! The solvation contribution (3D-RISM)
! ... The solvation contribution (3D-RISM)
!
sigmasol(:,:) = 0.d0
IF (lrism) CALL stres_rism(sigmasol)
!
! SUM
! ... Sum all terms
!
sigma(:,:) = sigmakin(:,:) + sigmaloc(:,:) + sigmahar(:,:) + &
sigmaxc(:,:) + sigmaxcc(:,:) + sigmaewa(:,:) + &
@ -213,12 +215,12 @@ SUBROUTINE stress( sigma )
ELSE
sigmaexx = 0.d0
ENDIF
! Resymmetrize the total stress. This should not be strictly necessary,
! but prevents loss of symmetry in long vc-bfgs runs
! ... Resymmetrize the total stress. This should not be strictly necessary,
! but prevents loss of symmetry in long vc-bfgs runs
CALL symmatrix( sigma )
!
! write results in Ry/(a.u.)^3 and in kbar
! ... write results in Ry/(a.u.)^3 and in kbar
!
IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress
WRITE( stdout, 9000) (sigma(1,1) + sigma(2,2)) * ry_kbar/3d0, &

View File

@ -7,19 +7,18 @@
!
!
!-----------------------------------------------------------------------
SUBROUTINE usnldiag_gpu (npw, h_diag_d, s_diag_d)
SUBROUTINE usnldiag_gpu( npw, h_diag_d, s_diag_d )
!-----------------------------------------------------------------------
!! Add nonlocal pseudopotential term to diagonal part of Hamiltonian.
!! Compute the diagonal part of the S matrix.
!
! add nonlocal pseudopotential term to diagonal part of Hamiltonian
! compute the diagonal part of the S matrix.
!
! Routine splitted for improving performance
! Routine splitted to improve performance
!
USE kinds, ONLY: DP
USE ions_base, ONLY: nat, ityp, ntyp => nsp
USE wvfct, ONLY: npwx
USE uspp, ONLY: ofsbeta, deeq_d, qq_at_d, qq_so_d, &
deeq_nc_d
USE uspp, ONLY: ofsbeta, deeq, qq_at, qq_so, &
deeq_nc
USE uspp_param, ONLY: upf, nh
USE noncollin_module, ONLY: noncolin, npol, lspinorb
USE device_memcpy_m, ONLY: dev_memset
@ -54,7 +53,7 @@ CONTAINS
SUBROUTINE usnldiag_collinear()
USE lsda_mod, ONLY: current_spin
USE uspp, ONLY: deeq_d, qq_at_d, vkb
USE uspp, ONLY: deeq, qq_at, vkb
IMPLICIT NONE
!
@ -72,7 +71,7 @@ CONTAINS
IF (ityp (na) == nt) THEN
ijkb_start = ofsbeta(na)
nh_ = nh(nt)
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc data present(vkb(:,:),deeq) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc parallel vector_length(32)
!$acc loop gang reduction(+:sum_h,sum_s)
DO ig = 1, npw
@ -82,11 +81,11 @@ CONTAINS
DO ih = 1, nh_
DO jh = 1, nh_
ikb = ijkb_start + ih
cv = vkb (ig, ikb)
cv = vkb(ig,ikb)
jkb = ijkb_start + jh
ar = cv*conjg(vkb (ig, jkb))
sum_h = sum_h + dble(deeq_d (ih, jh, na, current_spin) * ar)
sum_s = sum_s + dble(qq_at_d (ih, jh, na) * ar)
ar = cv*conjg(vkb(ig,jkb))
sum_h = sum_h + dble(deeq(ih,jh,na,current_spin) * ar)
sum_s = sum_s + dble(qq_at(ih,jh,na) * ar)
END DO
END DO
!$acc atomic update
@ -105,7 +104,7 @@ CONTAINS
IF (ityp (na) == nt) THEN
ijkb_start = ofsbeta(na)
nh_ = nh(nt)
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc data present(vkb(:,:),deeq) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc parallel vector_length(32)
!$acc loop gang reduction(+:sum_h,sum_s)
DO ig = 1, npw
@ -114,9 +113,9 @@ CONTAINS
!$acc loop vector private(ikb,ar) reduction(+:sum_h,sum_s)
DO ih = 1, nh_
ikb = ijkb_start + ih
ar = vkb (ig, ikb)*conjg(vkb (ig, ikb))
sum_h = sum_h + dble(deeq_d (ih, ih, na, current_spin) * ar)
sum_s = sum_s + dble(qq_at_d (ih, ih, na) * ar)
ar = vkb(ig,ikb)*conjg(vkb(ig,ikb))
sum_h = sum_h + dble(deeq(ih,ih,na,current_spin) * ar)
sum_s = sum_s + dble(qq_at(ih,ih,na) * ar)
END DO
!$acc atomic update
h_diag_d (ig,1) = h_diag_d (ig,1) + sum_h
@ -135,7 +134,7 @@ CONTAINS
!
SUBROUTINE usnldiag_noncollinear()
USE lsda_mod, ONLY: current_spin
USE uspp, ONLY: vkb, qq_at_d, qq_so_d, deeq_nc_d
USE uspp, ONLY: vkb, qq_at, qq_so, deeq_nc
IMPLICIT NONE
!
@ -153,7 +152,7 @@ CONTAINS
IF (ityp (na) == nt) THEN
ijkb_start = ofsbeta(na)
nh_ = nh(nt)
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc parallel vector_length(32)
!$acc loop gang reduction(+:sum_h1,sum_h4,sum_s)
DO ig = 1, npw ! change this to 2*npw ?
@ -166,10 +165,10 @@ CONTAINS
ikb = ijkb_start + ih
cv = vkb (ig, ikb)
jkb = ijkb_start + jh
ar = cv*conjg(vkb (ig, jkb))
sum_h1 = sum_h1 + dble(deeq_nc_d (ih, jh, na, 1) * ar)
sum_h4 = sum_h4 + dble(deeq_nc_d (ih, jh, na, 4) * ar)
sum_s = sum_s + dble(qq_at_d (ih, jh, na) * ar)
ar = cv*conjg(vkb(ig,jkb))
sum_h1 = sum_h1 + dble(deeq_nc(ih,jh,na,1) * ar)
sum_h4 = sum_h4 + dble(deeq_nc(ih,jh,na,4) * ar)
sum_s = sum_s + dble(qq_at(ih,jh,na) * ar)
END DO
END DO
!
@ -197,7 +196,7 @@ CONTAINS
IF (ityp (na) == nt) THEN
ijkb_start = ofsbeta(na)
nh_ = nh(nt)
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc parallel vector_length(32)
!$acc loop gang reduction(+:sum_h1,sum_h4,sum_s)
DO ig = 1, npw
@ -207,10 +206,10 @@ CONTAINS
!$acc loop vector private(ikb,ar) reduction(+:sum_h1,sum_h4,sum_s)
DO ih = 1, nh_
ikb = ijkb_start + ih
ar = vkb (ig, ikb)*conjg(vkb (ig, ikb))
sum_h1 = sum_h1 + dble(deeq_nc_d (ih, ih, na, 1) * ar)
sum_h4 = sum_h4 + dble(deeq_nc_d (ih, ih, na, 4) * ar)
sum_s = sum_s + dble(qq_at_d (ih, ih, na) * ar)
ar = vkb (ig, ikb)*conjg(vkb(ig,ikb))
sum_h1 = sum_h1 + dble(deeq_nc(ih,ih,na,1) * ar)
sum_h4 = sum_h4 + dble(deeq_nc(ih,ih,na,4) * ar)
sum_s = sum_s + dble(qq_at(ih,ih,na) * ar)
END DO
!
! OPTIMIZE HERE : this scattered assign is bad!
@ -238,7 +237,7 @@ CONTAINS
!
SUBROUTINE usnldiag_spinorb()
USE lsda_mod, ONLY: current_spin
USE uspp, ONLY: vkb, qq_at_d, qq_so_d, deeq_nc_d
USE uspp, ONLY: vkb, qq_at, qq_so, deeq_nc
IMPLICIT NONE
!
@ -256,7 +255,7 @@ CONTAINS
IF (ityp (na) == nt) THEN
ijkb_start = ofsbeta(na)
nh_ = nh(nt)
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc parallel vector_length(32)
!$acc loop gang reduction(+:sum_h1,sum_h4,sum_s1,sum_s4)
DO ig = 1, npw ! change this to 2*npw ?
@ -268,13 +267,13 @@ CONTAINS
DO ih = 1, nh_
DO jh = 1, nh_
ikb = ijkb_start + ih
cv = vkb (ig, ikb)
cv = vkb(ig,ikb)
jkb = ijkb_start + jh
ar = cv*conjg(vkb (ig, jkb))
sum_h1 = sum_h1 + dble(deeq_nc_d (ih, jh, na, 1) * ar)
sum_h4 = sum_h4 + dble(deeq_nc_d (ih, jh, na, 4) * ar)
sum_s1 = sum_s1 + dble(qq_so_d (ih, jh, 1, nt) * ar)
sum_s4 = sum_s4 + dble(qq_so_d (ih, jh, 4, nt) * ar)
ar = cv*conjg(vkb(ig,jkb))
sum_h1 = sum_h1 + dble(deeq_nc(ih,jh,na,1) * ar)
sum_h4 = sum_h4 + dble(deeq_nc(ih,jh,na,4) * ar)
sum_s1 = sum_s1 + dble(qq_so(ih,jh,1,nt) * ar)
sum_s4 = sum_s4 + dble(qq_so(ih,jh,4,nt) * ar)
END DO
END DO
!
@ -302,7 +301,7 @@ CONTAINS
IF (ityp (na) == nt) THEN
ijkb_start = ofsbeta(na)
nh_ = nh(nt)
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
!$acc parallel vector_length(32)
!$acc loop gang reduction(+:sum_h1,sum_h4,sum_s1,sum_s4)
DO ig = 1, npw
@ -314,10 +313,10 @@ CONTAINS
DO ih = 1, nh_
ikb = ijkb_start + ih
ar = vkb (ig, ikb)*conjg(vkb (ig, ikb))
sum_h1 = sum_h1 + dble(deeq_nc_d (ih, ih, na, 1) * ar)
sum_h4 = sum_h4 + dble(deeq_nc_d (ih, ih, na, 4) * ar)
sum_s1 = sum_s1 + dble(qq_so_d (ih, ih, 1, nt) * ar)
sum_s4 = sum_s4 + dble(qq_so_d (ih, ih, 4, nt) * ar)
sum_h1 = sum_h1 + dble(deeq_nc(ih,ih,na,1) * ar)
sum_h4 = sum_h4 + dble(deeq_nc(ih,ih,na,4) * ar)
sum_s1 = sum_s1 + dble(qq_so(ih,ih,1,nt) * ar)
sum_s4 = sum_s4 + dble(qq_so(ih,ih,4,nt) * ar)
END DO
!
! OPTIMIZE HERE : this scattered assign is bad!

View File

@ -12,8 +12,6 @@ set(src_upflib
init_tab_qrad.f90
interp_atwfc.f90
init_us_2_base.f90
gen_us_dj_gpu.f90
gen_us_dy_gpu.f90
gth.f90
paw_variables.f90
pseudo_types.f90
@ -56,7 +54,8 @@ set(src_upflib
sph_bes_gpu.f90
ylmr2_gpu.f90
dylmr2_gpu.f90
simpsn_gpu.f90)
simpsn_gpu.f90
upf_acc_interfaces.f90)
qe_enable_cuda_fortran("${src_upflib}")
qe_add_library(qe_xml xmltools.f90 dom.f90 wxml.f90)

View File

@ -65,14 +65,13 @@ ylmr2.o
# GPU versions of routines
OBJS_GPU= \
dylmr2_gpu.o \
gen_us_dj_gpu.o \
gen_us_dy_gpu.o \
init_us_2_base_gpu.o \
interp_atwfc_gpu.o \
qvan2_gpu.o \
simpsn_gpu.o \
sph_bes_gpu.o \
ylmr2_gpu.o
ylmr2_gpu.o \
upf_acc_interfaces.o
OBJS_NODEP+= dom.o wxml.o

View File

@ -9,12 +9,13 @@
SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
!-----------------------------------------------------------------------
!! This routine computes the derivatives of the Fourier transform of
!! the Q function needed in stress assuming that the radial fourier
!! transform is already computed and stored in table qrad.
!! The formula implemented here is:
!! dq(g,i,j) = sum_lm (-i)^l ap(lm,i,j) *
!! ( yr_lm(g^) dqrad(g,l,i,j) + dyr_lm(g^) qrad(g,l,i,j))
!! the Q function needed in stress assuming that the radial Fourier
!! transform is already computed and stored in table \(\text{qrad}\).
!! The implemented formula:
!
!! \[ \text{dq}(g,i,j) = \sum_lm (-i)^l \text{ap}(lm,i,j) *
!! ( \text{yr}_{lm}(g^) \text{dqrad}(g,l,i,j) +
!! \text{dyr}_{lm}(g') \text{qrad}(g,l,i,j)) \]
!
USE upf_kinds, ONLY: DP
USE uspp_data, ONLY: dq, qrad
@ -48,7 +49,7 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
!
! ... local variables
!
COMPLEX(DP) :: sig
COMPLEX(DP) :: sig, dqg_bgr
! (-i)^L
INTEGER :: nb, mb, ijv, ivl, jvl, ig, lp, l, lm, i0, i1, i2, i3
! the atomic index corresponding to ih
@ -71,7 +72,9 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
! auxiliary variable
! auxiliary variable
!
! compute the indices which correspond to ih,jh
! ... compute the indices which correspond to ih,jh
!
!$acc data present_or_copyin(g,qmod,ylmk0,dylmk0) present_or_copyout(dqg)
!
sixth = 1.d0 / 6.d0
dqi = 1 / dq
@ -91,14 +94,16 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
IF (ivl > nlx .OR. jvl > nlx) &
CALL upf_error (' dqvan2 ', ' wrong dimensions (2)', MAX(ivl,jvl))
!
!$acc kernels
dqg(:) = (0.d0,0.d0)
!$acc end kernels
!
! and make the sum over the non zero LM
! ... and make the sum over the non zero LM
!
DO lm = 1, lpx(ivl, jvl)
lp = lpl(ivl, jvl, lm)
DO lm = 1, lpx(ivl,jvl)
lp = lpl(ivl,jvl,lm)
!
! extraction of angular momentum l from lp:
! ... extraction of angular momentum l from lp:
!
IF (lp==1) THEN
l = 1
@ -119,16 +124,20 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
ENDIF
!
sig = (0.d0, -1.d0)**(l - 1)
sig = sig * ap(lp, ivl, jvl)
sig = sig * ap(lp,ivl,jvl)
!
qm1 = -1.0_dp ! any number smaller than qmod(1)
!
#if !defined(_OPENACC)
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(qm,px,ux,vx,wx,i0,i1,i2,i3,uvx,pwx,work,work1)
#else
!$acc parallel loop
#endif
DO ig = 1, ngy
!
! calculate quantites depending on the module of G only when needed
! ... calculate quantites depending on the module of G only when needed
!
#if !defined(_OPENMP)
#if !defined(_OPENMP) && !defined(_OPENACC)
IF ( ABS( qmod(ig) - qm1 ) > 1.0D-6 ) THEN
#endif
qm = qmod (ig) * dqi
@ -141,33 +150,40 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
i2 = qm + 3
i3 = qm + 4
uvx = ux * vx * sixth
!
pwx = px * wx * 0.5d0
!
work = qrad(i0, ijv, l, np) * uvx * wx + &
qrad(i1, ijv, l, np) * pwx * vx - &
qrad(i2, ijv, l, np) * pwx * ux + &
qrad(i3, ijv, l, np) * px * uvx
work1 = - qrad(i0, ijv, l, np) * (ux*vx + vx*wx + ux*wx) * sixth &
+ qrad(i1, ijv, l, np) * (wx*vx - px*wx - px*vx) * 0.5d0 &
- qrad(i2, ijv, l, np) * (wx*ux - px*wx - px*ux) * 0.5d0 &
+ qrad(i3, ijv, l, np) * (ux*vx - px*ux - px*vx) * sixth
work1 = work1 * dqi
#if !defined(_OPENMP)
work1 = (- qrad(i0, ijv, l, np) * (ux*vx + vx*wx + ux*wx) * sixth &
+ qrad(i1, ijv, l, np) * (wx*vx - px*wx - px*vx) * 0.5d0 &
- qrad(i2, ijv, l, np) * (wx*ux - px*wx - px*ux) * 0.5d0 &
+ qrad(i3, ijv, l, np) * (ux*vx - px*ux - px*vx) * sixth) * dqi
!
#if !defined(_OPENMP) && !defined(_OPENACC)
qm1 = qmod(ig)
ENDIF
#endif
dqg(ig) = dqg(ig) + sig * dylmk0(ig, lp) * work / tpiba
IF (qmod(ig) > 1.d-9) dqg(ig) = dqg(ig) + &
sig * ylmk0(ig, lp) * work1 * tpiba * g(ipol, ig) / qmod(ig)
!
IF (qmod(ig) > 1.d-9) THEN
dqg_bgr = sig * ylmk0(ig,lp) * work1 * tpiba * g(ipol,ig) / qmod(ig)
ELSE
dqg_bgr = (0.d0,0.d0)
ENDIF
!
dqg(ig) = dqg(ig) + sig * dylmk0(ig,lp) * work / tpiba + dqg_bgr
!
ENDDO
#if !defined(_OPENACC)
!$OMP END PARALLEL DO
!
#endif
!
ENDDO
!
!$acc end data
!
RETURN
!
END SUBROUTINE dqvan2

View File

@ -7,9 +7,9 @@
!
!
!----------------------------------------------------------------------
SUBROUTINE gen_us_dj_base &
( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, mill, g, dvkb )
SUBROUTINE gen_us_dj_base( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, &
mill, g, dvkb )
!----------------------------------------------------------------------
!! Calculates the beta function pseudopotentials with
!! the derivative of the Bessel functions.
@ -18,7 +18,7 @@ SUBROUTINE gen_us_dj_base &
USE upf_const, ONLY: tpi
USE uspp, ONLY: nkb, indv, nhtol, nhtolm
USE uspp_data, ONLY: nqx, tab, dq
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm
!
IMPLICIT NONE
!
@ -42,7 +42,7 @@ SUBROUTINE gen_us_dj_base &
!! rec.lattice units 2pi/a
REAL(DP), INTENT(IN) :: omega
!! cell volume
INTEGER, INTENT(IN) :: nr1,nr2,nr3
INTEGER, INTENT(IN) :: nr1, nr2, nr3
!! fft dims (dense grid)
COMPLEX(DP), INTENT(IN) :: eigts1(-nr1:nr1,nat)
!! structure factor 1
@ -54,7 +54,7 @@ SUBROUTINE gen_us_dj_base &
!! miller index map
REAL(DP), INTENT(IN) :: g(3,*)
!! g vectors (2pi/a units)
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx, nkb)
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb)
!! the beta function pseudopotential
!
! ... local variables
@ -67,46 +67,54 @@ SUBROUTINE gen_us_dj_base &
! index of the first nonzero point in the r
! counter on atomic type
!
REAL(DP) :: arg, px, ux, vx, wx
INTEGER :: ina, na, l, iig, lm, ikb_t, nht
REAL(DP) :: arg, px, ux, vx, wx, qt
! argument of the atomic phase factor
!
COMPLEX(DP) :: phase, pref
! atomic phase factor
COMPLEX(DP) :: pref
! prefactor
!
INTEGER :: na, l, iig, lm, iq
REAL(DP), ALLOCATABLE :: djl(:,:,:), ylm(:,:), q(:), gk(:,:)
REAL(DP) :: qt
COMPLEX(DP), ALLOCATABLE :: sk(:)
INTEGER, ALLOCATABLE :: nas(:), ihv(:), nav(:)
REAL(DP), ALLOCATABLE :: djl(:,:,:), ylm(:,:), q(:), gk(:,:)
COMPLEX(DP), ALLOCATABLE :: sk(:,:), phase(:)
! phase: atomic phase factor
!
IF (nkb == 0) RETURN
!
CALL start_clock( 'stres_us31' )
!$acc data present_or_copyin(igk,eigts1,eigts2,eigts3,mill,g) &
!$acc present_or_copyout(dvkb) copyin(ityp,tau,xk)
!
ALLOCATE( djl(npw,nbetam,ntyp) )
ALLOCATE( ylm(npw,(lmaxkb+1)**2) )
ALLOCATE( gk(3,npw) )
ALLOCATE( q(npw) )
!CALL start_clock( 'stres_us31' )
!
ALLOCATE( djl(npw,nbetam,ntyp), ylm(npw,(lmaxkb+1)**2) )
ALLOCATE( gk(3,npw), q(npw) )
!$acc data create( djl, ylm )
!$acc data create( gk, q )
!
!$acc parallel loop
DO ig = 1, npw
iig = igk(ig)
gk(1, ig) = xk(1) + g(1, iig)
gk(2, ig) = xk(2) + g(2, iig)
gk(3, ig) = xk(3) + g(3, iig)
q(ig) = gk(1, ig)**2 + gk(2, ig)**2 + gk(3, ig)**2
gk(1,ig) = xk(1) + g(1,iig)
gk(2,ig) = xk(2) + g(2,iig)
gk(3,ig) = xk(3) + g(3,iig)
q(ig) = gk(1,ig)**2 + gk(2,ig)**2 + gk(3,ig)**2
ENDDO
!
CALL stop_clock( 'stres_us31' )
CALL start_clock( 'stres_us32' )
#if defined(__CUDA)
!$acc host_data use_device(gk,q,ylm)
CALL ylmr2_gpu( (lmaxkb+1)**2, npw, gk, q, ylm )
!$acc end host_data
#else
!$acc update self(gk,q)
CALL ylmr2( (lmaxkb+1)**2, npw, gk, q, ylm )
CALL stop_clock( 'stres_us32' )
CALL start_clock( 'stres_us33' )
!$acc update device(ylm)
#endif
!
!$acc data copyin( tab )
DO nt = 1, ntyp
!$acc parallel loop collapse(2)
DO nb = 1, upf(nt)%nbeta
!
DO ig = 1, npw
qt = SQRT(q (ig)) * tpiba
qt = SQRT(q(ig)) * tpiba
px = qt / dq - INT(qt/dq)
ux = 1.d0 - px
vx = 2.d0 - px
@ -115,62 +123,99 @@ SUBROUTINE gen_us_dj_base &
i1 = i0 + 1
i2 = i0 + 2
i3 = i0 + 3
djl(ig,nb,nt) = ( tab(i0, nb, nt) * (-vx*wx-ux*wx-ux*vx)/6.d0 + &
tab(i1, nb, nt) * (+vx*wx-px*wx-px*vx)/2.d0 - &
tab(i2, nb, nt) * (+ux*wx-px*wx-px*ux)/2.d0 + &
tab(i3, nb, nt) * (+ux*vx-px*vx-px*ux)/6.d0 )/dq
djl(ig,nb,nt) = ( tab(i0,nb,nt) * (-vx*wx-ux*wx-ux*vx)/6.d0 + &
tab(i1,nb,nt) * (+vx*wx-px*wx-px*vx)/2.d0 - &
tab(i2,nb,nt) * (+ux*wx-px*wx-px*ux)/2.d0 + &
tab(i3,nb,nt) * (+ux*vx-px*vx-px*ux)/6.d0 ) / dq
ENDDO
!
ENDDO
ENDDO
!$acc end data
!
CALL stop_clock( 'stres_us33' )
CALL start_clock( 'stres_us34' )
!CALL stop_clock( 'stres_us33' )
!CALL start_clock( 'stres_us34' )
!
DEALLOCATE ( q )
DEALLOCATE ( gk )
!$acc end data
DEALLOCATE( q, gk )
!
ALLOCATE ( sk(npw) )
ikb = 0
!
ALLOCATE( phase(nat), sk(npw,nat) )
ALLOCATE( nas(nat), ihv(nat*nhm), nav(nat*nhm) )
!
ina = 0
DO nt = 1, ntyp
DO na = 1, nat
!
IF (ityp(na) == nt) THEN
arg = ( xk(1) * tau(1,na) + &
xk(2) * tau(2,na) + &
xk(3) * tau(3,na) ) * tpi
phase = CMPLX( COS(arg), -SIN(arg) ,KIND=DP )
DO ig = 1, npw
iig = igk(ig)
sk (ig) = eigts1(mill (1,iig), na) * &
eigts2(mill (2,iig), na) * &
eigts3(mill (3,iig), na) * phase
ENDDO
DO ih = 1, nh(nt)
nb = indv(ih, nt)
l = nhtol(ih, nt)
lm= nhtolm(ih, nt)
ikb = ikb + 1
pref = (0.d0, -1.d0) **l
!
DO ig = 1, npw
dvkb(ig, ikb) = djl(ig, nb, nt) * sk(ig) * ylm(ig, lm) &
* pref
ENDDO
ENDDO
ENDIF
!
ENDDO
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
ina = ina + 1
nas(ina) = na
ENDIF
ENDDO
ENDDO
!
CALL stop_clock('stres_us34')
!$acc data create( phase, sk, ihv, nav ) copyin( nas )
!
IF (ikb /= nkb) CALL upf_error('gen_us_dj', 'unexpected error', 1)
DEALLOCATE( sk )
DEALLOCATE( ylm )
DEALLOCATE( djl )
!$acc parallel loop
DO ina = 1, nat
na = nas(ina)
arg = (xk(1) * tau(1,na) + xk(2) * tau(2,na) &
+ xk(3) * tau(3,na) ) * tpi
phase(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP )
ENDDO
!
!$acc parallel loop collapse(2)
DO ina = 1, nat
DO ig = 1, npw
!
na = nas(ina)
iig = igk(ig)
sk(ig,na) = eigts1(mill(1,iig),na) * &
eigts2(mill(2,iig),na) * &
eigts3(mill(3,iig),na) * phase(na)
ENDDO
ENDDO
!
ikb_t = 0
DO ina = 1, nat
na = nas(ina)
nht = nh(ityp(na))
!$acc kernels
DO ih = 1, nht
ihv(ikb_t+ih) = ih
nav(ikb_t+ih) = na
ENDDO
!$acc end kernels
ikb_t = ikb_t + nht
ENDDO
!
!
!$acc parallel loop collapse(2) copyin(indv,nhtol,nhtolm)
DO ikb = 1, ikb_t
DO ig = 1, npw
ih = ihv(ikb)
na = nav(ikb)
nt = ityp(na)
nb = indv(ih,nt)
l = nhtol(ih,nt)
lm = nhtolm(ih,nt)
pref = (0._DP,-1._DP)**l
!
dvkb(ig,ikb) = CMPLX(djl(ig,nb,nt)) * sk(ig,na) * &
CMPLX(ylm(ig,lm)) * pref
ENDDO
ENDDO
!
!ALL stop_clock('stres_us34')
!
!$acc end data
!$acc end data
DEALLOCATE( phase, sk )
DEALLOCATE( nas, ihv, nav )
!
IF (ikb_t /= nkb) CALL upf_error('gen_us_dj', 'unexpected error', 1)
!
!$acc end data
DEALLOCATE( djl, ylm )
!
RETURN
!
END SUBROUTINE gen_us_dj_base

View File

@ -1,219 +0,0 @@
!
! Copyright (C) 2021 Quantum ESPRESSO Foundation
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------
SUBROUTINE gen_us_dj_gpu_ &
( npw, npwx, igk_d, xk, nat, tau, ityp, ntyp, tpiba, &
omega, nr1, nr2, nr3, eigts1_d, eigts2_d, eigts3_d, mill_d, g_d, dvkb_d )
!----------------------------------------------------------------------
!! Calculates the beta function pseudopotentials with
!! the derivative of the Bessel functions - GFPU version
!
! AF: more gpu-resident variables can be used, avoiding local GPU-alloc
! and host2dev transfers
!
USE upf_kinds, ONLY: dp
USE upf_const, ONLY: tpi
USE uspp, ONLY: nkb, indv_d, nhtol_d, nhtolm_d
USE uspp_data, ONLY: nqx, tab, tab_d, dq
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm
USE device_fbuff_m, ONLY: dev_buf
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: npw
!! number ok plane waves
INTEGER, INTENT(IN) :: npwx
!! max number ok plane waves across k-points
INTEGER, INTENT(IN) :: igk_d(npw)
!! indices of plane waves k+G
REAL(dp), INTENT(IN) :: xk(3)
!! k-point
INTEGER, INTENT(IN) :: nat
!! number of atoms
INTEGER, INTENT(IN) :: ityp(nat)
!! index of type per atom
INTEGER, INTENT(IN) :: ntyp
!! number of atomic types
REAL(DP), INTENT(IN) :: tau(3,nat)
!! atomic positions (cc alat units)
REAL(DP), INTENT(IN) :: tpiba
!! rec.lattice units 2pi/a
REAL(DP), INTENT(IN) :: omega
!! cell volume
INTEGER, INTENT(IN) :: nr1,nr2,nr3
!! fft dims (dense grid)
COMPLEX(DP), INTENT(IN) :: eigts1_d(-nr1:nr1,nat)
!! structure factor 1
COMPLEX(DP), INTENT(IN) :: eigts2_d(-nr2:nr2,nat)
!! structure factor 2
COMPLEX(DP), INTENT(IN) :: eigts3_d(-nr3:nr3,nat)
!! structure factor 3
INTEGER, INTENT(IN) :: mill_d(3,*)
!! miller index map
REAL(DP), INTENT(IN) :: g_d(3,*)
!! g vectors (2pi/a units)
COMPLEX(DP), INTENT(OUT) :: dvkb_d(npwx, nkb)
!! the beta function pseudopotential
!
! ... local variables
!
INTEGER :: na, nt, nb, ih, l, lm, ikb, iig, i0, i1, i2, &
i3, ig, nbm, iq, mil1, mil2, mil3, &
ikb_t, nht, ina, nas(nat), ierr(3)
REAL(DP) :: px, ux, vx, wx, arg, u_ipol, xk1, xk2, xk3, qt
COMPLEX(DP) :: pref
INTEGER, ALLOCATABLE :: ityp_d(:), ih_d(:), na_d(:), nas_d(:)
!
REAL(DP), POINTER :: gk_d(:,:), djl_d(:,:,:), ylm_d(:,:)
REAL(DP), ALLOCATABLE :: q_d(:), tau_d(:,:)
COMPLEX(DP), ALLOCATABLE :: phase_d(:), sk_d(:,:)
!
#if defined(__CUDA)
attributes(DEVICE) :: igk_d, mill_d, eigts1_d, eigts2_d, eigts3_d, g_d
attributes(DEVICE) :: gk_d, q_d, sk_d, djl_d, ylm_d, &
ityp_d, phase_d, ih_d, na_d, tau_d, nas_d
attributes(DEVICE) :: dvkb_d
!
IF (nkb == 0) RETURN
!
CALL dev_buf%lock_buffer( ylm_d, (/ npw,(lmaxkb+1)**2 /), ierr(1) )
CALL dev_buf%lock_buffer( djl_d, (/ npw,nbetam,ntyp /), ierr(2) )
CALL dev_buf%lock_buffer( gk_d, (/ 3,npw /), ierr(3) )
ALLOCATE( q_d(npw) )
!
xk1 = xk(1)
xk2 = xk(2)
xk3 = xk(3)
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = 1, npw
iig = igk_d(ig)
gk_d(1,ig) = xk1 + g_d(1,iig)
gk_d(2,ig) = xk2 + g_d(2,iig)
gk_d(3,ig) = xk3 + g_d(3,iig)
q_d(ig) = gk_d(1,ig)**2 + gk_d(2,ig)**2 + gk_d(3,ig)**2
ENDDO
!
CALL ylmr2_gpu( (lmaxkb+1)**2, npw, gk_d, q_d, ylm_d )
!
DO nt = 1, ntyp
nbm = upf(nt)%nbeta
!$cuf kernel do (2) <<<*,*>>>
DO nb = 1, nbm
DO ig = 1, npw
qt = SQRT(q_d(ig)) * tpiba
px = qt/dq - DBLE(INT(qt/dq))
ux = 1._DP - px
vx = 2._DP - px
wx = 3._DP - px
i0 = INT(qt/dq) + 1
i1 = i0 + 1
i2 = i0 + 2
i3 = i0 + 3
djl_d(ig,nb,nt) = (tab_d(i0,nb,nt) * (-vx*wx-ux*wx-ux*vx)/6._DP + &
tab_d(i1,nb,nt) * (+vx*wx-px*wx-px*vx)/2._DP - &
tab_d(i2,nb,nt) * (+ux*wx-px*wx-px*ux)/2._DP + &
tab_d(i3,nb,nt) * (+ux*vx-px*vx-px*ux)/6._DP)/dq
ENDDO
ENDDO
ENDDO
!
DEALLOCATE( q_d )
!
ALLOCATE( ityp_d(nat), nas_d(nat) )
ityp_d = ityp
ALLOCATE( tau_d(3,nat) )
tau_d = tau
ALLOCATE( phase_d(nat) )
!
ina = 0
DO nt = 1, ntyp
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
ina = ina + 1
nas(ina) = na
ENDIF
ENDDO
ENDDO
nas_d = nas
!
!$cuf kernel do (1) <<<*,*>>>
DO ina = 1, nat
na = nas_d(ina)
arg = (xk1 * tau_d(1,na) + xk2 * tau_d(2,na) &
+ xk3 * tau_d(3,na) ) * tpi
phase_d(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP )
ENDDO
!
DEALLOCATE( tau_d )
!
ALLOCATE( sk_d(npw,nat) )
!
!$cuf kernel do (2) <<<*,*>>>
DO ina = 1, nat
DO ig = 1, npw
!
na = nas_d(ina)
iig = igk_d(ig)
mil1 = mill_d(1,iig)
mil2 = mill_d(2,iig)
mil3 = mill_d(3,iig)
sk_d(ig,na) = eigts1_d(mil1,na) * &
eigts2_d(mil2,na) * &
eigts3_d(mil3,na) * phase_d(na)
ENDDO
ENDDO
!
!
DEALLOCATE( phase_d )
!
ALLOCATE( ih_d(nat*nhm), na_d(nat*nhm) )
!
ikb_t = 0
DO ina = 1, nat
na = nas(ina)
nht = nh(ityp(na))
!$cuf kernel do (1) <<<*,*>>>
DO ih = 1, nht
ih_d(ikb_t+ih) = ih
na_d(ikb_t+ih) = na
ENDDO
ikb_t = ikb_t + nht
ENDDO
!
!$cuf kernel do (2) <<<*,*>>>
DO ikb = 1, ikb_t
DO ig = 1, npw
ih = ih_d(ikb)
na = na_d(ikb)
nt = ityp_d(na)
nb = indv_d(ih,nt)
l = nhtol_d(ih,nt)
lm = nhtolm_d(ih,nt)
pref = (0._DP,-1._DP)**l
!
dvkb_d(ig,ikb) = CMPLX(djl_d(ig,nb,nt)) * sk_d(ig,na) * &
CMPLX(ylm_d(ig,lm)) * pref
ENDDO
ENDDO
!
DEALLOCATE( sk_d )
!
IF (ikb_t /= nkb) CALL upf_error( 'gen_us_dj', 'unexpected error', 1 )
!
CALL dev_buf%release_buffer( ylm_d, ierr(1) )
CALL dev_buf%release_buffer( djl_d, ierr(2) )
CALL dev_buf%release_buffer( gk_d, ierr(3) )
!
DEALLOCATE( ih_d, na_d, nas_d )
#endif
!
RETURN
!
END SUBROUTINE gen_us_dj_gpu_

View File

@ -1,5 +1,5 @@
!
! Copyright (C) 2001-2021 Quantum ESPRESSO Foundation
! Copyright (C) 2021 Quantum ESPRESSSO Foundation
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
@ -7,18 +7,20 @@
!
!
!----------------------------------------------------------------------
SUBROUTINE gen_us_dy_base &
( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, mill, g, u, dvkb )
SUBROUTINE gen_us_dy_base( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, &
mill, g, u, dvkb )
!----------------------------------------------------------------------
!! Calculates the beta functions of the pseudopotential with the
!! derivative of the spherical harmonics projected on vector u
!! Calculates the Kleinman-Bylander pseudopotentials with the
!! derivative of the spherical harmonics projected on vector u.
!
! AF: more extensive use of GPU-resident vars possible
!
USE upf_kinds, ONLY: dp
USE upf_const, ONLY: tpi
USE uspp, ONLY: nkb, indv, nhtol, nhtolm
USE uspp_data, ONLY: nqx, tab, dq
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm
!
IMPLICIT NONE
!
@ -55,112 +57,205 @@ SUBROUTINE gen_us_dy_base &
REAL(DP), INTENT(IN) :: g(3,*)
!! g vectors (2pi/a units)
REAL(DP), INTENT(IN) :: u(3)
!! input: projection vector
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb)
!! output: kleinman-bylander pseudopotential
!! projection vector
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx, nkb)
!! the beta function pseudopotential
!
! ... local variables
!
INTEGER :: na, nt, nb, ih, l, lm, ikb, iig, ipol, i0, i1, i2, &
i3, ig
REAL(DP), ALLOCATABLE :: gk(:,:), q(:)
REAL(DP) :: px, ux, vx, wx, arg
i3, ig, nbm, iq, mil1, mil2, mil3, ikb_t, &
nht, ina, lmx2
!
REAL(DP), ALLOCATABLE :: vkb0(:,:,:), dylm(:,:), dylm_u(:,:)
INTEGER, ALLOCATABLE :: nas(:), ihv(:), nav(:)
!
REAL(DP), ALLOCATABLE :: dylm(:,:,:), dylm_u(:,:)
REAL(DP), ALLOCATABLE :: q(:), gk(:,:), vkb0(:,:,:)
! dylm = d Y_lm/dr_i in cartesian axes
! dylm_u as above projected on u
COMPLEX(DP), ALLOCATABLE :: phase(:), sk(:,:)
!
COMPLEX(DP), ALLOCATABLE :: sk(:)
COMPLEX(DP) :: phase, pref
REAL(DP) :: px, ux, vx, wx, arg, u_ipol1, u_ipol2, u_ipol3, xk1, xk2, xk3
COMPLEX(DP) :: pref
!
INTEGER :: iq
!$acc kernels present_or_copyout(dvkb)
dvkb = (0._DP,0._DP)
!$acc end kernels
!
dvkb(:,:) = (0.d0, 0.d0)
IF (lmaxkb <= 0) RETURN
!
ALLOCATE( vkb0(npw,nbetam,ntyp), dylm_u(npw,(lmaxkb+1)**2), gk(3,npw) )
ALLOCATE( q(npw) )
!$acc data present_or_copyin(igk,eigts1,eigts2,eigts3,mill,g) present(dvkb)
!
lmx2 = (lmaxkb+1)**2
!
ALLOCATE( gk(3,npw) )
ALLOCATE( dylm_u(npw,lmx2) )
ALLOCATE( vkb0(npw,nbetam,ntyp) )
ALLOCATE( q(npw) )
!$acc data create( dylm_u, vkb0 )
!$acc data create( q, gk )
!
xk1 = xk(1)
xk2 = xk(2)
xk3 = xk(3)
!
!$acc parallel loop
DO ig = 1, npw
iig = igk(ig)
gk(1, ig) = xk(1) + g(1, iig)
gk(2, ig) = xk(2) + g(2, iig)
gk(3, ig) = xk(3) + g(3, iig)
q(ig) = gk(1, ig)**2 + gk(2, ig)**2 + gk(3, ig)**2
gk(1,ig) = xk1 + g(1,iig)
gk(2,ig) = xk2 + g(2,iig)
gk(3,ig) = xk3 + g(3,iig)
q(ig) = gk(1,ig)**2 + gk(2,ig)**2 + gk(3,ig)**2
ENDDO
!
ALLOCATE( dylm(npw,(lmaxkb+1)**2) )
dylm_u(:,:) = 0.d0
ALLOCATE( dylm(npw,(lmaxkb+1)**2,3) )
!$acc data create( dylm )
!
#if defined(__CUDA)
!$acc host_data use_device( gk, q, dylm )
DO ipol = 1, 3
CALL dylmr2( (lmaxkb+1)**2, npw, gk, q, dylm, ipol )
CALL daxpy( npw * (lmaxkb + 1) **2, u(ipol), dylm, 1, dylm_u, 1 )
CALL dylmr2_gpu( lmx2, npw, gk, q, dylm(:,:,ipol), ipol )
ENDDO
!$acc end host_data
#else
!$acc update self( gk, q )
DO ipol = 1, 3
CALL dylmr2( lmx2, npw, gk, q, dylm(:,:,ipol), ipol )
ENDDO
!$acc update device( dylm )
#endif
!
u_ipol1 = u(1) ; u_ipol2 = u(2) ; u_ipol3 = u(3)
!
!$acc parallel loop collapse(2)
DO lm = 1, lmx2
DO ig = 1, npw
dylm_u(ig,lm) = u_ipol1*dylm(ig,lm,1) + &
u_ipol2*dylm(ig,lm,2) + &
u_ipol3*dylm(ig,lm,3)
ENDDO
ENDDO
!$acc end data
DEALLOCATE( dylm )
!
DO ig = 1, npw
q(ig) = SQRT(q(ig)) * tpiba
ENDDO
!$acc kernels
q(:) = SQRT(q(:)) * tpiba
!$acc end kernels
!
!$acc data copyin( tab )
DO nt = 1, ntyp
! calculate beta in G-space using an interpolation table
DO nb = 1, upf(nt)%nbeta
nbm = upf(nt)%nbeta
!$acc parallel loop collapse(2)
DO nb = 1, nbm
DO ig = 1, npw
px = q(ig)/dq - INT(q(ig)/dq)
ux = 1.d0 - px
vx = 2.d0 - px
wx = 3.d0 - px
i0 = q(ig)/dq + 1
px = q(ig)/dq - DBLE(INT(q(ig)/dq))
ux = 1._DP - px
vx = 2._DP - px
wx = 3._DP - px
i0 = INT(q(ig)/dq) + 1
i1 = i0 + 1
i2 = i0 + 2
i3 = i0 + 3
vkb0(ig, nb, nt) = tab(i0, nb, nt) * ux * vx * wx / 6.d0 + &
tab(i1, nb, nt) * px * vx * wx / 2.d0 - &
tab(i2, nb, nt) * px * ux * wx / 2.d0 + &
tab(i3, nb, nt) * px * ux * vx / 6.d0
ENDDO
ENDDO
vkb0(ig,nb,nt) = tab(i0,nb,nt) * ux * vx * wx / 6._DP + &
tab(i1,nb,nt) * px * vx * wx / 2._DP - &
tab(i2,nb,nt) * px * ux * wx / 2._DP + &
tab(i3,nb,nt) * px * ux * vx / 6._DP
ENDDO
ENDDO
ENDDO
!$acc end data
!
DEALLOCATE( q )
!$acc end data
DEALLOCATE( gk, q )
!
ALLOCATE( sk(npw) )
ALLOCATE( nas(nat), phase(nat) )
!
ikb = 0
ina = 0
DO nt = 1, ntyp
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
arg = (xk(1) * tau(1, na) + xk(2) * tau(2, na) &
+ xk(3) * tau(3, na) ) * tpi
phase = CMPLX( COS(arg), -SIN(arg), KIND=DP )
DO ig = 1, npw
iig = igk(ig)
sk(ig) = eigts1(mill (1,iig), na) * &
eigts2(mill (2,iig), na) * &
eigts3(mill (3,iig), na) * phase
ENDDO
!
DO ih = 1, nh(nt)
nb = indv(ih, nt)
l = nhtol(ih, nt)
lm = nhtolm(ih, nt)
ikb = ikb + 1
pref = (0.d0, -1.d0)**l
!
DO ig = 1, npw
dvkb(ig, ikb) = vkb0(ig, nb, nt) * sk(ig) * dylm_u(ig, lm) &
* pref / tpiba
ENDDO
ENDDO
ENDIF
ENDDO
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
ina = ina + 1
nas(ina) = na
ENDIF
ENDDO
ENDDO
!
IF (ikb /= nkb) CALL upf_error( 'gen_us_dy', 'unexpected error', 1 )
ALLOCATE( sk(npw,nat) )
!$acc data create( sk )
!
!$acc data create( phase ) copyin( nas )
!
!$acc parallel loop copyin( tau )
DO ina = 1, nat
na = nas(ina)
arg = ( xk1 * tau(1,na) + xk2 * tau(2,na) &
+ xk3 * tau(3,na) ) * tpi
phase(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP )
ENDDO
!
!$acc parallel loop collapse(2)
DO ina = 1, nat
DO ig = 1, npw
!
na = nas(ina)
iig = igk(ig)
mil1 = mill(1,iig)
mil2 = mill(2,iig)
mil3 = mill(3,iig)
sk(ig,na) = eigts1(mil1,na) * &
eigts2(mil2,na) * &
eigts3(mil3,na) * phase(na)
ENDDO
ENDDO
!
!$acc end data
!
ALLOCATE( ihv(nat*nhm), nav(nat*nhm) )
!$acc data create( ihv, nav )
!
ikb_t = 0
DO ina = 1, nat
na = nas(ina)
nht = nh(ityp(na))
!$acc kernels
DO ih = 1, nht
ihv(ikb_t+ih) = ih
nav(ikb_t+ih) = na
ENDDO
!$acc end kernels
ikb_t = ikb_t + nht
ENDDO
!
!$acc parallel loop collapse(2) copyin(ityp,indv,nhtol,nhtolm)
DO ikb = 1, ikb_t
DO ig = 1, npw
ih = ihv(ikb)
na = nav(ikb)
nt = ityp(na)
nb = indv(ih,nt)
l = nhtol(ih,nt)
lm = nhtolm(ih,nt)
pref = (0._DP,-1._DP)**l
!
dvkb(ig,ikb) = CMPLX(vkb0(ig,nb,nt)) * sk(ig,na) * &
CMPLX(dylm_u(ig,lm)) * pref / CMPLX(tpiba)
ENDDO
ENDDO
!
!$acc end data
!$acc end data
DEALLOCATE( ihv, nav )
DEALLOCATE( phase, nas )
DEALLOCATE( sk )
DEALLOCATE( vkb0, dylm_u, gk )
!
IF (ikb_t /= nkb) CALL upf_error( 'gen_us_dy', 'unexpected error', 1 )
!
!$acc end data
DEALLOCATE( dylm_u, vkb0 )
!
!$acc end data
!
RETURN
!
END SUBROUTINE gen_us_dy_base
!

View File

@ -1,253 +0,0 @@
!
! Copyright (C) 2021 Quantum ESPRESSSO Foundation
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------
SUBROUTINE gen_us_dy_gpu_ ( npw, npwx, igk_d, xk, nat, tau, ityp, ntyp, &
tpiba, omega, nr1, nr2, nr3, eigts1_d, eigts2_d, eigts3_d, &
mill_d, g_d, u, dvkb_d )
!----------------------------------------------------------------------
!! Calculates the kleinman-bylander pseudopotentials with the
!! derivative of the spherical harmonics projected on vector u
!
! AF: more extensive use of GPU-resident vars possible
!
USE upf_kinds, ONLY: dp
USE upf_const, ONLY: tpi
USE uspp, ONLY: nkb, indv_d, nhtol_d, nhtolm_d
USE uspp_data, ONLY: nqx, tab, tab_d, dq
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm
USE device_fbuff_m, ONLY: dev_buf
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: npw
!! number ok plane waves
INTEGER, INTENT(IN) :: npwx
!! max number ok plane waves across k-points
INTEGER, INTENT(IN) :: igk_d(npw)
!! indices of plane waves k+G
REAL(dp), INTENT(IN) :: xk(3)
!! k-point
INTEGER, INTENT(IN) :: nat
!! number of atoms
INTEGER, INTENT(IN) :: ityp(nat)
!! index of type per atom
INTEGER, INTENT(IN) :: ntyp
!! number of atomic types
REAL(DP), INTENT(IN) :: tau(3,nat)
!! atomic positions (cc alat units)
REAL(DP), INTENT(IN) :: tpiba
!! rec.lattice units 2pi/a
REAL(DP), INTENT(IN) :: omega
!! cell volume
INTEGER, INTENT(IN) :: nr1,nr2,nr3
!! fft dims (dense grid)
COMPLEX(DP), INTENT(IN) :: eigts1_d(-nr1:nr1,nat)
!! structure factor 1
COMPLEX(DP), INTENT(IN) :: eigts2_d(-nr2:nr2,nat)
!! structure factor 2
COMPLEX(DP), INTENT(IN) :: eigts3_d(-nr3:nr3,nat)
!! structure factor 3
INTEGER, INTENT(IN) :: mill_d(3,*)
!! miller index map
REAL(DP), INTENT(IN) :: g_d(3,*)
!! g vectors (2pi/a units)
REAL(DP), INTENT(IN) :: u(3)
!! input: projection vector
COMPLEX(DP), INTENT(OUT) :: dvkb_d(npwx, nkb)
!! the beta function pseudopotential
!
! ... local variables
!
INTEGER :: na, nt, nb, ih, l, lm, ikb, iig, ipol, i0, i1, i2, &
i3, ig, nbm, iq, mil1, mil2, mil3, ikb_t, &
nht, ina, lmx2
INTEGER :: nas(nat), ierr(4)
!
INTEGER, ALLOCATABLE :: ityp_d(:), ih_d(:), na_d(:), nas_d(:)
!
REAL(DP), ALLOCATABLE :: q(:), dylm(:,:)
!
REAL(DP), POINTER :: gk_d(:,:)
REAL(DP), POINTER :: vkb0_d(:,:,:), dylm_u_d(:,:), dylm_d(:,:,:)
REAL(DP), ALLOCATABLE :: q_d(:), tau_d(:,:)
! dylm = d Y_lm/dr_i in cartesian axes
! dylm_u as above projected on u
COMPLEX(DP), ALLOCATABLE :: phase_d(:), sk_d(:,:)
!
REAL(DP) :: px, ux, vx, wx, arg, u_ipol1, u_ipol2, u_ipol3, xk1, xk2, xk3
COMPLEX(DP) :: pref
!
#if defined(__CUDA)
attributes(DEVICE) :: igk_d, mill_d, eigts1_d, eigts2_d, eigts3_d, g_d
attributes(DEVICE) :: gk_d, q_d, sk_d, vkb0_d, dylm_u_d, dylm_d, &
ityp_d, phase_d, ih_d, na_d, tau_d, nas_d
attributes(DEVICE) :: dvkb_d
!
dvkb_d = (0._DP,0._DP)
!
IF (lmaxkb <= 0) RETURN
!
lmx2 = (lmaxkb+1)**2
!
CALL dev_buf%lock_buffer( dylm_u_d, (/ npw,lmx2 /), ierr(1) )
CALL dev_buf%lock_buffer( vkb0_d, (/ npw,nbetam,ntyp /), ierr(2) )
CALL dev_buf%lock_buffer( gk_d, (/ 3,npw /), ierr(3) )
IF (ANY(ierr /= 0)) CALL upf_error( 'gen_us_dy_gpu', 'cannot allocate buffers', ABS(ierr) )
ALLOCATE( q_d(npw) )
!
xk1 = xk(1)
xk2 = xk(2)
xk3 = xk(3)
!
!$cuf kernel do <<<*,*>>>
DO ig = 1, npw
iig = igk_d(ig)
gk_d(1,ig) = xk1 + g_d(1,iig)
gk_d(2,ig) = xk2 + g_d(2,iig)
gk_d(3,ig) = xk3 + g_d(3,iig)
q_d(ig) = gk_d(1,ig)**2 + gk_d(2,ig)**2 + gk_d(3,ig)**2
ENDDO
!
CALL dev_buf%lock_buffer( dylm_d, (/npw,lmx2,3/), ierr(4) )
DO ipol = 1, 3
CALL dylmr2_gpu( lmx2, npw, gk_d, q_d, dylm_d(:,:,ipol), ipol )
ENDDO
!
u_ipol1 = u(1) ; u_ipol2 = u(2) ; u_ipol3 = u(3)
!
!$cuf kernel do (2) <<<*,*>>>
DO lm = 1, lmx2
DO ig = 1, npw
dylm_u_d(ig,lm) = u_ipol1*dylm_d(ig,lm,1) + &
u_ipol2*dylm_d(ig,lm,2) + &
u_ipol3*dylm_d(ig,lm,3)
ENDDO
ENDDO
CALL dev_buf%release_buffer( dylm_d, ierr(4) )
!
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = 1, npw
q_d(ig) = SQRT(q_d(ig)) * tpiba
ENDDO
!
!
DO nt = 1, ntyp
nbm = upf(nt)%nbeta
!$cuf kernel do (2) <<<*,*>>>
DO nb = 1, nbm
DO ig = 1, npw
px = q_d(ig)/dq - DBLE(INT(q_d(ig)/dq))
ux = 1._DP - px
vx = 2._DP - px
wx = 3._DP - px
i0 = INT(q_d(ig)/dq) + 1
i1 = i0 + 1
i2 = i0 + 2
i3 = i0 + 3
vkb0_d(ig,nb,nt) = tab_d(i0,nb,nt) * ux * vx * wx / 6._DP + &
tab_d(i1,nb,nt) * px * vx * wx / 2._DP - &
tab_d(i2,nb,nt) * px * ux * wx / 2._DP + &
tab_d(i3,nb,nt) * px * ux * vx / 6._DP
ENDDO
ENDDO
ENDDO
!
DEALLOCATE( q_d )
!
ALLOCATE( ityp_d(nat), nas_d(nat) )
ityp_d = ityp
ALLOCATE( tau_d(3,nat) )
tau_d = tau
ALLOCATE( phase_d(nat) )
!
ina = 0
DO nt = 1, ntyp
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
ina = ina + 1
nas(ina) = na
ENDIF
ENDDO
ENDDO
nas_d = nas
!
!$cuf kernel do (1) <<<*,*>>>
DO ina = 1, nat
na = nas_d(ina)
arg = (xk1 * tau_d(1,na) + xk2 * tau_d(2,na) &
+ xk3 * tau_d(3,na) ) * tpi
phase_d(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP )
ENDDO
!
DEALLOCATE( tau_d )
!
ALLOCATE( sk_d(npw,nat) )
!
!$cuf kernel do (2) <<<*,*>>>
DO ina = 1, nat
DO ig = 1, npw
!
na = nas_d(ina)
iig = igk_d(ig)
mil1 = mill_d(1,iig)
mil2 = mill_d(2,iig)
mil3 = mill_d(3,iig)
sk_d(ig,na) = eigts1_d(mil1,na) * &
eigts2_d(mil2,na) * &
eigts3_d(mil3,na) * phase_d(na)
ENDDO
ENDDO
!
DEALLOCATE( phase_d )
!
ALLOCATE( ih_d(nat*nhm), na_d(nat*nhm) )
!
ikb_t = 0
DO ina = 1, nat
na = nas(ina)
nht = nh(ityp(na))
!$cuf kernel do (1) <<<*,*>>>
DO ih = 1, nht
ih_d(ikb_t+ih) = ih
na_d(ikb_t+ih) = na
ENDDO
ikb_t = ikb_t + nht
ENDDO
!
!$cuf kernel do (2) <<<*,*>>>
DO ikb = 1, ikb_t
DO ig = 1, npw
ih = ih_d(ikb)
na = na_d(ikb)
nt = ityp_d(na)
nb = indv_d(ih,nt)
l = nhtol_d(ih,nt)
lm = nhtolm_d(ih,nt)
pref = (0._DP, -1._DP)**l
!
dvkb_d(ig,ikb) = CMPLX(vkb0_d(ig,nb,nt)) * sk_d(ig,na) * &
CMPLX(dylm_u_d(ig,lm)) * pref / CMPLX(tpiba)
ENDDO
ENDDO
!
DEALLOCATE( sk_d )
!
IF (ikb_t /= nkb) CALL upf_error( 'gen_us_dy', 'unexpected error', 1 )
!
CALL dev_buf%release_buffer( dylm_u_d, ierr(1) )
CALL dev_buf%release_buffer( vkb0_d, ierr(2) )
CALL dev_buf%release_buffer( gk_d, ierr(3) )
!
DEALLOCATE( ih_d, na_d, nas_d )
#endif
!
RETURN
!
END SUBROUTINE gen_us_dy_gpu_

View File

@ -14,7 +14,7 @@ module m_gth
!
private
public :: gth_parameters, readgth, vloc_gth, dvloc_gth, &
dvloc_gth_gpu, setlocq_gth, mk_ffnl_gth, mk_dffnl_gth, deallocate_gth
setlocq_gth, mk_ffnl_gth, mk_dffnl_gth, deallocate_gth
!
type gth_parameters
integer :: itype, lloc, lmax
@ -340,94 +340,9 @@ subroutine vloc_gth(itype, zion, tpiba2, ngl, gl, omega, vloc)
vloc (:) = vloc(:) * fact
!
end subroutine vloc_gth
!
!-----------------------------------------------------------------------
subroutine dvloc_gth(itype, zion, tpiba2, ngl, gl, omega, dvloc)
!-----------------------------------------------------------------------
!
! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g
!
USE upf_kinds, ONLY: dp
USE upf_const, ONLY: pi, tpi, e2, eps8
implicit none
!
! I/O
integer, intent(in) :: itype, ngl
real(dp), intent(in) :: zion, tpiba2, omega, gl (ngl)
real(dp), intent(out) :: dvloc (ngl)
!
! Local variables
integer :: ii, my_gth, igl, igl0
real(dp) :: cc1, cc2, cc3, cc4, rloc, &
gx, gx2, gx3, rl2, rl3, rq2, r2q, r4g3, r6g5, e_rq2h, fact
!
! IF ( do_comp_esm ) call upf_error('vloc_gth', 'ESM not implemented', itype)
!
! Find gtp param. set for type itype
my_gth=0
do ii=1,size(gth_p)
if (gth_p(ii)%itype==itype) then
my_gth=ii
exit
endif
enddo
if (my_gth==0) call upf_error('dvloc_gth', 'cannot map itype in some gtp param. set', itype)
rloc=gth_p(my_gth)%rloc
cc1=gth_p(my_gth)%cc(1)
cc2=gth_p(my_gth)%cc(2)
cc3=gth_p(my_gth)%cc(3)
cc4=gth_p(my_gth)%cc(4)
! Compute vloc(q)
if (gl (1) < eps8) then
!
! first the G=0 term
!
dvloc (1) = 0._dp
igl0 = 2
else
igl0 = 1
endif
!
! here the G<>0 terms, we first compute the part of the integrand
! function independent of |G| in real space
!
do igl = igl0, ngl
gx = sqrt (gl (igl) * tpiba2)
gx2 = gx**2
gx3 = gx*gx2
rl2 = rloc**2
rl3 = rloc*rl2
rq2 = gx2*rl2
r2q = gx*rl2
r4g3 = rl2*rl2*gx3
r6g5 = r4g3*rl2*gx2
e_rq2h = exp(-0.5_dp*rq2)
dvloc (igl) = &
e_rq2h*(zion*(rq2+2._dp)/gx3 + sqrt(pi/2._dp)*rl3* &
( &
( &
- 2._dp*r2q* (cc2+10._dp*cc3+105._dp*cc4) &
+ 4._dp*r4g3*(cc3+21._dp*cc4) &
- 6._dp*r6g5* cc4 &
) - r2q*( &
cc1 + &
cc2*(3._dp-rq2) + &
cc3*(15._dp-10._dp*rq2+rq2**2) + &
cc4*(105._dp-rq2*(105._dp-rq2*(21._dp-rq2))) &
) &
) &
)/gx
enddo
!
fact = tpi * e2 / omega
dvloc (:) = dvloc(:) * fact
!
end subroutine dvloc_gth
!
!
!-------------------------------------------------------------------------------
SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
subroutine dvloc_gth( itype, zion, tpiba2, ngl, gl, omega, dvloc )
!------------------------------------------------------------------------------
!! GPU version of 'dvloc_gth' from 'Modules/gth.f90'
!! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g
@ -435,34 +350,31 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
USE upf_kinds, ONLY : DP
USE upf_const, ONLY : pi, tpi, e2, eps8
!
IMPLICIT NONE
implicit none
!
! I/O
INTEGER, INTENT(IN) :: itype, ngl
REAL(DP), INTENT(IN) :: zion, tpiba2, omega
!
REAL(DP), INTENT(IN) :: gl_d(ngl)
REAL(DP), INTENT(OUT) :: dvloc_d(ngl)
integer, intent(in) :: itype, ngl
real(dp), intent(in) :: zion, tpiba2, omega
real(dp), intent(in) :: gl(ngl)
real(dp), intent(out) :: dvloc(ngl)
!
! Local variables
INTEGER :: ii, my_gth, igl, igl0
REAL(DP) :: cc1, cc2, cc3, cc4, rloc, gl1, &
integer :: ii, my_gth, igl, igl0
real(dp) :: cc1, cc2, cc3, cc4, rloc, gl1, &
gx, gx2, gx3, rl2, rl3, rq2, r2q, r4g3, r6g5, e_rq2h, fact
!
#if defined(__CUDA)
attributes(DEVICE) :: dvloc_d, gl_d
#endif
!$acc data present( dvloc, gl )
!
! IF ( do_comp_esm ) call upf_error('vloc_gth', 'ESM not implemented', itype)
!
! Find gtp param. set for type itype
my_gth = 0
DO ii = 1, SIZE(gth_p)
IF (gth_p(ii)%itype==itype) THEN
do ii = 1, SIZE(gth_p)
if (gth_p(ii)%itype==itype) then
my_gth = ii
EXIT
ENDIF
ENDDO
exit
endif
enddo
!
IF ( my_gth==0 ) CALL upf_error( 'dvloc_gth', 'cannot map itype in some gtp param. set', itype )
rloc = gth_p(my_gth)%rloc
@ -472,25 +384,27 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
cc4 = gth_p(my_gth)%cc(4)
!
! Compute vloc(q)
gl1 = gl_d(1)
IF (gl1 < eps8) THEN
gl1 = gl(1)
if (gl1 < eps8) then
!
! first the G=0 term
!
dvloc_d(1) = 0.0_DP
!$acc kernels
dvloc(1) = 0.0_DP
!$acc end kernels
igl0 = 2
ELSE
else
igl0 = 1
ENDIF
endif
!
! here the G<>0 terms, we first compute the part of the integrand
! function independent of |G| in real space
!
fact = tpi * e2 / omega
!
!$cuf kernel do (1) <<<*,*>>>
DO igl = igl0, ngl
gx = SQRT(gl_d(igl) * tpiba2)
!$acc parallel loop
do igl = igl0, ngl
gx = SQRT(gl(igl) * tpiba2)
gx2 = gx**2
gx3 = gx*gx2
rl2 = rloc**2
@ -500,7 +414,7 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
r4g3 = rl2*rl2*gx3
r6g5 = r4g3*rl2*gx2
e_rq2h = EXP(-0.5_DP*rq2)
dvloc_d(igl) = fact * &
dvloc(igl) = fact * &
e_rq2h*(zion*(rq2+2._DP)/gx3 + SQRT(pi/2._DP)*rl3* &
( &
( &
@ -515,11 +429,11 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
) &
) &
)/gx
ENDDO
enddo
!
!$acc end data
!
END SUBROUTINE dvloc_gth_gpu
!
end subroutine dvloc_gth
!
!-----------------------------------------------------------------------
subroutine setlocq_gth(itype, xq, zion, tpiba2, ngm, g, omega, vloc)

View File

@ -30,8 +30,8 @@ subroutine init_us_1( nat, ityp, omega, ngm, g, gg, intra_bgrp_comm )
USE atom, ONLY : rgrid
USE uspp, ONLY : nhtol, nhtoj, nhtolm, ijtoh, dvan, qq_at, qq_nt, indv, &
ap, aainit, qq_so, dvan_so, okvan, ofsbeta, &
nhtol_d, nhtoj_d, nhtolm_d, ijtoh_d, dvan_d, qq_at_d, &
qq_nt_d, indv_d, qq_so_d, dvan_so_d, ofsbeta_d
nhtol_d, nhtoj_d, nhtolm_d, ijtoh_d, dvan_d, &
qq_nt_d, indv_d, dvan_so_d, ofsbeta_d
USE uspp_param, ONLY : upf, lmaxq, nh, nhm, lmaxkb, nsp
USE upf_spinorb, ONLY : is_spinorbit, rot_ylm, fcoef, fcoef_d, lmaxx
USE paw_variables,ONLY : okpaw
@ -320,10 +320,8 @@ subroutine init_us_1( nat, ityp, omega, ngm, g, gg, intra_bgrp_comm )
nhtolm_d=nhtolm
nhtoj_d=nhtoj
ijtoh_d=ijtoh
qq_at_d=qq_at
qq_nt_d=qq_nt
if (is_spinorbit) then
qq_so_d=qq_so
dvan_so_d=dvan_so
fcoef_d=fcoef
else
@ -333,6 +331,13 @@ subroutine init_us_1( nat, ityp, omega, ngm, g, gg, intra_bgrp_comm )
ofsbeta_d=ofsbeta
!
#endif
!
if (nhm>0) then
!$acc update device(qq_at)
if (is_spinorbit) then
!$acc update device(qq_so)
endif
endif
!
call stop_clock ('init_us_1')
return

View File

@ -7,6 +7,7 @@
!
!-----------------------------------------------------------------------
SUBROUTINE simpson(mesh, func, rab, asum)
!$acc routine vector
!-----------------------------------------------------------------------
!
! simpson's rule integration. On input:
@ -22,27 +23,30 @@ SUBROUTINE simpson(mesh, func, rab, asum)
!
USE upf_kinds, ONLY: DP
IMPLICIT NONE
INTEGER, INTENT(in) :: mesh
real(DP), INTENT(in) :: rab (mesh), func (mesh)
INTEGER, INTENT(in) :: mesh
real(DP), INTENT(in) :: rab(mesh), func(mesh)
real(DP), INTENT(out):: asum
!
real(DP) :: f1, f2, f3, r12
real(DP) :: f1, f2, f3, r12, fct
INTEGER :: i
!
asum = 0.0d0
r12 = 1.0d0 / 3.0d0
f3 = func (1) * rab (1) * r12
DO i = 2, mesh - 1, 2
f1 = f3
f2 = func (i) * rab (i) * r12
f3 = func (i + 1) * rab (i + 1) * r12
asum = asum + f1 + 4.0d0 * f2 + f3
!
!$acc loop vector reduction(+:asum)
DO i = 2, mesh-1
fct = DBLE(ABS(MOD(i,2)-2)*2)
asum = asum + fct * func(i) * rab(i)
ENDDO
IF (MOD(mesh,2)==1) THEN
asum = (asum + func(1)*rab(1) + func(mesh)*rab(mesh)) * r12
ELSE
asum = (asum + func(1)*rab(1) - func(mesh-1)*rab(mesh-1)) * r12
ENDIF
!
! if mesh is not odd, use open formula instead:
! ... 2/3*f(n-5) + 4/3*f(n-4) + 13/12*f(n-3) + 0*f(n-2) + 27/12*f(n-1)
!!! Under testing
!** Under testing
!
!IF ( MOD(mesh,2) == 0 ) THEN
! print *, 'mesh even: correction:', f1*5.d0/4.d0-4.d0*f2+23.d0*f3/4.d0, &

View File

@ -8,11 +8,12 @@
!
!--------------------------------------------------------------------
subroutine sph_bes (msh, r, q, l, jl)
!$acc routine vector
!--------------------------------------------------------------------
!! Spherical Bessel function.
!
USE upf_kinds, only: DP
USE upf_const, ONLY : eps14
USE upf_const, only: eps14
!
implicit none
!
@ -29,8 +30,8 @@ subroutine sph_bes (msh, r, q, l, jl)
!
! xseries = convergence radius of the series for small x of j_l(x)
real(DP) :: x, xl, xseries = 0.05_dp
integer :: ir, ir0
integer, external:: semifact
integer :: i, ir, ir0
integer :: semifact
!
#if defined (__MASS)
real(DP) :: qr(msh), sin_qr(msh), cos_qr(msh)
@ -40,11 +41,17 @@ subroutine sph_bes (msh, r, q, l, jl)
if (abs (q) < eps14) then
if (l == -1) then
call upf_error ('sph_bes', 'j_{-1}(0) ?!?', 1)
stop !call upf_error ('sph_bes', 'j_{-1}(0) ?!?', 1)
elseif (l == 0) then
jl(:) = 1.d0
!$acc loop vector
do ir = 1, msh
jl(ir) = 1.d0
enddo
else
jl(:) = 0.d0
!$acc loop vector
do ir = 1, msh
jl(ir) = 0.d0
enddo
endif
return
end if
@ -52,7 +59,7 @@ subroutine sph_bes (msh, r, q, l, jl)
! case l=-1
if (l == - 1) then
if (abs (q * r (1) ) < eps14) call upf_error ('sph_bes', 'j_{-1}(0) ?!?',1)
if (abs (q * r (1) ) < eps14) stop !call upf_error ('sph_bes', 'j_{-1}(0) ?!?',1)
#if defined (__MASS)
@ -61,9 +68,10 @@ subroutine sph_bes (msh, r, q, l, jl)
jl = cos_qr / qr
#else
jl (:) = cos (q * r (:) ) / (q * r (:) )
!$acc loop vector
do ir = 1, msh
jl (ir) = cos (q * r (ir) ) / (q * r (ir) )
enddo
#endif
return
@ -75,6 +83,7 @@ subroutine sph_bes (msh, r, q, l, jl)
! notice that for small q it may happen that q*r(msh) < xseries !
ir0 = msh+1
!$acc loop vector
do ir = 1, msh
if ( abs (q * r (ir) ) > xseries ) then
ir0 = ir
@ -82,6 +91,7 @@ subroutine sph_bes (msh, r, q, l, jl)
end if
end do
!$acc loop vector
do ir = 1, ir0 - 1
x = q * r (ir)
if ( l == 0 ) then
@ -89,11 +99,18 @@ subroutine sph_bes (msh, r, q, l, jl)
else
xl = x**l
end if
jl (ir) = xl/semifact(2*l+1) * &
( 1.0_dp - x**2/1.0_dp/2.0_dp/(2.0_dp*l+3) * &
( 1.0_dp - x**2/2.0_dp/2.0_dp/(2.0_dp*l+5) * &
( 1.0_dp - x**2/3.0_dp/2.0_dp/(2.0_dp*l+7) * &
( 1.0_dp - x**2/4.0_dp/2.0_dp/(2.0_dp*l+9) ) ) ) )
!--
semifact = 1
!$acc loop seq reduction(*:semifact)
do i = 2*l+1, 1, -2
semifact = i*semifact
enddo
!---
jl (ir) = xl/DBLE(semifact) * &
( 1.0_dp - x**2/1.0_dp/2.0_dp/DBLE(2*l+3) * &
( 1.0_dp - x**2/2.0_dp/2.0_dp/DBLE(2*l+5) * &
( 1.0_dp - x**2/3.0_dp/2.0_dp/DBLE(2*l+7) * &
( 1.0_dp - x**2/4.0_dp/2.0_dp/DBLE(2*l+9) ) ) ) )
end do
! the following shouldn't be needed but do you trust compilers
@ -110,9 +127,10 @@ subroutine sph_bes (msh, r, q, l, jl)
jl (ir0:) = sin_qr(ir0:) / (q * r (ir0:) )
#else
jl (ir0:) = sin (q * r (ir0:) ) / (q * r (ir0:) )
!$acc loop vector
do ir = ir0, msh
jl (ir) = sin (q * r (ir) ) / (q * r (ir) )
enddo
#endif
elseif (l == 1) then
@ -126,10 +144,11 @@ subroutine sph_bes (msh, r, q, l, jl)
cos_qr(ir0:) ) / (q * r (ir0:) )
#else
jl (ir0:) = (sin (q * r (ir0:) ) / (q * r (ir0:) ) - &
cos (q * r (ir0:) ) ) / (q * r (ir0:) )
!$acc loop vector
do ir = ir0, msh
jl (ir) = (sin (q * r (ir) ) / (q * r (ir) ) - &
cos (q * r (ir) ) ) / (q * r (ir) )
enddo
#endif
elseif (l == 2) then
@ -143,10 +162,11 @@ subroutine sph_bes (msh, r, q, l, jl)
3.d0 * cos_qr(ir0:) ) / (q*r(ir0:))**2
#else
jl (ir0:) = ( (3.d0 / (q*r(ir0:)) - (q*r(ir0:)) ) * sin (q*r(ir0:)) - &
3.d0 * cos (q*r(ir0:)) ) / (q*r(ir0:))**2
!$acc loop vector
do ir = ir0, msh
jl (ir) = ( (3.d0 / (q*r(ir)) - (q*r(ir)) ) * sin (q*r(ir)) - &
3.d0 * cos (q*r(ir)) ) / (q*r(ir))**2
enddo
#endif
elseif (l == 3) then
@ -162,12 +182,13 @@ subroutine sph_bes (msh, r, q, l, jl)
(q*r(ir0:))**3
#else
jl (ir0:) = (sin (q*r(ir0:)) * &
(15.d0 / (q*r(ir0:)) - 6.d0 * (q*r(ir0:)) ) + &
cos (q*r(ir0:)) * ( (q*r(ir0:))**2 - 15.d0) ) / &
(q*r(ir0:)) **3
!$acc loop vector
do ir = ir0, msh
jl (ir) = (sin (q*r(ir)) * &
(15.d0 / (q*r(ir)) - 6.d0 * (q*r(ir)) ) + &
cos (q*r(ir)) * ( (q*r(ir))**2 - 15.d0) ) / &
(q*r(ir)) **3
enddo
#endif
elseif (l == 4) then
@ -184,12 +205,14 @@ subroutine sph_bes (msh, r, q, l, jl)
(q*r(ir0:))**5
#else
jl (ir0:) = (sin (q*r(ir0:)) * &
(105.d0 - 45.d0 * (q*r(ir0:))**2 + (q*r(ir0:))**4) + &
cos (q*r(ir0:)) * &
(10.d0 * (q*r(ir0:))**3 - 105.d0 * (q*r(ir0:))) ) / &
(q*r(ir0:))**5
!$acc loop vector
do ir = ir0, msh
jl (ir) = (sin (q*r(ir)) * &
(105.d0 - 45.d0 * (q*r(ir))**2 + (q*r(ir))**4) + &
cos (q*r(ir)) * &
(10.d0 * (q*r(ir))**3 - 105.d0 * (q*r(ir))) ) / &
(q*r(ir))**5
enddo
#endif
elseif (l == 5) then
@ -205,12 +228,15 @@ subroutine sph_bes (msh, r, q, l, jl)
(420.d0*sin_qr(ir0:)) / (q*r(ir0:)) ** 3 + &
( 15.d0*sin_qr(ir0:)) / (q*r(ir0:)) ) / (q*r(ir0:))
#else
jl (ir0:) = (-cos(q*r(ir0:)) - &
(945.d0*cos(q*r(ir0:))) / (q*r(ir0:)) ** 4 + &
(105.d0*cos(q*r(ir0:))) / (q*r(ir0:)) ** 2 + &
(945.d0*sin(q*r(ir0:))) / (q*r(ir0:)) ** 5 - &
(420.d0*sin(q*r(ir0:))) / (q*r(ir0:)) ** 3 + &
( 15.d0*sin(q*r(ir0:))) / (q*r(ir0:)) ) / (q*r(ir0:))
!$acc loop vector
do ir = ir0, msh
jl (ir) = (-cos(q*r(ir)) - &
(945.d0*cos(q*r(ir))) / (q*r(ir)) ** 4 + &
(105.d0*cos(q*r(ir))) / (q*r(ir)) ** 2 + &
(945.d0*sin(q*r(ir))) / (q*r(ir)) ** 5 - &
(420.d0*sin(q*r(ir))) / (q*r(ir)) ** 3 + &
( 15.d0*sin(q*r(ir))) / (q*r(ir)) ) / (q*r(ir))
enddo
#endif
elseif (l == 6) then
@ -228,19 +254,21 @@ subroutine sph_bes (msh, r, q, l, jl)
( 4725.d0*sin_qr(ir0:)) / (q*r(ir0:))**4 + &
( 210.d0*sin_qr(ir0:)) / (q*r(ir0:))**2 ) / (q*r(ir0:))
#else
jl (ir0:) = ((-10395.d0*cos(q*r(ir0:))) / (q*r(ir0:))**5 + &
( 1260.d0*cos(q*r(ir0:))) / (q*r(ir0:))**3 - &
( 21.d0*cos(q*r(ir0:))) / (q*r(ir0:)) - &
sin(q*r(ir0:)) + &
( 10395.d0*sin(q*r(ir0:))) / (q*r(ir0:))**6 - &
( 4725.d0*sin(q*r(ir0:))) / (q*r(ir0:))**4 + &
( 210.d0*sin(q*r(ir0:))) / (q*r(ir0:))**2 ) / (q*r(ir0:))
!$acc loop vector
do ir = ir0, msh
jl (ir) = ((-10395.d0*cos(q*r(ir))) / (q*r(ir))**5 + &
( 1260.d0*cos(q*r(ir))) / (q*r(ir))**3 - &
( 21.d0*cos(q*r(ir))) / (q*r(ir)) - &
sin(q*r(ir)) + &
( 10395.d0*sin(q*r(ir))) / (q*r(ir))**6 - &
( 4725.d0*sin(q*r(ir))) / (q*r(ir))**4 + &
( 210.d0*sin(q*r(ir))) / (q*r(ir))**2 ) / (q*r(ir))
enddo
#endif
else
call upf_error ('sph_bes', 'not implemented', abs(l))
stop !call upf_error ('sph_bes', 'not implemented', abs(l))
endif
!

View File

@ -0,0 +1,30 @@
!
! Copyright (C) 2001-2022 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
MODULE upf_acc_interfaces
!
INTERFACE sph_bes_acc
SUBROUTINE sph_bes( msh, r, q, l, jl )
!$acc routine vector
IMPLICIT NONE
INTEGER :: msh, l
REAL(8) :: r(msh), q, jl(msh)
END SUBROUTINE
END INTERFACE
!
INTERFACE
SUBROUTINE simpson( mesh, func, rab, asum )
!$acc routine vector
IMPLICIT NONE
INTEGER, INTENT(IN) :: mesh
REAL(8), INTENT(IN) :: rab(mesh), func(mesh)
REAL(8), INTENT(OUT):: asum
END SUBROUTINE
END INTERFACE
!
END MODULE

View File

@ -30,11 +30,10 @@ MODULE uspp
nkb, nkbus, vkb, dvan, deeq, qq_at, qq_nt, nhtoj, ijtoh, beta, &
becsum, ebecsum
PUBLIC :: lpx_d, lpl_d, ap_d, indv_d, nhtol_d, nhtolm_d, ofsbeta_d, &
dvan_d, deeq_d, qq_at_d, qq_nt_d, nhtoj_d, ijtoh_d, &
becsum_d, ebecsum_d
dvan_d, qq_nt_d, nhtoj_d, ijtoh_d, becsum_d, ebecsum_d
PUBLIC :: okvan, nlcc_any
PUBLIC :: qq_so, dvan_so, deeq_nc, fcoef
PUBLIC :: qq_so_d, dvan_so_d, deeq_nc_d, fcoef_d
PUBLIC :: dvan_so_d, fcoef_d
PUBLIC :: dbeta
!
PUBLIC :: allocate_uspp, deallocate_uspp
@ -110,16 +109,12 @@ MODULE uspp
REAL(DP), ALLOCATABLE :: becsum_d(:,:,:)
REAL(DP), ALLOCATABLE :: ebecsum_d(:,:,:)
REAL(DP), ALLOCATABLE :: dvan_d(:,:,:)
REAL(DP), ALLOCATABLE :: deeq_d(:,:,:,:)
REAL(DP), ALLOCATABLE :: qq_nt_d(:,:,:)
REAL(DP), ALLOCATABLE :: qq_at_d(:,:,:)
REAL(DP), ALLOCATABLE :: nhtoj_d(:,:)
COMPLEX(DP), ALLOCATABLE :: qq_so_d(:,:,:,:)
COMPLEX(DP), ALLOCATABLE :: dvan_so_d(:,:,:,:)
COMPLEX(DP), ALLOCATABLE :: deeq_nc_d(:,:,:,:)
#if defined(__CUDA)
attributes (DEVICE) :: becsum_d, ebecsum_d, dvan_d, deeq_d, qq_nt_d, &
qq_at_d, nhtoj_d, qq_so_d, dvan_so_d, deeq_nc_d
attributes (DEVICE) :: becsum_d, ebecsum_d, dvan_d, qq_nt_d, &
nhtoj_d, dvan_so_d
#endif
!
@ -356,15 +351,19 @@ CONTAINS
allocate( nhtoj(nhm,nsp) )
allocate( ijtoh(nhm,nhm,nsp) )
allocate( deeq(nhm,nhm,nat,nspin) )
!$acc enter data create(deeq)
if ( noncolin ) then
allocate( deeq_nc(nhm,nhm,nat,nspin) )
!$acc enter data create(deeq_nc)
endif
allocate( qq_at(nhm,nhm,nat) )
!$acc enter data create(qq_at)
allocate( qq_nt(nhm,nhm,nsp) )
! set the internal spin-orbit flag
is_spinorbit = lspinorb
if ( lspinorb ) then
allocate( qq_so(nhm,nhm,4,nsp) )
!$acc enter data create(qq_so)
allocate( dvan_so(nhm,nhm,nspin,nsp) )
allocate( fcoef(nhm,nhm,2,2,nsp) )
else
@ -386,14 +385,8 @@ CONTAINS
allocate( nhtolm_d(nhm,nsp) )
allocate( nhtoj_d(nhm,nsp) )
allocate( ijtoh_d(nhm,nhm,nsp) )
allocate( deeq_d(nhm,nhm,nat,nspin) )
if ( noncolin ) then
allocate( deeq_nc_d(nhm,nhm,nat,nspin) )
endif
allocate( qq_at_d(nhm,nhm,nat) )
allocate( qq_nt_d(nhm,nhm,nsp) )
if ( lspinorb ) then
allocate( qq_so_d(nhm,nhm,4,nsp) )
allocate( dvan_so_d(nhm,nhm,nspin,nsp) )
allocate( fcoef_d(nhm,nhm,2,2,nsp) )
else
@ -419,22 +412,34 @@ CONTAINS
IF( ALLOCATED( indv ) ) DEALLOCATE( indv )
IF( ALLOCATED( nhtolm ) ) DEALLOCATE( nhtolm )
IF( ALLOCATED( nhtoj ) ) DEALLOCATE( nhtoj )
IF( ALLOCATED( ofsbeta ) ) DEALLOCATE( ofsbeta )
IF( ALLOCATED( ofsbeta ) ) DEALLOCATE( ofsbeta )
IF( ALLOCATED( ijtoh ) ) DEALLOCATE( ijtoh )
!FIXME in order to be created and deleted automatically by using !$acc declare create(vkb) in
IF( ALLOCATED( vkb ) ) THEN
!$acc exit data delete(vkb )
!$acc exit data delete(vkb )
DEALLOCATE( vkb )
END IF
IF( ALLOCATED( becsum ) ) DEALLOCATE( becsum )
IF( ALLOCATED( ebecsum ) ) DEALLOCATE( ebecsum )
IF( ALLOCATED( qq_at ) ) DEALLOCATE( qq_at )
IF( ALLOCATED( qq_at ) ) THEN
!$acc exit data delete( qq_at )
DEALLOCATE( qq_at )
ENDIF
IF( ALLOCATED( qq_nt ) ) DEALLOCATE( qq_nt )
IF( ALLOCATED( dvan ) ) DEALLOCATE( dvan )
IF( ALLOCATED( deeq ) ) DEALLOCATE( deeq )
IF( ALLOCATED( qq_so ) ) DEALLOCATE( qq_so )
IF( ALLOCATED( deeq ) ) THEN
!$acc exit data delete( deeq )
DEALLOCATE( deeq )
ENDIF
IF( ALLOCATED( qq_so ) ) THEN
!$acc exit data delete( qq_so )
DEALLOCATE( qq_so )
ENDIF
IF( ALLOCATED( dvan_so ) ) DEALLOCATE( dvan_so )
IF( ALLOCATED( deeq_nc ) ) DEALLOCATE( deeq_nc )
IF( ALLOCATED( deeq_nc ) ) THEN
!$acc exit data delete( deeq_nc )
DEALLOCATE( deeq_nc )
ENDIF
IF( ALLOCATED( fcoef ) ) DEALLOCATE( fcoef )
IF( ALLOCATED( beta ) ) DEALLOCATE( beta )
IF( ALLOCATED( dbeta ) ) DEALLOCATE( dbeta )
@ -451,13 +456,9 @@ CONTAINS
IF( ALLOCATED( becsum_d ) ) DEALLOCATE( becsum_d )
IF( ALLOCATED( ebecsum_d ) ) DEALLOCATE( ebecsum_d )
IF( ALLOCATED( dvan_d ) ) DEALLOCATE( dvan_d )
IF( ALLOCATED( deeq_d ) ) DEALLOCATE( deeq_d )
IF( ALLOCATED( qq_nt_d ) ) DEALLOCATE( qq_nt_d )
IF( ALLOCATED( qq_at_d ) ) DEALLOCATE( qq_at_d )
IF( ALLOCATED( nhtoj_d ) ) DEALLOCATE( nhtoj_d )
IF( ALLOCATED( qq_so_d ) ) DEALLOCATE( qq_so_d )
IF( ALLOCATED( dvan_so_d ) ) DEALLOCATE( dvan_so_d )
IF( ALLOCATED( deeq_nc_d ) ) DEALLOCATE( deeq_nc_d )
IF( ALLOCATED( fcoef_d ) ) DEALLOCATE( fcoef_d )
!
END SUBROUTINE deallocate_uspp