mirror of https://gitlab.com/QEF/q-e.git
Merge branch 'stress_acc' into 'develop'
Stress Acc See merge request QEF/q-e!1945
This commit is contained in:
commit
6338722054
|
@ -184,9 +184,9 @@
|
|||
CALL ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms )
|
||||
!
|
||||
END IF
|
||||
!NOTE g and mill already allocate in the device they are initialized below.
|
||||
!$acc data present(g, mill)
|
||||
!$acc update device(g,mill)
|
||||
!NOTE g, gg and mill already allocate in the device they are initialized below.
|
||||
!$acc data present(g,gg,mill)
|
||||
!$acc update device(g,gg,mill)
|
||||
!$acc end data
|
||||
!
|
||||
CALL gshells (.TRUE.)
|
||||
|
@ -429,13 +429,13 @@
|
|||
! re-calculate G-vectors and kinetic energy
|
||||
!
|
||||
dfftp_ngm = dfftp%ngm
|
||||
!$acc parallel loop present(g, mill) copyin(bg) copyout(gg)
|
||||
!$acc parallel loop present(g,gg,mill) copyin(bg)
|
||||
do ig = 1, dfftp_ngm
|
||||
g(:,ig)= mill(1,ig)*bg(:,1) + mill(2,ig)*bg(:,2) + mill(3,ig)*bg(:,3)
|
||||
gg(ig)=g(1,ig)**2 + g(2,ig)**2 + g(3,ig)**2
|
||||
enddo
|
||||
!$acc end parallel loop
|
||||
!$acc update host(g)
|
||||
!$acc end parallel loop
|
||||
!$acc update host(g,gg)
|
||||
!
|
||||
call g2kin_init ( gg, tpiba2 )
|
||||
!
|
||||
|
|
|
@ -357,7 +357,7 @@
|
|||
|
||||
!
|
||||
|
||||
DEV_ACC data present(rhoeg, rhops, mill,g ) copy(fion) create(rp(1:s_ngm_)) copyin(sfac, screen_coul, gg, vps, ityp,ei1, ei2, ei3)
|
||||
DEV_ACC data present(rhoeg, rhops, mill,g,gg ) copy(fion) create(rp(1:s_ngm_)) copyin(sfac, screen_coul, vps, ityp,ei1, ei2, ei3)
|
||||
!
|
||||
DEV_OMP parallel default(none) &
|
||||
DEV_OMP shared(gstart, dffts,sfac, rhops, screen_coul, rhoeg, nsp, gg, tpiba2, tpiba, mill, g, &
|
||||
|
|
|
@ -212,7 +212,8 @@ DEV_ACC enter data create(drhot(1:p_ngm_, 1:6))
|
|||
!
|
||||
zpseu = 0.0_DP
|
||||
!
|
||||
DEV_ACC data copyin(rhog,drhog,ht,sfac,vps,gg,rhops) copyout(vtemp)
|
||||
DEV_ACC update device(gg)
|
||||
DEV_ACC data copyin(rhog,drhog,ht,sfac,vps,rhops) copyout(vtemp)
|
||||
DEV_OMP parallel default(shared), private(ig,is,ij,i,j,k)
|
||||
!
|
||||
DEV_OMP do
|
||||
|
@ -337,7 +338,7 @@ DEV_ACC loop vector reduction(+:x_tmp)
|
|||
DEV_ACC end parallel
|
||||
!
|
||||
DEV_OMP do
|
||||
DEV_ACC parallel loop present(rhotmp)
|
||||
DEV_ACC parallel loop present(rhotmp,gg)
|
||||
DO ig = gstart, p_ngm_
|
||||
vtemp(ig) = CONJG( rhotmp( ig ) ) * rhotmp( ig ) / gg( ig )
|
||||
END DO
|
||||
|
@ -402,7 +403,7 @@ DEV_ACC kernels
|
|||
DEV_ACC end kernels
|
||||
|
||||
!
|
||||
DEV_ACC parallel loop present(rhotmp)
|
||||
DEV_ACC parallel loop present(rhotmp,gg)
|
||||
!
|
||||
DEV_OMP parallel default(shared), private(ig,is)
|
||||
DEV_OMP do
|
||||
|
|
|
@ -120,7 +120,7 @@
|
|||
ALLOCATE( igtongl_d(ngm) )
|
||||
ALLOCATE( gl_d(ngm) )
|
||||
ENDIF
|
||||
!$acc enter data create( mill(1:3, 1:ngm), g(1:3, 1:ngm) )
|
||||
!$acc enter data create( mill(1:3,1:ngm), g(1:3,1:ngm), gg(1:ngm) )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
@ -142,7 +142,10 @@
|
|||
END IF
|
||||
!
|
||||
!
|
||||
IF( ALLOCATED( gg ) ) DEALLOCATE( gg )
|
||||
IF( ALLOCATED( gg ) ) THEN
|
||||
!$acc exit data delete(gg)
|
||||
DEALLOCATE( gg )
|
||||
END IF
|
||||
IF( ALLOCATED( g ) ) THEN
|
||||
!$acc exit data delete(g)
|
||||
DEALLOCATE( g )
|
||||
|
|
|
@ -29,11 +29,7 @@ subroutine drhodv (nu_i0, nper, drhoscf)
|
|||
USE cell_base, ONLY : tpiba
|
||||
USE lsda_mod, ONLY : current_spin, lsda, isk, nspin
|
||||
USE wvfct, ONLY : npwx, nbnd
|
||||
#if defined(__CUDA)
|
||||
USE uspp, ONLY : nkb, vkb, deeq_nc, deeq_nc_d, okvan
|
||||
#else
|
||||
USE uspp, ONLY : nkb, vkb, deeq_nc, okvan
|
||||
#endif
|
||||
USE becmod, ONLY : calbec, bec_type, becscal, allocate_bec_type, &
|
||||
deallocate_bec_type
|
||||
USE fft_base, ONLY : dfftp
|
||||
|
@ -142,18 +138,14 @@ subroutine drhodv (nu_i0, nper, drhoscf)
|
|||
ELSE
|
||||
IF (okvan) THEN
|
||||
deeq_nc(:,:,:,:)=deeq_nc_save(:,:,:,:,2)
|
||||
#if defined(__CUDA)
|
||||
deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:)
|
||||
#endif
|
||||
!$acc update device(deeq_nc)
|
||||
int1_nc(:,:,:,:,:)=int1_nc_save(:,:,:,:,:,2)
|
||||
ENDIF
|
||||
call drhodvnl (ik, ikk, nper, nu_i0, dynwrk, becpt, alphapt, &
|
||||
dbecq, dalpq)
|
||||
IF (okvan) THEN
|
||||
deeq_nc(:,:,:,:)=deeq_nc_save(:,:,:,:,1)
|
||||
#if defined(__CUDA)
|
||||
deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:)
|
||||
#endif
|
||||
!$acc update device(deeq_nc)
|
||||
int1_nc(:,:,:,:,:)=int1_nc_save(:,:,:,:,:,1)
|
||||
ENDIF
|
||||
ENDIF
|
||||
|
|
|
@ -65,7 +65,7 @@ subroutine phq_setup
|
|||
USE symm_base, ONLY : nrot, nsym, s, irt, t_rev, time_reversal, &
|
||||
sr, invs, inverse_s, d1, d2, d3, check_grid_sym
|
||||
USE uspp_param, ONLY : upf
|
||||
USE uspp, ONLY : nlcc_any, deeq_nc, deeq_nc_d, okvan
|
||||
USE uspp, ONLY : nlcc_any, deeq_nc, okvan
|
||||
USE noncollin_module, ONLY : noncolin, domag, m_loc, angle1, angle2, ux
|
||||
USE nlcc_ph, ONLY : drc
|
||||
USE control_ph, ONLY : rec_code, lgamma_gamma, search_sym, start_irr, &
|
||||
|
@ -183,9 +183,7 @@ subroutine phq_setup
|
|||
v%of_r(:,2:4)=-v%of_r(:,2:4)
|
||||
deeq_nc_save(:,:,:,:,2)=deeq_nc(:,:,:,:)
|
||||
deeq_nc(:,:,:,:)=deeq_nc_save(:,:,:,:,1)
|
||||
#if defined(__CUDA)
|
||||
deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:)
|
||||
#endif
|
||||
!$acc update device(deeq_nc)
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
|
|
|
@ -47,10 +47,8 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
USE scf, ONLY : rho, vrs
|
||||
#if defined(__CUDA)
|
||||
USE scf_gpum, ONLY : vrs_d
|
||||
USE uspp, ONLY : okvan, vkb, deeq_nc, deeq_nc_d
|
||||
#else
|
||||
USE uspp, ONLY : okvan, vkb, deeq_nc
|
||||
#endif
|
||||
USE uspp, ONLY : okvan, vkb, deeq_nc
|
||||
USE uspp_param, ONLY : nhm
|
||||
USE noncollin_module, ONLY : noncolin, domag, npol, nspin_mag
|
||||
USE paw_variables, ONLY : okpaw
|
||||
|
@ -282,22 +280,18 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
! Hubbard potential dvbare_hub_q * psi_kpoint
|
||||
! is calculated and added to dvpsi.
|
||||
!
|
||||
IF (lda_plus_u) CALL dvqhub_barepsi_us(ik, u(1, mode))
|
||||
IF (lda_plus_u) CALL dvqhub_barepsi_us(ik, u(1,mode))
|
||||
!
|
||||
ELSE
|
||||
IF (okvan) THEN
|
||||
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,2)
|
||||
#if defined(__CUDA)
|
||||
deeq_nc_d(:,:,:,:) = deeq_nc(:,:,:,:)
|
||||
#endif
|
||||
!$acc update device(deeq_nc)
|
||||
int1_nc(:,:,:,:,:) = int1_nc_save(:,:,:,:,:,2)
|
||||
ENDIF
|
||||
CALL dvqpsi_us(ik, u(1, mode), .FAlSE., becpt, alphapt)
|
||||
IF (okvan) THEN
|
||||
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,1)
|
||||
#if defined(__CUDA)
|
||||
deeq_nc_d(:,:,:,:) = deeq_nc(:,:,:,:)
|
||||
#endif
|
||||
!$acc update device(deeq_nc)
|
||||
int1_nc(:,:,:,:,:) = int1_nc_save(:,:,:,:,:,1)
|
||||
ENDIF
|
||||
ENDIF
|
||||
|
@ -342,9 +336,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
#endif
|
||||
IF (okvan) THEN
|
||||
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,2)
|
||||
#if defined(__CUDA)
|
||||
deeq_nc_d(:,:,:,:) = deeq_nc(:,:,:,:)
|
||||
#endif
|
||||
!$acc update device(deeq_nc)
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
|
@ -375,9 +367,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
#endif
|
||||
IF (okvan) THEN
|
||||
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,1)
|
||||
#if defined(__CUDA)
|
||||
deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:)
|
||||
#endif
|
||||
!$acc update device(deeq_nc)
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
|
|
|
@ -260,32 +260,22 @@ set(src_pw
|
|||
src/force_corr_gpu.f90
|
||||
src/orthoatwfc_gpu.f90
|
||||
src/wfcinit_gpu.f90
|
||||
src/stres_loc_gpu.f90
|
||||
src/add_paw_to_deeq_gpu.f90
|
||||
src/addusdens_gpu.f90
|
||||
src/addusforce_gpu.f90
|
||||
src/deriv_drhoc_gpu.f90
|
||||
src/s_psi_gpu.f90
|
||||
src/rotate_wfc_gpu.f90
|
||||
src/usnldiag_gpu.f90
|
||||
src/add_vuspsi_gpu.f90
|
||||
src/hs_1psi_gpu.f90
|
||||
src/g_psi_gpu.f90
|
||||
src/stres_mgga_gpu.f90
|
||||
src/atomic_wfc_gpu.f90
|
||||
src/dvloc_of_g_gpu.f90
|
||||
src/stres_ewa_gpu.f90
|
||||
src/stres_knl_gpu.f90
|
||||
src/compute_deff_gpu.f90
|
||||
src/add_vhub_to_deeq_gpu.f90
|
||||
src/s_1psi_gpu.f90
|
||||
src/h_psi_gpu.f90
|
||||
src/stres_us_gpu.f90
|
||||
src/stres_cc_gpu.f90
|
||||
src/utils_gpu.f90
|
||||
src/vhpsi_gpu.f90
|
||||
src/drhoc_gpu.f90
|
||||
src/stres_har_gpu.f90
|
||||
src/vloc_psi_gpu.f90
|
||||
src/hs_psi_gpu.f90)
|
||||
qe_enable_cuda_fortran("${src_pw}")
|
||||
|
|
|
@ -352,7 +352,7 @@ END SUBROUTINE cutoff_force_lc
|
|||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE cutoff_stres_evloc( psic_G, strf, evloc )
|
||||
SUBROUTINE cutoff_stres_evloc( rho_G, strf, evloc )
|
||||
!----------------------------------------------------------------------
|
||||
!! This subroutine adds the contribution from the cutoff long-range part
|
||||
!! of the local part of the ionic potential to \(\text{evloc}\).
|
||||
|
@ -363,14 +363,14 @@ SUBROUTINE cutoff_stres_evloc( psic_G, strf, evloc )
|
|||
!! to re-compute it here for the stress.
|
||||
!
|
||||
USE kinds
|
||||
USE ions_base, ONLY : ntyp => nsp
|
||||
USE gvect, ONLY : ngm, gstart
|
||||
USE io_global, ONLY : stdout
|
||||
USE fft_base, ONLY : dfftp
|
||||
USE ions_base, ONLY: ntyp => nsp
|
||||
USE gvect, ONLY: ngm, gstart
|
||||
USE io_global, ONLY: stdout
|
||||
USE fft_base, ONLY: dfftp
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
COMPLEX(DP), INTENT(IN) :: psic_G(dfftp%nnr)
|
||||
COMPLEX(DP), INTENT(IN) :: rho_G(dfftp%nnr)
|
||||
!! charge density in G space
|
||||
COMPLEX(DP), INTENT(IN) :: strf(ngm,ntyp)
|
||||
!! the structure factor
|
||||
|
@ -381,71 +381,28 @@ SUBROUTINE cutoff_stres_evloc( psic_G, strf, evloc )
|
|||
!
|
||||
INTEGER :: ng, nt
|
||||
!
|
||||
! If gstart=2, it means g(1) is G=0, but we have nothing to add for G=0
|
||||
! So we start at gstart.
|
||||
!$acc data present_or_copyin(rho_G,strf)
|
||||
!
|
||||
! ... If gstart=2, it means g(1) is G=0, but we have nothing to add for G=0
|
||||
! So we start at gstart.
|
||||
!
|
||||
!$acc parallel loop collapse(2) reduction(+:evloc) copyin(lr_Vloc)
|
||||
DO nt = 1, ntyp
|
||||
DO ng = gstart, ngm
|
||||
evloc = evloc + DBLE( CONJG(psic_G(ng)) * strf(ng,nt) ) &
|
||||
* lr_Vloc(ng,nt)
|
||||
ENDDO
|
||||
DO ng = gstart, ngm
|
||||
evloc = evloc + DBLE( CONJG(rho_G(ng)) * strf(ng,nt) ) &
|
||||
* lr_Vloc(ng,nt)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cutoff_stres_evloc
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE cutoff_stres_evloc_gpu( psicG_d, strf_d, evloc )
|
||||
!----------------------------------------------------------------------
|
||||
!! cutoff_stres_evloc - gpu version
|
||||
!
|
||||
USE kinds
|
||||
USE ions_base, ONLY : ntyp => nsp
|
||||
!USE vlocal, ONLY : strf
|
||||
USE gvect, ONLY : ngm, gstart
|
||||
USE io_global, ONLY : stdout
|
||||
USE fft_base, ONLY : dfftp
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
COMPLEX(DP), INTENT(IN) :: psicG_d(dfftp%nnr)
|
||||
!! charge density in G space
|
||||
COMPLEX(DP), INTENT(IN) :: strf_d(ngm,ntyp)
|
||||
REAL(DP), INTENT(INOUT) :: evloc
|
||||
!! the energy of the electrons in the local ionic potential
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: lrVloc_d(:,:)
|
||||
INTEGER :: ng, nt
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psicG_d, strf_d, lrVloc_d
|
||||
#endif
|
||||
!
|
||||
ALLOCATE( lrVloc_d(ngm,ntyp) )
|
||||
lrVloc_d = lr_Vloc
|
||||
!
|
||||
! If gstart=2, it means g(1) is G=0, but we have nothing to add for G=0
|
||||
! So we start at gstart.
|
||||
!
|
||||
!$cuf kernel do (2)
|
||||
DO nt = 1, ntyp
|
||||
DO ng = gstart, ngm
|
||||
evloc = evloc + DBLE( CONJG(psicG_d(ng)) * strf_d(ng,nt) ) &
|
||||
* lrVloc_d(ng,nt)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( lrVloc_d )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cutoff_stres_evloc_gpu
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE cutoff_stres_sigmaloc( psic_G, strf, sigmaloc )
|
||||
SUBROUTINE cutoff_stres_sigmaloc( rho_G, strf, sigmaloc )
|
||||
!----------------------------------------------------------------------
|
||||
!! This subroutine adds the contribution from the cutoff long-range part
|
||||
!! of the local part of the ionic potential to the rest of the
|
||||
|
@ -454,141 +411,67 @@ SUBROUTINE cutoff_stres_sigmaloc( psic_G, strf, sigmaloc )
|
|||
USE kinds
|
||||
USE ions_base, ONLY : ntyp => nsp
|
||||
USE constants, ONLY : eps8
|
||||
USE gvect, ONLY : ngm, g, gg, gstart
|
||||
USE gvect, ONLY : ngm, gstart, g, gg
|
||||
USE cell_base, ONLY : tpiba, tpiba2, alat, omega
|
||||
USE io_global, ONLY : stdout
|
||||
USE fft_base, ONLY : dfftp
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
COMPLEX(DP), INTENT(IN) :: psic_G(dfftp%nnr)
|
||||
COMPLEX(DP), INTENT(IN) :: rho_G(dfftp%nnr)
|
||||
!! charge density in G space
|
||||
COMPLEX(DP), INTENT(IN) :: strf(ngm,ntyp)
|
||||
!! the structure factor
|
||||
REAL(DP), INTENT(INOUT) :: sigmaloc(3,3)
|
||||
!! stress contribution for the local ionic potential
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ng, nt, l, m
|
||||
REAL(DP) :: Gp, G2lzo2Gp, beta, dlr_Vloc
|
||||
!
|
||||
! no G=0 contribution
|
||||
DO nt = 1, ntyp
|
||||
DO ng = gstart, ngm
|
||||
!
|
||||
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
|
||||
! below is a somewhat cumbersome way to define beta of Eq. (61) of PRB 96, 075448
|
||||
IF (Gp < eps8) THEN
|
||||
! G^2*lz/2|Gp|
|
||||
G2lzo2Gp = 0.0d0
|
||||
beta = 0.0d0
|
||||
ELSE
|
||||
G2lzo2Gp = gg(ng)*tpiba2*lz/2.0d0/Gp
|
||||
beta = G2lzo2Gp*(1.0d0-cutoff_2D(ng))/cutoff_2D(ng)
|
||||
ENDIF
|
||||
! dlr_vloc corresponds to the derivative of the long-range local ionic potential
|
||||
! with respect to G
|
||||
DO l = 1, 3
|
||||
IF (l == 3) THEN
|
||||
dlr_Vloc = - 1.0d0/(gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
|
||||
* (1.0d0+ gg(ng)*tpiba2/4.0d0)
|
||||
ELSE
|
||||
dlr_Vloc = - 1.0d0/ (gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
|
||||
* (1.0d0- beta + gg(ng)*tpiba2/4.0d0)
|
||||
ENDIF
|
||||
!
|
||||
DO m = 1, l
|
||||
sigmaloc(l,m) = sigmaloc(l,m) + DBLE( CONJG( psic_G(ng) ) &
|
||||
* strf(ng,nt) ) * 2.0d0 * dlr_Vloc &
|
||||
* tpiba2 * g(l,ng) * g(m,ng)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cutoff_stres_sigmaloc
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE cutoff_stres_sigmaloc_gpu( psicG_d, strf_d, sigmaloc )
|
||||
!----------------------------------------------------------------------
|
||||
!! This subroutine adds the contribution from the cutoff long-range part
|
||||
!! of the local part of the ionic potential to the rest of the
|
||||
!! \(\text{sigmaloc}\). That is, the rest of Eq. (63) of PRB 96, 075448.
|
||||
!
|
||||
USE kinds
|
||||
USE ions_base, ONLY : ntyp => nsp
|
||||
USE vlocal, ONLY : strf
|
||||
USE constants, ONLY : eps8
|
||||
USE gvect, ONLY : ngm, gstart, g_d, gg_d
|
||||
USE cell_base, ONLY : tpiba, tpiba2, alat, omega
|
||||
USE io_global, ONLY : stdout
|
||||
USE fft_base, ONLY : dfftp
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
COMPLEX(DP), INTENT(IN) :: psicG_d(dfftp%nnr)
|
||||
!! charge density in G space
|
||||
COMPLEX(DP), INTENT(IN) :: strf_d(ngm,ntyp)
|
||||
REAL(DP), INTENT(INOUT) :: sigmaloc(3,3)
|
||||
!! stress contribution for the local ionic potential
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ng, nt, l, m
|
||||
REAL(DP), ALLOCATABLE :: lrVloc_d(:,:), cutoff2D_d(:)
|
||||
INTEGER :: ng, nt, l, m
|
||||
REAL(DP) :: Gp, G2lzo2Gp, beta, dlr_Vloc1, dlr_Vloc2, dlr_Vloc3, &
|
||||
no_lm_dep
|
||||
REAL(DP) :: sigmaloc11, sigmaloc31, sigmaloc21, sigmaloc32, &
|
||||
sigmaloc22, sigmaloc33
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psicG_d, strf_d, lrVloc_d, cutoff2D_d
|
||||
#endif
|
||||
!
|
||||
ALLOCATE( lrVloc_d(ngm,ntyp), cutoff2D_d(ngm) )
|
||||
lrVloc_d = lr_Vloc
|
||||
cutoff2D_d = cutoff_2D
|
||||
!
|
||||
!$acc data present_or_copyin(rho_G,strf)
|
||||
!
|
||||
sigmaloc11 = 0._DP ; sigmaloc31 = 0._DP
|
||||
sigmaloc21 = 0._DP ; sigmaloc32 = 0._DP
|
||||
sigmaloc22 = 0._DP ; sigmaloc33 = 0._DP
|
||||
!
|
||||
! no G=0 contribution
|
||||
! ... no G=0 contribution
|
||||
!
|
||||
!$cuf kernel do (2) <<<*,*>>>
|
||||
!$acc parallel loop collapse(2) copyin(lr_Vloc,cutoff_2D) &
|
||||
!$acc reduction(+:sigmaloc11,sigmaloc21,sigmaloc22,sigmaloc31,&
|
||||
!$acc sigmaloc32,sigmaloc33)
|
||||
DO nt = 1, ntyp
|
||||
DO ng = gstart, ngm
|
||||
!
|
||||
Gp = SQRT( g_d(1,ng)**2 + g_d(2,ng)**2 )*tpiba
|
||||
! below is a somewhat cumbersome way to define beta of Eq. (61) of PRB 96, 075448
|
||||
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
|
||||
! ... below is a somewhat cumbersome way to define beta of Eq. (61) of PRB 96, 075448
|
||||
IF (Gp < eps8) THEN
|
||||
! G^2*lz/2|Gp|
|
||||
! ... G^2*lz/2|Gp|
|
||||
G2lzo2Gp = 0._DP
|
||||
beta = 0._DP
|
||||
ELSE
|
||||
G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp
|
||||
beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng)
|
||||
G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
|
||||
beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
|
||||
ENDIF
|
||||
! dlrVloc corresponds to the derivative of the long-range local ionic potential
|
||||
! with respect to G
|
||||
dlr_Vloc1 = - 1._DP/ (gg_d(ng)*tpiba2) * lrVloc_d(ng,nt) &
|
||||
* (1._DP- beta + gg_d(ng)*tpiba2/4._DP)
|
||||
dlr_Vloc2 = - 1._DP/ (gg_d(ng)*tpiba2) * lrVloc_d(ng,nt) &
|
||||
* (1._DP- beta + gg_d(ng)*tpiba2/4._DP)
|
||||
dlr_Vloc3 = - 1._DP/ (gg_d(ng)*tpiba2) * lrVloc_d(ng,nt) &
|
||||
* (1._DP+ gg_d(ng)*tpiba2/4._DP)
|
||||
no_lm_dep = DBLE( CONJG( psicG_d(ng) ) &
|
||||
* strf_d(ng,nt) ) * 2._DP * tpiba2
|
||||
sigmaloc11 = sigmaloc11 + no_lm_dep * dlr_Vloc1 * g_d(1,ng) * g_d(1,ng)
|
||||
sigmaloc21 = sigmaloc21 + no_lm_dep * dlr_Vloc2 * g_d(2,ng) * g_d(1,ng)
|
||||
sigmaloc22 = sigmaloc22 + no_lm_dep * dlr_Vloc2 * g_d(2,ng) * g_d(2,ng)
|
||||
sigmaloc31 = sigmaloc31 + no_lm_dep * dlr_Vloc3 * g_d(3,ng) * g_d(1,ng)
|
||||
sigmaloc32 = sigmaloc32 + no_lm_dep * dlr_Vloc3 * g_d(3,ng) * g_d(2,ng)
|
||||
sigmaloc33 = sigmaloc33 + no_lm_dep * dlr_Vloc3 * g_d(3,ng) * g_d(3,ng)
|
||||
! ... dlrVloc corresponds to the derivative of the long-range local ionic potential
|
||||
! with respect to G
|
||||
dlr_Vloc1 = -1._DP / (gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
|
||||
* (1._DP- beta + gg(ng)*tpiba2/4._DP)
|
||||
dlr_Vloc2 = -1._DP / (gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
|
||||
* (1._DP- beta + gg(ng)*tpiba2/4._DP)
|
||||
dlr_Vloc3 = -1._DP / (gg(ng)*tpiba2) * lr_Vloc(ng,nt) &
|
||||
* (1._DP+ gg(ng)*tpiba2/4._DP)
|
||||
no_lm_dep = DBLE( CONJG( rho_G(ng) ) &
|
||||
* strf(ng,nt) ) * 2._DP * tpiba2
|
||||
sigmaloc11 = sigmaloc11 + no_lm_dep * dlr_Vloc1 * g(1,ng) * g(1,ng)
|
||||
sigmaloc21 = sigmaloc21 + no_lm_dep * dlr_Vloc2 * g(2,ng) * g(1,ng)
|
||||
sigmaloc22 = sigmaloc22 + no_lm_dep * dlr_Vloc2 * g(2,ng) * g(2,ng)
|
||||
sigmaloc31 = sigmaloc31 + no_lm_dep * dlr_Vloc3 * g(3,ng) * g(1,ng)
|
||||
sigmaloc32 = sigmaloc32 + no_lm_dep * dlr_Vloc3 * g(3,ng) * g(2,ng)
|
||||
sigmaloc33 = sigmaloc33 + no_lm_dep * dlr_Vloc3 * g(3,ng) * g(3,ng)
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
|
@ -600,70 +483,15 @@ SUBROUTINE cutoff_stres_sigmaloc_gpu( psicG_d, strf_d, sigmaloc )
|
|||
sigmaloc(3,2) = sigmaloc(3,2) + sigmaloc32
|
||||
sigmaloc(3,3) = sigmaloc(3,3) + sigmaloc33
|
||||
!
|
||||
DEALLOCATE( lrVloc_d, cutoff2D_d )
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cutoff_stres_sigmaloc_gpu
|
||||
END SUBROUTINE cutoff_stres_sigmaloc
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE cutoff_stres_sigmahar( psic_G, sigmahar )
|
||||
!----------------------------------------------------------------------
|
||||
!! This subroutine cuts off the Hartree part of the stress.
|
||||
!! See Eq. (62) of PRB 96, 075448.
|
||||
!
|
||||
USE kinds
|
||||
USE gvect, ONLY : ngm, g, gg, gstart
|
||||
USE constants, ONLY : eps8
|
||||
USE cell_base, ONLY : tpiba2, alat, tpiba
|
||||
USE io_global, ONLY : stdout
|
||||
USE fft_base, ONLY : dfftp
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
COMPLEX(DP), INTENT(IN) :: psic_G(dfftp%nnr)
|
||||
!! charge density in G-space
|
||||
REAL(DP), INTENT(INOUT) :: sigmahar(3,3)
|
||||
!! hartree contribution to stress
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ng, nt, l, m
|
||||
REAL(DP) :: Gp, G2lzo2Gp, beta, shart, g2, fact
|
||||
!
|
||||
DO ng = gstart, ngm
|
||||
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
|
||||
IF (Gp < eps8) THEN
|
||||
G2lzo2Gp = 0.0d0
|
||||
beta = 0.0d0
|
||||
ELSE
|
||||
G2lzo2Gp = gg(ng)*tpiba2*lz/2.0d0/Gp
|
||||
beta = G2lzo2Gp*(1.0d0-cutoff_2D(ng))/cutoff_2D(ng)
|
||||
ENDIF
|
||||
g2 = gg (ng) * tpiba2
|
||||
shart = psic_G(ng) * CONJG(psic_G(ng)) / g2 * cutoff_2D(ng)
|
||||
DO l = 1, 3
|
||||
IF (l == 3) THEN
|
||||
fact = 1.0d0
|
||||
ELSE
|
||||
fact = 1.0d0 - beta
|
||||
ENDIF
|
||||
DO m = 1, l
|
||||
sigmahar(l,m) = sigmahar(l,m) + shart * tpiba2 * 2 * &
|
||||
g(l,ng) * g(m,ng) / g2 * fact
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!sigma is multiplied by 0.5*fpi*e2 after
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cutoff_stres_sigmahar
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar )
|
||||
SUBROUTINE cutoff_stres_sigmahar( rho_G, sigmahar )
|
||||
!----------------------------------------------------------------------
|
||||
!! This subroutine cuts off the Hartree part of the stress.
|
||||
!! See Eq. (62) of PRB 96, 075448.
|
||||
|
@ -674,14 +502,14 @@ SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar )
|
|||
USE cell_base, ONLY: tpiba2, alat, tpiba
|
||||
USE io_global, ONLY: stdout
|
||||
USE fft_base, ONLY: dfftp
|
||||
USE gvect, ONLY: g_d, gg_d
|
||||
USE gvect, ONLY: g, gg
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
COMPLEX(DP), INTENT(IN) :: psicG_d(dfftp%nnr)
|
||||
COMPLEX(DP), INTENT(IN) :: rho_G(dfftp%nnr)
|
||||
!! charge density in G-space
|
||||
REAL(DP), INTENT(INOUT) :: sigmahar(3,3)
|
||||
!! hartree contribution to stress
|
||||
!! Hartree contribution to stress
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
|
@ -689,50 +517,45 @@ SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar )
|
|||
REAL(DP) :: Gp, G2lzo2Gp, beta, shart, g2, fact
|
||||
REAL(DP) :: sigmahar11, sigmahar31, sigmahar21, &
|
||||
sigmahar32, sigmahar22, sigmahar33
|
||||
REAL(DP), ALLOCATABLE :: cutoff2D_d(:)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psicG_d, cutoff2D_d
|
||||
#endif
|
||||
!
|
||||
ALLOCATE( cutoff2D_d(ngm) )
|
||||
cutoff2D_d = cutoff_2D
|
||||
!$acc data present_or_copyin(rho_G)
|
||||
!
|
||||
sigmahar11 = 0._DP ; sigmahar31 = 0._DP
|
||||
sigmahar21 = 0._DP ; sigmahar32 = 0._DP
|
||||
sigmahar22 = 0._DP ; sigmahar33 = 0._DP
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
!$acc parallel loop copyin(cutoff_2D) &
|
||||
!$acc reduction(+:sigmahar11,sigmahar21,sigmahar22,sigmahar31,&
|
||||
!$acc sigmahar32,sigmahar33)
|
||||
DO ng = gstart, ngm
|
||||
Gp = SQRT(g_d(1,ng)**2 + g_d(2,ng)**2)*tpiba
|
||||
Gp = SQRT(g(1,ng)**2 + g(2,ng)**2)*tpiba
|
||||
IF (Gp < eps8) THEN
|
||||
G2lzo2Gp = 0._DP
|
||||
beta = 0._DP
|
||||
ELSE
|
||||
G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp
|
||||
beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng)
|
||||
G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
|
||||
beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
|
||||
ENDIF
|
||||
!
|
||||
g2 = gg_d(ng) * tpiba2
|
||||
g2 = gg(ng) * tpiba2
|
||||
!
|
||||
shart = DBLE(psicG_d(ng)*CONJG(psicG_d(ng))) /&
|
||||
g2 * cutoff2D_d(ng)
|
||||
shart = DBLE(rho_G(ng)*CONJG(rho_G(ng))) / &
|
||||
g2 * cutoff_2D(ng)
|
||||
!
|
||||
fact = 1._DP - beta
|
||||
!
|
||||
sigmahar11 = sigmahar11 + shart *tpiba2*2._DP * &
|
||||
g_d(1,ng) * g_d(1,ng) / g2 * fact
|
||||
g(1,ng) * g(1,ng) / g2 * fact
|
||||
sigmahar21 = sigmahar21 + shart *tpiba2*2._DP * &
|
||||
g_d(2,ng) * g_d(1,ng) / g2 * fact
|
||||
g(2,ng) * g(1,ng) / g2 * fact
|
||||
sigmahar22 = sigmahar22 + shart *tpiba2*2._DP * &
|
||||
g_d(2,ng) * g_d(2,ng) / g2 * fact
|
||||
g(2,ng) * g(2,ng) / g2 * fact
|
||||
sigmahar31 = sigmahar31 + shart *tpiba2*2._DP * &
|
||||
g_d(3,ng) * g_d(1,ng) / g2
|
||||
g(3,ng) * g(1,ng) / g2
|
||||
sigmahar32 = sigmahar32 + shart *tpiba2*2._DP * &
|
||||
g_d(3,ng) * g_d(2,ng) / g2
|
||||
g(3,ng) * g(2,ng) / g2
|
||||
sigmahar33 = sigmahar33 + shart *tpiba2*2._DP * &
|
||||
g_d(3,ng) * g_d(3,ng) / g2
|
||||
!
|
||||
g(3,ng) * g(3,ng) / g2
|
||||
ENDDO
|
||||
!
|
||||
sigmahar(1,1) = sigmahar(1,1) + sigmahar11
|
||||
|
@ -743,11 +566,11 @@ SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar )
|
|||
sigmahar(3,3) = sigmahar(3,3) + sigmahar33
|
||||
!sigma is multiplied by 0.5*fpi*e2 after
|
||||
!
|
||||
DEALLOCATE( cutoff2D_d )
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cutoff_stres_sigmahar_gpu
|
||||
END SUBROUTINE cutoff_stres_sigmahar
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
|
@ -759,87 +582,7 @@ SUBROUTINE cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa )
|
|||
USE kinds
|
||||
USE ions_base, ONLY : nat, zv, tau, ityp
|
||||
USE constants, ONLY : e2, eps8
|
||||
USE gvect, ONLY : ngm, g, gg, gstart
|
||||
USE cell_base, ONLY : tpiba2, alat, omega, tpiba
|
||||
USE io_global, ONLY : stdout
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(IN) :: alpha
|
||||
!! tuning param for LR/SR separation
|
||||
REAL(DP), INTENT(INOUT) :: sigmaewa(3,3)
|
||||
!! ewald contribution to stress
|
||||
REAL(DP), INTENT(INOUT) :: sdewald
|
||||
!! constant and diagonal terms
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ng, na, l, m
|
||||
REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact
|
||||
COMPLEX(DP) :: rhostar
|
||||
!
|
||||
! g(1) is a problem if it's G=0, because we divide by G^2.
|
||||
! So start at gstart.
|
||||
! fact=1.0d0, gamma_only not implemented
|
||||
! G=0 componenent of the long-range part of the local part of the
|
||||
! pseudopotminus the Hartree potential is set to 0.
|
||||
! in other words, sdewald=0.
|
||||
! sdewald is the last term in equation B1 of PRB 32 3792.
|
||||
! See also similar comment for ewaldg in cutoff_ewald routine
|
||||
!
|
||||
sdewald = 0._DP
|
||||
DO ng = gstart, ngm
|
||||
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
|
||||
IF (Gp < eps8) THEN
|
||||
G2lzo2Gp = 0._DP
|
||||
beta = 0._DP
|
||||
ELSE
|
||||
G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
|
||||
beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
|
||||
ENDIF
|
||||
g2 = gg(ng) * tpiba2
|
||||
g2a = g2 / 4._DP / alpha
|
||||
rhostar = (0._DP,0._DP)
|
||||
DO na = 1, nat
|
||||
arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + &
|
||||
g(3,ng) * tau(3,na) ) * tpi
|
||||
rhostar = rhostar + zv (ityp(na) ) * CMPLX(COS(arg), SIN(arg), KIND=DP)
|
||||
ENDDO
|
||||
rhostar = rhostar / omega
|
||||
sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2
|
||||
! ... sewald is an other diagonal term that is similar to the diagonal terms
|
||||
! in the other stress contributions. It basically gives a term prop to
|
||||
! the ewald energy
|
||||
sdewald = sdewald-sewald
|
||||
DO l = 1, 3
|
||||
IF (l == 3) THEN
|
||||
fact = (g2a + 1.0d0)
|
||||
ELSE
|
||||
fact = (1.0d0+g2a-beta)
|
||||
ENDIF
|
||||
!
|
||||
DO m = 1, l
|
||||
sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * &
|
||||
g(l,ng) * g(m,ng) / g2 * fact
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cutoff_stres_sigmaewa
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
|
||||
!----------------------------------------------------------------------
|
||||
!! This subroutine cuts off the Ewald part of the stress.
|
||||
!! See Eq. (64) in PRB 96 075448
|
||||
!
|
||||
USE kinds
|
||||
USE ions_base, ONLY : nat, zv, tau, ityp
|
||||
USE constants, ONLY : e2, eps8
|
||||
USE gvect, ONLY : ngm, gstart, g_d, gg_d
|
||||
USE gvect, ONLY : ngm, gstart, g, gg
|
||||
USE cell_base, ONLY : tpiba2, alat, omega, tpiba
|
||||
USE io_global, ONLY : stdout
|
||||
!
|
||||
|
@ -858,72 +601,62 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
|
|||
REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact
|
||||
REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
|
||||
COMPLEX(DP) :: rhostar
|
||||
!
|
||||
INTEGER , ALLOCATABLE :: ityp_d(:)
|
||||
REAL(DP), ALLOCATABLE :: cutoff2D_d(:), tau_d(:,:), zv_d(:)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: cutoff2D_d, tau_d, zv_d, ityp_d
|
||||
#endif
|
||||
!
|
||||
ntyp = SIZE(zv)
|
||||
ALLOCATE( cutoff2D_d(ngm), tau_d(3,nat), zv_d(ntyp) )
|
||||
ALLOCATE( ityp_d(nat) )
|
||||
cutoff2D_d = cutoff_2D
|
||||
tau_d = tau
|
||||
zv_d = zv
|
||||
ityp_d = ityp
|
||||
! g(1) is a problem if it's G=0, because we divide by G^2.
|
||||
! So start at gstart.
|
||||
! fact=1.0d0, gamma_only not implemented
|
||||
! G=0 componenent of the long-range part of the local part of the
|
||||
! pseudopotminus the Hartree potential is set to 0.
|
||||
! in other words, sdewald=0.
|
||||
! sdewald is the last term in equation B1 of PRB 32 3792.
|
||||
! See also similar comment for ewaldg in cutoff_ewald routine
|
||||
!
|
||||
! ... g(1) is a problem if it's G=0, because we divide by G^2.
|
||||
! So start at gstart.
|
||||
! fact=1.0d0, gamma_only not implemented
|
||||
! G=0 componenent of the long-range part of the local part of the
|
||||
! pseudopotminus the Hartree potential is set to 0.
|
||||
! in other words, sdewald=0.
|
||||
! sdewald is the last term in equation B1 of PRB 32 3792.
|
||||
! See also similar comment for ewaldg in cutoff_ewald routine
|
||||
!
|
||||
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
|
||||
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
|
||||
!
|
||||
sdewald = 0._DP
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
!$acc parallel loop copyin(cutoff_2D,tau,zv,ityp) &
|
||||
!$acc& reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32, &
|
||||
!$acc& sigma33)
|
||||
DO ng = gstart, ngm
|
||||
Gp = SQRT( g_d(1,ng)**2 + g_d(2,ng)**2 )*tpiba
|
||||
Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
|
||||
IF (Gp < eps8) THEN
|
||||
G2lzo2Gp = 0._DP
|
||||
beta = 0._DP
|
||||
ELSE
|
||||
G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp
|
||||
beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng)
|
||||
G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
|
||||
beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
|
||||
ENDIF
|
||||
g2 = gg_d(ng) * tpiba2
|
||||
g2 = gg(ng) * tpiba2
|
||||
g2a = g2 / 4._DP / alpha
|
||||
rhostar = (0._DP,0._DP)
|
||||
DO na = 1, nat
|
||||
arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + &
|
||||
g_d(3,ng) * tau_d(3,na) ) * tpi
|
||||
rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP)
|
||||
arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + &
|
||||
g(3,ng) * tau(3,na) ) * tpi
|
||||
rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP)
|
||||
ENDDO
|
||||
rhostar = rhostar / CMPLX(omega)
|
||||
sewald = tpi * e2 * EXP(-g2a) / g2* cutoff2D_d(ng) * ABS(rhostar)**2
|
||||
sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2
|
||||
! ... sewald is an other diagonal term that is similar to the diagonal terms
|
||||
! in the other stress contributions. It basically gives a term prop to
|
||||
! the ewald energy
|
||||
! in the other stress contributions. It basically gives a term prop to
|
||||
! the ewald energy
|
||||
!
|
||||
sdewald = sdewald - sewald
|
||||
sigma11 = sigma11 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(1,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta)
|
||||
g(1,ng) * g(1,ng) / g2 * (1._DP+g2a-beta)
|
||||
sigma21 = sigma21 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(2,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta)
|
||||
g(2,ng) * g(1,ng) / g2 * (1._DP+g2a-beta)
|
||||
sigma22 = sigma22 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(2,ng) * g_d(2,ng) / g2 * (1._DP+g2a-beta)
|
||||
g(2,ng) * g(2,ng) / g2 * (1._DP+g2a-beta)
|
||||
sigma31 = sigma31 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(3,ng) * g_d(1,ng) / g2 * (g2a+1._DP)
|
||||
g(3,ng) * g(1,ng) / g2 * (g2a+1._DP)
|
||||
sigma32 = sigma32 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(3,ng) * g_d(2,ng) / g2 * (g2a+1._DP)
|
||||
g(3,ng) * g(2,ng) / g2 * (g2a+1._DP)
|
||||
sigma33 = sigma33 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(3,ng) * g_d(3,ng) / g2 * (g2a+1._DP)
|
||||
g(3,ng) * g(3,ng) / g2 * (g2a+1._DP)
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
|
@ -934,11 +667,9 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
|
|||
sigmaewa(3,2) = sigmaewa(3,2) + sigma32
|
||||
sigmaewa(3,3) = sigmaewa(3,3) + sigma33
|
||||
!
|
||||
DEALLOCATE( cutoff2D_d, tau_d, zv_d )
|
||||
DEALLOCATE( ityp_d )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cutoff_stres_sigmaewa_gpu
|
||||
END SUBROUTINE cutoff_stres_sigmaewa
|
||||
!
|
||||
!
|
||||
END MODULE Coul_cut_2D
|
||||
|
|
|
@ -290,16 +290,6 @@ PWLIBS += \
|
|||
drhoc_gpu.o \
|
||||
addusforce_gpu.o \
|
||||
force_corr_gpu.o \
|
||||
stres_loc_gpu.o \
|
||||
dvloc_of_g_gpu.o \
|
||||
stres_har_gpu.o \
|
||||
stres_mgga_gpu.o \
|
||||
stres_cc_gpu.o \
|
||||
deriv_drhoc_gpu.o \
|
||||
stres_ewa_gpu.o \
|
||||
stres_knl_gpu.o \
|
||||
stres_us_gpu.o \
|
||||
compute_deff_gpu.o \
|
||||
orthoatwfc_gpu.o \
|
||||
atomic_wfc_gpu.o
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ SUBROUTINE add_paw_to_deeq_gpu(deeq_d)
|
|||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(KIND=DP), INTENT(INOUT) :: deeq_d( nhm, nhm, nat, nspin )
|
||||
REAL(DP), INTENT(INOUT) :: deeq_d(nhm,nhm,nat,nspin)
|
||||
!! integral of the perturbed potential
|
||||
!
|
||||
! ... local variables
|
||||
|
@ -27,7 +27,7 @@ SUBROUTINE add_paw_to_deeq_gpu(deeq_d)
|
|||
INTEGER :: na, nb, nab, nt, ih, jh, ijh, nhnt, is
|
||||
REAL(DP), ALLOCATABLE :: ddd_paw_d(:,:,:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: deeq_d, ddd_paw_d
|
||||
attributes(DEVICE) :: ddd_paw_d, deeq_d
|
||||
#endif
|
||||
|
||||
! OPTIMIZE HERE: squeeze loop on atoms having PAW pseudo
|
||||
|
@ -39,7 +39,7 @@ SUBROUTINE add_paw_to_deeq_gpu(deeq_d)
|
|||
nt = ityp(na)
|
||||
IF (.not.upf(nt)%tpawp) CYCLE
|
||||
nhnt = nh(nt)
|
||||
!$cuf kernel do(3)
|
||||
!$acc parallel loop collapse(3)
|
||||
DO is=1,nspin
|
||||
DO ih=1,nhnt
|
||||
DO jh=1,nhnt
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
!
|
||||
SUBROUTINE add_vhub_to_deeq_gpu( deeq_d )
|
||||
!-----------------------------------------------------------------
|
||||
!! Add Hubbard contributions to the integral of V_eff and Q_{nm} when
|
||||
!! Hubbard_projectors is pseudo.
|
||||
!! Add Hubbard contributions to the integral of \(V_\text{eff}\)
|
||||
!! and \(Q_{nm}\) when Hubbard_projectors is pseudo.
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE ions_base, ONLY : nat, ntyp => nsp, ityp
|
||||
|
@ -21,27 +21,23 @@ SUBROUTINE add_vhub_to_deeq_gpu( deeq_d )
|
|||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(KIND=DP), INTENT(INOUT) :: deeq_d( nhm, nhm, nat, nspin )
|
||||
!! integral of V_eff and Q_{nm}
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: deeq_d
|
||||
#endif
|
||||
REAL(KIND=DP), INTENT(INOUT) :: deeq_d(nhm,nhm,nat,nspin)
|
||||
!! integral of \(V_\text{eff}\) and \(Q_{nm}\)
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(KIND=DP), ALLOCATABLE :: deeq_aux_h( :, :, : )
|
||||
REAL(KIND=DP), ALLOCATABLE :: deeq_aux_d( :, :, : )
|
||||
REAL(KIND=DP), ALLOCATABLE :: deeq_aux_h(:,:,:)
|
||||
REAL(KIND=DP), ALLOCATABLE :: deeq_aux_d(:,:,:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: deeq_aux_d
|
||||
attributes(DEVICE) :: deeq_aux_d, deeq_d
|
||||
#endif
|
||||
INTEGER :: na, nt, ih, jh, ijh, m1, m2, ow1, ow2, is, nhnt
|
||||
!
|
||||
!
|
||||
! (maybe) OPTIMIZE here ... reorder the loop ?
|
||||
!
|
||||
ALLOCATE(deeq_aux_h( nhm, nhm, nspin ))
|
||||
ALLOCATE(deeq_aux_d( nhm, nhm, nspin ))
|
||||
ALLOCATE( deeq_aux_h(nhm,nhm,nspin) )
|
||||
ALLOCATE( deeq_aux_d(nhm,nhm,nspin) )
|
||||
!
|
||||
DO na = 1, nat
|
||||
!
|
||||
nt = ityp(na)
|
||||
|
@ -68,11 +64,12 @@ SUBROUTINE add_vhub_to_deeq_gpu( deeq_d )
|
|||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
CALL dev_memcpy(deeq_aux_d, deeq_aux_h)
|
||||
!
|
||||
CALL dev_memcpy( deeq_aux_d, deeq_aux_h )
|
||||
! HERE USE devXlib
|
||||
nhnt = nh(nt)
|
||||
!$cuf kernel do(3)
|
||||
DO is=1, nspin
|
||||
!$acc parallel loop collapse(3)
|
||||
DO is = 1, nspin
|
||||
DO ih = 1, nhnt
|
||||
DO jh = 1, nhnt
|
||||
deeq_d(jh,ih,na,is) = deeq_d(jh,ih,na,is) + deeq_aux_d(jh,ih,is)
|
||||
|
@ -81,7 +78,7 @@ SUBROUTINE add_vhub_to_deeq_gpu( deeq_d )
|
|||
END DO
|
||||
!
|
||||
ENDDO
|
||||
|
||||
!
|
||||
DEALLOCATE(deeq_aux_h, deeq_aux_d)
|
||||
!
|
||||
END SUBROUTINE add_vhub_to_deeq_gpu
|
||||
|
|
|
@ -19,7 +19,7 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
|
|||
USE lsda_mod, ONLY: current_spin
|
||||
USE control_flags, ONLY: gamma_only
|
||||
USE noncollin_module
|
||||
USE uspp, ONLY: ofsbeta, nkb, vkb, deeq_d, deeq_nc_d
|
||||
USE uspp, ONLY: ofsbeta, nkb, vkb, deeq, deeq_nc
|
||||
USE uspp_param, ONLY: nh, nhm
|
||||
USE becmod_gpum, ONLY: bec_type_d, becp_d, using_becp_r_d, &
|
||||
using_becp_k_d, using_becp_nc_d
|
||||
|
@ -133,10 +133,12 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
|
|||
! (l'=l+ijkb0, m'=m+ijkb0, indices run from 1 to nh(nt))
|
||||
!
|
||||
IF ( m_loc > 0 ) THEN
|
||||
!$acc host_data use_device(deeq)
|
||||
CALL DGEMM('N', 'N', nh(nt), m_loc, nh(nt), 1.0_dp, &
|
||||
deeq_d(1,1,na,current_spin), nhm, &
|
||||
deeq(1,1,na,current_spin), nhm, &
|
||||
becp_d%r_d(ofsbeta(na)+1,1), nkb, 0.0_dp, &
|
||||
ps_d(ofsbeta(na)+1,1), nkb )
|
||||
!$acc end host_data
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
|
@ -240,12 +242,12 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
|
|||
! deeq is real: copy it into a complex variable to perform
|
||||
! a zgemm - simple but sub-optimal solution
|
||||
!
|
||||
!deeaux_d(:,:) = CMPLX(deeq_d(1:nh(nt),1:nh(nt),na,current_spin), 0.0_dp, KIND=dp )
|
||||
!deeaux_d(:,:) = CMPLX(deeq(1:nh(nt),1:nh(nt),na,current_spin), 0.0_dp, KIND=dp )
|
||||
!
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
!$acc parallel loop collapse(2) present(deeq)
|
||||
DO j = 1, nhnt
|
||||
DO k = 1, nhnt
|
||||
deeaux_d(k,j) = CMPLX(deeq_d(k,j,na,current_spin), 0.0_dp, KIND=DP )
|
||||
deeaux_d(k,j) = CMPLX(deeq(k,j,na,current_spin), 0.0_dp, KIND=DP )
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
|
@ -313,29 +315,31 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
|
|||
!
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
!
|
||||
!$acc host_data use_device(deeq_nc)
|
||||
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
|
||||
deeq_nc_d(1,1,na,1), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, &
|
||||
deeq_nc(1,1,na,1), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, &
|
||||
(0.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,1,1), 2*nkb )
|
||||
|
||||
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
|
||||
deeq_nc_d(1,1,na,2), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, &
|
||||
deeq_nc(1,1,na,2), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, &
|
||||
(1.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,1,1), 2*nkb )
|
||||
|
||||
|
||||
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
|
||||
deeq_nc_d(1,1,na,3), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, &
|
||||
deeq_nc(1,1,na,3), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, &
|
||||
(0.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,2,1), 2*nkb )
|
||||
|
||||
|
||||
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
|
||||
deeq_nc_d(1,1,na,4), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, &
|
||||
deeq_nc(1,1,na,4), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, &
|
||||
(1.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,2,1), 2*nkb )
|
||||
|
||||
!$acc end host_data
|
||||
!
|
||||
! DO ibnd = 1, m
|
||||
! !
|
||||
! DO jh = 1, nh(nt)
|
||||
! !
|
||||
!!$cuf kernel do(1) <<<*,*>>>
|
||||
!!$acc parallel loop present(deeq_nc)
|
||||
! DO ih = 1, nh(nt)
|
||||
! !
|
||||
! ikb = ijkb0 + ih
|
||||
|
@ -344,11 +348,11 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d )
|
|||
! becpdn_jkb = becp_nc_d(jkb,2,ibnd)
|
||||
! !
|
||||
! ps_d(ikb,1,ibnd) = ps_d(ikb,1,ibnd) + &
|
||||
! deeq_nc_d(ih,jh,na,1)*becpup_jkb + &
|
||||
! deeq_nc_d(ih,jh,na,2)*becpdn_jkb
|
||||
! deeq_nc(ih,jh,na,1)*becpup_jkb + &
|
||||
! deeq_nc(ih,jh,na,2)*becpdn_jkb
|
||||
! ps_d(ikb,2,ibnd) = ps_d(ikb,2,ibnd) + &
|
||||
! deeq_nc_d(ih,jh,na,3)*becpup_jkb + &
|
||||
! deeq_nc_d(ih,jh,na,4)*becpdn_jkb
|
||||
! deeq_nc(ih,jh,na,3)*becpup_jkb + &
|
||||
! deeq_nc(ih,jh,na,4)*becpdn_jkb
|
||||
! !
|
||||
! END DO
|
||||
! !
|
||||
|
|
|
@ -23,7 +23,7 @@ SUBROUTINE addusstress( sigmanlc )
|
|||
! ... local variables
|
||||
!
|
||||
REAL(DP) :: sigma_r(3,3), sigma_g(3,3)
|
||||
INTEGER :: na,ijh, ipol,jpol
|
||||
INTEGER :: na, ijh, ipol, jpol
|
||||
!
|
||||
IF ( tqr ) THEN
|
||||
sigma_r(:,:) = 0.d0
|
||||
|
@ -56,7 +56,7 @@ SUBROUTINE addusstress_g( sigmanlc )
|
|||
!! routine).
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE ions_base, ONLY : nat, ntyp => nsp, ityp
|
||||
USE ions_base, ONLY : nat, ntyp=>nsp, ityp
|
||||
USE cell_base, ONLY : omega, tpiba
|
||||
USE fft_base, ONLY : dfftp
|
||||
USE fft_rho, ONLY : rho_r2g
|
||||
|
@ -71,7 +71,7 @@ SUBROUTINE addusstress_g( sigmanlc )
|
|||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(INOUT) :: sigmanlc(3, 3)
|
||||
REAL(DP), INTENT(INOUT) :: sigmanlc(3,3)
|
||||
!! the nonlocal stress
|
||||
!
|
||||
! ... local variables
|
||||
|
@ -82,18 +82,18 @@ SUBROUTINE addusstress_g( sigmanlc )
|
|||
! counters
|
||||
COMPLEX(DP), ALLOCATABLE :: aux1(:,:), aux2(:,:), vg(:,:), qgm(:,:)
|
||||
! work space (complex)
|
||||
COMPLEX(DP) :: cfac
|
||||
REAL(DP) :: fac(3,nspin), sus(3,3)
|
||||
COMPLEX(DP) :: cfac
|
||||
REAL(DP) :: fac(3,nspin), sus(3,3)
|
||||
! auxiliary variables
|
||||
REAL(DP) , ALLOCATABLE :: qmod(:), ylmk0(:,:), dylmk0(:,:), tbecsum(:,:)
|
||||
! work space (real)
|
||||
!
|
||||
!
|
||||
sus(:,:) = 0.d0
|
||||
!
|
||||
! Fourier transform of the total effective potential
|
||||
! ... Fourier transform of the total effective potential
|
||||
!
|
||||
ALLOCATE( vg(ngm,nspin) )
|
||||
ALLOCATE( vg(ngm,nspin) )
|
||||
!$acc data create( vg )
|
||||
!
|
||||
DO is = 1, nspin
|
||||
IF ( nspin == 4 .and. is /= 1 ) THEN
|
||||
|
@ -103,10 +103,10 @@ SUBROUTINE addusstress_g( sigmanlc )
|
|||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
! With k-point parallelization, distribute G-vectors across processors
|
||||
! ngm_s = index of first G-vector for this processor
|
||||
! ngm_e = index of last G-vector for this processor
|
||||
! ngm_l = local number of G-vectors
|
||||
! ... With k-point parallelization, distribute G-vectors across processors:
|
||||
! ngm_s = index of first G-vector for this processor
|
||||
! ngm_e = index of last G-vector for this processor
|
||||
! ngm_l = local number of G-vectors
|
||||
!
|
||||
CALL divide( inter_pool_comm, ngm, ngm_s, ngm_e )
|
||||
ngm_l = ngm_e-ngm_s+1
|
||||
|
@ -115,80 +115,132 @@ SUBROUTINE addusstress_g( sigmanlc )
|
|||
!
|
||||
ALLOCATE( aux1(ngm_l,3), aux2(ngm_l,nspin), qmod(ngm_l) )
|
||||
ALLOCATE( ylmk0(ngm_l,lmaxq*lmaxq), dylmk0(ngm_l,lmaxq*lmaxq) )
|
||||
!$acc data create(aux1,aux2,qmod)
|
||||
!$acc data create(ylmk0,dylmk0)
|
||||
!
|
||||
CALL ylmr2( lmaxq * lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0 )
|
||||
#if defined(__CUDA)
|
||||
!$acc host_data use_device(g,gg,ylmk0)
|
||||
CALL ylmr2_gpu( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0 )
|
||||
!$acc end host_data
|
||||
#else
|
||||
CALL ylmr2( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0 )
|
||||
!$acc update device(ylmk0)
|
||||
#endif
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ig = 1, ngm_l
|
||||
qmod(ig) = SQRT( gg(ngm_s+ig-1) ) * tpiba
|
||||
ENDDO
|
||||
!
|
||||
! here we compute the integral Q*V for each atom,
|
||||
! ... here we compute the integral Q*V for each atom,
|
||||
! I = sum_G i G_a exp(-iR.G) Q_nm v^*
|
||||
! (no contribution from G=0)
|
||||
! (no contribution from G=0)
|
||||
!
|
||||
DO ipol = 1, 3
|
||||
CALL dylmr2( lmaxq * lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!$acc host_data use_device(g,gg,dylmk0)
|
||||
CALL dylmr2_gpu( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
|
||||
!$acc end host_data
|
||||
#else
|
||||
CALL dylmr2( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol )
|
||||
!$acc update device(dylmk0)
|
||||
#endif
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
!
|
||||
IF ( upf(nt)%tvanp ) THEN
|
||||
nij = nh(nt)*(nh(nt)+1)/2
|
||||
ALLOCATE( qgm(ngm_l,nij), tbecsum(nij,nspin) )
|
||||
!$acc data create(qgm,tbecsum)
|
||||
ijh = 0
|
||||
DO ih = 1, nh(nt)
|
||||
DO jh = ih, nh(nt)
|
||||
ijh = ijh + 1
|
||||
CALL dqvan2( ih, jh, nt, ipol, ngm_l, g(1,ngm_s), tpiba, &
|
||||
qmod, ylmk0, dylmk0, qgm(1,ijh) )
|
||||
qmod, ylmk0, dylmk0, qgm(1,ijh) )
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DO na = 1, nat
|
||||
IF (ityp(na) == nt) THEN
|
||||
!
|
||||
tbecsum(:,:) = becsum( 1:nij, na, 1:nspin )
|
||||
!$acc kernels
|
||||
tbecsum(:,:) = becsum(1:nij,na,1:nspin)
|
||||
!$acc end kernels
|
||||
!
|
||||
CALL dgemm( 'N', 'N', 2*ngm_l, nspin, nij, 1.0_dp, &
|
||||
qgm, 2*ngm_l, tbecsum, nij, 0.0_dp, aux2, 2*ngm_l )
|
||||
!$acc host_data use_device(qgm,tbecsum,aux2)
|
||||
CALL MYDGEMM( 'N', 'N', 2*ngm_l, nspin, nij, 1.0_dp, &
|
||||
qgm, 2*ngm_l, tbecsum, nij, 0.0_dp, aux2, 2*ngm_l )
|
||||
!$acc end host_data
|
||||
!
|
||||
#if defined(_OPENACC)
|
||||
!$acc parallel loop collapse(2)
|
||||
#else
|
||||
!$omp parallel do default(shared) private(is, ig)
|
||||
#endif
|
||||
DO is = 1, nspin
|
||||
DO ig = 1, ngm_l
|
||||
aux2(ig,is) = aux2(ig,is) * CONJG(vg (ngm_s+ig-1, is))
|
||||
END DO
|
||||
END DO
|
||||
ENDDO
|
||||
ENDDO
|
||||
#if defined(_OPENACC)
|
||||
!$acc parallel loop
|
||||
#else
|
||||
!$omp end parallel do
|
||||
!$omp parallel do default(shared) private(ig, cfac)
|
||||
#endif
|
||||
DO ig = 1, ngm_l
|
||||
cfac = CONJG( eigts1(mill (1,ngm_s+ig-1), na) * &
|
||||
eigts2(mill (2,ngm_s+ig-1), na) * &
|
||||
eigts3(mill (3,ngm_s+ig-1), na) ) * tpiba
|
||||
aux1(ig,1) = cfac * g(1,ngm_s+ig-1)
|
||||
aux1(ig,2) = cfac * g(2,ngm_s+ig-1)
|
||||
aux1(ig,3) = cfac * g(3,ngm_s+ig-1)
|
||||
ENDDO
|
||||
cfac = CONJG( eigts1(mill(1,ngm_s+ig-1),na) * &
|
||||
eigts2(mill(2,ngm_s+ig-1),na) * &
|
||||
eigts3(mill(3,ngm_s+ig-1),na) ) * tpiba
|
||||
aux1(ig,1) = cfac * g(1,ngm_s+ig-1)
|
||||
aux1(ig,2) = cfac * g(2,ngm_s+ig-1)
|
||||
aux1(ig,3) = cfac * g(3,ngm_s+ig-1)
|
||||
ENDDO
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end parallel do
|
||||
CALL DGEMM('T','N', 3, nspin, 2*ngm_l, 1.0_dp, aux1, 2*ngm_l, &
|
||||
aux2, 2*ngm_l, 0.0_dp, fac, 3 )
|
||||
DO is = 1, nspin
|
||||
DO jpol = 1, 3
|
||||
sus(ipol, jpol) = sus(ipol, jpol) - omega * fac(jpol, is)
|
||||
ENDDO
|
||||
ENDDO
|
||||
#endif
|
||||
!
|
||||
!$acc data copyout( fac )
|
||||
!$acc host_data use_device(aux1,aux2,fac)
|
||||
CALL MYDGEMM( 'T','N', 3, nspin, 2*ngm_l, 1.0_dp, aux1, 2*ngm_l, &
|
||||
aux2, 2*ngm_l, 0.0_dp, fac, 3 )
|
||||
!$acc end host_data
|
||||
!$acc end data
|
||||
!
|
||||
DO is = 1, nspin
|
||||
DO jpol = 1, 3
|
||||
sus(ipol,jpol) = sus(ipol,jpol) - omega * fac(jpol,is)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
ENDDO
|
||||
!$acc end data
|
||||
DEALLOCATE( tbecsum, qgm )
|
||||
ENDIF
|
||||
ENDDO
|
||||
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
DEALLOCATE( ylmk0, dylmk0 )
|
||||
DEALLOCATE( aux1, aux2, qmod )
|
||||
!
|
||||
10 CONTINUE
|
||||
!
|
||||
CALL mp_sum( sus, inter_pool_comm )
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
sigmanlc(:,:) = sigmanlc(:,:) + 2.0_dp*sus(:,:)
|
||||
ELSE
|
||||
sigmanlc(:,:) = sigmanlc(:,:) + sus(:,:)
|
||||
ENDIF
|
||||
DEALLOCATE( ylmk0, dylmk0 )
|
||||
DEALLOCATE( aux1, aux2, vg, qmod )
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( vg )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
|
|
@ -5,16 +5,17 @@
|
|||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!-------------------------------------------------------------------------
|
||||
SUBROUTINE compute_deff( deff, et )
|
||||
!-----------------------------------------------------------------------
|
||||
!! This routine computes the effective value of the D-eS coefficients
|
||||
!! which appear often in many expressions in the US or PAW case.
|
||||
!! This routine is for the collinear case.
|
||||
!! This routine computes the effective value of the D-eS coefficients
|
||||
!! which appear often in many expressions in the US or PAW case.
|
||||
!! This routine is for the collinear case.
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE ions_base, ONLY: nsp, nat, ityp
|
||||
USE uspp, ONLY: deeq, qq_at, okvan
|
||||
USE ions_base, ONLY: nat
|
||||
USE uspp, ONLY: okvan, deeq, qq_at
|
||||
USE uspp_param, ONLY: nhm
|
||||
USE lsda_mod, ONLY: current_spin
|
||||
!
|
||||
|
@ -27,24 +28,21 @@ SUBROUTINE compute_deff( deff, et )
|
|||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: nt, na, is
|
||||
INTEGER :: nt, na, is, i, j
|
||||
!
|
||||
deff(:,:,:) = deeq(:,:,:,current_spin)
|
||||
!$acc kernels present_or_copyout(deff)
|
||||
!
|
||||
IF (okvan) THEN
|
||||
IF (.NOT. okvan) THEN
|
||||
!
|
||||
DO nt = 1, nsp
|
||||
DO na = 1, nat
|
||||
!
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
deff(:,:,na) = deff(:,:,na) - et*qq_at(:,:,na)
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
deff(:,:,:) = deeq(:,:,:,current_spin)
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
deff(:,:,:) = deeq(:,:,:,current_spin) - et*qq_at(:,:,:)
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!$acc end kernels
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
@ -52,7 +50,7 @@ END SUBROUTINE compute_deff
|
|||
!
|
||||
!
|
||||
!---------------------------------------------------------------------------
|
||||
SUBROUTINE compute_deff_nc( deff, et )
|
||||
SUBROUTINE compute_deff_nc( deff_nc, et )
|
||||
!-------------------------------------------------------------------------
|
||||
!! This routine computes the effective value of the D-eS coefficients
|
||||
!! which appears often in many expressions. This routine is for the
|
||||
|
@ -60,8 +58,8 @@ SUBROUTINE compute_deff_nc( deff, et )
|
|||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE ions_base, ONLY: nsp, nat, ityp
|
||||
USE noncollin_module, ONLY: npol, lspinorb
|
||||
USE uspp, ONLY: deeq_nc, qq_at, qq_so, okvan
|
||||
USE noncollin_module, ONLY: noncolin, npol, lspinorb
|
||||
USE uspp, ONLY: okvan, deeq_nc, qq_so, qq_at
|
||||
USE uspp_param, ONLY: nhm
|
||||
USE lsda_mod, ONLY: nspin
|
||||
!
|
||||
|
@ -69,42 +67,71 @@ SUBROUTINE compute_deff_nc( deff, et )
|
|||
!
|
||||
REAL(DP), INTENT(IN) :: et
|
||||
!! The eigenvalues of the hamiltonian
|
||||
COMPLEX(DP), INTENT(OUT) :: deff(nhm,nhm,nat,nspin)
|
||||
COMPLEX(DP), INTENT(OUT) :: deff_nc(nhm,nhm,nat,nspin)
|
||||
!! Effective values of the D-eS coefficients
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: nt, na, is, js, ijs
|
||||
INTEGER :: nt, na, is, ijs, i, j, ias
|
||||
INTEGER :: nt_v(nat), na_v(nat)
|
||||
!
|
||||
deff=deeq_nc
|
||||
IF (okvan) THEN
|
||||
!
|
||||
DO nt = 1, nsp
|
||||
DO na = 1, nat
|
||||
!
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
IF (lspinorb) THEN
|
||||
deff(:,:,na,:) = deff(:,:,na,:) - et * qq_so(:,:,:,nt)
|
||||
ELSE
|
||||
ijs=0
|
||||
!
|
||||
DO is=1,npol
|
||||
DO js=1,npol
|
||||
!
|
||||
ijs=ijs+1
|
||||
IF (is==js) deff(:,:,na,ijs)=deff(:,:,na,ijs)-et*qq_at(:,:,na)
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
!$acc data present_or_copyout( deff_nc )
|
||||
!
|
||||
!$acc kernels
|
||||
deff_nc(:,:,:,:) = deeq_nc(:,:,:,:)
|
||||
!$acc end kernels
|
||||
!
|
||||
IF (okvan) then
|
||||
!
|
||||
! ... set up index arrays to fill 'deff' in on gpu
|
||||
i = 0
|
||||
DO nt = 1, nsp
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na)/=nt ) CYCLE
|
||||
i = i + 1
|
||||
nt_v(i) = nt
|
||||
na_v(i) = na
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!$acc data copyin( nt_v, na_v )
|
||||
!
|
||||
IF (lspinorb) THEN
|
||||
!
|
||||
!$acc parallel loop collapse(3)
|
||||
DO ias = 1, nat
|
||||
DO i = 1, nhm
|
||||
DO j = 1, nhm
|
||||
na = na_v(ias)
|
||||
nt = nt_v(ias)
|
||||
deff_nc(i,j,na,:) = deeq_nc(i,j,na,:) - CMPLX(et)*qq_so(i,j,:,nt)
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
!$acc parallel loop collapse(3)
|
||||
DO ias = 1, nat
|
||||
DO i = 1, nhm
|
||||
DO j = 1, nhm
|
||||
na = na_v(ias)
|
||||
!$acc loop seq
|
||||
DO is = 1, npol
|
||||
ijs = (is-1)*npol + is
|
||||
deff_nc(i,j,na,ijs) = deeq_nc(i,j,na,ijs) - CMPLX(et*qq_at(i,j,na))
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
|
|
@ -1,155 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2009-2010 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-------------------------------------------------------------------------
|
||||
SUBROUTINE compute_deff_gpu( deff_d, et )
|
||||
!-----------------------------------------------------------------------
|
||||
!! This routine computes the effective value of the D-eS coefficients
|
||||
!! which appear often in many expressions in the US or PAW case.
|
||||
!! This routine is for the collinear case.
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE ions_base, ONLY: nat
|
||||
USE uspp, ONLY: okvan, deeq_d, qq_at_d
|
||||
USE uspp_param, ONLY: nhm
|
||||
USE lsda_mod, ONLY: current_spin
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(IN) :: et
|
||||
!! The eigenvalues of the hamiltonian
|
||||
COMPLEX(DP), INTENT(OUT) :: deff_d(nhm,nhm,nat)
|
||||
!! Effective values of the D-eS coefficients
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: nt, na, is, i, j
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: deff_d
|
||||
#endif
|
||||
!
|
||||
IF (.NOT. okvan) THEN
|
||||
!
|
||||
!$cuf kernel do (3) <<<*,*>>>
|
||||
DO na = 1, nat
|
||||
DO i = 1, nhm
|
||||
DO j = 1,nhm
|
||||
deff_d(i,j,na) = CMPLX(deeq_d(i,j,na,current_spin))
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
!$cuf kernel do (3) <<<*,*>>>
|
||||
DO na = 1, nat
|
||||
DO i = 1, nhm
|
||||
DO j = 1,nhm
|
||||
deff_d(i,j,na) = CMPLX(deeq_d(i,j,na,current_spin) - et*qq_at_d(i,j,na))
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE compute_deff_gpu
|
||||
!
|
||||
!
|
||||
!---------------------------------------------------------------------------
|
||||
SUBROUTINE compute_deff_nc_gpu( deff_d, et )
|
||||
!-------------------------------------------------------------------------
|
||||
!! This routine computes the effective value of the D-eS coefficients
|
||||
!! which appears often in many expressions. This routine is for the
|
||||
!! noncollinear case.
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE ions_base, ONLY: nsp, nat, ityp
|
||||
USE noncollin_module, ONLY: noncolin, npol, lspinorb
|
||||
USE uspp, ONLY: okvan, deeq_nc_d, qq_so_d, qq_at_d
|
||||
USE uspp_param, ONLY: nhm
|
||||
USE lsda_mod, ONLY: nspin
|
||||
USE device_memcpy_m, ONLY: dev_memcpy
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(IN) :: et
|
||||
!! The eigenvalues of the hamiltonian
|
||||
COMPLEX(DP), INTENT(OUT) :: deff_d(nhm,nhm,nat,nspin)
|
||||
!! Effective values of the D-eS coefficients
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: nt, na, is, js, ijs, i, j, ias
|
||||
INTEGER :: na_v(nat), nt_v(nat)
|
||||
INTEGER, ALLOCATABLE :: na_d(:), nt_d(:)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: deff_d, na_d, nt_d
|
||||
#endif
|
||||
!
|
||||
CALL dev_memcpy( deff_d, deeq_nc_d )
|
||||
!
|
||||
IF ( okvan ) THEN
|
||||
!
|
||||
ALLOCATE( nt_d(nat), na_d(nat) )
|
||||
!
|
||||
i = 0
|
||||
DO nt = 1, nsp
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na)/=nt ) CYCLE
|
||||
i = i + 1
|
||||
nt_v(i) = nt
|
||||
na_v(i) = na
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
nt_d = nt_v ; na_d = na_v
|
||||
!
|
||||
IF (lspinorb) THEN
|
||||
!
|
||||
!$cuf kernel do (3) <<<*,*>>>
|
||||
DO ias = 1, nat
|
||||
DO i = 1, nhm
|
||||
DO j = 1, nhm
|
||||
na = na_d(ias)
|
||||
nt = nt_d(ias)
|
||||
deff_d(i,j,na,:) = deeq_nc_d(i,j,na,:) - et*qq_so_d(i,j,:,nt)
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
!$cuf kernel do (3) <<<*,*>>>
|
||||
DO ias = 1, nat
|
||||
DO i = 1, nhm
|
||||
DO j = 1, nhm
|
||||
na = na_d(ias)
|
||||
nt = nt_d(ias)
|
||||
DO is = 1, npol
|
||||
ijs = (is-1)*npol+is
|
||||
deff_d(i,j,na,ijs) = deeq_nc_d(i,j,na,ijs) - &
|
||||
et*qq_at_d(i,j,na)
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
DEALLOCATE( nt_d, na_d )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE compute_deff_nc_gpu
|
|
@ -11,7 +11,8 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg )
|
|||
!! Calculates the Fourier transform of \(d\text{Rho}_c/dG\).
|
||||
!
|
||||
USE kinds
|
||||
USE constants, ONLY : pi, fpi
|
||||
USE constants, ONLY : pi, fpi
|
||||
USE upf_acc_interfaces, ONLY : simpson
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -46,10 +47,14 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg )
|
|||
! counter on g shells
|
||||
! lower limit for loop on ngl
|
||||
!
|
||||
!$acc data present_or_copyin(gl,r,rab,rhoc) present_or_copyout(drhocg)
|
||||
!
|
||||
! G=0 term
|
||||
!
|
||||
IF (gl(1) < 1.0d-8) THEN
|
||||
!$acc kernels
|
||||
drhocg(1) = 0.0d0
|
||||
!$acc end kernels
|
||||
igl0 = 2
|
||||
ELSE
|
||||
igl0 = 1
|
||||
|
@ -57,12 +62,19 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg )
|
|||
!
|
||||
! G <> 0 term
|
||||
!
|
||||
!$omp parallel private(aux, gx, rhocg1)
|
||||
#if !defined(_OPENACC)
|
||||
!$omp parallel private(aux,gx,rhocg1)
|
||||
#endif
|
||||
!
|
||||
ALLOCATE( aux(mesh) )
|
||||
#if defined(_OPENACC)
|
||||
!$acc parallel loop gang private(aux)
|
||||
#else
|
||||
!$omp do
|
||||
#endif
|
||||
DO igl = igl0, ngl
|
||||
gx = SQRT( gl(igl) * tpiba2 )
|
||||
!$acc loop vector
|
||||
DO ir = 1, mesh
|
||||
aux(ir) = r(ir)*rhoc(ir)*( r(ir) * COS(gx*r(ir)) / &
|
||||
gx - SIN(gx*r(ir)) / gx**2 )
|
||||
|
@ -70,10 +82,16 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg )
|
|||
CALL simpson( mesh, aux, rab, rhocg1 )
|
||||
drhocg(igl) = fpi / omega * rhocg1
|
||||
ENDDO
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end do nowait
|
||||
DEALLOCATE( aux )
|
||||
#endif
|
||||
!
|
||||
DEALLOCATE( aux )
|
||||
!$acc end data
|
||||
!
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end parallel
|
||||
#endif
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
|
|
@ -1,229 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2001-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
MODULE compute_drhocg_gpu_m
|
||||
#if defined(__CUDA)
|
||||
!
|
||||
USE cudafor
|
||||
!
|
||||
IMPLICIT NONE
|
||||
TYPE(dim3) :: drhocg_threads = dim3(32,8,1)
|
||||
PUBLIC :: drhocg_threads
|
||||
CONTAINS
|
||||
ATTRIBUTES(global) SUBROUTINE compute_drhocg_gpu( n, tpiba2, omega, gl, r, rhoc, rab, mesh, drhocg )
|
||||
!
|
||||
USE cudafor
|
||||
USE kinds, ONLY: DP
|
||||
USE constants, ONLY: pi, fpi, eps14
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, VALUE :: n, mesh
|
||||
REAL(DP),VALUE :: tpiba2, omega
|
||||
REAL(DP),DEVICE,INTENT(IN) :: gl(n), r(mesh), rhoc(mesh), rab(mesh)
|
||||
REAL(DP),DEVICE,INTENT(OUT) :: drhocg(n)
|
||||
!
|
||||
INTEGER :: tx, ty, igl, ir
|
||||
REAL(DP) :: mysum, val, gx, x
|
||||
!
|
||||
tx = threadIdx%x
|
||||
ty = threadIdx%y
|
||||
!
|
||||
igl = (blockIdx%x - 1) * blockDim%y + ty
|
||||
!
|
||||
IF (igl > n ) RETURN
|
||||
!
|
||||
gx = SQRT(gl(igl) * tpiba2)
|
||||
mysum = 0_dp
|
||||
!
|
||||
DO ir = tx, mesh, blockDim%x
|
||||
!
|
||||
val = r(ir)*rhoc(ir)*( r(ir) * COS(gx*r(ir)) / &
|
||||
gx - SIN(gx*r(ir)) / gx**2 ) * rab(ir)
|
||||
!
|
||||
IF (ir == 1 .OR. ir == mesh) THEN
|
||||
mysum = mysum + val
|
||||
ELSEIF (MOD(ir,2)) THEN
|
||||
mysum = mysum + 2._dp*val
|
||||
ELSE
|
||||
mysum = mysum + 4._dp*val
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
! Reduce by warp
|
||||
val = __shfl_down(mysum,1)
|
||||
mysum = mysum + val
|
||||
val = __shfl_down(mysum,2)
|
||||
mysum = mysum + val
|
||||
val = __shfl_down(mysum,4)
|
||||
mysum = mysum + val
|
||||
val = __shfl_down(mysum,8)
|
||||
mysum = mysum + val
|
||||
val = __shfl_down(mysum,16)
|
||||
mysum = mysum + val
|
||||
!
|
||||
IF (tx == 1) THEN
|
||||
drhocg(igl) = fpi * mysum / (3._dp * omega)
|
||||
ENDIF
|
||||
!
|
||||
END SUBROUTINE compute_drhocg_gpu
|
||||
!
|
||||
#endif
|
||||
END MODULE compute_drhocg_gpu_m
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE deriv_drhoc_gpu( ngl, gl_d, omega, tpiba2, mesh, r_d, rab_d, rhoc_d, &
|
||||
drhocg_d )
|
||||
!--------------------------------------------------------------------------
|
||||
!! Calculates the Fourier transform of \(d\text{Rho}_c/dG\).
|
||||
!---cuda kernel version
|
||||
!
|
||||
USE cudafor
|
||||
USE kinds
|
||||
USE constants, ONLY : pi, fpi
|
||||
USE compute_drhocg_gpu_m
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ngl
|
||||
!! input: the number of g shell
|
||||
INTEGER :: mesh
|
||||
!! input: the number of radial mesh points
|
||||
REAL(DP), INTENT(IN), DEVICE :: gl_d(ngl)
|
||||
!! input: the number of G shells
|
||||
REAL(DP), INTENT(IN), DEVICE :: r_d(mesh)
|
||||
!! input: the radial mesh
|
||||
REAL(DP), INTENT(IN), DEVICE :: rab_d(mesh)
|
||||
!! input: the derivative of the radial mesh
|
||||
REAL(DP), INTENT(IN), DEVICE :: rhoc_d(mesh)
|
||||
!! input: the radial core charge
|
||||
REAL(DP), INTENT(IN) :: omega
|
||||
!! input: the volume of the unit cell
|
||||
REAL(DP), INTENT(IN) :: tpiba2
|
||||
!! input: 2 times pi / alat
|
||||
REAL(DP), INTENT(OUT), DEVICE :: drhocg_d(ngl)
|
||||
!! Fourier transform of d Rho_c/dG
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP) :: gx, gl1
|
||||
! the modulus of g for a given shell
|
||||
INTEGER :: igl, blocks, igl0
|
||||
TYPE(dim3) :: threads
|
||||
! counter on radial mesh points
|
||||
! counter on g shells
|
||||
! lower limit for loop on ngl
|
||||
!
|
||||
! G=0 term
|
||||
!
|
||||
gl1 = gl_d(1)
|
||||
IF (gl1 < 1.0d-8) THEN
|
||||
drhocg_d(1) = 0.0_DP
|
||||
igl0 = 2
|
||||
ELSE
|
||||
igl0 = 1
|
||||
ENDIF
|
||||
!
|
||||
! G <> 0 term
|
||||
!
|
||||
threads = dim3(32,8,1)
|
||||
blocks = CEILING(REAL(ngl-igl0+1)/8)
|
||||
CALL compute_drhocg_gpu<<<blocks, threads>>>( ngl-igl0+1, tpiba2, omega, gl_d(igl0), r_d, &
|
||||
rhoc_d, rab_d, mesh, drhocg_d(igl0) )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE deriv_drhoc_gpu
|
||||
!
|
||||
!
|
||||
#else
|
||||
!
|
||||
!
|
||||
SUBROUTINE deriv_drhoc_gpu( ngl, gl_d, omega, tpiba2, mesh, r_d, rab_d, rhoc_d, &
|
||||
drhocg_d )
|
||||
!--------------------------------------------------------------------------
|
||||
!! Calculates the Fourier transform of \(d\text{Rho}_c/dG\).
|
||||
!---cuf kernel loop version---
|
||||
!
|
||||
USE kinds
|
||||
USE constants, ONLY: pi, fpi
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE simpsn_gpum, ONLY: simpsn_gpu_dev
|
||||
USE device_fbuff_m, ONLY: dev_buf
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ngl
|
||||
!! input: the number of g shell
|
||||
INTEGER :: mesh
|
||||
!! input: the number of radial mesh points
|
||||
REAL(DP), INTENT(IN) :: gl_d(ngl)
|
||||
!! input: the number of G shells
|
||||
REAL(DP), INTENT(IN) :: r_d(mesh)
|
||||
!! input: the radial mesh
|
||||
REAL(DP), INTENT(IN) :: rab_d(mesh)
|
||||
!! input: the derivative of the radial mesh
|
||||
REAL(DP), INTENT(IN) :: rhoc_d(mesh)
|
||||
!! input: the radial core charge
|
||||
REAL(DP), INTENT(IN) :: omega
|
||||
!! input: the volume of the unit cell
|
||||
REAL(DP), INTENT(IN) :: tpiba2
|
||||
!! input: 2 times pi / alat
|
||||
REAL(DP), INTENT(OUT) :: drhocg_d(ngl)
|
||||
!! Fourier transform of d Rho_c/dG
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP) :: gx, gl1
|
||||
! the modulus of g for a given shell
|
||||
REAL(DP), ALLOCATABLE :: aux_d(:,:)
|
||||
! auxiliary memory for integration
|
||||
INTEGER :: ir, igl, igl0
|
||||
! counter on radial mesh points
|
||||
! counter on g shells
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: gl_d, r_d, rab_d, rhoc_d, drhocg_d, aux_d
|
||||
#endif
|
||||
!
|
||||
! G=0 term
|
||||
!
|
||||
gl1 = gl_d(1)
|
||||
IF (gl1 < 1.0d-8) THEN
|
||||
drhocg_d(1) = 0.0_DP
|
||||
igl0 = 2
|
||||
ELSE
|
||||
igl0 = 1
|
||||
ENDIF
|
||||
!
|
||||
! G <> 0 term
|
||||
!
|
||||
ALLOCATE( aux_d(mesh,ngl) )
|
||||
!
|
||||
!$cuf kernel do (1)
|
||||
DO igl = igl0, ngl
|
||||
gx = SQRT( gl_d(igl) * tpiba2 )
|
||||
DO ir = 1, mesh
|
||||
aux_d(ir,igl) = r_d(ir)*rhoc_d(ir)*( r_d(ir) * COS(gx*r_d(ir)) / &
|
||||
gx - SIN(gx*r_d(ir)) / gx**2 )
|
||||
ENDDO
|
||||
CALL simpsn_gpu_dev( mesh, aux_d(:,igl), rab_d, drhocg_d(igl) )
|
||||
drhocg_d(igl) = fpi / omega * drhocg_d(igl)
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( aux_d )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE deriv_drhoc_gpu
|
||||
!
|
||||
#endif
|
|
@ -12,7 +12,8 @@ SUBROUTINE drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, rhocg )
|
|||
!! Calculates the Fourier transform of the core charge.
|
||||
!
|
||||
USE kinds
|
||||
USE constants, ONLY: pi, fpi
|
||||
USE constants, ONLY: pi, fpi
|
||||
USE upf_acc_interfaces, ONLY: simpson, sph_bes_acc
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -40,51 +41,83 @@ SUBROUTINE drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, rhocg )
|
|||
REAL(DP) :: gx, rhocg1
|
||||
! the modulus of g for a given shell
|
||||
! the Fourier transform
|
||||
REAL(DP), ALLOCATABLE :: aux(:)
|
||||
REAL(DP), ALLOCATABLE :: aux(:), gxv(:)
|
||||
! auxiliary memory for integration
|
||||
INTEGER :: ir, igl, igl0
|
||||
! counter on radial mesh points
|
||||
! counter on g shells
|
||||
! lower limit for loop on ngl
|
||||
!
|
||||
!--------PROVISIONAL------ gl def needs to be adjusted---------
|
||||
ALLOCATE( gxv(ngl) )
|
||||
DO igl = 1, ngl
|
||||
gxv(igl) = SQRT(gl(igl) * tpiba2)
|
||||
ENDDO
|
||||
!--------------------------------------------------------------
|
||||
!
|
||||
!$omp parallel private(aux, gx, rhocg1)
|
||||
#if !defined(_OPENACC)
|
||||
!$omp parallel private(aux,gx,rhocg1)
|
||||
#endif
|
||||
!
|
||||
ALLOCATE( aux(mesh) )
|
||||
#if defined(_OPENACC)
|
||||
!$acc data present_or_copyin(gl,r,rab,rhoc) present_or_copyout(rhocg)
|
||||
!$acc data create(aux)
|
||||
#endif
|
||||
!
|
||||
! G=0 term
|
||||
!
|
||||
#if !defined(_OPENACC)
|
||||
!$omp single
|
||||
#endif
|
||||
IF ( gl(1) < 1.0d-8 ) THEN
|
||||
!$acc parallel loop
|
||||
DO ir = 1, mesh
|
||||
aux(ir) = r(ir)**2 * rhoc(ir)
|
||||
ENDDO
|
||||
!$acc kernels
|
||||
CALL simpson( mesh, aux, rab, rhocg1 )
|
||||
rhocg(1) = fpi * rhocg1 / omega
|
||||
!$acc end kernels
|
||||
igl0 = 2
|
||||
ELSE
|
||||
igl0 = 1
|
||||
ENDIF
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end single
|
||||
#endif
|
||||
!$acc end data
|
||||
!
|
||||
! G <> 0 term
|
||||
!
|
||||
#if defined(_OPENACC)
|
||||
!$acc parallel loop gang private(aux) copyin(gxv)
|
||||
#else
|
||||
!$omp do
|
||||
#endif
|
||||
DO igl = igl0, ngl
|
||||
gx = SQRT(gl(igl) * tpiba2)
|
||||
CALL sph_bes( mesh, r, gx, 0, aux )
|
||||
gx = gxv(igl)
|
||||
CALL sph_bes_acc( mesh, r, gx, 0, aux )
|
||||
!$acc loop vector
|
||||
DO ir = 1, mesh
|
||||
aux(ir) = r(ir)**2 * rhoc(ir) * aux(ir)
|
||||
aux(ir) = r(ir)**2 * rhoc(ir) * aux(ir)
|
||||
ENDDO
|
||||
CALL simpson( mesh, aux, rab, rhocg1 )
|
||||
rhocg(igl) = fpi * rhocg1 / omega
|
||||
ENDDO
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end do nowait
|
||||
DEALLOCATE( aux )
|
||||
#endif
|
||||
!
|
||||
!$omp end parallel
|
||||
!$acc end data
|
||||
!
|
||||
DEALLOCATE( aux )
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end parallel
|
||||
#endif
|
||||
!
|
||||
DEALLOCATE( gxv )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE drhoc
|
||||
|
||||
|
|
|
@ -7,142 +7,192 @@
|
|||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
subroutine dvloc_of_g (mesh, msh, rab, r, vloc_at, zp, tpiba2, ngl, gl, &
|
||||
omega, dvloc)
|
||||
SUBROUTINE dvloc_of_g( mesh, msh, rab, r, vloc_at, zp, tpiba2, ngl, gl, &
|
||||
omega, dvloc )
|
||||
!----------------------------------------------------------------------
|
||||
!
|
||||
! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g
|
||||
!! This routine gives:
|
||||
!! \[ \text{dvloc} = D\text{Vloc}(g^2)/Dg^2 = (1/2g)\ D\text{Vloc}(g)/Dg
|
||||
!! \]
|
||||
!
|
||||
USE kinds
|
||||
USE constants , ONLY : pi, fpi, e2, eps8
|
||||
USE constants, ONLY: pi, fpi, e2, eps8
|
||||
USE Coul_cut_2D, ONLY: do_cutoff_2D
|
||||
USE esm, ONLY : do_comp_esm, esm_bc
|
||||
implicit none
|
||||
USE esm, ONLY: do_comp_esm, esm_bc
|
||||
!
|
||||
! first the dummy variables
|
||||
IMPLICIT NONE
|
||||
!
|
||||
integer, intent(in) :: ngl, mesh, msh
|
||||
! the number of shell of G vectors
|
||||
! max number of mesh points
|
||||
! number of mesh points for radial integration
|
||||
|
||||
real(DP), intent(in) :: zp, rab (mesh), r (mesh), vloc_at (mesh), &
|
||||
tpiba2, omega, gl (ngl)
|
||||
! valence pseudocharge
|
||||
! the derivative of the radial grid
|
||||
! the radial grid
|
||||
! the pseudo on the radial grid
|
||||
! 2 pi / alat
|
||||
! the volume of the unit cell
|
||||
! the moduli of g vectors for each s
|
||||
INTEGER, INTENT(IN) :: ngl
|
||||
!! the number of shell of G vectors
|
||||
INTEGER, INTENT(IN) :: mesh
|
||||
!! max number of mesh points
|
||||
INTEGER, INTENT(IN) :: msh
|
||||
!! number of mesh points for radial integration
|
||||
REAL(DP), INTENT(IN) :: zp
|
||||
!! valence pseudocharge
|
||||
REAL(DP), INTENT(IN) :: rab(mesh)
|
||||
!! the derivative of the radial grid
|
||||
REAL(DP), INTENT(IN) :: r(mesh)
|
||||
!! the radial grid
|
||||
REAL(DP), INTENT(IN) :: vloc_at(mesh)
|
||||
!! the pseudo on the radial grid
|
||||
REAL(DP), INTENT(IN) :: tpiba2
|
||||
!! 2 pi / alat
|
||||
REAL(DP), INTENT(IN) :: omega
|
||||
!! the volume of the unit cell
|
||||
REAL(DP), INTENT(IN) :: gl(ngl)
|
||||
!! the moduli of g vectors for each s
|
||||
REAL(DP), INTENT(OUT) :: dvloc(ngl)
|
||||
!! the Fourier transform dVloc/dG
|
||||
!
|
||||
real(DP), intent(out) :: dvloc (ngl)
|
||||
! the fourier transform dVloc/dG
|
||||
! ... local variables
|
||||
!
|
||||
real(DP) :: vlcp, g2a, gx
|
||||
real(DP), allocatable :: aux (:), aux1 (:)
|
||||
|
||||
integer :: i, igl, igl0
|
||||
REAL(DP) :: vlcp, g2a, gx, vlcp_0, vlcp_1
|
||||
REAL(DP), ALLOCATABLE :: aux(:,:), aux1(:)
|
||||
INTEGER :: i, igl, igl0
|
||||
! counter on erf functions or gaussians
|
||||
! counter on g shells vectors
|
||||
! first shell with g != 0
|
||||
|
||||
REAL(DP), PARAMETER :: r12=1.0d0/3.0d0
|
||||
!
|
||||
!$acc data present( dvloc, gl )
|
||||
!
|
||||
! the G=0 component is not computed
|
||||
if (gl (1) < eps8) then
|
||||
dvloc (1) = 0.0d0
|
||||
IF (gl(1) < eps8) THEN
|
||||
!$acc kernels
|
||||
dvloc(1) = 0.0d0
|
||||
!$acc end kernels
|
||||
igl0 = 2
|
||||
else
|
||||
ELSE
|
||||
igl0 = 1
|
||||
endif
|
||||
|
||||
ENDIF
|
||||
!
|
||||
! Pseudopotentials in numerical form (Vloc contains the local part)
|
||||
! In order to perform the Fourier transform, a term erf(r)/r is
|
||||
! subtracted in real space and added again in G space
|
||||
|
||||
allocate (aux1( mesh))
|
||||
!
|
||||
! This is the part of the integrand function
|
||||
! indipendent of |G| in real space
|
||||
ALLOCATE( aux1(mesh) )
|
||||
!
|
||||
do i = 1, msh
|
||||
aux1 (i) = r (i) * vloc_at (i) + zp * e2 * erf (r (i) )
|
||||
enddo
|
||||
! This is the part of the integrand function
|
||||
! indipendent of |G| in real space
|
||||
!
|
||||
!$omp parallel private(aux, gx, vlcp, g2a)
|
||||
ALLOCATE( aux(mesh,ngl) )
|
||||
!
|
||||
allocate (aux( mesh))
|
||||
!$acc data copyin(r,rab) create(aux1,aux)
|
||||
!
|
||||
!$acc parallel loop copyin(vloc_at)
|
||||
DO i = 1, msh
|
||||
aux1(i) = r(i)*vloc_at(i) + zp*e2*ERF(r(i))
|
||||
ENDDO
|
||||
!
|
||||
#if defined(_OPENACC)
|
||||
!$acc parallel loop gang present(aux,aux1,rab,r,dvloc)
|
||||
#else
|
||||
!$omp parallel private( gx, vlcp, vlcp_1, vlcp_0, g2a )
|
||||
!$omp do
|
||||
do igl = igl0, ngl
|
||||
gx = sqrt (gl (igl) * tpiba2)
|
||||
#endif
|
||||
DO igl = igl0, ngl
|
||||
!
|
||||
gx = SQRT(gl(igl)*tpiba2)
|
||||
!
|
||||
! and here we perform the integral, after multiplying for the |G|
|
||||
! dependent part
|
||||
!
|
||||
! DV(g)/Dg = Integral of r (Dj_0(gr)/Dg) V(r) dr
|
||||
do i = 1, msh
|
||||
aux (i) = aux1 (i) * (r (i) * cos (gx * r (i) ) / gx - sin (gx &
|
||||
* r (i) ) / gx**2)
|
||||
enddo
|
||||
call simpson (msh, aux, rab, vlcp)
|
||||
!
|
||||
!$acc loop seq
|
||||
DO i = 1, msh
|
||||
aux(i,igl) = aux1(i)*(r(i)*COS(gx*r(i))/gx - SIN(gx*r(i))/gx**2)
|
||||
ENDDO
|
||||
!
|
||||
!----Simpson int.---
|
||||
vlcp_0 = 0.0d0
|
||||
!$acc loop seq reduction(+:vlcp_0)
|
||||
DO i = 2, msh-1, 2
|
||||
vlcp_0 = vlcp_0 + ( aux(i-1,igl)*rab(i-1) + 4.0d0*aux(i,igl)*rab(i) + &
|
||||
aux(i+1,igl)*rab(i+1) )*r12
|
||||
ENDDO
|
||||
!------
|
||||
vlcp_1 = vlcp_0 * fpi / omega / 2.0d0 / gx
|
||||
!
|
||||
! DV(g^2)/Dg^2 = (DV(g)/Dg)/2g
|
||||
vlcp = fpi / omega / 2.0d0 / gx * vlcp
|
||||
|
||||
! for ESM stress
|
||||
!vlcp = fpi / omega / 2.0d0 / gx * vlcp
|
||||
!
|
||||
! for ESM stress
|
||||
! In ESM, vloc and dvloc have only short term.
|
||||
IF ( ( .not. do_comp_esm ) .or. ( esm_bc .eq. 'pbc' ) ) THEN
|
||||
IF ( (( .NOT. do_comp_esm ) .OR. ( esm_bc .EQ. 'pbc' )) .AND. &
|
||||
.NOT.do_cutoff_2D ) THEN
|
||||
! subtract the long-range term
|
||||
IF (.not.do_cutoff_2D) then ! 2D cutoff: do not re-add LR part here (re-added later in stres_loc)
|
||||
g2a = gl (igl) * tpiba2 / 4.d0
|
||||
vlcp = vlcp + fpi / omega * zp * e2 * exp ( - g2a) * (g2a + &
|
||||
1.d0) / (gl (igl) * tpiba2) **2
|
||||
ENDIF
|
||||
END IF
|
||||
dvloc (igl) = vlcp
|
||||
enddo
|
||||
! 2D cutoff: do not re-add LR part here re-added later in stres_loc)
|
||||
g2a = gl(igl) * tpiba2 / 4.d0
|
||||
vlcp = vlcp_1 + fpi / omega * zp * e2 * EXP(-g2a) * (g2a + 1.d0) / &
|
||||
(gl(igl)*tpiba2)**2
|
||||
ELSE
|
||||
vlcp = vlcp_1
|
||||
ENDIF
|
||||
dvloc(igl) = vlcp
|
||||
ENDDO
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end do nowait
|
||||
deallocate (aux)
|
||||
!$omp end parallel
|
||||
#else
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
#endif
|
||||
!
|
||||
deallocate (aux1)
|
||||
|
||||
return
|
||||
end subroutine dvloc_of_g
|
||||
DEALLOCATE( aux )
|
||||
DEALLOCATE( aux1 )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE dvloc_of_g
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
subroutine dvloc_coul (zp, tpiba2, ngl, gl, omega, dvloc)
|
||||
SUBROUTINE dvloc_coul( zp, tpiba2, ngl, gl, omega, dvloc )
|
||||
!----------------------------------------------------------------------
|
||||
!
|
||||
! Fourier transform of the Coulomb potential - For all-electron
|
||||
! calculations, in specific cases only, for testing purposes
|
||||
!! Fourier transform of the Coulomb potential - For all-electron
|
||||
!! calculations, in specific cases only, for testing purposes.
|
||||
!
|
||||
USE kinds
|
||||
USE constants , ONLY : fpi, e2, eps8
|
||||
implicit none
|
||||
USE constants, ONLY : fpi, e2, eps8
|
||||
!
|
||||
integer, intent(in) :: ngl
|
||||
! the number of shell of G vectors
|
||||
real(DP), intent(in) :: zp, tpiba2, omega, gl (ngl)
|
||||
! valence pseudocharge
|
||||
! 2 pi / alat
|
||||
! the volume of the unit cell
|
||||
! the moduli of g vectors for each s
|
||||
real(DP), intent(out) :: dvloc (ngl)
|
||||
! fourier transform: dvloc = D Vloc (g^2) / D g^2 = 4pi e^2/omegai /G^4
|
||||
IMPLICIT NONE
|
||||
!
|
||||
integer :: igl0
|
||||
INTEGER, INTENT(IN) :: ngl
|
||||
!! the number of shell of G vectors
|
||||
REAL(DP), INTENT(IN) :: zp
|
||||
!! valence pseudocharge
|
||||
REAL(DP), INTENT(IN) :: tpiba2
|
||||
!! 2 pi / alat
|
||||
REAL(DP), INTENT(IN) :: omega
|
||||
!! the volume of the unit cell
|
||||
REAL(DP), INTENT(IN) :: gl(ngl)
|
||||
!! the moduli of g vectors for each s
|
||||
REAL(DP), INTENT(OUT) :: dvloc(ngl)
|
||||
!! Fourier transform:
|
||||
!! dvloc = D Vloc (g^2) / D g^2 = 4pi e^2/omegai /G^4
|
||||
!
|
||||
INTEGER :: igl0
|
||||
! first shell with g != 0
|
||||
|
||||
!
|
||||
!$acc data present( dvloc, gl )
|
||||
!
|
||||
! the G=0 component is 0
|
||||
if (gl (1) < eps8) then
|
||||
dvloc (1) = 0.0d0
|
||||
IF (gl(1) < eps8) THEN
|
||||
!$acc kernels
|
||||
dvloc(1) = 0.0d0
|
||||
!$acc end kernels
|
||||
igl0 = 2
|
||||
else
|
||||
ELSE
|
||||
igl0 = 1
|
||||
endif
|
||||
|
||||
dvloc (igl0:ngl) = fpi * zp * e2 / omega / ( tpiba2 * gl (igl0:ngl) ) ** 2
|
||||
|
||||
return
|
||||
end subroutine dvloc_coul
|
||||
ENDIF
|
||||
!
|
||||
!$acc kernels
|
||||
dvloc(igl0:ngl) = fpi*zp*e2 / omega / (tpiba2*gl(igl0:ngl))**2
|
||||
!$acc end kernels
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE dvloc_coul
|
||||
|
||||
|
|
|
@ -1,252 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2001-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
MODULE compute_dvloc_gpum
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
IMPLICIT NONE
|
||||
TYPE(dim3) :: dvloc_threads = dim3(32,8,1)
|
||||
PUBLIC :: dvloc_threads
|
||||
CONTAINS
|
||||
ATTRIBUTES( global ) SUBROUTINE compute_dvloc_gpu( mesh, n, tpiba2, omega, zp, gl, aux1, r, rab ,dvloc, do_cutoff_2D, do_comp_esm, esm_bc_l )
|
||||
!
|
||||
!
|
||||
USE cudafor
|
||||
USE kinds
|
||||
USE constants, ONLY : pi, fpi, e2, eps8
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, VALUE :: n, mesh
|
||||
REAL(DP), VALUE :: zp, tpiba2, omega
|
||||
LOGICAL, VALUE :: do_cutoff_2D, do_comp_esm, esm_bc_l
|
||||
REAL(DP), DEVICE, INTENT(IN) :: rab(mesh), r(mesh), gl(n)
|
||||
REAL(DP), DEVICE, INTENT(IN) :: aux1(mesh)
|
||||
!
|
||||
REAL(DP), DEVICE, INTENT(OUT) :: dvloc(n)
|
||||
! the Fourier transform dVloc/dG
|
||||
!
|
||||
REAL(DP) :: val, mysum
|
||||
REAL(DP) :: g2a, gx
|
||||
!
|
||||
INTEGER :: tx, ty, ir, igl
|
||||
!
|
||||
tx = threadIdx%x
|
||||
ty = threadIdx%y
|
||||
!
|
||||
igl = (blockIdx%x - 1) * blockDim%y + ty
|
||||
!
|
||||
IF (igl > n ) RETURN
|
||||
!
|
||||
gx = SQRT(gl(igl) * tpiba2)
|
||||
mysum = 0.d0
|
||||
!
|
||||
DO ir = tx, mesh, blockDim%x
|
||||
!
|
||||
val = aux1(ir) * ( r(ir) * COS(gx*r(ir)) / gx - SIN(gx*r(ir)) / gx**2 )
|
||||
val = val * rab(ir)
|
||||
!
|
||||
IF (ir == 1 .OR. ir == mesh) THEN
|
||||
mysum = mysum + val
|
||||
ELSEIF ( MOD(ir,2) ) THEN
|
||||
mysum = mysum + 2.d0*val
|
||||
ELSE
|
||||
mysum = mysum + 4.d0*val
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!
|
||||
! Reduce by warp
|
||||
val = __shfl_down(mysum,1)
|
||||
mysum = mysum + val
|
||||
val = __shfl_down(mysum,2)
|
||||
mysum = mysum + val
|
||||
val = __shfl_down(mysum,4)
|
||||
mysum = mysum + val
|
||||
val = __shfl_down(mysum,8)
|
||||
mysum = mysum + val
|
||||
val = __shfl_down(mysum,16)
|
||||
mysum = mysum + val
|
||||
!
|
||||
IF (tx == 1) THEN
|
||||
!
|
||||
mysum = mysum * fpi / omega / 2.0d0 / gx / 3.d0
|
||||
!
|
||||
IF ( (.NOT. do_comp_esm) .OR. esm_bc_l ) THEN
|
||||
IF (.NOT. do_cutoff_2D) THEN
|
||||
g2a = gl(igl) * tpiba2 / 4.d0
|
||||
mysum = mysum + fpi / omega * zp * e2 * EXP(-g2a) * (g2a + &
|
||||
1.d0) / (gl(igl)*tpiba2)**2
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
dvloc(igl) = mysum
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE compute_dvloc_gpu
|
||||
!
|
||||
#endif
|
||||
END MODULE compute_dvloc_gpum
|
||||
|
||||
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE dvloc_of_g_gpu( mesh, msh, rab_d, r_d, vloc_at_d, zp, tpiba2, &
|
||||
ngl, gl_d, omega, dvloc_d ) !, igl0 )
|
||||
!----------------------------------------------------------------------
|
||||
!
|
||||
! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE kinds
|
||||
USE constants, ONLY : pi, fpi, e2, eps8
|
||||
USE Coul_cut_2D, ONLY : do_cutoff_2D
|
||||
USE esm, ONLY : do_comp_esm, esm_bc
|
||||
#if defined(__CUDA)
|
||||
USE compute_dvloc_gpum, ONLY : compute_dvloc_gpu
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! first the dummy variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: mesh, msh, ngl !, igl0
|
||||
! the number of shell of G vectors
|
||||
! max number of mesh points
|
||||
! number of mesh points for radial integration
|
||||
!
|
||||
REAL(DP), INTENT(IN) :: rab_d(mesh), r_d(mesh), vloc_at_d(mesh), gl_d(ngl)
|
||||
REAL(DP), INTENT(IN) :: zp, tpiba2, omega
|
||||
!
|
||||
REAL(DP), INTENT(OUT) :: dvloc_d(ngl)
|
||||
! the Fourier transform dVloc/dG
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: dvloc_d, rab_d, r_d , gl_d, vloc_at_d
|
||||
#endif
|
||||
!
|
||||
! valence pseudocharge
|
||||
! the derivative of the radial grid
|
||||
! the radial grid
|
||||
! the pseudo on the radial grid
|
||||
! 2 pi / alat
|
||||
! the volume of the unit cell
|
||||
! the moduli of g vectors for each s
|
||||
!
|
||||
LOGICAL :: esm_bc_lg
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: aux1_d(:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: aux1_d
|
||||
!
|
||||
TYPE(dim3) :: threads
|
||||
#endif
|
||||
!
|
||||
REAL(DP) :: gl1
|
||||
INTEGER :: i, igl, blocks, igl0
|
||||
!
|
||||
! counter on erf functions or gaussians
|
||||
! counter on g shells vectors
|
||||
! first shell with g != 0
|
||||
!
|
||||
! the G=0 component is not computed
|
||||
gl1 = gl_d(1)
|
||||
IF (gl1 < eps8) THEN
|
||||
dvloc_d(1) = 0.0_DP
|
||||
igl0 = 2
|
||||
ELSE
|
||||
igl0 = 1
|
||||
ENDIF
|
||||
!
|
||||
! Pseudopotentials in numerical form (Vloc contains the local part)
|
||||
! In order to perform the Fourier transform, a term erf(r)/r is
|
||||
! subtracted in real space and added again in G space
|
||||
!
|
||||
!
|
||||
ALLOCATE( aux1_d(mesh) )
|
||||
!
|
||||
! This is the part of the integrand function
|
||||
! indipendent of |G| in real space
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!$cuf kernel do(1)<<<*,*>>>
|
||||
DO i = 1, msh
|
||||
aux1_d(i) = r_d(i) * vloc_at_d(i) + zp * e2 * erf(r_d(i))
|
||||
ENDDO
|
||||
!
|
||||
esm_bc_lg = ( esm_bc == 'pbe' )
|
||||
!
|
||||
threads = dim3(32,8,1)
|
||||
blocks = CEILING(REAL(ngl-igl0+1)/8)
|
||||
CALL compute_dvloc_gpu<<<blocks, threads>>>( msh, ngl-igl0+1, tpiba2, omega, zp, gl_d(igl0), &
|
||||
aux1_d, r_d, rab_d, dvloc_d(igl0), do_cutoff_2D, &
|
||||
do_comp_esm, esm_bc_lg )
|
||||
!
|
||||
#else
|
||||
CALL errore( 'dvloc_of_g_gpu' , 'GPU version of dvloc_of_g called but not compiled.', 0 )
|
||||
#endif
|
||||
DEALLOCATE( aux1_d )
|
||||
!
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE dvloc_of_g_gpu
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE dvloc_coul_gpu( zp, tpiba2, ngl, gl_d, omega, dvloc_d )
|
||||
!----------------------------------------------------------------------
|
||||
!! Fourier transform of the Coulomb potential - For all-electron
|
||||
!! calculations, in specific cases only, for testing purposes.
|
||||
!
|
||||
USE kinds
|
||||
USE constants, ONLY : fpi, e2, eps8
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ngl
|
||||
! the number of shell of G vectors
|
||||
! first shell with g != 0
|
||||
REAL(DP), INTENT(IN) :: zp, tpiba2, omega
|
||||
! valence pseudocharge
|
||||
! 2 pi / alat
|
||||
! the volume of the unit cell
|
||||
REAL(DP), INTENT(IN) :: gl_d(ngl)
|
||||
! the moduli of g vectors for each s
|
||||
REAL(DP), INTENT(OUT) :: dvloc_d(ngl)
|
||||
! fourier transform: dvloc = D Vloc (g^2) / D g^2 = 4pi e^2/omegai /G^4
|
||||
INTEGER :: j, igl0
|
||||
REAL(DP) :: gl1
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: dvloc_d, gl_d
|
||||
#endif
|
||||
!
|
||||
! the G=0 component is 0
|
||||
gl1 = gl_d(1)
|
||||
IF (gl1 < eps8) then
|
||||
dvloc_d(1) = 0.0_DP
|
||||
igl0 = 2
|
||||
ELSE
|
||||
igl0 = 1
|
||||
ENDIF
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO j = igl0, ngl
|
||||
dvloc_d(j) = fpi * zp * e2 / omega / ( tpiba2 * gl_d(j) )**2
|
||||
ENDDO
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE dvloc_coul_gpu
|
|
@ -1134,7 +1134,7 @@ SUBROUTINE dprojdtau_k_gpu( spsi_d, alpha, na, ijkb0, ipol, ik, nb_s, nb_e, myke
|
|||
offsetU_back1, ldim_u, backall, lda_plus_u_kind, &
|
||||
Hubbard_projectors, oatwfc
|
||||
USE wvfct, ONLY : nbnd, npwx, wg
|
||||
USE uspp, ONLY : okvan, nkb, qq_at_d
|
||||
USE uspp, ONLY : okvan, nkb
|
||||
USE uspp_param, ONLY : nh
|
||||
USE becmod_subs_gpum, ONLY : calbec_gpu
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
|
@ -1530,7 +1530,7 @@ SUBROUTINE matrix_element_of_dSdtau_gpu (alpha, ipol, ik, ijkb0, lA, A, lB, B, A
|
|||
USE ions_base, ONLY : nat, ntyp => nsp, ityp
|
||||
USE cell_base, ONLY : tpiba
|
||||
USE wvfct, ONLY : npwx, wg
|
||||
USE uspp, ONLY : nkb, okvan, vkb, qq_at_d
|
||||
USE uspp, ONLY : nkb, okvan, vkb, qq_at
|
||||
USE uspp_param, ONLY : nh
|
||||
USE klist, ONLY : igk_k_d, ngk
|
||||
USE becmod_subs_gpum, ONLY : calbec_gpu
|
||||
|
@ -1585,10 +1585,10 @@ SUBROUTINE matrix_element_of_dSdtau_gpu (alpha, ipol, ik, ijkb0, lA, A, lB, B, A
|
|||
CALL dev_buf%lock_buffer(qq , [nh(nt),nh(nt)], ierr) ! ALLOCATE ( qq(nh(nt),nh(nt)) )
|
||||
IF ( ierr /= 0 ) CALL errore('matrix_element_of_dSdtau_gpu','Buffers allocation failed',ierr)
|
||||
!
|
||||
!$cuf kernel do(2)
|
||||
!$acc parallel loop collapse(2) present(qq_at)
|
||||
DO jh=1,nh_nt
|
||||
DO ih=1,nh_nt
|
||||
qq(ih,jh) = CMPLX(qq_at_d(ih,jh,alpha), 0.0d0, kind=DP)
|
||||
qq(ih,jh) = CMPLX(qq_at(ih,jh,alpha), 0.0d0, kind=DP)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
|
@ -1703,7 +1703,7 @@ SUBROUTINE dprojdtau_gamma_gpu( spsi_d, alpha, ijkb0, ipol, ik, nb_s, nb_e, &
|
|||
offsetU_back, offsetU_back1, ldim_u, backall, &
|
||||
Hubbard_projectors
|
||||
USE wvfct, ONLY : nbnd, npwx, wg
|
||||
USE uspp, ONLY : nkb, vkb, qq_at_d
|
||||
USE uspp, ONLY : nkb, vkb, qq_at
|
||||
USE uspp_param, ONLY : nh
|
||||
USE wavefunctions, ONLY : evc
|
||||
USE becmod_gpum, ONLY : bec_type_d, becp_d
|
||||
|
@ -1900,12 +1900,12 @@ SUBROUTINE dprojdtau_gamma_gpu( spsi_d, alpha, ijkb0, ipol, ik, nb_s, nb_e, &
|
|||
CALL dev_memset ( betapsi_d, 0.0_dp )
|
||||
!
|
||||
! here starts band parallelization
|
||||
!$cuf kernel do(2)
|
||||
!$acc parallel loop collapse(2) present(qq_at)
|
||||
DO ih = 1, nh_nt
|
||||
DO ibnd = nb_s, nb_e
|
||||
DO jh = 1, nh_nt
|
||||
betapsi_d(ih,ibnd) = betapsi_d(ih,ibnd) + &
|
||||
qq_at_d(ih,jh,alpha) * dbetapsi_d(jh,ibnd)
|
||||
qq_at(ih,jh,alpha) * dbetapsi_d(jh,ibnd)
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
|
@ -1918,12 +1918,12 @@ SUBROUTINE dprojdtau_gamma_gpu( spsi_d, alpha, ijkb0, ipol, ik, nb_s, nb_e, &
|
|||
CALL dev_memset ( betapsi_d, 0.0_dp )
|
||||
!
|
||||
becpr_d => becp_d%r_d
|
||||
!$cuf kernel do(2)
|
||||
!$acc parallel loop collapse(2) present(qq_at)
|
||||
DO ih = 1, nh_nt
|
||||
DO ibnd = nb_s, nb_e
|
||||
DO jh = 1, nh_nt
|
||||
betapsi_d(ih,ibnd) = betapsi_d(ih,ibnd) + &
|
||||
qq_at_d(ih,jh,alpha) * betapsi0_d(jh,ibnd)
|
||||
qq_at(ih,jh,alpha) * betapsi0_d(jh,ibnd)
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
|
|
|
@ -182,7 +182,7 @@ SUBROUTINE force_us_gpu( forcenl )
|
|||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
USE uspp, ONLY : qq_at_d, deeq_d
|
||||
USE uspp, ONLY : qq_at, deeq
|
||||
USE wvfct_gpum, ONLY : wg_d, using_wg_d, et_d, using_et_d
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -230,9 +230,11 @@ SUBROUTINE force_us_gpu( forcenl )
|
|||
IF ( ityp(na) == nt ) THEN
|
||||
ijkb0 = ofsbeta(na)
|
||||
! this is \sum_j q_{ij} <beta_j|psi>
|
||||
!$acc host_data use_device(qq_at)
|
||||
CALL DGEMM ('N','N', nh(nt), becp_d%nbnd_loc, nh(nt), &
|
||||
1.0_dp, qq_at_d(1,1,na), nhm, becp_d%r_d(ijkb0+1,1),&
|
||||
1.0_dp, qq_at(1,1,na), nhm, becp_d%r_d(ijkb0+1,1),&
|
||||
nkb, 0.0_dp, aux_d, nh(nt) )
|
||||
!$acc end host_data
|
||||
! multiply by -\epsilon_n
|
||||
!$cuf kernel do(2)
|
||||
DO ih = 1, nh_nt
|
||||
|
@ -243,10 +245,12 @@ SUBROUTINE force_us_gpu( forcenl )
|
|||
END DO
|
||||
|
||||
! add \sum_j d_{ij} <beta_j|psi>
|
||||
!$acc host_data use_device(deeq)
|
||||
CALL DGEMM ('N','N', nh(nt), becp_d%nbnd_loc, nh(nt), &
|
||||
1.0_dp, deeq_d(1,1,na,current_spin), nhm, &
|
||||
1.0_dp, deeq(1,1,na,current_spin), nhm, &
|
||||
becp_d%r_d(ijkb0+1,1), nkb, 1.0_dp, aux_d, nh(nt) )
|
||||
|
||||
!$acc end host_data
|
||||
!
|
||||
! Auxiliary variable to perform the reduction with cuf kernels
|
||||
forcenl_ipol = 0.0_dp
|
||||
!$cuf kernel do(2)
|
||||
|
|
|
@ -98,7 +98,7 @@ SUBROUTINE init_run()
|
|||
gg_d = gg
|
||||
END IF
|
||||
#endif
|
||||
!$acc update device(mill, g)
|
||||
!$acc update device(mill, g, gg)
|
||||
!
|
||||
IF (do_comp_esm) CALL esm_init(.NOT. lrism)
|
||||
!
|
||||
|
|
|
@ -9,13 +9,13 @@ MODULE uspp_init
|
|||
!
|
||||
PRIVATE
|
||||
PUBLIC :: init_us_2, gen_us_dj, gen_us_dy
|
||||
PUBLIC :: gen_us_dj_gpu, gen_us_dy_gpu
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE init_us_2( npw_, igk_, q_, vkb_, run_on_gpu_)
|
||||
SUBROUTINE init_us_2( npw_, igk_, q_, vkb_, run_on_gpu_ )
|
||||
!----------------------------------------------------------------------
|
||||
!! wrapper to call init_us_2_base
|
||||
!! Wrapper to call init_us_2_base.
|
||||
!! Calculates beta functions (Kleinman-Bylander projectors), with
|
||||
!! structure factor, for all atoms, in reciprocal space.
|
||||
!
|
||||
|
@ -40,38 +40,42 @@ CONTAINS
|
|||
!! beta functions (npw_ <= npwx)
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: run_on_gpu_
|
||||
!! whether you wish to run on gpu in case use_gpu is true
|
||||
!!
|
||||
!
|
||||
LOGICAL :: run_on_gpu
|
||||
!
|
||||
!
|
||||
run_on_gpu = .false.
|
||||
if(present(run_on_gpu_)) run_on_gpu = run_on_gpu_
|
||||
run_on_gpu = .FALSE.
|
||||
IF (PRESENT(run_on_gpu_)) run_on_gpu = run_on_gpu_
|
||||
!
|
||||
CALL start_clock( 'init_us_2' )
|
||||
if(use_gpu.and.run_on_gpu) then
|
||||
!
|
||||
IF (use_gpu .AND. run_on_gpu) THEN
|
||||
!
|
||||
!$acc data present(igk_(1:npw_), mill(:,:), g(:,:), vkb_(1:npwx,1:nkb), eigts1(:,:), eigts2(:,:), eigts3(:,:))
|
||||
!$acc host_data use_device(eigts1, eigts2, eigts3, mill, g, igk_, vkb_)
|
||||
CALL init_us_2_base_gpu(npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega,&
|
||||
dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, eigts3, mill, g,&
|
||||
vkb_ )
|
||||
!$acc data present( igk_(1:npw_), mill(:,:), g(:,:), vkb_(1:npwx,1:nkb), &
|
||||
!$acc eigts1(:,:), eigts2(:,:), eigts3(:,:) )
|
||||
!$acc host_data use_device( eigts1, eigts2, eigts3, mill, g, igk_, vkb_ )
|
||||
CALL init_us_2_base_gpu( npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, &
|
||||
dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, &
|
||||
eigts3, mill, g, vkb_ )
|
||||
!$acc end host_data
|
||||
!$acc end data
|
||||
!
|
||||
else
|
||||
CALL init_us_2_base(npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, &
|
||||
dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, eigts3, mill, g,&
|
||||
vkb_ )
|
||||
end if
|
||||
ELSE
|
||||
!
|
||||
CALL init_us_2_base( npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, &
|
||||
dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, &
|
||||
eigts3, mill, g, vkb_ )
|
||||
ENDIF
|
||||
!
|
||||
CALL stop_clock( 'init_us_2' )
|
||||
!
|
||||
!
|
||||
END SUBROUTINE init_us_2
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE gen_us_dj ( ik, dvkb )
|
||||
SUBROUTINE gen_us_dj( ik, dvkb )
|
||||
!----------------------------------------------------------------------
|
||||
!! wrapper to call gen_us_dj: same as init_us_2, but with the derivative
|
||||
!! Wrapper to call gen_us_dj: same as init_us_2, but with the derivative
|
||||
!! of the Bessel functions dj_l/dq instead of j_l(qr) in the integral
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
|
@ -80,7 +84,7 @@ CONTAINS
|
|||
USE gvect, ONLY : eigts1, eigts2, eigts3, mill, g
|
||||
USE wvfct, ONLY : npwx
|
||||
USE uspp, ONLY : nkb
|
||||
USE fft_base , ONLY : dfftp
|
||||
USE fft_base, ONLY : dfftp
|
||||
USE klist, ONLY : xk, ngk, igk_k
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -92,16 +96,18 @@ CONTAINS
|
|||
!
|
||||
! CALL start_clock( 'gen_us_dj' )
|
||||
!
|
||||
CALL gen_us_dj_base (ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, &
|
||||
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
|
||||
eigts1, eigts2, eigts3, mill, g, dvkb )
|
||||
!$acc data present_or_copyout( dvkb )
|
||||
CALL gen_us_dj_base( ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, &
|
||||
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, &
|
||||
dfftp%nr3, eigts1, eigts2, eigts3, mill, g, dvkb )
|
||||
!$acc end data
|
||||
!
|
||||
! CALL stop_clock( 'gen_us_dj' )
|
||||
!
|
||||
END SUBROUTINE gen_us_dj
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE gen_us_dy ( ik, u, dvkb )
|
||||
SUBROUTINE gen_us_dy( ik, u, dvkb )
|
||||
!----------------------------------------------------------------------
|
||||
!! wrapper to call gen_us_dj: same as init_us_2, but with the derivative
|
||||
!! of the spherical harmonics dY_lm/dq instead of Y_lm(q) in the integral
|
||||
|
@ -126,86 +132,14 @@ CONTAINS
|
|||
!
|
||||
! CALL start_clock( 'gen_us_dy' )
|
||||
!
|
||||
CALL gen_us_dy_base (ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, &
|
||||
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
|
||||
eigts1, eigts2, eigts3, mill, g, u, dvkb )
|
||||
!$acc data present_or_copyout( dvkb )
|
||||
CALL gen_us_dy_base( ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, &
|
||||
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, &
|
||||
dfftp%nr3, eigts1, eigts2, eigts3, mill, g, u, dvkb )
|
||||
!$acc end data
|
||||
!
|
||||
! CALL stop_clock( 'gen_us_d' )
|
||||
!
|
||||
END SUBROUTINE gen_us_dy
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE gen_us_dj_gpu ( ik, dvkb )
|
||||
!----------------------------------------------------------------------
|
||||
!! wrapper to call gen_us_dj: same as init_us_2, but with the derivative
|
||||
!! of the Bessel functions dj_l/dq instead of j_l(qr) in the integral
|
||||
!! (GPU version, FIXME: to be merged)
|
||||
!
|
||||
USE kinds, ONLY : dp
|
||||
USE ions_base, ONLY : nat, ntyp=>nsp, ityp, tau
|
||||
USE cell_base, ONLY : tpiba, omega
|
||||
USE gvect, ONLY : eigts1_d, eigts2_d, eigts3_d, mill_d, g_d
|
||||
USE wvfct, ONLY : npwx
|
||||
USE uspp, ONLY : nkb
|
||||
USE fft_base , ONLY : dfftp
|
||||
USE klist, ONLY : xk, ngk, igk_k_d
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ik
|
||||
!! k-point index
|
||||
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb)
|
||||
!! beta functions computed with dj_l/dq
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: dvkb
|
||||
#endif
|
||||
!
|
||||
! CALL start_clock( 'gen_us_dj' )
|
||||
!
|
||||
CALL gen_us_dj_gpu_ (ngk(ik), npwx, igk_k_d(1,ik), xk(1,ik), nat, tau, &
|
||||
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
|
||||
eigts1_d, eigts2_d, eigts3_d, mill_d, g_d, dvkb )
|
||||
!
|
||||
! CALL stop_clock( 'gen_us_dj' )
|
||||
!
|
||||
END SUBROUTINE gen_us_dj_gpu
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE gen_us_dy_gpu ( ik, u, dvkb )
|
||||
!----------------------------------------------------------------------
|
||||
!! wrapper to call gen_us_dj: same as init_us_2, but with the derivative
|
||||
!! of the spherical harmonics dY_lm/dq instead of Y_lm(q) in the integral
|
||||
!! (GPU version, FIXME: to be merged)
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE ions_base, ONLY : nat, ntyp=>nsp, ityp, tau
|
||||
USE cell_base, ONLY : tpiba, omega
|
||||
USE gvect, ONLY : eigts1_d, eigts2_d, eigts3_d, mill_d, g_d
|
||||
USE wvfct, ONLY : npwx
|
||||
USE uspp, ONLY : nkb
|
||||
USE fft_base , ONLY : dfftp
|
||||
USE klist, ONLY : xk, ngk, igk_k_d
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ik
|
||||
!! k-point index
|
||||
REAL(dp), INTENT(IN) :: u(3)
|
||||
!! k-point index
|
||||
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb)
|
||||
!! beta functions computed with dY_lm/dq
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: dvkb
|
||||
#endif
|
||||
!
|
||||
! CALL start_clock( 'gen_us_dy' )
|
||||
!
|
||||
CALL gen_us_dy_gpu_ (ngk(ik), npwx, igk_k_d(1,ik), xk(1,ik), nat, tau, &
|
||||
ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
|
||||
eigts1_d, eigts2_d, eigts3_d, mill_d, g_d, u, dvkb )
|
||||
!
|
||||
! CALL stop_clock( 'gen_us_dy' )
|
||||
!
|
||||
END SUBROUTINE gen_us_dy_gpu
|
||||
!
|
||||
END MODULE
|
||||
|
|
|
@ -184,8 +184,8 @@ SUBROUTINE newq( vr, deeq, skip_vltot )
|
|||
ENDDO
|
||||
!
|
||||
DEALLOCATE( qmod, ylmk0, vaux )
|
||||
CALL mp_sum( deeq( :, :, :, 1:nspin_mag ), inter_pool_comm )
|
||||
CALL mp_sum( deeq( :, :, :, 1:nspin_mag ), intra_bgrp_comm )
|
||||
CALL mp_sum( deeq(:,:,:,1:nspin_mag ), inter_pool_comm )
|
||||
CALL mp_sum( deeq(:,:,:,1:nspin_mag ), intra_bgrp_comm )
|
||||
!
|
||||
END SUBROUTINE newq
|
||||
!
|
||||
|
@ -199,7 +199,7 @@ SUBROUTINE newd( )
|
|||
USE kinds, ONLY : DP
|
||||
USE ions_base, ONLY : nat, ntyp => nsp, ityp
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE uspp, ONLY : deeq, deeq_d, dvan, deeq_nc, deeq_nc_d, dvan_so, okvan
|
||||
USE uspp, ONLY : deeq, dvan, deeq_nc, dvan_so, okvan
|
||||
USE uspp_param, ONLY : upf, lmaxq, nh, nhm
|
||||
USE noncollin_module, ONLY : noncolin, domag, nspin_mag, lspinorb
|
||||
USE uspp, ONLY : nhtol, nhtolm
|
||||
|
@ -248,13 +248,15 @@ SUBROUTINE newd( )
|
|||
!
|
||||
ENDDO
|
||||
!
|
||||
#if defined __CUDA
|
||||
if (noncolin) then
|
||||
if (nhm>0) deeq_nc_d=deeq_nc
|
||||
else
|
||||
if (nhm>0) deeq_d=deeq
|
||||
endif
|
||||
#endif
|
||||
IF (noncolin) THEN
|
||||
IF (nhm>0) THEN
|
||||
!$acc update device(deeq_nc)
|
||||
ENDIF
|
||||
ELSE
|
||||
IF (nhm>0) THEN
|
||||
!$acc update device(deeq)
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
! ... early return
|
||||
!
|
||||
|
@ -311,13 +313,15 @@ SUBROUTINE newd( )
|
|||
IF (lda_plus_U .AND. (Hubbard_projectors == 'pseudo')) CALL add_vhub_to_deeq( deeq )
|
||||
!
|
||||
! sync with GPUs
|
||||
#if defined __CUDA
|
||||
if (noncolin) then
|
||||
if (nhm>0) deeq_nc_d=deeq_nc
|
||||
else
|
||||
if (nhm>0) deeq_d=deeq
|
||||
endif
|
||||
#endif
|
||||
IF (noncolin) THEN
|
||||
IF (nhm>0) THEN
|
||||
!$acc update device(deeq_nc)
|
||||
ENDIF
|
||||
ELSE
|
||||
IF (nhm>0) THEN
|
||||
!$acc update device(deeq)
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
CALL stop_clock( 'newd' )
|
||||
!
|
||||
|
|
|
@ -50,9 +50,6 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot)
|
|||
! Input: potential , output: contribution to integral
|
||||
REAL(kind=dp), intent(in) :: vr(dfftp%nnr,nspin)
|
||||
REAL(kind=dp), intent(out) :: deeq_d( nhm, nhm, nat, nspin )
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: deeq_d
|
||||
#endif
|
||||
LOGICAL, intent(in) :: skip_vltot !If .false. vltot is added to vr when necessary
|
||||
! INTERNAL
|
||||
INTEGER :: ngm_s, ngm_e, ngm_l
|
||||
|
@ -70,7 +67,7 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot)
|
|||
! workaround for cuf kernel limitations
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: vaux_d, aux_d, qgm_d, ylmk0_d, qmod_d, deeaux_d, dfftp_nl_d
|
||||
attributes(DEVICE) :: deeq_d, vaux_d, aux_d, qgm_d, ylmk0_d, qmod_d, deeaux_d, dfftp_nl_d
|
||||
#endif
|
||||
! variable to map index of atoms of the same type
|
||||
INTEGER, ALLOCATABLE :: na_to_nab_h(:)
|
||||
|
@ -87,7 +84,9 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot)
|
|||
fact = 1.0_dp
|
||||
ENDIF
|
||||
!
|
||||
!$acc kernels
|
||||
deeq_d(:,:,:,:) = 0.D0
|
||||
!$acc end kernels
|
||||
!
|
||||
! With k-point parallelization, distribute G-vectors across processors
|
||||
! ngm_s = index of first G-vector for this processor
|
||||
|
@ -237,7 +236,7 @@ SUBROUTINE newd_gpu( )
|
|||
USE kinds, ONLY : DP
|
||||
USE ions_base, ONLY : nat, ntyp => nsp, ityp
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE uspp, ONLY : deeq, okvan, deeq, deeq_d, deeq_nc, deeq_nc_d, dvan_d, dvan_so_d
|
||||
USE uspp, ONLY : okvan, deeq, deeq_nc, dvan_d, dvan_so_d
|
||||
USE uspp_param, ONLY : upf, lmaxq, nh, nhm
|
||||
USE noncollin_module, ONLY : noncolin, domag, nspin_mag, lspinorb
|
||||
USE uspp, ONLY : nhtol, nhtolm
|
||||
|
@ -273,12 +272,12 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
IF ( lspinorb ) THEN
|
||||
!
|
||||
!$cuf kernel do(4)
|
||||
!$acc parallel loop collapse(4)
|
||||
DO is = 1, nspin
|
||||
DO na = 1, nat
|
||||
DO jh = 1, nht
|
||||
DO ih = 1, nht
|
||||
IF ( ityp_d(na) == nt ) deeq_nc_d(ih,jh,na,is) = dvan_so_d(ih,jh,is,nt)
|
||||
IF ( ityp_d(na) == nt ) deeq_nc(ih,jh,na,is) = dvan_so_d(ih,jh,is,nt)
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
|
@ -286,15 +285,15 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
ELSE IF ( noncolin ) THEN
|
||||
!
|
||||
!$cuf kernel do(3)
|
||||
!$acc parallel loop collapse(3)
|
||||
DO na = 1, nat
|
||||
DO jh = 1, nht
|
||||
DO ih = 1, nht
|
||||
IF ( ityp_d(na) == nt ) THEN
|
||||
deeq_nc_d(ih,jh,na,1) = dvan_d(ih,jh,nt)
|
||||
deeq_nc_d(ih,jh,na,2) = ( 0.D0, 0.D0 )
|
||||
deeq_nc_d(ih,jh,na,3) = ( 0.D0, 0.D0 )
|
||||
deeq_nc_d(ih,jh,na,4) = dvan_d(ih,jh,nt)
|
||||
deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt)
|
||||
deeq_nc(ih,jh,na,2) = ( 0.D0, 0.D0 )
|
||||
deeq_nc(ih,jh,na,3) = ( 0.D0, 0.D0 )
|
||||
deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
|
@ -303,13 +302,13 @@ SUBROUTINE newd_gpu( )
|
|||
ELSE
|
||||
!
|
||||
if ( nht > 0 ) THEN
|
||||
!$cuf kernel do(4)
|
||||
!$acc parallel loop collapse(4)
|
||||
DO is = 1, nspin
|
||||
DO na = 1, nat
|
||||
DO jh = 1, nht
|
||||
DO ih = 1, nht
|
||||
!
|
||||
IF ( ityp_d(na) == nt ) deeq_d(ih,jh,na,is) = dvan_d(ih,jh,nt)
|
||||
IF ( ityp_d(na) == nt ) deeq(ih,jh,na,is) = dvan_d(ih,jh,nt)
|
||||
!
|
||||
END DO
|
||||
END DO
|
||||
|
@ -328,9 +327,9 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
! ... sync with CPU
|
||||
if (noncolin) then
|
||||
deeq_nc=deeq_nc_d
|
||||
!$acc update self(deeq_nc)
|
||||
else
|
||||
deeq=deeq_d
|
||||
!$acc update self(deeq)
|
||||
endif
|
||||
!
|
||||
RETURN
|
||||
|
@ -346,12 +345,18 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
IF (tqr) THEN
|
||||
CALL newq_r(v%of_r,deeq,.false.)
|
||||
deeq_d=deeq
|
||||
!$acc update device(deeq)
|
||||
ELSE
|
||||
CALL newq_gpu(v%of_r,deeq_d,.false.)
|
||||
!$acc host_data use_device(deeq)
|
||||
CALL newq_gpu(v%of_r,deeq,.false.)
|
||||
!$acc end host_data
|
||||
END IF
|
||||
!
|
||||
IF (noncolin) call add_paw_to_deeq_gpu(deeq_d)
|
||||
IF (noncolin) THEN
|
||||
!$acc host_data use_device(deeq)
|
||||
call add_paw_to_deeq_gpu(deeq)
|
||||
!$acc end host_data
|
||||
ENDIF
|
||||
!
|
||||
types : &
|
||||
DO nt = 1, ntyp
|
||||
|
@ -372,13 +377,13 @@ SUBROUTINE newd_gpu( )
|
|||
ELSE if_noncolin
|
||||
!
|
||||
nht = nh(nt)
|
||||
!$cuf kernel do(4)
|
||||
!$acc parallel loop collapse(4)
|
||||
DO is = 1, nspin
|
||||
DO na = 1, nat
|
||||
DO ih = 1, nht
|
||||
DO jh = 1, nht
|
||||
IF ( ityp_d(na) == nt ) THEN
|
||||
deeq_d(ih,jh,na,is) = deeq_d(ih,jh,na,is) + dvan_d(ih,jh,nt)
|
||||
deeq(ih,jh,na,is) = deeq(ih,jh,na,is) + dvan_d(ih,jh,nt)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
|
@ -389,17 +394,25 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
END DO types
|
||||
!
|
||||
IF (.NOT.noncolin) CALL add_paw_to_deeq_gpu(deeq_d)
|
||||
IF (.NOT.noncolin) THEN
|
||||
!$acc host_data use_device(deeq)
|
||||
CALL add_paw_to_deeq_gpu(deeq)
|
||||
!$acc end host_data
|
||||
ENDIF
|
||||
!
|
||||
IF (lda_plus_U .AND. (Hubbard_projectors == 'pseudo')) CALL add_vhub_to_deeq_gpu(deeq_d)
|
||||
IF (lda_plus_U .AND. (Hubbard_projectors == 'pseudo')) THEN
|
||||
!$acc host_data use_device(deeq)
|
||||
CALL add_vhub_to_deeq_gpu(deeq)
|
||||
!$acc end host_data
|
||||
ENDIF
|
||||
!
|
||||
CALL buffer%release_buffer(ityp_d, ierr)
|
||||
CALL stop_clock_gpu( 'newd' )
|
||||
!
|
||||
if (noncolin) then
|
||||
deeq_nc=deeq_nc_d
|
||||
!$acc update self(deeq_nc)
|
||||
else
|
||||
deeq=deeq_d
|
||||
!$acc update self(deeq)
|
||||
endif
|
||||
!
|
||||
RETURN
|
||||
|
@ -429,7 +442,7 @@ SUBROUTINE newd_gpu( )
|
|||
ijs = ijs + 1
|
||||
!
|
||||
IF (domag) THEN
|
||||
!$cuf kernel do(3)
|
||||
!$acc parallel loop collapse(3) present(deeq_nc,deeq)
|
||||
DO na = 1, nat
|
||||
!
|
||||
DO ih = 1, nhnt
|
||||
|
@ -438,24 +451,24 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
IF ( ityp_d(na) == nt ) THEN
|
||||
!
|
||||
deeq_nc_d(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
|
||||
deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
|
||||
!
|
||||
DO kh = 1, nhnt
|
||||
!
|
||||
DO lh = 1, nhnt
|
||||
!
|
||||
deeq_nc_d(ih,jh,na,ijs) = deeq_nc_d(ih,jh,na,ijs) + &
|
||||
deeq_d (kh,lh,na,1)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + &
|
||||
deeq_nc(ih,jh,na,ijs) = deeq_nc(ih,jh,na,ijs) + &
|
||||
deeq(kh,lh,na,1)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + &
|
||||
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,2,is2,nt)) + &
|
||||
deeq_d (kh,lh,na,2)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) + &
|
||||
deeq(kh,lh,na,2)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) + &
|
||||
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,1,is2,nt)) + &
|
||||
(0.D0,-1.D0)*deeq_d (kh,lh,na,3)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) - &
|
||||
(0.D0,-1.D0)*deeq(kh,lh,na,3)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) - &
|
||||
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,1,is2,nt)) + &
|
||||
deeq_d (kh,lh,na,4)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) - &
|
||||
deeq(kh,lh,na,4)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) - &
|
||||
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,2,is2,nt))
|
||||
!
|
||||
END DO
|
||||
|
@ -471,7 +484,7 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
ELSE
|
||||
!
|
||||
!$cuf kernel do(3) <<<*,*>>>
|
||||
!$acc parallel loop collapse(3) present(deeq_nc,deeq)
|
||||
DO na = 1, nat
|
||||
!
|
||||
DO ih = 1, nhnt
|
||||
|
@ -480,15 +493,15 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
IF ( ityp_d(na) == nt ) THEN
|
||||
!
|
||||
deeq_nc_d(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
|
||||
deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
|
||||
!
|
||||
DO kh = 1, nhnt
|
||||
!
|
||||
DO lh = 1, nhnt
|
||||
!
|
||||
deeq_nc_d(ih,jh,na,ijs) = deeq_nc_d(ih,jh,na,ijs) + &
|
||||
deeq_d (kh,lh,na,1)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + &
|
||||
deeq_nc(ih,jh,na,ijs) = deeq_nc(ih,jh,na,ijs) + &
|
||||
deeq(kh,lh,na,1)* &
|
||||
(fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + &
|
||||
fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,2,is2,nt) )
|
||||
!
|
||||
END DO
|
||||
|
@ -526,42 +539,38 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
nhnt = nh(nt)
|
||||
!
|
||||
!$cuf kernel do(3)
|
||||
!$acc parallel loop collapse(3) present(deeq_nc,deeq)
|
||||
DO na = 1, nat
|
||||
!
|
||||
DO ih = 1, nhnt
|
||||
!
|
||||
DO jh = 1, nhnt
|
||||
!
|
||||
IF ( ityp_d(na) == nt ) THEN
|
||||
!
|
||||
IF (lspinorb) THEN
|
||||
deeq_nc_d(ih,jh,na,1) = dvan_so_d(ih,jh,1,nt) + &
|
||||
deeq_d(ih,jh,na,1) + deeq_d(ih,jh,na,4)
|
||||
!
|
||||
deeq_nc_d(ih,jh,na,4) = dvan_so_d(ih,jh,4,nt) + &
|
||||
deeq_d(ih,jh,na,1) - deeq_d(ih,jh,na,4)
|
||||
deeq_nc(ih,jh,na,1) = dvan_so_d(ih,jh,1,nt) + &
|
||||
deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
|
||||
!
|
||||
deeq_nc(ih,jh,na,4) = dvan_so_d(ih,jh,4,nt) + &
|
||||
deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
|
||||
!
|
||||
ELSE
|
||||
deeq_nc_d(ih,jh,na,1) = dvan_d(ih,jh,nt) + &
|
||||
deeq_d(ih,jh,na,1) + deeq_d(ih,jh,na,4)
|
||||
!
|
||||
deeq_nc_d(ih,jh,na,4) = dvan_d(ih,jh,nt) + &
|
||||
deeq_d(ih,jh,na,1) - deeq_d(ih,jh,na,4)
|
||||
deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt) + &
|
||||
deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
|
||||
!
|
||||
deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt) + &
|
||||
deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
|
||||
!
|
||||
END IF
|
||||
deeq_nc_d(ih,jh,na,2) = deeq_d(ih,jh,na,2) - &
|
||||
( 0.D0, 1.D0 ) * deeq_d(ih,jh,na,3)
|
||||
!
|
||||
deeq_nc_d(ih,jh,na,3) = deeq_d(ih,jh,na,2) + &
|
||||
( 0.D0, 1.D0 ) * deeq_d(ih,jh,na,3)
|
||||
deeq_nc(ih,jh,na,2) = deeq(ih,jh,na,2) - &
|
||||
( 0.D0, 1.D0 ) * deeq(ih,jh,na,3)
|
||||
!
|
||||
deeq_nc(ih,jh,na,3) = deeq(ih,jh,na,2) + &
|
||||
( 0.D0, 1.D0 ) * deeq(ih,jh,na,3)
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
END DO
|
||||
!
|
||||
END DO
|
||||
!
|
||||
RETURN
|
||||
|
|
|
@ -230,10 +230,10 @@ SUBROUTINE post_xml_init ( )
|
|||
g_d = g
|
||||
gg_d = gg
|
||||
#endif
|
||||
!$acc update device(mill, g)
|
||||
!$acc update device(mill, g, gg)
|
||||
!
|
||||
CALL ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms )
|
||||
CALL gshells ( lmovecell )
|
||||
CALL gshells ( lmovecell )
|
||||
!
|
||||
IF (do_comp_esm) CALL esm_init()
|
||||
IF (do_cutoff_2D) CALL cutoff_fact()
|
||||
|
|
|
@ -228,7 +228,7 @@ MODULE realus
|
|||
USE constants, ONLY : pi, fpi, eps16, eps6
|
||||
USE ions_base, ONLY : nat, nsp, ityp, tau
|
||||
USE cell_base, ONLY : at, bg, alat
|
||||
USE uspp, ONLY : okvan, qq_at, qq_at_d, qq_nt, nhtol
|
||||
USE uspp, ONLY : okvan, qq_at, qq_at, qq_nt, nhtol
|
||||
USE uspp_param, ONLY : upf, nh
|
||||
USE atom, ONLY : rgrid
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
|
@ -404,9 +404,8 @@ MODULE realus
|
|||
! and sync on GPUs
|
||||
!
|
||||
CALL mp_sum( qq_at, intra_bgrp_comm )
|
||||
#if defined __CUDA
|
||||
qq_at_d=qq_at
|
||||
#endif
|
||||
!
|
||||
!$acc update device(qq_at)
|
||||
!
|
||||
! and test that they don't differ too much
|
||||
! from the result computed on the atomic grid
|
||||
|
@ -455,7 +454,7 @@ MODULE realus
|
|||
!! Sync with GPU memory is performed outside
|
||||
!
|
||||
USE constants, ONLY : eps16, eps6
|
||||
USE uspp, ONLY : indv, nhtolm, ap, qq_at, qq_at_d
|
||||
USE uspp, ONLY : indv, nhtolm, ap, qq_at
|
||||
USE uspp_param, ONLY : upf, lmaxq, nh
|
||||
USE atom, ONLY : rgrid
|
||||
USE splinelib, ONLY : spline, splint
|
||||
|
|
|
@ -216,7 +216,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
|
|||
!
|
||||
USE mp, ONLY : mp_get_comm_null, mp_circular_shift_left
|
||||
USE device_fbuff_m, ONLY : dev_buf
|
||||
USE uspp, ONLY : qq_at_d
|
||||
USE uspp, ONLY : qq_at
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -273,9 +273,12 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
|
|||
! (l'=l+ijkb0, m'=m+ijkb0, indices run from 1 to nh(nt))
|
||||
!
|
||||
IF ( m_loc > 0 ) THEN
|
||||
!$acc host_data use_device(qq_at)
|
||||
CALL DGEMM('N', 'N', nh(nt), m_loc, nh(nt), 1.0_dp, &
|
||||
qq_at_d(1,1,na), nhm, becp_d%r_d(ofsbeta(na)+1,1),&
|
||||
qq_at(1,1,na), nhm, becp_d%r_d(ofsbeta(na)+1,1),&
|
||||
nkb, 0.0_dp, ps_d(ofsbeta(na)+1,1), nkb )
|
||||
!$acc end host_data
|
||||
|
||||
END IF
|
||||
END IF
|
||||
END DO
|
||||
|
@ -340,7 +343,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
|
|||
!! k-points version of \(\textrm{s_psi}\) routine.
|
||||
!
|
||||
USE device_fbuff_m, ONLY : dev_buf
|
||||
USE uspp, ONLY : qq_at_d
|
||||
USE uspp, ONLY : qq_at
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -367,17 +370,17 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
|
|||
! qq is real: copy it into a complex variable to perform
|
||||
! a zgemm - simple but sub-optimal solution
|
||||
!
|
||||
! here we need to use qq_at_d instead of qq_nt_d otherwise real space augmentation brakes!
|
||||
! here we need to use qq_at (device) instead of qq_nt_d otherwise real space augmentation brakes!
|
||||
! qq_nt_d would be much faster and works for calculations without real space augmentation
|
||||
CALL dev_buf%lock_buffer( qqc_d, (/ nhm, nhm, nat/), ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' s_psi_k_gpu ', ' cannot allocate buffer (qqc_d) ', ABS(ierr) )
|
||||
|
||||
!$cuf kernel do(3) <<<*,*>>>
|
||||
!$acc parallel loop collapse(3) present(qq_at)
|
||||
DO na = 1, nat
|
||||
DO jh = 1, nhm
|
||||
DO ih = 1, nhm
|
||||
qqc_d(ih,jh, na) = CMPLX ( qq_at_d(ih,jh, na), 0.0_dp, KIND=dp )
|
||||
qqc_d(ih,jh, na) = CMPLX ( qq_at(ih,jh, na), 0.0_dp, KIND=dp )
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
|
@ -427,7 +430,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
|
|||
!! k-points noncolinear/spinorbit version of \(\textrm{s_psi}\) routine.
|
||||
!
|
||||
USE device_fbuff_m, ONLY : dev_buf
|
||||
USE uspp, ONLY : qq_at_d, qq_so_d
|
||||
USE uspp, ONLY : qq_at, qq_so
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -455,11 +458,11 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
|
|||
IF( ierr /= 0 .and. ierr /= -1 ) &
|
||||
CALL errore( ' s_psi_nc_gpu ', ' cannot allocate buffer (qqc_d) ', ABS(ierr) )
|
||||
! Possibly convert only what's needed??
|
||||
!$cuf kernel do(3) <<<*,*>>>
|
||||
!$acc parallel loop collapse(3) present(qq_at)
|
||||
DO na = 1, nat
|
||||
DO jh = 1, nhm
|
||||
DO ih = 1, nhm
|
||||
qqc_d(ih, jh, na) = CMPLX ( qq_at_d(ih,jh, na), 0.0_dp, KIND=dp )
|
||||
qqc_d(ih, jh, na) = CMPLX ( qq_at(ih,jh, na), 0.0_dp, KIND=dp )
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
|
@ -482,19 +485,21 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
|
|||
ELSE
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
!$acc host_data use_device(qq_so)
|
||||
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
|
||||
qq_so_d(1,1,1,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, &
|
||||
qq_so(1,1,1,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, &
|
||||
(0.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,1,1), nkb*npol )
|
||||
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
|
||||
qq_so_d(1,1,2,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, &
|
||||
qq_so(1,1,2,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, &
|
||||
(1.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,1,1), nkb*npol )
|
||||
!
|
||||
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
|
||||
qq_so_d(1,1,3,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, &
|
||||
qq_so(1,1,3,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, &
|
||||
(0.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,2,1), nkb*npol )
|
||||
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
|
||||
qq_so_d(1,1,4,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, &
|
||||
qq_so(1,1,4,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, &
|
||||
(1.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,2,1), nkb*npol )
|
||||
!$acc end host_data
|
||||
END IF
|
||||
END DO
|
||||
END IF
|
||||
|
|
|
@ -7,8 +7,9 @@
|
|||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine stres_cc( sigmaxcc )
|
||||
SUBROUTINE stres_cc( sigmaxcc )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Core correction term of the stress.
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE atom, ONLY : rgrid, msh
|
||||
|
@ -17,7 +18,7 @@ subroutine stres_cc( sigmaxcc )
|
|||
USE cell_base, ONLY : alat, omega, tpiba, tpiba2
|
||||
USE fft_base, ONLY : dfftp
|
||||
USE fft_rho, ONLY : rho_r2g
|
||||
USE gvect, ONLY : ngm, gstart, g, gg, ngl, gl, igtongl
|
||||
USE gvect, ONLY : ngm, gstart, ngl, gl, igtongl, g, gg
|
||||
USE ener, ONLY : etxc, vtxc
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE scf, ONLY : rho, rho_core, rhog_core
|
||||
|
@ -26,81 +27,135 @@ subroutine stres_cc( sigmaxcc )
|
|||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
implicit none
|
||||
! output
|
||||
real(DP) :: sigmaxcc(3,3)
|
||||
! local variables
|
||||
IMPLICIT NONE
|
||||
!
|
||||
integer :: nt, ng, l, m, ir
|
||||
! counters
|
||||
real(DP) :: fact, sigmadiag
|
||||
real(DP), allocatable:: rhocg(:), vxc(:,:)
|
||||
complex(DP), allocatable :: vaux(:,:)
|
||||
REAL(DP), INTENT(OUT) :: sigmaxcc(3,3)
|
||||
!
|
||||
sigmaxcc(:,:) = 0.d0
|
||||
if ( ANY (upf(1:ntyp)%nlcc) ) goto 15
|
||||
! ... local variables
|
||||
!
|
||||
return
|
||||
INTEGER :: nt, ng, l, m, ir
|
||||
REAL(DP) :: fact
|
||||
REAL(DP), ALLOCATABLE :: rhocg(:), vxc(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: vaux(:,:)
|
||||
!
|
||||
15 continue
|
||||
REAL(DP) :: rhocg1, sigma_rid, sigmadiag
|
||||
REAL(DP) :: sigma1, sigma2, sigma3, &
|
||||
sigma4, sigma5, sigma6
|
||||
!
|
||||
! recalculate the exchange-correlation potential
|
||||
sigmaxcc(:,:) = 0._DP
|
||||
!
|
||||
allocate( vxc(dfftp%nnr,nspin), vaux(dfftp%nnr,1) )
|
||||
IF ( .NOT. ANY( upf(1:ntyp)%nlcc ) ) RETURN
|
||||
!
|
||||
call v_xc( rho, rho_core, rhog_core, etxc, vtxc, vxc )
|
||||
! ... recalculate the exchange-correlation potential
|
||||
!
|
||||
if ( nspin==2 ) then
|
||||
do ir = 1, dfftp%nnr
|
||||
ALLOCATE( vxc(dfftp%nnr,nspin), vaux(dfftp%nnr,1) )
|
||||
!
|
||||
CALL v_xc( rho, rho_core, rhog_core, etxc, vtxc, vxc )
|
||||
!
|
||||
!$acc data create( vaux )
|
||||
!$acc data copyin( vxc )
|
||||
IF ( nspin==2 ) then
|
||||
!$acc parallel loop
|
||||
DO ir = 1, dfftp%nnr
|
||||
vxc(ir,1) = 0.5d0 * ( vxc(ir,1) + vxc(ir,2) )
|
||||
enddo
|
||||
endif
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
call rho_r2g( dfftp, vxc(:,1), vaux(:,1:1) )
|
||||
CALL rho_r2g( dfftp, vxc(:,1), vaux(:,1:1) )
|
||||
!
|
||||
! vaux contains now Vxc(G)
|
||||
!$acc end data
|
||||
DEALLOCATE( vxc )
|
||||
!
|
||||
allocate(rhocg(ngl))
|
||||
sigmadiag = 0.0d0
|
||||
if (gamma_only) then
|
||||
fact = 2.d0
|
||||
else
|
||||
fact = 1.d0
|
||||
end if
|
||||
do nt = 1, ntyp
|
||||
if ( upf(nt)%nlcc ) then
|
||||
call drhoc( ngl, gl, omega, tpiba2, msh(nt), rgrid(nt)%r, &
|
||||
rgrid(nt)%rab, upf(nt)%rho_atc, rhocg )
|
||||
! diagonal term
|
||||
if (gstart==2) then
|
||||
sigmadiag = sigmadiag + CONJG(vaux(1,1)) * strf(1,nt) * rhocg(igtongl(1))
|
||||
endif
|
||||
do ng = gstart, ngm
|
||||
sigmadiag = sigmadiag + CONJG(vaux(ng,1)) * &
|
||||
strf(ng,nt) * rhocg(igtongl(ng)) * fact
|
||||
enddo
|
||||
! ... vaux contains now Vxc(G)
|
||||
!
|
||||
sigmadiag = 0._DP
|
||||
!
|
||||
fact = 1._DP
|
||||
IF (gamma_only) fact = 2._DP
|
||||
!
|
||||
!$acc data copyin( gl, strf, igtongl )
|
||||
!
|
||||
ALLOCATE( rhocg(ngl) )
|
||||
!$acc data create( rhocg )
|
||||
!
|
||||
sigma1 = 0._DP ; sigma4 = 0._DP
|
||||
sigma2 = 0._DP ; sigma5 = 0._DP
|
||||
sigma3 = 0._DP ; sigma6 = 0._DP
|
||||
!
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
IF ( upf(nt)%nlcc ) THEN
|
||||
!
|
||||
call deriv_drhoc( ngl, gl, omega, tpiba2, msh(nt), &
|
||||
rgrid(nt)%r, rgrid(nt)%rab, upf(nt)%rho_atc, rhocg )
|
||||
! non diagonal term (g=0 contribution missing)
|
||||
do ng = gstart, ngm
|
||||
do l = 1, 3
|
||||
do m = 1, 3
|
||||
sigmaxcc(l,m) = sigmaxcc(l,m) + CONJG(vaux(ng,1)) * strf(ng,nt) * &
|
||||
rhocg(igtongl(ng)) * tpiba * g(l,ng) * g(m,ng) / &
|
||||
sqrt(gg(ng)) * fact
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
endif
|
||||
enddo
|
||||
!$acc data copyin(rgrid(nt:nt),upf(nt:nt))
|
||||
!$acc data copyin(rgrid(nt)%r,rgrid(nt)%rab,upf(nt)%rho_atc)
|
||||
|
||||
CALL drhoc( ngl, gl, omega, tpiba2, msh(nt), rgrid(nt)%r, &
|
||||
rgrid(nt)%rab, upf(nt)%rho_atc, rhocg )
|
||||
!
|
||||
! ... diagonal term
|
||||
IF (gstart==2) THEN
|
||||
!$acc kernels
|
||||
rhocg1 = rhocg(igtongl(1))
|
||||
sigmadiag = sigmadiag + DBLE(CONJG(vaux(1,1)) * &
|
||||
strf(1,nt)) * rhocg1
|
||||
!$acc end kernels
|
||||
ENDIF
|
||||
!
|
||||
!$acc parallel loop reduction(+:sigmadiag)
|
||||
DO ng = gstart, ngm
|
||||
sigmadiag = sigmadiag + DBLE(CONJG(vaux(ng,1)) * &
|
||||
strf(ng,nt)) * rhocg(igtongl(ng)) * fact
|
||||
ENDDO
|
||||
!
|
||||
CALL deriv_drhoc( ngl, gl, omega, tpiba2, msh(nt), &
|
||||
rgrid(nt)%r, rgrid(nt)%rab, upf(nt)%rho_atc, &
|
||||
rhocg )
|
||||
!
|
||||
! ... non diagonal term (g=0 contribution missing)
|
||||
!
|
||||
!$acc parallel loop reduction(+:sigma1,sigma2,sigma3,sigma4,sigma5,sigma6)
|
||||
DO ng = gstart, ngm
|
||||
!
|
||||
sigma_rid = DBLE(CONJG(vaux(ng,1)) * strf(ng,nt)) * &
|
||||
rhocg(igtongl(ng)) * tpiba / SQRT(gg(ng)) * fact
|
||||
!
|
||||
sigma1 = sigma1 + sigma_rid * g(1,ng)*g(1,ng)
|
||||
sigma2 = sigma2 + sigma_rid * g(1,ng)*g(2,ng)
|
||||
sigma3 = sigma3 + sigma_rid * g(1,ng)*g(3,ng)
|
||||
sigma4 = sigma4 + sigma_rid * g(2,ng)*g(2,ng)
|
||||
sigma5 = sigma5 + sigma_rid * g(3,ng)*g(2,ng)
|
||||
sigma6 = sigma6 + sigma_rid * g(3,ng)*g(3,ng)
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
do l = 1, 3
|
||||
!$acc end data
|
||||
!
|
||||
sigmaxcc(1,1) = sigma1 ; sigmaxcc(2,3) = sigma5
|
||||
sigmaxcc(1,2) = sigma2 ; sigmaxcc(3,1) = sigma3
|
||||
sigmaxcc(1,3) = sigma3 ; sigmaxcc(3,2) = sigma5
|
||||
sigmaxcc(2,1) = sigma2 ; sigmaxcc(3,3) = sigma6
|
||||
sigmaxcc(2,2) = sigma4
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmaxcc(l,l) = sigmaxcc(l,l) + sigmadiag
|
||||
enddo
|
||||
call mp_sum( sigmaxcc, intra_bgrp_comm )
|
||||
deallocate( rhocg )
|
||||
deallocate( vaux, vxc )
|
||||
return
|
||||
end subroutine stres_cc
|
||||
ENDDO
|
||||
!
|
||||
CALL mp_sum( sigmaxcc, intra_bgrp_comm )
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
!
|
||||
DEALLOCATE( rhocg )
|
||||
DEALLOCATE( vaux )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_cc
|
||||
|
||||
|
|
|
@ -1,178 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2001-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE stres_cc_gpu( sigmaxcc )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE atom, ONLY : rgrid, msh
|
||||
USE uspp_param, ONLY : upf
|
||||
USE ions_base, ONLY : ntyp => nsp
|
||||
USE cell_base, ONLY : alat, omega, tpiba, tpiba2
|
||||
USE fft_base, ONLY : dfftp
|
||||
USE fft_rho, ONLY : rho_r2g
|
||||
USE gvect, ONLY : ngm, gstart, ngl, gl, igtongl, igtongl_d
|
||||
USE ener, ONLY : etxc, vtxc
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE scf, ONLY : rho, rho_core, rhog_core
|
||||
USE vlocal, ONLY : strf
|
||||
USE control_flags, ONLY : gamma_only
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
USE gvect, ONLY : g_d, gg_d
|
||||
#if defined(__CUDA)
|
||||
USE device_fbuff_m, ONLY : dev_buf
|
||||
USE device_memcpy_m, ONLY : dev_memcpy
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! output
|
||||
REAL(DP) :: sigmaxcc(3,3)
|
||||
! local variables
|
||||
!
|
||||
INTEGER :: nt, ng, l, m, ir
|
||||
! counters
|
||||
REAL(DP) :: fact
|
||||
REAL(DP), ALLOCATABLE :: vxc(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: vaux(:,:)
|
||||
!
|
||||
REAL(DP), POINTER :: rhocg_d(:), r_d(:), rab_d(:), rhoc_d(:), gl_d(:)
|
||||
COMPLEX(DP), POINTER :: strf_d(:)
|
||||
!
|
||||
INTEGER :: maxmesh, ierrs(6)
|
||||
REAL(DP) :: sigma_rid, sigmadiag
|
||||
REAL(DP) :: sigma1, sigma2, sigma3, &
|
||||
sigma4, sigma5, sigma6
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: rhocg_d, r_d, rab_d, rhoc_d, gl_d, strf_d
|
||||
!
|
||||
sigmaxcc(:,:) = 0._DP
|
||||
IF ( ANY( upf(1:ntyp)%nlcc ) ) GOTO 15
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
15 CONTINUE
|
||||
!
|
||||
! recalculate the exchange-correlation potential
|
||||
!
|
||||
ALLOCATE( vxc(dfftp%nnr,nspin), vaux(dfftp%nnr,1) )
|
||||
!
|
||||
CALL v_xc( rho, rho_core, rhog_core, etxc, vtxc, vxc )
|
||||
!
|
||||
!$acc data copyin(vxc) create(vaux)
|
||||
!
|
||||
IF ( nspin==2 ) THEN
|
||||
!$acc parallel loop
|
||||
DO ir = 1, dfftp%nnr
|
||||
vxc(ir,1) = 0.5d0 * ( vxc(ir,1) + vxc(ir,2) )
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
CALL rho_r2g( dfftp, vxc(:,1), vaux(:,1:1) )
|
||||
!
|
||||
! vaux contains now Vxc(G)
|
||||
!
|
||||
sigmadiag = 0._DP
|
||||
!
|
||||
fact = 1._DP
|
||||
IF (gamma_only) fact = 2._DP
|
||||
!
|
||||
maxmesh = MAXVAL(msh(1:ntyp))
|
||||
CALL dev_buf%lock_buffer( gl_d, ngl, ierrs(1) )
|
||||
CALL dev_memcpy( gl_d, gl, (/ 1, ngl /) )
|
||||
CALL dev_buf%lock_buffer( rhocg_d, ngl, ierrs(2) )
|
||||
CALL dev_buf%lock_buffer( r_d, maxmesh, ierrs(3) )
|
||||
CALL dev_buf%lock_buffer( rab_d, maxmesh, ierrs(4) )
|
||||
CALL dev_buf%lock_buffer( rhoc_d,maxmesh, ierrs(5) )
|
||||
CALL dev_buf%lock_buffer( strf_d, ngm, ierrs(6) )
|
||||
IF (ANY(ierrs /= 0)) CALL errore( 'stres_cc_gpu', 'cannot allocate buffers', -1 )
|
||||
!
|
||||
sigma1 = 0._DP ; sigma4 = 0._DP
|
||||
sigma2 = 0._DP ; sigma5 = 0._DP
|
||||
sigma3 = 0._DP ; sigma6 = 0._DP
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
IF ( upf(nt)%nlcc ) THEN
|
||||
!
|
||||
CALL dev_memcpy( strf_d, strf(:,nt), (/1, ngm/) )
|
||||
CALL dev_memcpy( r_d, rgrid(nt)%r, (/1, msh(nt)/) )
|
||||
CALL dev_memcpy( rab_d, rgrid(nt)%rab, (/1, msh(nt)/) )
|
||||
CALL dev_memcpy( rhoc_d, upf(nt)%rho_atc, (/1, msh(nt)/) )
|
||||
!
|
||||
CALL drhoc_gpu( ngl, gl_d, omega, tpiba2, msh(nt), r_d, &
|
||||
rab_d, rhoc_d, rhocg_d )
|
||||
!
|
||||
! diagonal term
|
||||
IF (gstart==2) THEN
|
||||
!$acc kernels
|
||||
sigmadiag = sigmadiag + DBLE(CONJG(vaux(1,1))*strf_d(1)) * &
|
||||
rhocg_d(igtongl_d(1))
|
||||
!$acc end kernels
|
||||
ENDIF
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ng = gstart, ngm
|
||||
sigmadiag = sigmadiag + DBLE(CONJG(vaux(ng,1)) * strf_d(ng)) * &
|
||||
rhocg_d(igtongl_d(ng)) * fact
|
||||
ENDDO
|
||||
!
|
||||
CALL deriv_drhoc_gpu( ngl, gl_d, omega, tpiba2, msh(nt), &
|
||||
r_d, rab_d, rhoc_d, rhocg_d )
|
||||
!
|
||||
! non diagonal term (g=0 contribution missing)
|
||||
!
|
||||
!$acc parallel loop reduction(+:sigma1,sigma2,sigma3,sigma4,sigma5,sigma6)
|
||||
DO ng = gstart, ngm
|
||||
!
|
||||
sigma_rid = DBLE(CONJG(vaux(ng,1)) &
|
||||
* strf_d(ng)) * rhocg_d(igtongl_d(ng)) * tpiba &
|
||||
/ SQRT(gg_d(ng)) * fact
|
||||
!
|
||||
sigma1 = sigma1 + sigma_rid * g_d(1,ng)*g_d(1,ng)
|
||||
sigma2 = sigma2 + sigma_rid * g_d(1,ng)*g_d(2,ng)
|
||||
sigma3 = sigma3 + sigma_rid * g_d(1,ng)*g_d(3,ng)
|
||||
sigma4 = sigma4 + sigma_rid * g_d(2,ng)*g_d(2,ng)
|
||||
sigma5 = sigma5 + sigma_rid * g_d(3,ng)*g_d(2,ng)
|
||||
sigma6 = sigma6 + sigma_rid * g_d(3,ng)*g_d(3,ng)
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
sigmaxcc(1,1) = sigma1 ; sigmaxcc(2,3) = sigma5
|
||||
sigmaxcc(1,2) = sigma2 ; sigmaxcc(3,1) = sigma3
|
||||
sigmaxcc(1,3) = sigma3 ; sigmaxcc(3,2) = sigma5
|
||||
sigmaxcc(2,1) = sigma2 ; sigmaxcc(3,3) = sigma6
|
||||
sigmaxcc(2,2) = sigma4
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmaxcc(l,l) = sigmaxcc(l,l) + sigmadiag
|
||||
ENDDO
|
||||
!
|
||||
CALL mp_sum( sigmaxcc, intra_bgrp_comm )
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( vxc, vaux )
|
||||
CALL dev_buf%release_buffer( gl_d, ierrs(1) )
|
||||
CALL dev_buf%release_buffer( rhocg_d,ierrs(2) )
|
||||
CALL dev_buf%release_buffer( r_d, ierrs(3) )
|
||||
CALL dev_buf%release_buffer( rab_d, ierrs(4) )
|
||||
CALL dev_buf%release_buffer( rhoc_d, ierrs(5) )
|
||||
CALL dev_buf%release_buffer( strf_d, ierrs(6) )
|
||||
#endif
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_cc_gpu
|
||||
|
|
@ -53,7 +53,7 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
|
|||
REAL(DP), INTENT(IN) :: gcutm
|
||||
!! input: cut-off of g vectors
|
||||
REAL(DP), INTENT(OUT) :: sigmaewa(3,3)
|
||||
! output: the ewald stress
|
||||
!! output: the ewald stress
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
|
@ -85,6 +85,9 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
|
|||
! diagonal term
|
||||
! nondiagonal term
|
||||
COMPLEX(DP) :: rhostar
|
||||
REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
|
||||
!
|
||||
!$acc data present( g, gg )
|
||||
!
|
||||
tpiba2 = (tpi / alat)**2
|
||||
sigmaewa(:,:) = 0.d0
|
||||
|
@ -94,85 +97,113 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
|
|||
charge = charge + zv(ityp(na))
|
||||
ENDDO
|
||||
!
|
||||
! choose alpha in order to have convergence in the sum over G
|
||||
! upperbound is a safe upper bound for the error ON THE ENERGY
|
||||
! ... choose alpha in order to have convergence in the sum over G
|
||||
! upperbound is a safe upper bound for the error ON THE ENERGY
|
||||
!
|
||||
alpha = 2.9d0
|
||||
12 alpha = alpha - 0.1d0
|
||||
!
|
||||
IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 )
|
||||
upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * &
|
||||
erfc ( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) )
|
||||
ERFC( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) )
|
||||
!
|
||||
IF (upperbound > 1d-7) GOTO 12
|
||||
!
|
||||
! G-space sum here
|
||||
!
|
||||
! Determine if this processor contains G=0 and set the constant term
|
||||
!
|
||||
! ... Determine if this processor contains G=0 and set the constant term
|
||||
! sdewald is the diagonal term
|
||||
IF (gstart == 2) THEN
|
||||
sdewald = tpi * e2 / 4.d0 / alpha * (charge / omega)**2
|
||||
ELSE
|
||||
sdewald = 0.d0
|
||||
ENDIF
|
||||
!
|
||||
! sdewald is the diagonal term
|
||||
IF (gamma_only) THEN
|
||||
fact = 2.d0
|
||||
fact = 2.d0
|
||||
ELSE
|
||||
fact = 1.d0
|
||||
ENDIF
|
||||
!
|
||||
! ... G-space sum here below
|
||||
!
|
||||
IF (do_cutoff_2D) THEN
|
||||
!
|
||||
CALL cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa )
|
||||
!
|
||||
ELSE
|
||||
!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau, zv, ityp, nat, omega, fact)&
|
||||
!$omp &private(g2, g2a, rhostar, na, arg, l, m, sewald)&
|
||||
!$omp &reduction(+:sigmaewa,sdewald)
|
||||
!
|
||||
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
|
||||
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
|
||||
!
|
||||
#if !defined(_OPENACC)
|
||||
!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau,&
|
||||
!$omp nat, zv, ityp, omega, fact) private(g2,g2a, rhostar, na, arg,&
|
||||
!$omp sewald) reduction(+:sdewald,sigma11,sigma21,sigma22,sigma31,&
|
||||
!$omp sigma32,sigma33)
|
||||
#else
|
||||
!$acc parallel loop copyin(tau,zv,ityp) reduction(+:sigma11,sigma21,sigma22,&
|
||||
!$acc sigma31,sigma32,sigma33) reduction(-:sdewald)
|
||||
#endif
|
||||
DO ng = gstart, ngm
|
||||
g2 = gg (ng) * tpiba2
|
||||
g2a = g2 / 4.d0 / alpha
|
||||
rhostar = (0.d0, 0.d0)
|
||||
g2 = gg(ng) * tpiba2
|
||||
g2a = g2 / 4._DP / alpha
|
||||
rhostar = (0._DP,0._DP)
|
||||
DO na = 1, nat
|
||||
arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + &
|
||||
g(3,ng) * tau(3,na) ) * tpi
|
||||
rhostar = rhostar + zv(ityp(na)) * CMPLX(COS(arg), SIN(arg), KIND=DP)
|
||||
rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP)
|
||||
ENDDO
|
||||
rhostar = rhostar / omega
|
||||
rhostar = rhostar / CMPLX(omega)
|
||||
sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2
|
||||
sdewald = sdewald - sewald
|
||||
DO l = 1, 3
|
||||
DO m = 1, l
|
||||
sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * &
|
||||
g(l,ng) * g(m,ng) / g2 * (g2a + 1)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
sigma11 = sigma11 + sewald * tpiba2 * 2._DP * &
|
||||
g(1,ng) * g(1,ng) / g2 * (g2a + 1)
|
||||
sigma21 = sigma21 + sewald * tpiba2 * 2._DP * &
|
||||
g(2,ng) * g(1,ng) / g2 * (g2a + 1)
|
||||
sigma22 = sigma22 + sewald * tpiba2 * 2._DP * &
|
||||
g(2,ng) * g(2,ng) / g2 * (g2a + 1)
|
||||
sigma31 = sigma31 + sewald * tpiba2 * 2._DP * &
|
||||
g(3,ng) * g(1,ng) / g2 * (g2a + 1)
|
||||
sigma32 = sigma32 + sewald * tpiba2 * 2._DP * &
|
||||
g(3,ng) * g(2,ng) / g2 * (g2a + 1)
|
||||
sigma33 = sigma33 + sewald * tpiba2 * 2._DP * &
|
||||
g(3,ng) * g(3,ng) / g2 * (g2a + 1)
|
||||
ENDDO
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end parallel do
|
||||
#endif
|
||||
!
|
||||
sigmaewa(1,1) = sigmaewa(1,1) + sigma11
|
||||
sigmaewa(2,1) = sigmaewa(2,1) + sigma21
|
||||
sigmaewa(2,2) = sigmaewa(2,2) + sigma22
|
||||
sigmaewa(3,1) = sigmaewa(3,1) + sigma31
|
||||
sigmaewa(3,2) = sigmaewa(3,2) + sigma32
|
||||
sigmaewa(3,3) = sigmaewa(3,3) + sigma33
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmaewa(l,l) = sigmaewa(l,l) + sdewald
|
||||
ENDDO
|
||||
!
|
||||
! R-space sum here (see ewald.f90 for details on parallelization)
|
||||
! ... R-space sum here (see ewald.f90 for details on parallelization)
|
||||
!
|
||||
CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey )
|
||||
!
|
||||
IF ( mykey == 0 ) THEN
|
||||
rmax = 4.0d0 / SQRT(alpha) / alat
|
||||
!
|
||||
! with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1
|
||||
! ... with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1
|
||||
!
|
||||
!$omp parallel do default(none) shared(na_s, na_e, nat, tau, rmax, at, bg, alat, ityp, alpha, omega, zv)&
|
||||
!$omp &private(nb, dtau, r, r2, nrm, nr, rr, fac, l, m)&
|
||||
!$omp &reduction(+:sigmaewa)
|
||||
!$omp parallel do default(none) shared(na_s,na_e,nat,tau,rmax,at,bg,alat,ityp,alpha,omega,zv)&
|
||||
!$omp &private(nb,dtau,r,r2,nrm,nr,rr,fac,l,m) reduction(+:sigmaewa)
|
||||
DO na = na_s, na_e
|
||||
DO nb = 1, nat
|
||||
dtau(:) = tau(:,na) - tau(:,nb)
|
||||
!
|
||||
! generates nearest-neighbors shells r(i)=R(i)-dtau(i)
|
||||
! ... generates nearest-neighbors shells r(i)=R(i)-dtau(i)
|
||||
!
|
||||
CALL rgen( dtau, rmax, mxr, at, bg, r, r2, nrm )
|
||||
!
|
||||
|
|
|
@ -1,246 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2001-2009 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg, tau, &
|
||||
omega, g_d, gg_d, ngm, gstart, gamma_only, &
|
||||
gcutm, sigmaewa )
|
||||
!---------------------------------------------------------------------
|
||||
!! Ewald contribution. Both real- and reciprocal-space terms are
|
||||
!! present.
|
||||
!
|
||||
USE kinds
|
||||
USE constants, ONLY : tpi, e2, eps6
|
||||
USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, nproc_bgrp
|
||||
USE mp, ONLY : mp_sum
|
||||
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_sigmaewa_gpu
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: nat
|
||||
!! input: number of atoms in the unit cell
|
||||
INTEGER :: ntyp
|
||||
!! input: number of different types of atoms
|
||||
INTEGER :: ityp(nat)
|
||||
!! input: the type of each atom
|
||||
INTEGER :: ngm
|
||||
!! input: number of plane waves for G sum
|
||||
INTEGER :: gstart
|
||||
!! input: first nonzero g vector
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
!! gamma point only
|
||||
REAL(DP), INTENT(IN) :: tau(3,nat)
|
||||
!! input: the positions of the atoms in the cell
|
||||
REAL(DP), INTENT(IN) :: g_d(3,ngm)
|
||||
!! input: the coordinates of G vectors
|
||||
REAL(DP), INTENT(IN) :: gg_d(ngm)
|
||||
!! input: the square moduli of G vectors
|
||||
REAL(DP), INTENT(IN) :: zv(ntyp)
|
||||
!! input: the charge of each type of atoms
|
||||
REAL(DP), INTENT(IN) :: at(3,3)
|
||||
!! input: the direct lattice vectors
|
||||
REAL(DP), INTENT(IN) :: bg(3,3)
|
||||
!! input: the reciprocal lattice vectors
|
||||
REAL(DP), INTENT(IN) :: omega
|
||||
!! input: the volume of the unit cell
|
||||
REAL(DP), INTENT(IN) :: alat
|
||||
!! input: measure of length
|
||||
REAL(DP), INTENT(IN) :: gcutm
|
||||
!! input: cut-off of g vectors
|
||||
REAL(DP), INTENT(OUT) :: sigmaewa(3,3)
|
||||
! output: the ewald stress
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER, PARAMETER :: mxr = 50
|
||||
! the maximum number of R vectors included in r sum
|
||||
INTEGER :: ng, nr, na, nb, l, m, nrm
|
||||
! counter over reciprocal G vectors
|
||||
! counter over direct vectors
|
||||
! counter on atoms
|
||||
! counter on atoms
|
||||
! counter on atoms
|
||||
! number of R vectors included in r sum
|
||||
INTEGER :: na_s, na_e, mykey
|
||||
!
|
||||
REAL(DP) :: charge, arg, tpiba2, dtau(3), alpha, r(3,mxr), &
|
||||
r2(mxr), rmax, rr, upperbound, fact, fac, g2, g2a, &
|
||||
sdewald, sewald
|
||||
! total ionic charge in the cell
|
||||
! the argument of the phase
|
||||
! length in reciprocal space
|
||||
! the difference tau_s - tau_s'
|
||||
! alpha term in ewald sum
|
||||
! input of the rgen routine ( not used here )
|
||||
! the square modulus of R_j-tau_s-tau_s'
|
||||
! the maximum radius to consider real space sum
|
||||
! buffer variable
|
||||
! used to optimize alpha
|
||||
! auxiliary variables
|
||||
! diagonal term
|
||||
! nondiagonal term
|
||||
!
|
||||
INTEGER :: ierr(2)
|
||||
REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
|
||||
COMPLEX(DP) :: rhostar
|
||||
!
|
||||
INTEGER, ALLOCATABLE :: ityp_d(:)
|
||||
REAL(DP), ALLOCATABLE :: zv_d(:), tau_d(:,:)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: g_d, gg_d, zv_d, ityp_d, tau_d
|
||||
#endif
|
||||
!
|
||||
tpiba2 = (tpi / alat)**2
|
||||
sigmaewa(:,:) = 0._DP
|
||||
charge = 0._DP
|
||||
!
|
||||
ALLOCATE( zv_d(ntyp), tau_d(3,nat) )
|
||||
zv_d = zv
|
||||
tau_d = tau
|
||||
ALLOCATE( ityp_d(nat) )
|
||||
ityp_d = ityp
|
||||
!
|
||||
DO na = 1, nat
|
||||
charge = charge + zv(ityp(na))
|
||||
ENDDO
|
||||
!
|
||||
! choose alpha in order to have convergence in the sum over G
|
||||
! upperbound is a safe upper bound for the error ON THE ENERGY
|
||||
!
|
||||
alpha = 2.9_DP
|
||||
12 alpha = alpha - 0.1_DP
|
||||
!
|
||||
IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 )
|
||||
upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * &
|
||||
erfc ( SQRT(tpiba2 * gcutm / 4._DP / alpha) )
|
||||
!
|
||||
IF (upperbound > 1d-7) GOTO 12
|
||||
!
|
||||
! G-space sum here
|
||||
!
|
||||
! Determine if this processor contains G=0 and set the constant term
|
||||
!
|
||||
IF (gstart == 2) THEN
|
||||
sdewald = tpi * e2 / 4._DP / alpha * (charge / omega)**2
|
||||
ELSE
|
||||
sdewald = 0._DP
|
||||
ENDIF
|
||||
!
|
||||
! sdewald is the diagonal term
|
||||
IF ( gamma_only ) THEN
|
||||
fact = 2._DP
|
||||
ELSE
|
||||
fact = 1._DP
|
||||
ENDIF
|
||||
!
|
||||
IF ( do_cutoff_2D ) THEN
|
||||
!
|
||||
CALL cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
|
||||
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ng = gstart, ngm
|
||||
g2 = gg_d(ng) * tpiba2
|
||||
g2a = g2 / 4._DP / alpha
|
||||
rhostar = (0._DP,0._DP)
|
||||
DO na = 1, nat
|
||||
arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + &
|
||||
g_d(3,ng) * tau_d(3,na) ) * tpi
|
||||
rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP)
|
||||
ENDDO
|
||||
rhostar = rhostar / CMPLX(omega)
|
||||
sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2
|
||||
sdewald = sdewald - sewald
|
||||
!
|
||||
sigma11 = sigma11 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(1,ng) * g_d(1,ng) / g2 * (g2a + 1)
|
||||
sigma21 = sigma21 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(2,ng) * g_d(1,ng) / g2 * (g2a + 1)
|
||||
sigma22 = sigma22 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(2,ng) * g_d(2,ng) / g2 * (g2a + 1)
|
||||
sigma31 = sigma31 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(3,ng) * g_d(1,ng) / g2 * (g2a + 1)
|
||||
sigma32 = sigma32 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(3,ng) * g_d(2,ng) / g2 * (g2a + 1)
|
||||
sigma33 = sigma33 + sewald * tpiba2 * 2._DP * &
|
||||
g_d(3,ng) * g_d(3,ng) / g2 * (g2a + 1)
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
sigmaewa(1,1) = sigmaewa(1,1) + sigma11
|
||||
sigmaewa(2,1) = sigmaewa(2,1) + sigma21
|
||||
sigmaewa(2,2) = sigmaewa(2,2) + sigma22
|
||||
sigmaewa(3,1) = sigmaewa(3,1) + sigma31
|
||||
sigmaewa(3,2) = sigmaewa(3,2) + sigma32
|
||||
sigmaewa(3,3) = sigmaewa(3,3) + sigma33
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmaewa(l,l) = sigmaewa(l,l) + sdewald
|
||||
ENDDO
|
||||
!
|
||||
! R-space sum here (see ewald.f90 for details on parallelization)
|
||||
!
|
||||
CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey )
|
||||
!
|
||||
IF ( mykey == 0 ) THEN
|
||||
rmax = 4.0d0 / SQRT(alpha) / alat
|
||||
!
|
||||
! with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1
|
||||
!
|
||||
DO na = na_s, na_e
|
||||
DO nb = 1, nat
|
||||
dtau(:) = tau(:,na) - tau(:,nb)
|
||||
!
|
||||
! generates nearest-neighbors shells r(i)=R(i)-dtau(i)
|
||||
!
|
||||
CALL rgen( dtau, rmax, mxr, at, bg, r, r2, nrm )
|
||||
!
|
||||
DO nr = 1, nrm
|
||||
rr = SQRT(r2 (nr) ) * alat
|
||||
fac = - e2 / 2.0_DP/ omega * alat**2 * zv(ityp(na)) * &
|
||||
zv(ityp(nb)) / rr**3 * (erfc(SQRT(alpha) * rr) + &
|
||||
rr * SQRT(8.0_DP * alpha / tpi) * EXP( - alpha * rr**2) )
|
||||
DO l = 1, 3
|
||||
DO m = 1, l
|
||||
sigmaewa(l,m) = sigmaewa(l,m) + fac * r(l,nr) * r(m,nr)
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, l - 1
|
||||
sigmaewa(m,l) = sigmaewa(l,m)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, 3
|
||||
sigmaewa(l,m) = - sigmaewa(l,m)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( zv_d, tau_d )
|
||||
DEALLOCATE( ityp_d )
|
||||
!
|
||||
CALL mp_sum( sigmaewa, intra_bgrp_comm )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_ewa_gpu
|
||||
|
|
@ -7,8 +7,8 @@
|
|||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
|
||||
dfft, g, alat, omega, sigmaxc )
|
||||
SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
|
||||
dfft, g, alat, omega, sigmaxc, kedtau )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
|
@ -24,9 +24,10 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
|
|||
TYPE(fft_type_descriptor), INTENT(IN):: dfft
|
||||
INTEGER, INTENT(IN) :: nspin
|
||||
REAL(DP), INTENT(IN) :: rho(dfft%nnr,nspin), rho_core(dfft%nnr)
|
||||
REAL(DP), INTENT(IN) :: g(3,dfft%ngm), alat, omega
|
||||
REAL(DP), INTENT(INOUT) :: kedtau(dfft%nnr, nspin) ! FIXME: should be INTENT(IN)
|
||||
COMPLEX(DP), INTENT(IN) :: rhog(dfft%ngm, nspin)
|
||||
REAL(DP), INTENT(IN) :: g(3,dfft%ngm)
|
||||
REAL(DP), INTENT(IN) :: alat, omega
|
||||
REAL(DP), INTENT(IN), OPTIONAL :: kedtau(dfft%nnr,nspin)
|
||||
COMPLEX(DP), INTENT(IN) :: rhog(dfft%ngm,nspin)
|
||||
COMPLEX(DP), INTENT(IN) :: rhog_core(dfft%ngm)
|
||||
REAL(DP), INTENT(INOUT) :: sigmaxc(3,3)
|
||||
!
|
||||
|
@ -34,30 +35,36 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
|
|||
!
|
||||
INTEGER :: k, l, m, ipol, ir, ig, is, nspin0, np
|
||||
INTEGER :: nr1, nr2, nr3, nrxx, ngm
|
||||
REAL(DP), ALLOCATABLE :: grho(:,:,:), grho2(:,:), rhoaux(:,:), segni(:)
|
||||
REAL(DP), ALLOCATABLE :: grho(:,:,:), grho2(:,:), rhoaux(:,:), &
|
||||
segni(:), kedtaue2(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: rhogaux(:,:)
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: sx(:), sc(:)
|
||||
REAL(DP), ALLOCATABLE :: v1x(:,:), v2x(:,:), v3x(:,:)
|
||||
REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:,:), v3c(:,:), v2c_ud(:)
|
||||
REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:), v3c(:,:), v2c_ud(:), v2cm(:,:,:)
|
||||
!
|
||||
REAL(DP), PARAMETER :: epsr = 1.0d-6, epsg = 1.0d-10, e2 = 2.d0
|
||||
REAL(DP) :: sigma_gradcorr(3, 3)
|
||||
!
|
||||
REAL(DP) :: v2xc, v2xc_uu, v2xc_dd
|
||||
REAL(DP) :: sigma_gc11, sigma_gc31, sigma_gc21, &
|
||||
sigma_gc32, sigma_gc22, sigma_gc33
|
||||
REAL(DP) :: sigma_gradcorr(3,3)
|
||||
!
|
||||
IF ( .NOT. xclib_dft_is('gradient') .AND. .NOT. xclib_dft_is('meta') ) RETURN
|
||||
!
|
||||
IF ( xclib_dft_is('meta') .and. nspin>1 ) CALL errore('stres_gradcorr', &
|
||||
'Meta-GGA stress does not work with spin polarization',1)
|
||||
IF ( xclib_dft_is('meta') .AND. nspin>1 ) CALL errore( 'stres_gradcorr', &
|
||||
'Meta-GGA stress does not work with spin polarization', 1 )
|
||||
!
|
||||
!$acc data present_or_copyin( rho, rho_core, rhog, rhog_core, g )
|
||||
!
|
||||
np = 1
|
||||
IF ( nspin==2 .AND. xclib_dft_is('meta') ) np=3
|
||||
IF ( nspin==2 .AND. xclib_dft_is('meta') ) np = 3
|
||||
!
|
||||
nspin0 = nspin
|
||||
IF (nspin==4) nspin0 = 1
|
||||
IF (nspin==4.and.domag) nspin0 = 2
|
||||
IF (nspin==4.AND.domag) nspin0 = 2
|
||||
!
|
||||
sigma_gradcorr(:,:) = 0.0_DP
|
||||
sigma_gc11=0.0_DP ; sigma_gc21=0.0_DP ; sigma_gc22=0.0_DP
|
||||
sigma_gc31=0.0_DP ; sigma_gc32=0.0_DP ; sigma_gc33=0.0_DP
|
||||
!
|
||||
nr1 = dfft%nr1
|
||||
nr2 = dfft%nr2
|
||||
|
@ -66,44 +73,71 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
|
|||
ngm = dfft%ngm
|
||||
!
|
||||
ALLOCATE( grho(3,nrxx,nspin0) )
|
||||
ALLOCATE (rhoaux(nrxx,nspin0))
|
||||
ALLOCATE( rhoaux(nrxx,nspin0) )
|
||||
ALLOCATE( rhogaux(ngm,nspin0) )
|
||||
IF (xclib_dft_is('meta')) ALLOCATE( kedtaue2(dfft%nnr,nspin) )
|
||||
!$acc data create( grho, rhoaux )
|
||||
!$acc data create( rhogaux )
|
||||
!
|
||||
! calculate the gradient of rho+rhocore in real space
|
||||
! For convenience rhoaux is in (up,down) format
|
||||
!
|
||||
IF ( nspin0 == 1 ) THEN
|
||||
!
|
||||
rhoaux(:,1) = rho(:,1) + rho_core(:)
|
||||
rhogaux(:,1) = rhog(:,1) + rhog_core(:)
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
rhoaux(k,1) = rho(k,1) + rho_core(k)
|
||||
ENDDO
|
||||
!$acc parallel loop
|
||||
DO k = 1, ngm
|
||||
rhogaux(k,1) = rhog(k,1) + rhog_core(k)
|
||||
ENDDO
|
||||
!
|
||||
ELSE IF ( nspin0 == 2 ) THEN
|
||||
ELSEIF ( nspin0 == 2 ) THEN
|
||||
!
|
||||
IF ( nspin == 4 .AND. domag ) THEN
|
||||
!
|
||||
ALLOCATE( segni( nrxx ) )
|
||||
!
|
||||
!$acc data copyout( segni )
|
||||
CALL compute_rho( rho, rhoaux, segni, nrxx )
|
||||
DEALLOCATE( segni )
|
||||
rhoaux(:,1) = rhoaux(:,1) + rho_core(:) / 2.0_DP
|
||||
rhoaux(:,2) = rhoaux(:,2) + rho_core(:) / 2.0_DP
|
||||
CALL rho_r2g ( dfft, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
rhoaux(k,1) = rhoaux(k,1) + rho_core(k) / 2.0_DP
|
||||
rhoaux(k,2) = rhoaux(k,2) + rho_core(k) / 2.0_DP
|
||||
ENDDO
|
||||
CALL rho_r2g( dfft, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
|
||||
!$acc end data
|
||||
ELSE
|
||||
rhoaux(:,1) = ( rho(:,1) + rho(:,2) + rho_core(:) ) / 2.0_DP
|
||||
rhoaux(:,2) = ( rho(:,1) - rho(:,2) + rho_core(:) ) / 2.0_DP
|
||||
rhogaux(:,1) = ( rhog(:,1) + rhog(:,2) + rhog_core(:) ) / 2.0_DP
|
||||
rhogaux(:,2) = ( rhog(:,1) - rhog(:,2) + rhog_core(:) ) / 2.0_DP
|
||||
END IF
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
rhoaux(k,1) = ( rho(k,1) + rho(k,2) + rho_core(k) ) / 2.0_DP
|
||||
rhoaux(k,2) = ( rho(k,1) - rho(k,2) + rho_core(k) ) / 2.0_DP
|
||||
ENDDO
|
||||
!$acc parallel loop
|
||||
DO k = 1, ngm
|
||||
rhogaux(k,1) = ( rhog(k,1) + rhog(k,2) + rhog_core(k) ) / 2.0_DP
|
||||
rhogaux(k,2) = ( rhog(k,1) - rhog(k,2) + rhog_core(k) ) / 2.0_DP
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
DO is = 1, nspin0
|
||||
CALL fft_gradient_g2r( dfft, rhogaux(1,is), g, grho(1,1,is) )
|
||||
CALL fft_gradient_g2r( dfft, rhogaux(:,is), g, grho(:,:,is) )
|
||||
ENDDO
|
||||
DEALLOCATE (rhogaux)
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( rhogaux )
|
||||
!
|
||||
ALLOCATE( grho2(nrxx,nspin0) )
|
||||
ALLOCATE( v1x(nrxx,nspin0), v2x(nrxx,nspin0), v3x(nrxx,nspin0) )
|
||||
ALLOCATE( v1c(nrxx,nspin0), v2c(np,nrxx,nspin0), v3c(nrxx,nspin0) )
|
||||
ALLOCATE( v1x(nrxx,nspin0), v2x(nrxx,nspin0) )
|
||||
ALLOCATE( v1c(nrxx,nspin0), v2c(nrxx,nspin0) )
|
||||
ALLOCATE( sx(nrxx), sc(nrxx) )
|
||||
!
|
||||
IF ( xclib_dft_is('meta') ) &
|
||||
ALLOCATE( v2cm(np,nrxx,nspin0), v3x(nrxx,nspin0), v3c(nrxx,nspin0) )
|
||||
!$acc data create( grho2, sx, sc, v1x, v2x, v1c, v2c )
|
||||
!
|
||||
IF (nspin0==1) THEN
|
||||
!
|
||||
! Spin-unpolarized case
|
||||
|
@ -113,73 +147,128 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
|
|||
!
|
||||
! routine computing v1x_v and v2x_v is different for GGA and meta-GGA
|
||||
!
|
||||
grho2(:,1) = grho(1,:,1)**2 + grho(2,:,1)**2 + grho(3,:,1)**2
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
|
||||
ENDDO
|
||||
!
|
||||
IF ( xclib_dft_is('meta') .AND. xclib_get_id('MGGA','EXCH') /= 4 ) THEN
|
||||
kedtau(:,1) = kedtau(:,1) / e2
|
||||
CALL xc_metagcx( nrxx, 1, np, rhoaux, grho, kedtau, sx, sc, &
|
||||
v1x, v2x, v3x, v1c, v2c, v3c )
|
||||
kedtau(:,1) = kedtau(:,1) * e2
|
||||
!$acc data present_or_copyin(kedtau) create( kedtaue2, v2cm, v3x, v3c )
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
kedtaue2(k,1) = kedtau(k,1) / e2
|
||||
ENDDO
|
||||
CALL xc_metagcx( nrxx, 1, np, rhoaux, grho, kedtaue2, sx, sc, &
|
||||
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
v2c(k,1) = v2cm(1,k,1)
|
||||
ENDDO
|
||||
!$acc end data
|
||||
ELSE
|
||||
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c(1,:,:) )
|
||||
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, gpu_args_=.TRUE. )
|
||||
ENDIF
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, l
|
||||
sigma_gradcorr(l,m) = sigma_gradcorr(l,m) + SUM( grho(l,:,1)*grho(m,:,1)* &
|
||||
e2 * (v2x(:,1) + v2c(1,:,1)) )
|
||||
ENDDO
|
||||
!$acc parallel loop reduction(+:sigma_gc11,sigma_gc21,sigma_gc22, &
|
||||
!$acc& sigma_gc31,sigma_gc32,sigma_gc33)
|
||||
DO k = 1, nrxx
|
||||
v2xc = e2 * (v2x(k,1) + v2c(k,1))
|
||||
sigma_gc11 = sigma_gc11 + grho(1,k,1)*grho(1,k,1) * v2xc
|
||||
sigma_gc21 = sigma_gc21 + grho(2,k,1)*grho(1,k,1) * v2xc
|
||||
sigma_gc22 = sigma_gc22 + grho(2,k,1)*grho(2,k,1) * v2xc
|
||||
sigma_gc31 = sigma_gc31 + grho(3,k,1)*grho(1,k,1) * v2xc
|
||||
sigma_gc32 = sigma_gc32 + grho(3,k,1)*grho(2,k,1) * v2xc
|
||||
sigma_gc33 = sigma_gc33 + grho(3,k,1)*grho(3,k,1) * v2xc
|
||||
ENDDO
|
||||
!
|
||||
ELSEIF (nspin0 == 2) THEN
|
||||
!
|
||||
! Spin-polarized case
|
||||
!
|
||||
grho2(:,:) = grho(1,:,:)**2 + grho(2,:,:)**2 + grho(3,:,:)**2
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
|
||||
grho2(k,2) = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2
|
||||
ENDDO
|
||||
!
|
||||
IF ( xclib_dft_is('meta') ) THEN
|
||||
!
|
||||
kedtau(:,1:nspin0) = kedtau(:,1:nspin0) / e2
|
||||
CALL xc_metagcx( nrxx, nspin0, np, rhoaux, grho, kedtau, sx, sc, &
|
||||
v1x, v2x, v3x, v1c, v2c, v3c )
|
||||
kedtau(:,1:nspin0) = kedtau(:,1:nspin0) * e2
|
||||
!$acc data present(kedtau) create(kedtaue2, v2cm, v3x, v3c)
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
kedtaue2(k,1:nspin0) = kedtau(k,1:nspin0) / e2
|
||||
ENDDO
|
||||
CALL xc_metagcx( nrxx, nspin0, np, rhoaux, grho, kedtaue2, sx, sc, &
|
||||
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
v2c(k,:) = v2cm(1,k,:)
|
||||
ENDDO
|
||||
!$acc end data
|
||||
! FIXME : what are we supposed to do now?
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
ALLOCATE( v2c_ud(nrxx) )
|
||||
!$acc data create( v2c_ud )
|
||||
!
|
||||
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c(1,:,:), v2c_ud )
|
||||
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud, gpu_args_=.TRUE. )
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, l
|
||||
!
|
||||
! ... exchange
|
||||
sigma_gradcorr(l,m) = &
|
||||
SUM( grho(l,:,1) * grho(m,:,1) * e2 * v2x(:,1) + &
|
||||
grho(l,:,2) * grho(m,:,2) * e2 * v2x(:,2) )
|
||||
!
|
||||
! ... correlation
|
||||
sigma_gradcorr(l,m) = sigma_gradcorr(l,m) + &
|
||||
SUM( grho(l,:,1) * grho(m,:,1) * v2c(1,:,1) + &
|
||||
grho(l,:,2) * grho(m,:,2) * v2c(1,:,2) + &
|
||||
(grho(l,:,1) * grho(m,:,2) + &
|
||||
grho(l,:,2) * grho(m,:,1)) * v2c_ud(:) ) * e2
|
||||
ENDDO
|
||||
!$acc parallel loop reduction(+:sigma_gc11,sigma_gc21,sigma_gc22, &
|
||||
!$acc& sigma_gc31,sigma_gc32,sigma_gc33)
|
||||
DO k = 1, nrxx
|
||||
!
|
||||
v2xc_uu = e2 * (v2x(k,1)+v2c(k,1))
|
||||
v2xc_dd = e2 * (v2x(k,2)+v2c(k,2))
|
||||
!
|
||||
sigma_gc11 = sigma_gc11 + grho(1,k,1)*grho(1,k,1) * v2xc_uu + &
|
||||
grho(1,k,2)*grho(1,k,2) * v2xc_dd + &
|
||||
(grho(1,k,1)*grho(1,k,2) + &
|
||||
grho(1,k,2)*grho(1,k,1)) * v2c_ud(k) * e2
|
||||
sigma_gc21 = sigma_gc21 + grho(2,k,1)*grho(1,k,1) * v2xc_uu + &
|
||||
grho(2,k,2)*grho(1,k,2) * v2xc_dd + &
|
||||
(grho(2,k,1)*grho(1,k,2) + &
|
||||
grho(1,k,2)*grho(2,k,1)) * v2c_ud(k) * e2
|
||||
sigma_gc22 = sigma_gc22 + grho(2,k,1)*grho(2,k,1) * v2xc_uu + &
|
||||
grho(2,k,2)*grho(2,k,2) * v2xc_dd + &
|
||||
(grho(2,k,1)*grho(2,k,2) + &
|
||||
grho(2,k,2)*grho(2,k,1)) * v2c_ud(k) * e2
|
||||
sigma_gc31 = sigma_gc31 + grho(3,k,1)*grho(1,k,1) * v2xc_uu + &
|
||||
grho(3,k,2)*grho(1,k,2) * v2xc_dd + &
|
||||
(grho(3,k,1)*grho(1,k,2) + &
|
||||
grho(1,k,2)*grho(3,k,1)) * v2c_ud(k) * e2
|
||||
sigma_gc32 = sigma_gc32 + grho(3,k,1)*grho(2,k,1) * v2xc_uu + &
|
||||
grho(3,k,2)*grho(2,k,2) * v2xc_dd + &
|
||||
(grho(3,k,1)*grho(2,k,2) + &
|
||||
grho(2,k,2)*grho(3,k,1)) * v2c_ud(k) * e2
|
||||
sigma_gc33 = sigma_gc33 + grho(3,k,1)*grho(3,k,1) * v2xc_uu + &
|
||||
grho(3,k,2)*grho(3,k,2) * v2xc_dd + &
|
||||
(grho(3,k,1)*grho(3,k,2) + &
|
||||
grho(3,k,2)*grho(3,k,1)) * v2c_ud(k) * e2
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( v2c_ud )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
sigma_gradcorr(1,1) = sigma_gc11
|
||||
sigma_gradcorr(2,1) = sigma_gc21
|
||||
sigma_gradcorr(2,2) = sigma_gc22
|
||||
sigma_gradcorr(3,1) = sigma_gc31
|
||||
sigma_gradcorr(3,2) = sigma_gc32
|
||||
sigma_gradcorr(3,3) = sigma_gc33
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
DEALLOCATE( sc, sx )
|
||||
DEALLOCATE( v1c, v2c, v3c )
|
||||
DEALLOCATE( v1x, v2x, v3x )
|
||||
DEALLOCATE( grho2 )
|
||||
DEALLOCATE( v1c, v2c )
|
||||
DEALLOCATE( v1x, v2x )
|
||||
DEALLOCATE( grho, grho2 )
|
||||
DEALLOCATE( rhoaux )
|
||||
DEALLOCATE( grho )
|
||||
IF (xclib_dft_is('meta')) DEALLOCATE( kedtaue2, v2cm, v3x, v3c )
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, l - 1
|
||||
|
@ -191,6 +280,8 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
|
|||
!
|
||||
sigmaxc(:,:) = sigmaxc(:,:) + sigma_gradcorr(:,:) / (nr1 * nr2 * nr3)
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_gradcorr
|
||||
|
|
|
@ -11,18 +11,18 @@ SUBROUTINE stres_har( sigmahar )
|
|||
!--------------------------------------------------------------------
|
||||
!! Calculates the Hartree contribution to the stress
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE constants, ONLY: e2, fpi
|
||||
USE cell_base, ONLY: omega, tpiba2
|
||||
USE ener, ONLY: ehart
|
||||
USE fft_base, ONLY: dfftp
|
||||
USE fft_rho, ONLY: rho_r2g
|
||||
USE gvect, ONLY: ngm, gstart, g, gg
|
||||
USE scf, ONLY: rho
|
||||
USE control_flags, ONLY: gamma_only
|
||||
USE mp_bands, ONLY: intra_bgrp_comm
|
||||
USE mp, ONLY: mp_sum
|
||||
USE Coul_cut_2D, ONLY: do_cutoff_2D, cutoff_stres_sigmahar
|
||||
USE kinds, ONLY: DP
|
||||
USE constants, ONLY: e2, fpi
|
||||
USE cell_base, ONLY: omega, tpiba2
|
||||
USE ener, ONLY: ehart
|
||||
USE fft_base, ONLY: dfftp
|
||||
USE fft_rho, ONLY: rho_r2g
|
||||
USE gvect, ONLY: ngm, gstart, g, gg
|
||||
USE scf, ONLY: rho
|
||||
USE control_flags, ONLY: gamma_only
|
||||
USE mp_bands, ONLY: intra_bgrp_comm
|
||||
USE mp, ONLY: mp_sum
|
||||
USE Coul_cut_2D, ONLY: do_cutoff_2D, cutoff_stres_sigmahar
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -31,58 +31,86 @@ SUBROUTINE stres_har( sigmahar )
|
|||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ig, l, m
|
||||
REAL(DP) :: shart, g2
|
||||
REAL(DP), PARAMETER :: eps = 1.d-8
|
||||
COMPLEX(DP), ALLOCATABLE :: rhog(:,:)
|
||||
!
|
||||
sigmahar(:,:) = 0.0_DP
|
||||
REAL(DP), PARAMETER :: eps = 1.E-8_DP
|
||||
INTEGER :: ig, l, m
|
||||
REAL(DP) :: sigmahar11, sigmahar31, sigmahar21, &
|
||||
sigmahar32, sigmahar22, sigmahar33
|
||||
!
|
||||
ALLOCATE( rhog(dfftp%nnr,1) )
|
||||
!$acc data create(rhog)
|
||||
!
|
||||
CALL rho_r2g( dfftp, rho%of_r(:,1), rhog )
|
||||
!
|
||||
! rhog contains now the charge density in G space
|
||||
! the G=0 component is not computed
|
||||
IF (do_cutoff_2D) THEN
|
||||
CALL cutoff_stres_sigmahar( rhog, sigmahar )
|
||||
ELSE
|
||||
DO ig = gstart, ngm
|
||||
g2 = gg(ig) * tpiba2
|
||||
shart = rhog(ig,1) * CONJG(rhog(ig,1)) / g2
|
||||
DO l = 1, 3
|
||||
DO m = 1, l
|
||||
sigmahar(l,m) = sigmahar(l,m) + shart * tpiba2 * 2 * &
|
||||
g(l,ig) * g(m,ig) / g2
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDIF
|
||||
! ... the G=0 component is not computed
|
||||
!
|
||||
sigmahar(:,:) = 0.0_DP
|
||||
!
|
||||
IF (do_cutoff_2D) THEN
|
||||
!
|
||||
CALL cutoff_stres_sigmahar( rhog(:,1), sigmahar )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
sigmahar11 = 0._DP ; sigmahar31 = 0._DP
|
||||
sigmahar21 = 0._DP ; sigmahar32 = 0._DP
|
||||
sigmahar22 = 0._DP ; sigmahar33 = 0._DP
|
||||
!
|
||||
!$acc parallel loop reduction(+:sigmahar11,sigmahar21,sigmahar22,&
|
||||
!$acc& sigmahar31,sigmahar32,sigmahar33)
|
||||
DO ig = gstart, ngm
|
||||
!
|
||||
g2 = gg(ig)
|
||||
!
|
||||
shart = DBLE(rhog(ig,1)*CONJG(rhog(ig,1))) / g2
|
||||
!
|
||||
sigmahar11 = sigmahar11 + shart *2._DP * &
|
||||
g(1,ig) * g(1,ig) / g2
|
||||
sigmahar21 = sigmahar21 + shart *2._DP * &
|
||||
g(2,ig) * g(1,ig) / g2
|
||||
sigmahar22 = sigmahar22 + shart *2._DP * &
|
||||
g(2,ig) * g(2,ig) / g2
|
||||
sigmahar31 = sigmahar31 + shart *2._DP * &
|
||||
g(3,ig) * g(1,ig) / g2
|
||||
sigmahar32 = sigmahar32 + shart *2._DP * &
|
||||
g(3,ig) * g(2,ig) / g2
|
||||
sigmahar33 = sigmahar33 + shart *2._DP * &
|
||||
g(3,ig) * g(3,ig) / g2
|
||||
ENDDO
|
||||
!
|
||||
sigmahar(1,1) = sigmahar(1,1) + sigmahar11 / tpiba2
|
||||
sigmahar(2,1) = sigmahar(2,1) + sigmahar21 / tpiba2
|
||||
sigmahar(2,2) = sigmahar(2,2) + sigmahar22 / tpiba2
|
||||
sigmahar(3,1) = sigmahar(3,1) + sigmahar31 / tpiba2
|
||||
sigmahar(3,2) = sigmahar(3,2) + sigmahar32 / tpiba2
|
||||
sigmahar(3,3) = sigmahar(3,3) + sigmahar33 / tpiba2
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( rhog )
|
||||
!
|
||||
CALL mp_sum( sigmahar, intra_bgrp_comm )
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
sigmahar(:,:) = fpi * e2 * sigmahar(:,:)
|
||||
sigmahar(:,:) = fpi * e2 * sigmahar(:,:)
|
||||
ELSE
|
||||
sigmahar(:,:) = fpi * e2 * sigmahar(:,:) * 0.5_DP
|
||||
sigmahar(:,:) = fpi * e2 * sigmahar(:,:) * 0.5_DP
|
||||
ENDIF
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmahar(l,l) = sigmahar(l,l) - ehart / omega
|
||||
sigmahar(l,l) = sigmahar(l,l) - ehart / omega
|
||||
ENDDO
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, l-1
|
||||
sigmahar(m,l) = sigmahar(l,m)
|
||||
ENDDO
|
||||
DO m = 1, l-1
|
||||
sigmahar(m,l) = sigmahar(l,m)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
sigmahar(:,:) = -sigmahar(:,:)
|
||||
!
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_har
|
||||
|
||||
|
|
|
@ -1,122 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2001-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE stres_har_gpu( sigmahar )
|
||||
!--------------------------------------------------------------------
|
||||
!! Calculates the Hartree contribution to the stress
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE constants, ONLY: e2, fpi
|
||||
USE cell_base, ONLY: omega, tpiba2
|
||||
USE ener, ONLY: ehart
|
||||
USE fft_base, ONLY: dfftp
|
||||
USE fft_rho, ONLY: rho_r2g
|
||||
USE gvect, ONLY: ngm, gstart
|
||||
USE scf, ONLY: rho
|
||||
USE control_flags, ONLY: gamma_only
|
||||
USE mp_bands, ONLY: intra_bgrp_comm
|
||||
USE mp, ONLY: mp_sum
|
||||
USE Coul_cut_2D, ONLY: do_cutoff_2D, cutoff_stres_sigmahar_gpu
|
||||
!
|
||||
USE gvect, ONLY: g_d, gg_d
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP) :: sigmahar(3,3)
|
||||
!! Hartree term of the stress tensor
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP) :: shart, g2
|
||||
REAL(DP), PARAMETER :: eps = 1.E-8_DP
|
||||
INTEGER :: ig, l, m
|
||||
!
|
||||
REAL(DP) :: sigmahar11, sigmahar31, sigmahar21, &
|
||||
sigmahar32, sigmahar22, sigmahar33
|
||||
COMPLEX(DP), ALLOCATABLE :: rhog(:,:)
|
||||
!
|
||||
sigmahar(:,:) = 0.0_DP
|
||||
!
|
||||
ALLOCATE( rhog(dfftp%nnr,1) )
|
||||
!$acc data create(rhog)
|
||||
!
|
||||
CALL rho_r2g( dfftp, rho%of_r(:,1), rhog )
|
||||
!
|
||||
! rhog contains now the charge density in G space
|
||||
! the G=0 component is not computed
|
||||
!
|
||||
IF (do_cutoff_2D) THEN
|
||||
!
|
||||
CALL cutoff_stres_sigmahar_gpu( rhog, sigmahar )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
sigmahar11 = 0._DP ; sigmahar31 = 0._DP
|
||||
sigmahar21 = 0._DP ; sigmahar32 = 0._DP
|
||||
sigmahar22 = 0._DP ; sigmahar33 = 0._DP
|
||||
!
|
||||
!$acc parallel loop reduction(+:sigmahar11,sigmahar21,sigmahar22, &
|
||||
!$acc sigmahar31,sigmahar32,sigmahar33)
|
||||
DO ig = gstart, ngm
|
||||
!
|
||||
g2 = gg_d(ig)
|
||||
!
|
||||
shart = DBLE(rhog(ig,1)*CONJG(rhog(ig,1))) / g2
|
||||
!
|
||||
sigmahar11 = sigmahar11 + shart *2._DP * &
|
||||
g_d(1,ig) * g_d(1,ig) / g2
|
||||
sigmahar21 = sigmahar21 + shart *2._DP * &
|
||||
g_d(2,ig) * g_d(1,ig) / g2
|
||||
sigmahar22 = sigmahar22 + shart *2._DP * &
|
||||
g_d(2,ig) * g_d(2,ig) / g2
|
||||
sigmahar31 = sigmahar31 + shart *2._DP * &
|
||||
g_d(3,ig) * g_d(1,ig) / g2
|
||||
sigmahar32 = sigmahar32 + shart *2._DP * &
|
||||
g_d(3,ig) * g_d(2,ig) / g2
|
||||
sigmahar33 = sigmahar33 + shart *2._DP * &
|
||||
g_d(3,ig) * g_d(3,ig) / g2
|
||||
ENDDO
|
||||
!
|
||||
sigmahar(1,1) = sigmahar(1,1) + sigmahar11 / tpiba2
|
||||
sigmahar(2,1) = sigmahar(2,1) + sigmahar21 / tpiba2
|
||||
sigmahar(2,2) = sigmahar(2,2) + sigmahar22 / tpiba2
|
||||
sigmahar(3,1) = sigmahar(3,1) + sigmahar31 / tpiba2
|
||||
sigmahar(3,2) = sigmahar(3,2) + sigmahar32 / tpiba2
|
||||
sigmahar(3,3) = sigmahar(3,3) + sigmahar33 / tpiba2
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( rhog )
|
||||
!
|
||||
CALL mp_sum( sigmahar, intra_bgrp_comm )
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
sigmahar(:,:) = fpi * e2 * sigmahar(:,:)
|
||||
ELSE
|
||||
sigmahar(:,:) = fpi * e2 * sigmahar(:,:) * 0.5_DP
|
||||
ENDIF
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmahar(l,l) = sigmahar(l,l) - ehart / omega
|
||||
ENDDO
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, l-1
|
||||
sigmahar(m,l) = sigmahar(l,m)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
sigmahar(:,:) = -sigmahar(:,:)
|
||||
!
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_har_gpu
|
||||
|
|
@ -9,11 +9,11 @@
|
|||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE stres_knl( sigmanlc, sigmakin )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Computes the kinetic + nonlocal contribuition to the stress
|
||||
!! Computes the kinetic + nonlocal contribuition to the stress.
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE constants, ONLY: pi, e2
|
||||
USE cell_base, ONLY: omega, alat, at, bg, tpiba
|
||||
USE cell_base, ONLY: omega, tpiba
|
||||
USE gvect, ONLY: g
|
||||
USE gvecw, ONLY: qcutz, ecfixed, q2sigma
|
||||
USE klist, ONLY: nks, xk, ngk, igk_k
|
||||
|
@ -27,76 +27,114 @@ SUBROUTINE stres_knl( sigmanlc, sigmakin )
|
|||
USE mp_pools, ONLY: inter_pool_comm
|
||||
USE mp_bands, ONLY: intra_bgrp_comm
|
||||
USE mp, ONLY: mp_sum
|
||||
USE wavefunctions_gpum, ONLY: using_evc
|
||||
#if defined(__CUDA)
|
||||
USE wavefunctions_gpum, ONLY: using_evc, using_evc_d
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP) :: sigmanlc (3, 3)
|
||||
REAL(DP) :: sigmanlc(3,3)
|
||||
!! non-local contribution to stress
|
||||
REAL(DP) :: sigmakin (3, 3)
|
||||
REAL(DP) :: sigmakin(3,3)
|
||||
!! kinetic contribution to stress
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: gk (:,:), kfac (:)
|
||||
REAL(DP) :: twobysqrtpi, gk2, arg
|
||||
INTEGER :: npw, ik, l, m, i, ibnd, is
|
||||
REAL(DP), ALLOCATABLE :: gk(:,:), kfac(:)
|
||||
REAL(DP) :: twobysqrtpi, gk2, arg, s11, s21, s31, s22, s32, s33, &
|
||||
xk1, xk2, xk3, tmpf, wg_nk
|
||||
INTEGER :: npw, ik, l, m, i, ibnd
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
CALL using_evc(0)
|
||||
ALLOCATE( gk(npwx,3) )
|
||||
ALLOCATE( kfac(npwx) )
|
||||
CALL using_evc_d(0)
|
||||
#endif
|
||||
!
|
||||
sigmanlc(:,:) = 0.d0
|
||||
sigmakin(:,:) = 0.d0
|
||||
twobysqrtpi = 2.d0/SQRT(pi)
|
||||
!$acc enter data create( evc )
|
||||
!
|
||||
kfac(:) = 1.d0
|
||||
ALLOCATE( gk(npwx,3), kfac(npwx) )
|
||||
!$acc data create( gk, kfac )
|
||||
!$acc data copyin( wg )
|
||||
!
|
||||
sigmanlc(:,:) = 0._DP
|
||||
sigmakin(:,:) = 0._DP
|
||||
twobysqrtpi = 2._DP/SQRT(pi)
|
||||
!
|
||||
!$acc kernels
|
||||
kfac(:) = 1._DP
|
||||
!$acc end kernels
|
||||
!
|
||||
s11 = 0._DP ; s22 = 0._DP
|
||||
s21 = 0._DP ; s32 = 0._DP
|
||||
s31 = 0._DP ; s33 = 0._DP
|
||||
!
|
||||
DO ik = 1, nks
|
||||
IF ( nks > 1 ) CALL get_buffer( evc, nwordwfc, iunwfc, ik )
|
||||
if ( nks > 1 ) CALL using_evc(2)
|
||||
IF ( nks > 1 ) THEN
|
||||
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
|
||||
#if defined(__CUDA)
|
||||
CALL using_evc(2)
|
||||
CALL using_evc_d(0)
|
||||
#endif
|
||||
ENDIF
|
||||
!
|
||||
npw = ngk(ik)
|
||||
!
|
||||
xk1 = xk(1,ik)
|
||||
xk2 = xk(2,ik)
|
||||
xk3 = xk(3,ik)
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO i = 1, npw
|
||||
gk(i,1) = ( xk(1,ik) + g(1,igk_k(i,ik)) ) * tpiba
|
||||
gk(i,2) = ( xk(2,ik) + g(2,igk_k(i,ik)) ) * tpiba
|
||||
gk(i,3) = ( xk(3,ik) + g(3,igk_k(i,ik)) ) * tpiba
|
||||
IF (qcutz > 0.d0) THEN
|
||||
gk(i,1) = ( xk1 + g(1,igk_k(i,ik)) ) * tpiba
|
||||
gk(i,2) = ( xk2 + g(2,igk_k(i,ik)) ) * tpiba
|
||||
gk(i,3) = ( xk3 + g(3,igk_k(i,ik)) ) * tpiba
|
||||
IF (qcutz > 0._DP) THEN
|
||||
gk2 = gk(i,1)**2 + gk(i,2)**2 + gk(i,3)**2
|
||||
arg = ( (gk2-ecfixed)/q2sigma )**2
|
||||
kfac(i) = 1.d0 + qcutz / q2sigma * twobysqrtpi * EXP(-arg)
|
||||
kfac(i) = 1._DP + qcutz / q2sigma * twobysqrtpi * EXP(-arg)
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
! ... kinetic contribution
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, l
|
||||
DO ibnd = 1, nbnd
|
||||
DO i = 1, npw
|
||||
IF (noncolin) THEN
|
||||
sigmakin(l,m) = sigmakin(l,m) + wg(ibnd,ik) * &
|
||||
gk(i,l) * gk(i, m) * kfac(i) * &
|
||||
( DBLE (CONJG(evc( i ,ibnd))*evc( i ,ibnd)) + &
|
||||
DBLE (CONJG(evc(i+npwx,ibnd))*evc(i+npwx,ibnd)))
|
||||
ELSE
|
||||
sigmakin(l,m) = sigmakin(l,m) + wg(ibnd,ik) * &
|
||||
gk(i,l) * gk(i, m) * kfac(i) * &
|
||||
DBLE (CONJG(evc(i, ibnd) ) * evc(i, ibnd) )
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$acc update device(evc)
|
||||
!
|
||||
!$acc parallel loop collapse(2) reduction(+:s11,s21,s31,s22,s32,s33)
|
||||
DO ibnd = 1, nbnd
|
||||
DO i = 1, npw
|
||||
wg_nk = wg(ibnd,ik)
|
||||
IF (noncolin) THEN
|
||||
tmpf = wg_nk * kfac(i) * &
|
||||
( DBLE(CONJG(evc( i ,ibnd))*evc( i ,ibnd)) + &
|
||||
DBLE(CONJG(evc(i+npwx,ibnd))*evc(i+npwx,ibnd)) )
|
||||
ELSE
|
||||
tmpf = wg_nk * kfac(i) * &
|
||||
DBLE( CONJG(evc(i,ibnd) ) * evc(i,ibnd) )
|
||||
ENDIF
|
||||
s11 = s11 + tmpf * gk(i,1)*gk(i,1)
|
||||
s21 = s21 + tmpf * gk(i,2)*gk(i,1)
|
||||
s31 = s31 + tmpf * gk(i,3)*gk(i,1)
|
||||
s22 = s22 + tmpf * gk(i,2)*gk(i,2)
|
||||
s32 = s32 + tmpf * gk(i,3)*gk(i,2)
|
||||
s33 = s33 + tmpf * gk(i,3)*gk(i,3)
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
! ... contribution from the nonlocal part
|
||||
! ... contribution from the nonlocal part
|
||||
!
|
||||
CALL stres_us( ik, gk, sigmanlc )
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( kfac )
|
||||
DEALLOCATE( gk )
|
||||
sigmakin(:,1) = [s11, s21, s31]
|
||||
sigmakin(:,2) = [0._DP,s22, s32]
|
||||
sigmakin(:,3) = [0._DP,0._DP,s33]
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
DEALLOCATE( gk, kfac )
|
||||
!
|
||||
!$acc exit data delete(evc)
|
||||
!
|
||||
! ... the kinetic term must be summed over PW's and over k-points
|
||||
!
|
||||
|
@ -121,11 +159,11 @@ SUBROUTINE stres_knl( sigmanlc, sigmakin )
|
|||
ENDDO
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
sigmakin(:,:) = 2.d0 * e2 / omega * sigmakin(:,:)
|
||||
sigmakin(:,:) = 2._DP * e2 / omega * sigmakin(:,:)
|
||||
ELSE
|
||||
sigmakin(:,:) = e2 / omega * sigmakin(:,:)
|
||||
ENDIF
|
||||
sigmanlc(:,:) = -1.d0 / omega * sigmanlc(:,:)
|
||||
sigmanlc(:,:) = -1._DP / omega * sigmanlc(:,:)
|
||||
!
|
||||
! ... symmetrize stress
|
||||
!
|
||||
|
|
|
@ -1,168 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2001-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE stres_knl_gpu( sigmanlc, sigmakin )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Computes the kinetic + nonlocal contribuition to the stress
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE constants, ONLY: pi, e2
|
||||
USE cell_base, ONLY: omega, alat, at, bg, tpiba
|
||||
USE gvect, ONLY: g, g_d
|
||||
USE gvecw, ONLY: qcutz, ecfixed, q2sigma
|
||||
USE klist, ONLY: nks, xk, ngk, igk_k_d
|
||||
USE io_files, ONLY: iunwfc, nwordwfc
|
||||
USE buffers, ONLY: get_buffer
|
||||
USE symme, ONLY: symmatrix
|
||||
USE wvfct, ONLY: npwx, nbnd, wg
|
||||
USE control_flags, ONLY: gamma_only, use_gpu
|
||||
USE noncollin_module, ONLY: noncolin, npol
|
||||
USE wavefunctions, ONLY: evc
|
||||
USE mp_pools, ONLY: inter_pool_comm
|
||||
USE mp_bands, ONLY: intra_bgrp_comm
|
||||
USE mp, ONLY: mp_sum
|
||||
#if defined(__CUDA)
|
||||
USE wavefunctions_gpum, ONLY: using_evc, using_evc_d, evc_d
|
||||
USE device_fbuff_m, ONLY : dev_buf
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP) :: sigmanlc(3,3)
|
||||
!! non-local contribution to stress
|
||||
REAL(DP) :: sigmakin(3,3)
|
||||
!! kinetic contribution to stress
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP), POINTER :: gk_d(:,:), kfac_d (:)
|
||||
REAL(DP) :: twobysqrtpi, gk2, arg, s11, s21, s31, s22, s32, s33, &
|
||||
xk1, xk2, xk3, tmpf, wg_nk
|
||||
INTEGER :: npw, ik, l, m, i, ibnd, is
|
||||
INTEGER :: ierr(2)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
ATTRIBUTES(DEVICE) :: gk_d, kfac_d
|
||||
!
|
||||
CALL using_evc(0)
|
||||
CALL using_evc_d(0)
|
||||
!
|
||||
CALL dev_buf%lock_buffer( gk_d, [npwx,3], ierr(1) )
|
||||
CALL dev_buf%lock_buffer( kfac_d, npwx, ierr(2) )
|
||||
!
|
||||
sigmanlc(:,:) = 0._DP
|
||||
sigmakin(:,:) = 0._DP
|
||||
twobysqrtpi = 2._DP/SQRT(pi)
|
||||
!
|
||||
kfac_d(:) = 1._DP
|
||||
!
|
||||
s11 = 0._DP ; s22 = 0._DP
|
||||
s21 = 0._DP ; s32 = 0._DP
|
||||
s31 = 0._DP ; s33 = 0._DP
|
||||
!
|
||||
DO ik = 1, nks
|
||||
IF ( nks > 1 ) THEN
|
||||
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
|
||||
CALL using_evc(2)
|
||||
CALL using_evc_d(0)
|
||||
ENDIF
|
||||
!
|
||||
npw = ngk(ik)
|
||||
!
|
||||
xk1 = xk(1,ik)
|
||||
xk2 = xk(2,ik)
|
||||
xk3 = xk(3,ik)
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, npw
|
||||
gk_d(i,1) = ( xk1 + g_d(1,igk_k_d(i,ik)) ) * tpiba
|
||||
gk_d(i,2) = ( xk2 + g_d(2,igk_k_d(i,ik)) ) * tpiba
|
||||
gk_d(i,3) = ( xk3 + g_d(3,igk_k_d(i,ik)) ) * tpiba
|
||||
IF (qcutz > 0._DP) THEN
|
||||
gk2 = gk_d(i,1)**2 + gk_d(i,2)**2 + gk_d(i,3)**2
|
||||
arg = ( (gk2-ecfixed)/q2sigma )**2
|
||||
kfac_d(i) = 1._DP + qcutz / q2sigma * twobysqrtpi * EXP(-arg)
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
! ... kinetic contribution
|
||||
!
|
||||
DO ibnd = 1, nbnd
|
||||
wg_nk = wg(ibnd,ik)
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, npw
|
||||
IF (noncolin) THEN
|
||||
tmpf = wg_nk * kfac_d(i) * &
|
||||
( DBLE(CONJG(evc_d( i ,ibnd))*evc_d( i ,ibnd)) + &
|
||||
DBLE(CONJG(evc_d(i+npwx,ibnd))*evc_d(i+npwx,ibnd)) )
|
||||
ELSE
|
||||
tmpf = wg_nk * kfac_d(i) * &
|
||||
DBLE( CONJG(evc_d(i, ibnd) ) * evc_d(i, ibnd) )
|
||||
ENDIF
|
||||
s11 = s11 + tmpf * gk_d(i,1)*gk_d(i,1)
|
||||
s21 = s21 + tmpf * gk_d(i,2)*gk_d(i,1)
|
||||
s31 = s31 + tmpf * gk_d(i,3)*gk_d(i,1)
|
||||
s22 = s22 + tmpf * gk_d(i,2)*gk_d(i,2)
|
||||
s32 = s32 + tmpf * gk_d(i,3)*gk_d(i,2)
|
||||
s33 = s33 + tmpf * gk_d(i,3)*gk_d(i,3)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
! ... contribution from the nonlocal part
|
||||
!
|
||||
CALL stres_us_gpu( ik, gk_d, sigmanlc )
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
sigmakin(:,1) = [s11, s21, s31]
|
||||
sigmakin(:,2) = [0._DP,s22, s32]
|
||||
sigmakin(:,3) = [0._DP,0._DP,s33]
|
||||
!
|
||||
CALL dev_buf%release_buffer( kfac_d, ierr(2) )
|
||||
CALL dev_buf%release_buffer( gk_d, ierr(1) )
|
||||
!
|
||||
! ... the kinetic term must be summed over PW's and over k-points
|
||||
!
|
||||
CALL mp_sum( sigmakin, intra_bgrp_comm )
|
||||
CALL mp_sum( sigmakin, inter_pool_comm )
|
||||
!
|
||||
! ... the nonlocal term is summed here only over k-points, because we add
|
||||
! ... to it the US term from augmentation charge derivatives
|
||||
!
|
||||
CALL mp_sum( sigmanlc, inter_pool_comm )
|
||||
!
|
||||
! ... add US term from augmentation charge derivatives, sum result over PW's
|
||||
!
|
||||
CALL addusstress( sigmanlc )
|
||||
CALL mp_sum( sigmanlc, intra_bgrp_comm )
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, l-1
|
||||
sigmanlc(m,l) = sigmanlc(l,m)
|
||||
sigmakin(m,l) = sigmakin(l,m)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
sigmakin(:,:) = 2._DP * e2 / omega * sigmakin(:,:)
|
||||
ELSE
|
||||
sigmakin(:,:) = e2 / omega * sigmakin(:,:)
|
||||
ENDIF
|
||||
sigmanlc(:,:) = -1._DP / omega * sigmanlc(:,:)
|
||||
!
|
||||
! ... symmetrize stress
|
||||
!
|
||||
CALL symmatrix( sigmakin )
|
||||
CALL symmatrix( sigmanlc )
|
||||
#endif
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_knl_gpu
|
||||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
! Copyright (C) 2001-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
|
@ -7,8 +6,9 @@
|
|||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
subroutine stres_loc( sigmaloc )
|
||||
SUBROUTINE stres_loc( sigmaloc )
|
||||
!----------------------------------------------------------------------
|
||||
!! Calculate the local term of the stress.
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE atom, ONLY : msh, rgrid
|
||||
|
@ -24,88 +24,125 @@ subroutine stres_loc( sigmaloc )
|
|||
USE uspp_param, ONLY : upf
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_evloc, cutoff_stres_sigmaloc
|
||||
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_evloc, &
|
||||
cutoff_stres_sigmaloc
|
||||
!
|
||||
implicit none
|
||||
IMPLICIT NONE
|
||||
!
|
||||
real(DP) :: sigmaloc(3,3)
|
||||
real(DP), allocatable :: dvloc(:)
|
||||
complex(DP), allocatable :: rhog(:,:)
|
||||
real(DP) :: evloc, fact
|
||||
integer :: ng, nt, l, m
|
||||
REAL(DP) :: sigmaloc(3,3)
|
||||
REAL(DP), ALLOCATABLE :: dvloc(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: rhog(:,:)
|
||||
REAL(DP) :: evloc, fact
|
||||
INTEGER :: ng, nt, l, m
|
||||
! counter on g vectors
|
||||
! counter on atomic type
|
||||
! counter on angular momentum
|
||||
! counter on spin components
|
||||
REAL(DP) :: sigma11, sigma21, sigma22, spart, &
|
||||
sigma31, sigma32, sigma33
|
||||
!
|
||||
allocate( dvloc(ngl), rhog(dfftp%nnr,1) )
|
||||
ALLOCATE( dvloc(ngl), rhog(dfftp%nnr,1) )
|
||||
sigmaloc(:,:) = 0.d0
|
||||
!
|
||||
call rho_r2g( dfftp, rho%of_r(:,1), rhog )
|
||||
!$acc data create(rhog)
|
||||
CALL rho_r2g( dfftp, rho%of_r(:,1), rhog )
|
||||
!
|
||||
! rhog contains the charge density in G space
|
||||
if (gamma_only) then
|
||||
!$acc data copyin(vloc,strf,gl,igtongl) create(dvloc)
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
fact = 2.d0
|
||||
else
|
||||
ELSE
|
||||
fact = 1.d0
|
||||
endif
|
||||
ENDIF
|
||||
!
|
||||
evloc = 0.0d0
|
||||
do nt = 1, ntyp
|
||||
if (gstart==2) evloc = evloc + rhog(1,1) * strf(1,nt) * vloc(igtongl(1),nt)
|
||||
do ng = gstart, ngm
|
||||
evloc = evloc + DBLE(CONJG(rhog(ng,1)) * strf(ng,nt) ) &
|
||||
* vloc(igtongl(ng),nt) * fact
|
||||
enddo
|
||||
enddo
|
||||
! 2D: add contribution from cutoff long-range part of Vloc
|
||||
IF (do_cutoff_2D) call cutoff_stres_evloc( rhog(:,1), strf, evloc )
|
||||
!
|
||||
IF (gstart==2) THEN
|
||||
!$acc parallel loop reduction(+:evloc)
|
||||
DO nt = 1, ntyp
|
||||
evloc = evloc + rhog(1,1)*strf(1,nt)*vloc(igtongl(1),nt)
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
!$acc parallel loop collapse(2) reduction(+:evloc)
|
||||
DO nt = 1, ntyp
|
||||
DO ng = gstart, ngm
|
||||
evloc = evloc + DBLE(CONJG(rhog(ng,1)) * strf(ng,nt)) &
|
||||
* vloc(igtongl(ng),nt) * fact
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
! ... 2D: add contribution from cutoff long-range part of Vloc
|
||||
IF (do_cutoff_2D) CALL cutoff_stres_evloc( rhog(:,1), strf, evloc )
|
||||
!
|
||||
! WRITE( 6,*) ' evloc ', evloc, evloc*omega ! DEBUG
|
||||
!
|
||||
do nt = 1, ntyp
|
||||
DO nt = 1, ntyp
|
||||
!
|
||||
IF ( upf(nt)%is_gth ) THEN
|
||||
!
|
||||
! special case: GTH pseudopotential
|
||||
!
|
||||
call dvloc_gth( nt, upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
|
||||
CALL dvloc_gth( nt, upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
|
||||
!
|
||||
ELSEIF ( upf(nt)%tcoulombp ) THEN
|
||||
!
|
||||
! special case: pseudopotential is coulomb 1/r potential
|
||||
!
|
||||
call dvloc_coul( upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
|
||||
CALL dvloc_coul( upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! normal case: dvloc contains dV_loc(G)/dG
|
||||
!
|
||||
call dvloc_of_g( rgrid(nt)%mesh, msh(nt), rgrid(nt)%rab, rgrid(nt)%r, &
|
||||
upf(nt)%vloc(:), upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc )
|
||||
CALL dvloc_of_g( rgrid(nt)%mesh, msh(nt), rgrid(nt)%rab, rgrid(nt)%r, &
|
||||
upf(nt)%vloc(:), upf(nt)%zp, tpiba2, ngl, gl, omega, &
|
||||
dvloc )
|
||||
!
|
||||
ENDIF
|
||||
! no G=0 contribution
|
||||
do ng = 1, ngm
|
||||
do l = 1, 3
|
||||
do m = 1, l
|
||||
sigmaloc(l,m) = sigmaloc(l,m) + DBLE(CONJG(rhog(ng,1)) * strf(ng,nt))* &
|
||||
2.0d0 * dvloc(igtongl(ng) ) * tpiba2 * &
|
||||
g(l,ng) * g(m,ng) * fact
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
IF (do_cutoff_2D) call cutoff_stres_sigmaloc( rhog(:,1), strf, sigmaloc ) ! 2D: re-add LR Vloc to sigma here
|
||||
! ... no G=0 contribution
|
||||
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
|
||||
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
|
||||
!
|
||||
!$acc parallel loop reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32,&
|
||||
!$acc sigma33)
|
||||
DO ng = 1, ngm
|
||||
spart = DBLE(CONJG(rhog(ng,1)) * strf(ng,nt)) * 2.0_DP * &
|
||||
dvloc(igtongl(ng))
|
||||
sigma11 = sigma11 + spart * g(1,ng) * g(1,ng)
|
||||
sigma21 = sigma21 + spart * g(2,ng) * g(1,ng)
|
||||
sigma22 = sigma22 + spart * g(2,ng) * g(2,ng)
|
||||
sigma31 = sigma31 + spart * g(3,ng) * g(1,ng)
|
||||
sigma32 = sigma32 + spart * g(3,ng) * g(2,ng)
|
||||
sigma33 = sigma33 + spart * g(3,ng) * g(3,ng)
|
||||
ENDDO
|
||||
!
|
||||
sigmaloc(1,1) = sigmaloc(1,1) + sigma11 * fact * tpiba2
|
||||
sigmaloc(2,1) = sigmaloc(2,1) + sigma21 * fact * tpiba2
|
||||
sigmaloc(2,2) = sigmaloc(2,2) + sigma22 * fact * tpiba2
|
||||
sigmaloc(3,1) = sigmaloc(3,1) + sigma31 * fact * tpiba2
|
||||
sigmaloc(3,2) = sigmaloc(3,2) + sigma32 * fact * tpiba2
|
||||
sigmaloc(3,3) = sigmaloc(3,3) + sigma33 * fact * tpiba2
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
do l = 1, 3
|
||||
! ... 2D: re-add LR Vloc to sigma here
|
||||
IF (do_cutoff_2D) CALL cutoff_stres_sigmaloc( rhog(:,1), strf, sigmaloc )
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmaloc(l,l) = sigmaloc(l,l) + evloc
|
||||
do m = 1, l-1
|
||||
DO m = 1, l-1
|
||||
sigmaloc(m,l) = sigmaloc(l,m)
|
||||
enddo
|
||||
enddo
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
call mp_sum( sigmaloc, intra_bgrp_comm )
|
||||
CALL mp_sum( sigmaloc, intra_bgrp_comm )
|
||||
!
|
||||
deallocate( dvloc, rhog )
|
||||
return
|
||||
end subroutine stres_loc
|
||||
|
||||
DEALLOCATE( dvloc, rhog )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_loc
|
||||
|
|
|
@ -1,199 +0,0 @@
|
|||
|
||||
! Copyright (C) 2001-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE stres_loc_gpu( sigmaloc )
|
||||
!----------------------------------------------------------------------
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE atom, ONLY : msh, rgrid
|
||||
USE m_gth, ONLY : dvloc_gth_gpu
|
||||
USE ions_base, ONLY : ntyp => nsp
|
||||
USE cell_base, ONLY : omega, tpiba2
|
||||
USE fft_base, ONLY : dfftp
|
||||
USE fft_rho, ONLY : rho_r2g
|
||||
USE gvect, ONLY : ngm, gstart, ngl, gl, igtongl, &
|
||||
g_d, gl_d, igtongl_d
|
||||
USE scf, ONLY : rho
|
||||
USE vlocal, ONLY : strf, vloc
|
||||
USE control_flags, ONLY : gamma_only
|
||||
USE uspp_param, ONLY : upf
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_evloc_gpu, cutoff_stres_sigmaloc_gpu
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE device_fbuff_m, ONLY : dev_buf
|
||||
USE device_memcpy_m, ONLY : dev_memcpy
|
||||
#endif
|
||||
!
|
||||
implicit none
|
||||
!
|
||||
REAL(DP) :: sigmaloc(3,3)
|
||||
REAL(DP) :: evloc, fact
|
||||
INTEGER :: ng, nt, l, m
|
||||
! counter on g vectors
|
||||
! counter on atomic type
|
||||
! counter on angular momentum
|
||||
! counter on spin components
|
||||
!
|
||||
INTEGER :: ierrs(5)
|
||||
REAL(DP) :: evloc_d, zp_d
|
||||
REAL(DP) :: spart, sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
|
||||
!
|
||||
INTEGER :: mshd
|
||||
!
|
||||
INTEGER, POINTER :: nl_d(:)
|
||||
REAL(DP), POINTER :: rab_d(:), r_d(:), vloc_d(:,:), dvloc_d(:), &
|
||||
upfvloc_d(:)
|
||||
COMPLEX(DP), POINTER :: strf_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: rhog(:,:)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: vloc_d, strf_d, nl_d, rab_d, r_d, dvloc_d, &
|
||||
upfvloc_d
|
||||
!
|
||||
nl_d => dfftp%nl_d
|
||||
!
|
||||
sigmaloc(:,:) = 0._DP
|
||||
!
|
||||
ALLOCATE( rhog(dfftp%nnr,1) )
|
||||
!$acc data create( rhog )
|
||||
!
|
||||
CALL rho_r2g( dfftp, rho%of_r(:,1), rhog )
|
||||
!
|
||||
gl_d = gl ; igtongl_d = igtongl
|
||||
!
|
||||
! rhog contains now the charge density in G space
|
||||
fact = 1._DP
|
||||
IF (gamma_only) fact = 2._DP
|
||||
!
|
||||
evloc = 0.0_DP
|
||||
!
|
||||
CALL dev_buf%lock_buffer( strf_d, (/ ngm, ntyp /), ierrs(1) )
|
||||
CALL dev_memcpy( strf_d, strf )
|
||||
!
|
||||
CALL dev_buf%lock_buffer( vloc_d, (/ ngl, ntyp /), ierrs(2) )
|
||||
CALL dev_memcpy( vloc_d, vloc )
|
||||
!
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
IF (gstart==2) THEN
|
||||
!$acc kernels
|
||||
evloc = evloc + rhog(1,1) * strf_d(1,nt) * vloc_d(igtongl_d(1),nt)
|
||||
!$acc end kernels
|
||||
ENDIF
|
||||
!
|
||||
evloc_d = 0._DP
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ng = gstart, ngm
|
||||
evloc_d = evloc_d + DBLE(CONJG(rhog(ng,1)) * strf_d(ng,nt)) &
|
||||
* vloc_d(igtongl_d(ng),nt)
|
||||
ENDDO
|
||||
!
|
||||
evloc = evloc + evloc_d * fact
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!
|
||||
CALL dev_buf%release_buffer( vloc_d, ierrs(2) )
|
||||
!
|
||||
CALL dev_buf%lock_buffer( dvloc_d, ngl, ierrs(2) )
|
||||
mshd = MAXVAL(rgrid(1:ntyp)%mesh)
|
||||
CALL dev_buf%lock_buffer( rab_d, mshd, ierrs(3) )
|
||||
CALL dev_buf%lock_buffer( r_d, mshd, ierrs(4) )
|
||||
CALL dev_buf%lock_buffer( upfvloc_d, mshd, ierrs(5) )
|
||||
!
|
||||
!
|
||||
! 2D: add contribution from cutoff long-range part of Vloc
|
||||
!$acc host_data use_device(rhog)
|
||||
IF (do_cutoff_2D) CALL cutoff_stres_evloc_gpu( rhog(:,1), strf_d, evloc )
|
||||
!$acc end host_data
|
||||
!
|
||||
! WRITE( 6,*) ' evloc ', evloc, evloc*omega ! DEBUG
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
IF ( upf(nt)%is_gth ) THEN
|
||||
!
|
||||
! special case: GTH pseudopotential
|
||||
!
|
||||
CALL dvloc_gth_gpu( nt, upf(nt)%zp, tpiba2, ngl, gl_d, omega, dvloc_d )
|
||||
!
|
||||
ELSEIF ( upf(nt)%tcoulombp ) THEN
|
||||
!
|
||||
! special case: pseudopotential is coulomb 1/r potential
|
||||
!
|
||||
CALL dvloc_coul_gpu( upf(nt)%zp, tpiba2, ngl, gl_d, omega, dvloc_d )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! normal case: dvloc contains dV_loc(G)/dG
|
||||
!
|
||||
! the G=0 component is not computed
|
||||
rab_d(1:msh(nt)) = rgrid(nt)%rab(1:msh(nt))
|
||||
r_d(1:msh(nt)) = rgrid(nt)%r(1:msh(nt))
|
||||
upfvloc_d(1:msh(nt)) = upf(nt)%vloc(1:msh(nt))
|
||||
zp_d = upf(nt)%zp
|
||||
!
|
||||
CALL dvloc_of_g_gpu( rgrid(nt)%mesh, msh(nt), rab_d(1:rgrid(nt)%mesh), &
|
||||
r_d(1:rgrid(nt)%mesh), upfvloc_d(1:rgrid(nt)%mesh), &
|
||||
zp_d, tpiba2, ngl, gl_d, omega, dvloc_d )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
|
||||
sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
|
||||
!
|
||||
!$acc parallel loop reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32,sigma33)
|
||||
DO ng = 1, ngm
|
||||
spart = DBLE(CONJG(rhog(ng,1)) * strf_d(ng,nt)) * 2.0_DP *&
|
||||
dvloc_d(igtongl_d(ng))
|
||||
sigma11 = sigma11 + spart * g_d(1,ng) * g_d(1,ng)
|
||||
sigma21 = sigma21 + spart * g_d(2,ng) * g_d(1,ng)
|
||||
sigma22 = sigma22 + spart * g_d(2,ng) * g_d(2,ng)
|
||||
sigma31 = sigma31 + spart * g_d(3,ng) * g_d(1,ng)
|
||||
sigma32 = sigma32 + spart * g_d(3,ng) * g_d(2,ng)
|
||||
sigma33 = sigma33 + spart * g_d(3,ng) * g_d(3,ng)
|
||||
ENDDO
|
||||
!
|
||||
sigmaloc(1,1) = sigmaloc(1,1) + sigma11 * fact * tpiba2
|
||||
sigmaloc(2,1) = sigmaloc(2,1) + sigma21 * fact * tpiba2
|
||||
sigmaloc(2,2) = sigmaloc(2,2) + sigma22 * fact * tpiba2
|
||||
sigmaloc(3,1) = sigmaloc(3,1) + sigma31 * fact * tpiba2
|
||||
sigmaloc(3,2) = sigmaloc(3,2) + sigma32 * fact * tpiba2
|
||||
sigmaloc(3,3) = sigmaloc(3,3) + sigma33 * fact * tpiba2
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!$acc host_data use_device(rhog)
|
||||
IF (do_cutoff_2D) CALL cutoff_stres_sigmaloc_gpu( rhog(:,1), strf_d, sigmaloc ) ! 2D: re-add LR Vloc to sigma here
|
||||
!$acc end host_data
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmaloc(l,l) = sigmaloc(l,l) + evloc
|
||||
DO m = 1, l-1
|
||||
sigmaloc(m,l) = sigmaloc(l,m)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
CALL mp_sum( sigmaloc, intra_bgrp_comm )
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( rhog )
|
||||
CALL dev_buf%release_buffer( strf_d, ierrs(1) )
|
||||
CALL dev_buf%release_buffer( dvloc_d, ierrs(2) )
|
||||
CALL dev_buf%release_buffer( rab_d, ierrs(3) )
|
||||
CALL dev_buf%release_buffer( r_d, ierrs(4) )
|
||||
CALL dev_buf%release_buffer( upfvloc_d, ierrs(5) )
|
||||
!
|
||||
#endif
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_loc_gpu
|
||||
|
|
@ -9,16 +9,16 @@
|
|||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE stres_mgga( sigmaxc )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! Analytic stress tensor contribution from metagga is added to sigmaxc
|
||||
!! Analytic stress tensor contribution from metagga is added to sigmaxc.
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE control_flags, ONLY : gamma_only
|
||||
USE noncollin_module, ONLY : noncolin
|
||||
USE cell_base, ONLY : alat, at, bg, omega, tpiba
|
||||
USE cell_base, ONLY : omega
|
||||
USE gvect, ONLY : g
|
||||
USE scf, ONLY : rho, v
|
||||
USE wavefunctions, ONLY : evc
|
||||
USE wavefunctions_gpum, ONLY : using_evc
|
||||
USE xc_lib, ONLY : xclib_dft_is
|
||||
USE klist, ONLY : nks, xk, ngk
|
||||
USE buffers, ONLY : get_buffer
|
||||
|
@ -26,74 +26,75 @@ SUBROUTINE stres_mgga( sigmaxc )
|
|||
USE wvfct, ONLY : nbnd, npwx, wg
|
||||
USE lsda_mod, ONLY : lsda, nspin, current_spin, isk
|
||||
USE fft_interfaces, ONLY : fwfft, invfft
|
||||
USE fft_base, ONLY : dfftp, dffts
|
||||
USE fft_base, ONLY : dffts
|
||||
USE mp, ONLY : mp_sum
|
||||
USE mp_pools, ONLY : inter_pool_comm
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE wavefunctions_gpum, ONLY : using_evc
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(INOUT) :: sigmaxc(3,3)
|
||||
REAL(DP), INTENT(INOUT) :: sigmaxc(3,3)
|
||||
!
|
||||
! Internal variables
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ix, iy, ir, iss, ipol, incr, ibnd, ik, npw
|
||||
INTEGER :: ipol2xy(3,3)
|
||||
!! ipol2xy(i,j) = ipol2x(j,i) is a collapsed symmetric index
|
||||
DATA ipol2xy / 1, 2, 3, 2, 4, 5, 3, 5, 6/
|
||||
REAL(DP), PARAMETER :: epsr = 1.0d-6, epsg = 1.0d-10, e2 = 2.d0
|
||||
COMPLEX(DP), ALLOCATABLE :: gradwfc (:,:), crosstaus(:,:,:)
|
||||
REAL(DP) :: w1, w2, delta, sigma_mgga(3,3)
|
||||
INTEGER :: ix, iy, K, ir, ipol, iss, incr, ibnd, ik, npw
|
||||
!
|
||||
if ( .not. xclib_dft_is('meta') ) return
|
||||
REAL(DP), PARAMETER :: epsr = 1.E-6_DP, epsg = 1.E-10_DP, e2 = 2._DP
|
||||
!
|
||||
current_spin=1
|
||||
COMPLEX(DP), ALLOCATABLE :: gradwfc(:,:), crosstaus(:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: vkin(:), rhokin(:)
|
||||
REAL(DP) :: sigma1, sigma2, sigma3, &
|
||||
sigma4, sigma5, sigma6
|
||||
!
|
||||
REAL(DP) :: w1, w2, delta, sigma_mgga(3,3)
|
||||
!
|
||||
!
|
||||
! Stop if something is not yet implemented
|
||||
IF ( .NOT. xclib_dft_is('meta') ) RETURN
|
||||
!
|
||||
if (noncolin) call errore('stres_mgga', &
|
||||
'noncollinear stress + meta-GGA not implemented',1)
|
||||
CALL using_evc(1)
|
||||
!
|
||||
! Initialization of a set of variables
|
||||
current_spin = 1
|
||||
!
|
||||
allocate (gradwfc( dffts%nnr, 3))
|
||||
allocate (crosstaus( dffts%nnr,6,nspin))
|
||||
IF ( noncolin ) CALL errore( 'stres_mgga', 'noncollinear stress + meta-GGA &
|
||||
¬ implemented', 1 )
|
||||
!
|
||||
! For gamma_only efficiency
|
||||
ALLOCATE( gradwfc(dffts%nnr,3) )
|
||||
ALLOCATE( crosstaus(dffts%nnr,6,nspin) )
|
||||
!$acc data create(crosstaus)
|
||||
!$acc data create(gradwfc)
|
||||
!
|
||||
incr=1
|
||||
IF ( gamma_only ) incr=2
|
||||
!$acc kernels
|
||||
crosstaus = 0.d0
|
||||
!$acc end kernels
|
||||
!
|
||||
crosstaus(:,:,:) = 0.d0
|
||||
gradwfc(:,:) = 0.d0
|
||||
! ... For gamma_only efficiency
|
||||
!
|
||||
! Loop over the k points
|
||||
incr = 1
|
||||
IF ( gamma_only ) incr = 2
|
||||
!
|
||||
! ... Loop over the k points
|
||||
!
|
||||
k_loop: DO ik = 1, nks
|
||||
!
|
||||
!
|
||||
IF ( lsda ) current_spin = isk(ik)
|
||||
!
|
||||
npw = ngk(ik)
|
||||
!
|
||||
! Read the wavefunctions
|
||||
! ... Read the wavefunctions
|
||||
!
|
||||
IF ( nks > 1 ) THEN
|
||||
!
|
||||
CALL get_buffer ( evc, nwordwfc, iunwfc, ik )
|
||||
CALL using_evc(2)
|
||||
!
|
||||
END IF
|
||||
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
|
||||
!$acc update device(evc)
|
||||
ENDIF
|
||||
!
|
||||
do ibnd = 1, nbnd, incr
|
||||
!
|
||||
DO ibnd = 1, nbnd, incr
|
||||
!
|
||||
! w1, w2: weights for each k point and band
|
||||
! ... w1, w2: weights for each k point and band
|
||||
!
|
||||
w1 = wg(ibnd,ik) / omega
|
||||
!
|
||||
IF ( (ibnd < nbnd) .and. (gamma_only) ) THEN
|
||||
IF ( (ibnd < nbnd) .AND. (gamma_only) ) THEN
|
||||
!
|
||||
! ... two ffts at the same time
|
||||
!
|
||||
|
@ -103,165 +104,223 @@ SUBROUTINE stres_mgga( sigmaxc )
|
|||
!
|
||||
w2 = w1
|
||||
!
|
||||
END IF
|
||||
ENDIF
|
||||
!
|
||||
! ... Gradient of the wavefunction in real space
|
||||
!
|
||||
! Gradient of the wavefunction in real space
|
||||
!
|
||||
CALL wfc_gradient( ibnd, ik, npw, gradwfc )
|
||||
!
|
||||
! Cross terms of kinetic energy density
|
||||
! ... Cross terms of kinetic energy density
|
||||
!
|
||||
do ix=1,3
|
||||
!
|
||||
do iy=1,ix
|
||||
!
|
||||
ipol = ipol2xy(iy,ix)
|
||||
!
|
||||
do ir=1,dffts%nnr
|
||||
!
|
||||
crosstaus(ir,ipol,current_spin) = crosstaus(ir,ipol,current_spin) +&
|
||||
2.0_DP*w1*DBLE(gradwfc(ir,ix))*DBLE(gradwfc(ir,iy)) +&
|
||||
2.0_DP*w2*AIMAG(gradwfc(ir,ix))*AIMAG(gradwfc(ir,iy))
|
||||
!
|
||||
end do
|
||||
!
|
||||
end do
|
||||
!
|
||||
end do
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ir = 1, dffts%nnr
|
||||
DO ipol = 1, 6
|
||||
!
|
||||
! ... explanation here: https://stackoverflow.com/a/244550
|
||||
!
|
||||
! M*(M+1)/ 2
|
||||
K = (3.0*4.0)/2.0 - 1 - (ipol - 1)
|
||||
K = FLOOR((SQRT(FLOAT(8*K+1))-1)/2)
|
||||
ix = (ipol-1) - (3.0*4.0)/2.0 + (K+1)*(K+2)/2.0 + 1 + (2-K)
|
||||
iy = 3 - K
|
||||
!
|
||||
crosstaus(ir,ipol,current_spin) = crosstaus(ir,ipol,current_spin) + &
|
||||
2.0_DP*w1*DBLE(gradwfc(ir,ix))*DBLE(gradwfc(ir,iy)) + &
|
||||
2.0_DP*w2*AIMAG(gradwfc(ir,ix))*AIMAG(gradwfc(ir,iy))
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
end do
|
||||
ENDDO !ibnd
|
||||
!
|
||||
END DO k_loop
|
||||
ENDDO k_loop
|
||||
!
|
||||
call mp_sum( crosstaus, inter_pool_comm )
|
||||
!$acc end data
|
||||
DEALLOCATE( gradwfc )
|
||||
!
|
||||
! gradwfc not used anymore
|
||||
!$acc host_data use_device(crosstaus)
|
||||
CALL mp_sum( crosstaus, inter_pool_comm )
|
||||
!$acc end host_data
|
||||
!
|
||||
deallocate (gradwfc)
|
||||
ALLOCATE( vkin(dffts%nnr) )
|
||||
ALLOCATE( rhokin(dffts%nnr) )
|
||||
!$acc data create(vkin,rhokin)
|
||||
!
|
||||
sigma_mgga(:,:) = 0.D0
|
||||
! ... metagga contribution to the stress tensor
|
||||
sigma_mgga(:,:) = 0._DP
|
||||
!
|
||||
! metagga contribution to the stress tensor
|
||||
sigma1 = 0.d0 ; sigma4 = 0.d0
|
||||
sigma2 = 0.d0 ; sigma5 = 0.d0
|
||||
sigma3 = 0.d0 ; sigma6 = 0.d0
|
||||
!
|
||||
do iss=1,nspin
|
||||
DO iss = 1, nspin
|
||||
!
|
||||
do ix=1,3
|
||||
!
|
||||
do iy=1,3
|
||||
!
|
||||
delta=0.
|
||||
if (ix==iy) delta=1.
|
||||
!
|
||||
do ir=1,dffts%nnr
|
||||
!
|
||||
sigma_mgga(ix,iy) = sigma_mgga(ix,iy) + v%kin_r(ir,iss) &
|
||||
* ( rho%kin_r(ir,iss) * delta &
|
||||
+ crosstaus(ir,ipol2xy(ix,iy),iss) )
|
||||
!
|
||||
end do
|
||||
!
|
||||
!
|
||||
end do
|
||||
!
|
||||
end do
|
||||
vkin = v%kin_r(:,iss)
|
||||
rhokin = rho%kin_r(:,iss)
|
||||
!$acc update device(vkin,rhokin)
|
||||
!
|
||||
end do
|
||||
deallocate( crosstaus )
|
||||
!$acc parallel loop reduction(+:sigma1,sigma2,sigma3,sigma4,sigma5,sigma6)
|
||||
DO ir = 1, dffts%nnr
|
||||
!
|
||||
sigma1 = sigma1 + vkin(ir) * ( rhokin(ir) &
|
||||
+ DBLE(crosstaus(ir,1,iss)) )
|
||||
sigma2 = sigma2 + vkin(ir) * DBLE(crosstaus(ir,2,iss))
|
||||
sigma3 = sigma3 + vkin(ir) * DBLE(crosstaus(ir,3,iss))
|
||||
sigma4 = sigma4 + vkin(ir) * ( rhokin(ir) &
|
||||
+ DBLE(crosstaus(ir,4,iss)) )
|
||||
sigma5 = sigma5 + vkin(ir) * DBLE(crosstaus(ir,5,iss))
|
||||
sigma6 = sigma6 + vkin(ir) * ( rhokin(ir) &
|
||||
+ DBLE(crosstaus(ir,6,iss)) )
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
sigma_mgga(1,1) = sigma1 ; sigma_mgga(2,3) = sigma5
|
||||
sigma_mgga(1,2) = sigma2 ; sigma_mgga(3,1) = sigma3
|
||||
sigma_mgga(1,3) = sigma3 ; sigma_mgga(3,2) = sigma5
|
||||
sigma_mgga(2,1) = sigma2 ; sigma_mgga(3,3) = sigma6
|
||||
sigma_mgga(2,2) = sigma4
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
DEALLOCATE( vkin, rhokin )
|
||||
DEALLOCATE( crosstaus )
|
||||
!
|
||||
CALL mp_sum( sigma_mgga, intra_bgrp_comm )
|
||||
!
|
||||
call mp_sum( sigma_mgga, intra_bgrp_comm )
|
||||
sigmaxc(:,:) = sigmaxc(:,:) + sigma_mgga(:,:) / &
|
||||
(dffts%nr1 * dffts%nr2 * dffts%nr3)
|
||||
!
|
||||
return
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_mgga
|
||||
|
||||
SUBROUTINE wfc_gradient ( ibnd, ik, npw, gradpsi )
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------
|
||||
SUBROUTINE wfc_gradient( ibnd, ik, npw, gradpsi )
|
||||
!----------------------------------------------------------
|
||||
!! Returns the gradient of the wavefunction in real space.
|
||||
!! Slightly adapted from sum_bands.f90
|
||||
!
|
||||
! Returns the gradient of the wavefunction in real space
|
||||
! Slightly adapted from sum_bands.f90
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE control_flags, ONLY : gamma_only
|
||||
USE wavefunctions, ONLY : psic, evc
|
||||
USE wvfct, ONLY : npwx, nbnd
|
||||
USE cell_base, ONLY : omega, tpiba
|
||||
USE klist, ONLY : xk, igk_k
|
||||
USE gvect, ONLY : g
|
||||
USE fft_base, ONLY : dffts
|
||||
USE fft_interfaces, ONLY : invfft
|
||||
USE wavefunctions_gpum, ONLY : using_evc
|
||||
USE kinds, ONLY: DP
|
||||
USE control_flags, ONLY: gamma_only
|
||||
USE wvfct, ONLY: npwx, nbnd
|
||||
USE cell_base, ONLY: omega, tpiba
|
||||
USE klist, ONLY: xk, igk_k
|
||||
USE wavefunctions, ONLY: psic, evc
|
||||
USE fft_base, ONLY: dffts
|
||||
USE fft_interfaces, ONLY: invfft
|
||||
USE gvect, ONLY: ngm, g
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd, ik, npw
|
||||
COMPLEX(DP) :: gradpsi(dffts%nnr,3)
|
||||
INTEGER, INTENT(IN) :: ibnd, ik, npw
|
||||
COMPLEX(DP), INTENT(OUT) :: gradpsi(dffts%nnr,3)
|
||||
!
|
||||
! Internal variables
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP) :: kplusg(npwx)
|
||||
INTEGER :: ipol
|
||||
INTEGER :: ipol, j
|
||||
REAL(DP) :: kplusg
|
||||
INTEGER, ALLOCATABLE :: nld(:), nlmd(:)
|
||||
REAL(DP) :: xki(3)
|
||||
!
|
||||
CALL using_evc(0)
|
||||
!$acc data present(gradpsi) copyin(evc) create(psic)
|
||||
!
|
||||
! Compute the gradient of the wavefunction in reciprocal space
|
||||
ALLOCATE( nld(npw) )
|
||||
nld = dffts%nl
|
||||
!
|
||||
xki(1:3) = xk(1:3,ik)
|
||||
!
|
||||
! ... Compute the gradient of the wavefunction in reciprocal space
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
DO ipol=1,3
|
||||
ALLOCATE( nlmd(npw) )
|
||||
nlmd = dffts%nlm
|
||||
!$acc data copyin(xki,nld,nlmd)
|
||||
!
|
||||
DO ipol = 1, 3
|
||||
!
|
||||
psic(:) = ( 0.D0, 0.D0 )
|
||||
!
|
||||
kplusg (1:npw) = (xk(ipol,ik)+g(ipol,igk_k(1:npw,ik))) * tpiba
|
||||
!$acc kernels
|
||||
psic(:) = (0._DP,0._DP)
|
||||
!$acc end kernels
|
||||
!
|
||||
IF ( ibnd < nbnd ) THEN
|
||||
!
|
||||
! ... two ffts at the same time
|
||||
!
|
||||
psic(dffts%nl(1:npw)) = CMPLX(0d0, kplusg(1:npw),kind=DP)* &
|
||||
( evc(1:npw,ibnd) + &
|
||||
( 0.D0, 1.D0 ) * evc(1:npw,ibnd+1) )
|
||||
!
|
||||
psic(dffts%nlm(1:npw)) = CMPLX(0d0,-kplusg(1:npw),kind=DP) * &
|
||||
CONJG( evc(1:npw,ibnd) - &
|
||||
( 0.D0, 1.D0 ) * evc(1:npw,ibnd+1) )
|
||||
!$acc parallel loop
|
||||
DO j = 1, npw
|
||||
kplusg = (xki(ipol)+g(ipol,igk_k(j,ik))) * tpiba
|
||||
!
|
||||
psic(nld(j)) = CMPLX(0._DP, kplusg, kind=DP) * &
|
||||
( evc(j,ibnd) + (0._DP,1._DP) * evc(j,ibnd+1) )
|
||||
!
|
||||
psic(nlmd(j)) = CMPLX(0._DP,-kplusg, kind=DP) * &
|
||||
CONJG( evc(j,ibnd) - (0._DP,1._DP) * evc(j,ibnd+1) )
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
psic(dffts%nl(1:npw)) = CMPLX(0d0, kplusg(1:npw),kind=DP)* &
|
||||
evc(1:npw,ibnd)
|
||||
!$acc parallel loop
|
||||
DO j = 1, npw
|
||||
kplusg = (xki(ipol)+g(ipol,igk_k(j,ik))) * tpiba
|
||||
!
|
||||
psic(nld(j)) = CMPLX(0._DP, kplusg, kind=DP) * evc(j,ibnd)
|
||||
!
|
||||
psic(nlmd(j)) = CMPLX(0._DP,-kplusg,kind=DP) * CONJG(evc(j,ibnd))
|
||||
ENDDO
|
||||
!
|
||||
psic(dffts%nlm(1:npw)) = CMPLX(0d0,-kplusg(1:npw),kind=DP) * &
|
||||
CONJG( evc(1:npw,ibnd) )
|
||||
!
|
||||
END IF
|
||||
ENDIF
|
||||
!
|
||||
! Gradient of the wavefunction in real space
|
||||
! ... Gradient of the wavefunction in real space
|
||||
!
|
||||
CALL invfft ('Wave', psic, dffts)
|
||||
!$acc host_data use_device(psic)
|
||||
CALL invfft( 'Wave', psic, dffts )
|
||||
!$acc end host_data
|
||||
!
|
||||
!$acc kernels
|
||||
gradpsi(:,ipol) = psic
|
||||
!$acc end kernels
|
||||
!
|
||||
END DO
|
||||
ENDDO
|
||||
!$acc end data
|
||||
DEALLOCATE( nlmd )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
DO ipol=1,3
|
||||
!
|
||||
psic(:) = ( 0.D0, 0.D0 )
|
||||
!
|
||||
kplusg (1:npw) = (xk(ipol,ik)+g(ipol,igk_k(1:npw,ik))) * tpiba
|
||||
psic(dffts%nl(igk_k(1:npw,ik))) = CMPLX(0d0, kplusg(1:npw),kind=DP)* &
|
||||
evc(1:npw,ibnd)
|
||||
!
|
||||
! Gradient of the wavefunction in real space
|
||||
!
|
||||
CALL invfft ('Wave', psic, dffts)
|
||||
!
|
||||
gradpsi(:,ipol) = psic
|
||||
!
|
||||
END DO
|
||||
!
|
||||
END IF
|
||||
!$acc data copyin(xki,nld)
|
||||
DO ipol = 1, 3
|
||||
!
|
||||
!$acc kernels
|
||||
psic(:) = (0._DP,0._DP)
|
||||
!$acc end kernels
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO j = 1, npw
|
||||
kplusg = (xki(ipol)+g(ipol,igk_k(j,ik))) * tpiba
|
||||
!
|
||||
psic(nld(j)) = CMPLX(0._DP,kplusg,kind=DP)*evc(j,ibnd)
|
||||
ENDDO
|
||||
!
|
||||
! ... Gradient of the wavefunction in real space
|
||||
!
|
||||
!$acc host_data use_device(psic)
|
||||
CALL invfft( 'Wave', psic, dffts )
|
||||
!$acc end host_data
|
||||
!
|
||||
!$acc kernels
|
||||
gradpsi(:,ipol) = psic
|
||||
!$acc end kernels
|
||||
!
|
||||
ENDDO
|
||||
!$acc end data
|
||||
ENDIF
|
||||
!
|
||||
DEALLOCATE( nld )
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE wfc_gradient
|
||||
|
|
|
@ -1,335 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2017 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE stres_mgga_gpu( sigmaxc )
|
||||
!----------------------------------------------------------------------------
|
||||
!! Analytic stress tensor contribution from metagga is added to sigmaxc
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE control_flags, ONLY : gamma_only
|
||||
USE noncollin_module, ONLY : noncolin
|
||||
USE cell_base, ONLY : alat, at, bg, omega, tpiba
|
||||
USE gvect, ONLY : g
|
||||
USE scf, ONLY : rho, v
|
||||
USE wavefunctions, ONLY : evc
|
||||
USE xc_lib, ONLY : xclib_dft_is
|
||||
USE klist, ONLY : nks, xk, ngk
|
||||
USE buffers, ONLY : get_buffer
|
||||
USE io_files, ONLY : iunwfc, nwordwfc
|
||||
USE wvfct, ONLY : nbnd, npwx, wg
|
||||
USE lsda_mod, ONLY : lsda, nspin, current_spin, isk
|
||||
USE fft_interfaces, ONLY : fwfft, invfft
|
||||
USE fft_base, ONLY : dfftp, dffts
|
||||
USE mp, ONLY : mp_sum
|
||||
USE mp_pools, ONLY : inter_pool_comm
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE wavefunctions_gpum, ONLY : using_evc
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE device_fbuff_m , ONLY : dev_buf
|
||||
USE device_memcpy_m, ONLY : dev_memcpy, dev_memset
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(INOUT) :: sigmaxc(3,3)
|
||||
!
|
||||
! Internal variables
|
||||
!
|
||||
INTEGER :: ix, iy, K, ir, ipol, iss, incr, ibnd, ik, npw
|
||||
!
|
||||
INTEGER :: ierrs(4)
|
||||
!
|
||||
REAL(DP), PARAMETER :: epsr = 1.E-6_DP, epsg = 1.E-10_DP, e2 = 2._DP
|
||||
!
|
||||
COMPLEX(DP), POINTER :: gradwfc_d(:,:), crosstaus_d(:,:,:)
|
||||
INTEGER :: ix_d(6), iy_d(6)
|
||||
REAL(DP), POINTER :: vkin_d(:), rhokin_d(:)
|
||||
REAL(DP) :: sigma1_d, sigma2_d, sigma3_d, &
|
||||
sigma4_d, sigma5_d, sigma6_d
|
||||
!
|
||||
REAL(DP) :: w1, w2, delta, sigma_mgga(3,3)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: gradwfc_d, crosstaus_d, vkin_d, rhokin_d, ix_d, iy_d
|
||||
!
|
||||
if ( .not. xclib_dft_is('meta') ) return
|
||||
!
|
||||
current_spin = 1
|
||||
!
|
||||
! Stop if something is not yet implemented
|
||||
!
|
||||
IF ( noncolin ) CALL errore( 'stres_mgga', &
|
||||
'noncollinear stress + meta-GGA not implemented', 1 )
|
||||
!
|
||||
! Initialization of a set of variables
|
||||
!
|
||||
CALL dev_buf%lock_buffer( gradwfc_d, (/ dffts%nnr, 3 /), ierrs(1) )
|
||||
CALL dev_buf%lock_buffer( crosstaus_d, (/ dffts%nnr, 6, nspin /), ierrs(2) )
|
||||
IF (ANY(ierrs(1:2) /= 0)) CALL errore( 'stres_mgga_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs(1:2))) )
|
||||
!
|
||||
CALL dev_memset(crosstaus_d , (0._DP,0._DP) )
|
||||
!
|
||||
! For gamma_only efficiency
|
||||
!
|
||||
incr = 1
|
||||
IF ( gamma_only ) incr = 2
|
||||
!
|
||||
!Polarization indexes
|
||||
! ix_d(1) = 1 ; iy_d(1) = 1
|
||||
! ix_d(2) = 2 ; iy_d(2) = 1
|
||||
! ix_d(3) = 3 ; iy_d(3) = 1
|
||||
! ix_d(4) = 2 ; iy_d(4) = 2
|
||||
! ix_d(5) = 3 ; iy_d(5) = 2
|
||||
! ix_d(6) = 3 ; iy_d(6) = 3
|
||||
!
|
||||
! Loop over the k points
|
||||
!
|
||||
k_loop: DO ik = 1, nks
|
||||
!
|
||||
IF ( lsda ) current_spin = isk(ik)
|
||||
!
|
||||
npw = ngk(ik)
|
||||
!
|
||||
! Read the wavefunctions
|
||||
!
|
||||
IF ( nks > 1 ) THEN
|
||||
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
|
||||
CALL using_evc(2)
|
||||
ENDIF
|
||||
!
|
||||
DO ibnd = 1, nbnd, incr
|
||||
!
|
||||
! w1, w2: weights for each k point and band
|
||||
!
|
||||
w1 = wg(ibnd,ik) / omega
|
||||
!
|
||||
IF ( (ibnd < nbnd) .AND. (gamma_only) ) THEN
|
||||
!
|
||||
! ... two ffts at the same time
|
||||
!
|
||||
w2 = wg(ibnd+1,ik) / omega
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
w2 = w1
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! Gradient of the wavefunction in real space
|
||||
!
|
||||
CALL wfc_gradient_gpu( ibnd, ik, npw, gradwfc_d )
|
||||
!
|
||||
! Cross terms of kinetic energy density
|
||||
!
|
||||
! FIX ME: PGI complains here if I set do(2)
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ir = 1, dffts%nnr
|
||||
DO ipol = 1, 6
|
||||
!
|
||||
! explenation here: https://stackoverflow.com/a/244550
|
||||
!
|
||||
! M*(M+1)/ 2
|
||||
K = (3.0*4.0)/2.0 - 1 - (ipol - 1)
|
||||
K = floor((SQRT(FLOAT(8*K+1))-1)/2)
|
||||
ix = (ipol-1) - (3.0*4.0)/2.0 + (K+1)*(K+2)/2.0 + 1 + (2-K)
|
||||
iy = 3 - K
|
||||
!
|
||||
crosstaus_d(ir,ipol,current_spin) = crosstaus_d(ir,ipol,current_spin) + &
|
||||
2.0_DP*w1*DBLE(gradwfc_d(ir,ix))*DBLE(gradwfc_d(ir,iy)) + &
|
||||
2.0_DP*w2*AIMAG(gradwfc_d(ir,ix))*AIMAG(gradwfc_d(ir,iy))
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!crosstaus(:,:,current_spin) = crosstaus(:,:,current_spin) + &
|
||||
! crosstaus_d(:,:,current_spin)
|
||||
!
|
||||
ENDDO !ibnd
|
||||
!
|
||||
ENDDO k_loop
|
||||
!
|
||||
!
|
||||
CALL dev_buf%release_buffer( gradwfc_d, ierrs(1) )
|
||||
!
|
||||
CALL mp_sum( crosstaus_d, inter_pool_comm )
|
||||
!
|
||||
CALL dev_buf%lock_buffer( vkin_d, dffts%nnr, ierrs(3) )
|
||||
CALL dev_buf%lock_buffer( rhokin_d, dffts%nnr, ierrs(4) )
|
||||
IF (ANY(ierrs(3:4) /= 0)) CALL errore( 'stres_mgga_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs(3:4))) )
|
||||
!
|
||||
! metagga contribution to the stress tensor
|
||||
sigma_mgga(:,:) = 0._DP
|
||||
!
|
||||
sigma1_d = 0.d0 ; sigma4_d = 0.d0
|
||||
sigma2_d = 0.d0 ; sigma5_d = 0.d0
|
||||
sigma3_d = 0.d0 ; sigma6_d = 0.d0
|
||||
!
|
||||
!
|
||||
DO iss = 1, nspin
|
||||
!
|
||||
CALL dev_memcpy( vkin_d, v%kin_r(:,iss) )
|
||||
CALL dev_memcpy( rhokin_d, rho%kin_r(:,iss) )
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ir = 1, dffts%nnr
|
||||
!
|
||||
sigma1_d = sigma1_d + vkin_d(ir) * (rhokin_d(ir) &
|
||||
+ DBLE(crosstaus_d(ir,1,iss)) )
|
||||
sigma2_d = sigma2_d + vkin_d(ir) * DBLE(crosstaus_d(ir,2,iss))
|
||||
sigma3_d = sigma3_d + vkin_d(ir) * DBLE(crosstaus_d(ir,3,iss))
|
||||
sigma4_d = sigma4_d + vkin_d(ir) * (rhokin_d(ir) &
|
||||
+ DBLE(crosstaus_d(ir,4,iss)) )
|
||||
sigma5_d = sigma5_d + vkin_d(ir) * DBLE(crosstaus_d(ir,5,iss))
|
||||
sigma6_d = sigma6_d + vkin_d(ir) * (rhokin_d(ir) &
|
||||
+ DBLE(crosstaus_d(ir,6,iss)) )
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
sigma_mgga(1,1) = sigma1_d ; sigma_mgga(2,3) = sigma5_d
|
||||
sigma_mgga(1,2) = sigma2_d ; sigma_mgga(3,1) = sigma3_d
|
||||
sigma_mgga(1,3) = sigma3_d ; sigma_mgga(3,2) = sigma5_d
|
||||
sigma_mgga(2,1) = sigma2_d ; sigma_mgga(3,3) = sigma6_d
|
||||
sigma_mgga(2,2) = sigma4_d
|
||||
!
|
||||
CALL dev_buf%release_buffer( vkin_d, ierrs(3) )
|
||||
CALL dev_buf%release_buffer( rhokin_d, ierrs(4) )
|
||||
CALL dev_buf%release_buffer( crosstaus_d, ierrs(2) )
|
||||
!
|
||||
CALL mp_sum( sigma_mgga, intra_bgrp_comm )
|
||||
!
|
||||
sigmaxc(:,:) = sigmaxc(:,:) + sigma_mgga(:,:) / &
|
||||
(dffts%nr1 * dffts%nr2 * dffts%nr3)
|
||||
!
|
||||
#endif
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_mgga_gpu
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------
|
||||
SUBROUTINE wfc_gradient_gpu( ibnd, ik, npw, gradpsi_d )
|
||||
!----------------------------------------------------------
|
||||
!! Returns the gradient of the wavefunction in real space.
|
||||
!! Slightly adapted from sum_bands.f90
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE control_flags, ONLY: gamma_only
|
||||
USE wvfct, ONLY: npwx, nbnd
|
||||
USE cell_base, ONLY: omega, tpiba
|
||||
USE klist, ONLY: xk, igk_k_d
|
||||
|
||||
USE fft_base, ONLY: dffts
|
||||
USE fft_interfaces, ONLY: invfft
|
||||
!
|
||||
USE gvect, ONLY: g_d
|
||||
USE wavefunctions_gpum, ONLY: using_evc, using_evc_d, evc_d, &
|
||||
using_psic, using_psic_d, psic_d
|
||||
#if defined(__CUDA)
|
||||
USE device_fbuff_m, ONLY: dev_buf
|
||||
USE device_memcpy_m, ONLY: dev_memcpy
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ibnd, ik, npw
|
||||
COMPLEX(DP), INTENT(OUT) :: gradpsi_d(dffts%nnr,3)
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ipol, j
|
||||
REAL(DP) :: kplusg
|
||||
INTEGER, POINTER :: nl_d(:), nlm_d(:)
|
||||
REAL(DP) :: xk_d(3)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: gradpsi_d, nl_d, nlm_d, xk_d
|
||||
!
|
||||
CALL using_evc_d(0)
|
||||
CALL using_psic_d(2)
|
||||
!
|
||||
nl_d => dffts%nl_d
|
||||
nlm_d => dffts%nlm_d
|
||||
!
|
||||
xk_d(1:3) = xk(1:3,ik)
|
||||
!
|
||||
! Compute the gradient of the wavefunction in reciprocal space
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
DO ipol = 1, 3
|
||||
!
|
||||
psic_d(:) = (0._DP,0._DP)
|
||||
!
|
||||
IF ( ibnd < nbnd ) THEN
|
||||
!
|
||||
! ... two ffts at the same time
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO j = 1, npw
|
||||
kplusg = (xk_d(ipol)+g_d(ipol,igk_k_d(j,ik))) * tpiba
|
||||
!
|
||||
psic_d(nl_d(j)) = CMPLX(0._DP, kplusg, kind=DP) * &
|
||||
( evc_d(j,ibnd) + &
|
||||
( 0._DP, 1._DP ) * evc_d(j,ibnd+1) )
|
||||
!
|
||||
psic_d(nlm_d(j)) = CMPLX(0._DP,-kplusg, kind=DP) * &
|
||||
CONJG( evc_d(j,ibnd) - &
|
||||
( 0._DP, 1._DP ) * evc_d(j,ibnd+1) )
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO j = 1, npw
|
||||
kplusg = (xk_d(ipol)+g_d(ipol,igk_k_d(j,ik))) * tpiba
|
||||
!
|
||||
psic_d(nl_d(j)) = CMPLX(0._DP, kplusg, kind=DP) * &
|
||||
evc_d(j,ibnd)
|
||||
!
|
||||
psic_d(nlm_d(j)) = CMPLX(0._DP,-kplusg,kind=DP) * &
|
||||
CONJG( evc_d(j,ibnd) )
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! Gradient of the wavefunction in real space
|
||||
!
|
||||
CALL invfft( 'Wave', psic_d, dffts )
|
||||
!
|
||||
CALL dev_memcpy( gradpsi_d(:,ipol), psic_d )
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
DO ipol = 1, 3
|
||||
!
|
||||
psic_d(:) = (0._DP,0._DP)
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO j = 1, npw
|
||||
kplusg = (xk_d(ipol)+g_d(ipol,igk_k_d(j,ik))) * tpiba
|
||||
!
|
||||
psic_d(nl_d(j)) = CMPLX(0._DP,kplusg,kind=DP)*evc_d(j,ibnd)
|
||||
ENDDO
|
||||
!
|
||||
! Gradient of the wavefunction in real space
|
||||
!
|
||||
CALL invfft( 'Wave', psic_d, dffts )
|
||||
!
|
||||
CALL dev_memcpy( gradpsi_d(:,ipol), psic_d )
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
#endif
|
||||
!
|
||||
END SUBROUTINE wfc_gradient_gpu
|
1262
PW/src/stres_us.f90
1262
PW/src/stres_us.f90
File diff suppressed because it is too large
Load Diff
|
@ -1,865 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2001-2021 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE stres_us_gpu( ik, gk_d, sigmanlc )
|
||||
!----------------------------------------------------------------------------
|
||||
!! nonlocal (separable pseudopotential) contribution to the stress
|
||||
!! NOTICE: sum of partial results over procs is performed in calling routine
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE ions_base, ONLY : nat, ntyp => nsp, ityp
|
||||
USE constants, ONLY : eps8
|
||||
USE klist, ONLY : nks, xk, ngk, igk_k
|
||||
USE lsda_mod, ONLY : current_spin, lsda, isk
|
||||
USE wvfct, ONLY : npwx, nbnd, wg, et
|
||||
USE control_flags, ONLY : gamma_only
|
||||
USE uspp_param, ONLY : upf, lmaxkb, nh, nhm
|
||||
USE uspp, ONLY : nkb, vkb, deeq_d
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE noncollin_module, ONLY : noncolin, npol, lspinorb
|
||||
USE mp_pools, ONLY : me_pool, root_pool
|
||||
USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, root_bgrp
|
||||
USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, &
|
||||
bec_type, becp, calbec
|
||||
USE mp, ONLY : mp_sum, mp_get_comm_null, &
|
||||
mp_circular_shift_left
|
||||
USE wavefunctions_gpum, ONLY : using_evc, using_evc_d, evc_d
|
||||
USE wvfct_gpum, ONLY : using_et
|
||||
USE becmod_gpum, ONLY : becp_d, bec_type_d
|
||||
USE becmod_subs_gpum, ONLY : using_becp_auto, using_becp_d_auto, &
|
||||
calbec_gpu
|
||||
USE uspp_init, ONLY : init_us_2, gen_us_dj_gpu, gen_us_dy_gpu
|
||||
#if defined(__CUDA)
|
||||
USE device_fbuff_m, ONLY : dev_buf
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ik
|
||||
!! k-point index
|
||||
REAL(DP), INTENT(IN) :: gk_d(npwx,3)
|
||||
!! wave function components for fixed k-point
|
||||
REAL(DP), INTENT(INOUT) :: sigmanlc(3,3)
|
||||
!! stress tensor, non-local contribution
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP), POINTER :: qm1_d(:)
|
||||
REAL(DP) :: q
|
||||
INTEGER :: npw , iu, np, ierr
|
||||
!
|
||||
INTEGER :: na1, np1, nh_np1, ijkb01, itot
|
||||
LOGICAL :: ismulti_np
|
||||
INTEGER, ALLOCATABLE :: shift(:)
|
||||
INTEGER, ALLOCATABLE :: ix_d(:,:), ityp_d(:), nh_d(:), shift_d(:)
|
||||
LOGICAL, ALLOCATABLE :: is_multinp_d(:)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: gk_d, qm1_d, is_multinp_d, ix_d, shift_d, &
|
||||
ityp_d, nh_d
|
||||
!
|
||||
CALL using_evc_d(0)
|
||||
CALL using_evc(0)
|
||||
!
|
||||
IF ( nkb == 0 ) RETURN
|
||||
!
|
||||
IF ( lsda ) current_spin = isk(ik)
|
||||
npw = ngk(ik)
|
||||
IF ( nks > 1 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .true. )
|
||||
!
|
||||
CALL allocate_bec_type( nkb, nbnd, becp, intra_bgrp_comm )
|
||||
CALL using_becp_auto(2)
|
||||
!
|
||||
CALL using_evc_d(0)
|
||||
CALL using_becp_d_auto(2)
|
||||
!
|
||||
!$acc data present(vkb(:,:))
|
||||
!$acc host_data use_device(vkb)
|
||||
CALL calbec_gpu( npw, vkb, evc_d, becp_d )
|
||||
!$acc end host_data
|
||||
!$acc end data
|
||||
!
|
||||
CALL dev_buf%lock_buffer( qm1_d, npwx, ierr )
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO iu = 1, npw
|
||||
q = SQRT( gk_d(iu,1)**2 + gk_d(iu,2)**2 + gk_d(iu,3)**2 )
|
||||
IF ( q > eps8 ) THEN
|
||||
qm1_d(iu) = 1._DP / q
|
||||
ELSE
|
||||
qm1_d(iu) = 0._DP
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
!----------define index arrays (type, atom, etc.) for cuf kernel loops------
|
||||
ALLOCATE( is_multinp_d(nat*nhm) )
|
||||
ALLOCATE( ix_d(nat*nhm,4), ityp_d(nat), nh_d(ntyp) )
|
||||
ityp_d = ityp ; nh_d = nh
|
||||
!
|
||||
ALLOCATE( shift(nat) )
|
||||
ijkb01 = 0
|
||||
DO iu = 1, ntyp
|
||||
DO na1 = 1, nat
|
||||
IF (ityp(na1) == iu ) THEN
|
||||
shift(na1) = ijkb01
|
||||
ijkb01 = ijkb01 + nh(iu)
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ALLOCATE( shift_d(nat) )
|
||||
shift_d = shift
|
||||
!
|
||||
ijkb01 = 0
|
||||
itot=0
|
||||
DO np = 1, ntyp
|
||||
DO na1 = 1, nat
|
||||
np1 = ityp(na1)
|
||||
IF (np /= np1) CYCLE
|
||||
ijkb01 = shift(na1)
|
||||
nh_np1 = nh(np1)
|
||||
ismulti_np = upf(np1)%tvanp .OR. upf(np1)%is_multiproj
|
||||
IF ( .NOT. ismulti_np .AND. noncolin .AND. lspinorb ) &
|
||||
CALL errore('stres_us','wrong case',1)
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO iu = itot+1, itot+nh_np1
|
||||
ix_d(iu,1) = na1 !na
|
||||
ix_d(iu,2) = iu-itot !ih
|
||||
ix_d(iu,3) = nh_np1 !nh(np)
|
||||
ix_d(iu,4) = ijkb01 !ishift
|
||||
is_multinp_d(iu) = ismulti_np
|
||||
ENDDO
|
||||
itot = itot + nh_np1
|
||||
ENDDO
|
||||
ENDDO
|
||||
!--------------
|
||||
!
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
CALL stres_us_gamma_gpu()
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL stres_us_k_gpu()
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
CALL dev_buf%release_buffer( qm1_d, ierr )
|
||||
!
|
||||
DEALLOCATE( is_multinp_d )
|
||||
DEALLOCATE( ix_d, ityp_d, nh_d, shift_d )
|
||||
DEALLOCATE( shift )
|
||||
!
|
||||
CALL deallocate_bec_type( becp )
|
||||
CALL using_becp_auto(2)
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE stres_us_gamma_gpu()
|
||||
!-----------------------------------------------------------------------
|
||||
!! nonlocal contribution to the stress - gamma version
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: na, np, nt, ibnd, ipol, jpol, l, i, ikb, &
|
||||
jkb, ih, jh, ibnd_loc,ijkb0,nh_np, nproc, &
|
||||
nbnd_loc, nbnd_begin, icyc, ishift, nhmx
|
||||
REAL(DP) :: dot11, dot21, dot31, dot22, dot32, dot33, &
|
||||
qm1i, gk1, gk2, gk3, wg_nk, fac, evps, aux
|
||||
COMPLEX(DP) :: worksum, cv, wsum1, wsum2, wsum3, ps, evci
|
||||
!
|
||||
COMPLEX(DP), POINTER :: ps_d(:), deff_d(:,:,:), dvkb_d(:,:,:)
|
||||
REAL(DP), POINTER :: becpr_d(:,:)
|
||||
!
|
||||
REAL(DP) :: xyz(3,3)
|
||||
INTEGER :: ierrs(3)
|
||||
!
|
||||
! xyz are the three unit vectors in the x,y,z directions
|
||||
DATA xyz / 1._DP, 0._DP, 0._DP, 0._DP, 1._DP, 0._DP, 0._DP, 0._DP, &
|
||||
1._DP /
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: deff_d, becpr_d, ps_d, dvkb_d
|
||||
#endif
|
||||
!
|
||||
CALL using_becp_auto(0)
|
||||
!
|
||||
IF( becp%comm /= mp_get_comm_null() ) THEN
|
||||
nproc = becp%nproc
|
||||
nbnd_loc = becp%nbnd_loc
|
||||
nbnd_begin = becp%ibnd_begin
|
||||
IF( ( nbnd_begin + nbnd_loc - 1 ) > nbnd ) nbnd_loc = nbnd - nbnd_begin + 1
|
||||
ELSE
|
||||
nproc = 1
|
||||
nbnd_loc = nbnd
|
||||
nbnd_begin = 1
|
||||
ENDIF
|
||||
!
|
||||
CALL dev_buf%lock_buffer( deff_d, (/ nhm,nhm,nat /), ierrs(1) )
|
||||
!
|
||||
! ... diagonal contribution - if the result from "calbec" are not
|
||||
! ... distributed, must be calculated on a single processor
|
||||
!
|
||||
! ... for the moment when using_gpu is true becp is always fully present in all processors
|
||||
!
|
||||
CALL using_et(0) ! compute_deff : intent(in)
|
||||
!
|
||||
CALL dev_buf%lock_buffer( ps_d, nkb, ierrs(2) )
|
||||
IF (ANY(ierrs(1:2) /= 0)) CALL errore( 'stres_us_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs)) )
|
||||
!
|
||||
becpr_d => becp_d%r_d
|
||||
!
|
||||
evps = 0._DP
|
||||
!
|
||||
compute_evps: IF ( .NOT. (nproc == 1 .AND. me_pool /= root_pool) ) THEN
|
||||
!
|
||||
DO ibnd_loc = 1, nbnd_loc
|
||||
ibnd = ibnd_loc + becp%ibnd_begin - 1
|
||||
CALL compute_deff_gpu( deff_d, et(ibnd,ik) )
|
||||
wg_nk = wg(ibnd,ik)
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO i = 1, itot
|
||||
!
|
||||
ih = ix_d(i,2) ; na = ix_d(i,1)
|
||||
ishift = ix_d(i,4) ; ikb = ishift + ih
|
||||
!
|
||||
IF (.NOT. is_multinp_d(i)) THEN
|
||||
aux = wg_nk * DBLE(deff_d(ih,ih,na)) * &
|
||||
ABS(becpr_d(ikb,ibnd_loc))**2
|
||||
ELSE
|
||||
nh_np = ix_d(i,3)
|
||||
!
|
||||
aux = wg_nk * DBLE(deff_d(ih,ih,na)) &
|
||||
* ABS(becpr_d(ikb,ibnd_loc))**2 &
|
||||
+ becpr_d(ikb,ibnd_loc)* wg_nk * 2._DP &
|
||||
* SUM( DBLE(deff_d(ih,ih+1:nh_np,na)) &
|
||||
* becpr_d(ishift+ih+1:ishift+nh_np,ibnd_loc))
|
||||
ENDIF
|
||||
evps = evps + aux
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDIF compute_evps
|
||||
!
|
||||
!
|
||||
! ... non diagonal contribution - derivative of the bessel function
|
||||
!------------------------------------
|
||||
CALL dev_buf%lock_buffer( dvkb_d, (/ npwx,nkb,4 /), ierrs(3) )
|
||||
IF (ierrs(3) /= 0) CALL errore( 'stres_us_gpu', 'cannot allocate buffers', ABS(ierrs(3)))
|
||||
!
|
||||
CALL gen_us_dj_gpu( ik, dvkb_d(:,:,4) )
|
||||
IF ( lmaxkb > 0 ) THEN
|
||||
DO ipol = 1, 3
|
||||
CALL gen_us_dy_gpu( ik, xyz(1,ipol), dvkb_d(:,:,ipol))
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
!
|
||||
DO icyc = 0, nproc -1
|
||||
!
|
||||
DO ibnd_loc = 1, nbnd_loc
|
||||
!
|
||||
ibnd = ibnd_loc + becp%ibnd_begin - 1
|
||||
CALL compute_deff_gpu( deff_d, et(ibnd,ik) )
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO i = 1, itot
|
||||
!
|
||||
ih = ix_d(i,2) ; na = ix_d(i,1)
|
||||
ishift = ix_d(i,4) ; ikb = ishift + ih
|
||||
!
|
||||
IF (.NOT. is_multinp_d(i)) THEN
|
||||
ps_d(ikb) = deff_d(ih,ih,na) * CMPLX(becpr_d(ikb,ibnd_loc))
|
||||
ELSE
|
||||
nh_np = ix_d(i,3)
|
||||
!
|
||||
ps_d(ikb) = CMPLX( SUM( becpr_d(ishift+1:ishift+nh_np,ibnd_loc) &
|
||||
* DBLE(deff_d(ih,1:nh_np,na))))
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP
|
||||
dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP
|
||||
!
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO na =1, nat
|
||||
DO i = 1, npw
|
||||
worksum = (0._DP,0._DP)
|
||||
np = ityp_d(na)
|
||||
ijkb0 = shift_d(na)
|
||||
nh_np = nh_d(np)
|
||||
DO ih = 1, nh_np
|
||||
ikb = ijkb0 + ih
|
||||
worksum = worksum + ps_d(ikb) * dvkb_d(i,ikb,4)
|
||||
ENDDO
|
||||
evci = evc_d(i,ibnd)
|
||||
gk1 = gk_d(i,1)
|
||||
gk2 = gk_d(i,2)
|
||||
gk3 = gk_d(i,3)
|
||||
qm1i = qm1_d(i)
|
||||
!
|
||||
cv = evci * CMPLX(gk1 * gk1 * qm1i)
|
||||
dot11 = dot11 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * CMPLX(gk2 * gk1 * qm1i )
|
||||
dot21 = dot21 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * CMPLX(gk3 * gk1 * qm1i)
|
||||
dot31 = dot31 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * CMPLX(gk2 * gk2 * qm1i)
|
||||
dot22 = dot22 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * CMPLX(gk3 * gk2 * qm1i)
|
||||
dot32 = dot32 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * CMPLX(gk3 * gk3 * qm1i)
|
||||
dot33 = dot33 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
ENDDO
|
||||
ENDDO
|
||||
! ... a factor 2 accounts for the other half of the G-vector sphere
|
||||
sigmanlc(:,1) = sigmanlc(:,1) - 4._DP * wg(ibnd,ik) * [dot11, dot21, dot31]
|
||||
sigmanlc(:,2) = sigmanlc(:,2) - 4._DP * wg(ibnd,ik) * [0._DP, dot22, dot32]
|
||||
sigmanlc(:,3) = sigmanlc(:,3) - 4._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33]
|
||||
|
||||
!
|
||||
! ... non diagonal contribution - derivative of the spherical harmonics
|
||||
! ... (no contribution from l=0)
|
||||
!
|
||||
IF ( lmaxkb == 0 ) CYCLE
|
||||
!
|
||||
!------------------------------------
|
||||
!
|
||||
dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP
|
||||
dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP
|
||||
!
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO ikb = 1, nkb
|
||||
DO i = 1, npw
|
||||
!
|
||||
wsum1 = ps_d(ikb)*dvkb_d(i,ikb,1)
|
||||
wsum2 = ps_d(ikb)*dvkb_d(i,ikb,2)
|
||||
wsum3 = ps_d(ikb)*dvkb_d(i,ikb,3)
|
||||
!
|
||||
evci = evc_d(i,ibnd)
|
||||
gk1 = gk_d(i,1)
|
||||
gk2 = gk_d(i,2)
|
||||
gk3 = gk_d(i,3)
|
||||
!
|
||||
cv = evci * CMPLX(gk1 )
|
||||
dot11 = dot11 + DBLE( wsum1)* DBLE(cv) + DIMAG(wsum1)*DIMAG(cv)
|
||||
dot21 = dot21 + DBLE( wsum2)* DBLE(cv) + DIMAG(wsum2)*DIMAG(cv)
|
||||
dot31 = dot31 + DBLE( wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * CMPLX( gk2)
|
||||
dot22 = dot22 + DBLE( wsum2)* DBLE(cv) + DIMAG(wsum2)*DIMAG(cv)
|
||||
dot32 = dot32 + DBLE( wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * CMPLX( gk3 )
|
||||
dot33 = dot33 + DBLE( wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
! ... a factor 2 accounts for the other half of the G-vector sphere
|
||||
!
|
||||
sigmanlc(:,1) = sigmanlc(:,1) -4._DP * wg(ibnd, ik) * [dot11, dot21, dot31]
|
||||
sigmanlc(:,2) = sigmanlc(:,2) -4._DP * wg(ibnd, ik) * [0._DP, dot22, dot32]
|
||||
sigmanlc(:,3) = sigmanlc(:,3) -4._DP * wg(ibnd, ik) * [0._DP, 0._DP, dot33]
|
||||
IF ( nproc > 1 ) THEN
|
||||
CALL errore ('stres_us_gamma_gpu line 303', &
|
||||
'unexpected error nproc be 1 with GPU acceleration', 100)
|
||||
!CALL mp_circular_shift_left(becp%r, icyc, becp%comm)
|
||||
!CALL mp_circular_shift_left(becp%ibnd_begin, icyc, becp%comm)
|
||||
!CALL mp_circular_shift_left(nbnd_loc, icyc, becp%comm)
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
10 CONTINUE
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmanlc(l,l) = sigmanlc(l,l) - evps
|
||||
ENDDO
|
||||
!
|
||||
!
|
||||
CALL dev_buf%release_buffer( deff_d, ierrs(1) )
|
||||
CALL dev_buf%release_buffer( ps_d, ierrs(2) )
|
||||
CALL dev_buf%release_buffer( dvkb_d, ierrs(3) )
|
||||
!
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_us_gamma_gpu
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE stres_us_k_gpu()
|
||||
!----------------------------------------------------------------------
|
||||
!! nonlocal contribution to the stress - k-points version
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: na, np, ibnd, ipol, jpol, l, i, nt, &
|
||||
ikb, jkb, ih, jh, is, js, ijs, ishift, nh_np
|
||||
REAL(DP) :: fac, evps, dot11, dot21, dot31, dot22, dot32, dot33, aux
|
||||
COMPLEX(DP) :: qm1i, gk1, gk2, gk3, ps, ps_nc(2)
|
||||
COMPLEX(DP) :: cv, cv1, cv2, worksum, worksum1, worksum2, evci, evc1i, &
|
||||
evc2i, ps1, ps2, ps1d1, ps1d2, ps1d3, ps2d1, ps2d2, &
|
||||
ps2d3, psd1, psd2, psd3
|
||||
!
|
||||
COMPLEX(DP), POINTER :: dvkb_d(:,:,:)
|
||||
COMPLEX(DP), POINTER :: deff_d(:,:,:), deff_nc_d(:,:,:,:)
|
||||
COMPLEX(DP), POINTER :: ps_d(:), ps_nc_d(:,:)
|
||||
COMPLEX(DP), POINTER :: becpk_d(:,:), becpnc_d(:,:,:)
|
||||
INTEGER :: nhmx, ierrs(3)
|
||||
!
|
||||
REAL(DP) :: xyz(3,3)
|
||||
! xyz are the three unit vectors in the x,y,z directions
|
||||
DATA xyz / 1._DP, 0._DP, 0._DP, 0._DP, 1._DP, 0._DP, 0._DP, &
|
||||
0._DP, 1._DP /
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
ATTRIBUTES(DEVICE) :: ps_d, ps_nc_d, becpnc_d, becpk_d, dvkb_d, &
|
||||
deff_d, deff_nc_d
|
||||
#endif
|
||||
!
|
||||
!
|
||||
!
|
||||
evps = 0._DP
|
||||
! ... diagonal contribution
|
||||
!
|
||||
CALL dev_buf%lock_buffer( dvkb_d, (/ npwx,nkb,4 /), ierrs(1) )
|
||||
!
|
||||
CALL gen_us_dj_gpu( ik, dvkb_d(:,:,4) )
|
||||
IF ( lmaxkb > 0 ) THEN
|
||||
DO ipol = 1, 3
|
||||
CALL gen_us_dy_gpu( ik, xyz(1,ipol), dvkb_d(:,:,ipol) )
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
IF (noncolin) THEN
|
||||
CALL dev_buf%lock_buffer( ps_nc_d, (/ nkb,npol /), ierrs(2) )
|
||||
CALL dev_buf%lock_buffer( deff_nc_d, (/ nhm,nhm,nat,nspin /), ierrs(3) )
|
||||
becpnc_d => becp_d%nc_d
|
||||
ELSE
|
||||
CALL dev_buf%lock_buffer ( ps_d, nkb, ierrs(2) )
|
||||
CALL dev_buf%lock_buffer( deff_d, (/ nhm,nhm,nat /), ierrs(3) )
|
||||
becpk_d => becp_d%k_d
|
||||
ENDIF
|
||||
IF (ANY(ierrs /= 0)) CALL errore( 'stres_us_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs)) )
|
||||
!
|
||||
CALL using_et(0)
|
||||
!
|
||||
! ... the contribution is calculated only on one processor because
|
||||
! ... partial results are later summed over all processors
|
||||
!
|
||||
IF ( me_bgrp /= root_bgrp ) GO TO 100
|
||||
!
|
||||
!
|
||||
DO ibnd = 1, nbnd
|
||||
fac = wg(ibnd,ik)
|
||||
IF (ABS(fac) < 1.d-9) CYCLE
|
||||
IF (noncolin) THEN
|
||||
CALL compute_deff_nc_gpu( deff_nc_d , et(ibnd,ik) )
|
||||
ELSE
|
||||
CALL compute_deff_gpu( deff_d, et(ibnd,ik) )
|
||||
ENDIF
|
||||
!
|
||||
!
|
||||
IF (noncolin) THEN
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO i = 1, itot
|
||||
!
|
||||
ih = ix_d(i,2) ; na = ix_d(i,1)
|
||||
ishift = ix_d(i,4) ; ikb = ishift + ih
|
||||
aux = 0.d0
|
||||
!
|
||||
IF (.NOT. is_multinp_d(i)) THEN
|
||||
!
|
||||
ijs = 0
|
||||
DO is = 1, npol
|
||||
DO js = 1, npol
|
||||
ijs = ijs + 1
|
||||
aux = aux + fac * DBLE(deff_nc_d(ih,ih,na,ijs) &
|
||||
* CONJG(becpnc_d(ikb,is,ibnd)) * &
|
||||
becpnc_d(ikb,js,ibnd))
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
nh_np = ix_d(i,3)
|
||||
!
|
||||
ijs = 0
|
||||
DO is = 1, npol
|
||||
DO js = 1, npol
|
||||
ijs = ijs + 1
|
||||
aux = aux + fac * DBLE(deff_nc_d(ih,ih,na,ijs) &
|
||||
* CONJG(becpnc_d(ikb,is,ibnd))* &
|
||||
becpnc_d(ikb,js,ibnd))
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DO jh = ih+1, nh_np
|
||||
jkb = ishift + jh
|
||||
ijs = 0
|
||||
DO is = 1, npol
|
||||
DO js = 1, npol
|
||||
ijs = ijs + 1
|
||||
aux = aux + 2._DP*fac * &
|
||||
DBLE( deff_nc_d(ih,jh,na,ijs) * &
|
||||
(CONJG(becpnc_d(ikb,is,ibnd)) * &
|
||||
becpnc_d(jkb,js,ibnd)) )
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
evps = evps + aux
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
aux = 0.d0
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO i = 1, itot
|
||||
!
|
||||
ih = ix_d(i,2) ; na = ix_d(i,1)
|
||||
ishift = ix_d(i,4) ; ikb = ishift + ih
|
||||
!
|
||||
IF (.NOT. is_multinp_d(i)) THEN
|
||||
!
|
||||
aux = fac * deff_d(ih,ih,na) * &
|
||||
ABS(becpk_d(ikb,ibnd) )**2
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
nh_np = ix_d(i,3)
|
||||
!
|
||||
aux = fac * DBLE(deff_d(ih,ih,na) * &
|
||||
ABS(becpk_d(ikb,ibnd) )**2) + &
|
||||
DBLE(SUM( deff_d(ih,ih+1:nh_np,na) * &
|
||||
fac * 2._DP*DBLE( CONJG(becpk_d(ikb,ibnd)) &
|
||||
* becpk_d(ishift+ih+1:ishift+nh_np,ibnd) ) ))
|
||||
!
|
||||
ENDIF
|
||||
evps = evps + aux
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
DO l = 1, 3
|
||||
sigmanlc(l,l) = sigmanlc(l,l) - evps
|
||||
ENDDO
|
||||
!
|
||||
100 CONTINUE
|
||||
!
|
||||
! ... non diagonal contribution - derivative of the bessel function
|
||||
!
|
||||
DO ibnd = 1, nbnd
|
||||
!
|
||||
IF ( noncolin ) THEN
|
||||
!
|
||||
CALL compute_deff_nc_gpu( deff_nc_d, et(ibnd,ik) )
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO i = 1, itot
|
||||
!
|
||||
ih = ix_d(i,2) ; na = ix_d(i,1)
|
||||
ishift = ix_d(i,4) ; ikb = ishift + ih
|
||||
!
|
||||
IF (.NOT. is_multinp_d(i)) THEN
|
||||
!
|
||||
DO is = 1, npol
|
||||
ijs = (is-1)*npol
|
||||
ps_nc_d(ikb,is) = SUM( becpnc_d(ikb,1:npol,ibnd) * &
|
||||
deff_nc_d(ih,ih,na,ijs+1:ijs+npol) )
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
nh_np = ix_d(i,3)
|
||||
!
|
||||
DO is = 1, npol
|
||||
ijs = (is-1)*npol
|
||||
ps_nc_d(ikb,is) = SUM( becpnc_d(ishift+1:ishift+nh_np,1:npol,ibnd) * &
|
||||
deff_nc_d(ih,1:nh_np,na,ijs+1:ijs+npol) )
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL compute_deff_gpu( deff_d, et(ibnd,ik) )
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO i = 1, itot
|
||||
!
|
||||
ih = ix_d(i,2) ; na = ix_d(i,1)
|
||||
ishift = ix_d(i,4) ; ikb = ishift + ih
|
||||
!
|
||||
IF (.NOT. is_multinp_d(i)) THEN
|
||||
ps_d(ikb) = CMPLX(deeq_d(ih,ih,na,current_spin)) * &
|
||||
becpk_d(ikb,ibnd)
|
||||
ELSE
|
||||
nh_np = ix_d(i,3)
|
||||
!
|
||||
ps_d(ikb) = SUM( becpk_d(ishift+1:ishift+nh_np,ibnd) * &
|
||||
deff_d(ih,1:nh_np,na) )
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP
|
||||
dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP
|
||||
!
|
||||
!
|
||||
IF (noncolin) THEN
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO ikb =1, nkb
|
||||
DO i = 1, npw
|
||||
evc1i = evc_d(i, ibnd)
|
||||
evc2i = evc_d(i+npwx,ibnd)
|
||||
qm1i = CMPLX(qm1_d(i))
|
||||
gk1 = CMPLX(gk_d(i,1))
|
||||
gk2 = CMPLX(gk_d(i,2))
|
||||
gk3 = CMPLX(gk_d(i,3))
|
||||
worksum1 = ps_nc_d(ikb,1) * dvkb_d(i,ikb,4)
|
||||
worksum2 = ps_nc_d(ikb,2) * dvkb_d(i,ikb,4)
|
||||
!
|
||||
cv1 = evc1i * gk1 * gk1 * qm1i
|
||||
cv2 = evc2i * gk1 * gk1 * qm1i
|
||||
dot11 = dot11 + DBLE(worksum1)*DBLE(cv1) + &
|
||||
DIMAG(worksum1)*DIMAG(cv1) + &
|
||||
DBLE(worksum2)*DBLE(cv2) + &
|
||||
DIMAG(worksum2)*DIMAG(cv2)
|
||||
!
|
||||
cv1 = evc1i * gk2 * gk1 * qm1i
|
||||
cv2 = evc2i * gk2 * gk1 * qm1i
|
||||
dot21 = dot21 + DBLE(worksum1)*DBLE(cv1) + &
|
||||
DIMAG(worksum1)*DIMAG(cv1) + &
|
||||
DBLE(worksum2)*DBLE(cv2) + &
|
||||
DIMAG(worksum2)*DIMAG(cv2)
|
||||
!
|
||||
cv1 = evc1i * gk3 * gk1 * qm1i
|
||||
cv2 = evc2i * gk3 * gk1 * qm1i
|
||||
dot31 = dot31 + DBLE(worksum1)*DBLE(cv1) + &
|
||||
DIMAG(worksum1)*DIMAG(cv1) + &
|
||||
DBLE(worksum2)*DBLE(cv2) + &
|
||||
DIMAG(worksum2)*DIMAG(cv2)
|
||||
!
|
||||
cv1 = evc1i * gk2 * gk2 * qm1i
|
||||
cv2 = evc2i * gk2 * gk2 * qm1i
|
||||
dot22 = dot22 + DBLE(worksum1)*DBLE(cv1) + &
|
||||
DIMAG(worksum1)*DIMAG(cv1) + &
|
||||
DBLE(worksum2)*DBLE(cv2) + &
|
||||
DIMAG(worksum2)*DIMAG(cv2)
|
||||
!
|
||||
cv1 = evc1i * gk3 * gk2 * qm1i
|
||||
cv2 = evc2i * gk3 * gk2 * qm1i
|
||||
dot32 = dot32 + DBLE(worksum1)*DBLE(cv1) + &
|
||||
DIMAG(worksum1)*DIMAG(cv1) + &
|
||||
DBLE(worksum2)*DBLE(cv2) + &
|
||||
DIMAG(worksum2)*DIMAG(cv2)
|
||||
!
|
||||
cv1 = evc1i * gk3 * gk3 * qm1i
|
||||
cv2 = evc2i * gk3 * gk3 * qm1i
|
||||
dot33 = dot33 + DBLE(worksum1)*DBLE(cv1) + &
|
||||
DIMAG(worksum1)*DIMAG(cv1) + &
|
||||
DBLE(worksum2)*DBLE(cv2) + &
|
||||
DIMAG(worksum2)*DIMAG(cv2)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
!
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO ikb = 1, nkb
|
||||
DO i = 1, npw
|
||||
!
|
||||
worksum = ps_d(ikb) *dvkb_d(i,ikb,4)
|
||||
!
|
||||
evci = evc_d(i,ibnd)
|
||||
qm1i = CMPLX(qm1_d(i))
|
||||
gk1 = CMPLX(gk_d(i,1))
|
||||
gk2 = CMPLX(gk_d(i,2))
|
||||
gk3 = CMPLX(gk_d(i,3))
|
||||
!
|
||||
cv = evci * gk1 * gk1 * qm1i
|
||||
dot11 = dot11 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * gk2 * gk1 * qm1i
|
||||
dot21 = dot21 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * gk3 * gk1 * qm1i
|
||||
dot31 = dot31 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * gk2 * gk2 * qm1i
|
||||
dot22 = dot22 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * gk3 * gk2 * qm1i
|
||||
dot32 = dot32 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * gk3 * gk3 * qm1i
|
||||
dot33 = dot33 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv)
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!IF ( me_bgrp /= root_bgrp ) stop
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
sigmanlc(:,1) = sigmanlc(:,1) - 2._DP * wg(ibnd,ik) * [dot11, dot21, dot31]
|
||||
sigmanlc(:,2) = sigmanlc(:,2) - 2._DP * wg(ibnd,ik) * [0._DP, dot22, dot32]
|
||||
sigmanlc(:,3) = sigmanlc(:,3) - 2._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33]
|
||||
!
|
||||
! ... non diagonal contribution - derivative of the spherical harmonics
|
||||
! ... (no contribution from l=0)
|
||||
!
|
||||
IF ( lmaxkb == 0 ) CYCLE
|
||||
!
|
||||
dot11 = 0._DP ; dot21 = 0._DP
|
||||
dot31 = 0._DP ; dot22 = 0._DP
|
||||
dot32 = 0._DP ; dot33 = 0._DP
|
||||
!
|
||||
IF (noncolin) THEN
|
||||
!
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO ikb =1, nkb
|
||||
DO i = 1, npw
|
||||
!
|
||||
gk1 = CMPLX(gk_d(i,1))
|
||||
gk2 = CMPLX(gk_d(i,2))
|
||||
gk3 = CMPLX(gk_d(i,3))
|
||||
!
|
||||
ps1 = ps_nc_d(ikb,1)
|
||||
ps2 = ps_nc_d(ikb,2)
|
||||
!
|
||||
ps1d1 = ps1 * dvkb_d(i,ikb,1)
|
||||
ps1d2 = ps1 * dvkb_d(i,ikb,2)
|
||||
ps1d3 = ps1 * dvkb_d(i,ikb,3)
|
||||
!
|
||||
ps2d1 = ps2 * dvkb_d(i,ikb,1)
|
||||
ps2d2 = ps2 * dvkb_d(i,ikb,2)
|
||||
ps2d3 = ps2 * dvkb_d(i,ikb,3)
|
||||
!
|
||||
evc1i = evc_d(i,ibnd)
|
||||
evc2i = evc_d(i+npwx,ibnd)
|
||||
!
|
||||
cv1 = evc1i * gk1
|
||||
cv2 = evc2i * gk1
|
||||
dot11 = dot11 + DBLE(ps1d1)*DBLE(cv1) + &
|
||||
DIMAG(ps1d1)*DIMAG(cv1) + &
|
||||
DBLE(ps2d1)*DBLE(cv2) + &
|
||||
DIMAG(ps2d1)*DIMAG(cv2)
|
||||
!
|
||||
dot21 = dot21 + DBLE(ps1d2)*DBLE(cv1) + &
|
||||
DIMAG(ps1d2)*DIMAG(cv1) + &
|
||||
DBLE(ps2d2)*DBLE(cv2) + &
|
||||
DIMAG(ps2d2)*DIMAG(cv2)
|
||||
!
|
||||
dot31 = dot31 + DBLE(ps1d3)*DBLE(cv1) + &
|
||||
DIMAG(ps1d3)*DIMAG(cv1) + &
|
||||
DBLE(ps2d3)*DBLE(cv2) + &
|
||||
DIMAG(ps2d3)*DIMAG(cv2)
|
||||
!
|
||||
cv1 = evc1i * gk2
|
||||
cv2 = evc2i * gk2
|
||||
dot22 = dot22 + DBLE(ps1d2)*DBLE(cv1) + &
|
||||
DIMAG(ps1d2)*DIMAG(cv1) + &
|
||||
DBLE(ps2d2)*DBLE(cv2) + &
|
||||
DIMAG(ps2d2)*DIMAG(cv2)
|
||||
!
|
||||
dot32 = dot32 + DBLE(ps1d3)*DBLE(cv1) + &
|
||||
DIMAG(ps1d3)*DIMAG(cv1) + &
|
||||
DBLE(ps2d3)*DBLE(cv2) + &
|
||||
DIMAG(ps2d3)*DIMAG(cv2)
|
||||
!
|
||||
cv1 = evc1i * gk3
|
||||
cv2 = evc2i * gk3
|
||||
dot33 = dot33 + DBLE(ps1d3)*DBLE(cv1) + &
|
||||
DIMAG(ps1d3)*DIMAG(cv1) + &
|
||||
DBLE(ps2d3)*DBLE(cv2) + &
|
||||
DIMAG(ps2d3)*DIMAG(cv2)
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO ikb = 1, nkb
|
||||
DO i = 1, npw
|
||||
ps = ps_d(ikb)
|
||||
psd1 = ps*dvkb_d(i,ikb,1)
|
||||
psd2 = ps*dvkb_d(i,ikb,2)
|
||||
psd3 = ps*dvkb_d(i,ikb,3)
|
||||
evci = evc_d(i,ibnd)
|
||||
gk1 = CMPLX(gk_d(i,1))
|
||||
gk2 = CMPLX(gk_d(i,2))
|
||||
gk3 = CMPLX(gk_d(i,3))
|
||||
!
|
||||
cv = evci * gk1
|
||||
dot11 = dot11 + DBLE(psd1)*DBLE(cv) + DIMAG(psd1)*DIMAG(cv)
|
||||
dot21 = dot21 + DBLE(psd2)*DBLE(cv) + DIMAG(psd2)*DIMAG(cv)
|
||||
dot31 = dot31 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * gk2
|
||||
dot22 = dot22 + DBLE(psd2)*DBLE(cv) + DIMAG(psd2)*DIMAG(cv)
|
||||
dot32 = dot32 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv)
|
||||
!
|
||||
cv = evci * gk3
|
||||
dot33 = dot33 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
sigmanlc(:,1) = sigmanlc(:,1) -2._DP * wg(ibnd,ik) * [dot11, dot21, dot31]
|
||||
sigmanlc(:,2) = sigmanlc(:,2) -2._DP * wg(ibnd,ik) * [0._DP, dot22, dot32]
|
||||
sigmanlc(:,3) = sigmanlc(:,3) -2._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33]
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
10 CONTINUE
|
||||
!
|
||||
CALL dev_buf%release_buffer( dvkb_d, ierrs(1) )
|
||||
!
|
||||
IF (noncolin) THEN
|
||||
CALL dev_buf%release_buffer( ps_nc_d, ierrs(2) )
|
||||
CALL dev_buf%release_buffer( deff_nc_d, ierrs(3) )
|
||||
ELSE
|
||||
CALL dev_buf%release_buffer( ps_d, ierrs(2) )
|
||||
CALL dev_buf%release_buffer( deff_d, ierrs(3) )
|
||||
ENDIF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE stres_us_k_gpu
|
||||
#endif
|
||||
!
|
||||
END SUBROUTINE stres_us_gpu
|
|
@ -17,12 +17,13 @@ SUBROUTINE stress( sigma )
|
|||
USE ions_base, ONLY : nat, ntyp => nsp, ityp, tau, zv, atm
|
||||
USE constants, ONLY : ry_kbar
|
||||
USE ener, ONLY : etxc, vtxc
|
||||
USE gvect, ONLY : ngm, gstart, g, gg, gcutm
|
||||
USE gvect, ONLY : ngm, gstart, g, gg, gcutm, gl, gl_d
|
||||
USE fft_base, ONLY : dfftp
|
||||
USE ldaU, ONLY : lda_plus_u, Hubbard_projectors
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE scf, ONLY : rho, rho_core, rhog_core
|
||||
USE control_flags, ONLY : iverbosity, gamma_only, llondon, ldftd3, lxdm, ts_vdw, mbd_vdw, use_gpu
|
||||
USE control_flags, ONLY : iverbosity, gamma_only, llondon, ldftd3, lxdm, &
|
||||
ts_vdw, mbd_vdw
|
||||
USE xc_lib, ONLY : xclib_dft_is
|
||||
USE symme, ONLY : symmatrix
|
||||
USE bp, ONLY : lelfield
|
||||
|
@ -36,8 +37,7 @@ SUBROUTINE stress( sigma )
|
|||
USE libmbd_interface, ONLY : HmbdvdW
|
||||
USE rism_module, ONLY : lrism, stres_rism
|
||||
USE esm, ONLY : do_comp_esm, esm_bc ! for ESM stress
|
||||
USE esm, ONLY : esm_stres_har, esm_stres_ewa, esm_stres_loclong ! for ESM stress
|
||||
USE gvect, ONLY : g_d, gg_d
|
||||
USE esm, ONLY : esm_stres_har, esm_stres_ewa, esm_stres_loclong
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -55,7 +55,7 @@ SUBROUTINE stress( sigma )
|
|||
REAL(DP) :: sigmasol(3,3) ! for RISM stress
|
||||
INTEGER :: l, m
|
||||
!
|
||||
! Auxiliary variables for Grimme-D3
|
||||
! ... Auxiliary variables for Grimme-D3
|
||||
!
|
||||
INTEGER :: atnum(1:nat)
|
||||
REAL(DP) :: latvecs(3,3)
|
||||
|
@ -70,10 +70,14 @@ SUBROUTINE stress( sigma )
|
|||
!
|
||||
CALL start_clock( 'stress' )
|
||||
!
|
||||
! contribution from local potential
|
||||
!$acc update device( g, gg )
|
||||
#if defined(__CUDA)
|
||||
gl_d = gl
|
||||
#endif
|
||||
!
|
||||
IF (.NOT. use_gpu) CALL stres_loc( sigmaloc )
|
||||
IF ( use_gpu) CALL stres_loc_gpu( sigmaloc )
|
||||
! ... contribution from local potential
|
||||
!
|
||||
CALL stres_loc( sigmaloc )
|
||||
!
|
||||
IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN
|
||||
! In ESM, sigmaloc has only short-range term: add long-range term
|
||||
|
@ -81,51 +85,50 @@ SUBROUTINE stress( sigma )
|
|||
sigmaloc(:,:) = sigmaloc(:,:) + sigmaloclong(:,:)
|
||||
END IF
|
||||
!
|
||||
! hartree contribution
|
||||
! ... Hartree contribution
|
||||
!
|
||||
IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress
|
||||
CALL esm_stres_har( sigmahar, rho%of_g(:,1) )
|
||||
ELSE
|
||||
IF (.NOT. use_gpu) CALL stres_har( sigmahar )
|
||||
IF ( use_gpu) CALL stres_har_gpu( sigmahar )
|
||||
END IF
|
||||
CALL stres_har( sigmahar )
|
||||
ENDIF
|
||||
!
|
||||
! xc contribution (diagonal)
|
||||
! ... XC contribution (diagonal)
|
||||
!
|
||||
sigmaxc(:,:) = 0.d0
|
||||
DO l = 1, 3
|
||||
sigmaxc (l, l) = - (etxc - vtxc) / omega
|
||||
sigmaxc(l,l) = - (etxc - vtxc) / omega
|
||||
ENDDO
|
||||
!
|
||||
! xc contribution: add gradient corrections (non diagonal)
|
||||
! ... XC contribution: add gradient corrections (non diagonal)
|
||||
!
|
||||
CALL stres_gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, rho%kin_r, &
|
||||
nspin, dfftp, g, alat, omega, sigmaxc )
|
||||
IF (.NOT.xclib_dft_is('meta')) THEN
|
||||
CALL stres_gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, &
|
||||
nspin, dfftp, g, alat, omega, sigmaxc )
|
||||
ELSE
|
||||
CALL stres_gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, &
|
||||
nspin, dfftp, g, alat, omega, sigmaxc, rho%kin_r )
|
||||
ENDIF
|
||||
!
|
||||
! meta-GGA contribution
|
||||
! ... meta-GGA contribution
|
||||
!
|
||||
IF (.NOT. use_gpu) CALL stres_mgga( sigmaxc )
|
||||
IF ( use_gpu) CALL stres_mgga_gpu( sigmaxc )
|
||||
CALL stres_mgga( sigmaxc )
|
||||
!
|
||||
! core correction contribution
|
||||
! ... core correction contribution
|
||||
!
|
||||
IF (.NOT. use_gpu) CALL stres_cc( sigmaxcc )
|
||||
IF ( use_gpu) CALL stres_cc_gpu( sigmaxcc )
|
||||
CALL stres_cc( sigmaxcc )
|
||||
!
|
||||
! ewald contribution
|
||||
! ... Ewald contribution
|
||||
!
|
||||
IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress
|
||||
CALL esm_stres_ewa( sigmaewa )
|
||||
ELSE
|
||||
IF (.NOT. use_gpu) CALL stres_ewa( alat, nat, ntyp, ityp, zv, at, &
|
||||
bg, tau, omega, g, gg, ngm, gstart, &
|
||||
gamma_only, gcutm, sigmaewa )
|
||||
IF ( use_gpu) CALL stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg,&
|
||||
tau, omega, g_d,gg_d, ngm, gstart,&
|
||||
gamma_only, gcutm, sigmaewa )
|
||||
END IF
|
||||
CALL stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, &
|
||||
tau, omega, g, gg, ngm, gstart, &
|
||||
gamma_only, gcutm, sigmaewa )
|
||||
ENDIF
|
||||
!
|
||||
! semi-empirical dispersion contribution: Grimme-D2 and D3
|
||||
! ... semi-empirical dispersion contribution: Grimme-D2 and D3
|
||||
!
|
||||
sigmad23( : , : ) = 0.d0
|
||||
IF ( llondon ) THEN
|
||||
|
@ -145,10 +148,9 @@ SUBROUTINE stress( sigma )
|
|||
CALL stop_clock('stres_dftd3')
|
||||
END IF
|
||||
!
|
||||
! kinetic + nonlocal contribuition
|
||||
! ... kinetic + nonlocal contribuition
|
||||
!
|
||||
IF (.NOT. use_gpu) CALL stres_knl( sigmanlc, sigmakin )
|
||||
IF ( use_gpu) CALL stres_knl_gpu( sigmanlc, sigmakin )
|
||||
CALL stres_knl( sigmanlc, sigmakin )
|
||||
!
|
||||
DO l = 1, 3
|
||||
DO m = 1, 3
|
||||
|
@ -156,13 +158,13 @@ SUBROUTINE stress( sigma )
|
|||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
! Hubbard contribution
|
||||
! (included by stres_knl if using beta as local projectors)
|
||||
! ... Hubbard contribution
|
||||
! (included by stres_knl if using beta as local projectors)
|
||||
!
|
||||
sigmah(:,:) = 0.d0
|
||||
IF ( lda_plus_u .AND. Hubbard_projectors /= 'pseudo' ) CALL stres_hub( sigmah )
|
||||
!
|
||||
! Electric field contribution
|
||||
! ... Electric field contribution
|
||||
!
|
||||
sigmael(:,:)=0.d0
|
||||
sigmaion(:,:)=0.d0
|
||||
|
@ -170,15 +172,15 @@ SUBROUTINE stress( sigma )
|
|||
! call stress_bp_efield (sigmael )
|
||||
! call stress_ion_efield (sigmaion )
|
||||
!
|
||||
! vdW dispersion contribution: xdm
|
||||
! ... vdW dispersion contribution: xdm
|
||||
!
|
||||
sigmaxdm = 0._dp
|
||||
IF (lxdm) sigmaxdm = stress_xdm()
|
||||
!
|
||||
! vdW dispersion contribution: Tkatchenko-Scheffler
|
||||
! ... vdW dispersion contribution: Tkatchenko-Scheffler
|
||||
!
|
||||
sigma_ts = 0.0_DP
|
||||
! vdW dispersion contribution: Many-Body Dispersion
|
||||
! ... vdW dispersion contribution: Many-Body Dispersion
|
||||
!
|
||||
sigma_mbd = 0.0_DP
|
||||
IF ( mbd_vdw ) THEN
|
||||
|
@ -187,17 +189,17 @@ SUBROUTINE stress( sigma )
|
|||
sigma_ts = -2.0_DP*alat*MATMUL( HtsvdW, TRANSPOSE(at) )/omega
|
||||
ENDIF
|
||||
!
|
||||
! DFT-non_local contribution
|
||||
! ... DFT-non_local contribution
|
||||
!
|
||||
sigma_nonloc_dft(:,:) = 0.d0
|
||||
CALL stres_nonloc_dft( rho%of_r, rho_core, nspin, sigma_nonloc_dft )
|
||||
!
|
||||
! The solvation contribution (3D-RISM)
|
||||
! ... The solvation contribution (3D-RISM)
|
||||
!
|
||||
sigmasol(:,:) = 0.d0
|
||||
IF (lrism) CALL stres_rism(sigmasol)
|
||||
!
|
||||
! SUM
|
||||
! ... Sum all terms
|
||||
!
|
||||
sigma(:,:) = sigmakin(:,:) + sigmaloc(:,:) + sigmahar(:,:) + &
|
||||
sigmaxc(:,:) + sigmaxcc(:,:) + sigmaewa(:,:) + &
|
||||
|
@ -213,12 +215,12 @@ SUBROUTINE stress( sigma )
|
|||
ELSE
|
||||
sigmaexx = 0.d0
|
||||
ENDIF
|
||||
! Resymmetrize the total stress. This should not be strictly necessary,
|
||||
! but prevents loss of symmetry in long vc-bfgs runs
|
||||
! ... Resymmetrize the total stress. This should not be strictly necessary,
|
||||
! but prevents loss of symmetry in long vc-bfgs runs
|
||||
|
||||
CALL symmatrix( sigma )
|
||||
!
|
||||
! write results in Ry/(a.u.)^3 and in kbar
|
||||
! ... write results in Ry/(a.u.)^3 and in kbar
|
||||
!
|
||||
IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress
|
||||
WRITE( stdout, 9000) (sigma(1,1) + sigma(2,2)) * ry_kbar/3d0, &
|
||||
|
|
|
@ -7,19 +7,18 @@
|
|||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE usnldiag_gpu (npw, h_diag_d, s_diag_d)
|
||||
SUBROUTINE usnldiag_gpu( npw, h_diag_d, s_diag_d )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Add nonlocal pseudopotential term to diagonal part of Hamiltonian.
|
||||
!! Compute the diagonal part of the S matrix.
|
||||
!
|
||||
! add nonlocal pseudopotential term to diagonal part of Hamiltonian
|
||||
! compute the diagonal part of the S matrix.
|
||||
!
|
||||
! Routine splitted for improving performance
|
||||
! Routine splitted to improve performance
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE ions_base, ONLY: nat, ityp, ntyp => nsp
|
||||
USE wvfct, ONLY: npwx
|
||||
USE uspp, ONLY: ofsbeta, deeq_d, qq_at_d, qq_so_d, &
|
||||
deeq_nc_d
|
||||
USE uspp, ONLY: ofsbeta, deeq, qq_at, qq_so, &
|
||||
deeq_nc
|
||||
USE uspp_param, ONLY: upf, nh
|
||||
USE noncollin_module, ONLY: noncolin, npol, lspinorb
|
||||
USE device_memcpy_m, ONLY: dev_memset
|
||||
|
@ -54,7 +53,7 @@ CONTAINS
|
|||
|
||||
SUBROUTINE usnldiag_collinear()
|
||||
USE lsda_mod, ONLY: current_spin
|
||||
USE uspp, ONLY: deeq_d, qq_at_d, vkb
|
||||
USE uspp, ONLY: deeq, qq_at, vkb
|
||||
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -72,7 +71,7 @@ CONTAINS
|
|||
IF (ityp (na) == nt) THEN
|
||||
ijkb_start = ofsbeta(na)
|
||||
nh_ = nh(nt)
|
||||
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc data present(vkb(:,:),deeq) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc parallel vector_length(32)
|
||||
!$acc loop gang reduction(+:sum_h,sum_s)
|
||||
DO ig = 1, npw
|
||||
|
@ -82,11 +81,11 @@ CONTAINS
|
|||
DO ih = 1, nh_
|
||||
DO jh = 1, nh_
|
||||
ikb = ijkb_start + ih
|
||||
cv = vkb (ig, ikb)
|
||||
cv = vkb(ig,ikb)
|
||||
jkb = ijkb_start + jh
|
||||
ar = cv*conjg(vkb (ig, jkb))
|
||||
sum_h = sum_h + dble(deeq_d (ih, jh, na, current_spin) * ar)
|
||||
sum_s = sum_s + dble(qq_at_d (ih, jh, na) * ar)
|
||||
ar = cv*conjg(vkb(ig,jkb))
|
||||
sum_h = sum_h + dble(deeq(ih,jh,na,current_spin) * ar)
|
||||
sum_s = sum_s + dble(qq_at(ih,jh,na) * ar)
|
||||
END DO
|
||||
END DO
|
||||
!$acc atomic update
|
||||
|
@ -105,7 +104,7 @@ CONTAINS
|
|||
IF (ityp (na) == nt) THEN
|
||||
ijkb_start = ofsbeta(na)
|
||||
nh_ = nh(nt)
|
||||
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc data present(vkb(:,:),deeq) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc parallel vector_length(32)
|
||||
!$acc loop gang reduction(+:sum_h,sum_s)
|
||||
DO ig = 1, npw
|
||||
|
@ -114,9 +113,9 @@ CONTAINS
|
|||
!$acc loop vector private(ikb,ar) reduction(+:sum_h,sum_s)
|
||||
DO ih = 1, nh_
|
||||
ikb = ijkb_start + ih
|
||||
ar = vkb (ig, ikb)*conjg(vkb (ig, ikb))
|
||||
sum_h = sum_h + dble(deeq_d (ih, ih, na, current_spin) * ar)
|
||||
sum_s = sum_s + dble(qq_at_d (ih, ih, na) * ar)
|
||||
ar = vkb(ig,ikb)*conjg(vkb(ig,ikb))
|
||||
sum_h = sum_h + dble(deeq(ih,ih,na,current_spin) * ar)
|
||||
sum_s = sum_s + dble(qq_at(ih,ih,na) * ar)
|
||||
END DO
|
||||
!$acc atomic update
|
||||
h_diag_d (ig,1) = h_diag_d (ig,1) + sum_h
|
||||
|
@ -135,7 +134,7 @@ CONTAINS
|
|||
!
|
||||
SUBROUTINE usnldiag_noncollinear()
|
||||
USE lsda_mod, ONLY: current_spin
|
||||
USE uspp, ONLY: vkb, qq_at_d, qq_so_d, deeq_nc_d
|
||||
USE uspp, ONLY: vkb, qq_at, qq_so, deeq_nc
|
||||
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -153,7 +152,7 @@ CONTAINS
|
|||
IF (ityp (na) == nt) THEN
|
||||
ijkb_start = ofsbeta(na)
|
||||
nh_ = nh(nt)
|
||||
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc parallel vector_length(32)
|
||||
!$acc loop gang reduction(+:sum_h1,sum_h4,sum_s)
|
||||
DO ig = 1, npw ! change this to 2*npw ?
|
||||
|
@ -166,10 +165,10 @@ CONTAINS
|
|||
ikb = ijkb_start + ih
|
||||
cv = vkb (ig, ikb)
|
||||
jkb = ijkb_start + jh
|
||||
ar = cv*conjg(vkb (ig, jkb))
|
||||
sum_h1 = sum_h1 + dble(deeq_nc_d (ih, jh, na, 1) * ar)
|
||||
sum_h4 = sum_h4 + dble(deeq_nc_d (ih, jh, na, 4) * ar)
|
||||
sum_s = sum_s + dble(qq_at_d (ih, jh, na) * ar)
|
||||
ar = cv*conjg(vkb(ig,jkb))
|
||||
sum_h1 = sum_h1 + dble(deeq_nc(ih,jh,na,1) * ar)
|
||||
sum_h4 = sum_h4 + dble(deeq_nc(ih,jh,na,4) * ar)
|
||||
sum_s = sum_s + dble(qq_at(ih,jh,na) * ar)
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
|
@ -197,7 +196,7 @@ CONTAINS
|
|||
IF (ityp (na) == nt) THEN
|
||||
ijkb_start = ofsbeta(na)
|
||||
nh_ = nh(nt)
|
||||
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc parallel vector_length(32)
|
||||
!$acc loop gang reduction(+:sum_h1,sum_h4,sum_s)
|
||||
DO ig = 1, npw
|
||||
|
@ -207,10 +206,10 @@ CONTAINS
|
|||
!$acc loop vector private(ikb,ar) reduction(+:sum_h1,sum_h4,sum_s)
|
||||
DO ih = 1, nh_
|
||||
ikb = ijkb_start + ih
|
||||
ar = vkb (ig, ikb)*conjg(vkb (ig, ikb))
|
||||
sum_h1 = sum_h1 + dble(deeq_nc_d (ih, ih, na, 1) * ar)
|
||||
sum_h4 = sum_h4 + dble(deeq_nc_d (ih, ih, na, 4) * ar)
|
||||
sum_s = sum_s + dble(qq_at_d (ih, ih, na) * ar)
|
||||
ar = vkb (ig, ikb)*conjg(vkb(ig,ikb))
|
||||
sum_h1 = sum_h1 + dble(deeq_nc(ih,ih,na,1) * ar)
|
||||
sum_h4 = sum_h4 + dble(deeq_nc(ih,ih,na,4) * ar)
|
||||
sum_s = sum_s + dble(qq_at(ih,ih,na) * ar)
|
||||
END DO
|
||||
!
|
||||
! OPTIMIZE HERE : this scattered assign is bad!
|
||||
|
@ -238,7 +237,7 @@ CONTAINS
|
|||
!
|
||||
SUBROUTINE usnldiag_spinorb()
|
||||
USE lsda_mod, ONLY: current_spin
|
||||
USE uspp, ONLY: vkb, qq_at_d, qq_so_d, deeq_nc_d
|
||||
USE uspp, ONLY: vkb, qq_at, qq_so, deeq_nc
|
||||
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -256,7 +255,7 @@ CONTAINS
|
|||
IF (ityp (na) == nt) THEN
|
||||
ijkb_start = ofsbeta(na)
|
||||
nh_ = nh(nt)
|
||||
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc parallel vector_length(32)
|
||||
!$acc loop gang reduction(+:sum_h1,sum_h4,sum_s1,sum_s4)
|
||||
DO ig = 1, npw ! change this to 2*npw ?
|
||||
|
@ -268,13 +267,13 @@ CONTAINS
|
|||
DO ih = 1, nh_
|
||||
DO jh = 1, nh_
|
||||
ikb = ijkb_start + ih
|
||||
cv = vkb (ig, ikb)
|
||||
cv = vkb(ig,ikb)
|
||||
jkb = ijkb_start + jh
|
||||
ar = cv*conjg(vkb (ig, jkb))
|
||||
sum_h1 = sum_h1 + dble(deeq_nc_d (ih, jh, na, 1) * ar)
|
||||
sum_h4 = sum_h4 + dble(deeq_nc_d (ih, jh, na, 4) * ar)
|
||||
sum_s1 = sum_s1 + dble(qq_so_d (ih, jh, 1, nt) * ar)
|
||||
sum_s4 = sum_s4 + dble(qq_so_d (ih, jh, 4, nt) * ar)
|
||||
ar = cv*conjg(vkb(ig,jkb))
|
||||
sum_h1 = sum_h1 + dble(deeq_nc(ih,jh,na,1) * ar)
|
||||
sum_h4 = sum_h4 + dble(deeq_nc(ih,jh,na,4) * ar)
|
||||
sum_s1 = sum_s1 + dble(qq_so(ih,jh,1,nt) * ar)
|
||||
sum_s4 = sum_s4 + dble(qq_so(ih,jh,4,nt) * ar)
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
|
@ -302,7 +301,7 @@ CONTAINS
|
|||
IF (ityp (na) == nt) THEN
|
||||
ijkb_start = ofsbeta(na)
|
||||
nh_ = nh(nt)
|
||||
!$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:))
|
||||
!$acc parallel vector_length(32)
|
||||
!$acc loop gang reduction(+:sum_h1,sum_h4,sum_s1,sum_s4)
|
||||
DO ig = 1, npw
|
||||
|
@ -314,10 +313,10 @@ CONTAINS
|
|||
DO ih = 1, nh_
|
||||
ikb = ijkb_start + ih
|
||||
ar = vkb (ig, ikb)*conjg(vkb (ig, ikb))
|
||||
sum_h1 = sum_h1 + dble(deeq_nc_d (ih, ih, na, 1) * ar)
|
||||
sum_h4 = sum_h4 + dble(deeq_nc_d (ih, ih, na, 4) * ar)
|
||||
sum_s1 = sum_s1 + dble(qq_so_d (ih, ih, 1, nt) * ar)
|
||||
sum_s4 = sum_s4 + dble(qq_so_d (ih, ih, 4, nt) * ar)
|
||||
sum_h1 = sum_h1 + dble(deeq_nc(ih,ih,na,1) * ar)
|
||||
sum_h4 = sum_h4 + dble(deeq_nc(ih,ih,na,4) * ar)
|
||||
sum_s1 = sum_s1 + dble(qq_so(ih,ih,1,nt) * ar)
|
||||
sum_s4 = sum_s4 + dble(qq_so(ih,ih,4,nt) * ar)
|
||||
END DO
|
||||
!
|
||||
! OPTIMIZE HERE : this scattered assign is bad!
|
||||
|
|
|
@ -12,8 +12,6 @@ set(src_upflib
|
|||
init_tab_qrad.f90
|
||||
interp_atwfc.f90
|
||||
init_us_2_base.f90
|
||||
gen_us_dj_gpu.f90
|
||||
gen_us_dy_gpu.f90
|
||||
gth.f90
|
||||
paw_variables.f90
|
||||
pseudo_types.f90
|
||||
|
@ -56,7 +54,8 @@ set(src_upflib
|
|||
sph_bes_gpu.f90
|
||||
ylmr2_gpu.f90
|
||||
dylmr2_gpu.f90
|
||||
simpsn_gpu.f90)
|
||||
simpsn_gpu.f90
|
||||
upf_acc_interfaces.f90)
|
||||
qe_enable_cuda_fortran("${src_upflib}")
|
||||
|
||||
qe_add_library(qe_xml xmltools.f90 dom.f90 wxml.f90)
|
||||
|
|
|
@ -65,14 +65,13 @@ ylmr2.o
|
|||
# GPU versions of routines
|
||||
OBJS_GPU= \
|
||||
dylmr2_gpu.o \
|
||||
gen_us_dj_gpu.o \
|
||||
gen_us_dy_gpu.o \
|
||||
init_us_2_base_gpu.o \
|
||||
interp_atwfc_gpu.o \
|
||||
qvan2_gpu.o \
|
||||
simpsn_gpu.o \
|
||||
sph_bes_gpu.o \
|
||||
ylmr2_gpu.o
|
||||
ylmr2_gpu.o \
|
||||
upf_acc_interfaces.o
|
||||
|
||||
OBJS_NODEP+= dom.o wxml.o
|
||||
|
||||
|
|
|
@ -9,12 +9,13 @@
|
|||
SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
|
||||
!-----------------------------------------------------------------------
|
||||
!! This routine computes the derivatives of the Fourier transform of
|
||||
!! the Q function needed in stress assuming that the radial fourier
|
||||
!! transform is already computed and stored in table qrad.
|
||||
!! The formula implemented here is:
|
||||
|
||||
!! dq(g,i,j) = sum_lm (-i)^l ap(lm,i,j) *
|
||||
!! ( yr_lm(g^) dqrad(g,l,i,j) + dyr_lm(g^) qrad(g,l,i,j))
|
||||
!! the Q function needed in stress assuming that the radial Fourier
|
||||
!! transform is already computed and stored in table \(\text{qrad}\).
|
||||
!! The implemented formula:
|
||||
!
|
||||
!! \[ \text{dq}(g,i,j) = \sum_lm (-i)^l \text{ap}(lm,i,j) *
|
||||
!! ( \text{yr}_{lm}(g^) \text{dqrad}(g,l,i,j) +
|
||||
!! \text{dyr}_{lm}(g') \text{qrad}(g,l,i,j)) \]
|
||||
!
|
||||
USE upf_kinds, ONLY: DP
|
||||
USE uspp_data, ONLY: dq, qrad
|
||||
|
@ -48,7 +49,7 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
|
|||
!
|
||||
! ... local variables
|
||||
!
|
||||
COMPLEX(DP) :: sig
|
||||
COMPLEX(DP) :: sig, dqg_bgr
|
||||
! (-i)^L
|
||||
INTEGER :: nb, mb, ijv, ivl, jvl, ig, lp, l, lm, i0, i1, i2, i3
|
||||
! the atomic index corresponding to ih
|
||||
|
@ -71,7 +72,9 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
|
|||
! auxiliary variable
|
||||
! auxiliary variable
|
||||
!
|
||||
! compute the indices which correspond to ih,jh
|
||||
! ... compute the indices which correspond to ih,jh
|
||||
!
|
||||
!$acc data present_or_copyin(g,qmod,ylmk0,dylmk0) present_or_copyout(dqg)
|
||||
!
|
||||
sixth = 1.d0 / 6.d0
|
||||
dqi = 1 / dq
|
||||
|
@ -91,14 +94,16 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
|
|||
IF (ivl > nlx .OR. jvl > nlx) &
|
||||
CALL upf_error (' dqvan2 ', ' wrong dimensions (2)', MAX(ivl,jvl))
|
||||
!
|
||||
!$acc kernels
|
||||
dqg(:) = (0.d0,0.d0)
|
||||
!$acc end kernels
|
||||
!
|
||||
! and make the sum over the non zero LM
|
||||
! ... and make the sum over the non zero LM
|
||||
!
|
||||
DO lm = 1, lpx(ivl, jvl)
|
||||
lp = lpl(ivl, jvl, lm)
|
||||
DO lm = 1, lpx(ivl,jvl)
|
||||
lp = lpl(ivl,jvl,lm)
|
||||
!
|
||||
! extraction of angular momentum l from lp:
|
||||
! ... extraction of angular momentum l from lp:
|
||||
!
|
||||
IF (lp==1) THEN
|
||||
l = 1
|
||||
|
@ -119,16 +124,20 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
|
|||
ENDIF
|
||||
!
|
||||
sig = (0.d0, -1.d0)**(l - 1)
|
||||
sig = sig * ap(lp, ivl, jvl)
|
||||
sig = sig * ap(lp,ivl,jvl)
|
||||
!
|
||||
qm1 = -1.0_dp ! any number smaller than qmod(1)
|
||||
!
|
||||
#if !defined(_OPENACC)
|
||||
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(qm,px,ux,vx,wx,i0,i1,i2,i3,uvx,pwx,work,work1)
|
||||
#else
|
||||
!$acc parallel loop
|
||||
#endif
|
||||
DO ig = 1, ngy
|
||||
!
|
||||
! calculate quantites depending on the module of G only when needed
|
||||
! ... calculate quantites depending on the module of G only when needed
|
||||
!
|
||||
#if !defined(_OPENMP)
|
||||
#if !defined(_OPENMP) && !defined(_OPENACC)
|
||||
IF ( ABS( qmod(ig) - qm1 ) > 1.0D-6 ) THEN
|
||||
#endif
|
||||
qm = qmod (ig) * dqi
|
||||
|
@ -141,33 +150,40 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg )
|
|||
i2 = qm + 3
|
||||
i3 = qm + 4
|
||||
uvx = ux * vx * sixth
|
||||
|
||||
!
|
||||
pwx = px * wx * 0.5d0
|
||||
|
||||
!
|
||||
work = qrad(i0, ijv, l, np) * uvx * wx + &
|
||||
qrad(i1, ijv, l, np) * pwx * vx - &
|
||||
qrad(i2, ijv, l, np) * pwx * ux + &
|
||||
qrad(i3, ijv, l, np) * px * uvx
|
||||
work1 = - qrad(i0, ijv, l, np) * (ux*vx + vx*wx + ux*wx) * sixth &
|
||||
+ qrad(i1, ijv, l, np) * (wx*vx - px*wx - px*vx) * 0.5d0 &
|
||||
- qrad(i2, ijv, l, np) * (wx*ux - px*wx - px*ux) * 0.5d0 &
|
||||
+ qrad(i3, ijv, l, np) * (ux*vx - px*ux - px*vx) * sixth
|
||||
|
||||
work1 = work1 * dqi
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
work1 = (- qrad(i0, ijv, l, np) * (ux*vx + vx*wx + ux*wx) * sixth &
|
||||
+ qrad(i1, ijv, l, np) * (wx*vx - px*wx - px*vx) * 0.5d0 &
|
||||
- qrad(i2, ijv, l, np) * (wx*ux - px*wx - px*ux) * 0.5d0 &
|
||||
+ qrad(i3, ijv, l, np) * (ux*vx - px*ux - px*vx) * sixth) * dqi
|
||||
!
|
||||
#if !defined(_OPENMP) && !defined(_OPENACC)
|
||||
qm1 = qmod(ig)
|
||||
ENDIF
|
||||
#endif
|
||||
|
||||
dqg(ig) = dqg(ig) + sig * dylmk0(ig, lp) * work / tpiba
|
||||
IF (qmod(ig) > 1.d-9) dqg(ig) = dqg(ig) + &
|
||||
sig * ylmk0(ig, lp) * work1 * tpiba * g(ipol, ig) / qmod(ig)
|
||||
!
|
||||
IF (qmod(ig) > 1.d-9) THEN
|
||||
dqg_bgr = sig * ylmk0(ig,lp) * work1 * tpiba * g(ipol,ig) / qmod(ig)
|
||||
ELSE
|
||||
dqg_bgr = (0.d0,0.d0)
|
||||
ENDIF
|
||||
!
|
||||
dqg(ig) = dqg(ig) + sig * dylmk0(ig,lp) * work / tpiba + dqg_bgr
|
||||
!
|
||||
ENDDO
|
||||
#if !defined(_OPENACC)
|
||||
!$OMP END PARALLEL DO
|
||||
!
|
||||
#endif
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE dqvan2
|
||||
|
|
|
@ -7,9 +7,9 @@
|
|||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE gen_us_dj_base &
|
||||
( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
|
||||
omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, mill, g, dvkb )
|
||||
SUBROUTINE gen_us_dj_base( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
|
||||
omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, &
|
||||
mill, g, dvkb )
|
||||
!----------------------------------------------------------------------
|
||||
!! Calculates the beta function pseudopotentials with
|
||||
!! the derivative of the Bessel functions.
|
||||
|
@ -18,7 +18,7 @@ SUBROUTINE gen_us_dj_base &
|
|||
USE upf_const, ONLY: tpi
|
||||
USE uspp, ONLY: nkb, indv, nhtol, nhtolm
|
||||
USE uspp_data, ONLY: nqx, tab, dq
|
||||
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh
|
||||
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -42,7 +42,7 @@ SUBROUTINE gen_us_dj_base &
|
|||
!! rec.lattice units 2pi/a
|
||||
REAL(DP), INTENT(IN) :: omega
|
||||
!! cell volume
|
||||
INTEGER, INTENT(IN) :: nr1,nr2,nr3
|
||||
INTEGER, INTENT(IN) :: nr1, nr2, nr3
|
||||
!! fft dims (dense grid)
|
||||
COMPLEX(DP), INTENT(IN) :: eigts1(-nr1:nr1,nat)
|
||||
!! structure factor 1
|
||||
|
@ -54,7 +54,7 @@ SUBROUTINE gen_us_dj_base &
|
|||
!! miller index map
|
||||
REAL(DP), INTENT(IN) :: g(3,*)
|
||||
!! g vectors (2pi/a units)
|
||||
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx, nkb)
|
||||
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb)
|
||||
!! the beta function pseudopotential
|
||||
!
|
||||
! ... local variables
|
||||
|
@ -67,46 +67,54 @@ SUBROUTINE gen_us_dj_base &
|
|||
! index of the first nonzero point in the r
|
||||
! counter on atomic type
|
||||
!
|
||||
REAL(DP) :: arg, px, ux, vx, wx
|
||||
INTEGER :: ina, na, l, iig, lm, ikb_t, nht
|
||||
REAL(DP) :: arg, px, ux, vx, wx, qt
|
||||
! argument of the atomic phase factor
|
||||
!
|
||||
COMPLEX(DP) :: phase, pref
|
||||
! atomic phase factor
|
||||
COMPLEX(DP) :: pref
|
||||
! prefactor
|
||||
!
|
||||
INTEGER :: na, l, iig, lm, iq
|
||||
REAL(DP), ALLOCATABLE :: djl(:,:,:), ylm(:,:), q(:), gk(:,:)
|
||||
REAL(DP) :: qt
|
||||
COMPLEX(DP), ALLOCATABLE :: sk(:)
|
||||
INTEGER, ALLOCATABLE :: nas(:), ihv(:), nav(:)
|
||||
REAL(DP), ALLOCATABLE :: djl(:,:,:), ylm(:,:), q(:), gk(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: sk(:,:), phase(:)
|
||||
! phase: atomic phase factor
|
||||
!
|
||||
IF (nkb == 0) RETURN
|
||||
!
|
||||
CALL start_clock( 'stres_us31' )
|
||||
!$acc data present_or_copyin(igk,eigts1,eigts2,eigts3,mill,g) &
|
||||
!$acc present_or_copyout(dvkb) copyin(ityp,tau,xk)
|
||||
!
|
||||
ALLOCATE( djl(npw,nbetam,ntyp) )
|
||||
ALLOCATE( ylm(npw,(lmaxkb+1)**2) )
|
||||
ALLOCATE( gk(3,npw) )
|
||||
ALLOCATE( q(npw) )
|
||||
!CALL start_clock( 'stres_us31' )
|
||||
!
|
||||
ALLOCATE( djl(npw,nbetam,ntyp), ylm(npw,(lmaxkb+1)**2) )
|
||||
ALLOCATE( gk(3,npw), q(npw) )
|
||||
!$acc data create( djl, ylm )
|
||||
!$acc data create( gk, q )
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ig = 1, npw
|
||||
iig = igk(ig)
|
||||
gk(1, ig) = xk(1) + g(1, iig)
|
||||
gk(2, ig) = xk(2) + g(2, iig)
|
||||
gk(3, ig) = xk(3) + g(3, iig)
|
||||
q(ig) = gk(1, ig)**2 + gk(2, ig)**2 + gk(3, ig)**2
|
||||
gk(1,ig) = xk(1) + g(1,iig)
|
||||
gk(2,ig) = xk(2) + g(2,iig)
|
||||
gk(3,ig) = xk(3) + g(3,iig)
|
||||
q(ig) = gk(1,ig)**2 + gk(2,ig)**2 + gk(3,ig)**2
|
||||
ENDDO
|
||||
!
|
||||
CALL stop_clock( 'stres_us31' )
|
||||
CALL start_clock( 'stres_us32' )
|
||||
#if defined(__CUDA)
|
||||
!$acc host_data use_device(gk,q,ylm)
|
||||
CALL ylmr2_gpu( (lmaxkb+1)**2, npw, gk, q, ylm )
|
||||
!$acc end host_data
|
||||
#else
|
||||
!$acc update self(gk,q)
|
||||
CALL ylmr2( (lmaxkb+1)**2, npw, gk, q, ylm )
|
||||
CALL stop_clock( 'stres_us32' )
|
||||
CALL start_clock( 'stres_us33' )
|
||||
!$acc update device(ylm)
|
||||
#endif
|
||||
!
|
||||
!$acc data copyin( tab )
|
||||
DO nt = 1, ntyp
|
||||
!$acc parallel loop collapse(2)
|
||||
DO nb = 1, upf(nt)%nbeta
|
||||
!
|
||||
DO ig = 1, npw
|
||||
qt = SQRT(q (ig)) * tpiba
|
||||
qt = SQRT(q(ig)) * tpiba
|
||||
px = qt / dq - INT(qt/dq)
|
||||
ux = 1.d0 - px
|
||||
vx = 2.d0 - px
|
||||
|
@ -115,62 +123,99 @@ SUBROUTINE gen_us_dj_base &
|
|||
i1 = i0 + 1
|
||||
i2 = i0 + 2
|
||||
i3 = i0 + 3
|
||||
djl(ig,nb,nt) = ( tab(i0, nb, nt) * (-vx*wx-ux*wx-ux*vx)/6.d0 + &
|
||||
tab(i1, nb, nt) * (+vx*wx-px*wx-px*vx)/2.d0 - &
|
||||
tab(i2, nb, nt) * (+ux*wx-px*wx-px*ux)/2.d0 + &
|
||||
tab(i3, nb, nt) * (+ux*vx-px*vx-px*ux)/6.d0 )/dq
|
||||
djl(ig,nb,nt) = ( tab(i0,nb,nt) * (-vx*wx-ux*wx-ux*vx)/6.d0 + &
|
||||
tab(i1,nb,nt) * (+vx*wx-px*wx-px*vx)/2.d0 - &
|
||||
tab(i2,nb,nt) * (+ux*wx-px*wx-px*ux)/2.d0 + &
|
||||
tab(i3,nb,nt) * (+ux*vx-px*vx-px*ux)/6.d0 ) / dq
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$acc end data
|
||||
!
|
||||
CALL stop_clock( 'stres_us33' )
|
||||
CALL start_clock( 'stres_us34' )
|
||||
!CALL stop_clock( 'stres_us33' )
|
||||
!CALL start_clock( 'stres_us34' )
|
||||
!
|
||||
DEALLOCATE ( q )
|
||||
DEALLOCATE ( gk )
|
||||
!$acc end data
|
||||
DEALLOCATE( q, gk )
|
||||
!
|
||||
ALLOCATE ( sk(npw) )
|
||||
ikb = 0
|
||||
!
|
||||
ALLOCATE( phase(nat), sk(npw,nat) )
|
||||
ALLOCATE( nas(nat), ihv(nat*nhm), nav(nat*nhm) )
|
||||
!
|
||||
ina = 0
|
||||
DO nt = 1, ntyp
|
||||
DO na = 1, nat
|
||||
!
|
||||
IF (ityp(na) == nt) THEN
|
||||
arg = ( xk(1) * tau(1,na) + &
|
||||
xk(2) * tau(2,na) + &
|
||||
xk(3) * tau(3,na) ) * tpi
|
||||
phase = CMPLX( COS(arg), -SIN(arg) ,KIND=DP )
|
||||
DO ig = 1, npw
|
||||
iig = igk(ig)
|
||||
sk (ig) = eigts1(mill (1,iig), na) * &
|
||||
eigts2(mill (2,iig), na) * &
|
||||
eigts3(mill (3,iig), na) * phase
|
||||
ENDDO
|
||||
DO ih = 1, nh(nt)
|
||||
nb = indv(ih, nt)
|
||||
l = nhtol(ih, nt)
|
||||
lm= nhtolm(ih, nt)
|
||||
ikb = ikb + 1
|
||||
pref = (0.d0, -1.d0) **l
|
||||
!
|
||||
DO ig = 1, npw
|
||||
dvkb(ig, ikb) = djl(ig, nb, nt) * sk(ig) * ylm(ig, lm) &
|
||||
* pref
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
ina = ina + 1
|
||||
nas(ina) = na
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
CALL stop_clock('stres_us34')
|
||||
!$acc data create( phase, sk, ihv, nav ) copyin( nas )
|
||||
!
|
||||
IF (ikb /= nkb) CALL upf_error('gen_us_dj', 'unexpected error', 1)
|
||||
DEALLOCATE( sk )
|
||||
DEALLOCATE( ylm )
|
||||
DEALLOCATE( djl )
|
||||
!$acc parallel loop
|
||||
DO ina = 1, nat
|
||||
na = nas(ina)
|
||||
arg = (xk(1) * tau(1,na) + xk(2) * tau(2,na) &
|
||||
+ xk(3) * tau(3,na) ) * tpi
|
||||
phase(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP )
|
||||
ENDDO
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ina = 1, nat
|
||||
DO ig = 1, npw
|
||||
!
|
||||
na = nas(ina)
|
||||
iig = igk(ig)
|
||||
sk(ig,na) = eigts1(mill(1,iig),na) * &
|
||||
eigts2(mill(2,iig),na) * &
|
||||
eigts3(mill(3,iig),na) * phase(na)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ikb_t = 0
|
||||
DO ina = 1, nat
|
||||
na = nas(ina)
|
||||
nht = nh(ityp(na))
|
||||
!$acc kernels
|
||||
DO ih = 1, nht
|
||||
ihv(ikb_t+ih) = ih
|
||||
nav(ikb_t+ih) = na
|
||||
ENDDO
|
||||
!$acc end kernels
|
||||
ikb_t = ikb_t + nht
|
||||
ENDDO
|
||||
!
|
||||
!
|
||||
!$acc parallel loop collapse(2) copyin(indv,nhtol,nhtolm)
|
||||
DO ikb = 1, ikb_t
|
||||
DO ig = 1, npw
|
||||
ih = ihv(ikb)
|
||||
na = nav(ikb)
|
||||
nt = ityp(na)
|
||||
nb = indv(ih,nt)
|
||||
l = nhtol(ih,nt)
|
||||
lm = nhtolm(ih,nt)
|
||||
pref = (0._DP,-1._DP)**l
|
||||
!
|
||||
dvkb(ig,ikb) = CMPLX(djl(ig,nb,nt)) * sk(ig,na) * &
|
||||
CMPLX(ylm(ig,lm)) * pref
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!ALL stop_clock('stres_us34')
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
DEALLOCATE( phase, sk )
|
||||
DEALLOCATE( nas, ihv, nav )
|
||||
!
|
||||
IF (ikb_t /= nkb) CALL upf_error('gen_us_dj', 'unexpected error', 1)
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( djl, ylm )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE gen_us_dj_base
|
||||
|
||||
|
|
|
@ -1,219 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2021 Quantum ESPRESSO Foundation
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE gen_us_dj_gpu_ &
|
||||
( npw, npwx, igk_d, xk, nat, tau, ityp, ntyp, tpiba, &
|
||||
omega, nr1, nr2, nr3, eigts1_d, eigts2_d, eigts3_d, mill_d, g_d, dvkb_d )
|
||||
!----------------------------------------------------------------------
|
||||
!! Calculates the beta function pseudopotentials with
|
||||
!! the derivative of the Bessel functions - GFPU version
|
||||
!
|
||||
! AF: more gpu-resident variables can be used, avoiding local GPU-alloc
|
||||
! and host2dev transfers
|
||||
!
|
||||
USE upf_kinds, ONLY: dp
|
||||
USE upf_const, ONLY: tpi
|
||||
USE uspp, ONLY: nkb, indv_d, nhtol_d, nhtolm_d
|
||||
USE uspp_data, ONLY: nqx, tab, tab_d, dq
|
||||
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm
|
||||
USE device_fbuff_m, ONLY: dev_buf
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw
|
||||
!! number ok plane waves
|
||||
INTEGER, INTENT(IN) :: npwx
|
||||
!! max number ok plane waves across k-points
|
||||
INTEGER, INTENT(IN) :: igk_d(npw)
|
||||
!! indices of plane waves k+G
|
||||
REAL(dp), INTENT(IN) :: xk(3)
|
||||
!! k-point
|
||||
INTEGER, INTENT(IN) :: nat
|
||||
!! number of atoms
|
||||
INTEGER, INTENT(IN) :: ityp(nat)
|
||||
!! index of type per atom
|
||||
INTEGER, INTENT(IN) :: ntyp
|
||||
!! number of atomic types
|
||||
REAL(DP), INTENT(IN) :: tau(3,nat)
|
||||
!! atomic positions (cc alat units)
|
||||
REAL(DP), INTENT(IN) :: tpiba
|
||||
!! rec.lattice units 2pi/a
|
||||
REAL(DP), INTENT(IN) :: omega
|
||||
!! cell volume
|
||||
INTEGER, INTENT(IN) :: nr1,nr2,nr3
|
||||
!! fft dims (dense grid)
|
||||
COMPLEX(DP), INTENT(IN) :: eigts1_d(-nr1:nr1,nat)
|
||||
!! structure factor 1
|
||||
COMPLEX(DP), INTENT(IN) :: eigts2_d(-nr2:nr2,nat)
|
||||
!! structure factor 2
|
||||
COMPLEX(DP), INTENT(IN) :: eigts3_d(-nr3:nr3,nat)
|
||||
!! structure factor 3
|
||||
INTEGER, INTENT(IN) :: mill_d(3,*)
|
||||
!! miller index map
|
||||
REAL(DP), INTENT(IN) :: g_d(3,*)
|
||||
!! g vectors (2pi/a units)
|
||||
COMPLEX(DP), INTENT(OUT) :: dvkb_d(npwx, nkb)
|
||||
!! the beta function pseudopotential
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: na, nt, nb, ih, l, lm, ikb, iig, i0, i1, i2, &
|
||||
i3, ig, nbm, iq, mil1, mil2, mil3, &
|
||||
ikb_t, nht, ina, nas(nat), ierr(3)
|
||||
REAL(DP) :: px, ux, vx, wx, arg, u_ipol, xk1, xk2, xk3, qt
|
||||
COMPLEX(DP) :: pref
|
||||
INTEGER, ALLOCATABLE :: ityp_d(:), ih_d(:), na_d(:), nas_d(:)
|
||||
!
|
||||
REAL(DP), POINTER :: gk_d(:,:), djl_d(:,:,:), ylm_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: q_d(:), tau_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: phase_d(:), sk_d(:,:)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: igk_d, mill_d, eigts1_d, eigts2_d, eigts3_d, g_d
|
||||
attributes(DEVICE) :: gk_d, q_d, sk_d, djl_d, ylm_d, &
|
||||
ityp_d, phase_d, ih_d, na_d, tau_d, nas_d
|
||||
attributes(DEVICE) :: dvkb_d
|
||||
!
|
||||
IF (nkb == 0) RETURN
|
||||
!
|
||||
CALL dev_buf%lock_buffer( ylm_d, (/ npw,(lmaxkb+1)**2 /), ierr(1) )
|
||||
CALL dev_buf%lock_buffer( djl_d, (/ npw,nbetam,ntyp /), ierr(2) )
|
||||
CALL dev_buf%lock_buffer( gk_d, (/ 3,npw /), ierr(3) )
|
||||
ALLOCATE( q_d(npw) )
|
||||
!
|
||||
xk1 = xk(1)
|
||||
xk2 = xk(2)
|
||||
xk3 = xk(3)
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ig = 1, npw
|
||||
iig = igk_d(ig)
|
||||
gk_d(1,ig) = xk1 + g_d(1,iig)
|
||||
gk_d(2,ig) = xk2 + g_d(2,iig)
|
||||
gk_d(3,ig) = xk3 + g_d(3,iig)
|
||||
q_d(ig) = gk_d(1,ig)**2 + gk_d(2,ig)**2 + gk_d(3,ig)**2
|
||||
ENDDO
|
||||
!
|
||||
CALL ylmr2_gpu( (lmaxkb+1)**2, npw, gk_d, q_d, ylm_d )
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
nbm = upf(nt)%nbeta
|
||||
!$cuf kernel do (2) <<<*,*>>>
|
||||
DO nb = 1, nbm
|
||||
DO ig = 1, npw
|
||||
qt = SQRT(q_d(ig)) * tpiba
|
||||
px = qt/dq - DBLE(INT(qt/dq))
|
||||
ux = 1._DP - px
|
||||
vx = 2._DP - px
|
||||
wx = 3._DP - px
|
||||
i0 = INT(qt/dq) + 1
|
||||
i1 = i0 + 1
|
||||
i2 = i0 + 2
|
||||
i3 = i0 + 3
|
||||
djl_d(ig,nb,nt) = (tab_d(i0,nb,nt) * (-vx*wx-ux*wx-ux*vx)/6._DP + &
|
||||
tab_d(i1,nb,nt) * (+vx*wx-px*wx-px*vx)/2._DP - &
|
||||
tab_d(i2,nb,nt) * (+ux*wx-px*wx-px*ux)/2._DP + &
|
||||
tab_d(i3,nb,nt) * (+ux*vx-px*vx-px*ux)/6._DP)/dq
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( q_d )
|
||||
!
|
||||
ALLOCATE( ityp_d(nat), nas_d(nat) )
|
||||
ityp_d = ityp
|
||||
ALLOCATE( tau_d(3,nat) )
|
||||
tau_d = tau
|
||||
ALLOCATE( phase_d(nat) )
|
||||
!
|
||||
ina = 0
|
||||
DO nt = 1, ntyp
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
ina = ina + 1
|
||||
nas(ina) = na
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDDO
|
||||
nas_d = nas
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ina = 1, nat
|
||||
na = nas_d(ina)
|
||||
arg = (xk1 * tau_d(1,na) + xk2 * tau_d(2,na) &
|
||||
+ xk3 * tau_d(3,na) ) * tpi
|
||||
phase_d(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP )
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( tau_d )
|
||||
!
|
||||
ALLOCATE( sk_d(npw,nat) )
|
||||
!
|
||||
!$cuf kernel do (2) <<<*,*>>>
|
||||
DO ina = 1, nat
|
||||
DO ig = 1, npw
|
||||
!
|
||||
na = nas_d(ina)
|
||||
iig = igk_d(ig)
|
||||
mil1 = mill_d(1,iig)
|
||||
mil2 = mill_d(2,iig)
|
||||
mil3 = mill_d(3,iig)
|
||||
sk_d(ig,na) = eigts1_d(mil1,na) * &
|
||||
eigts2_d(mil2,na) * &
|
||||
eigts3_d(mil3,na) * phase_d(na)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!
|
||||
DEALLOCATE( phase_d )
|
||||
!
|
||||
ALLOCATE( ih_d(nat*nhm), na_d(nat*nhm) )
|
||||
!
|
||||
ikb_t = 0
|
||||
DO ina = 1, nat
|
||||
na = nas(ina)
|
||||
nht = nh(ityp(na))
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ih = 1, nht
|
||||
ih_d(ikb_t+ih) = ih
|
||||
na_d(ikb_t+ih) = na
|
||||
ENDDO
|
||||
ikb_t = ikb_t + nht
|
||||
ENDDO
|
||||
!
|
||||
!$cuf kernel do (2) <<<*,*>>>
|
||||
DO ikb = 1, ikb_t
|
||||
DO ig = 1, npw
|
||||
ih = ih_d(ikb)
|
||||
na = na_d(ikb)
|
||||
nt = ityp_d(na)
|
||||
nb = indv_d(ih,nt)
|
||||
l = nhtol_d(ih,nt)
|
||||
lm = nhtolm_d(ih,nt)
|
||||
pref = (0._DP,-1._DP)**l
|
||||
!
|
||||
dvkb_d(ig,ikb) = CMPLX(djl_d(ig,nb,nt)) * sk_d(ig,na) * &
|
||||
CMPLX(ylm_d(ig,lm)) * pref
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( sk_d )
|
||||
!
|
||||
IF (ikb_t /= nkb) CALL upf_error( 'gen_us_dj', 'unexpected error', 1 )
|
||||
!
|
||||
CALL dev_buf%release_buffer( ylm_d, ierr(1) )
|
||||
CALL dev_buf%release_buffer( djl_d, ierr(2) )
|
||||
CALL dev_buf%release_buffer( gk_d, ierr(3) )
|
||||
!
|
||||
DEALLOCATE( ih_d, na_d, nas_d )
|
||||
#endif
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE gen_us_dj_gpu_
|
|
@ -1,5 +1,5 @@
|
|||
!
|
||||
! Copyright (C) 2001-2021 Quantum ESPRESSO Foundation
|
||||
! Copyright (C) 2021 Quantum ESPRESSSO Foundation
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
|
@ -7,18 +7,20 @@
|
|||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE gen_us_dy_base &
|
||||
( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
|
||||
omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, mill, g, u, dvkb )
|
||||
SUBROUTINE gen_us_dy_base( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, &
|
||||
omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, &
|
||||
mill, g, u, dvkb )
|
||||
!----------------------------------------------------------------------
|
||||
!! Calculates the beta functions of the pseudopotential with the
|
||||
!! derivative of the spherical harmonics projected on vector u
|
||||
!! Calculates the Kleinman-Bylander pseudopotentials with the
|
||||
!! derivative of the spherical harmonics projected on vector u.
|
||||
!
|
||||
! AF: more extensive use of GPU-resident vars possible
|
||||
!
|
||||
USE upf_kinds, ONLY: dp
|
||||
USE upf_const, ONLY: tpi
|
||||
USE uspp, ONLY: nkb, indv, nhtol, nhtolm
|
||||
USE uspp_data, ONLY: nqx, tab, dq
|
||||
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh
|
||||
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -55,112 +57,205 @@ SUBROUTINE gen_us_dy_base &
|
|||
REAL(DP), INTENT(IN) :: g(3,*)
|
||||
!! g vectors (2pi/a units)
|
||||
REAL(DP), INTENT(IN) :: u(3)
|
||||
!! input: projection vector
|
||||
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb)
|
||||
!! output: kleinman-bylander pseudopotential
|
||||
!! projection vector
|
||||
COMPLEX(DP), INTENT(OUT) :: dvkb(npwx, nkb)
|
||||
!! the beta function pseudopotential
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: na, nt, nb, ih, l, lm, ikb, iig, ipol, i0, i1, i2, &
|
||||
i3, ig
|
||||
REAL(DP), ALLOCATABLE :: gk(:,:), q(:)
|
||||
REAL(DP) :: px, ux, vx, wx, arg
|
||||
i3, ig, nbm, iq, mil1, mil2, mil3, ikb_t, &
|
||||
nht, ina, lmx2
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: vkb0(:,:,:), dylm(:,:), dylm_u(:,:)
|
||||
INTEGER, ALLOCATABLE :: nas(:), ihv(:), nav(:)
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: dylm(:,:,:), dylm_u(:,:)
|
||||
REAL(DP), ALLOCATABLE :: q(:), gk(:,:), vkb0(:,:,:)
|
||||
! dylm = d Y_lm/dr_i in cartesian axes
|
||||
! dylm_u as above projected on u
|
||||
COMPLEX(DP), ALLOCATABLE :: phase(:), sk(:,:)
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: sk(:)
|
||||
COMPLEX(DP) :: phase, pref
|
||||
REAL(DP) :: px, ux, vx, wx, arg, u_ipol1, u_ipol2, u_ipol3, xk1, xk2, xk3
|
||||
COMPLEX(DP) :: pref
|
||||
!
|
||||
INTEGER :: iq
|
||||
!$acc kernels present_or_copyout(dvkb)
|
||||
dvkb = (0._DP,0._DP)
|
||||
!$acc end kernels
|
||||
!
|
||||
dvkb(:,:) = (0.d0, 0.d0)
|
||||
IF (lmaxkb <= 0) RETURN
|
||||
!
|
||||
ALLOCATE( vkb0(npw,nbetam,ntyp), dylm_u(npw,(lmaxkb+1)**2), gk(3,npw) )
|
||||
ALLOCATE( q(npw) )
|
||||
!$acc data present_or_copyin(igk,eigts1,eigts2,eigts3,mill,g) present(dvkb)
|
||||
!
|
||||
lmx2 = (lmaxkb+1)**2
|
||||
!
|
||||
ALLOCATE( gk(3,npw) )
|
||||
ALLOCATE( dylm_u(npw,lmx2) )
|
||||
ALLOCATE( vkb0(npw,nbetam,ntyp) )
|
||||
ALLOCATE( q(npw) )
|
||||
!$acc data create( dylm_u, vkb0 )
|
||||
!$acc data create( q, gk )
|
||||
!
|
||||
xk1 = xk(1)
|
||||
xk2 = xk(2)
|
||||
xk3 = xk(3)
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO ig = 1, npw
|
||||
iig = igk(ig)
|
||||
gk(1, ig) = xk(1) + g(1, iig)
|
||||
gk(2, ig) = xk(2) + g(2, iig)
|
||||
gk(3, ig) = xk(3) + g(3, iig)
|
||||
q(ig) = gk(1, ig)**2 + gk(2, ig)**2 + gk(3, ig)**2
|
||||
gk(1,ig) = xk1 + g(1,iig)
|
||||
gk(2,ig) = xk2 + g(2,iig)
|
||||
gk(3,ig) = xk3 + g(3,iig)
|
||||
q(ig) = gk(1,ig)**2 + gk(2,ig)**2 + gk(3,ig)**2
|
||||
ENDDO
|
||||
!
|
||||
ALLOCATE( dylm(npw,(lmaxkb+1)**2) )
|
||||
dylm_u(:,:) = 0.d0
|
||||
ALLOCATE( dylm(npw,(lmaxkb+1)**2,3) )
|
||||
!$acc data create( dylm )
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!$acc host_data use_device( gk, q, dylm )
|
||||
DO ipol = 1, 3
|
||||
CALL dylmr2( (lmaxkb+1)**2, npw, gk, q, dylm, ipol )
|
||||
CALL daxpy( npw * (lmaxkb + 1) **2, u(ipol), dylm, 1, dylm_u, 1 )
|
||||
CALL dylmr2_gpu( lmx2, npw, gk, q, dylm(:,:,ipol), ipol )
|
||||
ENDDO
|
||||
!$acc end host_data
|
||||
#else
|
||||
!$acc update self( gk, q )
|
||||
DO ipol = 1, 3
|
||||
CALL dylmr2( lmx2, npw, gk, q, dylm(:,:,ipol), ipol )
|
||||
ENDDO
|
||||
!$acc update device( dylm )
|
||||
#endif
|
||||
!
|
||||
u_ipol1 = u(1) ; u_ipol2 = u(2) ; u_ipol3 = u(3)
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
DO lm = 1, lmx2
|
||||
DO ig = 1, npw
|
||||
dylm_u(ig,lm) = u_ipol1*dylm(ig,lm,1) + &
|
||||
u_ipol2*dylm(ig,lm,2) + &
|
||||
u_ipol3*dylm(ig,lm,3)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$acc end data
|
||||
DEALLOCATE( dylm )
|
||||
!
|
||||
DO ig = 1, npw
|
||||
q(ig) = SQRT(q(ig)) * tpiba
|
||||
ENDDO
|
||||
!$acc kernels
|
||||
q(:) = SQRT(q(:)) * tpiba
|
||||
!$acc end kernels
|
||||
!
|
||||
!$acc data copyin( tab )
|
||||
DO nt = 1, ntyp
|
||||
! calculate beta in G-space using an interpolation table
|
||||
DO nb = 1, upf(nt)%nbeta
|
||||
nbm = upf(nt)%nbeta
|
||||
!$acc parallel loop collapse(2)
|
||||
DO nb = 1, nbm
|
||||
DO ig = 1, npw
|
||||
px = q(ig)/dq - INT(q(ig)/dq)
|
||||
ux = 1.d0 - px
|
||||
vx = 2.d0 - px
|
||||
wx = 3.d0 - px
|
||||
i0 = q(ig)/dq + 1
|
||||
px = q(ig)/dq - DBLE(INT(q(ig)/dq))
|
||||
ux = 1._DP - px
|
||||
vx = 2._DP - px
|
||||
wx = 3._DP - px
|
||||
i0 = INT(q(ig)/dq) + 1
|
||||
i1 = i0 + 1
|
||||
i2 = i0 + 2
|
||||
i3 = i0 + 3
|
||||
vkb0(ig, nb, nt) = tab(i0, nb, nt) * ux * vx * wx / 6.d0 + &
|
||||
tab(i1, nb, nt) * px * vx * wx / 2.d0 - &
|
||||
tab(i2, nb, nt) * px * ux * wx / 2.d0 + &
|
||||
tab(i3, nb, nt) * px * ux * vx / 6.d0
|
||||
ENDDO
|
||||
ENDDO
|
||||
vkb0(ig,nb,nt) = tab(i0,nb,nt) * ux * vx * wx / 6._DP + &
|
||||
tab(i1,nb,nt) * px * vx * wx / 2._DP - &
|
||||
tab(i2,nb,nt) * px * ux * wx / 2._DP + &
|
||||
tab(i3,nb,nt) * px * ux * vx / 6._DP
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$acc end data
|
||||
!
|
||||
DEALLOCATE( q )
|
||||
!$acc end data
|
||||
DEALLOCATE( gk, q )
|
||||
!
|
||||
ALLOCATE( sk(npw) )
|
||||
ALLOCATE( nas(nat), phase(nat) )
|
||||
!
|
||||
ikb = 0
|
||||
ina = 0
|
||||
DO nt = 1, ntyp
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
arg = (xk(1) * tau(1, na) + xk(2) * tau(2, na) &
|
||||
+ xk(3) * tau(3, na) ) * tpi
|
||||
phase = CMPLX( COS(arg), -SIN(arg), KIND=DP )
|
||||
DO ig = 1, npw
|
||||
iig = igk(ig)
|
||||
sk(ig) = eigts1(mill (1,iig), na) * &
|
||||
eigts2(mill (2,iig), na) * &
|
||||
eigts3(mill (3,iig), na) * phase
|
||||
ENDDO
|
||||
!
|
||||
DO ih = 1, nh(nt)
|
||||
nb = indv(ih, nt)
|
||||
l = nhtol(ih, nt)
|
||||
lm = nhtolm(ih, nt)
|
||||
ikb = ikb + 1
|
||||
pref = (0.d0, -1.d0)**l
|
||||
!
|
||||
DO ig = 1, npw
|
||||
dvkb(ig, ikb) = vkb0(ig, nb, nt) * sk(ig) * dylm_u(ig, lm) &
|
||||
* pref / tpiba
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
ina = ina + 1
|
||||
nas(ina) = na
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
IF (ikb /= nkb) CALL upf_error( 'gen_us_dy', 'unexpected error', 1 )
|
||||
|
||||
ALLOCATE( sk(npw,nat) )
|
||||
!$acc data create( sk )
|
||||
!
|
||||
!$acc data create( phase ) copyin( nas )
|
||||
!
|
||||
!$acc parallel loop copyin( tau )
|
||||
DO ina = 1, nat
|
||||
na = nas(ina)
|
||||
arg = ( xk1 * tau(1,na) + xk2 * tau(2,na) &
|
||||
+ xk3 * tau(3,na) ) * tpi
|
||||
phase(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP )
|
||||
ENDDO
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ina = 1, nat
|
||||
DO ig = 1, npw
|
||||
!
|
||||
na = nas(ina)
|
||||
iig = igk(ig)
|
||||
mil1 = mill(1,iig)
|
||||
mil2 = mill(2,iig)
|
||||
mil3 = mill(3,iig)
|
||||
sk(ig,na) = eigts1(mil1,na) * &
|
||||
eigts2(mil2,na) * &
|
||||
eigts3(mil3,na) * phase(na)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
ALLOCATE( ihv(nat*nhm), nav(nat*nhm) )
|
||||
!$acc data create( ihv, nav )
|
||||
!
|
||||
ikb_t = 0
|
||||
DO ina = 1, nat
|
||||
na = nas(ina)
|
||||
nht = nh(ityp(na))
|
||||
!$acc kernels
|
||||
DO ih = 1, nht
|
||||
ihv(ikb_t+ih) = ih
|
||||
nav(ikb_t+ih) = na
|
||||
ENDDO
|
||||
!$acc end kernels
|
||||
ikb_t = ikb_t + nht
|
||||
ENDDO
|
||||
!
|
||||
!$acc parallel loop collapse(2) copyin(ityp,indv,nhtol,nhtolm)
|
||||
DO ikb = 1, ikb_t
|
||||
DO ig = 1, npw
|
||||
ih = ihv(ikb)
|
||||
na = nav(ikb)
|
||||
nt = ityp(na)
|
||||
nb = indv(ih,nt)
|
||||
l = nhtol(ih,nt)
|
||||
lm = nhtolm(ih,nt)
|
||||
pref = (0._DP,-1._DP)**l
|
||||
!
|
||||
dvkb(ig,ikb) = CMPLX(vkb0(ig,nb,nt)) * sk(ig,na) * &
|
||||
CMPLX(dylm_u(ig,lm)) * pref / CMPLX(tpiba)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
DEALLOCATE( ihv, nav )
|
||||
DEALLOCATE( phase, nas )
|
||||
DEALLOCATE( sk )
|
||||
DEALLOCATE( vkb0, dylm_u, gk )
|
||||
!
|
||||
IF (ikb_t /= nkb) CALL upf_error( 'gen_us_dy', 'unexpected error', 1 )
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( dylm_u, vkb0 )
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE gen_us_dy_base
|
||||
!
|
||||
|
|
|
@ -1,253 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2021 Quantum ESPRESSSO Foundation
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE gen_us_dy_gpu_ ( npw, npwx, igk_d, xk, nat, tau, ityp, ntyp, &
|
||||
tpiba, omega, nr1, nr2, nr3, eigts1_d, eigts2_d, eigts3_d, &
|
||||
mill_d, g_d, u, dvkb_d )
|
||||
!----------------------------------------------------------------------
|
||||
!! Calculates the kleinman-bylander pseudopotentials with the
|
||||
!! derivative of the spherical harmonics projected on vector u
|
||||
!
|
||||
! AF: more extensive use of GPU-resident vars possible
|
||||
!
|
||||
USE upf_kinds, ONLY: dp
|
||||
USE upf_const, ONLY: tpi
|
||||
USE uspp, ONLY: nkb, indv_d, nhtol_d, nhtolm_d
|
||||
USE uspp_data, ONLY: nqx, tab, tab_d, dq
|
||||
USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm
|
||||
USE device_fbuff_m, ONLY: dev_buf
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw
|
||||
!! number ok plane waves
|
||||
INTEGER, INTENT(IN) :: npwx
|
||||
!! max number ok plane waves across k-points
|
||||
INTEGER, INTENT(IN) :: igk_d(npw)
|
||||
!! indices of plane waves k+G
|
||||
REAL(dp), INTENT(IN) :: xk(3)
|
||||
!! k-point
|
||||
INTEGER, INTENT(IN) :: nat
|
||||
!! number of atoms
|
||||
INTEGER, INTENT(IN) :: ityp(nat)
|
||||
!! index of type per atom
|
||||
INTEGER, INTENT(IN) :: ntyp
|
||||
!! number of atomic types
|
||||
REAL(DP), INTENT(IN) :: tau(3,nat)
|
||||
!! atomic positions (cc alat units)
|
||||
REAL(DP), INTENT(IN) :: tpiba
|
||||
!! rec.lattice units 2pi/a
|
||||
REAL(DP), INTENT(IN) :: omega
|
||||
!! cell volume
|
||||
INTEGER, INTENT(IN) :: nr1,nr2,nr3
|
||||
!! fft dims (dense grid)
|
||||
COMPLEX(DP), INTENT(IN) :: eigts1_d(-nr1:nr1,nat)
|
||||
!! structure factor 1
|
||||
COMPLEX(DP), INTENT(IN) :: eigts2_d(-nr2:nr2,nat)
|
||||
!! structure factor 2
|
||||
COMPLEX(DP), INTENT(IN) :: eigts3_d(-nr3:nr3,nat)
|
||||
!! structure factor 3
|
||||
INTEGER, INTENT(IN) :: mill_d(3,*)
|
||||
!! miller index map
|
||||
REAL(DP), INTENT(IN) :: g_d(3,*)
|
||||
!! g vectors (2pi/a units)
|
||||
REAL(DP), INTENT(IN) :: u(3)
|
||||
!! input: projection vector
|
||||
COMPLEX(DP), INTENT(OUT) :: dvkb_d(npwx, nkb)
|
||||
!! the beta function pseudopotential
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: na, nt, nb, ih, l, lm, ikb, iig, ipol, i0, i1, i2, &
|
||||
i3, ig, nbm, iq, mil1, mil2, mil3, ikb_t, &
|
||||
nht, ina, lmx2
|
||||
INTEGER :: nas(nat), ierr(4)
|
||||
!
|
||||
INTEGER, ALLOCATABLE :: ityp_d(:), ih_d(:), na_d(:), nas_d(:)
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: q(:), dylm(:,:)
|
||||
!
|
||||
REAL(DP), POINTER :: gk_d(:,:)
|
||||
REAL(DP), POINTER :: vkb0_d(:,:,:), dylm_u_d(:,:), dylm_d(:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: q_d(:), tau_d(:,:)
|
||||
! dylm = d Y_lm/dr_i in cartesian axes
|
||||
! dylm_u as above projected on u
|
||||
COMPLEX(DP), ALLOCATABLE :: phase_d(:), sk_d(:,:)
|
||||
!
|
||||
REAL(DP) :: px, ux, vx, wx, arg, u_ipol1, u_ipol2, u_ipol3, xk1, xk2, xk3
|
||||
COMPLEX(DP) :: pref
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: igk_d, mill_d, eigts1_d, eigts2_d, eigts3_d, g_d
|
||||
attributes(DEVICE) :: gk_d, q_d, sk_d, vkb0_d, dylm_u_d, dylm_d, &
|
||||
ityp_d, phase_d, ih_d, na_d, tau_d, nas_d
|
||||
attributes(DEVICE) :: dvkb_d
|
||||
!
|
||||
dvkb_d = (0._DP,0._DP)
|
||||
!
|
||||
IF (lmaxkb <= 0) RETURN
|
||||
!
|
||||
lmx2 = (lmaxkb+1)**2
|
||||
!
|
||||
CALL dev_buf%lock_buffer( dylm_u_d, (/ npw,lmx2 /), ierr(1) )
|
||||
CALL dev_buf%lock_buffer( vkb0_d, (/ npw,nbetam,ntyp /), ierr(2) )
|
||||
CALL dev_buf%lock_buffer( gk_d, (/ 3,npw /), ierr(3) )
|
||||
IF (ANY(ierr /= 0)) CALL upf_error( 'gen_us_dy_gpu', 'cannot allocate buffers', ABS(ierr) )
|
||||
ALLOCATE( q_d(npw) )
|
||||
!
|
||||
xk1 = xk(1)
|
||||
xk2 = xk(2)
|
||||
xk3 = xk(3)
|
||||
!
|
||||
!$cuf kernel do <<<*,*>>>
|
||||
DO ig = 1, npw
|
||||
iig = igk_d(ig)
|
||||
gk_d(1,ig) = xk1 + g_d(1,iig)
|
||||
gk_d(2,ig) = xk2 + g_d(2,iig)
|
||||
gk_d(3,ig) = xk3 + g_d(3,iig)
|
||||
q_d(ig) = gk_d(1,ig)**2 + gk_d(2,ig)**2 + gk_d(3,ig)**2
|
||||
ENDDO
|
||||
!
|
||||
CALL dev_buf%lock_buffer( dylm_d, (/npw,lmx2,3/), ierr(4) )
|
||||
DO ipol = 1, 3
|
||||
CALL dylmr2_gpu( lmx2, npw, gk_d, q_d, dylm_d(:,:,ipol), ipol )
|
||||
ENDDO
|
||||
!
|
||||
u_ipol1 = u(1) ; u_ipol2 = u(2) ; u_ipol3 = u(3)
|
||||
!
|
||||
!$cuf kernel do (2) <<<*,*>>>
|
||||
DO lm = 1, lmx2
|
||||
DO ig = 1, npw
|
||||
dylm_u_d(ig,lm) = u_ipol1*dylm_d(ig,lm,1) + &
|
||||
u_ipol2*dylm_d(ig,lm,2) + &
|
||||
u_ipol3*dylm_d(ig,lm,3)
|
||||
ENDDO
|
||||
ENDDO
|
||||
CALL dev_buf%release_buffer( dylm_d, ierr(4) )
|
||||
!
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ig = 1, npw
|
||||
q_d(ig) = SQRT(q_d(ig)) * tpiba
|
||||
ENDDO
|
||||
!
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
nbm = upf(nt)%nbeta
|
||||
!$cuf kernel do (2) <<<*,*>>>
|
||||
DO nb = 1, nbm
|
||||
DO ig = 1, npw
|
||||
px = q_d(ig)/dq - DBLE(INT(q_d(ig)/dq))
|
||||
ux = 1._DP - px
|
||||
vx = 2._DP - px
|
||||
wx = 3._DP - px
|
||||
i0 = INT(q_d(ig)/dq) + 1
|
||||
i1 = i0 + 1
|
||||
i2 = i0 + 2
|
||||
i3 = i0 + 3
|
||||
vkb0_d(ig,nb,nt) = tab_d(i0,nb,nt) * ux * vx * wx / 6._DP + &
|
||||
tab_d(i1,nb,nt) * px * vx * wx / 2._DP - &
|
||||
tab_d(i2,nb,nt) * px * ux * wx / 2._DP + &
|
||||
tab_d(i3,nb,nt) * px * ux * vx / 6._DP
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( q_d )
|
||||
!
|
||||
ALLOCATE( ityp_d(nat), nas_d(nat) )
|
||||
ityp_d = ityp
|
||||
ALLOCATE( tau_d(3,nat) )
|
||||
tau_d = tau
|
||||
ALLOCATE( phase_d(nat) )
|
||||
!
|
||||
ina = 0
|
||||
DO nt = 1, ntyp
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
ina = ina + 1
|
||||
nas(ina) = na
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDDO
|
||||
nas_d = nas
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ina = 1, nat
|
||||
na = nas_d(ina)
|
||||
arg = (xk1 * tau_d(1,na) + xk2 * tau_d(2,na) &
|
||||
+ xk3 * tau_d(3,na) ) * tpi
|
||||
phase_d(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP )
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( tau_d )
|
||||
!
|
||||
ALLOCATE( sk_d(npw,nat) )
|
||||
!
|
||||
!$cuf kernel do (2) <<<*,*>>>
|
||||
DO ina = 1, nat
|
||||
DO ig = 1, npw
|
||||
!
|
||||
na = nas_d(ina)
|
||||
iig = igk_d(ig)
|
||||
mil1 = mill_d(1,iig)
|
||||
mil2 = mill_d(2,iig)
|
||||
mil3 = mill_d(3,iig)
|
||||
sk_d(ig,na) = eigts1_d(mil1,na) * &
|
||||
eigts2_d(mil2,na) * &
|
||||
eigts3_d(mil3,na) * phase_d(na)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( phase_d )
|
||||
!
|
||||
ALLOCATE( ih_d(nat*nhm), na_d(nat*nhm) )
|
||||
!
|
||||
ikb_t = 0
|
||||
DO ina = 1, nat
|
||||
na = nas(ina)
|
||||
nht = nh(ityp(na))
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ih = 1, nht
|
||||
ih_d(ikb_t+ih) = ih
|
||||
na_d(ikb_t+ih) = na
|
||||
ENDDO
|
||||
ikb_t = ikb_t + nht
|
||||
ENDDO
|
||||
!
|
||||
!$cuf kernel do (2) <<<*,*>>>
|
||||
DO ikb = 1, ikb_t
|
||||
DO ig = 1, npw
|
||||
ih = ih_d(ikb)
|
||||
na = na_d(ikb)
|
||||
nt = ityp_d(na)
|
||||
nb = indv_d(ih,nt)
|
||||
l = nhtol_d(ih,nt)
|
||||
lm = nhtolm_d(ih,nt)
|
||||
pref = (0._DP, -1._DP)**l
|
||||
!
|
||||
dvkb_d(ig,ikb) = CMPLX(vkb0_d(ig,nb,nt)) * sk_d(ig,na) * &
|
||||
CMPLX(dylm_u_d(ig,lm)) * pref / CMPLX(tpiba)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( sk_d )
|
||||
!
|
||||
IF (ikb_t /= nkb) CALL upf_error( 'gen_us_dy', 'unexpected error', 1 )
|
||||
!
|
||||
CALL dev_buf%release_buffer( dylm_u_d, ierr(1) )
|
||||
CALL dev_buf%release_buffer( vkb0_d, ierr(2) )
|
||||
CALL dev_buf%release_buffer( gk_d, ierr(3) )
|
||||
!
|
||||
DEALLOCATE( ih_d, na_d, nas_d )
|
||||
#endif
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE gen_us_dy_gpu_
|
146
upflib/gth.f90
146
upflib/gth.f90
|
@ -14,7 +14,7 @@ module m_gth
|
|||
!
|
||||
private
|
||||
public :: gth_parameters, readgth, vloc_gth, dvloc_gth, &
|
||||
dvloc_gth_gpu, setlocq_gth, mk_ffnl_gth, mk_dffnl_gth, deallocate_gth
|
||||
setlocq_gth, mk_ffnl_gth, mk_dffnl_gth, deallocate_gth
|
||||
!
|
||||
type gth_parameters
|
||||
integer :: itype, lloc, lmax
|
||||
|
@ -340,94 +340,9 @@ subroutine vloc_gth(itype, zion, tpiba2, ngl, gl, omega, vloc)
|
|||
vloc (:) = vloc(:) * fact
|
||||
!
|
||||
end subroutine vloc_gth
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine dvloc_gth(itype, zion, tpiba2, ngl, gl, omega, dvloc)
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g
|
||||
!
|
||||
USE upf_kinds, ONLY: dp
|
||||
USE upf_const, ONLY: pi, tpi, e2, eps8
|
||||
|
||||
implicit none
|
||||
!
|
||||
! I/O
|
||||
integer, intent(in) :: itype, ngl
|
||||
real(dp), intent(in) :: zion, tpiba2, omega, gl (ngl)
|
||||
real(dp), intent(out) :: dvloc (ngl)
|
||||
!
|
||||
! Local variables
|
||||
integer :: ii, my_gth, igl, igl0
|
||||
real(dp) :: cc1, cc2, cc3, cc4, rloc, &
|
||||
gx, gx2, gx3, rl2, rl3, rq2, r2q, r4g3, r6g5, e_rq2h, fact
|
||||
!
|
||||
! IF ( do_comp_esm ) call upf_error('vloc_gth', 'ESM not implemented', itype)
|
||||
!
|
||||
! Find gtp param. set for type itype
|
||||
my_gth=0
|
||||
do ii=1,size(gth_p)
|
||||
if (gth_p(ii)%itype==itype) then
|
||||
my_gth=ii
|
||||
exit
|
||||
endif
|
||||
enddo
|
||||
if (my_gth==0) call upf_error('dvloc_gth', 'cannot map itype in some gtp param. set', itype)
|
||||
rloc=gth_p(my_gth)%rloc
|
||||
cc1=gth_p(my_gth)%cc(1)
|
||||
cc2=gth_p(my_gth)%cc(2)
|
||||
cc3=gth_p(my_gth)%cc(3)
|
||||
cc4=gth_p(my_gth)%cc(4)
|
||||
|
||||
! Compute vloc(q)
|
||||
if (gl (1) < eps8) then
|
||||
!
|
||||
! first the G=0 term
|
||||
!
|
||||
dvloc (1) = 0._dp
|
||||
igl0 = 2
|
||||
else
|
||||
igl0 = 1
|
||||
endif
|
||||
!
|
||||
! here the G<>0 terms, we first compute the part of the integrand
|
||||
! function independent of |G| in real space
|
||||
!
|
||||
do igl = igl0, ngl
|
||||
gx = sqrt (gl (igl) * tpiba2)
|
||||
gx2 = gx**2
|
||||
gx3 = gx*gx2
|
||||
rl2 = rloc**2
|
||||
rl3 = rloc*rl2
|
||||
rq2 = gx2*rl2
|
||||
r2q = gx*rl2
|
||||
r4g3 = rl2*rl2*gx3
|
||||
r6g5 = r4g3*rl2*gx2
|
||||
e_rq2h = exp(-0.5_dp*rq2)
|
||||
dvloc (igl) = &
|
||||
e_rq2h*(zion*(rq2+2._dp)/gx3 + sqrt(pi/2._dp)*rl3* &
|
||||
( &
|
||||
( &
|
||||
- 2._dp*r2q* (cc2+10._dp*cc3+105._dp*cc4) &
|
||||
+ 4._dp*r4g3*(cc3+21._dp*cc4) &
|
||||
- 6._dp*r6g5* cc4 &
|
||||
) - r2q*( &
|
||||
cc1 + &
|
||||
cc2*(3._dp-rq2) + &
|
||||
cc3*(15._dp-10._dp*rq2+rq2**2) + &
|
||||
cc4*(105._dp-rq2*(105._dp-rq2*(21._dp-rq2))) &
|
||||
) &
|
||||
) &
|
||||
)/gx
|
||||
enddo
|
||||
!
|
||||
fact = tpi * e2 / omega
|
||||
dvloc (:) = dvloc(:) * fact
|
||||
!
|
||||
end subroutine dvloc_gth
|
||||
!
|
||||
!
|
||||
!-------------------------------------------------------------------------------
|
||||
SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
|
||||
subroutine dvloc_gth( itype, zion, tpiba2, ngl, gl, omega, dvloc )
|
||||
!------------------------------------------------------------------------------
|
||||
!! GPU version of 'dvloc_gth' from 'Modules/gth.f90'
|
||||
!! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g
|
||||
|
@ -435,34 +350,31 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
|
|||
USE upf_kinds, ONLY : DP
|
||||
USE upf_const, ONLY : pi, tpi, e2, eps8
|
||||
!
|
||||
IMPLICIT NONE
|
||||
implicit none
|
||||
!
|
||||
! I/O
|
||||
INTEGER, INTENT(IN) :: itype, ngl
|
||||
REAL(DP), INTENT(IN) :: zion, tpiba2, omega
|
||||
!
|
||||
REAL(DP), INTENT(IN) :: gl_d(ngl)
|
||||
REAL(DP), INTENT(OUT) :: dvloc_d(ngl)
|
||||
integer, intent(in) :: itype, ngl
|
||||
real(dp), intent(in) :: zion, tpiba2, omega
|
||||
real(dp), intent(in) :: gl(ngl)
|
||||
real(dp), intent(out) :: dvloc(ngl)
|
||||
!
|
||||
! Local variables
|
||||
INTEGER :: ii, my_gth, igl, igl0
|
||||
REAL(DP) :: cc1, cc2, cc3, cc4, rloc, gl1, &
|
||||
integer :: ii, my_gth, igl, igl0
|
||||
real(dp) :: cc1, cc2, cc3, cc4, rloc, gl1, &
|
||||
gx, gx2, gx3, rl2, rl3, rq2, r2q, r4g3, r6g5, e_rq2h, fact
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: dvloc_d, gl_d
|
||||
#endif
|
||||
!$acc data present( dvloc, gl )
|
||||
!
|
||||
! IF ( do_comp_esm ) call upf_error('vloc_gth', 'ESM not implemented', itype)
|
||||
!
|
||||
! Find gtp param. set for type itype
|
||||
my_gth = 0
|
||||
DO ii = 1, SIZE(gth_p)
|
||||
IF (gth_p(ii)%itype==itype) THEN
|
||||
do ii = 1, SIZE(gth_p)
|
||||
if (gth_p(ii)%itype==itype) then
|
||||
my_gth = ii
|
||||
EXIT
|
||||
ENDIF
|
||||
ENDDO
|
||||
exit
|
||||
endif
|
||||
enddo
|
||||
!
|
||||
IF ( my_gth==0 ) CALL upf_error( 'dvloc_gth', 'cannot map itype in some gtp param. set', itype )
|
||||
rloc = gth_p(my_gth)%rloc
|
||||
|
@ -472,25 +384,27 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
|
|||
cc4 = gth_p(my_gth)%cc(4)
|
||||
!
|
||||
! Compute vloc(q)
|
||||
gl1 = gl_d(1)
|
||||
IF (gl1 < eps8) THEN
|
||||
gl1 = gl(1)
|
||||
if (gl1 < eps8) then
|
||||
!
|
||||
! first the G=0 term
|
||||
!
|
||||
dvloc_d(1) = 0.0_DP
|
||||
!$acc kernels
|
||||
dvloc(1) = 0.0_DP
|
||||
!$acc end kernels
|
||||
igl0 = 2
|
||||
ELSE
|
||||
else
|
||||
igl0 = 1
|
||||
ENDIF
|
||||
endif
|
||||
!
|
||||
! here the G<>0 terms, we first compute the part of the integrand
|
||||
! function independent of |G| in real space
|
||||
!
|
||||
fact = tpi * e2 / omega
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO igl = igl0, ngl
|
||||
gx = SQRT(gl_d(igl) * tpiba2)
|
||||
!$acc parallel loop
|
||||
do igl = igl0, ngl
|
||||
gx = SQRT(gl(igl) * tpiba2)
|
||||
gx2 = gx**2
|
||||
gx3 = gx*gx2
|
||||
rl2 = rloc**2
|
||||
|
@ -500,7 +414,7 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
|
|||
r4g3 = rl2*rl2*gx3
|
||||
r6g5 = r4g3*rl2*gx2
|
||||
e_rq2h = EXP(-0.5_DP*rq2)
|
||||
dvloc_d(igl) = fact * &
|
||||
dvloc(igl) = fact * &
|
||||
e_rq2h*(zion*(rq2+2._DP)/gx3 + SQRT(pi/2._DP)*rl3* &
|
||||
( &
|
||||
( &
|
||||
|
@ -515,11 +429,11 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d )
|
|||
) &
|
||||
) &
|
||||
)/gx
|
||||
ENDDO
|
||||
enddo
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
END SUBROUTINE dvloc_gth_gpu
|
||||
!
|
||||
end subroutine dvloc_gth
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine setlocq_gth(itype, xq, zion, tpiba2, ngm, g, omega, vloc)
|
||||
|
|
|
@ -30,8 +30,8 @@ subroutine init_us_1( nat, ityp, omega, ngm, g, gg, intra_bgrp_comm )
|
|||
USE atom, ONLY : rgrid
|
||||
USE uspp, ONLY : nhtol, nhtoj, nhtolm, ijtoh, dvan, qq_at, qq_nt, indv, &
|
||||
ap, aainit, qq_so, dvan_so, okvan, ofsbeta, &
|
||||
nhtol_d, nhtoj_d, nhtolm_d, ijtoh_d, dvan_d, qq_at_d, &
|
||||
qq_nt_d, indv_d, qq_so_d, dvan_so_d, ofsbeta_d
|
||||
nhtol_d, nhtoj_d, nhtolm_d, ijtoh_d, dvan_d, &
|
||||
qq_nt_d, indv_d, dvan_so_d, ofsbeta_d
|
||||
USE uspp_param, ONLY : upf, lmaxq, nh, nhm, lmaxkb, nsp
|
||||
USE upf_spinorb, ONLY : is_spinorbit, rot_ylm, fcoef, fcoef_d, lmaxx
|
||||
USE paw_variables,ONLY : okpaw
|
||||
|
@ -320,10 +320,8 @@ subroutine init_us_1( nat, ityp, omega, ngm, g, gg, intra_bgrp_comm )
|
|||
nhtolm_d=nhtolm
|
||||
nhtoj_d=nhtoj
|
||||
ijtoh_d=ijtoh
|
||||
qq_at_d=qq_at
|
||||
qq_nt_d=qq_nt
|
||||
if (is_spinorbit) then
|
||||
qq_so_d=qq_so
|
||||
dvan_so_d=dvan_so
|
||||
fcoef_d=fcoef
|
||||
else
|
||||
|
@ -333,6 +331,13 @@ subroutine init_us_1( nat, ityp, omega, ngm, g, gg, intra_bgrp_comm )
|
|||
ofsbeta_d=ofsbeta
|
||||
!
|
||||
#endif
|
||||
!
|
||||
if (nhm>0) then
|
||||
!$acc update device(qq_at)
|
||||
if (is_spinorbit) then
|
||||
!$acc update device(qq_so)
|
||||
endif
|
||||
endif
|
||||
!
|
||||
call stop_clock ('init_us_1')
|
||||
return
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE simpson(mesh, func, rab, asum)
|
||||
!$acc routine vector
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! simpson's rule integration. On input:
|
||||
|
@ -22,27 +23,30 @@ SUBROUTINE simpson(mesh, func, rab, asum)
|
|||
!
|
||||
USE upf_kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(in) :: mesh
|
||||
real(DP), INTENT(in) :: rab (mesh), func (mesh)
|
||||
INTEGER, INTENT(in) :: mesh
|
||||
real(DP), INTENT(in) :: rab(mesh), func(mesh)
|
||||
real(DP), INTENT(out):: asum
|
||||
!
|
||||
real(DP) :: f1, f2, f3, r12
|
||||
real(DP) :: f1, f2, f3, r12, fct
|
||||
INTEGER :: i
|
||||
!
|
||||
asum = 0.0d0
|
||||
r12 = 1.0d0 / 3.0d0
|
||||
f3 = func (1) * rab (1) * r12
|
||||
|
||||
DO i = 2, mesh - 1, 2
|
||||
f1 = f3
|
||||
f2 = func (i) * rab (i) * r12
|
||||
f3 = func (i + 1) * rab (i + 1) * r12
|
||||
asum = asum + f1 + 4.0d0 * f2 + f3
|
||||
!
|
||||
!$acc loop vector reduction(+:asum)
|
||||
DO i = 2, mesh-1
|
||||
fct = DBLE(ABS(MOD(i,2)-2)*2)
|
||||
asum = asum + fct * func(i) * rab(i)
|
||||
ENDDO
|
||||
IF (MOD(mesh,2)==1) THEN
|
||||
asum = (asum + func(1)*rab(1) + func(mesh)*rab(mesh)) * r12
|
||||
ELSE
|
||||
asum = (asum + func(1)*rab(1) - func(mesh-1)*rab(mesh-1)) * r12
|
||||
ENDIF
|
||||
!
|
||||
! if mesh is not odd, use open formula instead:
|
||||
! ... 2/3*f(n-5) + 4/3*f(n-4) + 13/12*f(n-3) + 0*f(n-2) + 27/12*f(n-1)
|
||||
!!! Under testing
|
||||
!** Under testing
|
||||
!
|
||||
!IF ( MOD(mesh,2) == 0 ) THEN
|
||||
! print *, 'mesh even: correction:', f1*5.d0/4.d0-4.d0*f2+23.d0*f3/4.d0, &
|
||||
|
|
|
@ -8,11 +8,12 @@
|
|||
!
|
||||
!--------------------------------------------------------------------
|
||||
subroutine sph_bes (msh, r, q, l, jl)
|
||||
!$acc routine vector
|
||||
!--------------------------------------------------------------------
|
||||
!! Spherical Bessel function.
|
||||
!
|
||||
USE upf_kinds, only: DP
|
||||
USE upf_const, ONLY : eps14
|
||||
USE upf_const, only: eps14
|
||||
!
|
||||
implicit none
|
||||
!
|
||||
|
@ -29,8 +30,8 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
!
|
||||
! xseries = convergence radius of the series for small x of j_l(x)
|
||||
real(DP) :: x, xl, xseries = 0.05_dp
|
||||
integer :: ir, ir0
|
||||
integer, external:: semifact
|
||||
integer :: i, ir, ir0
|
||||
integer :: semifact
|
||||
!
|
||||
#if defined (__MASS)
|
||||
real(DP) :: qr(msh), sin_qr(msh), cos_qr(msh)
|
||||
|
@ -40,11 +41,17 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
|
||||
if (abs (q) < eps14) then
|
||||
if (l == -1) then
|
||||
call upf_error ('sph_bes', 'j_{-1}(0) ?!?', 1)
|
||||
stop !call upf_error ('sph_bes', 'j_{-1}(0) ?!?', 1)
|
||||
elseif (l == 0) then
|
||||
jl(:) = 1.d0
|
||||
!$acc loop vector
|
||||
do ir = 1, msh
|
||||
jl(ir) = 1.d0
|
||||
enddo
|
||||
else
|
||||
jl(:) = 0.d0
|
||||
!$acc loop vector
|
||||
do ir = 1, msh
|
||||
jl(ir) = 0.d0
|
||||
enddo
|
||||
endif
|
||||
return
|
||||
end if
|
||||
|
@ -52,7 +59,7 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
! case l=-1
|
||||
|
||||
if (l == - 1) then
|
||||
if (abs (q * r (1) ) < eps14) call upf_error ('sph_bes', 'j_{-1}(0) ?!?',1)
|
||||
if (abs (q * r (1) ) < eps14) stop !call upf_error ('sph_bes', 'j_{-1}(0) ?!?',1)
|
||||
|
||||
#if defined (__MASS)
|
||||
|
||||
|
@ -61,9 +68,10 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
jl = cos_qr / qr
|
||||
|
||||
#else
|
||||
|
||||
jl (:) = cos (q * r (:) ) / (q * r (:) )
|
||||
|
||||
!$acc loop vector
|
||||
do ir = 1, msh
|
||||
jl (ir) = cos (q * r (ir) ) / (q * r (ir) )
|
||||
enddo
|
||||
#endif
|
||||
|
||||
return
|
||||
|
@ -75,6 +83,7 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
! notice that for small q it may happen that q*r(msh) < xseries !
|
||||
|
||||
ir0 = msh+1
|
||||
!$acc loop vector
|
||||
do ir = 1, msh
|
||||
if ( abs (q * r (ir) ) > xseries ) then
|
||||
ir0 = ir
|
||||
|
@ -82,6 +91,7 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
end if
|
||||
end do
|
||||
|
||||
!$acc loop vector
|
||||
do ir = 1, ir0 - 1
|
||||
x = q * r (ir)
|
||||
if ( l == 0 ) then
|
||||
|
@ -89,11 +99,18 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
else
|
||||
xl = x**l
|
||||
end if
|
||||
jl (ir) = xl/semifact(2*l+1) * &
|
||||
( 1.0_dp - x**2/1.0_dp/2.0_dp/(2.0_dp*l+3) * &
|
||||
( 1.0_dp - x**2/2.0_dp/2.0_dp/(2.0_dp*l+5) * &
|
||||
( 1.0_dp - x**2/3.0_dp/2.0_dp/(2.0_dp*l+7) * &
|
||||
( 1.0_dp - x**2/4.0_dp/2.0_dp/(2.0_dp*l+9) ) ) ) )
|
||||
!--
|
||||
semifact = 1
|
||||
!$acc loop seq reduction(*:semifact)
|
||||
do i = 2*l+1, 1, -2
|
||||
semifact = i*semifact
|
||||
enddo
|
||||
!---
|
||||
jl (ir) = xl/DBLE(semifact) * &
|
||||
( 1.0_dp - x**2/1.0_dp/2.0_dp/DBLE(2*l+3) * &
|
||||
( 1.0_dp - x**2/2.0_dp/2.0_dp/DBLE(2*l+5) * &
|
||||
( 1.0_dp - x**2/3.0_dp/2.0_dp/DBLE(2*l+7) * &
|
||||
( 1.0_dp - x**2/4.0_dp/2.0_dp/DBLE(2*l+9) ) ) ) )
|
||||
end do
|
||||
|
||||
! the following shouldn't be needed but do you trust compilers
|
||||
|
@ -110,9 +127,10 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
jl (ir0:) = sin_qr(ir0:) / (q * r (ir0:) )
|
||||
|
||||
#else
|
||||
|
||||
jl (ir0:) = sin (q * r (ir0:) ) / (q * r (ir0:) )
|
||||
|
||||
!$acc loop vector
|
||||
do ir = ir0, msh
|
||||
jl (ir) = sin (q * r (ir) ) / (q * r (ir) )
|
||||
enddo
|
||||
#endif
|
||||
|
||||
elseif (l == 1) then
|
||||
|
@ -126,10 +144,11 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
cos_qr(ir0:) ) / (q * r (ir0:) )
|
||||
|
||||
#else
|
||||
|
||||
jl (ir0:) = (sin (q * r (ir0:) ) / (q * r (ir0:) ) - &
|
||||
cos (q * r (ir0:) ) ) / (q * r (ir0:) )
|
||||
|
||||
!$acc loop vector
|
||||
do ir = ir0, msh
|
||||
jl (ir) = (sin (q * r (ir) ) / (q * r (ir) ) - &
|
||||
cos (q * r (ir) ) ) / (q * r (ir) )
|
||||
enddo
|
||||
#endif
|
||||
|
||||
elseif (l == 2) then
|
||||
|
@ -143,10 +162,11 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
3.d0 * cos_qr(ir0:) ) / (q*r(ir0:))**2
|
||||
|
||||
#else
|
||||
|
||||
jl (ir0:) = ( (3.d0 / (q*r(ir0:)) - (q*r(ir0:)) ) * sin (q*r(ir0:)) - &
|
||||
3.d0 * cos (q*r(ir0:)) ) / (q*r(ir0:))**2
|
||||
|
||||
!$acc loop vector
|
||||
do ir = ir0, msh
|
||||
jl (ir) = ( (3.d0 / (q*r(ir)) - (q*r(ir)) ) * sin (q*r(ir)) - &
|
||||
3.d0 * cos (q*r(ir)) ) / (q*r(ir))**2
|
||||
enddo
|
||||
#endif
|
||||
|
||||
elseif (l == 3) then
|
||||
|
@ -162,12 +182,13 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
(q*r(ir0:))**3
|
||||
|
||||
#else
|
||||
|
||||
jl (ir0:) = (sin (q*r(ir0:)) * &
|
||||
(15.d0 / (q*r(ir0:)) - 6.d0 * (q*r(ir0:)) ) + &
|
||||
cos (q*r(ir0:)) * ( (q*r(ir0:))**2 - 15.d0) ) / &
|
||||
(q*r(ir0:)) **3
|
||||
|
||||
!$acc loop vector
|
||||
do ir = ir0, msh
|
||||
jl (ir) = (sin (q*r(ir)) * &
|
||||
(15.d0 / (q*r(ir)) - 6.d0 * (q*r(ir)) ) + &
|
||||
cos (q*r(ir)) * ( (q*r(ir))**2 - 15.d0) ) / &
|
||||
(q*r(ir)) **3
|
||||
enddo
|
||||
#endif
|
||||
|
||||
elseif (l == 4) then
|
||||
|
@ -184,12 +205,14 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
(q*r(ir0:))**5
|
||||
|
||||
#else
|
||||
|
||||
jl (ir0:) = (sin (q*r(ir0:)) * &
|
||||
(105.d0 - 45.d0 * (q*r(ir0:))**2 + (q*r(ir0:))**4) + &
|
||||
cos (q*r(ir0:)) * &
|
||||
(10.d0 * (q*r(ir0:))**3 - 105.d0 * (q*r(ir0:))) ) / &
|
||||
(q*r(ir0:))**5
|
||||
!$acc loop vector
|
||||
do ir = ir0, msh
|
||||
jl (ir) = (sin (q*r(ir)) * &
|
||||
(105.d0 - 45.d0 * (q*r(ir))**2 + (q*r(ir))**4) + &
|
||||
cos (q*r(ir)) * &
|
||||
(10.d0 * (q*r(ir))**3 - 105.d0 * (q*r(ir))) ) / &
|
||||
(q*r(ir))**5
|
||||
enddo
|
||||
#endif
|
||||
|
||||
elseif (l == 5) then
|
||||
|
@ -205,12 +228,15 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
(420.d0*sin_qr(ir0:)) / (q*r(ir0:)) ** 3 + &
|
||||
( 15.d0*sin_qr(ir0:)) / (q*r(ir0:)) ) / (q*r(ir0:))
|
||||
#else
|
||||
jl (ir0:) = (-cos(q*r(ir0:)) - &
|
||||
(945.d0*cos(q*r(ir0:))) / (q*r(ir0:)) ** 4 + &
|
||||
(105.d0*cos(q*r(ir0:))) / (q*r(ir0:)) ** 2 + &
|
||||
(945.d0*sin(q*r(ir0:))) / (q*r(ir0:)) ** 5 - &
|
||||
(420.d0*sin(q*r(ir0:))) / (q*r(ir0:)) ** 3 + &
|
||||
( 15.d0*sin(q*r(ir0:))) / (q*r(ir0:)) ) / (q*r(ir0:))
|
||||
!$acc loop vector
|
||||
do ir = ir0, msh
|
||||
jl (ir) = (-cos(q*r(ir)) - &
|
||||
(945.d0*cos(q*r(ir))) / (q*r(ir)) ** 4 + &
|
||||
(105.d0*cos(q*r(ir))) / (q*r(ir)) ** 2 + &
|
||||
(945.d0*sin(q*r(ir))) / (q*r(ir)) ** 5 - &
|
||||
(420.d0*sin(q*r(ir))) / (q*r(ir)) ** 3 + &
|
||||
( 15.d0*sin(q*r(ir))) / (q*r(ir)) ) / (q*r(ir))
|
||||
enddo
|
||||
#endif
|
||||
|
||||
elseif (l == 6) then
|
||||
|
@ -228,19 +254,21 @@ subroutine sph_bes (msh, r, q, l, jl)
|
|||
( 4725.d0*sin_qr(ir0:)) / (q*r(ir0:))**4 + &
|
||||
( 210.d0*sin_qr(ir0:)) / (q*r(ir0:))**2 ) / (q*r(ir0:))
|
||||
#else
|
||||
|
||||
jl (ir0:) = ((-10395.d0*cos(q*r(ir0:))) / (q*r(ir0:))**5 + &
|
||||
( 1260.d0*cos(q*r(ir0:))) / (q*r(ir0:))**3 - &
|
||||
( 21.d0*cos(q*r(ir0:))) / (q*r(ir0:)) - &
|
||||
sin(q*r(ir0:)) + &
|
||||
( 10395.d0*sin(q*r(ir0:))) / (q*r(ir0:))**6 - &
|
||||
( 4725.d0*sin(q*r(ir0:))) / (q*r(ir0:))**4 + &
|
||||
( 210.d0*sin(q*r(ir0:))) / (q*r(ir0:))**2 ) / (q*r(ir0:))
|
||||
!$acc loop vector
|
||||
do ir = ir0, msh
|
||||
jl (ir) = ((-10395.d0*cos(q*r(ir))) / (q*r(ir))**5 + &
|
||||
( 1260.d0*cos(q*r(ir))) / (q*r(ir))**3 - &
|
||||
( 21.d0*cos(q*r(ir))) / (q*r(ir)) - &
|
||||
sin(q*r(ir)) + &
|
||||
( 10395.d0*sin(q*r(ir))) / (q*r(ir))**6 - &
|
||||
( 4725.d0*sin(q*r(ir))) / (q*r(ir))**4 + &
|
||||
( 210.d0*sin(q*r(ir))) / (q*r(ir))**2 ) / (q*r(ir))
|
||||
enddo
|
||||
#endif
|
||||
|
||||
else
|
||||
|
||||
call upf_error ('sph_bes', 'not implemented', abs(l))
|
||||
stop !call upf_error ('sph_bes', 'not implemented', abs(l))
|
||||
|
||||
endif
|
||||
!
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
!
|
||||
! Copyright (C) 2001-2022 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
|
||||
MODULE upf_acc_interfaces
|
||||
!
|
||||
INTERFACE sph_bes_acc
|
||||
SUBROUTINE sph_bes( msh, r, q, l, jl )
|
||||
!$acc routine vector
|
||||
IMPLICIT NONE
|
||||
INTEGER :: msh, l
|
||||
REAL(8) :: r(msh), q, jl(msh)
|
||||
END SUBROUTINE
|
||||
END INTERFACE
|
||||
!
|
||||
INTERFACE
|
||||
SUBROUTINE simpson( mesh, func, rab, asum )
|
||||
!$acc routine vector
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: mesh
|
||||
REAL(8), INTENT(IN) :: rab(mesh), func(mesh)
|
||||
REAL(8), INTENT(OUT):: asum
|
||||
END SUBROUTINE
|
||||
END INTERFACE
|
||||
!
|
||||
END MODULE
|
|
@ -30,11 +30,10 @@ MODULE uspp
|
|||
nkb, nkbus, vkb, dvan, deeq, qq_at, qq_nt, nhtoj, ijtoh, beta, &
|
||||
becsum, ebecsum
|
||||
PUBLIC :: lpx_d, lpl_d, ap_d, indv_d, nhtol_d, nhtolm_d, ofsbeta_d, &
|
||||
dvan_d, deeq_d, qq_at_d, qq_nt_d, nhtoj_d, ijtoh_d, &
|
||||
becsum_d, ebecsum_d
|
||||
dvan_d, qq_nt_d, nhtoj_d, ijtoh_d, becsum_d, ebecsum_d
|
||||
PUBLIC :: okvan, nlcc_any
|
||||
PUBLIC :: qq_so, dvan_so, deeq_nc, fcoef
|
||||
PUBLIC :: qq_so_d, dvan_so_d, deeq_nc_d, fcoef_d
|
||||
PUBLIC :: dvan_so_d, fcoef_d
|
||||
PUBLIC :: dbeta
|
||||
!
|
||||
PUBLIC :: allocate_uspp, deallocate_uspp
|
||||
|
@ -110,16 +109,12 @@ MODULE uspp
|
|||
REAL(DP), ALLOCATABLE :: becsum_d(:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: ebecsum_d(:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: dvan_d(:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: deeq_d(:,:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: qq_nt_d(:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: qq_at_d(:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: nhtoj_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: qq_so_d(:,:,:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: dvan_so_d(:,:,:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: deeq_nc_d(:,:,:,:)
|
||||
#if defined(__CUDA)
|
||||
attributes (DEVICE) :: becsum_d, ebecsum_d, dvan_d, deeq_d, qq_nt_d, &
|
||||
qq_at_d, nhtoj_d, qq_so_d, dvan_so_d, deeq_nc_d
|
||||
attributes (DEVICE) :: becsum_d, ebecsum_d, dvan_d, qq_nt_d, &
|
||||
nhtoj_d, dvan_so_d
|
||||
#endif
|
||||
|
||||
!
|
||||
|
@ -356,15 +351,19 @@ CONTAINS
|
|||
allocate( nhtoj(nhm,nsp) )
|
||||
allocate( ijtoh(nhm,nhm,nsp) )
|
||||
allocate( deeq(nhm,nhm,nat,nspin) )
|
||||
!$acc enter data create(deeq)
|
||||
if ( noncolin ) then
|
||||
allocate( deeq_nc(nhm,nhm,nat,nspin) )
|
||||
!$acc enter data create(deeq_nc)
|
||||
endif
|
||||
allocate( qq_at(nhm,nhm,nat) )
|
||||
!$acc enter data create(qq_at)
|
||||
allocate( qq_nt(nhm,nhm,nsp) )
|
||||
! set the internal spin-orbit flag
|
||||
is_spinorbit = lspinorb
|
||||
if ( lspinorb ) then
|
||||
allocate( qq_so(nhm,nhm,4,nsp) )
|
||||
!$acc enter data create(qq_so)
|
||||
allocate( dvan_so(nhm,nhm,nspin,nsp) )
|
||||
allocate( fcoef(nhm,nhm,2,2,nsp) )
|
||||
else
|
||||
|
@ -386,14 +385,8 @@ CONTAINS
|
|||
allocate( nhtolm_d(nhm,nsp) )
|
||||
allocate( nhtoj_d(nhm,nsp) )
|
||||
allocate( ijtoh_d(nhm,nhm,nsp) )
|
||||
allocate( deeq_d(nhm,nhm,nat,nspin) )
|
||||
if ( noncolin ) then
|
||||
allocate( deeq_nc_d(nhm,nhm,nat,nspin) )
|
||||
endif
|
||||
allocate( qq_at_d(nhm,nhm,nat) )
|
||||
allocate( qq_nt_d(nhm,nhm,nsp) )
|
||||
if ( lspinorb ) then
|
||||
allocate( qq_so_d(nhm,nhm,4,nsp) )
|
||||
allocate( dvan_so_d(nhm,nhm,nspin,nsp) )
|
||||
allocate( fcoef_d(nhm,nhm,2,2,nsp) )
|
||||
else
|
||||
|
@ -419,22 +412,34 @@ CONTAINS
|
|||
IF( ALLOCATED( indv ) ) DEALLOCATE( indv )
|
||||
IF( ALLOCATED( nhtolm ) ) DEALLOCATE( nhtolm )
|
||||
IF( ALLOCATED( nhtoj ) ) DEALLOCATE( nhtoj )
|
||||
IF( ALLOCATED( ofsbeta ) ) DEALLOCATE( ofsbeta )
|
||||
IF( ALLOCATED( ofsbeta ) ) DEALLOCATE( ofsbeta )
|
||||
IF( ALLOCATED( ijtoh ) ) DEALLOCATE( ijtoh )
|
||||
!FIXME in order to be created and deleted automatically by using !$acc declare create(vkb) in
|
||||
IF( ALLOCATED( vkb ) ) THEN
|
||||
!$acc exit data delete(vkb )
|
||||
!$acc exit data delete(vkb )
|
||||
DEALLOCATE( vkb )
|
||||
END IF
|
||||
IF( ALLOCATED( becsum ) ) DEALLOCATE( becsum )
|
||||
IF( ALLOCATED( ebecsum ) ) DEALLOCATE( ebecsum )
|
||||
IF( ALLOCATED( qq_at ) ) DEALLOCATE( qq_at )
|
||||
IF( ALLOCATED( qq_at ) ) THEN
|
||||
!$acc exit data delete( qq_at )
|
||||
DEALLOCATE( qq_at )
|
||||
ENDIF
|
||||
IF( ALLOCATED( qq_nt ) ) DEALLOCATE( qq_nt )
|
||||
IF( ALLOCATED( dvan ) ) DEALLOCATE( dvan )
|
||||
IF( ALLOCATED( deeq ) ) DEALLOCATE( deeq )
|
||||
IF( ALLOCATED( qq_so ) ) DEALLOCATE( qq_so )
|
||||
IF( ALLOCATED( deeq ) ) THEN
|
||||
!$acc exit data delete( deeq )
|
||||
DEALLOCATE( deeq )
|
||||
ENDIF
|
||||
IF( ALLOCATED( qq_so ) ) THEN
|
||||
!$acc exit data delete( qq_so )
|
||||
DEALLOCATE( qq_so )
|
||||
ENDIF
|
||||
IF( ALLOCATED( dvan_so ) ) DEALLOCATE( dvan_so )
|
||||
IF( ALLOCATED( deeq_nc ) ) DEALLOCATE( deeq_nc )
|
||||
IF( ALLOCATED( deeq_nc ) ) THEN
|
||||
!$acc exit data delete( deeq_nc )
|
||||
DEALLOCATE( deeq_nc )
|
||||
ENDIF
|
||||
IF( ALLOCATED( fcoef ) ) DEALLOCATE( fcoef )
|
||||
IF( ALLOCATED( beta ) ) DEALLOCATE( beta )
|
||||
IF( ALLOCATED( dbeta ) ) DEALLOCATE( dbeta )
|
||||
|
@ -451,13 +456,9 @@ CONTAINS
|
|||
IF( ALLOCATED( becsum_d ) ) DEALLOCATE( becsum_d )
|
||||
IF( ALLOCATED( ebecsum_d ) ) DEALLOCATE( ebecsum_d )
|
||||
IF( ALLOCATED( dvan_d ) ) DEALLOCATE( dvan_d )
|
||||
IF( ALLOCATED( deeq_d ) ) DEALLOCATE( deeq_d )
|
||||
IF( ALLOCATED( qq_nt_d ) ) DEALLOCATE( qq_nt_d )
|
||||
IF( ALLOCATED( qq_at_d ) ) DEALLOCATE( qq_at_d )
|
||||
IF( ALLOCATED( nhtoj_d ) ) DEALLOCATE( nhtoj_d )
|
||||
IF( ALLOCATED( qq_so_d ) ) DEALLOCATE( qq_so_d )
|
||||
IF( ALLOCATED( dvan_so_d ) ) DEALLOCATE( dvan_so_d )
|
||||
IF( ALLOCATED( deeq_nc_d ) ) DEALLOCATE( deeq_nc_d )
|
||||
IF( ALLOCATED( fcoef_d ) ) DEALLOCATE( fcoef_d )
|
||||
!
|
||||
END SUBROUTINE deallocate_uspp
|
||||
|
|
Loading…
Reference in New Issue