Update stres_hub with the new calbec

This commit is contained in:
Ivan Carnimeo 2023-10-18 12:50:11 +02:00
parent 672aefe53f
commit 599186508c
1 changed files with 30 additions and 124 deletions

View File

@ -23,7 +23,7 @@ SUBROUTINE stres_hub ( sigmah )
ldim_back, ldmx_b, nsg, v_nsg, max_num_neighbors, &
ldim_u, Hubbard_V, at_sc, neighood, ldmx_tot, &
wfcU, nwfcU, copy_U_wfc, Hubbard_J
USE becmod, ONLY : bec_type, becp, calbec, allocate_bec_type, deallocate_bec_type
USE becmod, ONLY : becp, calbec, allocate_bec_type_acc, deallocate_bec_type_acc
USE lsda_mod, ONLY : lsda, nspin, current_spin, isk
USE uspp, ONLY : nkb, vkb, okvan
USE klist, ONLY : nks, xk, ngk, igk_k
@ -35,18 +35,13 @@ SUBROUTINE stres_hub ( sigmah )
USE io_global, ONLY : stdout
USE mp_pools, ONLY : inter_pool_comm, me_pool, nproc_pool
USE mp, ONLY : mp_sum
USE control_flags, ONLY : gamma_only
USE control_flags, ONLY : gamma_only, offload_type
USE mp_bands, ONLY : use_bgrp_in_hpsi
USE noncollin_module, ONLY : noncolin
USE force_mod, ONLY : eigenval, eigenvect, overlap_inv, at_dy, at_dj, &
us_dy, us_dj
USE wavefunctions_gpum, ONLY : using_evc
USE uspp_init, ONLY : init_us_2, gen_us_dj, gen_us_dy
USE constants, ONLY : eps16
USE becmod_gpum, ONLY : bec_type_d, becp_d
USE becmod_subs_gpum, ONLY : calbec_gpu, using_becp_auto, using_becp_d_auto, &
allocate_bec_type_gpu, deallocate_bec_type_gpu
USE wavefunctions_gpum, ONLY : evc_d, using_evc, using_evc_d
!
IMPLICIT NONE
!
@ -63,12 +58,6 @@ SUBROUTINE stres_hub ( sigmah )
!! the derivative of the atomic occupations
COMPLEX(DP), ALLOCATABLE :: spsi(:,:)
#if defined(__CUDA)
TYPE(bec_type_d) :: proj ! proj(nwfcU,nbnd)
#else
TYPE(bec_type) :: proj
#endif
LOGICAL :: lhubb
LOGICAL :: save_flag
!
@ -103,14 +92,6 @@ SUBROUTINE stres_hub ( sigmah )
!
!$acc data create(spsi) copyin(wfcU)
!
#if defined(__CUDA)
CALL allocate_bec_type_gpu( nwfcU, nbnd, proj )
CALL using_evc_d(0)
#else
CALL allocate_bec_type( nwfcU, nbnd, proj )
CALL using_evc(0)
#endif
!
IF (gamma_only) THEN
ALLOCATE( projrd(nwfcU,nbnd))
ELSE
@ -171,30 +152,22 @@ SUBROUTINE stres_hub ( sigmah )
npw = ngk(ik)
!
IF (nks > 1) CALL get_buffer (evc, nwordwfc, iunwfc, ik)
IF (nks > 1) CALL using_evc(2)
!
CALL init_us_2 (npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE.)
!
!$acc update self(vkb)
!
! Compute spsi = S * psi
CALL allocate_bec_type ( nkb, nbnd, becp)
CALL using_becp_auto(2)
CALL allocate_bec_type_acc ( nkb, nbnd, becp)
!
#if defined(__CUDA)
CALL using_evc_d(0)
CALL using_becp_d_auto(2)
!$acc host_data use_device(vkb,spsi)
CALL calbec_gpu( npw, vkb, evc_d, becp_d )
CALL s_psi_gpu( npwx, npw, nbnd, evc_d, spsi )
!$acc data present_or_copyin(evc)
CALL calbec( offload_type, npw, vkb, evc, becp )
!$acc host_data use_device(evc, spsi)
CALL s_psi_acc( npwx, npw, nbnd, evc, spsi )
!$acc end host_data
#else
CALL calbec( npw, vkb, evc, becp )
CALL s_psi( npwx, npw, nbnd, evc, spsi )
#endif
!$acc end data
!
CALL deallocate_bec_type (becp)
CALL using_becp_auto(2)
CALL deallocate_bec_type_acc (becp)
!
! Set up various quantities, in particular wfcU which
! contains Hubbard-U (ortho-)atomic wavefunctions (without ultrasoft S)
@ -202,26 +175,13 @@ SUBROUTINE stres_hub ( sigmah )
!$acc update device(wfcU)
!
! proj=<wfcU|S|evc>
#if defined(__CUDA)
CALL using_becp_d_auto(2)
!$acc host_data use_device( spsi, wfcU )
CALL calbec_gpu( npw, wfcU, spsi, proj )
!$acc end host_data
IF (gamma_only) THEN
projrd = proj%r_d
CALL calbec( offload_type, npw, wfcU, spsi, projrd )
!$acc update self(projrd)
ELSE
projkd = proj%k_d
CALL calbec( offload_type, npw, wfcU, spsi, projkd )
!$acc update self(projkd)
ENDIF
#else
!
CALL calbec( npw, wfcU, spsi, proj )
IF (gamma_only) THEN
projrd = proj%r
ELSE
projkd = proj%k
ENDIF
!
#endif
!
! Compute derivatives of spherical harmonics and spherical Bessel functions
!
@ -379,11 +339,6 @@ SUBROUTINE stres_hub ( sigmah )
ELSE
DEALLOCATE( projkd)
ENDIF
#if defined(__CUDA)
CALL deallocate_bec_type_gpu( proj )
#else
CALL deallocate_bec_type( proj )
#endif
!
IF (ALLOCATED(dns)) DEALLOCATE (dns)
IF (ALLOCATED(dnsb)) DEALLOCATE (dnsb)
@ -428,7 +383,6 @@ SUBROUTINE dndepsilon_k ( ipol,jpol,ldim,proj,spsi,ik,nb_s,nb_e,mykey,lpuk,dns )
USE ldaU, ONLY : nwfcU, offsetU, Hubbard_l, is_hubbard, &
ldim_back, offsetU_back, offsetU_back1, &
is_hubbard_back, Hubbard_l2, backall
USE wavefunctions_gpum,ONLY : using_evc
!
IMPLICIT NONE
!
@ -472,8 +426,6 @@ SUBROUTINE dndepsilon_k ( ipol,jpol,ldim,proj,spsi,ik,nb_s,nb_e,mykey,lpuk,dns )
!
CALL allocate_bec_type ( nwfcU,nbnd, dproj )
!
CALL using_evc(0)
!
! D_Sl for l=1 and l=2 are already initialized, for l=0 D_S0 is 1
!
! Offset of atomic wavefunctions initialized in setup and stored in offsetU
@ -1033,12 +985,11 @@ SUBROUTINE dprojdepsilon_k ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj )
USE uspp, ONLY : nkb, vkb, okvan
USE wavefunctions, ONLY : evc
USE becmod, ONLY : becp, calbec
USE control_flags, ONLY : offload_type
USE basis, ONLY : natomwfc, wfcatom, swfcatom
USE force_mod, ONLY : eigenval, eigenvect, overlap_inv, at_dy, at_dj
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
USE wavefunctions_gpum, ONLY : using_evc
USE becmod_subs_gpum, ONLY : calbec_gpu
!
IMPLICIT NONE
!
@ -1080,7 +1031,6 @@ SUBROUTINE dprojdepsilon_k ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj )
a1_temp(:), a2_temp(:)
COMPLEX (DP) :: temp
!
CALL using_evc(0)
CALL start_clock_gpu('dprojdepsilon')
!
!$acc data present_or_copyin(spsi) present_or_copyout(dproj)
@ -1277,15 +1227,7 @@ SUBROUTINE dprojdepsilon_k ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj )
ENDIF
!
! Compute dproj = <dwfc|S|psi> = <dwfc|spsi>
#if defined(__CUDA)
!$acc host_data use_device( spsi, dwfc, dproj )
CALL calbec_gpu( npw, dwfc, spsi, dproj )
!$acc end host_data
#else
!
CALL calbec( npw, dwfc, spsi, dproj )
!
#endif
CALL calbec( offload_type, npw, dwfc, spsi, dproj )
!
!$acc end data
!$acc end data
@ -1333,7 +1275,7 @@ SUBROUTINE matrix_element_of_dSdepsilon (ik, ipol, jpol, lA, A, lB, B, A_dS_B, l
USE uspp_param, ONLY : nh
USE wavefunctions, ONLY : evc
USE becmod, ONLY : calbec
USE becmod_subs_gpum, ONLY : calbec_gpu
USE control_flags, ONLY : offload_type
USE klist, ONLY : xk, igk_k, ngk
USE force_mod, ONLY : us_dy, us_dj
!
@ -1439,19 +1381,11 @@ SUBROUTINE matrix_element_of_dSdepsilon (ik, ipol, jpol, lA, A, lB, B, A_dS_B, l
ENDDO
ENDIF
!
#if defined(__CUDA)
! Calculate dbetaB = <dbeta|B>
!$acc host_data use_device(A,B,aux,dbetaB,Adbeta)
CALL calbec_gpu(npw, aux, B, dbetaB )
CALL calbec_gpu(npw, A, aux, Adbeta )
!$acc end host_data
#else
! Calculate dbetaB = <dbeta|B>
CALL calbec(npw, aux, B, dbetaB )
CALL calbec(offload_type, npw, aux, B, dbetaB )
!
! Calculate Adbeta = <A|dbeta>
CALL calbec(npw, A, aux, Adbeta )
#endif
CALL calbec(offload_type, npw, A, aux, Adbeta )
!
! aux is now used as a work space to store vkb
!$acc parallel loop collapse(2)
@ -1461,19 +1395,11 @@ SUBROUTINE matrix_element_of_dSdepsilon (ik, ipol, jpol, lA, A, lB, B, A_dS_B, l
ENDDO
ENDDO
!
#if defined(__CUDA)
! Calculate dbetaB = <dbeta|B>
!$acc host_data use_device(A,B,aux,betaB,Abeta)
CALL calbec_gpu(npw, A, aux, Abeta )
CALL calbec_gpu(npw, aux, B, betaB )
!$acc end host_data
#else
! Calculate Abeta = <A|beta>
CALL calbec( npw, A, aux, Abeta )
CALL calbec( offload_type, npw, A, aux, Abeta )
!
! Calculate betaB = <beta|B>
CALL calbec( npw, aux, B, betaB )
#endif
CALL calbec( offload_type, npw, aux, B, betaB )
!
!$acc end data
DEALLOCATE ( aux )
@ -1567,9 +1493,8 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
USE uspp_param, ONLY : nh
USE wavefunctions, ONLY : evc
USE becmod, ONLY : becp, calbec
USE control_flags, ONLY : offload_type
USE force_mod, ONLY : at_dy, at_dj, us_dy, us_dj
USE wavefunctions_gpum, ONLY : evc_d, using_evc, using_evc_d
USE becmod_subs_gpum, ONLY : calbec_gpu
!
IMPLICIT NONE
!
@ -1615,7 +1540,6 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
! qm1(npwx)
REAL (DP) :: temp
!
CALL using_evc(0)
CALL start_clock_gpu('dprojdepsilon')
!
! See the implementation in dprojdepsilon_k
@ -1692,13 +1616,7 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
!$acc end kernels
ENDIF
!
#if defined(__CUDA)
!$acc host_data use_device(dwfc,spsi,dproj)
CALL calbec_gpu ( npw, dwfc, spsi, dproj )
!$acc end host_data
#else
CALL calbec ( npw, dwfc, spsi, dproj )
#endif
CALL calbec ( offload_type, npw, dwfc, spsi, dproj )
!
!$acc end data
!
@ -1740,16 +1658,10 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
ENDDO
ENDIF
!
#if defined(__CUDA)
CALL using_evc_d(0)
!$acc host_data use_device(dbeta,dbetapsi,wfcU,wfatdbeta)
CALL calbec_gpu(npw, dbeta, evc_d, dbetapsi )
CALL calbec_gpu(npw, wfcU, dbeta, wfatdbeta )
!$acc end host_data
#else
CALL calbec(npw, dbeta, evc, dbetapsi )
CALL calbec(npw, wfcU, dbeta, wfatdbeta )
#endif
!$acc data present_or_copyin(evc)
CALL calbec(offload_type, npw, dbeta, evc, dbetapsi )
!$acc end data
CALL calbec(offload_type, npw, wfcU, dbeta, wfatdbeta )
!
! dbeta is now used as work space to store vkb
!$acc parallel loop collapse(2)
@ -1759,16 +1671,10 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
ENDDO
ENDDO
!
#if defined(__CUDA)
CALL using_evc_d(0)
!$acc host_data use_device(dbeta,betapsi0,wfcU,wfatbeta)
CALL calbec_gpu(npw, wfcU, dbeta, wfatbeta )
CALL calbec_gpu(npw, dbeta, evc_d, betapsi0 )
!$acc end host_data
#else
CALL calbec(npw, wfcU, dbeta, wfatbeta )
CALL calbec(npw, dbeta, evc, betapsi0 )
#endif
CALL calbec(offload_type, npw, wfcU, dbeta, wfatbeta )
!$acc data present_or_copyin(evc)
CALL calbec(offload_type, npw, dbeta, evc, betapsi0 )
!$acc end data
!
! here starts band parallelization
! beta is here used as work space to calculate dbetapsi