mirror of https://gitlab.com/QEF/q-e.git
Merge branch 'stress_acc' into 'develop'
Stress hubbard openACC See merge request QEF/q-e!2150
This commit is contained in:
commit
c125310efa
|
@ -43,6 +43,10 @@ SUBROUTINE stres_hub ( sigmah )
|
|||
USE wavefunctions_gpum, ONLY : using_evc
|
||||
USE uspp_init, ONLY : init_us_2, gen_us_dj, gen_us_dy
|
||||
USE constants, ONLY : eps16
|
||||
USE becmod_gpum, ONLY : bec_type_d, becp_d
|
||||
USE becmod_subs_gpum, ONLY : calbec_gpu, using_becp_auto, using_becp_d_auto, &
|
||||
allocate_bec_type_gpu, deallocate_bec_type_gpu
|
||||
USE wavefunctions_gpum, ONLY : evc_d, using_evc, using_evc_d
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -58,11 +62,20 @@ SUBROUTINE stres_hub ( sigmah )
|
|||
COMPLEX(DP), ALLOCATABLE :: dnsg(:,:,:,:,:)
|
||||
!! the derivative of the atomic occupations
|
||||
COMPLEX(DP), ALLOCATABLE :: spsi(:,:)
|
||||
TYPE (bec_type) :: proj
|
||||
|
||||
#if defined(__CUDA)
|
||||
TYPE(bec_type_d) :: proj ! proj(nwfcU,nbnd)
|
||||
#else
|
||||
TYPE(bec_type) :: proj
|
||||
#endif
|
||||
|
||||
LOGICAL :: lhubb
|
||||
LOGICAL :: save_flag
|
||||
!
|
||||
CALL start_clock( 'stres_hub' )
|
||||
REAL(DP), ALLOCATABLE :: projrd(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: projkd(:,:)
|
||||
!
|
||||
CALL start_clock_gpu( 'stres_hub' )
|
||||
!
|
||||
save_flag = use_bgrp_in_hpsi ; use_bgrp_in_hpsi = .false.
|
||||
!
|
||||
|
@ -88,7 +101,21 @@ SUBROUTINE stres_hub ( sigmah )
|
|||
ALLOCATE (overlap_inv(natomwfc,natomwfc))
|
||||
ENDIF
|
||||
!
|
||||
!$acc data create(spsi) copyin(wfcU)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
CALL allocate_bec_type_gpu( nwfcU, nbnd, proj )
|
||||
CALL using_evc_d(0)
|
||||
#else
|
||||
CALL allocate_bec_type( nwfcU, nbnd, proj )
|
||||
CALL using_evc(0)
|
||||
#endif
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
ALLOCATE( projrd(nwfcU,nbnd))
|
||||
ELSE
|
||||
ALLOCATE( projkd(nwfcU,nbnd))
|
||||
ENDIF
|
||||
!
|
||||
! poor-man parallelization over bands
|
||||
! - if nproc_pool=1 : nb_s=1, nb_e=nbnd, mykey=0
|
||||
|
@ -146,20 +173,55 @@ SUBROUTINE stres_hub ( sigmah )
|
|||
IF (nks > 1) CALL get_buffer (evc, nwordwfc, iunwfc, ik)
|
||||
IF (nks > 1) CALL using_evc(2)
|
||||
!
|
||||
CALL init_us_2 (npw, igk_k(1,ik), xk(1,ik), vkb)
|
||||
CALL init_us_2 (npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE.)
|
||||
!
|
||||
!$acc update self(vkb)
|
||||
!
|
||||
! Compute spsi = S * psi
|
||||
CALL allocate_bec_type ( nkb, nbnd, becp)
|
||||
CALL calbec (npw, vkb, evc, becp)
|
||||
CALL s_psi (npwx, npw, nbnd, evc, spsi)
|
||||
CALL using_becp_auto(2)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
CALL using_evc_d(0)
|
||||
CALL using_becp_d_auto(2)
|
||||
!$acc host_data use_device(vkb,spsi)
|
||||
CALL calbec_gpu( npw, vkb, evc_d, becp_d )
|
||||
CALL s_psi_gpu( npwx, npw, nbnd, evc_d, spsi )
|
||||
!$acc end host_data
|
||||
#else
|
||||
CALL calbec( npw, vkb, evc, becp )
|
||||
CALL s_psi( npwx, npw, nbnd, evc, spsi )
|
||||
#endif
|
||||
!
|
||||
CALL deallocate_bec_type (becp)
|
||||
CALL using_becp_auto(2)
|
||||
!
|
||||
! Set up various quantities, in particular wfcU which
|
||||
! contains Hubbard-U (ortho-)atomic wavefunctions (without ultrasoft S)
|
||||
CALL orthoUwfc_k (ik, .TRUE.)
|
||||
!$acc update device(wfcU)
|
||||
!
|
||||
! proj=<wfcU|S|evc>
|
||||
CALL calbec ( npw, wfcU, spsi, proj)
|
||||
#if defined(__CUDA)
|
||||
CALL using_becp_d_auto(2)
|
||||
!$acc host_data use_device( spsi, wfcU )
|
||||
CALL calbec_gpu( npw, wfcU, spsi, proj )
|
||||
!$acc end host_data
|
||||
IF (gamma_only) THEN
|
||||
projrd = proj%r_d
|
||||
ELSE
|
||||
projkd = proj%k_d
|
||||
ENDIF
|
||||
#else
|
||||
!
|
||||
CALL calbec( npw, wfcU, spsi, proj )
|
||||
IF (gamma_only) THEN
|
||||
projrd = proj%r
|
||||
ELSE
|
||||
projkd = proj%k
|
||||
ENDIF
|
||||
!
|
||||
#endif
|
||||
!
|
||||
! Compute derivatives of spherical harmonics and spherical Bessel functions
|
||||
!
|
||||
|
@ -195,9 +257,9 @@ SUBROUTINE stres_hub ( sigmah )
|
|||
! Compute the derivative of the occupation matrix w.r.t epsilon
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
CALL dndepsilon_gamma(ipol,jpol,ldim,proj%r,spsi,ik,nb_s,nb_e,mykey,1,dns)
|
||||
CALL dndepsilon_gamma(ipol,jpol,ldim,projrd,spsi,ik,nb_s,nb_e,mykey,1,dns)
|
||||
ELSE
|
||||
CALL dndepsilon_k(ipol,jpol,ldim,proj%k,spsi,ik,nb_s,nb_e,mykey,1,dns)
|
||||
CALL dndepsilon_k(ipol,jpol,ldim,projkd,spsi,ik,nb_s,nb_e,mykey,1,dns)
|
||||
ENDIF
|
||||
!
|
||||
DO na = 1, nat
|
||||
|
@ -221,9 +283,9 @@ SUBROUTINE stres_hub ( sigmah )
|
|||
! Compute the derivative of the occupation matrix w.r.t epsilon
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
CALL dndepsilon_gamma(ipol,jpol,ldimb,proj%r,spsi,ik,nb_s,nb_e,mykey,2,dnsb)
|
||||
CALL dndepsilon_gamma(ipol,jpol,ldimb,projrd,spsi,ik,nb_s,nb_e,mykey,2,dnsb)
|
||||
ELSE
|
||||
CALL dndepsilon_k(ipol,jpol,ldimb,proj%k,spsi,ik,nb_s,nb_e,mykey,2,dnsb)
|
||||
CALL dndepsilon_k(ipol,jpol,ldimb,projkd,spsi,ik,nb_s,nb_e,mykey,2,dnsb)
|
||||
ENDIF
|
||||
!
|
||||
DO na = 1, nat
|
||||
|
@ -249,9 +311,9 @@ SUBROUTINE stres_hub ( sigmah )
|
|||
! Compute the derivative of the occupation matrix w.r.t epsilon
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
CALL dngdepsilon_gamma(ipol,jpol,ldim,proj%r,spsi,ik,nb_s,nb_e,mykey,dnsg)
|
||||
CALL dngdepsilon_gamma(ipol,jpol,ldim,projrd,spsi,ik,nb_s,nb_e,mykey,dnsg)
|
||||
ELSE
|
||||
CALL dngdepsilon_k(ipol,jpol,ldim,proj%k,spsi,ik,nb_s,nb_e,mykey,dnsg)
|
||||
CALL dngdepsilon_k(ipol,jpol,ldim,projkd,spsi,ik,nb_s,nb_e,mykey,dnsg)
|
||||
ENDIF
|
||||
!
|
||||
DO is = 1, nspin
|
||||
|
@ -312,10 +374,23 @@ SUBROUTINE stres_hub ( sigmah )
|
|||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
CALL deallocate_bec_type ( proj )
|
||||
IF (gamma_only) THEN
|
||||
DEALLOCATE( projrd)
|
||||
ELSE
|
||||
DEALLOCATE( projkd)
|
||||
ENDIF
|
||||
#if defined(__CUDA)
|
||||
CALL deallocate_bec_type_gpu( proj )
|
||||
#else
|
||||
CALL deallocate_bec_type( proj )
|
||||
#endif
|
||||
!
|
||||
IF (ALLOCATED(dns)) DEALLOCATE (dns)
|
||||
IF (ALLOCATED(dnsb)) DEALLOCATE (dnsb)
|
||||
IF (ALLOCATED(dnsg)) DEALLOCATE (dnsg)
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
DEALLOCATE (spsi)
|
||||
DEALLOCATE (wfcatom)
|
||||
DEALLOCATE (at_dy, at_dj)
|
||||
|
@ -329,7 +404,7 @@ SUBROUTINE stres_hub ( sigmah )
|
|||
!
|
||||
use_bgrp_in_hpsi = save_flag
|
||||
!
|
||||
CALL stop_clock( 'stres_hub' )
|
||||
CALL stop_clock_gpu( 'stres_hub' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
@ -965,6 +1040,7 @@ SUBROUTINE dprojdepsilon_k ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj )
|
|||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
USE wavefunctions_gpum, ONLY : using_evc
|
||||
USE becmod_subs_gpum, ONLY : calbec_gpu
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -1002,22 +1078,32 @@ SUBROUTINE dprojdepsilon_k ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj )
|
|||
doverlap_inv(:,:) ! derivative of (O^{-1/2})_JI (note the transposition)
|
||||
REAL (DP), ALLOCATABLE :: &
|
||||
gk(:,:), & ! k+G
|
||||
qm1(:) ! 1/|k+G|
|
||||
qm1(:), & ! 1/|k+G|
|
||||
a1_temp(:), a2_temp(:)
|
||||
COMPLEX (DP) :: temp
|
||||
!
|
||||
CALL using_evc(0)
|
||||
CALL start_clock('dprojdepsilon')
|
||||
CALL start_clock_gpu('dprojdepsilon')
|
||||
!
|
||||
!$acc data present_or_copyin(spsi) present_or_copyout(dproj)
|
||||
!
|
||||
! Number of plane waves at the k point with the index ik
|
||||
npw = ngk(ik)
|
||||
!
|
||||
!$acc kernels
|
||||
dproj(:,:) = (0.d0, 0.d0)
|
||||
!$acc end kernels
|
||||
!
|
||||
! At first the derivatives of the atomic wfcs: we compute the term
|
||||
! <d\fi^{at}_{I,m1}/d\epsilon(ipol,jpol)|S|\psi_{k,v,s}>
|
||||
!
|
||||
ALLOCATE ( qm1(npwx), gk(3,npwx) )
|
||||
ALLOCATE ( dwfc(npwx,nwfcU) )
|
||||
ALLOCATE (a1_temp(npw), a2_temp(npw))
|
||||
!$acc data create(dwfc) copyin(overlap_inv, wfcU)
|
||||
!$acc kernels
|
||||
dwfc(:,:) = (0.d0, 0.d0)
|
||||
!$acc end kernels
|
||||
!
|
||||
! 1. Derivative of the atomic wavefunctions
|
||||
! (and then multiplied by (O^{-1/2})_JI in the ortho-atomic case)
|
||||
|
@ -1036,45 +1122,58 @@ SUBROUTINE dprojdepsilon_k ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj )
|
|||
ENDIF
|
||||
!
|
||||
! - (k+G)_jpol
|
||||
a1 = -gk(jpol,ig)
|
||||
a1_temp(ig) = -gk(jpol,ig)
|
||||
!
|
||||
! - (k+G)_ipol * (k+G)_jpol / |k+G|
|
||||
a2 = -gk(ipol,ig) * gk(jpol,ig) * qm1(ig)
|
||||
!
|
||||
DO na = 1, nat
|
||||
nt = ityp(na)
|
||||
ldim_std = 2*Hubbard_l(nt)+1
|
||||
IF (is_hubbard(nt) .OR. is_hubbard_back(nt)) THEN
|
||||
DO m1 = 1, ldim_u(nt)
|
||||
IF (m1.LE.ldim_std) THEN
|
||||
offpmU = offsetU(na)
|
||||
offpm = oatwfc(na)
|
||||
ELSE
|
||||
offpmU = offsetU_back(na) - ldim_std
|
||||
offpm = oatwfc_back(na) - ldim_std
|
||||
IF (backall(nt) .AND. m1.GT.ldim_std+2*Hubbard_l2(nt)+1) THEN
|
||||
offpmU = offsetU_back1(na) - ldim_std - 2*Hubbard_l2(nt) - 1
|
||||
offpm = oatwfc_back1(na) - ldim_std - 2*Hubbard_l2(nt) - 1
|
||||
ENDIF
|
||||
ENDIF
|
||||
IF (Hubbard_projectors.EQ."atomic") THEN
|
||||
dwfc(ig,offpmU+m1) = at_dy(ig,offpm+m1) * a1 + at_dj(ig,offpm+m1) * a2
|
||||
ELSEIF (Hubbard_projectors.EQ."ortho-atomic") THEN
|
||||
IF (m1>ldim_std) CALL errore("dprojdtau_k", &
|
||||
" Stress with background and ortho-atomic is not supported",1)
|
||||
DO m2 = 1, natomwfc
|
||||
dwfc(ig,offpmU+m1) = dwfc(ig,offpmU+m1) + &
|
||||
overlap_inv(offpm+m1,m2) * ( at_dy(ig,m2) * a1 + at_dj(ig,m2) * a2 )
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
a2_temp(ig) = -gk(ipol,ig) * gk(jpol,ig) * qm1(ig)
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!$acc data copyin(a1_temp, a2_temp, at_dj)
|
||||
!
|
||||
DO na = 1, nat
|
||||
nt = ityp(na)
|
||||
ldim_std = 2*Hubbard_l(nt)+1
|
||||
IF (is_hubbard(nt) .OR. is_hubbard_back(nt)) THEN
|
||||
DO m1 = 1, ldim_u(nt)
|
||||
IF (m1.LE.ldim_std) THEN
|
||||
offpmU = offsetU(na)
|
||||
offpm = oatwfc(na)
|
||||
ELSE
|
||||
offpmU = offsetU_back(na) - ldim_std
|
||||
offpm = oatwfc_back(na) - ldim_std
|
||||
IF (backall(nt) .AND. m1.GT.ldim_std+2*Hubbard_l2(nt)+1) THEN
|
||||
offpmU = offsetU_back1(na) - ldim_std - 2*Hubbard_l2(nt) - 1
|
||||
offpm = oatwfc_back1(na) - ldim_std - 2*Hubbard_l2(nt) - 1
|
||||
ENDIF
|
||||
ENDIF
|
||||
IF (Hubbard_projectors.EQ."atomic") THEN
|
||||
!$acc parallel loop
|
||||
DO ig = 1, npw
|
||||
dwfc(ig,offpmU+m1) = at_dy(ig,offpm+m1) * a1_temp(ig) + at_dj(ig,offpm+m1) * a2_temp(ig)
|
||||
ENDDO
|
||||
ELSEIF (Hubbard_projectors.EQ."ortho-atomic") THEN
|
||||
IF (m1>ldim_std) CALL errore("dprojdtau_k", &
|
||||
" Stress with background and ortho-atomic is not supported",1)
|
||||
!$acc parallel loop
|
||||
DO ig = 1, npw
|
||||
temp = (0.0d0, 0.0d0)
|
||||
DO m2 = 1, natomwfc
|
||||
temp = temp + overlap_inv(offpm+m1,m2) * ( at_dy(ig,m2) * a1_temp(ig) + at_dj(ig,m2) * a2_temp(ig) )
|
||||
ENDDO
|
||||
dwfc(ig,offpmU+m1) = dwfc(ig,offpmU+m1) + temp
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
! The diagonal term
|
||||
IF (ipol.EQ.jpol) dwfc(1:npw,:) = dwfc(1:npw,:) - wfcU(1:npw,:)*0.5d0
|
||||
IF (ipol.EQ.jpol) THEN
|
||||
!$acc kernels
|
||||
dwfc(1:npw,:) = dwfc(1:npw,:) - wfcU(1:npw,:)*0.5d0
|
||||
!$acc end kernels
|
||||
ENDIF
|
||||
!
|
||||
! 2. Contribution due to the derivative of (O^{-1/2})_JI which
|
||||
! is multiplied by atomic wavefunctions (only for ortho-atomic case)
|
||||
|
@ -1085,61 +1184,70 @@ SUBROUTINE dprojdepsilon_k ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj )
|
|||
!
|
||||
ALLOCATE (doverlap(natomwfc,natomwfc))
|
||||
ALLOCATE (doverlap_inv(natomwfc,natomwfc))
|
||||
!$acc data create(doverlap_inv) copyin(swfcatom, wfcatom)
|
||||
doverlap(:,:) = (0.0d0, 0.0d0)
|
||||
!$acc kernels
|
||||
doverlap_inv(:,:) = (0.0d0, 0.0d0)
|
||||
!$acc end kernels
|
||||
!
|
||||
! Calculate:
|
||||
! doverlap = < dphi_I/d\epsilon(ipol,jpol) | S | phi_J >
|
||||
! + < phi_I | S | dphi_J/d\epsilon(ipol,jpol) >
|
||||
!
|
||||
DO ig = 1, npw
|
||||
!
|
||||
! - (k+G)_jpol
|
||||
a1 = -gk(jpol,ig)
|
||||
!
|
||||
! - (k+G)_ipol * (k+G)_jpol / |k+G|
|
||||
a2 = -gk(ipol,ig) * gk(jpol,ig) * qm1(ig)
|
||||
!
|
||||
DO m2 = 1, natomwfc
|
||||
DO m1 = 1, natomwfc
|
||||
DO m2 = 1, natomwfc
|
||||
doverlap(m1,m2) = doverlap(m1,m2) &
|
||||
+ CONJG((at_dy(ig,m1)*a1 + at_dj(ig,m1)*a2)) * swfcatom(ig,m2) &
|
||||
+ CONJG(swfcatom(ig,m1)) * (at_dy(ig,m2)*a1 + at_dj(ig,m2)*a2)
|
||||
IF (ipol.EQ.jpol) THEN
|
||||
doverlap(m1,m2) = doverlap(m1,m2) &
|
||||
+ CONJG((-wfcatom(ig,m1)*0.5d0)) * swfcatom(ig,m2) &
|
||||
+ CONJG(swfcatom(ig,m1)) * (-wfcatom(ig,m2)*0.5d0)
|
||||
ENDIF
|
||||
temp = (0.0d0,0.0d0)
|
||||
!$acc parallel loop reduction(+:temp)
|
||||
DO ig = 1, npw
|
||||
temp = temp + CONJG((at_dy(ig,m1)*a1_temp(ig) + at_dj(ig,m1)*a2_temp(ig))) * swfcatom(ig,m2) &
|
||||
+ CONJG(swfcatom(ig,m1)) * (at_dy(ig,m2)*a1_temp(ig) + at_dj(ig,m2)*a2_temp(ig))
|
||||
ENDDO
|
||||
doverlap(m1,m2) = doverlap(m1,m2) + temp
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
IF (ipol.EQ.jpol) THEN
|
||||
DO m2 = 1, natomwfc
|
||||
DO m1 = 1, natomwfc
|
||||
temp = (0.0d0,0.0d0)
|
||||
!$acc parallel loop reduction(+:temp)
|
||||
DO ig = 1, npw
|
||||
temp = temp + CONJG((-wfcatom(ig,m1)*0.5d0)) * swfcatom(ig,m2) &
|
||||
+ CONJG(swfcatom(ig,m1)) * (-wfcatom(ig,m2)*0.5d0)
|
||||
ENDDO
|
||||
doverlap(m1,m2) = doverlap(m1,m2) + temp
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
! Sum over G vectors
|
||||
ENDIF
|
||||
!
|
||||
! Sum over G vectorso
|
||||
CALL mp_sum( doverlap, intra_bgrp_comm )
|
||||
!
|
||||
!$acc data copyin(doverlap)
|
||||
! USPP term in dO_IJ/d\epsilon(ipol,jpol)
|
||||
!
|
||||
IF (okvan) THEN
|
||||
! Calculate doverlap_us = < phi_I | dS/d\epsilon(ipol,jpol) | phi_J >
|
||||
ALLOCATE (doverlap_us(natomwfc,natomwfc))
|
||||
!$acc data create(doverlap_us) ! copyin(wfcatom)
|
||||
CALL matrix_element_of_dSdepsilon (ik, ipol, jpol, &
|
||||
natomwfc, wfcatom, natomwfc, wfcatom, doverlap_us, 1, natomwfc, 0)
|
||||
! Sum up the "normal" and ultrasoft terms
|
||||
DO m1 = 1, natomwfc
|
||||
DO m2 = 1, natomwfc
|
||||
!$acc parallel loop collapse(2)
|
||||
DO m2 = 1, natomwfc
|
||||
DO m1 = 1, natomwfc
|
||||
doverlap(m1,m2) = doverlap(m1,m2) + doverlap_us(m1,m2)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$acc end data
|
||||
DEALLOCATE (doverlap_us)
|
||||
ENDIF
|
||||
!
|
||||
! Now compute dO^{-1/2}_JI/d\epsilon(ipol,jpol) using dO_IJ/d\epsilon(ipol,jpol)
|
||||
! Note the transposition!
|
||||
!
|
||||
!$acc data copyin(doverlap) copyout(doverlap_inv)
|
||||
CALL calculate_doverlap_inv (natomwfc, eigenval, eigenvect, &
|
||||
doverlap, doverlap_inv)
|
||||
!$acc end data
|
||||
!
|
||||
! Now compute \sum_J dO^{-1/2}_JI/d\epsilon(ipol,jpol) \phi_J
|
||||
! and add it to another term (see above).
|
||||
|
@ -1154,37 +1262,58 @@ SUBROUTINE dprojdepsilon_k ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj )
|
|||
IF (is_hubbard(nt) .OR. is_hubbard_back(nt)) THEN
|
||||
offpmU = offsetU(na)
|
||||
offpm = oatwfc(na)
|
||||
CALL ZGEMM('N','N', npw, ldim_u(nt), natomwfc, (1.d0,0.d0), &
|
||||
!$acc host_data use_device(wfcatom, doverlap_inv, dwfc)
|
||||
CALL MYZGEMM('N','N', npw, ldim_u(nt), natomwfc, (1.d0,0.d0), &
|
||||
wfcatom, npwx, doverlap_inv(:,offpm+1:offpm+ldim_u(nt)), &
|
||||
natomwfc, (1.d0,0.d0), dwfc(:,offpmU+1:offpmU+ldim_u(nt)), npwx)
|
||||
!$acc end host_data
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
!
|
||||
DEALLOCATE (doverlap)
|
||||
DEALLOCATE (doverlap_inv)
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! Compute dproj = <dwfc|S|psi> = <dwfc|spsi>
|
||||
CALL calbec ( npw, dwfc, spsi, dproj )
|
||||
#if defined(__CUDA)
|
||||
!$acc host_data use_device( spsi, dwfc, dproj )
|
||||
CALL calbec_gpu( npw, dwfc, spsi, dproj )
|
||||
!$acc end host_data
|
||||
#else
|
||||
!
|
||||
CALL calbec( npw, dwfc, spsi, dproj )
|
||||
!
|
||||
#endif
|
||||
!
|
||||
!$acc end data
|
||||
!$acc end data
|
||||
DEALLOCATE ( dwfc, qm1, gk)
|
||||
DEALLOCATE(a1_temp, a2_temp)
|
||||
!
|
||||
! Now the derivatives of the beta functions: we compute the term
|
||||
! <\phi^{at}_{I,m1}|dS/d\epsilon(ipol,jpol)|\psi_{k,v,s}>
|
||||
!
|
||||
IF (okvan) THEN
|
||||
ALLOCATE(dproj_us(nwfcU,nb_s:nb_e))
|
||||
!$acc data create(dproj_us) copyin(evc)
|
||||
CALL matrix_element_of_dSdepsilon (ik, ipol, jpol, &
|
||||
nwfcU, wfcU, nbnd, evc, dproj_us, nb_s, nb_e, mykey)
|
||||
! dproj + dproj_us
|
||||
!$acc parallel loop
|
||||
DO m1 = 1, nwfcU
|
||||
dproj(m1,nb_s:nb_e) = dproj(m1,nb_s:nb_e) + dproj_us(m1,:)
|
||||
ENDDO
|
||||
!$acc end data
|
||||
DEALLOCATE(dproj_us)
|
||||
ENDIF
|
||||
!
|
||||
CALL stop_clock('dprojdepsilon')
|
||||
!$acc end data
|
||||
!
|
||||
CALL stop_clock_gpu('dprojdepsilon')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
@ -1206,6 +1335,7 @@ SUBROUTINE matrix_element_of_dSdepsilon (ik, ipol, jpol, lA, A, lB, B, A_dS_B, l
|
|||
USE uspp_param, ONLY : nh
|
||||
USE wavefunctions, ONLY : evc
|
||||
USE becmod, ONLY : calbec
|
||||
USE becmod_subs_gpum, ONLY : calbec_gpu
|
||||
USE klist, ONLY : xk, igk_k, ngk
|
||||
USE force_mod, ONLY : us_dy, us_dj
|
||||
!
|
||||
|
@ -1233,14 +1363,20 @@ SUBROUTINE matrix_element_of_dSdepsilon (ik, ipol, jpol, lA, A, lB, B, A_dS_B, l
|
|||
REAL (DP) :: q, a1, a2
|
||||
REAL (DP), PARAMETER :: eps = 1.0d-8
|
||||
REAL (DP), ALLOCATABLE :: gk(:,:), qm1(:)
|
||||
!
|
||||
A_dS_B(:,:) = (0.0d0, 0.0d0)
|
||||
INTEGER :: nh_nt
|
||||
REAL (DP), ALLOCATABLE :: a1_temp(:), a2_temp(:)
|
||||
!
|
||||
IF (.NOT.okvan) RETURN
|
||||
!
|
||||
!$acc data present_or_copyin(A,B) present_or_copyout(A_ds_B)
|
||||
!
|
||||
!$acc kernels
|
||||
A_dS_B(:,:) = (0.0d0, 0.0d0)
|
||||
!$acc end kernels
|
||||
npw = ngk(ik)
|
||||
!
|
||||
ALLOCATE ( qm1(npwx), gk(3,npwx) )
|
||||
ALLOCATE (a1_temp(npw), a2_temp(npw))
|
||||
!
|
||||
! Compute k+G and 1/|k+G|
|
||||
DO ig = 1, npw
|
||||
|
@ -1253,10 +1389,13 @@ SUBROUTINE matrix_element_of_dSdepsilon (ik, ipol, jpol, lA, A, lB, B, A_dS_B, l
|
|||
ELSE
|
||||
qm1(ig)=0.d0
|
||||
ENDIF
|
||||
a1_temp(ig) = -gk(jpol,ig)
|
||||
a2_temp(ig) = -gk(ipol,ig)*gk(jpol,ig)*qm1(ig)
|
||||
ENDDO
|
||||
!
|
||||
ijkb0 = 0
|
||||
!
|
||||
!$acc data copyin(us_dj, qq_at, a1_temp, a2_temp)
|
||||
DO nt = 1, ntyp
|
||||
!
|
||||
ALLOCATE ( Adbeta(lA,nh(nt)) )
|
||||
|
@ -1265,67 +1404,106 @@ SUBROUTINE matrix_element_of_dSdepsilon (ik, ipol, jpol, lA, A, lB, B, A_dS_B, l
|
|||
ALLOCATE ( betaB(nh(nt),lB) )
|
||||
ALLOCATE ( qq(nh(nt),nh(nt)) )
|
||||
!
|
||||
nh_nt = nh(nt)
|
||||
!$acc data create(Adbeta,Abeta,dbetaB,betaB,qq)
|
||||
!
|
||||
DO na = 1, nat
|
||||
!
|
||||
IF ( ityp(na).EQ.nt ) THEN
|
||||
!
|
||||
qq(:,:) = CMPLX(qq_at(1:nh(nt),1:nh(nt),na), 0.0d0, kind=DP)
|
||||
!$acc parallel loop collapse(2)
|
||||
DO jh = 1, nh_nt
|
||||
DO ih = 1, nh_nt
|
||||
qq(ih,jh) = CMPLX(qq_at(ih,jh,na), 0.0d0, kind=DP)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
! aux is used as a workspace
|
||||
ALLOCATE ( aux(npwx,nh(nt)) )
|
||||
!$acc data create(aux)
|
||||
!
|
||||
DO ih = 1, nh(nt)
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ih = 1, nh_nt !nh(nt)
|
||||
! now we compute the true dbeta function
|
||||
DO ig = 1, npw
|
||||
!
|
||||
! - (k+G)_jpol
|
||||
a1 = -gk(jpol,ig)
|
||||
!
|
||||
! - (k+G)_ipol * (k+G)_jpol / |k+G|
|
||||
a2 = -gk(ipol,ig)*gk(jpol,ig)*qm1(ig)
|
||||
!
|
||||
aux(ig,ih) = us_dy(ig,ijkb0+ih) * a1 + us_dj(ig,ijkb0+ih) * a2
|
||||
!
|
||||
IF (ipol.EQ.jpol) aux(ig,ih) = aux(ig,ih) - vkb(ig,ijkb0+ih)*0.5d0
|
||||
aux(ig,ih) = us_dy(ig,ijkb0+ih) * a1_temp(ig) + us_dj(ig,ijkb0+ih) * a2_temp(ig)
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
IF (ipol.EQ.jpol) THEN
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ih = 1, nh_nt !nh(nt)
|
||||
DO ig = 1, npw
|
||||
aux(ig,ih) = aux(ig,ih) - vkb(ig,ijkb0+ih)*0.5d0
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
! Calculate dbetaB = <dbeta|B>
|
||||
!$acc host_data use_device(A,B,aux,dbetaB,Adbeta)
|
||||
CALL calbec_gpu(npw, aux, B, dbetaB )
|
||||
CALL calbec_gpu(npw, A, aux, Adbeta )
|
||||
!$acc end host_data
|
||||
#else
|
||||
! Calculate dbetaB = <dbeta|B>
|
||||
CALL calbec(npw, aux, B, dbetaB )
|
||||
!
|
||||
! Calculate Adbeta = <A|dbeta>
|
||||
CALL calbec(npw, A, aux, Adbeta )
|
||||
#endif
|
||||
!
|
||||
! aux is now used as a work space to store vkb
|
||||
DO ih = 1, nh(nt)
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ih = 1, nh_nt !nh(nt)
|
||||
DO ig = 1, npw
|
||||
aux(ig,ih) = vkb(ig,ijkb0+ih)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
! Calculate dbetaB = <dbeta|B>
|
||||
!$acc host_data use_device(A,B,aux,betaB,Abeta)
|
||||
CALL calbec_gpu(npw, A, aux, Abeta )
|
||||
CALL calbec_gpu(npw, aux, B, betaB )
|
||||
!$acc end host_data
|
||||
#else
|
||||
! Calculate Abeta = <A|beta>
|
||||
CALL calbec(npw, A, aux, Abeta )
|
||||
CALL calbec( npw, A, aux, Abeta )
|
||||
!
|
||||
! Calculate betaB = <beta|B>
|
||||
CALL calbec( npw, aux, B, betaB )
|
||||
#endif
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE ( aux )
|
||||
!
|
||||
ALLOCATE ( aux(nh(nt), lB) )
|
||||
!$acc data create(aux)
|
||||
!
|
||||
! Calculate \sum_jh qq(ih,jh) * dbetaB(jh)
|
||||
CALL ZGEMM('N', 'N', nh(nt), lB_e-lB_s+1, nh(nt), (1.0d0,0.0d0), &
|
||||
!$acc host_data use_device(qq,dbetaB,aux)
|
||||
CALL MYZGEMM('N', 'N', nh(nt), lB_e-lB_s+1, nh(nt), (1.0d0,0.0d0), &
|
||||
qq, nh(nt), dbetaB(1,lB_s), nh(nt), (0.0d0,0.0d0), &
|
||||
aux(1,lB_s), nh(nt))
|
||||
!$acc end host_data
|
||||
!$acc kernels
|
||||
dbetaB(:,:) = aux(:,:)
|
||||
!$acc end kernels
|
||||
!
|
||||
! Calculate \sum_jh qq(ih,jh) * betaB(jh)
|
||||
CALL ZGEMM('N', 'N', nh(nt), lB_e-lB_s+1, nh(nt), (1.0d0,0.0d0), &
|
||||
!$acc host_data use_device(qq,betaB,aux)
|
||||
CALL MYZGEMM('N', 'N', nh(nt), lB_e-lB_s+1, nh(nt), (1.0d0,0.0d0), &
|
||||
qq, nh(nt), betaB(1,lB_s), nh(nt), (0.0d0,0.0d0), &
|
||||
aux(1,lB_s), nh(nt))
|
||||
!$acc end host_data
|
||||
!$acc kernels
|
||||
betaB(:,:) = aux(:,:)
|
||||
!$acc end kernels
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE ( aux )
|
||||
!
|
||||
ijkb0 = ijkb0 + nh(nt)
|
||||
|
@ -1335,24 +1513,31 @@ SUBROUTINE matrix_element_of_dSdepsilon (ik, ipol, jpol, lA, A, lB, B, A_dS_B, l
|
|||
! Only A_dS_B(:,lB_s:lB_e) are calculated
|
||||
!
|
||||
IF ( mykey == 0 ) THEN
|
||||
CALL ZGEMM('N', 'N', lA, lB_e-lB_s+1, nh(nt), (1.0d0,0.0d0), &
|
||||
!$acc host_data use_device(Adbeta,betaB,Abeta,dbetaB,A_dS_B)
|
||||
CALL MYZGEMM('N', 'N', lA, lB_e-lB_s+1, nh(nt), (1.0d0,0.0d0), &
|
||||
Adbeta, lA, betaB(1,lB_s), nh(nt), (1.0d0,0.0d0), &
|
||||
A_dS_B(1,lB_s), lA)
|
||||
CALL ZGEMM('N', 'N', lA, lB_e-lB_s+1, nh(nt), (1.0d0,0.0d0), &
|
||||
CALL MYZGEMM('N', 'N', lA, lB_e-lB_s+1, nh(nt), (1.0d0,0.0d0), &
|
||||
Abeta, lA, dbetaB(1,lB_s), nh(nt), (1.0d0,0.0d0), &
|
||||
A_dS_B(1,lB_s), lA)
|
||||
!$acc end host_data
|
||||
!
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE (dbetaB, betaB, Abeta, Adbeta, qq)
|
||||
!
|
||||
ENDDO
|
||||
!$acc end data
|
||||
!
|
||||
DEALLOCATE (a1_temp, a2_temp)
|
||||
DEALLOCATE ( qm1, gk )
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE matrix_element_of_dSdepsilon
|
||||
|
@ -1385,7 +1570,8 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
|
|||
USE wavefunctions, ONLY : evc
|
||||
USE becmod, ONLY : becp, calbec
|
||||
USE force_mod, ONLY : at_dy, at_dj, us_dy, us_dj
|
||||
USE wavefunctions_gpum, ONLY : using_evc
|
||||
USE wavefunctions_gpum, ONLY : evc_d, using_evc, using_evc_d
|
||||
USE becmod_subs_gpum, ONLY : calbec_gpu
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -1411,7 +1597,7 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
|
|||
! ... local variables
|
||||
!
|
||||
INTEGER :: i, ig, ijkb0, na, ibnd, iwf, nt, ih, jh, npw, &
|
||||
offpm, offpmU, m1, ldim_std
|
||||
offpm, offpmU, m1, ldim_std, nh_nt
|
||||
REAL (DP) :: q, a1, a2
|
||||
REAL (DP), PARAMETER :: eps=1.0d-8
|
||||
COMPLEX (DP), ALLOCATABLE :: &
|
||||
|
@ -1426,26 +1612,33 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
|
|||
! wfatbeta(nwfcU,nhm),! <wfc|beta>
|
||||
! wfatdbeta(nwfcU,nhm)! <wfc|dbeta>
|
||||
|
||||
REAL (DP), ALLOCATABLE :: gk(:,:), qm1(:)
|
||||
REAL (DP), ALLOCATABLE :: gk(:,:), qm1(:), a1_temp(:), a2_temp(:)
|
||||
! gk(3,npwx),
|
||||
! qm1(npwx)
|
||||
REAL (DP) :: temp
|
||||
!
|
||||
CALL using_evc(0)
|
||||
CALL start_clock_gpu('dprojdepsilon')
|
||||
!
|
||||
! See the implementation in dprojdepsilon_k
|
||||
IF (Hubbard_projectors.EQ."ortho-atomic") CALL errore("dprojdtau_gamma", &
|
||||
" Forces with gamma-only and ortho-atomic are not supported",1)
|
||||
!
|
||||
!$acc data present_or_copyin(spsi) present_or_copyout(dproj)
|
||||
|
||||
! Number of plane waves at the k point with the index ik
|
||||
npw = ngk(ik)
|
||||
!
|
||||
!$acc kernels
|
||||
dproj(:,:) = 0.d0
|
||||
!$acc end kernels
|
||||
!
|
||||
! At first the derivatives of the atomic wfcs: we compute the term
|
||||
! <d\fi^{at}_{I,m1}/d\epsilon(ipol,jpol)|S|\psi_{k,v,s}>
|
||||
!
|
||||
ALLOCATE ( qm1(npwx), gk(3,npwx) )
|
||||
ALLOCATE ( dwfc(npwx,nwfcU) )
|
||||
ALLOCATE ( a1_temp(npw), a2_temp(npw) )
|
||||
!
|
||||
DO ig = 1, npw
|
||||
!
|
||||
|
@ -1461,37 +1654,55 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
|
|||
ENDIF
|
||||
!
|
||||
! - (k+G)_jpol
|
||||
a1 = -gk(jpol,ig)
|
||||
a1_temp(ig) = -gk(jpol,ig)
|
||||
!
|
||||
! - (k+G)_ipol * (k+G)_jpol / |k+G|
|
||||
a2 = -gk(ipol,ig)*gk(jpol,ig)*qm1(ig)
|
||||
!
|
||||
DO na = 1, nat
|
||||
nt = ityp(na)
|
||||
ldim_std = 2*Hubbard_l(nt)+1
|
||||
IF (is_hubbard(nt) .OR. is_hubbard_back(nt)) THEN
|
||||
DO m1 = 1, ldim_u(nt)
|
||||
IF (m1.LE.ldim_std) THEN
|
||||
offpmU = offsetU(na)
|
||||
offpm = oatwfc(na)
|
||||
ELSE
|
||||
offpmU = offsetU_back(na) - ldim_std
|
||||
offpm = oatwfc_back(na) - ldim_std
|
||||
IF (backall(nt) .AND. m1.GT.ldim_std+2*Hubbard_l2(nt)+1) THEN
|
||||
offpmU = offsetU_back1(na) - ldim_std - 2*Hubbard_l2(nt) - 1
|
||||
offpm = oatwfc_back1(na) - ldim_std - 2*Hubbard_l2(nt) - 1
|
||||
ENDIF
|
||||
ENDIF
|
||||
dwfc(ig,offpmU+m1) = at_dy(ig,offpm+m1) * a1 + at_dj(ig,offpm+m1) * a2
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
a2_temp(ig) = -gk(ipol,ig)*gk(jpol,ig)*qm1(ig)
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
IF (ipol.EQ.jpol) dwfc(1:npw,:) = dwfc(1:npw,:) - wfcU(1:npw,:)*0.5d0
|
||||
!$acc data copyin(a1_temp, a2_temp, at_dy, at_dj, us_dy, us_dj, qq_at, wfcU)
|
||||
!$acc data create(dwfc)
|
||||
!
|
||||
DO na = 1, nat
|
||||
nt = ityp(na)
|
||||
ldim_std = 2*Hubbard_l(nt)+1
|
||||
IF (is_hubbard(nt) .OR. is_hubbard_back(nt)) THEN
|
||||
DO m1 = 1, ldim_u(nt)
|
||||
IF (m1.LE.ldim_std) THEN
|
||||
offpmU = offsetU(na)
|
||||
offpm = oatwfc(na)
|
||||
ELSE
|
||||
offpmU = offsetU_back(na) - ldim_std
|
||||
offpm = oatwfc_back(na) - ldim_std
|
||||
IF (backall(nt) .AND. m1.GT.ldim_std+2*Hubbard_l2(nt)+1) THEN
|
||||
offpmU = offsetU_back1(na) - ldim_std - 2*Hubbard_l2(nt) - 1
|
||||
offpm = oatwfc_back1(na) - ldim_std - 2*Hubbard_l2(nt) - 1
|
||||
ENDIF
|
||||
ENDIF
|
||||
!$acc parallel loop
|
||||
DO ig = 1, npw
|
||||
dwfc(ig,offpmU+m1) = at_dy(ig,offpm+m1) * a1_temp(ig) + at_dj(ig,offpm+m1) * a2_temp(ig)
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
IF (ipol.EQ.jpol) THEN
|
||||
!$acc kernels
|
||||
dwfc(1:npw,:) = dwfc(1:npw,:) - wfcU(1:npw,:)*0.5d0
|
||||
!$acc end kernels
|
||||
ENDIF
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!$acc host_data use_device(dwfc,spsi,dproj)
|
||||
CALL calbec_gpu ( npw, dwfc, spsi, dproj )
|
||||
!$acc end host_data
|
||||
#else
|
||||
CALL calbec ( npw, dwfc, spsi, dproj )
|
||||
#endif
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
DEALLOCATE (dwfc)
|
||||
!
|
||||
|
@ -1499,88 +1710,132 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
|
|||
! <\fi^{at}_{I,m1}|dS/d\epsilon(ipol,jpol)|\psi_{k,v,s}>
|
||||
!
|
||||
IF (okvan) THEN
|
||||
!
|
||||
ijkb0 = 0
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
!
|
||||
ALLOCATE (dbeta(npwx,nh(nt)), dbetapsi(nh(nt),nbnd), betapsi(nh(nt),nbnd), &
|
||||
wfatbeta(nwfcU,nh(nt)), wfatdbeta(nwfcU,nh(nt)), betapsi0(nh(nt),nbnd) )
|
||||
ijkb0 = 0
|
||||
!
|
||||
DO na = 1, nat
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
!
|
||||
IF ( ityp(na).EQ.nt ) THEN
|
||||
ALLOCATE (dbeta(npwx,nh(nt)), dbetapsi(nh(nt),nbnd), betapsi(nh(nt),nbnd), &
|
||||
wfatbeta(nwfcU,nh(nt)), wfatdbeta(nwfcU,nh(nt)), betapsi0(nh(nt),nbnd) )
|
||||
nh_nt = nh(nt)
|
||||
!$acc data create(dbeta, dbetapsi, betapsi, wfatbeta, wfatdbeta, betapsi0)
|
||||
!
|
||||
DO na = 1, nat
|
||||
!
|
||||
DO ih = 1, nh(nt)
|
||||
! now we compute the true dbeta function
|
||||
DO ig = 1, npw
|
||||
dbeta(ig,ih) = - us_dy(ig,ijkb0+ih)*gk(jpol,ig) - &
|
||||
us_dj(ig,ijkb0+ih) * gk(ipol,ig) * gk(jpol,ig) * qm1(ig)
|
||||
IF (ipol.EQ.jpol) &
|
||||
dbeta(ig,ih) = dbeta(ig,ih) - vkb(ig,ijkb0+ih)*0.5d0
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
CALL calbec(npw, dbeta, evc, dbetapsi )
|
||||
CALL calbec(npw, wfcU, dbeta, wfatdbeta )
|
||||
!
|
||||
! dbeta is now used as work space to store vkb
|
||||
DO ih = 1, nh(nt)
|
||||
DO ig = 1, npw
|
||||
dbeta(ig,ih) = vkb(ig,ijkb0+ih)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
CALL calbec(npw, wfcU, dbeta, wfatbeta )
|
||||
CALL calbec(npw, dbeta, evc, betapsi0 )
|
||||
!
|
||||
! here starts band parallelization
|
||||
! beta is here used as work space to calculate dbetapsi
|
||||
|
||||
betapsi(:,:) = 0.0_dp
|
||||
DO ih = 1, nh(nt)
|
||||
DO ibnd = nb_s, nb_e
|
||||
DO jh = 1, nh(nt)
|
||||
betapsi(ih,ibnd) = betapsi(ih,ibnd) + &
|
||||
qq_at(ih,jh,na) * dbetapsi(jh,ibnd)
|
||||
IF ( ityp(na).EQ.nt ) THEN
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ih = 1, nh_nt
|
||||
! now we compute the true dbeta function
|
||||
DO ig = 1, npw
|
||||
dbeta(ig,ih) = us_dy(ig,ijkb0+ih)*a1_temp(ig) + us_dj(ig,ijkb0+ih) * a2_temp(ig)
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
dbetapsi(:,:) = betapsi(:,:)
|
||||
!
|
||||
DO ih = 1, nh(nt)
|
||||
DO ibnd = nb_s, nb_e
|
||||
betapsi(ih,ibnd) = 0.0_dp
|
||||
DO jh = 1, nh(nt)
|
||||
betapsi(ih,ibnd) = betapsi(ih,ibnd) + &
|
||||
qq_at(ih,jh,na) * betapsi0(jh,ibnd)
|
||||
!
|
||||
IF (ipol.EQ.jpol) THEN
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ih = 1, nh_nt
|
||||
DO ig = 1, npw
|
||||
dbeta(ig,ih) = dbeta(ig,ih) - vkb(ig,ijkb0+ih)*0.5d0
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
CALL using_evc_d(0)
|
||||
!$acc host_data use_device(dbeta,dbetapsi,wfcU,wfatdbeta)
|
||||
CALL calbec_gpu(npw, dbeta, evc_d, dbetapsi )
|
||||
CALL calbec_gpu(npw, wfcU, dbeta, wfatdbeta )
|
||||
!$acc end host_data
|
||||
#else
|
||||
CALL calbec(npw, dbeta, evc, dbetapsi )
|
||||
CALL calbec(npw, wfcU, dbeta, wfatdbeta )
|
||||
#endif
|
||||
!
|
||||
! dbeta is now used as work space to store vkb
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ih = 1, nh_nt
|
||||
DO ig = 1, npw
|
||||
dbeta(ig,ih) = vkb(ig,ijkb0+ih)
|
||||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ijkb0 = ijkb0 + nh(nt)
|
||||
!
|
||||
! dproj(iwf,ibnd) = \sum_ih wfatdbeta(iwf,ih)*betapsi(ih,ibnd) +
|
||||
! wfatbeta(iwf,ih)*dbetapsi(ih,ibnd)
|
||||
!
|
||||
IF ( mykey == 0 .AND. nh(nt) > 0 ) THEN
|
||||
CALL DGEMM('N','N',nwfcU, nb_e-nb_s+1, nh(nt), 1.0_dp, &
|
||||
wfatdbeta, nwfcU, betapsi(1,nb_s), nh(nt), 1.0_dp,&
|
||||
dproj(1,nb_s), nwfcU)
|
||||
CALL DGEMM('N','N',nwfcU, nb_e-nb_s+1, nh(nt), 1.0_dp, &
|
||||
wfatbeta, nwfcU, dbetapsi(1,nb_s), nh(nt), 1.0_dp,&
|
||||
dproj(1,nb_s), nwfcU)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
CALL using_evc_d(0)
|
||||
!$acc host_data use_device(dbeta,betapsi0,wfcU,wfatbeta)
|
||||
CALL calbec_gpu(npw, wfcU, dbeta, wfatbeta )
|
||||
CALL calbec_gpu(npw, dbeta, evc_d, betapsi0 )
|
||||
!$acc end host_data
|
||||
#else
|
||||
CALL calbec(npw, wfcU, dbeta, wfatbeta )
|
||||
CALL calbec(npw, dbeta, evc, betapsi0 )
|
||||
#endif
|
||||
!
|
||||
! here starts band parallelization
|
||||
! beta is here used as work space to calculate dbetapsi
|
||||
!
|
||||
!$acc kernels
|
||||
betapsi(:,:) = 0.0_dp
|
||||
!$acc end kernels
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ibnd = nb_s, nb_e
|
||||
DO ih = 1, nh_nt
|
||||
temp = 0.0d0
|
||||
DO jh = 1, nh_nt
|
||||
temp = temp + qq_at(ih,jh,na) * dbetapsi(jh,ibnd)
|
||||
ENDDO
|
||||
betapsi(ih,ibnd) = betapsi(ih,ibnd) + temp
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!$acc kernels
|
||||
dbetapsi(:,:) = betapsi(:,:)
|
||||
betapsi(:,:) = 0.0_DP
|
||||
!$acc end kernels
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
DO ibnd = nb_s, nb_e
|
||||
DO ih = 1, nh_nt
|
||||
temp = 0.0d0
|
||||
DO jh = 1, nh_nt
|
||||
temp = temp + qq_at(ih,jh,na) * betapsi0(jh,ibnd)
|
||||
ENDDO
|
||||
betapsi(ih,ibnd) = betapsi(ih,ibnd) + temp
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ijkb0 = ijkb0 + nh(nt)
|
||||
!
|
||||
! dproj(iwf,ibnd) = \sum_ih wfatdbeta(iwf,ih)*betapsi(ih,ibnd) +
|
||||
! wfatbeta(iwf,ih)*dbetapsi(ih,ibnd)
|
||||
!
|
||||
IF ( mykey == 0 .AND. nh(nt) > 0 ) THEN
|
||||
!$acc host_data use_device(wfatdbeta,betapsi,dproj,wfatbeta,dbetapsi)
|
||||
CALL MYDGEMM('N','N',nwfcU, nb_e-nb_s+1, nh(nt), 1.0_dp, &
|
||||
wfatdbeta, nwfcU, betapsi(1,nb_s), nh(nt), 1.0_dp,&
|
||||
dproj(1,nb_s), nwfcU)
|
||||
CALL MYDGEMM('N','N',nwfcU, nb_e-nb_s+1, nh(nt), 1.0_dp, &
|
||||
wfatbeta, nwfcU, dbetapsi(1,nb_s), nh(nt), 1.0_dp,&
|
||||
dproj(1,nb_s), nwfcU)
|
||||
!$acc end host_data
|
||||
ENDIF
|
||||
! end band parallelization - only dproj(1,nb_s:nb_e) are calculated
|
||||
ENDIF
|
||||
! end band parallelization - only dproj(1,nb_s:nb_e) are calculated
|
||||
ENDIF
|
||||
ENDDO
|
||||
!$acc end data
|
||||
DEALLOCATE (dbeta, dbetapsi, betapsi, wfatbeta, wfatdbeta, betapsi0)
|
||||
ENDDO
|
||||
DEALLOCATE (dbeta, dbetapsi, betapsi, wfatbeta, wfatdbeta, betapsi0)
|
||||
ENDDO
|
||||
!
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE ( qm1, gk )
|
||||
DEALLOCATE ( a1_temp, a2_temp )
|
||||
!
|
||||
!$acc end data
|
||||
CALL stop_clock_gpu('dprojdepsilon')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
|
Loading…
Reference in New Issue