mirror of https://gitlab.com/QEF/q-e.git
fft_wave_wrap - evc and psic at the top (sum_band)
This commit is contained in:
parent
d77a134d5d
commit
7e93eb1f0b
|
@ -27,7 +27,7 @@ CONTAINS
|
|||
!
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
SUBROUTINE wave_g2r( f_in, f_out, dfft, dim2, igk, howmany_set )
|
||||
SUBROUTINE wave_g2r( f_in, f_out, dfft, igk, howmany_set )
|
||||
!--------------------------------------------------------------------
|
||||
!
|
||||
USE fft_helper_subroutines, ONLY: c2psi_gamma, c2psi_k
|
||||
|
@ -35,19 +35,19 @@ CONTAINS
|
|||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(fft_type_descriptor), INTENT(IN) :: dfft
|
||||
INTEGER, INTENT(IN) :: dim2
|
||||
COMPLEX(DP) :: f_in(:,:)
|
||||
COMPLEX(DP) :: f_out(:)
|
||||
INTEGER, OPTIONAL, INTENT(IN) :: igk(:)
|
||||
INTEGER, OPTIONAL, INTENT(IN) :: howmany_set(3)
|
||||
!
|
||||
INTEGER :: i2, npw, numblock
|
||||
INTEGER :: j, idx, ioff, ntgrp, right_nnr
|
||||
INTEGER :: j, idx, ioff, ntgrp, right_nnr, dim2
|
||||
INTEGER, PARAMETER :: blocksize = 256
|
||||
!
|
||||
!$acc data present_or_copyin(f_in) present_or_copyout(f_out)
|
||||
!
|
||||
npw = SIZE(f_in(:,1))
|
||||
dim2 = SIZE(f_in(1,:))
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
IF ( dim2/=2 ) CALL c2psi_gamma( dfft, f_out, f_in(:,1) )
|
||||
|
|
|
@ -284,7 +284,7 @@ SUBROUTINE sum_band()
|
|||
REAL(DP), ALLOCATABLE :: tg_rho(:)
|
||||
!
|
||||
LOGICAL :: use_tg
|
||||
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ebnd
|
||||
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ebnd, brange
|
||||
REAL(DP) :: kplusgi
|
||||
!
|
||||
CALL using_evc(0); CALL using_et(0)
|
||||
|
@ -410,7 +410,7 @@ SUBROUTINE sum_band()
|
|||
ebnd = ibnd
|
||||
IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
|
||||
!
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts, ebnd-ibnd+1 )
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts )
|
||||
!
|
||||
w1 = wg(ibnd,ik) / omega
|
||||
!
|
||||
|
@ -445,8 +445,9 @@ SUBROUTINE sum_band()
|
|||
!
|
||||
ebnd = ibnd
|
||||
IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
|
||||
brange = ebnd-ibnd+1
|
||||
!
|
||||
CALL wave_g2r( kplusg_evc(1:npw,1:ebnd-ibnd+1), psic, dffts, ebnd-ibnd+1 )
|
||||
CALL wave_g2r( kplusg_evc(1:npw,1:brange), psic, dffts )
|
||||
!
|
||||
! ... increment the kinetic energy density ...
|
||||
!
|
||||
|
@ -576,7 +577,8 @@ SUBROUTINE sum_band()
|
|||
IF ( dmft .AND. .NOT. dmft_updated) THEN
|
||||
!
|
||||
DO j = 1, npw
|
||||
CALL ZGEMM('C', 'N', nbnd, 1, nbnd, (1.d0,0.d0), v_dmft(:,:,ik), nbnd, evc(j,:), nbnd, (0.d0,0.d0), evc(j,:), nbnd)
|
||||
CALL ZGEMM( 'C', 'N', nbnd, 1, nbnd, (1.d0,0.d0), v_dmft(:,:,ik), &
|
||||
nbnd, evc(j,:), nbnd, (0.d0,0.d0), evc(j,:), nbnd )
|
||||
ENDDO
|
||||
!
|
||||
IF ( nks > 1 ) &
|
||||
|
@ -590,7 +592,8 @@ SUBROUTINE sum_band()
|
|||
!
|
||||
IF( use_tg ) THEN
|
||||
DO idx = 1, fftx_ntgrp(dffts)
|
||||
IF( idx + ibnd - 1 <= ibnd_end ) eband = eband + et( idx + ibnd - 1, ik ) * wg( idx + ibnd - 1, ik )
|
||||
IF( idx+ibnd-1 <= ibnd_end ) eband = eband + et(idx+ibnd-1,ik) * &
|
||||
wg(idx+ibnd-1,ik)
|
||||
END DO
|
||||
ELSE
|
||||
eband = eband + et(ibnd,ik) * wg(ibnd,ik)
|
||||
|
@ -653,19 +656,21 @@ SUBROUTINE sum_band()
|
|||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
CALL get_rho(tg_rho_nc(:,1), dffts%nr1x * dffts%nr2x* right_nr3, w1, tg_psi_nc(:,ipol))
|
||||
CALL get_rho( tg_rho_nc(:,1), dffts%nr1x*dffts%nr2x* &
|
||||
right_nr3, w1, tg_psi_nc(:,ipol) )
|
||||
ENDDO
|
||||
!
|
||||
IF (domag) CALL get_rho_domag(tg_rho_nc(:,:), dffts%nr1x*dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc(:,:))
|
||||
IF (domag) CALL get_rho_domag( tg_rho_nc(:,:), dffts%nr1x* &
|
||||
dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc(:,:) )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! Noncollinear case without task groups
|
||||
!
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic_nc(:,1), dffts, &
|
||||
1, igk=igk_k(:,ik) )
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic_nc(:,1), &
|
||||
dffts, igk=igk_k(:,ik) )
|
||||
CALL wave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd), &
|
||||
psic_nc(:,2), dffts, 1, igk=igk_k(:,ik) )
|
||||
psic_nc(:,2), dffts, igk=igk_k(:,ik) )
|
||||
!
|
||||
! increment the charge density ...
|
||||
!
|
||||
|
@ -737,7 +742,7 @@ SUBROUTINE sum_band()
|
|||
!
|
||||
ELSE
|
||||
!
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic, dffts, 1, igk=igk_k(:,ik) )
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic, dffts, igk=igk_k(:,ik) )
|
||||
!
|
||||
! ... increment the charge density ...
|
||||
!
|
||||
|
@ -752,7 +757,7 @@ SUBROUTINE sum_band()
|
|||
kplusg_evc(i,1) = CMPLX(0.D0,kplusgi,kind=DP) * evc(i,ibnd)
|
||||
ENDDO
|
||||
!
|
||||
CALL wave_g2r( kplusg_evc(1:npw,1:1), psic, dffts, 1, igk=igk_k(:,ik) )
|
||||
CALL wave_g2r( kplusg_evc(1:npw,1:1), psic, dffts, igk=igk_k(:,ik) )
|
||||
!
|
||||
! ... increment the kinetic energy density ...
|
||||
!
|
||||
|
|
|
@ -36,7 +36,7 @@ SUBROUTINE sum_band_gpu()
|
|||
USE uspp, ONLY : nkb, vkb, becsum, ebecsum, nhtol, nhtoj, indv, okvan, &
|
||||
becsum_d, ebecsum_d
|
||||
USE uspp_param, ONLY : upf, nh, nhm
|
||||
USE wavefunctions, ONLY : evc, psic
|
||||
USE wavefunctions, ONLY : evc, psic, psic_nc
|
||||
USE noncollin_module, ONLY : noncolin, npol, nspin_mag, domag
|
||||
USE wvfct, ONLY : nbnd, npwx, wg, et, btype
|
||||
USE mp_pools, ONLY : inter_pool_comm
|
||||
|
@ -65,7 +65,7 @@ SUBROUTINE sum_band_gpu()
|
|||
npol_,&! auxiliary dimension for noncolin case
|
||||
ibnd_start, ibnd_end, this_bgrp_nbnd ! first, last and number of band in this bgrp
|
||||
REAL(DP), ALLOCATABLE :: kplusg(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: kplusg_evc(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psicd(:), kplusg_evc(:,:)
|
||||
!
|
||||
!
|
||||
CALL start_clock_gpu( 'sum_band' )
|
||||
|
@ -137,9 +137,7 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
IF ( okvan ) CALL allocate_bec_type( nkb, this_bgrp_nbnd, becp, intra_bgrp_comm )
|
||||
IF ( okvan ) CALL using_becp_auto( 2 )
|
||||
IF (xclib_dft_is('meta') .OR. lxdm) THEN
|
||||
ALLOCATE( kplusg(npwx), kplusg_evc(npwx,2) )
|
||||
ENDIF
|
||||
IF (xclib_dft_is('meta') .OR. lxdm) ALLOCATE( kplusg(npwx), kplusg_evc(npwx,2) )
|
||||
!
|
||||
! ... specialized routines are called to sum at Gamma or for each k point
|
||||
! ... the contribution of the wavefunctions to the charge
|
||||
|
@ -148,15 +146,23 @@ SUBROUTINE sum_band_gpu()
|
|||
eband = 0.D0
|
||||
!
|
||||
CALL start_clock_gpu( 'sum_band:loop' )
|
||||
!
|
||||
!$acc data create(evc)
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
CALL sum_band_gamma_gpu()
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF (noncolin) THEN
|
||||
!$acc data create(psic_nc)
|
||||
CALL sum_band_k_gpu()
|
||||
!
|
||||
!$acc end data
|
||||
ELSE
|
||||
!$acc data create(psic)
|
||||
CALL sum_band_k_gpu()
|
||||
!$acc end data
|
||||
ENDIF
|
||||
ENDIF
|
||||
!$acc end data
|
||||
!
|
||||
CALL stop_clock_gpu( 'sum_band:loop' )
|
||||
CALL mp_sum( eband, inter_pool_comm )
|
||||
CALL mp_sum( eband, inter_bgrp_comm )
|
||||
|
@ -275,21 +281,19 @@ SUBROUTINE sum_band_gpu()
|
|||
! weights
|
||||
INTEGER :: npw, idx, ioff, ioff_tg, nxyp, incr, v_siz, j
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psi_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psic_d(:)
|
||||
REAL(DP), ALLOCATABLE :: tg_rho_d(:), tg_rho_h(:)
|
||||
REAL(DP), ALLOCATABLE :: rho_d(:,:)
|
||||
INTEGER, POINTER :: dffts_nl_d(:), dffts_nlm_d(:)
|
||||
LOGICAL :: use_tg
|
||||
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ierr, ebnd, i
|
||||
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ierr, ebnd, i, brange
|
||||
REAL(DP) :: kplusgi
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: psic_d, tg_psi_d, tg_rho_d, rho_d
|
||||
attributes(device) :: tg_psi_d, tg_rho_d, rho_d
|
||||
attributes(device) :: dffts_nl_d, dffts_nlm_d
|
||||
attributes(pinned) :: tg_rho_h
|
||||
#endif
|
||||
!
|
||||
! **TEMPORARY**
|
||||
!$acc data create(evc,psic)
|
||||
!$acc data create(psic)
|
||||
!
|
||||
CALL using_evc_d(0); CALL using_et(0)
|
||||
dffts_nl_d => dffts%nl_d
|
||||
|
@ -301,7 +305,7 @@ SUBROUTINE sum_band_gpu()
|
|||
use_tg = ( dffts%has_task_groups ) .AND. ( .NOT. (xclib_dft_is('meta') .OR. lxdm) )
|
||||
!
|
||||
incr = 2
|
||||
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
v_siz = dffts%nnr_tg
|
||||
|
@ -314,7 +318,6 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
ELSE
|
||||
ALLOCATE( rho_d, MOLD=rho%of_r ) ! OPTIMIZE HERE, use buffers (and batched FFT)
|
||||
ALLOCATE(psic_d(dfftp%nnr))
|
||||
rho_d = 0.0_DP
|
||||
ENDIF
|
||||
!
|
||||
|
@ -330,12 +333,13 @@ SUBROUTINE sum_band_gpu()
|
|||
CALL get_buffer ( evc, nwordwfc, iunwfc, ik )
|
||||
IF ( nks > 1 ) CALL using_evc(2) ! get_buffer(evc, ...) evc is updated (intent out)
|
||||
IF ( nks > 1 ) CALL using_evc_d(0) ! sync on the GPU
|
||||
!$acc update device(evc)
|
||||
!
|
||||
CALL stop_clock_gpu( 'sum_band:buffer' )
|
||||
!
|
||||
CALL start_clock_gpu( 'sum_band:init_us_2' )
|
||||
!
|
||||
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .true. )
|
||||
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
|
||||
!
|
||||
CALL stop_clock_gpu( 'sum_band:init_us_2' )
|
||||
!
|
||||
|
@ -380,9 +384,9 @@ SUBROUTINE sum_band_gpu()
|
|||
tg_psi_d(dffts_nlm_d(j)+ioff)=CONJG( evc_d(j,idx+ibnd-1) )
|
||||
ENDDO
|
||||
ENDIF
|
||||
|
||||
!
|
||||
ioff = ioff + right_nnr
|
||||
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
CALL invfft ('tgWave', tg_psi_d, dffts )
|
||||
|
@ -416,15 +420,15 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
CALL get_rho_gamma_gpu(tg_rho_d, dffts%nr1x*dffts%nr2x*right_nr3, w1, w2, tg_psi_d)
|
||||
CALL get_rho_gamma_gpu( tg_rho_d, dffts%nr1x*dffts%nr2x*right_nr3, &
|
||||
w1, w2, tg_psi_d )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
ebnd = ibnd
|
||||
IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
|
||||
!
|
||||
!$acc update device(evc)
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts, ebnd-ibnd+1 )
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts )
|
||||
!
|
||||
w1 = wg(ibnd,ik) / omega
|
||||
!
|
||||
|
@ -460,8 +464,9 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
ebnd = ibnd
|
||||
IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
|
||||
brange = ebnd-ibnd+1
|
||||
!
|
||||
CALL wave_g2r( kplusg_evc(1:npw,1:ebnd-ibnd+1), psic, dffts, ebnd-ibnd+1 )
|
||||
CALL wave_g2r( kplusg_evc(1:npw,1:brange), psic, dffts )
|
||||
!$acc update self(psic)
|
||||
!
|
||||
! ... increment the kinetic energy density ...
|
||||
|
@ -485,11 +490,12 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
! ... If we have a US pseudopotential we compute here the becsum term
|
||||
!
|
||||
IF ( okvan ) CALL sum_bec_gpu ( ik, current_spin, ibnd_start,ibnd_end,this_bgrp_nbnd )
|
||||
IF ( okvan ) CALL sum_bec_gpu( ik, current_spin, ibnd_start, ibnd_end, &
|
||||
this_bgrp_nbnd )
|
||||
!
|
||||
ENDDO k_loop
|
||||
!
|
||||
IF( .not. use_tg ) THEN
|
||||
IF( .NOT. use_tg ) THEN
|
||||
rho%of_r = rho_d
|
||||
ENDIF
|
||||
!
|
||||
|
@ -506,17 +512,16 @@ SUBROUTINE sum_band_gpu()
|
|||
ebecsum_d=ebecsum
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
DEALLOCATE( tg_psi_d )
|
||||
DEALLOCATE( tg_rho_d )
|
||||
DEALLOCATE( tg_rho_h )
|
||||
ELSE
|
||||
DEALLOCATE(rho_d)
|
||||
DEALLOCATE(psic_d)
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE sum_band_gamma_gpu
|
||||
|
@ -527,12 +532,11 @@ SUBROUTINE sum_band_gpu()
|
|||
!-----------------------------------------------------------------------
|
||||
!! \(\texttt{sum_band}\) - part for k-points version
|
||||
!
|
||||
USE wavefunctions_gpum, ONLY : psic_nc_d
|
||||
USE mp_bands, ONLY : me_bgrp
|
||||
USE mp, ONLY : mp_sum, mp_get_comm_null
|
||||
USE control_flags, ONLY : many_fft
|
||||
USE fft_helper_subroutines
|
||||
USE uspp_init, ONLY : init_us_2
|
||||
USE control_flags, ONLY : many_fft
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -543,11 +547,11 @@ SUBROUTINE sum_band_gpu()
|
|||
INTEGER :: npw, ipol, na, np
|
||||
!
|
||||
INTEGER :: idx, ioff, ioff_tg, nxyp, incr, v_siz
|
||||
COMPLEX(DP), ALLOCATABLE :: psicd(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psi_d(:), tg_psi_nc_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: tg_rho_d(:), tg_rho_nc_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: tg_rho_h(:), tg_rho_nc_h(:,:)
|
||||
REAL(DP), ALLOCATABLE :: rho_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psic_d(:), psicd(:), psicncd(:,:)
|
||||
INTEGER, POINTER :: dffts_nl_d(:)
|
||||
LOGICAL :: use_tg
|
||||
INTEGER :: nnr, right_nnr, right_nr3, right_inc, ntgrp, ierr
|
||||
|
@ -555,7 +559,7 @@ SUBROUTINE sum_band_gpu()
|
|||
REAL(DP) :: kplusgi
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: psic_d, tg_psi_d, tg_rho_d, tg_psi_nc_d, tg_rho_nc_d
|
||||
attributes(device) :: tg_psi_d, tg_rho_d, tg_psi_nc_d, tg_rho_nc_d
|
||||
attributes(device) :: rho_d, dffts_nl_d
|
||||
attributes(pinned) :: tg_rho_h, tg_rho_nc_h
|
||||
#endif
|
||||
|
@ -563,9 +567,6 @@ SUBROUTINE sum_band_gpu()
|
|||
CALL using_evc(0); CALL using_evc_d(0); CALL using_et(0)
|
||||
dffts_nl_d => dffts%nl_d
|
||||
!
|
||||
! **TEMPORARY**
|
||||
!$acc data create(evc)
|
||||
!
|
||||
! ... here we sum for each k point the contribution
|
||||
! ... of the wavefunctions to the charge
|
||||
!
|
||||
|
@ -592,16 +593,17 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
ELSE
|
||||
ALLOCATE( rho_d, MOLD=rho%of_r ) ! OPTIMIZE HERE, use buffers!
|
||||
IF (noncolin .or. (xclib_dft_is('meta') .OR. lxdm)) THEN
|
||||
ALLOCATE(psic_d(dffts%nnr))
|
||||
IF (noncolin .OR. (xclib_dft_is('meta') .OR. lxdm)) THEN
|
||||
ALLOCATE( psicd(dffts%nnr) )
|
||||
incr = 1
|
||||
ELSE
|
||||
ALLOCATE(psic_d(dffts%nnr * many_fft))
|
||||
ALLOCATE( psicd(dffts%nnr*many_fft) )
|
||||
incr = many_fft
|
||||
ENDIF
|
||||
! This is used as reduction variable on the device
|
||||
rho_d = 0.0_DP
|
||||
ENDIF
|
||||
!$acc data create(psicd)
|
||||
!
|
||||
k_loop: DO ik = 1, nks
|
||||
!
|
||||
|
@ -612,20 +614,22 @@ SUBROUTINE sum_band_gpu()
|
|||
tg_rho_d = 0.0_DP
|
||||
ENDIF
|
||||
ENDIF
|
||||
|
||||
!
|
||||
IF ( lsda ) current_spin = isk(ik)
|
||||
npw = ngk (ik)
|
||||
!
|
||||
CALL start_clock_gpu( 'sum_band:buffer' )
|
||||
IF ( nks > 1 ) &
|
||||
IF ( nks > 1 ) THEN
|
||||
CALL get_buffer ( evc, nwordwfc, iunwfc, ik )
|
||||
IF ( nks > 1 ) CALL using_evc(2)
|
||||
IF ( nks > 1 ) CALL using_evc_d(0) ! sync evc on GPU, OPTIMIZE (use async here)
|
||||
CALL using_evc(2)
|
||||
CALL using_evc_d(0) ! sync evc on GPU, OPTIMIZE (use async here)
|
||||
ENDIF
|
||||
!$acc update device(evc)
|
||||
CALL stop_clock_gpu( 'sum_band:buffer' )
|
||||
!
|
||||
CALL start_clock_gpu( 'sum_band:init_us_2' )
|
||||
!
|
||||
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .true. )
|
||||
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
|
||||
!
|
||||
CALL stop_clock_gpu( 'sum_band:init_us_2' )
|
||||
!
|
||||
|
@ -635,7 +639,8 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
!IF( use_tg ) THEN
|
||||
DO idx = 1, incr
|
||||
IF( idx + ibnd - 1 <= ibnd_end ) eband = eband + et( idx + ibnd - 1, ik ) * wg( idx + ibnd - 1, ik )
|
||||
IF( idx+ibnd-1 <= ibnd_end ) eband = eband + et(idx+ibnd-1,ik) * &
|
||||
wg(idx+ibnd-1,ik)
|
||||
ENDDO
|
||||
!ELSE
|
||||
! eband = eband + et( ibnd, ik ) * wg( ibnd, ik )
|
||||
|
@ -669,9 +674,9 @@ SUBROUTINE sum_band_gpu()
|
|||
evc_d( j+npwx, idx+ibnd-1 )
|
||||
END DO
|
||||
END IF
|
||||
|
||||
!
|
||||
ioff = ioff + right_nnr
|
||||
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL invfft ('tgWave', tg_psi_nc_d(:,1), dffts)
|
||||
|
@ -700,30 +705,27 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
! OPTIMIZE HERE : this is a sum of all densities in first spin channel
|
||||
DO ipol = 1, npol
|
||||
CALL get_rho_gpu(tg_rho_nc_d(:,1), dffts%nr1x*dffts%nr2x*right_nr3, w1, tg_psi_nc_d(:,ipol))
|
||||
CALL get_rho_gpu( tg_rho_nc_d(:,1), dffts%nr1x*dffts%nr2x* &
|
||||
right_nr3, w1, tg_psi_nc_d(:,ipol) )
|
||||
ENDDO
|
||||
!
|
||||
IF (domag) CALL get_rho_domag_gpu(tg_rho_nc_d(:,:), dffts%nr1x*dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc_d(:,:))
|
||||
IF (domag) CALL get_rho_domag_gpu( tg_rho_nc_d(:,:), dffts%nr1x* &
|
||||
dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc_d(:,:) )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! Noncollinear case without task groups
|
||||
!
|
||||
!....temporary---------------------------------------------
|
||||
!$acc update device(evc)
|
||||
ALLOCATE( psicncd(dffts%nnr,npol) )
|
||||
!$acc data create(psicncd)
|
||||
!-----------------------------------------------------------
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psicncd(:,1), &
|
||||
dffts, 1, igk=igk_k(:,ik) )
|
||||
CALL wave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd), psicncd(:,2), &
|
||||
dffts, 1, igk=igk_k(:,ik) )
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic_nc(:,1), &
|
||||
dffts, igk=igk_k(:,ik) )
|
||||
CALL wave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd), psic_nc(:,2), &
|
||||
dffts, igk=igk_k(:,ik) )
|
||||
!
|
||||
! Increment the charge density ...
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
!$acc host_data use_device(psicncd)
|
||||
CALL get_rho_gpu( rho_d(:,1), dffts%nnr, w1, psicncd(:,ipol) )
|
||||
!$acc host_data use_device(psic_nc)
|
||||
CALL get_rho_gpu( rho_d(:,1), dffts%nnr, w1, psic_nc(:,ipol) )
|
||||
!$acc end host_data
|
||||
ENDDO
|
||||
!
|
||||
|
@ -731,18 +733,13 @@ SUBROUTINE sum_band_gpu()
|
|||
! components of the magnetization (stored in rho%of_r(ir,2-4))
|
||||
!
|
||||
IF (domag) THEN
|
||||
!$acc host_data use_device(psicncd)
|
||||
CALL get_rho_domag_gpu( rho_d(1:,1:), dffts%nnr, w1, psicncd(1:,1:) )
|
||||
!$acc host_data use_device(psic_nc)
|
||||
CALL get_rho_domag_gpu( rho_d(1:,1:), dffts%nnr, w1, psic_nc(1:,1:) )
|
||||
!$acc end host_data
|
||||
ELSE
|
||||
rho_d(:,2:4) = 0.0_DP ! OPTIMIZE HERE: this memset can be avoided
|
||||
ENDIF
|
||||
!
|
||||
!---------------------------------
|
||||
!$acc end data
|
||||
DEALLOCATE( psicncd )
|
||||
!-----------------------------------
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
ELSE
|
||||
|
@ -769,9 +766,9 @@ SUBROUTINE sum_band_gpu()
|
|||
tg_psi_d( dffts_nl_d(igk_k_d(j,ik))+ioff ) = evc_d(j,idx+ibnd-1)
|
||||
END DO
|
||||
END IF
|
||||
|
||||
!
|
||||
ioff = ioff + right_nnr
|
||||
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL invfft ('tgWave', tg_psi_d, dffts)
|
||||
|
@ -799,18 +796,12 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
CALL get_rho_gpu(tg_rho_d, dffts%nr1x * dffts%nr2x * right_nr3, w1, tg_psi_d)
|
||||
!
|
||||
ELSE IF (many_fft > 1 .and. (.not. (xclib_dft_is('meta') .OR. lxdm))) THEN
|
||||
ELSEIF (many_fft > 1 .AND. (.NOT. (xclib_dft_is('meta') .OR. lxdm))) THEN
|
||||
!
|
||||
group_size = MIN(many_fft,ibnd_end-(ibnd-1))
|
||||
hm_vec(1)=group_size ; hm_vec(2)=ibnd ; hm_vec(3)=npw
|
||||
!
|
||||
!$acc update device(evc)
|
||||
|
||||
!....temporary---------------------------------------------
|
||||
ALLOCATE( psicd(dffts%nnr*many_fft) )
|
||||
!$acc data create(psicd)
|
||||
!-----------------------------------------------------------
|
||||
CALL wave_g2r( evc, psicd, dffts, 1, igk=igk_k(:,ik), &
|
||||
CALL wave_g2r( evc, psicd, dffts, igk=igk_k(:,ik), &
|
||||
howmany_set=hm_vec )
|
||||
!
|
||||
! ... increment the charge density ...
|
||||
|
@ -821,14 +812,10 @@ SUBROUTINE sum_band_gpu()
|
|||
CALL get_rho_gpu( rho_d(:,current_spin), nnr, w1, psicd(i*nnr+1:) )
|
||||
!$acc end host_data
|
||||
ENDDO
|
||||
!$acc end data
|
||||
DEALLOCATE( psicd )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
!$acc update device(evc)
|
||||
!$acc data create(psic)
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic, dffts, 1, &
|
||||
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic, dffts, &
|
||||
igk=igk_k(:,ik) )
|
||||
!
|
||||
! ... increment the charge density ...
|
||||
|
@ -836,7 +823,6 @@ SUBROUTINE sum_band_gpu()
|
|||
!$acc host_data use_device(psic)
|
||||
CALL get_rho_gpu( rho_d(:,current_spin), dffts%nnr, w1, psic )
|
||||
!$acc end host_data
|
||||
!$acc end data
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
|
@ -847,8 +833,9 @@ SUBROUTINE sum_band_gpu()
|
|||
kplusg_evc(i,1) = CMPLX(0.D0,kplusgi,kind=DP) * evc(i,ibnd)
|
||||
ENDDO
|
||||
!
|
||||
CALL wave_g2r( kplusg_evc(1:npw,1:1), psic, dffts, 1, &
|
||||
CALL wave_g2r( kplusg_evc(1:npw,1:1), psic, dffts, &
|
||||
igk=igk_k(:,ik) )
|
||||
!$acc update self(psic)
|
||||
!
|
||||
! ... increment the kinetic energy density ...
|
||||
!
|
||||
|
@ -876,6 +863,8 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
END DO k_loop
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
IF (.not. use_tg ) THEN
|
||||
rho%of_r = rho_d
|
||||
END IF
|
||||
|
@ -905,11 +894,9 @@ SUBROUTINE sum_band_gpu()
|
|||
END IF
|
||||
ELSE
|
||||
DEALLOCATE( rho_d ) ! OPTIMIZE HERE, use buffers!
|
||||
DEALLOCATE(psic_d) ! OPTIMIZE HERE, use buffers!
|
||||
DEALLOCATE( psicd )
|
||||
END IF
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE sum_band_k_gpu
|
||||
|
|
Loading…
Reference in New Issue