fft_wave_wrap - evc and psic at the top (sum_band)

This commit is contained in:
fabrizio22 2022-08-11 14:44:20 +02:00
parent d77a134d5d
commit 7e93eb1f0b
3 changed files with 139 additions and 147 deletions

View File

@ -27,7 +27,7 @@ CONTAINS
!
!
!----------------------------------------------------------------------
SUBROUTINE wave_g2r( f_in, f_out, dfft, dim2, igk, howmany_set )
SUBROUTINE wave_g2r( f_in, f_out, dfft, igk, howmany_set )
!--------------------------------------------------------------------
!
USE fft_helper_subroutines, ONLY: c2psi_gamma, c2psi_k
@ -35,19 +35,19 @@ CONTAINS
IMPLICIT NONE
!
TYPE(fft_type_descriptor), INTENT(IN) :: dfft
INTEGER, INTENT(IN) :: dim2
COMPLEX(DP) :: f_in(:,:)
COMPLEX(DP) :: f_out(:)
INTEGER, OPTIONAL, INTENT(IN) :: igk(:)
INTEGER, OPTIONAL, INTENT(IN) :: howmany_set(3)
!
INTEGER :: i2, npw, numblock
INTEGER :: j, idx, ioff, ntgrp, right_nnr
INTEGER :: j, idx, ioff, ntgrp, right_nnr, dim2
INTEGER, PARAMETER :: blocksize = 256
!
!$acc data present_or_copyin(f_in) present_or_copyout(f_out)
!
npw = SIZE(f_in(:,1))
npw = SIZE(f_in(:,1))
dim2 = SIZE(f_in(1,:))
!
IF (gamma_only) THEN
IF ( dim2/=2 ) CALL c2psi_gamma( dfft, f_out, f_in(:,1) )

View File

@ -284,7 +284,7 @@ SUBROUTINE sum_band()
REAL(DP), ALLOCATABLE :: tg_rho(:)
!
LOGICAL :: use_tg
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ebnd
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ebnd, brange
REAL(DP) :: kplusgi
!
CALL using_evc(0); CALL using_et(0)
@ -410,7 +410,7 @@ SUBROUTINE sum_band()
ebnd = ibnd
IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
!
CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts, ebnd-ibnd+1 )
CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts )
!
w1 = wg(ibnd,ik) / omega
!
@ -445,8 +445,9 @@ SUBROUTINE sum_band()
!
ebnd = ibnd
IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
brange = ebnd-ibnd+1
!
CALL wave_g2r( kplusg_evc(1:npw,1:ebnd-ibnd+1), psic, dffts, ebnd-ibnd+1 )
CALL wave_g2r( kplusg_evc(1:npw,1:brange), psic, dffts )
!
! ... increment the kinetic energy density ...
!
@ -576,7 +577,8 @@ SUBROUTINE sum_band()
IF ( dmft .AND. .NOT. dmft_updated) THEN
!
DO j = 1, npw
CALL ZGEMM('C', 'N', nbnd, 1, nbnd, (1.d0,0.d0), v_dmft(:,:,ik), nbnd, evc(j,:), nbnd, (0.d0,0.d0), evc(j,:), nbnd)
CALL ZGEMM( 'C', 'N', nbnd, 1, nbnd, (1.d0,0.d0), v_dmft(:,:,ik), &
nbnd, evc(j,:), nbnd, (0.d0,0.d0), evc(j,:), nbnd )
ENDDO
!
IF ( nks > 1 ) &
@ -590,10 +592,11 @@ SUBROUTINE sum_band()
!
IF( use_tg ) THEN
DO idx = 1, fftx_ntgrp(dffts)
IF( idx + ibnd - 1 <= ibnd_end ) eband = eband + et( idx + ibnd - 1, ik ) * wg( idx + ibnd - 1, ik )
IF( idx+ibnd-1 <= ibnd_end ) eband = eband + et(idx+ibnd-1,ik) * &
wg(idx+ibnd-1,ik)
END DO
ELSE
eband = eband + et( ibnd, ik ) * wg( ibnd, ik )
eband = eband + et(ibnd,ik) * wg(ibnd,ik)
END IF
!
! ... the sum of eband and demet is the integral for e < ef of
@ -652,37 +655,39 @@ SUBROUTINE sum_band()
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
DO ipol=1,npol
CALL get_rho(tg_rho_nc(:,1), dffts%nr1x * dffts%nr2x* right_nr3, w1, tg_psi_nc(:,ipol))
DO ipol = 1, npol
CALL get_rho( tg_rho_nc(:,1), dffts%nr1x*dffts%nr2x* &
right_nr3, w1, tg_psi_nc(:,ipol) )
ENDDO
!
IF (domag) CALL get_rho_domag(tg_rho_nc(:,:), dffts%nr1x*dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc(:,:))
IF (domag) CALL get_rho_domag( tg_rho_nc(:,:), dffts%nr1x* &
dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc(:,:) )
!
ELSE
!
! Noncollinear case without task groups
!
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic_nc(:,1), dffts, &
1, igk=igk_k(:,ik) )
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic_nc(:,1), &
dffts, igk=igk_k(:,ik) )
CALL wave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd), &
psic_nc(:,2), dffts, 1, igk=igk_k(:,ik) )
psic_nc(:,2), dffts, igk=igk_k(:,ik) )
!
! increment the charge density ...
!
DO ipol=1,npol
CALL get_rho(rho%of_r(:,1), dffts%nnr, w1, psic_nc(:,ipol))
END DO
DO ipol = 1, npol
CALL get_rho( rho%of_r(:,1), dffts%nnr, w1, psic_nc(:,ipol) )
ENDDO
!
! In this case, calculate also the three
! components of the magnetization (stored in rho%of_r(ir,2-4))
!
IF (domag) THEN
CALL get_rho_domag(rho%of_r(:,:), dffts%nnr, w1, psic_nc(:,:))
CALL get_rho_domag( rho%of_r(:,:), dffts%nnr, w1, psic_nc(:,:) )
ELSE
rho%of_r(:,2:4)=0.0_DP
END IF
rho%of_r(:,2:4) = 0.0_DP
ENDIF
!
END IF
ENDIF
!
ELSE
!
@ -737,7 +742,7 @@ SUBROUTINE sum_band()
!
ELSE
!
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic, dffts, 1, igk=igk_k(:,ik) )
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic, dffts, igk=igk_k(:,ik) )
!
! ... increment the charge density ...
!
@ -752,7 +757,7 @@ SUBROUTINE sum_band()
kplusg_evc(i,1) = CMPLX(0.D0,kplusgi,kind=DP) * evc(i,ibnd)
ENDDO
!
CALL wave_g2r( kplusg_evc(1:npw,1:1), psic, dffts, 1, igk=igk_k(:,ik) )
CALL wave_g2r( kplusg_evc(1:npw,1:1), psic, dffts, igk=igk_k(:,ik) )
!
! ... increment the kinetic energy density ...
!

View File

@ -36,7 +36,7 @@ SUBROUTINE sum_band_gpu()
USE uspp, ONLY : nkb, vkb, becsum, ebecsum, nhtol, nhtoj, indv, okvan, &
becsum_d, ebecsum_d
USE uspp_param, ONLY : upf, nh, nhm
USE wavefunctions, ONLY : evc, psic
USE wavefunctions, ONLY : evc, psic, psic_nc
USE noncollin_module, ONLY : noncolin, npol, nspin_mag, domag
USE wvfct, ONLY : nbnd, npwx, wg, et, btype
USE mp_pools, ONLY : inter_pool_comm
@ -65,7 +65,7 @@ SUBROUTINE sum_band_gpu()
npol_,&! auxiliary dimension for noncolin case
ibnd_start, ibnd_end, this_bgrp_nbnd ! first, last and number of band in this bgrp
REAL(DP), ALLOCATABLE :: kplusg(:)
COMPLEX(DP), ALLOCATABLE :: kplusg_evc(:,:)
COMPLEX(DP), ALLOCATABLE :: psicd(:), kplusg_evc(:,:)
!
!
CALL start_clock_gpu( 'sum_band' )
@ -135,28 +135,34 @@ SUBROUTINE sum_band_gpu()
!
! ... Allocate (and later deallocate) arrays needed in specific cases
!
IF ( okvan ) CALL allocate_bec_type (nkb, this_bgrp_nbnd, becp, intra_bgrp_comm)
IF ( okvan ) CALL using_becp_auto(2)
IF (xclib_dft_is('meta') .OR. lxdm) THEN
ALLOCATE( kplusg(npwx), kplusg_evc(npwx,2) )
ENDIF
IF ( okvan ) CALL allocate_bec_type( nkb, this_bgrp_nbnd, becp, intra_bgrp_comm )
IF ( okvan ) CALL using_becp_auto( 2 )
IF (xclib_dft_is('meta') .OR. lxdm) ALLOCATE( kplusg(npwx), kplusg_evc(npwx,2) )
!
! ... specialized routines are called to sum at Gamma or for each k point
! ... the contribution of the wavefunctions to the charge
! ... The band energy contribution eband is computed together with the charge
!
eband = 0.D0
eband = 0.D0
!
CALL start_clock_gpu( 'sum_band:loop' )
!
!$acc data create(evc)
IF ( gamma_only ) THEN
!
CALL sum_band_gamma_gpu()
!
ELSE
!
CALL sum_band_k_gpu()
!
IF (noncolin) THEN
!$acc data create(psic_nc)
CALL sum_band_k_gpu()
!$acc end data
ELSE
!$acc data create(psic)
CALL sum_band_k_gpu()
!$acc end data
ENDIF
ENDIF
!$acc end data
!
CALL stop_clock_gpu( 'sum_band:loop' )
CALL mp_sum( eband, inter_pool_comm )
CALL mp_sum( eband, inter_bgrp_comm )
@ -261,11 +267,11 @@ SUBROUTINE sum_band_gpu()
!-----------------------------------------------------------------------
!! \(\texttt{sum_band}\) - part for gamma version.
!
USE becmod, ONLY : becp
USE mp_bands, ONLY : me_bgrp
USE mp, ONLY : mp_sum, mp_get_comm_null
USE becmod, ONLY : becp
USE mp_bands, ONLY : me_bgrp
USE mp, ONLY : mp_sum, mp_get_comm_null
USE fft_helper_subroutines
USE uspp_init, ONLY : init_us_2
USE uspp_init, ONLY : init_us_2
!
IMPLICIT NONE
!
@ -275,21 +281,19 @@ SUBROUTINE sum_band_gpu()
! weights
INTEGER :: npw, idx, ioff, ioff_tg, nxyp, incr, v_siz, j
COMPLEX(DP), ALLOCATABLE :: tg_psi_d(:)
COMPLEX(DP), ALLOCATABLE :: psic_d(:)
REAL(DP), ALLOCATABLE :: tg_rho_d(:), tg_rho_h(:)
REAL(DP), ALLOCATABLE :: rho_d(:,:)
INTEGER, POINTER :: dffts_nl_d(:), dffts_nlm_d(:)
LOGICAL :: use_tg
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ierr, ebnd, i
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ierr, ebnd, i, brange
REAL(DP) :: kplusgi
#if defined(__CUDA)
attributes(device) :: psic_d, tg_psi_d, tg_rho_d, rho_d
attributes(device) :: tg_psi_d, tg_rho_d, rho_d
attributes(device) :: dffts_nl_d, dffts_nlm_d
attributes(pinned) :: tg_rho_h
#endif
!
! **TEMPORARY**
!$acc data create(evc,psic)
!$acc data create(psic)
!
CALL using_evc_d(0); CALL using_et(0)
dffts_nl_d => dffts%nl_d
@ -301,7 +305,7 @@ SUBROUTINE sum_band_gpu()
use_tg = ( dffts%has_task_groups ) .AND. ( .NOT. (xclib_dft_is('meta') .OR. lxdm) )
!
incr = 2
!
IF( use_tg ) THEN
!
v_siz = dffts%nnr_tg
@ -310,13 +314,12 @@ SUBROUTINE sum_band_gpu()
ALLOCATE( tg_rho_d( v_siz ) )
ALLOCATE( tg_rho_h( v_siz ) )
!
incr = 2 * fftx_ntgrp(dffts)
incr = 2 * fftx_ntgrp(dffts)
!
ELSE
ALLOCATE( rho_d, MOLD=rho%of_r ) ! OPTIMIZE HERE, use buffers (and batched FFT)
ALLOCATE(psic_d(dfftp%nnr))
rho_d = 0.0_DP
END IF
ENDIF
!
k_loop: DO ik = 1, nks
!
@ -330,12 +333,13 @@ SUBROUTINE sum_band_gpu()
CALL get_buffer ( evc, nwordwfc, iunwfc, ik )
IF ( nks > 1 ) CALL using_evc(2) ! get_buffer(evc, ...) evc is updated (intent out)
IF ( nks > 1 ) CALL using_evc_d(0) ! sync on the GPU
!$acc update device(evc)
!
CALL stop_clock_gpu( 'sum_band:buffer' )
!
CALL start_clock_gpu( 'sum_band:init_us_2' )
!
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .true. )
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
!
CALL stop_clock_gpu( 'sum_band:init_us_2' )
!
@ -349,7 +353,7 @@ SUBROUTINE sum_band_gpu()
!
eband = eband + et(ibnd,ik) * wg(ibnd,ik)
!
END DO
ENDDO
!
DO ibnd = ibnd_start, ibnd_end, incr
!
@ -372,18 +376,18 @@ SUBROUTINE sum_band_gpu()
(0.0d0,1.d0) * evc_d(j,idx+ibnd)
tg_psi_d(dffts_nlm_d(j)+ioff)=CONJG(evc_d(j,idx+ibnd-1) -&
(0.0d0,1.d0) * evc_d(j,idx+ibnd) )
END DO
ELSE IF( idx+ibnd-1 == ibnd_end ) THEN
ENDDO
ELSEIF( idx+ibnd-1 == ibnd_end ) THEN
!$cuf kernel do(1) <<<*,*>>>
DO j = 1, npw
tg_psi_d(dffts_nl_d (j)+ioff)= evc_d(j,idx+ibnd-1)
tg_psi_d(dffts_nlm_d(j)+ioff)=CONJG( evc_d(j,idx+ibnd-1) )
END DO
END IF
ENDDO
ENDIF
!
ioff = ioff + right_nnr
END DO
!
ENDDO
!
CALL invfft ('tgWave', tg_psi_d, dffts )
!
@ -406,25 +410,25 @@ SUBROUTINE sum_band_gpu()
IF( idx + ibnd - 1 < ibnd_end ) THEN
w1 = wg( idx + ibnd - 1, ik) / omega
w2 = wg( idx + ibnd , ik) / omega
ELSE IF( idx + ibnd - 1 == ibnd_end ) THEN
ELSEIF( idx + ibnd - 1 == ibnd_end ) THEN
w1 = wg( idx + ibnd - 1, ik) / omega
w2 = w1
ELSE
w1 = 0.0d0
w2 = w1
END IF
ENDIF
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
CALL get_rho_gamma_gpu(tg_rho_d, dffts%nr1x*dffts%nr2x*right_nr3, w1, w2, tg_psi_d)
CALL get_rho_gamma_gpu( tg_rho_d, dffts%nr1x*dffts%nr2x*right_nr3, &
w1, w2, tg_psi_d )
!
ELSE
!
ebnd = ibnd
IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
!
!$acc update device(evc)
CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts, ebnd-ibnd+1 )
CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts )
!
w1 = wg(ibnd,ik) / omega
!
@ -460,8 +464,9 @@ SUBROUTINE sum_band_gpu()
!
ebnd = ibnd
IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
brange = ebnd-ibnd+1
!
CALL wave_g2r( kplusg_evc(1:npw,1:ebnd-ibnd+1), psic, dffts, ebnd-ibnd+1 )
CALL wave_g2r( kplusg_evc(1:npw,1:brange), psic, dffts )
!$acc update self(psic)
!
! ... increment the kinetic energy density ...
@ -485,13 +490,14 @@ SUBROUTINE sum_band_gpu()
!
! ... If we have a US pseudopotential we compute here the becsum term
!
IF ( okvan ) CALL sum_bec_gpu ( ik, current_spin, ibnd_start,ibnd_end,this_bgrp_nbnd )
IF ( okvan ) CALL sum_bec_gpu( ik, current_spin, ibnd_start, ibnd_end, &
this_bgrp_nbnd )
!
END DO k_loop
ENDDO k_loop
!
IF( .not. use_tg ) THEN
IF( .NOT. use_tg ) THEN
rho%of_r = rho_d
END IF
ENDIF
!
! ... with distributed <beta|psi>, sum over bands
!
@ -506,16 +512,15 @@ SUBROUTINE sum_band_gpu()
ebecsum_d=ebecsum
ENDIF
!
!$acc end data
!
IF( use_tg ) THEN
DEALLOCATE( tg_psi_d )
DEALLOCATE( tg_rho_d )
DEALLOCATE( tg_rho_h )
ELSE
DEALLOCATE(rho_d)
DEALLOCATE(psic_d)
END IF
ENDIF
!
!$acc end data
!
RETURN
!
@ -527,12 +532,11 @@ SUBROUTINE sum_band_gpu()
!-----------------------------------------------------------------------
!! \(\texttt{sum_band}\) - part for k-points version
!
USE wavefunctions_gpum, ONLY : psic_nc_d
USE mp_bands, ONLY : me_bgrp
USE mp, ONLY : mp_sum, mp_get_comm_null
USE control_flags, ONLY : many_fft
USE mp_bands, ONLY : me_bgrp
USE mp, ONLY : mp_sum, mp_get_comm_null
USE fft_helper_subroutines
USE uspp_init, ONLY : init_us_2
USE uspp_init, ONLY : init_us_2
USE control_flags, ONLY : many_fft
!
IMPLICIT NONE
!
@ -543,11 +547,11 @@ SUBROUTINE sum_band_gpu()
INTEGER :: npw, ipol, na, np
!
INTEGER :: idx, ioff, ioff_tg, nxyp, incr, v_siz
COMPLEX(DP), ALLOCATABLE :: psicd(:)
COMPLEX(DP), ALLOCATABLE :: tg_psi_d(:), tg_psi_nc_d(:,:)
REAL(DP), ALLOCATABLE :: tg_rho_d(:), tg_rho_nc_d(:,:)
REAL(DP), ALLOCATABLE :: tg_rho_h(:), tg_rho_nc_h(:,:)
REAL(DP), ALLOCATABLE :: rho_d(:,:)
COMPLEX(DP), ALLOCATABLE :: psic_d(:), psicd(:), psicncd(:,:)
INTEGER, POINTER :: dffts_nl_d(:)
LOGICAL :: use_tg
INTEGER :: nnr, right_nnr, right_nr3, right_inc, ntgrp, ierr
@ -555,7 +559,7 @@ SUBROUTINE sum_band_gpu()
REAL(DP) :: kplusgi
!
#if defined(__CUDA)
attributes(device) :: psic_d, tg_psi_d, tg_rho_d, tg_psi_nc_d, tg_rho_nc_d
attributes(device) :: tg_psi_d, tg_rho_d, tg_psi_nc_d, tg_rho_nc_d
attributes(device) :: rho_d, dffts_nl_d
attributes(pinned) :: tg_rho_h, tg_rho_nc_h
#endif
@ -563,9 +567,6 @@ SUBROUTINE sum_band_gpu()
CALL using_evc(0); CALL using_evc_d(0); CALL using_et(0)
dffts_nl_d => dffts%nl_d
!
! **TEMPORARY**
!$acc data create(evc)
!
! ... here we sum for each k point the contribution
! ... of the wavefunctions to the charge
!
@ -588,20 +589,21 @@ SUBROUTINE sum_band_gpu()
ALLOCATE( tg_rho_h( v_siz ) )
ENDIF
!
incr = fftx_ntgrp(dffts)
incr = fftx_ntgrp(dffts)
!
ELSE
ALLOCATE(rho_d, MOLD=rho%of_r) ! OPTIMIZE HERE, use buffers!
IF (noncolin .or. (xclib_dft_is('meta') .OR. lxdm)) THEN
ALLOCATE(psic_d(dffts%nnr))
incr = 1
ALLOCATE( rho_d, MOLD=rho%of_r ) ! OPTIMIZE HERE, use buffers!
IF (noncolin .OR. (xclib_dft_is('meta') .OR. lxdm)) THEN
ALLOCATE( psicd(dffts%nnr) )
incr = 1
ELSE
ALLOCATE(psic_d(dffts%nnr * many_fft))
incr = many_fft
END IF
ALLOCATE( psicd(dffts%nnr*many_fft) )
incr = many_fft
ENDIF
! This is used as reduction variable on the device
rho_d = 0.0_DP
END IF
ENDIF
!$acc data create(psicd)
!
k_loop: DO ik = 1, nks
!
@ -612,20 +614,22 @@ SUBROUTINE sum_band_gpu()
tg_rho_d = 0.0_DP
ENDIF
ENDIF
!
IF ( lsda ) current_spin = isk(ik)
npw = ngk (ik)
!
CALL start_clock_gpu( 'sum_band:buffer' )
IF ( nks > 1 ) &
IF ( nks > 1 ) THEN
CALL get_buffer ( evc, nwordwfc, iunwfc, ik )
IF ( nks > 1 ) CALL using_evc(2)
IF ( nks > 1 ) CALL using_evc_d(0) ! sync evc on GPU, OPTIMIZE (use async here)
CALL using_evc(2)
CALL using_evc_d(0) ! sync evc on GPU, OPTIMIZE (use async here)
ENDIF
!$acc update device(evc)
CALL stop_clock_gpu( 'sum_band:buffer' )
!
CALL start_clock_gpu( 'sum_band:init_us_2' )
!
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .true. )
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
!
CALL stop_clock_gpu( 'sum_band:init_us_2' )
!
@ -635,8 +639,9 @@ SUBROUTINE sum_band_gpu()
!
!IF( use_tg ) THEN
DO idx = 1, incr
IF( idx + ibnd - 1 <= ibnd_end ) eband = eband + et( idx + ibnd - 1, ik ) * wg( idx + ibnd - 1, ik )
END DO
IF( idx+ibnd-1 <= ibnd_end ) eband = eband + et(idx+ibnd-1,ik) * &
wg(idx+ibnd-1,ik)
ENDDO
!ELSE
! eband = eband + et( ibnd, ik ) * wg( ibnd, ik )
!END IF
@ -669,9 +674,9 @@ SUBROUTINE sum_band_gpu()
evc_d( j+npwx, idx+ibnd-1 )
END DO
END IF
!
ioff = ioff + right_nnr
!
END DO
!
CALL invfft ('tgWave', tg_psi_nc_d(:,1), dffts)
@ -694,36 +699,33 @@ SUBROUTINE sum_band_gpu()
w1 = wg( idx + ibnd - 1, ik) / omega
ELSE
w1 = 0.0d0
END IF
ENDIF
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
! OPTIMIZE HERE : this is a sum of all densities in first spin channel
DO ipol=1,npol
CALL get_rho_gpu(tg_rho_nc_d(:,1), dffts%nr1x*dffts%nr2x*right_nr3, w1, tg_psi_nc_d(:,ipol))
DO ipol = 1, npol
CALL get_rho_gpu( tg_rho_nc_d(:,1), dffts%nr1x*dffts%nr2x* &
right_nr3, w1, tg_psi_nc_d(:,ipol) )
ENDDO
!
IF (domag) CALL get_rho_domag_gpu(tg_rho_nc_d(:,:), dffts%nr1x*dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc_d(:,:))
IF (domag) CALL get_rho_domag_gpu( tg_rho_nc_d(:,:), dffts%nr1x* &
dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc_d(:,:) )
!
ELSE
!
! Noncollinear case without task groups
!
!....temporary---------------------------------------------
!$acc update device(evc)
ALLOCATE( psicncd(dffts%nnr,npol) )
!$acc data create(psicncd)
!-----------------------------------------------------------
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psicncd(:,1), &
dffts, 1, igk=igk_k(:,ik) )
CALL wave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd), psicncd(:,2), &
dffts, 1, igk=igk_k(:,ik) )
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic_nc(:,1), &
dffts, igk=igk_k(:,ik) )
CALL wave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd), psic_nc(:,2), &
dffts, igk=igk_k(:,ik) )
!
! Increment the charge density ...
!
DO ipol = 1, npol
!$acc host_data use_device(psicncd)
CALL get_rho_gpu( rho_d(:,1), dffts%nnr, w1, psicncd(:,ipol) )
!$acc host_data use_device(psic_nc)
CALL get_rho_gpu( rho_d(:,1), dffts%nnr, w1, psic_nc(:,ipol) )
!$acc end host_data
ENDDO
!
@ -731,18 +733,13 @@ SUBROUTINE sum_band_gpu()
! components of the magnetization (stored in rho%of_r(ir,2-4))
!
IF (domag) THEN
!$acc host_data use_device(psicncd)
CALL get_rho_domag_gpu( rho_d(1:,1:), dffts%nnr, w1, psicncd(1:,1:) )
!$acc host_data use_device(psic_nc)
CALL get_rho_domag_gpu( rho_d(1:,1:), dffts%nnr, w1, psic_nc(1:,1:) )
!$acc end host_data
ELSE
rho_d(:,2:4)=0.0_DP ! OPTIMIZE HERE: this memset can be avoided
rho_d(:,2:4) = 0.0_DP ! OPTIMIZE HERE: this memset can be avoided
ENDIF
!
!---------------------------------
!$acc end data
DEALLOCATE( psicncd )
!-----------------------------------
!
ENDIF
!
ELSE
@ -769,9 +766,9 @@ SUBROUTINE sum_band_gpu()
tg_psi_d( dffts_nl_d(igk_k_d(j,ik))+ioff ) = evc_d(j,idx+ibnd-1)
END DO
END IF
!
ioff = ioff + right_nnr
!
END DO
!
CALL invfft ('tgWave', tg_psi_d, dffts)
@ -799,18 +796,12 @@ SUBROUTINE sum_band_gpu()
!
CALL get_rho_gpu(tg_rho_d, dffts%nr1x * dffts%nr2x * right_nr3, w1, tg_psi_d)
!
ELSE IF (many_fft > 1 .and. (.not. (xclib_dft_is('meta') .OR. lxdm))) THEN
ELSEIF (many_fft > 1 .AND. (.NOT. (xclib_dft_is('meta') .OR. lxdm))) THEN
!
group_size = MIN(many_fft,ibnd_end-(ibnd-1))
hm_vec(1)=group_size ; hm_vec(2)=ibnd ; hm_vec(3)=npw
!
!$acc update device(evc)
!....temporary---------------------------------------------
ALLOCATE( psicd(dffts%nnr*many_fft) )
!$acc data create(psicd)
!-----------------------------------------------------------
CALL wave_g2r( evc, psicd, dffts, 1, igk=igk_k(:,ik), &
CALL wave_g2r( evc, psicd, dffts, igk=igk_k(:,ik), &
howmany_set=hm_vec )
!
! ... increment the charge density ...
@ -818,17 +809,13 @@ SUBROUTINE sum_band_gpu()
DO i = 0, group_size-1
w1 = wg(ibnd+i,ik) / omega
!$acc host_data use_device(psicd)
CALL get_rho_gpu(rho_d(:,current_spin), nnr, w1, psicd(i*nnr+1:))
CALL get_rho_gpu( rho_d(:,current_spin), nnr, w1, psicd(i*nnr+1:) )
!$acc end host_data
ENDDO
!$acc end data
DEALLOCATE( psicd )
!
ELSE
!
!$acc update device(evc)
!$acc data create(psic)
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic, dffts, 1, &
CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic, dffts, &
igk=igk_k(:,ik) )
!
! ... increment the charge density ...
@ -836,7 +823,6 @@ SUBROUTINE sum_band_gpu()
!$acc host_data use_device(psic)
CALL get_rho_gpu( rho_d(:,current_spin), dffts%nnr, w1, psic )
!$acc end host_data
!$acc end data
!
ENDIF
!
@ -847,8 +833,9 @@ SUBROUTINE sum_band_gpu()
kplusg_evc(i,1) = CMPLX(0.D0,kplusgi,kind=DP) * evc(i,ibnd)
ENDDO
!
CALL wave_g2r( kplusg_evc(1:npw,1:1), psic, dffts, 1, &
CALL wave_g2r( kplusg_evc(1:npw,1:1), psic, dffts, &
igk=igk_k(:,ik) )
!$acc update self(psic)
!
! ... increment the kinetic energy density ...
!
@ -876,6 +863,8 @@ SUBROUTINE sum_band_gpu()
!
END DO k_loop
!
!$acc end data
!
IF (.not. use_tg ) THEN
rho%of_r = rho_d
END IF
@ -904,12 +893,10 @@ SUBROUTINE sum_band_gpu()
DEALLOCATE( tg_rho_h )
END IF
ELSE
DEALLOCATE(rho_d) ! OPTIMIZE HERE, use buffers!
DEALLOCATE(psic_d) ! OPTIMIZE HERE, use buffers!
DEALLOCATE( rho_d ) ! OPTIMIZE HERE, use buffers!
DEALLOCATE( psicd )
END IF
!
!$acc end data
!
RETURN
!
END SUBROUTINE sum_band_k_gpu