From 1d066441d89f2661342cfb4b7ad5174b18420e1f Mon Sep 17 00:00:00 2001
From: Paolo Giannozzi
Date: Wed, 10 Jul 2024 14:30:01 +0200
Subject: [PATCH] In charge density calculation, separate task-group code (to
be deleted sooner or later) from the rest. Makes the code much more readable.
Misc cleanup.
---
PW/src/sum_band.f90 | 732 ++++++++++++++++++++++++--------------------
1 file changed, 406 insertions(+), 326 deletions(-)
diff --git a/PW/src/sum_band.f90 b/PW/src/sum_band.f90
index a454b4256..fee3983c2 100644
--- a/PW/src/sum_band.f90
+++ b/PW/src/sum_band.f90
@@ -19,7 +19,7 @@ SUBROUTINE sum_band()
USE ions_base, ONLY : nat, ntyp => nsp, ityp
USE fft_base, ONLY : dfftp, dffts
USE fft_rho, ONLY : rho_g2r, rho_r2g
- USE fft_wave, ONLY : wave_g2r, tgwave_g2r
+ USE fft_wave, ONLY : wave_g2r
USE gvect, ONLY : ngm, g
USE gvecs, ONLY : doublegrid
USE klist, ONLY : nks, nkstot, wk, xk, ngk, igk_k
@@ -37,7 +37,7 @@ SUBROUTINE sum_band()
USE wvfct, ONLY : nbnd, npwx, wg, et, btype
USE mp_pools, ONLY : inter_pool_comm
USE mp_bands, ONLY : inter_bgrp_comm, intra_bgrp_comm, nbgrp
- USE mp, ONLY : mp_sum
+ USE mp, ONLY : mp_sum, mp_get_comm_null
USE xc_lib, ONLY : xclib_dft_is
USE paw_symmetry, ONLY : PAW_symmetrize
USE paw_variables, ONLY : okpaw
@@ -186,15 +186,30 @@ SUBROUTINE sum_band()
!$acc enter data create(psic)
ENDIF
!
- IF ( gamma_only ) THEN
- !
- CALL sum_band_gamma()
- !
+ IF ( ( dffts%has_task_groups ) .AND. .NOT. &
+ ( xclib_dft_is('meta') .OR. lxdm .OR. dmft .OR. sic ) ) THEN
+ ! Task groups: no GPU, no special cases, etc.
+ IF ( gamma_only ) THEN
+ !
+ CALL sum_band_gamma_tg ()
+ !
+ ELSE
+ !
+ CALL sum_band_k_tg()
+ !
+ ENDIF
ELSE
- !
- CALL sum_band_k()
- !
- ENDIF
+ ! General case, but no task groups
+ IF ( gamma_only ) THEN
+ !
+ CALL sum_band_gamma()
+ !
+ ELSE
+ !
+ CALL sum_band_k()
+ !
+ ENDIF
+ END IF
!
IF (noncolin) THEN
!$acc exit data delete(psic_nc)
@@ -210,7 +225,6 @@ SUBROUTINE sum_band()
DEALLOCATE( kplusg )
DEALLOCATE( kplusg_evc )
ENDIF
- IF ( okvan ) CALL deallocate_bec_type_acc ( becp )
!
! ... sum charge density over pools (distributed k-points) and bands
!
@@ -230,12 +244,19 @@ SUBROUTINE sum_band()
IF(sic) CALL rho_r2g( dffts, rho%pol_r, rho%pol_g )
!
IF( okvan ) THEN
+ !
+ ! ... with distributed , sum over bands
+ ! ... (use host copy to do the comunication)
+ !
+ IF ( okvan .AND. becp%comm /= mp_get_comm_null() .AND. nhm > 0 ) THEN
+ CALL mp_sum( becsum, becp%comm )
+ IF ( tqr ) CALL mp_sum( ebecsum, becp%comm )
+ ENDIF
+ CALL deallocate_bec_type_acc ( becp )
!
! ... becsum is summed over bands (if bgrp_parallelization is done)
! ... and over k-points (but it is not symmetrized)
!
- ! ... use host copy to do the comunication.
- !
CALL mp_sum(becsum, inter_bgrp_comm )
CALL mp_sum(becsum, inter_pool_comm )
!$acc update device(becsum)
@@ -316,11 +337,6 @@ SUBROUTINE sum_band()
!-----------------------------------------------------------------------
!! \(\texttt{sum_band}\) - part for gamma version.
!
- USE becmod, ONLY : becp
- USE mp_bands, ONLY : me_bgrp
- USE mp, ONLY : mp_sum, mp_get_comm_null
- USE fft_helper_subroutines, ONLY : fftx_ntgrp, fftx_tgpe, &
- tg_reduce_rho, tg_get_group_nr3
USE uspp_init, ONLY : init_us_2
!
IMPLICIT NONE
@@ -329,39 +345,17 @@ SUBROUTINE sum_band()
!
REAL(DP) :: w1, w2
! weights
- INTEGER :: npw, idx, ioff, ioff_tg, nxyp, incr, v_siz, j
- COMPLEX(DP), ALLOCATABLE :: tg_psi(:)
- REAL(DP), ALLOCATABLE :: tg_rho(:)
- LOGICAL :: use_tg
- INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ierr, ebnd, i, brange
+ INTEGER :: npw, incr, j
+ INTEGER :: ierr, ebnd, i, brange
REAL(DP) :: kplusgi
!
! ... here we sum for each k point the contribution
! ... of the wavefunctions to the charge
!
- use_tg = ( dffts%has_task_groups ) .AND. ( .NOT. (xclib_dft_is('meta') .OR. lxdm) )
- !
incr = 2
!
- IF( use_tg ) THEN
- !
- v_siz = dffts%nnr_tg
- !
- ALLOCATE( tg_psi( v_siz ) )
- ALLOCATE( tg_rho( v_siz ) )
- !$acc enter data create(tg_psi, tg_rho)
- !
- incr = 2 * fftx_ntgrp(dffts)
- !
- ENDIF
- !
k_loop: DO ik = 1, nks
!
- IF ( use_tg ) THEN
- !$acc kernels
- tg_rho = 0.0_DP
- !$acc end kernels
- END IF
IF ( lsda ) current_spin = isk(ik)
!
npw = ngk(ik)
@@ -393,75 +387,36 @@ SUBROUTINE sum_band()
!
DO ibnd = ibnd_start, ibnd_end, incr
!
- IF( use_tg ) THEN
+ ebnd = ibnd
+ IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
+ !
+ CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts )
+ !
+ w1 = wg(ibnd,ik) / omega
+ !
+ ! ... increment the charge density ...
+ !
+ IF ( ibnd < ibnd_end ) THEN
!
- CALL tgwave_g2r( evc(1:npw,ibnd:ibnd_end), tg_psi, dffts, npw )
+ ! ... two ffts at the same time
!
- ! ... Now the first proc of the group holds the first two bands
- ! ... of the 2*ntgrp bands that we are processing at the same time,
- ! ... the second proc. holds the third and fourth band and so on.
- !
- ! ... Compute the proper factor for each band
- !
- idx = fftx_tgpe(dffts) + 1
- !
- ! ... Remember two bands are packed in a single array :
- ! - proc 0 has bands ibnd and ibnd+1
- ! - proc 1 has bands ibnd+2 and ibnd+3
- ! - ....
- !
- idx = 2*idx - 1
- !
- IF( idx + ibnd - 1 < ibnd_end ) THEN
- w1 = wg( idx + ibnd - 1, ik) / omega
- w2 = wg( idx + ibnd , ik) / omega
- ELSEIF( idx + ibnd - 1 == ibnd_end ) THEN
- w1 = wg( idx + ibnd - 1, ik) / omega
- w2 = w1
- ELSE
- w1 = 0.0d0
- w2 = w1
- ENDIF
- !
- CALL tg_get_group_nr3( dffts, right_nr3 )
- !
- CALL get_rho_gamma( tg_rho, dffts%nr1x*dffts%nr2x*right_nr3, &
- w1, w2, tg_psi )
+ w2 = wg(ibnd+1,ik) / omega
!
ELSE
!
- ebnd = ibnd
- IF ( ibnd < ibnd_end ) ebnd = ebnd + 1
- !
- CALL wave_g2r( evc(1:npw,ibnd:ebnd), psic, dffts )
- !
- w1 = wg(ibnd,ik) / omega
- !
- ! ... increment the charge density ...
- !
- IF ( ibnd < ibnd_end ) THEN
- !
- ! ... two ffts at the same time
- !
- w2 = wg(ibnd+1,ik) / omega
- !
- ELSE
- !
- w2 = w1
- !
- ENDIF
- !
- CALL get_rho_gamma( rho%of_r(:,current_spin), dffts%nnr, w1, w2, psic )
+ w2 = w1
!
ENDIF
!
+ CALL get_rho_gamma( rho%of_r(:,current_spin), dffts%nnr, w1, w2, psic )
+ !
IF (xclib_dft_is('meta') .OR. lxdm) THEN
!
DO j = 1, 3
DO i = 1, npw
- kplusgi = (xk(j,ik)+g(j,i)) * tpiba
- kplusg_evc(i,1) = CMPLX(0._DP,kplusgi,KIND=DP) * evc(i,ibnd)
- IF ( ibnd < ibnd_end ) kplusg_evc(i,2) = CMPLX(0._DP,kplusgi,KIND=DP) * evc(i,ibnd+1)
+ kplusgi = (xk(j,ik)+g(j,i)) * tpiba
+ kplusg_evc(i,1) = CMPLX(0._DP,kplusgi,KIND=DP) * evc(i,ibnd)
+ IF ( ibnd < ibnd_end ) kplusg_evc(i,2) = CMPLX(0._DP,kplusgi,KIND=DP) * evc(i,ibnd+1)
ENDDO
!
ebnd = ibnd
@@ -475,8 +430,8 @@ SUBROUTINE sum_band()
!
DO ir = 1, dffts%nnr
rho%kin_r(ir,current_spin) = rho%kin_r(ir,current_spin) + &
- w1 * DBLE( psic(ir) )**2 + &
- w2 * AIMAG( psic(ir) )**2
+ w1 * DBLE( psic(ir) )**2 + &
+ w2 * AIMAG( psic(ir) )**2
ENDDO
ENDDO
!
@@ -484,49 +439,23 @@ SUBROUTINE sum_band()
!
ENDDO
!
- IF( use_tg ) THEN
- !$acc update host(tg_rho)
- CALL tg_reduce_rho( rho%of_r, tg_rho, current_spin, dffts )
- ENDIF
- !
! ... If we have a US pseudopotential we compute here the becsum term
!
IF ( okvan ) CALL sum_bec( ik, current_spin, ibnd_start, ibnd_end, &
- this_bgrp_nbnd )
+ this_bgrp_nbnd )
!
ENDDO k_loop
!
- ! ... with distributed , sum over bands
- !
- IF ( okvan .AND. becp%comm /= mp_get_comm_null() .AND. nhm>0) THEN
- CALL mp_sum( becsum, becp%comm )
- !$acc update device(becsum)
- ENDIF
- IF ( okvan .AND. becp%comm /= mp_get_comm_null() .AND. tqr .AND. nhm>0) THEN
- CALL mp_sum( ebecsum, becp%comm )
- !$acc update device(ebecsum)
- ENDIF
- !
- IF( use_tg ) THEN
- !$acc exit data delete(tg_psi, tg_rho)
- DEALLOCATE( tg_psi )
- DEALLOCATE( tg_rho )
- ENDIF
!
RETURN
!
END SUBROUTINE sum_band_gamma
!
- !
!-----------------------------------------------------------------------
SUBROUTINE sum_band_k()
!-----------------------------------------------------------------------
!! \(\texttt{sum_band}\) - part for k-points version
!
- USE mp_bands, ONLY : me_bgrp
- USE mp, ONLY : mp_sum, mp_get_comm_null
- USE fft_helper_subroutines, ONLY : fftx_ntgrp, fftx_tgpe, &
- tg_reduce_rho, tg_get_group_nr3
USE uspp_init, ONLY : init_us_2
USE klist, ONLY : nelec
USE control_flags, ONLY : many_fft
@@ -539,12 +468,9 @@ SUBROUTINE sum_band()
! weights
INTEGER :: npw, ipol, na, np
!
- INTEGER :: idx, ioff, ioff_tg, nxyp, incr, v_siz
+ INTEGER :: idx, incr
COMPLEX(DP), ALLOCATABLE :: psicd(:)
- COMPLEX(DP), ALLOCATABLE :: tg_psi(:), tg_psi_nc(:,:)
- REAL(DP), ALLOCATABLE :: tg_rho(:), tg_rho_nc(:,:)
- LOGICAL :: use_tg
- INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ierr
+ INTEGER :: ierr
INTEGER :: i, j, group_size, hm_vec(3)
REAL(DP) :: kplusgi
! polaron calculation
@@ -557,8 +483,6 @@ SUBROUTINE sum_band()
! ... here we sum for each k point the contribution
! ... of the wavefunctions to the charge
!
- use_tg = ( dffts%has_task_groups ) .AND. ( .NOT. (xclib_dft_is('meta') .OR. lxdm) )
- !
incr = 1
!
IF(sic) THEN
@@ -572,47 +496,17 @@ SUBROUTINE sum_band()
ibnd_p = nelec/2+1
END IF
!
- IF( use_tg ) THEN
- !
- v_siz = dffts%nnr_tg
- !
- IF (noncolin) THEN
- ALLOCATE( tg_psi_nc( v_siz, npol ) )
- ALLOCATE( tg_rho_nc( v_siz, nspin_mag ) )
- !$acc enter data create(tg_psi_nc, tg_rho_nc)
- ELSE
- ALLOCATE( tg_psi( v_siz ) )
- ALLOCATE( tg_rho( v_siz ) )
- !$acc enter data create(tg_psi, tg_rho)
- ENDIF
- !
- incr = fftx_ntgrp(dffts)
- !
+ IF (noncolin .OR. (xclib_dft_is('meta') .OR. lxdm)) THEN
+ ALLOCATE( psicd(dffts%nnr) )
+ incr = 1
ELSE
- IF (noncolin .OR. (xclib_dft_is('meta') .OR. lxdm)) THEN
- ALLOCATE( psicd(dffts%nnr) )
- incr = 1
- ELSE
- ALLOCATE( psicd(dffts%nnr*many_fft) )
- incr = many_fft
- ENDIF
- !$acc enter data create(psicd)
- ! ... This is used as reduction variable on the device
+ ALLOCATE( psicd(dffts%nnr*many_fft) )
+ incr = many_fft
ENDIF
+ !$acc enter data create(psicd)
+ ! ... This is used as reduction variable on the device
!
k_loop: DO ik = 1, nks
- !
- IF( use_tg ) THEN
- IF (noncolin) THEN
- !$acc kernels
- tg_rho_nc = 0.0_DP
- !$acc end kernels
- ELSE
- !$acc kernels
- tg_rho = 0.0_DP
- !$acc end kernels
- ENDIF
- ENDIF
!
IF ( lsda ) current_spin = isk(ik)
npw = ngk (ik)
@@ -675,128 +569,54 @@ SUBROUTINE sum_band()
w1 = wg(ibnd,ik) / omega
!
IF (noncolin) THEN
- IF( use_tg ) THEN
- !
- CALL tgwave_g2r( evc(1:npw,ibnd:ibnd_end), tg_psi_nc(:,1), dffts, &
- npw, igk_k(:,ik) )
- CALL tgwave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd_end), tg_psi_nc(:,2), &
- dffts, npw, igk_k(:,ik) )
- !
- ! ... Now the first proc of the group holds the first band
- ! ... of the ntgrp bands that we are processing at the same time,
- ! ... the second proc. holds the second and so on
- !
- ! ... Compute the proper factor for each band
- !
- idx = fftx_tgpe(dffts) + 1
- !
- ! ... Remember
- ! ... proc 0 has bands ibnd
- ! ... proc 1 has bands ibnd+1
- ! ... ....
- !
- IF( idx + ibnd - 1 <= ibnd_end ) THEN
- w1 = wg( idx + ibnd - 1, ik) / omega
- ELSE
- w1 = 0.0d0
- ENDIF
- !
- CALL tg_get_group_nr3( dffts, right_nr3 )
- !
- ! OPTIMIZE HERE : this is a sum of all densities in first spin channel
- !
- DO ipol = 1, npol
- CALL get_rho_gpu( tg_rho_nc(:,1), dffts%nr1x*dffts%nr2x* &
- right_nr3, w1, tg_psi_nc(:,ipol) )
- ENDDO
- !
- IF (domag) CALL get_rho_domag( tg_rho_nc(:,:), dffts%nr1x* &
- dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc(:,:) )
- !
- ELSE
- !
- ! ... Noncollinear case without task groups
- !
- CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic_nc(:,1), &
- dffts, igk=igk_k(:,ik) )
- CALL wave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd), psic_nc(:,2), &
- dffts, igk=igk_k(:,ik) )
- !
- ! ... Increment the charge density
- !
- DO ipol = 1, npol
- CALL get_rho_gpu( rho%of_r(:,1), dffts%nnr, w1, psic_nc(:,ipol) )
- ENDDO
- !
- ! ... In this case, calculate also the three
- ! ... components of the magnetization (stored in rho%of_r(ir,2-4))
- !
- IF (domag) THEN
- CALL get_rho_domag( rho%of_r(:,:), dffts%nnr, w1, psic_nc(1:,1:) )
- ENDIF
- !
- ENDIF
+ !
+ ! ... Noncollinear case without task groups
+ !
+ CALL wave_g2r( evc(1:npw,ibnd:ibnd), psic_nc(:,1), &
+ dffts, igk=igk_k(:,ik) )
+ CALL wave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd), psic_nc(:,2), &
+ dffts, igk=igk_k(:,ik) )
+ !
+ ! ... Increment the charge density
+ !
+ DO ipol = 1, npol
+ CALL get_rho_gpu( rho%of_r(:,1), dffts%nnr, w1, psic_nc(:,ipol) )
+ ENDDO
+ !
+ ! ... In this case, calculate also the three
+ ! ... components of the magnetization (stored in rho%of_r(ir,2-4))
+ !
+ IF (domag) &
+ CALL get_rho_domag( rho%of_r(:,:), dffts%nnr, w1, psic_nc(1:,1:) )
+ !
+ ELSE IF (many_fft > 1 .AND. (.NOT. (xclib_dft_is('meta') .OR. lxdm))) THEN
+ !
+ group_size = MIN(many_fft,ibnd_end-(ibnd-1))
+ hm_vec(1)=group_size ; hm_vec(2)=npw ; hm_vec(3)=group_size
+ !
+ CALL wave_g2r( evc(:,ibnd:ibnd+group_size-1), psicd, &
+ dffts, igk=igk_k(:,ik), howmany_set=hm_vec )
+ !
+ ! ... increment the charge density ...
+ !
+ DO i = 0, group_size-1
+ w1 = wg(ibnd+i,ik) / omega
+ CALL get_rho_gpu( rho%of_r(:,current_spin), dffts%nnr, w1, psicd(i*dffts%nnr+1:) )
+ ENDDO
!
ELSE
!
- IF( use_tg ) THEN
- !
- CALL tgwave_g2r( evc(1:npw,ibnd:ibnd_end), tg_psi, dffts, npw, &
- igk_k(:,ik) )
- !
- ! Now the first proc of the group holds the first band
- ! of the ntgrp bands that we are processing at the same time,
- ! the second proc. holds the second and so on
- !
- ! Compute the proper factor for each band
- !
- idx = fftx_tgpe(dffts) + 1
- !
- ! Remember
- ! proc 0 has bands ibnd
- ! proc 1 has bands ibnd+1
- ! ....
- !
- IF( idx+ibnd-1 <= ibnd_end ) THEN
- w1 = wg(idx+ibnd-1,ik) / omega
- ELSE
- w1 = 0.0d0
- ENDIF
- !
- CALL tg_get_group_nr3( dffts, right_nr3 )
- !
- CALL get_rho_gpu( tg_rho, dffts%nr1x*dffts%nr2x*right_nr3, w1, tg_psi )
- !
- ELSEIF (many_fft > 1 .AND. (.NOT. (xclib_dft_is('meta') .OR. lxdm))) THEN
- !
- group_size = MIN(many_fft,ibnd_end-(ibnd-1))
- hm_vec(1)=group_size ; hm_vec(2)=npw ; hm_vec(3)=group_size
- !
- CALL wave_g2r( evc(:,ibnd:ibnd+group_size-1), psicd, &
- dffts, igk=igk_k(:,ik), howmany_set=hm_vec )
- !
- ! ... increment the charge density ...
- !
- DO i = 0, group_size-1
- w1 = wg(ibnd+i,ik) / omega
- CALL get_rho_gpu( rho%of_r(:,current_spin), dffts%nnr, w1, psicd(i*dffts%nnr+1:) )
- ENDDO
- !
- ELSE
- !
- CALL wave_g2r( evc(1:npw,ibnd:ibnd), psicd, dffts, igk=igk_k(:,ik) )
- !
- ! ... increment the charge density ...
- !
- CALL get_rho_gpu( rho%of_r(:,current_spin), dffts%nnr, w1, psicd )
- !
- ENDIF
+ CALL wave_g2r( evc(1:npw,ibnd:ibnd), psicd, dffts, igk=igk_k(:,ik) )
+ !
+ ! ... increment the charge density ...
+ !
+ CALL get_rho_gpu( rho%of_r(:,current_spin), dffts%nnr, w1, psicd )
!
IF (xclib_dft_is('meta') .OR. lxdm) THEN
DO j=1,3
DO i = 1, npw
- kplusgi = (xk(j,ik)+g(j,igk_k(i,ik))) * tpiba
- kplusg_evc(i,1) = CMPLX(0._DP,kplusgi,KIND=DP) * evc(i,ibnd)
+ kplusgi = (xk(j,ik)+g(j,igk_k(i,ik))) * tpiba
+ kplusg_evc(i,1) = CMPLX(0._DP,kplusgi,KIND=DP) * evc(i,ibnd)
ENDDO
!
CALL wave_g2r( kplusg_evc(1:npw,1:1), psic, dffts, igk=igk_k(:,ik) )
@@ -812,37 +632,12 @@ SUBROUTINE sum_band()
!
ENDDO
!
- IF ( use_tg ) THEN
- !
- ! ... reduce the charge across task group
- !
- IF (noncolin) THEN
- !$acc update host(tg_rho_nc)
- ELSE
- !$acc update host(tg_rho)
- CALL tg_reduce_rho( rho%of_r, tg_rho_nc, tg_rho, current_spin, &
- noncolin, domag, dffts )
- END IF
- !
- END IF
- !
! ... If we have a US pseudopotential we compute here the becsum term
!
IF ( okvan ) CALL sum_bec ( ik, current_spin, ibnd_start,ibnd_end,this_bgrp_nbnd )
!
END DO k_loop
!
- ! ... with distributed , sum over bands
- !
- IF ( okvan .AND. becp%comm /= mp_get_comm_null() .AND. nhm>0 ) THEN
- CALL mp_sum( becsum, becp%comm )
- !$acc update device(becsum)
- ENDIF
- IF ( okvan .AND. becp%comm /= mp_get_comm_null() .AND. tqr .AND. nhm> 0) THEN
- CALL mp_sum( ebecsum, becp%comm )
- !$acc update device(ebecsum)
- ENDIF
- !
IF(sic .and. pol_type == 'h') THEN
wg_p = 1.0 - wg_p
DEALLOCATE(psic_p)
@@ -850,20 +645,8 @@ SUBROUTINE sum_band()
DEALLOCATE(rho_p)
END IF
!
- IF( use_tg ) THEN
- IF (noncolin) THEN
- !$acc exit data delete(tg_psi_nc,tg_rho_nc)
- DEALLOCATE( tg_psi_nc )
- DEALLOCATE( tg_rho_nc )
- ELSE
- !$acc exit data delete(tg_psi, tg_rho)
- DEALLOCATE( tg_psi )
- DEALLOCATE( tg_rho )
- END IF
- ELSE
- !$acc exit data delete(psicd)
- DEALLOCATE( psicd )
- END IF
+ !$acc exit data delete(psicd)
+ DEALLOCATE( psicd )
!
RETURN
!
@@ -973,6 +756,303 @@ SUBROUTINE sum_band()
END SUBROUTINE get_rho_domag
!
+ !-----------------------------------------------------------------------
+ SUBROUTINE sum_band_gamma_tg()
+ !-----------------------------------------------------------------------
+ !! \(\texttt{sum_band}\) - part for gamma version.
+ !
+ USE fft_helper_subroutines, ONLY : fftx_ntgrp, fftx_tgpe, &
+ tg_reduce_rho, tg_get_group_nr3
+ USE fft_wave, ONLY : tgwave_g2r
+ USE uspp_init, ONLY : init_us_2
+ !
+ IMPLICIT NONE
+ !
+ ! ... local variables
+ !
+ REAL(DP) :: w1, w2
+ ! weights
+ INTEGER :: npw, idx, incr, v_siz, j
+ COMPLEX(DP), ALLOCATABLE :: tg_psi(:)
+ REAL(DP), ALLOCATABLE :: tg_rho(:)
+ INTEGER :: right_nr3, ntgrp, ierr, i
+ !
+ ! ... here we sum for each k point the contribution
+ ! ... of the wavefunctions to the charge
+ !
+ incr = 2
+ !
+ v_siz = dffts%nnr_tg
+ !
+ ALLOCATE( tg_psi( v_siz ) )
+ ALLOCATE( tg_rho( v_siz ) )
+ !
+ incr = 2 * fftx_ntgrp(dffts)
+ !
+ k_loop: DO ik = 1, nks
+ !
+ tg_rho = 0.0_DP
+ IF ( lsda ) current_spin = isk(ik)
+ !
+ npw = ngk(ik)
+ !
+ CALL start_clock( 'sum_band:buffer' )
+ IF ( nks > 1 ) &
+ CALL get_buffer ( evc, nwordwfc, iunwfc, ik )
+ !
+ CALL stop_clock( 'sum_band:buffer' )
+ !
+ CALL start_clock( 'sum_band:init_us_2' )
+ !
+ IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
+ !
+ CALL stop_clock( 'sum_band:init_us_2' )
+ !
+ ! ... here we compute the band energy: the sum of the eigenvalues
+ !
+ DO ibnd = ibnd_start, ibnd_end
+ !
+ ! ... the sum of eband and demet is the integral for
+ ! ... e < ef of e n(e) which reduces for degauss=0 to the sum of
+ ! ... the eigenvalues.
+ !
+ eband = eband + et(ibnd,ik) * wg(ibnd,ik)
+ !
+ ENDDO
+ !
+ DO ibnd = ibnd_start, ibnd_end, incr
+ !
+ CALL tgwave_g2r( evc(1:npw,ibnd:ibnd_end), tg_psi, dffts, npw )
+ !
+ ! ... Now the first proc of the group holds the first two bands
+ ! ... of the 2*ntgrp bands that we are processing at the same time,
+ ! ... the second proc. holds the third and fourth band and so on.
+ !
+ ! ... Compute the proper factor for each band
+ !
+ idx = fftx_tgpe(dffts) + 1
+ !
+ ! ... Remember two bands are packed in a single array :
+ ! - proc 0 has bands ibnd and ibnd+1
+ ! - proc 1 has bands ibnd+2 and ibnd+3
+ ! - ....
+ !
+ idx = 2*idx - 1
+ !
+ IF( idx + ibnd - 1 < ibnd_end ) THEN
+ w1 = wg( idx + ibnd - 1, ik) / omega
+ w2 = wg( idx + ibnd , ik) / omega
+ ELSEIF( idx + ibnd - 1 == ibnd_end ) THEN
+ w1 = wg( idx + ibnd - 1, ik) / omega
+ w2 = w1
+ ELSE
+ w1 = 0.0d0
+ w2 = w1
+ ENDIF
+ !
+ CALL tg_get_group_nr3( dffts, right_nr3 )
+ !
+ CALL get_rho_gamma( tg_rho, dffts%nr1x*dffts%nr2x*right_nr3, &
+ w1, w2, tg_psi )
+ !
+ ENDDO
+ !
+ CALL tg_reduce_rho( rho%of_r, tg_rho, current_spin, dffts )
+ !
+ ! ... If we have a US pseudopotential we compute here the becsum term
+ !
+ IF ( okvan ) CALL sum_bec( ik, current_spin, ibnd_start, ibnd_end, &
+ this_bgrp_nbnd )
+ !
+ ENDDO k_loop
+ !
+ !
+ DEALLOCATE( tg_psi )
+ DEALLOCATE( tg_rho )
+ !
+ RETURN
+ !
+ END SUBROUTINE sum_band_gamma_tg
+ !
+ !
+ !-----------------------------------------------------------------------
+ SUBROUTINE sum_band_k_tg()
+ !-----------------------------------------------------------------------
+ !! \(\texttt{sum_band}\) - part for k-points version
+ !
+ USE fft_helper_subroutines, ONLY : fftx_ntgrp, fftx_tgpe, &
+ tg_reduce_rho, tg_get_group_nr3
+ USE fft_wave, ONLY : tgwave_g2r
+ USE uspp_init, ONLY : init_us_2
+ USE klist, ONLY : nelec
+ !
+ IMPLICIT NONE
+ !
+ ! ... local variables
+ !
+ REAL(DP) :: w1
+ ! weights
+ INTEGER :: npw, ipol, na, np
+ !
+ INTEGER :: idx, incr, v_siz
+ COMPLEX(DP), ALLOCATABLE :: tg_psi(:), tg_psi_nc(:,:)
+ REAL(DP), ALLOCATABLE :: tg_rho(:), tg_rho_nc(:,:)
+ !
+ INTEGER :: right_nr3, ntgrp, ierr
+ INTEGER :: i, j, group_size, hm_vec(3)
+ !
+ ! ... here we sum for each k point the contribution
+ ! ... of the wavefunctions to the charge
+ !
+ incr = 1
+ !
+ v_siz = dffts%nnr_tg
+ !
+ IF (noncolin) THEN
+ ALLOCATE( tg_psi_nc( v_siz, npol ) )
+ ALLOCATE( tg_rho_nc( v_siz, nspin_mag ) )
+ ELSE
+ ALLOCATE( tg_psi( v_siz ) )
+ ALLOCATE( tg_rho( v_siz ) )
+ ENDIF
+ !
+ incr = fftx_ntgrp(dffts)
+ !
+ k_loop: DO ik = 1, nks
+ !
+ IF (noncolin) THEN
+ tg_rho_nc = 0.0_DP
+ ELSE
+ tg_rho = 0.0_DP
+ ENDIF
+ !
+ IF ( lsda ) current_spin = isk(ik)
+ npw = ngk (ik)
+ !
+ CALL start_clock( 'sum_band:buffer' )
+ IF ( nks > 1 ) THEN
+ CALL get_buffer( evc, nwordwfc, iunwfc, ik )
+ ENDIF
+ CALL stop_clock( 'sum_band:buffer' )
+ !
+ CALL start_clock( 'sum_band:init_us_2' )
+ !
+ IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
+ !
+ CALL stop_clock( 'sum_band:init_us_2' )
+ !
+ ! ... for DMFT the eigenvectors are updated using v_dmft from add_dmft_occ.f90
+ !
+ DO ibnd = ibnd_start, ibnd_end, incr
+ !
+ ! ... here we compute the band energy: the sum of the eigenvalues
+ !
+ DO idx = 1, incr
+ IF( idx+ibnd-1 <= ibnd_end ) eband = eband + et(idx+ibnd-1,ik) * &
+ wg(idx+ibnd-1,ik)
+ ENDDO
+ !
+ ! ... the sum of eband and demet is the integral for e < ef of
+ ! ... e n(e) which reduces for degauss=0 to the sum of the
+ ! ... eigenvalues
+ !
+ w1 = wg(ibnd,ik) / omega
+ !
+ IF (noncolin) THEN
+ !
+ CALL tgwave_g2r( evc(1:npw,ibnd:ibnd_end), tg_psi_nc(:,1), dffts, &
+ npw, igk_k(:,ik) )
+ CALL tgwave_g2r( evc(npwx+1:npwx+npw,ibnd:ibnd_end), tg_psi_nc(:,2), &
+ dffts, npw, igk_k(:,ik) )
+ !
+ ! ... Now the first proc of the group holds the first band
+ ! ... of the ntgrp bands that we are processing at the same time,
+ ! ... the second proc. holds the second and so on
+ !
+ ! ... Compute the proper factor for each band
+ !
+ idx = fftx_tgpe(dffts) + 1
+ !
+ ! ... Remember
+ ! ... proc 0 has bands ibnd
+ ! ... proc 1 has bands ibnd+1
+ ! ... ....
+ !
+ IF( idx + ibnd - 1 <= ibnd_end ) THEN
+ w1 = wg( idx + ibnd - 1, ik) / omega
+ ELSE
+ w1 = 0.0d0
+ ENDIF
+ !
+ CALL tg_get_group_nr3( dffts, right_nr3 )
+ !
+ ! OPTIMIZE HERE : this is a sum of all densities in first spin channel
+ !
+ DO ipol = 1, npol
+ CALL get_rho_gpu( tg_rho_nc(:,1), dffts%nr1x*dffts%nr2x* &
+ right_nr3, w1, tg_psi_nc(:,ipol) )
+ ENDDO
+ !
+ IF (domag) CALL get_rho_domag( tg_rho_nc(:,:), dffts%nr1x* &
+ dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc(:,:) )
+ !
+ !
+ ELSE
+ !
+ CALL tgwave_g2r( evc(1:npw,ibnd:ibnd_end), tg_psi, dffts, npw, &
+ igk_k(:,ik) )
+ !
+ ! Now the first proc of the group holds the first band
+ ! of the ntgrp bands that we are processing at the same time,
+ ! the second proc. holds the second and so on
+ !
+ ! Compute the proper factor for each band
+ !
+ idx = fftx_tgpe(dffts) + 1
+ !
+ ! Remember
+ ! proc 0 has bands ibnd
+ ! proc 1 has bands ibnd+1
+ ! ....
+ !
+ IF( idx+ibnd-1 <= ibnd_end ) THEN
+ w1 = wg(idx+ibnd-1,ik) / omega
+ ELSE
+ w1 = 0.0d0
+ ENDIF
+ !
+ CALL tg_get_group_nr3( dffts, right_nr3 )
+ !
+ CALL get_rho_gpu( tg_rho, dffts%nr1x*dffts%nr2x*right_nr3, w1, tg_psi )
+ END IF
+ !
+ ENDDO
+ !
+ ! ... reduce the charge across task group
+ !
+ IF (.not.noncolin) THEN
+ CALL tg_reduce_rho( rho%of_r, tg_rho_nc, tg_rho, current_spin, &
+ noncolin, domag, dffts )
+ END IF
+ !
+ ! ... If we have a US pseudopotential we compute here the becsum term
+ !
+ IF ( okvan ) CALL sum_bec ( ik, current_spin, ibnd_start,ibnd_end,this_bgrp_nbnd )
+ !
+ END DO k_loop
+ !
+ IF (noncolin) THEN
+ DEALLOCATE( tg_psi_nc )
+ DEALLOCATE( tg_rho_nc )
+ ELSE
+ DEALLOCATE( tg_psi )
+ DEALLOCATE( tg_rho )
+ END IF
+ !
+ RETURN
+ !
+ END SUBROUTINE sum_band_k_tg
+ !
END SUBROUTINE sum_band
!----------------------------------------------------------------------------