mirror of https://gitlab.com/QEF/q-e.git
becsum and ebecsum moved to standard duplicated data. Initial port of sum_band. Needs btached FFT and improved data transfers
This commit is contained in:
parent
29ca3ded61
commit
306a51c87a
|
@ -253,4 +253,4 @@
|
|||
END SUBROUTINE deallocate_wavefunctions_module_gpu
|
||||
!=----------------------------------------------------------------------------=!
|
||||
END MODULE wavefunctions_module_gpum
|
||||
!=----------------------------------------------------------------------------=!
|
||||
!=----------------------------------------------------------------------------=!
|
||||
|
|
|
@ -53,6 +53,8 @@ SUBROUTINE addusdens_g(rho)
|
|||
USE mp_pools, ONLY : inter_pool_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
USE uspp_gpum, ONLY : using_becsum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
COMPLEX(kind=dp), INTENT(inout) :: rho(dfftp%ngm,nspin_mag)
|
||||
|
@ -92,6 +94,8 @@ SUBROUTINE addusdens_g(rho)
|
|||
!
|
||||
ALLOCATE (qmod(ngm_l), qgm(ngm_l) )
|
||||
ALLOCATE (ylmk0(ngm_l, lmaxq * lmaxq) )
|
||||
!
|
||||
CALL using_becsum(0)
|
||||
|
||||
CALL ylmr2 (lmaxq * lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0)
|
||||
DO ig = 1, ngm_l
|
||||
|
|
|
@ -52,12 +52,14 @@ SUBROUTINE addusdens_g_gpu(rho)
|
|||
USE gvect_gpum, ONLY : gg_d, g_d, &
|
||||
eigts1_d, eigts2_d, eigts3_d, mill_d
|
||||
USE noncollin_module, ONLY : noncolin, nspin_mag
|
||||
USE uspp, ONLY : becsum, okvan
|
||||
USE uspp, ONLY : okvan
|
||||
USE uspp_param, ONLY : upf, lmaxq, nh, nhm
|
||||
USE control_flags, ONLY : gamma_only
|
||||
USE mp_pools, ONLY : inter_pool_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
USE uspp_gpum, ONLY : becsum_d, using_becsum_d
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
COMPLEX(kind=dp), INTENT(inout) :: rho(dfftp%ngm,nspin_mag)
|
||||
|
@ -75,12 +77,12 @@ SUBROUTINE addusdens_g_gpu(rho)
|
|||
! modulus of G, spherical harmonics
|
||||
COMPLEX(DP), ALLOCATABLE :: skk_d(:,:), aux2_d(:,:)
|
||||
! structure factors, US contribution to rho
|
||||
COMPLEX(DP), ALLOCATABLE :: aux_d (:,:), qgm_d(:), becsum_cpy_d(:,:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: aux_d (:,:), qgm_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: aux_h (:,:)
|
||||
! work space for rho(G,nspin), Fourier transform of q
|
||||
INTEGER :: ij, im
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: becsum_cpy_d, tbecsum_d, qmod_d, ylmk0_d, skk_d, &
|
||||
attributes(device) :: tbecsum_d, qmod_d, ylmk0_d, skk_d, &
|
||||
aux2_d, aux_d, qgm_d
|
||||
attributes(pinned) :: aux_h
|
||||
#endif
|
||||
|
@ -103,11 +105,8 @@ SUBROUTINE addusdens_g_gpu(rho)
|
|||
! for the extraordinary unlikely case of more processors than G-vectors
|
||||
IF ( ngm_l <= 0 ) GO TO 10
|
||||
!
|
||||
! ====== this should eventually go away
|
||||
nij = nhm*(nhm+1)/2
|
||||
ALLOCATE (becsum_cpy_d (nij,nat,nspin_mag) )
|
||||
becsum_cpy_d(1:nij,1:nat,1:nspin_mag) = becsum(1:nij,1:nat,1:nspin_mag)
|
||||
! ======
|
||||
! Sync becsum if needed
|
||||
CALL using_becsum_d(0)
|
||||
!
|
||||
ALLOCATE (qmod_d(ngm_l), qgm_d(ngm_l) )
|
||||
ALLOCATE (ylmk0_d(ngm_l, lmaxq * lmaxq) )
|
||||
|
@ -143,7 +142,7 @@ SUBROUTINE addusdens_g_gpu(rho)
|
|||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO im = 1, nspin_mag
|
||||
DO ij = 1, nij
|
||||
tbecsum_d(ij,nb,im) = becsum_cpy_d(ij,na,im)
|
||||
tbecsum_d(ij,nb,im) = becsum_d(ij,na,im)
|
||||
ENDDO
|
||||
ENDDO
|
||||
|
||||
|
@ -180,7 +179,7 @@ SUBROUTINE addusdens_g_gpu(rho)
|
|||
ENDDO
|
||||
!
|
||||
DEALLOCATE (ylmk0_d)
|
||||
DEALLOCATE (qgm_d, qmod_d, becsum_cpy_d)
|
||||
DEALLOCATE (qgm_d, qmod_d)
|
||||
!
|
||||
10 CONTINUE
|
||||
!
|
||||
|
|
|
@ -50,6 +50,8 @@ SUBROUTINE addusforce_g (forcenl)
|
|||
USE control_flags, ONLY : gamma_only
|
||||
USE fft_interfaces,ONLY : fwfft
|
||||
!
|
||||
USE uspp_gpum, ONLY : using_becsum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(INOUT) :: forcenl (3, nat)
|
||||
|
@ -107,6 +109,9 @@ SUBROUTINE addusforce_g (forcenl)
|
|||
qmod (ig) = sqrt (gg (ngm_s+ig-1) )
|
||||
ENDDO
|
||||
!
|
||||
! Sync if needed
|
||||
CALL using_becsum(0)
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
IF ( upf(nt)%tvanp ) THEN
|
||||
!
|
||||
|
|
|
@ -62,6 +62,8 @@ SUBROUTINE addusstress_g (sigmanlc)
|
|||
USE mp_pools, ONLY : inter_pool_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
USE uspp_gpum, ONLY : using_becsum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(inout) :: sigmanlc (3, 3)
|
||||
|
@ -118,6 +120,9 @@ SUBROUTINE addusstress_g (sigmanlc)
|
|||
qmod (ig) = sqrt (gg (ngm_s+ig-1) )
|
||||
ENDDO
|
||||
!
|
||||
! Sync bescum if needed
|
||||
CALL using_becsum(0)
|
||||
!
|
||||
! here we compute the integral Q*V for each atom,
|
||||
! I = sum_G i G_a exp(-iR.G) Q_nm v^*
|
||||
! (no contribution from G=0)
|
||||
|
|
|
@ -43,7 +43,7 @@ SUBROUTINE allocate_nlpot
|
|||
USE wvfct_gpum, ONLY : using_g2kin
|
||||
USE uspp_gpum, ONLY : using_vkb, using_indv_ijkb0, using_indv_ijkb0_d, &
|
||||
using_deeq, using_deeq_nc, using_deeq_nc_d, &
|
||||
using_qq_at, using_qq_so
|
||||
using_qq_at, using_qq_so, using_becsum, using_ebecsum
|
||||
USE us_gpum, ONLY : using_tab, using_tab_at, using_tab_d2y, using_qrad
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -97,6 +97,7 @@ SUBROUTINE allocate_nlpot
|
|||
ALLOCATE (vkb( npwx, nkb))
|
||||
ALLOCATE (becsum( nhm * (nhm + 1)/2, nat, nspin))
|
||||
if (tqr) ALLOCATE (ebecsum( nhm * (nhm + 1)/2, nat, nspin))
|
||||
CALL using_becsum(2); IF (tqr) CALL using_ebecsum(2)
|
||||
!
|
||||
! Calculate dimensions for array tab (including a possible factor
|
||||
! coming from cell contraction during variable cell relaxation/MD)
|
||||
|
|
|
@ -34,7 +34,7 @@ SUBROUTINE compute_becsum ( iflag )
|
|||
bec_type, becp
|
||||
!
|
||||
USE wavefunctions_module_gpum, ONLY : using_evc
|
||||
USE uspp_gpum, ONLY : using_vkb
|
||||
USE uspp_gpum, ONLY : using_vkb, using_becsum
|
||||
USE becmod_subs_gpum, ONLY : using_becp_auto
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -54,6 +54,7 @@ SUBROUTINE compute_becsum ( iflag )
|
|||
!
|
||||
IF ( iflag == 1) CALL weights ( )
|
||||
!
|
||||
CALL using_becsum(2)
|
||||
becsum(:,:,:) = 0.D0
|
||||
CALL allocate_bec_type (nkb,nbnd, becp,intra_bgrp_comm)
|
||||
CALL using_becp_auto(2)
|
||||
|
|
|
@ -159,6 +159,8 @@ SUBROUTINE PAW_atomic_becsum()
|
|||
USE random_numbers, ONLY : randy
|
||||
USE basis, ONLY : starting_wfc
|
||||
USE noncollin_module, ONLY : nspin_mag, angle1, angle2
|
||||
!
|
||||
USE uspp_gpum, ONLY : using_becsum
|
||||
|
||||
IMPLICIT NONE
|
||||
!REAL(DP), INTENT(INOUT) :: becsum(nhm*(nhm+1)/2,nat,nspin)
|
||||
|
@ -175,6 +177,7 @@ SUBROUTINE PAW_atomic_becsum()
|
|||
IF ( starting_wfc=='random') noise = 0.10_dp
|
||||
!
|
||||
!
|
||||
CALL using_becsum(2)
|
||||
becsum=0.0_DP
|
||||
na_loop: DO na = 1, nat
|
||||
nt = ityp(na)
|
||||
|
|
|
@ -37,6 +37,8 @@ SUBROUTINE read_file()
|
|||
USE gvect, ONLY : ngm, g
|
||||
USE gvecw, ONLY : gcutw
|
||||
!
|
||||
USE uspp_gpum, ONLY : using_becsum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
INTEGER :: ierr
|
||||
LOGICAL :: exst
|
||||
|
@ -77,6 +79,7 @@ SUBROUTINE read_file()
|
|||
IF (lda_plus_u .AND. (U_projection == 'pseudo')) CALL init_q_aeps()
|
||||
!
|
||||
IF (okpaw) THEN
|
||||
CALL using_becsum(2)
|
||||
becsum = rho%bec
|
||||
CALL PAW_potential(rho%bec, ddd_PAW)
|
||||
ENDIF
|
||||
|
|
|
@ -1185,6 +1185,8 @@ MODULE realus
|
|||
USE fft_interfaces, ONLY : fwfft
|
||||
USE wavefunctions_module, ONLY : psic
|
||||
!
|
||||
USE uspp_gpum, ONLY : using_becsum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
! The charge density to be augmented (in G-space)
|
||||
COMPLEX(kind=dp), INTENT(inout) :: rho(dfftp%ngm,nspin_mag)
|
||||
|
@ -1204,6 +1206,8 @@ MODULE realus
|
|||
!
|
||||
CALL start_clock( 'addusdens' )
|
||||
!
|
||||
CALL using_becsum(0)
|
||||
!
|
||||
ALLOCATE ( rho_1(dfftp%nnr,nspin_mag) )
|
||||
rho_1(:,:) = 0.0_dp
|
||||
DO is = 1, nspin_mag
|
||||
|
@ -1286,6 +1290,8 @@ MODULE realus
|
|||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
USE uspp_gpum, ONLY : using_becsum, using_ebecsum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(INOUT) :: forcenl (3, nat)
|
||||
|
@ -1296,6 +1302,8 @@ MODULE realus
|
|||
!
|
||||
IF (.not.okvan) RETURN
|
||||
!
|
||||
CALL using_becsum(0); CALL using_ebecsum(0)
|
||||
!
|
||||
ALLOCATE ( forceq(3,nat) )
|
||||
forceq(:,:) = 0.0_dp
|
||||
!
|
||||
|
@ -1365,6 +1373,8 @@ MODULE realus
|
|||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
USE uspp_gpum, ONLY : using_becsum, using_ebecsum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(INOUT) :: sigmanl (3,3)
|
||||
|
@ -1375,6 +1385,8 @@ MODULE realus
|
|||
!
|
||||
IF (.not.okvan) RETURN
|
||||
!
|
||||
CALL using_becsum(0); CALL using_ebecsum(0)
|
||||
!
|
||||
sus(:,:) = 0.0_dp
|
||||
!
|
||||
DO na = 1, nat
|
||||
|
|
|
@ -46,7 +46,7 @@ SUBROUTINE sum_band()
|
|||
becp
|
||||
USE wavefunctions_module_gpum, ONLY : using_evc
|
||||
USE wvfct_gpum, ONLY : using_et
|
||||
USE uspp_gpum, ONLY : using_vkb
|
||||
USE uspp_gpum, ONLY : using_vkb, using_becsum, using_ebecsum
|
||||
USE becmod_subs_gpum, ONLY : using_becp_auto
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -65,7 +65,10 @@ SUBROUTINE sum_band()
|
|||
!
|
||||
CALL start_clock( 'sum_band' )
|
||||
!
|
||||
CALL using_becsum(2)
|
||||
!
|
||||
becsum(:,:,:) = 0.D0
|
||||
if (tqr) CALL using_ebecsum(2)
|
||||
if (tqr) ebecsum(:,:,:) = 0.D0
|
||||
rho%of_r(:,:) = 0.D0
|
||||
rho%of_g(:,:) = (0.D0, 0.D0)
|
||||
|
|
|
@ -13,6 +13,9 @@ SUBROUTINE sum_band_gpu()
|
|||
! ... Calculates the symmetrized charge density and related quantities
|
||||
! ... Also computes the occupations and the sum of occupied eigenvalues.
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE kinds, ONLY : DP
|
||||
USE ener, ONLY : eband
|
||||
USE control_flags, ONLY : diago_full_acc, gamma_only, lxdm, tqr
|
||||
|
@ -22,7 +25,7 @@ SUBROUTINE sum_band_gpu()
|
|||
USE fft_interfaces, ONLY : fwfft, invfft, fft_interpolate
|
||||
USE gvect, ONLY : ngm, g
|
||||
USE gvecs, ONLY : doublegrid
|
||||
USE klist, ONLY : nks, nkstot, wk, xk, ngk, igk_k
|
||||
USE klist, ONLY : nks, nkstot, wk, xk, ngk, igk_k, igk_k_d
|
||||
USE fixed_occ, ONLY : one_atom_occupations
|
||||
USE ldaU, ONLY : lda_plus_U
|
||||
USE lsda_mod, ONLY : lsda, nspin, current_spin, isk
|
||||
|
@ -32,7 +35,7 @@ SUBROUTINE sum_band_gpu()
|
|||
USE buffers, ONLY : get_buffer
|
||||
USE uspp, ONLY : nkb, vkb, becsum, ebecsum, nhtol, nhtoj, indv, okvan
|
||||
USE uspp_param, ONLY : upf, nh, nhm
|
||||
USE wavefunctions_module, ONLY : evc, psic, psic_nc
|
||||
USE wavefunctions_module, ONLY : evc, psic
|
||||
USE noncollin_module, ONLY : noncolin, npol, nspin_mag
|
||||
USE spin_orb, ONLY : lspinorb, domag, fcoef
|
||||
USE wvfct, ONLY : nbnd, npwx, wg, et, btype
|
||||
|
@ -44,9 +47,11 @@ SUBROUTINE sum_band_gpu()
|
|||
USE paw_variables, ONLY : okpaw
|
||||
USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, &
|
||||
becp
|
||||
USE wavefunctions_module_gpum, ONLY : using_evc
|
||||
USE wavefunctions_module_gpum, ONLY : evc_d, using_evc, using_evc_d
|
||||
USE wvfct_gpum, ONLY : using_et
|
||||
USE uspp_gpum, ONLY : using_vkb
|
||||
USE uspp_gpum, ONLY : vkb_d, using_vkb, &
|
||||
using_vkb_d, using_becsum_d, using_ebecsum_d, &
|
||||
using_becsum, using_ebecsum
|
||||
USE becmod_subs_gpum, ONLY : using_becp_auto
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -65,7 +70,10 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
CALL start_clock( 'sum_band' )
|
||||
!
|
||||
CALL using_becsum(2)
|
||||
!
|
||||
becsum(:,:,:) = 0.D0
|
||||
if (tqr) CALL using_ebecsum(2)
|
||||
if (tqr) ebecsum(:,:,:) = 0.D0
|
||||
rho%of_r(:,:) = 0.D0
|
||||
rho%of_g(:,:) = (0.D0, 0.D0)
|
||||
|
@ -157,13 +165,17 @@ SUBROUTINE sum_band_gpu()
|
|||
! ... becsum is summed over bands (if bgrp_parallelization is done)
|
||||
! ... and over k-points (but it is not symmetrized)
|
||||
!
|
||||
CALL using_becsum(1) ! use host copy to do the comunication. This avoids going back an forth GPU data
|
||||
CALL mp_sum(becsum, inter_bgrp_comm )
|
||||
CALL mp_sum(becsum, inter_pool_comm )
|
||||
CALL using_becsum_d(0)
|
||||
!
|
||||
! ... same for ebecsum, a correction to becsum (?) in real space
|
||||
!
|
||||
IF (tqr) CALL using_ebecsum(1)
|
||||
IF (tqr) CALL mp_sum(ebecsum, inter_pool_comm )
|
||||
IF (tqr) CALL mp_sum(ebecsum, inter_bgrp_comm )
|
||||
IF (tqr) CALL using_ebecsum_d(0)
|
||||
!
|
||||
! ... PAW: symmetrize becsum and store it
|
||||
! ... FIXME: the same should be done for USPP as well
|
||||
|
@ -245,13 +257,23 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
REAL(DP) :: w1, w2
|
||||
! weights
|
||||
INTEGER :: npw, idx, ioff, ioff_tg, nxyp, incr, v_siz, j, ir3
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psi(:)
|
||||
REAL(DP), ALLOCATABLE :: tg_rho(:)
|
||||
INTEGER :: npw, idx, ioff, ioff_tg, nxyp, incr, v_siz, j
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psi_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psic_d(:)
|
||||
REAL(DP), ALLOCATABLE :: tg_rho_d(:), tg_rho_h(:)
|
||||
REAL(DP), ALLOCATABLE :: rho_d(:,:)
|
||||
INTEGER, POINTER :: dffts_nl_d(:), dffts_nlm_d(:)
|
||||
LOGICAL :: use_tg
|
||||
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp
|
||||
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ierr
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: psic_d, tg_psi_d, tg_rho_d, rho_d
|
||||
attributes(device) :: dffts_nl_d, dffts_nlm_d
|
||||
attributes(pinned) :: tg_rho_h
|
||||
#endif
|
||||
!
|
||||
CALL using_evc(0); CALL using_et(0)
|
||||
CALL using_evc_d(0); CALL using_et(0)
|
||||
dffts_nl_d => dffts%nl_d
|
||||
dffts_nlm_d => dffts%nlm_d
|
||||
!
|
||||
! ... here we sum for each k point the contribution
|
||||
! ... of the wavefunctions to the charge
|
||||
|
@ -264,27 +286,32 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
v_siz = dffts%nnr_tg
|
||||
!
|
||||
ALLOCATE( tg_psi( v_siz ) )
|
||||
ALLOCATE( tg_rho( v_siz ) )
|
||||
ALLOCATE( tg_psi_d( v_siz ) )
|
||||
ALLOCATE( tg_rho_d( v_siz ) )
|
||||
ALLOCATE( tg_rho_h( v_siz ) )
|
||||
!
|
||||
incr = 2 * fftx_ntgrp(dffts)
|
||||
!
|
||||
ELSE
|
||||
ALLOCATE( rho_d, MOLD=rho%of_r ) ! OPTIMIZE HERE, use buffers (and batched FFT)
|
||||
ALLOCATE(psic_d(dfftp%nnr))
|
||||
rho_d = 0.0_DP
|
||||
END IF
|
||||
!
|
||||
k_loop: DO ik = 1, nks
|
||||
!
|
||||
IF ( use_tg ) tg_rho = 0.0_DP
|
||||
IF ( use_tg ) tg_rho_d = 0.0_DP
|
||||
IF ( lsda ) current_spin = isk(ik)
|
||||
!
|
||||
npw = ngk(ik)
|
||||
!
|
||||
IF ( nks > 1 ) &
|
||||
CALL get_buffer ( evc, nwordwfc, iunwfc, ik )
|
||||
IF ( nks > 1 ) CALL using_evc(1) ! get_buffer(evc, ...) evc is updated (intent out)
|
||||
IF ( nks > 1 ) CALL using_evc(2) ! get_buffer(evc, ...) evc is updated (intent out)
|
||||
IF ( nks > 1 ) CALL using_evc_d(0) ! sync on the GPU
|
||||
!
|
||||
IF ( nkb > 0 ) CALL using_vkb(1)
|
||||
IF ( nkb > 0 ) &
|
||||
CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb )
|
||||
IF ( nkb > 0 ) CALL using_vkb_d(2)
|
||||
IF ( nkb > 0 ) CALL init_us_2_gpu( npw, igk_k_d(1,ik), xk(1,ik), vkb_d )
|
||||
!
|
||||
! ... here we compute the band energy: the sum of the eigenvalues
|
||||
!
|
||||
|
@ -302,7 +329,7 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
tg_psi(:) = ( 0.D0, 0.D0 )
|
||||
tg_psi_d(:) = ( 0.D0, 0.D0 )
|
||||
ioff = 0
|
||||
!
|
||||
CALL tg_get_nnr( dffts, right_nnr )
|
||||
|
@ -313,16 +340,18 @@ SUBROUTINE sum_band_gpu()
|
|||
! ... 2*ntgrp ffts at the same time
|
||||
!
|
||||
IF( idx + ibnd - 1 < ibnd_end ) THEN
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO j = 1, npw
|
||||
tg_psi(dffts%nl (j)+ioff)= evc(j,idx+ibnd-1)+&
|
||||
(0.0d0,1.d0) * evc(j,idx+ibnd)
|
||||
tg_psi(dffts%nlm(j)+ioff)=CONJG(evc(j,idx+ibnd-1) -&
|
||||
(0.0d0,1.d0) * evc(j,idx+ibnd) )
|
||||
tg_psi_d(dffts_nl_d (j)+ioff)= evc_d(j,idx+ibnd-1)+&
|
||||
(0.0d0,1.d0) * evc_d(j,idx+ibnd)
|
||||
tg_psi_d(dffts_nlm_d(j)+ioff)=CONJG(evc_d(j,idx+ibnd-1) -&
|
||||
(0.0d0,1.d0) * evc_d(j,idx+ibnd) )
|
||||
END DO
|
||||
ELSE IF( idx + ibnd - 1 == ibnd_end ) THEN
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO j = 1, npw
|
||||
tg_psi(dffts%nl (j)+ioff)= evc(j,idx+ibnd-1)
|
||||
tg_psi(dffts%nlm(j)+ioff)=CONJG( evc(j,idx+ibnd-1) )
|
||||
tg_psi_d(dffts_nl_d (j)+ioff)= evc_d(j,idx+ibnd-1)
|
||||
tg_psi_d(dffts_nlm_d(j)+ioff)=CONJG( evc_d(j,idx+ibnd-1) )
|
||||
END DO
|
||||
END IF
|
||||
|
||||
|
@ -330,7 +359,7 @@ SUBROUTINE sum_band_gpu()
|
|||
|
||||
END DO
|
||||
!
|
||||
CALL invfft ('tgWave', tg_psi, dffts )
|
||||
CALL invfft ('tgWave', tg_psi_d, dffts )
|
||||
!
|
||||
! Now the first proc of the group holds the first two bands
|
||||
! of the 2*ntgrp bands that we are processing at the same time,
|
||||
|
@ -361,29 +390,35 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
CALL get_rho_gamma_gpu(tg_rho, dffts%nr1x*dffts%nr2x*right_nr3, w1, w2, tg_psi)
|
||||
CALL get_rho_gamma_gpu(tg_rho_d, dffts%nr1x*dffts%nr2x*right_nr3, w1, w2, tg_psi_d)
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
psic(:) = ( 0.D0, 0.D0 )
|
||||
psic_d(:) = ( 0.D0, 0.D0 )
|
||||
!
|
||||
IF ( ibnd < ibnd_end ) THEN
|
||||
!
|
||||
! ... two ffts at the same time
|
||||
!
|
||||
psic(dffts%nl(1:npw)) = evc(1:npw,ibnd) + &
|
||||
( 0.D0, 1.D0 ) * evc(1:npw,ibnd+1)
|
||||
psic(dffts%nlm(1:npw)) = CONJG( evc(1:npw,ibnd) - &
|
||||
( 0.D0, 1.D0 ) * evc(1:npw,ibnd+1) )
|
||||
!$cuf kernel do(1)
|
||||
DO j=1,npw
|
||||
psic_d(dffts_nl_d(j)) = evc_d(j,ibnd) + &
|
||||
( 0.D0, 1.D0 ) * evc_d(j,ibnd+1)
|
||||
psic_d(dffts_nlm_d(j)) = CONJG( evc_d(j,ibnd) - &
|
||||
( 0.D0, 1.D0 ) * evc_d(j,ibnd+1) )
|
||||
END DO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
psic(dffts%nl (1:npw)) = evc(1:npw,ibnd)
|
||||
psic(dffts%nlm(1:npw)) = CONJG( evc(1:npw,ibnd) )
|
||||
!$cuf kernel do(1)
|
||||
DO j=1,npw
|
||||
psic_d(dffts_nl_d (j)) = evc_d(j,ibnd)
|
||||
psic_d(dffts_nlm_d(j)) = CONJG( evc_d(j,ibnd) )
|
||||
END DO
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL invfft ('Wave', psic, dffts)
|
||||
CALL invfft ('Wave', psic_d, dffts)
|
||||
!
|
||||
w1 = wg(ibnd,ik) / omega
|
||||
!
|
||||
|
@ -401,7 +436,7 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
END IF
|
||||
!
|
||||
CALL get_rho_gamma_gpu(rho%of_r(:,current_spin), dffts%nnr, w1, w2, psic)
|
||||
CALL get_rho_gamma_gpu(rho_d(:,current_spin), dffts%nnr, w1, w2, psic_d(:))
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -444,8 +479,8 @@ SUBROUTINE sum_band_gpu()
|
|||
END DO
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
|
||||
CALL tg_reduce_rho( rho%of_r, tg_rho, current_spin, dffts )
|
||||
tg_rho_h = tg_rho_d
|
||||
CALL tg_reduce_rho( rho%of_r, tg_rho_h, current_spin, dffts )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -455,14 +490,24 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
END DO k_loop
|
||||
!
|
||||
IF( .not. use_tg ) THEN
|
||||
rho%of_r = rho_d
|
||||
END IF
|
||||
!
|
||||
! ... with distributed <beta|psi>, sum over bands
|
||||
!
|
||||
IF( okvan .AND. becp%comm /= mp_get_comm_null() ) CALL using_becsum(1)
|
||||
IF( okvan .AND. becp%comm /= mp_get_comm_null() ) CALL mp_sum( becsum, becp%comm )
|
||||
IF( okvan .AND. becp%comm /= mp_get_comm_null() .and. tqr ) CALL using_ebecsum(1)
|
||||
IF( okvan .AND. becp%comm /= mp_get_comm_null() .and. tqr ) CALL mp_sum( ebecsum, becp%comm )
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
DEALLOCATE( tg_psi )
|
||||
DEALLOCATE( tg_rho )
|
||||
DEALLOCATE( tg_psi_d )
|
||||
DEALLOCATE( tg_rho_d )
|
||||
DEALLOCATE( tg_rho_h )
|
||||
ELSE
|
||||
DEALLOCATE(rho_d)
|
||||
DEALLOCATE(psic_d)
|
||||
END IF
|
||||
!
|
||||
RETURN
|
||||
|
@ -476,6 +521,7 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
! ... k-points version
|
||||
!
|
||||
USE wavefunctions_module_gpum, ONLY : psic_nc_d
|
||||
USE mp_bands, ONLY : me_bgrp
|
||||
USE mp, ONLY : mp_sum, mp_get_comm_null
|
||||
USE fft_helper_subroutines
|
||||
|
@ -488,13 +534,24 @@ SUBROUTINE sum_band_gpu()
|
|||
! weights
|
||||
INTEGER :: npw, ipol, na, np
|
||||
!
|
||||
INTEGER :: idx, ioff, ioff_tg, nxyp, incr, v_siz, j, ir3
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psi(:), tg_psi_nc(:,:)
|
||||
REAL(DP), ALLOCATABLE :: tg_rho(:), tg_rho_nc(:,:)
|
||||
INTEGER :: idx, ioff, ioff_tg, nxyp, incr, v_siz, j
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psi_d(:), tg_psi_nc_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: tg_rho_d(:), tg_rho_nc_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: tg_rho_h(:), tg_rho_nc_h(:,:)
|
||||
REAL(DP), ALLOCATABLE :: rho_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psic_d(:)
|
||||
INTEGER, POINTER :: dffts_nl_d(:)
|
||||
LOGICAL :: use_tg
|
||||
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp
|
||||
INTEGER :: right_nnr, right_nr3, right_inc, ntgrp, ierr
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: psic_d, tg_psi_d, tg_rho_d, tg_psi_nc_d, tg_rho_nc_d
|
||||
attributes(device) :: rho_d, dffts_nl_d
|
||||
attributes(pinned) :: tg_rho_h, tg_rho_nc_h
|
||||
#endif
|
||||
!
|
||||
CALL using_evc(0); CALL using_et(0)
|
||||
dffts_nl_d => dffts%nl_d
|
||||
!
|
||||
!
|
||||
! ... here we sum for each k point the contribution
|
||||
|
@ -509,24 +566,31 @@ SUBROUTINE sum_band_gpu()
|
|||
v_siz = dffts%nnr_tg
|
||||
!
|
||||
IF (noncolin) THEN
|
||||
ALLOCATE( tg_psi_nc( v_siz, npol ) )
|
||||
ALLOCATE( tg_rho_nc( v_siz, nspin_mag ) )
|
||||
ALLOCATE( tg_psi_nc_d( v_siz, npol ) )
|
||||
ALLOCATE( tg_rho_nc_d( v_siz, nspin_mag ) )
|
||||
ALLOCATE( tg_rho_nc_h( v_siz, nspin_mag ) )
|
||||
ELSE
|
||||
ALLOCATE( tg_psi( v_siz ) )
|
||||
ALLOCATE( tg_rho( v_siz ) )
|
||||
ALLOCATE( tg_psi_d( v_siz ) )
|
||||
ALLOCATE( tg_rho_d( v_siz ) )
|
||||
ALLOCATE( tg_rho_h( v_siz ) )
|
||||
ENDIF
|
||||
!
|
||||
incr = fftx_ntgrp(dffts)
|
||||
!
|
||||
ELSE
|
||||
ALLOCATE(rho_d, MOLD=rho%of_r) ! OPTIMIZE HERE, use buffers!
|
||||
ALLOCATE(psic_d(dffts%nnr))
|
||||
! This is used as reduction variable on the device
|
||||
rho_d = 0.0_DP
|
||||
END IF
|
||||
!
|
||||
k_loop: DO ik = 1, nks
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
IF (noncolin) THEN
|
||||
tg_rho_nc = 0.0_DP
|
||||
tg_rho_nc_d = 0.0_DP
|
||||
ELSE
|
||||
tg_rho = 0.0_DP
|
||||
tg_rho_d = 0.0_DP
|
||||
ENDIF
|
||||
ENDIF
|
||||
|
||||
|
@ -535,11 +599,12 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
IF ( nks > 1 ) &
|
||||
CALL get_buffer ( evc, nwordwfc, iunwfc, ik )
|
||||
IF ( nks > 1 ) CALL using_evc(1)
|
||||
IF ( nks > 1 ) CALL using_evc(2)
|
||||
IF ( nks > 1 ) CALL using_evc_d(0) ! sync evc on GPU, OPTIMIZE (use async here)
|
||||
!
|
||||
IF ( nkb > 0 ) CALL using_vkb(1)
|
||||
IF ( nkb > 0 ) CALL using_vkb_d(2)
|
||||
IF ( nkb > 0 ) &
|
||||
CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb )
|
||||
CALL init_us_2_gpu( npw, igk_k_d(1,ik), xk(1,ik), vkb_d )
|
||||
!
|
||||
! ... here we compute the band energy: the sum of the eigenvalues
|
||||
!
|
||||
|
@ -561,7 +626,7 @@ SUBROUTINE sum_band_gpu()
|
|||
IF (noncolin) THEN
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
tg_psi_nc = ( 0.D0, 0.D0 )
|
||||
tg_psi_nc_d = ( 0.D0, 0.D0 )
|
||||
!
|
||||
CALL tg_get_nnr( dffts, right_nnr )
|
||||
ntgrp = fftx_ntgrp( dffts )
|
||||
|
@ -573,11 +638,12 @@ SUBROUTINE sum_band_gpu()
|
|||
! ... ntgrp ffts at the same time
|
||||
!
|
||||
IF( idx + ibnd - 1 <= ibnd_end ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO j = 1, npw
|
||||
tg_psi_nc( dffts%nl(igk_k(j,ik) ) + ioff, 1 ) = &
|
||||
evc( j, idx+ibnd-1 )
|
||||
tg_psi_nc( dffts%nl(igk_k(j,ik) ) + ioff, 2 ) = &
|
||||
evc( j+npwx, idx+ibnd-1 )
|
||||
tg_psi_nc_d( dffts_nl_d(igk_k_d(j,ik) ) + ioff, 1 ) = &
|
||||
evc_d( j, idx+ibnd-1 )
|
||||
tg_psi_nc_d( dffts_nl_d(igk_k_d(j,ik) ) + ioff, 2 ) = &
|
||||
evc_d( j+npwx, idx+ibnd-1 )
|
||||
END DO
|
||||
END IF
|
||||
|
||||
|
@ -585,8 +651,8 @@ SUBROUTINE sum_band_gpu()
|
|||
|
||||
END DO
|
||||
!
|
||||
CALL invfft ('tgWave', tg_psi_nc(:,1), dffts )
|
||||
CALL invfft ('tgWave', tg_psi_nc(:,2), dffts)
|
||||
CALL invfft ('tgWave', tg_psi_nc_d(:,1), dffts)
|
||||
CALL invfft ('tgWave', tg_psi_nc_d(:,2), dffts)
|
||||
!
|
||||
! Now the first proc of the group holds the first band
|
||||
! of the ntgrp bands that we are processing at the same time,
|
||||
|
@ -609,37 +675,43 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
! OPTIMIZE HERE : this is a sum of all densities in first spin channel
|
||||
DO ipol=1,npol
|
||||
CALL get_rho_gpu(tg_rho_nc(:,1), dffts%nr1x * dffts%nr2x* right_nr3, w1, tg_psi_nc(:,ipol))
|
||||
CALL get_rho_gpu(tg_rho_nc_d(:,1), dffts%nr1x * dffts%nr2x* right_nr3, w1, tg_psi_nc_d(:,ipol))
|
||||
ENDDO
|
||||
!
|
||||
IF (domag) CALL get_rho_domag_gpu(tg_rho_nc(:,:), dffts%nr1x*dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc(:,:))
|
||||
IF (domag) CALL get_rho_domag_gpu(tg_rho_nc_d(:,:), dffts%nr1x*dffts%nr2x*dffts%my_nr3p, w1, tg_psi_nc_d(:,:))
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! Noncollinear case without task groups
|
||||
!
|
||||
psic_nc = (0.D0,0.D0)
|
||||
DO ig = 1, npw
|
||||
psic_nc(dffts%nl(igk_k(ig,ik)),1)=evc(ig ,ibnd)
|
||||
psic_nc(dffts%nl(igk_k(ig,ik)),2)=evc(ig+npwx,ibnd)
|
||||
psic_nc_d = (0.D0,0.D0)
|
||||
|
||||
!$cuf kernel do(1)
|
||||
DO j = 1, npw
|
||||
psic_nc_d( dffts_nl_d(igk_k_d(j,ik) ), 1 ) = &
|
||||
evc_d( j, ibnd )
|
||||
psic_nc_d( dffts_nl_d(igk_k_d(j,ik) ), 2 ) = &
|
||||
evc_d( j+npwx, ibnd )
|
||||
END DO
|
||||
CALL invfft ('Wave', psic_nc(:,1), dffts)
|
||||
CALL invfft ('Wave', psic_nc(:,2), dffts)
|
||||
!
|
||||
CALL invfft ('Wave', psic_nc_d(:,1), dffts)
|
||||
CALL invfft ('Wave', psic_nc_d(:,2), dffts)
|
||||
!
|
||||
! increment the charge density ...
|
||||
!
|
||||
DO ipol=1,npol
|
||||
CALL get_rho_gpu(rho%of_r(:,1), dffts%nnr, w1, psic_nc(:,ipol))
|
||||
CALL get_rho_gpu(rho_d(:,1), dffts%nnr, w1, psic_nc_d(:,ipol))
|
||||
END DO
|
||||
!
|
||||
! In this case, calculate also the three
|
||||
! components of the magnetization (stored in rho%of_r(ir,2-4))
|
||||
!
|
||||
IF (domag) THEN
|
||||
CALL get_rho_domag_gpu(rho%of_r(:,:), dffts%nnr, w1, psic_nc(:,:))
|
||||
CALL get_rho_domag_gpu(rho_d(1:,1:), dffts%nnr, w1, psic_nc_d(1:,1:))
|
||||
ELSE
|
||||
rho%of_r(:,2:4)=0.0_DP
|
||||
rho_d(:,2:4)=0.0_DP ! OPTIMIZE HERE: this memset can be avoided
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
|
@ -648,12 +720,10 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
!$omp parallel default(shared), private(j,ioff,idx)
|
||||
!$omp do
|
||||
DO j = 1, SIZE( tg_psi )
|
||||
tg_psi(j) = ( 0.D0, 0.D0 )
|
||||
!$cuf kernel do(1)
|
||||
DO j = 1, SIZE( tg_psi_d )
|
||||
tg_psi_d(j) = ( 0.D0, 0.D0 )
|
||||
END DO
|
||||
!$omp end do
|
||||
!
|
||||
ioff = 0
|
||||
!
|
||||
|
@ -665,19 +735,17 @@ SUBROUTINE sum_band_gpu()
|
|||
! ... ntgrp ffts at the same time
|
||||
!
|
||||
IF( idx + ibnd - 1 <= ibnd_end ) THEN
|
||||
!$omp do
|
||||
!$cuf kernel do(1)
|
||||
DO j = 1, npw
|
||||
tg_psi( dffts%nl(igk_k(j,ik))+ioff ) = evc(j,idx+ibnd-1)
|
||||
tg_psi_d( dffts_nl_d(igk_k_d(j,ik))+ioff ) = evc_d(j,idx+ibnd-1)
|
||||
END DO
|
||||
!$omp end do
|
||||
END IF
|
||||
|
||||
ioff = ioff + right_nnr
|
||||
|
||||
END DO
|
||||
!$omp end parallel
|
||||
!
|
||||
CALL invfft ('tgWave', tg_psi, dffts)
|
||||
CALL invfft ('tgWave', tg_psi_d, dffts)
|
||||
!
|
||||
! Now the first proc of the group holds the first band
|
||||
! of the ntgrp bands that we are processing at the same time,
|
||||
|
@ -700,19 +768,22 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
CALL get_rho_gpu(tg_rho, dffts%nr1x * dffts%nr2x * right_nr3, w1, tg_psi)
|
||||
CALL get_rho_gpu(tg_rho_d, dffts%nr1x * dffts%nr2x * right_nr3, w1, tg_psi_d)
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
psic(:) = ( 0.D0, 0.D0 )
|
||||
psic_d(:) = ( 0.D0, 0.D0 )
|
||||
!
|
||||
psic(dffts%nl(igk_k(1:npw,ik))) = evc(1:npw,ibnd)
|
||||
!$cuf kernel do(1)
|
||||
DO j = 1, npw
|
||||
psic_d(dffts_nl_d(igk_k_d(j,ik))) = evc_d(j,ibnd)
|
||||
END DO
|
||||
!
|
||||
CALL invfft ('Wave', psic, dffts)
|
||||
CALL invfft ('Wave', psic_d, dffts)
|
||||
!
|
||||
! ... increment the charge density ...
|
||||
!
|
||||
CALL get_rho_gpu(rho%of_r(:,current_spin), dffts%nnr, w1, psic)
|
||||
CALL get_rho_gpu(rho_d(:,current_spin), dffts%nnr, w1, psic_d(:))
|
||||
|
||||
END IF
|
||||
!
|
||||
|
@ -721,14 +792,14 @@ SUBROUTINE sum_band_gpu()
|
|||
psic(:) = ( 0.D0, 0.D0 )
|
||||
!
|
||||
kplusg (1:npw) = (xk(j,ik)+g(j,igk_k(1:npw,ik))) * tpiba
|
||||
psic(dffts%nl(igk_k(1:npw,ik)))=CMPLX(0d0,kplusg(1:npw),kind=DP) * &
|
||||
psic(dffts_nl_d(igk_k(1:npw,ik)))=CMPLX(0d0,kplusg(1:npw),kind=DP) * &
|
||||
evc(1:npw,ibnd)
|
||||
!
|
||||
CALL invfft ('Wave', psic, dffts)
|
||||
!
|
||||
! ... increment the kinetic energy density ...
|
||||
!
|
||||
CALL get_rho_gpu(rho%kin_r(:,current_spin), dffts%nnr, w1, psic)
|
||||
CALL get_rho(rho%kin_r(:,current_spin), dffts%nnr, w1, psic)
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
|
@ -740,7 +811,9 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
! reduce the charge across task group
|
||||
!
|
||||
CALL tg_reduce_rho( rho%of_r, tg_rho_nc, tg_rho, current_spin, noncolin, domag, dffts )
|
||||
IF (noncolin) tg_rho_nc_h = tg_rho_nc_d
|
||||
IF (.not. noncolin) tg_rho_h = tg_rho_d
|
||||
CALL tg_reduce_rho( rho%of_r, tg_rho_nc_h, tg_rho_h, current_spin, noncolin, domag, dffts )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -750,19 +823,30 @@ SUBROUTINE sum_band_gpu()
|
|||
!
|
||||
END DO k_loop
|
||||
!
|
||||
IF (.not. use_tg ) THEN
|
||||
rho%of_r = rho_d
|
||||
END IF
|
||||
!
|
||||
! ... with distributed <beta|psi>, sum over bands
|
||||
!
|
||||
IF( okvan .AND. becp%comm /= mp_get_comm_null() ) CALL using_becsum(1)
|
||||
IF( okvan .AND. becp%comm /= mp_get_comm_null() ) CALL mp_sum( becsum, becp%comm )
|
||||
IF( okvan .AND. becp%comm /= mp_get_comm_null() .and. tqr ) CALL using_ebecsum(1)
|
||||
IF( okvan .AND. becp%comm /= mp_get_comm_null() .and. tqr ) CALL mp_sum( ebecsum, becp%comm )
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
IF (noncolin) THEN
|
||||
DEALLOCATE( tg_psi_nc )
|
||||
DEALLOCATE( tg_rho_nc )
|
||||
DEALLOCATE( tg_psi_nc_d )
|
||||
DEALLOCATE( tg_rho_nc_d )
|
||||
DEALLOCATE( tg_rho_nc_h )
|
||||
ELSE
|
||||
DEALLOCATE( tg_psi )
|
||||
DEALLOCATE( tg_rho )
|
||||
DEALLOCATE( tg_psi_d )
|
||||
DEALLOCATE( tg_rho_d )
|
||||
DEALLOCATE( tg_rho_h )
|
||||
END IF
|
||||
ELSE
|
||||
DEALLOCATE(rho_d) ! OPTIMIZE HERE, use buffers!
|
||||
DEALLOCATE(psic_d) ! OPTIMIZE HERE, use buffers!
|
||||
END IF
|
||||
!
|
||||
RETURN
|
||||
|
@ -770,81 +854,104 @@ SUBROUTINE sum_band_gpu()
|
|||
END SUBROUTINE sum_band_k_gpu
|
||||
!
|
||||
!
|
||||
SUBROUTINE get_rho_gpu(rho_loc, nrxxs_loc, w1_loc, psic_loc)
|
||||
SUBROUTINE get_rho_gpu(rho_loc_d, nrxxs_loc, w1_loc, psic_loc_d)
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
INTEGER :: nrxxs_loc
|
||||
REAL(DP) :: rho_loc(nrxxs_loc)
|
||||
REAL(DP) :: rho_loc_d(:)
|
||||
REAL(DP) :: w1_loc
|
||||
COMPLEX(DP) :: psic_loc(nrxxs_loc)
|
||||
|
||||
COMPLEX(DP) :: psic_loc_d(:)
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: rho_loc_d, psic_loc_d
|
||||
#endif
|
||||
INTEGER :: ir
|
||||
|
||||
!$omp parallel do
|
||||
!$cuf kernel do(1)
|
||||
DO ir = 1, nrxxs_loc
|
||||
!
|
||||
rho_loc(ir) = rho_loc(ir) + &
|
||||
w1_loc * ( DBLE( psic_loc(ir) )**2 + &
|
||||
AIMAG( psic_loc(ir) )**2 )
|
||||
rho_loc_d(ir) = rho_loc_d(ir) + &
|
||||
w1_loc * ( DBLE( psic_loc_d(ir) )**2 + &
|
||||
AIMAG( psic_loc_d(ir) )**2 )
|
||||
!
|
||||
END DO
|
||||
!$omp end parallel do
|
||||
|
||||
END SUBROUTINE get_rho_gpu
|
||||
|
||||
SUBROUTINE get_rho_gamma_gpu(rho_loc, nrxxs_loc, w1_loc, w2_loc, psic_loc)
|
||||
!
|
||||
SUBROUTINE get_rho(rho_loc_h, nrxxs_loc, w1_loc, psic_loc_h)
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
INTEGER :: nrxxs_loc
|
||||
REAL(DP) :: rho_loc(nrxxs_loc)
|
||||
REAL(DP) :: w1_loc, w2_loc
|
||||
COMPLEX(DP) :: psic_loc(nrxxs_loc)
|
||||
|
||||
REAL(DP) :: rho_loc_h(nrxxs_loc)
|
||||
REAL(DP) :: w1_loc
|
||||
COMPLEX(DP) :: psic_loc_h(nrxxs_loc)
|
||||
INTEGER :: ir
|
||||
|
||||
!$omp parallel do
|
||||
DO ir = 1, nrxxs_loc
|
||||
!
|
||||
rho_loc(ir) = rho_loc(ir) + &
|
||||
w1_loc * DBLE( psic_loc(ir) )**2 + &
|
||||
w2_loc * AIMAG( psic_loc(ir) )**2
|
||||
rho_loc_h(ir) = rho_loc_h(ir) + &
|
||||
w1_loc * ( DBLE( psic_loc_h(ir) )**2 + &
|
||||
AIMAG( psic_loc_h(ir) )**2 )
|
||||
!
|
||||
END DO
|
||||
|
||||
END SUBROUTINE get_rho
|
||||
|
||||
SUBROUTINE get_rho_gamma_gpu(rho_loc_d, nrxxs_loc, w1_loc, w2_loc, psic_loc_d)
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
INTEGER :: nrxxs_loc
|
||||
REAL(DP) :: rho_loc_d(nrxxs_loc)
|
||||
REAL(DP) :: w1_loc, w2_loc
|
||||
COMPLEX(DP) :: psic_loc_d(nrxxs_loc)
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: rho_loc_d, psic_loc_d
|
||||
#endif
|
||||
INTEGER :: ir
|
||||
|
||||
!$cuf kernel do(1)
|
||||
DO ir = 1, nrxxs_loc
|
||||
!
|
||||
rho_loc_d(ir) = rho_loc_d(ir) + &
|
||||
w1_loc * DBLE( psic_loc_d(ir) )**2 + &
|
||||
w2_loc * AIMAG( psic_loc_d(ir) )**2
|
||||
!
|
||||
END DO
|
||||
!$omp end parallel do
|
||||
|
||||
END SUBROUTINE get_rho_gamma_gpu
|
||||
|
||||
|
||||
SUBROUTINE get_rho_domag_gpu(rho_loc, nrxxs_loc, w1_loc, psic_loc)
|
||||
SUBROUTINE get_rho_domag_gpu(rho_loc_d, nrxxs_loc, w1_loc, psic_loc_d)
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
INTEGER :: nrxxs_loc
|
||||
REAL(DP) :: rho_loc(:, :)
|
||||
REAL(DP) :: rho_loc_d(:, :)
|
||||
REAL(DP) :: w1_loc
|
||||
COMPLEX(DP) :: psic_loc(:, :)
|
||||
|
||||
COMPLEX(DP) :: psic_loc_d(:, :)
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: rho_loc_d, psic_loc_d
|
||||
#endif
|
||||
INTEGER :: ir
|
||||
|
||||
!$omp parallel do
|
||||
!$cuf kernel do(1)
|
||||
DO ir = 1, nrxxs_loc
|
||||
!
|
||||
rho_loc(ir,2) = rho_loc(ir,2) + w1_loc*2.D0* &
|
||||
(DBLE(psic_loc(ir,1))* DBLE(psic_loc(ir,2)) + &
|
||||
AIMAG(psic_loc(ir,1))*AIMAG(psic_loc(ir,2)))
|
||||
rho_loc_d(ir,2) = rho_loc_d(ir,2) + w1_loc*2.D0* &
|
||||
(DBLE(psic_loc_d(ir,1))* DBLE(psic_loc_d(ir,2)) + &
|
||||
AIMAG(psic_loc_d(ir,1))*AIMAG(psic_loc_d(ir,2)))
|
||||
|
||||
rho_loc(ir,3) = rho_loc(ir,3) + w1_loc*2.D0* &
|
||||
(DBLE(psic_loc(ir,1))*AIMAG(psic_loc(ir,2)) - &
|
||||
DBLE(psic_loc(ir,2))*AIMAG(psic_loc(ir,1)))
|
||||
rho_loc_d(ir,3) = rho_loc_d(ir,3) + w1_loc*2.D0* &
|
||||
(DBLE(psic_loc_d(ir,1))*AIMAG(psic_loc_d(ir,2)) - &
|
||||
DBLE(psic_loc_d(ir,2))*AIMAG(psic_loc_d(ir,1)))
|
||||
|
||||
rho_loc(ir,4) = rho_loc(ir,4) + w1_loc* &
|
||||
(DBLE(psic_loc(ir,1))**2+AIMAG(psic_loc(ir,1))**2 &
|
||||
-DBLE(psic_loc(ir,2))**2-AIMAG(psic_loc(ir,2))**2)
|
||||
rho_loc_d(ir,4) = rho_loc_d(ir,4) + w1_loc* &
|
||||
(DBLE(psic_loc_d(ir,1))**2+AIMAG(psic_loc_d(ir,1))**2 &
|
||||
-DBLE(psic_loc_d(ir,2))**2-AIMAG(psic_loc_d(ir,2))**2)
|
||||
!
|
||||
END DO
|
||||
!$omp end parallel do
|
||||
|
||||
END SUBROUTINE get_rho_domag_gpu
|
||||
|
||||
|
@ -880,7 +987,8 @@ SUBROUTINE sum_bec_gpu ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd
|
|||
USE mp, ONLY : mp_sum
|
||||
USE wavefunctions_module_gpum, ONLY : using_evc
|
||||
USE wvfct_gpum, ONLY : using_et
|
||||
USE uspp_gpum, ONLY : using_vkb, using_indv_ijkb0
|
||||
USE uspp_gpum, ONLY : using_vkb, using_indv_ijkb0, &
|
||||
using_becsum, using_ebecsum
|
||||
USE becmod_subs_gpum, ONLY : using_becp_auto
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -898,6 +1006,8 @@ SUBROUTINE sum_bec_gpu ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd
|
|||
CALL using_vkb(0)
|
||||
CALL using_indv_ijkb0(0)
|
||||
CALL using_becp_auto(2)
|
||||
CALL using_becsum(1)
|
||||
if (tqr) CALL using_ebecsum(1)
|
||||
!
|
||||
npw = ngk(ik)
|
||||
IF ( .NOT. real_space ) THEN
|
||||
|
|
Loading…
Reference in New Issue