Merge branch 'qe_dav' into 'develop'

Porting on GPU of turbo_Davidson with openAcc

See merge request QEF/q-e!2311
This commit is contained in:
giannozz 2024-03-25 07:53:18 +00:00
commit 1b7d1c2295
11 changed files with 834 additions and 487 deletions

View File

@ -54,8 +54,11 @@ FUNCTION lr_dot(x,y)
!
IF (gamma_only) THEN
!
!$acc data present_or_copyin(x,y) copyin(wg) copy(temp_gamma)
CALL lr_dot_gamma()
!$acc end data
lr_dot = cmplx(temp_gamma,0.0d0,dp)
lr_dot = lr_dot/degspin
!
ELSEIF (noncolin) THEN
!
@ -65,11 +68,10 @@ FUNCTION lr_dot(x,y)
ELSE
!
CALL lr_dot_k()
lr_dot = lr_dot/degspin
!
ENDIF
!
lr_dot = lr_dot/degspin
!
CALL stop_clock ('lr_dot')
!
RETURN
@ -80,7 +82,29 @@ CONTAINS
!
! Optical case: gamma_only
! Noncollinear case is not implemented
!
USE wvfct, ONLY : npwx,nbnd,wg
#if defined(__CUDA)
use cublas
#endif
!
INTEGER :: ibnd
REAL(DP), EXTERNAL :: MYDDOT_VECTOR_GPU
!$acc routine(MYDDOT_VECTOR_GPU) vector
!
! !$acc data present(x,y,wg,temp_gamma)
#if defined(__CUDA)
!$acc parallel loop reduction(temp_gamma)
DO ibnd=1,nbnd
!
temp_gamma = temp_gamma + 2.D0*wg(ibnd,1)*MYDDOT_VECTOR_GPU(2*ngk(1),x(:,ibnd,1),y(:,ibnd,1))
!
! G=0 has been accounted twice, so we subtract one contribution.
!
IF (gstart==2) temp_gamma = temp_gamma - wg(ibnd,1)*dble(x(1,ibnd,1))*dble(y(1,ibnd,1))
!
ENDDO
#else
DO ibnd=1,nbnd
!
temp_gamma = temp_gamma + 2.D0*wg(ibnd,1)*DDOT(2*ngk(1),x(:,ibnd,1),1,y(:,ibnd,1),1)
@ -90,11 +114,15 @@ CONTAINS
IF (gstart==2) temp_gamma = temp_gamma - wg(ibnd,1)*dble(x(1,ibnd,1))*dble(y(1,ibnd,1))
!
ENDDO
!
#if defined(__MPI)
CALL mp_sum(temp_gamma, intra_bgrp_comm)
#endif
!
#if defined(__MPI)
!$acc host_data use_device(temp_gamma)
CALL mp_sum(temp_gamma, intra_bgrp_comm)
!$acc end host_data
#endif
!
! !$acc end data
RETURN
!
END SUBROUTINE lr_dot_gamma

View File

@ -46,7 +46,9 @@ SUBROUTINE lr_sm1_psi (ik, lda, n, m, psi, spsi)
CALL start_clock( 'lr_sm1_psi' )
!
IF ( gamma_only ) THEN
!$acc data present_or_copyin(psi) present_or_copyout(spsi)
CALL sm1_psi_gamma()
!$acc end data
ELSEIF (noncolin) THEN
CALL sm1_psi_nc()
ELSE
@ -74,6 +76,11 @@ CONTAINS
calbec_rs_gamma, add_vuspsir_gamma, &
v_loc_psir, s_psir_gamma
USE lrus, ONLY : bbg
USE uspp, ONLY : vkb
#if defined(__CUDA)
USE cublas
#endif
!
IMPLICIT NONE
!
@ -83,9 +90,15 @@ CONTAINS
! counters
REAL(DP), ALLOCATABLE :: ps(:,:)
!
!
! Initialize spsi : spsi = psi
!
CALL ZCOPY( lda * npol * m, psi, 1, spsi, 1 )
! !$acc data present(psi,spsi)
!
!$acc kernels
spsi(:,:) = psi(:,:)
!$acc end kernels
!!CALL ZCOPY( lda * npol * m, psi, 1, spsi, 1 )
!
IF ( nkb == 0 .OR. .NOT. okvan ) RETURN
!
@ -97,7 +110,7 @@ CONTAINS
ENDDO
!
ELSE
CALL calbec(n,vkb,psi,becp,m)
CALL calbec(n,vkb,psi,becp,m)
ENDIF
!
! Use the array ps as a workspace
@ -114,8 +127,13 @@ CONTAINS
!
! Step 2 : |spsi> = S^{-1} * |psi> = |psi> + ps * |beta>
!
!$acc enter data copyin(ps)
!$acc host_data use_device(vkb, ps, spsi)
call DGEMM('N','N',2*n,m,nkb,1.d0,vkb,2*lda,ps,nkb,1.d0,spsi,2*lda)
!$acc end host_data
!
!$acc exit data delete(ps)
! !$acc end data
DEALLOCATE(ps)
!
RETURN

View File

@ -69,7 +69,7 @@ SUBROUTINE orthogonalize(dvpsi, evq, ikk, ikq, dpsi, npwq, dpsi_computed)
!
ALLOCATE(ps(nbnd,nbnd))
!
!$acc data copyin(evq) copy(dvpsi,dpsi) create(ps(1:nbnd, 1:nbnd), ps_r(1:nbnd, 1:nbnd))
!$acc data copyin(evq) present_or_copy(dvpsi) copy(dpsi) create(ps(1:nbnd, 1:nbnd), ps_r(1:nbnd, 1:nbnd))
IF (gamma_only) THEN
!$acc kernels
ps_r(:,:) = 0.0d0

View File

@ -2198,14 +2198,17 @@ MODULE realus
INTEGER :: ebnd
!
!-------------------TEMPORARY-----------
INTEGER :: ngk1
LOGICAL :: is_present, acc_is_present
! INTEGER :: ngk1
! LOGICAL :: is_present, acc_is_present
!---------------------------------------
!
!Task groups
!
!The new task group version based on vloc_psi
!print *, "->Real space"
!
!$acc data present_or_copyin(orbital) present_or_copyout(psic)
!
CALL start_clock( 'invfft_orbital' )
!
IF( dffts%has_task_groups ) THEN
@ -2227,11 +2230,11 @@ MODULE realus
CALL wave_g2r( orbital(1:ngk(1),ibnd:ebnd), psic, dffts )
!
!-------------TEMPORARY---------------------
ngk1 = SIZE(psic)
#if defined(_OPENACC)
is_present = acc_is_present(psic,ngk1)
!$acc update self(psic) if (is_present)
#endif
! ngk1 = SIZE(psic)
!#if defined(_OPENACC)
! is_present = acc_is_present(psic,ngk1)
! !$acc update self(psic) if (is_present)
!#endif
!-------------------------------------------
!
IF (PRESENT(conserved)) THEN
@ -2245,6 +2248,8 @@ MODULE realus
!
CALL stop_clock( 'invfft_orbital' )
!
!$acc end data
!
END SUBROUTINE invfft_orbital_gamma
!
!--------------------------------------------------------------------------
@ -2290,14 +2295,16 @@ MODULE realus
COMPLEX(DP), ALLOCATABLE :: psio(:,:)
!
!-------------------TEMPORARY-----------
INTEGER :: ngk1
LOGICAL :: is_present, acc_is_present
! INTEGER :: ngk1
! LOGICAL :: is_present, acc_is_present
!---------------------------------------
!
! ... Task groups
!print *, "->Fourier space"
CALL start_clock( 'fwfft_orbital' )
!
!$acc data present_or_copyin(psic) present_or_copy(orbital)
!
add_to_orbital_=.FALSE. ; IF( PRESENT(add_to_orbital)) add_to_orbital_ = add_to_orbital
!
! ... New task_groups versions
@ -2352,30 +2359,32 @@ MODULE realus
CALL wave_r2g( psic(1:dffts%nnr), psio, dffts )
!
!-------------TEMPORARY---------------------
ngk1 = SIZE(psic)
#if defined(_OPENACC)
is_present = acc_is_present(psic,ngk1)
!$acc update self(psic) if (is_present)
#endif
! ngk1 = SIZE(psic)
!#if defined(_OPENACC)
! is_present = acc_is_present(psic,ngk1)
! !$acc update self(psic) if (is_present)
!#endif
!-------------------------------------------
!
fac = 1.d0
IF ( ibnd<last ) fac = 0.5d0
!
IF ( add_to_orbital_ ) THEN
!$omp parallel do
!$acc parallel loop
! !$omp parallel do
DO j = 1, ngk(1)
orbital(j,ibnd) = orbital(j,ibnd) + fac*psio(j,1)
IF (ibnd<last) orbital(j,ibnd+1) = orbital(j,ibnd+1) + fac*psio(j,2)
ENDDO
!$omp end parallel do
! !$omp end parallel do
ELSE
!$omp parallel do
!$acc parallel loop
! !$omp parallel do
DO j = 1, ngk(1)
orbital(j,ibnd) = fac*psio(j,1)
IF (ibnd<last) orbital(j,ibnd+1) = fac*psio(j,2)
ENDDO
!$omp end parallel do
! !$omp end parallel do
ENDIF
!
DEALLOCATE( psio )
@ -2388,6 +2397,8 @@ MODULE realus
!
ENDIF
!
!$acc end data
!
CALL stop_clock( 'fwfft_orbital' )
!
END SUBROUTINE fwfft_orbital_gamma

View File

@ -973,6 +973,7 @@ SUBROUTINE sum_bec_gpu ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd
USE mp_bands, ONLY : nbgrp,inter_bgrp_comm
USE mp, ONLY : mp_sum
USE upf_spinorb, ONLY : fcoef
USE wavefunctions, ONLY : psic
!
! Used to avoid unnecessary memcopy
USE xc_lib, ONLY : xclib_dft_is
@ -1000,6 +1001,7 @@ SUBROUTINE sum_bec_gpu ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd
if (gamma_only) then
do ibnd = ibnd_start, ibnd_end, 2
call invfft_orbital_gamma(evc,ibnd,ibnd_end)
!$acc update self(psic)
call calbec_rs_gamma(ibnd,ibnd_end,becp%r)
enddo
call mp_sum(becp%r,inter_bgrp_comm)

View File

@ -66,6 +66,10 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
#if defined (__ENVIRON)
USE plugin_flags, ONLY : use_environ
USE environ_td_module, ONLY : calc_environ_dpotential
#endif
!
#if defined(__CUDA)
USE cublas
#endif
!
IMPLICIT NONE
@ -82,26 +86,30 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
& w1(:), w2(:)
COMPLEX(DP), ALLOCATABLE :: dvrs_temp(:,:), spsi1(:,:), dvrsc(:,:), &
& dvrssc(:), sevc1_new(:,:,:)
INTEGER :: nnr_siz
!
IF (lr_verbosity > 5) THEN
WRITE(stdout,'("<lr_apply_liouvillian>")')
ENDIF
!
CALL start_clock('lr_apply')
CALL start_clock_gpu('lr_apply')
!
IF (interaction) CALL start_clock('lr_apply_int')
IF (.not.interaction) CALL start_clock('lr_apply_no')
IF (interaction) CALL start_clock_gpu('lr_apply_int')
IF (.not.interaction) CALL start_clock_gpu('lr_apply_no')
!
ALLOCATE( d_deeq(nhm, nhm, nat, nspin) )
d_deeq(:,:,:,:)=0.0d0
!
ALLOCATE( spsi1(npwx, nbnd) )
ALLOCATE( sevc1_new(npwx*npol,nbnd,nks))
!
nnr_siz= dffts%nnr
!$acc data present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks)) present_or_copyout(evc1_new(1:npwx*npol,1:nbnd,1:nks)) copyout(sevc1_new(1:npwx*npol,1:nbnd,1:nks)) create(spsi1(1:npwx, 1:nbnd)) present_or_copyin(revc0(1:nnr_siz,1:nbnd,1))
!
d_deeq(:,:,:,:)=0.0d0
!$acc kernels
spsi1(:,:)=(0.0d0,0.0d0)
!
ALLOCATE(sevc1_new(npwx*npol,nbnd,nks))
sevc1_new(:,:,:) = (0.0d0,0.0d0)
!
evc1_new(:,:,:) = (0.0d0,0.0d0)
!$acc end kernels
!
IF ( interaction ) THEN
!
@ -232,8 +240,10 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
! Here we add the two terms:
! [H(k) - E(k)] * evc1(k) + dV_HXC * revc0(k)
!
CALL zaxpy(size_evc,CMPLX(1.0d0,0.0d0,kind=DP),&
!$acc host_data use_device(evc1_new, sevc1_new)
CALL zaxpy(size_evc,CMPLX(1.0d0,0.0d0,kind=DP),&
& evc1_new(:,:,:), 1, sevc1_new(:,:,:), 1)
!$acc end host_data
!
ELSEIF ( interaction .and. ltammd ) THEN
!
@ -244,8 +254,10 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
!
! Here evc1_new contains the interaction
!
!$acc host_data use_device(evc1_new, sevc1_new)
CALL zaxpy(size_evc,CMPLX(0.5d0,0.0d0,kind=DP),&
& evc1_new(:,:,:) , 1, sevc1_new(:,:,:),1)
!$acc end host_data
!
ELSE
!
@ -256,21 +268,27 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
!
ENDIF
!
IF (gstart == 2 .AND. gamma_only ) sevc1_new(1,:,:) = &
& CMPLX( REAL( sevc1_new(1,:,:), DP ), 0.0d0, DP )
!
IF (gstart==2 .and. gamma_only) THEN
DO ik=1,nks
DO ibnd=1,nbnd
IF (abs(aimag(sevc1_new(1,ibnd,ik)))>1.0d-12) THEN
!
CALL errore(' lr_apply_liouvillian ',&
'Imaginary part of G=0 '// &
'component does not equal zero',1)
ENDIF
ENDDO
ENDDO
IF (gstart == 2 .AND. gamma_only ) THEN
!$acc kernels
sevc1_new(1,:,:) = &
& CMPLX( REAL( sevc1_new(1,:,:), DP ), 0.0d0, DP )
!$acc end kernels
ENDIF
!
! IF (gstart==2 .and. gamma_only) THEN
! DO ik=1,nks
! DO ibnd=1,nbnd
! IF (abs(aimag(sevc1_new(1,ibnd,ik)))>1.0d-12) THEN
! !
! CALL errore(' lr_apply_liouvillian ',&
! 'Imaginary part of G=0 '// &
! 'component does not equal zero',1)
! ENDIF
! ENDDO
! ENDDO
! ENDIF
!
! Apply the projector on empty states P_c^+.
! Note: The projector P_c^+ can be applied only
@ -289,10 +307,13 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
!
CALL orthogonalize(sevc1_new(:,:,ik), evc0(:,:,ik), ik, ik, &
& sevc0(:,:,ik), ngk(ik), .true.)
!$acc kernels
sevc1_new(:,:,ik) = -sevc1_new(:,:,ik)
!$acc end kernels
!
ENDDO
!
!
! Here we apply the S^{-1} operator.
! See equations after Eq.(47) of B. Walker et al., J. Chem. Phys.
! 127, 164106 (2007).
@ -305,16 +326,17 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
& sevc1_new(1,1,ik), evc1_new(1,1,ik))
ENDDO
!
!$acc end data
IF (allocated(dvrs)) DEALLOCATE(dvrs)
IF (allocated(dvrss)) DEALLOCATE(dvrss)
DEALLOCATE(d_deeq)
DEALLOCATE(spsi1)
DEALLOCATE(sevc1_new)
!
IF (interaction) CALL stop_clock('lr_apply_int')
IF (.not.interaction) CALL stop_clock('lr_apply_no')
IF (interaction) CALL stop_clock_gpu('lr_apply_int')
IF (.not.interaction) CALL stop_clock_gpu('lr_apply_no')
!
CALL stop_clock('lr_apply')
CALL stop_clock_gpu('lr_apply')
!
RETURN
!
@ -330,13 +352,17 @@ CONTAINS
USE mp_global, ONLY : ibnd_start, ibnd_end, inter_bgrp_comm
USE mp, ONLY : mp_sum
USE lr_exx_kernel, ONLY : lr_exx_sum_int
#if defined(__CUDA)
USE cublas
#endif
!
IMPLICIT NONE
!
REAL(DP), ALLOCATABLE :: becp2(:,:)
REAL(DP), ALLOCATABLE :: tg_dvrss(:)
INTEGER :: v_siz, incr, ioff
INTEGER :: ibnd_start_gamma, ibnd_end_gamma
COMPLEX(DP) :: coef
INTEGER :: v_siz, incr, ioff, ir, nnr_siz
INTEGER :: ibnd_start_gamma, ibnd_end_gamma, ibnd, ngk1
!
incr = 2
!
@ -350,9 +376,13 @@ CONTAINS
! Now apply to the ground state wavefunctions
! and convert to real space
!
nnr_siz= dffts%nnr
!$acc data create (psic(1:nnr_siz))
!
IF ( interaction ) THEN
!
CALL start_clock('interaction')
!
CALL start_clock_gpu('interaction')
!
IF (nkb > 0 .and. okvan) THEN
! calculation of becp2
@ -393,23 +423,25 @@ CONTAINS
ENDDO
!end: calculation of becp2
ENDIF
IF ( dffts%has_task_groups ) THEN
!
v_siz = dffts%nnr_tg
!
incr = 2 * fftx_ntgrp(dffts)
!
ALLOCATE( tg_dvrss(1:v_siz) )
tg_dvrss=0.0d0
!
CALL tg_gather(dffts, dvrss, tg_dvrss)
!
ENDIF
!
IF ( dffts%has_task_groups ) THEN
!
v_siz = dffts%nnr_tg
!
incr = 2 * fftx_ntgrp(dffts)
!
ALLOCATE( tg_dvrss(1:v_siz) )
tg_dvrss=0.0d0
!
CALL tg_gather(dffts, dvrss, tg_dvrss)
!
ENDIF
!
! evc1_new is used as a container for the interaction
!
!$acc kernels
evc1_new(:,:,:) = (0.0d0,0.0d0)
!$acc end kernels
!
ibnd_start_gamma = ibnd_start
IF (MOD(ibnd_start, 2)==0) ibnd_start_gamma = ibnd_start + 1
@ -417,8 +449,8 @@ CONTAINS
!
IF (lr_exx) CALL lr_exx_sum_int()
!
!$acc enter data copyin(dvrss(1:nnr_siz))
DO ibnd = ibnd_start_gamma ,ibnd_end_gamma, incr
! DO ibnd = 1,nbnd,2
!
! Product with the potential vrs = (vltot+vr)
! revc0 is on smooth grid. psic is used up to smooth grid
@ -433,7 +465,9 @@ CONTAINS
!
ELSE
!
DO ir = 1,dffts%nnr
!DO ir = 1,dffts%nnr
!$acc parallel loop
DO ir = 1, nnr_siz
!
psic(ir) = revc0(ir,ibnd,1)*CMPLX(dvrss(ir),0.0d0,DP)
!
@ -446,65 +480,65 @@ CONTAINS
ENDIF
!
IF (real_space .and. okvan .and. nkb > 0) THEN
!THE REAL SPACE PART (modified from s_psi)
!fac = sqrt(omega)
!
ijkb0 = 0
iqs = 0
jqs = 0
!
DO nt = 1, ntyp
!
DO ia = 1, nat
!THE REAL SPACE PART (modified from s_psi)
!fac = sqrt(omega)
!
ijkb0 = 0
iqs = 0
jqs = 0
!
DO nt = 1, ntyp
!
DO ia = 1, nat
!
IF ( ityp(ia) == nt ) THEN
!
IF ( ityp(ia) == nt ) THEN
!
mbia = maxbox_beta(ia)
ALLOCATE( w1(nh(nt)), w2(nh(nt)) )
w1 = 0.D0
w2 = 0.D0
!
DO ih = 1, nh(nt)
!
DO jh = 1, nh(nt)
!
jkb = ijkb0 + jh
w1(ih) = w1(ih) + becp2(jkb, ibnd)
IF ( ibnd+1 <= nbnd ) w2(ih) = w2(ih) + &
mbia = maxbox_beta(ia)
ALLOCATE( w1(nh(nt)), w2(nh(nt)) )
w1 = 0.D0
w2 = 0.D0
!
DO ih = 1, nh(nt)
!
DO jh = 1, nh(nt)
!
jkb = ijkb0 + jh
w1(ih) = w1(ih) + becp2(jkb, ibnd)
IF ( ibnd+1 <= nbnd ) w2(ih) = w2(ih) + &
&becp2(jkb, ibnd+1)
!
ENDDO
!
ENDDO
!
!w1 = w1 * fac
!w2 = w2 * fac
ijkb0 = ijkb0 + nh(nt)
!
DO ih = 1, nh(nt)
!
DO ir = 1, mbia
!
iqs = jqs + ir
psic( box_beta(box0(ia)+ir) ) = &
!
ENDDO
!
ENDDO
!
!w1 = w1 * fac
!w2 = w2 * fac
ijkb0 = ijkb0 + nh(nt)
!
DO ih = 1, nh(nt)
!
DO ir = 1, mbia
!
iqs = jqs + ir
psic( box_beta(box0(ia)+ir) ) = &
&psic( box_beta(box0(ia)+ir) ) + &
&betasave(box0(ia)+ir,ih)*&
&CMPLX( w1(ih), w2(ih), KIND=dp )
!
ENDDO
!
jqs = iqs
!
ENDDO
!
!
ENDDO
!
jqs = iqs
!
ENDDO
!
DEALLOCATE( w1, w2 )
!
ENDIF
!
ENDDO
!
ENDDO
!
ENDIF
!
ENDDO
!
ENDDO
!
ENDIF
!
! Back to reciprocal space
@ -513,8 +547,12 @@ CONTAINS
!
ENDDO
!
!$acc exit data delete (dvrss)
!
#if defined(__MPI)
!$acc host_data use_device(evc1_new)
CALL mp_sum( evc1_new(:,:,1), inter_bgrp_comm )
!$acc end host_data
#endif
IF (dffts%has_task_groups) DEALLOCATE (tg_dvrss)
!
@ -525,7 +563,7 @@ CONTAINS
!
ENDIF
!
CALL stop_clock('interaction')
CALL stop_clock_gpu('interaction')
!
ENDIF
!
@ -541,11 +579,9 @@ CONTAINS
! Compute sevc1_new = H*evc1
!
#if defined(__CUDA)
!$acc data copyin(evc1) copyout(sevc1_new)
!$acc host_data use_device(evc1, sevc1_new)
CALL h_psi_gpu (npwx,ngk(1),nbnd,evc1(1,1,1),sevc1_new(1,1,1))
!$acc end host_data
!$acc end data
#else
CALL h_psi(npwx,ngk(1),nbnd,evc1(1,1,1),sevc1_new(1,1,1))
#endif
@ -562,11 +598,9 @@ CONTAINS
ENDDO
ELSE
#if defined(__CUDA)
!$acc data copyin(evc1) copyout(spsi1)
!$acc host_data use_device(evc1, spsi1)
CALL s_psi_acc (npwx,ngk(1),nbnd,evc1(1,1,1),spsi1)
!$acc end host_data
!$acc end data
#else
CALL s_psi(npwx,ngk(1),nbnd,evc1(1,1,1),spsi1)
#endif
@ -576,11 +610,16 @@ CONTAINS
!
DO ibnd = 1,nbnd
!
CALL zaxpy(ngk(1), CMPLX(-(et(ibnd,1)-scissor),0.0d0,DP), &
& spsi1(:,ibnd), 1, sevc1_new(:,ibnd,1), 1)
coef = CMPLX(-(et(ibnd,1)-scissor),0.0d0)
!$acc host_data use_device(spsi1, sevc1_new)
CALL zaxpy(ngk(1), coef, &
& spsi1(1,ibnd), 1, sevc1_new(1,ibnd,1), 1)
!$acc end host_data
!
ENDDO
!
!$acc end data
!
IF ( nkb > 0 .and. okvan ) DEALLOCATE(becp2)
!
RETURN

View File

@ -60,6 +60,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
USE lr_exx_kernel, ONLY : lr_exx_kernel_int, revc_int,&
& revc_int_c
USE constants, ONLY : eps12
USE qpoint, ONLY : nksq
USE fft_helper_subroutines
!
IMPLICIT NONE
@ -75,6 +76,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
INTEGER :: ir, ik, ibnd, jbnd, ig, ijkb0, np, na
INTEGER :: ijh,ih,jh,ikb,jkb,is
INTEGER :: i, j, k, l
INTEGER :: v_siz, nnr_siz, irho
REAL(kind=dp) :: w1, w2, scal, rho_sum
! These are temporary buffers for the response
REAL(kind=dp), ALLOCATABLE :: rho_sum_resp_x(:), rho_sum_resp_y(:),&
@ -86,13 +88,23 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
WRITE(stdout,'("<lr_calc_dens>")')
ENDIF
!
CALL start_clock('lr_calc_dens')
CALL start_clock_gpu('lr_calc_dens')
!
ALLOCATE( psic(dfftp%nnr) )
v_siz = dfftp%nnr
nnr_siz = dffts%nnr
!
!$acc data create(psic(1:v_siz)) present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks)) present_or_copyin(revc0(1:nnr_siz,1:nbnd,1:nksq)) copyout( rho_1(1:v_siz,1:nspin_mag))
!
!$acc kernels
psic(:) = (0.0d0, 0.0d0)
!$acc end kernels
!
IF (gamma_only) THEN
!$acc kernels
rho_1(:,:) = 0.0d0
!$acc end kernels
ELSE
rho_1c(:,:) = (0.0d0, 0.0d0)
ENDIF
@ -105,10 +117,17 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
!
! If a double grid is used, interpolate onto the fine grid
!
IF ( doublegrid ) CALL fft_interpolate(dffts, rho_1(:,1), dfftp, rho_1(:,1))
IF ( doublegrid ) THEN
print *, 'doublegrid', doublegrid
!$acc host_data use_device(rho_1(:,1))
CALL fft_interpolate(dffts, rho_1(:,1), dfftp, rho_1(:,1))
!$acc end host_data
ENDIF
!
#if defined(__MPI)
!$acc host_data use_device(rho_1)
CALL mp_sum(rho_1, inter_bgrp_comm)
!$acc end host_data
#endif
!
ELSE
@ -162,11 +181,11 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
!
! The psic workspace can present a memory bottleneck
!
DEALLOCATE ( psic )
!
#if defined(__MPI)
IF(gamma_only) THEN
!$acc host_data use_device(rho_1)
CALL mp_sum(rho_1, inter_pool_comm)
!$acc end host_data
ELSE
CALL mp_sum(rho_1c, inter_pool_comm)
ENDIF
@ -179,7 +198,11 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
DO is = 1, nspin_mag
!
rho_sum = 0.0d0
rho_sum = SUM(rho_1(:,is))
! rho_sum = SUM(rho_1(:,is))
!$acc parallel loop private(rho_sum) copy(rho_sum)
do irho = 1, v_siz
rho_sum = rho_sum + rho_1(i,is)
enddo
!
#if defined(__MPI)
CALL mp_sum(rho_sum, intra_bgrp_comm )
@ -213,6 +236,10 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
!
ENDIF
!
!$acc end data
!
DEALLOCATE ( psic )
!
IF (charge_response == 2 .AND. LR_iteration /=0) THEN
!
ALLOCATE( rho_sum_resp_x( dfftp%nr1 ) )
@ -335,7 +362,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
!
ENDIF
!
CALL stop_clock('lr_calc_dens')
CALL stop_clock_gpu('lr_calc_dens')
!
RETURN
!
@ -360,15 +387,18 @@ CONTAINS
IMPLICIT NONE
!
INTEGER :: ibnd_start_gamma, ibnd_end_gamma
INTEGER :: ibnd_start_gamma, ibnd_end_gamma, ibnd, ebnd
INTEGER :: v_siz, incr, ir3, ioff, ioff_tg, nxyp, idx
REAL(DP), ALLOCATABLE :: tg_rho(:)
INTEGER :: nnr_siz, pnnr_siz, ir
!
ibnd_start_gamma = ibnd_start
IF (MOD(ibnd_start, 2)==0) ibnd_start_gamma = ibnd_start + 1
ibnd_end_gamma = MAX(ibnd_end, ibnd_start_gamma)
!
incr = 2
nnr_siz = dffts%nnr
pnnr_siz = dfftp%nnr
!
IF ( dffts%has_task_groups ) THEN
!
@ -385,6 +415,9 @@ CONTAINS
!
! FFT: evc1 -> psic
!
ebnd = ibnd
IF ( ibnd < nbnd ) ebnd = ebnd + 1
!
CALL invfft_orbital_gamma(evc1(:,:,1),ibnd,nbnd)
!
IF (dffts%has_task_groups) THEN
@ -446,9 +479,10 @@ CONTAINS
! in no way the final response charge density.
! The loop is over real space points.
!
DO ir = 1, dffts%nnr
!$acc parallel loop
DO ir = 1, nnr_siz
rho_1(ir,1) = rho_1(ir,1) &
+ 2.0d0*(w1*real(revc0(ir,ibnd,1),dp)*real(psic(ir),dp)&
+ 2.0d0*(w1*dble(revc0(ir,ibnd,1))*dble(psic(ir))&
+ w2*aimag(revc0(ir,ibnd,1))*aimag(psic(ir)))
ENDDO
!

View File

@ -46,7 +46,6 @@ PROGRAM lr_dav_main
LOGICAL, EXTERNAL :: check_gpu_support
use_gpu = check_gpu_support()
if(use_gpu) Call errore('lr_dav_main', 'turbo_davidson with GPU NYI', 1)
#if defined(__MPI)
CALL mp_startup ( )
@ -97,6 +96,7 @@ PROGRAM lr_dav_main
CALL lr_dv_setup()
! Davidson loop
!$acc data copyin(revc0(:,:,:))
if (precondition) write(stdout,'(/5x,"Precondition is used in the algorithm,")')
do while (.not. dav_conv .and. dav_iter .lt. max_iter)
dav_iter=dav_iter+1
@ -111,13 +111,15 @@ PROGRAM lr_dav_main
! Check to see if the wall time limit has been exceeded.
if ( check_stop_now() ) then
call lr_write_restart_dav()
goto 100
!! goto 100
exit
endif
!
enddo
!$acc end data
! call check_hermitian()
! Extract physical meaning from the solution
if ( check_stop_now() ) goto 100
call interpret_eign('END')
! The check_orth at the end may take quite a lot of time in the case of
! USPP because we didn't store the S* vector basis. Turn this step on only

File diff suppressed because it is too large Load Diff

View File

@ -301,7 +301,7 @@ SUBROUTINE one_lanczos_step()
!
ENDIF
!
! X. Ge: To increase the stability, apply lr_ortho.
! X. Ge: To increase the stability, apply orthogonalize.
! I.Timrov: Actually, without this trick, it turns out that
! the Lanczos chain is not stable when pseudo_hermitian=.false.,
! because there is a warning from lr_calc_dens that the integral of
@ -311,9 +311,11 @@ SUBROUTINE one_lanczos_step()
IF (.not.eels .and. .not. magnons) THEN
!
DO ik=1, nks
! CALL orthogonalize(evc1(:,:,ik,1), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),npwx,.true.)
CALL lr_ortho(evc1(:,:,ik,1), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),.true.)
IF (.not. pseudo_hermitian) &
CALL lr_ortho(evc1(:,:,ik,2), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),.true.)
CALL lr_ortho(evc1(:,:,ik,2), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),.true.)
! CALL orthogonalize(evc1(:,:,ik,2), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),npwx,.true.)
ENDDO
!
ENDIF

View File

@ -355,6 +355,21 @@ SUBROUTINE lr_readin
!
ENDIF
!
IF (davidson) THEN
!
! check and set num_init and num_basis_max
!
IF (num_init < num_eign ) THEN
WRITE(stdout,'(5X,"num_init is too small, set to num_init = 2*num_eign")')
num_init = 2 * num_eign
ENDIF
IF (num_basis_max < 2*num_init ) THEN
WRITE(stdout,'(5X,"num_basis_max is too small, set to num_basis_max = 4*num_init")')
num_basis_max = 4 * num_init
ENDIF
!
ENDIF
!
#if defined(__MPI)
ENDIF
!