Merge branch 'qe_dav' into 'develop'

Porting on GPU of turbo_Davidson with openAcc

See merge request QEF/q-e!2311
This commit is contained in:
giannozz 2024-03-25 07:53:18 +00:00
commit 1b7d1c2295
11 changed files with 834 additions and 487 deletions

View File

@ -54,8 +54,11 @@ FUNCTION lr_dot(x,y)
!
IF (gamma_only) THEN
!
!$acc data present_or_copyin(x,y) copyin(wg) copy(temp_gamma)
CALL lr_dot_gamma()
!$acc end data
lr_dot = cmplx(temp_gamma,0.0d0,dp)
lr_dot = lr_dot/degspin
!
ELSEIF (noncolin) THEN
!
@ -65,11 +68,10 @@ FUNCTION lr_dot(x,y)
ELSE
!
CALL lr_dot_k()
lr_dot = lr_dot/degspin
!
ENDIF
!
lr_dot = lr_dot/degspin
!
CALL stop_clock ('lr_dot')
!
RETURN
@ -81,6 +83,28 @@ CONTAINS
! Optical case: gamma_only
! Noncollinear case is not implemented
!
USE wvfct, ONLY : npwx,nbnd,wg
#if defined(__CUDA)
use cublas
#endif
!
INTEGER :: ibnd
REAL(DP), EXTERNAL :: MYDDOT_VECTOR_GPU
!$acc routine(MYDDOT_VECTOR_GPU) vector
!
! !$acc data present(x,y,wg,temp_gamma)
#if defined(__CUDA)
!$acc parallel loop reduction(temp_gamma)
DO ibnd=1,nbnd
!
temp_gamma = temp_gamma + 2.D0*wg(ibnd,1)*MYDDOT_VECTOR_GPU(2*ngk(1),x(:,ibnd,1),y(:,ibnd,1))
!
! G=0 has been accounted twice, so we subtract one contribution.
!
IF (gstart==2) temp_gamma = temp_gamma - wg(ibnd,1)*dble(x(1,ibnd,1))*dble(y(1,ibnd,1))
!
ENDDO
#else
DO ibnd=1,nbnd
!
temp_gamma = temp_gamma + 2.D0*wg(ibnd,1)*DDOT(2*ngk(1),x(:,ibnd,1),1,y(:,ibnd,1),1)
@ -90,11 +114,15 @@ CONTAINS
IF (gstart==2) temp_gamma = temp_gamma - wg(ibnd,1)*dble(x(1,ibnd,1))*dble(y(1,ibnd,1))
!
ENDDO
!
#if defined(__MPI)
CALL mp_sum(temp_gamma, intra_bgrp_comm)
#endif
!
#if defined(__MPI)
!$acc host_data use_device(temp_gamma)
CALL mp_sum(temp_gamma, intra_bgrp_comm)
!$acc end host_data
#endif
!
! !$acc end data
RETURN
!
END SUBROUTINE lr_dot_gamma

View File

@ -46,7 +46,9 @@ SUBROUTINE lr_sm1_psi (ik, lda, n, m, psi, spsi)
CALL start_clock( 'lr_sm1_psi' )
!
IF ( gamma_only ) THEN
!$acc data present_or_copyin(psi) present_or_copyout(spsi)
CALL sm1_psi_gamma()
!$acc end data
ELSEIF (noncolin) THEN
CALL sm1_psi_nc()
ELSE
@ -74,6 +76,11 @@ CONTAINS
calbec_rs_gamma, add_vuspsir_gamma, &
v_loc_psir, s_psir_gamma
USE lrus, ONLY : bbg
USE uspp, ONLY : vkb
#if defined(__CUDA)
USE cublas
#endif
!
IMPLICIT NONE
!
@ -83,9 +90,15 @@ CONTAINS
! counters
REAL(DP), ALLOCATABLE :: ps(:,:)
!
!
! Initialize spsi : spsi = psi
!
CALL ZCOPY( lda * npol * m, psi, 1, spsi, 1 )
! !$acc data present(psi,spsi)
!
!$acc kernels
spsi(:,:) = psi(:,:)
!$acc end kernels
!!CALL ZCOPY( lda * npol * m, psi, 1, spsi, 1 )
!
IF ( nkb == 0 .OR. .NOT. okvan ) RETURN
!
@ -114,8 +127,13 @@ CONTAINS
!
! Step 2 : |spsi> = S^{-1} * |psi> = |psi> + ps * |beta>
!
!$acc enter data copyin(ps)
!$acc host_data use_device(vkb, ps, spsi)
call DGEMM('N','N',2*n,m,nkb,1.d0,vkb,2*lda,ps,nkb,1.d0,spsi,2*lda)
!$acc end host_data
!
!$acc exit data delete(ps)
! !$acc end data
DEALLOCATE(ps)
!
RETURN

View File

@ -69,7 +69,7 @@ SUBROUTINE orthogonalize(dvpsi, evq, ikk, ikq, dpsi, npwq, dpsi_computed)
!
ALLOCATE(ps(nbnd,nbnd))
!
!$acc data copyin(evq) copy(dvpsi,dpsi) create(ps(1:nbnd, 1:nbnd), ps_r(1:nbnd, 1:nbnd))
!$acc data copyin(evq) present_or_copy(dvpsi) copy(dpsi) create(ps(1:nbnd, 1:nbnd), ps_r(1:nbnd, 1:nbnd))
IF (gamma_only) THEN
!$acc kernels
ps_r(:,:) = 0.0d0

View File

@ -2198,14 +2198,17 @@ MODULE realus
INTEGER :: ebnd
!
!-------------------TEMPORARY-----------
INTEGER :: ngk1
LOGICAL :: is_present, acc_is_present
! INTEGER :: ngk1
! LOGICAL :: is_present, acc_is_present
!---------------------------------------
!
!Task groups
!
!The new task group version based on vloc_psi
!print *, "->Real space"
!
!$acc data present_or_copyin(orbital) present_or_copyout(psic)
!
CALL start_clock( 'invfft_orbital' )
!
IF( dffts%has_task_groups ) THEN
@ -2227,11 +2230,11 @@ MODULE realus
CALL wave_g2r( orbital(1:ngk(1),ibnd:ebnd), psic, dffts )
!
!-------------TEMPORARY---------------------
ngk1 = SIZE(psic)
#if defined(_OPENACC)
is_present = acc_is_present(psic,ngk1)
!$acc update self(psic) if (is_present)
#endif
! ngk1 = SIZE(psic)
!#if defined(_OPENACC)
! is_present = acc_is_present(psic,ngk1)
! !$acc update self(psic) if (is_present)
!#endif
!-------------------------------------------
!
IF (PRESENT(conserved)) THEN
@ -2245,6 +2248,8 @@ MODULE realus
!
CALL stop_clock( 'invfft_orbital' )
!
!$acc end data
!
END SUBROUTINE invfft_orbital_gamma
!
!--------------------------------------------------------------------------
@ -2290,14 +2295,16 @@ MODULE realus
COMPLEX(DP), ALLOCATABLE :: psio(:,:)
!
!-------------------TEMPORARY-----------
INTEGER :: ngk1
LOGICAL :: is_present, acc_is_present
! INTEGER :: ngk1
! LOGICAL :: is_present, acc_is_present
!---------------------------------------
!
! ... Task groups
!print *, "->Fourier space"
CALL start_clock( 'fwfft_orbital' )
!
!$acc data present_or_copyin(psic) present_or_copy(orbital)
!
add_to_orbital_=.FALSE. ; IF( PRESENT(add_to_orbital)) add_to_orbital_ = add_to_orbital
!
! ... New task_groups versions
@ -2352,30 +2359,32 @@ MODULE realus
CALL wave_r2g( psic(1:dffts%nnr), psio, dffts )
!
!-------------TEMPORARY---------------------
ngk1 = SIZE(psic)
#if defined(_OPENACC)
is_present = acc_is_present(psic,ngk1)
!$acc update self(psic) if (is_present)
#endif
! ngk1 = SIZE(psic)
!#if defined(_OPENACC)
! is_present = acc_is_present(psic,ngk1)
! !$acc update self(psic) if (is_present)
!#endif
!-------------------------------------------
!
fac = 1.d0
IF ( ibnd<last ) fac = 0.5d0
!
IF ( add_to_orbital_ ) THEN
!$omp parallel do
!$acc parallel loop
! !$omp parallel do
DO j = 1, ngk(1)
orbital(j,ibnd) = orbital(j,ibnd) + fac*psio(j,1)
IF (ibnd<last) orbital(j,ibnd+1) = orbital(j,ibnd+1) + fac*psio(j,2)
ENDDO
!$omp end parallel do
! !$omp end parallel do
ELSE
!$omp parallel do
!$acc parallel loop
! !$omp parallel do
DO j = 1, ngk(1)
orbital(j,ibnd) = fac*psio(j,1)
IF (ibnd<last) orbital(j,ibnd+1) = fac*psio(j,2)
ENDDO
!$omp end parallel do
! !$omp end parallel do
ENDIF
!
DEALLOCATE( psio )
@ -2388,6 +2397,8 @@ MODULE realus
!
ENDIF
!
!$acc end data
!
CALL stop_clock( 'fwfft_orbital' )
!
END SUBROUTINE fwfft_orbital_gamma

View File

@ -973,6 +973,7 @@ SUBROUTINE sum_bec_gpu ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd
USE mp_bands, ONLY : nbgrp,inter_bgrp_comm
USE mp, ONLY : mp_sum
USE upf_spinorb, ONLY : fcoef
USE wavefunctions, ONLY : psic
!
! Used to avoid unnecessary memcopy
USE xc_lib, ONLY : xclib_dft_is
@ -1000,6 +1001,7 @@ SUBROUTINE sum_bec_gpu ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd
if (gamma_only) then
do ibnd = ibnd_start, ibnd_end, 2
call invfft_orbital_gamma(evc,ibnd,ibnd_end)
!$acc update self(psic)
call calbec_rs_gamma(ibnd,ibnd_end,becp%r)
enddo
call mp_sum(becp%r,inter_bgrp_comm)

View File

@ -66,6 +66,10 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
#if defined (__ENVIRON)
USE plugin_flags, ONLY : use_environ
USE environ_td_module, ONLY : calc_environ_dpotential
#endif
!
#if defined(__CUDA)
USE cublas
#endif
!
IMPLICIT NONE
@ -82,26 +86,30 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
& w1(:), w2(:)
COMPLEX(DP), ALLOCATABLE :: dvrs_temp(:,:), spsi1(:,:), dvrsc(:,:), &
& dvrssc(:), sevc1_new(:,:,:)
INTEGER :: nnr_siz
!
IF (lr_verbosity > 5) THEN
WRITE(stdout,'("<lr_apply_liouvillian>")')
ENDIF
!
CALL start_clock('lr_apply')
CALL start_clock_gpu('lr_apply')
!
IF (interaction) CALL start_clock('lr_apply_int')
IF (.not.interaction) CALL start_clock('lr_apply_no')
IF (interaction) CALL start_clock_gpu('lr_apply_int')
IF (.not.interaction) CALL start_clock_gpu('lr_apply_no')
!
ALLOCATE( d_deeq(nhm, nhm, nat, nspin) )
d_deeq(:,:,:,:)=0.0d0
!
ALLOCATE( spsi1(npwx, nbnd) )
ALLOCATE( sevc1_new(npwx*npol,nbnd,nks))
!
nnr_siz= dffts%nnr
!$acc data present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks)) present_or_copyout(evc1_new(1:npwx*npol,1:nbnd,1:nks)) copyout(sevc1_new(1:npwx*npol,1:nbnd,1:nks)) create(spsi1(1:npwx, 1:nbnd)) present_or_copyin(revc0(1:nnr_siz,1:nbnd,1))
!
d_deeq(:,:,:,:)=0.0d0
!$acc kernels
spsi1(:,:)=(0.0d0,0.0d0)
!
ALLOCATE(sevc1_new(npwx*npol,nbnd,nks))
sevc1_new(:,:,:) = (0.0d0,0.0d0)
!
evc1_new(:,:,:) = (0.0d0,0.0d0)
!$acc end kernels
!
IF ( interaction ) THEN
!
@ -232,8 +240,10 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
! Here we add the two terms:
! [H(k) - E(k)] * evc1(k) + dV_HXC * revc0(k)
!
!$acc host_data use_device(evc1_new, sevc1_new)
CALL zaxpy(size_evc,CMPLX(1.0d0,0.0d0,kind=DP),&
& evc1_new(:,:,:), 1, sevc1_new(:,:,:), 1)
!$acc end host_data
!
ELSEIF ( interaction .and. ltammd ) THEN
!
@ -244,8 +254,10 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
!
! Here evc1_new contains the interaction
!
!$acc host_data use_device(evc1_new, sevc1_new)
CALL zaxpy(size_evc,CMPLX(0.5d0,0.0d0,kind=DP),&
& evc1_new(:,:,:) , 1, sevc1_new(:,:,:),1)
!$acc end host_data
!
ELSE
!
@ -256,21 +268,27 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
!
ENDIF
!
IF (gstart == 2 .AND. gamma_only ) sevc1_new(1,:,:) = &
!
IF (gstart == 2 .AND. gamma_only ) THEN
!$acc kernels
sevc1_new(1,:,:) = &
& CMPLX( REAL( sevc1_new(1,:,:), DP ), 0.0d0, DP )
!
IF (gstart==2 .and. gamma_only) THEN
DO ik=1,nks
DO ibnd=1,nbnd
IF (abs(aimag(sevc1_new(1,ibnd,ik)))>1.0d-12) THEN
!
CALL errore(' lr_apply_liouvillian ',&
'Imaginary part of G=0 '// &
'component does not equal zero',1)
ENDIF
ENDDO
ENDDO
!$acc end kernels
ENDIF
!
! IF (gstart==2 .and. gamma_only) THEN
! DO ik=1,nks
! DO ibnd=1,nbnd
! IF (abs(aimag(sevc1_new(1,ibnd,ik)))>1.0d-12) THEN
! !
! CALL errore(' lr_apply_liouvillian ',&
! 'Imaginary part of G=0 '// &
! 'component does not equal zero',1)
! ENDIF
! ENDDO
! ENDDO
! ENDIF
!
! Apply the projector on empty states P_c^+.
! Note: The projector P_c^+ can be applied only
@ -289,10 +307,13 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
!
CALL orthogonalize(sevc1_new(:,:,ik), evc0(:,:,ik), ik, ik, &
& sevc0(:,:,ik), ngk(ik), .true.)
!$acc kernels
sevc1_new(:,:,ik) = -sevc1_new(:,:,ik)
!$acc end kernels
!
ENDDO
!
!
! Here we apply the S^{-1} operator.
! See equations after Eq.(47) of B. Walker et al., J. Chem. Phys.
! 127, 164106 (2007).
@ -305,16 +326,17 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
& sevc1_new(1,1,ik), evc1_new(1,1,ik))
ENDDO
!
!$acc end data
IF (allocated(dvrs)) DEALLOCATE(dvrs)
IF (allocated(dvrss)) DEALLOCATE(dvrss)
DEALLOCATE(d_deeq)
DEALLOCATE(spsi1)
DEALLOCATE(sevc1_new)
!
IF (interaction) CALL stop_clock('lr_apply_int')
IF (.not.interaction) CALL stop_clock('lr_apply_no')
IF (interaction) CALL stop_clock_gpu('lr_apply_int')
IF (.not.interaction) CALL stop_clock_gpu('lr_apply_no')
!
CALL stop_clock('lr_apply')
CALL stop_clock_gpu('lr_apply')
!
RETURN
!
@ -330,13 +352,17 @@ CONTAINS
USE mp_global, ONLY : ibnd_start, ibnd_end, inter_bgrp_comm
USE mp, ONLY : mp_sum
USE lr_exx_kernel, ONLY : lr_exx_sum_int
#if defined(__CUDA)
USE cublas
#endif
!
IMPLICIT NONE
!
REAL(DP), ALLOCATABLE :: becp2(:,:)
REAL(DP), ALLOCATABLE :: tg_dvrss(:)
INTEGER :: v_siz, incr, ioff
INTEGER :: ibnd_start_gamma, ibnd_end_gamma
COMPLEX(DP) :: coef
INTEGER :: v_siz, incr, ioff, ir, nnr_siz
INTEGER :: ibnd_start_gamma, ibnd_end_gamma, ibnd, ngk1
!
incr = 2
!
@ -350,9 +376,13 @@ CONTAINS
! Now apply to the ground state wavefunctions
! and convert to real space
!
nnr_siz= dffts%nnr
!$acc data create (psic(1:nnr_siz))
!
IF ( interaction ) THEN
!
CALL start_clock('interaction')
!
CALL start_clock_gpu('interaction')
!
IF (nkb > 0 .and. okvan) THEN
! calculation of becp2
@ -393,7 +423,7 @@ CONTAINS
ENDDO
!end: calculation of becp2
ENDIF
!
IF ( dffts%has_task_groups ) THEN
!
v_siz = dffts%nnr_tg
@ -409,7 +439,9 @@ CONTAINS
!
! evc1_new is used as a container for the interaction
!
!$acc kernels
evc1_new(:,:,:) = (0.0d0,0.0d0)
!$acc end kernels
!
ibnd_start_gamma = ibnd_start
IF (MOD(ibnd_start, 2)==0) ibnd_start_gamma = ibnd_start + 1
@ -417,8 +449,8 @@ CONTAINS
!
IF (lr_exx) CALL lr_exx_sum_int()
!
!$acc enter data copyin(dvrss(1:nnr_siz))
DO ibnd = ibnd_start_gamma ,ibnd_end_gamma, incr
! DO ibnd = 1,nbnd,2
!
! Product with the potential vrs = (vltot+vr)
! revc0 is on smooth grid. psic is used up to smooth grid
@ -433,7 +465,9 @@ CONTAINS
!
ELSE
!
DO ir = 1,dffts%nnr
!DO ir = 1,dffts%nnr
!$acc parallel loop
DO ir = 1, nnr_siz
!
psic(ir) = revc0(ir,ibnd,1)*CMPLX(dvrss(ir),0.0d0,DP)
!
@ -504,7 +538,7 @@ CONTAINS
ENDDO
!
ENDDO
!
ENDIF
!
! Back to reciprocal space
@ -513,8 +547,12 @@ CONTAINS
!
ENDDO
!
!$acc exit data delete (dvrss)
!
#if defined(__MPI)
!$acc host_data use_device(evc1_new)
CALL mp_sum( evc1_new(:,:,1), inter_bgrp_comm )
!$acc end host_data
#endif
IF (dffts%has_task_groups) DEALLOCATE (tg_dvrss)
!
@ -525,7 +563,7 @@ CONTAINS
!
ENDIF
!
CALL stop_clock('interaction')
CALL stop_clock_gpu('interaction')
!
ENDIF
!
@ -541,11 +579,9 @@ CONTAINS
! Compute sevc1_new = H*evc1
!
#if defined(__CUDA)
!$acc data copyin(evc1) copyout(sevc1_new)
!$acc host_data use_device(evc1, sevc1_new)
CALL h_psi_gpu (npwx,ngk(1),nbnd,evc1(1,1,1),sevc1_new(1,1,1))
!$acc end host_data
!$acc end data
#else
CALL h_psi(npwx,ngk(1),nbnd,evc1(1,1,1),sevc1_new(1,1,1))
#endif
@ -562,11 +598,9 @@ CONTAINS
ENDDO
ELSE
#if defined(__CUDA)
!$acc data copyin(evc1) copyout(spsi1)
!$acc host_data use_device(evc1, spsi1)
CALL s_psi_acc (npwx,ngk(1),nbnd,evc1(1,1,1),spsi1)
!$acc end host_data
!$acc end data
#else
CALL s_psi(npwx,ngk(1),nbnd,evc1(1,1,1),spsi1)
#endif
@ -576,11 +610,16 @@ CONTAINS
!
DO ibnd = 1,nbnd
!
CALL zaxpy(ngk(1), CMPLX(-(et(ibnd,1)-scissor),0.0d0,DP), &
& spsi1(:,ibnd), 1, sevc1_new(:,ibnd,1), 1)
coef = CMPLX(-(et(ibnd,1)-scissor),0.0d0)
!$acc host_data use_device(spsi1, sevc1_new)
CALL zaxpy(ngk(1), coef, &
& spsi1(1,ibnd), 1, sevc1_new(1,ibnd,1), 1)
!$acc end host_data
!
ENDDO
!
!$acc end data
!
IF ( nkb > 0 .and. okvan ) DEALLOCATE(becp2)
!
RETURN

View File

@ -60,6 +60,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
USE lr_exx_kernel, ONLY : lr_exx_kernel_int, revc_int,&
& revc_int_c
USE constants, ONLY : eps12
USE qpoint, ONLY : nksq
USE fft_helper_subroutines
!
IMPLICIT NONE
@ -75,6 +76,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
INTEGER :: ir, ik, ibnd, jbnd, ig, ijkb0, np, na
INTEGER :: ijh,ih,jh,ikb,jkb,is
INTEGER :: i, j, k, l
INTEGER :: v_siz, nnr_siz, irho
REAL(kind=dp) :: w1, w2, scal, rho_sum
! These are temporary buffers for the response
REAL(kind=dp), ALLOCATABLE :: rho_sum_resp_x(:), rho_sum_resp_y(:),&
@ -86,13 +88,23 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
WRITE(stdout,'("<lr_calc_dens>")')
ENDIF
!
CALL start_clock('lr_calc_dens')
CALL start_clock_gpu('lr_calc_dens')
!
ALLOCATE( psic(dfftp%nnr) )
v_siz = dfftp%nnr
nnr_siz = dffts%nnr
!
!$acc data create(psic(1:v_siz)) present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks)) present_or_copyin(revc0(1:nnr_siz,1:nbnd,1:nksq)) copyout( rho_1(1:v_siz,1:nspin_mag))
!
!$acc kernels
psic(:) = (0.0d0, 0.0d0)
!$acc end kernels
!
IF (gamma_only) THEN
!$acc kernels
rho_1(:,:) = 0.0d0
!$acc end kernels
ELSE
rho_1c(:,:) = (0.0d0, 0.0d0)
ENDIF
@ -105,10 +117,17 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
!
! If a double grid is used, interpolate onto the fine grid
!
IF ( doublegrid ) CALL fft_interpolate(dffts, rho_1(:,1), dfftp, rho_1(:,1))
IF ( doublegrid ) THEN
print *, 'doublegrid', doublegrid
!$acc host_data use_device(rho_1(:,1))
CALL fft_interpolate(dffts, rho_1(:,1), dfftp, rho_1(:,1))
!$acc end host_data
ENDIF
!
#if defined(__MPI)
!$acc host_data use_device(rho_1)
CALL mp_sum(rho_1, inter_bgrp_comm)
!$acc end host_data
#endif
!
ELSE
@ -162,11 +181,11 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
!
! The psic workspace can present a memory bottleneck
!
DEALLOCATE ( psic )
!
#if defined(__MPI)
IF(gamma_only) THEN
!$acc host_data use_device(rho_1)
CALL mp_sum(rho_1, inter_pool_comm)
!$acc end host_data
ELSE
CALL mp_sum(rho_1c, inter_pool_comm)
ENDIF
@ -179,7 +198,11 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
DO is = 1, nspin_mag
!
rho_sum = 0.0d0
rho_sum = SUM(rho_1(:,is))
! rho_sum = SUM(rho_1(:,is))
!$acc parallel loop private(rho_sum) copy(rho_sum)
do irho = 1, v_siz
rho_sum = rho_sum + rho_1(i,is)
enddo
!
#if defined(__MPI)
CALL mp_sum(rho_sum, intra_bgrp_comm )
@ -213,6 +236,10 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
!
ENDIF
!
!$acc end data
!
DEALLOCATE ( psic )
!
IF (charge_response == 2 .AND. LR_iteration /=0) THEN
!
ALLOCATE( rho_sum_resp_x( dfftp%nr1 ) )
@ -335,7 +362,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
!
ENDIF
!
CALL stop_clock('lr_calc_dens')
CALL stop_clock_gpu('lr_calc_dens')
!
RETURN
!
@ -360,15 +387,18 @@ CONTAINS
IMPLICIT NONE
!
INTEGER :: ibnd_start_gamma, ibnd_end_gamma
INTEGER :: ibnd_start_gamma, ibnd_end_gamma, ibnd, ebnd
INTEGER :: v_siz, incr, ir3, ioff, ioff_tg, nxyp, idx
REAL(DP), ALLOCATABLE :: tg_rho(:)
INTEGER :: nnr_siz, pnnr_siz, ir
!
ibnd_start_gamma = ibnd_start
IF (MOD(ibnd_start, 2)==0) ibnd_start_gamma = ibnd_start + 1
ibnd_end_gamma = MAX(ibnd_end, ibnd_start_gamma)
!
incr = 2
nnr_siz = dffts%nnr
pnnr_siz = dfftp%nnr
!
IF ( dffts%has_task_groups ) THEN
!
@ -385,6 +415,9 @@ CONTAINS
!
! FFT: evc1 -> psic
!
ebnd = ibnd
IF ( ibnd < nbnd ) ebnd = ebnd + 1
!
CALL invfft_orbital_gamma(evc1(:,:,1),ibnd,nbnd)
!
IF (dffts%has_task_groups) THEN
@ -446,9 +479,10 @@ CONTAINS
! in no way the final response charge density.
! The loop is over real space points.
!
DO ir = 1, dffts%nnr
!$acc parallel loop
DO ir = 1, nnr_siz
rho_1(ir,1) = rho_1(ir,1) &
+ 2.0d0*(w1*real(revc0(ir,ibnd,1),dp)*real(psic(ir),dp)&
+ 2.0d0*(w1*dble(revc0(ir,ibnd,1))*dble(psic(ir))&
+ w2*aimag(revc0(ir,ibnd,1))*aimag(psic(ir)))
ENDDO
!

View File

@ -46,7 +46,6 @@ PROGRAM lr_dav_main
LOGICAL, EXTERNAL :: check_gpu_support
use_gpu = check_gpu_support()
if(use_gpu) Call errore('lr_dav_main', 'turbo_davidson with GPU NYI', 1)
#if defined(__MPI)
CALL mp_startup ( )
@ -97,6 +96,7 @@ PROGRAM lr_dav_main
CALL lr_dv_setup()
! Davidson loop
!$acc data copyin(revc0(:,:,:))
if (precondition) write(stdout,'(/5x,"Precondition is used in the algorithm,")')
do while (.not. dav_conv .and. dav_iter .lt. max_iter)
dav_iter=dav_iter+1
@ -111,13 +111,15 @@ PROGRAM lr_dav_main
! Check to see if the wall time limit has been exceeded.
if ( check_stop_now() ) then
call lr_write_restart_dav()
goto 100
!! goto 100
exit
endif
!
enddo
!$acc end data
! call check_hermitian()
! Extract physical meaning from the solution
if ( check_stop_now() ) goto 100
call interpret_eign('END')
! The check_orth at the end may take quite a lot of time in the case of
! USPP because we didn't store the S* vector basis. Turn this step on only

View File

@ -454,57 +454,135 @@ contains
write(stdout,'(7x,"num of basis:",I5,3x,"total built basis:",I5)') num_basis,num_basis_tot
! Add new matrix elements to the M_C and M_D(in the subspace)(part 1)
if (.not.ltammd) then
if (.not. okvan) then !norm conserving
!$acc data copyin(vec_b(:,:,:,num_basis_old+1:num_basis)) create(vecwork(:,:,:)) copyout(D_vec_b(:,:,:,num_basis_old+1:num_basis), C_vec_b(:,:,:,num_basis_old+1:num_basis))
do ibr = num_basis_old+1, num_basis
if(.not.ltammd) then
! Calculate new D*vec_b
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.false.)
if(.not. poor_of_ram2) D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
if (.not. poor_of_ram2) THEN
!$acc kernels
D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
!$acc end kernels
endif
!
! Add new M_D
do ibl = 1, ibr
! Here there's a choice between saving memory and saving calculation
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
M_D(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
else ! Less calculation, double memory required
M_D(ibl,ibr)=lr_dot(svec_b(1,1,1,ibl),vecwork(1,1,1))
endif
M_D(ibl,ibr)=lr_dot(vec_b(1,1,1,ibl),vecwork(1,1,1))
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
enddo
! Calculate new C*vec_b
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
if (.not. poor_of_ram2) THEN
!$acc kernels
C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
!$acc end kernels
endif
!
! Add new M_C
do ibl = 1, ibr
M_C(ibl,ibr)=lr_dot(vec_b(1,1,1,ibl),vecwork(1,1,1))
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
enddo
!
enddo
!$acc end data
else if(poor_of_ram) then ! Less memory needed
do ibr = num_basis_old+1, num_basis
! Calculate new D*vec_b
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.false.)
if(.not. poor_of_ram2) D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
!
! Add new M_D
do ibl = 1, ibr
! Here there's a choice between saving memory and saving calculation
M_D(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
enddo
! Calculate new C*vec_b
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
if(.not. poor_of_ram2) C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
! Add new M_C
do ibl = 1, ibr
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
M_C(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
else ! Less calculation, double memory required
M_C(ibl,ibr)=lr_dot(svec_b(1,1,1,ibl),vecwork(1,1,1))
endif
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
enddo
else ! ltammd
enddo
else !uspp
do ibr = num_basis_old+1, num_basis
! Calculate new D*vec_b
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.false.)
if(.not. poor_of_ram2) D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
!
! Add new M_D
do ibl = 1, ibr
! Here there's a choice between saving memory and saving calculation
M_D(ibl,ibr)=lr_dot(svec_b(1,1,1,ibl),vecwork(1,1,1))
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
enddo
! Calculate new C*vec_b
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
if(.not. poor_of_ram2) then
if(.not. poor_of_ram2) C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
! Add new M_C
do ibl = 1, ibr
M_C(ibl,ibr)=lr_dot(svec_b(1,1,1,ibl),vecwork(1,1,1))
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
enddo
enddo
endif
else
if (.not. okvan) then
!$acc data copyin(vec_b(:,:,:,num_basis_old+1:num_basis)) create(vecwork(:,:,:)) copyout(D_vec_b(:,:,:,num_basis_old+1:num_basis), C_vec_b(:,:,:,num_basis_old+1:num_basis))
do ibr = num_basis_old+1, num_basis
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
if (.not. poor_of_ram2) then
!$acc kernels
D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
!$acc end kernels
endif
! Add new M_D, M_C
do ibl = 1, ibr
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
M_D(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
else ! Less calculation, double memory required
M_D(ibl,ibr)=lr_dot_us(svec_b(1,1,1,ibl),vecwork(1,1,1))
endif
M_D(ibl,ibr)=lr_dot(vec_b(1,1,1,ibl),vecwork(1,1,1))
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
M_C(ibl,ibr)=M_D(ibl,ibr)
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
enddo
endif
enddo
!$acc end data
elseif (poor_of_ram) then ! Less memory needed
do ibr = num_basis_old+1, num_basis
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
if (.not. poor_of_ram2) then
D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
endif
! Add new M_D, M_C
do ibl = 1, ibr
M_D(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
M_C(ibl,ibr)=M_D(ibl,ibr)
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
enddo
enddo
else ! Less calculation, double memory required
do ibr = num_basis_old+1, num_basis
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
if (.not. poor_of_ram2) then
D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
endif
! Add new M_D, M_C
do ibl = 1, ibr
M_D(ibl,ibr)=lr_dot_us(svec_b(1,1,1,ibl),vecwork(1,1,1))
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
M_C(ibl,ibr)=M_D(ibl,ibr)
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
enddo
enddo
endif
endif
call solve_M_DC()
@ -654,11 +732,13 @@ contains
use kinds, only : dp
use io_global, only : stdout
use wvfct, only : nbnd, npwx
use uspp, only : okvan
use lr_dav_debug
use lr_us
implicit none
integer :: ieign, flag,ibr
complex(kind=dp),external :: lr_dot
integer :: ieign, flag, ibr
logical :: discharged
complex(kind=dp) :: temp(npwx,nbnd)
@ -672,20 +752,28 @@ contains
call start_clock("calc_residue")
!$acc data copyout(left_res(:,:,:,:), right_res(:,:,:,:)) copyin(right_M(:,:), left_M(:,:), C_vec_b(:,:,:,:), D_vec_b(:,:,:,:), left_full(:,:,:,:), right_full(:,:,:,:))
do ieign = 1, num_eign
if(poor_of_ram2) then ! If D_ C_ basis are not stored, we have to apply liouvillian again
if (poor_of_ram2) then ! If D_ C_ basis are not stored, we have to apply liouvillian again
call lr_apply_liouvillian(right_full(:,:,:,ieign),right_res(:,:,:,ieign),.true.) ! Apply lanczos
call lr_apply_liouvillian(left_full(:,:,:,ieign),left_res(:,:,:,ieign),.false.)
else ! Otherwise they are be recovered directly by the combination of C_ and D_ basis
!$acc kernels
left_res(:,:,:,ieign)=0.0d0
right_res(:,:,:,ieign)=0.0d0
!$acc end kernels
do ibr = 1, num_basis
!$acc kernels async
right_res(:,:,1,ieign)=right_res(:,:,1,ieign)+right_M(ibr,eign_value_order(ieign))*C_vec_b(:,:,1,ibr)
left_res(:,:,1,ieign)=left_res(:,:,1,ieign)+left_M(ibr,eign_value_order(ieign))*D_vec_b(:,:,1,ibr)
!$acc end kernels
enddo
!$acc wait
endif
! The reason of using this method
call lr_1to1orth(right_res(1,1,1,ieign),left_full(1,1,1,ieign))
call lr_1to1orth(left_res(1,1,1,ieign),right_full(1,1,1,ieign))
! Instead of this will be explained in the document
@ -693,37 +781,47 @@ contains
! left_res(:,:,:,ieign)=left_res(:,:,:,ieign)-sqrt(eign_value(eign_value_order(ieign),1))*right_full(:,:,:,ieign)
! Update kill_r/l
if (okvan) then
right2(ieign)=lr_dot_us(right_res(1,1,1,ieign),right_res(1,1,1,ieign))
else
right2(ieign)=lr_dot(right_res(1,1,1,ieign),right_res(1,1,1,ieign))
endif
if (abs(aimag(right2(ieign))) .gt. zero .or. dble(right2(ieign)) .lt. 0.0D0) then
write(stdout,'(7x,"Warning! Wanging! the residue is weird.")')
endif
if( dble(right2(ieign)) .lt. residue_conv_thr ) then
if (dble(right2(ieign)) .lt. residue_conv_thr ) then
kill_right(ieign)=.true.
toadd=toadd-1
endif
if( dble(right2(ieign)) .gt. max_res ) max_res = dble(right2(ieign))
if (okvan) then
left2(ieign)=lr_dot_us(left_res(1,1,1,ieign),left_res(1,1,1,ieign))
else
left2(ieign)=lr_dot(left_res(1,1,1,ieign),left_res(1,1,1,ieign))
endif
if (abs(aimag(left2(ieign))) .gt. zero .or. dble(left2(ieign)) .lt. 0.0D0) then
write(stdout,'(7x,"Warning! Wanging! the residue is weird.")')
endif
if( dble(left2(ieign)) .lt. residue_conv_thr ) then
if (dble(left2(ieign)) .lt. residue_conv_thr ) then
kill_left(ieign)=.true.
toadd=toadd-1
endif
if( dble(left2(ieign)) .gt. max_res ) max_res = dble(left2(ieign))
if(dble(left2(ieign)) .gt. max_res ) max_res = dble(left2(ieign))
write (stdout,'(5x,"Residue(Squared modulus):",I5,2x,2F15.7)') ieign, &
dble(right2(ieign)), dble(left2(ieign))
enddo
!$acc end data
write(stdout,'(7x,"Largest residue:",5x,F20.12)') max_res
if(max_res .lt. residue_conv_thr) dav_conv=.true.
call stop_clock("calc_residue")
! Forseen that the number of basis will be too large
if(num_basis+toadd .gt. num_basis_max) then
if(discharged) &
if (num_basis+toadd .gt. num_basis_max) then
if (discharged) &
call errore('lr_discharge',"The num_basis_max is too small that even discharge &
& cannot work. Please increase its value in the input.",1)
discharged = .true.
@ -745,6 +843,7 @@ contains
use uspp, only : okvan
use lr_us
use lr_dav_debug
use wvfct, only : npwx
implicit none
integer :: ieign, flag
@ -756,24 +855,25 @@ contains
if (precondition) then
do ieign = 1, num_eign
if(.not. kill_left(ieign)) then
if (.not. kill_left(ieign)) then
call treat_residue(left_res(:,:,1,ieign),ieign)
call lr_norm(left_res(1,1,1,ieign))
call lr_ortho(left_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
call orthogonalize(left_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
call lr_norm(left_res(1,1,1,ieign))
endif
if(.not. kill_right(ieign)) then
if (.not. kill_right(ieign)) then
call treat_residue(right_res(:,:,1,ieign),ieign)
call lr_norm(right_res(1,1,1,ieign))
call lr_ortho(right_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
call lr_norm(left_res(1,1,1,ieign))
call orthogonalize(right_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
call lr_norm(right_res(1,1,1,ieign))
endif
enddo
endif
! Here mGS are called three times and lr_ortho is called once for increasing
! Here mGS are called three times and orthogonalize is called once for increasing
! numerical stability of orthonalization
!$acc data copy(left_res(:,:,:,:), right_res(:,:,:,:), vec_b(:,:,:,:))
call lr_mGS_orth() ! 1st
call lr_mGS_orth_pp()
call lr_mGS_orth() ! 2nd
@ -782,20 +882,19 @@ contains
call lr_mGS_orth_pp()
do ieign = 1, num_eign
call lr_ortho(right_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
call lr_ortho(left_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
call orthogonalize(right_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
call orthogonalize(left_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
call lr_norm(right_res(:,:,:,ieign))
call lr_norm(left_res(:,:,:,ieign))
enddo
if(toadd .eq. 0) then
!
if (toadd .eq. 0) then
write(stdout,'("TOADD is zero !!")')
dav_conv=.true.
return
go to 10
endif
!if( dav_iter .gt. max_iter .or. num_basis+toadd .gt. num_basis_max ) then
if( dav_iter .gt. max_iter ) then
!
if ( dav_iter .gt. max_iter ) then
write(stdout,'(/5x,"!!!! We have arrived maximum number of iterations. We have to stop &
&here, and the result will not be trustable !!!!! ")')
dav_conv=.true.
@ -804,29 +903,35 @@ contains
num_basis_tot=num_basis_tot+toadd
! Expand the basis
do ieign = 1, num_eign
if(.not. kill_left(ieign)) then
if (.not. kill_left(ieign)) then
num_basis=num_basis+1
!$acc kernels
vec_b(:,:,:,num_basis)=left_res(:,:,:,ieign)
if(.not. poor_of_ram .and. okvan) &
!$acc end kernels
if (.not. poor_of_ram .and. okvan) &
call lr_apply_s(vec_b(:,:,:,num_basis),svec_b(:,:,:,num_basis))
endif
if(.not. kill_right(ieign)) then
if (.not. kill_right(ieign)) then
num_basis=num_basis+1
!$acc kernels
vec_b(:,:,:,num_basis)=right_res(:,:,:,ieign)
if(.not. poor_of_ram .and. okvan) &
!$acc end kernels
if (.not. poor_of_ram .and. okvan) &
call lr_apply_s(vec_b(:,:,:,num_basis),svec_b(:,:,:,num_basis))
endif
enddo
endif
if(conv_assistant) then
if(max_res .lt. 10*residue_conv_thr .and. .not. ploted(1)) then
!
10 call stop_clock("expan_basis")
!$acc end data
!
if (conv_assistant) then
if (max_res .lt. 10*residue_conv_thr .and. .not. ploted(1)) then
call interpret_eign('10')
ploted(1)=.true.
endif
endif
call stop_clock("expan_basis")
!
return
end subroutine dav_expan_basis
!-------------------------------------------------------------------------------
@ -847,26 +952,35 @@ contains
real(dp) :: temp
call start_clock("mGS_orth")
!$acc data present_or_copy(left_res(:,:,:,:), right_res(:,:,:,:)) present_or_copyin(vec_b(:,:,:,:))
! first orthogonalize to old basis
do ib = 1, num_basis
do ieign = 1, num_eign
if (.not. kill_left(ieign)) then
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
if (poor_of_ram .or. .not. okvan) then ! Less memory needed
do ib = 1, num_basis
call lr_1to1orth(left_res(1,1,1,ieign),vec_b(1,1,1,ib))
enddo
else ! Less calculation, double memory required
do ib = 1, num_basis
call lr_bi_1to1orth(left_res(1,1,1,ieign),vec_b(1,1,1,ib),svec_b(1,1,1,ib))
enddo
endif
endif
if (.not. kill_right(ieign)) then
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
if (poor_of_ram .or. .not. okvan) then ! Less memory needed
do ib = 1, num_basis
call lr_1to1orth(right_res(1,1,1,ieign),vec_b(1,1,1,ib))
else ! Less calculation, double memory required
enddo
else
do ib = 1, num_basis
call lr_bi_1to1orth(right_res(1,1,1,ieign),vec_b(1,1,1,ib),svec_b(1,1,1,ib))
enddo
endif
endif
enddo
enddo
!
! orthogonalize between new basis themselves
do ieign = 1, num_eign
if (.not. kill_left(ieign) .and. .not. kill_right(ieign)) then
@ -886,6 +1000,7 @@ contains
call lr_1to1orth(right_res(1,1,1,ieign2),right_res(1,1,1,ieign))
enddo
enddo
!$acc end data
call stop_clock("mGS_orth")
return
end subroutine lr_mGS_orth
@ -901,18 +1016,25 @@ contains
use klist, only : nks
use wvfct, only : npwx,nbnd
use io_global, only : stdout
use uspp, only : okvan
use lr_dav_variables
use lr_us
implicit none
complex(kind=dp),external :: lr_dot
integer :: ieign,ia
real(dp) :: norm_res
call start_clock("mGS_orth_pp")
!$acc data present_or_copy(left_res(:,:,:,:), right_res(:,:,:,:))
do ieign = 1, num_eign
if(.not. kill_left(ieign)) then
if (.not. kill_left(ieign)) then
if (okvan) then
norm_res = dble(lr_dot_us(left_res(1,1,1,ieign),left_res(1,1,1,ieign)))
if(norm_res .lt. residue_conv_thr) then
else
norm_res = dble(lr_dot(left_res(1,1,1,ieign),left_res(1,1,1,ieign)))
endif
if (norm_res .lt. residue_conv_thr) then
kill_left(ieign) = .true.
write(stdout,'("One residue is eliminated:",5x,E20.12)') norm_res
toadd=toadd-1
@ -921,9 +1043,13 @@ contains
endif
endif
if(.not. kill_right(ieign)) then
if (.not. kill_right(ieign)) then
if (okvan) then
norm_res = dble(lr_dot_us(right_res(1,1,1,ieign),right_res(1,1,1,ieign)))
if(norm_res .lt. residue_conv_thr) then
else
norm_res = dble(lr_dot(right_res(1,1,1,ieign),right_res(1,1,1,ieign)))
endif
if (norm_res .lt. residue_conv_thr) then
kill_right(ieign) = .true.
write(stdout,'("One residue is eliminated:",5x,E20.12)') norm_res
toadd=toadd-1
@ -932,6 +1058,8 @@ contains
endif
endif
enddo
!$acc end data
call stop_clock("mGS_orth_pp")
return
end subroutine lr_mGS_orth_pp
@ -947,14 +1075,26 @@ contains
use klist, only : nks
use wvfct, only : npwx,nbnd
use lr_us
use uspp, only : okvan
implicit none
complex(dp) :: vect(npwx,nbnd,nks),svect(npwx,nbnd,nks)
complex(dp) :: vect(npwx,nbnd,nks)
complex(kind=dp),external :: lr_dot
real(dp) :: temp
!
!$acc data present_or_copy(vect(:,:,:))
!
if (okvan) then
temp=dble(lr_dot_us(vect(1,1,1),vect(1,1,1)))
else
temp=dble(lr_dot(vect(1,1,1),vect(1,1,1)))
endif
!$acc kernels copyin(temp)
vect(:,:,:)=vect(:,:,:)/sqrt(temp)
!$acc end kernels
!
!$acc end data
!
return
end subroutine lr_norm
!-------------------------------------------------------------------------------
@ -970,11 +1110,33 @@ contains
use klist, only : nks
use wvfct, only : npwx,nbnd
use lr_us
use uspp, only : okvan
implicit none
complex(kind=dp),external :: lr_dot
complex(dp) :: vect1(npwx,nbnd,nks),vect2(npwx,nbnd,nks)
vect1(:,:,1)=vect1(:,:,1)-(lr_dot_us(vect1(1,1,1),vect2(1,1,1))/lr_dot_us(vect2(1,1,1),vect2(1,1,1)))*vect2(:,:,1)
complex(dp) :: temp_dot1, temp_dot2, temp_dot
!
!$acc data present_or_copy(vect1(:,:,:)) present_or_copyin(vect2(:,:,:))
!
if (okvan) then
temp_dot1 = lr_dot_us(vect1(1,1,1),vect2(1,1,1))
temp_dot2 = lr_dot_us(vect2(1,1,1),vect2(1,1,1))
temp_dot = temp_dot1/temp_dot2
!$acc kernels copyin(temp_dot)
vect1(:,:,1)=vect1(:,:,1)-temp_dot*vect2(:,:,1)
!$acc end kernels
else
temp_dot1 = lr_dot(vect1(1,1,1),vect2(1,1,1))
temp_dot2 = lr_dot(vect2(1,1,1),vect2(1,1,1))
temp_dot = temp_dot1/temp_dot2
!$acc kernels copyin(temp_dot)
vect1(:,:,1)=vect1(:,:,1)-temp_dot*vect2(:,:,1)
!$acc end kernels
endif
!
!$acc end data
!
return
end subroutine lr_1to1orth
!-------------------------------------------------------------------------------
@ -993,8 +1155,17 @@ contains
implicit none
complex(kind=dp),external :: lr_dot
complex(dp) :: vect1(npwx,nbnd,nks),vect2(npwx,nbnd,nks),svect2(npwx,nbnd,nks)
vect1(:,:,1)=vect1(:,:,1)-(lr_dot(svect2(1,1,1),vect1(1,1,1))/lr_dot(svect2(1,1,1),vect2(1,1,1)))*vect2(:,:,1)
complex(dp) :: temp_dot1, temp_dot2, temp_dot
!
!$acc data present_or_copy(vect1(:,:,:)) present_or_copyin(svect2(:,:,:), vect2(:,:,:))
temp_dot1 = lr_dot(svect2(1,1,1),vect1(1,1,1))
temp_dot2 = lr_dot(svect2(1,1,1),vect2(1,1,1))
temp_dot = temp_dot1/temp_dot2
!
!$acc kernels copyin(temp_dot)
vect1(:,:,1)=vect1(:,:,1)-temp_dot*vect2(:,:,1)
!$acc end kernels
!$acc end data
return
end subroutine lr_bi_1to1orth
!-------------------------------------------------------------------------------
@ -1045,18 +1216,20 @@ contains
use io_global, only : stdout,ionode,ionode_id
use mp, only : mp_bcast,mp_barrier
use mp_world, only : world_comm
use uspp, only : okvan
use lr_us
implicit none
complex(dp), external :: lr_dot
character(len=*) :: message
integer :: ieign, ia,ic,iv,ipol
real(kind=dp), external :: ddot
real(dp) :: norm, normx, normy, alpha, shouldbe1,temp
real(dp) :: C_right_M(num_basis_max),D_left_M(num_basis_max)
if(.not. allocated(norm_F)) allocate(norm_F(num_eign))
if (.not. allocated(norm_F)) allocate(norm_F(num_eign))
if(message=="END") then
if (message=="END") then
write(stdout,'(/7x,"================================================================")')
write(stdout,'(/7x,"Davidson diagonalization has finished in",I5," steps.")') dav_iter
write(stdout,'(10x,"the number of current basis is",I5)') num_basis
@ -1065,20 +1238,20 @@ contains
write(stdout,'(/7x,"Now print out information of eigenstates")')
endif
if(message=="10")&
if (message=="10")&
write(stdout,'(/7x,"Quasi convergence(10*residue_conv_thr) has been arrived in",I5," steps.")') dav_iter
if(.not. done_calc_R) then
if (.not. done_calc_R) then
call lr_calc_R()
done_calc_R=.true.
endif
! Print out Oscilation strength
if(message == "END") then
if (message == "END") then
write(stdout,'(/,/5x,"K-S Oscillator strengths")')
write(stdout,'(5x,"occ",1x,"con",8x,"R-x",14x,"R-y",14x,"R-z")')
do iv=nbnd-p_nbnd_occ+1, nbnd
do ic=1,p_nbnd_virt
do iv = nbnd-p_nbnd_occ+1, nbnd
do ic = 1,p_nbnd_virt
write(stdout,'(5x,i3,1x,i3,3x,E16.8,2X,E16.8,2X,E16.8)') &
&iv,ic,dble(R(iv,ic,1)),dble(R(iv,ic,2)),dble(R(iv,ic,3))
enddo
@ -1089,10 +1262,10 @@ contains
do ieign = 1, num_eign
#if defined(__MPI)
! This part is calculated in serial
if(ionode) then
if (ionode) then
#endif
ia = eign_value_order(ieign)
if(message=="END")&
if (message=="END")&
write(stdout,'(/7x,"! The",I5,1x,"-th eigen state. The transition&
& energy is: ", 5x, F12.8)') ieign, tr_energy(ia)
! Please see Documentation for the explaination of the next four steps
@ -1102,23 +1275,23 @@ contains
&num_basis_max,right_M(:,ia),1,(0.0D0,0.0D0),C_right_M,1)
omegar(ieign)=ddot(num_basis,left_M(:,ia),1,left_M(:,ia),1)
omegar(ieign)=ddot(num_basis,C_right_M,1,left_M(:,ia),1)
!
! Apply D to the left eigen state in order to calculate the left omega
call dgemv('N',num_basis,num_basis,(1.0D0,0.0D0),dble(M_D),&
&num_basis_max,left_M(:,ia),1,(0.0D0,0.0D0),D_left_M,1)
omegal(ieign)=ddot(num_basis,right_M(:,ia),1,right_M(:,ia),1)
omegal(ieign)=ddot(num_basis,D_left_M,1,right_M(:,ia),1)
if(abs(omegal(ieign)*omegar(ieign)-tr_energy(ia)**2) .gt. zero) then
!
if (abs(omegal(ieign)*omegar(ieign)-tr_energy(ia)**2) .gt. zero) then
write(stdout,'(/5x,"Warning !!! : The interpretation of the eigenstates may have problems.")')
write(stdout,'(10x,"omegal*omegar = ",F20.12,10x,"omega^2 = ",F20.12)') &
&omegal(ieign)*omegar(ieign),tr_energy(ia)**2
endif
!
! scale right_full and left_full in order to make omegar = omegal
omegar(ieign)=sqrt(dble(omegar(ieign)))
omegal(ieign)=sqrt(dble(omegal(ieign)))
!
#if defined(__MPI)
endif
call mp_barrier(world_comm)
@ -1130,7 +1303,11 @@ contains
left_full(:,:,:,ieign)=left_full(:,:,:,ieign)/dble(omegal(ieign))
! Normalize the vector
if (okvan) then
norm=2.0d0*dble(lr_dot_us(right_full(1,1,1,ieign),left_full(1,1,1,ieign)))
else
norm=2.0d0*dble(lr_dot(right_full(1,1,1,ieign),left_full(1,1,1,ieign)))
endif
norm=sqrt(abs(norm))
right_full(:,:,:,ieign)=right_full(:,:,:,ieign)/norm
@ -1140,11 +1317,16 @@ contains
left_res(:,:,:,ieign)=(right_full(:,:,:,ieign)+left_full(:,:,:,ieign))/sqrt(2.0d0) !X
right_res(:,:,:,ieign)=(right_full(:,:,:,ieign)-left_full(:,:,:,ieign))/sqrt(2.0d0) !Y
if (okvan) then
normx=dble(lr_dot_us(left_res(1,1,1,ieign),left_res(1,1,1,ieign)))
normy=-dble(lr_dot_us(right_res(1,1,1,ieign),right_res(1,1,1,ieign)))
else
normx=dble(lr_dot(left_res(1,1,1,ieign),left_res(1,1,1,ieign)))
normy=-dble(lr_dot(right_res(1,1,1,ieign),right_res(1,1,1,ieign)))
endif
norm_F(ieign)=normx+normy !! Actually norm_F should always be one since it was previously normalized
if(message=="END") then
if (message=="END") then
write(stdout,'(/5x,"The two digitals below indicate the importance of doing beyong TDA: ")')
write(stdout,'(/5x,"Components: X",2x,F12.5,";",4x,"Y",2x,F12.5)') &
normx/norm_F(ieign),normy/norm_F(ieign)
@ -1160,14 +1342,14 @@ contains
normy=normy-Fy(iv,ic)*Fy(iv,ic)
enddo
enddo
if( normx+normy .lt. 0.0d0 ) then
if ( normx+normy .lt. 0.0d0 ) then
normx=-normx
normy=-normy
endif
if(message=="END") then
if (message=="END") then
write (stdout,'(/5x,"In the occ-virt project subspace the total Fxy is:")')
write(stdout,'(/5x,"X",2x,F12.5,";",4x,"Y",2x,F12.5,4x,"total",2x,F12.5,2x,&
write (stdout,'(/5x,"X",2x,F12.5,";",4x,"Y",2x,F12.5,4x,"total",2x,F12.5,2x,&
&"/ ",F12.5)') normx,normy,normx+normy,norm_F(ieign)
endif
@ -1178,7 +1360,7 @@ contains
total_chi(ieign)=chi_dav(1,ieign)+chi_dav(2,ieign)+chi_dav(3,ieign)
if(message=="END") then
if (message=="END") then
write (stdout,'(/5x,"The Chi_i_i is",5x,"Total",10x,"1",15x,"2",15x,"3")')
write (stdout,'(/12x,8x,E15.8,3x,E15.8,3x,E15.8,3x,E15.8)') total_chi(ieign),&
&chi_dav(1,ieign),chi_dav(2,ieign), chi_dav(3,ieign)
@ -1200,7 +1382,7 @@ contains
enddo
#if defined(__MPI)
if(ionode) then
if (ionode) then
#endif
call write_eigenvalues(message)
call write_spectrum(message)
@ -1219,18 +1401,32 @@ contains
use lr_dav_variables
use lr_variables, only : d0psi
use kinds, only : dp
use uspp, only : okvan
use lr_us
implicit none
character :: flag_calc
integer :: ieign,ipol
if( flag_calc .eq. "X" ) then
complex(kind=dp), external :: lr_dot
!
if (okvan) then
if ( flag_calc .eq. "X" ) then
dav_calc_chi=lr_dot_us(d0psi(:,:,1,ipol), left_res(:,:,1,ieign))
else if (flag_calc .eq. "Y") then
dav_calc_chi=lr_dot_us(d0psi(:,:,1,ipol), right_res(:,:,1,ieign))
endif
else
if ( flag_calc .eq. "X" ) then
!$acc data copyin(d0psi(:,:,1,ipol), left_res(:,:,1,ieign))
dav_calc_chi=lr_dot(d0psi(:,:,1,ipol), left_res(:,:,1,ieign))
!$acc end data
else if (flag_calc .eq. "Y") then
!$acc data copyin(d0psi(:,:,1,ipol), right_res(:,:,1,ieign))
dav_calc_chi=lr_dot(d0psi(:,:,1,ipol), right_res(:,:,1,ieign))
!$acc end data
endif
endif
!
return
end function dav_calc_chi
!-------------------------------------------------------------------------------
@ -1502,7 +1698,7 @@ contains
! Orthogonalize to occupied states
do ib = 1, num_init
call lr_ortho(vec_b(:,:,:,ib), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
call orthogonalize(vec_b(:,:,:,ib), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
call lr_norm(vec_b(1,1,1,ib))
enddo

View File

@ -301,7 +301,7 @@ SUBROUTINE one_lanczos_step()
!
ENDIF
!
! X. Ge: To increase the stability, apply lr_ortho.
! X. Ge: To increase the stability, apply orthogonalize.
! I.Timrov: Actually, without this trick, it turns out that
! the Lanczos chain is not stable when pseudo_hermitian=.false.,
! because there is a warning from lr_calc_dens that the integral of
@ -311,9 +311,11 @@ SUBROUTINE one_lanczos_step()
IF (.not.eels .and. .not. magnons) THEN
!
DO ik=1, nks
! CALL orthogonalize(evc1(:,:,ik,1), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),npwx,.true.)
CALL lr_ortho(evc1(:,:,ik,1), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),.true.)
IF (.not. pseudo_hermitian) &
CALL lr_ortho(evc1(:,:,ik,2), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),.true.)
! CALL orthogonalize(evc1(:,:,ik,2), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),npwx,.true.)
ENDDO
!
ENDIF

View File

@ -355,6 +355,21 @@ SUBROUTINE lr_readin
!
ENDIF
!
IF (davidson) THEN
!
! check and set num_init and num_basis_max
!
IF (num_init < num_eign ) THEN
WRITE(stdout,'(5X,"num_init is too small, set to num_init = 2*num_eign")')
num_init = 2 * num_eign
ENDIF
IF (num_basis_max < 2*num_init ) THEN
WRITE(stdout,'(5X,"num_basis_max is too small, set to num_basis_max = 4*num_init")')
num_basis_max = 4 * num_init
ENDIF
!
ENDIF
!
#if defined(__MPI)
ENDIF
!