mirror of https://gitlab.com/QEF/q-e.git
Merge branch 'qe_dav' into 'develop'
Porting on GPU of turbo_Davidson with openAcc See merge request QEF/q-e!2311
This commit is contained in:
commit
1b7d1c2295
|
@ -54,8 +54,11 @@ FUNCTION lr_dot(x,y)
|
|||
!
|
||||
IF (gamma_only) THEN
|
||||
!
|
||||
!$acc data present_or_copyin(x,y) copyin(wg) copy(temp_gamma)
|
||||
CALL lr_dot_gamma()
|
||||
!$acc end data
|
||||
lr_dot = cmplx(temp_gamma,0.0d0,dp)
|
||||
lr_dot = lr_dot/degspin
|
||||
!
|
||||
ELSEIF (noncolin) THEN
|
||||
!
|
||||
|
@ -65,11 +68,10 @@ FUNCTION lr_dot(x,y)
|
|||
ELSE
|
||||
!
|
||||
CALL lr_dot_k()
|
||||
lr_dot = lr_dot/degspin
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
lr_dot = lr_dot/degspin
|
||||
!
|
||||
CALL stop_clock ('lr_dot')
|
||||
!
|
||||
RETURN
|
||||
|
@ -81,6 +83,28 @@ CONTAINS
|
|||
! Optical case: gamma_only
|
||||
! Noncollinear case is not implemented
|
||||
!
|
||||
USE wvfct, ONLY : npwx,nbnd,wg
|
||||
#if defined(__CUDA)
|
||||
use cublas
|
||||
#endif
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
REAL(DP), EXTERNAL :: MYDDOT_VECTOR_GPU
|
||||
!$acc routine(MYDDOT_VECTOR_GPU) vector
|
||||
!
|
||||
! !$acc data present(x,y,wg,temp_gamma)
|
||||
#if defined(__CUDA)
|
||||
!$acc parallel loop reduction(temp_gamma)
|
||||
DO ibnd=1,nbnd
|
||||
!
|
||||
temp_gamma = temp_gamma + 2.D0*wg(ibnd,1)*MYDDOT_VECTOR_GPU(2*ngk(1),x(:,ibnd,1),y(:,ibnd,1))
|
||||
!
|
||||
! G=0 has been accounted twice, so we subtract one contribution.
|
||||
!
|
||||
IF (gstart==2) temp_gamma = temp_gamma - wg(ibnd,1)*dble(x(1,ibnd,1))*dble(y(1,ibnd,1))
|
||||
!
|
||||
ENDDO
|
||||
#else
|
||||
DO ibnd=1,nbnd
|
||||
!
|
||||
temp_gamma = temp_gamma + 2.D0*wg(ibnd,1)*DDOT(2*ngk(1),x(:,ibnd,1),1,y(:,ibnd,1),1)
|
||||
|
@ -90,11 +114,15 @@ CONTAINS
|
|||
IF (gstart==2) temp_gamma = temp_gamma - wg(ibnd,1)*dble(x(1,ibnd,1))*dble(y(1,ibnd,1))
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
#if defined(__MPI)
|
||||
CALL mp_sum(temp_gamma, intra_bgrp_comm)
|
||||
#endif
|
||||
!
|
||||
#if defined(__MPI)
|
||||
!$acc host_data use_device(temp_gamma)
|
||||
CALL mp_sum(temp_gamma, intra_bgrp_comm)
|
||||
!$acc end host_data
|
||||
#endif
|
||||
!
|
||||
! !$acc end data
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE lr_dot_gamma
|
||||
|
|
|
@ -46,7 +46,9 @@ SUBROUTINE lr_sm1_psi (ik, lda, n, m, psi, spsi)
|
|||
CALL start_clock( 'lr_sm1_psi' )
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!$acc data present_or_copyin(psi) present_or_copyout(spsi)
|
||||
CALL sm1_psi_gamma()
|
||||
!$acc end data
|
||||
ELSEIF (noncolin) THEN
|
||||
CALL sm1_psi_nc()
|
||||
ELSE
|
||||
|
@ -74,6 +76,11 @@ CONTAINS
|
|||
calbec_rs_gamma, add_vuspsir_gamma, &
|
||||
v_loc_psir, s_psir_gamma
|
||||
USE lrus, ONLY : bbg
|
||||
USE uspp, ONLY : vkb
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -83,9 +90,15 @@ CONTAINS
|
|||
! counters
|
||||
REAL(DP), ALLOCATABLE :: ps(:,:)
|
||||
!
|
||||
!
|
||||
! Initialize spsi : spsi = psi
|
||||
!
|
||||
CALL ZCOPY( lda * npol * m, psi, 1, spsi, 1 )
|
||||
! !$acc data present(psi,spsi)
|
||||
!
|
||||
!$acc kernels
|
||||
spsi(:,:) = psi(:,:)
|
||||
!$acc end kernels
|
||||
!!CALL ZCOPY( lda * npol * m, psi, 1, spsi, 1 )
|
||||
!
|
||||
IF ( nkb == 0 .OR. .NOT. okvan ) RETURN
|
||||
!
|
||||
|
@ -114,8 +127,13 @@ CONTAINS
|
|||
!
|
||||
! Step 2 : |spsi> = S^{-1} * |psi> = |psi> + ps * |beta>
|
||||
!
|
||||
!$acc enter data copyin(ps)
|
||||
!$acc host_data use_device(vkb, ps, spsi)
|
||||
call DGEMM('N','N',2*n,m,nkb,1.d0,vkb,2*lda,ps,nkb,1.d0,spsi,2*lda)
|
||||
!$acc end host_data
|
||||
!
|
||||
!$acc exit data delete(ps)
|
||||
! !$acc end data
|
||||
DEALLOCATE(ps)
|
||||
!
|
||||
RETURN
|
||||
|
|
|
@ -69,7 +69,7 @@ SUBROUTINE orthogonalize(dvpsi, evq, ikk, ikq, dpsi, npwq, dpsi_computed)
|
|||
!
|
||||
ALLOCATE(ps(nbnd,nbnd))
|
||||
!
|
||||
!$acc data copyin(evq) copy(dvpsi,dpsi) create(ps(1:nbnd, 1:nbnd), ps_r(1:nbnd, 1:nbnd))
|
||||
!$acc data copyin(evq) present_or_copy(dvpsi) copy(dpsi) create(ps(1:nbnd, 1:nbnd), ps_r(1:nbnd, 1:nbnd))
|
||||
IF (gamma_only) THEN
|
||||
!$acc kernels
|
||||
ps_r(:,:) = 0.0d0
|
||||
|
|
|
@ -2198,14 +2198,17 @@ MODULE realus
|
|||
INTEGER :: ebnd
|
||||
!
|
||||
!-------------------TEMPORARY-----------
|
||||
INTEGER :: ngk1
|
||||
LOGICAL :: is_present, acc_is_present
|
||||
! INTEGER :: ngk1
|
||||
! LOGICAL :: is_present, acc_is_present
|
||||
!---------------------------------------
|
||||
!
|
||||
!Task groups
|
||||
!
|
||||
!The new task group version based on vloc_psi
|
||||
!print *, "->Real space"
|
||||
!
|
||||
!$acc data present_or_copyin(orbital) present_or_copyout(psic)
|
||||
!
|
||||
CALL start_clock( 'invfft_orbital' )
|
||||
!
|
||||
IF( dffts%has_task_groups ) THEN
|
||||
|
@ -2227,11 +2230,11 @@ MODULE realus
|
|||
CALL wave_g2r( orbital(1:ngk(1),ibnd:ebnd), psic, dffts )
|
||||
!
|
||||
!-------------TEMPORARY---------------------
|
||||
ngk1 = SIZE(psic)
|
||||
#if defined(_OPENACC)
|
||||
is_present = acc_is_present(psic,ngk1)
|
||||
!$acc update self(psic) if (is_present)
|
||||
#endif
|
||||
! ngk1 = SIZE(psic)
|
||||
!#if defined(_OPENACC)
|
||||
! is_present = acc_is_present(psic,ngk1)
|
||||
! !$acc update self(psic) if (is_present)
|
||||
!#endif
|
||||
!-------------------------------------------
|
||||
!
|
||||
IF (PRESENT(conserved)) THEN
|
||||
|
@ -2245,6 +2248,8 @@ MODULE realus
|
|||
!
|
||||
CALL stop_clock( 'invfft_orbital' )
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
END SUBROUTINE invfft_orbital_gamma
|
||||
!
|
||||
!--------------------------------------------------------------------------
|
||||
|
@ -2290,14 +2295,16 @@ MODULE realus
|
|||
COMPLEX(DP), ALLOCATABLE :: psio(:,:)
|
||||
!
|
||||
!-------------------TEMPORARY-----------
|
||||
INTEGER :: ngk1
|
||||
LOGICAL :: is_present, acc_is_present
|
||||
! INTEGER :: ngk1
|
||||
! LOGICAL :: is_present, acc_is_present
|
||||
!---------------------------------------
|
||||
!
|
||||
! ... Task groups
|
||||
!print *, "->Fourier space"
|
||||
CALL start_clock( 'fwfft_orbital' )
|
||||
!
|
||||
!$acc data present_or_copyin(psic) present_or_copy(orbital)
|
||||
!
|
||||
add_to_orbital_=.FALSE. ; IF( PRESENT(add_to_orbital)) add_to_orbital_ = add_to_orbital
|
||||
!
|
||||
! ... New task_groups versions
|
||||
|
@ -2352,30 +2359,32 @@ MODULE realus
|
|||
CALL wave_r2g( psic(1:dffts%nnr), psio, dffts )
|
||||
!
|
||||
!-------------TEMPORARY---------------------
|
||||
ngk1 = SIZE(psic)
|
||||
#if defined(_OPENACC)
|
||||
is_present = acc_is_present(psic,ngk1)
|
||||
!$acc update self(psic) if (is_present)
|
||||
#endif
|
||||
! ngk1 = SIZE(psic)
|
||||
!#if defined(_OPENACC)
|
||||
! is_present = acc_is_present(psic,ngk1)
|
||||
! !$acc update self(psic) if (is_present)
|
||||
!#endif
|
||||
!-------------------------------------------
|
||||
!
|
||||
fac = 1.d0
|
||||
IF ( ibnd<last ) fac = 0.5d0
|
||||
!
|
||||
IF ( add_to_orbital_ ) THEN
|
||||
!$omp parallel do
|
||||
!$acc parallel loop
|
||||
! !$omp parallel do
|
||||
DO j = 1, ngk(1)
|
||||
orbital(j,ibnd) = orbital(j,ibnd) + fac*psio(j,1)
|
||||
IF (ibnd<last) orbital(j,ibnd+1) = orbital(j,ibnd+1) + fac*psio(j,2)
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
! !$omp end parallel do
|
||||
ELSE
|
||||
!$omp parallel do
|
||||
!$acc parallel loop
|
||||
! !$omp parallel do
|
||||
DO j = 1, ngk(1)
|
||||
orbital(j,ibnd) = fac*psio(j,1)
|
||||
IF (ibnd<last) orbital(j,ibnd+1) = fac*psio(j,2)
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
! !$omp end parallel do
|
||||
ENDIF
|
||||
!
|
||||
DEALLOCATE( psio )
|
||||
|
@ -2388,6 +2397,8 @@ MODULE realus
|
|||
!
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
CALL stop_clock( 'fwfft_orbital' )
|
||||
!
|
||||
END SUBROUTINE fwfft_orbital_gamma
|
||||
|
|
|
@ -973,6 +973,7 @@ SUBROUTINE sum_bec_gpu ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd
|
|||
USE mp_bands, ONLY : nbgrp,inter_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
USE upf_spinorb, ONLY : fcoef
|
||||
USE wavefunctions, ONLY : psic
|
||||
!
|
||||
! Used to avoid unnecessary memcopy
|
||||
USE xc_lib, ONLY : xclib_dft_is
|
||||
|
@ -1000,6 +1001,7 @@ SUBROUTINE sum_bec_gpu ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd
|
|||
if (gamma_only) then
|
||||
do ibnd = ibnd_start, ibnd_end, 2
|
||||
call invfft_orbital_gamma(evc,ibnd,ibnd_end)
|
||||
!$acc update self(psic)
|
||||
call calbec_rs_gamma(ibnd,ibnd_end,becp%r)
|
||||
enddo
|
||||
call mp_sum(becp%r,inter_bgrp_comm)
|
||||
|
|
|
@ -66,6 +66,10 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
|
|||
#if defined (__ENVIRON)
|
||||
USE plugin_flags, ONLY : use_environ
|
||||
USE environ_td_module, ONLY : calc_environ_dpotential
|
||||
#endif
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -82,26 +86,30 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
|
|||
& w1(:), w2(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: dvrs_temp(:,:), spsi1(:,:), dvrsc(:,:), &
|
||||
& dvrssc(:), sevc1_new(:,:,:)
|
||||
INTEGER :: nnr_siz
|
||||
!
|
||||
IF (lr_verbosity > 5) THEN
|
||||
WRITE(stdout,'("<lr_apply_liouvillian>")')
|
||||
ENDIF
|
||||
!
|
||||
CALL start_clock('lr_apply')
|
||||
CALL start_clock_gpu('lr_apply')
|
||||
!
|
||||
IF (interaction) CALL start_clock('lr_apply_int')
|
||||
IF (.not.interaction) CALL start_clock('lr_apply_no')
|
||||
IF (interaction) CALL start_clock_gpu('lr_apply_int')
|
||||
IF (.not.interaction) CALL start_clock_gpu('lr_apply_no')
|
||||
!
|
||||
ALLOCATE( d_deeq(nhm, nhm, nat, nspin) )
|
||||
d_deeq(:,:,:,:)=0.0d0
|
||||
!
|
||||
ALLOCATE( spsi1(npwx, nbnd) )
|
||||
ALLOCATE( sevc1_new(npwx*npol,nbnd,nks))
|
||||
!
|
||||
nnr_siz= dffts%nnr
|
||||
!$acc data present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks)) present_or_copyout(evc1_new(1:npwx*npol,1:nbnd,1:nks)) copyout(sevc1_new(1:npwx*npol,1:nbnd,1:nks)) create(spsi1(1:npwx, 1:nbnd)) present_or_copyin(revc0(1:nnr_siz,1:nbnd,1))
|
||||
!
|
||||
d_deeq(:,:,:,:)=0.0d0
|
||||
!$acc kernels
|
||||
spsi1(:,:)=(0.0d0,0.0d0)
|
||||
!
|
||||
ALLOCATE(sevc1_new(npwx*npol,nbnd,nks))
|
||||
sevc1_new(:,:,:) = (0.0d0,0.0d0)
|
||||
!
|
||||
evc1_new(:,:,:) = (0.0d0,0.0d0)
|
||||
!$acc end kernels
|
||||
!
|
||||
IF ( interaction ) THEN
|
||||
!
|
||||
|
@ -232,8 +240,10 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
|
|||
! Here we add the two terms:
|
||||
! [H(k) - E(k)] * evc1(k) + dV_HXC * revc0(k)
|
||||
!
|
||||
!$acc host_data use_device(evc1_new, sevc1_new)
|
||||
CALL zaxpy(size_evc,CMPLX(1.0d0,0.0d0,kind=DP),&
|
||||
& evc1_new(:,:,:), 1, sevc1_new(:,:,:), 1)
|
||||
!$acc end host_data
|
||||
!
|
||||
ELSEIF ( interaction .and. ltammd ) THEN
|
||||
!
|
||||
|
@ -244,8 +254,10 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
|
|||
!
|
||||
! Here evc1_new contains the interaction
|
||||
!
|
||||
!$acc host_data use_device(evc1_new, sevc1_new)
|
||||
CALL zaxpy(size_evc,CMPLX(0.5d0,0.0d0,kind=DP),&
|
||||
& evc1_new(:,:,:) , 1, sevc1_new(:,:,:),1)
|
||||
!$acc end host_data
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
|
@ -256,21 +268,27 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
|
|||
!
|
||||
ENDIF
|
||||
!
|
||||
IF (gstart == 2 .AND. gamma_only ) sevc1_new(1,:,:) = &
|
||||
!
|
||||
IF (gstart == 2 .AND. gamma_only ) THEN
|
||||
!$acc kernels
|
||||
sevc1_new(1,:,:) = &
|
||||
& CMPLX( REAL( sevc1_new(1,:,:), DP ), 0.0d0, DP )
|
||||
!
|
||||
IF (gstart==2 .and. gamma_only) THEN
|
||||
DO ik=1,nks
|
||||
DO ibnd=1,nbnd
|
||||
IF (abs(aimag(sevc1_new(1,ibnd,ik)))>1.0d-12) THEN
|
||||
!
|
||||
CALL errore(' lr_apply_liouvillian ',&
|
||||
'Imaginary part of G=0 '// &
|
||||
'component does not equal zero',1)
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$acc end kernels
|
||||
ENDIF
|
||||
|
||||
!
|
||||
! IF (gstart==2 .and. gamma_only) THEN
|
||||
! DO ik=1,nks
|
||||
! DO ibnd=1,nbnd
|
||||
! IF (abs(aimag(sevc1_new(1,ibnd,ik)))>1.0d-12) THEN
|
||||
! !
|
||||
! CALL errore(' lr_apply_liouvillian ',&
|
||||
! 'Imaginary part of G=0 '// &
|
||||
! 'component does not equal zero',1)
|
||||
! ENDIF
|
||||
! ENDDO
|
||||
! ENDDO
|
||||
! ENDIF
|
||||
!
|
||||
! Apply the projector on empty states P_c^+.
|
||||
! Note: The projector P_c^+ can be applied only
|
||||
|
@ -289,10 +307,13 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
|
|||
!
|
||||
CALL orthogonalize(sevc1_new(:,:,ik), evc0(:,:,ik), ik, ik, &
|
||||
& sevc0(:,:,ik), ngk(ik), .true.)
|
||||
!$acc kernels
|
||||
sevc1_new(:,:,ik) = -sevc1_new(:,:,ik)
|
||||
!$acc end kernels
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!
|
||||
! Here we apply the S^{-1} operator.
|
||||
! See equations after Eq.(47) of B. Walker et al., J. Chem. Phys.
|
||||
! 127, 164106 (2007).
|
||||
|
@ -305,16 +326,17 @@ SUBROUTINE lr_apply_liouvillian( evc1, evc1_new, interaction )
|
|||
& sevc1_new(1,1,ik), evc1_new(1,1,ik))
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
IF (allocated(dvrs)) DEALLOCATE(dvrs)
|
||||
IF (allocated(dvrss)) DEALLOCATE(dvrss)
|
||||
DEALLOCATE(d_deeq)
|
||||
DEALLOCATE(spsi1)
|
||||
DEALLOCATE(sevc1_new)
|
||||
!
|
||||
IF (interaction) CALL stop_clock('lr_apply_int')
|
||||
IF (.not.interaction) CALL stop_clock('lr_apply_no')
|
||||
IF (interaction) CALL stop_clock_gpu('lr_apply_int')
|
||||
IF (.not.interaction) CALL stop_clock_gpu('lr_apply_no')
|
||||
!
|
||||
CALL stop_clock('lr_apply')
|
||||
CALL stop_clock_gpu('lr_apply')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
@ -330,13 +352,17 @@ CONTAINS
|
|||
USE mp_global, ONLY : ibnd_start, ibnd_end, inter_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
USE lr_exx_kernel, ONLY : lr_exx_sum_int
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: becp2(:,:)
|
||||
REAL(DP), ALLOCATABLE :: tg_dvrss(:)
|
||||
INTEGER :: v_siz, incr, ioff
|
||||
INTEGER :: ibnd_start_gamma, ibnd_end_gamma
|
||||
COMPLEX(DP) :: coef
|
||||
INTEGER :: v_siz, incr, ioff, ir, nnr_siz
|
||||
INTEGER :: ibnd_start_gamma, ibnd_end_gamma, ibnd, ngk1
|
||||
!
|
||||
incr = 2
|
||||
!
|
||||
|
@ -350,9 +376,13 @@ CONTAINS
|
|||
! Now apply to the ground state wavefunctions
|
||||
! and convert to real space
|
||||
!
|
||||
nnr_siz= dffts%nnr
|
||||
!$acc data create (psic(1:nnr_siz))
|
||||
!
|
||||
IF ( interaction ) THEN
|
||||
!
|
||||
CALL start_clock('interaction')
|
||||
!
|
||||
CALL start_clock_gpu('interaction')
|
||||
!
|
||||
IF (nkb > 0 .and. okvan) THEN
|
||||
! calculation of becp2
|
||||
|
@ -393,7 +423,7 @@ CONTAINS
|
|||
ENDDO
|
||||
!end: calculation of becp2
|
||||
ENDIF
|
||||
|
||||
!
|
||||
IF ( dffts%has_task_groups ) THEN
|
||||
!
|
||||
v_siz = dffts%nnr_tg
|
||||
|
@ -409,7 +439,9 @@ CONTAINS
|
|||
!
|
||||
! evc1_new is used as a container for the interaction
|
||||
!
|
||||
!$acc kernels
|
||||
evc1_new(:,:,:) = (0.0d0,0.0d0)
|
||||
!$acc end kernels
|
||||
!
|
||||
ibnd_start_gamma = ibnd_start
|
||||
IF (MOD(ibnd_start, 2)==0) ibnd_start_gamma = ibnd_start + 1
|
||||
|
@ -417,8 +449,8 @@ CONTAINS
|
|||
!
|
||||
IF (lr_exx) CALL lr_exx_sum_int()
|
||||
!
|
||||
!$acc enter data copyin(dvrss(1:nnr_siz))
|
||||
DO ibnd = ibnd_start_gamma ,ibnd_end_gamma, incr
|
||||
! DO ibnd = 1,nbnd,2
|
||||
!
|
||||
! Product with the potential vrs = (vltot+vr)
|
||||
! revc0 is on smooth grid. psic is used up to smooth grid
|
||||
|
@ -433,7 +465,9 @@ CONTAINS
|
|||
!
|
||||
ELSE
|
||||
!
|
||||
DO ir = 1,dffts%nnr
|
||||
!DO ir = 1,dffts%nnr
|
||||
!$acc parallel loop
|
||||
DO ir = 1, nnr_siz
|
||||
!
|
||||
psic(ir) = revc0(ir,ibnd,1)*CMPLX(dvrss(ir),0.0d0,DP)
|
||||
!
|
||||
|
@ -504,7 +538,7 @@ CONTAINS
|
|||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! Back to reciprocal space
|
||||
|
@ -513,8 +547,12 @@ CONTAINS
|
|||
!
|
||||
ENDDO
|
||||
!
|
||||
!$acc exit data delete (dvrss)
|
||||
!
|
||||
#if defined(__MPI)
|
||||
!$acc host_data use_device(evc1_new)
|
||||
CALL mp_sum( evc1_new(:,:,1), inter_bgrp_comm )
|
||||
!$acc end host_data
|
||||
#endif
|
||||
IF (dffts%has_task_groups) DEALLOCATE (tg_dvrss)
|
||||
!
|
||||
|
@ -525,7 +563,7 @@ CONTAINS
|
|||
!
|
||||
ENDIF
|
||||
!
|
||||
CALL stop_clock('interaction')
|
||||
CALL stop_clock_gpu('interaction')
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
|
@ -541,11 +579,9 @@ CONTAINS
|
|||
! Compute sevc1_new = H*evc1
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!$acc data copyin(evc1) copyout(sevc1_new)
|
||||
!$acc host_data use_device(evc1, sevc1_new)
|
||||
CALL h_psi_gpu (npwx,ngk(1),nbnd,evc1(1,1,1),sevc1_new(1,1,1))
|
||||
!$acc end host_data
|
||||
!$acc end data
|
||||
#else
|
||||
CALL h_psi(npwx,ngk(1),nbnd,evc1(1,1,1),sevc1_new(1,1,1))
|
||||
#endif
|
||||
|
@ -562,11 +598,9 @@ CONTAINS
|
|||
ENDDO
|
||||
ELSE
|
||||
#if defined(__CUDA)
|
||||
!$acc data copyin(evc1) copyout(spsi1)
|
||||
!$acc host_data use_device(evc1, spsi1)
|
||||
CALL s_psi_acc (npwx,ngk(1),nbnd,evc1(1,1,1),spsi1)
|
||||
!$acc end host_data
|
||||
!$acc end data
|
||||
#else
|
||||
CALL s_psi(npwx,ngk(1),nbnd,evc1(1,1,1),spsi1)
|
||||
#endif
|
||||
|
@ -576,11 +610,16 @@ CONTAINS
|
|||
!
|
||||
DO ibnd = 1,nbnd
|
||||
!
|
||||
CALL zaxpy(ngk(1), CMPLX(-(et(ibnd,1)-scissor),0.0d0,DP), &
|
||||
& spsi1(:,ibnd), 1, sevc1_new(:,ibnd,1), 1)
|
||||
coef = CMPLX(-(et(ibnd,1)-scissor),0.0d0)
|
||||
!$acc host_data use_device(spsi1, sevc1_new)
|
||||
CALL zaxpy(ngk(1), coef, &
|
||||
& spsi1(1,ibnd), 1, sevc1_new(1,ibnd,1), 1)
|
||||
!$acc end host_data
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
IF ( nkb > 0 .and. okvan ) DEALLOCATE(becp2)
|
||||
!
|
||||
RETURN
|
||||
|
|
|
@ -60,6 +60,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
|
|||
USE lr_exx_kernel, ONLY : lr_exx_kernel_int, revc_int,&
|
||||
& revc_int_c
|
||||
USE constants, ONLY : eps12
|
||||
USE qpoint, ONLY : nksq
|
||||
USE fft_helper_subroutines
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -75,6 +76,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
|
|||
INTEGER :: ir, ik, ibnd, jbnd, ig, ijkb0, np, na
|
||||
INTEGER :: ijh,ih,jh,ikb,jkb,is
|
||||
INTEGER :: i, j, k, l
|
||||
INTEGER :: v_siz, nnr_siz, irho
|
||||
REAL(kind=dp) :: w1, w2, scal, rho_sum
|
||||
! These are temporary buffers for the response
|
||||
REAL(kind=dp), ALLOCATABLE :: rho_sum_resp_x(:), rho_sum_resp_y(:),&
|
||||
|
@ -86,13 +88,23 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
|
|||
WRITE(stdout,'("<lr_calc_dens>")')
|
||||
ENDIF
|
||||
!
|
||||
CALL start_clock('lr_calc_dens')
|
||||
CALL start_clock_gpu('lr_calc_dens')
|
||||
!
|
||||
ALLOCATE( psic(dfftp%nnr) )
|
||||
|
||||
v_siz = dfftp%nnr
|
||||
nnr_siz = dffts%nnr
|
||||
!
|
||||
!$acc data create(psic(1:v_siz)) present_or_copyin(evc1(1:npwx*npol,1:nbnd,1:nks)) present_or_copyin(revc0(1:nnr_siz,1:nbnd,1:nksq)) copyout( rho_1(1:v_siz,1:nspin_mag))
|
||||
!
|
||||
!$acc kernels
|
||||
psic(:) = (0.0d0, 0.0d0)
|
||||
!$acc end kernels
|
||||
!
|
||||
IF (gamma_only) THEN
|
||||
!$acc kernels
|
||||
rho_1(:,:) = 0.0d0
|
||||
!$acc end kernels
|
||||
ELSE
|
||||
rho_1c(:,:) = (0.0d0, 0.0d0)
|
||||
ENDIF
|
||||
|
@ -105,10 +117,17 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
|
|||
!
|
||||
! If a double grid is used, interpolate onto the fine grid
|
||||
!
|
||||
IF ( doublegrid ) CALL fft_interpolate(dffts, rho_1(:,1), dfftp, rho_1(:,1))
|
||||
IF ( doublegrid ) THEN
|
||||
print *, 'doublegrid', doublegrid
|
||||
!$acc host_data use_device(rho_1(:,1))
|
||||
CALL fft_interpolate(dffts, rho_1(:,1), dfftp, rho_1(:,1))
|
||||
!$acc end host_data
|
||||
ENDIF
|
||||
!
|
||||
#if defined(__MPI)
|
||||
!$acc host_data use_device(rho_1)
|
||||
CALL mp_sum(rho_1, inter_bgrp_comm)
|
||||
!$acc end host_data
|
||||
#endif
|
||||
!
|
||||
ELSE
|
||||
|
@ -162,11 +181,11 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
|
|||
!
|
||||
! The psic workspace can present a memory bottleneck
|
||||
!
|
||||
DEALLOCATE ( psic )
|
||||
!
|
||||
#if defined(__MPI)
|
||||
IF(gamma_only) THEN
|
||||
!$acc host_data use_device(rho_1)
|
||||
CALL mp_sum(rho_1, inter_pool_comm)
|
||||
!$acc end host_data
|
||||
ELSE
|
||||
CALL mp_sum(rho_1c, inter_pool_comm)
|
||||
ENDIF
|
||||
|
@ -179,7 +198,11 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
|
|||
DO is = 1, nspin_mag
|
||||
!
|
||||
rho_sum = 0.0d0
|
||||
rho_sum = SUM(rho_1(:,is))
|
||||
! rho_sum = SUM(rho_1(:,is))
|
||||
!$acc parallel loop private(rho_sum) copy(rho_sum)
|
||||
do irho = 1, v_siz
|
||||
rho_sum = rho_sum + rho_1(i,is)
|
||||
enddo
|
||||
!
|
||||
#if defined(__MPI)
|
||||
CALL mp_sum(rho_sum, intra_bgrp_comm )
|
||||
|
@ -213,6 +236,10 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
|
|||
!
|
||||
ENDIF
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
DEALLOCATE ( psic )
|
||||
!
|
||||
IF (charge_response == 2 .AND. LR_iteration /=0) THEN
|
||||
!
|
||||
ALLOCATE( rho_sum_resp_x( dfftp%nr1 ) )
|
||||
|
@ -335,7 +362,7 @@ SUBROUTINE lr_calc_dens( evc1, response_calc )
|
|||
!
|
||||
ENDIF
|
||||
!
|
||||
CALL stop_clock('lr_calc_dens')
|
||||
CALL stop_clock_gpu('lr_calc_dens')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
|
@ -360,15 +387,18 @@ CONTAINS
|
|||
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd_start_gamma, ibnd_end_gamma
|
||||
INTEGER :: ibnd_start_gamma, ibnd_end_gamma, ibnd, ebnd
|
||||
INTEGER :: v_siz, incr, ir3, ioff, ioff_tg, nxyp, idx
|
||||
REAL(DP), ALLOCATABLE :: tg_rho(:)
|
||||
INTEGER :: nnr_siz, pnnr_siz, ir
|
||||
!
|
||||
ibnd_start_gamma = ibnd_start
|
||||
IF (MOD(ibnd_start, 2)==0) ibnd_start_gamma = ibnd_start + 1
|
||||
ibnd_end_gamma = MAX(ibnd_end, ibnd_start_gamma)
|
||||
!
|
||||
incr = 2
|
||||
nnr_siz = dffts%nnr
|
||||
pnnr_siz = dfftp%nnr
|
||||
!
|
||||
IF ( dffts%has_task_groups ) THEN
|
||||
!
|
||||
|
@ -385,6 +415,9 @@ CONTAINS
|
|||
!
|
||||
! FFT: evc1 -> psic
|
||||
!
|
||||
ebnd = ibnd
|
||||
IF ( ibnd < nbnd ) ebnd = ebnd + 1
|
||||
!
|
||||
CALL invfft_orbital_gamma(evc1(:,:,1),ibnd,nbnd)
|
||||
!
|
||||
IF (dffts%has_task_groups) THEN
|
||||
|
@ -446,9 +479,10 @@ CONTAINS
|
|||
! in no way the final response charge density.
|
||||
! The loop is over real space points.
|
||||
!
|
||||
DO ir = 1, dffts%nnr
|
||||
!$acc parallel loop
|
||||
DO ir = 1, nnr_siz
|
||||
rho_1(ir,1) = rho_1(ir,1) &
|
||||
+ 2.0d0*(w1*real(revc0(ir,ibnd,1),dp)*real(psic(ir),dp)&
|
||||
+ 2.0d0*(w1*dble(revc0(ir,ibnd,1))*dble(psic(ir))&
|
||||
+ w2*aimag(revc0(ir,ibnd,1))*aimag(psic(ir)))
|
||||
ENDDO
|
||||
!
|
||||
|
|
|
@ -46,7 +46,6 @@ PROGRAM lr_dav_main
|
|||
LOGICAL, EXTERNAL :: check_gpu_support
|
||||
|
||||
use_gpu = check_gpu_support()
|
||||
if(use_gpu) Call errore('lr_dav_main', 'turbo_davidson with GPU NYI', 1)
|
||||
|
||||
#if defined(__MPI)
|
||||
CALL mp_startup ( )
|
||||
|
@ -97,6 +96,7 @@ PROGRAM lr_dav_main
|
|||
CALL lr_dv_setup()
|
||||
|
||||
! Davidson loop
|
||||
!$acc data copyin(revc0(:,:,:))
|
||||
if (precondition) write(stdout,'(/5x,"Precondition is used in the algorithm,")')
|
||||
do while (.not. dav_conv .and. dav_iter .lt. max_iter)
|
||||
dav_iter=dav_iter+1
|
||||
|
@ -111,13 +111,15 @@ PROGRAM lr_dav_main
|
|||
! Check to see if the wall time limit has been exceeded.
|
||||
if ( check_stop_now() ) then
|
||||
call lr_write_restart_dav()
|
||||
goto 100
|
||||
!! goto 100
|
||||
exit
|
||||
endif
|
||||
!
|
||||
enddo
|
||||
!$acc end data
|
||||
! call check_hermitian()
|
||||
! Extract physical meaning from the solution
|
||||
|
||||
if ( check_stop_now() ) goto 100
|
||||
call interpret_eign('END')
|
||||
! The check_orth at the end may take quite a lot of time in the case of
|
||||
! USPP because we didn't store the S* vector basis. Turn this step on only
|
||||
|
|
|
@ -454,57 +454,135 @@ contains
|
|||
write(stdout,'(7x,"num of basis:",I5,3x,"total built basis:",I5)') num_basis,num_basis_tot
|
||||
|
||||
! Add new matrix elements to the M_C and M_D(in the subspace)(part 1)
|
||||
if (.not.ltammd) then
|
||||
if (.not. okvan) then !norm conserving
|
||||
!$acc data copyin(vec_b(:,:,:,num_basis_old+1:num_basis)) create(vecwork(:,:,:)) copyout(D_vec_b(:,:,:,num_basis_old+1:num_basis), C_vec_b(:,:,:,num_basis_old+1:num_basis))
|
||||
do ibr = num_basis_old+1, num_basis
|
||||
if(.not.ltammd) then
|
||||
! Calculate new D*vec_b
|
||||
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.false.)
|
||||
if(.not. poor_of_ram2) D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
|
||||
if (.not. poor_of_ram2) THEN
|
||||
!$acc kernels
|
||||
D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
!$acc end kernels
|
||||
endif
|
||||
!
|
||||
! Add new M_D
|
||||
do ibl = 1, ibr
|
||||
! Here there's a choice between saving memory and saving calculation
|
||||
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
|
||||
M_D(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
else ! Less calculation, double memory required
|
||||
M_D(ibl,ibr)=lr_dot(svec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
endif
|
||||
M_D(ibl,ibr)=lr_dot(vec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
|
||||
enddo
|
||||
! Calculate new C*vec_b
|
||||
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
|
||||
if (.not. poor_of_ram2) THEN
|
||||
!$acc kernels
|
||||
C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
!$acc end kernels
|
||||
endif
|
||||
!
|
||||
! Add new M_C
|
||||
do ibl = 1, ibr
|
||||
M_C(ibl,ibr)=lr_dot(vec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
|
||||
enddo
|
||||
!
|
||||
enddo
|
||||
!$acc end data
|
||||
else if(poor_of_ram) then ! Less memory needed
|
||||
do ibr = num_basis_old+1, num_basis
|
||||
! Calculate new D*vec_b
|
||||
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.false.)
|
||||
if(.not. poor_of_ram2) D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
!
|
||||
! Add new M_D
|
||||
do ibl = 1, ibr
|
||||
! Here there's a choice between saving memory and saving calculation
|
||||
M_D(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
|
||||
enddo
|
||||
|
||||
! Calculate new C*vec_b
|
||||
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
|
||||
if(.not. poor_of_ram2) C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
|
||||
! Add new M_C
|
||||
do ibl = 1, ibr
|
||||
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
|
||||
M_C(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
else ! Less calculation, double memory required
|
||||
M_C(ibl,ibr)=lr_dot(svec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
endif
|
||||
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
|
||||
enddo
|
||||
|
||||
else ! ltammd
|
||||
enddo
|
||||
else !uspp
|
||||
do ibr = num_basis_old+1, num_basis
|
||||
! Calculate new D*vec_b
|
||||
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.false.)
|
||||
if(.not. poor_of_ram2) D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
!
|
||||
! Add new M_D
|
||||
do ibl = 1, ibr
|
||||
! Here there's a choice between saving memory and saving calculation
|
||||
M_D(ibl,ibr)=lr_dot(svec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
|
||||
enddo
|
||||
! Calculate new C*vec_b
|
||||
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
|
||||
if(.not. poor_of_ram2) then
|
||||
if(.not. poor_of_ram2) C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
! Add new M_C
|
||||
do ibl = 1, ibr
|
||||
M_C(ibl,ibr)=lr_dot(svec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
|
||||
enddo
|
||||
enddo
|
||||
endif
|
||||
else
|
||||
if (.not. okvan) then
|
||||
!$acc data copyin(vec_b(:,:,:,num_basis_old+1:num_basis)) create(vecwork(:,:,:)) copyout(D_vec_b(:,:,:,num_basis_old+1:num_basis), C_vec_b(:,:,:,num_basis_old+1:num_basis))
|
||||
do ibr = num_basis_old+1, num_basis
|
||||
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
|
||||
if (.not. poor_of_ram2) then
|
||||
!$acc kernels
|
||||
D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
!$acc end kernels
|
||||
endif
|
||||
|
||||
! Add new M_D, M_C
|
||||
do ibl = 1, ibr
|
||||
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
|
||||
M_D(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
else ! Less calculation, double memory required
|
||||
M_D(ibl,ibr)=lr_dot_us(svec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
endif
|
||||
M_D(ibl,ibr)=lr_dot(vec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
|
||||
M_C(ibl,ibr)=M_D(ibl,ibr)
|
||||
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
|
||||
enddo
|
||||
endif
|
||||
enddo
|
||||
!$acc end data
|
||||
elseif (poor_of_ram) then ! Less memory needed
|
||||
do ibr = num_basis_old+1, num_basis
|
||||
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
|
||||
if (.not. poor_of_ram2) then
|
||||
D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
endif
|
||||
! Add new M_D, M_C
|
||||
do ibl = 1, ibr
|
||||
M_D(ibl,ibr)=lr_dot_us(vec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
|
||||
M_C(ibl,ibr)=M_D(ibl,ibr)
|
||||
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
|
||||
enddo
|
||||
enddo
|
||||
else ! Less calculation, double memory required
|
||||
do ibr = num_basis_old+1, num_basis
|
||||
call lr_apply_liouvillian(vec_b(:,:,:,ibr),vecwork(:,:,:),.true.)
|
||||
if (.not. poor_of_ram2) then
|
||||
D_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
C_vec_b(:,:,:,ibr)=vecwork(:,:,:)
|
||||
endif
|
||||
! Add new M_D, M_C
|
||||
do ibl = 1, ibr
|
||||
M_D(ibl,ibr)=lr_dot_us(svec_b(1,1,1,ibl),vecwork(1,1,1))
|
||||
if(ibl /= ibr) M_D(ibr,ibl)=M_D(ibl,ibr)
|
||||
M_C(ibl,ibr)=M_D(ibl,ibr)
|
||||
if(ibl /= ibr) M_C(ibr,ibl)=M_C(ibl,ibr)
|
||||
enddo
|
||||
enddo
|
||||
endif
|
||||
endif
|
||||
|
||||
call solve_M_DC()
|
||||
|
||||
|
@ -654,11 +732,13 @@ contains
|
|||
use kinds, only : dp
|
||||
use io_global, only : stdout
|
||||
use wvfct, only : nbnd, npwx
|
||||
use uspp, only : okvan
|
||||
use lr_dav_debug
|
||||
use lr_us
|
||||
|
||||
implicit none
|
||||
integer :: ieign, flag,ibr
|
||||
complex(kind=dp),external :: lr_dot
|
||||
integer :: ieign, flag, ibr
|
||||
logical :: discharged
|
||||
complex(kind=dp) :: temp(npwx,nbnd)
|
||||
|
||||
|
@ -672,20 +752,28 @@ contains
|
|||
|
||||
call start_clock("calc_residue")
|
||||
|
||||
!$acc data copyout(left_res(:,:,:,:), right_res(:,:,:,:)) copyin(right_M(:,:), left_M(:,:), C_vec_b(:,:,:,:), D_vec_b(:,:,:,:), left_full(:,:,:,:), right_full(:,:,:,:))
|
||||
do ieign = 1, num_eign
|
||||
if(poor_of_ram2) then ! If D_ C_ basis are not stored, we have to apply liouvillian again
|
||||
|
||||
if (poor_of_ram2) then ! If D_ C_ basis are not stored, we have to apply liouvillian again
|
||||
call lr_apply_liouvillian(right_full(:,:,:,ieign),right_res(:,:,:,ieign),.true.) ! Apply lanczos
|
||||
call lr_apply_liouvillian(left_full(:,:,:,ieign),left_res(:,:,:,ieign),.false.)
|
||||
else ! Otherwise they are be recovered directly by the combination of C_ and D_ basis
|
||||
!$acc kernels
|
||||
left_res(:,:,:,ieign)=0.0d0
|
||||
right_res(:,:,:,ieign)=0.0d0
|
||||
!$acc end kernels
|
||||
do ibr = 1, num_basis
|
||||
!$acc kernels async
|
||||
right_res(:,:,1,ieign)=right_res(:,:,1,ieign)+right_M(ibr,eign_value_order(ieign))*C_vec_b(:,:,1,ibr)
|
||||
left_res(:,:,1,ieign)=left_res(:,:,1,ieign)+left_M(ibr,eign_value_order(ieign))*D_vec_b(:,:,1,ibr)
|
||||
!$acc end kernels
|
||||
enddo
|
||||
!$acc wait
|
||||
endif
|
||||
|
||||
! The reason of using this method
|
||||
|
||||
call lr_1to1orth(right_res(1,1,1,ieign),left_full(1,1,1,ieign))
|
||||
call lr_1to1orth(left_res(1,1,1,ieign),right_full(1,1,1,ieign))
|
||||
! Instead of this will be explained in the document
|
||||
|
@ -693,37 +781,47 @@ contains
|
|||
! left_res(:,:,:,ieign)=left_res(:,:,:,ieign)-sqrt(eign_value(eign_value_order(ieign),1))*right_full(:,:,:,ieign)
|
||||
|
||||
! Update kill_r/l
|
||||
if (okvan) then
|
||||
right2(ieign)=lr_dot_us(right_res(1,1,1,ieign),right_res(1,1,1,ieign))
|
||||
else
|
||||
right2(ieign)=lr_dot(right_res(1,1,1,ieign),right_res(1,1,1,ieign))
|
||||
endif
|
||||
if (abs(aimag(right2(ieign))) .gt. zero .or. dble(right2(ieign)) .lt. 0.0D0) then
|
||||
write(stdout,'(7x,"Warning! Wanging! the residue is weird.")')
|
||||
endif
|
||||
if( dble(right2(ieign)) .lt. residue_conv_thr ) then
|
||||
if (dble(right2(ieign)) .lt. residue_conv_thr ) then
|
||||
kill_right(ieign)=.true.
|
||||
toadd=toadd-1
|
||||
endif
|
||||
if( dble(right2(ieign)) .gt. max_res ) max_res = dble(right2(ieign))
|
||||
|
||||
if (okvan) then
|
||||
left2(ieign)=lr_dot_us(left_res(1,1,1,ieign),left_res(1,1,1,ieign))
|
||||
else
|
||||
left2(ieign)=lr_dot(left_res(1,1,1,ieign),left_res(1,1,1,ieign))
|
||||
endif
|
||||
|
||||
if (abs(aimag(left2(ieign))) .gt. zero .or. dble(left2(ieign)) .lt. 0.0D0) then
|
||||
write(stdout,'(7x,"Warning! Wanging! the residue is weird.")')
|
||||
endif
|
||||
if( dble(left2(ieign)) .lt. residue_conv_thr ) then
|
||||
if (dble(left2(ieign)) .lt. residue_conv_thr ) then
|
||||
kill_left(ieign)=.true.
|
||||
toadd=toadd-1
|
||||
endif
|
||||
if( dble(left2(ieign)) .gt. max_res ) max_res = dble(left2(ieign))
|
||||
if(dble(left2(ieign)) .gt. max_res ) max_res = dble(left2(ieign))
|
||||
|
||||
write (stdout,'(5x,"Residue(Squared modulus):",I5,2x,2F15.7)') ieign, &
|
||||
dble(right2(ieign)), dble(left2(ieign))
|
||||
enddo
|
||||
!$acc end data
|
||||
|
||||
write(stdout,'(7x,"Largest residue:",5x,F20.12)') max_res
|
||||
if(max_res .lt. residue_conv_thr) dav_conv=.true.
|
||||
call stop_clock("calc_residue")
|
||||
|
||||
! Forseen that the number of basis will be too large
|
||||
if(num_basis+toadd .gt. num_basis_max) then
|
||||
if(discharged) &
|
||||
if (num_basis+toadd .gt. num_basis_max) then
|
||||
if (discharged) &
|
||||
call errore('lr_discharge',"The num_basis_max is too small that even discharge &
|
||||
& cannot work. Please increase its value in the input.",1)
|
||||
discharged = .true.
|
||||
|
@ -745,6 +843,7 @@ contains
|
|||
use uspp, only : okvan
|
||||
use lr_us
|
||||
use lr_dav_debug
|
||||
use wvfct, only : npwx
|
||||
|
||||
implicit none
|
||||
integer :: ieign, flag
|
||||
|
@ -756,24 +855,25 @@ contains
|
|||
|
||||
if (precondition) then
|
||||
do ieign = 1, num_eign
|
||||
if(.not. kill_left(ieign)) then
|
||||
if (.not. kill_left(ieign)) then
|
||||
call treat_residue(left_res(:,:,1,ieign),ieign)
|
||||
call lr_norm(left_res(1,1,1,ieign))
|
||||
call lr_ortho(left_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
|
||||
call orthogonalize(left_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
|
||||
call lr_norm(left_res(1,1,1,ieign))
|
||||
endif
|
||||
|
||||
if(.not. kill_right(ieign)) then
|
||||
if (.not. kill_right(ieign)) then
|
||||
call treat_residue(right_res(:,:,1,ieign),ieign)
|
||||
call lr_norm(right_res(1,1,1,ieign))
|
||||
call lr_ortho(right_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
|
||||
call lr_norm(left_res(1,1,1,ieign))
|
||||
call orthogonalize(right_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
|
||||
call lr_norm(right_res(1,1,1,ieign))
|
||||
endif
|
||||
enddo
|
||||
endif
|
||||
|
||||
! Here mGS are called three times and lr_ortho is called once for increasing
|
||||
! Here mGS are called three times and orthogonalize is called once for increasing
|
||||
! numerical stability of orthonalization
|
||||
!$acc data copy(left_res(:,:,:,:), right_res(:,:,:,:), vec_b(:,:,:,:))
|
||||
call lr_mGS_orth() ! 1st
|
||||
call lr_mGS_orth_pp()
|
||||
call lr_mGS_orth() ! 2nd
|
||||
|
@ -782,20 +882,19 @@ contains
|
|||
call lr_mGS_orth_pp()
|
||||
|
||||
do ieign = 1, num_eign
|
||||
call lr_ortho(right_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
|
||||
call lr_ortho(left_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
|
||||
call orthogonalize(right_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
|
||||
call orthogonalize(left_res(:,:,:,ieign), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
|
||||
call lr_norm(right_res(:,:,:,ieign))
|
||||
call lr_norm(left_res(:,:,:,ieign))
|
||||
enddo
|
||||
|
||||
if(toadd .eq. 0) then
|
||||
!
|
||||
if (toadd .eq. 0) then
|
||||
write(stdout,'("TOADD is zero !!")')
|
||||
dav_conv=.true.
|
||||
return
|
||||
go to 10
|
||||
endif
|
||||
|
||||
!if( dav_iter .gt. max_iter .or. num_basis+toadd .gt. num_basis_max ) then
|
||||
if( dav_iter .gt. max_iter ) then
|
||||
!
|
||||
if ( dav_iter .gt. max_iter ) then
|
||||
write(stdout,'(/5x,"!!!! We have arrived maximum number of iterations. We have to stop &
|
||||
&here, and the result will not be trustable !!!!! ")')
|
||||
dav_conv=.true.
|
||||
|
@ -804,29 +903,35 @@ contains
|
|||
num_basis_tot=num_basis_tot+toadd
|
||||
! Expand the basis
|
||||
do ieign = 1, num_eign
|
||||
if(.not. kill_left(ieign)) then
|
||||
if (.not. kill_left(ieign)) then
|
||||
num_basis=num_basis+1
|
||||
!$acc kernels
|
||||
vec_b(:,:,:,num_basis)=left_res(:,:,:,ieign)
|
||||
if(.not. poor_of_ram .and. okvan) &
|
||||
!$acc end kernels
|
||||
if (.not. poor_of_ram .and. okvan) &
|
||||
call lr_apply_s(vec_b(:,:,:,num_basis),svec_b(:,:,:,num_basis))
|
||||
endif
|
||||
if(.not. kill_right(ieign)) then
|
||||
if (.not. kill_right(ieign)) then
|
||||
num_basis=num_basis+1
|
||||
!$acc kernels
|
||||
vec_b(:,:,:,num_basis)=right_res(:,:,:,ieign)
|
||||
if(.not. poor_of_ram .and. okvan) &
|
||||
!$acc end kernels
|
||||
if (.not. poor_of_ram .and. okvan) &
|
||||
call lr_apply_s(vec_b(:,:,:,num_basis),svec_b(:,:,:,num_basis))
|
||||
endif
|
||||
enddo
|
||||
endif
|
||||
if(conv_assistant) then
|
||||
if(max_res .lt. 10*residue_conv_thr .and. .not. ploted(1)) then
|
||||
!
|
||||
10 call stop_clock("expan_basis")
|
||||
!$acc end data
|
||||
!
|
||||
if (conv_assistant) then
|
||||
if (max_res .lt. 10*residue_conv_thr .and. .not. ploted(1)) then
|
||||
call interpret_eign('10')
|
||||
ploted(1)=.true.
|
||||
endif
|
||||
endif
|
||||
|
||||
call stop_clock("expan_basis")
|
||||
|
||||
!
|
||||
return
|
||||
end subroutine dav_expan_basis
|
||||
!-------------------------------------------------------------------------------
|
||||
|
@ -847,26 +952,35 @@ contains
|
|||
real(dp) :: temp
|
||||
|
||||
call start_clock("mGS_orth")
|
||||
|
||||
!$acc data present_or_copy(left_res(:,:,:,:), right_res(:,:,:,:)) present_or_copyin(vec_b(:,:,:,:))
|
||||
|
||||
! first orthogonalize to old basis
|
||||
do ib = 1, num_basis
|
||||
do ieign = 1, num_eign
|
||||
if (.not. kill_left(ieign)) then
|
||||
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
|
||||
if (poor_of_ram .or. .not. okvan) then ! Less memory needed
|
||||
do ib = 1, num_basis
|
||||
call lr_1to1orth(left_res(1,1,1,ieign),vec_b(1,1,1,ib))
|
||||
enddo
|
||||
else ! Less calculation, double memory required
|
||||
do ib = 1, num_basis
|
||||
call lr_bi_1to1orth(left_res(1,1,1,ieign),vec_b(1,1,1,ib),svec_b(1,1,1,ib))
|
||||
enddo
|
||||
endif
|
||||
endif
|
||||
if (.not. kill_right(ieign)) then
|
||||
if(poor_of_ram .or. .not. okvan) then ! Less memory needed
|
||||
if (poor_of_ram .or. .not. okvan) then ! Less memory needed
|
||||
do ib = 1, num_basis
|
||||
call lr_1to1orth(right_res(1,1,1,ieign),vec_b(1,1,1,ib))
|
||||
else ! Less calculation, double memory required
|
||||
enddo
|
||||
else
|
||||
do ib = 1, num_basis
|
||||
call lr_bi_1to1orth(right_res(1,1,1,ieign),vec_b(1,1,1,ib),svec_b(1,1,1,ib))
|
||||
enddo
|
||||
endif
|
||||
endif
|
||||
enddo
|
||||
enddo
|
||||
|
||||
!
|
||||
! orthogonalize between new basis themselves
|
||||
do ieign = 1, num_eign
|
||||
if (.not. kill_left(ieign) .and. .not. kill_right(ieign)) then
|
||||
|
@ -886,6 +1000,7 @@ contains
|
|||
call lr_1to1orth(right_res(1,1,1,ieign2),right_res(1,1,1,ieign))
|
||||
enddo
|
||||
enddo
|
||||
!$acc end data
|
||||
call stop_clock("mGS_orth")
|
||||
return
|
||||
end subroutine lr_mGS_orth
|
||||
|
@ -901,18 +1016,25 @@ contains
|
|||
use klist, only : nks
|
||||
use wvfct, only : npwx,nbnd
|
||||
use io_global, only : stdout
|
||||
use uspp, only : okvan
|
||||
use lr_dav_variables
|
||||
use lr_us
|
||||
|
||||
implicit none
|
||||
complex(kind=dp),external :: lr_dot
|
||||
integer :: ieign,ia
|
||||
real(dp) :: norm_res
|
||||
|
||||
call start_clock("mGS_orth_pp")
|
||||
!$acc data present_or_copy(left_res(:,:,:,:), right_res(:,:,:,:))
|
||||
do ieign = 1, num_eign
|
||||
if(.not. kill_left(ieign)) then
|
||||
if (.not. kill_left(ieign)) then
|
||||
if (okvan) then
|
||||
norm_res = dble(lr_dot_us(left_res(1,1,1,ieign),left_res(1,1,1,ieign)))
|
||||
if(norm_res .lt. residue_conv_thr) then
|
||||
else
|
||||
norm_res = dble(lr_dot(left_res(1,1,1,ieign),left_res(1,1,1,ieign)))
|
||||
endif
|
||||
if (norm_res .lt. residue_conv_thr) then
|
||||
kill_left(ieign) = .true.
|
||||
write(stdout,'("One residue is eliminated:",5x,E20.12)') norm_res
|
||||
toadd=toadd-1
|
||||
|
@ -921,9 +1043,13 @@ contains
|
|||
endif
|
||||
endif
|
||||
|
||||
if(.not. kill_right(ieign)) then
|
||||
if (.not. kill_right(ieign)) then
|
||||
if (okvan) then
|
||||
norm_res = dble(lr_dot_us(right_res(1,1,1,ieign),right_res(1,1,1,ieign)))
|
||||
if(norm_res .lt. residue_conv_thr) then
|
||||
else
|
||||
norm_res = dble(lr_dot(right_res(1,1,1,ieign),right_res(1,1,1,ieign)))
|
||||
endif
|
||||
if (norm_res .lt. residue_conv_thr) then
|
||||
kill_right(ieign) = .true.
|
||||
write(stdout,'("One residue is eliminated:",5x,E20.12)') norm_res
|
||||
toadd=toadd-1
|
||||
|
@ -932,6 +1058,8 @@ contains
|
|||
endif
|
||||
endif
|
||||
enddo
|
||||
!$acc end data
|
||||
|
||||
call stop_clock("mGS_orth_pp")
|
||||
return
|
||||
end subroutine lr_mGS_orth_pp
|
||||
|
@ -947,14 +1075,26 @@ contains
|
|||
use klist, only : nks
|
||||
use wvfct, only : npwx,nbnd
|
||||
use lr_us
|
||||
use uspp, only : okvan
|
||||
|
||||
implicit none
|
||||
complex(dp) :: vect(npwx,nbnd,nks),svect(npwx,nbnd,nks)
|
||||
complex(dp) :: vect(npwx,nbnd,nks)
|
||||
complex(kind=dp),external :: lr_dot
|
||||
real(dp) :: temp
|
||||
|
||||
!
|
||||
!$acc data present_or_copy(vect(:,:,:))
|
||||
!
|
||||
if (okvan) then
|
||||
temp=dble(lr_dot_us(vect(1,1,1),vect(1,1,1)))
|
||||
else
|
||||
temp=dble(lr_dot(vect(1,1,1),vect(1,1,1)))
|
||||
endif
|
||||
!$acc kernels copyin(temp)
|
||||
vect(:,:,:)=vect(:,:,:)/sqrt(temp)
|
||||
|
||||
!$acc end kernels
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
return
|
||||
end subroutine lr_norm
|
||||
!-------------------------------------------------------------------------------
|
||||
|
@ -970,11 +1110,33 @@ contains
|
|||
use klist, only : nks
|
||||
use wvfct, only : npwx,nbnd
|
||||
use lr_us
|
||||
use uspp, only : okvan
|
||||
|
||||
implicit none
|
||||
complex(kind=dp),external :: lr_dot
|
||||
complex(dp) :: vect1(npwx,nbnd,nks),vect2(npwx,nbnd,nks)
|
||||
|
||||
vect1(:,:,1)=vect1(:,:,1)-(lr_dot_us(vect1(1,1,1),vect2(1,1,1))/lr_dot_us(vect2(1,1,1),vect2(1,1,1)))*vect2(:,:,1)
|
||||
complex(dp) :: temp_dot1, temp_dot2, temp_dot
|
||||
!
|
||||
!$acc data present_or_copy(vect1(:,:,:)) present_or_copyin(vect2(:,:,:))
|
||||
!
|
||||
if (okvan) then
|
||||
temp_dot1 = lr_dot_us(vect1(1,1,1),vect2(1,1,1))
|
||||
temp_dot2 = lr_dot_us(vect2(1,1,1),vect2(1,1,1))
|
||||
temp_dot = temp_dot1/temp_dot2
|
||||
!$acc kernels copyin(temp_dot)
|
||||
vect1(:,:,1)=vect1(:,:,1)-temp_dot*vect2(:,:,1)
|
||||
!$acc end kernels
|
||||
else
|
||||
temp_dot1 = lr_dot(vect1(1,1,1),vect2(1,1,1))
|
||||
temp_dot2 = lr_dot(vect2(1,1,1),vect2(1,1,1))
|
||||
temp_dot = temp_dot1/temp_dot2
|
||||
!$acc kernels copyin(temp_dot)
|
||||
vect1(:,:,1)=vect1(:,:,1)-temp_dot*vect2(:,:,1)
|
||||
!$acc end kernels
|
||||
endif
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
return
|
||||
end subroutine lr_1to1orth
|
||||
!-------------------------------------------------------------------------------
|
||||
|
@ -993,8 +1155,17 @@ contains
|
|||
implicit none
|
||||
complex(kind=dp),external :: lr_dot
|
||||
complex(dp) :: vect1(npwx,nbnd,nks),vect2(npwx,nbnd,nks),svect2(npwx,nbnd,nks)
|
||||
|
||||
vect1(:,:,1)=vect1(:,:,1)-(lr_dot(svect2(1,1,1),vect1(1,1,1))/lr_dot(svect2(1,1,1),vect2(1,1,1)))*vect2(:,:,1)
|
||||
complex(dp) :: temp_dot1, temp_dot2, temp_dot
|
||||
!
|
||||
!$acc data present_or_copy(vect1(:,:,:)) present_or_copyin(svect2(:,:,:), vect2(:,:,:))
|
||||
temp_dot1 = lr_dot(svect2(1,1,1),vect1(1,1,1))
|
||||
temp_dot2 = lr_dot(svect2(1,1,1),vect2(1,1,1))
|
||||
temp_dot = temp_dot1/temp_dot2
|
||||
!
|
||||
!$acc kernels copyin(temp_dot)
|
||||
vect1(:,:,1)=vect1(:,:,1)-temp_dot*vect2(:,:,1)
|
||||
!$acc end kernels
|
||||
!$acc end data
|
||||
return
|
||||
end subroutine lr_bi_1to1orth
|
||||
!-------------------------------------------------------------------------------
|
||||
|
@ -1045,18 +1216,20 @@ contains
|
|||
use io_global, only : stdout,ionode,ionode_id
|
||||
use mp, only : mp_bcast,mp_barrier
|
||||
use mp_world, only : world_comm
|
||||
use uspp, only : okvan
|
||||
use lr_us
|
||||
|
||||
implicit none
|
||||
complex(dp), external :: lr_dot
|
||||
character(len=*) :: message
|
||||
integer :: ieign, ia,ic,iv,ipol
|
||||
real(kind=dp), external :: ddot
|
||||
real(dp) :: norm, normx, normy, alpha, shouldbe1,temp
|
||||
real(dp) :: C_right_M(num_basis_max),D_left_M(num_basis_max)
|
||||
|
||||
if(.not. allocated(norm_F)) allocate(norm_F(num_eign))
|
||||
if (.not. allocated(norm_F)) allocate(norm_F(num_eign))
|
||||
|
||||
if(message=="END") then
|
||||
if (message=="END") then
|
||||
write(stdout,'(/7x,"================================================================")')
|
||||
write(stdout,'(/7x,"Davidson diagonalization has finished in",I5," steps.")') dav_iter
|
||||
write(stdout,'(10x,"the number of current basis is",I5)') num_basis
|
||||
|
@ -1065,20 +1238,20 @@ contains
|
|||
write(stdout,'(/7x,"Now print out information of eigenstates")')
|
||||
endif
|
||||
|
||||
if(message=="10")&
|
||||
if (message=="10")&
|
||||
write(stdout,'(/7x,"Quasi convergence(10*residue_conv_thr) has been arrived in",I5," steps.")') dav_iter
|
||||
|
||||
if(.not. done_calc_R) then
|
||||
if (.not. done_calc_R) then
|
||||
call lr_calc_R()
|
||||
done_calc_R=.true.
|
||||
endif
|
||||
|
||||
! Print out Oscilation strength
|
||||
if(message == "END") then
|
||||
if (message == "END") then
|
||||
write(stdout,'(/,/5x,"K-S Oscillator strengths")')
|
||||
write(stdout,'(5x,"occ",1x,"con",8x,"R-x",14x,"R-y",14x,"R-z")')
|
||||
do iv=nbnd-p_nbnd_occ+1, nbnd
|
||||
do ic=1,p_nbnd_virt
|
||||
do iv = nbnd-p_nbnd_occ+1, nbnd
|
||||
do ic = 1,p_nbnd_virt
|
||||
write(stdout,'(5x,i3,1x,i3,3x,E16.8,2X,E16.8,2X,E16.8)') &
|
||||
&iv,ic,dble(R(iv,ic,1)),dble(R(iv,ic,2)),dble(R(iv,ic,3))
|
||||
enddo
|
||||
|
@ -1089,10 +1262,10 @@ contains
|
|||
do ieign = 1, num_eign
|
||||
#if defined(__MPI)
|
||||
! This part is calculated in serial
|
||||
if(ionode) then
|
||||
if (ionode) then
|
||||
#endif
|
||||
ia = eign_value_order(ieign)
|
||||
if(message=="END")&
|
||||
if (message=="END")&
|
||||
write(stdout,'(/7x,"! The",I5,1x,"-th eigen state. The transition&
|
||||
& energy is: ", 5x, F12.8)') ieign, tr_energy(ia)
|
||||
! Please see Documentation for the explaination of the next four steps
|
||||
|
@ -1102,23 +1275,23 @@ contains
|
|||
&num_basis_max,right_M(:,ia),1,(0.0D0,0.0D0),C_right_M,1)
|
||||
omegar(ieign)=ddot(num_basis,left_M(:,ia),1,left_M(:,ia),1)
|
||||
omegar(ieign)=ddot(num_basis,C_right_M,1,left_M(:,ia),1)
|
||||
|
||||
!
|
||||
! Apply D to the left eigen state in order to calculate the left omega
|
||||
call dgemv('N',num_basis,num_basis,(1.0D0,0.0D0),dble(M_D),&
|
||||
&num_basis_max,left_M(:,ia),1,(0.0D0,0.0D0),D_left_M,1)
|
||||
omegal(ieign)=ddot(num_basis,right_M(:,ia),1,right_M(:,ia),1)
|
||||
omegal(ieign)=ddot(num_basis,D_left_M,1,right_M(:,ia),1)
|
||||
|
||||
if(abs(omegal(ieign)*omegar(ieign)-tr_energy(ia)**2) .gt. zero) then
|
||||
!
|
||||
if (abs(omegal(ieign)*omegar(ieign)-tr_energy(ia)**2) .gt. zero) then
|
||||
write(stdout,'(/5x,"Warning !!! : The interpretation of the eigenstates may have problems.")')
|
||||
write(stdout,'(10x,"omegal*omegar = ",F20.12,10x,"omega^2 = ",F20.12)') &
|
||||
&omegal(ieign)*omegar(ieign),tr_energy(ia)**2
|
||||
endif
|
||||
|
||||
!
|
||||
! scale right_full and left_full in order to make omegar = omegal
|
||||
omegar(ieign)=sqrt(dble(omegar(ieign)))
|
||||
omegal(ieign)=sqrt(dble(omegal(ieign)))
|
||||
|
||||
!
|
||||
#if defined(__MPI)
|
||||
endif
|
||||
call mp_barrier(world_comm)
|
||||
|
@ -1130,7 +1303,11 @@ contains
|
|||
left_full(:,:,:,ieign)=left_full(:,:,:,ieign)/dble(omegal(ieign))
|
||||
|
||||
! Normalize the vector
|
||||
if (okvan) then
|
||||
norm=2.0d0*dble(lr_dot_us(right_full(1,1,1,ieign),left_full(1,1,1,ieign)))
|
||||
else
|
||||
norm=2.0d0*dble(lr_dot(right_full(1,1,1,ieign),left_full(1,1,1,ieign)))
|
||||
endif
|
||||
norm=sqrt(abs(norm))
|
||||
|
||||
right_full(:,:,:,ieign)=right_full(:,:,:,ieign)/norm
|
||||
|
@ -1140,11 +1317,16 @@ contains
|
|||
left_res(:,:,:,ieign)=(right_full(:,:,:,ieign)+left_full(:,:,:,ieign))/sqrt(2.0d0) !X
|
||||
right_res(:,:,:,ieign)=(right_full(:,:,:,ieign)-left_full(:,:,:,ieign))/sqrt(2.0d0) !Y
|
||||
|
||||
if (okvan) then
|
||||
normx=dble(lr_dot_us(left_res(1,1,1,ieign),left_res(1,1,1,ieign)))
|
||||
normy=-dble(lr_dot_us(right_res(1,1,1,ieign),right_res(1,1,1,ieign)))
|
||||
else
|
||||
normx=dble(lr_dot(left_res(1,1,1,ieign),left_res(1,1,1,ieign)))
|
||||
normy=-dble(lr_dot(right_res(1,1,1,ieign),right_res(1,1,1,ieign)))
|
||||
endif
|
||||
norm_F(ieign)=normx+normy !! Actually norm_F should always be one since it was previously normalized
|
||||
|
||||
if(message=="END") then
|
||||
if (message=="END") then
|
||||
write(stdout,'(/5x,"The two digitals below indicate the importance of doing beyong TDA: ")')
|
||||
write(stdout,'(/5x,"Components: X",2x,F12.5,";",4x,"Y",2x,F12.5)') &
|
||||
normx/norm_F(ieign),normy/norm_F(ieign)
|
||||
|
@ -1160,14 +1342,14 @@ contains
|
|||
normy=normy-Fy(iv,ic)*Fy(iv,ic)
|
||||
enddo
|
||||
enddo
|
||||
if( normx+normy .lt. 0.0d0 ) then
|
||||
if ( normx+normy .lt. 0.0d0 ) then
|
||||
normx=-normx
|
||||
normy=-normy
|
||||
endif
|
||||
|
||||
if(message=="END") then
|
||||
if (message=="END") then
|
||||
write (stdout,'(/5x,"In the occ-virt project subspace the total Fxy is:")')
|
||||
write(stdout,'(/5x,"X",2x,F12.5,";",4x,"Y",2x,F12.5,4x,"total",2x,F12.5,2x,&
|
||||
write (stdout,'(/5x,"X",2x,F12.5,";",4x,"Y",2x,F12.5,4x,"total",2x,F12.5,2x,&
|
||||
&"/ ",F12.5)') normx,normy,normx+normy,norm_F(ieign)
|
||||
endif
|
||||
|
||||
|
@ -1178,7 +1360,7 @@ contains
|
|||
|
||||
total_chi(ieign)=chi_dav(1,ieign)+chi_dav(2,ieign)+chi_dav(3,ieign)
|
||||
|
||||
if(message=="END") then
|
||||
if (message=="END") then
|
||||
write (stdout,'(/5x,"The Chi_i_i is",5x,"Total",10x,"1",15x,"2",15x,"3")')
|
||||
write (stdout,'(/12x,8x,E15.8,3x,E15.8,3x,E15.8,3x,E15.8)') total_chi(ieign),&
|
||||
&chi_dav(1,ieign),chi_dav(2,ieign), chi_dav(3,ieign)
|
||||
|
@ -1200,7 +1382,7 @@ contains
|
|||
enddo
|
||||
|
||||
#if defined(__MPI)
|
||||
if(ionode) then
|
||||
if (ionode) then
|
||||
#endif
|
||||
call write_eigenvalues(message)
|
||||
call write_spectrum(message)
|
||||
|
@ -1219,18 +1401,32 @@ contains
|
|||
use lr_dav_variables
|
||||
use lr_variables, only : d0psi
|
||||
use kinds, only : dp
|
||||
use uspp, only : okvan
|
||||
use lr_us
|
||||
|
||||
implicit none
|
||||
character :: flag_calc
|
||||
integer :: ieign,ipol
|
||||
|
||||
if( flag_calc .eq. "X" ) then
|
||||
complex(kind=dp), external :: lr_dot
|
||||
!
|
||||
if (okvan) then
|
||||
if ( flag_calc .eq. "X" ) then
|
||||
dav_calc_chi=lr_dot_us(d0psi(:,:,1,ipol), left_res(:,:,1,ieign))
|
||||
else if (flag_calc .eq. "Y") then
|
||||
dav_calc_chi=lr_dot_us(d0psi(:,:,1,ipol), right_res(:,:,1,ieign))
|
||||
endif
|
||||
|
||||
else
|
||||
if ( flag_calc .eq. "X" ) then
|
||||
!$acc data copyin(d0psi(:,:,1,ipol), left_res(:,:,1,ieign))
|
||||
dav_calc_chi=lr_dot(d0psi(:,:,1,ipol), left_res(:,:,1,ieign))
|
||||
!$acc end data
|
||||
else if (flag_calc .eq. "Y") then
|
||||
!$acc data copyin(d0psi(:,:,1,ipol), right_res(:,:,1,ieign))
|
||||
dav_calc_chi=lr_dot(d0psi(:,:,1,ipol), right_res(:,:,1,ieign))
|
||||
!$acc end data
|
||||
endif
|
||||
endif
|
||||
!
|
||||
return
|
||||
end function dav_calc_chi
|
||||
!-------------------------------------------------------------------------------
|
||||
|
@ -1502,7 +1698,7 @@ contains
|
|||
|
||||
! Orthogonalize to occupied states
|
||||
do ib = 1, num_init
|
||||
call lr_ortho(vec_b(:,:,:,ib), evc0(:,:,1), 1, 1,sevc0(:,:,1),.true.)
|
||||
call orthogonalize(vec_b(:,:,:,ib), evc0(:,:,1), 1, 1,sevc0(:,:,1),npwx,.true.)
|
||||
call lr_norm(vec_b(1,1,1,ib))
|
||||
enddo
|
||||
|
||||
|
|
|
@ -301,7 +301,7 @@ SUBROUTINE one_lanczos_step()
|
|||
!
|
||||
ENDIF
|
||||
!
|
||||
! X. Ge: To increase the stability, apply lr_ortho.
|
||||
! X. Ge: To increase the stability, apply orthogonalize.
|
||||
! I.Timrov: Actually, without this trick, it turns out that
|
||||
! the Lanczos chain is not stable when pseudo_hermitian=.false.,
|
||||
! because there is a warning from lr_calc_dens that the integral of
|
||||
|
@ -311,9 +311,11 @@ SUBROUTINE one_lanczos_step()
|
|||
IF (.not.eels .and. .not. magnons) THEN
|
||||
!
|
||||
DO ik=1, nks
|
||||
! CALL orthogonalize(evc1(:,:,ik,1), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),npwx,.true.)
|
||||
CALL lr_ortho(evc1(:,:,ik,1), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),.true.)
|
||||
IF (.not. pseudo_hermitian) &
|
||||
CALL lr_ortho(evc1(:,:,ik,2), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),.true.)
|
||||
! CALL orthogonalize(evc1(:,:,ik,2), evc0(:,:,ik), ik, ik, sevc0(:,:,ik),npwx,.true.)
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
|
|
|
@ -355,6 +355,21 @@ SUBROUTINE lr_readin
|
|||
!
|
||||
ENDIF
|
||||
!
|
||||
IF (davidson) THEN
|
||||
!
|
||||
! check and set num_init and num_basis_max
|
||||
!
|
||||
IF (num_init < num_eign ) THEN
|
||||
WRITE(stdout,'(5X,"num_init is too small, set to num_init = 2*num_eign")')
|
||||
num_init = 2 * num_eign
|
||||
ENDIF
|
||||
IF (num_basis_max < 2*num_init ) THEN
|
||||
WRITE(stdout,'(5X,"num_basis_max is too small, set to num_basis_max = 4*num_init")')
|
||||
num_basis_max = 4 * num_init
|
||||
ENDIF
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
#if defined(__MPI)
|
||||
ENDIF
|
||||
!
|
||||
|
|
Loading…
Reference in New Issue