Merge branch 'vloc_clean' into 'develop'

More Vloc*psi cleanup

See merge request QEF/q-e!2394
This commit is contained in:
giannozz 2024-07-17 11:49:59 +00:00
commit f22bd7b35d
22 changed files with 508 additions and 821 deletions

View File

@ -45,9 +45,6 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
USE lsda_mod, ONLY : lsda, nspin, current_spin, isk
USE wvfct, ONLY : nbnd, npwx
USE scf, ONLY : rho, vrs
#if defined(__CUDA)
USE scf_gpum, ONLY : vrs_d
#endif
USE uspp, ONLY : okvan, vkb, deeq_nc
USE uspp_param, ONLY : nhm
USE noncollin_module, ONLY : noncolin, domag, npol, nspin_mag
@ -331,10 +328,9 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
dvscfins(:, 2:4, :) = -dvscfins(:, 2:4, :)
IF (okvan) int3_nc(:,:,:,:,:) = int3_save(:,:,:,:,:,2)
ENDIF
!$acc kernels
vrs(:, 2:4) = -vrs(:, 2:4)
#if defined(__CUDA)
vrs_d = vrs
#endif
!$acc end kernels
IF (okvan) THEN
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,2)
!$acc update device(deeq_nc)
@ -362,10 +358,9 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
dvscfins(:, 2:4, :) = -dvscfins(:, 2:4, :)
IF (okvan) int3_nc(:,:,:,:,:) = int3_save(:,:,:,:,:,1)
ENDIF
!$acc kernels
vrs(:, 2:4) = -vrs(:, 2:4)
#if defined(__CUDA)
vrs_d = vrs
#endif
!$acc end kernels
IF (okvan) THEN
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,1)
!$acc update device(deeq_nc)

View File

@ -245,7 +245,6 @@ set(src_pw
src/wannier_occ.f90
src/d3hess_mod.f90
# GPU
src/scf_mod_gpu.f90
src/g_psi_mod_gpu.f90
src/newd_gpu.f90
src/add_paw_to_deeq_gpu.f90

View File

@ -278,7 +278,6 @@ PWLIBS += \
h_psi_gpu.o \
vhpsi_gpu.o \
vloc_psi_gpu.o \
scf_mod_gpu.o \
usnldiag_gpu.o \
add_vuspsi_gpu.o \
newd_gpu.o \

View File

@ -27,8 +27,6 @@ SUBROUTINE allocate_fft
USE wavefunctions, ONLY : psic, psic_nc
USE xc_lib, ONLY : xclib_dft_is
!
USE scf_gpum, ONLY : using_vrs
!
IMPLICIT NONE
!
! ... First a bunch of checks
@ -68,9 +66,7 @@ SUBROUTINE allocate_fft
ALLOCATE( rhog_core(ngm) )
ALLOCATE( psic(dfftp%nnr) )
ALLOCATE( vrs(dfftp%nnr,nspin) )
#if defined(__CUDA)
CALL using_vrs(2)
#endif
!$acc enter data create (vrs)
!
IF (noncolin) ALLOCATE( psic_nc(dfftp%nnr,npol) )
!

View File

@ -64,8 +64,6 @@ SUBROUTINE clean_pw( lflag )
USE libmbd_interface, ONLY : clean_mbd
USE dftd3_qe, ONLY : dftd3_clean
!
USE scf_gpum, ONLY : deallocate_scf_gpu
!
USE control_flags, ONLY : sic, scissor
USE sic_mod, ONLY : deallocate_sic
USE sci_mod, ONLY : deallocate_scissor
@ -142,8 +140,8 @@ SUBROUTINE clean_pw( lflag )
IF ( ALLOCATED( rhog_core ) ) DEALLOCATE( rhog_core )
IF ( ALLOCATED( psic ) ) DEALLOCATE( psic )
IF ( ALLOCATED( psic_nc ) ) DEALLOCATE( psic_nc )
!$acc exit data delete(vrs)
IF ( ALLOCATED( vrs ) ) DEALLOCATE( vrs )
CALL deallocate_scf_gpu()
!
! ... arrays allocated in allocate_locpot.f90 ( and never deallocated )
!

View File

@ -47,8 +47,6 @@ SUBROUTINE electrons()
USE loc_scdm, ONLY : use_scdm, localize_orbitals
USE loc_scdm_k, ONLY : localize_orbitals_k
!
USE scf_gpum, ONLY : using_vrs
!
USE add_dmft_occ, ONLY : dmft
USE rism_module, ONLY : lrism, rism_calc3d
USE makovpayne, ONLY : makov_payne
@ -145,7 +143,6 @@ SUBROUTINE electrons()
ehart, etxc, vtxc, eth, etotefield, charge, v)
IF (lrism) CALL rism_calc3d(rho%of_g(:, 1), esol, vsol, v%of_r, tr2)
IF (okpaw) CALL PAW_potential(rho%bec, ddd_PAW, epaw,etot_cmp_paw)
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, &
nspin, doublegrid )
!
@ -242,7 +239,6 @@ SUBROUTINE electrons()
IF (lrism) CALL rism_calc3d(rho%of_g(:, 1), esol, vsol, v%of_r, tr2)
!
IF (okpaw) CALL PAW_potential(rho%bec, ddd_PAW, epaw,etot_cmp_paw)
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, &
nspin, doublegrid )
!
@ -462,7 +458,6 @@ SUBROUTINE electrons_scf ( printout, exxen )
USE libmbd_interface, ONLY : EmbdvdW
USE add_dmft_occ, ONLY : dmft, dmft_update, v_dmft, dmft_updated
!
USE scf_gpum, ONLY : using_vrs
USE device_fbuff_m, ONLY : dev_buf, pin_buf
USE pwcom, ONLY : report_mag
USE makovpayne, ONLY : makov_payne
@ -987,13 +982,8 @@ SUBROUTINE electrons_scf ( printout, exxen )
!
! ... define the total local potential (external + scf)
!
CALL using_vrs(1)
CALL sum_vrs( dfftp%nnr, nspin, vltot, v%of_r, vrs )
!
! ... interpolate the total local potential
!
CALL using_vrs(1) ! redundant
CALL interpolate_vrs( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs )
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, &
nspin, doublegrid )
!
! ... in the US case we have to recompute the self-consistent
! ... term in the nonlocal potential

View File

@ -19,7 +19,6 @@ SUBROUTINE h_epsi_her_apply( lda, n, nbande, psi, hpsi, pdir, e_field )
USE noncollin_module, ONLY : noncolin, npol, lspinorb
USE wvfct, ONLY : npwx, nbnd, ik => current_k
USE lsda_mod, ONLY : current_spin, nspin
USE scf, ONLY : vrs
USE gvect
USE uspp, ONLY : okvan, nkb, vkb, qq_so, qq_at
USE uspp_param, ONLY : nh, nhm, nbetam

View File

@ -19,7 +19,6 @@ SUBROUTINE h_epsi_her_set( pdir, e_field )
USE noncollin_module, ONLY: noncolin, npol, lspinorb
USE wvfct, ONLY: npwx, nbnd
USE lsda_mod, ONLY: current_spin, nspin
USE scf, ONLY: vrs
USE gvect
USE fft_base, ONLY: dfftp
USE uspp, ONLY: okvan, nkb, vkb

View File

@ -106,7 +106,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
USE sci_mod, ONLY: p_psi
USE fft_helper_subroutines
!
USE scf_gpum, ONLY: using_vrs
#if defined(__OSCDFT)
USE plugin_flags, ONLY : use_oscdft
USE oscdft_base, ONLY : oscdft_ctx
@ -133,10 +132,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
!
!
CALL start_clock( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6)
CALL using_vrs(0) ! vloc_psi_gamma (intent:in)
!
! ... Here we set the kinetic energy (k+G)^2 psi and clean up garbage
!
@ -162,9 +157,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
! ... real-space algorithm
! ... fixme: real_space without beta functions does not make sense
!
IF ( dffts%has_task_groups ) &
CALL errore( 'h_psi', 'task_groups not implemented with real_space', 1 )
DO ibnd = 1, m, 2
! ... transform psi to real space -> psic
CALL invfft_orbital_gamma( psi, ibnd, m )
@ -180,6 +172,9 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
CALL fwfft_orbital_gamma( hpsi, ibnd, m, add_to_orbital=.TRUE. )
ENDDO
!
ELSE IF ( dffts%has_task_groups ) THEN
! ... usual reciprocal-space algorithm, with task groups
CALL vloc_psi_tg_gamma( lda, n, m, psi, vrs(1,current_spin), hpsi )
ELSE
! ... usual reciprocal-space algorithm
CALL vloc_psi_gamma( lda, n, m, psi, vrs(1,current_spin), hpsi )
@ -188,7 +183,11 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
!
ELSEIF ( noncolin ) THEN
!
CALL vloc_psi_nc( lda, n, m, psi, vrs, hpsi )
IF ( dffts%has_task_groups ) THEN
CALL vloc_psi_tg_nc( lda, n, m, psi, vrs, hpsi )
ELSE
CALL vloc_psi_nc( lda, n, m, psi, vrs, hpsi )
END IF
!
ELSE
!
@ -197,9 +196,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
! ... real-space algorithm
! ... fixme: real_space without beta functions does not make sense
!
IF ( dffts%has_task_groups ) &
CALL errore( 'h_psi', 'task_groups not implemented with real_space', 1 )
!
DO ibnd = 1, m
! ... transform psi to real space -> psic
CALL invfft_orbital_k( psi, ibnd, m )
@ -216,9 +212,12 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
!
ENDDO
!
ELSE IF ( dffts%has_task_groups ) THEN
! ... usual reciprocal-space algorithm, with task groups
CALL vloc_psi_tg_k( lda, n, m, psi, vrs(1,current_spin), hpsi )
ELSE
!
CALL vloc_psi_k( lda, n, m, psi, vrs(1,current_spin), hpsi )
! ... usual reciprocal-space algorithm
CALL vloc_psi_k( lda, n, m, psi, vrs(1,current_spin), hpsi )
!
ENDIF
!

View File

@ -97,7 +97,6 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
USE bp, ONLY: lelfield, l3dstring, gdir, efield, efield_cry
USE becmod, ONLY: bec_type, becp
USE lsda_mod, ONLY: current_spin
USE scf_gpum, ONLY: vrs_d, using_vrs_d
USE uspp, ONLY: nkb, vkb
USE ldaU, ONLY: lda_plus_u, lda_plus_u_kind, Hubbard_projectors
USE gvect, ONLY: gstart
@ -113,6 +112,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
USE fft_helper_subroutines
USE device_memcpy_m, ONLY: dev_memcpy, dev_memset
!
USE scf, ONLY: vrs
USE wvfct, ONLY: g2kin
#if defined(__OSCDFT)
USE plugin_flags, ONLY : use_oscdft
@ -140,7 +140,6 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
LOGICAL :: need_host_copy
!
CALL start_clock_gpu( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6)
CALL using_vrs_d(0)
!
! ... Here we add the kinetic energy (k+G)^2 psi and clean up garbage
!
@ -208,13 +207,13 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
!
ELSE
! ... usual reciprocal-space algorithm
CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs(1,current_spin), hpsi_d )
!
ENDIF
!
ELSE IF ( noncolin ) THEN
!
CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs_d, hpsi_d )
CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs, hpsi_d )
!
ELSE
!
@ -246,7 +245,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
!
ELSE
!
CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs(1,current_spin), hpsi_d )
!
ENDIF
!

View File

@ -35,7 +35,6 @@ SUBROUTINE hinit1()
USE dfunct_gpum, ONLY : newd_gpu
USE exx_base, ONLY : coulomb_fac, coulomb_done
!
USE scf_gpum, ONLY : using_vrs
USE ener, ONLY : esol, vsol
USE rism_module, ONLY : lrism, rism_update_pos, rism_calc3d
!
@ -113,7 +112,6 @@ SUBROUTINE hinit1()
!
! ... define the total local potential (external+scf)
!
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, &
doublegrid )
!

View File

@ -47,7 +47,6 @@ SUBROUTINE orbm_kubo()
USE gvecs, ONLY : doublegrid
USE mp_pools, ONLY : intra_pool_comm
USE mp_world, ONLY : world_comm
USE scf_gpum, ONLY : using_vrs
USE uspp_init, ONLY : init_us_2
!
IMPLICIT NONE
@ -112,7 +111,6 @@ SUBROUTINE orbm_kubo()
ALLOCATE( H_evc(npol*npwx,nbnd) )
ALLOCATE( temp(ngm) )
!
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid )
CALL allocate_bec_type( nkb, nbnd, becp )
! Initializations

View File

@ -55,7 +55,6 @@ SUBROUTINE potinit()
USE paw_init, ONLY : PAW_atomic_becsum
USE paw_onecenter, ONLY : PAW_potential
!
USE scf_gpum, ONLY : using_vrs
USE pwcom, ONLY : report_mag
USE rism_module, ONLY : lrism, rism_init3d, rism_calc3d
!
@ -278,9 +277,7 @@ SUBROUTINE potinit()
!
! ... define the total local potential (external+scf)
!
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid )
!
! ... write on output the parameters used in the DFT+U(+V) calculation
!
IF ( lda_plus_u ) THEN

View File

@ -2613,7 +2613,6 @@ MODULE realus
USE fft_base, ONLY : dffts
USE scf, ONLY : vrs
USE lsda_mod, ONLY : current_spin
USE scf_gpum, ONLY : using_vrs
!
IMPLICIT NONE
!
@ -2629,8 +2628,6 @@ MODULE realus
!
CALL start_clock( 'v_loc_psir' )
CALL using_vrs(0) ! tg_gather (intent: in)
IF( dffts%has_task_groups ) THEN
IF (ibnd == 1 ) THEN
CALL tg_gather( dffts, vrs(:,current_spin), tg_v )
@ -2672,7 +2669,6 @@ MODULE realus
USE fft_base, ONLY : dffts
USE scf, ONLY : vrs
USE lsda_mod, ONLY : current_spin
USE scf_gpum, ONLY : using_vrs
!
IMPLICIT NONE
!
@ -2688,8 +2684,6 @@ MODULE realus
!
CALL start_clock( 'v_loc_psir' )
CALL using_vrs(0) ! tg_gather (intent: in)
IF( dffts%has_task_groups ) THEN
IF (ibnd == 1 ) THEN
CALL tg_gather( dffts, vrs(:,current_spin), tg_v )

View File

@ -98,9 +98,6 @@ MODULE scf
REAL(DP), ALLOCATABLE :: vltot(:)
!! the local potential in real space
REAL(DP), ALLOCATABLE :: vrs(:,:)
#if defined(__CUDA)
attributes(pinned) :: vrs
#endif
!! the total pot. in real space (smooth grid)
REAL(DP), ALLOCATABLE :: rho_core(:)
!! the core charge in real space

View File

@ -1,124 +0,0 @@
!
! Copyright (C) 2002-2011 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#define DIMS1D(arr) lbound(arr,1):ubound(arr,1)
#define DIMS2D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2)
#define DIMS3D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3)
#define DIMS4D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3),lbound(arr,4):ubound(arr,4)
#define DIMS5D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3),lbound(arr,4):ubound(arr,4),lbound(arr,5):ubound(arr,5)
!=----------------------------------------------------------------------------=!
MODULE scf_gpum
!=----------------------------------------------------------------------------=!
#if defined(__CUDA)
USE cudafor
#endif
IMPLICIT NONE
SAVE
INTEGER, PARAMETER :: DP = selected_real_kind(14,200)
INTEGER, PARAMETER :: sgl = selected_real_kind(6,30)
INTEGER, PARAMETER :: i4b = selected_int_kind(9)
INTEGER, PARAMETER :: i8b = selected_int_kind(18)
#if defined(__DEBUG)
INTEGER :: iverbosity = 1
#else
INTEGER :: iverbosity = 0
#endif
!
REAL(DP), ALLOCATABLE :: vrs_d(:, :)
!
#if defined(__CUDA)
attributes (DEVICE) :: vrs_d
#endif
LOGICAL :: vrs_ood = .false. ! used to flag out of date variables
LOGICAL :: vrs_d_ood = .false. ! used to flag out of date variables
!
CONTAINS
!
SUBROUTINE using_vrs(intento, debug_info)
!
! intento is used to specify what the variable will be used for :
! 0 -> in , the variable needs to be synchronized but won't be changed
! 1 -> inout , the variable needs to be synchronized AND will be changed
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
!
USE scf, ONLY : vrs
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
#if defined(__CUDA) || defined(__CUDA_GNU)
INTEGER :: intento_
intento_ = intento
!
IF (PRESENT(debug_info) ) print *, "using_vrs ", debug_info, vrs_ood
!
IF (vrs_ood) THEN
IF ((.not. allocated(vrs_d)) .and. (intento_ < 2)) THEN
CALL errore('using_vrs_d', 'PANIC: sync of vrs from vrs_d with unallocated array. Bye!!', 1)
stop
END IF
IF (.not. allocated(vrs)) THEN
IF (intento_ /= 2) THEN
print *, "WARNING: sync of vrs with unallocated array and intento /= 2? Changed to 2!"
intento_ = 2
END IF
! IF (intento_ > 0) vrs_d_ood = .true.
END IF
IF (intento_ < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied vrs D->H"
vrs = vrs_d
END IF
vrs_ood = .false.
ENDIF
IF (intento_ > 0) vrs_d_ood = .true.
#endif
END SUBROUTINE using_vrs
!
SUBROUTINE using_vrs_d(intento, debug_info)
!
USE scf, ONLY : vrs
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
#if defined(__CUDA) || defined(__CUDA_GNU)
!
IF (PRESENT(debug_info) ) print *, "using_vrs_d ", debug_info, vrs_d_ood
!
IF (.not. allocated(vrs)) THEN
IF (intento /= 2) print *, "WARNING: sync of vrs_d with unallocated array and intento /= 2?"
IF (allocated(vrs_d)) DEALLOCATE(vrs_d)
vrs_d_ood = .false.
RETURN
END IF
! here we know that vrs is allocated, check if size is 0
IF ( SIZE(vrs) == 0 ) THEN
print *, "Refusing to allocate 0 dimensional array vrs_d. If used, code will crash."
RETURN
END IF
!
IF (vrs_d_ood) THEN
IF ( allocated(vrs_d) .and. (SIZE(vrs_d)/=SIZE(vrs))) deallocate(vrs_d)
IF (.not. allocated(vrs_d)) ALLOCATE(vrs_d(DIMS2D(vrs))) ! MOLD does not work on all compilers
IF (intento < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied vrs H->D"
vrs_d = vrs
END IF
vrs_d_ood = .false.
ENDIF
IF (intento > 0) vrs_ood = .true.
#else
CALL errore('using_vrs_d', 'Trying to use device data without device compiled code!', 1)
#endif
END SUBROUTINE using_vrs_d
!
SUBROUTINE deallocate_scf_gpu
IF( ALLOCATED( vrs_d ) ) DEALLOCATE( vrs_d )
vrs_d_ood = .false.
END SUBROUTINE deallocate_scf_gpu
!=----------------------------------------------------------------------------=!
END MODULE scf_gpum
!=----------------------------------------------------------------------------=!

View File

@ -1,5 +1,5 @@
!
! Copyright (C) 2001 PWSCF group
! Copyright (C) 2001-2024 Quantum ESPESSO Foundation
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
@ -13,7 +13,9 @@ SUBROUTINE set_vrs( vrs, vltot, vr, kedtau, kedtaur, nrxx, nspin, doublegrid )
!! the sum of all the local pseudopotential contributions.
!
USE kinds
USE fft_base, ONLY : dffts
USE xc_lib, ONLY : xclib_dft_is
USE fft_base, ONLY : dffts, dfftp
USE fft_interfaces, ONLY : fft_interpolate
!
IMPLICIT NONE
!
@ -33,42 +35,9 @@ SUBROUTINE set_vrs( vrs, vltot, vr, kedtau, kedtaur, nrxx, nspin, doublegrid )
!! the kinetic energy density in R-space
LOGICAL :: doublegrid
! input: true if a doublegrid is used
!
CALL sum_vrs( nrxx, nspin, vltot, vr, vrs )
!
CALL interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
!
RETURN
!
END SUBROUTINE set_vrs
!
!
!--------------------------------------------------------------------
SUBROUTINE sum_vrs( nrxx, nspin, vltot, vr, vrs )
!--------------------------------------------------------------------
!! Accumulates local potential contributions into vrs (the total local potential).
!
USE kinds
!
IMPLICIT NONE
!
INTEGER :: nspin
!! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear
INTEGER :: nrxx
!! input: the fft grid dimension
REAL(DP) :: vrs(nrxx,nspin)
!! output: total local potential on the smooth grid:
!! \(\text{vrs\}=\text{vltot}+\text{vr}\)
REAL(DP) :: vltot(nrxx)
!! input: the total local pseudopotential
REAL(DP) :: vr(nrxx,nspin)
!! input: the scf(H+xc) part of the local potential
!
! ... local variable
!
INTEGER :: is
!
!
!$acc data present(vrs)
DO is = 1, nspin
!
! define the total local potential (external + scf) for each spin ...
@ -84,39 +53,6 @@ SUBROUTINE sum_vrs( nrxx, nspin, vltot, vr, vrs )
!
ENDDO
!
RETURN
!
END SUBROUTINE sum_vrs
!
!--------------------------------------------------------------------------
SUBROUTINE interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
!--------------------------------------------------------------------------
!! Interpolates local potential on the smooth mesh if necessary.
!
USE kinds
USE xc_lib, ONLY : xclib_dft_is
USE fft_base, ONLY : dffts, dfftp
USE fft_interfaces, ONLY : fft_interpolate
!
IMPLICIT NONE
!
INTEGER :: nspin
!! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear
INTEGER :: nrxx
!! input: the fft grid dimension
REAL(DP) :: vrs(nrxx,nspin)
!! output: total local potential interpolated on the smooth grid
REAL(DP) :: kedtau(dffts%nnr,nspin)
!! position dependent kinetic energy enhancement factor
REAL(DP) :: kedtaur(nrxx,nspin)
!! the kinetic energy density in R-space
LOGICAL :: doublegrid
!! input: true if a doublegrid is used
!
! ... local variable
!
INTEGER :: is
!
! ... interpolate it on the smooth mesh if necessary
!
DO is = 1, nspin
@ -124,6 +60,10 @@ SUBROUTINE interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
IF (xclib_dft_is('meta')) CALL fft_interpolate( dfftp, kedtaur(:,is), dffts, kedtau(:,is) )
ENDDO
!
!$acc update device(vrs)
!
!$acc end data
!
RETURN
!
END SUBROUTINE interpolate_vrs
END SUBROUTINE set_vrs

View File

@ -484,8 +484,6 @@ SUBROUTINE sum_band()
! ... here we sum for each k point the contribution
! ... of the wavefunctions to the charge
!
incr = 1
!
IF(sic) THEN
ALLOCATE(rho_p(dffts%nnr))
ALLOCATE(psic_p(dffts%nnr*2)) ! why *2?

View File

@ -1,5 +1,5 @@
!
! Copyright (C) 2003-2013 PWSCF group
! Copyright (C) 2003-2024 QUantum ESPRESS Foundation
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
@ -7,9 +7,9 @@
!
!
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_gamma( lda, n, m, psi, v, hpsi )
SUBROUTINE vloc_psi_tg_gamma( lda, n, m, psi, v, hpsi )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - Gamma point.
!! Calculation of Vloc*psi using dual-space technique and "task groups"
!
USE parallel_include
USE kinds, ONLY : DP
@ -41,113 +41,72 @@ SUBROUTINE vloc_psi_gamma( lda, n, m, psi, v, hpsi )
COMPLEX(DP) :: fp, fm
COMPLEX(DP), ALLOCATABLE :: vpsi(:,:)
! ... Variables for task groups
LOGICAL :: use_tg
INTEGER :: v_siz, idx, ebnd, brange
REAL(DP) :: fac
REAL(DP), ALLOCATABLE :: tg_v(:)
COMPLEX(DP), ALLOCATABLE :: tg_psic(:), tg_vpsi(:,:)
!
CALL start_clock( 'vloc_psi' )
incr = 2
!
use_tg = dffts%has_task_groups
IF (.not. dffts%has_task_groups ) CALL errore('vloc_psi','no task groups?',1)
!
IF( use_tg ) THEN
CALL start_clock( 'vloc_psi:tg_gather' )
v_siz = dffts%nnr_tg
ALLOCATE( tg_v(v_siz) )
ALLOCATE( tg_psic(v_siz) )
CALL tg_gather( dffts, v, tg_v )
incr = 2*fftx_ntgrp(dffts)
ALLOCATE( tg_vpsi(n,incr) )
CALL stop_clock( 'vloc_psi:tg_gather' )
ELSE
ALLOCATE( vpsi(n,incr) )
ENDIF
CALL start_clock( 'vloc_psi:tg_gather' )
incr = 2*fftx_ntgrp(dffts)
v_siz = dffts%nnr_tg
ALLOCATE( tg_v(v_siz) )
ALLOCATE( tg_psic(v_siz) )
CALL tg_gather( dffts, v, tg_v )
ALLOCATE( tg_vpsi(n,incr) )
CALL stop_clock( 'vloc_psi:tg_gather' )
!
IF ( use_tg ) THEN
DO ibnd = 1, m, incr
!
DO ibnd = 1, m, incr
!
! ... the local potential V_Loc psi. First bring psi to real space
!
CALL tgwave_g2r( psi(:,ibnd:m), tg_psic, dffts, n )
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
! ... product with the potential v on the smooth grid
!
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic(j) = tg_psic(j) * tg_v(j)
ENDDO
!
! ... back to reciprocal space
! ... addition to the total product
!
brange = m-ibnd+1
!
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n )
!
DO idx = 1, 2*fftx_ntgrp(dffts), 2
IF ( idx+ibnd-1<m ) THEN
DO j = 1, n
hpsi(j,ibnd+idx-1) = hpsi(j,ibnd+idx-1) + 0.5d0 * tg_vpsi(j,idx)
hpsi(j,ibnd+idx) = hpsi(j,ibnd+idx) + 0.5d0 * tg_vpsi(j,idx+1)
ENDDO
ELSEIF ( idx+ibnd-1==m ) THEN
DO j = 1, n
hpsi(j,ibnd+idx-1) = hpsi(j,ibnd+idx-1) + tg_vpsi(j,idx)
ENDDO
ENDIF
ENDDO
!
! ... the local potential V_Loc psi. First bring psi to real space
!
CALL tgwave_g2r( psi(:,ibnd:m), tg_psic, dffts, n )
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
! ... product with the potential v on the smooth grid
!
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic(j) = tg_psic(j) * tg_v(j)
ENDDO
!
ELSE
! ... back to reciprocal space
! ... addition to the total product
!
DO ibnd = 1, m, incr
!
ebnd = ibnd
IF ( ibnd < m ) ebnd = ibnd + 1
!
CALL wave_g2r( psi(1:n,ibnd:ebnd), psic, dffts )
!
DO j = 1, dffts%nnr
psic(j) = psic(j) * v(j)
ENDDO
!
brange=1 ; fac=1.d0
IF ( ibnd<m ) THEN
brange=2 ; fac=0.5d0
brange = m-ibnd+1
!
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n )
!
DO idx = 1, 2*fftx_ntgrp(dffts), 2
IF ( idx+ibnd-1<m ) THEN
DO j = 1, n
hpsi(j,ibnd+idx-1) = hpsi(j,ibnd+idx-1) + 0.5d0 * tg_vpsi(j,idx)
hpsi(j,ibnd+idx) = hpsi(j,ibnd+idx) + 0.5d0 * tg_vpsi(j,idx+1)
ENDDO
ELSEIF ( idx+ibnd-1==m ) THEN
DO j = 1, n
hpsi(j,ibnd+idx-1) = hpsi(j,ibnd+idx-1) + tg_vpsi(j,idx)
ENDDO
ENDIF
!
CALL wave_r2g( psic(1:dffts%nnr), vpsi(:,1:brange), dffts )
!
DO j = 1, n
hpsi(j,ibnd) = hpsi(j,ibnd) + fac*vpsi(j,1)
IF ( ibnd<m ) hpsi(j,ibnd+1) = hpsi(j,ibnd+1) + fac*vpsi(j,2)
ENDDO
!
ENDDO
!
ENDIF
ENDDO
!
IF( use_tg ) THEN
DEALLOCATE( tg_psic )
DEALLOCATE( tg_v )
DEALLOCATE( tg_vpsi )
ELSE
DEALLOCATE( vpsi )
ENDIF
DEALLOCATE( tg_psic )
DEALLOCATE( tg_v )
DEALLOCATE( tg_vpsi )
!
CALL stop_clock( 'vloc_psi' )
!
RETURN
!
END SUBROUTINE vloc_psi_gamma
END SUBROUTINE vloc_psi_tg_gamma
!
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_k( lda, n, m, psi, v, hpsi )
SUBROUTINE vloc_psi_tg_k( lda, n, m, psi, v, hpsi )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - k-points:
!
@ -190,107 +149,63 @@ SUBROUTINE vloc_psi_k( lda, n, m, psi, v, hpsi )
INTEGER, PARAMETER :: blocksize = 256
INTEGER :: numblock
! ... Task Groups
LOGICAL :: use_tg
REAL(DP), ALLOCATABLE :: tg_v(:)
COMPLEX(DP), ALLOCATABLE :: tg_psic(:), tg_vpsi(:,:)
INTEGER :: idx, brange, dffts_nnr
INTEGER :: v_siz, idx, brange
!
IF (.not. dffts%has_task_groups ) CALL errore('vloc_psi','no task groups?',2)
!
CALL start_clock( 'vloc_psi' )
use_tg = dffts%has_task_groups
!
IF( use_tg ) THEN
CALL start_clock( 'vloc_psi:tg_gather' )
dffts_nnr = dffts%nnr_tg
incr = fftx_ntgrp(dffts)
ALLOCATE( tg_v(dffts_nnr) )
ALLOCATE( tg_psic(dffts_nnr), tg_vpsi(lda,incr) )
CALL tg_gather( dffts, v, tg_v )
CALL stop_clock( 'vloc_psi:tg_gather' )
ELSE
dffts_nnr = dffts%nnr
ALLOCATE( vpsi(lda,1) )
ENDIF
CALL start_clock( 'vloc_psi:tg_gather' )
v_siz = dffts%nnr_tg
incr = fftx_ntgrp(dffts)
ALLOCATE( tg_v(v_siz) )
ALLOCATE( tg_psic(v_siz), tg_vpsi(lda,incr) )
CALL tg_gather( dffts, v, tg_v )
CALL stop_clock( 'vloc_psi:tg_gather' )
!
IF ( use_tg ) THEN
CALL tg_get_nnr( dffts, right_nnr )
!
! ... compute the number of chuncks
numblock = (n+blocksize-1)/blocksize
!
DO ibnd = 1, m, fftx_ntgrp(dffts)
!
CALL tg_get_nnr( dffts, right_nnr )
CALL tgwave_g2r( psi(:,ibnd:m), tg_psic, dffts, n, igk_k(:,current_k) )
!
! ... compute the number of chuncks
numblock = (n+blocksize-1)/blocksize
! write (6,*) 'wfc R '
! write (6,99) (tg_psic(i), i=1,400)
!
DO ibnd = 1, m, fftx_ntgrp(dffts)
!
CALL tgwave_g2r( psi(:,ibnd:m), tg_psic, dffts, n, igk_k(:,current_k) )
!
! write (6,*) 'wfc R '
! write (6,99) (tg_psic(i), i=1,400)
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
!$omp parallel do
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic(j) = tg_psic(j) * tg_v(j)
ENDDO
!$omp end parallel do
!
! write (6,*) 'v psi R '
! write (6,99) (tg_psic(i), i=1,400)
!
brange = m-ibnd+1
!
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n, igk_k(:,current_k) )
!
!$omp parallel do collapse(2)
DO idx = 0, MIN(fftx_ntgrp(dffts)-1, m-ibnd)
DO j = 1, numblock
DO iin = (j-1)*blocksize+1, MIN(j*blocksize,n)
hpsi(iin,ibnd+idx) = hpsi(iin,ibnd+idx) + tg_vpsi(iin,idx+1)
ENDDO
CALL tg_get_group_nr3( dffts, right_nr3 )
!
!$omp parallel do
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic(j) = tg_psic(j) * tg_v(j)
ENDDO
!$omp end parallel do
!
! write (6,*) 'v psi R '
! write (6,99) (tg_psic(i), i=1,400)
!
brange = m-ibnd+1
!
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n, igk_k(:,current_k) )
!
!$omp parallel do collapse(2)
DO idx = 0, MIN(fftx_ntgrp(dffts)-1, m-ibnd)
DO j = 1, numblock
DO iin = (j-1)*blocksize+1, MIN(j*blocksize,n)
hpsi(iin,ibnd+idx) = hpsi(iin,ibnd+idx) + tg_vpsi(iin,idx+1)
ENDDO
ENDDO
!$omp end parallel do
!
ENDDO
!$omp end parallel do
!
ELSE
!
DO ibnd = 1, m
!
CALL wave_g2r( psi(1:n,ibnd:ibnd), psic, dffts, igk=igk_k(:,current_k) )
!
! write (6,*) 'wfc R '
! write (6,99) (psic(i), i=1,400)
!
!$omp parallel do
DO j = 1, dffts_nnr
psic(j) = psic(j) * v(j)
ENDDO
!$omp end parallel do
!
! write (6,*) 'v psi R '
! write (6,99) (psic(i), i=1,400)
!
CALL wave_r2g( psic(1:dffts_nnr), vpsi(1:n,:), dffts, igk=igk_k(:,current_k) )
!
!$omp parallel do
DO i = 1, n
hpsi(i,ibnd) = hpsi(i,ibnd) + vpsi(i,1)
ENDDO
!$omp end parallel do
!
! write (6,*) 'v psi G ', ibnd
! write (6,99) (psic(i), i=1,400)
!
ENDDO
!
ENDIF
ENDDO
!
IF ( use_tg ) THEN
DEALLOCATE( tg_psic, tg_vpsi )
DEALLOCATE( tg_v )
ELSE
DEALLOCATE( vpsi )
ENDIF
DEALLOCATE( tg_psic, tg_vpsi )
DEALLOCATE( tg_v )
!
CALL stop_clock( 'vloc_psi' )
!
@ -298,10 +213,10 @@ SUBROUTINE vloc_psi_k( lda, n, m, psi, v, hpsi )
!
RETURN
!
END SUBROUTINE vloc_psi_k
END SUBROUTINE vloc_psi_tg_k
!
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_nc( lda, n, m, psi, v, hpsi )
SUBROUTINE vloc_psi_tg_nc( lda, n, m, psi, v, hpsi )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - noncollinear case.
!
@ -345,132 +260,333 @@ SUBROUTINE vloc_psi_nc( lda, n, m, psi, v, hpsi )
INTEGER :: v_siz, idx, ioff, brange
INTEGER :: right_nr3, right_inc
!
IF (.not. dffts%has_task_groups ) CALL errore('vloc_psi','no task groups?',3)
!
CALL start_clock( 'vloc_psi' )
!
CALL start_clock( 'vloc_psi:tg_gather' )
incr = fftx_ntgrp(dffts)
v_siz = dffts%nnr_tg
IF (domag) THEN
ALLOCATE( tg_v(v_siz,4) )
DO is = 1, nspin
CALL tg_gather( dffts, v(:,is), tg_v(:,is) )
ENDDO
ELSE
ALLOCATE( tg_v(v_siz,1) )
CALL tg_gather( dffts, v(:,1), tg_v(:,1) )
ENDIF
ALLOCATE( tg_psic(v_siz,npol), tg_vpsi(lda,incr) )
CALL stop_clock( 'vloc_psi:tg_gather' )
!
DO ibnd = 1, m, incr
!
DO ipol = 1, npol
ii = lda*(ipol-1)+1
ie = lda*(ipol-1)+n
CALL tgwave_g2r( psi(ii:ie,ibnd:m), tg_psic(:,ipol), dffts, n, &
igk_k(:,current_k) )
ENDDO
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
IF (domag) THEN
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
sup = tg_psic(j,1) * (tg_v(j,1)+tg_v(j,4)) + &
tg_psic(j,2) * (tg_v(j,2)-(0.d0,1.d0)*tg_v(j,3))
sdwn = tg_psic(j,2) * (tg_v(j,1)-tg_v(j,4)) + &
tg_psic(j,1) * (tg_v(j,2)+(0.d0,1.d0)*tg_v(j,3))
tg_psic(j,1) = sup
tg_psic(j,2) = sdwn
ENDDO
ELSE
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic(j,:) = tg_psic(j,:) * tg_v(j,1)
ENDDO
ENDIF
!
brange = m-ibnd+1
!
DO ipol = 1, npol
!
CALL tgwave_r2g( tg_psic(:,ipol), tg_vpsi(:,1:brange), dffts, n, &
igk_k(:,current_k) )
!
CALL tg_get_recip_inc( dffts, right_inc )
!
ioff = 0
!$omp parallel do
DO idx = 1, fftx_ntgrp(dffts)
IF ( idx+ibnd-1<=m ) THEN
DO j = 1, n
hpsi(j,ipol,ibnd+idx-1) = hpsi(j,ipol,ibnd+idx-1) + tg_vpsi(j,idx)
ENDDO
ENDIF
ioff = ioff + right_inc
ENDDO
!$omp end parallel do
!
ENDDO
!
ENDDO
!
DEALLOCATE( tg_v )
DEALLOCATE( tg_psic, tg_vpsi )
!
CALL stop_clock ('vloc_psi')
!
RETURN
!
END SUBROUTINE vloc_psi_tg_nc
!
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_gamma( lda, n, m, psi, v, hpsi )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - Gamma point.
!
USE parallel_include
USE kinds, ONLY : DP
USE mp_bands, ONLY : me_bgrp
USE fft_base, ONLY : dffts
USE fft_wave
USE wavefunctions, ONLY : psic
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda
!! leading dimension of arrays psi, hpsi
INTEGER, INTENT(IN) :: n
!! true dimension of psi, hpsi
INTEGER, INTENT(IN) :: m
!! number of states psi
COMPLEX(DP), INTENT(IN) :: psi(lda,m)
!! the wavefunction
COMPLEX(DP), INTENT(INOUT) :: hpsi(lda,m)
!! Hamiltonian dot psi
REAL(DP), INTENT(IN) :: v(dffts%nnr)
!! the total pot. in real space (smooth grid) for current spin
!
! ... local variables
!
INTEGER :: ibnd, j, incr, brange, ebnd
REAL(DP) :: fac
COMPLEX(DP) :: fp, fm
COMPLEX(DP), ALLOCATABLE :: vpsi(:,:)
!
CALL start_clock( 'vloc_psi' )
incr = 2
!
IF ( dffts%has_task_groups ) CALL errore('vloc_psi','no task groups!',1)
ALLOCATE( vpsi(n,incr) )
!
DO ibnd = 1, m, incr
!
ebnd = ibnd
IF ( ibnd < m ) ebnd = ibnd + 1
!
CALL wave_g2r( psi(1:n,ibnd:ebnd), psic, dffts )
!
DO j = 1, dffts%nnr
psic(j) = psic(j) * v(j)
ENDDO
!
brange=1 ; fac=1.d0
IF ( ibnd<m ) THEN
brange=2 ; fac=0.5d0
ENDIF
!
CALL wave_r2g( psic(1:dffts%nnr), vpsi(:,1:brange), dffts )
!
DO j = 1, n
hpsi(j,ibnd) = hpsi(j,ibnd) + fac*vpsi(j,1)
IF ( ibnd<m ) hpsi(j,ibnd+1) = hpsi(j,ibnd+1) + fac*vpsi(j,2)
ENDDO
!
ENDDO
!
DEALLOCATE( vpsi )
!
CALL stop_clock( 'vloc_psi' )
!
RETURN
!
END SUBROUTINE vloc_psi_gamma
!
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_k( lda, n, m, psi, v, hpsi )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - k-points:
!
!! * fft to real space;
!! * product with the potential v on the smooth grid;
!! * back to reciprocal space;
!! * addition to the hpsi.
!
USE parallel_include
USE kinds, ONLY : DP
USE wvfct, ONLY : current_k
USE klist, ONLY : igk_k
USE mp_bands, ONLY : me_bgrp
USE fft_base, ONLY : dffts
USE fft_wave
USE wavefunctions, ONLY : psic
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda
!! leading dimension of arrays psi, hpsi
INTEGER, INTENT(IN) :: n
!! true dimension of psi, hpsi
INTEGER, INTENT(IN) :: m
!! number of states psi
COMPLEX(DP), INTENT(IN) :: psi(lda,m)
!! the wavefunction
COMPLEX(DP), INTENT(INOUT) :: hpsi(lda,m)
!! Hamiltonian dot psi
REAL(DP), INTENT(IN) :: v(dffts%nnr)
!! the total pot. in real space (smooth grid) for current spin
!
! ... local variables
!
INTEGER :: ibnd, j, incr
INTEGER :: i, iin
COMPLEX(DP), ALLOCATABLE :: vpsi(:,:)
! ... chunking parameters
INTEGER, PARAMETER :: blocksize = 256
INTEGER :: numblock
INTEGER :: idx, brange, v_siz
!
CALL start_clock( 'vloc_psi' )
!
IF (dffts%has_task_groups ) CALL errore('vloc_psi','no task groups!',2)
!
v_siz = dffts%nnr
ALLOCATE( vpsi(lda,1) )
!
DO ibnd = 1, m
!
CALL wave_g2r( psi(1:n,ibnd:ibnd), psic, dffts, igk=igk_k(:,current_k) )
!
! write (6,*) 'wfc R '
! write (6,99) (psic(i), i=1,400)
!
!$omp parallel do
DO j = 1, v_siz
psic(j) = psic(j) * v(j)
ENDDO
!$omp end parallel do
!
! write (6,*) 'v psi R '
! write (6,99) (psic(i), i=1,400)
!
CALL wave_r2g( psic(1:v_siz), vpsi(1:n,:), dffts, igk=igk_k(:,current_k) )
!
!$omp parallel do
DO i = 1, n
hpsi(i,ibnd) = hpsi(i,ibnd) + vpsi(i,1)
ENDDO
!$omp end parallel do
!
! write (6,*) 'v psi G ', ibnd
! write (6,99) (psic(i), i=1,400)
!
ENDDO
!
DEALLOCATE( vpsi )
!
CALL stop_clock( 'vloc_psi' )
!
99 format ( 20 ('(',2f12.9,')') )
!
RETURN
!
END SUBROUTINE vloc_psi_k
!
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_nc( lda, n, m, psi, v, hpsi )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - noncollinear case.
!
USE parallel_include
USE kinds, ONLY : DP
USE wvfct, ONLY : current_k
USE klist, ONLY : igk_k
USE mp_bands, ONLY : me_bgrp
USE fft_base, ONLY : dffts, dfftp
USE fft_wave
USE lsda_mod, ONLY : nspin
USE noncollin_module, ONLY : npol, domag
USE wavefunctions, ONLY : psic_nc
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda
!! leading dimension of arrays psi, hpsi
INTEGER, INTENT(IN) :: n
!! true dimension of psi, hpsi
INTEGER, INTENT(IN) :: m
!! number of states psi
REAL(DP), INTENT(IN) :: v(dfftp%nnr,4) ! beware dimensions!
!! the total pot. in real space (smooth grid)
COMPLEX(DP), INTENT(IN) :: psi(lda*npol,m)
!! the wavefunction
COMPLEX(DP), INTENT(INOUT) :: hpsi(lda,npol,m)
!! Hamiltonian dot psi
!
! ... local variables
!
INTEGER :: ibnd, j,ipol, incr, is, ii, ie
COMPLEX(DP) :: sup, sdwn
COMPLEX(DP), ALLOCATABLE :: vpsi(:,:)
INTEGER :: v_siz, idx, ioff, brange
!
CALL start_clock( 'vloc_psi' )
!
incr = 1
!
use_tg = dffts%has_task_groups
IF (dffts%has_task_groups ) CALL errore('vloc_psi','no task groups!',3)
!
IF( use_tg ) THEN
CALL start_clock( 'vloc_psi:tg_gather' )
v_siz = dffts%nnr_tg
ALLOCATE( vpsi(lda,1) )
!
DO ibnd = 1, m, incr
!
psic_nc = (0.d0,0.d0)
!
DO ipol = 1, npol
ii = lda*(ipol-1)+1
ie = lda*(ipol-1)+n
CALL wave_g2r( psi(ii:ie,ibnd:ibnd), psic_nc(:,ipol), dffts, &
igk=igk_k(:,current_k) )
ENDDO
!
IF (domag) THEN
ALLOCATE( tg_v(v_siz,4) )
DO is = 1, nspin
CALL tg_gather( dffts, v(:,is), tg_v(:,is) )
DO j = 1, dffts%nnr
sup = psic_nc(j,1) * (v(j,1)+v(j,4)) + &
psic_nc(j,2) * (v(j,2)-(0.d0,1.d0)*v(j,3))
sdwn = psic_nc(j,2) * (v(j,1)-v(j,4)) + &
psic_nc(j,1) * (v(j,2)+(0.d0,1.d0)*v(j,3))
psic_nc(j,1) = sup
psic_nc(j,2) = sdwn
ENDDO
ELSE
ALLOCATE( tg_v(v_siz,1) )
CALL tg_gather( dffts, v(:,1), tg_v(:,1) )
DO j = 1, dffts%nnr
psic_nc(j,:) = psic_nc(j,:) * v(j,1)
ENDDO
ENDIF
incr = fftx_ntgrp(dffts)
ALLOCATE( tg_psic(v_siz,npol), tg_vpsi(lda,incr) )
CALL stop_clock( 'vloc_psi:tg_gather' )
ELSE
ALLOCATE( vpsi(lda,1) )
ENDIF
!
IF( use_tg ) THEN
!
DO ibnd = 1, m, incr
!
DO ipol = 1, npol
ii = lda*(ipol-1)+1
ie = lda*(ipol-1)+n
CALL tgwave_g2r( psi(ii:ie,ibnd:m), tg_psic(:,ipol), dffts, n, &
igk_k(:,current_k) )
DO ipol = 1, npol
CALL wave_r2g( psic_nc(1:dffts%nnr,ipol), vpsi(1:n,:), dffts, &
igk=igk_k(:,current_k) )
!$omp parallel do
DO j = 1, n
hpsi(j,ipol,ibnd) = hpsi(j,ipol,ibnd) + vpsi(j,1)
ENDDO
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
IF (domag) THEN
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
sup = tg_psic(j,1) * (tg_v(j,1)+tg_v(j,4)) + &
tg_psic(j,2) * (tg_v(j,2)-(0.d0,1.d0)*tg_v(j,3))
sdwn = tg_psic(j,2) * (tg_v(j,1)-tg_v(j,4)) + &
tg_psic(j,1) * (tg_v(j,2)+(0.d0,1.d0)*tg_v(j,3))
tg_psic(j,1) = sup
tg_psic(j,2) = sdwn
ENDDO
ELSE
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic(j,:) = tg_psic(j,:) * tg_v(j,1)
ENDDO
ENDIF
!
brange = m-ibnd+1
!
DO ipol = 1, npol
!
CALL tgwave_r2g( tg_psic(:,ipol), tg_vpsi(:,1:brange), dffts, n, &
igk_k(:,current_k) )
!
CALL tg_get_recip_inc( dffts, right_inc )
!
ioff = 0
!$omp parallel do
DO idx = 1, fftx_ntgrp(dffts)
IF ( idx+ibnd-1<=m ) THEN
DO j = 1, n
hpsi(j,ipol,ibnd+idx-1) = hpsi(j,ipol,ibnd+idx-1) + tg_vpsi(j,idx)
ENDDO
ENDIF
ioff = ioff + right_inc
ENDDO
!$omp end parallel do
!
ENDDO
!
!$omp end parallel do
ENDDO
!
ELSE
!
DO ibnd = 1, m, incr
!
psic_nc = (0.d0,0.d0)
!
DO ipol = 1, npol
ii = lda*(ipol-1)+1
ie = lda*(ipol-1)+n
CALL wave_g2r( psi(ii:ie,ibnd:ibnd), psic_nc(:,ipol), dffts, &
igk=igk_k(:,current_k) )
ENDDO
!
IF (domag) THEN
DO j = 1, dffts%nnr
sup = psic_nc(j,1) * (v(j,1)+v(j,4)) + &
psic_nc(j,2) * (v(j,2)-(0.d0,1.d0)*v(j,3))
sdwn = psic_nc(j,2) * (v(j,1)-v(j,4)) + &
psic_nc(j,1) * (v(j,2)+(0.d0,1.d0)*v(j,3))
psic_nc(j,1) = sup
psic_nc(j,2) = sdwn
ENDDO
ELSE
DO j = 1, dffts%nnr
psic_nc(j,:) = psic_nc(j,:) * v(j,1)
ENDDO
ENDIF
!
DO ipol = 1, npol
CALL wave_r2g( psic_nc(1:dffts%nnr,ipol), vpsi(1:n,:), dffts, &
igk=igk_k(:,current_k) )
!$omp parallel do
DO j = 1, n
hpsi(j,ipol,ibnd) = hpsi(j,ipol,ibnd) + vpsi(j,1)
ENDDO
!$omp end parallel do
ENDDO
!
ENDDO
!
ENDIF
ENDDO
!
IF( use_tg ) THEN
DEALLOCATE( tg_v )
DEALLOCATE( tg_psic, tg_vpsi )
ELSE
DEALLOCATE( vpsi )
ENDIF
DEALLOCATE( vpsi )
!
CALL stop_clock ('vloc_psi')
!

View File

@ -10,7 +10,7 @@
#define tg_gather_gpu tg_gather
#endif
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v, hpsi_d )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - Gamma point.
!
@ -30,9 +30,9 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
INTEGER, INTENT(in) :: lda, n, m
COMPLEX(DP), INTENT(in) :: psi_d(lda,m)
COMPLEX(DP), INTENT(inout):: hpsi_d(lda,m)
REAL(DP), INTENT(in) :: v_d(dffts%nnr)
REAL(DP), INTENT(in) :: v(dffts%nnr)
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, hpsi_d, v_d
attributes(DEVICE) :: psi_d, hpsi_d
#endif
!
! ... local variables
@ -42,12 +42,7 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
COMPLEX(DP), ALLOCATABLE :: psi(:,:)
COMPLEX(DP), ALLOCATABLE :: psic(:), vpsi(:,:)
! ... Variables for task groups
LOGICAL :: use_tg
REAL(DP), POINTER :: tg_v_d(:)
COMPLEX(DP), ALLOCATABLE :: tg_psic(:), tg_vpsi(:,:)
#if defined(__CUDA)
attributes(DEVICE) :: tg_v_d
INTEGER :: dffts_nnr, idx, ebnd, brange
INTEGER :: ierr, ioff
! ... Variables to handle batched FFT
@ -64,63 +59,17 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
incr = 2*many_fft
!
use_tg = dffts%has_task_groups
IF ( dffts%has_task_groups ) CALL errore('Vloc_psi_gpu','no task groups!',1)
!
IF ( use_tg ) THEN
CALL start_clock_gpu( 'vloc_psi:tg_gather' )
dffts_nnr = dffts%nnr_tg
incr = 2*fftx_ntgrp(dffts)
CALL dev_buf%lock_buffer( tg_v_d, dffts_nnr, ierr )
ALLOCATE( tg_psic(dffts_nnr), tg_vpsi(dffts_nnr,incr) )
CALL tg_gather_gpu( dffts, v_d, tg_v_d )
CALL stop_clock_gpu( 'vloc_psi:tg_gather' )
ELSE
dffts_nnr = dffts%nnr
ALLOCATE( psic(dffts_nnr*incr), vpsi(dffts_nnr,incr) )
ENDIF
dffts_nnr = dffts%nnr
ALLOCATE( psic(dffts_nnr*incr), vpsi(dffts_nnr,incr) )
!
! ... The local potential V_Loc psi:
! - fft to real space;
! - product with the potential v on the smooth grid;
! - back to reciprocal space.
!
IF( use_tg ) THEN
!
!$acc data create(tg_psic,tg_vpsi)
DO ibnd = 1, m, incr
!
CALL tgwave_g2r( psi(1:n,ibnd:m), tg_psic, dffts, n )
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
!$acc parallel loop
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic(j) = tg_psic(j) * tg_v_d(j)
ENDDO
!
brange = m-ibnd+1
!
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n )
!
DO idx = 1, 2*fftx_ntgrp(dffts), 2
IF ( idx+ibnd-1<m ) THEN
!$acc parallel loop
DO j = 1, n
hpsi_d(j,ibnd+idx-1) = hpsi_d(j,ibnd+idx-1) + 0.5d0 * tg_vpsi(j,idx)
hpsi_d(j,ibnd+idx) = hpsi_d(j,ibnd+idx) + 0.5d0 * tg_vpsi(j,idx+1)
ENDDO
ELSEIF ( idx+ibnd-1==m ) THEN
!$acc parallel loop
DO j = 1, n
hpsi_d(j,ibnd+idx-1) = hpsi_d(j,ibnd+idx-1) + tg_vpsi(j,idx)
ENDDO
ENDIF
ENDDO
!
ENDDO
!$acc end data
!
ELSEIF (many_fft > 1) THEN
IF (many_fft > 1) THEN
!
!$acc data create(psic,vpsi)
DO ibnd = 1, m, incr
@ -133,10 +82,10 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
CALL wave_g2r( psi(:,ibnd:ibnd+group_size-1), psic, dffts, howmany_set=hm_vec )
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2) present(v)
DO idx = 0, howmany-1
DO j = 1, dffts_nnr
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v_d(j)
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v(j)
ENDDO
ENDDO
!
@ -173,9 +122,9 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
CALL wave_g2r( psi(1:n,ibnd:ebnd), psic, dffts )
!
!$acc parallel loop
!$acc parallel loop present(v)
DO j = 1, dffts_nnr
psic(j) = psic(j) * v_d(j)
psic(j) = psic(j) * v(j)
ENDDO
!
brange=1 ; fac=1.d0
@ -199,12 +148,7 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!$acc end data
DEALLOCATE( psi )
!
IF( use_tg ) THEN
DEALLOCATE( tg_psic, tg_vpsi )
CALL dev_buf%release_buffer( tg_v_d, ierr )
ELSE
DEALLOCATE( psic, vpsi )
ENDIF
DEALLOCATE( psic, vpsi )
!
CALL stop_clock_gpu ('vloc_psi')
#endif
@ -214,7 +158,7 @@ END SUBROUTINE vloc_psi_gamma_gpu
!
!@njs: vloc_psi_k
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v, hpsi_d )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - k-points. GPU double.
!
@ -241,9 +185,9 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
INTEGER, INTENT(IN) :: lda, n, m
COMPLEX(DP), INTENT(IN) :: psi_d(lda,m)
COMPLEX(DP), INTENT(INOUT):: hpsi_d(lda,m)
REAL(DP), INTENT(IN) :: v_d(dffts%nnr)
REAL(DP), INTENT(IN) :: v(dffts%nnr)
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, hpsi_d, v_d
attributes(DEVICE) :: psi_d, hpsi_d
#endif
!
! ... local variables
@ -253,18 +197,14 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
COMPLEX(DP), ALLOCATABLE :: psi(:,:)
COMPLEX(DP), ALLOCATABLE :: psic(:), vpsi(:,:)
! ... Task Groups
LOGICAL :: use_tg
COMPLEX(DP), ALLOCATABLE :: tg_psic(:), tg_vpsi(:,:)
REAL(DP), POINTER :: tg_v_d(:)
!
#if defined(__CUDA)
attributes(DEVICE) :: tg_v_d
!
INTEGER :: dffts_nnr, idx, group_size, hm_vec(3)
INTEGER :: ierr, brange
!
CALL start_clock_gpu ('vloc_psi')
use_tg = dffts%has_task_groups
IF ( dffts%has_task_groups ) CALL errore('Vloc_psi_gpu','no task groups!',2)
!
ALLOCATE( psi(lda,m) )
!$acc data create( psi )
@ -274,52 +214,10 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
incr = many_fft
!
IF( use_tg ) THEN
CALL start_clock_gpu ('vloc_psi:tg_gather')
dffts_nnr = dffts%nnr_tg
incr = fftx_ntgrp(dffts)
CALL dev_buf%lock_buffer( tg_v_d, dffts_nnr, ierr )
ALLOCATE( tg_psic(dffts_nnr), tg_vpsi(dffts_nnr,incr) )
CALL tg_gather_gpu( dffts, v_d, tg_v_d )
CALL stop_clock_gpu ('vloc_psi:tg_gather')
ELSE
dffts_nnr = dffts%nnr
ALLOCATE( psic(dffts_nnr*incr), vpsi(dffts_nnr,incr) )
ENDIF
dffts_nnr = dffts%nnr
ALLOCATE( psic(dffts_nnr*incr), vpsi(dffts_nnr,incr) )
!
IF( use_tg ) THEN
!
CALL tg_get_nnr( dffts, right_nnr )
!
!$acc data create(tg_psic,tg_vpsi)
DO ibnd = 1, m, fftx_ntgrp(dffts)
!
CALL tgwave_g2r( psi(1:n,ibnd:m), tg_psic, dffts, n, igk_k(:,current_k) )
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
!$acc parallel loop
DO j = 1, dffts%nr1x*dffts%nr2x* right_nr3
tg_psic(j) = tg_psic(j) * tg_v_d(j)
ENDDO
!
brange = m-ibnd+1
!
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n, igk_k(:,current_k) )
!
DO idx = 1, fftx_ntgrp(dffts)
IF ( idx+ibnd-1 <= m ) THEN
!$acc parallel loop
DO j = 1, n
hpsi_d(j,ibnd+idx-1) = hpsi_d(j,ibnd+idx-1) + tg_vpsi(j,idx-1)
ENDDO
ENDIF
ENDDO
!
ENDDO
!$acc end data
!
ELSEIF (many_fft > 1) THEN
IF (many_fft > 1) THEN
!
!$acc data create(psic,vpsi)
DO ibnd = 1, m, incr
@ -331,10 +229,10 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
CALL wave_g2r( psi(:,ibnd:ebnd), psic, dffts, igk=igk_k(:,current_k), &
howmany_set=hm_vec )
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2) present(v)
DO idx = 0, group_size-1
DO j = 1, dffts_nnr
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v_d(j)
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v(j)
ENDDO
ENDDO
!
@ -358,9 +256,9 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
CALL wave_g2r( psi(1:n,ibnd:ibnd), psic, dffts, igk=igk_k(:,current_k) )
!
!$acc parallel loop
!$acc parallel loop present(v)
DO j = 1, dffts_nnr
psic(j) = psic(j) * v_d(j)
psic(j) = psic(j) * v(j)
ENDDO
!
CALL wave_r2g( psic, vpsi(1:n,:), dffts, igk=igk_k(:,current_k) )
@ -378,12 +276,7 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!$acc end data
DEALLOCATE( psi )
!
IF( use_tg ) THEN
DEALLOCATE( tg_psic, tg_vpsi )
CALL dev_buf%release_buffer( tg_v_d, ierr )
ELSE
DEALLOCATE( psic, vpsi )
ENDIF
DEALLOCATE( psic, vpsi )
!
CALL stop_clock_gpu( 'vloc_psi' )
#endif
@ -396,7 +289,7 @@ END SUBROUTINE vloc_psi_k_gpu
!
!@njs: vloc_psi_nc
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v, hpsi_d )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - non-collinear -
!! GPU version.
@ -415,11 +308,11 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda, n, m
REAL(DP), INTENT(IN) :: v_d(dfftp%nnr,4) ! beware dimensions!
REAL(DP), INTENT(IN) :: v(dfftp%nnr,4) ! beware dimensions!
COMPLEX(DP), INTENT(IN) :: psi_d(lda*npol,m)
COMPLEX(DP), INTENT(INOUT):: hpsi_d(lda,npol,m)
#if defined(__CUDA)
attributes(DEVICE) :: v_d, psi_d, hpsi_d
attributes(DEVICE) :: psi_d, hpsi_d
#endif
!
! ... local variables
@ -428,19 +321,14 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
COMPLEX(DP) :: sup, sdwn
!
COMPLEX(DP), ALLOCATABLE :: psi(:,:), psic_nc(:,:), vpsic_nc(:,:)
! ... Variables for task groups
LOGICAL :: use_tg
REAL(DP), ALLOCATABLE :: tg_v_d(:,:)
COMPLEX(DP), ALLOCATABLE :: tg_psic(:,:), tg_vpsi(:,:)
#if defined(__CUDA)
attributes(DEVICE) :: tg_v_d
INTEGER :: dffts_nnr, idx, ioff, ii, ie, brange
INTEGER :: right_nnr, right_nr3, right_inc
!
CALL start_clock_gpu ('vloc_psi')
!
incr = 1
use_tg = dffts%has_task_groups
IF ( dffts%has_task_groups ) CALL errore('Vloc_psi_gpu','no task groups!',3)
!
ALLOCATE( psi(lda*npol,m) )
!$acc data create( psi )
@ -448,134 +336,54 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
psi = psi_d
!$acc end kernels
!
IF( use_tg ) THEN
CALL start_clock_gpu( 'vloc_psi:tg_gather' )
dffts_nnr = dffts%nnr_tg
incr = fftx_ntgrp(dffts)
IF (domag) THEN
ALLOCATE( tg_v_d(dffts_nnr,4) )
DO is = 1, nspin
CALL tg_gather_gpu( dffts, v_d(:,is), tg_v_d(:,is) )
ENDDO
ELSE
ALLOCATE( tg_v_d(dffts_nnr,1) )
CALL tg_gather_gpu( dffts, v_d(:,1), tg_v_d(:,1) )
ENDIF
ALLOCATE( tg_psic(dffts_nnr,npol), tg_vpsi(lda,incr) )
CALL stop_clock_gpu( 'vloc_psi:tg_gather' )
ELSE
dffts_nnr = dffts%nnr
ALLOCATE( psic_nc(dffts_nnr,npol), vpsic_nc(lda,1) )
ENDIF
dffts_nnr = dffts%nnr
ALLOCATE( psic_nc(dffts_nnr,npol), vpsic_nc(lda,1) )
!
! ... the local potential V_Loc psi. First the psi in real space
!
IF( use_tg ) THEN
!$acc data create( psic_nc, vpsic_nc )
DO ibnd = 1, m, incr
!
!$acc data create( tg_psic, tg_vpsi )
DO ibnd = 1, m, incr
!
DO ipol = 1, npol
ii = lda*(ipol-1)+1
ie = lda*ipol
CALL tgwave_g2r( psi(ii:ie,ibnd:m), tg_psic(:,ipol), dffts, n, &
igk_k(:,current_k) )
ENDDO
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
IF (domag) THEN
!$acc parallel loop
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
sup = tg_psic(j,1) * (tg_v_d(j,1)+tg_v_d(j,4)) + &
tg_psic(j,2) * (tg_v_d(j,2)-(0.d0,1.d0)*tg_v_d(j,3))
sdwn = tg_psic(j,2) * (tg_v_d(j,1)-tg_v_d(j,4)) + &
tg_psic(j,1) * (tg_v_d(j,2)+(0.d0,1.d0)*tg_v_d(j,3))
tg_psic(j,1) = sup
tg_psic(j,2) = sdwn
ENDDO
ELSE
!$acc parallel loop
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic(j,:) = tg_psic(j,:) * tg_v_d(j,1)
ENDDO
ENDIF
!
brange = m-ibnd+1
!
DO ipol = 1, npol
CALL tgwave_r2g( tg_psic(:,ipol), tg_vpsi(:,1:brange), dffts, n, igk_k(:,current_k) )
!
CALL tg_get_recip_inc( dffts, right_inc )
!
ioff = 0
!
DO idx = 1, fftx_ntgrp(dffts)
IF ( idx+ibnd-1<=m ) THEN
!$acc parallel loop
DO j = 1, n
hpsi_d(j,ipol,ibnd+idx-1) = hpsi_d(j,ipol,ibnd+idx-1) + tg_vpsi(j,idx)
ENDDO
ENDIF
ioff = ioff + right_inc
ENDDO
ENDDO
!
!$acc kernels
psic_nc = (0.d0,0.d0)
!$acc end kernels
DO ipol = 1, npol
ii = lda*(ipol-1)+1
ie = lda*(ipol-1)+n
CALL wave_g2r( psi(ii:ie,ibnd:ibnd), psic_nc(:,ipol), dffts, &
igk=igk_k(:,current_k) )
ENDDO
!$acc end data
!
ELSE
IF (domag) THEN
!$acc parallel loop present(v)
DO j = 1, dffts_nnr
sup = psic_nc(j,1) * (v(j,1)+v(j,4)) + &
psic_nc(j,2) * (v(j,2)-(0.d0,1.d0)*v(j,3))
sdwn = psic_nc(j,2) * (v(j,1)-v(j,4)) + &
psic_nc(j,1) * (v(j,2)+(0.d0,1.d0)*v(j,3))
psic_nc(j,1) = sup
psic_nc(j,2) = sdwn
ENDDO
ELSE
!$acc parallel loop present(v)
DO j = 1, dffts_nnr
psic_nc(j,:) = psic_nc(j,:) * v(j,1)
ENDDO
ENDIF
!
!$acc data create( psic_nc, vpsic_nc )
DO ibnd = 1, m, incr
DO ipol = 1, npol
CALL wave_r2g( psic_nc(:,ipol), vpsic_nc(1:n,:), dffts, &
igk=igk_k(:,current_k) )
!
!$acc kernels
psic_nc = (0.d0,0.d0)
!$acc end kernels
DO ipol = 1, npol
ii = lda*(ipol-1)+1
ie = lda*(ipol-1)+n
CALL wave_g2r( psi(ii:ie,ibnd:ibnd), psic_nc(:,ipol), dffts, &
igk=igk_k(:,current_k) )
!$acc parallel loop
DO j = 1, n
hpsi_d(j,ipol,ibnd) = hpsi_d(j,ipol,ibnd) + vpsic_nc(j,1)
ENDDO
!
IF (domag) THEN
!$acc parallel loop
DO j = 1, dffts_nnr
sup = psic_nc(j,1) * (v_d(j,1)+v_d(j,4)) + &
psic_nc(j,2) * (v_d(j,2)-(0.d0,1.d0)*v_d(j,3))
sdwn = psic_nc(j,2) * (v_d(j,1)-v_d(j,4)) + &
psic_nc(j,1) * (v_d(j,2)+(0.d0,1.d0)*v_d(j,3))
psic_nc(j,1) = sup
psic_nc(j,2) = sdwn
ENDDO
ELSE
!$acc parallel loop
DO j = 1, dffts_nnr
psic_nc(j,:) = psic_nc(j,:) * v_d(j,1)
ENDDO
ENDIF
!
DO ipol = 1, npol
CALL wave_r2g( psic_nc(:,ipol), vpsic_nc(1:n,:), dffts, &
igk=igk_k(:,current_k) )
!
!$acc parallel loop
DO j = 1, n
hpsi_d(j,ipol,ibnd) = hpsi_d(j,ipol,ibnd) + vpsic_nc(j,1)
ENDDO
ENDDO
!
ENDDO
!$acc end data
!
ENDIF
!
IF( use_tg ) THEN
DEALLOCATE( tg_v_d, tg_psic, tg_vpsi )
ELSE
DEALLOCATE( psic_nc, vpsic_nc )
ENDIF
ENDDO
!$acc end data
DEALLOCATE( psic_nc, vpsic_nc )
!
!$acc end data
DEALLOCATE( psi )

View File

@ -39,7 +39,6 @@ SUBROUTINE lr_apply_liouvillian_magnons( evc1, evc1_new, L_dag )
USE io_global, ONLY : stdout
USE uspp_init, ONLY : init_us_2
USE scf_gpum, ONLY : vrs_d
IMPLICIT NONE
!
@ -402,13 +401,12 @@ SUBROUTINE lr_apply_liouvillian_magnons( evc1, evc1_new, L_dag )
!
! Change the sign of b_xc
!
!$acc kernels
vrs(:,2) = - vrs(:,2)
vrs(:,3) = - vrs(:,3)
vrs(:,4) = - vrs(:,4)
!$acc end kernels
!
#if defined(__CUDA)
vrs_d = vrs
#endif
! Apply the operator ( H - \epsilon S + alpha_pv P_v) to evc1
! where alpha_pv = 0
!
@ -418,13 +416,11 @@ SUBROUTINE lr_apply_liouvillian_magnons( evc1, evc1_new, L_dag )
!
! Change the sign of b_xc back
!
!$acc kernels
vrs(:,2) = - vrs(:,2)
vrs(:,3) = - vrs(:,3)
vrs(:,4) = - vrs(:,4)
!
#if defined(__CUDA)
vrs_d = vrs
#endif
!$acc end kernels
!
IF (ALLOCATED(psic_nc)) DEALLOCATE(psic_nc)
!

View File

@ -55,7 +55,6 @@ SUBROUTINE lr_readin
USE constants, ONLY : eps4, rytoev
USE control_lr, ONLY : lrpa, alpha_mix, ethr_nscf
USE mp_world, ONLY : world_comm
USE scf_gpum, ONLY: vrs_d
#if defined (__ENVIRON)
USE plugin_flags, ONLY : use_environ
USE environ_base_module, ONLY : read_environ_input, init_environ_setup, &
@ -504,9 +503,6 @@ SUBROUTINE lr_readin
! vrs = vltot + v%of_r
!
CALL set_vrs ( vrs, vltot, v%of_r, 0, 0, dfftp%nnr, nspin, doublegrid )
#if defined(__CUDA)
vrs_d = vrs
#endif
!
DEALLOCATE( vltot )
CALL destroy_scf_type(v)