More Vloc*psi cleanup

Variable "vrs" used in the Hamiltonian is now an ACC variable, replaces vrs_d.
vrs is copied to device in set_vrs. Obsolete using_vrs* machinery deleted.
This commit is contained in:
Paolo Giannozzi 2024-07-17 10:04:16 +02:00
parent 7bdae85022
commit 457c7d55ef
17 changed files with 42 additions and 266 deletions

View File

@ -245,7 +245,6 @@ set(src_pw
src/wannier_occ.f90
src/d3hess_mod.f90
# GPU
src/scf_mod_gpu.f90
src/g_psi_mod_gpu.f90
src/newd_gpu.f90
src/add_paw_to_deeq_gpu.f90

View File

@ -278,7 +278,6 @@ PWLIBS += \
h_psi_gpu.o \
vhpsi_gpu.o \
vloc_psi_gpu.o \
scf_mod_gpu.o \
usnldiag_gpu.o \
add_vuspsi_gpu.o \
newd_gpu.o \

View File

@ -27,8 +27,6 @@ SUBROUTINE allocate_fft
USE wavefunctions, ONLY : psic, psic_nc
USE xc_lib, ONLY : xclib_dft_is
!
USE scf_gpum, ONLY : using_vrs
!
IMPLICIT NONE
!
! ... First a bunch of checks
@ -68,9 +66,7 @@ SUBROUTINE allocate_fft
ALLOCATE( rhog_core(ngm) )
ALLOCATE( psic(dfftp%nnr) )
ALLOCATE( vrs(dfftp%nnr,nspin) )
#if defined(__CUDA)
CALL using_vrs(2)
#endif
!$acc enter data create (vrs)
!
IF (noncolin) ALLOCATE( psic_nc(dfftp%nnr,npol) )
!

View File

@ -142,8 +142,8 @@ SUBROUTINE clean_pw( lflag )
IF ( ALLOCATED( rhog_core ) ) DEALLOCATE( rhog_core )
IF ( ALLOCATED( psic ) ) DEALLOCATE( psic )
IF ( ALLOCATED( psic_nc ) ) DEALLOCATE( psic_nc )
!$acc exit data delete(vrs)
IF ( ALLOCATED( vrs ) ) DEALLOCATE( vrs )
CALL deallocate_scf_gpu()
!
! ... arrays allocated in allocate_locpot.f90 ( and never deallocated )
!

View File

@ -47,8 +47,6 @@ SUBROUTINE electrons()
USE loc_scdm, ONLY : use_scdm, localize_orbitals
USE loc_scdm_k, ONLY : localize_orbitals_k
!
USE scf_gpum, ONLY : using_vrs
!
USE add_dmft_occ, ONLY : dmft
USE rism_module, ONLY : lrism, rism_calc3d
USE makovpayne, ONLY : makov_payne
@ -145,7 +143,6 @@ SUBROUTINE electrons()
ehart, etxc, vtxc, eth, etotefield, charge, v)
IF (lrism) CALL rism_calc3d(rho%of_g(:, 1), esol, vsol, v%of_r, tr2)
IF (okpaw) CALL PAW_potential(rho%bec, ddd_PAW, epaw,etot_cmp_paw)
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, &
nspin, doublegrid )
!
@ -242,7 +239,6 @@ SUBROUTINE electrons()
IF (lrism) CALL rism_calc3d(rho%of_g(:, 1), esol, vsol, v%of_r, tr2)
!
IF (okpaw) CALL PAW_potential(rho%bec, ddd_PAW, epaw,etot_cmp_paw)
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, &
nspin, doublegrid )
!
@ -462,7 +458,6 @@ SUBROUTINE electrons_scf ( printout, exxen )
USE libmbd_interface, ONLY : EmbdvdW
USE add_dmft_occ, ONLY : dmft, dmft_update, v_dmft, dmft_updated
!
USE scf_gpum, ONLY : using_vrs
USE device_fbuff_m, ONLY : dev_buf, pin_buf
USE pwcom, ONLY : report_mag
USE makovpayne, ONLY : makov_payne
@ -987,13 +982,8 @@ SUBROUTINE electrons_scf ( printout, exxen )
!
! ... define the total local potential (external + scf)
!
CALL using_vrs(1)
CALL sum_vrs( dfftp%nnr, nspin, vltot, v%of_r, vrs )
!
! ... interpolate the total local potential
!
CALL using_vrs(1) ! redundant
CALL interpolate_vrs( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs )
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, &
nspin, doublegrid )
!
! ... in the US case we have to recompute the self-consistent
! ... term in the nonlocal potential

View File

@ -19,7 +19,6 @@ SUBROUTINE h_epsi_her_apply( lda, n, nbande, psi, hpsi, pdir, e_field )
USE noncollin_module, ONLY : noncolin, npol, lspinorb
USE wvfct, ONLY : npwx, nbnd, ik => current_k
USE lsda_mod, ONLY : current_spin, nspin
USE scf, ONLY : vrs
USE gvect
USE uspp, ONLY : okvan, nkb, vkb, qq_so, qq_at
USE uspp_param, ONLY : nh, nhm, nbetam

View File

@ -19,7 +19,6 @@ SUBROUTINE h_epsi_her_set( pdir, e_field )
USE noncollin_module, ONLY: noncolin, npol, lspinorb
USE wvfct, ONLY: npwx, nbnd
USE lsda_mod, ONLY: current_spin, nspin
USE scf, ONLY: vrs
USE gvect
USE fft_base, ONLY: dfftp
USE uspp, ONLY: okvan, nkb, vkb

View File

@ -106,7 +106,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
USE sci_mod, ONLY: p_psi
USE fft_helper_subroutines
!
USE scf_gpum, ONLY: using_vrs
#if defined(__OSCDFT)
USE plugin_flags, ONLY : use_oscdft
USE oscdft_base, ONLY : oscdft_ctx
@ -133,10 +132,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
!
!
CALL start_clock( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6)
CALL using_vrs(0) ! vloc_psi_gamma (intent:in)
!
! ... Here we set the kinetic energy (k+G)^2 psi and clean up garbage
!

View File

@ -97,7 +97,6 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
USE bp, ONLY: lelfield, l3dstring, gdir, efield, efield_cry
USE becmod, ONLY: bec_type, becp
USE lsda_mod, ONLY: current_spin
USE scf_gpum, ONLY: vrs_d, using_vrs_d
USE uspp, ONLY: nkb, vkb
USE ldaU, ONLY: lda_plus_u, lda_plus_u_kind, Hubbard_projectors
USE gvect, ONLY: gstart
@ -113,6 +112,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
USE fft_helper_subroutines
USE device_memcpy_m, ONLY: dev_memcpy, dev_memset
!
USE scf, ONLY: vrs
USE wvfct, ONLY: g2kin
#if defined(__OSCDFT)
USE plugin_flags, ONLY : use_oscdft
@ -140,7 +140,6 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
LOGICAL :: need_host_copy
!
CALL start_clock_gpu( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6)
CALL using_vrs_d(0)
!
! ... Here we add the kinetic energy (k+G)^2 psi and clean up garbage
!
@ -208,13 +207,13 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
!
ELSE
! ... usual reciprocal-space algorithm
CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs(1,current_spin), hpsi_d )
!
ENDIF
!
ELSE IF ( noncolin ) THEN
!
CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs_d, hpsi_d )
CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs, hpsi_d )
!
ELSE
!
@ -246,7 +245,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
!
ELSE
!
CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs(1,current_spin), hpsi_d )
!
ENDIF
!

View File

@ -35,7 +35,6 @@ SUBROUTINE hinit1()
USE dfunct_gpum, ONLY : newd_gpu
USE exx_base, ONLY : coulomb_fac, coulomb_done
!
USE scf_gpum, ONLY : using_vrs
USE ener, ONLY : esol, vsol
USE rism_module, ONLY : lrism, rism_update_pos, rism_calc3d
!
@ -113,7 +112,6 @@ SUBROUTINE hinit1()
!
! ... define the total local potential (external+scf)
!
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, &
doublegrid )
!

View File

@ -47,7 +47,6 @@ SUBROUTINE orbm_kubo()
USE gvecs, ONLY : doublegrid
USE mp_pools, ONLY : intra_pool_comm
USE mp_world, ONLY : world_comm
USE scf_gpum, ONLY : using_vrs
USE uspp_init, ONLY : init_us_2
!
IMPLICIT NONE
@ -112,7 +111,6 @@ SUBROUTINE orbm_kubo()
ALLOCATE( H_evc(npol*npwx,nbnd) )
ALLOCATE( temp(ngm) )
!
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid )
CALL allocate_bec_type( nkb, nbnd, becp )
! Initializations

View File

@ -55,7 +55,6 @@ SUBROUTINE potinit()
USE paw_init, ONLY : PAW_atomic_becsum
USE paw_onecenter, ONLY : PAW_potential
!
USE scf_gpum, ONLY : using_vrs
USE pwcom, ONLY : report_mag
USE rism_module, ONLY : lrism, rism_init3d, rism_calc3d
!
@ -278,9 +277,7 @@ SUBROUTINE potinit()
!
! ... define the total local potential (external+scf)
!
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid )
!
! ... write on output the parameters used in the DFT+U(+V) calculation
!
IF ( lda_plus_u ) THEN

View File

@ -2613,7 +2613,6 @@ MODULE realus
USE fft_base, ONLY : dffts
USE scf, ONLY : vrs
USE lsda_mod, ONLY : current_spin
USE scf_gpum, ONLY : using_vrs
!
IMPLICIT NONE
!
@ -2629,8 +2628,6 @@ MODULE realus
!
CALL start_clock( 'v_loc_psir' )
CALL using_vrs(0) ! tg_gather (intent: in)
IF( dffts%has_task_groups ) THEN
IF (ibnd == 1 ) THEN
CALL tg_gather( dffts, vrs(:,current_spin), tg_v )
@ -2672,7 +2669,6 @@ MODULE realus
USE fft_base, ONLY : dffts
USE scf, ONLY : vrs
USE lsda_mod, ONLY : current_spin
USE scf_gpum, ONLY : using_vrs
!
IMPLICIT NONE
!
@ -2688,8 +2684,6 @@ MODULE realus
!
CALL start_clock( 'v_loc_psir' )
CALL using_vrs(0) ! tg_gather (intent: in)
IF( dffts%has_task_groups ) THEN
IF (ibnd == 1 ) THEN
CALL tg_gather( dffts, vrs(:,current_spin), tg_v )

View File

@ -98,9 +98,6 @@ MODULE scf
REAL(DP), ALLOCATABLE :: vltot(:)
!! the local potential in real space
REAL(DP), ALLOCATABLE :: vrs(:,:)
#if defined(__CUDA)
attributes(pinned) :: vrs
#endif
!! the total pot. in real space (smooth grid)
REAL(DP), ALLOCATABLE :: rho_core(:)
!! the core charge in real space

View File

@ -1,124 +0,0 @@
!
! Copyright (C) 2002-2011 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#define DIMS1D(arr) lbound(arr,1):ubound(arr,1)
#define DIMS2D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2)
#define DIMS3D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3)
#define DIMS4D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3),lbound(arr,4):ubound(arr,4)
#define DIMS5D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3),lbound(arr,4):ubound(arr,4),lbound(arr,5):ubound(arr,5)
!=----------------------------------------------------------------------------=!
MODULE scf_gpum
!=----------------------------------------------------------------------------=!
#if defined(__CUDA)
USE cudafor
#endif
IMPLICIT NONE
SAVE
INTEGER, PARAMETER :: DP = selected_real_kind(14,200)
INTEGER, PARAMETER :: sgl = selected_real_kind(6,30)
INTEGER, PARAMETER :: i4b = selected_int_kind(9)
INTEGER, PARAMETER :: i8b = selected_int_kind(18)
#if defined(__DEBUG)
INTEGER :: iverbosity = 1
#else
INTEGER :: iverbosity = 0
#endif
!
REAL(DP), ALLOCATABLE :: vrs_d(:, :)
!
#if defined(__CUDA)
attributes (DEVICE) :: vrs_d
#endif
LOGICAL :: vrs_ood = .false. ! used to flag out of date variables
LOGICAL :: vrs_d_ood = .false. ! used to flag out of date variables
!
CONTAINS
!
SUBROUTINE using_vrs(intento, debug_info)
!
! intento is used to specify what the variable will be used for :
! 0 -> in , the variable needs to be synchronized but won't be changed
! 1 -> inout , the variable needs to be synchronized AND will be changed
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
!
USE scf, ONLY : vrs
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
#if defined(__CUDA) || defined(__CUDA_GNU)
INTEGER :: intento_
intento_ = intento
!
IF (PRESENT(debug_info) ) print *, "using_vrs ", debug_info, vrs_ood
!
IF (vrs_ood) THEN
IF ((.not. allocated(vrs_d)) .and. (intento_ < 2)) THEN
CALL errore('using_vrs_d', 'PANIC: sync of vrs from vrs_d with unallocated array. Bye!!', 1)
stop
END IF
IF (.not. allocated(vrs)) THEN
IF (intento_ /= 2) THEN
print *, "WARNING: sync of vrs with unallocated array and intento /= 2? Changed to 2!"
intento_ = 2
END IF
! IF (intento_ > 0) vrs_d_ood = .true.
END IF
IF (intento_ < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied vrs D->H"
vrs = vrs_d
END IF
vrs_ood = .false.
ENDIF
IF (intento_ > 0) vrs_d_ood = .true.
#endif
END SUBROUTINE using_vrs
!
SUBROUTINE using_vrs_d(intento, debug_info)
!
USE scf, ONLY : vrs
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
#if defined(__CUDA) || defined(__CUDA_GNU)
!
IF (PRESENT(debug_info) ) print *, "using_vrs_d ", debug_info, vrs_d_ood
!
IF (.not. allocated(vrs)) THEN
IF (intento /= 2) print *, "WARNING: sync of vrs_d with unallocated array and intento /= 2?"
IF (allocated(vrs_d)) DEALLOCATE(vrs_d)
vrs_d_ood = .false.
RETURN
END IF
! here we know that vrs is allocated, check if size is 0
IF ( SIZE(vrs) == 0 ) THEN
print *, "Refusing to allocate 0 dimensional array vrs_d. If used, code will crash."
RETURN
END IF
!
IF (vrs_d_ood) THEN
IF ( allocated(vrs_d) .and. (SIZE(vrs_d)/=SIZE(vrs))) deallocate(vrs_d)
IF (.not. allocated(vrs_d)) ALLOCATE(vrs_d(DIMS2D(vrs))) ! MOLD does not work on all compilers
IF (intento < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied vrs H->D"
vrs_d = vrs
END IF
vrs_d_ood = .false.
ENDIF
IF (intento > 0) vrs_ood = .true.
#else
CALL errore('using_vrs_d', 'Trying to use device data without device compiled code!', 1)
#endif
END SUBROUTINE using_vrs_d
!
SUBROUTINE deallocate_scf_gpu
IF( ALLOCATED( vrs_d ) ) DEALLOCATE( vrs_d )
vrs_d_ood = .false.
END SUBROUTINE deallocate_scf_gpu
!=----------------------------------------------------------------------------=!
END MODULE scf_gpum
!=----------------------------------------------------------------------------=!

View File

@ -1,5 +1,5 @@
!
! Copyright (C) 2001 PWSCF group
! Copyright (C) 2001-2024 Quantum ESPESSO Foundation
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
@ -13,7 +13,9 @@ SUBROUTINE set_vrs( vrs, vltot, vr, kedtau, kedtaur, nrxx, nspin, doublegrid )
!! the sum of all the local pseudopotential contributions.
!
USE kinds
USE fft_base, ONLY : dffts
USE xc_lib, ONLY : xclib_dft_is
USE fft_base, ONLY : dffts, dfftp
USE fft_interfaces, ONLY : fft_interpolate
!
IMPLICIT NONE
!
@ -33,42 +35,9 @@ SUBROUTINE set_vrs( vrs, vltot, vr, kedtau, kedtaur, nrxx, nspin, doublegrid )
!! the kinetic energy density in R-space
LOGICAL :: doublegrid
! input: true if a doublegrid is used
!
CALL sum_vrs( nrxx, nspin, vltot, vr, vrs )
!
CALL interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
!
RETURN
!
END SUBROUTINE set_vrs
!
!
!--------------------------------------------------------------------
SUBROUTINE sum_vrs( nrxx, nspin, vltot, vr, vrs )
!--------------------------------------------------------------------
!! Accumulates local potential contributions into vrs (the total local potential).
!
USE kinds
!
IMPLICIT NONE
!
INTEGER :: nspin
!! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear
INTEGER :: nrxx
!! input: the fft grid dimension
REAL(DP) :: vrs(nrxx,nspin)
!! output: total local potential on the smooth grid:
!! \(\text{vrs\}=\text{vltot}+\text{vr}\)
REAL(DP) :: vltot(nrxx)
!! input: the total local pseudopotential
REAL(DP) :: vr(nrxx,nspin)
!! input: the scf(H+xc) part of the local potential
!
! ... local variable
!
INTEGER :: is
!
!
!$acc data present(vrs)
DO is = 1, nspin
!
! define the total local potential (external + scf) for each spin ...
@ -84,39 +53,6 @@ SUBROUTINE sum_vrs( nrxx, nspin, vltot, vr, vrs )
!
ENDDO
!
RETURN
!
END SUBROUTINE sum_vrs
!
!--------------------------------------------------------------------------
SUBROUTINE interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
!--------------------------------------------------------------------------
!! Interpolates local potential on the smooth mesh if necessary.
!
USE kinds
USE xc_lib, ONLY : xclib_dft_is
USE fft_base, ONLY : dffts, dfftp
USE fft_interfaces, ONLY : fft_interpolate
!
IMPLICIT NONE
!
INTEGER :: nspin
!! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear
INTEGER :: nrxx
!! input: the fft grid dimension
REAL(DP) :: vrs(nrxx,nspin)
!! output: total local potential interpolated on the smooth grid
REAL(DP) :: kedtau(dffts%nnr,nspin)
!! position dependent kinetic energy enhancement factor
REAL(DP) :: kedtaur(nrxx,nspin)
!! the kinetic energy density in R-space
LOGICAL :: doublegrid
!! input: true if a doublegrid is used
!
! ... local variable
!
INTEGER :: is
!
! ... interpolate it on the smooth mesh if necessary
!
DO is = 1, nspin
@ -124,6 +60,10 @@ SUBROUTINE interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
IF (xclib_dft_is('meta')) CALL fft_interpolate( dfftp, kedtaur(:,is), dffts, kedtau(:,is) )
ENDDO
!
!$acc update device(vrs)
!
!$acc end data
!
RETURN
!
END SUBROUTINE interpolate_vrs
END SUBROUTINE set_vrs

View File

@ -10,7 +10,7 @@
#define tg_gather_gpu tg_gather
#endif
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v, hpsi_d )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - Gamma point.
!
@ -30,9 +30,9 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
INTEGER, INTENT(in) :: lda, n, m
COMPLEX(DP), INTENT(in) :: psi_d(lda,m)
COMPLEX(DP), INTENT(inout):: hpsi_d(lda,m)
REAL(DP), INTENT(in) :: v_d(dffts%nnr)
REAL(DP), INTENT(in) :: v(dffts%nnr)
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, hpsi_d, v_d
attributes(DEVICE) :: psi_d, hpsi_d
#endif
!
! ... local variables
@ -82,10 +82,10 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
CALL wave_g2r( psi(:,ibnd:ibnd+group_size-1), psic, dffts, howmany_set=hm_vec )
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2) present(v)
DO idx = 0, howmany-1
DO j = 1, dffts_nnr
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v_d(j)
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v(j)
ENDDO
ENDDO
!
@ -122,9 +122,9 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
CALL wave_g2r( psi(1:n,ibnd:ebnd), psic, dffts )
!
!$acc parallel loop
!$acc parallel loop present(v)
DO j = 1, dffts_nnr
psic(j) = psic(j) * v_d(j)
psic(j) = psic(j) * v(j)
ENDDO
!
brange=1 ; fac=1.d0
@ -158,7 +158,7 @@ END SUBROUTINE vloc_psi_gamma_gpu
!
!@njs: vloc_psi_k
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v, hpsi_d )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - k-points. GPU double.
!
@ -185,9 +185,9 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
INTEGER, INTENT(IN) :: lda, n, m
COMPLEX(DP), INTENT(IN) :: psi_d(lda,m)
COMPLEX(DP), INTENT(INOUT):: hpsi_d(lda,m)
REAL(DP), INTENT(IN) :: v_d(dffts%nnr)
REAL(DP), INTENT(IN) :: v(dffts%nnr)
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, hpsi_d, v_d
attributes(DEVICE) :: psi_d, hpsi_d
#endif
!
! ... local variables
@ -229,10 +229,10 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
CALL wave_g2r( psi(:,ibnd:ebnd), psic, dffts, igk=igk_k(:,current_k), &
howmany_set=hm_vec )
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2) present(v)
DO idx = 0, group_size-1
DO j = 1, dffts_nnr
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v_d(j)
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v(j)
ENDDO
ENDDO
!
@ -256,9 +256,9 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
!
CALL wave_g2r( psi(1:n,ibnd:ibnd), psic, dffts, igk=igk_k(:,current_k) )
!
!$acc parallel loop
!$acc parallel loop present(v)
DO j = 1, dffts_nnr
psic(j) = psic(j) * v_d(j)
psic(j) = psic(j) * v(j)
ENDDO
!
CALL wave_r2g( psic, vpsi(1:n,:), dffts, igk=igk_k(:,current_k) )
@ -289,7 +289,7 @@ END SUBROUTINE vloc_psi_k_gpu
!
!@njs: vloc_psi_nc
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v, hpsi_d )
!-----------------------------------------------------------------------
!! Calculation of Vloc*psi using dual-space technique - non-collinear -
!! GPU version.
@ -308,11 +308,11 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda, n, m
REAL(DP), INTENT(IN) :: v_d(dfftp%nnr,4) ! beware dimensions!
REAL(DP), INTENT(IN) :: v(dfftp%nnr,4) ! beware dimensions!
COMPLEX(DP), INTENT(IN) :: psi_d(lda*npol,m)
COMPLEX(DP), INTENT(INOUT):: hpsi_d(lda,npol,m)
#if defined(__CUDA)
attributes(DEVICE) :: v_d, psi_d, hpsi_d
attributes(DEVICE) :: psi_d, hpsi_d
#endif
!
! ... local variables
@ -355,19 +355,19 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
ENDDO
!
IF (domag) THEN
!$acc parallel loop
!$acc parallel loop present(v)
DO j = 1, dffts_nnr
sup = psic_nc(j,1) * (v_d(j,1)+v_d(j,4)) + &
psic_nc(j,2) * (v_d(j,2)-(0.d0,1.d0)*v_d(j,3))
sdwn = psic_nc(j,2) * (v_d(j,1)-v_d(j,4)) + &
psic_nc(j,1) * (v_d(j,2)+(0.d0,1.d0)*v_d(j,3))
sup = psic_nc(j,1) * (v(j,1)+v(j,4)) + &
psic_nc(j,2) * (v(j,2)-(0.d0,1.d0)*v(j,3))
sdwn = psic_nc(j,2) * (v(j,1)-v(j,4)) + &
psic_nc(j,1) * (v(j,2)+(0.d0,1.d0)*v(j,3))
psic_nc(j,1) = sup
psic_nc(j,2) = sdwn
ENDDO
ELSE
!$acc parallel loop
!$acc parallel loop present(v)
DO j = 1, dffts_nnr
psic_nc(j,:) = psic_nc(j,:) * v_d(j,1)
psic_nc(j,:) = psic_nc(j,:) * v(j,1)
ENDDO
ENDIF
!