diff --git a/PW/CMakeLists.txt b/PW/CMakeLists.txt index 828d3afcc..7d2be85b8 100644 --- a/PW/CMakeLists.txt +++ b/PW/CMakeLists.txt @@ -245,7 +245,6 @@ set(src_pw src/wannier_occ.f90 src/d3hess_mod.f90 # GPU - src/scf_mod_gpu.f90 src/g_psi_mod_gpu.f90 src/newd_gpu.f90 src/add_paw_to_deeq_gpu.f90 diff --git a/PW/src/Makefile b/PW/src/Makefile index 20b94cc3b..4704862fb 100644 --- a/PW/src/Makefile +++ b/PW/src/Makefile @@ -278,7 +278,6 @@ PWLIBS += \ h_psi_gpu.o \ vhpsi_gpu.o \ vloc_psi_gpu.o \ - scf_mod_gpu.o \ usnldiag_gpu.o \ add_vuspsi_gpu.o \ newd_gpu.o \ diff --git a/PW/src/allocate_fft.f90 b/PW/src/allocate_fft.f90 index f15ddee01..4705cda85 100644 --- a/PW/src/allocate_fft.f90 +++ b/PW/src/allocate_fft.f90 @@ -27,8 +27,6 @@ SUBROUTINE allocate_fft USE wavefunctions, ONLY : psic, psic_nc USE xc_lib, ONLY : xclib_dft_is ! - USE scf_gpum, ONLY : using_vrs - ! IMPLICIT NONE ! ! ... First a bunch of checks @@ -68,9 +66,7 @@ SUBROUTINE allocate_fft ALLOCATE( rhog_core(ngm) ) ALLOCATE( psic(dfftp%nnr) ) ALLOCATE( vrs(dfftp%nnr,nspin) ) -#if defined(__CUDA) - CALL using_vrs(2) -#endif + !$acc enter data create (vrs) ! IF (noncolin) ALLOCATE( psic_nc(dfftp%nnr,npol) ) ! diff --git a/PW/src/clean_pw.f90 b/PW/src/clean_pw.f90 index d906c5389..5d7e19f66 100644 --- a/PW/src/clean_pw.f90 +++ b/PW/src/clean_pw.f90 @@ -142,8 +142,8 @@ SUBROUTINE clean_pw( lflag ) IF ( ALLOCATED( rhog_core ) ) DEALLOCATE( rhog_core ) IF ( ALLOCATED( psic ) ) DEALLOCATE( psic ) IF ( ALLOCATED( psic_nc ) ) DEALLOCATE( psic_nc ) + !$acc exit data delete(vrs) IF ( ALLOCATED( vrs ) ) DEALLOCATE( vrs ) - CALL deallocate_scf_gpu() ! ! ... arrays allocated in allocate_locpot.f90 ( and never deallocated ) ! diff --git a/PW/src/electrons.f90 b/PW/src/electrons.f90 index 7cf4c68a8..9ab34e7db 100644 --- a/PW/src/electrons.f90 +++ b/PW/src/electrons.f90 @@ -47,8 +47,6 @@ SUBROUTINE electrons() USE loc_scdm, ONLY : use_scdm, localize_orbitals USE loc_scdm_k, ONLY : localize_orbitals_k ! - USE scf_gpum, ONLY : using_vrs - ! USE add_dmft_occ, ONLY : dmft USE rism_module, ONLY : lrism, rism_calc3d USE makovpayne, ONLY : makov_payne @@ -145,7 +143,6 @@ SUBROUTINE electrons() ehart, etxc, vtxc, eth, etotefield, charge, v) IF (lrism) CALL rism_calc3d(rho%of_g(:, 1), esol, vsol, v%of_r, tr2) IF (okpaw) CALL PAW_potential(rho%bec, ddd_PAW, epaw,etot_cmp_paw) - CALL using_vrs(1) CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, & nspin, doublegrid ) ! @@ -242,7 +239,6 @@ SUBROUTINE electrons() IF (lrism) CALL rism_calc3d(rho%of_g(:, 1), esol, vsol, v%of_r, tr2) ! IF (okpaw) CALL PAW_potential(rho%bec, ddd_PAW, epaw,etot_cmp_paw) - CALL using_vrs(1) CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, & nspin, doublegrid ) ! @@ -462,7 +458,6 @@ SUBROUTINE electrons_scf ( printout, exxen ) USE libmbd_interface, ONLY : EmbdvdW USE add_dmft_occ, ONLY : dmft, dmft_update, v_dmft, dmft_updated ! - USE scf_gpum, ONLY : using_vrs USE device_fbuff_m, ONLY : dev_buf, pin_buf USE pwcom, ONLY : report_mag USE makovpayne, ONLY : makov_payne @@ -987,13 +982,8 @@ SUBROUTINE electrons_scf ( printout, exxen ) ! ! ... define the total local potential (external + scf) ! - CALL using_vrs(1) - CALL sum_vrs( dfftp%nnr, nspin, vltot, v%of_r, vrs ) - ! - ! ... interpolate the total local potential - ! - CALL using_vrs(1) ! redundant - CALL interpolate_vrs( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs ) + CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, & + nspin, doublegrid ) ! ! ... in the US case we have to recompute the self-consistent ! ... term in the nonlocal potential diff --git a/PW/src/h_epsi_her_apply.f90 b/PW/src/h_epsi_her_apply.f90 index d34a56936..86c1be94d 100644 --- a/PW/src/h_epsi_her_apply.f90 +++ b/PW/src/h_epsi_her_apply.f90 @@ -19,7 +19,6 @@ SUBROUTINE h_epsi_her_apply( lda, n, nbande, psi, hpsi, pdir, e_field ) USE noncollin_module, ONLY : noncolin, npol, lspinorb USE wvfct, ONLY : npwx, nbnd, ik => current_k USE lsda_mod, ONLY : current_spin, nspin - USE scf, ONLY : vrs USE gvect USE uspp, ONLY : okvan, nkb, vkb, qq_so, qq_at USE uspp_param, ONLY : nh, nhm, nbetam diff --git a/PW/src/h_epsi_her_set.f90 b/PW/src/h_epsi_her_set.f90 index 1c3820bce..d77ceb2bb 100644 --- a/PW/src/h_epsi_her_set.f90 +++ b/PW/src/h_epsi_her_set.f90 @@ -19,7 +19,6 @@ SUBROUTINE h_epsi_her_set( pdir, e_field ) USE noncollin_module, ONLY: noncolin, npol, lspinorb USE wvfct, ONLY: npwx, nbnd USE lsda_mod, ONLY: current_spin, nspin - USE scf, ONLY: vrs USE gvect USE fft_base, ONLY: dfftp USE uspp, ONLY: okvan, nkb, vkb diff --git a/PW/src/h_psi.f90 b/PW/src/h_psi.f90 index c08cca721..2c6d19f25 100644 --- a/PW/src/h_psi.f90 +++ b/PW/src/h_psi.f90 @@ -106,7 +106,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi ) USE sci_mod, ONLY: p_psi USE fft_helper_subroutines ! - USE scf_gpum, ONLY: using_vrs #if defined(__OSCDFT) USE plugin_flags, ONLY : use_oscdft USE oscdft_base, ONLY : oscdft_ctx @@ -133,10 +132,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi ) ! ! CALL start_clock( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6) - - CALL using_vrs(0) ! vloc_psi_gamma (intent:in) - - ! ! ... Here we set the kinetic energy (k+G)^2 psi and clean up garbage ! diff --git a/PW/src/h_psi_gpu.f90 b/PW/src/h_psi_gpu.f90 index 1b0a34731..435f70581 100644 --- a/PW/src/h_psi_gpu.f90 +++ b/PW/src/h_psi_gpu.f90 @@ -97,7 +97,6 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d ) USE bp, ONLY: lelfield, l3dstring, gdir, efield, efield_cry USE becmod, ONLY: bec_type, becp USE lsda_mod, ONLY: current_spin - USE scf_gpum, ONLY: vrs_d, using_vrs_d USE uspp, ONLY: nkb, vkb USE ldaU, ONLY: lda_plus_u, lda_plus_u_kind, Hubbard_projectors USE gvect, ONLY: gstart @@ -113,6 +112,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d ) USE fft_helper_subroutines USE device_memcpy_m, ONLY: dev_memcpy, dev_memset ! + USE scf, ONLY: vrs USE wvfct, ONLY: g2kin #if defined(__OSCDFT) USE plugin_flags, ONLY : use_oscdft @@ -140,7 +140,6 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d ) LOGICAL :: need_host_copy ! CALL start_clock_gpu( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6) - CALL using_vrs_d(0) ! ! ... Here we add the kinetic energy (k+G)^2 psi and clean up garbage ! @@ -208,13 +207,13 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d ) ! ELSE ! ... usual reciprocal-space algorithm - CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d ) + CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs(1,current_spin), hpsi_d ) ! ENDIF ! ELSE IF ( noncolin ) THEN ! - CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs_d, hpsi_d ) + CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs, hpsi_d ) ! ELSE ! @@ -246,7 +245,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d ) ! ELSE ! - CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d ) + CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs(1,current_spin), hpsi_d ) ! ENDIF ! diff --git a/PW/src/hinit1.f90 b/PW/src/hinit1.f90 index 233a4206f..3d3765dc6 100644 --- a/PW/src/hinit1.f90 +++ b/PW/src/hinit1.f90 @@ -35,7 +35,6 @@ SUBROUTINE hinit1() USE dfunct_gpum, ONLY : newd_gpu USE exx_base, ONLY : coulomb_fac, coulomb_done ! - USE scf_gpum, ONLY : using_vrs USE ener, ONLY : esol, vsol USE rism_module, ONLY : lrism, rism_update_pos, rism_calc3d ! @@ -113,7 +112,6 @@ SUBROUTINE hinit1() ! ! ... define the total local potential (external+scf) ! - CALL using_vrs(1) CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, & doublegrid ) ! diff --git a/PW/src/orbm_kubo.f90 b/PW/src/orbm_kubo.f90 index 7b4d86699..77f1c7007 100644 --- a/PW/src/orbm_kubo.f90 +++ b/PW/src/orbm_kubo.f90 @@ -47,7 +47,6 @@ SUBROUTINE orbm_kubo() USE gvecs, ONLY : doublegrid USE mp_pools, ONLY : intra_pool_comm USE mp_world, ONLY : world_comm - USE scf_gpum, ONLY : using_vrs USE uspp_init, ONLY : init_us_2 ! IMPLICIT NONE @@ -112,7 +111,6 @@ SUBROUTINE orbm_kubo() ALLOCATE( H_evc(npol*npwx,nbnd) ) ALLOCATE( temp(ngm) ) ! - CALL using_vrs(1) CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid ) CALL allocate_bec_type( nkb, nbnd, becp ) ! Initializations diff --git a/PW/src/potinit.f90 b/PW/src/potinit.f90 index 37bdff3c3..0fad36ad9 100644 --- a/PW/src/potinit.f90 +++ b/PW/src/potinit.f90 @@ -55,7 +55,6 @@ SUBROUTINE potinit() USE paw_init, ONLY : PAW_atomic_becsum USE paw_onecenter, ONLY : PAW_potential ! - USE scf_gpum, ONLY : using_vrs USE pwcom, ONLY : report_mag USE rism_module, ONLY : lrism, rism_init3d, rism_calc3d ! @@ -278,9 +277,7 @@ SUBROUTINE potinit() ! ! ... define the total local potential (external+scf) ! - CALL using_vrs(1) CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid ) - ! ! ... write on output the parameters used in the DFT+U(+V) calculation ! IF ( lda_plus_u ) THEN diff --git a/PW/src/realus.f90 b/PW/src/realus.f90 index b918864e5..b792f88b1 100644 --- a/PW/src/realus.f90 +++ b/PW/src/realus.f90 @@ -2613,7 +2613,6 @@ MODULE realus USE fft_base, ONLY : dffts USE scf, ONLY : vrs USE lsda_mod, ONLY : current_spin - USE scf_gpum, ONLY : using_vrs ! IMPLICIT NONE ! @@ -2629,8 +2628,6 @@ MODULE realus ! CALL start_clock( 'v_loc_psir' ) - CALL using_vrs(0) ! tg_gather (intent: in) - IF( dffts%has_task_groups ) THEN IF (ibnd == 1 ) THEN CALL tg_gather( dffts, vrs(:,current_spin), tg_v ) @@ -2672,7 +2669,6 @@ MODULE realus USE fft_base, ONLY : dffts USE scf, ONLY : vrs USE lsda_mod, ONLY : current_spin - USE scf_gpum, ONLY : using_vrs ! IMPLICIT NONE ! @@ -2688,8 +2684,6 @@ MODULE realus ! CALL start_clock( 'v_loc_psir' ) - CALL using_vrs(0) ! tg_gather (intent: in) - IF( dffts%has_task_groups ) THEN IF (ibnd == 1 ) THEN CALL tg_gather( dffts, vrs(:,current_spin), tg_v ) diff --git a/PW/src/scf_mod.f90 b/PW/src/scf_mod.f90 index a8b19fba0..dc7fc5744 100644 --- a/PW/src/scf_mod.f90 +++ b/PW/src/scf_mod.f90 @@ -98,9 +98,6 @@ MODULE scf REAL(DP), ALLOCATABLE :: vltot(:) !! the local potential in real space REAL(DP), ALLOCATABLE :: vrs(:,:) -#if defined(__CUDA) - attributes(pinned) :: vrs -#endif !! the total pot. in real space (smooth grid) REAL(DP), ALLOCATABLE :: rho_core(:) !! the core charge in real space diff --git a/PW/src/scf_mod_gpu.f90 b/PW/src/scf_mod_gpu.f90 deleted file mode 100644 index 271c5d0a4..000000000 --- a/PW/src/scf_mod_gpu.f90 +++ /dev/null @@ -1,124 +0,0 @@ -! -! Copyright (C) 2002-2011 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -#define DIMS1D(arr) lbound(arr,1):ubound(arr,1) -#define DIMS2D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2) -#define DIMS3D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3) -#define DIMS4D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3),lbound(arr,4):ubound(arr,4) -#define DIMS5D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3),lbound(arr,4):ubound(arr,4),lbound(arr,5):ubound(arr,5) -!=----------------------------------------------------------------------------=! - MODULE scf_gpum -!=----------------------------------------------------------------------------=! -#if defined(__CUDA) - USE cudafor -#endif - IMPLICIT NONE - SAVE - INTEGER, PARAMETER :: DP = selected_real_kind(14,200) - INTEGER, PARAMETER :: sgl = selected_real_kind(6,30) - INTEGER, PARAMETER :: i4b = selected_int_kind(9) - INTEGER, PARAMETER :: i8b = selected_int_kind(18) -#if defined(__DEBUG) - INTEGER :: iverbosity = 1 -#else - INTEGER :: iverbosity = 0 -#endif - ! - REAL(DP), ALLOCATABLE :: vrs_d(:, :) - ! -#if defined(__CUDA) - attributes (DEVICE) :: vrs_d -#endif - - LOGICAL :: vrs_ood = .false. ! used to flag out of date variables - LOGICAL :: vrs_d_ood = .false. ! used to flag out of date variables - ! - CONTAINS - ! - SUBROUTINE using_vrs(intento, debug_info) - ! - ! intento is used to specify what the variable will be used for : - ! 0 -> in , the variable needs to be synchronized but won't be changed - ! 1 -> inout , the variable needs to be synchronized AND will be changed - ! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten - ! - USE scf, ONLY : vrs - implicit none - INTEGER, INTENT(IN) :: intento - CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info -#if defined(__CUDA) || defined(__CUDA_GNU) - INTEGER :: intento_ - intento_ = intento - ! - IF (PRESENT(debug_info) ) print *, "using_vrs ", debug_info, vrs_ood - ! - IF (vrs_ood) THEN - IF ((.not. allocated(vrs_d)) .and. (intento_ < 2)) THEN - CALL errore('using_vrs_d', 'PANIC: sync of vrs from vrs_d with unallocated array. Bye!!', 1) - stop - END IF - IF (.not. allocated(vrs)) THEN - IF (intento_ /= 2) THEN - print *, "WARNING: sync of vrs with unallocated array and intento /= 2? Changed to 2!" - intento_ = 2 - END IF - ! IF (intento_ > 0) vrs_d_ood = .true. - END IF - IF (intento_ < 2) THEN - IF ( iverbosity > 0 ) print *, "Really copied vrs D->H" - vrs = vrs_d - END IF - vrs_ood = .false. - ENDIF - IF (intento_ > 0) vrs_d_ood = .true. -#endif - END SUBROUTINE using_vrs - ! - SUBROUTINE using_vrs_d(intento, debug_info) - ! - USE scf, ONLY : vrs - implicit none - INTEGER, INTENT(IN) :: intento - CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info -#if defined(__CUDA) || defined(__CUDA_GNU) - ! - IF (PRESENT(debug_info) ) print *, "using_vrs_d ", debug_info, vrs_d_ood - ! - IF (.not. allocated(vrs)) THEN - IF (intento /= 2) print *, "WARNING: sync of vrs_d with unallocated array and intento /= 2?" - IF (allocated(vrs_d)) DEALLOCATE(vrs_d) - vrs_d_ood = .false. - RETURN - END IF - ! here we know that vrs is allocated, check if size is 0 - IF ( SIZE(vrs) == 0 ) THEN - print *, "Refusing to allocate 0 dimensional array vrs_d. If used, code will crash." - RETURN - END IF - ! - IF (vrs_d_ood) THEN - IF ( allocated(vrs_d) .and. (SIZE(vrs_d)/=SIZE(vrs))) deallocate(vrs_d) - IF (.not. allocated(vrs_d)) ALLOCATE(vrs_d(DIMS2D(vrs))) ! MOLD does not work on all compilers - IF (intento < 2) THEN - IF ( iverbosity > 0 ) print *, "Really copied vrs H->D" - vrs_d = vrs - END IF - vrs_d_ood = .false. - ENDIF - IF (intento > 0) vrs_ood = .true. -#else - CALL errore('using_vrs_d', 'Trying to use device data without device compiled code!', 1) -#endif - END SUBROUTINE using_vrs_d - ! - SUBROUTINE deallocate_scf_gpu - IF( ALLOCATED( vrs_d ) ) DEALLOCATE( vrs_d ) - vrs_d_ood = .false. - END SUBROUTINE deallocate_scf_gpu -!=----------------------------------------------------------------------------=! - END MODULE scf_gpum -!=----------------------------------------------------------------------------=! diff --git a/PW/src/set_vrs.f90 b/PW/src/set_vrs.f90 index fc683a4bb..a5c2ec685 100644 --- a/PW/src/set_vrs.f90 +++ b/PW/src/set_vrs.f90 @@ -1,5 +1,5 @@ ! -! Copyright (C) 2001 PWSCF group +! Copyright (C) 2001-2024 Quantum ESPESSO Foundation ! This file is distributed under the terms of the ! GNU General Public License. See the file `License' ! in the root directory of the present distribution, @@ -13,7 +13,9 @@ SUBROUTINE set_vrs( vrs, vltot, vr, kedtau, kedtaur, nrxx, nspin, doublegrid ) !! the sum of all the local pseudopotential contributions. ! USE kinds - USE fft_base, ONLY : dffts + USE xc_lib, ONLY : xclib_dft_is + USE fft_base, ONLY : dffts, dfftp + USE fft_interfaces, ONLY : fft_interpolate ! IMPLICIT NONE ! @@ -33,42 +35,9 @@ SUBROUTINE set_vrs( vrs, vltot, vr, kedtau, kedtaur, nrxx, nspin, doublegrid ) !! the kinetic energy density in R-space LOGICAL :: doublegrid ! input: true if a doublegrid is used - ! - CALL sum_vrs( nrxx, nspin, vltot, vr, vrs ) - ! - CALL interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs ) - ! - RETURN - ! -END SUBROUTINE set_vrs -! -! -!-------------------------------------------------------------------- -SUBROUTINE sum_vrs( nrxx, nspin, vltot, vr, vrs ) - !-------------------------------------------------------------------- - !! Accumulates local potential contributions into vrs (the total local potential). - ! - USE kinds - ! - IMPLICIT NONE - ! - INTEGER :: nspin - !! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear - INTEGER :: nrxx - !! input: the fft grid dimension - REAL(DP) :: vrs(nrxx,nspin) - !! output: total local potential on the smooth grid: - !! \(\text{vrs\}=\text{vltot}+\text{vr}\) - REAL(DP) :: vltot(nrxx) - !! input: the total local pseudopotential - REAL(DP) :: vr(nrxx,nspin) - !! input: the scf(H+xc) part of the local potential - ! - ! ... local variable - ! INTEGER :: is ! - ! + !$acc data present(vrs) DO is = 1, nspin ! ! define the total local potential (external + scf) for each spin ... @@ -84,39 +53,6 @@ SUBROUTINE sum_vrs( nrxx, nspin, vltot, vr, vrs ) ! ENDDO ! - RETURN - ! -END SUBROUTINE sum_vrs -! -!-------------------------------------------------------------------------- -SUBROUTINE interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs ) - !-------------------------------------------------------------------------- - !! Interpolates local potential on the smooth mesh if necessary. - ! - USE kinds - USE xc_lib, ONLY : xclib_dft_is - USE fft_base, ONLY : dffts, dfftp - USE fft_interfaces, ONLY : fft_interpolate - ! - IMPLICIT NONE - ! - INTEGER :: nspin - !! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear - INTEGER :: nrxx - !! input: the fft grid dimension - REAL(DP) :: vrs(nrxx,nspin) - !! output: total local potential interpolated on the smooth grid - REAL(DP) :: kedtau(dffts%nnr,nspin) - !! position dependent kinetic energy enhancement factor - REAL(DP) :: kedtaur(nrxx,nspin) - !! the kinetic energy density in R-space - LOGICAL :: doublegrid - !! input: true if a doublegrid is used - ! - ! ... local variable - ! - INTEGER :: is - ! ! ... interpolate it on the smooth mesh if necessary ! DO is = 1, nspin @@ -124,6 +60,10 @@ SUBROUTINE interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs ) IF (xclib_dft_is('meta')) CALL fft_interpolate( dfftp, kedtaur(:,is), dffts, kedtau(:,is) ) ENDDO ! + !$acc update device(vrs) + ! + !$acc end data + ! RETURN ! -END SUBROUTINE interpolate_vrs +END SUBROUTINE set_vrs diff --git a/PW/src/vloc_psi_gpu.f90 b/PW/src/vloc_psi_gpu.f90 index 2fbc46b41..a0d924236 100644 --- a/PW/src/vloc_psi_gpu.f90 +++ b/PW/src/vloc_psi_gpu.f90 @@ -10,7 +10,7 @@ #define tg_gather_gpu tg_gather #endif !----------------------------------------------------------------------- -SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d ) +SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v, hpsi_d ) !----------------------------------------------------------------------- !! Calculation of Vloc*psi using dual-space technique - Gamma point. ! @@ -30,9 +30,9 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d ) INTEGER, INTENT(in) :: lda, n, m COMPLEX(DP), INTENT(in) :: psi_d(lda,m) COMPLEX(DP), INTENT(inout):: hpsi_d(lda,m) - REAL(DP), INTENT(in) :: v_d(dffts%nnr) + REAL(DP), INTENT(in) :: v(dffts%nnr) #if defined(__CUDA) - attributes(DEVICE) :: psi_d, hpsi_d, v_d + attributes(DEVICE) :: psi_d, hpsi_d #endif ! ! ... local variables @@ -82,10 +82,10 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d ) ! CALL wave_g2r( psi(:,ibnd:ibnd+group_size-1), psic, dffts, howmany_set=hm_vec ) ! - !$acc parallel loop collapse(2) + !$acc parallel loop collapse(2) present(v) DO idx = 0, howmany-1 DO j = 1, dffts_nnr - psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v_d(j) + psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v(j) ENDDO ENDDO ! @@ -122,9 +122,9 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d ) ! CALL wave_g2r( psi(1:n,ibnd:ebnd), psic, dffts ) ! - !$acc parallel loop + !$acc parallel loop present(v) DO j = 1, dffts_nnr - psic(j) = psic(j) * v_d(j) + psic(j) = psic(j) * v(j) ENDDO ! brange=1 ; fac=1.d0 @@ -158,7 +158,7 @@ END SUBROUTINE vloc_psi_gamma_gpu ! !@njs: vloc_psi_k !----------------------------------------------------------------------- -SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d ) +SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v, hpsi_d ) !----------------------------------------------------------------------- !! Calculation of Vloc*psi using dual-space technique - k-points. GPU double. ! @@ -185,9 +185,9 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d ) INTEGER, INTENT(IN) :: lda, n, m COMPLEX(DP), INTENT(IN) :: psi_d(lda,m) COMPLEX(DP), INTENT(INOUT):: hpsi_d(lda,m) - REAL(DP), INTENT(IN) :: v_d(dffts%nnr) + REAL(DP), INTENT(IN) :: v(dffts%nnr) #if defined(__CUDA) - attributes(DEVICE) :: psi_d, hpsi_d, v_d + attributes(DEVICE) :: psi_d, hpsi_d #endif ! ! ... local variables @@ -229,10 +229,10 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d ) CALL wave_g2r( psi(:,ibnd:ebnd), psic, dffts, igk=igk_k(:,current_k), & howmany_set=hm_vec ) ! - !$acc parallel loop collapse(2) + !$acc parallel loop collapse(2) present(v) DO idx = 0, group_size-1 DO j = 1, dffts_nnr - psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v_d(j) + psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v(j) ENDDO ENDDO ! @@ -256,9 +256,9 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d ) ! CALL wave_g2r( psi(1:n,ibnd:ibnd), psic, dffts, igk=igk_k(:,current_k) ) ! - !$acc parallel loop + !$acc parallel loop present(v) DO j = 1, dffts_nnr - psic(j) = psic(j) * v_d(j) + psic(j) = psic(j) * v(j) ENDDO ! CALL wave_r2g( psic, vpsi(1:n,:), dffts, igk=igk_k(:,current_k) ) @@ -289,7 +289,7 @@ END SUBROUTINE vloc_psi_k_gpu ! !@njs: vloc_psi_nc !----------------------------------------------------------------------- -SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d ) +SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v, hpsi_d ) !----------------------------------------------------------------------- !! Calculation of Vloc*psi using dual-space technique - non-collinear - !! GPU version. @@ -308,11 +308,11 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d ) IMPLICIT NONE ! INTEGER, INTENT(IN) :: lda, n, m - REAL(DP), INTENT(IN) :: v_d(dfftp%nnr,4) ! beware dimensions! + REAL(DP), INTENT(IN) :: v(dfftp%nnr,4) ! beware dimensions! COMPLEX(DP), INTENT(IN) :: psi_d(lda*npol,m) COMPLEX(DP), INTENT(INOUT):: hpsi_d(lda,npol,m) #if defined(__CUDA) - attributes(DEVICE) :: v_d, psi_d, hpsi_d + attributes(DEVICE) :: psi_d, hpsi_d #endif ! ! ... local variables @@ -355,19 +355,19 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d ) ENDDO ! IF (domag) THEN - !$acc parallel loop + !$acc parallel loop present(v) DO j = 1, dffts_nnr - sup = psic_nc(j,1) * (v_d(j,1)+v_d(j,4)) + & - psic_nc(j,2) * (v_d(j,2)-(0.d0,1.d0)*v_d(j,3)) - sdwn = psic_nc(j,2) * (v_d(j,1)-v_d(j,4)) + & - psic_nc(j,1) * (v_d(j,2)+(0.d0,1.d0)*v_d(j,3)) + sup = psic_nc(j,1) * (v(j,1)+v(j,4)) + & + psic_nc(j,2) * (v(j,2)-(0.d0,1.d0)*v(j,3)) + sdwn = psic_nc(j,2) * (v(j,1)-v(j,4)) + & + psic_nc(j,1) * (v(j,2)+(0.d0,1.d0)*v(j,3)) psic_nc(j,1) = sup psic_nc(j,2) = sdwn ENDDO ELSE - !$acc parallel loop + !$acc parallel loop present(v) DO j = 1, dffts_nnr - psic_nc(j,:) = psic_nc(j,:) * v_d(j,1) + psic_nc(j,:) = psic_nc(j,:) * v(j,1) ENDDO ENDIF !