mirror of https://gitlab.com/QEF/q-e.git
Merge branch 'vloc_clean' into 'develop'
More Vloc*psi cleanup See merge request QEF/q-e!2394
This commit is contained in:
commit
f22bd7b35d
|
@ -45,9 +45,6 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
USE lsda_mod, ONLY : lsda, nspin, current_spin, isk
|
||||
USE wvfct, ONLY : nbnd, npwx
|
||||
USE scf, ONLY : rho, vrs
|
||||
#if defined(__CUDA)
|
||||
USE scf_gpum, ONLY : vrs_d
|
||||
#endif
|
||||
USE uspp, ONLY : okvan, vkb, deeq_nc
|
||||
USE uspp_param, ONLY : nhm
|
||||
USE noncollin_module, ONLY : noncolin, domag, npol, nspin_mag
|
||||
|
@ -331,10 +328,9 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
dvscfins(:, 2:4, :) = -dvscfins(:, 2:4, :)
|
||||
IF (okvan) int3_nc(:,:,:,:,:) = int3_save(:,:,:,:,:,2)
|
||||
ENDIF
|
||||
!$acc kernels
|
||||
vrs(:, 2:4) = -vrs(:, 2:4)
|
||||
#if defined(__CUDA)
|
||||
vrs_d = vrs
|
||||
#endif
|
||||
!$acc end kernels
|
||||
IF (okvan) THEN
|
||||
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,2)
|
||||
!$acc update device(deeq_nc)
|
||||
|
@ -362,10 +358,9 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
dvscfins(:, 2:4, :) = -dvscfins(:, 2:4, :)
|
||||
IF (okvan) int3_nc(:,:,:,:,:) = int3_save(:,:,:,:,:,1)
|
||||
ENDIF
|
||||
!$acc kernels
|
||||
vrs(:, 2:4) = -vrs(:, 2:4)
|
||||
#if defined(__CUDA)
|
||||
vrs_d = vrs
|
||||
#endif
|
||||
!$acc end kernels
|
||||
IF (okvan) THEN
|
||||
deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,1)
|
||||
!$acc update device(deeq_nc)
|
||||
|
|
|
@ -245,7 +245,6 @@ set(src_pw
|
|||
src/wannier_occ.f90
|
||||
src/d3hess_mod.f90
|
||||
# GPU
|
||||
src/scf_mod_gpu.f90
|
||||
src/g_psi_mod_gpu.f90
|
||||
src/newd_gpu.f90
|
||||
src/add_paw_to_deeq_gpu.f90
|
||||
|
|
|
@ -278,7 +278,6 @@ PWLIBS += \
|
|||
h_psi_gpu.o \
|
||||
vhpsi_gpu.o \
|
||||
vloc_psi_gpu.o \
|
||||
scf_mod_gpu.o \
|
||||
usnldiag_gpu.o \
|
||||
add_vuspsi_gpu.o \
|
||||
newd_gpu.o \
|
||||
|
|
|
@ -27,8 +27,6 @@ SUBROUTINE allocate_fft
|
|||
USE wavefunctions, ONLY : psic, psic_nc
|
||||
USE xc_lib, ONLY : xclib_dft_is
|
||||
!
|
||||
USE scf_gpum, ONLY : using_vrs
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... First a bunch of checks
|
||||
|
@ -68,9 +66,7 @@ SUBROUTINE allocate_fft
|
|||
ALLOCATE( rhog_core(ngm) )
|
||||
ALLOCATE( psic(dfftp%nnr) )
|
||||
ALLOCATE( vrs(dfftp%nnr,nspin) )
|
||||
#if defined(__CUDA)
|
||||
CALL using_vrs(2)
|
||||
#endif
|
||||
!$acc enter data create (vrs)
|
||||
!
|
||||
IF (noncolin) ALLOCATE( psic_nc(dfftp%nnr,npol) )
|
||||
!
|
||||
|
|
|
@ -64,8 +64,6 @@ SUBROUTINE clean_pw( lflag )
|
|||
USE libmbd_interface, ONLY : clean_mbd
|
||||
USE dftd3_qe, ONLY : dftd3_clean
|
||||
!
|
||||
USE scf_gpum, ONLY : deallocate_scf_gpu
|
||||
!
|
||||
USE control_flags, ONLY : sic, scissor
|
||||
USE sic_mod, ONLY : deallocate_sic
|
||||
USE sci_mod, ONLY : deallocate_scissor
|
||||
|
@ -142,8 +140,8 @@ SUBROUTINE clean_pw( lflag )
|
|||
IF ( ALLOCATED( rhog_core ) ) DEALLOCATE( rhog_core )
|
||||
IF ( ALLOCATED( psic ) ) DEALLOCATE( psic )
|
||||
IF ( ALLOCATED( psic_nc ) ) DEALLOCATE( psic_nc )
|
||||
!$acc exit data delete(vrs)
|
||||
IF ( ALLOCATED( vrs ) ) DEALLOCATE( vrs )
|
||||
CALL deallocate_scf_gpu()
|
||||
!
|
||||
! ... arrays allocated in allocate_locpot.f90 ( and never deallocated )
|
||||
!
|
||||
|
|
|
@ -47,8 +47,6 @@ SUBROUTINE electrons()
|
|||
USE loc_scdm, ONLY : use_scdm, localize_orbitals
|
||||
USE loc_scdm_k, ONLY : localize_orbitals_k
|
||||
!
|
||||
USE scf_gpum, ONLY : using_vrs
|
||||
!
|
||||
USE add_dmft_occ, ONLY : dmft
|
||||
USE rism_module, ONLY : lrism, rism_calc3d
|
||||
USE makovpayne, ONLY : makov_payne
|
||||
|
@ -145,7 +143,6 @@ SUBROUTINE electrons()
|
|||
ehart, etxc, vtxc, eth, etotefield, charge, v)
|
||||
IF (lrism) CALL rism_calc3d(rho%of_g(:, 1), esol, vsol, v%of_r, tr2)
|
||||
IF (okpaw) CALL PAW_potential(rho%bec, ddd_PAW, epaw,etot_cmp_paw)
|
||||
CALL using_vrs(1)
|
||||
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, &
|
||||
nspin, doublegrid )
|
||||
!
|
||||
|
@ -242,7 +239,6 @@ SUBROUTINE electrons()
|
|||
IF (lrism) CALL rism_calc3d(rho%of_g(:, 1), esol, vsol, v%of_r, tr2)
|
||||
!
|
||||
IF (okpaw) CALL PAW_potential(rho%bec, ddd_PAW, epaw,etot_cmp_paw)
|
||||
CALL using_vrs(1)
|
||||
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, &
|
||||
nspin, doublegrid )
|
||||
!
|
||||
|
@ -462,7 +458,6 @@ SUBROUTINE electrons_scf ( printout, exxen )
|
|||
USE libmbd_interface, ONLY : EmbdvdW
|
||||
USE add_dmft_occ, ONLY : dmft, dmft_update, v_dmft, dmft_updated
|
||||
!
|
||||
USE scf_gpum, ONLY : using_vrs
|
||||
USE device_fbuff_m, ONLY : dev_buf, pin_buf
|
||||
USE pwcom, ONLY : report_mag
|
||||
USE makovpayne, ONLY : makov_payne
|
||||
|
@ -987,13 +982,8 @@ SUBROUTINE electrons_scf ( printout, exxen )
|
|||
!
|
||||
! ... define the total local potential (external + scf)
|
||||
!
|
||||
CALL using_vrs(1)
|
||||
CALL sum_vrs( dfftp%nnr, nspin, vltot, v%of_r, vrs )
|
||||
!
|
||||
! ... interpolate the total local potential
|
||||
!
|
||||
CALL using_vrs(1) ! redundant
|
||||
CALL interpolate_vrs( dfftp%nnr, nspin, doublegrid, kedtau, v%kin_r, vrs )
|
||||
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, &
|
||||
nspin, doublegrid )
|
||||
!
|
||||
! ... in the US case we have to recompute the self-consistent
|
||||
! ... term in the nonlocal potential
|
||||
|
|
|
@ -19,7 +19,6 @@ SUBROUTINE h_epsi_her_apply( lda, n, nbande, psi, hpsi, pdir, e_field )
|
|||
USE noncollin_module, ONLY : noncolin, npol, lspinorb
|
||||
USE wvfct, ONLY : npwx, nbnd, ik => current_k
|
||||
USE lsda_mod, ONLY : current_spin, nspin
|
||||
USE scf, ONLY : vrs
|
||||
USE gvect
|
||||
USE uspp, ONLY : okvan, nkb, vkb, qq_so, qq_at
|
||||
USE uspp_param, ONLY : nh, nhm, nbetam
|
||||
|
|
|
@ -19,7 +19,6 @@ SUBROUTINE h_epsi_her_set( pdir, e_field )
|
|||
USE noncollin_module, ONLY: noncolin, npol, lspinorb
|
||||
USE wvfct, ONLY: npwx, nbnd
|
||||
USE lsda_mod, ONLY: current_spin, nspin
|
||||
USE scf, ONLY: vrs
|
||||
USE gvect
|
||||
USE fft_base, ONLY: dfftp
|
||||
USE uspp, ONLY: okvan, nkb, vkb
|
||||
|
|
|
@ -106,7 +106,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
|
|||
USE sci_mod, ONLY: p_psi
|
||||
USE fft_helper_subroutines
|
||||
!
|
||||
USE scf_gpum, ONLY: using_vrs
|
||||
#if defined(__OSCDFT)
|
||||
USE plugin_flags, ONLY : use_oscdft
|
||||
USE oscdft_base, ONLY : oscdft_ctx
|
||||
|
@ -133,10 +132,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
|
|||
!
|
||||
!
|
||||
CALL start_clock( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6)
|
||||
|
||||
CALL using_vrs(0) ! vloc_psi_gamma (intent:in)
|
||||
|
||||
|
||||
!
|
||||
! ... Here we set the kinetic energy (k+G)^2 psi and clean up garbage
|
||||
!
|
||||
|
@ -162,9 +157,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
|
|||
! ... real-space algorithm
|
||||
! ... fixme: real_space without beta functions does not make sense
|
||||
!
|
||||
IF ( dffts%has_task_groups ) &
|
||||
CALL errore( 'h_psi', 'task_groups not implemented with real_space', 1 )
|
||||
|
||||
DO ibnd = 1, m, 2
|
||||
! ... transform psi to real space -> psic
|
||||
CALL invfft_orbital_gamma( psi, ibnd, m )
|
||||
|
@ -180,6 +172,9 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
|
|||
CALL fwfft_orbital_gamma( hpsi, ibnd, m, add_to_orbital=.TRUE. )
|
||||
ENDDO
|
||||
!
|
||||
ELSE IF ( dffts%has_task_groups ) THEN
|
||||
! ... usual reciprocal-space algorithm, with task groups
|
||||
CALL vloc_psi_tg_gamma( lda, n, m, psi, vrs(1,current_spin), hpsi )
|
||||
ELSE
|
||||
! ... usual reciprocal-space algorithm
|
||||
CALL vloc_psi_gamma( lda, n, m, psi, vrs(1,current_spin), hpsi )
|
||||
|
@ -188,7 +183,11 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
|
|||
!
|
||||
ELSEIF ( noncolin ) THEN
|
||||
!
|
||||
CALL vloc_psi_nc( lda, n, m, psi, vrs, hpsi )
|
||||
IF ( dffts%has_task_groups ) THEN
|
||||
CALL vloc_psi_tg_nc( lda, n, m, psi, vrs, hpsi )
|
||||
ELSE
|
||||
CALL vloc_psi_nc( lda, n, m, psi, vrs, hpsi )
|
||||
END IF
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
|
@ -197,9 +196,6 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
|
|||
! ... real-space algorithm
|
||||
! ... fixme: real_space without beta functions does not make sense
|
||||
!
|
||||
IF ( dffts%has_task_groups ) &
|
||||
CALL errore( 'h_psi', 'task_groups not implemented with real_space', 1 )
|
||||
!
|
||||
DO ibnd = 1, m
|
||||
! ... transform psi to real space -> psic
|
||||
CALL invfft_orbital_k( psi, ibnd, m )
|
||||
|
@ -216,9 +212,12 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
|
|||
!
|
||||
ENDDO
|
||||
!
|
||||
ELSE IF ( dffts%has_task_groups ) THEN
|
||||
! ... usual reciprocal-space algorithm, with task groups
|
||||
CALL vloc_psi_tg_k( lda, n, m, psi, vrs(1,current_spin), hpsi )
|
||||
ELSE
|
||||
!
|
||||
CALL vloc_psi_k( lda, n, m, psi, vrs(1,current_spin), hpsi )
|
||||
! ... usual reciprocal-space algorithm
|
||||
CALL vloc_psi_k( lda, n, m, psi, vrs(1,current_spin), hpsi )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
|
|
|
@ -97,7 +97,6 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
|
|||
USE bp, ONLY: lelfield, l3dstring, gdir, efield, efield_cry
|
||||
USE becmod, ONLY: bec_type, becp
|
||||
USE lsda_mod, ONLY: current_spin
|
||||
USE scf_gpum, ONLY: vrs_d, using_vrs_d
|
||||
USE uspp, ONLY: nkb, vkb
|
||||
USE ldaU, ONLY: lda_plus_u, lda_plus_u_kind, Hubbard_projectors
|
||||
USE gvect, ONLY: gstart
|
||||
|
@ -113,6 +112,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
|
|||
USE fft_helper_subroutines
|
||||
USE device_memcpy_m, ONLY: dev_memcpy, dev_memset
|
||||
!
|
||||
USE scf, ONLY: vrs
|
||||
USE wvfct, ONLY: g2kin
|
||||
#if defined(__OSCDFT)
|
||||
USE plugin_flags, ONLY : use_oscdft
|
||||
|
@ -140,7 +140,6 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
|
|||
LOGICAL :: need_host_copy
|
||||
!
|
||||
CALL start_clock_gpu( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6)
|
||||
CALL using_vrs_d(0)
|
||||
!
|
||||
! ... Here we add the kinetic energy (k+G)^2 psi and clean up garbage
|
||||
!
|
||||
|
@ -208,13 +207,13 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
|
|||
!
|
||||
ELSE
|
||||
! ... usual reciprocal-space algorithm
|
||||
CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
|
||||
CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs(1,current_spin), hpsi_d )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
ELSE IF ( noncolin ) THEN
|
||||
!
|
||||
CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs_d, hpsi_d )
|
||||
CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs, hpsi_d )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
|
@ -246,7 +245,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
|
|||
!
|
||||
ELSE
|
||||
!
|
||||
CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
|
||||
CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs(1,current_spin), hpsi_d )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
|
|
|
@ -35,7 +35,6 @@ SUBROUTINE hinit1()
|
|||
USE dfunct_gpum, ONLY : newd_gpu
|
||||
USE exx_base, ONLY : coulomb_fac, coulomb_done
|
||||
!
|
||||
USE scf_gpum, ONLY : using_vrs
|
||||
USE ener, ONLY : esol, vsol
|
||||
USE rism_module, ONLY : lrism, rism_update_pos, rism_calc3d
|
||||
!
|
||||
|
@ -113,7 +112,6 @@ SUBROUTINE hinit1()
|
|||
!
|
||||
! ... define the total local potential (external+scf)
|
||||
!
|
||||
CALL using_vrs(1)
|
||||
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, &
|
||||
doublegrid )
|
||||
!
|
||||
|
|
|
@ -47,7 +47,6 @@ SUBROUTINE orbm_kubo()
|
|||
USE gvecs, ONLY : doublegrid
|
||||
USE mp_pools, ONLY : intra_pool_comm
|
||||
USE mp_world, ONLY : world_comm
|
||||
USE scf_gpum, ONLY : using_vrs
|
||||
USE uspp_init, ONLY : init_us_2
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -112,7 +111,6 @@ SUBROUTINE orbm_kubo()
|
|||
ALLOCATE( H_evc(npol*npwx,nbnd) )
|
||||
ALLOCATE( temp(ngm) )
|
||||
!
|
||||
CALL using_vrs(1)
|
||||
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid )
|
||||
CALL allocate_bec_type( nkb, nbnd, becp )
|
||||
! Initializations
|
||||
|
|
|
@ -55,7 +55,6 @@ SUBROUTINE potinit()
|
|||
USE paw_init, ONLY : PAW_atomic_becsum
|
||||
USE paw_onecenter, ONLY : PAW_potential
|
||||
!
|
||||
USE scf_gpum, ONLY : using_vrs
|
||||
USE pwcom, ONLY : report_mag
|
||||
USE rism_module, ONLY : lrism, rism_init3d, rism_calc3d
|
||||
!
|
||||
|
@ -278,9 +277,7 @@ SUBROUTINE potinit()
|
|||
!
|
||||
! ... define the total local potential (external+scf)
|
||||
!
|
||||
CALL using_vrs(1)
|
||||
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid )
|
||||
!
|
||||
! ... write on output the parameters used in the DFT+U(+V) calculation
|
||||
!
|
||||
IF ( lda_plus_u ) THEN
|
||||
|
|
|
@ -2613,7 +2613,6 @@ MODULE realus
|
|||
USE fft_base, ONLY : dffts
|
||||
USE scf, ONLY : vrs
|
||||
USE lsda_mod, ONLY : current_spin
|
||||
USE scf_gpum, ONLY : using_vrs
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -2629,8 +2628,6 @@ MODULE realus
|
|||
!
|
||||
CALL start_clock( 'v_loc_psir' )
|
||||
|
||||
CALL using_vrs(0) ! tg_gather (intent: in)
|
||||
|
||||
IF( dffts%has_task_groups ) THEN
|
||||
IF (ibnd == 1 ) THEN
|
||||
CALL tg_gather( dffts, vrs(:,current_spin), tg_v )
|
||||
|
@ -2672,7 +2669,6 @@ MODULE realus
|
|||
USE fft_base, ONLY : dffts
|
||||
USE scf, ONLY : vrs
|
||||
USE lsda_mod, ONLY : current_spin
|
||||
USE scf_gpum, ONLY : using_vrs
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -2688,8 +2684,6 @@ MODULE realus
|
|||
!
|
||||
CALL start_clock( 'v_loc_psir' )
|
||||
|
||||
CALL using_vrs(0) ! tg_gather (intent: in)
|
||||
|
||||
IF( dffts%has_task_groups ) THEN
|
||||
IF (ibnd == 1 ) THEN
|
||||
CALL tg_gather( dffts, vrs(:,current_spin), tg_v )
|
||||
|
|
|
@ -98,9 +98,6 @@ MODULE scf
|
|||
REAL(DP), ALLOCATABLE :: vltot(:)
|
||||
!! the local potential in real space
|
||||
REAL(DP), ALLOCATABLE :: vrs(:,:)
|
||||
#if defined(__CUDA)
|
||||
attributes(pinned) :: vrs
|
||||
#endif
|
||||
!! the total pot. in real space (smooth grid)
|
||||
REAL(DP), ALLOCATABLE :: rho_core(:)
|
||||
!! the core charge in real space
|
||||
|
|
|
@ -1,124 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2002-2011 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
#define DIMS1D(arr) lbound(arr,1):ubound(arr,1)
|
||||
#define DIMS2D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2)
|
||||
#define DIMS3D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3)
|
||||
#define DIMS4D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3),lbound(arr,4):ubound(arr,4)
|
||||
#define DIMS5D(arr) lbound(arr,1):ubound(arr,1),lbound(arr,2):ubound(arr,2),lbound(arr,3):ubound(arr,3),lbound(arr,4):ubound(arr,4),lbound(arr,5):ubound(arr,5)
|
||||
!=----------------------------------------------------------------------------=!
|
||||
MODULE scf_gpum
|
||||
!=----------------------------------------------------------------------------=!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
SAVE
|
||||
INTEGER, PARAMETER :: DP = selected_real_kind(14,200)
|
||||
INTEGER, PARAMETER :: sgl = selected_real_kind(6,30)
|
||||
INTEGER, PARAMETER :: i4b = selected_int_kind(9)
|
||||
INTEGER, PARAMETER :: i8b = selected_int_kind(18)
|
||||
#if defined(__DEBUG)
|
||||
INTEGER :: iverbosity = 1
|
||||
#else
|
||||
INTEGER :: iverbosity = 0
|
||||
#endif
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: vrs_d(:, :)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes (DEVICE) :: vrs_d
|
||||
#endif
|
||||
|
||||
LOGICAL :: vrs_ood = .false. ! used to flag out of date variables
|
||||
LOGICAL :: vrs_d_ood = .false. ! used to flag out of date variables
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE using_vrs(intento, debug_info)
|
||||
!
|
||||
! intento is used to specify what the variable will be used for :
|
||||
! 0 -> in , the variable needs to be synchronized but won't be changed
|
||||
! 1 -> inout , the variable needs to be synchronized AND will be changed
|
||||
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
|
||||
!
|
||||
USE scf, ONLY : vrs
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
|
||||
#if defined(__CUDA) || defined(__CUDA_GNU)
|
||||
INTEGER :: intento_
|
||||
intento_ = intento
|
||||
!
|
||||
IF (PRESENT(debug_info) ) print *, "using_vrs ", debug_info, vrs_ood
|
||||
!
|
||||
IF (vrs_ood) THEN
|
||||
IF ((.not. allocated(vrs_d)) .and. (intento_ < 2)) THEN
|
||||
CALL errore('using_vrs_d', 'PANIC: sync of vrs from vrs_d with unallocated array. Bye!!', 1)
|
||||
stop
|
||||
END IF
|
||||
IF (.not. allocated(vrs)) THEN
|
||||
IF (intento_ /= 2) THEN
|
||||
print *, "WARNING: sync of vrs with unallocated array and intento /= 2? Changed to 2!"
|
||||
intento_ = 2
|
||||
END IF
|
||||
! IF (intento_ > 0) vrs_d_ood = .true.
|
||||
END IF
|
||||
IF (intento_ < 2) THEN
|
||||
IF ( iverbosity > 0 ) print *, "Really copied vrs D->H"
|
||||
vrs = vrs_d
|
||||
END IF
|
||||
vrs_ood = .false.
|
||||
ENDIF
|
||||
IF (intento_ > 0) vrs_d_ood = .true.
|
||||
#endif
|
||||
END SUBROUTINE using_vrs
|
||||
!
|
||||
SUBROUTINE using_vrs_d(intento, debug_info)
|
||||
!
|
||||
USE scf, ONLY : vrs
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
|
||||
#if defined(__CUDA) || defined(__CUDA_GNU)
|
||||
!
|
||||
IF (PRESENT(debug_info) ) print *, "using_vrs_d ", debug_info, vrs_d_ood
|
||||
!
|
||||
IF (.not. allocated(vrs)) THEN
|
||||
IF (intento /= 2) print *, "WARNING: sync of vrs_d with unallocated array and intento /= 2?"
|
||||
IF (allocated(vrs_d)) DEALLOCATE(vrs_d)
|
||||
vrs_d_ood = .false.
|
||||
RETURN
|
||||
END IF
|
||||
! here we know that vrs is allocated, check if size is 0
|
||||
IF ( SIZE(vrs) == 0 ) THEN
|
||||
print *, "Refusing to allocate 0 dimensional array vrs_d. If used, code will crash."
|
||||
RETURN
|
||||
END IF
|
||||
!
|
||||
IF (vrs_d_ood) THEN
|
||||
IF ( allocated(vrs_d) .and. (SIZE(vrs_d)/=SIZE(vrs))) deallocate(vrs_d)
|
||||
IF (.not. allocated(vrs_d)) ALLOCATE(vrs_d(DIMS2D(vrs))) ! MOLD does not work on all compilers
|
||||
IF (intento < 2) THEN
|
||||
IF ( iverbosity > 0 ) print *, "Really copied vrs H->D"
|
||||
vrs_d = vrs
|
||||
END IF
|
||||
vrs_d_ood = .false.
|
||||
ENDIF
|
||||
IF (intento > 0) vrs_ood = .true.
|
||||
#else
|
||||
CALL errore('using_vrs_d', 'Trying to use device data without device compiled code!', 1)
|
||||
#endif
|
||||
END SUBROUTINE using_vrs_d
|
||||
!
|
||||
SUBROUTINE deallocate_scf_gpu
|
||||
IF( ALLOCATED( vrs_d ) ) DEALLOCATE( vrs_d )
|
||||
vrs_d_ood = .false.
|
||||
END SUBROUTINE deallocate_scf_gpu
|
||||
!=----------------------------------------------------------------------------=!
|
||||
END MODULE scf_gpum
|
||||
!=----------------------------------------------------------------------------=!
|
|
@ -1,5 +1,5 @@
|
|||
!
|
||||
! Copyright (C) 2001 PWSCF group
|
||||
! Copyright (C) 2001-2024 Quantum ESPESSO Foundation
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
|
@ -13,7 +13,9 @@ SUBROUTINE set_vrs( vrs, vltot, vr, kedtau, kedtaur, nrxx, nspin, doublegrid )
|
|||
!! the sum of all the local pseudopotential contributions.
|
||||
!
|
||||
USE kinds
|
||||
USE fft_base, ONLY : dffts
|
||||
USE xc_lib, ONLY : xclib_dft_is
|
||||
USE fft_base, ONLY : dffts, dfftp
|
||||
USE fft_interfaces, ONLY : fft_interpolate
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -33,42 +35,9 @@ SUBROUTINE set_vrs( vrs, vltot, vr, kedtau, kedtaur, nrxx, nspin, doublegrid )
|
|||
!! the kinetic energy density in R-space
|
||||
LOGICAL :: doublegrid
|
||||
! input: true if a doublegrid is used
|
||||
!
|
||||
CALL sum_vrs( nrxx, nspin, vltot, vr, vrs )
|
||||
!
|
||||
CALL interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE set_vrs
|
||||
!
|
||||
!
|
||||
!--------------------------------------------------------------------
|
||||
SUBROUTINE sum_vrs( nrxx, nspin, vltot, vr, vrs )
|
||||
!--------------------------------------------------------------------
|
||||
!! Accumulates local potential contributions into vrs (the total local potential).
|
||||
!
|
||||
USE kinds
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: nspin
|
||||
!! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear
|
||||
INTEGER :: nrxx
|
||||
!! input: the fft grid dimension
|
||||
REAL(DP) :: vrs(nrxx,nspin)
|
||||
!! output: total local potential on the smooth grid:
|
||||
!! \(\text{vrs\}=\text{vltot}+\text{vr}\)
|
||||
REAL(DP) :: vltot(nrxx)
|
||||
!! input: the total local pseudopotential
|
||||
REAL(DP) :: vr(nrxx,nspin)
|
||||
!! input: the scf(H+xc) part of the local potential
|
||||
!
|
||||
! ... local variable
|
||||
!
|
||||
INTEGER :: is
|
||||
!
|
||||
!
|
||||
!$acc data present(vrs)
|
||||
DO is = 1, nspin
|
||||
!
|
||||
! define the total local potential (external + scf) for each spin ...
|
||||
|
@ -84,39 +53,6 @@ SUBROUTINE sum_vrs( nrxx, nspin, vltot, vr, vrs )
|
|||
!
|
||||
ENDDO
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE sum_vrs
|
||||
!
|
||||
!--------------------------------------------------------------------------
|
||||
SUBROUTINE interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
|
||||
!--------------------------------------------------------------------------
|
||||
!! Interpolates local potential on the smooth mesh if necessary.
|
||||
!
|
||||
USE kinds
|
||||
USE xc_lib, ONLY : xclib_dft_is
|
||||
USE fft_base, ONLY : dffts, dfftp
|
||||
USE fft_interfaces, ONLY : fft_interpolate
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: nspin
|
||||
!! input: number of spin components: 1 if lda, 2 if lsd, 4 if noncolinear
|
||||
INTEGER :: nrxx
|
||||
!! input: the fft grid dimension
|
||||
REAL(DP) :: vrs(nrxx,nspin)
|
||||
!! output: total local potential interpolated on the smooth grid
|
||||
REAL(DP) :: kedtau(dffts%nnr,nspin)
|
||||
!! position dependent kinetic energy enhancement factor
|
||||
REAL(DP) :: kedtaur(nrxx,nspin)
|
||||
!! the kinetic energy density in R-space
|
||||
LOGICAL :: doublegrid
|
||||
!! input: true if a doublegrid is used
|
||||
!
|
||||
! ... local variable
|
||||
!
|
||||
INTEGER :: is
|
||||
!
|
||||
! ... interpolate it on the smooth mesh if necessary
|
||||
!
|
||||
DO is = 1, nspin
|
||||
|
@ -124,6 +60,10 @@ SUBROUTINE interpolate_vrs( nrxx, nspin, doublegrid, kedtau, kedtaur, vrs )
|
|||
IF (xclib_dft_is('meta')) CALL fft_interpolate( dfftp, kedtaur(:,is), dffts, kedtau(:,is) )
|
||||
ENDDO
|
||||
!
|
||||
!$acc update device(vrs)
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE interpolate_vrs
|
||||
END SUBROUTINE set_vrs
|
||||
|
|
|
@ -484,8 +484,6 @@ SUBROUTINE sum_band()
|
|||
! ... here we sum for each k point the contribution
|
||||
! ... of the wavefunctions to the charge
|
||||
!
|
||||
incr = 1
|
||||
!
|
||||
IF(sic) THEN
|
||||
ALLOCATE(rho_p(dffts%nnr))
|
||||
ALLOCATE(psic_p(dffts%nnr*2)) ! why *2?
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
!
|
||||
! Copyright (C) 2003-2013 PWSCF group
|
||||
! Copyright (C) 2003-2024 QUantum ESPRESS Foundation
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
|
@ -7,9 +7,9 @@
|
|||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_gamma( lda, n, m, psi, v, hpsi )
|
||||
SUBROUTINE vloc_psi_tg_gamma( lda, n, m, psi, v, hpsi )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Calculation of Vloc*psi using dual-space technique - Gamma point.
|
||||
!! Calculation of Vloc*psi using dual-space technique and "task groups"
|
||||
!
|
||||
USE parallel_include
|
||||
USE kinds, ONLY : DP
|
||||
|
@ -41,113 +41,72 @@ SUBROUTINE vloc_psi_gamma( lda, n, m, psi, v, hpsi )
|
|||
COMPLEX(DP) :: fp, fm
|
||||
COMPLEX(DP), ALLOCATABLE :: vpsi(:,:)
|
||||
! ... Variables for task groups
|
||||
LOGICAL :: use_tg
|
||||
INTEGER :: v_siz, idx, ebnd, brange
|
||||
REAL(DP) :: fac
|
||||
REAL(DP), ALLOCATABLE :: tg_v(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psic(:), tg_vpsi(:,:)
|
||||
!
|
||||
CALL start_clock( 'vloc_psi' )
|
||||
incr = 2
|
||||
!
|
||||
use_tg = dffts%has_task_groups
|
||||
IF (.not. dffts%has_task_groups ) CALL errore('vloc_psi','no task groups?',1)
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
CALL start_clock( 'vloc_psi:tg_gather' )
|
||||
v_siz = dffts%nnr_tg
|
||||
ALLOCATE( tg_v(v_siz) )
|
||||
ALLOCATE( tg_psic(v_siz) )
|
||||
CALL tg_gather( dffts, v, tg_v )
|
||||
incr = 2*fftx_ntgrp(dffts)
|
||||
ALLOCATE( tg_vpsi(n,incr) )
|
||||
CALL stop_clock( 'vloc_psi:tg_gather' )
|
||||
ELSE
|
||||
ALLOCATE( vpsi(n,incr) )
|
||||
ENDIF
|
||||
CALL start_clock( 'vloc_psi:tg_gather' )
|
||||
incr = 2*fftx_ntgrp(dffts)
|
||||
v_siz = dffts%nnr_tg
|
||||
ALLOCATE( tg_v(v_siz) )
|
||||
ALLOCATE( tg_psic(v_siz) )
|
||||
CALL tg_gather( dffts, v, tg_v )
|
||||
ALLOCATE( tg_vpsi(n,incr) )
|
||||
CALL stop_clock( 'vloc_psi:tg_gather' )
|
||||
!
|
||||
IF ( use_tg ) THEN
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
! ... the local potential V_Loc psi. First bring psi to real space
|
||||
!
|
||||
CALL tgwave_g2r( psi(:,ibnd:m), tg_psic, dffts, n )
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
! ... product with the potential v on the smooth grid
|
||||
!
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
tg_psic(j) = tg_psic(j) * tg_v(j)
|
||||
ENDDO
|
||||
!
|
||||
! ... back to reciprocal space
|
||||
! ... addition to the total product
|
||||
!
|
||||
brange = m-ibnd+1
|
||||
!
|
||||
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n )
|
||||
!
|
||||
DO idx = 1, 2*fftx_ntgrp(dffts), 2
|
||||
IF ( idx+ibnd-1<m ) THEN
|
||||
DO j = 1, n
|
||||
hpsi(j,ibnd+idx-1) = hpsi(j,ibnd+idx-1) + 0.5d0 * tg_vpsi(j,idx)
|
||||
hpsi(j,ibnd+idx) = hpsi(j,ibnd+idx) + 0.5d0 * tg_vpsi(j,idx+1)
|
||||
ENDDO
|
||||
ELSEIF ( idx+ibnd-1==m ) THEN
|
||||
DO j = 1, n
|
||||
hpsi(j,ibnd+idx-1) = hpsi(j,ibnd+idx-1) + tg_vpsi(j,idx)
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
! ... the local potential V_Loc psi. First bring psi to real space
|
||||
!
|
||||
CALL tgwave_g2r( psi(:,ibnd:m), tg_psic, dffts, n )
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
! ... product with the potential v on the smooth grid
|
||||
!
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
tg_psic(j) = tg_psic(j) * tg_v(j)
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
! ... back to reciprocal space
|
||||
! ... addition to the total product
|
||||
!
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
ebnd = ibnd
|
||||
IF ( ibnd < m ) ebnd = ibnd + 1
|
||||
!
|
||||
CALL wave_g2r( psi(1:n,ibnd:ebnd), psic, dffts )
|
||||
!
|
||||
DO j = 1, dffts%nnr
|
||||
psic(j) = psic(j) * v(j)
|
||||
ENDDO
|
||||
!
|
||||
brange=1 ; fac=1.d0
|
||||
IF ( ibnd<m ) THEN
|
||||
brange=2 ; fac=0.5d0
|
||||
brange = m-ibnd+1
|
||||
!
|
||||
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n )
|
||||
!
|
||||
DO idx = 1, 2*fftx_ntgrp(dffts), 2
|
||||
IF ( idx+ibnd-1<m ) THEN
|
||||
DO j = 1, n
|
||||
hpsi(j,ibnd+idx-1) = hpsi(j,ibnd+idx-1) + 0.5d0 * tg_vpsi(j,idx)
|
||||
hpsi(j,ibnd+idx) = hpsi(j,ibnd+idx) + 0.5d0 * tg_vpsi(j,idx+1)
|
||||
ENDDO
|
||||
ELSEIF ( idx+ibnd-1==m ) THEN
|
||||
DO j = 1, n
|
||||
hpsi(j,ibnd+idx-1) = hpsi(j,ibnd+idx-1) + tg_vpsi(j,idx)
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
CALL wave_r2g( psic(1:dffts%nnr), vpsi(:,1:brange), dffts )
|
||||
!
|
||||
DO j = 1, n
|
||||
hpsi(j,ibnd) = hpsi(j,ibnd) + fac*vpsi(j,1)
|
||||
IF ( ibnd<m ) hpsi(j,ibnd+1) = hpsi(j,ibnd+1) + fac*vpsi(j,2)
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
DEALLOCATE( tg_psic )
|
||||
DEALLOCATE( tg_v )
|
||||
DEALLOCATE( tg_vpsi )
|
||||
ELSE
|
||||
DEALLOCATE( vpsi )
|
||||
ENDIF
|
||||
DEALLOCATE( tg_psic )
|
||||
DEALLOCATE( tg_v )
|
||||
DEALLOCATE( tg_vpsi )
|
||||
!
|
||||
CALL stop_clock( 'vloc_psi' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE vloc_psi_gamma
|
||||
END SUBROUTINE vloc_psi_tg_gamma
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_k( lda, n, m, psi, v, hpsi )
|
||||
SUBROUTINE vloc_psi_tg_k( lda, n, m, psi, v, hpsi )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Calculation of Vloc*psi using dual-space technique - k-points:
|
||||
!
|
||||
|
@ -190,107 +149,63 @@ SUBROUTINE vloc_psi_k( lda, n, m, psi, v, hpsi )
|
|||
INTEGER, PARAMETER :: blocksize = 256
|
||||
INTEGER :: numblock
|
||||
! ... Task Groups
|
||||
LOGICAL :: use_tg
|
||||
REAL(DP), ALLOCATABLE :: tg_v(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psic(:), tg_vpsi(:,:)
|
||||
INTEGER :: idx, brange, dffts_nnr
|
||||
INTEGER :: v_siz, idx, brange
|
||||
!
|
||||
IF (.not. dffts%has_task_groups ) CALL errore('vloc_psi','no task groups?',2)
|
||||
!
|
||||
CALL start_clock( 'vloc_psi' )
|
||||
use_tg = dffts%has_task_groups
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
CALL start_clock( 'vloc_psi:tg_gather' )
|
||||
dffts_nnr = dffts%nnr_tg
|
||||
incr = fftx_ntgrp(dffts)
|
||||
ALLOCATE( tg_v(dffts_nnr) )
|
||||
ALLOCATE( tg_psic(dffts_nnr), tg_vpsi(lda,incr) )
|
||||
CALL tg_gather( dffts, v, tg_v )
|
||||
CALL stop_clock( 'vloc_psi:tg_gather' )
|
||||
ELSE
|
||||
dffts_nnr = dffts%nnr
|
||||
ALLOCATE( vpsi(lda,1) )
|
||||
ENDIF
|
||||
CALL start_clock( 'vloc_psi:tg_gather' )
|
||||
v_siz = dffts%nnr_tg
|
||||
incr = fftx_ntgrp(dffts)
|
||||
ALLOCATE( tg_v(v_siz) )
|
||||
ALLOCATE( tg_psic(v_siz), tg_vpsi(lda,incr) )
|
||||
CALL tg_gather( dffts, v, tg_v )
|
||||
CALL stop_clock( 'vloc_psi:tg_gather' )
|
||||
!
|
||||
IF ( use_tg ) THEN
|
||||
CALL tg_get_nnr( dffts, right_nnr )
|
||||
!
|
||||
! ... compute the number of chuncks
|
||||
numblock = (n+blocksize-1)/blocksize
|
||||
!
|
||||
DO ibnd = 1, m, fftx_ntgrp(dffts)
|
||||
!
|
||||
CALL tg_get_nnr( dffts, right_nnr )
|
||||
CALL tgwave_g2r( psi(:,ibnd:m), tg_psic, dffts, n, igk_k(:,current_k) )
|
||||
!
|
||||
! ... compute the number of chuncks
|
||||
numblock = (n+blocksize-1)/blocksize
|
||||
! write (6,*) 'wfc R '
|
||||
! write (6,99) (tg_psic(i), i=1,400)
|
||||
!
|
||||
DO ibnd = 1, m, fftx_ntgrp(dffts)
|
||||
!
|
||||
CALL tgwave_g2r( psi(:,ibnd:m), tg_psic, dffts, n, igk_k(:,current_k) )
|
||||
!
|
||||
! write (6,*) 'wfc R '
|
||||
! write (6,99) (tg_psic(i), i=1,400)
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
!$omp parallel do
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
tg_psic(j) = tg_psic(j) * tg_v(j)
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
! write (6,*) 'v psi R '
|
||||
! write (6,99) (tg_psic(i), i=1,400)
|
||||
!
|
||||
brange = m-ibnd+1
|
||||
!
|
||||
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n, igk_k(:,current_k) )
|
||||
!
|
||||
!$omp parallel do collapse(2)
|
||||
DO idx = 0, MIN(fftx_ntgrp(dffts)-1, m-ibnd)
|
||||
DO j = 1, numblock
|
||||
DO iin = (j-1)*blocksize+1, MIN(j*blocksize,n)
|
||||
hpsi(iin,ibnd+idx) = hpsi(iin,ibnd+idx) + tg_vpsi(iin,idx+1)
|
||||
ENDDO
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
!$omp parallel do
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
tg_psic(j) = tg_psic(j) * tg_v(j)
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
! write (6,*) 'v psi R '
|
||||
! write (6,99) (tg_psic(i), i=1,400)
|
||||
!
|
||||
brange = m-ibnd+1
|
||||
!
|
||||
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n, igk_k(:,current_k) )
|
||||
!
|
||||
!$omp parallel do collapse(2)
|
||||
DO idx = 0, MIN(fftx_ntgrp(dffts)-1, m-ibnd)
|
||||
DO j = 1, numblock
|
||||
DO iin = (j-1)*blocksize+1, MIN(j*blocksize,n)
|
||||
hpsi(iin,ibnd+idx) = hpsi(iin,ibnd+idx) + tg_vpsi(iin,idx+1)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
DO ibnd = 1, m
|
||||
!
|
||||
CALL wave_g2r( psi(1:n,ibnd:ibnd), psic, dffts, igk=igk_k(:,current_k) )
|
||||
!
|
||||
! write (6,*) 'wfc R '
|
||||
! write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
!$omp parallel do
|
||||
DO j = 1, dffts_nnr
|
||||
psic(j) = psic(j) * v(j)
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
! write (6,*) 'v psi R '
|
||||
! write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
CALL wave_r2g( psic(1:dffts_nnr), vpsi(1:n,:), dffts, igk=igk_k(:,current_k) )
|
||||
!
|
||||
!$omp parallel do
|
||||
DO i = 1, n
|
||||
hpsi(i,ibnd) = hpsi(i,ibnd) + vpsi(i,1)
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
! write (6,*) 'v psi G ', ibnd
|
||||
! write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
IF ( use_tg ) THEN
|
||||
DEALLOCATE( tg_psic, tg_vpsi )
|
||||
DEALLOCATE( tg_v )
|
||||
ELSE
|
||||
DEALLOCATE( vpsi )
|
||||
ENDIF
|
||||
DEALLOCATE( tg_psic, tg_vpsi )
|
||||
DEALLOCATE( tg_v )
|
||||
!
|
||||
CALL stop_clock( 'vloc_psi' )
|
||||
!
|
||||
|
@ -298,10 +213,10 @@ SUBROUTINE vloc_psi_k( lda, n, m, psi, v, hpsi )
|
|||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE vloc_psi_k
|
||||
END SUBROUTINE vloc_psi_tg_k
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_nc( lda, n, m, psi, v, hpsi )
|
||||
SUBROUTINE vloc_psi_tg_nc( lda, n, m, psi, v, hpsi )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Calculation of Vloc*psi using dual-space technique - noncollinear case.
|
||||
!
|
||||
|
@ -345,132 +260,333 @@ SUBROUTINE vloc_psi_nc( lda, n, m, psi, v, hpsi )
|
|||
INTEGER :: v_siz, idx, ioff, brange
|
||||
INTEGER :: right_nr3, right_inc
|
||||
!
|
||||
IF (.not. dffts%has_task_groups ) CALL errore('vloc_psi','no task groups?',3)
|
||||
!
|
||||
CALL start_clock( 'vloc_psi' )
|
||||
!
|
||||
CALL start_clock( 'vloc_psi:tg_gather' )
|
||||
incr = fftx_ntgrp(dffts)
|
||||
v_siz = dffts%nnr_tg
|
||||
IF (domag) THEN
|
||||
ALLOCATE( tg_v(v_siz,4) )
|
||||
DO is = 1, nspin
|
||||
CALL tg_gather( dffts, v(:,is), tg_v(:,is) )
|
||||
ENDDO
|
||||
ELSE
|
||||
ALLOCATE( tg_v(v_siz,1) )
|
||||
CALL tg_gather( dffts, v(:,1), tg_v(:,1) )
|
||||
ENDIF
|
||||
ALLOCATE( tg_psic(v_siz,npol), tg_vpsi(lda,incr) )
|
||||
CALL stop_clock( 'vloc_psi:tg_gather' )
|
||||
!
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
ii = lda*(ipol-1)+1
|
||||
ie = lda*(ipol-1)+n
|
||||
CALL tgwave_g2r( psi(ii:ie,ibnd:m), tg_psic(:,ipol), dffts, n, &
|
||||
igk_k(:,current_k) )
|
||||
ENDDO
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
IF (domag) THEN
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
sup = tg_psic(j,1) * (tg_v(j,1)+tg_v(j,4)) + &
|
||||
tg_psic(j,2) * (tg_v(j,2)-(0.d0,1.d0)*tg_v(j,3))
|
||||
sdwn = tg_psic(j,2) * (tg_v(j,1)-tg_v(j,4)) + &
|
||||
tg_psic(j,1) * (tg_v(j,2)+(0.d0,1.d0)*tg_v(j,3))
|
||||
tg_psic(j,1) = sup
|
||||
tg_psic(j,2) = sdwn
|
||||
ENDDO
|
||||
ELSE
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
tg_psic(j,:) = tg_psic(j,:) * tg_v(j,1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
brange = m-ibnd+1
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
!
|
||||
CALL tgwave_r2g( tg_psic(:,ipol), tg_vpsi(:,1:brange), dffts, n, &
|
||||
igk_k(:,current_k) )
|
||||
!
|
||||
CALL tg_get_recip_inc( dffts, right_inc )
|
||||
!
|
||||
ioff = 0
|
||||
!$omp parallel do
|
||||
DO idx = 1, fftx_ntgrp(dffts)
|
||||
IF ( idx+ibnd-1<=m ) THEN
|
||||
DO j = 1, n
|
||||
hpsi(j,ipol,ibnd+idx-1) = hpsi(j,ipol,ibnd+idx-1) + tg_vpsi(j,idx)
|
||||
ENDDO
|
||||
ENDIF
|
||||
ioff = ioff + right_inc
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( tg_v )
|
||||
DEALLOCATE( tg_psic, tg_vpsi )
|
||||
!
|
||||
CALL stop_clock ('vloc_psi')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE vloc_psi_tg_nc
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_gamma( lda, n, m, psi, v, hpsi )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Calculation of Vloc*psi using dual-space technique - Gamma point.
|
||||
!
|
||||
USE parallel_include
|
||||
USE kinds, ONLY : DP
|
||||
USE mp_bands, ONLY : me_bgrp
|
||||
USE fft_base, ONLY : dffts
|
||||
USE fft_wave
|
||||
USE wavefunctions, ONLY : psic
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: lda
|
||||
!! leading dimension of arrays psi, hpsi
|
||||
INTEGER, INTENT(IN) :: n
|
||||
!! true dimension of psi, hpsi
|
||||
INTEGER, INTENT(IN) :: m
|
||||
!! number of states psi
|
||||
COMPLEX(DP), INTENT(IN) :: psi(lda,m)
|
||||
!! the wavefunction
|
||||
COMPLEX(DP), INTENT(INOUT) :: hpsi(lda,m)
|
||||
!! Hamiltonian dot psi
|
||||
REAL(DP), INTENT(IN) :: v(dffts%nnr)
|
||||
!! the total pot. in real space (smooth grid) for current spin
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ibnd, j, incr, brange, ebnd
|
||||
REAL(DP) :: fac
|
||||
COMPLEX(DP) :: fp, fm
|
||||
COMPLEX(DP), ALLOCATABLE :: vpsi(:,:)
|
||||
!
|
||||
CALL start_clock( 'vloc_psi' )
|
||||
incr = 2
|
||||
!
|
||||
IF ( dffts%has_task_groups ) CALL errore('vloc_psi','no task groups!',1)
|
||||
ALLOCATE( vpsi(n,incr) )
|
||||
!
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
ebnd = ibnd
|
||||
IF ( ibnd < m ) ebnd = ibnd + 1
|
||||
!
|
||||
CALL wave_g2r( psi(1:n,ibnd:ebnd), psic, dffts )
|
||||
!
|
||||
DO j = 1, dffts%nnr
|
||||
psic(j) = psic(j) * v(j)
|
||||
ENDDO
|
||||
!
|
||||
brange=1 ; fac=1.d0
|
||||
IF ( ibnd<m ) THEN
|
||||
brange=2 ; fac=0.5d0
|
||||
ENDIF
|
||||
!
|
||||
CALL wave_r2g( psic(1:dffts%nnr), vpsi(:,1:brange), dffts )
|
||||
!
|
||||
DO j = 1, n
|
||||
hpsi(j,ibnd) = hpsi(j,ibnd) + fac*vpsi(j,1)
|
||||
IF ( ibnd<m ) hpsi(j,ibnd+1) = hpsi(j,ibnd+1) + fac*vpsi(j,2)
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( vpsi )
|
||||
!
|
||||
CALL stop_clock( 'vloc_psi' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE vloc_psi_gamma
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_k( lda, n, m, psi, v, hpsi )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Calculation of Vloc*psi using dual-space technique - k-points:
|
||||
!
|
||||
!! * fft to real space;
|
||||
!! * product with the potential v on the smooth grid;
|
||||
!! * back to reciprocal space;
|
||||
!! * addition to the hpsi.
|
||||
!
|
||||
USE parallel_include
|
||||
USE kinds, ONLY : DP
|
||||
USE wvfct, ONLY : current_k
|
||||
USE klist, ONLY : igk_k
|
||||
USE mp_bands, ONLY : me_bgrp
|
||||
USE fft_base, ONLY : dffts
|
||||
USE fft_wave
|
||||
USE wavefunctions, ONLY : psic
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: lda
|
||||
!! leading dimension of arrays psi, hpsi
|
||||
INTEGER, INTENT(IN) :: n
|
||||
!! true dimension of psi, hpsi
|
||||
INTEGER, INTENT(IN) :: m
|
||||
!! number of states psi
|
||||
COMPLEX(DP), INTENT(IN) :: psi(lda,m)
|
||||
!! the wavefunction
|
||||
COMPLEX(DP), INTENT(INOUT) :: hpsi(lda,m)
|
||||
!! Hamiltonian dot psi
|
||||
REAL(DP), INTENT(IN) :: v(dffts%nnr)
|
||||
!! the total pot. in real space (smooth grid) for current spin
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ibnd, j, incr
|
||||
INTEGER :: i, iin
|
||||
COMPLEX(DP), ALLOCATABLE :: vpsi(:,:)
|
||||
! ... chunking parameters
|
||||
INTEGER, PARAMETER :: blocksize = 256
|
||||
INTEGER :: numblock
|
||||
INTEGER :: idx, brange, v_siz
|
||||
!
|
||||
CALL start_clock( 'vloc_psi' )
|
||||
!
|
||||
IF (dffts%has_task_groups ) CALL errore('vloc_psi','no task groups!',2)
|
||||
!
|
||||
v_siz = dffts%nnr
|
||||
ALLOCATE( vpsi(lda,1) )
|
||||
!
|
||||
DO ibnd = 1, m
|
||||
!
|
||||
CALL wave_g2r( psi(1:n,ibnd:ibnd), psic, dffts, igk=igk_k(:,current_k) )
|
||||
!
|
||||
! write (6,*) 'wfc R '
|
||||
! write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
!$omp parallel do
|
||||
DO j = 1, v_siz
|
||||
psic(j) = psic(j) * v(j)
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
! write (6,*) 'v psi R '
|
||||
! write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
CALL wave_r2g( psic(1:v_siz), vpsi(1:n,:), dffts, igk=igk_k(:,current_k) )
|
||||
!
|
||||
!$omp parallel do
|
||||
DO i = 1, n
|
||||
hpsi(i,ibnd) = hpsi(i,ibnd) + vpsi(i,1)
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
! write (6,*) 'v psi G ', ibnd
|
||||
! write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( vpsi )
|
||||
!
|
||||
CALL stop_clock( 'vloc_psi' )
|
||||
!
|
||||
99 format ( 20 ('(',2f12.9,')') )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE vloc_psi_k
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_nc( lda, n, m, psi, v, hpsi )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Calculation of Vloc*psi using dual-space technique - noncollinear case.
|
||||
!
|
||||
USE parallel_include
|
||||
USE kinds, ONLY : DP
|
||||
USE wvfct, ONLY : current_k
|
||||
USE klist, ONLY : igk_k
|
||||
USE mp_bands, ONLY : me_bgrp
|
||||
USE fft_base, ONLY : dffts, dfftp
|
||||
USE fft_wave
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE noncollin_module, ONLY : npol, domag
|
||||
USE wavefunctions, ONLY : psic_nc
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: lda
|
||||
!! leading dimension of arrays psi, hpsi
|
||||
INTEGER, INTENT(IN) :: n
|
||||
!! true dimension of psi, hpsi
|
||||
INTEGER, INTENT(IN) :: m
|
||||
!! number of states psi
|
||||
REAL(DP), INTENT(IN) :: v(dfftp%nnr,4) ! beware dimensions!
|
||||
!! the total pot. in real space (smooth grid)
|
||||
COMPLEX(DP), INTENT(IN) :: psi(lda*npol,m)
|
||||
!! the wavefunction
|
||||
COMPLEX(DP), INTENT(INOUT) :: hpsi(lda,npol,m)
|
||||
!! Hamiltonian dot psi
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ibnd, j,ipol, incr, is, ii, ie
|
||||
COMPLEX(DP) :: sup, sdwn
|
||||
COMPLEX(DP), ALLOCATABLE :: vpsi(:,:)
|
||||
INTEGER :: v_siz, idx, ioff, brange
|
||||
!
|
||||
CALL start_clock( 'vloc_psi' )
|
||||
!
|
||||
incr = 1
|
||||
!
|
||||
use_tg = dffts%has_task_groups
|
||||
IF (dffts%has_task_groups ) CALL errore('vloc_psi','no task groups!',3)
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
CALL start_clock( 'vloc_psi:tg_gather' )
|
||||
v_siz = dffts%nnr_tg
|
||||
ALLOCATE( vpsi(lda,1) )
|
||||
!
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
psic_nc = (0.d0,0.d0)
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
ii = lda*(ipol-1)+1
|
||||
ie = lda*(ipol-1)+n
|
||||
CALL wave_g2r( psi(ii:ie,ibnd:ibnd), psic_nc(:,ipol), dffts, &
|
||||
igk=igk_k(:,current_k) )
|
||||
ENDDO
|
||||
!
|
||||
IF (domag) THEN
|
||||
ALLOCATE( tg_v(v_siz,4) )
|
||||
DO is = 1, nspin
|
||||
CALL tg_gather( dffts, v(:,is), tg_v(:,is) )
|
||||
DO j = 1, dffts%nnr
|
||||
sup = psic_nc(j,1) * (v(j,1)+v(j,4)) + &
|
||||
psic_nc(j,2) * (v(j,2)-(0.d0,1.d0)*v(j,3))
|
||||
sdwn = psic_nc(j,2) * (v(j,1)-v(j,4)) + &
|
||||
psic_nc(j,1) * (v(j,2)+(0.d0,1.d0)*v(j,3))
|
||||
psic_nc(j,1) = sup
|
||||
psic_nc(j,2) = sdwn
|
||||
ENDDO
|
||||
ELSE
|
||||
ALLOCATE( tg_v(v_siz,1) )
|
||||
CALL tg_gather( dffts, v(:,1), tg_v(:,1) )
|
||||
DO j = 1, dffts%nnr
|
||||
psic_nc(j,:) = psic_nc(j,:) * v(j,1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
incr = fftx_ntgrp(dffts)
|
||||
ALLOCATE( tg_psic(v_siz,npol), tg_vpsi(lda,incr) )
|
||||
CALL stop_clock( 'vloc_psi:tg_gather' )
|
||||
ELSE
|
||||
ALLOCATE( vpsi(lda,1) )
|
||||
ENDIF
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
ii = lda*(ipol-1)+1
|
||||
ie = lda*(ipol-1)+n
|
||||
CALL tgwave_g2r( psi(ii:ie,ibnd:m), tg_psic(:,ipol), dffts, n, &
|
||||
igk_k(:,current_k) )
|
||||
DO ipol = 1, npol
|
||||
CALL wave_r2g( psic_nc(1:dffts%nnr,ipol), vpsi(1:n,:), dffts, &
|
||||
igk=igk_k(:,current_k) )
|
||||
!$omp parallel do
|
||||
DO j = 1, n
|
||||
hpsi(j,ipol,ibnd) = hpsi(j,ipol,ibnd) + vpsi(j,1)
|
||||
ENDDO
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
IF (domag) THEN
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
sup = tg_psic(j,1) * (tg_v(j,1)+tg_v(j,4)) + &
|
||||
tg_psic(j,2) * (tg_v(j,2)-(0.d0,1.d0)*tg_v(j,3))
|
||||
sdwn = tg_psic(j,2) * (tg_v(j,1)-tg_v(j,4)) + &
|
||||
tg_psic(j,1) * (tg_v(j,2)+(0.d0,1.d0)*tg_v(j,3))
|
||||
tg_psic(j,1) = sup
|
||||
tg_psic(j,2) = sdwn
|
||||
ENDDO
|
||||
ELSE
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
tg_psic(j,:) = tg_psic(j,:) * tg_v(j,1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
brange = m-ibnd+1
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
!
|
||||
CALL tgwave_r2g( tg_psic(:,ipol), tg_vpsi(:,1:brange), dffts, n, &
|
||||
igk_k(:,current_k) )
|
||||
!
|
||||
CALL tg_get_recip_inc( dffts, right_inc )
|
||||
!
|
||||
ioff = 0
|
||||
!$omp parallel do
|
||||
DO idx = 1, fftx_ntgrp(dffts)
|
||||
IF ( idx+ibnd-1<=m ) THEN
|
||||
DO j = 1, n
|
||||
hpsi(j,ipol,ibnd+idx-1) = hpsi(j,ipol,ibnd+idx-1) + tg_vpsi(j,idx)
|
||||
ENDDO
|
||||
ENDIF
|
||||
ioff = ioff + right_inc
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!$omp end parallel do
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
psic_nc = (0.d0,0.d0)
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
ii = lda*(ipol-1)+1
|
||||
ie = lda*(ipol-1)+n
|
||||
CALL wave_g2r( psi(ii:ie,ibnd:ibnd), psic_nc(:,ipol), dffts, &
|
||||
igk=igk_k(:,current_k) )
|
||||
ENDDO
|
||||
!
|
||||
IF (domag) THEN
|
||||
DO j = 1, dffts%nnr
|
||||
sup = psic_nc(j,1) * (v(j,1)+v(j,4)) + &
|
||||
psic_nc(j,2) * (v(j,2)-(0.d0,1.d0)*v(j,3))
|
||||
sdwn = psic_nc(j,2) * (v(j,1)-v(j,4)) + &
|
||||
psic_nc(j,1) * (v(j,2)+(0.d0,1.d0)*v(j,3))
|
||||
psic_nc(j,1) = sup
|
||||
psic_nc(j,2) = sdwn
|
||||
ENDDO
|
||||
ELSE
|
||||
DO j = 1, dffts%nnr
|
||||
psic_nc(j,:) = psic_nc(j,:) * v(j,1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
CALL wave_r2g( psic_nc(1:dffts%nnr,ipol), vpsi(1:n,:), dffts, &
|
||||
igk=igk_k(:,current_k) )
|
||||
!$omp parallel do
|
||||
DO j = 1, n
|
||||
hpsi(j,ipol,ibnd) = hpsi(j,ipol,ibnd) + vpsi(j,1)
|
||||
ENDDO
|
||||
!$omp end parallel do
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
DEALLOCATE( tg_v )
|
||||
DEALLOCATE( tg_psic, tg_vpsi )
|
||||
ELSE
|
||||
DEALLOCATE( vpsi )
|
||||
ENDIF
|
||||
DEALLOCATE( vpsi )
|
||||
!
|
||||
CALL stop_clock ('vloc_psi')
|
||||
!
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#define tg_gather_gpu tg_gather
|
||||
#endif
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
||||
SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v, hpsi_d )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Calculation of Vloc*psi using dual-space technique - Gamma point.
|
||||
!
|
||||
|
@ -30,9 +30,9 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
INTEGER, INTENT(in) :: lda, n, m
|
||||
COMPLEX(DP), INTENT(in) :: psi_d(lda,m)
|
||||
COMPLEX(DP), INTENT(inout):: hpsi_d(lda,m)
|
||||
REAL(DP), INTENT(in) :: v_d(dffts%nnr)
|
||||
REAL(DP), INTENT(in) :: v(dffts%nnr)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psi_d, hpsi_d, v_d
|
||||
attributes(DEVICE) :: psi_d, hpsi_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables
|
||||
|
@ -42,12 +42,7 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: psi(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psic(:), vpsi(:,:)
|
||||
! ... Variables for task groups
|
||||
LOGICAL :: use_tg
|
||||
REAL(DP), POINTER :: tg_v_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psic(:), tg_vpsi(:,:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: tg_v_d
|
||||
INTEGER :: dffts_nnr, idx, ebnd, brange
|
||||
INTEGER :: ierr, ioff
|
||||
! ... Variables to handle batched FFT
|
||||
|
@ -64,63 +59,17 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
!
|
||||
incr = 2*many_fft
|
||||
!
|
||||
use_tg = dffts%has_task_groups
|
||||
IF ( dffts%has_task_groups ) CALL errore('Vloc_psi_gpu','no task groups!',1)
|
||||
!
|
||||
IF ( use_tg ) THEN
|
||||
CALL start_clock_gpu( 'vloc_psi:tg_gather' )
|
||||
dffts_nnr = dffts%nnr_tg
|
||||
incr = 2*fftx_ntgrp(dffts)
|
||||
CALL dev_buf%lock_buffer( tg_v_d, dffts_nnr, ierr )
|
||||
ALLOCATE( tg_psic(dffts_nnr), tg_vpsi(dffts_nnr,incr) )
|
||||
CALL tg_gather_gpu( dffts, v_d, tg_v_d )
|
||||
CALL stop_clock_gpu( 'vloc_psi:tg_gather' )
|
||||
ELSE
|
||||
dffts_nnr = dffts%nnr
|
||||
ALLOCATE( psic(dffts_nnr*incr), vpsi(dffts_nnr,incr) )
|
||||
ENDIF
|
||||
dffts_nnr = dffts%nnr
|
||||
ALLOCATE( psic(dffts_nnr*incr), vpsi(dffts_nnr,incr) )
|
||||
!
|
||||
! ... The local potential V_Loc psi:
|
||||
! - fft to real space;
|
||||
! - product with the potential v on the smooth grid;
|
||||
! - back to reciprocal space.
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
!$acc data create(tg_psic,tg_vpsi)
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
CALL tgwave_g2r( psi(1:n,ibnd:m), tg_psic, dffts, n )
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
tg_psic(j) = tg_psic(j) * tg_v_d(j)
|
||||
ENDDO
|
||||
!
|
||||
brange = m-ibnd+1
|
||||
!
|
||||
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n )
|
||||
!
|
||||
DO idx = 1, 2*fftx_ntgrp(dffts), 2
|
||||
IF ( idx+ibnd-1<m ) THEN
|
||||
!$acc parallel loop
|
||||
DO j = 1, n
|
||||
hpsi_d(j,ibnd+idx-1) = hpsi_d(j,ibnd+idx-1) + 0.5d0 * tg_vpsi(j,idx)
|
||||
hpsi_d(j,ibnd+idx) = hpsi_d(j,ibnd+idx) + 0.5d0 * tg_vpsi(j,idx+1)
|
||||
ENDDO
|
||||
ELSEIF ( idx+ibnd-1==m ) THEN
|
||||
!$acc parallel loop
|
||||
DO j = 1, n
|
||||
hpsi_d(j,ibnd+idx-1) = hpsi_d(j,ibnd+idx-1) + tg_vpsi(j,idx)
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!$acc end data
|
||||
!
|
||||
ELSEIF (many_fft > 1) THEN
|
||||
IF (many_fft > 1) THEN
|
||||
!
|
||||
!$acc data create(psic,vpsi)
|
||||
DO ibnd = 1, m, incr
|
||||
|
@ -133,10 +82,10 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
!
|
||||
CALL wave_g2r( psi(:,ibnd:ibnd+group_size-1), psic, dffts, howmany_set=hm_vec )
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$acc parallel loop collapse(2) present(v)
|
||||
DO idx = 0, howmany-1
|
||||
DO j = 1, dffts_nnr
|
||||
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v_d(j)
|
||||
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v(j)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
|
@ -173,9 +122,9 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
!
|
||||
CALL wave_g2r( psi(1:n,ibnd:ebnd), psic, dffts )
|
||||
!
|
||||
!$acc parallel loop
|
||||
!$acc parallel loop present(v)
|
||||
DO j = 1, dffts_nnr
|
||||
psic(j) = psic(j) * v_d(j)
|
||||
psic(j) = psic(j) * v(j)
|
||||
ENDDO
|
||||
!
|
||||
brange=1 ; fac=1.d0
|
||||
|
@ -199,12 +148,7 @@ SUBROUTINE vloc_psi_gamma_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
!$acc end data
|
||||
DEALLOCATE( psi )
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
DEALLOCATE( tg_psic, tg_vpsi )
|
||||
CALL dev_buf%release_buffer( tg_v_d, ierr )
|
||||
ELSE
|
||||
DEALLOCATE( psic, vpsi )
|
||||
ENDIF
|
||||
DEALLOCATE( psic, vpsi )
|
||||
!
|
||||
CALL stop_clock_gpu ('vloc_psi')
|
||||
#endif
|
||||
|
@ -214,7 +158,7 @@ END SUBROUTINE vloc_psi_gamma_gpu
|
|||
!
|
||||
!@njs: vloc_psi_k
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
||||
SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v, hpsi_d )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Calculation of Vloc*psi using dual-space technique - k-points. GPU double.
|
||||
!
|
||||
|
@ -241,9 +185,9 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
INTEGER, INTENT(IN) :: lda, n, m
|
||||
COMPLEX(DP), INTENT(IN) :: psi_d(lda,m)
|
||||
COMPLEX(DP), INTENT(INOUT):: hpsi_d(lda,m)
|
||||
REAL(DP), INTENT(IN) :: v_d(dffts%nnr)
|
||||
REAL(DP), INTENT(IN) :: v(dffts%nnr)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psi_d, hpsi_d, v_d
|
||||
attributes(DEVICE) :: psi_d, hpsi_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables
|
||||
|
@ -253,18 +197,14 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: psi(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psic(:), vpsi(:,:)
|
||||
! ... Task Groups
|
||||
LOGICAL :: use_tg
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psic(:), tg_vpsi(:,:)
|
||||
REAL(DP), POINTER :: tg_v_d(:)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: tg_v_d
|
||||
!
|
||||
INTEGER :: dffts_nnr, idx, group_size, hm_vec(3)
|
||||
INTEGER :: ierr, brange
|
||||
!
|
||||
CALL start_clock_gpu ('vloc_psi')
|
||||
use_tg = dffts%has_task_groups
|
||||
IF ( dffts%has_task_groups ) CALL errore('Vloc_psi_gpu','no task groups!',2)
|
||||
!
|
||||
ALLOCATE( psi(lda,m) )
|
||||
!$acc data create( psi )
|
||||
|
@ -274,52 +214,10 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
!
|
||||
incr = many_fft
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
CALL start_clock_gpu ('vloc_psi:tg_gather')
|
||||
dffts_nnr = dffts%nnr_tg
|
||||
incr = fftx_ntgrp(dffts)
|
||||
CALL dev_buf%lock_buffer( tg_v_d, dffts_nnr, ierr )
|
||||
ALLOCATE( tg_psic(dffts_nnr), tg_vpsi(dffts_nnr,incr) )
|
||||
CALL tg_gather_gpu( dffts, v_d, tg_v_d )
|
||||
CALL stop_clock_gpu ('vloc_psi:tg_gather')
|
||||
ELSE
|
||||
dffts_nnr = dffts%nnr
|
||||
ALLOCATE( psic(dffts_nnr*incr), vpsi(dffts_nnr,incr) )
|
||||
ENDIF
|
||||
dffts_nnr = dffts%nnr
|
||||
ALLOCATE( psic(dffts_nnr*incr), vpsi(dffts_nnr,incr) )
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
CALL tg_get_nnr( dffts, right_nnr )
|
||||
!
|
||||
!$acc data create(tg_psic,tg_vpsi)
|
||||
DO ibnd = 1, m, fftx_ntgrp(dffts)
|
||||
!
|
||||
CALL tgwave_g2r( psi(1:n,ibnd:m), tg_psic, dffts, n, igk_k(:,current_k) )
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x* right_nr3
|
||||
tg_psic(j) = tg_psic(j) * tg_v_d(j)
|
||||
ENDDO
|
||||
!
|
||||
brange = m-ibnd+1
|
||||
!
|
||||
CALL tgwave_r2g( tg_psic, tg_vpsi(:,1:brange), dffts, n, igk_k(:,current_k) )
|
||||
!
|
||||
DO idx = 1, fftx_ntgrp(dffts)
|
||||
IF ( idx+ibnd-1 <= m ) THEN
|
||||
!$acc parallel loop
|
||||
DO j = 1, n
|
||||
hpsi_d(j,ibnd+idx-1) = hpsi_d(j,ibnd+idx-1) + tg_vpsi(j,idx-1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!$acc end data
|
||||
!
|
||||
ELSEIF (many_fft > 1) THEN
|
||||
IF (many_fft > 1) THEN
|
||||
!
|
||||
!$acc data create(psic,vpsi)
|
||||
DO ibnd = 1, m, incr
|
||||
|
@ -331,10 +229,10 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
CALL wave_g2r( psi(:,ibnd:ebnd), psic, dffts, igk=igk_k(:,current_k), &
|
||||
howmany_set=hm_vec )
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$acc parallel loop collapse(2) present(v)
|
||||
DO idx = 0, group_size-1
|
||||
DO j = 1, dffts_nnr
|
||||
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v_d(j)
|
||||
psic(idx*dffts_nnr+j) = psic(idx*dffts_nnr+j) * v(j)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
|
@ -358,9 +256,9 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
!
|
||||
CALL wave_g2r( psi(1:n,ibnd:ibnd), psic, dffts, igk=igk_k(:,current_k) )
|
||||
!
|
||||
!$acc parallel loop
|
||||
!$acc parallel loop present(v)
|
||||
DO j = 1, dffts_nnr
|
||||
psic(j) = psic(j) * v_d(j)
|
||||
psic(j) = psic(j) * v(j)
|
||||
ENDDO
|
||||
!
|
||||
CALL wave_r2g( psic, vpsi(1:n,:), dffts, igk=igk_k(:,current_k) )
|
||||
|
@ -378,12 +276,7 @@ SUBROUTINE vloc_psi_k_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
!$acc end data
|
||||
DEALLOCATE( psi )
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
DEALLOCATE( tg_psic, tg_vpsi )
|
||||
CALL dev_buf%release_buffer( tg_v_d, ierr )
|
||||
ELSE
|
||||
DEALLOCATE( psic, vpsi )
|
||||
ENDIF
|
||||
DEALLOCATE( psic, vpsi )
|
||||
!
|
||||
CALL stop_clock_gpu( 'vloc_psi' )
|
||||
#endif
|
||||
|
@ -396,7 +289,7 @@ END SUBROUTINE vloc_psi_k_gpu
|
|||
!
|
||||
!@njs: vloc_psi_nc
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
||||
SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v, hpsi_d )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Calculation of Vloc*psi using dual-space technique - non-collinear -
|
||||
!! GPU version.
|
||||
|
@ -415,11 +308,11 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: lda, n, m
|
||||
REAL(DP), INTENT(IN) :: v_d(dfftp%nnr,4) ! beware dimensions!
|
||||
REAL(DP), INTENT(IN) :: v(dfftp%nnr,4) ! beware dimensions!
|
||||
COMPLEX(DP), INTENT(IN) :: psi_d(lda*npol,m)
|
||||
COMPLEX(DP), INTENT(INOUT):: hpsi_d(lda,npol,m)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: v_d, psi_d, hpsi_d
|
||||
attributes(DEVICE) :: psi_d, hpsi_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables
|
||||
|
@ -428,19 +321,14 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
COMPLEX(DP) :: sup, sdwn
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: psi(:,:), psic_nc(:,:), vpsic_nc(:,:)
|
||||
! ... Variables for task groups
|
||||
LOGICAL :: use_tg
|
||||
REAL(DP), ALLOCATABLE :: tg_v_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tg_psic(:,:), tg_vpsi(:,:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: tg_v_d
|
||||
INTEGER :: dffts_nnr, idx, ioff, ii, ie, brange
|
||||
INTEGER :: right_nnr, right_nr3, right_inc
|
||||
!
|
||||
CALL start_clock_gpu ('vloc_psi')
|
||||
!
|
||||
incr = 1
|
||||
use_tg = dffts%has_task_groups
|
||||
IF ( dffts%has_task_groups ) CALL errore('Vloc_psi_gpu','no task groups!',3)
|
||||
!
|
||||
ALLOCATE( psi(lda*npol,m) )
|
||||
!$acc data create( psi )
|
||||
|
@ -448,134 +336,54 @@ SUBROUTINE vloc_psi_nc_gpu( lda, n, m, psi_d, v_d, hpsi_d )
|
|||
psi = psi_d
|
||||
!$acc end kernels
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
CALL start_clock_gpu( 'vloc_psi:tg_gather' )
|
||||
dffts_nnr = dffts%nnr_tg
|
||||
incr = fftx_ntgrp(dffts)
|
||||
IF (domag) THEN
|
||||
ALLOCATE( tg_v_d(dffts_nnr,4) )
|
||||
DO is = 1, nspin
|
||||
CALL tg_gather_gpu( dffts, v_d(:,is), tg_v_d(:,is) )
|
||||
ENDDO
|
||||
ELSE
|
||||
ALLOCATE( tg_v_d(dffts_nnr,1) )
|
||||
CALL tg_gather_gpu( dffts, v_d(:,1), tg_v_d(:,1) )
|
||||
ENDIF
|
||||
ALLOCATE( tg_psic(dffts_nnr,npol), tg_vpsi(lda,incr) )
|
||||
CALL stop_clock_gpu( 'vloc_psi:tg_gather' )
|
||||
ELSE
|
||||
dffts_nnr = dffts%nnr
|
||||
ALLOCATE( psic_nc(dffts_nnr,npol), vpsic_nc(lda,1) )
|
||||
ENDIF
|
||||
dffts_nnr = dffts%nnr
|
||||
ALLOCATE( psic_nc(dffts_nnr,npol), vpsic_nc(lda,1) )
|
||||
!
|
||||
! ... the local potential V_Loc psi. First the psi in real space
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!$acc data create( psic_nc, vpsic_nc )
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
!$acc data create( tg_psic, tg_vpsi )
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
ii = lda*(ipol-1)+1
|
||||
ie = lda*ipol
|
||||
CALL tgwave_g2r( psi(ii:ie,ibnd:m), tg_psic(:,ipol), dffts, n, &
|
||||
igk_k(:,current_k) )
|
||||
ENDDO
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
IF (domag) THEN
|
||||
!$acc parallel loop
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
sup = tg_psic(j,1) * (tg_v_d(j,1)+tg_v_d(j,4)) + &
|
||||
tg_psic(j,2) * (tg_v_d(j,2)-(0.d0,1.d0)*tg_v_d(j,3))
|
||||
sdwn = tg_psic(j,2) * (tg_v_d(j,1)-tg_v_d(j,4)) + &
|
||||
tg_psic(j,1) * (tg_v_d(j,2)+(0.d0,1.d0)*tg_v_d(j,3))
|
||||
tg_psic(j,1) = sup
|
||||
tg_psic(j,2) = sdwn
|
||||
ENDDO
|
||||
ELSE
|
||||
!$acc parallel loop
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
tg_psic(j,:) = tg_psic(j,:) * tg_v_d(j,1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
brange = m-ibnd+1
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
CALL tgwave_r2g( tg_psic(:,ipol), tg_vpsi(:,1:brange), dffts, n, igk_k(:,current_k) )
|
||||
!
|
||||
CALL tg_get_recip_inc( dffts, right_inc )
|
||||
!
|
||||
ioff = 0
|
||||
!
|
||||
DO idx = 1, fftx_ntgrp(dffts)
|
||||
IF ( idx+ibnd-1<=m ) THEN
|
||||
!$acc parallel loop
|
||||
DO j = 1, n
|
||||
hpsi_d(j,ipol,ibnd+idx-1) = hpsi_d(j,ipol,ibnd+idx-1) + tg_vpsi(j,idx)
|
||||
ENDDO
|
||||
ENDIF
|
||||
ioff = ioff + right_inc
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
!$acc kernels
|
||||
psic_nc = (0.d0,0.d0)
|
||||
!$acc end kernels
|
||||
DO ipol = 1, npol
|
||||
ii = lda*(ipol-1)+1
|
||||
ie = lda*(ipol-1)+n
|
||||
CALL wave_g2r( psi(ii:ie,ibnd:ibnd), psic_nc(:,ipol), dffts, &
|
||||
igk=igk_k(:,current_k) )
|
||||
ENDDO
|
||||
!$acc end data
|
||||
!
|
||||
ELSE
|
||||
IF (domag) THEN
|
||||
!$acc parallel loop present(v)
|
||||
DO j = 1, dffts_nnr
|
||||
sup = psic_nc(j,1) * (v(j,1)+v(j,4)) + &
|
||||
psic_nc(j,2) * (v(j,2)-(0.d0,1.d0)*v(j,3))
|
||||
sdwn = psic_nc(j,2) * (v(j,1)-v(j,4)) + &
|
||||
psic_nc(j,1) * (v(j,2)+(0.d0,1.d0)*v(j,3))
|
||||
psic_nc(j,1) = sup
|
||||
psic_nc(j,2) = sdwn
|
||||
ENDDO
|
||||
ELSE
|
||||
!$acc parallel loop present(v)
|
||||
DO j = 1, dffts_nnr
|
||||
psic_nc(j,:) = psic_nc(j,:) * v(j,1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
!$acc data create( psic_nc, vpsic_nc )
|
||||
DO ibnd = 1, m, incr
|
||||
DO ipol = 1, npol
|
||||
CALL wave_r2g( psic_nc(:,ipol), vpsic_nc(1:n,:), dffts, &
|
||||
igk=igk_k(:,current_k) )
|
||||
!
|
||||
!$acc kernels
|
||||
psic_nc = (0.d0,0.d0)
|
||||
!$acc end kernels
|
||||
DO ipol = 1, npol
|
||||
ii = lda*(ipol-1)+1
|
||||
ie = lda*(ipol-1)+n
|
||||
CALL wave_g2r( psi(ii:ie,ibnd:ibnd), psic_nc(:,ipol), dffts, &
|
||||
igk=igk_k(:,current_k) )
|
||||
!$acc parallel loop
|
||||
DO j = 1, n
|
||||
hpsi_d(j,ipol,ibnd) = hpsi_d(j,ipol,ibnd) + vpsic_nc(j,1)
|
||||
ENDDO
|
||||
!
|
||||
IF (domag) THEN
|
||||
!$acc parallel loop
|
||||
DO j = 1, dffts_nnr
|
||||
sup = psic_nc(j,1) * (v_d(j,1)+v_d(j,4)) + &
|
||||
psic_nc(j,2) * (v_d(j,2)-(0.d0,1.d0)*v_d(j,3))
|
||||
sdwn = psic_nc(j,2) * (v_d(j,1)-v_d(j,4)) + &
|
||||
psic_nc(j,1) * (v_d(j,2)+(0.d0,1.d0)*v_d(j,3))
|
||||
psic_nc(j,1) = sup
|
||||
psic_nc(j,2) = sdwn
|
||||
ENDDO
|
||||
ELSE
|
||||
!$acc parallel loop
|
||||
DO j = 1, dffts_nnr
|
||||
psic_nc(j,:) = psic_nc(j,:) * v_d(j,1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
CALL wave_r2g( psic_nc(:,ipol), vpsic_nc(1:n,:), dffts, &
|
||||
igk=igk_k(:,current_k) )
|
||||
!
|
||||
!$acc parallel loop
|
||||
DO j = 1, n
|
||||
hpsi_d(j,ipol,ibnd) = hpsi_d(j,ipol,ibnd) + vpsic_nc(j,1)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
!$acc end data
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
DEALLOCATE( tg_v_d, tg_psic, tg_vpsi )
|
||||
ELSE
|
||||
DEALLOCATE( psic_nc, vpsic_nc )
|
||||
ENDIF
|
||||
ENDDO
|
||||
!$acc end data
|
||||
DEALLOCATE( psic_nc, vpsic_nc )
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( psi )
|
||||
|
|
|
@ -39,7 +39,6 @@ SUBROUTINE lr_apply_liouvillian_magnons( evc1, evc1_new, L_dag )
|
|||
|
||||
USE io_global, ONLY : stdout
|
||||
USE uspp_init, ONLY : init_us_2
|
||||
USE scf_gpum, ONLY : vrs_d
|
||||
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -402,13 +401,12 @@ SUBROUTINE lr_apply_liouvillian_magnons( evc1, evc1_new, L_dag )
|
|||
!
|
||||
! Change the sign of b_xc
|
||||
!
|
||||
!$acc kernels
|
||||
vrs(:,2) = - vrs(:,2)
|
||||
vrs(:,3) = - vrs(:,3)
|
||||
vrs(:,4) = - vrs(:,4)
|
||||
!$acc end kernels
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
vrs_d = vrs
|
||||
#endif
|
||||
! Apply the operator ( H - \epsilon S + alpha_pv P_v) to evc1
|
||||
! where alpha_pv = 0
|
||||
!
|
||||
|
@ -418,13 +416,11 @@ SUBROUTINE lr_apply_liouvillian_magnons( evc1, evc1_new, L_dag )
|
|||
!
|
||||
! Change the sign of b_xc back
|
||||
!
|
||||
!$acc kernels
|
||||
vrs(:,2) = - vrs(:,2)
|
||||
vrs(:,3) = - vrs(:,3)
|
||||
vrs(:,4) = - vrs(:,4)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
vrs_d = vrs
|
||||
#endif
|
||||
!$acc end kernels
|
||||
!
|
||||
IF (ALLOCATED(psic_nc)) DEALLOCATE(psic_nc)
|
||||
!
|
||||
|
|
|
@ -55,7 +55,6 @@ SUBROUTINE lr_readin
|
|||
USE constants, ONLY : eps4, rytoev
|
||||
USE control_lr, ONLY : lrpa, alpha_mix, ethr_nscf
|
||||
USE mp_world, ONLY : world_comm
|
||||
USE scf_gpum, ONLY: vrs_d
|
||||
#if defined (__ENVIRON)
|
||||
USE plugin_flags, ONLY : use_environ
|
||||
USE environ_base_module, ONLY : read_environ_input, init_environ_setup, &
|
||||
|
@ -504,9 +503,6 @@ SUBROUTINE lr_readin
|
|||
! vrs = vltot + v%of_r
|
||||
!
|
||||
CALL set_vrs ( vrs, vltot, v%of_r, 0, 0, dfftp%nnr, nspin, doublegrid )
|
||||
#if defined(__CUDA)
|
||||
vrs_d = vrs
|
||||
#endif
|
||||
!
|
||||
DEALLOCATE( vltot )
|
||||
CALL destroy_scf_type(v)
|
||||
|
|
Loading…
Reference in New Issue