mirror of https://gitlab.com/QEF/q-e.git
Added GPU version for h_psi, s_psi, g_psi and vloc. psic and psic_nc module variables should be standardized. Many data copies between CPU and GPU should be replaced by data in modules. All test passing except for pw_vc-relax/vc-relax3.in which has too loose convergence (noticed by Josh Romero) and for dsygvdx_gpu (real problem) occasionally failing
This commit is contained in:
parent
df72013a32
commit
5f6d231bdd
|
@ -281,7 +281,7 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
|
|||
!END DO
|
||||
! ========= TO HERE, REPLACED BY =======
|
||||
|
||||
CALL reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, vc_d)
|
||||
CALL reorder_evals_cevecs(nbase, nvec, nvecx, conv, e_d, ew_d, vc_d)
|
||||
!
|
||||
nb1 = nbase + 1
|
||||
!
|
||||
|
@ -608,62 +608,62 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
|
|||
!
|
||||
RETURN
|
||||
!
|
||||
CONTAINS
|
||||
SUBROUTINE reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
|
||||
USE david_param, ONLY : DP
|
||||
USE david_buffer, ONLY : buffer
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: nbase, nvec, nvecx
|
||||
LOGICAL, INTENT(IN) :: conv(nvec)
|
||||
REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
|
||||
COMPLEX(DP), DEVICE :: v_d(nvecx,nvecx)
|
||||
!
|
||||
INTEGER :: n, np, info
|
||||
INTEGER, ALLOCATABLE :: conv_idx(:)
|
||||
INTEGER, DEVICE, POINTER :: conv_idx_d(:)
|
||||
COMPLEX(DP), DEVICE, POINTER :: vtmp_d(:,:)
|
||||
!
|
||||
np = 0
|
||||
ALLOCATE(conv_idx(nvec))
|
||||
DO n = 1, nvec
|
||||
conv_idx(n) = -1
|
||||
IF ( .NOT. conv(n) ) THEN
|
||||
np = np + 1
|
||||
conv_idx(n) = np
|
||||
END IF
|
||||
END DO
|
||||
|
||||
IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
|
||||
|
||||
CALL buffer%lock_buffer(conv_idx_d, nvec, info)
|
||||
CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
|
||||
|
||||
conv_idx_d(1:nvec) = conv_idx(1:nvec)
|
||||
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j=1,nvec
|
||||
DO k=1,nvecx
|
||||
vtmp_d(k,j) = v_d(k,j)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j=1,nvec
|
||||
DO k=1,nvecx
|
||||
IF(conv_idx_d(j) /= -1) THEN
|
||||
v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
|
||||
IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
CALL buffer%release_buffer(conv_idx_d, info)
|
||||
CALL buffer%release_buffer(vtmp_d, info)
|
||||
!
|
||||
DEALLOCATE(conv_idx)
|
||||
END SUBROUTINE
|
||||
|
||||
END SUBROUTINE cegterg_gpu
|
||||
|
||||
SUBROUTINE reorder_evals_cevecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
|
||||
USE david_param, ONLY : DP
|
||||
USE david_buffer, ONLY : buffer
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: nbase, nvec, nvecx
|
||||
LOGICAL, INTENT(IN) :: conv(nvec)
|
||||
REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
|
||||
COMPLEX(DP), DEVICE :: v_d(nvecx,nvecx)
|
||||
!
|
||||
INTEGER :: j, k, n, np, info
|
||||
INTEGER, ALLOCATABLE :: conv_idx(:)
|
||||
INTEGER, DEVICE, POINTER :: conv_idx_d(:)
|
||||
COMPLEX(DP), DEVICE, POINTER :: vtmp_d(:,:)
|
||||
!
|
||||
np = 0
|
||||
ALLOCATE(conv_idx(nvec))
|
||||
DO n = 1, nvec
|
||||
conv_idx(n) = -1
|
||||
IF ( .NOT. conv(n) ) THEN
|
||||
np = np + 1
|
||||
conv_idx(n) = np
|
||||
END IF
|
||||
END DO
|
||||
|
||||
IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
|
||||
|
||||
CALL buffer%lock_buffer(conv_idx_d, nvec, info)
|
||||
CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
|
||||
|
||||
conv_idx_d(1:nvec) = conv_idx(1:nvec)
|
||||
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j=1,nvec
|
||||
DO k=1,nvecx
|
||||
vtmp_d(k,j) = v_d(k,j)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j=1,nvec
|
||||
DO k=1,nvecx
|
||||
IF(conv_idx_d(j) /= -1) THEN
|
||||
v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
|
||||
IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
CALL buffer%release_buffer(conv_idx_d, info)
|
||||
CALL buffer%release_buffer(vtmp_d, info)
|
||||
!
|
||||
DEALLOCATE(conv_idx)
|
||||
END SUBROUTINE reorder_evals_cevecs
|
||||
|
||||
!
|
||||
! Wrapper for subroutine with distributed matrixes (written by Carlo Cavazzoni)
|
||||
!
|
||||
|
|
|
@ -272,7 +272,7 @@ SUBROUTINE regterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
|
|||
!END DO
|
||||
! ========= TO HERE, REPLACED BY =======
|
||||
|
||||
CALL reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, vr_d)
|
||||
CALL reorder_evals_revecs(nbase, nvec, nvecx, conv, e_d, ew_d, vr_d)
|
||||
!
|
||||
nb1 = nbase + 1
|
||||
!
|
||||
|
@ -567,62 +567,63 @@ SUBROUTINE regterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
|
|||
!
|
||||
RETURN
|
||||
!
|
||||
CONTAINS
|
||||
SUBROUTINE reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
|
||||
USE david_param, ONLY : DP
|
||||
USE david_buffer, ONLY : buffer
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: nbase, nvec, nvecx
|
||||
LOGICAL, INTENT(IN) :: conv(nvec)
|
||||
REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
|
||||
REAL(DP), DEVICE :: v_d(nvecx,nvecx)
|
||||
!
|
||||
INTEGER :: n, np, info
|
||||
INTEGER, ALLOCATABLE :: conv_idx(:)
|
||||
INTEGER, DEVICE, POINTER :: conv_idx_d(:)
|
||||
REAL(DP), DEVICE, POINTER :: vtmp_d(:,:)
|
||||
!
|
||||
np = 0
|
||||
ALLOCATE(conv_idx(nvec))
|
||||
DO n = 1, nvec
|
||||
conv_idx(n) = -1
|
||||
IF ( .NOT. conv(n) ) THEN
|
||||
np = np + 1
|
||||
conv_idx(n) = np
|
||||
END IF
|
||||
END DO
|
||||
|
||||
IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
|
||||
|
||||
CALL buffer%lock_buffer(conv_idx_d, nvec, info)
|
||||
CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
|
||||
|
||||
conv_idx_d(1:nvec) = conv_idx(1:nvec)
|
||||
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j=1,nvec
|
||||
DO k=1,nvecx
|
||||
vtmp_d(k,j) = v_d(k,j)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j=1,nvec
|
||||
DO k=1,nvecx
|
||||
IF(conv_idx_d(j) /= -1) THEN
|
||||
v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
|
||||
IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
CALL buffer%release_buffer(conv_idx_d, info)
|
||||
CALL buffer%release_buffer(vtmp_d, info)
|
||||
!
|
||||
DEALLOCATE(conv_idx)
|
||||
END SUBROUTINE
|
||||
|
||||
END SUBROUTINE regterg_gpu
|
||||
|
||||
SUBROUTINE reorder_evals_revecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
|
||||
USE david_param, ONLY : DP
|
||||
USE david_buffer, ONLY : buffer
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: nbase, nvec, nvecx
|
||||
LOGICAL, INTENT(IN) :: conv(nvec)
|
||||
REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
|
||||
REAL(DP), DEVICE :: v_d(nvecx,nvecx)
|
||||
!
|
||||
INTEGER :: j, k, n, np, info
|
||||
INTEGER, ALLOCATABLE :: conv_idx(:)
|
||||
INTEGER, DEVICE, POINTER :: conv_idx_d(:)
|
||||
REAL(DP), DEVICE, POINTER :: vtmp_d(:,:)
|
||||
!
|
||||
np = 0
|
||||
ALLOCATE(conv_idx(nvec))
|
||||
DO n = 1, nvec
|
||||
conv_idx(n) = -1
|
||||
IF ( .NOT. conv(n) ) THEN
|
||||
np = np + 1
|
||||
conv_idx(n) = np
|
||||
END IF
|
||||
END DO
|
||||
|
||||
IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
|
||||
|
||||
CALL buffer%lock_buffer(conv_idx_d, nvec, info)
|
||||
CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
|
||||
|
||||
conv_idx_d(1:nvec) = conv_idx(1:nvec)
|
||||
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j=1,nvec
|
||||
DO k=1,nvecx
|
||||
vtmp_d(k,j) = v_d(k,j)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j=1,nvec
|
||||
DO k=1,nvecx
|
||||
IF(conv_idx_d(j) /= -1) THEN
|
||||
v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
|
||||
IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
CALL buffer%release_buffer(conv_idx_d, info)
|
||||
CALL buffer%release_buffer(vtmp_d, info)
|
||||
!
|
||||
DEALLOCATE(conv_idx)
|
||||
END SUBROUTINE reorder_evals_revecs
|
||||
|
||||
|
||||
!
|
||||
! Wrapper for subroutine with distributed matrixes (written by Carlo Cavazzoni)
|
||||
!
|
||||
|
|
|
@ -162,6 +162,8 @@ w0gauss.o \
|
|||
w1gauss.o \
|
||||
deviatoric.o
|
||||
|
||||
-include make.gpu
|
||||
|
||||
TLDEPS=libfft
|
||||
|
||||
all : libqemod.a
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
!
|
||||
! Copyright (C) Quantum ESPRESSO group
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
MODULE buffers_module
|
||||
#if defined(__CUDA)
|
||||
USE fbuf, ONLY : buf_t
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(buf_t) :: buffer
|
||||
#endif
|
||||
END MODULE buffers_module
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,4 @@
|
|||
MODULES += \
|
||||
fbuf.o \
|
||||
buffers_module.o \
|
||||
wavefunctions_gpu.o
|
|
@ -0,0 +1,60 @@
|
|||
!
|
||||
! Copyright (C) 2002-2011 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
|
||||
!=----------------------------------------------------------------------------=!
|
||||
MODULE wavefunctions_module_gpum
|
||||
!=----------------------------------------------------------------------------=!
|
||||
USE kinds, ONLY : DP
|
||||
|
||||
IMPLICIT NONE
|
||||
SAVE
|
||||
|
||||
!
|
||||
COMPLEX(DP), DEVICE, ALLOCATABLE, TARGET :: &
|
||||
evc_d(:,:) ! wavefunctions in the PW basis set
|
||||
! noncolinear case: first index
|
||||
! is a combined PW + spin index
|
||||
!
|
||||
COMPLEX(DP) , DEVICE, ALLOCATABLE, TARGET :: &
|
||||
psic_d(:), & ! additional memory for FFT
|
||||
psic_nc_d(:,:) ! as above for the noncolinear case
|
||||
!
|
||||
|
||||
CONTAINS
|
||||
!
|
||||
! ==== SUBS FOR EVC
|
||||
!
|
||||
SUBROUTINE start_sync_evc_dev()
|
||||
use wavefunctions_module, only: evc
|
||||
implicit none
|
||||
if (.not. allocated(evc_d)) then
|
||||
allocate(evc_d, source=evc)
|
||||
return
|
||||
end if
|
||||
evc_d = evc
|
||||
!
|
||||
END SUBROUTINE start_sync_evc_dev
|
||||
!
|
||||
SUBROUTINE start_sync_evc_hst()
|
||||
use wavefunctions_module, only: evc
|
||||
implicit none
|
||||
evc = evc_d
|
||||
END SUBROUTINE start_sync_evc_hst
|
||||
|
||||
SUBROUTINE wait_sync_evc_dev()
|
||||
RETURN
|
||||
END SUBROUTINE wait_sync_evc_dev
|
||||
SUBROUTINE wait_sync_evc_hst()
|
||||
RETURN
|
||||
END SUBROUTINE wait_sync_evc_hst
|
||||
|
||||
|
||||
|
||||
!=----------------------------------------------------------------------------=!
|
||||
END MODULE wavefunctions_module_gpum
|
||||
!=----------------------------------------------------------------------------=!
|
|
@ -163,6 +163,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
|
|||
USE noncollin_module, ONLY : noncolin, npol
|
||||
USE wavefunctions_module, ONLY : evc
|
||||
USE g_psi_mod, ONLY : h_diag, s_diag
|
||||
USE g_psi_mod_gpum, ONLY : using_h_diag, using_s_diag
|
||||
USE scf, ONLY : v_of_0
|
||||
USE bp, ONLY : lelfield, evcel, evcelp, evcelm, bec_evcel,&
|
||||
gdir, l3dstring, efield, efield_cry
|
||||
|
@ -208,6 +209,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
|
|||
! In addition to the above ithe initial wfc rotation uses h_psi, and s_psi
|
||||
|
||||
ALLOCATE( h_diag( npwx, npol ), STAT=ierr )
|
||||
call using_h_diag(.true.); call using_s_diag(.true.)
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' diag_bands ', ' cannot allocate h_diag ', ABS(ierr) )
|
||||
!
|
||||
|
@ -240,6 +242,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
|
|||
CALL deallocate_bec_type ( becp )
|
||||
DEALLOCATE( s_diag )
|
||||
DEALLOCATE( h_diag )
|
||||
call using_h_diag(.true.); call using_s_diag(.true.)
|
||||
!
|
||||
IF ( notconv > MAX( 5, nbnd / 4 ) ) THEN
|
||||
!
|
||||
|
@ -278,6 +281,7 @@ CONTAINS
|
|||
h_diag(ig,1) = 1.D0 + g2kin(ig) + SQRT( 1.D0 + ( g2kin(ig) - 1.D0 )**2 )
|
||||
!
|
||||
END FORALL
|
||||
call using_h_diag(.true.)
|
||||
!
|
||||
ntry = 0
|
||||
!
|
||||
|
@ -318,6 +322,7 @@ CONTAINS
|
|||
h_diag(1:npw, 1) = g2kin(1:npw) + v_of_0
|
||||
!
|
||||
CALL usnldiag( npw, h_diag, s_diag )
|
||||
call using_h_diag(.true.); call using_s_diag(.true.);
|
||||
!
|
||||
ntry = 0
|
||||
!
|
||||
|
@ -438,6 +443,7 @@ CONTAINS
|
|||
h_diag(ig,:) = 1.D0 + g2kin(ig) + SQRT( 1.D0 + ( g2kin(ig) - 1.D0 )**2 )
|
||||
!
|
||||
END FORALL
|
||||
call using_h_diag(.true.);
|
||||
!
|
||||
ntry = 0
|
||||
!
|
||||
|
@ -482,6 +488,7 @@ CONTAINS
|
|||
END DO
|
||||
!
|
||||
CALL usnldiag( npw, h_diag, s_diag )
|
||||
call using_h_diag(.true.); call using_s_diag(.true.);
|
||||
!
|
||||
ntry = 0
|
||||
!
|
||||
|
|
|
@ -16,6 +16,7 @@ subroutine g_psi (lda, n, m, npol, psi, e)
|
|||
!
|
||||
USE kinds
|
||||
USE g_psi_mod
|
||||
USE g_psi_mod_gpum, ONLY : using_h_diag, using_s_diag
|
||||
implicit none
|
||||
integer :: lda, n, m, npol, ipol
|
||||
! input: the leading dimension of psi
|
||||
|
@ -37,6 +38,7 @@ subroutine g_psi (lda, n, m, npol, psi, e)
|
|||
! counter on psi functions
|
||||
! counter on G vectors
|
||||
!
|
||||
call using_h_diag(.false.); call using_s_diag(.false.)
|
||||
call start_clock ('g_psi')
|
||||
!
|
||||
#ifdef TEST_NEW_PRECONDITIONING
|
||||
|
|
|
@ -10,6 +10,77 @@
|
|||
!-----------------------------------------------------------------------
|
||||
#if defined(__CUDA)
|
||||
subroutine g_psi_gpu (lda, n, m, npol, psi_d, e_d)
|
||||
USE kinds
|
||||
|
||||
USE cudafor
|
||||
USE g_psi_mod_gpum, ONLY : h_diag_d, s_diag_d, using_h_diag_d, using_s_diag_d
|
||||
|
||||
implicit none
|
||||
integer :: lda, n, m, npol, ipol
|
||||
! input: the leading dimension of psi
|
||||
! input: the real dimension of psi
|
||||
! input: the number of bands
|
||||
! input: the number of coordinates of psi
|
||||
! local variable: counter of coordinates of psi
|
||||
real(DP) :: e_d (m)
|
||||
! input: the eigenvectors
|
||||
complex(DP) :: psi_d (lda, npol, m)
|
||||
! inp/out: the psi vector
|
||||
|
||||
attributes( device ) :: e_d, psi_d
|
||||
|
||||
!
|
||||
! Local variables
|
||||
!
|
||||
real(DP), parameter :: eps = 1.0d-4
|
||||
! a small number
|
||||
real(DP) :: x, scala, denm
|
||||
integer :: k, i
|
||||
! counter on psi functions
|
||||
! counter on G vectors
|
||||
!
|
||||
call using_h_diag_d(.false.)
|
||||
call using_s_diag_d(.false.)
|
||||
!
|
||||
call start_clock ('g_psi')
|
||||
!
|
||||
#ifdef TEST_NEW_PRECONDITIONING
|
||||
scala = 1.d0
|
||||
|
||||
!$cuf kernel do(3) <<<*,*>>>
|
||||
do ipol=1,npol
|
||||
do k = 1, m
|
||||
do i = 1, n
|
||||
x = (h_diag_d(i,ipol) - e_d(k)*s_diag_d(i,ipol))*scala
|
||||
denm = 0.5_dp*(1.d0+x+sqrt(1.d0+(x-1)*(x-1.d0)))/scala
|
||||
psi_d (i, ipol, k) = psi_d (i, ipol, k) / denm
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
#else
|
||||
!$cuf kernel do(3) <<<*,*>>>
|
||||
do ipol=1,npol
|
||||
do k = 1, m
|
||||
do i = 1, n
|
||||
denm = h_diag_d (i,ipol) - e_d (k) * s_diag_d (i,ipol)
|
||||
!
|
||||
! denm = g2+v(g=0) - e(k)
|
||||
!
|
||||
if (abs (denm) < eps) denm = sign (eps, denm)
|
||||
!
|
||||
! denm = sign( max( abs(denm),eps ), denm )
|
||||
!
|
||||
psi_d (i, ipol, k) = psi_d (i, ipol, k) / denm
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
#endif
|
||||
|
||||
call stop_clock ('g_psi')
|
||||
return
|
||||
end subroutine g_psi_gpu
|
||||
|
||||
subroutine g_psi_gpu_compatibility (lda, n, m, npol, psi_d, e_d)
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! This routine computes an estimate of the inverse Hamiltonian
|
||||
|
@ -57,5 +128,5 @@ subroutine g_psi_gpu (lda, n, m, npol, psi_d, e_d)
|
|||
DEALLOCATE(psi, e )
|
||||
|
||||
return
|
||||
end subroutine g_psi_gpu
|
||||
end subroutine g_psi_gpu_compatibility
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,93 @@
|
|||
!
|
||||
! Copyright (C) 2001-2007 PWSCF group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
MODULE g_psi_mod_gpum
|
||||
!
|
||||
! ... These are the variables needed in g_psi
|
||||
!
|
||||
USE kinds, only : DP
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), DEVICE, ALLOCATABLE :: &
|
||||
h_diag_d (:,:),& ! diagonal part of the Hamiltonian
|
||||
s_diag_d (:,:) ! diagonal part of the overlap matrix
|
||||
|
||||
LOGICAL :: h_diag_ood = .false. ! used to flag out of date variables
|
||||
LOGICAL :: s_diag_ood = .false.
|
||||
LOGICAL :: h_diag_d_ood = .false.
|
||||
LOGICAL :: s_diag_d_ood = .false.
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE using_h_diag(writing)
|
||||
USE g_psi_mod, ONLY : h_diag
|
||||
implicit none
|
||||
LOGICAL, INTENT(IN) :: writing
|
||||
!
|
||||
IF (h_diag_ood) THEN
|
||||
h_diag = h_diag_d
|
||||
h_diag_ood = .false.
|
||||
ENDIF
|
||||
IF (writing) h_diag_d_ood = .true.
|
||||
END SUBROUTINE using_h_diag
|
||||
!
|
||||
SUBROUTINE using_h_diag_d(writing)
|
||||
USE g_psi_mod, ONLY : h_diag
|
||||
implicit none
|
||||
LOGICAL, INTENT(IN) :: writing
|
||||
!
|
||||
IF (.not. allocated(h_diag)) THEN
|
||||
IF (allocated(h_diag_d)) DEALLOCATE(h_diag_d)
|
||||
h_diag_d_ood = .false.
|
||||
RETURN
|
||||
END IF
|
||||
IF (h_diag_d_ood) THEN
|
||||
IF (.not. allocated(h_diag_d)) THEN
|
||||
ALLOCATE(h_diag_d, SOURCE=h_diag)
|
||||
ELSE
|
||||
h_diag_d = h_diag
|
||||
ENDIF
|
||||
h_diag_d_ood = .false.
|
||||
ENDIF
|
||||
IF (writing) h_diag_ood = .true.
|
||||
END SUBROUTINE using_h_diag_d
|
||||
!
|
||||
SUBROUTINE using_s_diag(writing)
|
||||
USE g_psi_mod, ONLY : s_diag
|
||||
implicit none
|
||||
LOGICAL, INTENT(IN) :: writing
|
||||
!
|
||||
IF (s_diag_ood) THEN
|
||||
s_diag = s_diag_d
|
||||
s_diag_ood = .false.
|
||||
ENDIF
|
||||
IF (writing) s_diag_d_ood = .true.
|
||||
END SUBROUTINE using_s_diag
|
||||
!
|
||||
SUBROUTINE using_s_diag_d(writing)
|
||||
USE g_psi_mod, ONLY : s_diag
|
||||
implicit none
|
||||
LOGICAL, INTENT(IN) :: writing
|
||||
!
|
||||
IF (.not. allocated(s_diag)) THEN
|
||||
IF (allocated(s_diag_d)) DEALLOCATE(s_diag_d)
|
||||
s_diag_d_ood = .false.
|
||||
RETURN
|
||||
END IF
|
||||
IF (s_diag_d_ood) THEN
|
||||
IF (.not. allocated(s_diag_d)) THEN
|
||||
ALLOCATE(s_diag_d, SOURCE=s_diag)
|
||||
ELSE
|
||||
s_diag_d = s_diag
|
||||
END IF
|
||||
s_diag_d_ood = .false.
|
||||
ENDIF
|
||||
IF (writing) s_diag_ood = .true.
|
||||
END SUBROUTINE using_s_diag_d
|
||||
!
|
||||
END MODULE g_psi_mod_gpum
|
|
@ -7,7 +7,324 @@
|
|||
!
|
||||
!----------------------------------------------------------------------------
|
||||
#if defined(__CUDA)
|
||||
|
||||
|
||||
|
||||
! Copyright (C) 2002-2016 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE h_psi_gpu( lda, n, m, psi_d, hpsi_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... This routine computes the product of the Hamiltonian
|
||||
! ... matrix with m wavefunctions contained in psi
|
||||
!
|
||||
! ... input:
|
||||
! ... lda leading dimension of arrays psi, spsi, hpsi
|
||||
! ... n true dimension of psi, spsi, hpsi
|
||||
! ... m number of states psi
|
||||
! ... psi
|
||||
!
|
||||
! ... output:
|
||||
! ... hpsi H*psi
|
||||
!
|
||||
! --- Wrapper routine: performs bgrp parallelization on non-distributed bands
|
||||
! --- if suitable and required, calls old H\psi routine h_psi_
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE noncollin_module, ONLY : npol
|
||||
USE funct, ONLY : exx_is_active
|
||||
USE mp_bands, ONLY : use_bgrp_in_hpsi, inter_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: lda, n, m
|
||||
COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
|
||||
COMPLEX(DP), DEVICE, INTENT(OUT) :: hpsi_d(lda*npol,m)
|
||||
!
|
||||
INTEGER :: m_start, m_end
|
||||
!
|
||||
CALL start_clock( 'h_psi_bgrp' ); !write (*,*) 'start h_psi_bgrp'; FLUSH(6)
|
||||
|
||||
! band parallelization with non-distributed bands is performed if
|
||||
! 1. enabled (variable use_bgrp_in_hpsi must be set to .T.)
|
||||
! 2. exact exchange is not active (if it is, band parallelization is already
|
||||
! used in exx routines called by Hpsi)
|
||||
! 3. there is more than one band, otherwise there is nothing to parallelize
|
||||
!
|
||||
IF (use_bgrp_in_hpsi .AND. .NOT. exx_is_active() .AND. m > 1) THEN
|
||||
! use band parallelization here
|
||||
hpsi_d(:,:) = (0.d0,0.d0)
|
||||
CALL divide(inter_bgrp_comm,m,m_start,m_end)
|
||||
! Check if there at least one band in this band group
|
||||
IF (m_end >= m_start) &
|
||||
CALL h_psi__gpu( lda, n, m_end-m_start+1, psi_d(1,m_start), hpsi_d(1,m_start) )
|
||||
CALL mp_sum(hpsi_d,inter_bgrp_comm)
|
||||
ELSE
|
||||
! don't use band parallelization here
|
||||
CALL h_psi__gpu( lda, n, m, psi_d, hpsi_d )
|
||||
END IF
|
||||
|
||||
CALL stop_clock( 'h_psi_bgrp' )
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE h_psi_gpu
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... This routine computes the product of the Hamiltonian
|
||||
! ... matrix with m wavefunctions contained in psi
|
||||
!
|
||||
! ... input:
|
||||
! ... lda leading dimension of arrays psi, spsi, hpsi
|
||||
! ... n true dimension of psi, spsi, hpsi
|
||||
! ... m number of states psi
|
||||
! ... psi
|
||||
!
|
||||
! ... output:
|
||||
! ... hpsi H*psi
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE bp, ONLY : lelfield,l3dstring,gdir, efield, efield_cry
|
||||
USE becmod, ONLY : bec_type, becp, calbec
|
||||
USE lsda_mod, ONLY : current_spin
|
||||
USE scf, ONLY : vrs
|
||||
USE wvfct, ONLY : g2kin
|
||||
USE uspp, ONLY : vkb, nkb
|
||||
USE ldaU, ONLY : lda_plus_u, U_projection
|
||||
USE gvect, ONLY : gstart
|
||||
USE funct, ONLY : dft_is_meta
|
||||
USE control_flags, ONLY : gamma_only
|
||||
USE noncollin_module, ONLY: npol, noncolin
|
||||
USE realus, ONLY : real_space, &
|
||||
invfft_orbital_gamma, fwfft_orbital_gamma, calbec_rs_gamma, add_vuspsir_gamma, &
|
||||
invfft_orbital_k, fwfft_orbital_k, calbec_rs_k, add_vuspsir_k, &
|
||||
v_loc_psir_inplace
|
||||
USE fft_base, ONLY : dffts
|
||||
USE exx, ONLY : use_ace, vexx, vexxace_gamma, vexxace_k
|
||||
USE funct, ONLY : exx_is_active
|
||||
USE fft_helper_subroutines
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: lda, n, m
|
||||
COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
|
||||
COMPLEX(DP), DEVICE, INTENT(OUT) :: hpsi_d(lda*npol,m)
|
||||
!
|
||||
REAL(DP), DEVICE, ALLOCATABLE :: g2kin_d(:), vrs_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psi_host(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: hpsi_host(:,:)
|
||||
!
|
||||
INTEGER :: ipol, ibnd, incr, i
|
||||
REAL(dp) :: ee
|
||||
!
|
||||
LOGICAL :: need_host_copy
|
||||
!
|
||||
CALL start_clock( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6)
|
||||
ALLOCATE(g2kin_d, SOURCE=g2kin)
|
||||
ALLOCATE(vrs_d, SOURCE=vrs)
|
||||
hpsi_d (:, 1:m) = (0.0_dp, 0.0_dp)
|
||||
|
||||
need_host_copy = ( real_space .and. nkb > 0 ) .OR. &
|
||||
dft_is_meta() .OR. &
|
||||
(lda_plus_u .AND. U_projection.NE."pseudo" ) .OR. &
|
||||
( nkb > 0 .AND. .NOT. real_space) .OR. &
|
||||
exx_is_active() .OR. lelfield
|
||||
|
||||
if (need_host_copy) then
|
||||
ALLOCATE(psi_host(lda*npol,m) , hpsi_host(lda*npol,m) )
|
||||
psi_host = psi_d
|
||||
hpsi_host = hpsi_d ! this is not needed
|
||||
end if
|
||||
|
||||
CALL start_clock( 'h_psi:pot' ); !write (*,*) 'start h_pot';FLUSH(6)
|
||||
!
|
||||
! ... Here the product with the local potential V_loc psi
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
IF ( real_space .and. nkb > 0 ) then
|
||||
!
|
||||
! ... real-space algorithm
|
||||
! ... fixme: real_space without beta functions does not make sense
|
||||
!
|
||||
IF ( dffts%has_task_groups ) then
|
||||
incr = 2 * fftx_ntgrp(dffts)
|
||||
ELSE
|
||||
incr = 2
|
||||
ENDIF
|
||||
DO ibnd = 1, m, incr
|
||||
! ... transform psi to real space -> psic
|
||||
CALL invfft_orbital_gamma(psi_host,ibnd,m)
|
||||
! ... compute becp%r = < beta|psi> from psic in real space
|
||||
CALL start_clock( 'h_psi:calbec' )
|
||||
CALL calbec_rs_gamma(ibnd,m,becp%r)
|
||||
CALL stop_clock( 'h_psi:calbec' )
|
||||
! ... psic -> vrs * psic (psic overwritten will become hpsi)
|
||||
CALL v_loc_psir_inplace(ibnd,m)
|
||||
! ... psic (hpsi) -> psic + vusp
|
||||
CALL add_vuspsir_gamma(ibnd,m)
|
||||
! ... transform psic back in reciprocal space and assign it to hpsi
|
||||
CALL fwfft_orbital_gamma(hpsi_host,ibnd,m)
|
||||
END DO
|
||||
hpsi_d = hpsi_host
|
||||
!
|
||||
ELSE
|
||||
! ... usual reciprocal-space algorithm
|
||||
CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
ELSE IF ( noncolin ) THEN
|
||||
!
|
||||
CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs_d, hpsi_d )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF ( real_space .and. nkb > 0 ) then
|
||||
!
|
||||
! ... real-space algorithm
|
||||
! ... fixme: real_space without beta functions does not make sense
|
||||
!
|
||||
IF ( dffts%has_task_groups ) then
|
||||
incr = fftx_ntgrp(dffts)
|
||||
ELSE
|
||||
incr = 1
|
||||
ENDIF
|
||||
DO ibnd = 1, m
|
||||
! ... transform psi to real space -> psic
|
||||
CALL invfft_orbital_k(psi_host,ibnd,m)
|
||||
! ... compute becp%r = < beta|psi> from psic in real space
|
||||
CALL start_clock( 'h_psi:calbec' )
|
||||
CALL calbec_rs_k(ibnd,m)
|
||||
CALL stop_clock( 'h_psi:calbec' )
|
||||
! ... psic -> vrs * psic (psic overwritten will become hpsi)
|
||||
CALL v_loc_psir_inplace(ibnd,m)
|
||||
! ... psic (hpsi) -> psic + vusp
|
||||
CALL add_vuspsir_k(ibnd,m)
|
||||
! ... transform psic back in reciprocal space and assign it to hpsi
|
||||
CALL fwfft_orbital_k(hpsi_host,ibnd,m)
|
||||
END DO
|
||||
IF (need_host_copy) hpsi_d = hpsi_host
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... Here the product with the non local potential V_NL psi
|
||||
! ... (not in the real-space case: it is done together with V_loc)
|
||||
!
|
||||
IF ( nkb > 0 .AND. .NOT. real_space) THEN
|
||||
!
|
||||
CALL start_clock( 'h_psi:calbec' )
|
||||
CALL calbec ( n, vkb, psi_host, becp, m )
|
||||
CALL stop_clock( 'h_psi:calbec' )
|
||||
hpsi_host = hpsi_d
|
||||
CALL add_vuspsi( lda, n, m, hpsi_host )
|
||||
hpsi_d = hpsi_host
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL stop_clock( 'h_psi:pot' )
|
||||
!
|
||||
! ... Here we add the kinetic energy (k+G)^2 psi
|
||||
!
|
||||
!$cuf kernel do(2)
|
||||
DO ibnd = 1, m
|
||||
DO i=1, n
|
||||
hpsi_d (i, ibnd) = hpsi_d(i, ibnd) + g2kin_d (i) * psi_d (i, ibnd)
|
||||
IF ( noncolin ) THEN
|
||||
hpsi_d (lda+i, ibnd) = hpsi_d(lda+i,ibnd) + g2kin_d (i) * psi_d (lda+i, ibnd)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
if (dft_is_meta()) then
|
||||
hpsi_host = hpsi_d
|
||||
call h_psi_meta (lda, n, m, psi_host, hpsi_host)
|
||||
hpsi_d = hpsi_host
|
||||
end if
|
||||
!
|
||||
! ... Here we add the Hubbard potential times psi
|
||||
!
|
||||
IF ( lda_plus_u .AND. U_projection.NE."pseudo" ) THEN
|
||||
!
|
||||
hpsi_host = hpsi_d
|
||||
IF (noncolin) THEN
|
||||
CALL vhpsi_nc( lda, n, m, psi_host, hpsi_host )
|
||||
ELSE
|
||||
call vhpsi( lda, n, m, psi_host, hpsi_host )
|
||||
ENDIF
|
||||
hpsi_d = hpsi_host
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! ... Here the exact-exchange term Vxx psi
|
||||
!
|
||||
IF ( exx_is_active() ) THEN
|
||||
hpsi_host = hpsi_d
|
||||
IF ( use_ace) THEN
|
||||
IF (gamma_only) THEN
|
||||
CALL vexxace_gamma(lda,m,psi_host,ee,hpsi_host)
|
||||
ELSE
|
||||
CALL vexxace_k(lda,m,psi_host,ee,hpsi_host)
|
||||
END IF
|
||||
ELSE
|
||||
CALL vexx( lda, n, m, psi_host, hpsi_host, becp )
|
||||
END IF
|
||||
hpsi_d = hpsi_host
|
||||
END IF
|
||||
!
|
||||
! ... electric enthalpy if required
|
||||
!
|
||||
IF ( lelfield ) THEN
|
||||
!
|
||||
hpsi_host = hpsi_d
|
||||
IF ( .NOT.l3dstring ) THEN
|
||||
CALL h_epsi_her_apply( lda, n, m, psi_host, hpsi_host,gdir, efield )
|
||||
ELSE
|
||||
DO ipol=1,3
|
||||
CALL h_epsi_her_apply( lda, n, m, psi_host, hpsi_host,ipol,efield_cry(ipol) )
|
||||
END DO
|
||||
END IF
|
||||
hpsi_d = hpsi_host
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... With Gamma-only trick, Im(H*psi)(G=0) = 0 by definition,
|
||||
! ... but it is convenient to explicitly set it to 0 to prevent trouble
|
||||
!
|
||||
IF ( gamma_only .AND. gstart == 2 ) then
|
||||
!$cuf kernel do(1)
|
||||
do i=1,m
|
||||
hpsi_d(1,i) = CMPLX( DBLE( hpsi_d(1,i) ), 0.D0 ,kind=DP)
|
||||
end do
|
||||
end if
|
||||
!
|
||||
if (need_host_copy) then
|
||||
DEALLOCATE(psi_host , hpsi_host )
|
||||
end if
|
||||
DEALLOCATE(g2kin_d, vrs_d)
|
||||
CALL stop_clock( 'h_psi' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE h_psi__gpu
|
||||
|
||||
|
||||
|
||||
SUBROUTINE h_psi_gpu_compatibility( lda, n, m, psi_d, hpsi_d )
|
||||
USE kinds, ONLY : DP
|
||||
USE noncollin_module, ONLY : npol
|
||||
USE cudafor
|
||||
|
@ -31,5 +348,5 @@ SUBROUTINE h_psi_gpu( lda, n, m, psi_d, hpsi_d )
|
|||
|
||||
DEALLOCATE(psi, hpsi )
|
||||
|
||||
END SUBROUTINE h_psi_gpu
|
||||
END SUBROUTINE h_psi_gpu_compatibility
|
||||
#endif
|
||||
|
|
|
@ -33,8 +33,10 @@ SUBROUTINE init_run()
|
|||
USE esm, ONLY : do_comp_esm, esm_init
|
||||
USE tsvdw_module, ONLY : tsvdw_initialize
|
||||
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_fact
|
||||
USE buffers_module, ONLY : buffer
|
||||
!
|
||||
IMPLICIT NONE
|
||||
INTEGER :: ierr
|
||||
!
|
||||
CALL start_clock( 'init_run' )
|
||||
!
|
||||
|
@ -51,6 +53,10 @@ SUBROUTINE init_run()
|
|||
CALL summary()
|
||||
CALL memory_report()
|
||||
!
|
||||
! ... allocate GPU buffers
|
||||
!
|
||||
CALL buffer%init(10, ierr)
|
||||
!
|
||||
! ... allocate memory for G- and R-space fft arrays
|
||||
!
|
||||
CALL allocate_fft()
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
PWLIBS += \
|
||||
g_psi_mod_gpu.o \
|
||||
g_psi_gpu.o \
|
||||
h_psi_gpu.o \
|
||||
s_psi_gpu.o
|
||||
s_psi_gpu.o \
|
||||
vloc_psi_gpu.o
|
||||
|
||||
c_bands.o : g_psi_mod_gpu.o
|
||||
|
|
|
@ -212,6 +212,9 @@ MODULE wvfct
|
|||
et(:,:), &! eigenvalues of the hamiltonian
|
||||
wg(:,:), &! the weight of each k point and band
|
||||
g2kin(:) ! kinetic energy
|
||||
#if defined(__CUDA)
|
||||
attributes(pinned) :: g2kin
|
||||
#endif
|
||||
INTEGER, ALLOCATABLE :: &
|
||||
btype(:,:) ! one if the corresponding state has to be
|
||||
! converged to full accuracy, zero otherwise
|
||||
|
|
|
@ -8,7 +8,460 @@
|
|||
!
|
||||
!----------------------------------------------------------------------------
|
||||
#if defined(__CUDA)
|
||||
|
||||
!@njs: s_psi, psi, spsi, s_psi_, s_psi_k, s_psi_gamma, s_psi_nc
|
||||
SUBROUTINE s_psi_gpu( lda, n, m, psi_d, spsi_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... This routine applies the S matrix to m wavefunctions psi
|
||||
! ... and puts the results in spsi.
|
||||
! ... Requires the products of psi with all beta functions
|
||||
! ... in array becp(nkb,m) (calculated in h_psi or by calbec)
|
||||
!
|
||||
! ... input:
|
||||
!
|
||||
! ... lda leading dimension of arrays psi, spsi
|
||||
! ... n true dimension of psi, spsi
|
||||
! ... m number of states psi
|
||||
! ... psi
|
||||
!
|
||||
! ... output:
|
||||
!
|
||||
! ... spsi S*psi
|
||||
!
|
||||
! --- Wrapper routine: performs bgrp parallelization on non-distributed bands
|
||||
! --- if suitable and required, calls old S\psi routine s_psi_
|
||||
! --- See comments in h_psi.f90 about band parallelization
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE noncollin_module, ONLY : npol
|
||||
USE funct, ONLY : exx_is_active
|
||||
USE mp_bands, ONLY : use_bgrp_in_hpsi, inter_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: lda, n, m
|
||||
COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
|
||||
COMPLEX(DP), DEVICE, INTENT(OUT)::spsi_d(lda*npol,m)
|
||||
!
|
||||
INTEGER :: m_start, m_end, i
|
||||
!
|
||||
CALL start_clock( 's_psi_bgrp' )
|
||||
|
||||
IF (use_bgrp_in_hpsi .AND. .NOT. exx_is_active() .AND. m > 1) THEN
|
||||
! use band parallelization here
|
||||
spsi_d(:,:) = (0.d0,0.d0)
|
||||
CALL divide(inter_bgrp_comm,m,m_start,m_end)
|
||||
!write(6,*) m, m_start,m_end
|
||||
! Check if there at least one band in this band group
|
||||
IF (m_end >= m_start) &
|
||||
CALL s_psi__gpu( lda, n, m_end-m_start+1, psi_d(1,m_start), spsi_d(1,m_start) )
|
||||
CALL mp_sum(spsi_d,inter_bgrp_comm)
|
||||
ELSE
|
||||
! don't use band parallelization here
|
||||
CALL s_psi__gpu( lda, n, m, psi_d, spsi_d )
|
||||
END IF
|
||||
|
||||
CALL stop_clock( 's_psi_bgrp' )
|
||||
RETURN
|
||||
|
||||
END SUBROUTINE s_psi_gpu
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... This routine applies the S matrix to m wavefunctions psi
|
||||
! ... and puts the results in spsi.
|
||||
! ... Requires the products of psi with all beta functions
|
||||
! ... in array becp(nkb,m) (calculated in h_psi or by calbec)
|
||||
!
|
||||
! ... input:
|
||||
!
|
||||
! ... lda leading dimension of arrays psi, spsi
|
||||
! ... n true dimension of psi, spsi
|
||||
! ... m number of states psi
|
||||
! ... psi
|
||||
!
|
||||
! ... output:
|
||||
!
|
||||
! ... spsi S*psi
|
||||
!
|
||||
USE cudafor
|
||||
USE cublas
|
||||
USE buffers_module, ONLY : buffer
|
||||
USE kinds, ONLY : DP
|
||||
USE becmod, ONLY : becp
|
||||
USE uspp, ONLY : vkb, nkb, okvan, qq_at, qq_so, indv_ijkb0
|
||||
USE spin_orb, ONLY : lspinorb
|
||||
USE uspp_param, ONLY : upf, nh, nhm
|
||||
USE ions_base, ONLY : nat, nsp, ityp
|
||||
USE control_flags, ONLY: gamma_only
|
||||
USE noncollin_module, ONLY: npol, noncolin
|
||||
USE realus, ONLY : real_space, &
|
||||
invfft_orbital_gamma, fwfft_orbital_gamma, calbec_rs_gamma, s_psir_gamma, &
|
||||
invfft_orbital_k, fwfft_orbital_k, calbec_rs_k, s_psir_k
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: lda, n, m
|
||||
COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
|
||||
COMPLEX(DP), DEVICE, INTENT(OUT)::spsi_d(lda*npol,m)
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: psi_host(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE ::spsi_host(:,:)
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
!
|
||||
LOGICAL :: need_host_copy
|
||||
!
|
||||
! ... initialize spsi
|
||||
!
|
||||
spsi_d = psi_d
|
||||
!
|
||||
IF ( nkb == 0 .OR. .NOT. okvan ) RETURN
|
||||
!
|
||||
need_host_copy = real_space
|
||||
IF (need_host_copy) THEN
|
||||
ALLOCATE(psi_host(lda*npol,m), spsi_host(lda*npol,m))
|
||||
psi_host = psi_d
|
||||
spsi_host = spsi_d
|
||||
END IF
|
||||
!
|
||||
CALL start_clock( 's_psi' )
|
||||
!
|
||||
! ... The product with the beta functions
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
IF (real_space ) THEN
|
||||
!
|
||||
DO ibnd = 1, m, 2
|
||||
! transform the orbital to real space
|
||||
CALL invfft_orbital_gamma(psi_host,ibnd,m)
|
||||
CALL s_psir_gamma(ibnd,m)
|
||||
CALL fwfft_orbital_gamma(spsi_host,ibnd,m)
|
||||
END DO
|
||||
spsi_d = spsi_host
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL s_psi_gamma_gpu()
|
||||
!
|
||||
END IF
|
||||
!
|
||||
ELSE IF ( noncolin ) THEN
|
||||
!
|
||||
CALL s_psi_nc_gpu()
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF (real_space ) THEN
|
||||
!
|
||||
DO ibnd = 1, m
|
||||
! transform the orbital to real space
|
||||
CALL invfft_orbital_k(psi_host,ibnd,m)
|
||||
CALL s_psir_k(ibnd,m)
|
||||
CALL fwfft_orbital_k(spsi_host,ibnd,m)
|
||||
spsi_d = spsi_host
|
||||
END DO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL s_psi_k_gpu()
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL stop_clock( 's_psi' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE s_psi_gamma_gpu()
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! ... gamma version
|
||||
!
|
||||
USE mp, ONLY: mp_get_comm_null, mp_circular_shift_left
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... here the local variables
|
||||
!
|
||||
INTEGER :: ikb, jkb, ih, jh, na, nt, ibnd, ierr
|
||||
! counters
|
||||
INTEGER :: nproc, mype, m_loc, m_begin, ibnd_loc, icyc, icur_blk, m_max
|
||||
! data distribution indexes
|
||||
INTEGER, EXTERNAL :: ldim_block, gind_block
|
||||
! data distribution functions
|
||||
REAL(DP), ALLOCATABLE :: ps(:,:)
|
||||
REAL(DP), POINTER, CONTIGUOUS, DEVICE :: ps_d(:,:)
|
||||
COMPLEX(DP), POINTER, CONTIGUOUS, DEVICE :: vkb_d(:,:)
|
||||
! the product vkb and psi
|
||||
!
|
||||
IF( becp%comm == mp_get_comm_null() ) THEN
|
||||
nproc = 1
|
||||
mype = 0
|
||||
m_loc = m
|
||||
m_begin = 1
|
||||
m_max = m
|
||||
ELSE
|
||||
!
|
||||
! becp(l,i) = <beta_l|psi_i>, with vkb(n,l)=|beta_l>
|
||||
! in this case becp(l,i) are distributed (index i is)
|
||||
!
|
||||
nproc = becp%nproc
|
||||
mype = becp%mype
|
||||
m_loc = becp%nbnd_loc
|
||||
m_begin = becp%ibnd_begin
|
||||
m_max = SIZE( becp%r, 2 )
|
||||
IF( ( m_begin + m_loc - 1 ) > m ) m_loc = m - m_begin + 1
|
||||
END IF
|
||||
!
|
||||
ALLOCATE( ps( nkb, m_max ), STAT=ierr )
|
||||
CALL buffer%lock_buffer(ps_d, (/ nkb, m_max /), ierr)
|
||||
CALL buffer%lock_buffer(vkb_d, (/ lda, nkb /), ierr)
|
||||
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' s_psi_gamma ', ' cannot allocate memory (ps) ', ABS(ierr) )
|
||||
!
|
||||
ps(:,:) = 0.D0
|
||||
!
|
||||
! In becp=<vkb_i|psi_j> terms corresponding to atom na of type nt
|
||||
! run from index i=indv_ijkb0(na)+1 to i=indv_ijkb0(na)+nh(nt)
|
||||
!
|
||||
DO nt = 1, nsp
|
||||
IF ( upf(nt)%tvanp ) THEN
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
!
|
||||
! Next operation computes ps(l',i)=\sum_m qq(l,m) becp(m',i)
|
||||
! (l'=l+ijkb0, m'=m+ijkb0, indices run from 1 to nh(nt))
|
||||
!
|
||||
IF ( m_loc > 0 ) THEN
|
||||
CALL DGEMM('N', 'N', nh(nt), m_loc, nh(nt), 1.0_dp, &
|
||||
qq_at(1,1,na), nhm, becp%r(indv_ijkb0(na)+1,1),&
|
||||
nkb, 0.0_dp, ps(indv_ijkb0(na)+1,1), nkb )
|
||||
END IF
|
||||
END IF
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
ps_d = ps
|
||||
vkb_d(1:lda, 1:nkb) = vkb(1:lda,1:nkb)
|
||||
!
|
||||
IF( becp%comm == mp_get_comm_null() ) THEN
|
||||
IF ( m == 1 ) THEN
|
||||
CALL myDGEMV( 'N', 2 * n, nkb, 1.D0, vkb_d, &
|
||||
2 * lda, ps_d, 1, 1.D0, spsi_d, 1 )
|
||||
ELSE
|
||||
CALL cublasDGEMM( 'N', 'N', 2 * n, m, nkb, 1.D0, vkb_d, &
|
||||
2 * lda, ps_d, nkb, 1.D0, spsi_d, 2 * lda )
|
||||
END IF
|
||||
ELSE
|
||||
!
|
||||
! parallel block multiplication of vkb and ps
|
||||
!
|
||||
icur_blk = mype
|
||||
!
|
||||
DO icyc = 0, nproc - 1
|
||||
|
||||
m_loc = ldim_block( becp%nbnd , nproc, icur_blk )
|
||||
m_begin = gind_block( 1, becp%nbnd, nproc, icur_blk )
|
||||
|
||||
IF( ( m_begin + m_loc - 1 ) > m ) m_loc = m - m_begin + 1
|
||||
|
||||
IF( m_loc > 0 ) THEN
|
||||
CALL cublasDGEMM( 'N', 'N', 2 * n, m_loc, nkb, 1.D0, vkb_d, &
|
||||
2 * lda, ps_d, nkb, 1.D0, spsi_d( 1, m_begin ), 2 * lda )
|
||||
END IF
|
||||
|
||||
! block rotation
|
||||
!
|
||||
CALL mp_circular_shift_left( ps_d, icyc, becp%comm )
|
||||
|
||||
icur_blk = icur_blk + 1
|
||||
IF( icur_blk == nproc ) icur_blk = 0
|
||||
|
||||
END DO
|
||||
!
|
||||
END IF
|
||||
!
|
||||
DEALLOCATE( ps )
|
||||
CALL buffer%release_buffer(ps_d, ierr)
|
||||
CALL buffer%release_buffer(vkb_d, ierr)
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE s_psi_gamma_gpu
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE s_psi_k_gpu()
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! ... k-points version
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ikb, jkb, ih, jh, na, nt, ibnd, ierr
|
||||
! counters
|
||||
COMPLEX(DP), ALLOCATABLE :: ps(:,:), qqc(:,:)
|
||||
COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: ps_d(:,:)
|
||||
COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: vkb_d(:,:)
|
||||
! ps = product vkb and psi ; qqc = complex version of qq
|
||||
!
|
||||
ALLOCATE( ps( nkb, m ), STAT=ierr )
|
||||
!
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' s_psi_k ', ' cannot allocate memory (ps) ', ABS(ierr) )
|
||||
CALL buffer%lock_buffer(ps_d, (/ nkb, m /), ierr)
|
||||
CALL buffer%lock_buffer(vkb_d, (/ lda, nkb /), ierr)
|
||||
!ALLOCATE(vkb_d, SOURCE=vkb)
|
||||
|
||||
!
|
||||
ps(:,:) = ( 0.D0, 0.D0 )
|
||||
!
|
||||
DO nt = 1, nsp
|
||||
IF ( upf(nt)%tvanp ) THEN
|
||||
! qq is real: copy it into a complex variable to perform
|
||||
! a zgemm - simple but sub-optimal solution
|
||||
ALLOCATE( qqc(nh(nt),nh(nt)) )
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
qqc(:,:) = CMPLX ( qq_at(1:nh(nt),1:nh(nt),na), 0.0_dp, KIND=dp )
|
||||
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
|
||||
qqc, nh(nt), becp%k(indv_ijkb0(na)+1,1), nkb, &
|
||||
(0.0_dp,0.0_dp), ps(indv_ijkb0(na)+1,1), nkb )
|
||||
!
|
||||
END IF
|
||||
END DO
|
||||
DEALLOCATE (qqc)
|
||||
END IF
|
||||
END DO
|
||||
ps_d = ps
|
||||
vkb_d(1:lda, 1:nkb) = vkb(1:lda,1:nkb)
|
||||
!
|
||||
IF ( m == 1 ) THEN
|
||||
!
|
||||
CALL ZGEMV( 'N', n, nkb, ( 1.D0, 0.D0 ), vkb_d, &
|
||||
lda, ps_d, 1, ( 1.D0, 0.D0 ), spsi_d, 1 )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', n, m, nkb, ( 1.D0, 0.D0 ), vkb_d, &
|
||||
lda, ps_d, nkb, ( 1.D0, 0.D0 ), spsi_d, lda )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
DEALLOCATE( ps )
|
||||
CALL buffer%release_buffer(ps_d, ierr)
|
||||
CALL buffer%release_buffer(vkb_d, ierr)
|
||||
!DEALLOCATE(vkb_d)
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE s_psi_k_gpu
|
||||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE s_psi_nc_gpu ( )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! ... k-points noncolinear/spinorbit version
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! here the local variables
|
||||
!
|
||||
INTEGER :: ikb, jkb, ih, jh, na, nt, ibnd, ipol, ierr
|
||||
! counters
|
||||
COMPLEX (DP), ALLOCATABLE :: ps (:,:,:)
|
||||
COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: ps_d(:,:,:)
|
||||
COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: vkb_d(:,:)
|
||||
! the product vkb and psi
|
||||
!
|
||||
ALLOCATE (ps(nkb,npol,m),STAT=ierr)
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' s_psi_nc ', ' cannot allocate memory (ps) ', ABS(ierr) )
|
||||
|
||||
|
||||
ps(:,:,:) = (0.D0,0.D0)
|
||||
!
|
||||
DO nt = 1, nsp
|
||||
!
|
||||
IF ( upf(nt)%tvanp ) THEN
|
||||
!
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
DO ih = 1,nh(nt)
|
||||
ikb = indv_ijkb0(na) + ih
|
||||
DO jh = 1, nh (nt)
|
||||
jkb = indv_ijkb0(na) + jh
|
||||
IF ( .NOT. lspinorb ) THEN
|
||||
DO ipol=1,npol
|
||||
DO ibnd = 1, m
|
||||
ps(ikb,ipol,ibnd) = ps(ikb,ipol,ibnd) + &
|
||||
qq_at(ih,jh,na)*becp%nc(jkb,ipol,ibnd)
|
||||
END DO
|
||||
END DO
|
||||
ELSE
|
||||
DO ibnd = 1, m
|
||||
ps(ikb,1,ibnd)=ps(ikb,1,ibnd) + &
|
||||
qq_so(ih,jh,1,nt)*becp%nc(jkb,1,ibnd)+ &
|
||||
qq_so(ih,jh,2,nt)*becp%nc(jkb,2,ibnd)
|
||||
ps(ikb,2,ibnd)=ps(ikb,2,ibnd) + &
|
||||
qq_so(ih,jh,3,nt)*becp%nc(jkb,1,ibnd)+ &
|
||||
qq_so(ih,jh,4,nt)*becp%nc(jkb,2,ibnd)
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
CALL buffer%lock_buffer(ps_d, (/ nkb,npol,m /), ierr)
|
||||
CALL buffer%lock_buffer(vkb_d, (/ lda, nkb /), ierr)
|
||||
|
||||
ps_d = ps
|
||||
vkb_d(1:lda, 1:nkb) = vkb(1:lda,1:nkb)
|
||||
|
||||
|
||||
call ZGEMM ('N', 'N', n, m*npol, nkb, (1.d0, 0.d0) , vkb_d, &
|
||||
lda, ps_d, nkb, (1.d0, 0.d0) , spsi_d(1,1), lda)
|
||||
|
||||
DEALLOCATE(ps)
|
||||
CALL buffer%release_buffer(ps_d, ierr)
|
||||
CALL buffer%release_buffer(vkb_d, ierr)
|
||||
|
||||
RETURN
|
||||
|
||||
END SUBROUTINE s_psi_nc_gpu
|
||||
|
||||
END SUBROUTINE s_psi__gpu
|
||||
!@nje
|
||||
|
||||
SUBROUTINE myDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
use cudafor
|
||||
use cublas
|
||||
implicit none
|
||||
DOUBLE PRECISION ALPHA,BETA
|
||||
INTEGER INCX,INCY,LDA,M,N
|
||||
CHARACTER TRANS
|
||||
DOUBLE PRECISION, DEVICE :: A(LDA,*),X(*),Y(*)
|
||||
!
|
||||
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
!
|
||||
END SUBROUTINE myDGEMV
|
||||
|
||||
SUBROUTINE s_psi_gpu_compatibility( lda, n, m, psi_d, spsi_d )
|
||||
USE kinds, ONLY : DP
|
||||
USE noncollin_module, ONLY : npol
|
||||
USE cudafor
|
||||
|
@ -32,5 +485,5 @@ SUBROUTINE s_psi_gpu( lda, n, m, psi_d, spsi_d )
|
|||
|
||||
DEALLOCATE(psi, spsi )
|
||||
|
||||
END SUBROUTINE s_psi_gpu
|
||||
END SUBROUTINE s_psi_gpu_compatibility
|
||||
#endif
|
||||
|
|
|
@ -72,6 +72,9 @@ MODULE scf
|
|||
vrs(:,:), &! the total pot. in real space (smooth grid)
|
||||
rho_core(:), &! the core charge in real space
|
||||
kedtau(:,:) ! position dependent kinetic energy enhancement factor
|
||||
#if defined(__CUDA)
|
||||
attributes(pinned) :: vrs
|
||||
#endif
|
||||
COMPLEX(DP), ALLOCATABLE :: &
|
||||
rhog_core(:) ! the core charge in reciprocal space
|
||||
|
||||
|
|
|
@ -0,0 +1,623 @@
|
|||
!
|
||||
! Copyright (C) 2003-2013 PWSCF group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
#if defined(__CUDA)
|
||||
!@njs: vloc_psi_gamma, psi, v, hpsi, psic
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_gamma_gpu(lda, n, m, psi_d, v_d, hpsi_d)
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! Calculation of Vloc*psi using dual-space technique - Gamma point
|
||||
!
|
||||
USE parallel_include
|
||||
USE kinds, ONLY : DP
|
||||
USE mp_bands, ONLY : me_bgrp
|
||||
USE fft_base, ONLY : dffts
|
||||
USE fft_interfaces,ONLY : fwfft, invfft
|
||||
USE wavefunctions_module, ONLY: psic_h => psic
|
||||
USE wavefunctions_module_gpum, ONLY: psic_d
|
||||
USE fft_helper_subroutines
|
||||
USE buffers_module, ONLY : buffer
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(in) :: lda, n, m
|
||||
COMPLEX(DP), DEVICE, INTENT(in) :: psi_d (lda, m)
|
||||
COMPLEX(DP), DEVICE, INTENT(inout):: hpsi_d (lda, m)
|
||||
REAL(DP), DEVICE, INTENT(in) :: v_d(dffts%nnr)
|
||||
!
|
||||
INTEGER :: ibnd, j, incr, right_nnr, right_nr3, right_inc
|
||||
COMPLEX(DP) :: fp, fm
|
||||
!
|
||||
LOGICAL :: use_tg
|
||||
! Variables for task groups
|
||||
!@njs: tg_v, tg_psic
|
||||
REAL(DP), DEVICE, POINTER :: tg_v_d(:)
|
||||
COMPLEX(DP), DEVICE, POINTER :: tg_psic_d(:)
|
||||
INTEGER, DEVICE, POINTER :: dffts_nl_d(:), dffts_nlm_d(:)
|
||||
INTEGER :: v_siz, idx, ioff
|
||||
INTEGER :: ierr
|
||||
!
|
||||
CALL start_clock ('vloc_psi')
|
||||
incr = 2
|
||||
!
|
||||
IF (.not. allocated(psic_d)) ALLOCATE(psic_d, SOURCE=psic_h)
|
||||
use_tg = dffts%has_task_groups
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
CALL start_clock ('vloc_psi:tg_gather')
|
||||
v_siz = dffts%nnr_tg
|
||||
!
|
||||
CALL buffer%lock_buffer( tg_v_d, v_siz, ierr )
|
||||
CALL buffer%lock_buffer( tg_psic_d, v_siz, ierr )
|
||||
!
|
||||
CALL tg_gather_gpu( dffts, v_d, tg_v_d )
|
||||
CALL stop_clock ('vloc_psi:tg_gather')
|
||||
!
|
||||
incr = 2 * fftx_ntgrp(dffts)
|
||||
!
|
||||
ENDIF
|
||||
! Sync fft data
|
||||
CALL buffer%lock_buffer( dffts_nl_d, size(dffts%nl), ierr )
|
||||
CALL buffer%lock_buffer( dffts_nlm_d, size(dffts%nlm), ierr )
|
||||
dffts_nl_d(1:n) = dffts%nl(1:n)
|
||||
dffts_nlm_d(1:n) = dffts%nlm(1:n)
|
||||
! End Sync
|
||||
|
||||
!
|
||||
! the local potential V_Loc psi. First bring psi to real space
|
||||
!
|
||||
DO ibnd = 1, m, incr
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
CALL tg_get_nnr( dffts, right_nnr )
|
||||
!
|
||||
tg_psic_d = (0.d0, 0.d0)
|
||||
ioff = 0
|
||||
!
|
||||
DO idx = 1, 2*fftx_ntgrp(dffts), 2
|
||||
IF( idx + ibnd - 1 < m ) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
tg_psic_d(dffts_nl_d (j)+ioff) = psi_d(j,idx+ibnd-1) + &
|
||||
(0.0d0,1.d0) * psi_d(j,idx+ibnd)
|
||||
tg_psic_d(dffts_nlm_d(j)+ioff) = conjg( psi_d(j,idx+ibnd-1) - &
|
||||
(0.0d0,1.d0) * psi_d(j,idx+ibnd) )
|
||||
ENDDO
|
||||
ELSEIF( idx + ibnd - 1 == m ) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
tg_psic_d(dffts_nl_d (j)+ioff) = psi_d(j,idx+ibnd-1)
|
||||
tg_psic_d(dffts_nlm_d(j)+ioff) = conjg( psi_d(j,idx+ibnd-1) )
|
||||
ENDDO
|
||||
ENDIF
|
||||
|
||||
ioff = ioff + right_nnr
|
||||
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
psic_d(:) = (0.d0, 0.d0)
|
||||
IF (ibnd < m) THEN
|
||||
! two ffts at the same time
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
psic_d(dffts_nl_d (j))= psi_d(j,ibnd) + (0.0d0,1.d0)*psi_d(j,ibnd+1)
|
||||
psic_d(dffts_nlm_d(j))=conjg(psi_d(j,ibnd) - (0.0d0,1.d0)*psi_d(j,ibnd+1))
|
||||
ENDDO
|
||||
ELSE
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
psic_d (dffts_nl_d (j)) = psi_d(j, ibnd)
|
||||
psic_d (dffts_nlm_d(j)) = conjg(psi_d(j, ibnd))
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! fft to real space
|
||||
! product with the potential v on the smooth grid
|
||||
! back to reciprocal space
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
CALL invfft ('tgWave', tg_psic_d, dffts )
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, dffts%nr1x * dffts%nr2x * right_nr3
|
||||
tg_psic_d (j) = tg_psic_d (j) * tg_v_d(j)
|
||||
ENDDO
|
||||
!
|
||||
CALL fwfft ('tgWave', tg_psic_d, dffts )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL invfft ('Wave', psic_d, dffts)
|
||||
!
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, dffts%nnr
|
||||
psic_d (j) = psic_d (j) * v_d(j)
|
||||
ENDDO
|
||||
!
|
||||
CALL fwfft ('Wave', psic_d, dffts)
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! addition to the total product
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
ioff = 0
|
||||
!
|
||||
CALL tg_get_recip_inc( dffts, right_inc )
|
||||
!
|
||||
DO idx = 1, 2*fftx_ntgrp(dffts), 2
|
||||
!
|
||||
IF( idx + ibnd - 1 < m ) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
fp= ( tg_psic_d( dffts_nl_d(j) + ioff ) + &
|
||||
tg_psic_d( dffts_nlm_d(j) + ioff ) ) * 0.5d0
|
||||
fm= ( tg_psic_d( dffts_nl_d(j) + ioff ) - &
|
||||
tg_psic_d( dffts_nlm_d(j) + ioff ) ) * 0.5d0
|
||||
hpsi_d (j, ibnd+idx-1) = hpsi_d (j, ibnd+idx-1) + &
|
||||
cmplx( dble(fp), aimag(fm),kind=DP)
|
||||
hpsi_d (j, ibnd+idx ) = hpsi_d (j, ibnd+idx ) + &
|
||||
cmplx(aimag(fp),- dble(fm),kind=DP)
|
||||
ENDDO
|
||||
ELSEIF( idx + ibnd - 1 == m ) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
hpsi_d (j, ibnd+idx-1) = hpsi_d (j, ibnd+idx-1) + &
|
||||
tg_psic_d( dffts_nl_d(j) + ioff )
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
ioff = ioff + right_inc
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
IF (ibnd < m) THEN
|
||||
! two ffts at the same time
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
fp = (psic_d (dffts_nl_d(j)) + psic_d (dffts_nlm_d(j)))*0.5d0
|
||||
fm = (psic_d (dffts_nl_d(j)) - psic_d (dffts_nlm_d(j)))*0.5d0
|
||||
hpsi_d (j, ibnd) = hpsi_d (j, ibnd) + &
|
||||
cmplx( dble(fp), aimag(fm),kind=DP)
|
||||
hpsi_d (j, ibnd+1) = hpsi_d (j, ibnd+1) + &
|
||||
cmplx(aimag(fp),- dble(fm),kind=DP)
|
||||
ENDDO
|
||||
ELSE
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
hpsi_d (j, ibnd) = hpsi_d (j, ibnd) + psic_d (dffts_nl_d(j))
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
CALL buffer%release_buffer( tg_psic_d, ierr )
|
||||
CALL buffer%release_buffer( tg_v_d, ierr )
|
||||
!
|
||||
ENDIF
|
||||
CALL buffer%release_buffer( dffts_nl_d, ierr )
|
||||
CALL buffer%release_buffer( dffts_nlm_d, ierr )
|
||||
CALL stop_clock ('vloc_psi')
|
||||
!
|
||||
RETURN
|
||||
END SUBROUTINE vloc_psi_gamma_gpu
|
||||
!
|
||||
!@njs: vloc_psi_k
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_k_gpu(lda, n, m, psi_d, v_d, hpsi_d)
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! Calculation of Vloc*psi using dual-space technique - k-points
|
||||
!
|
||||
! fft to real space
|
||||
! product with the potential v on the smooth grid
|
||||
! back to reciprocal space
|
||||
! addition to the hpsi
|
||||
!
|
||||
USE parallel_include
|
||||
USE kinds, ONLY : DP
|
||||
USE wvfct, ONLY : current_k
|
||||
USE klist, ONLY : igk_k_host => igk_k
|
||||
USE mp_bands, ONLY : me_bgrp
|
||||
USE fft_base, ONLY : dffts
|
||||
USE fft_interfaces,ONLY : fwfft, invfft
|
||||
USE fft_helper_subroutines
|
||||
USE wavefunctions_module, ONLY: psic_h => psic
|
||||
USE wavefunctions_module_gpum, ONLY: psic_d
|
||||
USE buffers_module, ONLY : buffer
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(in) :: lda, n, m
|
||||
COMPLEX(DP), DEVICE, INTENT(in) :: psi_d (lda, m)
|
||||
COMPLEX(DP), DEVICE, INTENT(inout):: hpsi_d (lda, m)
|
||||
REAL(DP), DEVICE, INTENT(in) :: v_d(dffts%nnr)
|
||||
!
|
||||
INTEGER :: ibnd, j, incr
|
||||
INTEGER :: i, right_nnr, right_nr3, right_inc
|
||||
!
|
||||
LOGICAL :: use_tg
|
||||
! Task Groups
|
||||
REAL(DP), DEVICE, POINTER :: tg_v_d(:)
|
||||
COMPLEX(DP), DEVICE, POINTER :: tg_psic_d(:)
|
||||
INTEGER, DEVICE, POINTER :: dffts_nl_d(:)
|
||||
INTEGER, DEVICE, POINTER :: igk_k_d(:)
|
||||
INTEGER :: v_siz, idx, ioff
|
||||
INTEGER :: ierr
|
||||
!
|
||||
IF (.not. allocated(psic_d)) ALLOCATE(psic_d, SOURCE=psic_h)
|
||||
CALL start_clock ('vloc_psi')
|
||||
use_tg = dffts%has_task_groups
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
CALL start_clock ('vloc_psi:tg_gather')
|
||||
v_siz = dffts%nnr_tg
|
||||
!
|
||||
CALL buffer%lock_buffer( tg_v_d, v_siz, ierr )
|
||||
CALL buffer%lock_buffer( tg_psic_d, v_siz, ierr )
|
||||
!
|
||||
CALL tg_gather( dffts, v_d, tg_v_d )
|
||||
CALL stop_clock ('vloc_psi:tg_gather')
|
||||
|
||||
ENDIF
|
||||
CALL buffer%lock_buffer( dffts_nl_d, size(dffts%nl) , ierr )
|
||||
CALL buffer%lock_buffer( igk_k_d, n, ierr )
|
||||
!
|
||||
dffts_nl_d = dffts%nl
|
||||
igk_k_d(1:n) = igk_k_host(1:n, current_k)
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
|
||||
CALL tg_get_nnr( dffts, right_nnr )
|
||||
|
||||
DO ibnd = 1, m, fftx_ntgrp(dffts)
|
||||
!
|
||||
tg_psic_d = (0.d0, 0.d0)
|
||||
ioff = 0
|
||||
!
|
||||
DO idx = 1, fftx_ntgrp(dffts)
|
||||
|
||||
IF( idx + ibnd - 1 <= m ) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
tg_psic_d(dffts_nl_d (igk_k_d(j))+ioff) = psi_d(j,idx+ibnd-1)
|
||||
ENDDO
|
||||
|
||||
ENDIF
|
||||
|
||||
!write (6,*) 'wfc G ', idx+ibnd-1
|
||||
!write (6,99) (tg_psic(i+ioff), i=1,400)
|
||||
|
||||
ioff = ioff + right_nnr
|
||||
ENDDO
|
||||
!
|
||||
CALL invfft ('tgWave', tg_psic_d, dffts )
|
||||
!write (6,*) 'wfc R '
|
||||
!write (6,99) (tg_psic(i), i=1,400)
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
!
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, dffts%nr1x*dffts%nr2x* right_nr3
|
||||
tg_psic_d (j) = tg_psic_d (j) * tg_v_d(j)
|
||||
ENDDO
|
||||
!
|
||||
!write (6,*) 'v psi R '
|
||||
!write (6,99) (tg_psic(i), i=1,400)
|
||||
!
|
||||
CALL fwfft ('tgWave', tg_psic_d, dffts )
|
||||
!
|
||||
! addition to the total product
|
||||
!
|
||||
ioff = 0
|
||||
!
|
||||
CALL tg_get_recip_inc( dffts, right_inc )
|
||||
!
|
||||
DO idx = 1, fftx_ntgrp(dffts)
|
||||
!
|
||||
IF( idx + ibnd - 1 <= m ) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
hpsi_d (j, ibnd+idx-1) = hpsi_d (j, ibnd+idx-1) + &
|
||||
tg_psic_d( dffts_nl_d(igk_k_d(j)) + ioff )
|
||||
ENDDO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!write (6,*) 'v psi G ', idx+ibnd-1
|
||||
!write (6,99) (tg_psic(i+ioff), i=1,400)
|
||||
|
||||
ioff = ioff + right_inc
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ENDDO
|
||||
ELSE
|
||||
DO ibnd = 1, m
|
||||
!
|
||||
!!! == OPTIMIZE HERE == (setting to 0 and setting elements!)
|
||||
psic_d(:) = (0.d0, 0.d0)
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
psic_d (dffts_nl_d (igk_k_d(j))) = psi_d(j, ibnd)
|
||||
END DO
|
||||
!
|
||||
!write (6,*) 'wfc G ', ibnd
|
||||
!write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
CALL invfft ('Wave', psic_d, dffts)
|
||||
!write (6,*) 'wfc R '
|
||||
!write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, dffts%nnr
|
||||
psic_d (j) = psic_d (j) * v_d(j)
|
||||
ENDDO
|
||||
!
|
||||
!write (6,*) 'v psi R '
|
||||
!write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
CALL fwfft ('Wave', psic_d, dffts)
|
||||
!
|
||||
! addition to the total product
|
||||
!
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
hpsi_d (j, ibnd) = hpsi_d (j, ibnd) + psic_d (dffts_nl_d(igk_k_d(j)))
|
||||
ENDDO
|
||||
!
|
||||
!write (6,*) 'v psi G ', ibnd
|
||||
!write (6,99) (psic(i), i=1,400)
|
||||
!
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
CALL buffer%release_buffer( tg_psic_d, ierr )
|
||||
CALL buffer%release_buffer( tg_v_d, ierr )
|
||||
!
|
||||
ENDIF
|
||||
CALL buffer%release_buffer( dffts_nl_d, ierr )
|
||||
CALL buffer%release_buffer( igk_k_d, ierr )
|
||||
CALL stop_clock ('vloc_psi')
|
||||
!
|
||||
99 format ( 20 ('(',2f12.9,')') )
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE vloc_psi_k_gpu
|
||||
!
|
||||
!@njs: vloc_psi_nc
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE vloc_psi_nc_gpu (lda, n, m, psi_d, v_d, hpsi_d)
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! Calculation of Vloc*psi using dual-space technique - noncolinear
|
||||
!
|
||||
USE parallel_include
|
||||
USE kinds, ONLY : DP
|
||||
USE wvfct, ONLY : current_k
|
||||
USE klist, ONLY : igk_k_host => igk_k
|
||||
USE mp_bands, ONLY : me_bgrp
|
||||
USE fft_base, ONLY : dffts, dfftp
|
||||
USE fft_interfaces,ONLY : fwfft, invfft
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE spin_orb, ONLY : domag
|
||||
USE noncollin_module, ONLY: npol
|
||||
USE wavefunctions_module, ONLY: psic_nc_h => psic_nc
|
||||
USE wavefunctions_module_gpum, ONLY: psic_nc_d
|
||||
USE fft_helper_subroutines
|
||||
USE buffers_module, ONLY : buffer
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(in) :: lda, n, m
|
||||
REAL(DP), DEVICE, INTENT(in) :: v_d(dfftp%nnr,4) ! beware dimensions!
|
||||
COMPLEX(DP), DEVICE, INTENT(in) :: psi_d (lda*npol, m)
|
||||
COMPLEX(DP), DEVICE, INTENT(inout):: hpsi_d (lda,npol,m)
|
||||
INTEGER, DEVICE, POINTER :: dffts_nl_d(:)
|
||||
INTEGER, DEVICE, POINTER :: igk_k_d(:)
|
||||
!
|
||||
INTEGER :: ibnd, j,ipol, incr, is
|
||||
COMPLEX(DP) :: sup, sdwn
|
||||
!
|
||||
LOGICAL :: use_tg
|
||||
! Variables for task groups
|
||||
REAL(DP), DEVICE, ALLOCATABLE :: tg_v_d(:,:)
|
||||
COMPLEX(DP), DEVICE, ALLOCATABLE :: tg_psic_d(:,:)
|
||||
INTEGER :: v_siz, idx, ioff
|
||||
INTEGER :: right_nnr, right_nr3, right_inc
|
||||
INTEGER :: ierr
|
||||
!
|
||||
CALL start_clock ('vloc_psi')
|
||||
!
|
||||
IF (.not. allocated(psic_nc_d)) ALLOCATE(psic_nc_d, SOURCE=psic_nc_h)
|
||||
incr = 1
|
||||
!
|
||||
use_tg = dffts%has_task_groups
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
CALL start_clock ('vloc_psi:tg_gather')
|
||||
v_siz = dffts%nnr_tg
|
||||
IF (domag) THEN
|
||||
ALLOCATE( tg_v_d( v_siz, 4 ) )
|
||||
DO is=1,nspin
|
||||
CALL tg_gather( dffts, v_d(:,is), tg_v_d(:,is) )
|
||||
ENDDO
|
||||
ELSE
|
||||
ALLOCATE( tg_v_d( v_siz, 1 ) )
|
||||
CALL tg_gather( dffts, v_d(:,1), tg_v_d(:,1) )
|
||||
ENDIF
|
||||
ALLOCATE( tg_psic_d( v_siz, npol ) )
|
||||
CALL stop_clock ('vloc_psi:tg_gather')
|
||||
|
||||
incr = fftx_ntgrp(dffts)
|
||||
ENDIF
|
||||
CALL buffer%lock_buffer( dffts_nl_d, size(dffts%nl), ierr )
|
||||
CALL buffer%lock_buffer( igk_k_d, n, ierr )
|
||||
dffts_nl_d = dffts%nl
|
||||
igk_k_d(1:n) = igk_k_host(1:n, current_k)
|
||||
!
|
||||
! the local potential V_Loc psi. First the psi in real space
|
||||
!
|
||||
DO ibnd = 1, m, incr
|
||||
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
CALL tg_get_nnr( dffts, right_nnr )
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
!
|
||||
! == OPTIMIZE HERE == setting data twice
|
||||
tg_psic_d(:,ipol) = ( 0.D0, 0.D0 )
|
||||
ioff = 0
|
||||
!
|
||||
DO idx = 1, fftx_ntgrp(dffts)
|
||||
!
|
||||
IF( idx + ibnd - 1 <= m ) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
tg_psic_d( dffts_nl_d( igk_k_d(j) ) + ioff, ipol ) = &
|
||||
psi_d( j +(ipol-1)*lda, idx+ibnd-1 )
|
||||
ENDDO
|
||||
ENDIF
|
||||
|
||||
ioff = ioff + right_nnr
|
||||
|
||||
ENDDO
|
||||
!
|
||||
CALL invfft ('tgWave', tg_psic_d(:,ipol), dffts )
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
psic_nc_d = (0.d0,0.d0)
|
||||
DO ipol=1,npol
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
psic_nc_d(dffts_nl_d(igk_k_d(j)),ipol) = psi_d(j+(ipol-1)*lda,ibnd)
|
||||
ENDDO
|
||||
CALL invfft ('Wave', psic_nc_d(:,ipol), dffts)
|
||||
ENDDO
|
||||
ENDIF
|
||||
|
||||
!
|
||||
! product with the potential v = (vltot+vr) on the smooth grid
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
CALL tg_get_group_nr3( dffts, right_nr3 )
|
||||
IF (domag) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j=1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
sup = tg_psic_d(j,1) * (tg_v_d(j,1)+tg_v_d(j,4)) + &
|
||||
tg_psic_d(j,2) * (tg_v_d(j,2)-(0.d0,1.d0)*tg_v_d(j,3))
|
||||
sdwn = tg_psic_d(j,2) * (tg_v_d(j,1)-tg_v_d(j,4)) + &
|
||||
tg_psic_d(j,1) * (tg_v_d(j,2)+(0.d0,1.d0)*tg_v_d(j,3))
|
||||
tg_psic_d(j,1)=sup
|
||||
tg_psic_d(j,2)=sdwn
|
||||
ENDDO
|
||||
ELSE
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j=1, dffts%nr1x*dffts%nr2x*right_nr3
|
||||
tg_psic_d(j,:) = tg_psic_d(j,:) * tg_v_d(j,1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
ELSE
|
||||
IF (domag) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j=1, dffts%nnr
|
||||
sup = psic_nc_d(j,1) * (v_d(j,1)+v_d(j,4)) + &
|
||||
psic_nc_d(j,2) * (v_d(j,2)-(0.d0,1.d0)*v_d(j,3))
|
||||
sdwn = psic_nc_d(j,2) * (v_d(j,1)-v_d(j,4)) + &
|
||||
psic_nc_d(j,1) * (v_d(j,2)+(0.d0,1.d0)*v_d(j,3))
|
||||
psic_nc_d(j,1)=sup
|
||||
psic_nc_d(j,2)=sdwn
|
||||
ENDDO
|
||||
ELSE
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j=1, dffts%nnr
|
||||
psic_nc_d(j,:) = psic_nc_d(j,:) * v_d(j,1)
|
||||
ENDDO
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
! back to reciprocal space
|
||||
!
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
DO ipol = 1, npol
|
||||
|
||||
CALL fwfft ('tgWave', tg_psic_d(:,ipol), dffts )
|
||||
!
|
||||
ioff = 0
|
||||
!
|
||||
CALL tg_get_recip_inc( dffts, right_inc )
|
||||
!
|
||||
DO idx = 1, fftx_ntgrp(dffts)
|
||||
!
|
||||
IF( idx + ibnd - 1 <= m ) THEN
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
hpsi_d (j, ipol, ibnd+idx-1) = hpsi_d (j, ipol, ibnd+idx-1) + &
|
||||
tg_psic_d( dffts_nl_d(igk_k_d(j)) + ioff, ipol )
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
ioff = ioff + right_inc
|
||||
!
|
||||
ENDDO
|
||||
|
||||
ENDDO
|
||||
!
|
||||
ELSE
|
||||
|
||||
DO ipol=1,npol
|
||||
CALL fwfft ('Wave', psic_nc_d(:,ipol), dffts)
|
||||
ENDDO
|
||||
!
|
||||
! addition to the total product
|
||||
!
|
||||
DO ipol=1,npol
|
||||
!$cuf kernel do(1) <<<,>>>
|
||||
DO j = 1, n
|
||||
hpsi_d(j,ipol,ibnd) = hpsi_d(j,ipol,ibnd) + &
|
||||
psic_nc_d(dffts_nl_d(igk_k_d(j)),ipol)
|
||||
ENDDO
|
||||
ENDDO
|
||||
|
||||
ENDIF
|
||||
|
||||
ENDDO
|
||||
|
||||
IF( use_tg ) THEN
|
||||
!
|
||||
DEALLOCATE(tg_v_d, tg_psic_d)
|
||||
!
|
||||
ENDIF
|
||||
CALL buffer%release_buffer( dffts_nl_d, ierr )
|
||||
CALL buffer%release_buffer( igk_k_d, ierr )
|
||||
CALL stop_clock ('vloc_psi')
|
||||
!
|
||||
RETURN
|
||||
END SUBROUTINE vloc_psi_nc_gpu
|
||||
!@nje
|
||||
#endif
|
Loading…
Reference in New Issue