Added GPU version for h_psi, s_psi, g_psi and vloc. psic and psic_nc module variables should be standardized. Many data copies between CPU and GPU should be replaced by data in modules. All test passing except for pw_vc-relax/vc-relax3.in which has too loose convergence (noticed by Josh Romero) and for dsygvdx_gpu (real problem) occasionally failing

This commit is contained in:
Pietro Bonfa 2018-04-18 13:28:38 +02:00
parent df72013a32
commit 5f6d231bdd
18 changed files with 2935 additions and 116 deletions

View File

@ -281,7 +281,7 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!END DO
! ========= TO HERE, REPLACED BY =======
CALL reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, vc_d)
CALL reorder_evals_cevecs(nbase, nvec, nvecx, conv, e_d, ew_d, vc_d)
!
nb1 = nbase + 1
!
@ -608,62 +608,62 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!
RETURN
!
CONTAINS
SUBROUTINE reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
USE david_param, ONLY : DP
USE david_buffer, ONLY : buffer
implicit none
INTEGER, INTENT(IN) :: nbase, nvec, nvecx
LOGICAL, INTENT(IN) :: conv(nvec)
REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
COMPLEX(DP), DEVICE :: v_d(nvecx,nvecx)
!
INTEGER :: n, np, info
INTEGER, ALLOCATABLE :: conv_idx(:)
INTEGER, DEVICE, POINTER :: conv_idx_d(:)
COMPLEX(DP), DEVICE, POINTER :: vtmp_d(:,:)
!
np = 0
ALLOCATE(conv_idx(nvec))
DO n = 1, nvec
conv_idx(n) = -1
IF ( .NOT. conv(n) ) THEN
np = np + 1
conv_idx(n) = np
END IF
END DO
IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
CALL buffer%lock_buffer(conv_idx_d, nvec, info)
CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
conv_idx_d(1:nvec) = conv_idx(1:nvec)
!$cuf kernel do(2) <<<*,*>>>
DO j=1,nvec
DO k=1,nvecx
vtmp_d(k,j) = v_d(k,j)
END DO
END DO
!$cuf kernel do(2) <<<*,*>>>
DO j=1,nvec
DO k=1,nvecx
IF(conv_idx_d(j) /= -1) THEN
v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
END IF
END DO
END DO
!
CALL buffer%release_buffer(conv_idx_d, info)
CALL buffer%release_buffer(vtmp_d, info)
!
DEALLOCATE(conv_idx)
END SUBROUTINE
END SUBROUTINE cegterg_gpu
SUBROUTINE reorder_evals_cevecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
USE david_param, ONLY : DP
USE david_buffer, ONLY : buffer
implicit none
INTEGER, INTENT(IN) :: nbase, nvec, nvecx
LOGICAL, INTENT(IN) :: conv(nvec)
REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
COMPLEX(DP), DEVICE :: v_d(nvecx,nvecx)
!
INTEGER :: j, k, n, np, info
INTEGER, ALLOCATABLE :: conv_idx(:)
INTEGER, DEVICE, POINTER :: conv_idx_d(:)
COMPLEX(DP), DEVICE, POINTER :: vtmp_d(:,:)
!
np = 0
ALLOCATE(conv_idx(nvec))
DO n = 1, nvec
conv_idx(n) = -1
IF ( .NOT. conv(n) ) THEN
np = np + 1
conv_idx(n) = np
END IF
END DO
IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
CALL buffer%lock_buffer(conv_idx_d, nvec, info)
CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
conv_idx_d(1:nvec) = conv_idx(1:nvec)
!$cuf kernel do(2) <<<*,*>>>
DO j=1,nvec
DO k=1,nvecx
vtmp_d(k,j) = v_d(k,j)
END DO
END DO
!$cuf kernel do(2) <<<*,*>>>
DO j=1,nvec
DO k=1,nvecx
IF(conv_idx_d(j) /= -1) THEN
v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
END IF
END DO
END DO
!
CALL buffer%release_buffer(conv_idx_d, info)
CALL buffer%release_buffer(vtmp_d, info)
!
DEALLOCATE(conv_idx)
END SUBROUTINE reorder_evals_cevecs
!
! Wrapper for subroutine with distributed matrixes (written by Carlo Cavazzoni)
!

View File

@ -272,7 +272,7 @@ SUBROUTINE regterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!END DO
! ========= TO HERE, REPLACED BY =======
CALL reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, vr_d)
CALL reorder_evals_revecs(nbase, nvec, nvecx, conv, e_d, ew_d, vr_d)
!
nb1 = nbase + 1
!
@ -567,62 +567,63 @@ SUBROUTINE regterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!
RETURN
!
CONTAINS
SUBROUTINE reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
USE david_param, ONLY : DP
USE david_buffer, ONLY : buffer
implicit none
INTEGER, INTENT(IN) :: nbase, nvec, nvecx
LOGICAL, INTENT(IN) :: conv(nvec)
REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
REAL(DP), DEVICE :: v_d(nvecx,nvecx)
!
INTEGER :: n, np, info
INTEGER, ALLOCATABLE :: conv_idx(:)
INTEGER, DEVICE, POINTER :: conv_idx_d(:)
REAL(DP), DEVICE, POINTER :: vtmp_d(:,:)
!
np = 0
ALLOCATE(conv_idx(nvec))
DO n = 1, nvec
conv_idx(n) = -1
IF ( .NOT. conv(n) ) THEN
np = np + 1
conv_idx(n) = np
END IF
END DO
IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
CALL buffer%lock_buffer(conv_idx_d, nvec, info)
CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
conv_idx_d(1:nvec) = conv_idx(1:nvec)
!$cuf kernel do(2) <<<*,*>>>
DO j=1,nvec
DO k=1,nvecx
vtmp_d(k,j) = v_d(k,j)
END DO
END DO
!$cuf kernel do(2) <<<*,*>>>
DO j=1,nvec
DO k=1,nvecx
IF(conv_idx_d(j) /= -1) THEN
v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
END IF
END DO
END DO
!
CALL buffer%release_buffer(conv_idx_d, info)
CALL buffer%release_buffer(vtmp_d, info)
!
DEALLOCATE(conv_idx)
END SUBROUTINE
END SUBROUTINE regterg_gpu
SUBROUTINE reorder_evals_revecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
USE david_param, ONLY : DP
USE david_buffer, ONLY : buffer
implicit none
INTEGER, INTENT(IN) :: nbase, nvec, nvecx
LOGICAL, INTENT(IN) :: conv(nvec)
REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
REAL(DP), DEVICE :: v_d(nvecx,nvecx)
!
INTEGER :: j, k, n, np, info
INTEGER, ALLOCATABLE :: conv_idx(:)
INTEGER, DEVICE, POINTER :: conv_idx_d(:)
REAL(DP), DEVICE, POINTER :: vtmp_d(:,:)
!
np = 0
ALLOCATE(conv_idx(nvec))
DO n = 1, nvec
conv_idx(n) = -1
IF ( .NOT. conv(n) ) THEN
np = np + 1
conv_idx(n) = np
END IF
END DO
IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
CALL buffer%lock_buffer(conv_idx_d, nvec, info)
CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
conv_idx_d(1:nvec) = conv_idx(1:nvec)
!$cuf kernel do(2) <<<*,*>>>
DO j=1,nvec
DO k=1,nvecx
vtmp_d(k,j) = v_d(k,j)
END DO
END DO
!$cuf kernel do(2) <<<*,*>>>
DO j=1,nvec
DO k=1,nvecx
IF(conv_idx_d(j) /= -1) THEN
v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
END IF
END DO
END DO
!
CALL buffer%release_buffer(conv_idx_d, info)
CALL buffer%release_buffer(vtmp_d, info)
!
DEALLOCATE(conv_idx)
END SUBROUTINE reorder_evals_revecs
!
! Wrapper for subroutine with distributed matrixes (written by Carlo Cavazzoni)
!

View File

@ -162,6 +162,8 @@ w0gauss.o \
w1gauss.o \
deviatoric.o
-include make.gpu
TLDEPS=libfft
all : libqemod.a

View File

@ -0,0 +1,17 @@
!
! Copyright (C) Quantum ESPRESSO group
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
MODULE buffers_module
#if defined(__CUDA)
USE fbuf, ONLY : buf_t
!
IMPLICIT NONE
!
TYPE(buf_t) :: buffer
#endif
END MODULE buffers_module

1153
Modules/fbuf.f90 Normal file

File diff suppressed because it is too large Load Diff

4
Modules/make.gpu Normal file
View File

@ -0,0 +1,4 @@
MODULES += \
fbuf.o \
buffers_module.o \
wavefunctions_gpu.o

View File

@ -0,0 +1,60 @@
!
! Copyright (C) 2002-2011 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!=----------------------------------------------------------------------------=!
MODULE wavefunctions_module_gpum
!=----------------------------------------------------------------------------=!
USE kinds, ONLY : DP
IMPLICIT NONE
SAVE
!
COMPLEX(DP), DEVICE, ALLOCATABLE, TARGET :: &
evc_d(:,:) ! wavefunctions in the PW basis set
! noncolinear case: first index
! is a combined PW + spin index
!
COMPLEX(DP) , DEVICE, ALLOCATABLE, TARGET :: &
psic_d(:), & ! additional memory for FFT
psic_nc_d(:,:) ! as above for the noncolinear case
!
CONTAINS
!
! ==== SUBS FOR EVC
!
SUBROUTINE start_sync_evc_dev()
use wavefunctions_module, only: evc
implicit none
if (.not. allocated(evc_d)) then
allocate(evc_d, source=evc)
return
end if
evc_d = evc
!
END SUBROUTINE start_sync_evc_dev
!
SUBROUTINE start_sync_evc_hst()
use wavefunctions_module, only: evc
implicit none
evc = evc_d
END SUBROUTINE start_sync_evc_hst
SUBROUTINE wait_sync_evc_dev()
RETURN
END SUBROUTINE wait_sync_evc_dev
SUBROUTINE wait_sync_evc_hst()
RETURN
END SUBROUTINE wait_sync_evc_hst
!=----------------------------------------------------------------------------=!
END MODULE wavefunctions_module_gpum
!=----------------------------------------------------------------------------=!

View File

@ -163,6 +163,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
USE noncollin_module, ONLY : noncolin, npol
USE wavefunctions_module, ONLY : evc
USE g_psi_mod, ONLY : h_diag, s_diag
USE g_psi_mod_gpum, ONLY : using_h_diag, using_s_diag
USE scf, ONLY : v_of_0
USE bp, ONLY : lelfield, evcel, evcelp, evcelm, bec_evcel,&
gdir, l3dstring, efield, efield_cry
@ -208,6 +209,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
! In addition to the above ithe initial wfc rotation uses h_psi, and s_psi
ALLOCATE( h_diag( npwx, npol ), STAT=ierr )
call using_h_diag(.true.); call using_s_diag(.true.)
IF( ierr /= 0 ) &
CALL errore( ' diag_bands ', ' cannot allocate h_diag ', ABS(ierr) )
!
@ -240,6 +242,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
CALL deallocate_bec_type ( becp )
DEALLOCATE( s_diag )
DEALLOCATE( h_diag )
call using_h_diag(.true.); call using_s_diag(.true.)
!
IF ( notconv > MAX( 5, nbnd / 4 ) ) THEN
!
@ -278,6 +281,7 @@ CONTAINS
h_diag(ig,1) = 1.D0 + g2kin(ig) + SQRT( 1.D0 + ( g2kin(ig) - 1.D0 )**2 )
!
END FORALL
call using_h_diag(.true.)
!
ntry = 0
!
@ -318,6 +322,7 @@ CONTAINS
h_diag(1:npw, 1) = g2kin(1:npw) + v_of_0
!
CALL usnldiag( npw, h_diag, s_diag )
call using_h_diag(.true.); call using_s_diag(.true.);
!
ntry = 0
!
@ -438,6 +443,7 @@ CONTAINS
h_diag(ig,:) = 1.D0 + g2kin(ig) + SQRT( 1.D0 + ( g2kin(ig) - 1.D0 )**2 )
!
END FORALL
call using_h_diag(.true.);
!
ntry = 0
!
@ -482,6 +488,7 @@ CONTAINS
END DO
!
CALL usnldiag( npw, h_diag, s_diag )
call using_h_diag(.true.); call using_s_diag(.true.);
!
ntry = 0
!

View File

@ -16,6 +16,7 @@ subroutine g_psi (lda, n, m, npol, psi, e)
!
USE kinds
USE g_psi_mod
USE g_psi_mod_gpum, ONLY : using_h_diag, using_s_diag
implicit none
integer :: lda, n, m, npol, ipol
! input: the leading dimension of psi
@ -37,6 +38,7 @@ subroutine g_psi (lda, n, m, npol, psi, e)
! counter on psi functions
! counter on G vectors
!
call using_h_diag(.false.); call using_s_diag(.false.)
call start_clock ('g_psi')
!
#ifdef TEST_NEW_PRECONDITIONING

View File

@ -10,6 +10,77 @@
!-----------------------------------------------------------------------
#if defined(__CUDA)
subroutine g_psi_gpu (lda, n, m, npol, psi_d, e_d)
USE kinds
USE cudafor
USE g_psi_mod_gpum, ONLY : h_diag_d, s_diag_d, using_h_diag_d, using_s_diag_d
implicit none
integer :: lda, n, m, npol, ipol
! input: the leading dimension of psi
! input: the real dimension of psi
! input: the number of bands
! input: the number of coordinates of psi
! local variable: counter of coordinates of psi
real(DP) :: e_d (m)
! input: the eigenvectors
complex(DP) :: psi_d (lda, npol, m)
! inp/out: the psi vector
attributes( device ) :: e_d, psi_d
!
! Local variables
!
real(DP), parameter :: eps = 1.0d-4
! a small number
real(DP) :: x, scala, denm
integer :: k, i
! counter on psi functions
! counter on G vectors
!
call using_h_diag_d(.false.)
call using_s_diag_d(.false.)
!
call start_clock ('g_psi')
!
#ifdef TEST_NEW_PRECONDITIONING
scala = 1.d0
!$cuf kernel do(3) <<<*,*>>>
do ipol=1,npol
do k = 1, m
do i = 1, n
x = (h_diag_d(i,ipol) - e_d(k)*s_diag_d(i,ipol))*scala
denm = 0.5_dp*(1.d0+x+sqrt(1.d0+(x-1)*(x-1.d0)))/scala
psi_d (i, ipol, k) = psi_d (i, ipol, k) / denm
enddo
enddo
enddo
#else
!$cuf kernel do(3) <<<*,*>>>
do ipol=1,npol
do k = 1, m
do i = 1, n
denm = h_diag_d (i,ipol) - e_d (k) * s_diag_d (i,ipol)
!
! denm = g2+v(g=0) - e(k)
!
if (abs (denm) < eps) denm = sign (eps, denm)
!
! denm = sign( max( abs(denm),eps ), denm )
!
psi_d (i, ipol, k) = psi_d (i, ipol, k) / denm
enddo
enddo
enddo
#endif
call stop_clock ('g_psi')
return
end subroutine g_psi_gpu
subroutine g_psi_gpu_compatibility (lda, n, m, npol, psi_d, e_d)
!-----------------------------------------------------------------------
!
! This routine computes an estimate of the inverse Hamiltonian
@ -57,5 +128,5 @@ subroutine g_psi_gpu (lda, n, m, npol, psi_d, e_d)
DEALLOCATE(psi, e )
return
end subroutine g_psi_gpu
end subroutine g_psi_gpu_compatibility
#endif

93
PW/src/g_psi_mod_gpu.f90 Normal file
View File

@ -0,0 +1,93 @@
!
! Copyright (C) 2001-2007 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
MODULE g_psi_mod_gpum
!
! ... These are the variables needed in g_psi
!
USE kinds, only : DP
!
IMPLICIT NONE
!
REAL(DP), DEVICE, ALLOCATABLE :: &
h_diag_d (:,:),& ! diagonal part of the Hamiltonian
s_diag_d (:,:) ! diagonal part of the overlap matrix
LOGICAL :: h_diag_ood = .false. ! used to flag out of date variables
LOGICAL :: s_diag_ood = .false.
LOGICAL :: h_diag_d_ood = .false.
LOGICAL :: s_diag_d_ood = .false.
!
CONTAINS
!
SUBROUTINE using_h_diag(writing)
USE g_psi_mod, ONLY : h_diag
implicit none
LOGICAL, INTENT(IN) :: writing
!
IF (h_diag_ood) THEN
h_diag = h_diag_d
h_diag_ood = .false.
ENDIF
IF (writing) h_diag_d_ood = .true.
END SUBROUTINE using_h_diag
!
SUBROUTINE using_h_diag_d(writing)
USE g_psi_mod, ONLY : h_diag
implicit none
LOGICAL, INTENT(IN) :: writing
!
IF (.not. allocated(h_diag)) THEN
IF (allocated(h_diag_d)) DEALLOCATE(h_diag_d)
h_diag_d_ood = .false.
RETURN
END IF
IF (h_diag_d_ood) THEN
IF (.not. allocated(h_diag_d)) THEN
ALLOCATE(h_diag_d, SOURCE=h_diag)
ELSE
h_diag_d = h_diag
ENDIF
h_diag_d_ood = .false.
ENDIF
IF (writing) h_diag_ood = .true.
END SUBROUTINE using_h_diag_d
!
SUBROUTINE using_s_diag(writing)
USE g_psi_mod, ONLY : s_diag
implicit none
LOGICAL, INTENT(IN) :: writing
!
IF (s_diag_ood) THEN
s_diag = s_diag_d
s_diag_ood = .false.
ENDIF
IF (writing) s_diag_d_ood = .true.
END SUBROUTINE using_s_diag
!
SUBROUTINE using_s_diag_d(writing)
USE g_psi_mod, ONLY : s_diag
implicit none
LOGICAL, INTENT(IN) :: writing
!
IF (.not. allocated(s_diag)) THEN
IF (allocated(s_diag_d)) DEALLOCATE(s_diag_d)
s_diag_d_ood = .false.
RETURN
END IF
IF (s_diag_d_ood) THEN
IF (.not. allocated(s_diag_d)) THEN
ALLOCATE(s_diag_d, SOURCE=s_diag)
ELSE
s_diag_d = s_diag
END IF
s_diag_d_ood = .false.
ENDIF
IF (writing) s_diag_ood = .true.
END SUBROUTINE using_s_diag_d
!
END MODULE g_psi_mod_gpum

View File

@ -7,7 +7,324 @@
!
!----------------------------------------------------------------------------
#if defined(__CUDA)
! Copyright (C) 2002-2016 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
SUBROUTINE h_psi_gpu( lda, n, m, psi_d, hpsi_d )
!----------------------------------------------------------------------------
!
! ... This routine computes the product of the Hamiltonian
! ... matrix with m wavefunctions contained in psi
!
! ... input:
! ... lda leading dimension of arrays psi, spsi, hpsi
! ... n true dimension of psi, spsi, hpsi
! ... m number of states psi
! ... psi
!
! ... output:
! ... hpsi H*psi
!
! --- Wrapper routine: performs bgrp parallelization on non-distributed bands
! --- if suitable and required, calls old H\psi routine h_psi_
!
USE kinds, ONLY : DP
USE noncollin_module, ONLY : npol
USE funct, ONLY : exx_is_active
USE mp_bands, ONLY : use_bgrp_in_hpsi, inter_bgrp_comm
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda, n, m
COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
COMPLEX(DP), DEVICE, INTENT(OUT) :: hpsi_d(lda*npol,m)
!
INTEGER :: m_start, m_end
!
CALL start_clock( 'h_psi_bgrp' ); !write (*,*) 'start h_psi_bgrp'; FLUSH(6)
! band parallelization with non-distributed bands is performed if
! 1. enabled (variable use_bgrp_in_hpsi must be set to .T.)
! 2. exact exchange is not active (if it is, band parallelization is already
! used in exx routines called by Hpsi)
! 3. there is more than one band, otherwise there is nothing to parallelize
!
IF (use_bgrp_in_hpsi .AND. .NOT. exx_is_active() .AND. m > 1) THEN
! use band parallelization here
hpsi_d(:,:) = (0.d0,0.d0)
CALL divide(inter_bgrp_comm,m,m_start,m_end)
! Check if there at least one band in this band group
IF (m_end >= m_start) &
CALL h_psi__gpu( lda, n, m_end-m_start+1, psi_d(1,m_start), hpsi_d(1,m_start) )
CALL mp_sum(hpsi_d,inter_bgrp_comm)
ELSE
! don't use band parallelization here
CALL h_psi__gpu( lda, n, m, psi_d, hpsi_d )
END IF
CALL stop_clock( 'h_psi_bgrp' )
RETURN
!
END SUBROUTINE h_psi_gpu
!
!----------------------------------------------------------------------------
SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
!----------------------------------------------------------------------------
!
! ... This routine computes the product of the Hamiltonian
! ... matrix with m wavefunctions contained in psi
!
! ... input:
! ... lda leading dimension of arrays psi, spsi, hpsi
! ... n true dimension of psi, spsi, hpsi
! ... m number of states psi
! ... psi
!
! ... output:
! ... hpsi H*psi
!
USE kinds, ONLY : DP
USE bp, ONLY : lelfield,l3dstring,gdir, efield, efield_cry
USE becmod, ONLY : bec_type, becp, calbec
USE lsda_mod, ONLY : current_spin
USE scf, ONLY : vrs
USE wvfct, ONLY : g2kin
USE uspp, ONLY : vkb, nkb
USE ldaU, ONLY : lda_plus_u, U_projection
USE gvect, ONLY : gstart
USE funct, ONLY : dft_is_meta
USE control_flags, ONLY : gamma_only
USE noncollin_module, ONLY: npol, noncolin
USE realus, ONLY : real_space, &
invfft_orbital_gamma, fwfft_orbital_gamma, calbec_rs_gamma, add_vuspsir_gamma, &
invfft_orbital_k, fwfft_orbital_k, calbec_rs_k, add_vuspsir_k, &
v_loc_psir_inplace
USE fft_base, ONLY : dffts
USE exx, ONLY : use_ace, vexx, vexxace_gamma, vexxace_k
USE funct, ONLY : exx_is_active
USE fft_helper_subroutines
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda, n, m
COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
COMPLEX(DP), DEVICE, INTENT(OUT) :: hpsi_d(lda*npol,m)
!
REAL(DP), DEVICE, ALLOCATABLE :: g2kin_d(:), vrs_d(:,:)
COMPLEX(DP), ALLOCATABLE :: psi_host(:,:)
COMPLEX(DP), ALLOCATABLE :: hpsi_host(:,:)
!
INTEGER :: ipol, ibnd, incr, i
REAL(dp) :: ee
!
LOGICAL :: need_host_copy
!
CALL start_clock( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6)
ALLOCATE(g2kin_d, SOURCE=g2kin)
ALLOCATE(vrs_d, SOURCE=vrs)
hpsi_d (:, 1:m) = (0.0_dp, 0.0_dp)
need_host_copy = ( real_space .and. nkb > 0 ) .OR. &
dft_is_meta() .OR. &
(lda_plus_u .AND. U_projection.NE."pseudo" ) .OR. &
( nkb > 0 .AND. .NOT. real_space) .OR. &
exx_is_active() .OR. lelfield
if (need_host_copy) then
ALLOCATE(psi_host(lda*npol,m) , hpsi_host(lda*npol,m) )
psi_host = psi_d
hpsi_host = hpsi_d ! this is not needed
end if
CALL start_clock( 'h_psi:pot' ); !write (*,*) 'start h_pot';FLUSH(6)
!
! ... Here the product with the local potential V_loc psi
!
IF ( gamma_only ) THEN
!
IF ( real_space .and. nkb > 0 ) then
!
! ... real-space algorithm
! ... fixme: real_space without beta functions does not make sense
!
IF ( dffts%has_task_groups ) then
incr = 2 * fftx_ntgrp(dffts)
ELSE
incr = 2
ENDIF
DO ibnd = 1, m, incr
! ... transform psi to real space -> psic
CALL invfft_orbital_gamma(psi_host,ibnd,m)
! ... compute becp%r = < beta|psi> from psic in real space
CALL start_clock( 'h_psi:calbec' )
CALL calbec_rs_gamma(ibnd,m,becp%r)
CALL stop_clock( 'h_psi:calbec' )
! ... psic -> vrs * psic (psic overwritten will become hpsi)
CALL v_loc_psir_inplace(ibnd,m)
! ... psic (hpsi) -> psic + vusp
CALL add_vuspsir_gamma(ibnd,m)
! ... transform psic back in reciprocal space and assign it to hpsi
CALL fwfft_orbital_gamma(hpsi_host,ibnd,m)
END DO
hpsi_d = hpsi_host
!
ELSE
! ... usual reciprocal-space algorithm
CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
!
ENDIF
!
ELSE IF ( noncolin ) THEN
!
CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs_d, hpsi_d )
!
ELSE
!
IF ( real_space .and. nkb > 0 ) then
!
! ... real-space algorithm
! ... fixme: real_space without beta functions does not make sense
!
IF ( dffts%has_task_groups ) then
incr = fftx_ntgrp(dffts)
ELSE
incr = 1
ENDIF
DO ibnd = 1, m
! ... transform psi to real space -> psic
CALL invfft_orbital_k(psi_host,ibnd,m)
! ... compute becp%r = < beta|psi> from psic in real space
CALL start_clock( 'h_psi:calbec' )
CALL calbec_rs_k(ibnd,m)
CALL stop_clock( 'h_psi:calbec' )
! ... psic -> vrs * psic (psic overwritten will become hpsi)
CALL v_loc_psir_inplace(ibnd,m)
! ... psic (hpsi) -> psic + vusp
CALL add_vuspsir_k(ibnd,m)
! ... transform psic back in reciprocal space and assign it to hpsi
CALL fwfft_orbital_k(hpsi_host,ibnd,m)
END DO
IF (need_host_copy) hpsi_d = hpsi_host
!
ELSE
!
CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
!
END IF
!
END IF
!
! ... Here the product with the non local potential V_NL psi
! ... (not in the real-space case: it is done together with V_loc)
!
IF ( nkb > 0 .AND. .NOT. real_space) THEN
!
CALL start_clock( 'h_psi:calbec' )
CALL calbec ( n, vkb, psi_host, becp, m )
CALL stop_clock( 'h_psi:calbec' )
hpsi_host = hpsi_d
CALL add_vuspsi( lda, n, m, hpsi_host )
hpsi_d = hpsi_host
!
END IF
!
CALL stop_clock( 'h_psi:pot' )
!
! ... Here we add the kinetic energy (k+G)^2 psi
!
!$cuf kernel do(2)
DO ibnd = 1, m
DO i=1, n
hpsi_d (i, ibnd) = hpsi_d(i, ibnd) + g2kin_d (i) * psi_d (i, ibnd)
IF ( noncolin ) THEN
hpsi_d (lda+i, ibnd) = hpsi_d(lda+i,ibnd) + g2kin_d (i) * psi_d (lda+i, ibnd)
END IF
END DO
END DO
!
if (dft_is_meta()) then
hpsi_host = hpsi_d
call h_psi_meta (lda, n, m, psi_host, hpsi_host)
hpsi_d = hpsi_host
end if
!
! ... Here we add the Hubbard potential times psi
!
IF ( lda_plus_u .AND. U_projection.NE."pseudo" ) THEN
!
hpsi_host = hpsi_d
IF (noncolin) THEN
CALL vhpsi_nc( lda, n, m, psi_host, hpsi_host )
ELSE
call vhpsi( lda, n, m, psi_host, hpsi_host )
ENDIF
hpsi_d = hpsi_host
!
ENDIF
!
! ... Here the exact-exchange term Vxx psi
!
IF ( exx_is_active() ) THEN
hpsi_host = hpsi_d
IF ( use_ace) THEN
IF (gamma_only) THEN
CALL vexxace_gamma(lda,m,psi_host,ee,hpsi_host)
ELSE
CALL vexxace_k(lda,m,psi_host,ee,hpsi_host)
END IF
ELSE
CALL vexx( lda, n, m, psi_host, hpsi_host, becp )
END IF
hpsi_d = hpsi_host
END IF
!
! ... electric enthalpy if required
!
IF ( lelfield ) THEN
!
hpsi_host = hpsi_d
IF ( .NOT.l3dstring ) THEN
CALL h_epsi_her_apply( lda, n, m, psi_host, hpsi_host,gdir, efield )
ELSE
DO ipol=1,3
CALL h_epsi_her_apply( lda, n, m, psi_host, hpsi_host,ipol,efield_cry(ipol) )
END DO
END IF
hpsi_d = hpsi_host
!
END IF
!
! ... With Gamma-only trick, Im(H*psi)(G=0) = 0 by definition,
! ... but it is convenient to explicitly set it to 0 to prevent trouble
!
IF ( gamma_only .AND. gstart == 2 ) then
!$cuf kernel do(1)
do i=1,m
hpsi_d(1,i) = CMPLX( DBLE( hpsi_d(1,i) ), 0.D0 ,kind=DP)
end do
end if
!
if (need_host_copy) then
DEALLOCATE(psi_host , hpsi_host )
end if
DEALLOCATE(g2kin_d, vrs_d)
CALL stop_clock( 'h_psi' )
!
RETURN
!
END SUBROUTINE h_psi__gpu
SUBROUTINE h_psi_gpu_compatibility( lda, n, m, psi_d, hpsi_d )
USE kinds, ONLY : DP
USE noncollin_module, ONLY : npol
USE cudafor
@ -31,5 +348,5 @@ SUBROUTINE h_psi_gpu( lda, n, m, psi_d, hpsi_d )
DEALLOCATE(psi, hpsi )
END SUBROUTINE h_psi_gpu
END SUBROUTINE h_psi_gpu_compatibility
#endif

View File

@ -33,8 +33,10 @@ SUBROUTINE init_run()
USE esm, ONLY : do_comp_esm, esm_init
USE tsvdw_module, ONLY : tsvdw_initialize
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_fact
USE buffers_module, ONLY : buffer
!
IMPLICIT NONE
INTEGER :: ierr
!
CALL start_clock( 'init_run' )
!
@ -51,6 +53,10 @@ SUBROUTINE init_run()
CALL summary()
CALL memory_report()
!
! ... allocate GPU buffers
!
CALL buffer%init(10, ierr)
!
! ... allocate memory for G- and R-space fft arrays
!
CALL allocate_fft()

View File

@ -1,4 +1,8 @@
PWLIBS += \
g_psi_mod_gpu.o \
g_psi_gpu.o \
h_psi_gpu.o \
s_psi_gpu.o
s_psi_gpu.o \
vloc_psi_gpu.o
c_bands.o : g_psi_mod_gpu.o

View File

@ -212,6 +212,9 @@ MODULE wvfct
et(:,:), &! eigenvalues of the hamiltonian
wg(:,:), &! the weight of each k point and band
g2kin(:) ! kinetic energy
#if defined(__CUDA)
attributes(pinned) :: g2kin
#endif
INTEGER, ALLOCATABLE :: &
btype(:,:) ! one if the corresponding state has to be
! converged to full accuracy, zero otherwise

View File

@ -8,7 +8,460 @@
!
!----------------------------------------------------------------------------
#if defined(__CUDA)
!@njs: s_psi, psi, spsi, s_psi_, s_psi_k, s_psi_gamma, s_psi_nc
SUBROUTINE s_psi_gpu( lda, n, m, psi_d, spsi_d )
!----------------------------------------------------------------------------
!
! ... This routine applies the S matrix to m wavefunctions psi
! ... and puts the results in spsi.
! ... Requires the products of psi with all beta functions
! ... in array becp(nkb,m) (calculated in h_psi or by calbec)
!
! ... input:
!
! ... lda leading dimension of arrays psi, spsi
! ... n true dimension of psi, spsi
! ... m number of states psi
! ... psi
!
! ... output:
!
! ... spsi S*psi
!
! --- Wrapper routine: performs bgrp parallelization on non-distributed bands
! --- if suitable and required, calls old S\psi routine s_psi_
! --- See comments in h_psi.f90 about band parallelization
!
USE kinds, ONLY : DP
USE noncollin_module, ONLY : npol
USE funct, ONLY : exx_is_active
USE mp_bands, ONLY : use_bgrp_in_hpsi, inter_bgrp_comm
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda, n, m
COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
COMPLEX(DP), DEVICE, INTENT(OUT)::spsi_d(lda*npol,m)
!
INTEGER :: m_start, m_end, i
!
CALL start_clock( 's_psi_bgrp' )
IF (use_bgrp_in_hpsi .AND. .NOT. exx_is_active() .AND. m > 1) THEN
! use band parallelization here
spsi_d(:,:) = (0.d0,0.d0)
CALL divide(inter_bgrp_comm,m,m_start,m_end)
!write(6,*) m, m_start,m_end
! Check if there at least one band in this band group
IF (m_end >= m_start) &
CALL s_psi__gpu( lda, n, m_end-m_start+1, psi_d(1,m_start), spsi_d(1,m_start) )
CALL mp_sum(spsi_d,inter_bgrp_comm)
ELSE
! don't use band parallelization here
CALL s_psi__gpu( lda, n, m, psi_d, spsi_d )
END IF
CALL stop_clock( 's_psi_bgrp' )
RETURN
END SUBROUTINE s_psi_gpu
!
!----------------------------------------------------------------------------
SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
!----------------------------------------------------------------------------
!
! ... This routine applies the S matrix to m wavefunctions psi
! ... and puts the results in spsi.
! ... Requires the products of psi with all beta functions
! ... in array becp(nkb,m) (calculated in h_psi or by calbec)
!
! ... input:
!
! ... lda leading dimension of arrays psi, spsi
! ... n true dimension of psi, spsi
! ... m number of states psi
! ... psi
!
! ... output:
!
! ... spsi S*psi
!
USE cudafor
USE cublas
USE buffers_module, ONLY : buffer
USE kinds, ONLY : DP
USE becmod, ONLY : becp
USE uspp, ONLY : vkb, nkb, okvan, qq_at, qq_so, indv_ijkb0
USE spin_orb, ONLY : lspinorb
USE uspp_param, ONLY : upf, nh, nhm
USE ions_base, ONLY : nat, nsp, ityp
USE control_flags, ONLY: gamma_only
USE noncollin_module, ONLY: npol, noncolin
USE realus, ONLY : real_space, &
invfft_orbital_gamma, fwfft_orbital_gamma, calbec_rs_gamma, s_psir_gamma, &
invfft_orbital_k, fwfft_orbital_k, calbec_rs_k, s_psir_k
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda, n, m
COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
COMPLEX(DP), DEVICE, INTENT(OUT)::spsi_d(lda*npol,m)
!
COMPLEX(DP), ALLOCATABLE :: psi_host(:,:)
COMPLEX(DP), ALLOCATABLE ::spsi_host(:,:)
!
INTEGER :: ibnd
!
LOGICAL :: need_host_copy
!
! ... initialize spsi
!
spsi_d = psi_d
!
IF ( nkb == 0 .OR. .NOT. okvan ) RETURN
!
need_host_copy = real_space
IF (need_host_copy) THEN
ALLOCATE(psi_host(lda*npol,m), spsi_host(lda*npol,m))
psi_host = psi_d
spsi_host = spsi_d
END IF
!
CALL start_clock( 's_psi' )
!
! ... The product with the beta functions
!
IF ( gamma_only ) THEN
!
IF (real_space ) THEN
!
DO ibnd = 1, m, 2
! transform the orbital to real space
CALL invfft_orbital_gamma(psi_host,ibnd,m)
CALL s_psir_gamma(ibnd,m)
CALL fwfft_orbital_gamma(spsi_host,ibnd,m)
END DO
spsi_d = spsi_host
!
ELSE
!
CALL s_psi_gamma_gpu()
!
END IF
!
ELSE IF ( noncolin ) THEN
!
CALL s_psi_nc_gpu()
!
ELSE
!
IF (real_space ) THEN
!
DO ibnd = 1, m
! transform the orbital to real space
CALL invfft_orbital_k(psi_host,ibnd,m)
CALL s_psir_k(ibnd,m)
CALL fwfft_orbital_k(spsi_host,ibnd,m)
spsi_d = spsi_host
END DO
!
ELSE
!
CALL s_psi_k_gpu()
!
END IF
!
END IF
!
CALL stop_clock( 's_psi' )
!
RETURN
!
CONTAINS
!
!-----------------------------------------------------------------------
SUBROUTINE s_psi_gamma_gpu()
!-----------------------------------------------------------------------
!
! ... gamma version
!
USE mp, ONLY: mp_get_comm_null, mp_circular_shift_left
!
IMPLICIT NONE
!
! ... here the local variables
!
INTEGER :: ikb, jkb, ih, jh, na, nt, ibnd, ierr
! counters
INTEGER :: nproc, mype, m_loc, m_begin, ibnd_loc, icyc, icur_blk, m_max
! data distribution indexes
INTEGER, EXTERNAL :: ldim_block, gind_block
! data distribution functions
REAL(DP), ALLOCATABLE :: ps(:,:)
REAL(DP), POINTER, CONTIGUOUS, DEVICE :: ps_d(:,:)
COMPLEX(DP), POINTER, CONTIGUOUS, DEVICE :: vkb_d(:,:)
! the product vkb and psi
!
IF( becp%comm == mp_get_comm_null() ) THEN
nproc = 1
mype = 0
m_loc = m
m_begin = 1
m_max = m
ELSE
!
! becp(l,i) = <beta_l|psi_i>, with vkb(n,l)=|beta_l>
! in this case becp(l,i) are distributed (index i is)
!
nproc = becp%nproc
mype = becp%mype
m_loc = becp%nbnd_loc
m_begin = becp%ibnd_begin
m_max = SIZE( becp%r, 2 )
IF( ( m_begin + m_loc - 1 ) > m ) m_loc = m - m_begin + 1
END IF
!
ALLOCATE( ps( nkb, m_max ), STAT=ierr )
CALL buffer%lock_buffer(ps_d, (/ nkb, m_max /), ierr)
CALL buffer%lock_buffer(vkb_d, (/ lda, nkb /), ierr)
IF( ierr /= 0 ) &
CALL errore( ' s_psi_gamma ', ' cannot allocate memory (ps) ', ABS(ierr) )
!
ps(:,:) = 0.D0
!
! In becp=<vkb_i|psi_j> terms corresponding to atom na of type nt
! run from index i=indv_ijkb0(na)+1 to i=indv_ijkb0(na)+nh(nt)
!
DO nt = 1, nsp
IF ( upf(nt)%tvanp ) THEN
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
!
! Next operation computes ps(l',i)=\sum_m qq(l,m) becp(m',i)
! (l'=l+ijkb0, m'=m+ijkb0, indices run from 1 to nh(nt))
!
IF ( m_loc > 0 ) THEN
CALL DGEMM('N', 'N', nh(nt), m_loc, nh(nt), 1.0_dp, &
qq_at(1,1,na), nhm, becp%r(indv_ijkb0(na)+1,1),&
nkb, 0.0_dp, ps(indv_ijkb0(na)+1,1), nkb )
END IF
END IF
END DO
END IF
END DO
ps_d = ps
vkb_d(1:lda, 1:nkb) = vkb(1:lda,1:nkb)
!
IF( becp%comm == mp_get_comm_null() ) THEN
IF ( m == 1 ) THEN
CALL myDGEMV( 'N', 2 * n, nkb, 1.D0, vkb_d, &
2 * lda, ps_d, 1, 1.D0, spsi_d, 1 )
ELSE
CALL cublasDGEMM( 'N', 'N', 2 * n, m, nkb, 1.D0, vkb_d, &
2 * lda, ps_d, nkb, 1.D0, spsi_d, 2 * lda )
END IF
ELSE
!
! parallel block multiplication of vkb and ps
!
icur_blk = mype
!
DO icyc = 0, nproc - 1
m_loc = ldim_block( becp%nbnd , nproc, icur_blk )
m_begin = gind_block( 1, becp%nbnd, nproc, icur_blk )
IF( ( m_begin + m_loc - 1 ) > m ) m_loc = m - m_begin + 1
IF( m_loc > 0 ) THEN
CALL cublasDGEMM( 'N', 'N', 2 * n, m_loc, nkb, 1.D0, vkb_d, &
2 * lda, ps_d, nkb, 1.D0, spsi_d( 1, m_begin ), 2 * lda )
END IF
! block rotation
!
CALL mp_circular_shift_left( ps_d, icyc, becp%comm )
icur_blk = icur_blk + 1
IF( icur_blk == nproc ) icur_blk = 0
END DO
!
END IF
!
DEALLOCATE( ps )
CALL buffer%release_buffer(ps_d, ierr)
CALL buffer%release_buffer(vkb_d, ierr)
!
RETURN
!
END SUBROUTINE s_psi_gamma_gpu
!
!-----------------------------------------------------------------------
SUBROUTINE s_psi_k_gpu()
!-----------------------------------------------------------------------
!
! ... k-points version
!
IMPLICIT NONE
!
! ... local variables
!
INTEGER :: ikb, jkb, ih, jh, na, nt, ibnd, ierr
! counters
COMPLEX(DP), ALLOCATABLE :: ps(:,:), qqc(:,:)
COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: ps_d(:,:)
COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: vkb_d(:,:)
! ps = product vkb and psi ; qqc = complex version of qq
!
ALLOCATE( ps( nkb, m ), STAT=ierr )
!
IF( ierr /= 0 ) &
CALL errore( ' s_psi_k ', ' cannot allocate memory (ps) ', ABS(ierr) )
CALL buffer%lock_buffer(ps_d, (/ nkb, m /), ierr)
CALL buffer%lock_buffer(vkb_d, (/ lda, nkb /), ierr)
!ALLOCATE(vkb_d, SOURCE=vkb)
!
ps(:,:) = ( 0.D0, 0.D0 )
!
DO nt = 1, nsp
IF ( upf(nt)%tvanp ) THEN
! qq is real: copy it into a complex variable to perform
! a zgemm - simple but sub-optimal solution
ALLOCATE( qqc(nh(nt),nh(nt)) )
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
qqc(:,:) = CMPLX ( qq_at(1:nh(nt),1:nh(nt),na), 0.0_dp, KIND=dp )
CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
qqc, nh(nt), becp%k(indv_ijkb0(na)+1,1), nkb, &
(0.0_dp,0.0_dp), ps(indv_ijkb0(na)+1,1), nkb )
!
END IF
END DO
DEALLOCATE (qqc)
END IF
END DO
ps_d = ps
vkb_d(1:lda, 1:nkb) = vkb(1:lda,1:nkb)
!
IF ( m == 1 ) THEN
!
CALL ZGEMV( 'N', n, nkb, ( 1.D0, 0.D0 ), vkb_d, &
lda, ps_d, 1, ( 1.D0, 0.D0 ), spsi_d, 1 )
!
ELSE
!
CALL ZGEMM( 'N', 'N', n, m, nkb, ( 1.D0, 0.D0 ), vkb_d, &
lda, ps_d, nkb, ( 1.D0, 0.D0 ), spsi_d, lda )
!
END IF
!
DEALLOCATE( ps )
CALL buffer%release_buffer(ps_d, ierr)
CALL buffer%release_buffer(vkb_d, ierr)
!DEALLOCATE(vkb_d)
!
RETURN
!
END SUBROUTINE s_psi_k_gpu
!
!
!-----------------------------------------------------------------------
SUBROUTINE s_psi_nc_gpu ( )
!-----------------------------------------------------------------------
!
! ... k-points noncolinear/spinorbit version
!
IMPLICIT NONE
!
! here the local variables
!
INTEGER :: ikb, jkb, ih, jh, na, nt, ibnd, ipol, ierr
! counters
COMPLEX (DP), ALLOCATABLE :: ps (:,:,:)
COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: ps_d(:,:,:)
COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: vkb_d(:,:)
! the product vkb and psi
!
ALLOCATE (ps(nkb,npol,m),STAT=ierr)
IF( ierr /= 0 ) &
CALL errore( ' s_psi_nc ', ' cannot allocate memory (ps) ', ABS(ierr) )
ps(:,:,:) = (0.D0,0.D0)
!
DO nt = 1, nsp
!
IF ( upf(nt)%tvanp ) THEN
!
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
DO ih = 1,nh(nt)
ikb = indv_ijkb0(na) + ih
DO jh = 1, nh (nt)
jkb = indv_ijkb0(na) + jh
IF ( .NOT. lspinorb ) THEN
DO ipol=1,npol
DO ibnd = 1, m
ps(ikb,ipol,ibnd) = ps(ikb,ipol,ibnd) + &
qq_at(ih,jh,na)*becp%nc(jkb,ipol,ibnd)
END DO
END DO
ELSE
DO ibnd = 1, m
ps(ikb,1,ibnd)=ps(ikb,1,ibnd) + &
qq_so(ih,jh,1,nt)*becp%nc(jkb,1,ibnd)+ &
qq_so(ih,jh,2,nt)*becp%nc(jkb,2,ibnd)
ps(ikb,2,ibnd)=ps(ikb,2,ibnd) + &
qq_so(ih,jh,3,nt)*becp%nc(jkb,1,ibnd)+ &
qq_so(ih,jh,4,nt)*becp%nc(jkb,2,ibnd)
END DO
END IF
END DO
END DO
END IF
END DO
END IF
END DO
CALL buffer%lock_buffer(ps_d, (/ nkb,npol,m /), ierr)
CALL buffer%lock_buffer(vkb_d, (/ lda, nkb /), ierr)
ps_d = ps
vkb_d(1:lda, 1:nkb) = vkb(1:lda,1:nkb)
call ZGEMM ('N', 'N', n, m*npol, nkb, (1.d0, 0.d0) , vkb_d, &
lda, ps_d, nkb, (1.d0, 0.d0) , spsi_d(1,1), lda)
DEALLOCATE(ps)
CALL buffer%release_buffer(ps_d, ierr)
CALL buffer%release_buffer(vkb_d, ierr)
RETURN
END SUBROUTINE s_psi_nc_gpu
END SUBROUTINE s_psi__gpu
!@nje
SUBROUTINE myDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
use cudafor
use cublas
implicit none
DOUBLE PRECISION ALPHA,BETA
INTEGER INCX,INCY,LDA,M,N
CHARACTER TRANS
DOUBLE PRECISION, DEVICE :: A(LDA,*),X(*),Y(*)
!
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
!
END SUBROUTINE myDGEMV
SUBROUTINE s_psi_gpu_compatibility( lda, n, m, psi_d, spsi_d )
USE kinds, ONLY : DP
USE noncollin_module, ONLY : npol
USE cudafor
@ -32,5 +485,5 @@ SUBROUTINE s_psi_gpu( lda, n, m, psi_d, spsi_d )
DEALLOCATE(psi, spsi )
END SUBROUTINE s_psi_gpu
END SUBROUTINE s_psi_gpu_compatibility
#endif

View File

@ -72,6 +72,9 @@ MODULE scf
vrs(:,:), &! the total pot. in real space (smooth grid)
rho_core(:), &! the core charge in real space
kedtau(:,:) ! position dependent kinetic energy enhancement factor
#if defined(__CUDA)
attributes(pinned) :: vrs
#endif
COMPLEX(DP), ALLOCATABLE :: &
rhog_core(:) ! the core charge in reciprocal space

623
PW/src/vloc_psi_gpu.f90 Normal file
View File

@ -0,0 +1,623 @@
!
! Copyright (C) 2003-2013 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
#if defined(__CUDA)
!@njs: vloc_psi_gamma, psi, v, hpsi, psic
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_gamma_gpu(lda, n, m, psi_d, v_d, hpsi_d)
!-----------------------------------------------------------------------
!
! Calculation of Vloc*psi using dual-space technique - Gamma point
!
USE parallel_include
USE kinds, ONLY : DP
USE mp_bands, ONLY : me_bgrp
USE fft_base, ONLY : dffts
USE fft_interfaces,ONLY : fwfft, invfft
USE wavefunctions_module, ONLY: psic_h => psic
USE wavefunctions_module_gpum, ONLY: psic_d
USE fft_helper_subroutines
USE buffers_module, ONLY : buffer
!
IMPLICIT NONE
!
INTEGER, INTENT(in) :: lda, n, m
COMPLEX(DP), DEVICE, INTENT(in) :: psi_d (lda, m)
COMPLEX(DP), DEVICE, INTENT(inout):: hpsi_d (lda, m)
REAL(DP), DEVICE, INTENT(in) :: v_d(dffts%nnr)
!
INTEGER :: ibnd, j, incr, right_nnr, right_nr3, right_inc
COMPLEX(DP) :: fp, fm
!
LOGICAL :: use_tg
! Variables for task groups
!@njs: tg_v, tg_psic
REAL(DP), DEVICE, POINTER :: tg_v_d(:)
COMPLEX(DP), DEVICE, POINTER :: tg_psic_d(:)
INTEGER, DEVICE, POINTER :: dffts_nl_d(:), dffts_nlm_d(:)
INTEGER :: v_siz, idx, ioff
INTEGER :: ierr
!
CALL start_clock ('vloc_psi')
incr = 2
!
IF (.not. allocated(psic_d)) ALLOCATE(psic_d, SOURCE=psic_h)
use_tg = dffts%has_task_groups
!
IF( use_tg ) THEN
!
CALL start_clock ('vloc_psi:tg_gather')
v_siz = dffts%nnr_tg
!
CALL buffer%lock_buffer( tg_v_d, v_siz, ierr )
CALL buffer%lock_buffer( tg_psic_d, v_siz, ierr )
!
CALL tg_gather_gpu( dffts, v_d, tg_v_d )
CALL stop_clock ('vloc_psi:tg_gather')
!
incr = 2 * fftx_ntgrp(dffts)
!
ENDIF
! Sync fft data
CALL buffer%lock_buffer( dffts_nl_d, size(dffts%nl), ierr )
CALL buffer%lock_buffer( dffts_nlm_d, size(dffts%nlm), ierr )
dffts_nl_d(1:n) = dffts%nl(1:n)
dffts_nlm_d(1:n) = dffts%nlm(1:n)
! End Sync
!
! the local potential V_Loc psi. First bring psi to real space
!
DO ibnd = 1, m, incr
!
IF( use_tg ) THEN
!
CALL tg_get_nnr( dffts, right_nnr )
!
tg_psic_d = (0.d0, 0.d0)
ioff = 0
!
DO idx = 1, 2*fftx_ntgrp(dffts), 2
IF( idx + ibnd - 1 < m ) THEN
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
tg_psic_d(dffts_nl_d (j)+ioff) = psi_d(j,idx+ibnd-1) + &
(0.0d0,1.d0) * psi_d(j,idx+ibnd)
tg_psic_d(dffts_nlm_d(j)+ioff) = conjg( psi_d(j,idx+ibnd-1) - &
(0.0d0,1.d0) * psi_d(j,idx+ibnd) )
ENDDO
ELSEIF( idx + ibnd - 1 == m ) THEN
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
tg_psic_d(dffts_nl_d (j)+ioff) = psi_d(j,idx+ibnd-1)
tg_psic_d(dffts_nlm_d(j)+ioff) = conjg( psi_d(j,idx+ibnd-1) )
ENDDO
ENDIF
ioff = ioff + right_nnr
ENDDO
!
ELSE
!
psic_d(:) = (0.d0, 0.d0)
IF (ibnd < m) THEN
! two ffts at the same time
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
psic_d(dffts_nl_d (j))= psi_d(j,ibnd) + (0.0d0,1.d0)*psi_d(j,ibnd+1)
psic_d(dffts_nlm_d(j))=conjg(psi_d(j,ibnd) - (0.0d0,1.d0)*psi_d(j,ibnd+1))
ENDDO
ELSE
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
psic_d (dffts_nl_d (j)) = psi_d(j, ibnd)
psic_d (dffts_nlm_d(j)) = conjg(psi_d(j, ibnd))
ENDDO
ENDIF
!
ENDIF
!
! fft to real space
! product with the potential v on the smooth grid
! back to reciprocal space
!
IF( use_tg ) THEN
!
CALL invfft ('tgWave', tg_psic_d, dffts )
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
!$cuf kernel do(1) <<<,>>>
DO j = 1, dffts%nr1x * dffts%nr2x * right_nr3
tg_psic_d (j) = tg_psic_d (j) * tg_v_d(j)
ENDDO
!
CALL fwfft ('tgWave', tg_psic_d, dffts )
!
ELSE
!
CALL invfft ('Wave', psic_d, dffts)
!
!$cuf kernel do(1) <<<,>>>
DO j = 1, dffts%nnr
psic_d (j) = psic_d (j) * v_d(j)
ENDDO
!
CALL fwfft ('Wave', psic_d, dffts)
!
ENDIF
!
! addition to the total product
!
IF( use_tg ) THEN
!
ioff = 0
!
CALL tg_get_recip_inc( dffts, right_inc )
!
DO idx = 1, 2*fftx_ntgrp(dffts), 2
!
IF( idx + ibnd - 1 < m ) THEN
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
fp= ( tg_psic_d( dffts_nl_d(j) + ioff ) + &
tg_psic_d( dffts_nlm_d(j) + ioff ) ) * 0.5d0
fm= ( tg_psic_d( dffts_nl_d(j) + ioff ) - &
tg_psic_d( dffts_nlm_d(j) + ioff ) ) * 0.5d0
hpsi_d (j, ibnd+idx-1) = hpsi_d (j, ibnd+idx-1) + &
cmplx( dble(fp), aimag(fm),kind=DP)
hpsi_d (j, ibnd+idx ) = hpsi_d (j, ibnd+idx ) + &
cmplx(aimag(fp),- dble(fm),kind=DP)
ENDDO
ELSEIF( idx + ibnd - 1 == m ) THEN
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
hpsi_d (j, ibnd+idx-1) = hpsi_d (j, ibnd+idx-1) + &
tg_psic_d( dffts_nl_d(j) + ioff )
ENDDO
ENDIF
!
ioff = ioff + right_inc
!
ENDDO
!
ELSE
IF (ibnd < m) THEN
! two ffts at the same time
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
fp = (psic_d (dffts_nl_d(j)) + psic_d (dffts_nlm_d(j)))*0.5d0
fm = (psic_d (dffts_nl_d(j)) - psic_d (dffts_nlm_d(j)))*0.5d0
hpsi_d (j, ibnd) = hpsi_d (j, ibnd) + &
cmplx( dble(fp), aimag(fm),kind=DP)
hpsi_d (j, ibnd+1) = hpsi_d (j, ibnd+1) + &
cmplx(aimag(fp),- dble(fm),kind=DP)
ENDDO
ELSE
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
hpsi_d (j, ibnd) = hpsi_d (j, ibnd) + psic_d (dffts_nl_d(j))
ENDDO
ENDIF
ENDIF
!
ENDDO
!
IF( use_tg ) THEN
!
CALL buffer%release_buffer( tg_psic_d, ierr )
CALL buffer%release_buffer( tg_v_d, ierr )
!
ENDIF
CALL buffer%release_buffer( dffts_nl_d, ierr )
CALL buffer%release_buffer( dffts_nlm_d, ierr )
CALL stop_clock ('vloc_psi')
!
RETURN
END SUBROUTINE vloc_psi_gamma_gpu
!
!@njs: vloc_psi_k
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_k_gpu(lda, n, m, psi_d, v_d, hpsi_d)
!-----------------------------------------------------------------------
!
! Calculation of Vloc*psi using dual-space technique - k-points
!
! fft to real space
! product with the potential v on the smooth grid
! back to reciprocal space
! addition to the hpsi
!
USE parallel_include
USE kinds, ONLY : DP
USE wvfct, ONLY : current_k
USE klist, ONLY : igk_k_host => igk_k
USE mp_bands, ONLY : me_bgrp
USE fft_base, ONLY : dffts
USE fft_interfaces,ONLY : fwfft, invfft
USE fft_helper_subroutines
USE wavefunctions_module, ONLY: psic_h => psic
USE wavefunctions_module_gpum, ONLY: psic_d
USE buffers_module, ONLY : buffer
!
IMPLICIT NONE
!
INTEGER, INTENT(in) :: lda, n, m
COMPLEX(DP), DEVICE, INTENT(in) :: psi_d (lda, m)
COMPLEX(DP), DEVICE, INTENT(inout):: hpsi_d (lda, m)
REAL(DP), DEVICE, INTENT(in) :: v_d(dffts%nnr)
!
INTEGER :: ibnd, j, incr
INTEGER :: i, right_nnr, right_nr3, right_inc
!
LOGICAL :: use_tg
! Task Groups
REAL(DP), DEVICE, POINTER :: tg_v_d(:)
COMPLEX(DP), DEVICE, POINTER :: tg_psic_d(:)
INTEGER, DEVICE, POINTER :: dffts_nl_d(:)
INTEGER, DEVICE, POINTER :: igk_k_d(:)
INTEGER :: v_siz, idx, ioff
INTEGER :: ierr
!
IF (.not. allocated(psic_d)) ALLOCATE(psic_d, SOURCE=psic_h)
CALL start_clock ('vloc_psi')
use_tg = dffts%has_task_groups
!
IF( use_tg ) THEN
!
CALL start_clock ('vloc_psi:tg_gather')
v_siz = dffts%nnr_tg
!
CALL buffer%lock_buffer( tg_v_d, v_siz, ierr )
CALL buffer%lock_buffer( tg_psic_d, v_siz, ierr )
!
CALL tg_gather( dffts, v_d, tg_v_d )
CALL stop_clock ('vloc_psi:tg_gather')
ENDIF
CALL buffer%lock_buffer( dffts_nl_d, size(dffts%nl) , ierr )
CALL buffer%lock_buffer( igk_k_d, n, ierr )
!
dffts_nl_d = dffts%nl
igk_k_d(1:n) = igk_k_host(1:n, current_k)
!
IF( use_tg ) THEN
CALL tg_get_nnr( dffts, right_nnr )
DO ibnd = 1, m, fftx_ntgrp(dffts)
!
tg_psic_d = (0.d0, 0.d0)
ioff = 0
!
DO idx = 1, fftx_ntgrp(dffts)
IF( idx + ibnd - 1 <= m ) THEN
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
tg_psic_d(dffts_nl_d (igk_k_d(j))+ioff) = psi_d(j,idx+ibnd-1)
ENDDO
ENDIF
!write (6,*) 'wfc G ', idx+ibnd-1
!write (6,99) (tg_psic(i+ioff), i=1,400)
ioff = ioff + right_nnr
ENDDO
!
CALL invfft ('tgWave', tg_psic_d, dffts )
!write (6,*) 'wfc R '
!write (6,99) (tg_psic(i), i=1,400)
!
CALL tg_get_group_nr3( dffts, right_nr3 )
!
!$cuf kernel do(1) <<<,>>>
DO j = 1, dffts%nr1x*dffts%nr2x* right_nr3
tg_psic_d (j) = tg_psic_d (j) * tg_v_d(j)
ENDDO
!
!write (6,*) 'v psi R '
!write (6,99) (tg_psic(i), i=1,400)
!
CALL fwfft ('tgWave', tg_psic_d, dffts )
!
! addition to the total product
!
ioff = 0
!
CALL tg_get_recip_inc( dffts, right_inc )
!
DO idx = 1, fftx_ntgrp(dffts)
!
IF( idx + ibnd - 1 <= m ) THEN
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
hpsi_d (j, ibnd+idx-1) = hpsi_d (j, ibnd+idx-1) + &
tg_psic_d( dffts_nl_d(igk_k_d(j)) + ioff )
ENDDO
!
ENDIF
!
!write (6,*) 'v psi G ', idx+ibnd-1
!write (6,99) (tg_psic(i+ioff), i=1,400)
ioff = ioff + right_inc
!
ENDDO
!
ENDDO
ELSE
DO ibnd = 1, m
!
!!! == OPTIMIZE HERE == (setting to 0 and setting elements!)
psic_d(:) = (0.d0, 0.d0)
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
psic_d (dffts_nl_d (igk_k_d(j))) = psi_d(j, ibnd)
END DO
!
!write (6,*) 'wfc G ', ibnd
!write (6,99) (psic(i), i=1,400)
!
CALL invfft ('Wave', psic_d, dffts)
!write (6,*) 'wfc R '
!write (6,99) (psic(i), i=1,400)
!
!$cuf kernel do(1) <<<,>>>
DO j = 1, dffts%nnr
psic_d (j) = psic_d (j) * v_d(j)
ENDDO
!
!write (6,*) 'v psi R '
!write (6,99) (psic(i), i=1,400)
!
CALL fwfft ('Wave', psic_d, dffts)
!
! addition to the total product
!
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
hpsi_d (j, ibnd) = hpsi_d (j, ibnd) + psic_d (dffts_nl_d(igk_k_d(j)))
ENDDO
!
!write (6,*) 'v psi G ', ibnd
!write (6,99) (psic(i), i=1,400)
!
ENDDO
ENDIF
!
IF( use_tg ) THEN
!
CALL buffer%release_buffer( tg_psic_d, ierr )
CALL buffer%release_buffer( tg_v_d, ierr )
!
ENDIF
CALL buffer%release_buffer( dffts_nl_d, ierr )
CALL buffer%release_buffer( igk_k_d, ierr )
CALL stop_clock ('vloc_psi')
!
99 format ( 20 ('(',2f12.9,')') )
RETURN
END SUBROUTINE vloc_psi_k_gpu
!
!@njs: vloc_psi_nc
!-----------------------------------------------------------------------
SUBROUTINE vloc_psi_nc_gpu (lda, n, m, psi_d, v_d, hpsi_d)
!-----------------------------------------------------------------------
!
! Calculation of Vloc*psi using dual-space technique - noncolinear
!
USE parallel_include
USE kinds, ONLY : DP
USE wvfct, ONLY : current_k
USE klist, ONLY : igk_k_host => igk_k
USE mp_bands, ONLY : me_bgrp
USE fft_base, ONLY : dffts, dfftp
USE fft_interfaces,ONLY : fwfft, invfft
USE lsda_mod, ONLY : nspin
USE spin_orb, ONLY : domag
USE noncollin_module, ONLY: npol
USE wavefunctions_module, ONLY: psic_nc_h => psic_nc
USE wavefunctions_module_gpum, ONLY: psic_nc_d
USE fft_helper_subroutines
USE buffers_module, ONLY : buffer
!
IMPLICIT NONE
!
INTEGER, INTENT(in) :: lda, n, m
REAL(DP), DEVICE, INTENT(in) :: v_d(dfftp%nnr,4) ! beware dimensions!
COMPLEX(DP), DEVICE, INTENT(in) :: psi_d (lda*npol, m)
COMPLEX(DP), DEVICE, INTENT(inout):: hpsi_d (lda,npol,m)
INTEGER, DEVICE, POINTER :: dffts_nl_d(:)
INTEGER, DEVICE, POINTER :: igk_k_d(:)
!
INTEGER :: ibnd, j,ipol, incr, is
COMPLEX(DP) :: sup, sdwn
!
LOGICAL :: use_tg
! Variables for task groups
REAL(DP), DEVICE, ALLOCATABLE :: tg_v_d(:,:)
COMPLEX(DP), DEVICE, ALLOCATABLE :: tg_psic_d(:,:)
INTEGER :: v_siz, idx, ioff
INTEGER :: right_nnr, right_nr3, right_inc
INTEGER :: ierr
!
CALL start_clock ('vloc_psi')
!
IF (.not. allocated(psic_nc_d)) ALLOCATE(psic_nc_d, SOURCE=psic_nc_h)
incr = 1
!
use_tg = dffts%has_task_groups
!
IF( use_tg ) THEN
CALL start_clock ('vloc_psi:tg_gather')
v_siz = dffts%nnr_tg
IF (domag) THEN
ALLOCATE( tg_v_d( v_siz, 4 ) )
DO is=1,nspin
CALL tg_gather( dffts, v_d(:,is), tg_v_d(:,is) )
ENDDO
ELSE
ALLOCATE( tg_v_d( v_siz, 1 ) )
CALL tg_gather( dffts, v_d(:,1), tg_v_d(:,1) )
ENDIF
ALLOCATE( tg_psic_d( v_siz, npol ) )
CALL stop_clock ('vloc_psi:tg_gather')
incr = fftx_ntgrp(dffts)
ENDIF
CALL buffer%lock_buffer( dffts_nl_d, size(dffts%nl), ierr )
CALL buffer%lock_buffer( igk_k_d, n, ierr )
dffts_nl_d = dffts%nl
igk_k_d(1:n) = igk_k_host(1:n, current_k)
!
! the local potential V_Loc psi. First the psi in real space
!
DO ibnd = 1, m, incr
IF( use_tg ) THEN
!
CALL tg_get_nnr( dffts, right_nnr )
!
DO ipol = 1, npol
!
! == OPTIMIZE HERE == setting data twice
tg_psic_d(:,ipol) = ( 0.D0, 0.D0 )
ioff = 0
!
DO idx = 1, fftx_ntgrp(dffts)
!
IF( idx + ibnd - 1 <= m ) THEN
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
tg_psic_d( dffts_nl_d( igk_k_d(j) ) + ioff, ipol ) = &
psi_d( j +(ipol-1)*lda, idx+ibnd-1 )
ENDDO
ENDIF
ioff = ioff + right_nnr
ENDDO
!
CALL invfft ('tgWave', tg_psic_d(:,ipol), dffts )
!
ENDDO
!
ELSE
psic_nc_d = (0.d0,0.d0)
DO ipol=1,npol
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
psic_nc_d(dffts_nl_d(igk_k_d(j)),ipol) = psi_d(j+(ipol-1)*lda,ibnd)
ENDDO
CALL invfft ('Wave', psic_nc_d(:,ipol), dffts)
ENDDO
ENDIF
!
! product with the potential v = (vltot+vr) on the smooth grid
!
IF( use_tg ) THEN
CALL tg_get_group_nr3( dffts, right_nr3 )
IF (domag) THEN
!$cuf kernel do(1) <<<,>>>
DO j=1, dffts%nr1x*dffts%nr2x*right_nr3
sup = tg_psic_d(j,1) * (tg_v_d(j,1)+tg_v_d(j,4)) + &
tg_psic_d(j,2) * (tg_v_d(j,2)-(0.d0,1.d0)*tg_v_d(j,3))
sdwn = tg_psic_d(j,2) * (tg_v_d(j,1)-tg_v_d(j,4)) + &
tg_psic_d(j,1) * (tg_v_d(j,2)+(0.d0,1.d0)*tg_v_d(j,3))
tg_psic_d(j,1)=sup
tg_psic_d(j,2)=sdwn
ENDDO
ELSE
!$cuf kernel do(1) <<<,>>>
DO j=1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic_d(j,:) = tg_psic_d(j,:) * tg_v_d(j,1)
ENDDO
ENDIF
ELSE
IF (domag) THEN
!$cuf kernel do(1) <<<,>>>
DO j=1, dffts%nnr
sup = psic_nc_d(j,1) * (v_d(j,1)+v_d(j,4)) + &
psic_nc_d(j,2) * (v_d(j,2)-(0.d0,1.d0)*v_d(j,3))
sdwn = psic_nc_d(j,2) * (v_d(j,1)-v_d(j,4)) + &
psic_nc_d(j,1) * (v_d(j,2)+(0.d0,1.d0)*v_d(j,3))
psic_nc_d(j,1)=sup
psic_nc_d(j,2)=sdwn
ENDDO
ELSE
!$cuf kernel do(1) <<<,>>>
DO j=1, dffts%nnr
psic_nc_d(j,:) = psic_nc_d(j,:) * v_d(j,1)
ENDDO
ENDIF
ENDIF
!
! back to reciprocal space
!
IF( use_tg ) THEN
!
DO ipol = 1, npol
CALL fwfft ('tgWave', tg_psic_d(:,ipol), dffts )
!
ioff = 0
!
CALL tg_get_recip_inc( dffts, right_inc )
!
DO idx = 1, fftx_ntgrp(dffts)
!
IF( idx + ibnd - 1 <= m ) THEN
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
hpsi_d (j, ipol, ibnd+idx-1) = hpsi_d (j, ipol, ibnd+idx-1) + &
tg_psic_d( dffts_nl_d(igk_k_d(j)) + ioff, ipol )
ENDDO
ENDIF
!
ioff = ioff + right_inc
!
ENDDO
ENDDO
!
ELSE
DO ipol=1,npol
CALL fwfft ('Wave', psic_nc_d(:,ipol), dffts)
ENDDO
!
! addition to the total product
!
DO ipol=1,npol
!$cuf kernel do(1) <<<,>>>
DO j = 1, n
hpsi_d(j,ipol,ibnd) = hpsi_d(j,ipol,ibnd) + &
psic_nc_d(dffts_nl_d(igk_k_d(j)),ipol)
ENDDO
ENDDO
ENDIF
ENDDO
IF( use_tg ) THEN
!
DEALLOCATE(tg_v_d, tg_psic_d)
!
ENDIF
CALL buffer%release_buffer( dffts_nl_d, ierr )
CALL buffer%release_buffer( igk_k_d, ierr )
CALL stop_clock ('vloc_psi')
!
RETURN
END SUBROUTINE vloc_psi_nc_gpu
!@nje
#endif