Fix CG with k points (CUF-->OpenACC)

This commit is contained in:
Ivan Carnimeo 2024-07-17 08:58:47 +02:00
parent cce06d5692
commit aa2b133609
4 changed files with 174 additions and 103 deletions

View File

@ -44,7 +44,7 @@ END FUNCTION KSDdot
!
!----------------------------------------------------------------------------
SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
npwx, npw, nbnd, npol, psi_d, e_d, btype, &
npwx, npw, nbnd, npol, psi, e_d, btype, &
ethr, maxter, reorder, notconv, avg_iter )
!----------------------------------------------------------------------------
!
@ -77,21 +77,22 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
INTEGER, INTENT(IN) :: npwx, npw, nbnd, npol, maxter
INTEGER, INTENT(IN) :: btype(nbnd)
REAL(DP), INTENT(IN) :: precondition_d(npwx*npol), ethr
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nbnd)
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nbnd)
REAL(DP), INTENT(INOUT) :: e_d(nbnd)
INTEGER, INTENT(OUT) :: notconv
REAL(DP), INTENT(OUT) :: avg_iter
#if defined(__CUDA)
attributes(DEVICE) :: precondition_d, psi_d, e_d
attributes(DEVICE) :: precondition_d, e_d
#endif
!
! ... local variables
!
INTEGER :: i, j, k, m, m_start, m_end, iter, moved
COMPLEX(DP), ALLOCATABLE :: hpsi_d(:), spsi_d(:), g_d(:), cg_d(:)
COMPLEX(DP), ALLOCATABLE :: scg_d(:), ppsi_d(:), g0_d(:), lagrange_d(:)
COMPLEX(DP), ALLOCATABLE :: hpsi(:), spsi(:), g(:), cg(:)
COMPLEX(DP), ALLOCATABLE :: scg(:), ppsi(:), g0_d(:), lagrange_d(:)
!$acc declare device_resident(g, scg, hpsi, spsi, cg, ppsi)
#if defined(__CUDA)
attributes(DEVICE) :: hpsi_d, spsi_d, g_d, cg_d, scg_d, ppsi_d, g0_d, lagrange_d
attributes(DEVICE) :: g0_d, lagrange_d
#endif
COMPLEX(DP), ALLOCATABLE :: lagrange(:)
REAL(DP) :: gamma, ddot_temp, es_1, es(2)
@ -127,24 +128,24 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
kdim2 = 2 * kdim
!
ALLOCATE( hpsi_d(kdmx), STAT=ierr )
ALLOCATE( hpsi(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate hpsi_d ', ABS(ierr) )
ALLOCATE( spsi_d(kdmx), STAT=ierr )
CALL errore( ' ccgdiagg ',' cannot allocate hpsi ', ABS(ierr) )
ALLOCATE( spsi(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate spsi_d ', ABS(ierr) )
ALLOCATE( g_d(kdmx), STAT=ierr )
CALL errore( ' ccgdiagg ',' cannot allocate spsi ', ABS(ierr) )
ALLOCATE( g(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate g_d ', ABS(ierr) )
ALLOCATE( cg_d(kdmx), STAT=ierr )
CALL errore( ' ccgdiagg ',' cannot allocate g ', ABS(ierr) )
ALLOCATE( cg(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate cg_d ', ABS(ierr) )
ALLOCATE( scg_d(kdmx), STAT=ierr )
CALL errore( ' ccgdiagg ',' cannot allocate cg ', ABS(ierr) )
ALLOCATE( scg(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate scg_d ', ABS(ierr) )
ALLOCATE( ppsi_d(kdmx), STAT=ierr )
CALL errore( ' ccgdiagg ',' cannot allocate scg ', ABS(ierr) )
ALLOCATE( ppsi(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate ppsi_d ', ABS(ierr) )
CALL errore( ' ccgdiagg ',' cannot allocate ppsi ', ABS(ierr) )
ALLOCATE( g0_d(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate g0_d ', ABS(ierr) )
@ -176,26 +177,44 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
END IF
!
CALL dev_memset( spsi_d , ZERO )
CALL dev_memset( scg_d , ZERO )
CALL dev_memset( hpsi_d , ZERO )
CALL dev_memset( g_d , ZERO )
CALL dev_memset( cg_d , ZERO )
!$acc host_data use_device(spsi)
CALL dev_memset( spsi , ZERO )
!$acc end host_data
!$acc host_data use_device(hpsi)
CALL dev_memset( hpsi , ZERO )
!$acc end host_data
!$acc host_data use_device(scg)
CALL dev_memset( scg , ZERO )
!$acc end host_data
!$acc host_data use_device(g)
CALL dev_memset( g , ZERO )
!$acc end host_data
!$acc host_data use_device(cg)
CALL dev_memset( cg , ZERO )
!$acc end host_data
CALL dev_memset( g0_d , ZERO )
CALL dev_memset( ppsi_d , ZERO )
!$acc host_data use_device(ppsi)
CALL dev_memset( ppsi , ZERO )
!$acc end host_data
CALL dev_memset( lagrange_d , ZERO )
!
! ... calculate S|psi>
!
CALL s_1psi_ptr( npwx, npw, psi_d(1,m), spsi_d )
write(*,*) '@@4.1'
CALL s_1psi_ptr( npwx, npw, psi(1,m), spsi )
write(*,*) '@@4.2'
!
! ... orthogonalize starting eigenfunction to those already calculated
!
call divide(inter_bgrp_comm,m,m_start,m_end); !write(*,*) m,m_start,m_end
lagrange = ZERO
if(m_start.le.m_end) &
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
kdmx, spsi_d, 1, ZERO, lagrange_d(m_start), 1 )
IF(m_start.le.m_end) THEN
!$acc host_data use_device(psi, spsi)
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi(1,m_start), &
kdmx, spsi, 1, ZERO, lagrange_d(m_start), 1 )
!$acc end host_data
END IF
write(*,*) '@@4.3'
if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
CALL mp_sum( lagrange( 1:m ), inter_bgrp_comm )
@ -206,34 +225,42 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
lagrange_d(1:m) = lagrange(1:m)
!
DO j = 1, m - 1
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
!
psi_d(i,m) = psi_d(i,m) - lagrange_d(j) * psi_d(i,j)
psi(i,m) = psi(i,m) - lagrange_d(j) * psi(i,j)
!
END DO
!$acc end kernels
!
psi_norm = psi_norm - ( DBLE( lagrange(j) )**2 + AIMAG( lagrange(j) )**2 )
!
END DO
write(*,*) '@@4.4'
!
psi_norm = SQRT( psi_norm )
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
psi_d(i,m) = psi_d(i,m) / psi_norm
psi(i,m) = psi(i,m) / psi_norm
END DO
!$acc end kernels
!
write(*,*) '@@4.5'
! ... calculate starting gradient (|hpsi> = H|psi>) ...
!
CALL hs_1psi_ptr( npwx, npw, psi_d(1,m), hpsi_d, spsi_d )
CALL hs_1psi_ptr( npwx, npw, psi(1,m), hpsi, spsi )
!
write(*,*) '@@4.6'
! ... and starting eigenvalue (e = <y|PHP|y> = <psi|H|psi>)
!
! ... NB: ddot(2*npw,a,1,b,1) = REAL( zdotc(npw,a,1,b,1) )
!
e(m) = ksDdot( kdim2, psi_d(1,m), 1, hpsi_d, 1 )
!$acc host_data use_device(psi, hpsi)
e(m) = ksDdot( kdim2, psi(1,m), 1, hpsi, 1 )
!$acc end host_data
!
write(*,*) '@@4.7'
CALL mp_sum( e(m), intra_bgrp_comm )
!
! ... start iteration for this band
@ -243,26 +270,32 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
! ... calculate P (PHP)|y>
! ... ( P = preconditioning matrix, assumed diagonal )
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
g_d(i) = hpsi_d(i) / precondition_d(i)
ppsi_d(i) = spsi_d(i) / precondition_d(i)
g(i) = hpsi(i) / precondition_d(i)
ppsi(i) = spsi(i) / precondition_d(i)
END DO
!$acc end kernels
write(*,*) '@@4.8'
!
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
!
es(1) = ksDdot( kdim2, spsi_d(1), 1, g_d(1), 1 )
es(2) = ksDdot( kdim2, spsi_d(1), 1, ppsi_d(1), 1 )
!$acc host_data use_device(spsi, g, ppsi)
es(1) = ksDdot( kdim2, spsi(1), 1, g(1), 1 )
es(2) = ksDdot( kdim2, spsi(1), 1, ppsi(1), 1 )
!$acc end host_data
write(*,*) '@@4.9'
!
CALL mp_sum( es , intra_bgrp_comm )
!
es(1) = es(1) / es(2)
es_1 = es(1)
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
g_d(i) = g_d(i) - es_1 * ppsi_d(i)
g(i) = g(i) - es_1 * ppsi(i)
END DO
!$acc end kernels
!
! ... e1 = <y| S P^2 PHP|y> / <y| S S P^2|y> ensures that
! ... <g| S P^2|y> = 0
@ -270,13 +303,19 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
! ... scg is used as workspace
!
CALL s_1psi_ptr( npwx, npw, g_d(1), scg_d(1) )
write(*,*) '@@4.10'
CALL s_1psi_ptr( npwx, npw, g(1), scg(1) )
write(*,*) '@@4.11'
!
lagrange(1:m-1) = ZERO
call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
if(m_start.le.m_end) &
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
kdmx, scg_d, 1, ZERO, lagrange_d(m_start), 1 )
IF(m_start.le.m_end) THEN
!$acc host_data use_device(psi, scg)
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi(1,m_start), &
kdmx, scg, 1, ZERO, lagrange_d(m_start), 1 )
!$acc end host_data
END IF
write(*,*) '@@4.12'
if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
CALL mp_sum( lagrange( 1:m-1 ), inter_bgrp_comm )
!
@ -286,19 +325,24 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
DO j = 1, ( m - 1 )
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
g_d(i) = g_d(i) - lagrange_d(j) * psi_d(i,j)
scg_d(i) = scg_d(i) - lagrange_d(j) * psi_d(i,j)
g(i) = g(i) - lagrange_d(j) * psi(i,j)
scg(i) = scg(i) - lagrange_d(j) * psi(i,j)
END DO
!$acc end kernels
!
END DO
write(*,*) '@@4.11'
!
IF ( iter /= 1 ) THEN
!
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
!
gg1 = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
!$acc host_data use_device(g)
gg1 = ksDdot( kdim2, g(1), 1, g0_d(1), 1 )
!$acc end host_data
write(*,*) '@@4.12'
!
CALL mp_sum( gg1, intra_bgrp_comm )
!
@ -306,12 +350,17 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
! ... gg is <g(n+1)|S|g(n+1)>
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
g0_d(i) = scg_d(i) * precondition_d(i)
g0_d(i) = scg(i) * precondition_d(i)
END DO
!$acc end kernels
write(*,*) '@@4.13'
!
gg = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
!$acc host_data use_device(g)
gg = ksDdot( kdim2, g(1), 1, g0_d(1), 1 )
!$acc end host_data
write(*,*) '@@4.14'
!
CALL mp_sum( gg, intra_bgrp_comm )
!
@ -321,10 +370,12 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
gg0 = gg
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
cg_d(i) = g_d(i)
cg(i) = g(i)
END DO
!$acc end kernels
write(*,*) '@@4.15'
!
ELSE
!
@ -339,7 +390,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
! See comment below
!!DO i = 1, kdmx
!! cg_d(i) = g_d(i) + cg_d(i) * gamma
!! cg(i) = g(i) + cg(i) * gamma
!!END DO
!
! ... The following is needed because <y(n+1)| S P^2 |cg(n+1)>
@ -348,11 +399,13 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
psi_norm = gamma * cg0 * sint
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
! v== this breaks the logic, done here for performance
cg_d(i) = (g_d(i) + cg_d(i) * gamma) - psi_norm * psi_d(i,m)
cg(i) = (g(i) + cg(i) * gamma) - psi_norm * psi(i,m)
END DO
!$acc end kernels
write(*,*) '@@4.16'
!
END IF
!
@ -360,10 +413,15 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
! ... |scg> is S|cg>
!
CALL hs_1psi_ptr( npwx, npw, cg_d(1), ppsi_d(1), scg_d(1) )
write(*,*) '@@4.17'
CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) )
write(*,*) '@@4.18'
!
cg0 = ksDdot( kdim2, cg_d(1), 1, scg_d(1), 1 )
!$acc host_data use_device(scg, cg)
cg0 = ksDdot( kdim2, cg(1), 1, scg(1), 1 )
!$acc end host_data
!
write(*,*) '@@4.19'
CALL mp_sum( cg0 , intra_bgrp_comm )
!
cg0 = SQRT( cg0 )
@ -376,11 +434,16 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
! ... so that the result is correctly normalized :
! ... <y(t)|P^2S|y(t)> = 1
!
a0 = 2.D0 * ksDdot( kdim2, psi_d(1,m), 1, ppsi_d(1), 1 ) / cg0
!$acc host_data use_device(psi, ppsi)
a0 = 2.D0 * ksDdot( kdim2, psi(1,m), 1, ppsi(1), 1 ) / cg0
!$acc end host_data
!
write(*,*) '@@4.20'
CALL mp_sum( a0 , intra_bgrp_comm )
!
b0 = ksDdot( kdim2, cg_d(1), 1, ppsi_d(1), 1 ) / cg0**2
!$acc host_data use_device(cg, ppsi)
b0 = ksDdot( kdim2, cg(1), 1, ppsi(1), 1 ) / cg0**2
!$acc end host_data
!
CALL mp_sum( b0 , intra_bgrp_comm )
!
@ -413,10 +476,11 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
e(m) = MIN( es(1), es(2) )
! ... upgrade |psi>
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
psi_d(i,m) = cost * psi_d(i,m) + sint / cg0 * cg_d(i)
psi(i,m) = cost * psi(i,m) + sint / cg0 * cg(i)
END DO
!$acc end kernels
!
! ... here one could test convergence on the energy
!
@ -424,12 +488,13 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
! ... upgrade H|psi> and S|psi>
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO i = 1, kdmx
spsi_d(i) = cost * spsi_d(i) + sint / cg0 * scg_d(i)
spsi(i) = cost * spsi(i) + sint / cg0 * scg(i)
!
hpsi_d(i) = cost * hpsi_d(i) + sint / cg0 * ppsi_d(i)
hpsi(i) = cost * hpsi(i) + sint / cg0 * ppsi(i)
END DO
!$acc end kernels
!
END DO iterate
!
@ -470,28 +535,31 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
!
e0 = e(m)
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO k = 1, kdmx
ppsi_d(k) = psi_d(k,m)
ppsi(k) = psi(k,m)
END DO
!$acc end kernels
!
DO j = m, i + 1, - 1
!
e(j) = e(j-1)
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO k = 1, kdmx
psi_d(k,j) = psi_d(k,j-1)
psi(k,j) = psi(k,j-1)
END DO
!$acc end kernels
!
END DO
!
e(i) = e0
!
!$cuf kernel do(1) <<<*,*>>>
!$acc kernels
DO k = 1, kdmx
psi_d(k,i) = ppsi_d(k)
psi(k,i) = ppsi(k)
END DO
!$acc end kernels
!
! ... this procedure should be good if only a few inversions occur,
! ... extremely inefficient if eigenvectors are often in bad order
@ -511,13 +579,13 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
DEALLOCATE( lagrange )
DEALLOCATE( e )
DEALLOCATE( lagrange_d )
DEALLOCATE( ppsi_d )
DEALLOCATE( ppsi )
DEALLOCATE( g0_d )
DEALLOCATE( cg_d )
DEALLOCATE( g_d )
DEALLOCATE( hpsi_d )
DEALLOCATE( scg_d )
DEALLOCATE( spsi_d )
DEALLOCATE( cg )
DEALLOCATE( g )
DEALLOCATE( hpsi )
DEALLOCATE( scg )
DEALLOCATE( spsi )
!
CALL stop_clock( 'ccgdiagg' )
!

View File

@ -687,6 +687,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
! --- Define a small number ---
INTEGER :: j
!
write(*,*) '@0'
!write (*,*) ' enter diag_bands_k'; FLUSH(6)
IF ( lelfield ) THEN
!
@ -747,6 +748,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
!
ntry = 0
!
write(*,*) '@1'
CG_loop : DO
!
IF ( isolve == 1 .OR. isolve == 2 ) THEN
@ -757,9 +759,11 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
IF ( .not. use_gpu ) THEN
CALL rotate_wfc( npwx, npw, nbnd, gstart, nbnd, evc, npol, okvan, evc, et(1,ik) )
ELSE
!$acc host_data use_device(evc, et)
write(*,*) '@2'
!$acc host_data use_device(et)
CALL rotate_wfc_gpu( npwx, npw, nbnd, gstart, nbnd, evc, npol, okvan, evc, et(1,ik) )
!$acc end host_data
write(*,*) '@3'
END IF
!
avg_iter = avg_iter + 1.D0
@ -773,12 +777,14 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
npwx, npw, nbnd, npol, evc, et(1,ik), btype(1,ik), &
ethr, max_cg_iter, .NOT. lscf, notconv, cg_iter )
ELSE
write(*,*) '@4'
CALL using_h_diag_d(0)
!$acc host_data use_device(evc, et)
!$acc host_data use_device(et)
CALL ccgdiagg_gpu( hs_1psi_gpu, s_1psi_gpu, h_diag_d, &
npwx, npw, nbnd, npol, evc, et(1,ik), btype(1,ik), &
ethr, max_cg_iter, .NOT. lscf, notconv, cg_iter )
!$acc end host_data
write(*,*) '@5'
END IF
!
avg_iter = avg_iter + cg_iter

View File

@ -6,7 +6,7 @@
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
SUBROUTINE hs_1psi_gpu( lda, n, psi_d, hpsi_d, spsi_d )
SUBROUTINE hs_1psi_gpu( lda, n, psi, hpsi, spsi )
!----------------------------------------------------------------------------
!! This routine applies the Hamiltonian and the S matrix
!! to a vector psi and puts the result in hpsi and spsi.
@ -26,10 +26,7 @@ SUBROUTINE hs_1psi_gpu( lda, n, psi_d, hpsi_d, spsi_d )
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: lda, n
COMPLEX (DP) :: psi_d(lda*npol,1), hpsi_d(n), spsi_d(n,1)
#if defined (__CUDA)
attributes(DEVICE) :: psi_d, hpsi_d, spsi_d
#endif
COMPLEX (DP) :: psi(lda*npol,1), hpsi(n), spsi(n,1)
COMPLEX (DP), ALLOCATABLE :: psi_h(:,:), spsi_h(:,:)
!
!
@ -39,8 +36,9 @@ SUBROUTINE hs_1psi_gpu( lda, n, psi_d, hpsi_d, spsi_d )
! makes it easier to debug probable errors, please do not "beautify"
if (real_space) then
ALLOCATE(psi_h(lda*npol,1), spsi_h(n,1))
psi_h(1:lda*npol,1) = psi_d(1:lda*npol,1)
CALL h_psi_gpu( lda, n, 1, psi_d, hpsi_d )
!$acc update self(psi)
psi_h(1:lda*npol,1) = psi(1:lda*npol,1)
CALL h_psi_gpu( lda, n, 1, psi, hpsi )
if (gamma_only) then
call invfft_orbital_gamma(psi_h,1,1) !transform the orbital to real space
call s_psir_gamma(1,1)
@ -50,11 +48,12 @@ SUBROUTINE hs_1psi_gpu( lda, n, psi_d, hpsi_d, spsi_d )
call s_psir_k(1,1)
call fwfft_orbital_k(spsi_h,1,1)
end if
spsi_d(1:n,1) = spsi_h(1:n,1)
spsi(1:n,1) = spsi_h(1:n,1)
!$acc update device(psi)
DEALLOCATE(psi_h, spsi_h)
else
CALL h_psi_gpu( lda, n, 1, psi_d, hpsi_d ) ! apply H to a single wfc (no bgrp parallelization here)
CALL s_psi_acc( lda, n, 1, psi_d, spsi_d ) ! apply S to a single wfc (no bgrp parallelization here)
CALL h_psi_gpu( lda, n, 1, psi, hpsi ) ! apply H to a single wfc (no bgrp parallelization here)
CALL s_psi_acc( lda, n, 1, psi, spsi ) ! apply S to a single wfc (no bgrp parallelization here)
endif
!
CALL stop_clock_gpu( 'hs_1psi' )

View File

@ -6,7 +6,7 @@
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
SUBROUTINE s_1psi_gpu( npwx, n, psi, spsi )
!----------------------------------------------------------------------------
!! spsi = S*psi for one wavefunction. Wrapper routine - calls \(texttt{calbec}
!! and \texttt{s_psi}.
@ -15,7 +15,7 @@ SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
USE uspp, ONLY: nkb, vkb
USE becmod, ONLY: bec_type, becp
#if defined(__CUDA)
USE becmod, ONLY: calbec_cuf
USE becmod, ONLY: calbec
#endif
USE control_flags, ONLY: gamma_only, offload_type
USE noncollin_module, ONLY: noncolin, npol
@ -30,14 +30,11 @@ SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
!! maximum number of PW for wavefunctions
INTEGER :: n
!! the number of plane waves
COMPLEX(DP) :: psi_d(npwx*npol,1)
COMPLEX(DP) :: psi(npwx*npol,1)
!! input vector
COMPLEX(DP) :: spsi_d(npwx*npol,1)
COMPLEX(DP) :: spsi(npwx*npol,1)
!! S*psi
!
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, spsi_d
#endif
COMPLEX(DP), ALLOCATABLE :: psi_h(:,:), spsi_h(:,:)
! ... local variables
!
@ -49,8 +46,9 @@ SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
IF ( real_space) THEN
!
ALLOCATE(psi_h(npwx*npol,1), spsi_h(npwx*npol,1))
psi_h(1:npwx*npol,1) = psi_d(1:npwx*npol,1)
spsi_h(1:npwx*npol,1) = spsi_d(1:npwx*npol,1)
!$acc update self(psi, spsi)
psi_h(1:npwx*npol,1) = psi(1:npwx*npol,1)
spsi_h(1:npwx*npol,1) = spsi(1:npwx*npol,1)
IF ( gamma_only ) THEN
!
DO ibnd = 1, nbnd, 2
@ -77,15 +75,15 @@ SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
!
ENDIF
!
spsi_d(1:npwx*npol,1) = spsi_h(1:npwx*npol,1)
spsi(1:npwx*npol,1) = spsi_h(1:npwx*npol,1)
!$acc update device(spsi)
DEALLOCATE(psi_h, spsi_h)
ELSE
!
#if defined(__CUDA)
!civn: remove psi_d and use calbec instead
CAll calbec_cuf(offload_type, n, vkb, psi_d, becp )
CAll calbec(offload_type, n, vkb, psi, becp )
#endif
CALL s_psi_acc( npwx, n, 1, psi_d, spsi_d )
CALL s_psi_acc( npwx, n, 1, psi, spsi )
!
ENDIF