mirror of https://gitlab.com/QEF/q-e.git
Fix CG with k points (CUF-->OpenACC)
This commit is contained in:
parent
cce06d5692
commit
aa2b133609
|
@ -44,7 +44,7 @@ END FUNCTION KSDdot
|
|||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
||||
npwx, npw, nbnd, npol, psi_d, e_d, btype, &
|
||||
npwx, npw, nbnd, npol, psi, e_d, btype, &
|
||||
ethr, maxter, reorder, notconv, avg_iter )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
|
@ -77,21 +77,22 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
INTEGER, INTENT(IN) :: npwx, npw, nbnd, npol, maxter
|
||||
INTEGER, INTENT(IN) :: btype(nbnd)
|
||||
REAL(DP), INTENT(IN) :: precondition_d(npwx*npol), ethr
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nbnd)
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nbnd)
|
||||
REAL(DP), INTENT(INOUT) :: e_d(nbnd)
|
||||
INTEGER, INTENT(OUT) :: notconv
|
||||
REAL(DP), INTENT(OUT) :: avg_iter
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: precondition_d, psi_d, e_d
|
||||
attributes(DEVICE) :: precondition_d, e_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: i, j, k, m, m_start, m_end, iter, moved
|
||||
COMPLEX(DP), ALLOCATABLE :: hpsi_d(:), spsi_d(:), g_d(:), cg_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: scg_d(:), ppsi_d(:), g0_d(:), lagrange_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: hpsi(:), spsi(:), g(:), cg(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: scg(:), ppsi(:), g0_d(:), lagrange_d(:)
|
||||
!$acc declare device_resident(g, scg, hpsi, spsi, cg, ppsi)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: hpsi_d, spsi_d, g_d, cg_d, scg_d, ppsi_d, g0_d, lagrange_d
|
||||
attributes(DEVICE) :: g0_d, lagrange_d
|
||||
#endif
|
||||
COMPLEX(DP), ALLOCATABLE :: lagrange(:)
|
||||
REAL(DP) :: gamma, ddot_temp, es_1, es(2)
|
||||
|
@ -127,24 +128,24 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
kdim2 = 2 * kdim
|
||||
!
|
||||
ALLOCATE( hpsi_d(kdmx), STAT=ierr )
|
||||
ALLOCATE( hpsi(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate hpsi_d ', ABS(ierr) )
|
||||
ALLOCATE( spsi_d(kdmx), STAT=ierr )
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate hpsi ', ABS(ierr) )
|
||||
ALLOCATE( spsi(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate spsi_d ', ABS(ierr) )
|
||||
ALLOCATE( g_d(kdmx), STAT=ierr )
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate spsi ', ABS(ierr) )
|
||||
ALLOCATE( g(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate g_d ', ABS(ierr) )
|
||||
ALLOCATE( cg_d(kdmx), STAT=ierr )
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate g ', ABS(ierr) )
|
||||
ALLOCATE( cg(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate cg_d ', ABS(ierr) )
|
||||
ALLOCATE( scg_d(kdmx), STAT=ierr )
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate cg ', ABS(ierr) )
|
||||
ALLOCATE( scg(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate scg_d ', ABS(ierr) )
|
||||
ALLOCATE( ppsi_d(kdmx), STAT=ierr )
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate scg ', ABS(ierr) )
|
||||
ALLOCATE( ppsi(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate ppsi_d ', ABS(ierr) )
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate ppsi ', ABS(ierr) )
|
||||
ALLOCATE( g0_d(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate g0_d ', ABS(ierr) )
|
||||
|
@ -176,26 +177,44 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
END IF
|
||||
!
|
||||
CALL dev_memset( spsi_d , ZERO )
|
||||
CALL dev_memset( scg_d , ZERO )
|
||||
CALL dev_memset( hpsi_d , ZERO )
|
||||
CALL dev_memset( g_d , ZERO )
|
||||
CALL dev_memset( cg_d , ZERO )
|
||||
!$acc host_data use_device(spsi)
|
||||
CALL dev_memset( spsi , ZERO )
|
||||
!$acc end host_data
|
||||
!$acc host_data use_device(hpsi)
|
||||
CALL dev_memset( hpsi , ZERO )
|
||||
!$acc end host_data
|
||||
!$acc host_data use_device(scg)
|
||||
CALL dev_memset( scg , ZERO )
|
||||
!$acc end host_data
|
||||
!$acc host_data use_device(g)
|
||||
CALL dev_memset( g , ZERO )
|
||||
!$acc end host_data
|
||||
!$acc host_data use_device(cg)
|
||||
CALL dev_memset( cg , ZERO )
|
||||
!$acc end host_data
|
||||
CALL dev_memset( g0_d , ZERO )
|
||||
CALL dev_memset( ppsi_d , ZERO )
|
||||
!$acc host_data use_device(ppsi)
|
||||
CALL dev_memset( ppsi , ZERO )
|
||||
!$acc end host_data
|
||||
CALL dev_memset( lagrange_d , ZERO )
|
||||
!
|
||||
! ... calculate S|psi>
|
||||
!
|
||||
CALL s_1psi_ptr( npwx, npw, psi_d(1,m), spsi_d )
|
||||
write(*,*) '@@4.1'
|
||||
CALL s_1psi_ptr( npwx, npw, psi(1,m), spsi )
|
||||
write(*,*) '@@4.2'
|
||||
!
|
||||
! ... orthogonalize starting eigenfunction to those already calculated
|
||||
!
|
||||
call divide(inter_bgrp_comm,m,m_start,m_end); !write(*,*) m,m_start,m_end
|
||||
lagrange = ZERO
|
||||
if(m_start.le.m_end) &
|
||||
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
|
||||
kdmx, spsi_d, 1, ZERO, lagrange_d(m_start), 1 )
|
||||
IF(m_start.le.m_end) THEN
|
||||
!$acc host_data use_device(psi, spsi)
|
||||
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi(1,m_start), &
|
||||
kdmx, spsi, 1, ZERO, lagrange_d(m_start), 1 )
|
||||
!$acc end host_data
|
||||
END IF
|
||||
write(*,*) '@@4.3'
|
||||
if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
|
||||
|
||||
CALL mp_sum( lagrange( 1:m ), inter_bgrp_comm )
|
||||
|
@ -206,34 +225,42 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
lagrange_d(1:m) = lagrange(1:m)
|
||||
!
|
||||
DO j = 1, m - 1
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
!
|
||||
psi_d(i,m) = psi_d(i,m) - lagrange_d(j) * psi_d(i,j)
|
||||
psi(i,m) = psi(i,m) - lagrange_d(j) * psi(i,j)
|
||||
!
|
||||
END DO
|
||||
!$acc end kernels
|
||||
!
|
||||
psi_norm = psi_norm - ( DBLE( lagrange(j) )**2 + AIMAG( lagrange(j) )**2 )
|
||||
!
|
||||
END DO
|
||||
write(*,*) '@@4.4'
|
||||
!
|
||||
psi_norm = SQRT( psi_norm )
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
psi_d(i,m) = psi_d(i,m) / psi_norm
|
||||
psi(i,m) = psi(i,m) / psi_norm
|
||||
END DO
|
||||
!$acc end kernels
|
||||
!
|
||||
write(*,*) '@@4.5'
|
||||
! ... calculate starting gradient (|hpsi> = H|psi>) ...
|
||||
!
|
||||
CALL hs_1psi_ptr( npwx, npw, psi_d(1,m), hpsi_d, spsi_d )
|
||||
CALL hs_1psi_ptr( npwx, npw, psi(1,m), hpsi, spsi )
|
||||
!
|
||||
write(*,*) '@@4.6'
|
||||
! ... and starting eigenvalue (e = <y|PHP|y> = <psi|H|psi>)
|
||||
!
|
||||
! ... NB: ddot(2*npw,a,1,b,1) = REAL( zdotc(npw,a,1,b,1) )
|
||||
!
|
||||
e(m) = ksDdot( kdim2, psi_d(1,m), 1, hpsi_d, 1 )
|
||||
!$acc host_data use_device(psi, hpsi)
|
||||
e(m) = ksDdot( kdim2, psi(1,m), 1, hpsi, 1 )
|
||||
!$acc end host_data
|
||||
!
|
||||
write(*,*) '@@4.7'
|
||||
CALL mp_sum( e(m), intra_bgrp_comm )
|
||||
!
|
||||
! ... start iteration for this band
|
||||
|
@ -243,26 +270,32 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
! ... calculate P (PHP)|y>
|
||||
! ... ( P = preconditioning matrix, assumed diagonal )
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
g_d(i) = hpsi_d(i) / precondition_d(i)
|
||||
ppsi_d(i) = spsi_d(i) / precondition_d(i)
|
||||
g(i) = hpsi(i) / precondition_d(i)
|
||||
ppsi(i) = spsi(i) / precondition_d(i)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
write(*,*) '@@4.8'
|
||||
!
|
||||
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
|
||||
!
|
||||
es(1) = ksDdot( kdim2, spsi_d(1), 1, g_d(1), 1 )
|
||||
es(2) = ksDdot( kdim2, spsi_d(1), 1, ppsi_d(1), 1 )
|
||||
!$acc host_data use_device(spsi, g, ppsi)
|
||||
es(1) = ksDdot( kdim2, spsi(1), 1, g(1), 1 )
|
||||
es(2) = ksDdot( kdim2, spsi(1), 1, ppsi(1), 1 )
|
||||
!$acc end host_data
|
||||
write(*,*) '@@4.9'
|
||||
!
|
||||
CALL mp_sum( es , intra_bgrp_comm )
|
||||
!
|
||||
es(1) = es(1) / es(2)
|
||||
es_1 = es(1)
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
g_d(i) = g_d(i) - es_1 * ppsi_d(i)
|
||||
g(i) = g(i) - es_1 * ppsi(i)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
!
|
||||
! ... e1 = <y| S P^2 PHP|y> / <y| S S P^2|y> ensures that
|
||||
! ... <g| S P^2|y> = 0
|
||||
|
@ -270,13 +303,19 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
! ... scg is used as workspace
|
||||
!
|
||||
CALL s_1psi_ptr( npwx, npw, g_d(1), scg_d(1) )
|
||||
write(*,*) '@@4.10'
|
||||
CALL s_1psi_ptr( npwx, npw, g(1), scg(1) )
|
||||
write(*,*) '@@4.11'
|
||||
!
|
||||
lagrange(1:m-1) = ZERO
|
||||
call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
|
||||
if(m_start.le.m_end) &
|
||||
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
|
||||
kdmx, scg_d, 1, ZERO, lagrange_d(m_start), 1 )
|
||||
IF(m_start.le.m_end) THEN
|
||||
!$acc host_data use_device(psi, scg)
|
||||
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi(1,m_start), &
|
||||
kdmx, scg, 1, ZERO, lagrange_d(m_start), 1 )
|
||||
!$acc end host_data
|
||||
END IF
|
||||
write(*,*) '@@4.12'
|
||||
if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
|
||||
CALL mp_sum( lagrange( 1:m-1 ), inter_bgrp_comm )
|
||||
!
|
||||
|
@ -286,19 +325,24 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
DO j = 1, ( m - 1 )
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
g_d(i) = g_d(i) - lagrange_d(j) * psi_d(i,j)
|
||||
scg_d(i) = scg_d(i) - lagrange_d(j) * psi_d(i,j)
|
||||
g(i) = g(i) - lagrange_d(j) * psi(i,j)
|
||||
scg(i) = scg(i) - lagrange_d(j) * psi(i,j)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
!
|
||||
END DO
|
||||
write(*,*) '@@4.11'
|
||||
!
|
||||
IF ( iter /= 1 ) THEN
|
||||
!
|
||||
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
|
||||
!
|
||||
gg1 = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
|
||||
!$acc host_data use_device(g)
|
||||
gg1 = ksDdot( kdim2, g(1), 1, g0_d(1), 1 )
|
||||
!$acc end host_data
|
||||
write(*,*) '@@4.12'
|
||||
!
|
||||
CALL mp_sum( gg1, intra_bgrp_comm )
|
||||
!
|
||||
|
@ -306,12 +350,17 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
! ... gg is <g(n+1)|S|g(n+1)>
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
g0_d(i) = scg_d(i) * precondition_d(i)
|
||||
g0_d(i) = scg(i) * precondition_d(i)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
write(*,*) '@@4.13'
|
||||
!
|
||||
gg = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
|
||||
!$acc host_data use_device(g)
|
||||
gg = ksDdot( kdim2, g(1), 1, g0_d(1), 1 )
|
||||
!$acc end host_data
|
||||
write(*,*) '@@4.14'
|
||||
!
|
||||
CALL mp_sum( gg, intra_bgrp_comm )
|
||||
!
|
||||
|
@ -321,10 +370,12 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
gg0 = gg
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
cg_d(i) = g_d(i)
|
||||
cg(i) = g(i)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
write(*,*) '@@4.15'
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
|
@ -339,7 +390,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
! See comment below
|
||||
!!DO i = 1, kdmx
|
||||
!! cg_d(i) = g_d(i) + cg_d(i) * gamma
|
||||
!! cg(i) = g(i) + cg(i) * gamma
|
||||
!!END DO
|
||||
!
|
||||
! ... The following is needed because <y(n+1)| S P^2 |cg(n+1)>
|
||||
|
@ -348,11 +399,13 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
psi_norm = gamma * cg0 * sint
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
! v== this breaks the logic, done here for performance
|
||||
cg_d(i) = (g_d(i) + cg_d(i) * gamma) - psi_norm * psi_d(i,m)
|
||||
cg(i) = (g(i) + cg(i) * gamma) - psi_norm * psi(i,m)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
write(*,*) '@@4.16'
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -360,10 +413,15 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
! ... |scg> is S|cg>
|
||||
!
|
||||
CALL hs_1psi_ptr( npwx, npw, cg_d(1), ppsi_d(1), scg_d(1) )
|
||||
write(*,*) '@@4.17'
|
||||
CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) )
|
||||
write(*,*) '@@4.18'
|
||||
!
|
||||
cg0 = ksDdot( kdim2, cg_d(1), 1, scg_d(1), 1 )
|
||||
!$acc host_data use_device(scg, cg)
|
||||
cg0 = ksDdot( kdim2, cg(1), 1, scg(1), 1 )
|
||||
!$acc end host_data
|
||||
!
|
||||
write(*,*) '@@4.19'
|
||||
CALL mp_sum( cg0 , intra_bgrp_comm )
|
||||
!
|
||||
cg0 = SQRT( cg0 )
|
||||
|
@ -376,11 +434,16 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
! ... so that the result is correctly normalized :
|
||||
! ... <y(t)|P^2S|y(t)> = 1
|
||||
!
|
||||
a0 = 2.D0 * ksDdot( kdim2, psi_d(1,m), 1, ppsi_d(1), 1 ) / cg0
|
||||
!$acc host_data use_device(psi, ppsi)
|
||||
a0 = 2.D0 * ksDdot( kdim2, psi(1,m), 1, ppsi(1), 1 ) / cg0
|
||||
!$acc end host_data
|
||||
!
|
||||
write(*,*) '@@4.20'
|
||||
CALL mp_sum( a0 , intra_bgrp_comm )
|
||||
!
|
||||
b0 = ksDdot( kdim2, cg_d(1), 1, ppsi_d(1), 1 ) / cg0**2
|
||||
!$acc host_data use_device(cg, ppsi)
|
||||
b0 = ksDdot( kdim2, cg(1), 1, ppsi(1), 1 ) / cg0**2
|
||||
!$acc end host_data
|
||||
!
|
||||
CALL mp_sum( b0 , intra_bgrp_comm )
|
||||
!
|
||||
|
@ -413,10 +476,11 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
e(m) = MIN( es(1), es(2) )
|
||||
! ... upgrade |psi>
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
psi_d(i,m) = cost * psi_d(i,m) + sint / cg0 * cg_d(i)
|
||||
psi(i,m) = cost * psi(i,m) + sint / cg0 * cg(i)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
!
|
||||
! ... here one could test convergence on the energy
|
||||
!
|
||||
|
@ -424,12 +488,13 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
! ... upgrade H|psi> and S|psi>
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO i = 1, kdmx
|
||||
spsi_d(i) = cost * spsi_d(i) + sint / cg0 * scg_d(i)
|
||||
spsi(i) = cost * spsi(i) + sint / cg0 * scg(i)
|
||||
!
|
||||
hpsi_d(i) = cost * hpsi_d(i) + sint / cg0 * ppsi_d(i)
|
||||
hpsi(i) = cost * hpsi(i) + sint / cg0 * ppsi(i)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
!
|
||||
END DO iterate
|
||||
!
|
||||
|
@ -470,28 +535,31 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
!
|
||||
e0 = e(m)
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO k = 1, kdmx
|
||||
ppsi_d(k) = psi_d(k,m)
|
||||
ppsi(k) = psi(k,m)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
!
|
||||
DO j = m, i + 1, - 1
|
||||
!
|
||||
e(j) = e(j-1)
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO k = 1, kdmx
|
||||
psi_d(k,j) = psi_d(k,j-1)
|
||||
psi(k,j) = psi(k,j-1)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
!
|
||||
END DO
|
||||
!
|
||||
e(i) = e0
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
!$acc kernels
|
||||
DO k = 1, kdmx
|
||||
psi_d(k,i) = ppsi_d(k)
|
||||
psi(k,i) = ppsi(k)
|
||||
END DO
|
||||
!$acc end kernels
|
||||
!
|
||||
! ... this procedure should be good if only a few inversions occur,
|
||||
! ... extremely inefficient if eigenvectors are often in bad order
|
||||
|
@ -511,13 +579,13 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition_d, &
|
|||
DEALLOCATE( lagrange )
|
||||
DEALLOCATE( e )
|
||||
DEALLOCATE( lagrange_d )
|
||||
DEALLOCATE( ppsi_d )
|
||||
DEALLOCATE( ppsi )
|
||||
DEALLOCATE( g0_d )
|
||||
DEALLOCATE( cg_d )
|
||||
DEALLOCATE( g_d )
|
||||
DEALLOCATE( hpsi_d )
|
||||
DEALLOCATE( scg_d )
|
||||
DEALLOCATE( spsi_d )
|
||||
DEALLOCATE( cg )
|
||||
DEALLOCATE( g )
|
||||
DEALLOCATE( hpsi )
|
||||
DEALLOCATE( scg )
|
||||
DEALLOCATE( spsi )
|
||||
!
|
||||
CALL stop_clock( 'ccgdiagg' )
|
||||
!
|
||||
|
|
|
@ -687,6 +687,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
|
|||
! --- Define a small number ---
|
||||
INTEGER :: j
|
||||
!
|
||||
write(*,*) '@0'
|
||||
!write (*,*) ' enter diag_bands_k'; FLUSH(6)
|
||||
IF ( lelfield ) THEN
|
||||
!
|
||||
|
@ -747,6 +748,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
|
|||
!
|
||||
ntry = 0
|
||||
!
|
||||
write(*,*) '@1'
|
||||
CG_loop : DO
|
||||
!
|
||||
IF ( isolve == 1 .OR. isolve == 2 ) THEN
|
||||
|
@ -757,9 +759,11 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
|
|||
IF ( .not. use_gpu ) THEN
|
||||
CALL rotate_wfc( npwx, npw, nbnd, gstart, nbnd, evc, npol, okvan, evc, et(1,ik) )
|
||||
ELSE
|
||||
!$acc host_data use_device(evc, et)
|
||||
write(*,*) '@2'
|
||||
!$acc host_data use_device(et)
|
||||
CALL rotate_wfc_gpu( npwx, npw, nbnd, gstart, nbnd, evc, npol, okvan, evc, et(1,ik) )
|
||||
!$acc end host_data
|
||||
write(*,*) '@3'
|
||||
END IF
|
||||
!
|
||||
avg_iter = avg_iter + 1.D0
|
||||
|
@ -773,12 +777,14 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
|
|||
npwx, npw, nbnd, npol, evc, et(1,ik), btype(1,ik), &
|
||||
ethr, max_cg_iter, .NOT. lscf, notconv, cg_iter )
|
||||
ELSE
|
||||
write(*,*) '@4'
|
||||
CALL using_h_diag_d(0)
|
||||
!$acc host_data use_device(evc, et)
|
||||
!$acc host_data use_device(et)
|
||||
CALL ccgdiagg_gpu( hs_1psi_gpu, s_1psi_gpu, h_diag_d, &
|
||||
npwx, npw, nbnd, npol, evc, et(1,ik), btype(1,ik), &
|
||||
ethr, max_cg_iter, .NOT. lscf, notconv, cg_iter )
|
||||
!$acc end host_data
|
||||
write(*,*) '@5'
|
||||
END IF
|
||||
!
|
||||
avg_iter = avg_iter + cg_iter
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE hs_1psi_gpu( lda, n, psi_d, hpsi_d, spsi_d )
|
||||
SUBROUTINE hs_1psi_gpu( lda, n, psi, hpsi, spsi )
|
||||
!----------------------------------------------------------------------------
|
||||
!! This routine applies the Hamiltonian and the S matrix
|
||||
!! to a vector psi and puts the result in hpsi and spsi.
|
||||
|
@ -26,10 +26,7 @@ SUBROUTINE hs_1psi_gpu( lda, n, psi_d, hpsi_d, spsi_d )
|
|||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: lda, n
|
||||
COMPLEX (DP) :: psi_d(lda*npol,1), hpsi_d(n), spsi_d(n,1)
|
||||
#if defined (__CUDA)
|
||||
attributes(DEVICE) :: psi_d, hpsi_d, spsi_d
|
||||
#endif
|
||||
COMPLEX (DP) :: psi(lda*npol,1), hpsi(n), spsi(n,1)
|
||||
COMPLEX (DP), ALLOCATABLE :: psi_h(:,:), spsi_h(:,:)
|
||||
!
|
||||
!
|
||||
|
@ -39,8 +36,9 @@ SUBROUTINE hs_1psi_gpu( lda, n, psi_d, hpsi_d, spsi_d )
|
|||
! makes it easier to debug probable errors, please do not "beautify"
|
||||
if (real_space) then
|
||||
ALLOCATE(psi_h(lda*npol,1), spsi_h(n,1))
|
||||
psi_h(1:lda*npol,1) = psi_d(1:lda*npol,1)
|
||||
CALL h_psi_gpu( lda, n, 1, psi_d, hpsi_d )
|
||||
!$acc update self(psi)
|
||||
psi_h(1:lda*npol,1) = psi(1:lda*npol,1)
|
||||
CALL h_psi_gpu( lda, n, 1, psi, hpsi )
|
||||
if (gamma_only) then
|
||||
call invfft_orbital_gamma(psi_h,1,1) !transform the orbital to real space
|
||||
call s_psir_gamma(1,1)
|
||||
|
@ -50,11 +48,12 @@ SUBROUTINE hs_1psi_gpu( lda, n, psi_d, hpsi_d, spsi_d )
|
|||
call s_psir_k(1,1)
|
||||
call fwfft_orbital_k(spsi_h,1,1)
|
||||
end if
|
||||
spsi_d(1:n,1) = spsi_h(1:n,1)
|
||||
spsi(1:n,1) = spsi_h(1:n,1)
|
||||
!$acc update device(psi)
|
||||
DEALLOCATE(psi_h, spsi_h)
|
||||
else
|
||||
CALL h_psi_gpu( lda, n, 1, psi_d, hpsi_d ) ! apply H to a single wfc (no bgrp parallelization here)
|
||||
CALL s_psi_acc( lda, n, 1, psi_d, spsi_d ) ! apply S to a single wfc (no bgrp parallelization here)
|
||||
CALL h_psi_gpu( lda, n, 1, psi, hpsi ) ! apply H to a single wfc (no bgrp parallelization here)
|
||||
CALL s_psi_acc( lda, n, 1, psi, spsi ) ! apply S to a single wfc (no bgrp parallelization here)
|
||||
endif
|
||||
!
|
||||
CALL stop_clock_gpu( 'hs_1psi' )
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
|
||||
SUBROUTINE s_1psi_gpu( npwx, n, psi, spsi )
|
||||
!----------------------------------------------------------------------------
|
||||
!! spsi = S*psi for one wavefunction. Wrapper routine - calls \(texttt{calbec}
|
||||
!! and \texttt{s_psi}.
|
||||
|
@ -15,7 +15,7 @@ SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
|
|||
USE uspp, ONLY: nkb, vkb
|
||||
USE becmod, ONLY: bec_type, becp
|
||||
#if defined(__CUDA)
|
||||
USE becmod, ONLY: calbec_cuf
|
||||
USE becmod, ONLY: calbec
|
||||
#endif
|
||||
USE control_flags, ONLY: gamma_only, offload_type
|
||||
USE noncollin_module, ONLY: noncolin, npol
|
||||
|
@ -30,14 +30,11 @@ SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
|
|||
!! maximum number of PW for wavefunctions
|
||||
INTEGER :: n
|
||||
!! the number of plane waves
|
||||
COMPLEX(DP) :: psi_d(npwx*npol,1)
|
||||
COMPLEX(DP) :: psi(npwx*npol,1)
|
||||
!! input vector
|
||||
COMPLEX(DP) :: spsi_d(npwx*npol,1)
|
||||
COMPLEX(DP) :: spsi(npwx*npol,1)
|
||||
!! S*psi
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psi_d, spsi_d
|
||||
#endif
|
||||
COMPLEX(DP), ALLOCATABLE :: psi_h(:,:), spsi_h(:,:)
|
||||
! ... local variables
|
||||
!
|
||||
|
@ -49,8 +46,9 @@ SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
|
|||
IF ( real_space) THEN
|
||||
!
|
||||
ALLOCATE(psi_h(npwx*npol,1), spsi_h(npwx*npol,1))
|
||||
psi_h(1:npwx*npol,1) = psi_d(1:npwx*npol,1)
|
||||
spsi_h(1:npwx*npol,1) = spsi_d(1:npwx*npol,1)
|
||||
!$acc update self(psi, spsi)
|
||||
psi_h(1:npwx*npol,1) = psi(1:npwx*npol,1)
|
||||
spsi_h(1:npwx*npol,1) = spsi(1:npwx*npol,1)
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
DO ibnd = 1, nbnd, 2
|
||||
|
@ -77,15 +75,15 @@ SUBROUTINE s_1psi_gpu( npwx, n, psi_d, spsi_d )
|
|||
!
|
||||
ENDIF
|
||||
!
|
||||
spsi_d(1:npwx*npol,1) = spsi_h(1:npwx*npol,1)
|
||||
spsi(1:npwx*npol,1) = spsi_h(1:npwx*npol,1)
|
||||
!$acc update device(spsi)
|
||||
DEALLOCATE(psi_h, spsi_h)
|
||||
ELSE
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!civn: remove psi_d and use calbec instead
|
||||
CAll calbec_cuf(offload_type, n, vkb, psi_d, becp )
|
||||
CAll calbec(offload_type, n, vkb, psi, becp )
|
||||
#endif
|
||||
CALL s_psi_acc( npwx, n, 1, psi_d, spsi_d )
|
||||
CALL s_psi_acc( npwx, n, 1, psi, spsi )
|
||||
|
||||
!
|
||||
ENDIF
|
||||
|
|
Loading…
Reference in New Issue