mirror of https://gitlab.com/QEF/q-e.git
Test small2, nscf, conv_thr=1E-12 crashes when rotate_HSpsi_k_gpu is used as implemented so far.
In this commit rotate_HSpsi_k_gpu is skipped, rotate_HSpsi_k is used and the test works. In c_bands there is also a commented line where ethr is forced greater than 1.D-13, similarly to the scf case (see electrons.f90).
This commit is contained in:
parent
5d644ec0c9
commit
8928d0f195
|
@ -176,7 +176,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
|||
if (n_start .le. n_end) &
|
||||
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), psi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx*npol
|
||||
DO ii = 1, kdmx
|
||||
DO jj = n_start, n_end
|
||||
psi_d(ii,jj) = aux_d(ii, jj)
|
||||
END DO
|
||||
|
@ -189,7 +189,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
|||
if (n_start .le. n_end) &
|
||||
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), hpsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx*npol
|
||||
DO ii = 1, kdmx
|
||||
DO jj = n_start, n_end
|
||||
hpsi_d(ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
|
@ -204,7 +204,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
|||
if (n_start .le. n_end) &
|
||||
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), spsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx*npol
|
||||
DO ii = 1, kdmx
|
||||
DO jj = n_start, n_end
|
||||
spsi_d(ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
|
@ -217,7 +217,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
|||
ELSE IF (present(spsi_d)) THEN
|
||||
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx*npol
|
||||
DO ii = 1, kdmx
|
||||
DO jj = 1, nbnd
|
||||
spsi_d(ii,jj) = psi_d(ii,jj)
|
||||
END DO
|
||||
|
@ -234,7 +234,6 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
|||
DEALLOCATE( ss_d )
|
||||
DEALLOCATE( hh_d )
|
||||
DEALLOCATE( en_d )
|
||||
!
|
||||
call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
|
||||
!call print_clock('rotHSw')
|
||||
!call print_clock('rotHSw:hc')
|
||||
|
|
|
@ -90,20 +90,20 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
|||
recv_counts(nbgrp), displs(nbgrp), column_type
|
||||
|
||||
INTEGER :: ii, jj, kk ! indexes for cuf kernel loops
|
||||
REAL(DP) :: tmp ! host auxiliary variable for some host <-> device array copy
|
||||
|
||||
!
|
||||
!civn 2fix: these are needed only for __MPI = true (protate)
|
||||
REAL(DP), ALLOCATABLE :: ew(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
|
||||
!
|
||||
!
|
||||
! .. device variables
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: ew_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
|
||||
LOGICAL, ALLOCATABLE :: conv_d(:)
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: evc_d, eig_d
|
||||
attributes(device) :: psi_d, hpsi_d, spsi_d, ew_d
|
||||
attributes(device) :: conv_d
|
||||
#endif
|
||||
!
|
||||
! ... init local variables
|
||||
|
@ -118,10 +118,12 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
|||
CALL mp_type_create_column_section(evc_d(1,1), 0, npwx*npol, npwx*npol, column_type)
|
||||
|
||||
ALLOCATE ( ew_d(nvecx), conv(nbnd) )
|
||||
ALLOCATE ( conv_d(nbnd) )
|
||||
ALLOCATE ( psi_d(npwx*npol,nvecx), hpsi_d(npwx*npol,nvecx), spsi_d(npwx*npol,nvecx) )
|
||||
|
||||
CALL start_clock( 'paro:init' );
|
||||
conv(:) = .FALSE. ; nconv = COUNT ( conv(:) )
|
||||
conv_d = conv
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx*npol
|
||||
DO jj = 1, nbnd
|
||||
|
@ -137,7 +139,18 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
|||
#if defined(__MPI)
|
||||
IF ( nproc_ortho == 1 ) THEN
|
||||
#endif
|
||||
CALL rotate_HSpsi_k_gpu ( npwx, npw, nbnd, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, eig_d )
|
||||
!civn 2fix
|
||||
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), eig(nbnd) )
|
||||
psi = psi_d
|
||||
hpsi = hpsi_d
|
||||
spsi = spsi_d
|
||||
eig = eig_d
|
||||
CALL rotate_HSpsi_k ( npwx, npw, nbnd, nbnd, npol, psi, hpsi, overlap, spsi, eig )
|
||||
psi_d = psi
|
||||
hpsi_d = hpsi
|
||||
spsi_d = spsi
|
||||
eig_d = eig
|
||||
DEALLOCATE ( psi, hpsi, spsi, eig )
|
||||
#if defined(__MPI)
|
||||
ELSE
|
||||
!civn 2fix
|
||||
|
@ -219,21 +232,21 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
|||
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx*npol
|
||||
DO jj = nbase+1, nbase+how_many
|
||||
kk = jj + ibnd_start - 1
|
||||
psi_d (ii,jj) = psi_d (ii,kk)
|
||||
hpsi_d(ii,jj) = hpsi_d(ii,kk)
|
||||
spsi_d(ii,jj) = spsi_d(ii,kk)
|
||||
DO jj = 1, how_many
|
||||
kk = jj + ibnd_start - 1
|
||||
DO ii = 1, npwx*npol
|
||||
psi_d (ii,nbase+jj) = psi_d (ii,nbase+kk)
|
||||
hpsi_d(ii,nbase+jj) = hpsi_d(ii,nbase+kk)
|
||||
spsi_d(ii,nbase+jj) = spsi_d(ii,nbase+kk)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, how_many
|
||||
ew_d(ii) = ew_d(ibnd_start+ii-1)
|
||||
ew_d(ii) = ew_d(ii+ibnd_start-1)
|
||||
END DO
|
||||
CALL stop_clock( 'paro:pack' );
|
||||
|
||||
!write (6,*) ' check nactive = ', lbnd, nactive
|
||||
write (6,*) ' check nactive = ', lbnd, nactive, nconv
|
||||
if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
|
||||
|
||||
CALL bpcg_k_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, npol, how_many, &
|
||||
|
@ -261,7 +274,19 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
|||
#if defined(__MPI)
|
||||
IF ( nproc_ortho == 1 ) THEN
|
||||
#endif
|
||||
CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d )
|
||||
!CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d )
|
||||
!civn 2fix
|
||||
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), ew(nvecx) )
|
||||
psi = psi_d
|
||||
hpsi = hpsi_d
|
||||
spsi = spsi_d
|
||||
ew = ew_d
|
||||
CALL rotate_HSpsi_k( npwx, npw, ndiag, ndiag, npol, psi, hpsi, overlap, spsi, ew )
|
||||
psi_d = psi
|
||||
hpsi_d = hpsi
|
||||
spsi_d = spsi
|
||||
ew_d = ew
|
||||
DEALLOCATE ( psi, hpsi, spsi, ew )
|
||||
#if defined(__MPI)
|
||||
ELSE
|
||||
!civn 2fix
|
||||
|
@ -282,11 +307,12 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
|||
! only the first nbnd eigenvalues are relevant for convergence
|
||||
! but only those that have actually been corrected should be trusted
|
||||
conv(1:nbnd) = .FALSE.
|
||||
conv_d = conv
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, ntrust
|
||||
tmp = ew_d(ii)
|
||||
tmp = tmp - eig_d(ii)
|
||||
conv(ii) = ABS(tmp).LT.ethr
|
||||
conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr
|
||||
END DO
|
||||
conv = conv_d
|
||||
nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, nbnd
|
||||
|
@ -305,7 +331,7 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
|||
|
||||
CALL mp_sum(nhpsi,inter_bgrp_comm)
|
||||
|
||||
DEALLOCATE ( ew_d, conv )
|
||||
DEALLOCATE ( ew_d, conv, conv_d )
|
||||
DEALLOCATE ( psi_d, hpsi_d, spsi_d )
|
||||
|
||||
CALL mp_type_free( column_type )
|
||||
|
|
|
@ -637,6 +637,9 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
|
|||
END IF
|
||||
ELSE
|
||||
!
|
||||
!civn
|
||||
! ethr = MAX( ethr, 1.D-13 )
|
||||
!
|
||||
IF ( .not. use_gpu ) THEN
|
||||
CALL using_evc(1); CALL using_et(1); CALL using_h_diag(0)
|
||||
CALL paro_k_new( h_psi, s_psi, hs_psi, g_1psi, okvan, &
|
||||
|
|
Loading…
Reference in New Issue