Test small2, nscf, conv_thr=1E-12 crashes when rotate_HSpsi_k_gpu is used as implemented so far.

In this commit rotate_HSpsi_k_gpu is skipped, rotate_HSpsi_k is used and the test works.
In c_bands there is also a commented line where ethr is forced greater than 1.D-13, similarly to the scf case (see electrons.f90).
This commit is contained in:
Ivan Carnimeo 2020-09-15 15:21:08 +02:00 committed by Ivan Carnimeo
parent 5d644ec0c9
commit 8928d0f195
3 changed files with 50 additions and 22 deletions

View File

@ -176,7 +176,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
if (n_start .le. n_end) & if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), psi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx ) CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), psi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
!$cuf kernel do(2) !$cuf kernel do(2)
DO ii = 1, npwx*npol DO ii = 1, kdmx
DO jj = n_start, n_end DO jj = n_start, n_end
psi_d(ii,jj) = aux_d(ii, jj) psi_d(ii,jj) = aux_d(ii, jj)
END DO END DO
@ -189,7 +189,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
if (n_start .le. n_end) & if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), hpsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx ) CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), hpsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
!$cuf kernel do(2) !$cuf kernel do(2)
DO ii = 1, npwx*npol DO ii = 1, kdmx
DO jj = n_start, n_end DO jj = n_start, n_end
hpsi_d(ii,jj) = aux_d(ii,jj) hpsi_d(ii,jj) = aux_d(ii,jj)
END DO END DO
@ -204,7 +204,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
if (n_start .le. n_end) & if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), spsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx ) CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), spsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
!$cuf kernel do(2) !$cuf kernel do(2)
DO ii = 1, npwx*npol DO ii = 1, kdmx
DO jj = n_start, n_end DO jj = n_start, n_end
spsi_d(ii,jj) = aux_d(ii,jj) spsi_d(ii,jj) = aux_d(ii,jj)
END DO END DO
@ -217,7 +217,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
ELSE IF (present(spsi_d)) THEN ELSE IF (present(spsi_d)) THEN
!$cuf kernel do(2) !$cuf kernel do(2)
DO ii = 1, npwx*npol DO ii = 1, kdmx
DO jj = 1, nbnd DO jj = 1, nbnd
spsi_d(ii,jj) = psi_d(ii,jj) spsi_d(ii,jj) = psi_d(ii,jj)
END DO END DO
@ -234,7 +234,6 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
DEALLOCATE( ss_d ) DEALLOCATE( ss_d )
DEALLOCATE( hh_d ) DEALLOCATE( hh_d )
DEALLOCATE( en_d ) DEALLOCATE( en_d )
!
call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6) call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
!call print_clock('rotHSw') !call print_clock('rotHSw')
!call print_clock('rotHSw:hc') !call print_clock('rotHSw:hc')

View File

@ -90,20 +90,20 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
recv_counts(nbgrp), displs(nbgrp), column_type recv_counts(nbgrp), displs(nbgrp), column_type
INTEGER :: ii, jj, kk ! indexes for cuf kernel loops INTEGER :: ii, jj, kk ! indexes for cuf kernel loops
REAL(DP) :: tmp ! host auxiliary variable for some host <-> device array copy !
!civn 2fix: these are needed only for __MPI = true (protate) !civn 2fix: these are needed only for __MPI = true (protate)
REAL(DP), ALLOCATABLE :: ew(:) REAL(DP), ALLOCATABLE :: ew(:)
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:) COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
!
! !
! .. device variables ! .. device variables
! !
REAL(DP), ALLOCATABLE :: ew_d(:) REAL(DP), ALLOCATABLE :: ew_d(:)
COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:) COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
LOGICAL, ALLOCATABLE :: conv_d(:)
#if defined (__CUDA) #if defined (__CUDA)
attributes(device) :: evc_d, eig_d attributes(device) :: evc_d, eig_d
attributes(device) :: psi_d, hpsi_d, spsi_d, ew_d attributes(device) :: psi_d, hpsi_d, spsi_d, ew_d
attributes(device) :: conv_d
#endif #endif
! !
! ... init local variables ! ... init local variables
@ -118,10 +118,12 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
CALL mp_type_create_column_section(evc_d(1,1), 0, npwx*npol, npwx*npol, column_type) CALL mp_type_create_column_section(evc_d(1,1), 0, npwx*npol, npwx*npol, column_type)
ALLOCATE ( ew_d(nvecx), conv(nbnd) ) ALLOCATE ( ew_d(nvecx), conv(nbnd) )
ALLOCATE ( conv_d(nbnd) )
ALLOCATE ( psi_d(npwx*npol,nvecx), hpsi_d(npwx*npol,nvecx), spsi_d(npwx*npol,nvecx) ) ALLOCATE ( psi_d(npwx*npol,nvecx), hpsi_d(npwx*npol,nvecx), spsi_d(npwx*npol,nvecx) )
CALL start_clock( 'paro:init' ); CALL start_clock( 'paro:init' );
conv(:) = .FALSE. ; nconv = COUNT ( conv(:) ) conv(:) = .FALSE. ; nconv = COUNT ( conv(:) )
conv_d = conv
!$cuf kernel do(2) !$cuf kernel do(2)
DO ii = 1, npwx*npol DO ii = 1, npwx*npol
DO jj = 1, nbnd DO jj = 1, nbnd
@ -137,7 +139,18 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
#if defined(__MPI) #if defined(__MPI)
IF ( nproc_ortho == 1 ) THEN IF ( nproc_ortho == 1 ) THEN
#endif #endif
CALL rotate_HSpsi_k_gpu ( npwx, npw, nbnd, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, eig_d ) !civn 2fix
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), eig(nbnd) )
psi = psi_d
hpsi = hpsi_d
spsi = spsi_d
eig = eig_d
CALL rotate_HSpsi_k ( npwx, npw, nbnd, nbnd, npol, psi, hpsi, overlap, spsi, eig )
psi_d = psi
hpsi_d = hpsi
spsi_d = spsi
eig_d = eig
DEALLOCATE ( psi, hpsi, spsi, eig )
#if defined(__MPI) #if defined(__MPI)
ELSE ELSE
!civn 2fix !civn 2fix
@ -219,21 +232,21 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
END DO END DO
!$cuf kernel do(2) !$cuf kernel do(2)
DO ii = 1, npwx*npol DO jj = 1, how_many
DO jj = nbase+1, nbase+how_many kk = jj + ibnd_start - 1
kk = jj + ibnd_start - 1 DO ii = 1, npwx*npol
psi_d (ii,jj) = psi_d (ii,kk) psi_d (ii,nbase+jj) = psi_d (ii,nbase+kk)
hpsi_d(ii,jj) = hpsi_d(ii,kk) hpsi_d(ii,nbase+jj) = hpsi_d(ii,nbase+kk)
spsi_d(ii,jj) = spsi_d(ii,kk) spsi_d(ii,nbase+jj) = spsi_d(ii,nbase+kk)
END DO END DO
END DO END DO
!$cuf kernel do(1) !$cuf kernel do(1)
DO ii = 1, how_many DO ii = 1, how_many
ew_d(ii) = ew_d(ibnd_start+ii-1) ew_d(ii) = ew_d(ii+ibnd_start-1)
END DO END DO
CALL stop_clock( 'paro:pack' ); CALL stop_clock( 'paro:pack' );
!write (6,*) ' check nactive = ', lbnd, nactive write (6,*) ' check nactive = ', lbnd, nactive, nconv
if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED ' if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
CALL bpcg_k_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, npol, how_many, & CALL bpcg_k_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, npol, how_many, &
@ -261,7 +274,19 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
#if defined(__MPI) #if defined(__MPI)
IF ( nproc_ortho == 1 ) THEN IF ( nproc_ortho == 1 ) THEN
#endif #endif
CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d ) !CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d )
!civn 2fix
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), ew(nvecx) )
psi = psi_d
hpsi = hpsi_d
spsi = spsi_d
ew = ew_d
CALL rotate_HSpsi_k( npwx, npw, ndiag, ndiag, npol, psi, hpsi, overlap, spsi, ew )
psi_d = psi
hpsi_d = hpsi
spsi_d = spsi
ew_d = ew
DEALLOCATE ( psi, hpsi, spsi, ew )
#if defined(__MPI) #if defined(__MPI)
ELSE ELSE
!civn 2fix !civn 2fix
@ -282,11 +307,12 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
! only the first nbnd eigenvalues are relevant for convergence ! only the first nbnd eigenvalues are relevant for convergence
! but only those that have actually been corrected should be trusted ! but only those that have actually been corrected should be trusted
conv(1:nbnd) = .FALSE. conv(1:nbnd) = .FALSE.
conv_d = conv
!$cuf kernel do(1)
DO ii = 1, ntrust DO ii = 1, ntrust
tmp = ew_d(ii) conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr
tmp = tmp - eig_d(ii)
conv(ii) = ABS(tmp).LT.ethr
END DO END DO
conv = conv_d
nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
!$cuf kernel do(1) !$cuf kernel do(1)
DO ii = 1, nbnd DO ii = 1, nbnd
@ -305,7 +331,7 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
CALL mp_sum(nhpsi,inter_bgrp_comm) CALL mp_sum(nhpsi,inter_bgrp_comm)
DEALLOCATE ( ew_d, conv ) DEALLOCATE ( ew_d, conv, conv_d )
DEALLOCATE ( psi_d, hpsi_d, spsi_d ) DEALLOCATE ( psi_d, hpsi_d, spsi_d )
CALL mp_type_free( column_type ) CALL mp_type_free( column_type )

View File

@ -637,6 +637,9 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
END IF END IF
ELSE ELSE
! !
!civn
! ethr = MAX( ethr, 1.D-13 )
!
IF ( .not. use_gpu ) THEN IF ( .not. use_gpu ) THEN
CALL using_evc(1); CALL using_et(1); CALL using_h_diag(0) CALL using_evc(1); CALL using_et(1); CALL using_h_diag(0)
CALL paro_k_new( h_psi, s_psi, hs_psi, g_1psi, okvan, & CALL paro_k_new( h_psi, s_psi, hs_psi, g_1psi, okvan, &