mirror of https://gitlab.com/QEF/q-e.git
Test small2, nscf, conv_thr=1E-12 crashes when rotate_HSpsi_k_gpu is used as implemented so far.
In this commit rotate_HSpsi_k_gpu is skipped, rotate_HSpsi_k is used and the test works. In c_bands there is also a commented line where ethr is forced greater than 1.D-13, similarly to the scf case (see electrons.f90).
This commit is contained in:
parent
5d644ec0c9
commit
8928d0f195
|
@ -176,7 +176,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), psi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), psi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, npwx*npol
|
DO ii = 1, kdmx
|
||||||
DO jj = n_start, n_end
|
DO jj = n_start, n_end
|
||||||
psi_d(ii,jj) = aux_d(ii, jj)
|
psi_d(ii,jj) = aux_d(ii, jj)
|
||||||
END DO
|
END DO
|
||||||
|
@ -189,7 +189,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), hpsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), hpsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, npwx*npol
|
DO ii = 1, kdmx
|
||||||
DO jj = n_start, n_end
|
DO jj = n_start, n_end
|
||||||
hpsi_d(ii,jj) = aux_d(ii,jj)
|
hpsi_d(ii,jj) = aux_d(ii,jj)
|
||||||
END DO
|
END DO
|
||||||
|
@ -204,7 +204,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), spsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), spsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, npwx*npol
|
DO ii = 1, kdmx
|
||||||
DO jj = n_start, n_end
|
DO jj = n_start, n_end
|
||||||
spsi_d(ii,jj) = aux_d(ii,jj)
|
spsi_d(ii,jj) = aux_d(ii,jj)
|
||||||
END DO
|
END DO
|
||||||
|
@ -217,7 +217,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
||||||
ELSE IF (present(spsi_d)) THEN
|
ELSE IF (present(spsi_d)) THEN
|
||||||
|
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, npwx*npol
|
DO ii = 1, kdmx
|
||||||
DO jj = 1, nbnd
|
DO jj = 1, nbnd
|
||||||
spsi_d(ii,jj) = psi_d(ii,jj)
|
spsi_d(ii,jj) = psi_d(ii,jj)
|
||||||
END DO
|
END DO
|
||||||
|
@ -234,7 +234,6 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove
|
||||||
DEALLOCATE( ss_d )
|
DEALLOCATE( ss_d )
|
||||||
DEALLOCATE( hh_d )
|
DEALLOCATE( hh_d )
|
||||||
DEALLOCATE( en_d )
|
DEALLOCATE( en_d )
|
||||||
!
|
|
||||||
call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
|
call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
|
||||||
!call print_clock('rotHSw')
|
!call print_clock('rotHSw')
|
||||||
!call print_clock('rotHSw:hc')
|
!call print_clock('rotHSw:hc')
|
||||||
|
|
|
@ -90,20 +90,20 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
||||||
recv_counts(nbgrp), displs(nbgrp), column_type
|
recv_counts(nbgrp), displs(nbgrp), column_type
|
||||||
|
|
||||||
INTEGER :: ii, jj, kk ! indexes for cuf kernel loops
|
INTEGER :: ii, jj, kk ! indexes for cuf kernel loops
|
||||||
REAL(DP) :: tmp ! host auxiliary variable for some host <-> device array copy
|
!
|
||||||
|
|
||||||
!civn 2fix: these are needed only for __MPI = true (protate)
|
!civn 2fix: these are needed only for __MPI = true (protate)
|
||||||
REAL(DP), ALLOCATABLE :: ew(:)
|
REAL(DP), ALLOCATABLE :: ew(:)
|
||||||
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
|
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
|
||||||
!
|
|
||||||
!
|
!
|
||||||
! .. device variables
|
! .. device variables
|
||||||
!
|
!
|
||||||
REAL(DP), ALLOCATABLE :: ew_d(:)
|
REAL(DP), ALLOCATABLE :: ew_d(:)
|
||||||
COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
|
COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
|
||||||
|
LOGICAL, ALLOCATABLE :: conv_d(:)
|
||||||
#if defined (__CUDA)
|
#if defined (__CUDA)
|
||||||
attributes(device) :: evc_d, eig_d
|
attributes(device) :: evc_d, eig_d
|
||||||
attributes(device) :: psi_d, hpsi_d, spsi_d, ew_d
|
attributes(device) :: psi_d, hpsi_d, spsi_d, ew_d
|
||||||
|
attributes(device) :: conv_d
|
||||||
#endif
|
#endif
|
||||||
!
|
!
|
||||||
! ... init local variables
|
! ... init local variables
|
||||||
|
@ -118,10 +118,12 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
||||||
CALL mp_type_create_column_section(evc_d(1,1), 0, npwx*npol, npwx*npol, column_type)
|
CALL mp_type_create_column_section(evc_d(1,1), 0, npwx*npol, npwx*npol, column_type)
|
||||||
|
|
||||||
ALLOCATE ( ew_d(nvecx), conv(nbnd) )
|
ALLOCATE ( ew_d(nvecx), conv(nbnd) )
|
||||||
|
ALLOCATE ( conv_d(nbnd) )
|
||||||
ALLOCATE ( psi_d(npwx*npol,nvecx), hpsi_d(npwx*npol,nvecx), spsi_d(npwx*npol,nvecx) )
|
ALLOCATE ( psi_d(npwx*npol,nvecx), hpsi_d(npwx*npol,nvecx), spsi_d(npwx*npol,nvecx) )
|
||||||
|
|
||||||
CALL start_clock( 'paro:init' );
|
CALL start_clock( 'paro:init' );
|
||||||
conv(:) = .FALSE. ; nconv = COUNT ( conv(:) )
|
conv(:) = .FALSE. ; nconv = COUNT ( conv(:) )
|
||||||
|
conv_d = conv
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, npwx*npol
|
DO ii = 1, npwx*npol
|
||||||
DO jj = 1, nbnd
|
DO jj = 1, nbnd
|
||||||
|
@ -137,7 +139,18 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
||||||
#if defined(__MPI)
|
#if defined(__MPI)
|
||||||
IF ( nproc_ortho == 1 ) THEN
|
IF ( nproc_ortho == 1 ) THEN
|
||||||
#endif
|
#endif
|
||||||
CALL rotate_HSpsi_k_gpu ( npwx, npw, nbnd, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, eig_d )
|
!civn 2fix
|
||||||
|
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), eig(nbnd) )
|
||||||
|
psi = psi_d
|
||||||
|
hpsi = hpsi_d
|
||||||
|
spsi = spsi_d
|
||||||
|
eig = eig_d
|
||||||
|
CALL rotate_HSpsi_k ( npwx, npw, nbnd, nbnd, npol, psi, hpsi, overlap, spsi, eig )
|
||||||
|
psi_d = psi
|
||||||
|
hpsi_d = hpsi
|
||||||
|
spsi_d = spsi
|
||||||
|
eig_d = eig
|
||||||
|
DEALLOCATE ( psi, hpsi, spsi, eig )
|
||||||
#if defined(__MPI)
|
#if defined(__MPI)
|
||||||
ELSE
|
ELSE
|
||||||
!civn 2fix
|
!civn 2fix
|
||||||
|
@ -219,21 +232,21 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
||||||
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
|
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
|
||||||
END DO
|
END DO
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, npwx*npol
|
DO jj = 1, how_many
|
||||||
DO jj = nbase+1, nbase+how_many
|
kk = jj + ibnd_start - 1
|
||||||
kk = jj + ibnd_start - 1
|
DO ii = 1, npwx*npol
|
||||||
psi_d (ii,jj) = psi_d (ii,kk)
|
psi_d (ii,nbase+jj) = psi_d (ii,nbase+kk)
|
||||||
hpsi_d(ii,jj) = hpsi_d(ii,kk)
|
hpsi_d(ii,nbase+jj) = hpsi_d(ii,nbase+kk)
|
||||||
spsi_d(ii,jj) = spsi_d(ii,kk)
|
spsi_d(ii,nbase+jj) = spsi_d(ii,nbase+kk)
|
||||||
END DO
|
END DO
|
||||||
END DO
|
END DO
|
||||||
!$cuf kernel do(1)
|
!$cuf kernel do(1)
|
||||||
DO ii = 1, how_many
|
DO ii = 1, how_many
|
||||||
ew_d(ii) = ew_d(ibnd_start+ii-1)
|
ew_d(ii) = ew_d(ii+ibnd_start-1)
|
||||||
END DO
|
END DO
|
||||||
CALL stop_clock( 'paro:pack' );
|
CALL stop_clock( 'paro:pack' );
|
||||||
|
|
||||||
!write (6,*) ' check nactive = ', lbnd, nactive
|
write (6,*) ' check nactive = ', lbnd, nactive, nconv
|
||||||
if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
|
if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
|
||||||
|
|
||||||
CALL bpcg_k_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, npol, how_many, &
|
CALL bpcg_k_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, npol, how_many, &
|
||||||
|
@ -261,7 +274,19 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
||||||
#if defined(__MPI)
|
#if defined(__MPI)
|
||||||
IF ( nproc_ortho == 1 ) THEN
|
IF ( nproc_ortho == 1 ) THEN
|
||||||
#endif
|
#endif
|
||||||
CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d )
|
!CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d )
|
||||||
|
!civn 2fix
|
||||||
|
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), ew(nvecx) )
|
||||||
|
psi = psi_d
|
||||||
|
hpsi = hpsi_d
|
||||||
|
spsi = spsi_d
|
||||||
|
ew = ew_d
|
||||||
|
CALL rotate_HSpsi_k( npwx, npw, ndiag, ndiag, npol, psi, hpsi, overlap, spsi, ew )
|
||||||
|
psi_d = psi
|
||||||
|
hpsi_d = hpsi
|
||||||
|
spsi_d = spsi
|
||||||
|
ew_d = ew
|
||||||
|
DEALLOCATE ( psi, hpsi, spsi, ew )
|
||||||
#if defined(__MPI)
|
#if defined(__MPI)
|
||||||
ELSE
|
ELSE
|
||||||
!civn 2fix
|
!civn 2fix
|
||||||
|
@ -282,11 +307,12 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
||||||
! only the first nbnd eigenvalues are relevant for convergence
|
! only the first nbnd eigenvalues are relevant for convergence
|
||||||
! but only those that have actually been corrected should be trusted
|
! but only those that have actually been corrected should be trusted
|
||||||
conv(1:nbnd) = .FALSE.
|
conv(1:nbnd) = .FALSE.
|
||||||
|
conv_d = conv
|
||||||
|
!$cuf kernel do(1)
|
||||||
DO ii = 1, ntrust
|
DO ii = 1, ntrust
|
||||||
tmp = ew_d(ii)
|
conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr
|
||||||
tmp = tmp - eig_d(ii)
|
|
||||||
conv(ii) = ABS(tmp).LT.ethr
|
|
||||||
END DO
|
END DO
|
||||||
|
conv = conv_d
|
||||||
nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
|
nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
|
||||||
!$cuf kernel do(1)
|
!$cuf kernel do(1)
|
||||||
DO ii = 1, nbnd
|
DO ii = 1, nbnd
|
||||||
|
@ -305,7 +331,7 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap
|
||||||
|
|
||||||
CALL mp_sum(nhpsi,inter_bgrp_comm)
|
CALL mp_sum(nhpsi,inter_bgrp_comm)
|
||||||
|
|
||||||
DEALLOCATE ( ew_d, conv )
|
DEALLOCATE ( ew_d, conv, conv_d )
|
||||||
DEALLOCATE ( psi_d, hpsi_d, spsi_d )
|
DEALLOCATE ( psi_d, hpsi_d, spsi_d )
|
||||||
|
|
||||||
CALL mp_type_free( column_type )
|
CALL mp_type_free( column_type )
|
||||||
|
|
|
@ -637,6 +637,9 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
|
||||||
END IF
|
END IF
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
|
!civn
|
||||||
|
! ethr = MAX( ethr, 1.D-13 )
|
||||||
|
!
|
||||||
IF ( .not. use_gpu ) THEN
|
IF ( .not. use_gpu ) THEN
|
||||||
CALL using_evc(1); CALL using_et(1); CALL using_h_diag(0)
|
CALL using_evc(1); CALL using_et(1); CALL using_h_diag(0)
|
||||||
CALL paro_k_new( h_psi, s_psi, hs_psi, g_1psi, okvan, &
|
CALL paro_k_new( h_psi, s_psi, hs_psi, g_1psi, okvan, &
|
||||||
|
|
Loading…
Reference in New Issue