From 8928d0f195ab224e65ce657d53f39ae0788606df Mon Sep 17 00:00:00 2001 From: Ivan Carnimeo Date: Tue, 15 Sep 2020 15:21:08 +0200 Subject: [PATCH] Test small2, nscf, conv_thr=1E-12 crashes when rotate_HSpsi_k_gpu is used as implemented so far. In this commit rotate_HSpsi_k_gpu is skipped, rotate_HSpsi_k is used and the test works. In c_bands there is also a commented line where ethr is forced greater than 1.D-13, similarly to the scf case (see electrons.f90). --- KS_Solvers/DENSE/rotate_HSpsi_k_gpu.f90 | 9 ++-- KS_Solvers/ParO/paro_k_new_gpu.f90 | 60 ++++++++++++++++++------- PW/src/c_bands.f90 | 3 ++ 3 files changed, 50 insertions(+), 22 deletions(-) diff --git a/KS_Solvers/DENSE/rotate_HSpsi_k_gpu.f90 b/KS_Solvers/DENSE/rotate_HSpsi_k_gpu.f90 index 47fafcea0..7c7a071e1 100644 --- a/KS_Solvers/DENSE/rotate_HSpsi_k_gpu.f90 +++ b/KS_Solvers/DENSE/rotate_HSpsi_k_gpu.f90 @@ -176,7 +176,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove if (n_start .le. n_end) & CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), psi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx ) !$cuf kernel do(2) - DO ii = 1, npwx*npol + DO ii = 1, kdmx DO jj = n_start, n_end psi_d(ii,jj) = aux_d(ii, jj) END DO @@ -189,7 +189,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove if (n_start .le. n_end) & CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), hpsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx ) !$cuf kernel do(2) - DO ii = 1, npwx*npol + DO ii = 1, kdmx DO jj = n_start, n_end hpsi_d(ii,jj) = aux_d(ii,jj) END DO @@ -204,7 +204,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove if (n_start .le. n_end) & CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), spsi_d, kdmx, vv_d(1,n_start), nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx ) !$cuf kernel do(2) - DO ii = 1, npwx*npol + DO ii = 1, kdmx DO jj = n_start, n_end spsi_d(ii,jj) = aux_d(ii,jj) END DO @@ -217,7 +217,7 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove ELSE IF (present(spsi_d)) THEN !$cuf kernel do(2) - DO ii = 1, npwx*npol + DO ii = 1, kdmx DO jj = 1, nbnd spsi_d(ii,jj) = psi_d(ii,jj) END DO @@ -234,7 +234,6 @@ SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, ove DEALLOCATE( ss_d ) DEALLOCATE( hh_d ) DEALLOCATE( en_d ) -! call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6) !call print_clock('rotHSw') !call print_clock('rotHSw:hc') diff --git a/KS_Solvers/ParO/paro_k_new_gpu.f90 b/KS_Solvers/ParO/paro_k_new_gpu.f90 index 4f8ef626c..f29183542 100644 --- a/KS_Solvers/ParO/paro_k_new_gpu.f90 +++ b/KS_Solvers/ParO/paro_k_new_gpu.f90 @@ -90,20 +90,20 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap recv_counts(nbgrp), displs(nbgrp), column_type INTEGER :: ii, jj, kk ! indexes for cuf kernel loops - REAL(DP) :: tmp ! host auxiliary variable for some host <-> device array copy - +! !civn 2fix: these are needed only for __MPI = true (protate) REAL(DP), ALLOCATABLE :: ew(:) COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:) -! ! ! .. device variables ! REAL(DP), ALLOCATABLE :: ew_d(:) COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:) + LOGICAL, ALLOCATABLE :: conv_d(:) #if defined (__CUDA) attributes(device) :: evc_d, eig_d attributes(device) :: psi_d, hpsi_d, spsi_d, ew_d + attributes(device) :: conv_d #endif ! ! ... init local variables @@ -118,10 +118,12 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap CALL mp_type_create_column_section(evc_d(1,1), 0, npwx*npol, npwx*npol, column_type) ALLOCATE ( ew_d(nvecx), conv(nbnd) ) + ALLOCATE ( conv_d(nbnd) ) ALLOCATE ( psi_d(npwx*npol,nvecx), hpsi_d(npwx*npol,nvecx), spsi_d(npwx*npol,nvecx) ) CALL start_clock( 'paro:init' ); conv(:) = .FALSE. ; nconv = COUNT ( conv(:) ) + conv_d = conv !$cuf kernel do(2) DO ii = 1, npwx*npol DO jj = 1, nbnd @@ -137,7 +139,18 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap #if defined(__MPI) IF ( nproc_ortho == 1 ) THEN #endif - CALL rotate_HSpsi_k_gpu ( npwx, npw, nbnd, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, eig_d ) +!civn 2fix + ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), eig(nbnd) ) + psi = psi_d + hpsi = hpsi_d + spsi = spsi_d + eig = eig_d + CALL rotate_HSpsi_k ( npwx, npw, nbnd, nbnd, npol, psi, hpsi, overlap, spsi, eig ) + psi_d = psi + hpsi_d = hpsi + spsi_d = spsi + eig_d = eig + DEALLOCATE ( psi, hpsi, spsi, eig ) #if defined(__MPI) ELSE !civn 2fix @@ -219,21 +232,21 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive END DO !$cuf kernel do(2) - DO ii = 1, npwx*npol - DO jj = nbase+1, nbase+how_many - kk = jj + ibnd_start - 1 - psi_d (ii,jj) = psi_d (ii,kk) - hpsi_d(ii,jj) = hpsi_d(ii,kk) - spsi_d(ii,jj) = spsi_d(ii,kk) + DO jj = 1, how_many + kk = jj + ibnd_start - 1 + DO ii = 1, npwx*npol + psi_d (ii,nbase+jj) = psi_d (ii,nbase+kk) + hpsi_d(ii,nbase+jj) = hpsi_d(ii,nbase+kk) + spsi_d(ii,nbase+jj) = spsi_d(ii,nbase+kk) END DO END DO !$cuf kernel do(1) DO ii = 1, how_many - ew_d(ii) = ew_d(ibnd_start+ii-1) + ew_d(ii) = ew_d(ii+ibnd_start-1) END DO CALL stop_clock( 'paro:pack' ); - !write (6,*) ' check nactive = ', lbnd, nactive + write (6,*) ' check nactive = ', lbnd, nactive, nconv if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED ' CALL bpcg_k_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, npol, how_many, & @@ -261,7 +274,19 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap #if defined(__MPI) IF ( nproc_ortho == 1 ) THEN #endif - CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d ) + !CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d ) +!civn 2fix + ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), ew(nvecx) ) + psi = psi_d + hpsi = hpsi_d + spsi = spsi_d + ew = ew_d + CALL rotate_HSpsi_k( npwx, npw, ndiag, ndiag, npol, psi, hpsi, overlap, spsi, ew ) + psi_d = psi + hpsi_d = hpsi + spsi_d = spsi + ew_d = ew + DEALLOCATE ( psi, hpsi, spsi, ew ) #if defined(__MPI) ELSE !civn 2fix @@ -282,11 +307,12 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap ! only the first nbnd eigenvalues are relevant for convergence ! but only those that have actually been corrected should be trusted conv(1:nbnd) = .FALSE. + conv_d = conv +!$cuf kernel do(1) DO ii = 1, ntrust - tmp = ew_d(ii) - tmp = tmp - eig_d(ii) - conv(ii) = ABS(tmp).LT.ethr + conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr END DO + conv = conv_d nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv !$cuf kernel do(1) DO ii = 1, nbnd @@ -305,7 +331,7 @@ SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap CALL mp_sum(nhpsi,inter_bgrp_comm) - DEALLOCATE ( ew_d, conv ) + DEALLOCATE ( ew_d, conv, conv_d ) DEALLOCATE ( psi_d, hpsi_d, spsi_d ) CALL mp_type_free( column_type ) diff --git a/PW/src/c_bands.f90 b/PW/src/c_bands.f90 index b1908d38e..dc504f0c1 100644 --- a/PW/src/c_bands.f90 +++ b/PW/src/c_bands.f90 @@ -637,6 +637,9 @@ SUBROUTINE diag_bands( iter, ik, avg_iter ) END IF ELSE ! +!civn +! ethr = MAX( ethr, 1.D-13 ) +! IF ( .not. use_gpu ) THEN CALL using_evc(1); CALL using_et(1); CALL using_h_diag(0) CALL paro_k_new( h_psi, s_psi, hs_psi, g_1psi, okvan, &