Aligned cegterg_gpu to CPU version.

This commit is contained in:
Pietro Bonfa 2018-11-13 13:32:25 +01:00
parent db6593df01
commit a0470a9e67
1 changed files with 153 additions and 148 deletions

View File

@ -7,15 +7,12 @@
!
#define ZERO ( 0.D0, 0.D0 )
#define ONE ( 1.D0, 0.D0 )
#if defined(__CUDA)
!
!----------------------------------------------------------------------------
SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
npw, npwx, nvec, nvecx, npol, evc_d, ethr, &
e_d, btype, notcnv, lrot, dav_iter )
!----------------------------------------------------------------------------
! PB : 17/8/18, restored original algorithm since aligned version
! in commit 354a86b is much slower, probably due to MPI and/or
! load unbalance. More careful analysis needed.
!
! ... iterative solution of the eigenvalue problem:
!
@ -24,13 +21,18 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
! ... where H is an hermitean operator, e is a real scalar,
! ... S is an overlap matrix, evc is a complex vector
!
#if defined(__CUDA)
use cudafor
use cublas
#endif
USE LAXlib, ONLY : diaghg
USE david_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id,&
nbgrp, my_bgrp_id
USE mp, ONLY : mp_sum, mp_bcast
nbgrp, my_bgrp_id
USE mp, ONLY : mp_sum, mp_gather, mp_bcast, mp_size,&
mp_type_create_column_section, mp_type_free
USE gbuffers, ONLY : gbuf => pin_buf
USE cuda_util, ONLY : cuf_memcpy, cuf_memset
!
IMPLICIT NONE
!
@ -41,7 +43,7 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
! maximum dimension of the reduced basis set :
! (the basis set is refreshed when its dimension would exceed nvecx)
! umber of spin polarizations
COMPLEX(DP), DEVICE, INTENT(INOUT) :: evc_d(npwx,npol,nvec)
COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx,npol,nvec)
! evc contains the refined estimates of the eigenvectors
REAL(DP), INTENT(IN) :: ethr
! energy threshold for convergence :
@ -53,14 +55,16 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
! band type ( 1 = occupied, 0 = empty )
LOGICAL, INTENT(IN) :: lrot
! .TRUE. if the wfc have already been rotated
REAL(DP), DEVICE, INTENT(OUT) :: e_d(nvec)
REAL(DP), INTENT(OUT) :: e_d(nvec)
! contains the estimated roots.
INTEGER, INTENT(OUT) :: dav_iter, notcnv
! integer number of iterations performed
! number of unconverged roots
#if defined(__CUDA)
attributes(DEVICE) :: evc_d, e_d
#endif
!
! ... LOCAL variables
!@njs: hc, sc, vc, psi, hpsi, spsi
!
INTEGER, PARAMETER :: maxter = 20
! maximum number of iterations
@ -72,14 +76,16 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
! adapted npw and npwx
! do-loop counters
INTEGER :: n_start, n_end, my_n
INTEGER :: column_section_type
! defines a column section for communication
INTEGER :: ierr
COMPLEX(DP), DEVICE, ALLOCATABLE :: hc_d(:,:), sc_d(:,:), vc_d(:,:)
COMPLEX(DP), ALLOCATABLE :: hc_d(:,:), sc_d(:,:), vc_d(:,:)
! Hamiltonian on the reduced basis
! S matrix on the reduced basis
! the eigenvectors of the Hamiltonian
REAL(DP), DEVICE, ALLOCATABLE :: ew_d(:)
REAL(DP), ALLOCATABLE :: ew_d(:)
! eigenvalues of the reduced hamiltonian
COMPLEX(DP), DEVICE, ALLOCATABLE :: psi_d(:,:,:), hpsi_d(:,:,:), spsi_d(:,:,:)
COMPLEX(DP), ALLOCATABLE :: psi_d(:,:,:), hpsi_d(:,:,:), spsi_d(:,:,:)
! work space, contains psi
! the product of H and psi
! the product of S and psi
@ -87,11 +93,18 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
! true if the root is converged
REAL(DP) :: empty_ethr
! threshold for empty bands
INTEGER, ALLOCATABLE :: recv_counts(:), displs(:)
! receive counts and memory offsets
COMPLEX(DP), POINTER :: pinned_buffer(:,:)
! auxiliary variable for performing MPI operation and overcome CUDAFortran limitations
REAL(DP), ALLOCATABLE :: ew_host(:)
REAL(DP), ALLOCATABLE :: e_host(:)
! auxiliary variables for performing dot product
INTEGER :: i,j,k
!
!
#if defined(__CUDA)
attributes(DEVICE) :: hc_d, sc_d, vc_d, ew_d, psi_d, hpsi_d, spsi_d
#endif
!
EXTERNAL h_psi_gpu, s_psi_gpu, g_psi_gpu
! h_psi(npwx,npw,nvec,psi,hpsi)
@ -157,23 +170,20 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
ALLOCATE( conv( nvec ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' cegterg ',' cannot allocate conv ', ABS(ierr) )
ALLOCATE( recv_counts(mp_size(inter_bgrp_comm)), displs(mp_size(inter_bgrp_comm)) )
!
! This buffer is used to perform MPI calls with non-contiguous slices.
! In order to limit the number of allocated buffers, a rather large,
! but hopefully 'repetitive' size is selected (as of today buffers are
! selected according to the leading dimension(s) )
!
CALL gbuf%lock_buffer(pinned_buffer, (/nvecx, nvecx/), ierr)
!
notcnv = nvec
nbase = nvec
conv = .FALSE.
!
IF ( uspp ) spsi_d = ZERO
!
hpsi_d = ZERO
psi_d = ZERO
!$cuf kernel do(3) <<<*,*>>>
DO k=1,nvec
DO j=1,npol
DO i=1,npwx
psi_d(i,j,k) = evc_d(i,j,k)
END DO
END DO
END DO
CALL cuf_memcpy(psi_d, evc_d, (/1 , npwx/), (/1 , npol/), (/1 ,nvec/))
!
! ... hpsi contains h times the basis vectors
!
@ -187,17 +197,22 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
! ... space vc contains the eigenvectors of hc
!
CALL start_clock( 'cegterg:init' )
hc_d(:,:) = ZERO
sc_d(:,:) = ZERO
vc_d(:,:) = ZERO
!
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
CALL divide_all(inter_bgrp_comm,nbase,n_start,n_end,recv_counts,displs)
CALL mp_type_create_column_section(sc_d(1,1), 0, nbase, nvecx, column_section_type)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
!
if (n_start .le. n_end) &
CALL ZGEMM( 'C','N', nbase, my_n, kdim, ONE, psi_d, kdmx, hpsi_d(1,1,n_start), kdmx, ZERO, hc_d(1,n_start), nvecx )
CALL mp_sum( hc_d( :, 1:nbase ), inter_bgrp_comm )
!
CALL mp_sum( hc_d( :, 1:nbase ), intra_bgrp_comm )
if (n_start .le. n_end) then
!
pinned_buffer(1:nbase, n_start:n_end) = hc_d( 1:nbase, n_start:n_end )
CALL mp_sum( pinned_buffer(1:nbase, n_start:n_end), intra_bgrp_comm )
hc_d( 1:nbase, n_start:n_end ) = pinned_buffer(1:nbase, n_start:n_end)
!
end if
CALL mp_gather( hc_d, column_section_type, recv_counts, displs, root_bgrp_id, inter_bgrp_comm )
!
IF ( uspp ) THEN
!
@ -212,14 +227,39 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
ZERO, sc_d(1,n_start), nvecx )
!
END IF
CALL mp_sum( sc_d( :, 1:nbase ), inter_bgrp_comm )
!
CALL mp_sum( sc_d( :, 1:nbase ), intra_bgrp_comm )
if ((n_start .le. n_end) .and. (mp_size(intra_bgrp_comm) > 1 )) then
pinned_buffer(1:nbase, n_start:n_end) = sc_d( 1:nbase, n_start:n_end )
CALL mp_sum( pinned_buffer( 1:nbase, n_start:n_end ), intra_bgrp_comm )
sc_d( 1:nbase, n_start:n_end ) = pinned_buffer(1:nbase, n_start:n_end)
end if
CALL mp_gather( sc_d, column_section_type, recv_counts, displs, root_bgrp_id, inter_bgrp_comm )
!
CALL mp_type_free( column_section_type )
!
!$cuf kernel do
DO n = 1, nbase
!
! ... the diagonal of hc and sc must be strictly real
!
hc_d(n,n) = CMPLX( REAL( hc_d(n,n) ), 0.D0 ,kind=DP)
sc_d(n,n) = CMPLX( REAL( sc_d(n,n) ), 0.D0 ,kind=DP)
!
DO m = n + 1, nbase
!
hc_d(n,m) = CONJG( hc_d(m,n) )
sc_d(n,m) = CONJG( sc_d(m,n) )
!
END DO
!
END DO
!
CALL stop_clock( 'cegterg:init' )
!
IF ( lrot ) THEN
!
CALL cuf_memset(vc_d, ZERO, (/1, nbase/), (/1, nbase/))
!
!$cuf kernel do(1) <<<*,*>>>
DO n = 1, nbase
!
@ -229,6 +269,8 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!
END DO
!
CALL mp_bcast( e_d, root_bgrp_id, inter_bgrp_comm )
!
ELSE
!
! ... diagonalize the reduced hamiltonian
@ -243,10 +285,7 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
ENDIF
CALL stop_clock( 'cegterg:diag' )
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, nvec
e_d(i) = ew_d(i)
END DO
CALL cuf_memcpy (e_d, ew_d, (/ 1, nvec /) )
!
END IF
!
@ -292,14 +331,6 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
!$cuf kernel do(3) <<<*,*>>>
DO i=1, notcnv
DO j=1,npol
DO k=npw,npwx ! pn;y cleanup what needs to be cleaned up
psi_d(k,j,nbase+i)=ZERO
END DO
END DO
END DO
IF ( uspp ) THEN
!
if (n_start .le. n_end) &
@ -315,8 +346,7 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
END IF
! NB: must not call mp_sum over inter_bgrp_comm here because it is done later to the full correction
!
!$cuf kernel do(3) <<<*,*>>>
!$cuf kernel do(3) <<<*,*>>>
DO np=1,notcnv
DO j=1,npol
DO k=1,npwx
@ -330,6 +360,9 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
ONE, psi_d(1,1,nb1), kdmx )
CALL mp_sum( psi_d(:,:,nb1:nbase+notcnv), inter_bgrp_comm )
!
! clean up garbage if there is any
IF (npw < npwx) CALL cuf_memset(psi_d, ZERO, (/npw+1,npwx/),(/1, npol/),(/nb1, nbase+notcnv/))
!
CALL stop_clock( 'cegterg:update' )
!
! ... approximate inverse iteration
@ -366,7 +399,7 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!$cuf kernel do(3) <<<*,*>>>
DO i = 1,notcnv
DO j=1,npol
DO k=1,npwx
DO k=1,npw
psi_d(k,j,nbase+i) = psi_d(k,j,nbase+i)/SQRT( ew_d(i) )
END DO
END DO
@ -382,33 +415,42 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!
CALL start_clock( 'cegterg:overlap' )
!
!$cuf kernel do(2) <<<*,*>>>
DO i=0,notcnv-1
DO j=1, nvecx
hc_d( j, nb1+i )=ZERO
sc_d( j, nb1+i )=ZERO
END DO
END DO
CALL divide_all(inter_bgrp_comm,nbase+notcnv,n_start,n_end,recv_counts,displs)
CALL mp_type_create_column_section(sc_d(1,1), nbase, notcnv, nvecx, column_section_type)
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
!
CALL ZGEMM( 'C','N', notcnv, my_n, kdim, ONE, hpsi_d(1,1,nb1), kdmx, psi_d(1,1,n_start), kdmx, &
ZERO, hc_d(nb1,n_start), nvecx )
!
if ((n_start .le. n_end) .and. (mp_size(intra_bgrp_comm) > 1 )) then
pinned_buffer(nb1:nbase+notcnv, n_start:n_end) = hc_d( nb1:nbase+notcnv, n_start:n_end )
CALL mp_sum( pinned_buffer( nb1:nbase+notcnv, n_start:n_end ), intra_bgrp_comm )
hc_d( nb1:nbase+notcnv, n_start:n_end ) = pinned_buffer(nb1:nbase+notcnv, n_start:n_end)
end if
CALL mp_gather( hc_d, column_section_type, recv_counts, displs, root_bgrp_id, inter_bgrp_comm )
!
CALL divide(inter_bgrp_comm,nbase+notcnv,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
CALL ZGEMM( 'C','N', my_n, notcnv, kdim, ONE, psi_d(1,1,n_start), kdmx, hpsi_d(1,1,nb1), kdmx, &
ZERO, hc_d(n_start,nb1), nvecx )
CALL mp_sum( hc_d( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
CALL mp_sum( hc_d( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
!
IF ( uspp ) THEN
!
CALL ZGEMM( 'C','N', my_n, notcnv, kdim, ONE, psi_d(1,1,n_start), kdmx, spsi_d(1,1,nb1), kdmx, &
ZERO, sc_d(n_start,nb1), nvecx )
CALL ZGEMM( 'C','N', notcnv, my_n, kdim, ONE, spsi_d(1,1,nb1), kdmx, psi_d(1,1,n_start), kdmx, &
ZERO, sc_d(nb1,n_start), nvecx )
!
ELSE
!
CALL ZGEMM( 'C','N', my_n, notcnv, kdim, ONE, psi_d(1,1,n_start), kdmx, psi_d(1,1,nb1), kdmx, &
ZERO, sc_d(n_start,nb1), nvecx )
CALL ZGEMM( 'C','N', notcnv, my_n, kdim, ONE, psi_d(1,1,nb1), kdmx, psi_d(1,1,n_start), kdmx, &
ZERO, sc_d(nb1,n_start), nvecx )
!
END IF
CALL mp_sum( sc_d( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
CALL mp_sum( sc_d( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
!
if ( (n_start .le. n_end) .and. (mp_size(intra_bgrp_comm) > 1 ) ) then
pinned_buffer( nb1:nbase+notcnv, n_start:n_end ) = sc_d( nb1:nbase+notcnv, n_start:n_end )
CALL mp_sum( pinned_buffer( nb1:nbase+notcnv, n_start:n_end ), intra_bgrp_comm )
sc_d( nb1:nbase+notcnv, n_start:n_end ) = pinned_buffer( nb1:nbase+notcnv, n_start:n_end )
end if
CALL mp_gather( sc_d, column_section_type, recv_counts, displs, root_bgrp_id, inter_bgrp_comm )
!
CALL mp_type_free( column_section_type )
!
CALL stop_clock( 'cegterg:overlap' )
!
@ -417,15 +459,17 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!$cuf kernel do(1) <<<*,*>>>
DO n = 1, nbase
!
! ... the diagonal of hc and sc must be strictly real
! ... the diagonal of hc and sc must be strictly real
!
hc_d(n,n) = CMPLX( REAL( hc_d(n,n) ), 0.D0 ,kind=DP)
sc_d(n,n) = CMPLX( REAL( sc_d(n,n) ), 0.D0 ,kind=DP)
IF( n>=nb1 ) THEN
hc_d(n,n) = CMPLX( REAL( hc_d(n,n) ), 0.D0 ,kind=DP)
sc_d(n,n) = CMPLX( REAL( sc_d(n,n) ), 0.D0 ,kind=DP)
ENDIF
!
DO m = n + 1, nbase
DO m = MAX(n+1,nb1), nbase
!
hc_d(m,n) = CONJG( hc_d(n,m) )
sc_d(m,n) = CONJG( sc_d(n,m) )
hc_d(n,m) = CONJG( hc_d(m,n) )
sc_d(n,m) = CONJG( sc_d(m,n) )
!
END DO
!
@ -461,10 +505,7 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!
notcnv = COUNT( .NOT. conv(:) )
!
!$cuf kernel do(1) <<<*,*>>>
DO i=1,nvec
e_d(i) = ew_d(i)
END DO
CALL cuf_memcpy (e_d, ew_d, (/ 1, nvec /) )
!
! ... if overall convergence has been achieved, or the dimension of
! ... the reduced basis set is becoming too large, or in any case if
@ -477,15 +518,6 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!
CALL start_clock( 'cegterg:last' )
!
! Only reset what needs to be reset
!$cuf kernel do(3) <<<*,*>>>
DO k=1,nvec
DO j=1,npol
DO i=npw,npwx
evc_d(i,j,k) = ZERO
END DO
END DO
END DO
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
CALL ZGEMM( 'N','N', kdim, nvec, my_n, ONE, psi_d(1,1,n_start), kdmx, vc_d(n_start,1), nvecx, &
@ -515,81 +547,44 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!
! ... refresh psi, H*psi and S*psi
!
!$cuf kernel do(3) <<<*,*>>>
DO i=1,nvec
DO j=1,npol
DO k=1,npwx
psi_d(k,j,i) = evc_d(k,j,i)
END DO
END DO
END DO
CALL cuf_memcpy(psi_d, evc_d, (/ 1, npwx /), (/1 , npol /), (/1, nvec/))
!
IF ( uspp ) THEN
!
!psi_d(:,:,nvec+1:nvec+nvec) = ZERO (only clean what needs to be cleaned)
!$cuf kernel do(3) <<<*,*>>>
DO i=1, nvec
DO j=1, npol
DO k=npw, npwx
psi_d(k,j,nvec+i) = ZERO
END DO
END DO
END DO
CALL ZGEMM( 'N','N', kdim, nvec, my_n, ONE, spsi_d(1,1,n_start), kdmx, vc_d(n_start,1), nvecx, &
ZERO, psi_d(1,1,nvec+1), kdmx)
CALL mp_sum( psi_d(:,:,nvec+1:nvec+nvec), inter_bgrp_comm )
!
!spsi_d(:,:,1:nvec) = psi_d(:,:,nvec+1:nvec+nvec)
!$cuf kernel do(3) <<<*,*>>>
DO i=1,nvec
DO j=1, npol
DO k=1, npwx
spsi_d(k,j,i) = psi_d(k,j,i+nvec)
END DO
END DO
END DO
CALL cuf_memcpy(spsi_d, psi_d(:,:,nvec+1:), (/1 ,npwx/), (/1,npol/), (/1, nvec/))
CALL mp_sum( spsi_d(:,:,1:nvec), inter_bgrp_comm )
!
END IF
!
!psi_d(:,:,nvec+1:nvec+nvec) = ZERO (only clean what needs to be cleaned)
!$cuf kernel do(3) <<<*,*>>>
DO i=1, nvec
DO j=1, npol
DO k=npw, npwx
psi_d(k,j,nvec+i) = ZERO
END DO
END DO
END DO
CALL ZGEMM( 'N','N', kdim, nvec, my_n, ONE, hpsi_d(1,1,n_start), kdmx, vc_d(n_start,1), nvecx, &
ZERO, psi_d(1,1,nvec+1), kdmx )
CALL mp_sum( psi_d(:,:,nvec+1:nvec+nvec), inter_bgrp_comm )
!
!hpsi_d(:,:,1:nvec) = psi_d(:,:,nvec+1:nvec+nvec)
!$cuf kernel do(3) <<<*,*>>>
DO i=1,nvec
DO j=1, npol
DO k=1, npwx
hpsi_d(k,j,i) = psi_d(k,j,i+nvec)
END DO
END DO
END DO
CALL cuf_memcpy(hpsi_d, psi_d(:,:,nvec+1:), (/1 ,npwx/), (/1,npol/), (/1, nvec/))
CALL mp_sum( hpsi_d(:,:,1:nvec), inter_bgrp_comm )
!
! ... refresh the reduced hamiltonian
!
nbase = nvec
!
hc_d(:,1:nbase) = ZERO
sc_d(:,1:nbase) = ZERO
vc_d(:,1:nbase) = ZERO
! These variables are set to ZERO in the CUF Kernel below
!hc_d(1:nbase,1:nbase) = ZERO
!sc_d(1:nbase,1:nbase) = ZERO
!vc_d(1:nbase,1:nbase) = ZERO
!
!$cuf kernel do(1) <<<*,*>>>
!$cuf kernel do(2) <<<*,*>>>
DO n = 1, nbase
!
! hc(n,n) = REAL( e(n) )
hc_d(n,n) = CMPLX( e_d(n), 0.0_DP ,kind=DP)
!
sc_d(n,n) = ONE
vc_d(n,n) = ONE
DO j = 1, nbase
!
IF ( j == n ) THEN
hc_d(j,n) = CMPLX( e_d(n), 0.0_DP ,kind=DP)
!
sc_d(j,n) = ONE
vc_d(j,n) = ONE
ELSE
hc_d(j,n) = ZERO; sc_d(j,n) = ZERO; vc_d(j,n) = ZERO
END IF
END DO
!
END DO
!
@ -599,6 +594,9 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
!
END DO iterate
!
CALL gbuf%release_buffer(pinned_buffer, ierr)
DEALLOCATE( recv_counts )
DEALLOCATE( displs )
DEALLOCATE( conv )
DEALLOCATE( e_host, ew_host, ew_d )
DEALLOCATE( vc_d )
@ -628,13 +626,19 @@ SUBROUTINE reorder_evals_cevecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
implicit none
INTEGER, INTENT(IN) :: nbase, nvec, nvecx
LOGICAL, INTENT(IN) :: conv(nvec)
REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
COMPLEX(DP), DEVICE :: v_d(nvecx,nvecx)
REAL(DP) :: e_d(nvecx), ew_d(nvecx)
COMPLEX(DP) :: v_d(nvecx,nvecx)
#if defined(__CUDA)
attributes(DEVICE) :: e_d, ew_d, v_d
#endif
!
INTEGER :: j, k, n, np, info
INTEGER, ALLOCATABLE :: conv_idx(:)
INTEGER, DEVICE, POINTER :: conv_idx_d(:)
COMPLEX(DP), DEVICE, POINTER :: vtmp_d(:,:)
INTEGER, POINTER :: conv_idx_d(:)
COMPLEX(DP), POINTER :: vtmp_d(:,:)
#if defined(__CUDA)
attributes(DEVICE) :: conv_idx_d, vtmp_d
#endif
!
np = 0
ALLOCATE(conv_idx(nvec))
@ -674,6 +678,7 @@ SUBROUTINE reorder_evals_cevecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
DEALLOCATE(conv_idx)
END SUBROUTINE reorder_evals_cevecs
#if defined(__CUDA)
!
! Wrapper for subroutine with distributed matrixes (written by Carlo Cavazzoni)
!