Merge branch 'develop_omp5' into 'develop_omp5'

regterg to OpenMP offload

See merge request icarnimeo/q-e!11
This commit is contained in:
Ivan Carnimeo 2023-04-20 13:45:19 +00:00
commit a864194949
10 changed files with 573 additions and 304 deletions

View File

@ -1387,7 +1387,7 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
INTEGER :: k, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom, offset
INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx, ipp, nblk, nsiz
!
INTEGER :: iter, dest, sorc
INTEGER :: iter, dest, sorc, ncp_me, npp_gproc
INTEGER :: istatus(MPI_STATUS_SIZE)
me = dfft%mype + 1
@ -1422,22 +1422,24 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
!$omp target data use_device_addr(f_in)
#endif
DO gproc = 1, nprocp
ncp_me = ncp_(me)
npp_gproc = npp_(gproc)
kdest = ( gproc - 1 ) * ncpx
kfrom = ( gproc - 1 ) * npp_(gproc)
istat = int(omp_target_memcpy_rect(c_loc(f_aux), c_loc(f_in), &
int(sizeof(dummy),c_size_t), &
int(2,c_int), &
int((/ ncp_(me), npp_(gproc) /),c_size_t), &
int((/ kdest, 0 /),c_size_t), &
int((/ 0, kfrom /),c_size_t), &
int((/ (nxx_/nppx), nppx /),c_size_t), &
int((/ (nxx_/nr3x), nr3x /),c_size_t), &
kfrom = ( gproc - 1 ) * nppx
istat = int(omp_target_memcpy_rect(c_loc(f_aux), c_loc(f_in), &
int(sizeof(dummy),c_size_t), &
int(2,c_int), &
int((/ ncp_me, npp_gproc /),c_size_t), &
int((/ kdest, 0 /),c_size_t), &
int((/ 0, kfrom /),c_size_t), &
int((/ (nxx_/nppx), nppx /),c_size_t), &
int((/ (nxx_/nr3x), nr3x /),c_size_t), &
#ifdef __GPU_MPI
int(omp_get_default_device(),c_int), &
int(omp_get_default_device(),c_int), &
#else
int(omp_get_initial_device(),c_int), &
int(omp_get_initial_device(),c_int), &
#endif
int(omp_get_default_device(),c_int)), &
int(omp_get_default_device(),c_int)), &
kind(istat))
ENDDO
!$omp end target data
@ -1447,13 +1449,15 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
DO gproc = 1, nprocp
kdest = ( gproc - 1 ) * sendsiz
kfrom = offset
ncp_me = ncp_(me)
npp_gproc = npp_(gproc)
!$omp target teams distribute parallel do collapse(2)
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
DO k = 1, ncp_me
DO i = 1, npp_gproc
f_aux( kdest + i + (k-1)*nppx ) = f_in( kfrom + i + (k-1)*nr3x )
END DO
END DO
offset = offset + npp_ ( gproc )
offset = offset + npp_gproc
ENDDO
#ifndef __GPU_MPI
!$omp target update from (f_aux)
@ -1513,7 +1517,7 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
10 CONTINUE
!$omp target teams distribute parallel do
do i = lbound(f_aux,1), ubound(f_aux,1)
do i = 1, nxx_
f_aux(i) = (0.d0, 0.d0)
end do
!$omp end target teams distribute parallel do
@ -1559,7 +1563,7 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
DO ip = 1, nprocp
ioff = dfft%iss( ip )
nswip = dfft%nsp( ip )
!$omp target teams distribute parallel do collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO omp_j = 1, npp
DO omp_i = 1, nswip
mc = dfft%ismap( omp_i + ioff )
@ -1567,13 +1571,12 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
f_in( omp_j + it ) = f_aux( mc + ( omp_j - 1 ) * nnp )
ENDDO
ENDDO
!$omp end target teams distribute parallel do
ENDDO
ELSE
DO gproc = 1, nprocp
ioff = dfft%iss( gproc )
nswip = dfft%nsw( gproc )
!$omp target teams distribute parallel do collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO omp_j = 1, npp
DO omp_i = 1, nswip
mc = dfft%ismap( omp_i + ioff )
@ -1581,7 +1584,6 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
f_in( omp_j + it ) = f_aux( mc + ( omp_j - 1 ) * nnp )
ENDDO
ENDDO
!$omp end target teams distribute parallel do
ENDDO
END IF
!
@ -1592,12 +1594,12 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
gcomm = dfft%comm
!
#ifndef __GPU_MPI
!$omp target update from (f_in(1:sendsiz*nprocp))
!$omp target update from (f_in(1:sendsiz*nprocp))
#endif
!
CALL start_clock ('a2a_bw')
#ifdef __GPU_MPI
!$omp target data use_device_ptr(f_in, f_aux)
!$omp target data use_device_ptr(f_in, f_aux)
DO iter = 2, nprocp
IF(IAND(nprocp, nprocp-1) == 0) THEN
sorc = IEOR( me-1, iter-1 )
@ -1620,14 +1622,13 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
ENDDO
!$omp target teams distribute parallel do
!$omp target teams distribute parallel do
DO i=(me-1)*sendsiz + 1, me*sendsiz
f_aux(i) = f_in(i)
ENDDO
!$omp end target teams distribute parallel do
call MPI_WAITALL(2*nprocp-2, srh, MPI_STATUSES_IGNORE, ierr)
!$omp end target data
!$omp end target data
#else
CALL mpi_alltoall (f_in(1), sendsiz, MPI_DOUBLE_COMPLEX, f_aux(1), sendsiz, MPI_DOUBLE_COMPLEX, gcomm, ierr)
#ifndef __MEMCPY_RECT
@ -1649,21 +1650,23 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
!$omp target data use_device_addr(f_in)
#endif
DO gproc = 1, nprocp
ncp_me = ncp_(me)
npp_gproc = npp_(gproc)
kdest = ( gproc - 1 ) * ncpx
kfrom = ( gproc - 1 ) * npp_(gproc)
istat = int(omp_target_memcpy_rect(c_loc(f_in), c_loc(f_aux), &
int(sizeof(dummy),c_size_t), &
int(2,c_int), &
int((/ ncp_(me), npp_(gproc) /),c_size_t), &
int((/ 0, kfrom /),c_size_t), &
int((/ kdest, 0 /),c_size_t), &
int((/ (nxx_/nr3x), nr3x /),c_size_t), &
int((/ (nxx_/nppx), nppx /),c_size_t), &
int(omp_get_default_device(),c_int), &
kfrom = ( gproc - 1 ) * nppx
istat = int(omp_target_memcpy_rect(c_loc(f_in), c_loc(f_aux), &
int(sizeof(dummy),c_size_t), &
int(2,c_int), &
int((/ ncp_me, npp_gproc /),c_size_t), &
int((/ 0, kfrom /),c_size_t), &
int((/ kdest, 0 /),c_size_t), &
int((/ (nxx_/nr3x), nr3x /),c_size_t), &
int((/ (nxx_/nppx), nppx /),c_size_t), &
int(omp_get_default_device(),c_int), &
#ifdef __GPU_MPI
int(omp_get_default_device(),c_int)), &
int(omp_get_default_device(),c_int)), &
#else
int(omp_get_initial_device(),c_int)), &
int(omp_get_initial_device(),c_int)), &
#endif
kind(istat))
ENDDO
@ -1674,13 +1677,15 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
DO gproc = 1, nprocp
kdest = ( gproc - 1 ) * sendsiz
kfrom = offset
ncp_me = ncp_(me)
npp_gproc = npp_(gproc)
!$omp target teams distribute parallel do collapse(2)
DO k = 1, ncp_(me)
DO i = 1, npp_( gproc )
DO k = 1, ncp_me
DO i = 1, npp_gproc
f_in( kfrom + i + (k-1)*nr3x ) = f_aux( kdest + i + (k-1)*nppx )
END DO
END DO
offset = offset + npp_( gproc )
offset = offset + npp_gproc
ENDDO
!
#endif

View File

@ -322,9 +322,6 @@ CONTAINS
ALLOCATE( desc%tg_rdsp( desc%nproc2) ) ; desc%tg_rdsp = 0
#if defined (__OPENMP_GPU)
#ifndef __CRAY
!$omp target enter data map(alloc:desc)
#endif
!$omp target enter data map(always,alloc:desc%nsp)
!$omp target enter data map(always,alloc:desc%nsw)
!$omp target enter data map(always,alloc:desc%ismap)
@ -337,7 +334,7 @@ CONTAINS
nsubbatches = ceiling(real(desc%batchsize)/desc%subbatchsize)
ALLOCATE( desc%srh(2*nproc, nsubbatches))
!$omp target enter data map(alloc:desc%srh)
!$omp target enter data map(always,alloc:desc%srh)
#endif
#if defined(__CUDA)
@ -429,9 +426,6 @@ CONTAINS
IF (OMP_TARGET_IS_PRESENT(c_loc(desc%nlm), OMP_GET_DEFAULT_DEVICE()) == 1) THEN
!$omp target exit data map(delete:desc%nlm)
ENDIF
#ifndef __CRAY
!$omp target exit data map(delete:desc)
#endif
#endif
IF ( ALLOCATED( desc%aux ) ) DEALLOCATE( desc%aux )

View File

@ -18,7 +18,7 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
me_bgrp, root_bgrp
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp, ONLY : mp_sum, mp_barrier, mp_allgather, mp_type_create_column_section, mp_type_free
USE device_memcpy_m, ONLY: dev_memcpy
!
@ -31,7 +31,7 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
INTEGER, INTENT(IN) :: &
npw, & ! dimension of the matrices (psi,Hpsi,Spsi) to be rotated
npwx, & ! leading dimension of the wavefunction-related matrices
nstart, & ! input number of states
nstart, & ! input number of states
nbnd ! output number of states
COMPLEX(DP), INTENT(INOUT) :: psi(npwx,nstart), hpsi(npwx,nstart) ! input and output psi, Hpsi,
COMPLEX(DP), INTENT(INOUT), OPTIONAL :: spsi(npwx,nstart) ! ... and optionnally Spsi
@ -63,7 +63,7 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
if (overlap) spsi(1,1:nstart) = CMPLX( DBLE( spsi(1,1:nstart) ), 0.D0,kind=DP)
!$acc end kernels
END IF
kdim = 2 * npw
kdmx = 2 * npwx
!
@ -86,7 +86,7 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
if (n_start .le. n_end) &
CALL MYDGEMM2( 'T','N', nstart,my_n,kdim, 2.D0, psi,kdmx, hpsi(1,n_start),kdmx, 0.D0, hh(1,n_start),nstart,.FALSE. )
IF ( gstart == 2 ) call MYDGER( nstart, my_n, -1.D0, psi,kdmx, hpsi(1,n_start),kdmx, hh(1,n_start),nstart )
IF ( gstart == 2 ) call MYDGER2( nstart, my_n, -1.D0, psi,kdmx, hpsi(1,n_start),kdmx, hh(1,n_start),nstart,.FALSE. )
call start_clock('rotHSw:hc:s1')
CALL mp_sum( hh(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
call stop_clock('rotHSw:hc:s1')
@ -99,13 +99,13 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
!
if (n_start .le. n_end) &
CALL MYDGEMM2('T','N', nstart,my_n,kdim, 2.D0, psi,kdmx, spsi(1,n_start),kdmx, 0.D0, ss(1,n_start),nstart,.FALSE. )
IF ( gstart == 2 ) CALL MYDGER(nstart, my_n, -1.D0, psi,kdmx, spsi(1,n_start),kdmx, ss(1,n_start),nstart)
IF ( gstart == 2 ) CALL MYDGER2(nstart, my_n, -1.D0, psi,kdmx, spsi(1,n_start),kdmx, ss(1,n_start),nstart,.FALSE. )
!
ELSE
!
if (n_start .le. n_end) &
CALL MYDGEMM2('T','N', nstart,my_n,kdim, 2.D0, psi,kdmx, psi(1,n_start),kdmx, 0.D0, ss(1,n_start),nstart,.FALSE. )
IF ( gstart == 2 ) CALL MYDGER(nstart, my_n, -1.D0, psi,kdmx, psi(1,n_start),kdmx, ss(1,n_start),nstart)
IF ( gstart == 2 ) CALL MYDGER2(nstart, my_n, -1.D0, psi,kdmx, psi(1,n_start),kdmx, ss(1,n_start),nstart,.FALSE. )
!
END IF
call start_clock('rotHSw:hc:s3')
@ -151,7 +151,7 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
if (n_start .le. n_end) &
CALL MYDGEMM2( 'N','N', kdim,my_n,nstart, 1.D0,hpsi,kdmx,vv(1,n_start),nstart, 0.D0, aux(1,n_start),kdmx,.FALSE. )
CALL dev_memcpy(hpsi, aux, [1, npwx], 1, [n_start,n_end])
CALL dev_memcpy(hpsi, aux, [1, npwx], 1, [n_start,n_end])
! call start_clock('rotHSw:ev:b4'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b4')
call start_clock('rotHSw:ev:s6')
CALL mp_allgather(hpsi(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
@ -161,7 +161,7 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
if (n_start .le. n_end) &
CALL MYDGEMM2( 'N','N', kdim,my_n,nstart, 1.D0,spsi,kdmx,vv(1,n_start),nstart, 0.D0, aux(1,n_start),kdmx,.FALSE. )
CALL dev_memcpy(spsi, aux, [1, npwx], 1, [n_start,n_end])
CALL dev_memcpy(spsi, aux, [1, npwx], 1, [n_start,n_end])
! call start_clock('rotHSw:ev:b5'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b5')
call start_clock('rotHSw:ev:s7')
CALL mp_allgather(spsi(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
@ -207,7 +207,7 @@ SUBROUTINE protate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, sps
!
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp, ONLY : mp_bcast, mp_root_sum, mp_sum, mp_barrier, mp_rank
USE mp, ONLY : mp_allgather, mp_type_create_column_section, mp_type_free
!
@ -411,7 +411,7 @@ CONTAINS
!write (6,*) 'nx: ', nx
!write (6,*) 'ipc, ipr, root', ipc, ipr, root
!write (6,*) 'ic, nc, ir, nr', ic, nc, ir, nr
!write (6,*) 'ic, nc, ir, nr', ic, nc, ir, nr
!do ix=1,nx
! write (6,'(16f12.8)') work(1:nx,ix)
!enddo
@ -469,7 +469,7 @@ CONTAINS
nr = idesc_ip(LAX_DESC_NR, ipr, ipc )
ir = idesc_ip(LAX_DESC_IR, ipr, ipc )
!
root = rank_ip( ipr, ipc ) ! this proc has the needed block
root = rank_ip( ipr, ipc ) ! this proc has the needed block
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) vtmp(:,1:nc) = vv(:,1:nc)
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm ) ! the other ones will receive it
!

View File

@ -5,9 +5,9 @@
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
! NOTE (Ivan Carnimeo, May, 05th, 2022):
! cegterg and regterg have been ported to GPU with OpenACC,
! the previous CUF versions (cegterg_gpu and regterg_gpu) have been removed,
! NOTE (Ivan Carnimeo, May, 05th, 2022):
! cegterg and regterg have been ported to GPU with OpenACC,
! the previous CUF versions (cegterg_gpu and regterg_gpu) have been removed,
! and now cegterg and regterg are used for both CPU and GPU execution.
! If you want to see the previous code checkout to commit: df3080b231c5daf52295c23501fbcaa9bfc4bfcc (on Thu Apr 21 06:18:02 2022 +0000)
!
@ -93,16 +93,16 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
! the product of S and psi
LOGICAL, ALLOCATABLE :: conv(:)
! true if the root is converged
REAL(DP) :: empty_ethr
REAL(DP) :: empty_ethr
! threshold for empty bands
INTEGER :: i,j,k
!
REAL(DP), EXTERNAL :: MYDDOT_VECTOR_GPU
REAL(DP), EXTERNAL :: MYDDOT_VECTOR_GPU
!$acc routine(MYDDOT_VECTOR_GPU) vector
!
EXTERNAL h_psi, s_psi, g_psi
! h_psi(npwx,npw,nvec,psi,hpsi)
! calculates H|psi>
! calculates H|psi>
! s_psi(npwx,npw,nvec,psi,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,nvec)
@ -110,8 +110,10 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
! calculates (diag(h)-e)^-1 * psi, diagonal approx. to (h-e)^-1*psi
! the first nvec columns contain the trial eigenvectors
!
!$omp target enter data map(to:evc)
!$omp target enter data map(alloc:e)
CALL start_clock( 'regterg' ) !; write(6,*) 'enter regterg' ; FLUSH(6)
!
!
!$acc data deviceptr(evc, e)
!
IF ( nvec > nvecx / 2 ) CALL errore( 'regter', 'nvecx is too small', 1 )
@ -140,6 +142,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
ALLOCATE( spsi( npwx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' regterg ',' cannot allocate spsi ', ABS(ierr) )
!$omp target enter data map(alloc:spsi)
END IF
!
ALLOCATE( sr( nvecx, nvecx ), STAT=ierr )
@ -154,6 +157,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
ALLOCATE( ew( nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate ew ', ABS(ierr) )
!$omp target enter data map(alloc:sr, hr, vr, ew)
ALLOCATE( conv( nvec ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate conv ', ABS(ierr) )
@ -161,79 +165,137 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
notcnv = nvec
nbase = nvec
conv = .FALSE.
!
!
#if defined(__OPENMP_GPU)
!$omp target teams distribute parallel do collapse(2)
do j=1,nvecx
do i=1,npwx
hpsi(i,j) = ZERO
psi (i,j) = ZERO
enddo
enddo
IF ( uspp ) then
!$omp target teams distribute parallel do collapse(2)
do j=1,nvecx
do i=1,npwx
spsi(i,j) = ZERO
enddo
enddo
endif
!$omp target teams distribute parallel do
DO k=1,nvec
psi(1,k) = evc(1,k)
! ... set Im[ psi(G=0) ] - needed for numerical stability
IF (gstart == 2) psi(1,k) = CMPLX( DBLE( psi(1,k) ), 0.D0 ,kind=DP)
DO i=2,npwx
psi(i,k) = evc(i,k)
END DO
END DO
!$acc end parallel
#else
!$acc kernels
hpsi = ZERO
psi = ZERO
IF ( uspp ) spsi = ZERO
!$acc end kernels
!
!$acc parallel vector_length(64)
!$acc parallel vector_length(64)
!$acc loop gang independent
DO k=1,nvec
psi(1,k) = evc(1,k)
! ... set Im[ psi(G=0) ] - needed for numerical stability
IF (gstart == 2) psi(1,k) = CMPLX( DBLE( psi(1,k) ), 0.D0 ,kind=DP)
!$acc loop vector
!$acc loop vector
DO i=2,npwx
psi(i,k) = evc(i,k)
END DO
END DO
!$acc end parallel
#endif
!
! ... hpsi contains h times the basis vectors
!
!$acc host_data use_device(psi, hpsi, spsi)
!$omp target update from(psi,hpsi)
CALL h_psi( npwx, npw, nvec, psi, hpsi ) ; nhpsi = nvec
!
! ... spsi contains s times the basis vectors
!
IF ( uspp ) CALL s_psi( npwx, npw, nvec, psi, spsi )
IF ( uspp ) then
!$omp target update from(spsi)
CALL s_psi( npwx, npw, nvec, psi, spsi )
endif
!$acc end host_data
!
! ... hr contains the projection of the hamiltonian onto the reduced
! ... space vr contains the eigenvectors of hr
!
CALL start_clock( 'regterg:init' )
CALL start_clock( 'regterg:init' )
#if !defined(__OPENMP_GPU)
!$acc kernels
hr(:,:) = 0.D0
sr(:,:) = 0.D0
vr(:,:) = 0.D0
!$acc end kernels
#else
!$omp target teams distribute parallel do collapse(2)
do j=1,nvecx
do i=1,nvecx
hr(i,j) = 0.D0
sr(i,j) = 0.D0
vr(i,j) = 0.D0
enddo
enddo
#endif
!
!$acc host_data use_device(psi, hpsi, spsi, hr, sr)
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
if (n_start .le. n_end) &
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0 , psi, npwx2, hpsi(1,n_start), npwx2, 0.D0, hr(1,n_start), nvecx )
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, hpsi(1,n_start), npwx2, hr(1,n_start), nvecx )
if (n_start .le. n_end) then
!$omp target update to(hpsi)
CALL MYDGEMM( 'T','N', nbase, my_n, npw2, 2.D0 , psi, npwx2, hpsi(1,n_start), npwx2, 0.D0, hr(1,n_start), nvecx )
endif
IF ( gstart == 2 ) THEN
!$omp target update to(hpsi)
CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, hpsi(1,n_start), npwx2, hr(1,n_start), nvecx )
ENDIF
!$omp target update from(hr)
CALL mp_sum( hr( :, 1:nbase ), inter_bgrp_comm )
!
CALL mp_sum( hr( :, 1:nbase ), intra_bgrp_comm )
!$omp target update to(hr)
!
IF ( uspp ) THEN
!
if (n_start .le. n_end) &
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, spsi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, spsi(1,n_start), npwx2, sr(1,n_start), nvecx )
if (n_start .le. n_end) then
!$omp target update to(spsi)
CALL MYDGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, spsi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
endif
IF ( gstart == 2 ) THEN
!$omp target update to(spsi)
CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, spsi(1,n_start), npwx2, sr(1,n_start), nvecx )
ENDIF
!
ELSE
!
if (n_start .le. n_end) &
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, psi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
CALL MYDGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, psi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, psi(1,n_start), npwx2, sr(1,n_start), nvecx )
!
END IF
!$omp target update from(sr)
CALL mp_sum( sr( :, 1:nbase ), inter_bgrp_comm )
!
CALL mp_sum( sr( :, 1:nbase ), intra_bgrp_comm )
!$omp target update to(sr)
!$acc end host_data
!
CALL stop_clock( 'regterg:init' )
!
IF ( lrot ) THEN
!
!$acc parallel loop
!$acc parallel loop
!$omp target teams distribute parallel do
DO n = 1, nbase
!
e(n) = hr(n,n)
@ -254,10 +316,12 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL mp_bcast( vr, root_bgrp_id, inter_bgrp_comm )
CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
ENDIF
CALL stop_clock( 'regterg:diag' )
CALL stop_clock( 'regterg:diag' )
!$acc end host_data
!$omp target update to(vr,ew)
!
!$acc parallel loop
!$acc parallel loop
!$omp target teams distribute parallel do
DO i = 1, nvec
e(i) = ew(i)
END DO
@ -273,31 +337,34 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL start_clock( 'regterg:update' )
!
np = 0
!
!
DO n = 1, nvec
!
IF ( .NOT. conv(n) ) THEN
!
! ... this root not yet converged ...
! ... this root not yet converged ...
!
np = np + 1
!
! ... reorder eigenvectors so that coefficients for unconverged
! ... roots come first. This allows to use quick matrix-matrix
! ... roots come first. This allows to use quick matrix-matrix
! ... multiplications to set a new basis vector (see below)
!
IF ( np /= n ) THEN
!$acc parallel loop
IF ( np /= n ) THEN
!$acc parallel loop
!$omp target teams distribute parallel do
DO i = 1, nvecx
vr(i,np) = vr(i,n)
END DO
END DO
END IF
!
! ... for use in g_psi
!
!$acc kernels
!$acc kernels
!$omp target
ew(nbase+np) = e(n)
!$acc end kernels
!$omp end target
!
END IF
!
@ -310,6 +377,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=1, notcnv
DO k=1,npwx
psi(k,nbase+i)=ZERO
@ -318,19 +386,22 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!$acc host_data use_device(psi, spsi, vr)
IF ( uspp ) THEN
!
if (n_start .le. n_end) &
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
!
if (n_start .le. n_end) then
!$omp target update to(spsi)
CALL MYDGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
endif
!
ELSE
!
if (n_start .le. n_end) &
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
CALL MYDGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
!
END IF
!$acc end host_data
! NB: must not call mp_sum over inter_bgrp_comm here because it is done later to the full correction
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO np=1,notcnv
DO k=1,npwx
psi(k,nbase+np) = - ew(nbase+np) * psi(k,nbase+np)
@ -338,25 +409,32 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
END DO
!
!$acc host_data use_device(psi, hpsi, vr, ew)
if (n_start .le. n_end) &
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 1.D0, psi(1,nb1), npwx2 )
if (n_start .le. n_end) then
!$omp target update to(hpsi)
CALL MYDGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 1.D0, psi(1,nb1), npwx2 )
endif
!
!$omp target update from(psi)
CALL mp_sum( psi(:,nb1:nbase+notcnv), inter_bgrp_comm )
!
CALL stop_clock( 'regterg:update' )
!
! ... approximate inverse iteration
!
!$omp target update from(ew)
CALL g_psi( npwx, npw, notcnv, 1, psi(1,nb1), ew(nb1) )
!$acc end host_data
!
! ... "normalize" correction vectors psi(:,nb1:nbase+notcnv) in
! ... order to improve numerical stability of subspace diagonalization
! ... "normalize" correction vectors psi(:,nb1:nbase+notcnv) in
! ... order to improve numerical stability of subspace diagonalization
! ... (rdiaghg) ew is used as work array :
!
! ... ew = <psi_i|psi_i>, i = nbase + 1, nbase + notcnv
!
!$acc parallel vector_length(96)
!$acc loop gang private(nbn)
!$acc parallel vector_length(96)
!$acc loop gang private(nbn)
!$omp target update to(psi,ew)
!$omp target teams distribute private(nbn)
DO n = 1, notcnv
!
nbn = nbase + n
@ -366,18 +444,21 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
END DO
!$acc end parallel
!$omp target update from(ew)
!
!$acc host_data use_device(ew)
CALL mp_sum( ew( 1:notcnv ), intra_bgrp_comm )
!$acc end host_data
!
!$acc parallel vector_length(96)
!$acc loop gang
!$omp target update to(ew)
!$acc parallel vector_length(96)
!$acc loop gang
!$omp target teams distribute parallel do
DO i = 1,notcnv
psi(1,nbase+i) = psi(1,nbase+i)/SQRT( ew(i) )
! ... set Im[ psi(G=0) ] - needed for numerical stability
IF (gstart == 2) psi(1,nbase+i) = CMPLX( DBLE(psi(1,nbase+i)), 0.D0 ,kind=DP)
!$acc loop vector
!$acc loop vector
DO k=2,npwx
psi(k,nbase+i) = psi(k,nbase+i)/SQRT( ew(i) )
END DO
@ -387,9 +468,13 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
! ... here compute the hpsi and spsi of the new functions
!
!$acc host_data use_device(psi, hpsi, spsi)
!$omp target update from(psi,hpsi)
CALL h_psi( npwx, npw, notcnv, psi(1,nb1), hpsi(1,nb1) ) ; nhpsi = nhpsi + notcnv
!
IF ( uspp ) CALL s_psi( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
IF ( uspp ) THEN
!$omp target update from(spsi)
CALL s_psi( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
ENDIF
!$acc end host_data
!
! ... update the reduced hamiltonian
@ -397,6 +482,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL start_clock( 'regterg:overlap' )
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=0,notcnv-1
DO j=1, nvecx
hr( j, nb1+i )=0.d0
@ -406,14 +492,18 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!$acc host_data use_device(psi, hpsi, hr)
CALL divide(inter_bgrp_comm,nbase+notcnv,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, 0.D0, hr(n_start,nb1), nvecx )
!$omp target update to(hpsi)
CALL MYDGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, 0.D0, hr(n_start,nb1), nvecx )
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, hr(n_start,nb1), nvecx )
!$omp target update from(hr)
CALL mp_sum( hr( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
!
CALL mp_sum( hr( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
!$omp target update to(hr)
!$acc end host_data
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=0,notcnv-1
DO j=1, nvecx
sr( j, nb1+i )=0.d0
@ -425,29 +515,33 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
IF ( uspp ) THEN
!
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, 0.D0, sr(n_start,nb1), nvecx )
!$omp target update to(spsi)
CALL MYDGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, 0.D0, sr(n_start,nb1), nvecx )
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, sr(n_start,nb1), nvecx )
!
ELSE
!
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, 0.D0, sr(n_start,nb1) , nvecx )
CALL MYDGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, 0.D0, sr(n_start,nb1) , nvecx )
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, sr(n_start,nb1), nvecx )
!
END IF
!$omp target update from(sr)
CALL mp_sum( sr( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
!
CALL mp_sum( sr( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
!$omp target update to(sr)
!$acc end host_data
!
CALL stop_clock( 'regterg:overlap' )
!
nbase = nbase + notcnv
!
!$acc parallel
!$acc parallel
!$acc loop gang
!$omp target teams distribute parallel do
DO n = 1, nbase
!
!$acc loop vector
!$acc loop vector
DO m = n + 1, nbase
!
hr(m,n) = hr(n,m)
@ -456,12 +550,13 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
END DO
!
END DO
!$acc end parallel
!$acc end parallel
!
! ... diagonalize the reduced hamiltonian
!
CALL start_clock( 'regterg:diag' )
!$acc host_data use_device(hr, sr, ew, vr)
!$omp target update from(hr, sr)
IF( my_bgrp_id == root_bgrp_id ) THEN
CALL diaghg( nbase, nvec, hr, sr, nvecx, ew, vr, me_bgrp, root_bgrp, intra_bgrp_comm )
END IF
@ -470,25 +565,28 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
ENDIF
!$acc end host_data
!$omp target update to(hr,sr,vr,ew)
CALL stop_clock( 'regterg:diag' )
!
! ... test for convergence
!
!$acc parallel loop copy(conv(1:nvec)) copyin(btype(1:nvec))
!$omp target teams distribute parallel do map(tofrom:conv) map(to:btype)
DO i = 1, nvec
IF(btype(i) == 1) THEN
conv(i) = ( ( ABS( ew(i) - e(i) ) < ethr ) )
ELSE
conv(i) = ( ( ABS( ew(i) - e(i) ) < empty_ethr ) )
END IF
END DO
END IF
END DO
!
! ... next line useful for band parallelization of exact exchange
IF ( nbgrp > 1 ) CALL mp_bcast(conv,root_bgrp_id,inter_bgrp_comm)
!
notcnv = COUNT( .NOT. conv(:) )
!
!$acc parallel loop
!$acc parallel loop
!$omp target teams distribute parallel do
DO i=1,nvec
e(i) = ew(i)
END DO
@ -504,7 +602,8 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
CALL start_clock( 'regterg:last' )
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO k=1,nvec
DO i=1,npwx
evc(i,k) = ZERO
@ -514,9 +613,11 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
!$acc host_data use_device(psi, vr)
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, evc, npwx2 )
CALL MYDGEMM( 'N','N', npw2, nvec, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, evc, npwx2 )
!$omp target update from(evc)
!$acc end host_data
CALL mp_sum( evc, inter_bgrp_comm )
!$omp target update to(evc)
!
IF ( notcnv == 0 ) THEN
!
@ -541,7 +642,8 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
! ... refresh psi, H*psi and S*psi
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=1,nvec
DO k=1,npwx
psi(k,i) = evc(k,i)
@ -551,18 +653,22 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
IF ( uspp ) THEN
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i = 1, npwx
DO j = nvec+1, nvec+nvec
psi(i,j) = ZERO
END DO
END DO
END DO
END DO
!
!$acc host_data use_device(psi, spsi, vr)
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
CALL MYDGEMM( 'N','N', npw2, nvec, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
!$omp target update from(psi)
CALL mp_sum( psi(:,nvec+1:nvec+nvec), inter_bgrp_comm )
!$omp target update to(psi)
!$acc end host_data
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=1,nvec
DO k=1,npwx
spsi(k,i) = psi(k,i+nvec)
@ -571,15 +677,27 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
END IF
!
#if defined(__OPENMP_GPU)
!$omp target teams distribute parallel do collapse(2)
do j=nvec+1,nvec+nvec
do i=1,npwx
psi(i,j) = ZERO
enddo
enddo
#else
!$acc kernels
psi(:,nvec+1:nvec+nvec) = ZERO
!$acc end kernels
#endif
!$acc host_data use_device(psi, hpsi, vr)
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
CALL MYDGEMM( 'N','N', npw2, nvec, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
!$omp target update from(psi)
CALL mp_sum( psi(:,nvec+1:nvec+nvec), inter_bgrp_comm )
!$omp target update to(psi)
!$acc end host_data
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=1,nvec
DO k=1, npwx
hpsi(k,i) = psi(k,i+nvec)
@ -590,21 +708,23 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
nbase = nvec
!
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i = 1, nvecx
DO j = 1, nbase
hr(i,j) = 0.D0
sr(i,j) = 0.D0
vr(i,j) = 0.D0
END DO
END DO
END DO
END DO
!
!$acc parallel loop
!$acc parallel loop
!$omp target teams distribute parallel do
DO j = 1, nbase
hr(j,j) = e(j)
sr(j,j) = 1.D0
vr(j,j) = 1.D0
END DO
END DO
!
CALL stop_clock( 'regterg:last' )
!
@ -612,19 +732,26 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
END DO iterate
!
!$omp target exit data map(delete:evc)
!$omp target exit data map(from:e)
DEALLOCATE( conv )
!$omp target exit data map(delete:ew)
DEALLOCATE( ew )
!$omp target exit data map(delete:vr)
DEALLOCATE( vr )
!$omp target exit data map(delete:hr)
DEALLOCATE( hr )
!$omp target exit data map(delete:sr)
DEALLOCATE( sr )
!
!$omp target exit data map(delete:spsi)
IF ( uspp ) DEALLOCATE( spsi )
!
#if defined(__OPENMP_GPU)
!$omp end target data
#endif
DEALLOCATE( hpsi )
DEALLOCATE( psi )
DEALLOCATE( psi )
!
!$acc end data
!
@ -715,7 +842,7 @@ SUBROUTINE pregterg(h_psi, s_psi, uspp, g_psi, &
! the product of S and psi
LOGICAL, ALLOCATABLE :: conv(:)
! true if the root is converged
REAL(DP) :: empty_ethr
REAL(DP) :: empty_ethr
! threshold for empty bands
INTEGER :: npw2, npwx2
INTEGER :: idesc(LAX_DESC_SIZE), idesc_old(LAX_DESC_SIZE)
@ -737,7 +864,7 @@ SUBROUTINE pregterg(h_psi, s_psi, uspp, g_psi, &
!
EXTERNAL h_psi, s_psi, g_psi
! h_psi(npwx,npw,nvec,psi,hpsi)
! calculates H|psi>
! calculates H|psi>
! s_psi(npwx,npw,nvec,psi,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,nvec)
@ -747,10 +874,10 @@ SUBROUTINE pregterg(h_psi, s_psi, uspp, g_psi, &
!
!
CALL start_clock( 'regterg' ) !; write(6,*) 'enter pregterg' ; FLUSH(6)
!
!
CALL laxlib_getval( np_ortho = np_ortho, ortho_parent_comm = ortho_parent_comm, &
do_distr_diag_inside_bgrp = do_distr_diag_inside_bgrp )
!
!
IF ( nvec > nvecx / 2 ) CALL errore( 'pregter', 'nvecx is too small', 1 )
!
IF ( gstart == -1 ) CALL errore( 'pregter', 'gstart variable not initialized', 1 )
@ -791,7 +918,7 @@ SUBROUTINE pregterg(h_psi, s_psi, uspp, g_psi, &
!
IF( la_proc ) THEN
!
! only procs involved in the diagonalization need to allocate local
! only procs involved in the diagonalization need to allocate local
! matrix block.
!
ALLOCATE( vl( nx , nx ), STAT=ierr )
@ -921,8 +1048,8 @@ SUBROUTINE pregterg(h_psi, s_psi, uspp, g_psi, &
!
CALL g_psi( npwx, npw, notcnv, 1, psi(1,nb1), ew(nb1) )
!
! ... "normalize" correction vectors psi(:,nb1:nbase+notcnv) in
! ... order to improve numerical stability of subspace diagonalization
! ... "normalize" correction vectors psi(:,nb1:nbase+notcnv) in
! ... order to improve numerical stability of subspace diagonalization
! ... (rdiaghg) ew is used as work array :
!
! ... ew = <psi_i|psi_i>, i = nbase + 1, nbase + notcnv
@ -955,7 +1082,7 @@ SUBROUTINE pregterg(h_psi, s_psi, uspp, g_psi, &
!
CALL start_clock( 'regterg:overlap' )
!
! we need to save the old descriptor in order to redistribute matrices
! we need to save the old descriptor in order to redistribute matrices
!
idesc_old = idesc
!
@ -1051,7 +1178,7 @@ SUBROUTINE pregterg(h_psi, s_psi, uspp, g_psi, &
!
CALL start_clock( 'regterg:last' )
!
CALL refresh_evc()
CALL refresh_evc()
!
IF ( notcnv == 0 ) THEN
!
@ -1081,7 +1208,7 @@ SUBROUTINE pregterg(h_psi, s_psi, uspp, g_psi, &
IF ( uspp ) THEN
!
CALL refresh_spsi()
!
!
END IF
!
CALL refresh_hpsi()
@ -1138,7 +1265,7 @@ SUBROUTINE pregterg(h_psi, s_psi, uspp, g_psi, &
!$omp end target data
#endif
DEALLOCATE( hpsi )
DEALLOCATE( psi )
DEALLOCATE( psi )
!
CALL stop_clock( 'regterg' )
!call print_clock( 'regterg' )
@ -1164,7 +1291,7 @@ CONTAINS
DO i = 1, idesc(LAX_DESC_NC)
distmat( i, i ) = 1_DP
END DO
END IF
END IF
RETURN
END SUBROUTINE set_to_identity
!
@ -1196,14 +1323,14 @@ CONTAINS
!
IF ( .NOT. conv(n) ) THEN
!
! ... this root not yet converged ...
! ... this root not yet converged ...
!
np = np + 1
npl = npl + 1
IF( npl == 1 ) ic_notcnv( ipc ) = np
!
! ... reorder eigenvectors so that coefficients for unconverged
! ... roots come first. This allows to use quick matrix-matrix
! ... roots come first. This allows to use quick matrix-matrix
! ... multiplications to set a new basis vector (see below)
!
notcnv_ip( ipc ) = notcnv_ip( ipc ) + 1
@ -1217,7 +1344,7 @@ CONTAINS
! ... for use in g_psi
!
ew(nbase+np) = e(n)
!
!
END IF
!
END DO
@ -1245,7 +1372,7 @@ CONTAINS
IF( notcnv_ip( ipc ) > 0 ) THEN
notcl = notcnv_ip( ipc )
ic = ic_notcnv( ipc )
ic = ic_notcnv( ipc )
beta = 0.0d0
@ -1261,7 +1388,7 @@ CONTAINS
END IF
CALL mp_bcast( vtmp(:,1:notcl), root, ortho_parent_comm )
!
!
IF ( uspp ) THEN
!
CALL DGEMM( 'N', 'N', npw2, notcl, nr, 1.D0, &
@ -1291,7 +1418,6 @@ CONTAINS
!
END DO
DEALLOCATE( vtmp )
DEALLOCATE( ptmp )
@ -1329,19 +1455,19 @@ CONTAINS
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
!
! this proc sends his block
!
!
CALL mp_bcast( vl(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
psi(1,ir), npwx2, vl, nx, beta, evc(1,ic), npwx2 )
ELSE
!
! all other procs receive
!
!
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
psi(1,ir), npwx2, vtmp, nx, beta, evc(1,ic), npwx2 )
END IF
!
!
beta = 1.0d0
@ -1387,19 +1513,19 @@ CONTAINS
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
!
! this proc sends his block
!
!
CALL mp_bcast( vl(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
spsi(1,ir), npwx2, vl, nx, beta, psi(1,nvec+ic), npwx2 )
ELSE
!
! all other procs receive
!
!
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
spsi(1,ir), npwx2, vtmp, nx, beta, psi(1,nvec+ic), npwx2 )
END IF
!
!
beta = 1_DP
END DO
@ -1447,19 +1573,19 @@ CONTAINS
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
!
! this proc sends his block
!
!
CALL mp_bcast( vl(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
hpsi(1,ir), npwx2, vl, nx, beta, psi(1,nvec+ic), npwx2 )
ELSE
!
! all other procs receive
!
!
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
hpsi(1,ir), npwx2, vtmp, nx, beta, psi(1,nvec+ic), npwx2 )
END IF
!
!
beta = 1.0d0
END DO
@ -1480,7 +1606,7 @@ CONTAINS
SUBROUTINE compute_distmat( dm, v, w )
!
! This subroutine compute <vi|wj> and store the
! result in distributed matrix dm
! result in distributed matrix dm
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root
@ -1492,7 +1618,7 @@ CONTAINS
!
work = 0.0d0
!
DO ipc = 1, idesc(LAX_DESC_NPC) ! loop on column procs
DO ipc = 1, idesc(LAX_DESC_NPC) ! loop on column procs
!
nc = nrc_ip( ipc )
ic = irc_ip( ipc )
@ -1580,7 +1706,6 @@ CONTAINS
CALL mp_root_sum( vtmp(:,1:nc), dm, root, ortho_parent_comm )
END IF
END DO
!
END IF

View File

@ -14,15 +14,15 @@
!
! The approach following Algorithm is the parallel orbital updating algorithm:
! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
! and let $n=0$.
! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
! $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))] $$
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
! $n$-th iteration or the former iterations.
! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$ satisfying
! $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad \forall v \in \tilde{V}_N$$
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
! $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else, stop.
!
@ -30,7 +30,7 @@
! X. Dai, X. Gong, A. Zhou, J. Zhu,
! A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
! X. Dai, Z. Liu, X. Zhang, A. Zhou,
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
! arXiv:1510.07230 (2015).
! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
! A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
@ -68,17 +68,17 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
COMPLEX(DP),INTENT(IN) :: psi0(npwx,nbnd) ! psi0 needed to compute the Pv projection
COMPLEX(DP),INTENT(IN) :: spsi0(npwx,nbnd) ! Spsi0 needed to compute the Pv projection
INTEGER, INTENT(IN) :: npw, npwx, nbnd, nvec ! input dimensions
INTEGER, INTENT(IN) :: npw, npwx, nbnd, nvec ! input dimensions
REAL(DP), INTENT(IN) :: ethr ! threshold for convergence.
REAL(DP), INTENT(INOUT) :: e(nvec) ! current estimate of the target eigenvalues
COMPLEX(DP),INTENT(INOUT) :: psi(npwx,nvec),hpsi(npwx,nvec),spsi(npwx,nvec) !
COMPLEX(DP),INTENT(INOUT) :: psi(npwx,nvec),hpsi(npwx,nvec),spsi(npwx,nvec) !
! input: the current estimate of the wfcs
! output: the estimated correction vectors
INTEGER, INTENT(INOUT) :: nhpsi ! (updated) number of Hpsi evaluations
!
! ... LOCAL variables
!
INTEGER, PARAMETER :: maxter = 5 ! maximum number of CG iterations
INTEGER, PARAMETER :: maxter = 5 ! maximum number of CG iterations
!
COMPLEX(DP), ALLOCATABLE :: b(:,:), & ! RHS for testing
p(:,:), hp(:,:), sp(:,:), z(:,:) ! additional working vetors
@ -112,7 +112,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
!
npw2 = 2*npw
npwx2 = 2*npwx
block_size = min(nvec,64)
block_size = min(nvec,64)
!
ALLOCATE( g0( block_size ), g1( block_size ), g2( block_size ), alpha( block_size ), gamma( block_size ) )
ALLOCATE( ethr_cg( block_size ), ff( block_size ), ff0( block_size ), cg_iter( block_size ) )
@ -123,7 +123,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
done = 0 ! the number of correction vectors already solved
nactive = 0 ! the number of correction vectors currently being updated
!$acc kernels
!$acc kernels
cg_iter = 0 ! how many iteration each active vector has completed (<= maxter)
!$acc end kernels
@ -132,7 +132,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
nnew = min(done+block_size,nvec)-(done+nactive) ! number of new corrections to be added to the seach
if ( nnew > 0 ) then ! add nnew vectors to the active list
!write(6,*) ' nnew =', nnew
!$acc parallel
!$acc parallel
!$acc loop gang private(i)
do l=nactive+1,nactive+nnew; i=l+done
!write(6,*) ' l =',l,' i =',i
@ -140,18 +140,18 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
!$acc loop vector
DO ii = 1, npwx
b(ii,l) = e(i) * spsi(ii,i) - hpsi(ii,i) ! initial gradient and saved RHS for later
END DO
END DO
!$acc loop vector
DO ii = 1, npwx
z(ii,l) = b(ii,l)
END DO
END DO
end do
!$acc end parallel
! initial preconditioned gradient
! initial preconditioned gradient
!$acc host_data use_device(z, e)
do l=nactive+1,nactive+nnew; i=l+done
call g_1psi(npwx,npw,z(:,l),e(i))
call g_1psi(npwx,npw,z(:,l),e(i))
end do
!$acc end host_data
@ -159,14 +159,14 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
CALL start_clock( 'pcg:ortho' )
!$acc host_data use_device(spsi0, psi0, z, spsi0vec)
CALL MYDGEMM2( 'T','N', nbnd,nnew,npw2, 2.D0, spsi0, npwx2, z(:,nactive+1), npwx2, 0.D0, spsi0vec, nbnd,.FALSE. )
IF ( gstart == 2 ) CALL MYDGER( nbnd, nnew, -1.D0, spsi0, npwx2, z(:,nactive+1), npwx2, spsi0vec, nbnd )
IF ( gstart == 2 ) CALL MYDGER2( nbnd, nnew, -1.D0, spsi0, npwx2, z(:,nactive+1), npwx2, spsi0vec, nbnd,.FALSE. )
CALL mp_sum( spsi0vec, intra_bgrp_comm )
CALL MYDGEMM2( 'N','N', npw2,nnew,nbnd,-1.D0, psi0, npwx2, spsi0vec, nbnd, 1.D0, z(:,nactive+1), npwx2,.FALSE. )
!$acc end host_data
CALL stop_clock( 'pcg:ortho' )
!-
!$acc parallel loop
!$acc parallel loop
do l=nactive+1,nactive+nnew
g0(l) = 2.D0*MYDDOT_VECTOR_GPU(npw2,z(:,l),b(:,l))
IF (gstart==2) g0(l)=g0(l)-CONJG(z(1,l))*b(1,l)
@ -174,9 +174,9 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
!$acc host_data use_device(g0)
CALL mp_sum( g0(nactive+1:nactive+nnew), intra_bgrp_comm ) ! g0 = < initial z | initial gradient b >
!$acc end host_data
!$acc end host_data
!$acc parallel
!$acc parallel
!$acc loop gang private(i)
do l=nactive+1,nactive+nnew; i=l+done
!write(6,*) ' l =',l,' i =',i
@ -184,20 +184,20 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
!write (6,*) 0, g0(l), ff(l)
! ethr_cg = ethr ! CG convergence threshold could be set from input but it is not ...
ethr_cg(l) = 1.0D-2 ! it makes more sense to fix the convergence of the CG solution to a
ethr_cg(l) = 1.0D-2 ! it makes more sense to fix the convergence of the CG solution to a
! fixed function of the RHS (see ethr_cg update later).
ethr_cg(l) = max ( 0.01*ethr, ethr_cg(l) * g0(l) ) ! here we set the convergence of the correction
!write(6,*) 'ethr_cg :', ethr_cg(l)
!$acc loop vector
!$acc loop vector
do ii = 1, npwx
! zero the trial solution
psi(ii,i) = ZERO
hpsi(ii,i) = ZERO
psi(ii,i) = ZERO
hpsi(ii,i) = ZERO
spsi(ii,i) = ZERO
! initial search direction
p(ii,l) = z(ii,l)
end do
end do
cg_iter(l) = 0 ! this is a new correction vector, reset its interation count
end do
!$acc end parallel
@ -210,7 +210,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
if ( nactive == 0 ) EXIT MAIN_LOOP ! this is the only MAIN_LOOP EXIT condition
!$acc kernels
!$acc kernels
cg_iter(1:nactive) = cg_iter(1:nactive) + 1 ! update interation counters
!$acc end kernels
@ -223,10 +223,10 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
!$acc end host_data
CALL stop_clock( 'pcg:hs_1psi' )
!$acc parallel loop private(i)
!$acc parallel loop private(i)
do l = 1, nactive; i=l+done
gamma(l) = 2.D0*MYDDOT_VECTOR_GPU(npw2,p(:,l),hp(:,l)) &
- e(i) * 2.D0*MYDDOT_VECTOR_GPU(npw2,p(:,l),sp(:,l))
- e(i) * 2.D0*MYDDOT_VECTOR_GPU(npw2,p(:,l),sp(:,l))
IF (gstart==2) gamma(l) = gamma(l) - CONJG(p(1,l))*hp(1,l) + e(i) * CONJG(p(1,l))*sp(1,l)
end do
@ -234,19 +234,19 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
CALL mp_sum( gamma(1:nactive), intra_bgrp_comm ) ! gamma = < p | hp - e sp >
!$acc end host_data
!$acc parallel
!$acc parallel
!$acc loop gang private(i)
do l = 1, nactive; i=l+done
!write(6,*) ' l =',l,' i =',i
alpha(l) = g0(l)/gamma(l)
!write(6,*) 'g0, gamma, alpha :', g0(l), gamma(l), alpha(l)
!$acc loop vector
!$acc loop vector
DO ii = 1, npwx
psi(ii,i) = psi(ii,i) + alpha(l) * p(ii,l) ! updated solution
hpsi(ii,i) = hpsi(ii,i) + alpha(l) * hp(ii,l) ! updated solution
spsi(ii,i) = spsi(ii,i) + alpha(l) * sp(ii,l) ! updated solution
END DO
END DO
g2(l) = 2.D0 * ( MYDDOT_VECTOR_GPU(npw2,z(:,l),b(:,l)) &
+ e(i) * MYDDOT_VECTOR_GPU(npw2,z(:,l),spsi(:,i)) &
@ -263,8 +263,8 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
do l = 1, nactive ! update the preconditioned gradient
DO ii = 1, npwx
i=l+done
z(ii,l) = b(ii,l) + e(i) * spsi(ii,i) - hpsi(ii,i)
END DO
z(ii,l) = b(ii,l) + e(i) * spsi(ii,i) - hpsi(ii,i)
END DO
end do
!$acc host_data use_device(z, e)
@ -277,13 +277,13 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
CALL start_clock( 'pcg:ortho' )
!$acc host_data use_device(spsi0, psi0, z, spsi0vec)
CALL MYDGEMM2( 'T','N', nbnd,nactive,npw2, 2.D0, spsi0, npwx2, z, npwx2, 0.D0, spsi0vec, nbnd, .FALSE. )
IF ( gstart == 2 ) CALL MYDGER( nbnd, nactive, -1.D0, spsi0, npwx2, z, npwx2, spsi0vec, nbnd )
IF ( gstart == 2 ) CALL MYDGER2( nbnd, nactive, -1.D0, spsi0, npwx2, z, npwx2, spsi0vec, nbnd, .FALSE. )
CALL mp_sum( spsi0vec, intra_bgrp_comm )
CALL MYDGEMM2( 'N','N', npw2,nactive,nbnd,-1.D0, psi0, npwx2, spsi0vec, nbnd, 1.D0, z, npwx2, .FALSE. )
!$acc end host_data
CALL stop_clock( 'pcg:ortho' )
!-
!$acc parallel loop private(i)
!$acc parallel loop private(i)
do l = 1, nactive; i=l+done
g1(l) = 2.D0 * ( MYDDOT_VECTOR_GPU(npw2,z(:,l),b(:,l)) &
+ e(i) * MYDDOT_VECTOR_GPU(npw2,z(:,l),spsi(:,i)) &
@ -293,9 +293,9 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
!$acc host_data use_device(g1)
CALL mp_sum( g1(1:nactive), intra_bgrp_comm ) ! g1 = < new z | new gradient b + e spsi - hpsi >
!$acc end host_data
!$acc end host_data
!$acc parallel loop private(i)
!$acc parallel loop private(i)
do l = 1, nactive; i = l + done ! evaluate the function ff
ff(l) = - ( e(i)*MYDDOT_VECTOR_GPU(npw2,psi(:,i),spsi(:,i)) &
-MYDDOT_VECTOR_GPU(npw2,psi(:,i),hpsi(:,i)) ) &
@ -313,22 +313,22 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
!write (6,*) cg_iter(l), g1(l), ff(l), gamma(l)
!$acc serial copyout(ff_check, ff0_check, g1_check, cg_iter_l, iter_check) copyin(maxter)
ff_check = ff(l) > ff0(l)
ff0_check = ff0(l) < 0.d0
g1_check = ABS ( g1(l) ) < ethr_cg(l)
ff_check = ff(l) > ff0(l)
ff0_check = ff0(l) < 0.d0
g1_check = ABS ( g1(l) ) < ethr_cg(l)
cg_iter_l = cg_iter(l)
iter_check = cg_iter_l == maxter
!$acc end serial
iter_check = cg_iter_l == maxter
!$acc end serial
!write (6,*) cg_iter(l), g1(l), ff(l), gamma(l)
IF ( ff_check .AND. ff0_check ) THEN
!$acc parallel loop
!$acc parallel loop
DO ii = 1, npwx
psi(ii,i) = psi(ii,i) - alpha(l) * p(ii,l) ! fallback solution: if last iter failed to improve ff0
hpsi(ii,i) = hpsi(ii,i) - alpha(l) * hp(ii,l)! exit whitout updating and ...
spsi(ii,i) = spsi(ii,i) - alpha(l) * sp(ii,l)! hope next time it'll be better
END DO
END DO
END IF
!write(6,*) 'g0, g1, g2 :', g0(l), g1(l), g2(l)
@ -355,10 +355,10 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
p (ii,l) = psi (ii,done+newdone) ; psi (ii,done+newdone) = psi (ii,i) ; psi (ii,i) = p (ii,l)
hp(ii,l) = hpsi(ii,done+newdone) ; hpsi(ii,done+newdone) = hpsi(ii,i) ; hpsi(ii,i) = hp(ii,l)
sp(ii,l) = spsi(ii,done+newdone) ; spsi(ii,done+newdone) = spsi(ii,i) ; spsi(ii,i) = sp(ii,l)
END DO
END DO
ee = e(done+newdone)
e(done+newdone) = e(i)
ee = e(done+newdone)
e(done+newdone) = e(i)
e(i) = ee
!write(6,*) ' overwrite converged p/hp/etc l = ',l, ' with newdone = ',newdone
@ -383,7 +383,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
!$acc loop independent
DO ii = 1, npwx
p(ii,l) = z(ii,l) + beta * p(ii,l) ! updated search direction
END DO
END DO
!write(6,*) 'beta :', beta
ff0(l) = ff(l) ! updated minimum value reached by the function
@ -405,11 +405,11 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
do l=1, nactive
!write(6,*) ' l+newdone =',l+newdone,' -> l =',l
!$acc parallel loop
!$acc parallel loop
DO ii = 1, npwx
p(ii,l) = p(ii,l+newdone) ; hp(ii,l) = hp(ii,l+newdone) ; sp(ii,l) = sp(ii,l+newdone)
b(ii,l) = b(ii,l+newdone) ; z(ii,l) = z(ii,l+newdone)
END DO
b(ii,l) = b(ii,l+newdone) ; z(ii,l) = z(ii,l+newdone)
END DO
!$acc kernels
ff0(l) = ff0(l+newdone) ; ff(l) = ff(l+newdone)
g0(l) = g0(l+newdone) ; g1(l) = g1(l+newdone) ; g2(l) = g2(l+newdone)
@ -421,7 +421,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
END IF
! END DO iterate Here is where the pcg loop would terminate
END DO MAIN_LOOP
!write (6,*) ' exit pcg loop'

View File

@ -165,7 +165,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
kedtaue2(k,1) = kedtau(k,1) / e2
ENDDO
CALL xc_metagcx( nrxx, 1, np, rhoaux, grho, kedtaue2, sx, sc, &
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=gpuarg )
!$acc parallel loop
DO k = 1, nrxx
v2c(k,1) = v2cm(1,k,1)
@ -205,7 +205,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
kedtaue2(k,1:nspin0) = kedtau(k,1:nspin0) / e2
ENDDO
CALL xc_metagcx( nrxx, nspin0, np, rhoaux, grho, kedtaue2, sx, sc, &
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=gpuarg )
!$acc parallel loop
DO k = 1, nrxx
v2c(k,:) = v2cm(1,k,:)
@ -218,7 +218,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
ALLOCATE( v2c_ud(nrxx) )
!$acc data create( v2c_ud )
!
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud, gpu_args_=.TRUE. )
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud, gpu_args_=gpuarg )
!
!$acc parallel loop reduction(+:sigma_gc11,sigma_gc21,sigma_gc22, &
!$acc& sigma_gc31,sigma_gc32,sigma_gc33)
@ -265,7 +265,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
sigma_gradcorr(2,2) = sigma_gc22
sigma_gradcorr(3,1) = sigma_gc31
sigma_gradcorr(3,2) = sigma_gc32
sigma_gradcorr(3,3) = sigma_gc33
sigma_gradcorr(3,3) = sigma_gc33
!
!$acc end data
!$acc end data

View File

@ -20,7 +20,7 @@ SUBROUTINE v_of_rho( rho, rho_core, rhog_core, &
USE noncollin_module, ONLY : noncolin, nspin_lsda
USE ions_base, ONLY : nat, tau
USE ldaU, ONLY : lda_plus_u, lda_plus_u_kind, ldmx_b, &
nsg, v_nsg
nsg, v_nsg
USE xc_lib, ONLY : xclib_dft_is
USE scf, ONLY : scf_type
USE cell_base, ONLY : alat
@ -35,10 +35,10 @@ SUBROUTINE v_of_rho( rho, rho_core, rhog_core, &
TYPE(scf_type), INTENT(INOUT) :: rho
!! the valence charge
TYPE(scf_type), INTENT(INOUT) :: v
!! the scf (Hxc) potential
!=================> NB: NOTE that in F90 derived data type must be INOUT and
!! the scf (Hxc) potential
!=================> NB: NOTE that in F90 derived data type must be INOUT and
!=================> not just OUT because otherwise their allocatable or pointer
!=================> components are NOT defined
!=================> components are NOT defined
REAL(DP), INTENT(IN) :: rho_core(dfftp%nnr)
!! the core charge
COMPLEX(DP), INTENT(IN) :: rhog_core(ngm)
@ -78,7 +78,7 @@ SUBROUTINE v_of_rho( rho, rho_core, rhog_core, &
!
CALL v_h( rho%of_g(:,1), ehart, charge, v%of_r )
!
! ... DFT+U(+V): build up (extended) Hubbard potential
! ... DFT+U(+V): build up (extended) Hubbard potential
!
IF (lda_plus_u) THEN
!
@ -119,7 +119,7 @@ SUBROUTINE v_of_rho( rho, rho_core, rhog_core, &
ENDIF
!
! ... add an electric field
!
!
DO is = 1, nspin_lsda
CALL add_efield(v%of_r(1,is), etotefield, rho%of_r(:,1), .false. )
END DO
@ -177,7 +177,7 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
REAL(DP), INTENT(INOUT) :: v(dfftp%nnr,nspin)
!! V_xc potential
REAL(DP), INTENT(INOUT) :: kedtaur(dfftp%nnr,nspin)
!! local K energy density
!! local K energy density
REAL(DP), INTENT(INOUT) :: vtxc
!! integral V_xc * rho
REAL(DP), INTENT(INOUT) :: etxc
@ -236,7 +236,7 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
rhogsum(k) = fac*rhog_core(k) + ( rho%of_g(k,1) + sgn_is*rho%of_g(k,nspin) )*0.5D0
ENDDO
!
CALL fft_gradient_g2r( dfftp, rhogsum, g, grho(:,:,is) )
CALL fft_gradient_g2r( dfftp, rhogsum, g, grho(:,:,is) )
!
ENDDO
!
@ -283,7 +283,7 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
!$acc data create( rho_updw )
!
!$acc parallel loop present(rho)
DO k = 1, dfftp%nnr
DO k = 1, dfftp%nnr
rho_updw(k,1) = ( rho%of_r(k,1) + rho%of_r(k,2) ) * 0.5d0
rho_updw(k,2) = ( rho%of_r(k,1) - rho%of_r(k,2) ) * 0.5d0
ENDDO
@ -363,7 +363,7 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
write (stdout, '(/,5x, "negative rho (up,down): ", 2es10.3)') rhoneg1, rhoneg2
ENDIF
!
vtxc = omega * vtxc / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
vtxc = omega * vtxc / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
etxc = omega * etxc / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
!
IF ( dft_is_nonlocc() ) CALL nlc( rho%of_r, rho_core, nspin, etxc, vtxc, v )
@ -470,7 +470,7 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v )
!$acc data create( ex, ec, vx, vc )
!$acc parallel loop
#elif defined(__OPENMP_GPU)
!$omp target data map(to:rho%of_r,rho_core) map(alloc:ex,ec,vx,vc) map(from:v)
!$omp target data map(always,to:rho%of_r) map(to:rho_core) map(alloc:ex,ec,vx,vc) map(from:v)
!$omp target teams distribute parallel do
#endif
DO ir = 1, dfftp_nnr
@ -625,7 +625,7 @@ SUBROUTINE v_h( rhog, ehart, charge, v )
USE mp, ONLY : mp_sum
USE martyna_tuckerman, ONLY : wg_corr_h, do_comp_mt
USE esm, ONLY : do_comp_esm, esm_hartree, esm_bc
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_2D, cutoff_hartree
USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_2D, cutoff_hartree
!
IMPLICIT NONE
!
@ -678,7 +678,7 @@ SUBROUTINE v_h( rhog, ehart, charge, v )
!$omp parallel do private( fac, rgtot_re, rgtot_im ), reduction(+:ehart)
DO ig = gstart, ngm
!
fac = 1.D0 / gg(ig)
fac = 1.D0 / gg(ig)
!
rgtot_re = REAL( rhog(ig) )
rgtot_im = AIMAG( rhog(ig) )
@ -809,11 +809,11 @@ SUBROUTINE v_hubbard( ns, v_hub, eth )
!
! ... local variables
!
REAL(DP) :: effU, sgn(2)
REAL(DP) :: effU, sgn(2)
INTEGER :: is, isop, na, nt, m1, m2
!
eth = 0.d0
sgn(1) = 1.d0
sgn(1) = 1.d0
sgn(2) = -1.d0
!
v_hub(:,:,:,:) = 0.d0
@ -828,8 +828,8 @@ SUBROUTINE v_hubbard( ns, v_hub, eth )
effU = Hubbard_U(nt) - Hubbard_J0(nt)
ELSE
effU = Hubbard_U(nt)
ENDIF
!
ENDIF
!
DO is = 1, nspin
DO m1 = 1, 2 * Hubbard_l(nt) + 1
eth = eth + ( Hubbard_alpha(nt) + 0.5D0*effU )*ns(m1,m1,is,na)
@ -862,7 +862,7 @@ SUBROUTINE v_hubbard( ns, v_hub, eth )
ENDDO
!
END IF
!
!
ENDDO
!
IF (nspin==1) eth = 2.d0 * eth
@ -915,7 +915,7 @@ SUBROUTINE v_hubbard_b (ns, v_hub, eth)
CALL errore('v_hubbard_b', 'J0 is not supported in DFT+U with multiple channels per atomic type',1)
!
IF (Hubbard_beta(nt).NE.0.d0) &
CALL errore('v_hubbard_b', 'Hubbard_beta is not supported in DFT+U with multiple channels per atomic type',1)
CALL errore('v_hubbard_b', 'Hubbard_beta is not supported in DFT+U with multiple channels per atomic type',1)
!
IF (is_hubbard_back(nt)) THEN
!
@ -923,7 +923,7 @@ SUBROUTINE v_hubbard_b (ns, v_hub, eth)
!
DO is = 1, nspin
!
DO m1 = 1, ldim_back(nt)
DO m1 = 1, ldim_back(nt)
!
eth = eth + ( Hubbard_alpha_back(nt) + 0.5D0 * effU ) * &
ns(m1,m1,is,na)
@ -1004,7 +1004,7 @@ SUBROUTINE v_hubbard_full( ns, v_hub, eth )
!
IF (Hubbard_U(nt)/=0.d0) THEN
!
! Initialize U(m1,m2,m3,m4) matrix
! Initialize U(m1,m2,m3,m4) matrix
!
CALL hubbard_matrix( Hubbard_lmax, Hubbard_l(nt), Hubbard_U(nt), &
Hubbard_J(1,nt), u_matrix )
@ -1022,7 +1022,7 @@ SUBROUTINE v_hubbard_full( ns, v_hub, eth )
IF (nspin==1) n_tot = 2.d0 * n_tot
!
mag2 = 0.d0
!
!
IF (nspin==2) THEN
DO m1 = 1, 2 * Hubbard_l(nt) + 1
mag2 = mag2 + ns(m1,m1,1,na) - ns(m1,m1,2,na)
@ -1047,12 +1047,12 @@ SUBROUTINE v_hubbard_full( ns, v_hub, eth )
!
DO m1 = 1, 2 * Hubbard_l(nt) + 1
!
! Hubbard potential: DC contribution
! Hubbard potential: DC contribution
!
v_hub(m1,m1,is,na) = v_hub(m1,m1,is,na) + Hubbard_J(1,nt)*n_spin + &
0.5d0*(Hubbard_U(nt)-Hubbard_J(1,nt)) - Hubbard_U(nt)*n_tot
!
! +U contributions
! +U contributions
!
DO m2 = 1, 2 * Hubbard_l(nt) + 1
DO m3 = 1, 2 * Hubbard_l(nt) + 1
@ -1080,7 +1080,7 @@ SUBROUTINE v_hubbard_full( ns, v_hub, eth )
ENDIF
!
ENDDO ! na
!
!
IF (nspin==1) eth_u = 2.d0 * eth_u
eth = eth_u - eth_dc
!
@ -1132,19 +1132,19 @@ SUBROUTINE v_hubbard_full_nc( ns, v_hub, eth )
ALLOCATE( u_matrix(2*Hubbard_lmax+1, 2*Hubbard_lmax+1, 2*Hubbard_lmax+1, 2*Hubbard_lmax+1) )
!
eth = 0.d0
eth_dc = 0.d0
eth_noflip = 0.d0
eth_flip = 0.d0
eth_dc = 0.d0
eth_noflip = 0.d0
eth_flip = 0.d0
!
v_hub(:,:,:,:) = 0.d0
!
DO na = 1, nat
DO na = 1, nat
!
nt = ityp (na)
nt = ityp (na)
!
IF (Hubbard_U(nt) /= 0.d0) THEN
IF (Hubbard_U(nt) /= 0.d0) THEN
!
! Initialize U(m1,m2,m3,m4) matrix
! Initialize U(m1,m2,m3,m4) matrix
!
CALL hubbard_matrix( Hubbard_lmax, Hubbard_l(nt), Hubbard_U(nt), &
Hubbard_J(1,nt), u_matrix )
@ -1160,17 +1160,17 @@ SUBROUTINE v_hubbard_full_nc( ns, v_hub, eth )
mx = mx + DBLE( ns(m1, m1, 2, na) + ns(m1, m1, 3, na) )
my = my + 2.d0 * AIMAG( ns(m1, m1, 2, na) )
mz = mz + DBLE( ns(m1, m1, 1, na) - ns(m1, m1, 4, na) )
ENDDO
mag2 = mx**2 + my**2 + mz**2
ENDDO
mag2 = mx**2 + my**2 + mz**2
!
! Hubbard energy: DC term
!
mx = REAL(n_tot)
eth_dc = eth_dc + 0.5d0*( Hubbard_U(nt)*mx*(mx-1.d0) - &
Hubbard_J(1,nt)*mx*(0.5d0*mx-1.d0) - &
0.5d0*Hubbard_J(1,nt)*mag2 )
0.5d0*Hubbard_J(1,nt)*mag2 )
!
DO is = 1, nspin
DO is = 1, nspin
!
IF (is == 2) THEN
is1 = 3
@ -1191,7 +1191,7 @@ SUBROUTINE v_hubbard_full_nc( ns, v_hub, eth )
DO m3 = 1, 2*Hubbard_l(nt)+1
DO m4 = 1, 2*Hubbard_l(nt)+1
eth_noflip = eth_noflip + 0.5d0*( &
( u_matrix(m1,m2,m3,m4)-u_matrix(m1,m2,m4,m3) )* &
( u_matrix(m1,m2,m3,m4)-u_matrix(m1,m2,m4,m3) )* &
ns(m1,m3,is,na)*ns(m2,m4,is,na) + &
u_matrix(m1,m2,m3,m4)*ns(m1,m3,is,na)*ns(m2,m4,nspin+1-is,na) )
ENDDO
@ -1200,7 +1200,7 @@ SUBROUTINE v_hubbard_full_nc( ns, v_hub, eth )
ENDDO
!
ELSE
!
!
! Spin-flip contribution
!
DO m1 = 1, 2*Hubbard_l(nt)+1
@ -1208,7 +1208,7 @@ SUBROUTINE v_hubbard_full_nc( ns, v_hub, eth )
DO m3 = 1, 2*Hubbard_l(nt)+1
DO m4 = 1, 2*Hubbard_l(nt)+1
eth_flip = eth_flip - 0.5d0*u_matrix(m1,m2,m4,m3)* &
ns(m1,m3,is,na)*ns(m2,m4,is1,na)
ns(m1,m3,is,na)*ns(m2,m4,is1,na)
ENDDO
ENDDO
ENDDO
@ -1216,7 +1216,7 @@ SUBROUTINE v_hubbard_full_nc( ns, v_hub, eth )
!
ENDIF
!
! Hubbard potential: non spin-flip contribution
! Hubbard potential: non spin-flip contribution
!
IF (is1 == is) THEN
!
@ -1225,8 +1225,8 @@ SUBROUTINE v_hubbard_full_nc( ns, v_hub, eth )
DO m3 = 1, 2*Hubbard_l(nt)+1
DO m4 = 1, 2*Hubbard_l(nt)+1
v_hub(m1,m2,is,na) = v_hub(m1,m2,is,na) + &
u_matrix(m1,m3,m2,m4)*( ns(m3,m4,1,na)+ns(m3,m4,4,na) )
ENDDO
u_matrix(m1,m3,m2,m4)*( ns(m3,m4,1,na)+ns(m3,m4,4,na) )
ENDDO
ENDDO
ENDDO
ENDDO
@ -1237,28 +1237,28 @@ SUBROUTINE v_hubbard_full_nc( ns, v_hub, eth )
!
n_aux = 0.d0
DO m1 = 1, 2*Hubbard_l(nt)+1
n_aux = n_aux + ns(m1,m1,is1,na)
n_aux = n_aux + ns(m1,m1,is1,na)
ENDDO
!
DO m1 = 1, 2*Hubbard_l(nt)+1
!
! Hubbard potential: DC contribution
!
! Hubbard potential: DC contribution
!
v_hub(m1,m1,is,na) = v_hub(m1,m1,is,na) + Hubbard_J(1,nt)*n_aux
!
IF (is1 == is) THEN
v_hub(m1,m1,is,na) = v_hub(m1,m1,is,na) + &
0.5d0*(Hubbard_U(nt)-Hubbard_J(1,nt)) - Hubbard_U(nt)*n_tot
0.5d0*(Hubbard_U(nt)-Hubbard_J(1,nt)) - Hubbard_U(nt)*n_tot
ENDIF
!
! Hubbard potential: spin-flip contribution
!
DO m2 = 1, 2*Hubbard_l(nt)+1
DO m2 = 1, 2*Hubbard_l(nt)+1
DO m3 = 1, 2*Hubbard_l(nt)+1
DO m4 = 1, 2*Hubbard_l(nt)+1
v_hub(m1,m2,is,na) = v_hub(m1,m2,is,na) - &
u_matrix(m1,m3,m4,m2) * ns(m3,m4,is1,na)
ENDDO
u_matrix(m1,m3,m4,m2) * ns(m3,m4,is1,na)
ENDDO
ENDDO
ENDDO
!
@ -1277,7 +1277,7 @@ SUBROUTINE v_hubbard_full_nc( ns, v_hub, eth )
IF ( iverbosity > 0 ) THEN
WRITE(stdout,*) '--- in v_hubbard ---'
WRITE(stdout,'("Hub. E (dc, noflip, flip, total) ",4f9.4)') &
eth_dc, eth_noflip, eth_flip, eth
eth_dc, eth_noflip, eth_flip, eth
WRITE(stdout,*) '-------'
ENDIF
!
@ -1310,8 +1310,8 @@ SUBROUTINE v_hubbard_extended (nsg, v_hub, eth)
COMPLEX(DP), INTENT(IN) :: nsg (ldmx_tot, ldmx_tot, max_num_neighbors, nat, nspin)
COMPLEX(DP), INTENT(OUT) :: v_hub(ldmx_tot, ldmx_tot, max_num_neighbors, nat, nspin)
REAL(DP), INTENT(OUT) :: eth
!
! Local variables
!
! Local variables
!
INTEGER :: is, isop, na, na1, na2, nt, nt1, nt2, m1, m2, viz, equiv_na2, i_type
INTEGER, EXTERNAL :: type_interaction, find_viz
@ -1360,14 +1360,14 @@ SUBROUTINE v_hubbard_extended (nsg, v_hub, eth)
na = find_viz(na1,na1)
!
! This is the diagonal term (like in the DFT+U only case)
!
!
DO m1 = 1, ldim_u(nt1)
!
i_type = type_interaction(na1,m1,equiv_na2,m1)
!
v_hub(m1,m1,na,na1,is) = v_hub(m1,m1,na,na1,is) &
+ Hubbard_V(na1,na1,i_type) * 0.5d0
!
!
eth = eth + nsg(m1,m1,na,na1,is) &
* Hubbard_V(na1,na1,i_type) * 0.5d0
!
@ -1420,7 +1420,7 @@ SUBROUTINE v_hubbard_extended (nsg, v_hub, eth)
ENDDO ! viz
!
ENDDO ! is
!
!
ENDIF
!
! Hubbard_alpha or Hubbard_alpha_back
@ -1468,7 +1468,7 @@ END SUBROUTINE v_hubbard_extended
!----------------------------------------------------------------------------
SUBROUTINE v_h_of_rho_r( rhor, ehart, charge, v )
!----------------------------------------------------------------------------
!! Hartree potential VH(r) from a density in R space n(r)
!! Hartree potential VH(r) from a density in R space n(r)
!
USE kinds, ONLY : DP
USE fft_base, ONLY : dfftp
@ -1515,7 +1515,7 @@ END SUBROUTINE v_h_of_rho_r
!----------------------------------------------------------------------------
SUBROUTINE gradv_h_of_rho_r( rho, gradv )
!----------------------------------------------------------------------------
!! Gradient of Hartree potential in R space from a total
!! Gradient of Hartree potential in R space from a total
!! (spinless) density in R space n(r)
!
USE kinds, ONLY : DP
@ -1545,7 +1545,7 @@ SUBROUTINE gradv_h_of_rho_r( rho, gradv )
! ... Bring rho to G space
!
ALLOCATE( rhoaux( dfftp%nnr ) )
rhoaux( : ) = CMPLX( rho( : ), 0.D0, KIND=dp )
rhoaux( : ) = CMPLX( rho( : ), 0.D0, KIND=dp )
!
CALL fwfft( 'Rho', rhoaux, dfftp )
!
@ -1560,25 +1560,25 @@ SUBROUTINE gradv_h_of_rho_r( rho, gradv )
DO ig = gstart, ngm
!
fac = g(ipol,ig) / gg(ig)
gaux(dfftp%nl(ig)) = CMPLX(-AIMAG(rhoaux(dfftp%nl(ig))),REAL(rhoaux(dfftp%nl(ig))),KIND=dp)*fac
gaux(dfftp%nl(ig)) = CMPLX(-AIMAG(rhoaux(dfftp%nl(ig))),REAL(rhoaux(dfftp%nl(ig))),KIND=dp)*fac
!
ENDDO
!
! ...and add the factor e2*fpi/2\pi/a coming from the missing prefactor of
! V = e2 * fpi divided by the 2\pi/a factor missing in G
! ...and add the factor e2*fpi/2\pi/a coming from the missing prefactor of
! V = e2 * fpi divided by the 2\pi/a factor missing in G
!
fac = e2 * fpi / tpiba
gaux = gaux * fac
gaux = gaux * fac
!
! ...add martyna-tuckerman correction, if needed
!
!
IF (do_comp_mt) THEN
ALLOCATE( vaux( ngm ), rgtot(ngm) )
rgtot(1:ngm) = rhoaux(dfftp%nl(1:ngm))
CALL wg_corr_h( omega, ngm, rgtot, vaux, eh_corr )
DO ig = gstart, ngm
fac = g(ipol,ig) * tpiba
gaux(dfftp%nl(ig)) = gaux(dfftp%nl(ig)) + CMPLX(-AIMAG(vaux(ig)),REAL(vaux(ig)),kind=dp)*fac
gaux(dfftp%nl(ig)) = gaux(dfftp%nl(ig)) + CMPLX(-AIMAG(vaux(ig)),REAL(vaux(ig)),kind=dp)*fac
END DO
DEALLOCATE( rgtot, vaux )
ENDIF

View File

@ -7,7 +7,7 @@
!
! This file initiated by Carlo Cavazzoni 2020
!
! Purpose: collect miscellaneus subroutines to help dealing with
! Purpose: collect miscellaneus subroutines to help dealing with
! accelerator devices
! In principle this can go away .......
@ -15,6 +15,12 @@ SUBROUTINE MYDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#if defined(__CUDA)
use cudafor
use cublas
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
use onemkl_blas_gpu
#elif defined(__ROCBLAS)
use rocblas_utils
#endif
#endif
! .. Scalar Arguments ..
DOUBLE PRECISION :: ALPHA
@ -23,8 +29,18 @@ SUBROUTINE MYDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
!$omp target variant dispatch use_device_ptr(A, X, Y)
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
!$omp end target variant dispatch
#elif defined(__ROCBLAS)
CALL rocblas_dger( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#endif
#else
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#endif
END SUBROUTINE MYDGER
@ -35,6 +51,12 @@ SUBROUTINE MYDGEMM( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC
#if defined(__CUDA)
use cudafor
use cublas
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
use onemkl_blas_gpu
#elif defined(__ROCBLAS)
use rocblas_utils
#endif
#endif
CHARACTER*1, INTENT(IN) :: TRANSA, TRANSB
INTEGER, INTENT(IN) :: M, N, K, LDA, LDB, LDC
@ -43,6 +65,14 @@ SUBROUTINE MYDGEMM( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC
#if defined(__CUDA)
attributes(device) :: A, B, C
CALL cublasdgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
!$omp target variant dispatch use_device_ptr(A, B, C)
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
!$omp end target variant dispatch
#elif defined(__ROCBLAS)
CALL rocblas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
#else
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
@ -86,9 +116,51 @@ END SUBROUTINE MYZGEMM
!=============================================================================================
! The following two are PROVISIONAL routines for omp5 porting. They do the same as MYDGEMM and
! MYZGEMM, but with an additional variable (OMP_OFFLOAD) to decide wether to perform a cpu
! MYZGEMM, but with an additional variable (OMP_OFFLOAD) to decide wether to perform a cpu
! _gemm or call a rocblas _gemm which takes gpu_only arguments.
!
SUBROUTINE MYDGER2 ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA, OMP_OFFLOAD )
#if defined(__CUDA)
use cudafor
use cublas
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
use onemkl_blas_gpu
#elif defined(__ROCBLAS)
use rocblas_utils
#endif
#endif
! .. Scalar Arguments ..
DOUBLE PRECISION :: ALPHA
INTEGER :: INCX, INCY, LDA, M, N
! .. Array Arguments ..
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
LOGICAL, INTENT(IN) :: OMP_OFFLOAD
#if defined(__CUDA)
attributes(device) :: A, X, Y
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
IF (OMP_OFFLOAD) THEN
!$omp target variant dispatch use_device_ptr(A, X, Y)
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
!$omp end target variant dispatch
ELSE
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
ENDIF
#elif defined(__ROCBLAS)
IF (OMP_OFFLOAD) THEN
CALL rocblas_dger( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
ELSE
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
ENDIF
#endif
#else
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#endif
END SUBROUTINE MYDGER2
SUBROUTINE MYDGEMM2( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, OMP_OFFLOAD )
#if defined(__CUDA)
use cudafor
@ -109,20 +181,20 @@ SUBROUTINE MYDGEMM2( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LD
#if defined(__CUDA)
attributes(device) :: A, B, C
CALL cublasdgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#else
#if defined(__ONEMKL)
!$omp target variant dispatch use_device_ptr(A, B, C)
#endif
#if defined(__ROCBLAS)
#elif defined(__ONEMKL)
IF (OMP_OFFLOAD) THEN
!$omp target variant dispatch use_device_ptr(A, B, C)
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
!$omp end target variant dispatch
ELSE
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
ENDIF
#elif defined(__ROCBLAS)
IF (OMP_OFFLOAD) CALL rocblas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
IF (.NOT. OMP_OFFLOAD) CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#else
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
#if defined(__ONEMKL)
!$omp end target variant dispatch
#endif
#endif
END SUBROUTINE MYDGEMM2
@ -146,20 +218,20 @@ SUBROUTINE MYZGEMM2( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LD
#if defined(__CUDA)
attributes(device) :: A, B, C
CALL cublaszgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#else
#if defined(__ONEMKL)
!$omp target variant dispatch use_device_ptr(A, B, C)
#endif
#if defined(__ROCBLAS)
#elif defined(__ONEMKL)
IF (OMP_OFFLOAD) THEN
!$omp target variant dispatch use_device_ptr(A, B, C)
CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
!$omp end target variant dispatch
ELSE
CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
ENDIF
#elif defined(__ROCBLAS)
IF (OMP_OFFLOAD) CALL rocblas_zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
IF (.NOT. OMP_OFFLOAD) CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#else
CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
#if defined(__ONEMKL)
!$omp end target variant dispatch
#endif
#endif
END SUBROUTINE MYZGEMM2
!========================================================================================================
@ -216,9 +288,9 @@ DOUBLE PRECISION FUNCTION MYDDOT_VECTOR_GPU(N,DX,DY)
!$omp declare target
#endif
RES = 0.0d0
MYDDOT_VECTOR_GPU = 0.0d0
MYDDOT_VECTOR_GPU = 0.0d0
IF (N.LE.0) RETURN
! code for unequal increments or equal increments not equal to 1 NOT implemented
! code for unequal increments or equal increments not equal to 1 NOT implemented
M = mod(N,5)
IF(M.NE.0) THEN
!$acc loop vector reduction(+:RES)
@ -226,8 +298,8 @@ DOUBLE PRECISION FUNCTION MYDDOT_VECTOR_GPU(N,DX,DY)
!$omp parallel do simd reduction(+:RES)
#endif
DO I = 1, M
RES = RES + DX(I) * DY(I)
END DO
RES = RES + DX(I) * DY(I)
END DO
#if defined __OPENMP_GPU
!$omp end parallel do simd
#endif
@ -235,15 +307,15 @@ DOUBLE PRECISION FUNCTION MYDDOT_VECTOR_GPU(N,DX,DY)
MYDDOT_VECTOR_GPU = RES
RETURN
END IF
END IF
MP1 = M + 1
END IF
MP1 = M + 1
!$acc loop vector reduction(+:RES)
#if defined __OPENMP_GPU
!$omp parallel do simd reduction(+:RES)
#endif
DO I = MP1, n, 5
RES = RES + DX(I)*DY(I) + DX(I+1)*DY(I+1) + DX(I+2)*DY(I+2) + DX(I+3)*DY(I+3) + DX(I+4)*DY(I+4)
END DO
END DO
#if defined __OPENMP_GPU
!$omp end parallel do simd
#endif

View File

@ -49,7 +49,6 @@ MODULE rocblas
END INTERFACE
CONTAINS
FUNCTION rocblas_get_operation(op)
@ -202,6 +201,29 @@ MODULE rocblas_utils
INTERFACE rocblas_dgemm
MODULE PROCEDURE rocblas_ddgemm, rocblas_dzgemm1, rocblas_dzgemm2, rocblas_dzgemm3
END INTERFACE
INTERFACE
FUNCTION rocblas_dger_(handle, m, n, alpha, x, incx, y, incy, A, lda) &
BIND(C, NAME="rocblas_dger")
USE ISO_C_BINDING
IMPLICIT NONE
TYPE(C_PTR), VALUE :: handle
INTEGER(rocblas_int), VALUE :: m, n
REAL(c_double) :: alpha
TYPE(C_PTR), VALUE :: x
INTEGER(rocblas_int), VALUE :: incx
TYPE(C_PTR), VALUE :: y
INTEGER(rocblas_int), VALUE :: incy
TYPE(C_PTR), VALUE :: A
INTEGER(rocblas_int), VALUE :: lda
INTEGER :: rocblas_dger_
END FUNCTION rocblas_dger_
END INTERFACE
INTERFACE rocblas_dger
MODULE PROCEDURE rocblas_dger1, rocblas_dzger
END INTERFACE
CONTAINS
@ -403,5 +425,51 @@ MODULE rocblas_utils
END SUBROUTINE
SUBROUTINE rocblas_dger1(m, n, alpha, x, incx, y, incy, A, lda)
USE ISO_C_BINDING
IMPLICIT NONE
INTEGER, INTENT(IN) :: m, n, incx, incy, lda
REAL(DP), INTENT(IN) :: alpha
REAL(DP), INTENT(IN), TARGET :: x(m), y(n)
REAL(DP), INTENT(INOUT), TARGET :: A(lda,*)
INTEGER :: rm, rn, rincx, rincy, rlda
INTEGER :: stat
rm = int(m, kind(rocblas_int))
rn = int(n, kind(rocblas_int))
rincx = int(incx, kind(rocblas_int))
rincy = int(incx, kind(rocblas_int))
rlda = int(lda, kind(rocblas_int))
!$omp target data use_device_ptr(A, x, y)
stat = rocblas_dger_(handle, rm, rn, alpha, c_loc(x), rincx, c_loc(y), rincy, &
c_loc(A), rlda)
!$omp end target data
CALL rocblas_check(stat, "DGER1")
END SUBROUTINE
SUBROUTINE rocblas_dzger(m, n, alpha, x, incx, y, incy, A, lda)
USE ISO_C_BINDING
IMPLICIT NONE
INTEGER, INTENT(IN) :: m, n, incx, incy, lda
REAL(DP), INTENT(IN) :: alpha
COMPLEX(DP), INTENT(IN) :: x(m), y(n)
REAL(DP), INTENT(INOUT) :: A(lda,*)
INTEGER :: rm, rn, rincx, rincy, rlda
INTEGER :: stat
rm = int(m, kind(rocblas_int))
rn = int(n, kind(rocblas_int))
rincx = int(incx, kind(rocblas_int))
rincy = int(incx, kind(rocblas_int))
rlda = int(lda, kind(rocblas_int))
!$omp target data use_device_ptr(A, x, y)
stat = rocblas_dger_(handle, rm, rn, alpha, c_loc(x), rincx, c_loc(y), rincy, &
c_loc(A), rlda)
!$omp end target data
CALL rocblas_check(stat, "DZGER")
END SUBROUTINE
END MODULE rocblas_utils
#endif

View File

@ -112,6 +112,11 @@ for dir in $dirs; do
# list of all external library modules or include files
libdeps="mpi omp_lib hdf5 mkl_dfti mkl_dfti.f90 fftw3.f03 fftw3.f \
mkl_omp_offload mkl_omp_offload.f90 \
onemkl_blas_omp_offload_ilp64 onemkl_blas_omp_offload_lp64 \
onemkl_blas_omp_offload_ilp64_no_array_check onemkl_blas_omp_offload_lp64_no_array_check \
onemkl_lapack_omp_offload_ilp64 onemkl_lapack_omp_offload_lp64 \
onemkl_vsl_omp_offload_ilp64 onemkl_vsl_omp_offload_lp64 \
mkl_dfti_omp_offload mkl_dfti_omp_offload.f90 \
xc_version.h xc_f03_lib_m elpa elpa1 \
mbd w90_io fox_dom fox_wxml m_common_io \