mirror of https://gitlab.com/QEF/q-e.git
Merge branch 'develop_omp5' into 'develop_omp5'
regterg to OpenMP offload See merge request icarnimeo/q-e!11
This commit is contained in:
commit
a864194949
|
@ -1387,7 +1387,7 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
|
|||
INTEGER :: k, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom, offset
|
||||
INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx, ipp, nblk, nsiz
|
||||
!
|
||||
INTEGER :: iter, dest, sorc
|
||||
INTEGER :: iter, dest, sorc, ncp_me, npp_gproc
|
||||
INTEGER :: istatus(MPI_STATUS_SIZE)
|
||||
|
||||
me = dfft%mype + 1
|
||||
|
@ -1422,12 +1422,14 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
|
|||
!$omp target data use_device_addr(f_in)
|
||||
#endif
|
||||
DO gproc = 1, nprocp
|
||||
ncp_me = ncp_(me)
|
||||
npp_gproc = npp_(gproc)
|
||||
kdest = ( gproc - 1 ) * ncpx
|
||||
kfrom = ( gproc - 1 ) * npp_(gproc)
|
||||
kfrom = ( gproc - 1 ) * nppx
|
||||
istat = int(omp_target_memcpy_rect(c_loc(f_aux), c_loc(f_in), &
|
||||
int(sizeof(dummy),c_size_t), &
|
||||
int(2,c_int), &
|
||||
int((/ ncp_(me), npp_(gproc) /),c_size_t), &
|
||||
int((/ ncp_me, npp_gproc /),c_size_t), &
|
||||
int((/ kdest, 0 /),c_size_t), &
|
||||
int((/ 0, kfrom /),c_size_t), &
|
||||
int((/ (nxx_/nppx), nppx /),c_size_t), &
|
||||
|
@ -1447,13 +1449,15 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
|
|||
DO gproc = 1, nprocp
|
||||
kdest = ( gproc - 1 ) * sendsiz
|
||||
kfrom = offset
|
||||
ncp_me = ncp_(me)
|
||||
npp_gproc = npp_(gproc)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO k = 1, ncp_ (me)
|
||||
DO i = 1, npp_ ( gproc )
|
||||
DO k = 1, ncp_me
|
||||
DO i = 1, npp_gproc
|
||||
f_aux( kdest + i + (k-1)*nppx ) = f_in( kfrom + i + (k-1)*nr3x )
|
||||
END DO
|
||||
END DO
|
||||
offset = offset + npp_ ( gproc )
|
||||
offset = offset + npp_gproc
|
||||
ENDDO
|
||||
#ifndef __GPU_MPI
|
||||
!$omp target update from (f_aux)
|
||||
|
@ -1513,7 +1517,7 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
|
|||
10 CONTINUE
|
||||
|
||||
!$omp target teams distribute parallel do
|
||||
do i = lbound(f_aux,1), ubound(f_aux,1)
|
||||
do i = 1, nxx_
|
||||
f_aux(i) = (0.d0, 0.d0)
|
||||
end do
|
||||
!$omp end target teams distribute parallel do
|
||||
|
@ -1567,7 +1571,6 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
|
|||
f_in( omp_j + it ) = f_aux( mc + ( omp_j - 1 ) * nnp )
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$omp end target teams distribute parallel do
|
||||
ENDDO
|
||||
ELSE
|
||||
DO gproc = 1, nprocp
|
||||
|
@ -1581,7 +1584,6 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
|
|||
f_in( omp_j + it ) = f_aux( mc + ( omp_j - 1 ) * nnp )
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$omp end target teams distribute parallel do
|
||||
ENDDO
|
||||
END IF
|
||||
!
|
||||
|
@ -1624,7 +1626,6 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
|
|||
DO i=(me-1)*sendsiz + 1, me*sendsiz
|
||||
f_aux(i) = f_in(i)
|
||||
ENDDO
|
||||
!$omp end target teams distribute parallel do
|
||||
|
||||
call MPI_WAITALL(2*nprocp-2, srh, MPI_STATUSES_IGNORE, ierr)
|
||||
!$omp end target data
|
||||
|
@ -1649,12 +1650,14 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
|
|||
!$omp target data use_device_addr(f_in)
|
||||
#endif
|
||||
DO gproc = 1, nprocp
|
||||
ncp_me = ncp_(me)
|
||||
npp_gproc = npp_(gproc)
|
||||
kdest = ( gproc - 1 ) * ncpx
|
||||
kfrom = ( gproc - 1 ) * npp_(gproc)
|
||||
kfrom = ( gproc - 1 ) * nppx
|
||||
istat = int(omp_target_memcpy_rect(c_loc(f_in), c_loc(f_aux), &
|
||||
int(sizeof(dummy),c_size_t), &
|
||||
int(2,c_int), &
|
||||
int((/ ncp_(me), npp_(gproc) /),c_size_t), &
|
||||
int((/ ncp_me, npp_gproc /),c_size_t), &
|
||||
int((/ 0, kfrom /),c_size_t), &
|
||||
int((/ kdest, 0 /),c_size_t), &
|
||||
int((/ (nxx_/nr3x), nr3x /),c_size_t), &
|
||||
|
@ -1674,13 +1677,15 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
|
|||
DO gproc = 1, nprocp
|
||||
kdest = ( gproc - 1 ) * sendsiz
|
||||
kfrom = offset
|
||||
ncp_me = ncp_(me)
|
||||
npp_gproc = npp_(gproc)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO k = 1, ncp_(me)
|
||||
DO i = 1, npp_( gproc )
|
||||
DO k = 1, ncp_me
|
||||
DO i = 1, npp_gproc
|
||||
f_in( kfrom + i + (k-1)*nr3x ) = f_aux( kdest + i + (k-1)*nppx )
|
||||
END DO
|
||||
END DO
|
||||
offset = offset + npp_( gproc )
|
||||
offset = offset + npp_gproc
|
||||
ENDDO
|
||||
!
|
||||
#endif
|
||||
|
|
|
@ -322,9 +322,6 @@ CONTAINS
|
|||
ALLOCATE( desc%tg_rdsp( desc%nproc2) ) ; desc%tg_rdsp = 0
|
||||
|
||||
#if defined (__OPENMP_GPU)
|
||||
#ifndef __CRAY
|
||||
!$omp target enter data map(alloc:desc)
|
||||
#endif
|
||||
!$omp target enter data map(always,alloc:desc%nsp)
|
||||
!$omp target enter data map(always,alloc:desc%nsw)
|
||||
!$omp target enter data map(always,alloc:desc%ismap)
|
||||
|
@ -337,7 +334,7 @@ CONTAINS
|
|||
|
||||
nsubbatches = ceiling(real(desc%batchsize)/desc%subbatchsize)
|
||||
ALLOCATE( desc%srh(2*nproc, nsubbatches))
|
||||
!$omp target enter data map(alloc:desc%srh)
|
||||
!$omp target enter data map(always,alloc:desc%srh)
|
||||
#endif
|
||||
|
||||
#if defined(__CUDA)
|
||||
|
@ -429,9 +426,6 @@ CONTAINS
|
|||
IF (OMP_TARGET_IS_PRESENT(c_loc(desc%nlm), OMP_GET_DEFAULT_DEVICE()) == 1) THEN
|
||||
!$omp target exit data map(delete:desc%nlm)
|
||||
ENDIF
|
||||
#ifndef __CRAY
|
||||
!$omp target exit data map(delete:desc)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
IF ( ALLOCATED( desc%aux ) ) DEALLOCATE( desc%aux )
|
||||
|
|
|
@ -86,7 +86,7 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
|
|||
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
|
||||
if (n_start .le. n_end) &
|
||||
CALL MYDGEMM2( 'T','N', nstart,my_n,kdim, 2.D0, psi,kdmx, hpsi(1,n_start),kdmx, 0.D0, hh(1,n_start),nstart,.FALSE. )
|
||||
IF ( gstart == 2 ) call MYDGER( nstart, my_n, -1.D0, psi,kdmx, hpsi(1,n_start),kdmx, hh(1,n_start),nstart )
|
||||
IF ( gstart == 2 ) call MYDGER2( nstart, my_n, -1.D0, psi,kdmx, hpsi(1,n_start),kdmx, hh(1,n_start),nstart,.FALSE. )
|
||||
call start_clock('rotHSw:hc:s1')
|
||||
CALL mp_sum( hh(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
|
||||
call stop_clock('rotHSw:hc:s1')
|
||||
|
@ -99,13 +99,13 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
|
|||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL MYDGEMM2('T','N', nstart,my_n,kdim, 2.D0, psi,kdmx, spsi(1,n_start),kdmx, 0.D0, ss(1,n_start),nstart,.FALSE. )
|
||||
IF ( gstart == 2 ) CALL MYDGER(nstart, my_n, -1.D0, psi,kdmx, spsi(1,n_start),kdmx, ss(1,n_start),nstart)
|
||||
IF ( gstart == 2 ) CALL MYDGER2(nstart, my_n, -1.D0, psi,kdmx, spsi(1,n_start),kdmx, ss(1,n_start),nstart,.FALSE. )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL MYDGEMM2('T','N', nstart,my_n,kdim, 2.D0, psi,kdmx, psi(1,n_start),kdmx, 0.D0, ss(1,n_start),nstart,.FALSE. )
|
||||
IF ( gstart == 2 ) CALL MYDGER(nstart, my_n, -1.D0, psi,kdmx, psi(1,n_start),kdmx, ss(1,n_start),nstart)
|
||||
IF ( gstart == 2 ) CALL MYDGER2(nstart, my_n, -1.D0, psi,kdmx, psi(1,n_start),kdmx, ss(1,n_start),nstart,.FALSE. )
|
||||
!
|
||||
END IF
|
||||
call start_clock('rotHSw:hc:s3')
|
||||
|
|
|
@ -110,6 +110,8 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
! calculates (diag(h)-e)^-1 * psi, diagonal approx. to (h-e)^-1*psi
|
||||
! the first nvec columns contain the trial eigenvectors
|
||||
!
|
||||
!$omp target enter data map(to:evc)
|
||||
!$omp target enter data map(alloc:e)
|
||||
CALL start_clock( 'regterg' ) !; write(6,*) 'enter regterg' ; FLUSH(6)
|
||||
!
|
||||
!$acc data deviceptr(evc, e)
|
||||
|
@ -140,6 +142,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
ALLOCATE( spsi( npwx, nvecx ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' regterg ',' cannot allocate spsi ', ABS(ierr) )
|
||||
!$omp target enter data map(alloc:spsi)
|
||||
END IF
|
||||
!
|
||||
ALLOCATE( sr( nvecx, nvecx ), STAT=ierr )
|
||||
|
@ -154,6 +157,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
ALLOCATE( ew( nvecx ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( 'regterg ',' cannot allocate ew ', ABS(ierr) )
|
||||
!$omp target enter data map(alloc:sr, hr, vr, ew)
|
||||
ALLOCATE( conv( nvec ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( 'regterg ',' cannot allocate conv ', ABS(ierr) )
|
||||
|
@ -162,6 +166,33 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
nbase = nvec
|
||||
conv = .FALSE.
|
||||
!
|
||||
#if defined(__OPENMP_GPU)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
do j=1,nvecx
|
||||
do i=1,npwx
|
||||
hpsi(i,j) = ZERO
|
||||
psi (i,j) = ZERO
|
||||
enddo
|
||||
enddo
|
||||
IF ( uspp ) then
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
do j=1,nvecx
|
||||
do i=1,npwx
|
||||
spsi(i,j) = ZERO
|
||||
enddo
|
||||
enddo
|
||||
endif
|
||||
!$omp target teams distribute parallel do
|
||||
DO k=1,nvec
|
||||
psi(1,k) = evc(1,k)
|
||||
! ... set Im[ psi(G=0) ] - needed for numerical stability
|
||||
IF (gstart == 2) psi(1,k) = CMPLX( DBLE( psi(1,k) ), 0.D0 ,kind=DP)
|
||||
DO i=2,npwx
|
||||
psi(i,k) = evc(i,k)
|
||||
END DO
|
||||
END DO
|
||||
!$acc end parallel
|
||||
#else
|
||||
!$acc kernels
|
||||
hpsi = ZERO
|
||||
psi = ZERO
|
||||
|
@ -180,53 +211,83 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
END DO
|
||||
END DO
|
||||
!$acc end parallel
|
||||
#endif
|
||||
!
|
||||
! ... hpsi contains h times the basis vectors
|
||||
!
|
||||
!$acc host_data use_device(psi, hpsi, spsi)
|
||||
!$omp target update from(psi,hpsi)
|
||||
CALL h_psi( npwx, npw, nvec, psi, hpsi ) ; nhpsi = nvec
|
||||
!
|
||||
! ... spsi contains s times the basis vectors
|
||||
!
|
||||
IF ( uspp ) CALL s_psi( npwx, npw, nvec, psi, spsi )
|
||||
IF ( uspp ) then
|
||||
!$omp target update from(spsi)
|
||||
CALL s_psi( npwx, npw, nvec, psi, spsi )
|
||||
endif
|
||||
!$acc end host_data
|
||||
!
|
||||
! ... hr contains the projection of the hamiltonian onto the reduced
|
||||
! ... space vr contains the eigenvectors of hr
|
||||
!
|
||||
CALL start_clock( 'regterg:init' )
|
||||
#if !defined(__OPENMP_GPU)
|
||||
!$acc kernels
|
||||
hr(:,:) = 0.D0
|
||||
sr(:,:) = 0.D0
|
||||
vr(:,:) = 0.D0
|
||||
!$acc end kernels
|
||||
#else
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
do j=1,nvecx
|
||||
do i=1,nvecx
|
||||
hr(i,j) = 0.D0
|
||||
sr(i,j) = 0.D0
|
||||
vr(i,j) = 0.D0
|
||||
enddo
|
||||
enddo
|
||||
#endif
|
||||
!
|
||||
!$acc host_data use_device(psi, hpsi, spsi, hr, sr)
|
||||
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
|
||||
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
|
||||
if (n_start .le. n_end) &
|
||||
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0 , psi, npwx2, hpsi(1,n_start), npwx2, 0.D0, hr(1,n_start), nvecx )
|
||||
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, hpsi(1,n_start), npwx2, hr(1,n_start), nvecx )
|
||||
if (n_start .le. n_end) then
|
||||
!$omp target update to(hpsi)
|
||||
CALL MYDGEMM( 'T','N', nbase, my_n, npw2, 2.D0 , psi, npwx2, hpsi(1,n_start), npwx2, 0.D0, hr(1,n_start), nvecx )
|
||||
endif
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$omp target update to(hpsi)
|
||||
CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, hpsi(1,n_start), npwx2, hr(1,n_start), nvecx )
|
||||
ENDIF
|
||||
!$omp target update from(hr)
|
||||
CALL mp_sum( hr( :, 1:nbase ), inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( hr( :, 1:nbase ), intra_bgrp_comm )
|
||||
!$omp target update to(hr)
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, spsi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
|
||||
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, spsi(1,n_start), npwx2, sr(1,n_start), nvecx )
|
||||
if (n_start .le. n_end) then
|
||||
!$omp target update to(spsi)
|
||||
CALL MYDGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, spsi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
|
||||
endif
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$omp target update to(spsi)
|
||||
CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, spsi(1,n_start), npwx2, sr(1,n_start), nvecx )
|
||||
ENDIF
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, psi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
|
||||
CALL MYDGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, psi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
|
||||
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, psi(1,n_start), npwx2, sr(1,n_start), nvecx )
|
||||
!
|
||||
END IF
|
||||
!$omp target update from(sr)
|
||||
CALL mp_sum( sr( :, 1:nbase ), inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( sr( :, 1:nbase ), intra_bgrp_comm )
|
||||
!$omp target update to(sr)
|
||||
!$acc end host_data
|
||||
!
|
||||
CALL stop_clock( 'regterg:init' )
|
||||
|
@ -234,6 +295,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
IF ( lrot ) THEN
|
||||
!
|
||||
!$acc parallel loop
|
||||
!$omp target teams distribute parallel do
|
||||
DO n = 1, nbase
|
||||
!
|
||||
e(n) = hr(n,n)
|
||||
|
@ -256,8 +318,10 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
ENDIF
|
||||
CALL stop_clock( 'regterg:diag' )
|
||||
!$acc end host_data
|
||||
!$omp target update to(vr,ew)
|
||||
!
|
||||
!$acc parallel loop
|
||||
!$omp target teams distribute parallel do
|
||||
DO i = 1, nvec
|
||||
e(i) = ew(i)
|
||||
END DO
|
||||
|
@ -288,6 +352,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
!
|
||||
IF ( np /= n ) THEN
|
||||
!$acc parallel loop
|
||||
!$omp target teams distribute parallel do
|
||||
DO i = 1, nvecx
|
||||
vr(i,np) = vr(i,n)
|
||||
END DO
|
||||
|
@ -296,8 +361,10 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
! ... for use in g_psi
|
||||
!
|
||||
!$acc kernels
|
||||
!$omp target
|
||||
ew(nbase+np) = e(n)
|
||||
!$acc end kernels
|
||||
!$omp end target
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -310,6 +377,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
|
||||
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO i=1, notcnv
|
||||
DO k=1,npwx
|
||||
psi(k,nbase+i)=ZERO
|
||||
|
@ -318,19 +386,22 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
!$acc host_data use_device(psi, spsi, vr)
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
|
||||
if (n_start .le. n_end) then
|
||||
!$omp target update to(spsi)
|
||||
CALL MYDGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
|
||||
endif
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
|
||||
CALL MYDGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
|
||||
!
|
||||
END IF
|
||||
!$acc end host_data
|
||||
! NB: must not call mp_sum over inter_bgrp_comm here because it is done later to the full correction
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO np=1,notcnv
|
||||
DO k=1,npwx
|
||||
psi(k,nbase+np) = - ew(nbase+np) * psi(k,nbase+np)
|
||||
|
@ -338,14 +409,19 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
END DO
|
||||
!
|
||||
!$acc host_data use_device(psi, hpsi, vr, ew)
|
||||
if (n_start .le. n_end) &
|
||||
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 1.D0, psi(1,nb1), npwx2 )
|
||||
if (n_start .le. n_end) then
|
||||
!$omp target update to(hpsi)
|
||||
CALL MYDGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 1.D0, psi(1,nb1), npwx2 )
|
||||
endif
|
||||
!
|
||||
!$omp target update from(psi)
|
||||
CALL mp_sum( psi(:,nb1:nbase+notcnv), inter_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock( 'regterg:update' )
|
||||
!
|
||||
! ... approximate inverse iteration
|
||||
!
|
||||
!$omp target update from(ew)
|
||||
CALL g_psi( npwx, npw, notcnv, 1, psi(1,nb1), ew(nb1) )
|
||||
!$acc end host_data
|
||||
!
|
||||
|
@ -357,6 +433,8 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
!
|
||||
!$acc parallel vector_length(96)
|
||||
!$acc loop gang private(nbn)
|
||||
!$omp target update to(psi,ew)
|
||||
!$omp target teams distribute private(nbn)
|
||||
DO n = 1, notcnv
|
||||
!
|
||||
nbn = nbase + n
|
||||
|
@ -366,13 +444,16 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
!
|
||||
END DO
|
||||
!$acc end parallel
|
||||
!$omp target update from(ew)
|
||||
!
|
||||
!$acc host_data use_device(ew)
|
||||
CALL mp_sum( ew( 1:notcnv ), intra_bgrp_comm )
|
||||
!$acc end host_data
|
||||
!
|
||||
!$omp target update to(ew)
|
||||
!$acc parallel vector_length(96)
|
||||
!$acc loop gang
|
||||
!$omp target teams distribute parallel do
|
||||
DO i = 1,notcnv
|
||||
psi(1,nbase+i) = psi(1,nbase+i)/SQRT( ew(i) )
|
||||
! ... set Im[ psi(G=0) ] - needed for numerical stability
|
||||
|
@ -387,9 +468,13 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
! ... here compute the hpsi and spsi of the new functions
|
||||
!
|
||||
!$acc host_data use_device(psi, hpsi, spsi)
|
||||
!$omp target update from(psi,hpsi)
|
||||
CALL h_psi( npwx, npw, notcnv, psi(1,nb1), hpsi(1,nb1) ) ; nhpsi = nhpsi + notcnv
|
||||
!
|
||||
IF ( uspp ) CALL s_psi( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
|
||||
IF ( uspp ) THEN
|
||||
!$omp target update from(spsi)
|
||||
CALL s_psi( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
|
||||
ENDIF
|
||||
!$acc end host_data
|
||||
!
|
||||
! ... update the reduced hamiltonian
|
||||
|
@ -397,6 +482,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
CALL start_clock( 'regterg:overlap' )
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO i=0,notcnv-1
|
||||
DO j=1, nvecx
|
||||
hr( j, nb1+i )=0.d0
|
||||
|
@ -406,14 +492,18 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
!$acc host_data use_device(psi, hpsi, hr)
|
||||
CALL divide(inter_bgrp_comm,nbase+notcnv,n_start,n_end)
|
||||
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
|
||||
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, 0.D0, hr(n_start,nb1), nvecx )
|
||||
!$omp target update to(hpsi)
|
||||
CALL MYDGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, 0.D0, hr(n_start,nb1), nvecx )
|
||||
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, hr(n_start,nb1), nvecx )
|
||||
!$omp target update from(hr)
|
||||
CALL mp_sum( hr( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( hr( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
|
||||
!$omp target update to(hr)
|
||||
!$acc end host_data
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO i=0,notcnv-1
|
||||
DO j=1, nvecx
|
||||
sr( j, nb1+i )=0.d0
|
||||
|
@ -425,18 +515,21 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, 0.D0, sr(n_start,nb1), nvecx )
|
||||
!$omp target update to(spsi)
|
||||
CALL MYDGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, 0.D0, sr(n_start,nb1), nvecx )
|
||||
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, sr(n_start,nb1), nvecx )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, 0.D0, sr(n_start,nb1) , nvecx )
|
||||
CALL MYDGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, 0.D0, sr(n_start,nb1) , nvecx )
|
||||
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, sr(n_start,nb1), nvecx )
|
||||
!
|
||||
END IF
|
||||
!$omp target update from(sr)
|
||||
CALL mp_sum( sr( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( sr( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
|
||||
!$omp target update to(sr)
|
||||
!$acc end host_data
|
||||
!
|
||||
CALL stop_clock( 'regterg:overlap' )
|
||||
|
@ -445,6 +538,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
!
|
||||
!$acc parallel
|
||||
!$acc loop gang
|
||||
!$omp target teams distribute parallel do
|
||||
DO n = 1, nbase
|
||||
!
|
||||
!$acc loop vector
|
||||
|
@ -462,6 +556,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
!
|
||||
CALL start_clock( 'regterg:diag' )
|
||||
!$acc host_data use_device(hr, sr, ew, vr)
|
||||
!$omp target update from(hr, sr)
|
||||
IF( my_bgrp_id == root_bgrp_id ) THEN
|
||||
CALL diaghg( nbase, nvec, hr, sr, nvecx, ew, vr, me_bgrp, root_bgrp, intra_bgrp_comm )
|
||||
END IF
|
||||
|
@ -470,11 +565,13 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
|
||||
ENDIF
|
||||
!$acc end host_data
|
||||
!$omp target update to(hr,sr,vr,ew)
|
||||
CALL stop_clock( 'regterg:diag' )
|
||||
!
|
||||
! ... test for convergence
|
||||
!
|
||||
!$acc parallel loop copy(conv(1:nvec)) copyin(btype(1:nvec))
|
||||
!$omp target teams distribute parallel do map(tofrom:conv) map(to:btype)
|
||||
DO i = 1, nvec
|
||||
IF(btype(i) == 1) THEN
|
||||
conv(i) = ( ( ABS( ew(i) - e(i) ) < ethr ) )
|
||||
|
@ -489,6 +586,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
notcnv = COUNT( .NOT. conv(:) )
|
||||
!
|
||||
!$acc parallel loop
|
||||
!$omp target teams distribute parallel do
|
||||
DO i=1,nvec
|
||||
e(i) = ew(i)
|
||||
END DO
|
||||
|
@ -505,6 +603,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
CALL start_clock( 'regterg:last' )
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO k=1,nvec
|
||||
DO i=1,npwx
|
||||
evc(i,k) = ZERO
|
||||
|
@ -514,9 +613,11 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
|
||||
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
|
||||
!$acc host_data use_device(psi, vr)
|
||||
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, evc, npwx2 )
|
||||
CALL MYDGEMM( 'N','N', npw2, nvec, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, evc, npwx2 )
|
||||
!$omp target update from(evc)
|
||||
!$acc end host_data
|
||||
CALL mp_sum( evc, inter_bgrp_comm )
|
||||
!$omp target update to(evc)
|
||||
!
|
||||
IF ( notcnv == 0 ) THEN
|
||||
!
|
||||
|
@ -542,6 +643,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
! ... refresh psi, H*psi and S*psi
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO i=1,nvec
|
||||
DO k=1,npwx
|
||||
psi(k,i) = evc(k,i)
|
||||
|
@ -551,6 +653,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
IF ( uspp ) THEN
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO i = 1, npwx
|
||||
DO j = nvec+1, nvec+nvec
|
||||
psi(i,j) = ZERO
|
||||
|
@ -558,11 +661,14 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
END DO
|
||||
!
|
||||
!$acc host_data use_device(psi, spsi, vr)
|
||||
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
|
||||
CALL MYDGEMM( 'N','N', npw2, nvec, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
|
||||
!$omp target update from(psi)
|
||||
CALL mp_sum( psi(:,nvec+1:nvec+nvec), inter_bgrp_comm )
|
||||
!$omp target update to(psi)
|
||||
!$acc end host_data
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO i=1,nvec
|
||||
DO k=1,npwx
|
||||
spsi(k,i) = psi(k,i+nvec)
|
||||
|
@ -571,15 +677,27 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
!
|
||||
END IF
|
||||
!
|
||||
#if defined(__OPENMP_GPU)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
do j=nvec+1,nvec+nvec
|
||||
do i=1,npwx
|
||||
psi(i,j) = ZERO
|
||||
enddo
|
||||
enddo
|
||||
#else
|
||||
!$acc kernels
|
||||
psi(:,nvec+1:nvec+nvec) = ZERO
|
||||
!$acc end kernels
|
||||
#endif
|
||||
!$acc host_data use_device(psi, hpsi, vr)
|
||||
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
|
||||
CALL MYDGEMM( 'N','N', npw2, nvec, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
|
||||
!$omp target update from(psi)
|
||||
CALL mp_sum( psi(:,nvec+1:nvec+nvec), inter_bgrp_comm )
|
||||
!$omp target update to(psi)
|
||||
!$acc end host_data
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO i=1,nvec
|
||||
DO k=1, npwx
|
||||
hpsi(k,i) = psi(k,i+nvec)
|
||||
|
@ -591,6 +709,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
nbase = nvec
|
||||
!
|
||||
!$acc parallel loop collapse(2)
|
||||
!$omp target teams distribute parallel do collapse(2)
|
||||
DO i = 1, nvecx
|
||||
DO j = 1, nbase
|
||||
hr(i,j) = 0.D0
|
||||
|
@ -600,6 +719,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
END DO
|
||||
!
|
||||
!$acc parallel loop
|
||||
!$omp target teams distribute parallel do
|
||||
DO j = 1, nbase
|
||||
hr(j,j) = e(j)
|
||||
sr(j,j) = 1.D0
|
||||
|
@ -612,12 +732,19 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
!
|
||||
END DO iterate
|
||||
!
|
||||
!$omp target exit data map(delete:evc)
|
||||
!$omp target exit data map(from:e)
|
||||
DEALLOCATE( conv )
|
||||
!$omp target exit data map(delete:ew)
|
||||
DEALLOCATE( ew )
|
||||
!$omp target exit data map(delete:vr)
|
||||
DEALLOCATE( vr )
|
||||
!$omp target exit data map(delete:hr)
|
||||
DEALLOCATE( hr )
|
||||
!$omp target exit data map(delete:sr)
|
||||
DEALLOCATE( sr )
|
||||
!
|
||||
!$omp target exit data map(delete:spsi)
|
||||
IF ( uspp ) DEALLOCATE( spsi )
|
||||
!
|
||||
#if defined(__OPENMP_GPU)
|
||||
|
@ -1291,7 +1418,6 @@ CONTAINS
|
|||
!
|
||||
END DO
|
||||
|
||||
|
||||
DEALLOCATE( vtmp )
|
||||
DEALLOCATE( ptmp )
|
||||
|
||||
|
@ -1580,7 +1706,6 @@ CONTAINS
|
|||
CALL mp_root_sum( vtmp(:,1:nc), dm, root, ortho_parent_comm )
|
||||
END IF
|
||||
|
||||
|
||||
END DO
|
||||
!
|
||||
END IF
|
||||
|
|
|
@ -159,7 +159,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
|
|||
CALL start_clock( 'pcg:ortho' )
|
||||
!$acc host_data use_device(spsi0, psi0, z, spsi0vec)
|
||||
CALL MYDGEMM2( 'T','N', nbnd,nnew,npw2, 2.D0, spsi0, npwx2, z(:,nactive+1), npwx2, 0.D0, spsi0vec, nbnd,.FALSE. )
|
||||
IF ( gstart == 2 ) CALL MYDGER( nbnd, nnew, -1.D0, spsi0, npwx2, z(:,nactive+1), npwx2, spsi0vec, nbnd )
|
||||
IF ( gstart == 2 ) CALL MYDGER2( nbnd, nnew, -1.D0, spsi0, npwx2, z(:,nactive+1), npwx2, spsi0vec, nbnd,.FALSE. )
|
||||
CALL mp_sum( spsi0vec, intra_bgrp_comm )
|
||||
CALL MYDGEMM2( 'N','N', npw2,nnew,nbnd,-1.D0, psi0, npwx2, spsi0vec, nbnd, 1.D0, z(:,nactive+1), npwx2,.FALSE. )
|
||||
!$acc end host_data
|
||||
|
@ -277,7 +277,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
|
|||
CALL start_clock( 'pcg:ortho' )
|
||||
!$acc host_data use_device(spsi0, psi0, z, spsi0vec)
|
||||
CALL MYDGEMM2( 'T','N', nbnd,nactive,npw2, 2.D0, spsi0, npwx2, z, npwx2, 0.D0, spsi0vec, nbnd, .FALSE. )
|
||||
IF ( gstart == 2 ) CALL MYDGER( nbnd, nactive, -1.D0, spsi0, npwx2, z, npwx2, spsi0vec, nbnd )
|
||||
IF ( gstart == 2 ) CALL MYDGER2( nbnd, nactive, -1.D0, spsi0, npwx2, z, npwx2, spsi0vec, nbnd, .FALSE. )
|
||||
CALL mp_sum( spsi0vec, intra_bgrp_comm )
|
||||
CALL MYDGEMM2( 'N','N', npw2,nactive,nbnd,-1.D0, psi0, npwx2, spsi0vec, nbnd, 1.D0, z, npwx2, .FALSE. )
|
||||
!$acc end host_data
|
||||
|
|
|
@ -165,7 +165,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
|
|||
kedtaue2(k,1) = kedtau(k,1) / e2
|
||||
ENDDO
|
||||
CALL xc_metagcx( nrxx, 1, np, rhoaux, grho, kedtaue2, sx, sc, &
|
||||
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
|
||||
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=gpuarg )
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
v2c(k,1) = v2cm(1,k,1)
|
||||
|
@ -205,7 +205,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
|
|||
kedtaue2(k,1:nspin0) = kedtau(k,1:nspin0) / e2
|
||||
ENDDO
|
||||
CALL xc_metagcx( nrxx, nspin0, np, rhoaux, grho, kedtaue2, sx, sc, &
|
||||
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
|
||||
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=gpuarg )
|
||||
!$acc parallel loop
|
||||
DO k = 1, nrxx
|
||||
v2c(k,:) = v2cm(1,k,:)
|
||||
|
@ -218,7 +218,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
|
|||
ALLOCATE( v2c_ud(nrxx) )
|
||||
!$acc data create( v2c_ud )
|
||||
!
|
||||
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud, gpu_args_=.TRUE. )
|
||||
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud, gpu_args_=gpuarg )
|
||||
!
|
||||
!$acc parallel loop reduction(+:sigma_gc11,sigma_gc21,sigma_gc22, &
|
||||
!$acc& sigma_gc31,sigma_gc32,sigma_gc33)
|
||||
|
|
|
@ -470,7 +470,7 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v )
|
|||
!$acc data create( ex, ec, vx, vc )
|
||||
!$acc parallel loop
|
||||
#elif defined(__OPENMP_GPU)
|
||||
!$omp target data map(to:rho%of_r,rho_core) map(alloc:ex,ec,vx,vc) map(from:v)
|
||||
!$omp target data map(always,to:rho%of_r) map(to:rho_core) map(alloc:ex,ec,vx,vc) map(from:v)
|
||||
!$omp target teams distribute parallel do
|
||||
#endif
|
||||
DO ir = 1, dfftp_nnr
|
||||
|
|
|
@ -15,6 +15,12 @@ SUBROUTINE MYDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
|||
#if defined(__CUDA)
|
||||
use cudafor
|
||||
use cublas
|
||||
#elif defined(__OPENMP_GPU)
|
||||
#if defined(__ONEMKL)
|
||||
use onemkl_blas_gpu
|
||||
#elif defined(__ROCBLAS)
|
||||
use rocblas_utils
|
||||
#endif
|
||||
#endif
|
||||
! .. Scalar Arguments ..
|
||||
DOUBLE PRECISION :: ALPHA
|
||||
|
@ -23,8 +29,18 @@ SUBROUTINE MYDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
|||
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, X, Y
|
||||
#endif
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
#elif defined(__OPENMP_GPU)
|
||||
#if defined(__ONEMKL)
|
||||
!$omp target variant dispatch use_device_ptr(A, X, Y)
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
!$omp end target variant dispatch
|
||||
#elif defined(__ROCBLAS)
|
||||
CALL rocblas_dger( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
#endif
|
||||
#else
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
#endif
|
||||
|
||||
END SUBROUTINE MYDGER
|
||||
|
||||
|
@ -35,6 +51,12 @@ SUBROUTINE MYDGEMM( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC
|
|||
#if defined(__CUDA)
|
||||
use cudafor
|
||||
use cublas
|
||||
#elif defined(__OPENMP_GPU)
|
||||
#if defined(__ONEMKL)
|
||||
use onemkl_blas_gpu
|
||||
#elif defined(__ROCBLAS)
|
||||
use rocblas_utils
|
||||
#endif
|
||||
#endif
|
||||
CHARACTER*1, INTENT(IN) :: TRANSA, TRANSB
|
||||
INTEGER, INTENT(IN) :: M, N, K, LDA, LDB, LDC
|
||||
|
@ -43,6 +65,14 @@ SUBROUTINE MYDGEMM( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC
|
|||
#if defined(__CUDA)
|
||||
attributes(device) :: A, B, C
|
||||
CALL cublasdgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
#elif defined(__OPENMP_GPU)
|
||||
#if defined(__ONEMKL)
|
||||
!$omp target variant dispatch use_device_ptr(A, B, C)
|
||||
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
!$omp end target variant dispatch
|
||||
#elif defined(__ROCBLAS)
|
||||
CALL rocblas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
#endif
|
||||
#else
|
||||
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
#endif
|
||||
|
@ -89,6 +119,48 @@ END SUBROUTINE MYZGEMM
|
|||
! MYZGEMM, but with an additional variable (OMP_OFFLOAD) to decide wether to perform a cpu
|
||||
! _gemm or call a rocblas _gemm which takes gpu_only arguments.
|
||||
!
|
||||
SUBROUTINE MYDGER2 ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA, OMP_OFFLOAD )
|
||||
#if defined(__CUDA)
|
||||
use cudafor
|
||||
use cublas
|
||||
#elif defined(__OPENMP_GPU)
|
||||
#if defined(__ONEMKL)
|
||||
use onemkl_blas_gpu
|
||||
#elif defined(__ROCBLAS)
|
||||
use rocblas_utils
|
||||
#endif
|
||||
#endif
|
||||
! .. Scalar Arguments ..
|
||||
DOUBLE PRECISION :: ALPHA
|
||||
INTEGER :: INCX, INCY, LDA, M, N
|
||||
! .. Array Arguments ..
|
||||
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
|
||||
LOGICAL, INTENT(IN) :: OMP_OFFLOAD
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, X, Y
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
#elif defined(__OPENMP_GPU)
|
||||
#if defined(__ONEMKL)
|
||||
IF (OMP_OFFLOAD) THEN
|
||||
!$omp target variant dispatch use_device_ptr(A, X, Y)
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
!$omp end target variant dispatch
|
||||
ELSE
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
ENDIF
|
||||
#elif defined(__ROCBLAS)
|
||||
IF (OMP_OFFLOAD) THEN
|
||||
CALL rocblas_dger( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
ELSE
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
ENDIF
|
||||
#endif
|
||||
#else
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
#endif
|
||||
|
||||
END SUBROUTINE MYDGER2
|
||||
|
||||
SUBROUTINE MYDGEMM2( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, OMP_OFFLOAD )
|
||||
#if defined(__CUDA)
|
||||
use cudafor
|
||||
|
@ -109,20 +181,20 @@ SUBROUTINE MYDGEMM2( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LD
|
|||
#if defined(__CUDA)
|
||||
attributes(device) :: A, B, C
|
||||
CALL cublasdgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
#else
|
||||
#if defined(__ONEMKL)
|
||||
#elif defined(__ONEMKL)
|
||||
IF (OMP_OFFLOAD) THEN
|
||||
!$omp target variant dispatch use_device_ptr(A, B, C)
|
||||
#endif
|
||||
#if defined(__ROCBLAS)
|
||||
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
!$omp end target variant dispatch
|
||||
ELSE
|
||||
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
ENDIF
|
||||
#elif defined(__ROCBLAS)
|
||||
IF (OMP_OFFLOAD) CALL rocblas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
IF (.NOT. OMP_OFFLOAD) CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
#else
|
||||
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
#endif
|
||||
#if defined(__ONEMKL)
|
||||
!$omp end target variant dispatch
|
||||
#endif
|
||||
#endif
|
||||
|
||||
END SUBROUTINE MYDGEMM2
|
||||
|
||||
|
@ -146,20 +218,20 @@ SUBROUTINE MYZGEMM2( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LD
|
|||
#if defined(__CUDA)
|
||||
attributes(device) :: A, B, C
|
||||
CALL cublaszgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
#else
|
||||
#if defined(__ONEMKL)
|
||||
#elif defined(__ONEMKL)
|
||||
IF (OMP_OFFLOAD) THEN
|
||||
!$omp target variant dispatch use_device_ptr(A, B, C)
|
||||
#endif
|
||||
#if defined(__ROCBLAS)
|
||||
CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
!$omp end target variant dispatch
|
||||
ELSE
|
||||
CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
ENDIF
|
||||
#elif defined(__ROCBLAS)
|
||||
IF (OMP_OFFLOAD) CALL rocblas_zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
IF (.NOT. OMP_OFFLOAD) CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
#else
|
||||
CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
|
||||
#endif
|
||||
#if defined(__ONEMKL)
|
||||
!$omp end target variant dispatch
|
||||
#endif
|
||||
#endif
|
||||
|
||||
END SUBROUTINE MYZGEMM2
|
||||
!========================================================================================================
|
||||
|
|
|
@ -49,7 +49,6 @@ MODULE rocblas
|
|||
|
||||
END INTERFACE
|
||||
|
||||
|
||||
CONTAINS
|
||||
|
||||
FUNCTION rocblas_get_operation(op)
|
||||
|
@ -203,6 +202,29 @@ MODULE rocblas_utils
|
|||
MODULE PROCEDURE rocblas_ddgemm, rocblas_dzgemm1, rocblas_dzgemm2, rocblas_dzgemm3
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE
|
||||
FUNCTION rocblas_dger_(handle, m, n, alpha, x, incx, y, incy, A, lda) &
|
||||
BIND(C, NAME="rocblas_dger")
|
||||
USE ISO_C_BINDING
|
||||
IMPLICIT NONE
|
||||
TYPE(C_PTR), VALUE :: handle
|
||||
INTEGER(rocblas_int), VALUE :: m, n
|
||||
REAL(c_double) :: alpha
|
||||
TYPE(C_PTR), VALUE :: x
|
||||
INTEGER(rocblas_int), VALUE :: incx
|
||||
TYPE(C_PTR), VALUE :: y
|
||||
INTEGER(rocblas_int), VALUE :: incy
|
||||
TYPE(C_PTR), VALUE :: A
|
||||
INTEGER(rocblas_int), VALUE :: lda
|
||||
INTEGER :: rocblas_dger_
|
||||
END FUNCTION rocblas_dger_
|
||||
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE rocblas_dger
|
||||
MODULE PROCEDURE rocblas_dger1, rocblas_dzger
|
||||
END INTERFACE
|
||||
|
||||
CONTAINS
|
||||
|
||||
SUBROUTINE rocblas_zgemm(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
||||
|
@ -403,5 +425,51 @@ MODULE rocblas_utils
|
|||
|
||||
END SUBROUTINE
|
||||
|
||||
SUBROUTINE rocblas_dger1(m, n, alpha, x, incx, y, incy, A, lda)
|
||||
USE ISO_C_BINDING
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: m, n, incx, incy, lda
|
||||
REAL(DP), INTENT(IN) :: alpha
|
||||
REAL(DP), INTENT(IN), TARGET :: x(m), y(n)
|
||||
REAL(DP), INTENT(INOUT), TARGET :: A(lda,*)
|
||||
INTEGER :: rm, rn, rincx, rincy, rlda
|
||||
INTEGER :: stat
|
||||
rm = int(m, kind(rocblas_int))
|
||||
rn = int(n, kind(rocblas_int))
|
||||
rincx = int(incx, kind(rocblas_int))
|
||||
rincy = int(incx, kind(rocblas_int))
|
||||
rlda = int(lda, kind(rocblas_int))
|
||||
|
||||
!$omp target data use_device_ptr(A, x, y)
|
||||
stat = rocblas_dger_(handle, rm, rn, alpha, c_loc(x), rincx, c_loc(y), rincy, &
|
||||
c_loc(A), rlda)
|
||||
!$omp end target data
|
||||
CALL rocblas_check(stat, "DGER1")
|
||||
|
||||
END SUBROUTINE
|
||||
|
||||
SUBROUTINE rocblas_dzger(m, n, alpha, x, incx, y, incy, A, lda)
|
||||
USE ISO_C_BINDING
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: m, n, incx, incy, lda
|
||||
REAL(DP), INTENT(IN) :: alpha
|
||||
COMPLEX(DP), INTENT(IN) :: x(m), y(n)
|
||||
REAL(DP), INTENT(INOUT) :: A(lda,*)
|
||||
INTEGER :: rm, rn, rincx, rincy, rlda
|
||||
INTEGER :: stat
|
||||
rm = int(m, kind(rocblas_int))
|
||||
rn = int(n, kind(rocblas_int))
|
||||
rincx = int(incx, kind(rocblas_int))
|
||||
rincy = int(incx, kind(rocblas_int))
|
||||
rlda = int(lda, kind(rocblas_int))
|
||||
|
||||
!$omp target data use_device_ptr(A, x, y)
|
||||
stat = rocblas_dger_(handle, rm, rn, alpha, c_loc(x), rincx, c_loc(y), rincy, &
|
||||
c_loc(A), rlda)
|
||||
!$omp end target data
|
||||
CALL rocblas_check(stat, "DZGER")
|
||||
|
||||
END SUBROUTINE
|
||||
|
||||
END MODULE rocblas_utils
|
||||
#endif
|
||||
|
|
|
@ -112,6 +112,11 @@ for dir in $dirs; do
|
|||
|
||||
# list of all external library modules or include files
|
||||
libdeps="mpi omp_lib hdf5 mkl_dfti mkl_dfti.f90 fftw3.f03 fftw3.f \
|
||||
mkl_omp_offload mkl_omp_offload.f90 \
|
||||
onemkl_blas_omp_offload_ilp64 onemkl_blas_omp_offload_lp64 \
|
||||
onemkl_blas_omp_offload_ilp64_no_array_check onemkl_blas_omp_offload_lp64_no_array_check \
|
||||
onemkl_lapack_omp_offload_ilp64 onemkl_lapack_omp_offload_lp64 \
|
||||
onemkl_vsl_omp_offload_ilp64 onemkl_vsl_omp_offload_lp64 \
|
||||
mkl_dfti_omp_offload mkl_dfti_omp_offload.f90 \
|
||||
xc_version.h xc_f03_lib_m elpa elpa1 \
|
||||
mbd w90_io fox_dom fox_wxml m_common_io \
|
||||
|
|
Loading…
Reference in New Issue