Merge branch 'develop_omp5' into 'develop_omp5'

regterg to OpenMP offload

See merge request icarnimeo/q-e!11
This commit is contained in:
Ivan Carnimeo 2023-04-20 13:45:19 +00:00
commit a864194949
10 changed files with 573 additions and 304 deletions

View File

@ -1387,7 +1387,7 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
INTEGER :: k, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom, offset
INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx, ipp, nblk, nsiz
!
INTEGER :: iter, dest, sorc
INTEGER :: iter, dest, sorc, ncp_me, npp_gproc
INTEGER :: istatus(MPI_STATUS_SIZE)
me = dfft%mype + 1
@ -1422,12 +1422,14 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
!$omp target data use_device_addr(f_in)
#endif
DO gproc = 1, nprocp
ncp_me = ncp_(me)
npp_gproc = npp_(gproc)
kdest = ( gproc - 1 ) * ncpx
kfrom = ( gproc - 1 ) * npp_(gproc)
kfrom = ( gproc - 1 ) * nppx
istat = int(omp_target_memcpy_rect(c_loc(f_aux), c_loc(f_in), &
int(sizeof(dummy),c_size_t), &
int(2,c_int), &
int((/ ncp_(me), npp_(gproc) /),c_size_t), &
int((/ ncp_me, npp_gproc /),c_size_t), &
int((/ kdest, 0 /),c_size_t), &
int((/ 0, kfrom /),c_size_t), &
int((/ (nxx_/nppx), nppx /),c_size_t), &
@ -1447,13 +1449,15 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
DO gproc = 1, nprocp
kdest = ( gproc - 1 ) * sendsiz
kfrom = offset
ncp_me = ncp_(me)
npp_gproc = npp_(gproc)
!$omp target teams distribute parallel do collapse(2)
DO k = 1, ncp_ (me)
DO i = 1, npp_ ( gproc )
DO k = 1, ncp_me
DO i = 1, npp_gproc
f_aux( kdest + i + (k-1)*nppx ) = f_in( kfrom + i + (k-1)*nr3x )
END DO
END DO
offset = offset + npp_ ( gproc )
offset = offset + npp_gproc
ENDDO
#ifndef __GPU_MPI
!$omp target update from (f_aux)
@ -1513,7 +1517,7 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
10 CONTINUE
!$omp target teams distribute parallel do
do i = lbound(f_aux,1), ubound(f_aux,1)
do i = 1, nxx_
f_aux(i) = (0.d0, 0.d0)
end do
!$omp end target teams distribute parallel do
@ -1567,7 +1571,6 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
f_in( omp_j + it ) = f_aux( mc + ( omp_j - 1 ) * nnp )
ENDDO
ENDDO
!$omp end target teams distribute parallel do
ENDDO
ELSE
DO gproc = 1, nprocp
@ -1581,7 +1584,6 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
f_in( omp_j + it ) = f_aux( mc + ( omp_j - 1 ) * nnp )
ENDDO
ENDDO
!$omp end target teams distribute parallel do
ENDDO
END IF
!
@ -1624,7 +1626,6 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
DO i=(me-1)*sendsiz + 1, me*sendsiz
f_aux(i) = f_in(i)
ENDDO
!$omp end target teams distribute parallel do
call MPI_WAITALL(2*nprocp-2, srh, MPI_STATUSES_IGNORE, ierr)
!$omp end target data
@ -1649,12 +1650,14 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
!$omp target data use_device_addr(f_in)
#endif
DO gproc = 1, nprocp
ncp_me = ncp_(me)
npp_gproc = npp_(gproc)
kdest = ( gproc - 1 ) * ncpx
kfrom = ( gproc - 1 ) * npp_(gproc)
kfrom = ( gproc - 1 ) * nppx
istat = int(omp_target_memcpy_rect(c_loc(f_in), c_loc(f_aux), &
int(sizeof(dummy),c_size_t), &
int(2,c_int), &
int((/ ncp_(me), npp_(gproc) /),c_size_t), &
int((/ ncp_me, npp_gproc /),c_size_t), &
int((/ 0, kfrom /),c_size_t), &
int((/ kdest, 0 /),c_size_t), &
int((/ (nxx_/nr3x), nr3x /),c_size_t), &
@ -1674,13 +1677,15 @@ SUBROUTINE fft_scatter_omp ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn )
DO gproc = 1, nprocp
kdest = ( gproc - 1 ) * sendsiz
kfrom = offset
ncp_me = ncp_(me)
npp_gproc = npp_(gproc)
!$omp target teams distribute parallel do collapse(2)
DO k = 1, ncp_(me)
DO i = 1, npp_( gproc )
DO k = 1, ncp_me
DO i = 1, npp_gproc
f_in( kfrom + i + (k-1)*nr3x ) = f_aux( kdest + i + (k-1)*nppx )
END DO
END DO
offset = offset + npp_( gproc )
offset = offset + npp_gproc
ENDDO
!
#endif

View File

@ -322,9 +322,6 @@ CONTAINS
ALLOCATE( desc%tg_rdsp( desc%nproc2) ) ; desc%tg_rdsp = 0
#if defined (__OPENMP_GPU)
#ifndef __CRAY
!$omp target enter data map(alloc:desc)
#endif
!$omp target enter data map(always,alloc:desc%nsp)
!$omp target enter data map(always,alloc:desc%nsw)
!$omp target enter data map(always,alloc:desc%ismap)
@ -337,7 +334,7 @@ CONTAINS
nsubbatches = ceiling(real(desc%batchsize)/desc%subbatchsize)
ALLOCATE( desc%srh(2*nproc, nsubbatches))
!$omp target enter data map(alloc:desc%srh)
!$omp target enter data map(always,alloc:desc%srh)
#endif
#if defined(__CUDA)
@ -429,9 +426,6 @@ CONTAINS
IF (OMP_TARGET_IS_PRESENT(c_loc(desc%nlm), OMP_GET_DEFAULT_DEVICE()) == 1) THEN
!$omp target exit data map(delete:desc%nlm)
ENDIF
#ifndef __CRAY
!$omp target exit data map(delete:desc)
#endif
#endif
IF ( ALLOCATED( desc%aux ) ) DEALLOCATE( desc%aux )

View File

@ -86,7 +86,7 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
if (n_start .le. n_end) &
CALL MYDGEMM2( 'T','N', nstart,my_n,kdim, 2.D0, psi,kdmx, hpsi(1,n_start),kdmx, 0.D0, hh(1,n_start),nstart,.FALSE. )
IF ( gstart == 2 ) call MYDGER( nstart, my_n, -1.D0, psi,kdmx, hpsi(1,n_start),kdmx, hh(1,n_start),nstart )
IF ( gstart == 2 ) call MYDGER2( nstart, my_n, -1.D0, psi,kdmx, hpsi(1,n_start),kdmx, hh(1,n_start),nstart,.FALSE. )
call start_clock('rotHSw:hc:s1')
CALL mp_sum( hh(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
call stop_clock('rotHSw:hc:s1')
@ -99,13 +99,13 @@ SUBROUTINE rotate_HSpsi_gamma( npwx, npw, nstart, nbnd, psi, hpsi, overlap, spsi
!
if (n_start .le. n_end) &
CALL MYDGEMM2('T','N', nstart,my_n,kdim, 2.D0, psi,kdmx, spsi(1,n_start),kdmx, 0.D0, ss(1,n_start),nstart,.FALSE. )
IF ( gstart == 2 ) CALL MYDGER(nstart, my_n, -1.D0, psi,kdmx, spsi(1,n_start),kdmx, ss(1,n_start),nstart)
IF ( gstart == 2 ) CALL MYDGER2(nstart, my_n, -1.D0, psi,kdmx, spsi(1,n_start),kdmx, ss(1,n_start),nstart,.FALSE. )
!
ELSE
!
if (n_start .le. n_end) &
CALL MYDGEMM2('T','N', nstart,my_n,kdim, 2.D0, psi,kdmx, psi(1,n_start),kdmx, 0.D0, ss(1,n_start),nstart,.FALSE. )
IF ( gstart == 2 ) CALL MYDGER(nstart, my_n, -1.D0, psi,kdmx, psi(1,n_start),kdmx, ss(1,n_start),nstart)
IF ( gstart == 2 ) CALL MYDGER2(nstart, my_n, -1.D0, psi,kdmx, psi(1,n_start),kdmx, ss(1,n_start),nstart,.FALSE. )
!
END IF
call start_clock('rotHSw:hc:s3')

View File

@ -110,6 +110,8 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
! calculates (diag(h)-e)^-1 * psi, diagonal approx. to (h-e)^-1*psi
! the first nvec columns contain the trial eigenvectors
!
!$omp target enter data map(to:evc)
!$omp target enter data map(alloc:e)
CALL start_clock( 'regterg' ) !; write(6,*) 'enter regterg' ; FLUSH(6)
!
!$acc data deviceptr(evc, e)
@ -140,6 +142,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
ALLOCATE( spsi( npwx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' regterg ',' cannot allocate spsi ', ABS(ierr) )
!$omp target enter data map(alloc:spsi)
END IF
!
ALLOCATE( sr( nvecx, nvecx ), STAT=ierr )
@ -154,6 +157,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
ALLOCATE( ew( nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate ew ', ABS(ierr) )
!$omp target enter data map(alloc:sr, hr, vr, ew)
ALLOCATE( conv( nvec ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate conv ', ABS(ierr) )
@ -162,6 +166,33 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
nbase = nvec
conv = .FALSE.
!
#if defined(__OPENMP_GPU)
!$omp target teams distribute parallel do collapse(2)
do j=1,nvecx
do i=1,npwx
hpsi(i,j) = ZERO
psi (i,j) = ZERO
enddo
enddo
IF ( uspp ) then
!$omp target teams distribute parallel do collapse(2)
do j=1,nvecx
do i=1,npwx
spsi(i,j) = ZERO
enddo
enddo
endif
!$omp target teams distribute parallel do
DO k=1,nvec
psi(1,k) = evc(1,k)
! ... set Im[ psi(G=0) ] - needed for numerical stability
IF (gstart == 2) psi(1,k) = CMPLX( DBLE( psi(1,k) ), 0.D0 ,kind=DP)
DO i=2,npwx
psi(i,k) = evc(i,k)
END DO
END DO
!$acc end parallel
#else
!$acc kernels
hpsi = ZERO
psi = ZERO
@ -180,53 +211,83 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
END DO
END DO
!$acc end parallel
#endif
!
! ... hpsi contains h times the basis vectors
!
!$acc host_data use_device(psi, hpsi, spsi)
!$omp target update from(psi,hpsi)
CALL h_psi( npwx, npw, nvec, psi, hpsi ) ; nhpsi = nvec
!
! ... spsi contains s times the basis vectors
!
IF ( uspp ) CALL s_psi( npwx, npw, nvec, psi, spsi )
IF ( uspp ) then
!$omp target update from(spsi)
CALL s_psi( npwx, npw, nvec, psi, spsi )
endif
!$acc end host_data
!
! ... hr contains the projection of the hamiltonian onto the reduced
! ... space vr contains the eigenvectors of hr
!
CALL start_clock( 'regterg:init' )
#if !defined(__OPENMP_GPU)
!$acc kernels
hr(:,:) = 0.D0
sr(:,:) = 0.D0
vr(:,:) = 0.D0
!$acc end kernels
#else
!$omp target teams distribute parallel do collapse(2)
do j=1,nvecx
do i=1,nvecx
hr(i,j) = 0.D0
sr(i,j) = 0.D0
vr(i,j) = 0.D0
enddo
enddo
#endif
!
!$acc host_data use_device(psi, hpsi, spsi, hr, sr)
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
if (n_start .le. n_end) &
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0 , psi, npwx2, hpsi(1,n_start), npwx2, 0.D0, hr(1,n_start), nvecx )
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, hpsi(1,n_start), npwx2, hr(1,n_start), nvecx )
if (n_start .le. n_end) then
!$omp target update to(hpsi)
CALL MYDGEMM( 'T','N', nbase, my_n, npw2, 2.D0 , psi, npwx2, hpsi(1,n_start), npwx2, 0.D0, hr(1,n_start), nvecx )
endif
IF ( gstart == 2 ) THEN
!$omp target update to(hpsi)
CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, hpsi(1,n_start), npwx2, hr(1,n_start), nvecx )
ENDIF
!$omp target update from(hr)
CALL mp_sum( hr( :, 1:nbase ), inter_bgrp_comm )
!
CALL mp_sum( hr( :, 1:nbase ), intra_bgrp_comm )
!$omp target update to(hr)
!
IF ( uspp ) THEN
!
if (n_start .le. n_end) &
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, spsi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, spsi(1,n_start), npwx2, sr(1,n_start), nvecx )
if (n_start .le. n_end) then
!$omp target update to(spsi)
CALL MYDGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, spsi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
endif
IF ( gstart == 2 ) THEN
!$omp target update to(spsi)
CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, spsi(1,n_start), npwx2, sr(1,n_start), nvecx )
ENDIF
!
ELSE
!
if (n_start .le. n_end) &
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, psi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
CALL MYDGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, psi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, psi(1,n_start), npwx2, sr(1,n_start), nvecx )
!
END IF
!$omp target update from(sr)
CALL mp_sum( sr( :, 1:nbase ), inter_bgrp_comm )
!
CALL mp_sum( sr( :, 1:nbase ), intra_bgrp_comm )
!$omp target update to(sr)
!$acc end host_data
!
CALL stop_clock( 'regterg:init' )
@ -234,6 +295,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
IF ( lrot ) THEN
!
!$acc parallel loop
!$omp target teams distribute parallel do
DO n = 1, nbase
!
e(n) = hr(n,n)
@ -256,8 +318,10 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
ENDIF
CALL stop_clock( 'regterg:diag' )
!$acc end host_data
!$omp target update to(vr,ew)
!
!$acc parallel loop
!$omp target teams distribute parallel do
DO i = 1, nvec
e(i) = ew(i)
END DO
@ -288,6 +352,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
IF ( np /= n ) THEN
!$acc parallel loop
!$omp target teams distribute parallel do
DO i = 1, nvecx
vr(i,np) = vr(i,n)
END DO
@ -296,8 +361,10 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
! ... for use in g_psi
!
!$acc kernels
!$omp target
ew(nbase+np) = e(n)
!$acc end kernels
!$omp end target
!
END IF
!
@ -310,6 +377,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=1, notcnv
DO k=1,npwx
psi(k,nbase+i)=ZERO
@ -318,19 +386,22 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!$acc host_data use_device(psi, spsi, vr)
IF ( uspp ) THEN
!
if (n_start .le. n_end) &
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
if (n_start .le. n_end) then
!$omp target update to(spsi)
CALL MYDGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
endif
!
ELSE
!
if (n_start .le. n_end) &
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
CALL MYDGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
!
END IF
!$acc end host_data
! NB: must not call mp_sum over inter_bgrp_comm here because it is done later to the full correction
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO np=1,notcnv
DO k=1,npwx
psi(k,nbase+np) = - ew(nbase+np) * psi(k,nbase+np)
@ -338,14 +409,19 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
END DO
!
!$acc host_data use_device(psi, hpsi, vr, ew)
if (n_start .le. n_end) &
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 1.D0, psi(1,nb1), npwx2 )
if (n_start .le. n_end) then
!$omp target update to(hpsi)
CALL MYDGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 1.D0, psi(1,nb1), npwx2 )
endif
!
!$omp target update from(psi)
CALL mp_sum( psi(:,nb1:nbase+notcnv), inter_bgrp_comm )
!
CALL stop_clock( 'regterg:update' )
!
! ... approximate inverse iteration
!
!$omp target update from(ew)
CALL g_psi( npwx, npw, notcnv, 1, psi(1,nb1), ew(nb1) )
!$acc end host_data
!
@ -357,6 +433,8 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
!$acc parallel vector_length(96)
!$acc loop gang private(nbn)
!$omp target update to(psi,ew)
!$omp target teams distribute private(nbn)
DO n = 1, notcnv
!
nbn = nbase + n
@ -366,13 +444,16 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
END DO
!$acc end parallel
!$omp target update from(ew)
!
!$acc host_data use_device(ew)
CALL mp_sum( ew( 1:notcnv ), intra_bgrp_comm )
!$acc end host_data
!
!$omp target update to(ew)
!$acc parallel vector_length(96)
!$acc loop gang
!$omp target teams distribute parallel do
DO i = 1,notcnv
psi(1,nbase+i) = psi(1,nbase+i)/SQRT( ew(i) )
! ... set Im[ psi(G=0) ] - needed for numerical stability
@ -387,9 +468,13 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
! ... here compute the hpsi and spsi of the new functions
!
!$acc host_data use_device(psi, hpsi, spsi)
!$omp target update from(psi,hpsi)
CALL h_psi( npwx, npw, notcnv, psi(1,nb1), hpsi(1,nb1) ) ; nhpsi = nhpsi + notcnv
!
IF ( uspp ) CALL s_psi( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
IF ( uspp ) THEN
!$omp target update from(spsi)
CALL s_psi( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
ENDIF
!$acc end host_data
!
! ... update the reduced hamiltonian
@ -397,6 +482,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL start_clock( 'regterg:overlap' )
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=0,notcnv-1
DO j=1, nvecx
hr( j, nb1+i )=0.d0
@ -406,14 +492,18 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!$acc host_data use_device(psi, hpsi, hr)
CALL divide(inter_bgrp_comm,nbase+notcnv,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, 0.D0, hr(n_start,nb1), nvecx )
!$omp target update to(hpsi)
CALL MYDGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, 0.D0, hr(n_start,nb1), nvecx )
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, hr(n_start,nb1), nvecx )
!$omp target update from(hr)
CALL mp_sum( hr( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
!
CALL mp_sum( hr( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
!$omp target update to(hr)
!$acc end host_data
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=0,notcnv-1
DO j=1, nvecx
sr( j, nb1+i )=0.d0
@ -425,18 +515,21 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
IF ( uspp ) THEN
!
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, 0.D0, sr(n_start,nb1), nvecx )
!$omp target update to(spsi)
CALL MYDGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, 0.D0, sr(n_start,nb1), nvecx )
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, sr(n_start,nb1), nvecx )
!
ELSE
!
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, 0.D0, sr(n_start,nb1) , nvecx )
CALL MYDGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, 0.D0, sr(n_start,nb1) , nvecx )
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, sr(n_start,nb1), nvecx )
!
END IF
!$omp target update from(sr)
CALL mp_sum( sr( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
!
CALL mp_sum( sr( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
!$omp target update to(sr)
!$acc end host_data
!
CALL stop_clock( 'regterg:overlap' )
@ -445,6 +538,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
!$acc parallel
!$acc loop gang
!$omp target teams distribute parallel do
DO n = 1, nbase
!
!$acc loop vector
@ -462,6 +556,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
CALL start_clock( 'regterg:diag' )
!$acc host_data use_device(hr, sr, ew, vr)
!$omp target update from(hr, sr)
IF( my_bgrp_id == root_bgrp_id ) THEN
CALL diaghg( nbase, nvec, hr, sr, nvecx, ew, vr, me_bgrp, root_bgrp, intra_bgrp_comm )
END IF
@ -470,11 +565,13 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
ENDIF
!$acc end host_data
!$omp target update to(hr,sr,vr,ew)
CALL stop_clock( 'regterg:diag' )
!
! ... test for convergence
!
!$acc parallel loop copy(conv(1:nvec)) copyin(btype(1:nvec))
!$omp target teams distribute parallel do map(tofrom:conv) map(to:btype)
DO i = 1, nvec
IF(btype(i) == 1) THEN
conv(i) = ( ( ABS( ew(i) - e(i) ) < ethr ) )
@ -489,6 +586,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
notcnv = COUNT( .NOT. conv(:) )
!
!$acc parallel loop
!$omp target teams distribute parallel do
DO i=1,nvec
e(i) = ew(i)
END DO
@ -505,6 +603,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL start_clock( 'regterg:last' )
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO k=1,nvec
DO i=1,npwx
evc(i,k) = ZERO
@ -514,9 +613,11 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
!$acc host_data use_device(psi, vr)
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, evc, npwx2 )
CALL MYDGEMM( 'N','N', npw2, nvec, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, evc, npwx2 )
!$omp target update from(evc)
!$acc end host_data
CALL mp_sum( evc, inter_bgrp_comm )
!$omp target update to(evc)
!
IF ( notcnv == 0 ) THEN
!
@ -542,6 +643,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
! ... refresh psi, H*psi and S*psi
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=1,nvec
DO k=1,npwx
psi(k,i) = evc(k,i)
@ -551,6 +653,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
IF ( uspp ) THEN
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i = 1, npwx
DO j = nvec+1, nvec+nvec
psi(i,j) = ZERO
@ -558,11 +661,14 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
END DO
!
!$acc host_data use_device(psi, spsi, vr)
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
CALL MYDGEMM( 'N','N', npw2, nvec, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
!$omp target update from(psi)
CALL mp_sum( psi(:,nvec+1:nvec+nvec), inter_bgrp_comm )
!$omp target update to(psi)
!$acc end host_data
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=1,nvec
DO k=1,npwx
spsi(k,i) = psi(k,i+nvec)
@ -571,15 +677,27 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
END IF
!
#if defined(__OPENMP_GPU)
!$omp target teams distribute parallel do collapse(2)
do j=nvec+1,nvec+nvec
do i=1,npwx
psi(i,j) = ZERO
enddo
enddo
#else
!$acc kernels
psi(:,nvec+1:nvec+nvec) = ZERO
!$acc end kernels
#endif
!$acc host_data use_device(psi, hpsi, vr)
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
CALL MYDGEMM( 'N','N', npw2, nvec, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
!$omp target update from(psi)
CALL mp_sum( psi(:,nvec+1:nvec+nvec), inter_bgrp_comm )
!$omp target update to(psi)
!$acc end host_data
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i=1,nvec
DO k=1, npwx
hpsi(k,i) = psi(k,i+nvec)
@ -591,6 +709,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
nbase = nvec
!
!$acc parallel loop collapse(2)
!$omp target teams distribute parallel do collapse(2)
DO i = 1, nvecx
DO j = 1, nbase
hr(i,j) = 0.D0
@ -600,6 +719,7 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
END DO
!
!$acc parallel loop
!$omp target teams distribute parallel do
DO j = 1, nbase
hr(j,j) = e(j)
sr(j,j) = 1.D0
@ -612,12 +732,19 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
!
END DO iterate
!
!$omp target exit data map(delete:evc)
!$omp target exit data map(from:e)
DEALLOCATE( conv )
!$omp target exit data map(delete:ew)
DEALLOCATE( ew )
!$omp target exit data map(delete:vr)
DEALLOCATE( vr )
!$omp target exit data map(delete:hr)
DEALLOCATE( hr )
!$omp target exit data map(delete:sr)
DEALLOCATE( sr )
!
!$omp target exit data map(delete:spsi)
IF ( uspp ) DEALLOCATE( spsi )
!
#if defined(__OPENMP_GPU)
@ -1291,7 +1418,6 @@ CONTAINS
!
END DO
DEALLOCATE( vtmp )
DEALLOCATE( ptmp )
@ -1580,7 +1706,6 @@ CONTAINS
CALL mp_root_sum( vtmp(:,1:nc), dm, root, ortho_parent_comm )
END IF
END DO
!
END IF

View File

@ -159,7 +159,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
CALL start_clock( 'pcg:ortho' )
!$acc host_data use_device(spsi0, psi0, z, spsi0vec)
CALL MYDGEMM2( 'T','N', nbnd,nnew,npw2, 2.D0, spsi0, npwx2, z(:,nactive+1), npwx2, 0.D0, spsi0vec, nbnd,.FALSE. )
IF ( gstart == 2 ) CALL MYDGER( nbnd, nnew, -1.D0, spsi0, npwx2, z(:,nactive+1), npwx2, spsi0vec, nbnd )
IF ( gstart == 2 ) CALL MYDGER2( nbnd, nnew, -1.D0, spsi0, npwx2, z(:,nactive+1), npwx2, spsi0vec, nbnd,.FALSE. )
CALL mp_sum( spsi0vec, intra_bgrp_comm )
CALL MYDGEMM2( 'N','N', npw2,nnew,nbnd,-1.D0, psi0, npwx2, spsi0vec, nbnd, 1.D0, z(:,nactive+1), npwx2,.FALSE. )
!$acc end host_data
@ -277,7 +277,7 @@ SUBROUTINE bpcg_gamma( hs_psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, nvec, psi,
CALL start_clock( 'pcg:ortho' )
!$acc host_data use_device(spsi0, psi0, z, spsi0vec)
CALL MYDGEMM2( 'T','N', nbnd,nactive,npw2, 2.D0, spsi0, npwx2, z, npwx2, 0.D0, spsi0vec, nbnd, .FALSE. )
IF ( gstart == 2 ) CALL MYDGER( nbnd, nactive, -1.D0, spsi0, npwx2, z, npwx2, spsi0vec, nbnd )
IF ( gstart == 2 ) CALL MYDGER2( nbnd, nactive, -1.D0, spsi0, npwx2, z, npwx2, spsi0vec, nbnd, .FALSE. )
CALL mp_sum( spsi0vec, intra_bgrp_comm )
CALL MYDGEMM2( 'N','N', npw2,nactive,nbnd,-1.D0, psi0, npwx2, spsi0vec, nbnd, 1.D0, z, npwx2, .FALSE. )
!$acc end host_data

View File

@ -165,7 +165,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
kedtaue2(k,1) = kedtau(k,1) / e2
ENDDO
CALL xc_metagcx( nrxx, 1, np, rhoaux, grho, kedtaue2, sx, sc, &
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=gpuarg )
!$acc parallel loop
DO k = 1, nrxx
v2c(k,1) = v2cm(1,k,1)
@ -205,7 +205,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
kedtaue2(k,1:nspin0) = kedtau(k,1:nspin0) / e2
ENDDO
CALL xc_metagcx( nrxx, nspin0, np, rhoaux, grho, kedtaue2, sx, sc, &
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. )
v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=gpuarg )
!$acc parallel loop
DO k = 1, nrxx
v2c(k,:) = v2cm(1,k,:)
@ -218,7 +218,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, &
ALLOCATE( v2c_ud(nrxx) )
!$acc data create( v2c_ud )
!
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud, gpu_args_=.TRUE. )
CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud, gpu_args_=gpuarg )
!
!$acc parallel loop reduction(+:sigma_gc11,sigma_gc21,sigma_gc22, &
!$acc& sigma_gc31,sigma_gc32,sigma_gc33)

View File

@ -470,7 +470,7 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v )
!$acc data create( ex, ec, vx, vc )
!$acc parallel loop
#elif defined(__OPENMP_GPU)
!$omp target data map(to:rho%of_r,rho_core) map(alloc:ex,ec,vx,vc) map(from:v)
!$omp target data map(always,to:rho%of_r) map(to:rho_core) map(alloc:ex,ec,vx,vc) map(from:v)
!$omp target teams distribute parallel do
#endif
DO ir = 1, dfftp_nnr

View File

@ -15,6 +15,12 @@ SUBROUTINE MYDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#if defined(__CUDA)
use cudafor
use cublas
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
use onemkl_blas_gpu
#elif defined(__ROCBLAS)
use rocblas_utils
#endif
#endif
! .. Scalar Arguments ..
DOUBLE PRECISION :: ALPHA
@ -23,8 +29,18 @@ SUBROUTINE MYDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
!$omp target variant dispatch use_device_ptr(A, X, Y)
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
!$omp end target variant dispatch
#elif defined(__ROCBLAS)
CALL rocblas_dger( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#endif
#else
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#endif
END SUBROUTINE MYDGER
@ -35,6 +51,12 @@ SUBROUTINE MYDGEMM( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC
#if defined(__CUDA)
use cudafor
use cublas
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
use onemkl_blas_gpu
#elif defined(__ROCBLAS)
use rocblas_utils
#endif
#endif
CHARACTER*1, INTENT(IN) :: TRANSA, TRANSB
INTEGER, INTENT(IN) :: M, N, K, LDA, LDB, LDC
@ -43,6 +65,14 @@ SUBROUTINE MYDGEMM( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC
#if defined(__CUDA)
attributes(device) :: A, B, C
CALL cublasdgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
!$omp target variant dispatch use_device_ptr(A, B, C)
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
!$omp end target variant dispatch
#elif defined(__ROCBLAS)
CALL rocblas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
#else
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
@ -89,6 +119,48 @@ END SUBROUTINE MYZGEMM
! MYZGEMM, but with an additional variable (OMP_OFFLOAD) to decide wether to perform a cpu
! _gemm or call a rocblas _gemm which takes gpu_only arguments.
!
SUBROUTINE MYDGER2 ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA, OMP_OFFLOAD )
#if defined(__CUDA)
use cudafor
use cublas
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
use onemkl_blas_gpu
#elif defined(__ROCBLAS)
use rocblas_utils
#endif
#endif
! .. Scalar Arguments ..
DOUBLE PRECISION :: ALPHA
INTEGER :: INCX, INCY, LDA, M, N
! .. Array Arguments ..
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
LOGICAL, INTENT(IN) :: OMP_OFFLOAD
#if defined(__CUDA)
attributes(device) :: A, X, Y
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#elif defined(__OPENMP_GPU)
#if defined(__ONEMKL)
IF (OMP_OFFLOAD) THEN
!$omp target variant dispatch use_device_ptr(A, X, Y)
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
!$omp end target variant dispatch
ELSE
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
ENDIF
#elif defined(__ROCBLAS)
IF (OMP_OFFLOAD) THEN
CALL rocblas_dger( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
ELSE
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
ENDIF
#endif
#else
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#endif
END SUBROUTINE MYDGER2
SUBROUTINE MYDGEMM2( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, OMP_OFFLOAD )
#if defined(__CUDA)
use cudafor
@ -109,20 +181,20 @@ SUBROUTINE MYDGEMM2( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LD
#if defined(__CUDA)
attributes(device) :: A, B, C
CALL cublasdgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#else
#if defined(__ONEMKL)
#elif defined(__ONEMKL)
IF (OMP_OFFLOAD) THEN
!$omp target variant dispatch use_device_ptr(A, B, C)
#endif
#if defined(__ROCBLAS)
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
!$omp end target variant dispatch
ELSE
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
ENDIF
#elif defined(__ROCBLAS)
IF (OMP_OFFLOAD) CALL rocblas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
IF (.NOT. OMP_OFFLOAD) CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#else
CALL dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
#if defined(__ONEMKL)
!$omp end target variant dispatch
#endif
#endif
END SUBROUTINE MYDGEMM2
@ -146,20 +218,20 @@ SUBROUTINE MYZGEMM2( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LD
#if defined(__CUDA)
attributes(device) :: A, B, C
CALL cublaszgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#else
#if defined(__ONEMKL)
#elif defined(__ONEMKL)
IF (OMP_OFFLOAD) THEN
!$omp target variant dispatch use_device_ptr(A, B, C)
#endif
#if defined(__ROCBLAS)
CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
!$omp end target variant dispatch
ELSE
CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
ENDIF
#elif defined(__ROCBLAS)
IF (OMP_OFFLOAD) CALL rocblas_zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
IF (.NOT. OMP_OFFLOAD) CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#else
CALL zgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
#endif
#if defined(__ONEMKL)
!$omp end target variant dispatch
#endif
#endif
END SUBROUTINE MYZGEMM2
!========================================================================================================

View File

@ -49,7 +49,6 @@ MODULE rocblas
END INTERFACE
CONTAINS
FUNCTION rocblas_get_operation(op)
@ -203,6 +202,29 @@ MODULE rocblas_utils
MODULE PROCEDURE rocblas_ddgemm, rocblas_dzgemm1, rocblas_dzgemm2, rocblas_dzgemm3
END INTERFACE
INTERFACE
FUNCTION rocblas_dger_(handle, m, n, alpha, x, incx, y, incy, A, lda) &
BIND(C, NAME="rocblas_dger")
USE ISO_C_BINDING
IMPLICIT NONE
TYPE(C_PTR), VALUE :: handle
INTEGER(rocblas_int), VALUE :: m, n
REAL(c_double) :: alpha
TYPE(C_PTR), VALUE :: x
INTEGER(rocblas_int), VALUE :: incx
TYPE(C_PTR), VALUE :: y
INTEGER(rocblas_int), VALUE :: incy
TYPE(C_PTR), VALUE :: A
INTEGER(rocblas_int), VALUE :: lda
INTEGER :: rocblas_dger_
END FUNCTION rocblas_dger_
END INTERFACE
INTERFACE rocblas_dger
MODULE PROCEDURE rocblas_dger1, rocblas_dzger
END INTERFACE
CONTAINS
SUBROUTINE rocblas_zgemm(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
@ -403,5 +425,51 @@ MODULE rocblas_utils
END SUBROUTINE
SUBROUTINE rocblas_dger1(m, n, alpha, x, incx, y, incy, A, lda)
USE ISO_C_BINDING
IMPLICIT NONE
INTEGER, INTENT(IN) :: m, n, incx, incy, lda
REAL(DP), INTENT(IN) :: alpha
REAL(DP), INTENT(IN), TARGET :: x(m), y(n)
REAL(DP), INTENT(INOUT), TARGET :: A(lda,*)
INTEGER :: rm, rn, rincx, rincy, rlda
INTEGER :: stat
rm = int(m, kind(rocblas_int))
rn = int(n, kind(rocblas_int))
rincx = int(incx, kind(rocblas_int))
rincy = int(incx, kind(rocblas_int))
rlda = int(lda, kind(rocblas_int))
!$omp target data use_device_ptr(A, x, y)
stat = rocblas_dger_(handle, rm, rn, alpha, c_loc(x), rincx, c_loc(y), rincy, &
c_loc(A), rlda)
!$omp end target data
CALL rocblas_check(stat, "DGER1")
END SUBROUTINE
SUBROUTINE rocblas_dzger(m, n, alpha, x, incx, y, incy, A, lda)
USE ISO_C_BINDING
IMPLICIT NONE
INTEGER, INTENT(IN) :: m, n, incx, incy, lda
REAL(DP), INTENT(IN) :: alpha
COMPLEX(DP), INTENT(IN) :: x(m), y(n)
REAL(DP), INTENT(INOUT) :: A(lda,*)
INTEGER :: rm, rn, rincx, rincy, rlda
INTEGER :: stat
rm = int(m, kind(rocblas_int))
rn = int(n, kind(rocblas_int))
rincx = int(incx, kind(rocblas_int))
rincy = int(incx, kind(rocblas_int))
rlda = int(lda, kind(rocblas_int))
!$omp target data use_device_ptr(A, x, y)
stat = rocblas_dger_(handle, rm, rn, alpha, c_loc(x), rincx, c_loc(y), rincy, &
c_loc(A), rlda)
!$omp end target data
CALL rocblas_check(stat, "DZGER")
END SUBROUTINE
END MODULE rocblas_utils
#endif

View File

@ -112,6 +112,11 @@ for dir in $dirs; do
# list of all external library modules or include files
libdeps="mpi omp_lib hdf5 mkl_dfti mkl_dfti.f90 fftw3.f03 fftw3.f \
mkl_omp_offload mkl_omp_offload.f90 \
onemkl_blas_omp_offload_ilp64 onemkl_blas_omp_offload_lp64 \
onemkl_blas_omp_offload_ilp64_no_array_check onemkl_blas_omp_offload_lp64_no_array_check \
onemkl_lapack_omp_offload_ilp64 onemkl_lapack_omp_offload_lp64 \
onemkl_vsl_omp_offload_ilp64 onemkl_vsl_omp_offload_lp64 \
mkl_dfti_omp_offload mkl_dfti_omp_offload.f90 \
xc_version.h xc_f03_lib_m elpa elpa1 \
mbd w90_io fox_dom fox_wxml m_common_io \