From 4c4b7a9e1768df3dabdd2725fd70667a89e92caf Mon Sep 17 00:00:00 2001 From: fabrizio22 Date: Mon, 7 Feb 2022 15:26:06 +0100 Subject: [PATCH] XClib-acc - present directive only in GGA --- XClib/qe_drivers_d_gga.f90 | 10 ++-------- XClib/qe_drivers_gga.f90 | 14 +++++--------- XClib/xc_wrapper_d_gga.f90 | 14 +------------- XClib/xc_wrapper_gga.f90 | 30 +++--------------------------- 4 files changed, 11 insertions(+), 57 deletions(-) diff --git a/XClib/qe_drivers_d_gga.f90 b/XClib/qe_drivers_d_gga.f90 index 0a59d664e..69c6e4695 100644 --- a/XClib/qe_drivers_d_gga.f90 +++ b/XClib/qe_drivers_d_gga.f90 @@ -67,7 +67,7 @@ SUBROUTINE dgcxc_unpol( length, r_in, s2_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc REAL(DP), ALLOCATABLE :: sx(:), sc(:) REAL(DP), PARAMETER :: small = 1.E-30_DP ! - !$acc data deviceptr( r_in, s2_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc ) + !$acc data present( r_in, s2_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc ) ! ALLOCATE( raux(4*length), s2aux(4*length), dr(length), s(length), ds(length) ) ALLOCATE( v1x(4*length), v2x(4*length), sx(4*length) ) @@ -97,9 +97,7 @@ SUBROUTINE dgcxc_unpol( length, r_in, s2_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc raux(i4-1+ir) = r_in(ir) ; s2aux(i4-1+ir) = (s(ir)-ds(ir))**2 ENDDO ! - !$acc host_data use_device( raux, s2aux, sx, sc, v1x, v2x, v1c, v2c ) CALL gcxc( length*4, raux, s2aux, sx, sc, v1x, v2x, v1c, v2c ) - !$acc end host_data ! !$acc parallel loop DO ir = 1, length @@ -191,7 +189,7 @@ SUBROUTINE dgcxc_spin( length, r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, & REAL(DP), PARAMETER :: rho_trash = 0.4_DP, zeta_trash = 0.2_DP, & s2_trash = 0.1_DP ! - !$acc data deviceptr( r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, vssc, vrzc ) + !$acc data present( r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, vssc, vrzc ) ! igcx_=igcx igcc_=igcc @@ -255,9 +253,7 @@ SUBROUTINE dgcxc_spin( length, r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, & raux(i8-1+ir,2) = r_dw ; s2aux(i8-1+ir,2) = (s_dw-ds_dw)**2 ENDDO ! - !$acc host_data use_device( raux, s2aux, sx, v1x, v2x ) CALL gcx_spin( length*8, raux, s2aux, sx, v1x, v2x ) - !$acc end host_data ! !$acc parallel loop DO ir = 1, length @@ -348,9 +344,7 @@ SUBROUTINE dgcxc_spin( length, r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, & rtaux(i6-1+ir) = rt ; s2taux(i6-1+ir) = s2t ; zetaux(i6-1+ir) = zeta-dz ENDDO ! - !$acc host_data use_device( rtaux, zetaux, s2taux, sc, v1c, v2c ) CALL gcc_spin( length*6, rtaux, zetaux, s2taux, sc, v1c, v2c ) - !$acc end host_data ! !$acc parallel loop DO ir = 1, length diff --git a/XClib/qe_drivers_gga.f90 b/XClib/qe_drivers_gga.f90 index d1484c0b2..f3b8519dd 100644 --- a/XClib/qe_drivers_gga.f90 +++ b/XClib/qe_drivers_gga.f90 @@ -81,8 +81,7 @@ SUBROUTINE gcxc( length, rho_in, grho_in, sx_out, sc_out, v1x_out, & #endif ! #if defined(_OPENACC) -!$acc data deviceptr( rho_in(length), grho_in(length), sx_out(length), sc_out(length), & -!$acc& v1x_out(length), v2x_out(length), v1c_out(length), v2c_out(length) ) +!$acc data present( rho_in, grho_in, sx_out, sc_out, v1x_out, v2x_out, v1c_out, v2c_out ) !$acc parallel loop #else !$omp parallel if(ntids==1) default(none) & @@ -494,8 +493,7 @@ SUBROUTINE gcx_spin( length, rho_in, grho2_in, sx_tot, v1x_out, v2x_out ) !sx_tot = 0.0_DP ! #if defined(_OPENACC) -!$acc data deviceptr( rho_in(length,2), grho2_in(length,2), sx_tot(length), & -!$acc& v1x_out(length,2), v2x_out(length,2) ) +!$acc data present( rho_in, grho2_in, sx_tot, v1x_out, v2x_out ) !$acc parallel loop #else !$omp parallel if(ntids==1) default(none) & @@ -1036,8 +1034,7 @@ SUBROUTINE gcc_spin( length, rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out #endif ! #if defined(_OPENACC) -!$acc data deviceptr( rho_in(length), zeta_io(length), grho_in(length), & -!$acc& sc_out(length), v1c_out(length,2), v2c_out(length) ) +!$acc data present( rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out ) !$acc parallel loop #else !$omp parallel if(ntids==1) default(none) & @@ -1151,7 +1148,7 @@ SUBROUTINE gcc_spin_more( length, rho_in, grho_in, grho_ud_in, & REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v2c !! correlation potential (gradient part) REAL(DP), INTENT(OUT), DIMENSION(length) :: v2c_ud - !!correlation potential (off-diag. term) + !! correlation potential (off-diag. term) ! ! ... local variables ! @@ -1166,8 +1163,7 @@ SUBROUTINE gcc_spin_more( length, rho_in, grho_in, grho_ud_in, & #endif ! #if defined(_OPENACC) -!$acc data deviceptr( rho_in(length,2), grho_in(length,2), grho_ud_in(length), & -!$acc& sc(length), v1c(length,2), v2c(length,2), v2c_ud(length) ) +!$acc data present( rho_in, grho_in, grho_ud_in, sc, v1c, v2c, v2c_ud ) !$acc parallel loop #else !$omp parallel if(ntids==1) default(none) & diff --git a/XClib/xc_wrapper_d_gga.f90 b/XClib/xc_wrapper_d_gga.f90 index f4319fe9c..00220352d 100644 --- a/XClib/xc_wrapper_d_gga.f90 +++ b/XClib/xc_wrapper_d_gga.f90 @@ -36,17 +36,13 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss, gpu_args_ ) IF ( gpu_args ) THEN ! !$acc data present( r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) - !$acc host_data use_device( r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) CALL dgcxc_( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) - !$acc end host_data !$acc end data ! ELSE ! !$acc data copyin( r_in, g_in ), copyout( dvxc_rr, dvxc_sr, dvxc_ss ) - !$acc host_data use_device( r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) CALL dgcxc_( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) - !$acc end host_data !$acc end data ! ENDIF @@ -108,7 +104,7 @@ SUBROUTINE dgcxc_( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) REAL(DP), PARAMETER :: small = 1.E-10_DP, rho_trash = 0.5_DP REAL(DP), PARAMETER :: epsr=1.0d-6, epsg=1.0d-6 ! - !$acc data deviceptr( r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) + !$acc data present( r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) ! IF ( ANY(.NOT.is_libxc(3:4)) ) THEN rho_threshold_gga = small ; grho_threshold_gga = small @@ -293,10 +289,8 @@ SUBROUTINE dgcxc_( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) ENDIF ! !$acc data copyin( sigma ) - !$acc host_data use_device( sigma, vrrx, vsrx, vssx, vrrc, vsrc, vssc ) CALL dgcxc_unpol( length, r_in(:,1), sigma, vrrx(:,1), vsrx(:,1), vssx(:,1), & vrrc(:,1), vsrc(:,1), vssc ) - !$acc end host_data !$acc end data ! !$acc parallel loop @@ -311,9 +305,7 @@ SUBROUTINE dgcxc_( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) ALLOCATE( vrzc(length,sp) ) !$acc data create( vrzc ) ! - !$acc host_data use_device( vrrx, vsrx, vssx, vrrc, vsrc, vssc, vrzc ) CALL dgcxc_spin( length, r_in, g_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc, vrzc ) - !$acc end host_data ! !$acc parallel loop DO k = 1, length @@ -369,10 +361,8 @@ SUBROUTINE dgcxc_( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) sigma(k) = g_in(k,1,1)**2 + g_in(k,2,1)**2 + g_in(k,3,1)**2 ENDDO ! - !$acc host_data use_device( sigma, vrrx, vsrx, vssx, vrrc, vsrc, vssc ) CALL dgcxc_unpol( length, r_in(:,1), sigma, vrrx(:,1), vsrx(:,1), vssx(:,1), & vrrc(:,1), vsrc(:,1), vssc ) - !$acc end host_data ! !$acc end data DEALLOCATE( sigma ) @@ -389,9 +379,7 @@ SUBROUTINE dgcxc_( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) ALLOCATE( vrzc(length,sp) ) !$acc data create( vrzc ) ! - !$acc host_data use_device( vrrx, vsrx, vssx, vrrc, vsrc, vssc, vrzc ) CALL dgcxc_spin( length, r_in, g_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc, vrzc ) - !$acc end host_data ! !$acc parallel loop DO k = 1, length diff --git a/XClib/xc_wrapper_gga.f90 b/XClib/xc_wrapper_gga.f90 index a850e71cc..5c2d8f258 100644 --- a/XClib/xc_wrapper_gga.f90 +++ b/XClib/xc_wrapper_gga.f90 @@ -55,16 +55,12 @@ SUBROUTINE xc_gcx( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud, & !$acc data present( rho, grho, ex, ec, v1x, v2x, v1c, v2c ) IF (PRESENT(v2c_ud)) THEN !$acc data present( v2c_ud ) - !$acc host_data use_device( rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) - !$acc end host_data !$acc end data ELSE ALLOCATE( v2c_dummy(length) ) !$acc data create( v2c_dummy ) - !$acc host_data use_device( rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy ) CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy ) - !$acc end host_data !$acc end data DEALLOCATE( v2c_dummy ) ENDIF @@ -75,16 +71,12 @@ SUBROUTINE xc_gcx( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud, & !$acc data copyin( rho, grho ), copyout( ex, ec, v1x, v2x, v1c, v2c ) IF (PRESENT(v2c_ud)) THEN !$acc data copyout( v2c_ud ) - !$acc host_data use_device( rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) - !$acc end host_data !$acc end data ELSE ALLOCATE( v2c_dummy(length) ) !$acc data create( v2c_dummy ) - !$acc host_data use_device( rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy ) CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy ) - !$acc end host_data !$acc end data DEALLOCATE( v2c_dummy ) ENDIF @@ -154,9 +146,7 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) REAL(DP) :: rho_up, rho_dw, grho_up, grho_dw, sgn1 REAL(DP), PARAMETER :: small = 1.E-10_DP ! - !$acc data deviceptr( rho(length,ns), grho(3,length,ns), ex(length), ec(length), & - !$acc& v1x(length,ns), v2x(length,ns), v1c(length,ns), v2c(length,ns), & - !$acc& v2c_ud(length) ) + !$acc data present( rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ! #if defined(__LIBXC) ! @@ -214,9 +204,7 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ! IF ( ns==1 .AND. ANY(.NOT.is_libxc(3:4)) ) THEN ! - !$acc host_data use_device( rho_lxc, sigma ) CALL gcxc( length, rho_lxc, sigma, ex, ec, v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) ) - !$acc end host_data ! !$acc parallel loop DO k = 1, length @@ -305,9 +293,8 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) grho2(k,2) = sigma(3*k) grho_ud(k) = sigma(3*k-1) ENDDO - !$acc host_data use_device( grho2, grho_ud ) + ! CALL gcc_spin_more( length, rho, grho2, grho_ud, ec, v1c, v2c, v2c_ud ) - !$acc end host_data !$acc end data DEALLOCATE( grho_ud ) ! @@ -329,9 +316,7 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) grho2(k,2) = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2 ENDDO ! - !$acc host_data use_device( rh, zeta, grho2 ) CALL gcc_spin( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) ) - !$acc end host_data ! !$acc parallel loop DO k = 1, length @@ -420,9 +405,8 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) grho2(k,is) = grho(1,k,is)**2 + grho(2,k,is)**2 + grho(3,k,is)**2 ENDDO ENDDO - !$acc host_data use_device( grho2 ) + ! CALL gcx_spin( length, rho, grho2, ex, v1x, v2x ) - !$acc end host_data !$acc end data DEALLOCATE( grho2 ) ! @@ -442,7 +426,6 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ! ALLOCATE( rh(length) ) !$acc data create( rh ) - !$acc host_data use_device( rh, grho2 ) !$acc parallel loop DO k = 1, length rh(k) = ABS(rho(k,1)) @@ -459,13 +442,11 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ec(k) = ec(k) * sgn1 ENDDO ! - !$acc end host_data !$acc end data DEALLOCATE( rh ) ! ELSE ! - !$acc host_data use_device( grho2 ) !$acc parallel loop collapse(2) DO is = 1, ns DO k = 1, length @@ -474,13 +455,11 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ENDDO ! CALL gcx_spin( length, rho, grho2, ex, v1x, v2x ) - !$acc end host_data ! IF (igcc==3 .OR. igcc==7 .OR. igcc==13 ) THEN ! ALLOCATE( grho_ud(length) ) !$acc data create( grho_ud ) - !$acc host_data use_device( grho_ud, grho2 ) !$acc parallel loop DO k = 1, length grho_ud(k) = grho(1,k,1) * grho(1,k,2) + grho(2,k,1) * grho(2,k,2) + & @@ -489,7 +468,6 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ! CALL gcc_spin_more( length, rho, grho2, grho_ud, ec, v1c, v2c, v2c_ud ) ! - !$acc end host_data !$acc end data DEALLOCATE( grho_ud ) ! @@ -497,7 +475,6 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ! ALLOCATE( rh(length), zeta(length) ) !$acc data create( rh, zeta ) - !$acc host_data use_device( rh, zeta, grho2 ) !$acc parallel loop DO k = 1, length rh(k) = rho(k,1) + rho(k,2) @@ -519,7 +496,6 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) IF ( ns==2 ) v2c_ud(k) = v2c(k,1) ENDDO ! - !$acc end host_data !$acc end data DEALLOCATE( rh, zeta ) !