diff --git a/CMakeLists.txt b/CMakeLists.txt index 80235cb8c..46b979e84 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -306,11 +306,18 @@ endif(QE_ENABLE_OPENMP) # OpenACC # The following targets will be defined: add_library(qe_openacc_fortran INTERFACE) -qe_install_targets(qe_openacc_fortran) +add_library(qe_openacc_c INTERFACE) +qe_install_targets(qe_openacc_fortran qe_openacc_c) ########################################################### if(QE_ENABLE_OPENACC) + if(CMAKE_VERSION VERSION_LESS 3.16.0) + message(FATAL_ERROR "CMake versions >= 3.16 required for QE_ENABLE_OPENACC=ON!") + endif() + find_package(OpenACC REQUIRED Fortran C) target_link_libraries(qe_openacc_fortran INTERFACE OpenACC::OpenACC_Fortran) + target_link_libraries(qe_openacc_c + INTERFACE OpenACC::OpenACC_C) endif(QE_ENABLE_OPENACC) ########################################################### diff --git a/CPV/src/exch_corr.f90 b/CPV/src/exch_corr.f90 index 51157c8e0..2faa0c8cb 100644 --- a/CPV/src/exch_corr.f90 +++ b/CPV/src/exch_corr.f90 @@ -373,7 +373,7 @@ subroutine exch_corr_cp(nnr,nspin,grhor,rhor,etxc) use kinds, only: DP use xc_lib, only: xclib_dft_is, xclib_dft_is_libxc, xclib_get_id, & - xclib_set_threshold, xc_gcx + xclib_set_threshold, xc, xc_gcx implicit none integer, intent(in) :: nnr integer, intent(in) :: nspin diff --git a/CPV/src/metaxc.f90 b/CPV/src/metaxc.f90 index bec71f946..3f7746177 100644 --- a/CPV/src/metaxc.f90 +++ b/CPV/src/metaxc.f90 @@ -11,7 +11,7 @@ SUBROUTINE tpssmeta(nnr, nspin,grho,rho,kedtau,etxc) ! =================== !-------------------------------------------------------------------- use kinds, only: dp - use xc_lib, only: xclib_set_threshold + use xc_lib, only: xclib_set_threshold, xc_metagcx ! IMPLICIT NONE ! diff --git a/FFTXlib/fft_helper_subroutines.f90 b/FFTXlib/fft_helper_subroutines.f90 index 30ed4cc24..962ad47f9 100644 --- a/FFTXlib/fft_helper_subroutines.f90 +++ b/FFTXlib/fft_helper_subroutines.f90 @@ -23,7 +23,7 @@ MODULE fft_helper_subroutines #endif END INTERFACE PRIVATE - PUBLIC :: fftx_threed2oned, fftx_oned2threed + PUBLIC :: fftx_threed2oned, fftx_threed2oned_gpu, fftx_oned2threed PUBLIC :: tg_reduce_rho PUBLIC :: tg_get_nnr, tg_get_recip_inc, fftx_ntgrp, fftx_tgpe, & tg_get_group_nr3 @@ -469,6 +469,8 @@ CONTAINS END SUBROUTINE SUBROUTINE fftx_threed2oned( desc, vin, vout1, vout2 ) + !! Copy charge density from 3D array to 1D array in Fourier + !! space. USE fft_param USE fft_types, ONLY : fft_type_descriptor TYPE(fft_type_descriptor), INTENT(in) :: desc @@ -490,7 +492,40 @@ CONTAINS END DO END IF END SUBROUTINE - + + SUBROUTINE fftx_threed2oned_gpu( desc, vin_d, vout1_d, vout2_d ) + !! GPU version of \(\texttt{fftx_threed2oned}\). + USE fft_param + USE fft_types, ONLY : fft_type_descriptor + TYPE(fft_type_descriptor), INTENT(in) :: desc + complex(DP), INTENT(OUT) :: vout1_d(:) + complex(DP), OPTIONAL, INTENT(OUT) :: vout2_d(:) + complex(DP), INTENT(IN) :: vin_d(:) + INTEGER, POINTER :: nl_d(:), nlm_d(:) + COMPLEX(DP) :: fp, fm + INTEGER :: ig +#if defined(__CUDA) && defined(_OPENACC) + attributes(DEVICE) :: nl_d, nlm_d + !$acc data deviceptr( vout1_d(:), vout2_d(:), vin_d(:) ) + nl_d => desc%nl_d + nlm_d => desc%nlm_d + IF( PRESENT( vout2_d ) ) THEN + !$acc parallel loop + DO ig=1,desc%ngm + fp=vin_d(nl_d(ig))+vin_d(nlm_d(ig)) + fm=vin_d(nl_d(ig))-vin_d(nlm_d(ig)) + vout1_d(ig) = CMPLX(0.5d0,0.d0,kind=DP)*CMPLX( DBLE(fp),AIMAG(fm),kind=DP) + vout2_d(ig) = CMPLX(0.5d0,0.d0,kind=DP)*CMPLX(AIMAG(fp),-DBLE(fm),kind=DP) + END DO + ELSE + !$acc parallel loop + DO ig=1,desc%ngm + vout1_d(ig) = vin_d(nl_d(ig)) + END DO + END IF + !$acc end data +#endif + END SUBROUTINE SUBROUTINE fftx_psi2c_gamma( desc, vin, vout1, vout2 ) USE fft_param diff --git a/Modules/fft_rho.f90 b/Modules/fft_rho.f90 index 6732a3b36..25eabcdf2 100644 --- a/Modules/fft_rho.f90 +++ b/Modules/fft_rho.f90 @@ -16,7 +16,7 @@ MODULE fft_rho ! IMPLICIT NONE PRIVATE - PUBLIC :: rho_r2g, rho_g2r + PUBLIC :: rho_r2g, rho_g2r, rho_r2g_gpu ! INTERFACE rho_g2r MODULE PROCEDURE rho_g2r_1, rho_g2r_2, rho_g2r_sum_components @@ -24,7 +24,10 @@ MODULE fft_rho ! CONTAINS ! + !------------------------------------------------------------- SUBROUTINE rho_r2g ( desc, rhor, rhog, v ) + !------------------------------------------------------ + !! Bring charge density rho from real to G- space USE fft_types, ONLY: fft_type_descriptor USE fft_helper_subroutines, ONLY: fftx_threed2oned ! @@ -93,6 +96,97 @@ CONTAINS END SUBROUTINE rho_r2g ! + !----------------------------------------------------------------- + SUBROUTINE rho_r2g_gpu( desc, rhor_d, rhog_d, v_d ) + !--------------------------------------------------------------- + !! Bring charge density rho from real to G- space - GPU version. + ! + USE fft_types, ONLY: fft_type_descriptor + USE fft_helper_subroutines, ONLY: fftx_threed2oned_gpu + ! + TYPE(fft_type_descriptor), INTENT(in) :: desc + REAL(dp), INTENT(in) :: rhor_d(:,:) + !! rho in real space + COMPLEX(dp), INTENT(OUT):: rhog_d(:,:) + !! rho in G-space + REAL(dp), OPTIONAL, INTENT(in) :: v_d(:) + ! + ! ... local variables + ! + INTEGER :: ir, ig, iss, isup, isdw + INTEGER :: nspin + COMPLEX(dp):: fp, fm + COMPLEX(dp), ALLOCATABLE :: psi_d(:) + ! + !$acc data deviceptr( rhor_d(:,:), rhog_d(:,:), v_d(:) ) + ! + nspin= SIZE(rhor_d, 2) + ! + ALLOCATE( psi_d(desc%nnr) ) + !$acc data create( psi_d(desc%nnr) ) + !$acc host_data use_device( psi_d ) + IF( nspin == 1 ) THEN + iss=1 + IF( PRESENT( v_d ) ) THEN + !$acc parallel loop + DO ir=1,desc%nnr + psi_d(ir)=CMPLX(rhor_d(ir,iss)+v_d(ir),0.0_dp,kind=dp) + END DO + ELSE + !$acc parallel loop + DO ir=1,desc%nnr + psi_d(ir)=CMPLX(rhor_d(ir,iss),0.0_dp,kind=dp) + END DO + END IF + CALL fwfft('Rho', psi_d, desc ) + CALL fftx_threed2oned_gpu( desc, psi_d, rhog_d(:,iss) ) + ELSE + IF ( gamma_only ) THEN + ! nspin/2 = 1 for LSDA, = 2 for noncolinear + DO iss=1,nspin/2 + isup=1+(iss-1)*nspin/2 ! 1 for LSDA, 1 and 3 for noncolinear + isdw=2+(iss-1)*nspin/2 ! 2 for LSDA, 2 and 4 for noncolinear + IF( PRESENT( v_d ) ) THEN + !$acc parallel loop + DO ir=1,desc%nnr + psi_d(ir)=CMPLX(rhor_d(ir,isup)+v_d(ir),rhor_d(ir,isdw)+v_d(ir),kind=dp) + END DO + ELSE + !$acc parallel loop + DO ir=1,desc%nnr + psi_d(ir)=CMPLX(rhor_d(ir,isup),rhor_d(ir,isdw),kind=dp) + END DO + END IF + CALL fwfft('Rho', psi_d, desc ) + CALL fftx_threed2oned_gpu( desc, psi_d, rhog_d(:,isup), rhog_d(:,isdw) ) + END DO + ELSE + DO iss=1,nspin + IF( PRESENT( v_d ) ) THEN + !$acc parallel loop + DO ir=1,desc%nnr + psi_d(ir)=CMPLX(rhor_d(ir,iss)+v_d(ir),0.0_dp,kind=dp) + END DO + ELSE + !$acc parallel loop + DO ir=1,desc%nnr + psi_d(ir)=CMPLX(rhor_d(ir,iss),0.0_dp,kind=dp) + END DO + END IF + CALL fwfft('Rho', psi_d, desc ) + CALL fftx_threed2oned_gpu( desc, psi_d, rhog_d(:,iss) ) + END DO + END IF + ENDIF + !$acc end host_data + !$acc end data + DEALLOCATE( psi_d ) + ! + !$acc end data + ! + END SUBROUTINE rho_r2g_gpu + ! + ! SUBROUTINE rho_g2r_1 ( desc, rhog, rhor ) USE fft_types, ONLY: fft_type_descriptor USE fft_helper_subroutines, ONLY: fftx_threed2oned, fftx_oned2threed diff --git a/Modules/gradutils.f90 b/Modules/gradutils.f90 index fb6d77b6b..217c05403 100644 --- a/Modules/gradutils.f90 +++ b/Modules/gradutils.f90 @@ -264,7 +264,152 @@ SUBROUTINE fft_gradient_g2r( dfft, a, g, ga ) RETURN ! END SUBROUTINE fft_gradient_g2r - +! +! +!---------------------------------------------------------------------------- +SUBROUTINE fft_gradient_g2r_gpu( dfft, a_d, g_d, ga_d ) + !---------------------------------------------------------------------------- + !! GPU double of \(\texttt{fft_gradient_g2r}\). + ! + USE cell_base, ONLY : tpiba + USE kinds, ONLY : DP + USE fft_interfaces, ONLY : invfft + USE fft_types, ONLY : fft_type_descriptor + ! + IMPLICIT NONE + ! + TYPE(fft_type_descriptor),INTENT(IN) :: dfft + !! FFT descriptor + COMPLEX(DP), INTENT(IN) :: a_d(dfft%ngm) + !! a(G), a complex function in G-space + REAL(DP), INTENT(IN) :: g_d(3,dfft%ngm) + !! G-vectors, in \( 2\pi/a \) units + REAL(DP), INTENT(OUT) :: ga_d(3,dfft%nnr) + !! The gradient of \({\bf a}\), real, on the real-space FFT grid + ! + ! ... local variables + ! + INTEGER :: ipol, n, ip + INTEGER, POINTER :: nl_d(:), nlm_d(:) + COMPLEX(DP), ALLOCATABLE :: gaux_d(:) + ! +#if defined(__CUDA) && defined(_OPENACC) + attributes(DEVICE) :: g_d, nl_d, nlm_d + ! + !$acc data deviceptr( a_d(dfft%ngm), ga_d(3,dfft%nnr) ) + ! + nl_d => dfft%nl_d + nlm_d => dfft%nlm_d + ! + ALLOCATE( gaux_d(dfft%nnr) ) + !$acc data create( gaux_d ) + !$acc host_data use_device( gaux_d ) + ! + IF ( dfft%lgamma ) THEN + ! + ! ... Gamma tricks: perform 2 FFT's in a single shot + ! x and y + ipol = 1 + !$acc parallel loop + DO n = 1, dfft%nnr + DO ip = 1, 3 + ga_d(ip,n) = 0.D0 + ENDDO + gaux_d(n) = (0.0_dp,0.0_dp) + ENDDO + ! + ! ... multiply a(G) by iG to get the gradient in real space + ! + !$acc parallel loop + DO n = 1, dfft%ngm + gaux_d(nl_d(n) ) = CMPLX( 0.0_dp, g_d(ipol,n),kind=DP)* a_d(n) - & + CMPLX(g_d(ipol+1,n),kind=DP) * a_d(n) + gaux_d(nlm_d(n)) = CMPLX( 0.0_dp,-g_d(ipol,n),kind=DP)*CONJG(a_d(n)) + & + CMPLX(g_d(ipol+1,n),kind=DP) * CONJG(a_d(n)) + ENDDO + ! + ! ... bring back to R-space, (\grad_ipol a)(r) ... + ! + CALL invfft( 'Rho', gaux_d, dfft ) + ! + ! ... bring back to R-space, (\grad_ipol a)(r) + ! ... add the factor 2\pi/a missing in the definition of q+G + ! + !$acc parallel loop + DO n = 1, dfft%nnr + ga_d(ipol, n) = REAL( gaux_d(n), kind=DP ) * tpiba + ga_d(ipol+1, n) = AIMAG( gaux_d(n) ) * tpiba + ENDDO + ! ... for z + ipol = 3 + !$acc parallel loop + DO n = 1, dfft%nnr + gaux_d(n) = (0.0_dp,0.0_dp) + ENDDO + ! + ! ... multiply a(G) by iG to get the gradient in real space + ! + !$acc parallel loop + DO n = 1, dfft%ngm + gaux_d(nl_d(n)) = CMPLX(g_d(ipol,n),kind=DP) * CMPLX( -AIMAG(a_d(n)), REAL(a_d(n)),kind=DP) + gaux_d(nlm_d(n)) = CONJG( gaux_d(nl_d(n)) ) + ENDDO + ! + ! ... bring back to R-space, (\grad_ipol a)(r) ... + ! + CALL invfft( 'Rho', gaux_d, dfft ) + ! + ! ...and add the factor 2\pi/a missing in the definition of G + ! + !$acc parallel loop + DO n = 1, dfft%nnr + ga_d(ipol,n) = tpiba * REAL( gaux_d(n), kind=DP ) + ENDDO + ! + ELSE + ! + DO ipol = 1, 3 + ! + !$acc parallel loop + DO n = 1, dfft%nnr + ga_d(ipol,n) = 0.D0 + gaux_d(n) = (0.0_dp,0.0_dp) + ENDDO + ! + ! ... multiply a(G) by iG to get the gradient in real space + ! + !$acc parallel loop + DO n = 1, dfft%ngm + gaux_d(nl_d(n)) = CMPLX(g_d(ipol,n), kind=DP) * CMPLX( -AIMAG(a_d(n)), REAL(a_d(n)), kind=DP) + ENDDO + ! + ! ... bring back to R-space, (\grad_ipol a)(r) ... + ! + CALL invfft( 'Rho', gaux_d, dfft ) + ! + ! ...and add the factor 2\pi/a missing in the definition of G + ! + !$acc parallel loop + DO n = 1, dfft%nnr + ga_d(ipol,n) = tpiba * REAL( gaux_d(n), kind=DP ) + ENDDO + ! + ENDDO + ! + ENDIF + ! + !$acc end host_data + !$acc end data + DEALLOCATE( gaux_d ) + ! + !$acc end data + ! +#endif + ! + RETURN + ! +END SUBROUTINE fft_gradient_g2r_gpu +! !---------------------------------------------------------------------------- SUBROUTINE fft_graddot( dfft, a, g, da ) !---------------------------------------------------------------------------- @@ -373,7 +518,152 @@ SUBROUTINE fft_graddot( dfft, a, g, da ) RETURN ! END SUBROUTINE fft_graddot - +! +! +!---------------------------------------------------------------------------- +SUBROUTINE fft_graddot_gpu( dfft, a_d, g_d, da_d ) + !--------------------------------------------------------------------------- + !! Calculates \( da = \sum_i \nabla_i a_i \) in R-space. + ! + USE cell_base, ONLY : tpiba + USE kinds, ONLY : DP + USE fft_interfaces, ONLY : fwfft, invfft + USE fft_types, ONLY : fft_type_descriptor + ! + IMPLICIT NONE + ! + TYPE(fft_type_descriptor),INTENT(IN) :: dfft + !! FFT descriptor + REAL(DP), INTENT(IN) :: a_d(3,dfft%nnr) + !! A real function on the real-space FFT grid + REAL(DP), INTENT(IN) :: g_d(3,dfft%ngm) + !! G-vectors, in \( 2\pi/a \) units + REAL(DP), INTENT(OUT) :: da_d(dfft%nnr) + !! \( \sum_i \nabla_i a_i \), real, on the real-space FFT grid + ! + ! ... local variables + ! + INTEGER :: n, ipol + INTEGER, POINTER :: nl_d(:), nlm_d(:) + COMPLEX(DP), ALLOCATABLE :: aux_d(:), gaux_d(:) + COMPLEX(DP) :: fp, fm, aux1, aux2 + ! +#if defined(__CUDA) && defined(_OPENACC) + attributes(DEVICE) :: g_d, nl_d, nlm_d + ! + !$acc data deviceptr( a_d(3,dfft%ngm), da_d(dfft%nnr) ) + ! + nl_d => dfft%nl_d + nlm_d => dfft%nlm_d + ! + ALLOCATE( aux_d(dfft%nnr), gaux_d(dfft%nnr) ) + !$acc data create( aux_d, gaux_d ) + !$acc host_data use_device( aux_d, gaux_d ) + ! + !$acc parallel loop + DO n = 1, dfft%nnr + gaux_d(n) = (0.0_dp,0.0_dp) + ENDDO + ! + IF ( dfft%lgamma ) THEN + ! + ! Gamma tricks: perform 2 FFT's in a single shot + ! x and y + ipol = 1 + ! + !$acc parallel loop + DO n = 1, dfft%nnr + aux_d(n) = CMPLX( a_d(ipol,n), a_d(ipol+1,n), kind=DP) + ENDDO + ! + ! ... bring a(ipol,r) to G-space, a(G) ... + ! + CALL fwfft( 'Rho', aux_d, dfft ) + ! + ! ... multiply by iG to get the gradient in G-space + ! + !$acc parallel loop + DO n = 1, dfft%ngm + fp = (aux_d(nl_d(n)) + aux_d(nlm_d(n)))*0.5_dp + fm = (aux_d(nl_d(n)) - aux_d(nlm_d(n)))*0.5_dp + aux1 = CMPLX( REAL(fp), AIMAG(fm), kind=DP) + aux2 = CMPLX(AIMAG(fp), -REAL(fm), kind=DP) + gaux_d(nl_d(n)) = & + CMPLX(0.0_dp, g_d(ipol ,n),kind=DP) * aux1 + & + CMPLX(0.0_dp, g_d(ipol+1,n),kind=DP) * aux2 + ENDDO + ! ... for z + ipol = 3 + !$acc parallel loop + DO n = 1, dfft%nnr + aux_d(n) = CMPLX( a_d(ipol,n), 0.0_dp, kind=DP) + ENDDO + ! + ! ... bring a(ipol,r) to G-space, a(G) ... + ! + CALL fwfft( 'Rho', aux_d, dfft ) + ! + ! ... multiply by iG to get the gradient in G-space + ! ... fill both gaux(G) and gaux(-G) = gaux*(G) + ! + !$acc parallel loop + DO n = 1, dfft%ngm + gaux_d(nl_d(n)) = gaux_d(nl_d(n)) + CMPLX(g_d(ipol,n),kind=DP) * & + CMPLX( -AIMAG( aux_d(nl_d(n)) ), & + REAL( aux_d(nl_d(n)) ), kind=DP) + gaux_d(nlm_d(n)) = CONJG( gaux_d(nl_d(n)) ) + ENDDO + ! + ELSE + ! + DO ipol = 1, 3 + ! + !$acc parallel loop + DO n = 1, dfft%nnr + aux_d(n) = CMPLX( a_d(ipol,n), 0.0_dp, kind=DP) + ENDDO + ! + ! ... bring a(ipol,r) to G-space, a(G) ... + ! + CALL fwfft( 'Rho', aux_d, dfft ) + ! + ! ... multiply by iG to get the gradient in G-space + ! + !$acc parallel loop + DO n = 1, dfft%ngm + gaux_d(nl_d(n)) = gaux_d(nl_d(n)) + CMPLX(g_d(ipol,n),kind=DP) * & + CMPLX( -AIMAG( aux_d(nl_d(n)) ), & + REAL( aux_d(nl_d(n)),kind=DP ), kind=DP) + ENDDO + ! + ENDDO + ! + ENDIF + ! + ! ... bring back to R-space, (\grad_ipol a)(r) ... + ! + CALL invfft( 'Rho', gaux_d, dfft ) + ! + ! ... add the factor 2\pi/a missing in the definition of G and sum + ! + !$acc parallel loop + DO n = 1, dfft%nnr + da_d(n) = tpiba * REAL( gaux_d(n), kind=DP ) + ENDDO + ! + !$acc end host_data + !$acc end data + DEALLOCATE( aux_d, gaux_d ) + ! + !$acc end data + ! +#endif + ! + RETURN + ! +END SUBROUTINE fft_graddot_gpu +! +! !-------------------------------------------------------------------- SUBROUTINE fft_qgraddot ( dfft, a, xq, g, da) !-------------------------------------------------------------------- diff --git a/PP/src/ppacf.f90 b/PP/src/ppacf.f90 index 0014617d5..114df7699 100644 --- a/PP/src/ppacf.f90 +++ b/PP/src/ppacf.f90 @@ -55,7 +55,7 @@ PROGRAM do_ppacf USE funct, ONLY : dft_is_nonlocc, nlc, enforce_input_dft USE xc_lib, ONLY : xclib_get_id, xclib_set_auxiliary_flags, & xclib_set_exx_fraction, xclib_dft_is_libxc, & - xclib_set_threshold, xc_gcx + xclib_set_threshold, xc, xc_gcx USE wvfct, ONLY : npw, npwx USE environment, ONLY : environment_start, environment_end USE vdW_DF, ONLY : Nqs, vdW_DF_potential, vdW_DF_energy, vdW_DF_analysis diff --git a/PW/src/compute_rho.f90 b/PW/src/compute_rho.f90 index 69054f7e7..14eb833b8 100644 --- a/PW/src/compute_rho.f90 +++ b/PW/src/compute_rho.f90 @@ -53,3 +53,60 @@ END IF RETURN END SUBROUTINE compute_rho +! +! +!------------------------------------------------------------------------------ +SUBROUTINE compute_rho_gpu( rho_d, rhoout_d, segni_d, nrxx ) + !-------------------------------------------------------------------------- + !! GPU double of \(\texttt{compute_rho}\). + ! + USE kinds, ONLY : DP + USE noncollin_module, ONLY : lsign, ux + ! + IMPLICIT NONE + ! + INTEGER :: nrxx + !! input: the dimension of the mesh + REAL(DP), INTENT(IN) :: rho_d(nrxx,4) + !! the four components of the charge + REAL(DP), INTENT(OUT) :: rhoout_d(nrxx,2) + !! the spin up and spin down charge + REAL(DP), INTENT(OUT) :: segni_d(nrxx) + !! the orientation when needed + ! + ! ... local variables + ! + REAL(DP) :: ux1, ux2, ux3, amag + INTEGER :: ir ! counter on mesh points + ! + !$acc data deviceptr( rho_d(nrxx,4), rhoout_d(nrxx,2), segni_d(nrxx) ) + ! + IF (lsign) THEN + ! + ux1 = ux(1) ; ux2 = ux(2) ; ux3 = ux(3) + ! + !$acc parallel loop + DO ir = 1, nrxx + segni_d(ir) = SIGN(1.0_DP,rho_d(ir,2)*ux1+rho_d(ir,3)*ux2+rho_d(ir,4)*ux3) + amag = SQRT(rho_d(ir,2)**2+rho_d(ir,3)**2+rho_d(ir,4)**2) + rhoout_d(ir,1) = 0.5d0*(rho_d(ir,1)+segni_d(ir)*amag) + rhoout_d(ir,2) = 0.5d0*(rho_d(ir,1)-segni_d(ir)*amag) + ENDDO + ! + ELSE + ! + !$acc parallel loop + DO ir = 1, nrxx + segni_d(ir) = 1.0_DP + amag = SQRT(rho_d(ir,2)**2+rho_d(ir,3)**2+rho_d(ir,4)**2) + rhoout_d(ir,1) = 0.5d0*(rho_d(ir,1) + amag) + rhoout_d(ir,2) = 0.5d0*(rho_d(ir,1) - amag) + ENDDO + ! + ENDIF + ! + !$acc end data + ! + RETURN + ! +END SUBROUTINE compute_rho_gpu diff --git a/PW/src/gradcorr.f90 b/PW/src/gradcorr.f90 index fc573b487..d81f43bfc 100644 --- a/PW/src/gradcorr.f90 +++ b/PW/src/gradcorr.f90 @@ -13,14 +13,15 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v ) ! USE constants, ONLY : e2 USE kinds, ONLY : DP - USE gvect, ONLY : ngm, g + USE gvect, ONLY : ngm, g, g_d USE lsda_mod, ONLY : nspin USE cell_base, ONLY : omega USE xc_lib, ONLY : igcc_is_lyp, xclib_dft_is, xc_gcx USE noncollin_module, ONLY : domag USE fft_base, ONLY : dfftp USE fft_interfaces, ONLY : fwfft - USE fft_rho, ONLY: rho_r2g + USE fft_rho, ONLY : rho_r2g, rho_r2g_gpu + USE control_flags, ONLY : use_gpu ! IMPLICIT NONE ! @@ -39,15 +40,15 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v ) REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:), v2c_ud(:) REAL(DP) :: sx(dfftp%nnr), sc(dfftp%nnr) ! - REAL(DP) :: sgn(2), etxcgc, vtxcgc, fac, amag - ! + REAL(DP) :: sgn_is, etxcgc, vtxcgc, fac, amag REAL(DP) :: grup, grdw - ! REAL(DP), PARAMETER :: epsr = 1.D-6, epsg = 1.D-10 ! - ! IF ( .NOT. xclib_dft_is('gradient') ) RETURN ! + !$acc data deviceptr( rho(dfftp%nnr,nspin), rho_core(dfftp%nnr), & + !$acc& rhog(ngm,nspin), rhog_core(ngm), v(dfftp%nnr,nspin) ) + ! etxcgc = 0.0_DP vtxcgc = 0.0_DP ! @@ -55,77 +56,111 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v ) IF ( nspin==4 ) nspin0 = 1 IF ( nspin==4 .AND. domag ) nspin0 = 2 fac = 1.0_DP / DBLE( nspin0 ) - sgn(1) = 1._DP ; sgn(2) = -1._DP ! ALLOCATE( h(3,dfftp%nnr,nspin0) ) ALLOCATE( grho(3,dfftp%nnr,nspin0) ) ALLOCATE( rhoaux(dfftp%nnr,nspin0) ) - ! ALLOCATE( v1x(dfftp%nnr,nspin0), v2x(dfftp%nnr,nspin0) ) ALLOCATE( v1c(dfftp%nnr,nspin0), v2c(dfftp%nnr,nspin0) ) + !$acc data create( rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, h ) ! - IF ( nspin==4 .AND. domag ) THEN - ALLOCATE( vgg(dfftp%nnr,nspin0) ) - ALLOCATE( vsave(dfftp%nnr,nspin) ) - ALLOCATE( segni(dfftp%nnr) ) - vsave=v - v=0._DP - ENDIF - ! - ALLOCATE( rhogaux( ngm, nspin0 ) ) + ALLOCATE( rhogaux(ngm,nspin0) ) + !$acc data create( rhogaux ) ! ! ... calculate the gradient of rho + rho_core in real space ! IF ( nspin == 4 .AND. domag ) THEN - ! - CALL compute_rho( rho, rhoaux, segni, dfftp%nnr ) - ! - ! ... bring starting rhoaux to G-space - ! - CALL rho_r2g ( dfftp, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) ) - ! + ALLOCATE( vsave(dfftp%nnr,nspin) ) + ALLOCATE( segni(dfftp%nnr) ) + !$acc data copyout( segni, vsave ) + ! + !$acc parallel loop collapse(2) + DO is = 1, nspin + DO ir = 1, dfftp%nnr + vsave(ir,is) = v(ir,is) + v(ir,is) = 0._DP + ENDDO + ENDDO + ! + ! ... bring starting rhoaux to G-space + IF ( use_gpu ) THEN + !$acc host_data use_device( rhoaux, rhogaux, segni ) + CALL compute_rho_gpu( rho, rhoaux, segni, dfftp%nnr ) + CALL rho_r2g_gpu( dfftp, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) ) + !$acc end host_data + ELSE + CALL compute_rho( rho, rhoaux, segni, dfftp%nnr ) + CALL rho_r2g( dfftp, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) ) + ENDIF + !$acc end data ELSE - ! - ! ... for convenience rhoaux and rhogaux are in (up,down) format, when LSDA - ! - DO is = 1, nspin0 - rhoaux(:,is) = ( rho(:,1) + sgn(is) * rho(:,nspin0) ) * 0.5_DP - rhogaux(:,is) = ( rhog(:,1) + sgn(is) * rhog(:,nspin0) ) * 0.5_DP - ENDDO - ! + ! ... for convenience rhoaux and rhogaux are in (up,down) format, when LSDA + ! + !$acc parallel loop collapse(2) + DO is = 1, nspin0 + DO ir = 1, dfftp%nnr + sgn_is = DBLE(3-2*is) + rhoaux(ir,is) = ( rho(ir,1) + sgn_is * rho(ir,nspin0) ) * 0.5_DP + ENDDO + ENDDO + !$acc parallel loop collapse(2) + DO is = 1, nspin0 + DO ir = 1, ngm + sgn_is = DBLE(3-2*is) + rhogaux(ir,is) = ( rhog(ir,1) + CMPLX(sgn_is) * rhog(ir,nspin0) ) & + * (0.5_DP,0._DP) + ENDDO + ENDDO + ! ENDIF ! + !$acc parallel loop collapse(2) DO is = 1, nspin0 - ! - rhoaux(:,is) = fac * rho_core(:) + rhoaux(:,is) - rhogaux(:,is) = fac * rhog_core(:) + rhogaux(:,is) - ! - CALL fft_gradient_g2r( dfftp, rhogaux(1,is), g, grho(1,1,is) ) - ! + DO ir = 1, dfftp%nnr + rhoaux(ir,is) = fac * rho_core(ir) + rhoaux(ir,is) + ENDDO + ENDDO + !$acc parallel loop collapse(2) + DO is = 1, nspin0 + DO ir = 1, ngm + rhogaux(ir,is) = CMPLX(fac,kind=DP) * rhog_core(ir) + rhogaux(ir,is) + ENDDO ENDDO ! - DEALLOCATE( rhogaux ) + DO is = 1, nspin0 + IF ( use_gpu ) THEN + !$acc host_data use_device( rhogaux, grho ) + CALL fft_gradient_g2r_gpu( dfftp, rhogaux(:,is), g_d, grho(:,:,is) ) + !$acc end host_data + ELSE + CALL fft_gradient_g2r( dfftp, rhogaux(:,is), g, grho(:,:,is) ) + ENDIF + ENDDO ! + !$acc end data + DEALLOCATE( rhogaux ) ! IF ( nspin0 == 1 ) THEN ! ! ... This is the spin-unpolarised case ! - CALL xc_gcx( dfftp%nnr, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c ) + !$acc host_data use_device( rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c ) + CALL xc_gcx( dfftp%nnr, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, & + gpu_args_=.TRUE. ) + !$acc end host_data ! + !$acc parallel loop reduction(+:etxcgc) reduction(+:vtxcgc) DO k = 1, dfftp%nnr - ! ! ... first term of the gradient correction : D(rho*Exc)/D(rho) v(k,1) = v(k,1) + e2 * ( v1x(k,1) + v1c(k,1) ) - ! ! ... h contains: D(rho*Exc) / D(|grad rho|) * (grad rho) / |grad rho| - h(:,k,1) = e2 * ( v2x(k,1) + v2c(k,1) ) * grho(:,k,1) + DO ipol = 1, 3 + h(ipol,k,1) = e2 * ( v2x(k,1) + v2c(k,1) ) * grho(ipol,k,1) + ENDDO ! vtxcgc = vtxcgc + e2 * ( v1x(k,1) + v1c(k,1) ) * & - (rhoaux(k,1) - rho_core(k) ) - ! + ( rhoaux(k,1) - rho_core(k) ) etxcgc = etxcgc + e2 * ( sx(k) + sc(k) ) - ! ENDDO ! ELSE @@ -133,71 +168,84 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v ) ! ... spin-polarised case ! ALLOCATE( v2c_ud(dfftp%nnr) ) - ! - ! - CALL xc_gcx( dfftp%nnr, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud ) + !$acc data create( v2c_ud ) ! + !$acc host_data use_device( rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud ) + CALL xc_gcx( dfftp%nnr, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, & + v2c_ud, gpu_args_=.TRUE. ) + !$acc end host_data ! ! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho| ! + !$acc parallel loop reduction(+:etxcgc) reduction(+:vtxcgc) DO k = 1, dfftp%nnr - v(k,1:nspin0) = v(k,1:nspin0) + e2*( v1x(k,1:nspin0) + v1c(k,1:nspin0)) + ! + DO is = 1, nspin0 + v(k,is) = v(k,is) + e2*(v1x(k,is) + v1c(k,is)) + ENDDO + ! DO ipol = 1, 3 - ! grup = grho(ipol,k,1) grdw = grho(ipol,k,2) h(ipol,k,1) = e2*( (v2x(k,1) + v2c(k,1))*grup + v2c_ud(k)*grdw ) h(ipol,k,2) = e2*( (v2x(k,2) + v2c(k,2))*grdw + v2c_ud(k)*grup ) - ! ENDDO ! - vtxcgc = vtxcgc + & - e2 * ( v1x(k,1) + v1c(k,1) ) * ( rhoaux(k,1) - rho_core(k) * fac ) - vtxcgc = vtxcgc + & - e2 * ( v1x(k,2) + v1c(k,2) ) * ( rhoaux(k,2) - rho_core(k) * fac ) + vtxcgc = vtxcgc + e2 * ( & + ( v1x(k,1) + v1c(k,1) ) * ( rhoaux(k,1) - rho_core(k) * fac ) + & + ( v1x(k,2) + v1c(k,2) ) * ( rhoaux(k,2) - rho_core(k) * fac ) ) etxcgc = etxcgc + e2 * ( sx(k) + sc(k) ) ! ENDDO ! + !$acc end data DEALLOCATE( v2c_ud ) ! ENDIF ! - ! + !$acc parallel loop collapse(2) DO is = 1, nspin0 - ! - rhoaux(:,is) = rhoaux(:,is) - fac * rho_core(:) - ! - END DO + DO k = 1, dfftp%nnr + rhoaux(k,is) = rhoaux(k,is) - fac * rho_core(k) + ENDDO + ENDDO ! - DEALLOCATE( grho ) - DEALLOCATE( v1x, v2x ) - DEALLOCATE( v1c, v2c ) - ! - ALLOCATE( dh(dfftp%nnr) ) + ALLOCATE( dh(dfftp%nnr) ) + !$acc data create( dh ) ! ! ... second term of the gradient correction : ! ... \sum_alpha (D / D r_alpha) ( D(rho*Exc)/D(grad_alpha rho) ) ! + !$acc host_data use_device( rhoaux, dh, h ) DO is = 1, nspin0 - ! - CALL fft_graddot( dfftp, h(1,1,is), g, dh ) - ! - v(:,is) = v(:,is) - dh(:) - ! - vtxcgc = vtxcgc - SUM( dh(:) * rhoaux(:,is) ) - ! - END DO + IF ( use_gpu ) CALL fft_graddot_gpu( dfftp, h(1,1,is), g_d, dh ) + IF ( .NOT. use_gpu ) CALL fft_graddot( dfftp, h(1,1,is), g, dh ) + !$acc parallel loop + DO k = 1, dfftp%nnr + v(k,is) = v(k,is) - dh(k) + vtxcgc = vtxcgc - dh(k) * rhoaux(k,is) + ENDDO + ENDDO + !$acc end host_data + !$acc end data ! vtxc = vtxc + omega * vtxcgc / ( dfftp%nr1 * dfftp%nr2 * dfftp%nr3 ) etxc = etxc + omega * etxcgc / ( dfftp%nr1 * dfftp%nr2 * dfftp%nr3 ) ! IF (nspin==4 .AND. domag) THEN - DO is = 1, nspin0 - vgg(:,is) = v(:,is) + ALLOCATE( vgg(dfftp%nnr,nspin0) ) + !$acc data copyin( segni, vsave ) create( vgg ) + !$acc host_data use_device( segni, vgg, vsave ) + ! + !$acc parallel loop collapse(2) + DO is = 1, nspin + DO ir = 1, dfftp%nnr + IF (is<=nspin0) vgg(ir,is) = v(ir,is) + v(ir,is) = vsave(ir,is) + ENDDO ENDDO ! - v = vsave + !$acc parallel loop DO k = 1, dfftp%nnr v(k,1) = v(k,1) + 0.5d0*(vgg(k,1)+vgg(k,2)) amag = SQRT(rho(k,2)**2+rho(k,3)**2+rho(k,4)**2) @@ -207,17 +255,21 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v ) v(k,4) = v(k,4) + segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,4)/amag ENDIF ENDDO - ENDIF - ! - DEALLOCATE( dh ) - DEALLOCATE( h ) - DEALLOCATE( rhoaux ) - IF (nspin==4 .AND. domag) THEN - DEALLOCATE( vgg ) - DEALLOCATE( vsave ) + ! + !$acc end host_data + !$acc end data DEALLOCATE( segni ) + DEALLOCATE( vgg, vsave ) ENDIF ! + !$acc end data + ! + DEALLOCATE( rhoaux, grho ) + DEALLOCATE( v1x, v2x ) + DEALLOCATE( v1c, v2c ) + DEALLOCATE( dh, h ) + ! + !$acc end data ! RETURN ! diff --git a/PW/src/paw_onecenter.f90 b/PW/src/paw_onecenter.f90 index e06c93e82..44cce6131 100644 --- a/PW/src/paw_onecenter.f90 +++ b/PW/src/paw_onecenter.f90 @@ -429,7 +429,7 @@ MODULE paw_onecenter USE constants, ONLY : e2, eps12 USE lsda_mod, ONLY : nspin USE atom, ONLY : g => rgrid - USE xc_lib, ONLY : xclib_dft_is + USE xc_lib, ONLY : xclib_dft_is, xc USE constants, ONLY : fpi ! REMOVE ! TYPE(paw_info), INTENT(IN) :: i diff --git a/PW/src/stres_gradcorr.f90 b/PW/src/stres_gradcorr.f90 index 163b52953..76f0ab00a 100644 --- a/PW/src/stres_gradcorr.f90 +++ b/PW/src/stres_gradcorr.f90 @@ -12,7 +12,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, & !---------------------------------------------------------------------------- ! USE kinds, ONLY: DP - USE xc_lib, ONLY: xclib_dft_is, xclib_get_id, xc_gcx + USE xc_lib, ONLY: xclib_dft_is, xclib_get_id, xc_gcx, xc_metagcx USE noncollin_module, ONLY: domag USE mp_bands, ONLY: intra_bgrp_comm USE mp, ONLY: mp_sum diff --git a/PW/src/v_of_rho.f90 b/PW/src/v_of_rho.f90 index a42ab7eee..8891224c1 100644 --- a/PW/src/v_of_rho.f90 +++ b/PW/src/v_of_rho.f90 @@ -63,7 +63,7 @@ SUBROUTINE v_of_rho( rho, rho_core, rhog_core, & ! ! ... calculate exchange-correlation potential ! - IF (xclib_dft_is('meta')) then + IF (xclib_dft_is('meta')) THEN CALL v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v%of_r, v%kin_r ) ELSE CALL v_xc( rho, rho_core, rhog_core, etxc, vtxc, v%of_r ) @@ -154,13 +154,15 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur ) USE constants, ONLY : e2, eps8 USE io_global, ONLY : stdout USE fft_base, ONLY : dfftp - USE gvect, ONLY : g, ngm + USE gvect, ONLY : g, g_d, ngm USE lsda_mod, ONLY : nspin USE cell_base, ONLY : omega USE funct, ONLY : dft_is_nonlocc, nlc - USE scf, ONLY : scf_type, rhoz_or_updw + USE scf, ONLY : scf_type + USE xc_lib, ONLY : xc_metagcx USE mp, ONLY : mp_sum USE mp_bands, ONLY : intra_bgrp_comm + USE control_flags, ONLY : use_gpu ! IMPLICIT NONE ! @@ -181,20 +183,19 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur ) ! ! ... local variables ! - REAL(DP) :: zeta, rh, sgn(2) + REAL(DP) :: zeta, rh, sgn_is INTEGER :: k, ipol, is, np ! REAL(DP), ALLOCATABLE :: ex(:), ec(:) REAL(DP), ALLOCATABLE :: v1x(:,:), v2x(:,:), v3x(:,:) REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:,:), v3c(:,:) ! - REAL(DP) :: fac - - REAL(DP), DIMENSION(2) :: grho2, rhoneg + REAL(DP) :: fac, rhoneg1, rhoneg2 + REAL(DP), DIMENSION(2) :: grho2 REAL(DP), DIMENSION(3) :: grhoup, grhodw ! - REAL(DP), ALLOCATABLE :: grho(:,:,:), h(:,:,:), dh(:) - REAL(DP), ALLOCATABLE :: rhoout(:) + REAL(DP), ALLOCATABLE :: h(:,:,:), dh(:) + REAL(DP), ALLOCATABLE :: rho_updw(:,:), grho(:,:,:), tau(:,:) COMPLEX(DP), ALLOCATABLE :: rhogsum(:) REAL(DP), PARAMETER :: eps12 = 1.0d-12, zero=0._dp ! @@ -203,15 +204,17 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur ) etxc = zero vtxc = zero v(:,:) = zero - rhoneg(:) = zero - sgn(1) = 1._dp ; sgn(2) = -1._dp + rhoneg1 = zero ; rhoneg2 = zero fac = 1.D0 / DBLE( nspin ) np = 1 IF (nspin==2) np=3 ! + !$acc data copyin( rho ) copyout( kedtaur, v ) + ! ALLOCATE( grho(3,dfftp%nnr,nspin) ) ALLOCATE( h(3,dfftp%nnr,nspin) ) - ALLOCATE( rhogsum(ngm) ) + ALLOCATE( rhogsum(ngm), tau(dfftp%nnr,nspin) ) + !$acc data create( tau, grho, h ) ! ALLOCATE( ex(dfftp%nnr), ec(dfftp%nnr) ) ALLOCATE( v1x(dfftp%nnr,nspin), v2x(dfftp%nnr,nspin) , v3x(dfftp%nnr,nspin) ) @@ -220,104 +223,158 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur ) ! ... calculate the gradient of rho + rho_core in real space ! ... in LSDA case rhogsum is in (up,down) format ! + !$acc data create( rhogsum ) copyin( rhog_core, rho%of_g, rho%kin_r ) DO is = 1, nspin ! - rhogsum(:) = fac*rhog_core(:) + ( rho%of_g(:,1) + sgn(is)*rho%of_g(:,nspin) )*0.5D0 + sgn_is = (-1.d0)**(is+1) ! - CALL fft_gradient_g2r( dfftp, rhogsum, g, grho(1,1,is) ) + !$acc parallel loop present(rho) + DO k = 1, ngm + rhogsum(k) = fac*rhog_core(k) + ( rho%of_g(k,1) + sgn_is*rho%of_g(k,nspin) )*0.5D0 + ENDDO + ! + IF ( use_gpu ) THEN + !$acc host_data use_device( rhogsum, grho ) + CALL fft_gradient_g2r_gpu( dfftp, rhogsum, g_d, grho(:,:,is) ) + !$acc end host_data + ELSE + CALL fft_gradient_g2r( dfftp, rhogsum, g, grho(:,:,is) ) + ENDIF ! ENDDO - DEALLOCATE(rhogsum) ! + !$acc parallel loop collapse(2) present(rho) + DO is = 1, nspin + DO k = 1, dfftp%nnr + tau(k,is) = rho%kin_r(k,is)/e2 + ENDDO + ENDDO ! + !$acc end data + DEALLOCATE( rhogsum ) + ! + !$acc data copyin( rho%of_r ) + !$acc data create( ex, ec, v1x, v2x, v3x, v1c, v2c, v3c ) IF (nspin == 1) THEN ! - CALL xc_metagcx( dfftp%nnr, 1, np, rho%of_r, grho, rho%kin_r/e2, ex, ec, & - v1x, v2x, v3x, v1c, v2c, v3c ) + !$acc host_data use_device( rho%of_r, grho, tau, ex, ec, & + !$acc& v1x, v2x, v3x, v1c, v2c, v3c ) + CALL xc_metagcx( dfftp%nnr, 1, np, rho%of_r, grho, tau, ex, ec, & + v1x, v2x, v3x, v1c, v2c, v3c, gpu_args_=.TRUE. ) + !$acc end host_data ! + !$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) & + !$acc& reduction(-:rhoneg2) present(rho) DO k = 1, dfftp%nnr ! v(k,1) = (v1x(k,1)+v1c(k,1)) * e2 ! - ! h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho| - h(:,k,1) = (v2x(k,1)+v2c(1,k,1)) * grho(:,k,1) * e2 + ! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho| + DO ipol = 1, 3 + h(ipol,k,1) = (v2x(k,1)+v2c(1,k,1)) * grho(ipol,k,1) * e2 + ENDDO ! kedtaur(k,1) = (v3x(k,1)+v3c(k,1)) * 0.5d0 * e2 ! etxc = etxc + (ex(k)+ec(k)) * e2 vtxc = vtxc + (v1x(k,1)+v1c(k,1)) * e2 * ABS(rho%of_r(k,1)) ! - IF (rho%of_r(k,1) < zero) rhoneg(1) = rhoneg(1)-rho%of_r(k,1) + IF (rho%of_r(k,1) < zero) rhoneg1 = rhoneg1-rho%of_r(k,1) ! ENDDO ! ELSE ! - CALL rhoz_or_updw( rho, 'only_r', '->updw' ) + ALLOCATE( rho_updw(dfftp%nnr,2) ) + !$acc data create( rho_updw ) ! - CALL xc_metagcx( dfftp%nnr, 2, np, rho%of_r, grho, rho%kin_r/e2, ex, ec, & - v1x, v2x, v3x, v1c, v2c, v3c ) + !$acc parallel loop present(rho) + DO k = 1, dfftp%nnr + rho_updw(k,1) = ( rho%of_r(k,1) + rho%of_r(k,2) ) * 0.5d0 + rho_updw(k,2) = ( rho%of_r(k,1) - rho%of_r(k,2) ) * 0.5d0 + ENDDO ! - ! first term of the gradient correction : D(rho*Exc)/D(rho) + !$acc host_data use_device( rho_updw, grho, tau, ex, ec, & + !$acc& v1x, v2x, v3x, v1c, v2c, v3c ) + CALL xc_metagcx( dfftp%nnr, 2, np, rho_updw, grho, tau, ex, ec, & + v1x, v2x, v3x, v1c, v2c, v3c, gpu_args_=.TRUE. ) + !$acc end host_data ! + ! ... first term of the gradient correction : D(rho*Exc)/D(rho) + ! + !$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) & + !$acc& reduction(-:rhoneg2) DO k = 1, dfftp%nnr ! v(k,1) = (v1x(k,1) + v1c(k,1)) * e2 v(k,2) = (v1x(k,2) + v1c(k,2)) * e2 ! - ! h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho| + ! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho| ! - h(:,k,1) = (v2x(k,1) * grho(:,k,1) + v2c(:,k,1)) * e2 - h(:,k,2) = (v2x(k,2) * grho(:,k,2) + v2c(:,k,2)) * e2 + DO ipol = 1, 3 + h(ipol,k,1) = (v2x(k,1) * grho(ipol,k,1) + v2c(ipol,k,1)) * e2 + h(ipol,k,2) = (v2x(k,2) * grho(ipol,k,2) + v2c(ipol,k,2)) * e2 + ENDDO ! kedtaur(k,1) = (v3x(k,1) + v3c(k,1)) * 0.5d0 * e2 kedtaur(k,2) = (v3x(k,2) + v3c(k,2)) * 0.5d0 * e2 ! etxc = etxc + (ex(k)+ec(k)) * e2 - vtxc = vtxc + (v1x(k,1)+v1c(k,1)) * ABS(rho%of_r(k,1)) * e2 - vtxc = vtxc + (v1x(k,2)+v1c(k,2)) * ABS(rho%of_r(k,2)) * e2 + vtxc = vtxc + (v1x(k,1)+v1c(k,1)) * ABS(rho_updw(k,1)) * e2 + & + (v1x(k,2)+v1c(k,2)) * ABS(rho_updw(k,2)) * e2 ! - IF ( rho%of_r(k,1) < 0.d0 ) rhoneg(1) = rhoneg(1) - rho%of_r(k,1) - IF ( rho%of_r(k,2) < 0.d0 ) rhoneg(2) = rhoneg(2) - rho%of_r(k,2) + IF ( rho_updw(k,1) < 0.d0 ) rhoneg1 = rhoneg1 - rho_updw(k,1) + IF ( rho_updw(k,2) < 0.d0 ) rhoneg2 = rhoneg2 - rho_updw(k,2) ! ENDDO ! - CALL rhoz_or_updw( rho, 'only_r', '->rhoz' ) + !$acc end data + DEALLOCATE( rho_updw ) ! ENDIF ! + !$acc end data DEALLOCATE( ex, ec ) DEALLOCATE( v1x, v2x, v3x ) DEALLOCATE( v1c, v2c, v3c ) ! - ! - ALLOCATE( dh( dfftp%nnr ) ) + ALLOCATE( dh( dfftp%nnr ) ) + !$acc data create( dh ) ! ! ... second term of the gradient correction : ! ... \sum_alpha (D / D r_alpha) ( D(rho*Exc)/D(grad_alpha rho) ) ! - ALLOCATE (rhoout(dfftp%nnr)) + !$acc host_data use_device( rho%of_r, dh, h ) DO is = 1, nspin ! - CALL fft_graddot( dfftp, h(1,1,is), g, dh ) + sgn_is = (-1.d0)**(is+1) ! - v(:,is) = v(:,is) - dh(:) + IF ( use_gpu ) CALL fft_graddot_gpu( dfftp, h(1,1,is), g_d, dh ) + IF ( .NOT. use_gpu ) CALL fft_graddot( dfftp, h(1,1,is), g, dh ) ! - ! ... rhoout is in (up,down) format + !$acc parallel loop reduction(-:vtxc) present(rho) + DO k = 1, dfftp%nnr + v(k,is) = v(k,is) - dh(k) + vtxc = vtxc - dh(k) * ( rho%of_r(k,1) + sgn_is*rho%of_r(k,nspin) )*0.5D0 + ENDDO ! - rhoout(:) = ( rho%of_r(:,1) + sgn(is)*rho%of_r(:,nspin) )*0.5D0 - vtxc = vtxc - SUM( dh(:) * rhoout(:) ) - ! - END DO - DEALLOCATE(rhoout) - DEALLOCATE(dh) + ENDDO + !$acc end host_data + !$acc end data + DEALLOCATE( dh ) ! - CALL mp_sum( rhoneg, intra_bgrp_comm ) + !$acc end data + !$acc end data + !$acc end data ! - rhoneg(:) = rhoneg(:) * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 ) + CALL mp_sum( rhoneg1, intra_bgrp_comm ) + CALL mp_sum( rhoneg2, intra_bgrp_comm ) ! - IF ((rhoneg(1) > eps8) .OR. (rhoneg(2) > eps8)) THEN - write (stdout, '(/,5x, "negative rho (up,down): ", 2es10.3)') rhoneg(:) + rhoneg1 = rhoneg1 * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 ) + rhoneg2 = rhoneg2 * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 ) + ! + IF ((rhoneg1 > eps8) .OR. (rhoneg2 > eps8)) THEN + write (stdout, '(/,5x, "negative rho (up,down): ", 2es10.3)') rhoneg1, rhoneg2 ENDIF ! vtxc = omega * vtxc / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 ) @@ -328,8 +385,8 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur ) CALL mp_sum( vtxc , intra_bgrp_comm ) CALL mp_sum( etxc , intra_bgrp_comm ) ! - DEALLOCATE(grho) - DEALLOCATE(h) + DEALLOCATE( tau, grho ) + DEALLOCATE( h ) ! CALL stop_clock( 'v_xc_meta' ) ! @@ -337,10 +394,10 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur ) ! END SUBROUTINE v_xc_meta ! -! +!------------------------------------------------------------------------------ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v ) !---------------------------------------------------------------------------- - !! Exchange-Correlation potential Vxc(r) from n(r) + !! Exchange-Correlation potential from charge density. ! USE kinds, ONLY : DP USE constants, ONLY : e2, eps8 @@ -352,8 +409,10 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v ) USE noncollin_module, ONLY : domag USE funct, ONLY : nlc, dft_is_nonlocc USE scf, ONLY : scf_type + USE xc_lib, ONLY : xc USE mp_bands, ONLY : intra_bgrp_comm USE mp, ONLY : mp_sum + USE control_flags, ONLY : use_gpu ! IMPLICIT NONE ! @@ -364,19 +423,17 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v ) COMPLEX(DP), INTENT(IN) :: rhog_core(ngm) !! the core charge in reciprocal space REAL(DP), INTENT(OUT) :: v(dfftp%nnr,nspin) - !! V_xc potential + !! \(V_{xc}\) potential REAL(DP), INTENT(OUT) :: vtxc - !! integral V_xc * rho + !! integral \(V_{xc}\cdot\text{rho}\) REAL(DP), INTENT(OUT) :: etxc - !! E_xc energy + !! \(E_{xc}\) energy ! ! ... local variables ! REAL(DP) :: rhoneg(2), vs - ! - !REAL(DP), ALLOCATABLE :: arhox(:), amag(:), zeta(:) - REAL(DP) :: arho, amag - REAL(DP) :: rhoup2, rhodw2 + REAL(DP) :: rhoup2, rhodw2, rhoneg1, rhoneg2 + REAL(DP) :: arho, amag, vtxc24 REAL(DP), ALLOCATABLE :: ex(:), ec(:) REAL(DP), ALLOCATABLE :: vx(:,:), vc(:,:) ! In order: @@ -390,98 +447,110 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v ) INTEGER :: ir, ipol ! counter on mesh points ! counter on nspin - ! + ! number of mesh points (=dfftp%nnr) REAL(DP), PARAMETER :: vanishing_charge = 1.D-10, & vanishing_mag = 1.D-20 ! - ! CALL start_clock( 'v_xc' ) ! - ALLOCATE( ex(dfftp%nnr) ) - ALLOCATE( ec(dfftp%nnr) ) - ALLOCATE( vx(dfftp%nnr,nspin) ) - ALLOCATE( vc(dfftp%nnr,nspin) ) + etxc = 0.D0 ; rhoneg1 = 0.D0 + vtxc = 0.D0 ; rhoneg2 = 0.D0 ! - etxc = 0.D0 - vtxc = 0.D0 - v(:,:) = 0.D0 - rhoneg = 0.D0 + !$acc data copyin( rho_core, rhog_core, rho ) copyout( v ) + !$acc data copyin( rho%of_r, rho%of_g ) ! + ALLOCATE( ex(dfftp%nnr), vx(dfftp%nnr,nspin) ) + ALLOCATE( ec(dfftp%nnr), vc(dfftp%nnr,nspin) ) + !$acc data create( ex, ec, vx, vc ) ! - rho%of_r(:,1) = rho%of_r(:,1) + rho_core(:) + !$acc host_data use_device( rho%of_r, rho%of_g, rho_core, rhog_core, v,& + !$acc& ex, ec, vx, vc ) + ! + !$acc parallel loop + DO ir = 1, dfftp%nnr + rho%of_r(ir,1) = rho%of_r(ir,1) + rho_core(ir) + ENDDO ! IF ( nspin == 1 .OR. ( nspin == 4 .AND. .NOT. domag ) ) THEN ! ... spin-unpolarized case ! - CALL xc( dfftp%nnr, 1, 1, rho%of_r, ex, ec, vx, vc ) + CALL xc( dfftp%nnr, 1, 1, rho%of_r, ex, ec, vx, vc, gpu_args_=.TRUE. ) ! + !$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) & + !$acc& present(rho) DO ir = 1, dfftp%nnr v(ir,1) = e2*( vx(ir,1) + vc(ir,1) ) etxc = etxc + e2*( ex(ir) + ec(ir) )*rho%of_r(ir,1) rho%of_r(ir,1) = rho%of_r(ir,1) - rho_core(ir) vtxc = vtxc + v(ir,1)*rho%of_r(ir,1) - IF (rho%of_r(ir,1) < 0.D0) rhoneg(1) = rhoneg(1)-rho%of_r(ir,1) + IF (rho%of_r(ir,1) < 0.D0) rhoneg1 = rhoneg1-rho%of_r(ir,1) ENDDO ! ! ELSEIF ( nspin == 2 ) THEN ! ... spin-polarized case ! - CALL xc( dfftp%nnr, 2, 2, rho%of_r, ex, ec, vx, vc ) + CALL xc( dfftp%nnr, 2, 2, rho%of_r, ex, ec, vx, vc, gpu_args_=.TRUE. ) ! - DO ir = 1, dfftp%nnr !OMP ? - v(ir,:) = e2*( vx(ir,:) + vc(ir,:) ) + !$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) & + !$acc& reduction(-:rhoneg2) present(rho) + DO ir = 1, dfftp%nnr + v(ir,1) = e2*( vx(ir,1) + vc(ir,1) ) + v(ir,2) = e2*( vx(ir,2) + vc(ir,2) ) etxc = etxc + e2*( (ex(ir) + ec(ir))*rho%of_r(ir,1) ) rho%of_r(ir,1) = rho%of_r(ir,1) - rho_core(ir) vtxc = vtxc + ( ( v(ir,1) + v(ir,2) )*rho%of_r(ir,1) + & - ( v(ir,1) - v(ir,2) )*rho%of_r(ir,2) ) + ( v(ir,1) - v(ir,2) )*rho%of_r(ir,2) )*0.5d0 ! rhoup2 = rho%of_r(ir,1)+rho%of_r(ir,2) rhodw2 = rho%of_r(ir,1)-rho%of_r(ir,2) - IF (rhoup2 < 0.d0) rhoneg(1) = rhoneg(1) - rhoup2 - IF (rhodw2 < 0.d0) rhoneg(2) = rhoneg(2) - rhodw2 + IF (rhoup2 < 0.d0) rhoneg1 = rhoneg1 - rhoup2*0.5d0 + IF (rhodw2 < 0.d0) rhoneg2 = rhoneg2 - rhodw2*0.5d0 ENDDO ! - vtxc = 0.5d0 * vtxc - rhoneg = 0.5d0 * rhoneg - ! - ! - ELSE IF ( nspin == 4 ) THEN - ! ... noncolinear case - ! - CALL xc( dfftp%nnr, 4, 2, rho%of_r, ex, ec, vx, vc ) - ! - DO ir = 1, dfftp%nnr !OMP ? - arho = ABS( rho%of_r(ir,1) ) - IF ( arho < vanishing_charge ) CYCLE - vs = 0.5D0*( vx(ir,1) + vc(ir,1) - vx(ir,2) - vc(ir,2) ) - v(ir,1) = e2*( 0.5D0*( vx(ir,1) + vc(ir,1) + vx(ir,2) + vc(ir,2) ) ) - ! - amag = SQRT( SUM( rho%of_r(ir,2:4)**2 ) ) - IF ( amag > vanishing_mag ) THEN - v(ir,2:4) = e2 * vs * rho%of_r(ir,2:4) / amag - vtxc = vtxc + SUM( v(ir,2:4) * rho%of_r(ir,2:4) ) - ENDIF - etxc = etxc + e2*( ex(ir) + ec(ir) ) * arho - ! - rho%of_r(ir,1) = rho%of_r(ir,1) - rho_core(ir) - IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1) - IF ( amag / arho > 1.D0 ) rhoneg(2) = rhoneg(2) + 1.D0/omega - vtxc = vtxc + v(ir,1) * rho%of_r(ir,1) - ENDDO - ! - ! + ELSEIF ( nspin == 4 ) THEN + ! ... noncollinear case + ! + CALL xc( dfftp%nnr, 4, 2, rho%of_r, ex, ec, vx, vc, gpu_args_=.TRUE. ) + ! + !$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) & + !$acc& reduction(+:rhoneg2) present(rho) + DO ir = 1, dfftp%nnr + arho = ABS( rho%of_r(ir,1) ) + IF ( arho < vanishing_charge ) CYCLE + vs = 0.5D0*( vx(ir,1) + vc(ir,1) - vx(ir,2) - vc(ir,2) ) + v(ir,1) = e2*( 0.5D0*( vx(ir,1) + vc(ir,1) + vx(ir,2) + vc(ir,2) ) ) + ! + amag = SQRT( SUM( rho%of_r(ir,2:4)**2 ) ) + IF ( amag > vanishing_mag ) THEN + v(ir,2:4) = e2 * vs * rho%of_r(ir,2:4) / amag + vtxc24 = SUM( v(ir,2:4) * rho%of_r(ir,2:4) ) + ELSE + vtxc24 = 0.d0 + ENDIF + etxc = etxc + e2*( ex(ir) + ec(ir) ) * arho + ! + rho%of_r(ir,1) = rho%of_r(ir,1) - rho_core(ir) + IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg1 = rhoneg1 - rho%of_r(ir,1) + IF ( amag / arho > 1.D0 ) rhoneg2 = rhoneg2 + 1.D0/omega + vtxc = vtxc + vtxc24 + v(ir,1) * rho%of_r(ir,1) + ENDDO + ! ENDIF ! - DEALLOCATE( ex, ec ) - DEALLOCATE( vx, vc ) + !$acc end host_data + !$acc end data + DEALLOCATE( ex, vx ) + DEALLOCATE( ec, vc ) ! - CALL mp_sum( rhoneg , intra_bgrp_comm ) + CALL mp_sum( rhoneg1 , intra_bgrp_comm ) + CALL mp_sum( rhoneg2 , intra_bgrp_comm ) ! - rhoneg(:) = rhoneg(:) * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 ) + rhoneg1 = rhoneg1 * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 ) + rhoneg2 = rhoneg2 * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 ) ! - IF ( rhoneg(1) > eps8 .OR. rhoneg(2) > eps8 ) & - WRITE( stdout,'(/,5X,"negative rho (up, down): ",2ES10.3)') rhoneg + IF ( rhoneg1 > eps8 .OR. rhoneg2 > eps8 ) & + WRITE( stdout,'(/,5X,"negative rho (up, down): ",2ES10.3)') rhoneg1, rhoneg2 ! ! ... energy terms, local-density contribution ! @@ -490,7 +559,12 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v ) ! ! ... add gradient corrections (if any) ! + !$acc host_data use_device( rho%of_r, rho%of_g, rho_core, rhog_core, v ) CALL gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, etxc, vtxc, v ) + !$acc end host_data + ! + !$acc end data + !$acc end data ! ! ... add non local corrections (if any) ! diff --git a/XClib/CMakeLists.txt b/XClib/CMakeLists.txt index 0db519f24..6848b4a1f 100644 --- a/XClib/CMakeLists.txt +++ b/XClib/CMakeLists.txt @@ -44,6 +44,8 @@ set(sources_libbeef beefun.c) qe_add_library(qe_libbeef ${sources_libbeef}) target_link_libraries(qe_libbeef + PUBLIC + qe_openacc_c PRIVATE qe_lapack) @@ -87,6 +89,7 @@ if(QE_ENABLE_TEST) qe_mpi_fortran qe_upflib qe_external_libxc + qe_libbeef qe_xclib) find_program(BASH_PROGRAM bash) diff --git a/XClib/Makefile b/XClib/Makefile index afccc9ecb..db6ec9cdd 100644 --- a/XClib/Makefile +++ b/XClib/Makefile @@ -41,6 +41,7 @@ xclib_utils_and_para.o \ dft_setting_params.o \ dft_setting_routines.o \ qe_dft_list.o \ +xc_beef_interface.o \ xc_lib.o all: xc_lib.a diff --git a/XClib/beefleg.h b/XClib/beefleg.h index 8aac045d1..57880bb70 100755 --- a/XClib/beefleg.h +++ b/XClib/beefleg.h @@ -8,14 +8,25 @@ #define nmax 30 +#pragma acc routine (dgemv_) seq extern void dgemv_(const char *, const int *, const int *, const double *, double *, const int *, double *, const int *, const double *, double *, const int *); +#pragma acc routine seq +double ddot1(double v[], double u[], int n) +{ + double result = 0.0; + for (int i = 0; i < n; i++) + result += v[i]*u[i]; + return result; +} + extern double ddot_(const int *, double *, const int *, double *, const int *); //beef exchange enhancement factor legendre polynomial coefficients -static double mi[] = { +#pragma acc declare copyin (mi[0:nmax-1]) +static double mi[nmax] = { 1.516501714304992365356, 0.441353209874497942611, -0.091821352411060291887, @@ -392,7 +403,7 @@ static double beefmat[] = { L[1] = x; int i=0;\ for(i=2;i=0) { - (*LdLn[beeforder])(t, &fx, &dl); - + //(*LdLn[beeforder])(t, &fx, &dl); + LdLnACC(t, &fx, &dl, beeforder); dfx = dl*( 4.*s / (4.+s2) - 4.*s2*s/sq(4.+s2) ); *dr = dx*fx - 4./3.*s2/(s*(*r))*sx*dfx; *dg = sx*dfx*pix/(s*r83); @@ -60,13 +63,13 @@ void beefx_(double *r, double *g, double *e, double *dr, double *dg, int *addlda *dg = 0.; *e = 0.; } - break; } } // evaluate local part of bee correlation and its derivatives de/drho and ( de/d|grad rho| ) / |grad rho| +#pragma acc routine seq void beeflocalcorr_(double *r, double *g, double *e, double *dr, double *dg, int *addlda) { double rs, ldac, ldadr, pbec, pbedr, pbed2rho; @@ -80,10 +83,9 @@ void beeflocalcorr_(double *r, double *g, double *e, double *dr, double *dg, int } switch(beeftype) { - case 0: //BEEF-vdW xc + case 0: //BEEF-vdW xc rs = invpi075tothird / pow(*r,1./3.); - corpbe(rs, 0.5/r2k * sqrt(*g*rs) / (*r), - (beeforder>-3), 1, &ldac, &ldadr, &pbec, &pbedr, &pbed2rho); + corpbe(rs, 0.5/r2k * sqrt(*g*rs) / (*r),(beeforder>-3), 1, &ldac, &ldadr, &pbec, &pbedr, &pbed2rho); if(beeforder==-1) { @@ -131,9 +133,11 @@ void beefxpot_(double *r, double *g, double *e, int *addlda) const int n=nmax; const int i1=1; const int i2=1; + + double L[nmax]={1.}; switch(beeftype) { - case 0: //BEEF-vdW xc + case 0: //BEEF-vdW xc r43 = pow(*r, 4./3.); s2 = *g*pix / (r43*r43); @@ -171,7 +175,7 @@ void beeflocalcorrpot_(double *r, double *g, double *e, int *addlda) } switch(beeftype) { - case 0: //BEEF-vdW xc + case 0: //BEEF-vdW xc rs = invpi075tothird / pow(*r,1./3.); corpbe(rs, 0.5/r2k * sqrt(*g*rs) / (*r), (beeforder>-3), 0, &ldac, &ldadr, &pbec, &pbedr, &pbed2rho); @@ -199,6 +203,7 @@ void beeflocalcorrpot_(double *r, double *g, double *e, int *addlda) // evaluate local part of bee correlation for spin polarized system +#pragma acc routine seq void beeflocalcorrspin_(double *r, double *z, double *g, double *e, double *drup, double *drdown, double *dg, int *addlda) { double rs, ldac, ldadrup, ldadrdown, pbec, pbedrup, pbedrdown, pbed2rho; @@ -213,7 +218,7 @@ void beeflocalcorrspin_(double *r, double *z, double *g, double *e, } switch(beeftype) { - case 0: //BEEF-vdW xc + case 0: //BEEF-vdW xc rs = invpi075tothird / pow(*r,1./3.); corpbespin(rs, 0.5/r2k * sqrt(*g*rs) / (*r), *z, (beeforder>-3), 1, &ldac, &ldadrup, &ldadrdown, &pbec, @@ -275,7 +280,7 @@ void beeflocalcorrpotspin_(double *r, double *z, double *g, double *e, int *addl } switch(beeftype) { - case 0: //BEEF-vdW xc + case 0: //BEEF-vdW xc rs = invpi075tothird / pow(*r,1./3.); corpbespin(rs, 0.5/r2k * sqrt(*g*rs) / (*r), *z, (beeforder>-3), 0, &ldac, &ldadrup, &ldadrdown, &pbec, @@ -311,6 +316,7 @@ void beeflocalcorrpotspin_(double *r, double *z, double *g, double *e, int *addl void beefsetmode_(int *mode) { beeforder = *mode; +#pragma acc update device(beeforder) } // initialize pseudo random number generator @@ -357,6 +363,7 @@ void beefensemble_(double *beefxc, double *ensemble) int beef_set_type_(int *tbeef, int *ionode) { beeftype = *tbeef; +#pragma acc update device(beeftype) if(*ionode) { diff --git a/XClib/dft_setting_routines.f90 b/XClib/dft_setting_routines.f90 index 225a1a651..764a8b78d 100644 --- a/XClib/dft_setting_routines.f90 +++ b/XClib/dft_setting_routines.f90 @@ -546,7 +546,6 @@ CONTAINS DO l = 1, LEN_TRIM(lxc_name) lxc_name(l:l) = capital( lxc_name(l:l) ) ENDDO - nlxc = 0 ii = 0 DO WHILE ( ii < LEN_TRIM(lxc_name) ) ii = ii + 1 diff --git a/XClib/pbecor.c b/XClib/pbecor.c index b408a4189..dfd48fa1f 100755 --- a/XClib/pbecor.c +++ b/XClib/pbecor.c @@ -29,9 +29,9 @@ /* ---------------------------------------------------------------------- */ /* ###################################################################### */ /* ---------------------------------------------------------------------- */ -/* Subroutine */ static void gcor2(double a, double a1, double b1, - double b2, double b3, double b4, double rtrs, - double *gg, double *ggrs) +/* Subroutine */ +#pragma acc routine seq +static void gcor2(double a, double a1, double b1, double b2, double b3, double b4, double rtrs, double *gg, double *ggrs) { /* Local variables */ double q0, q1, q2, q3; @@ -61,10 +61,9 @@ /* ###################################################################### */ /* ---------------------------------------------------------------------- */ -/* Subroutine */ void corpbe(double rs, double t, - int lgga, - int lpot, double *ec, double *vc, - double *h__, double *dvc, double *dv2rho) +/* Subroutine */ +#pragma acc routine seq +void corpbe(double rs, double t, int lgga, int lpot, double *ec, double *vc, double *h__, double *dvc, double *dv2rho) { /* Local variables */ double b, b2, q4, t2, q5, t4, @@ -168,7 +167,9 @@ /* ###################################################################### */ /* ---------------------------------------------------------------------- */ -/* Subroutine */ void corpbespin(double rs, double t, double zet, +/* Subroutine */ +#pragma acc routine seq +void corpbespin(double rs, double t, double zet, int lgga, int lpot, double *ec, double *vcup, double *vcdown, double *h__, double *dvcup, double *dvcdown, double *dv2rho) diff --git a/XClib/pbecor.h b/XClib/pbecor.h index a52bf8f18..b2b3c162d 100755 --- a/XClib/pbecor.h +++ b/XClib/pbecor.h @@ -1,10 +1,12 @@ #ifndef pbecor_h #define pbecor_h 1 +#pragma acc routine seq extern void corpbe(double, double, int, int, double *, double *, double *, double *, double *); +#pragma acc routine seq void corpbespin(double, double, double, int, int, double *, double *, double *, double *, double *, double *, double *); diff --git a/XClib/qe_drivers_d_gga.f90 b/XClib/qe_drivers_d_gga.f90 index 32b44136f..f26adee53 100644 --- a/XClib/qe_drivers_d_gga.f90 +++ b/XClib/qe_drivers_d_gga.f90 @@ -35,7 +35,7 @@ SUBROUTINE dgcxc_unpol( length, r_in, s2_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc !! This routine computes the derivative of the exchange and correlation !! potentials of GGA family. ! - USE qe_drivers_gga, ONLY: gcxc, gcxc_beef + USE qe_drivers_gga, ONLY: gcxc ! IMPLICIT NONE ! @@ -90,10 +90,11 @@ SUBROUTINE dgcxc_unpol( length, r_in, s2_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc raux(i3:f3) = r_in ; s2aux(i3:f3) = (s+ds)**2 raux(i4:f4) = r_in ; s2aux(i4:f4) = (s-ds)**2 ! + !$acc data copyin( raux, s2aux ) copyout( sx, sc, v1x, v2x, v1c, v2c ) + !$acc host_data use_device( raux, s2aux, sx, sc, v1x, v2x, v1c, v2c ) CALL gcxc( length*4, raux, s2aux, sx, sc, v1x, v2x, v1c, v2c ) - ! - IF ( igcx==43 .OR. igcc==14 ) CALL gcxc_beef( length*4, raux, s2aux, & - sx, sc, v1x, v2x, v1c, v2c ) + !$acc end host_data + !$acc end data ! ! ... to avoid NaN in the next operations WHERE( r_in<=small .OR. s2_in<=small ) @@ -129,7 +130,7 @@ SUBROUTINE dgcxc_spin( length, r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, & !! This routine computes the derivative of the exchange and correlation !! potentials in the spin-polarized case. ! - USE qe_drivers_gga, ONLY: gcx_spin, gcc_spin, gcx_spin_beef, gcc_spin_beef + USE qe_drivers_gga, ONLY: gcx_spin, gcc_spin ! IMPLICIT NONE ! @@ -239,8 +240,11 @@ SUBROUTINE dgcxc_spin( length, r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, & raux(i8:f8,:) = r ; s2aux(i8:f8,:) = (s-dsdw)**2 ! ! - IF ( igcx/=43 ) CALL gcx_spin( length*8, raux, s2aux, sx, v1x, v2x ) - IF ( igcx==43 ) CALL gcx_spin_beef( length*8, raux, s2aux, sx, v1x, v2x ) + !$acc data copyin( raux, s2aux ) copyout( sx, v1x, v2x ) + !$acc host_data use_device( raux, s2aux, sx, v1x, v2x ) + CALL gcx_spin( length*8, raux, s2aux, sx, v1x, v2x ) + !$acc end host_data + !$acc end data ! ! ... up vrrx(:,1) = 0.5_DP * (v1x(i1:f1,1) - v1x(i2:f2,1)) / drup(:,1) @@ -311,8 +315,11 @@ SUBROUTINE dgcxc_spin( length, r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, & rtaux(i5:f5) = rt ; s2taux(i5:f5) = s2t ; zetaux(i5:f5) = zeta+dz rtaux(i6:f6) = rt ; s2taux(i6:f6) = s2t ; zetaux(i6:f6) = zeta-dz ! - IF ( igcc/=14 ) CALL gcc_spin( length*6, rtaux, zetaux, s2taux, sc, v1c, v2c ) - IF ( igcc==14 ) CALL gcc_spin_beef( length*6, rtaux, zetaux, s2taux, sc, v1c, v2c ) + !$acc data copyin( rtaux, zetaux, s2taux ) copyout( sc, v1c, v2c ) + !$acc host_data use_device( rtaux, zetaux, s2taux, sc, v1c, v2c ) + CALL gcc_spin( length*6, rtaux, zetaux, s2taux, sc, v1c, v2c ) + !$acc end host_data + !$acc end data ! vrrc(:,1) = 0.5_DP * (v1c(i1:f1,1) - v1c(i2:f2,1)) / dr * null_v(:,1) vrrc(:,2) = 0.5_DP * (v1c(i1:f1,2) - v1c(i2:f2,2)) / dr * null_v(:,1) diff --git a/XClib/qe_drivers_d_lda_lsda.f90 b/XClib/qe_drivers_d_lda_lsda.f90 index c6d740e77..d2f933a99 100644 --- a/XClib/qe_drivers_d_lda_lsda.f90 +++ b/XClib/qe_drivers_d_lda_lsda.f90 @@ -125,7 +125,11 @@ SUBROUTINE dmxc_lda( length, rho_in, dmuxc ) rhoaux(i1:f1) = arho+dr rhoaux(i2:f2) = arho-dr ! + !$acc data copyin( rhoaux ) copyout( ex, ec, vx, vc ) + !$acc host_data use_device( rhoaux, ex, ec, vx, vc ) CALL xc_lda( length*2, rhoaux, ex, ec, vx, vc ) + !$acc end host_data + !$acc end data ! WHERE ( arho < small ) dr = 1.0_DP ! ... to avoid NaN in the next operation ! @@ -316,7 +320,11 @@ SUBROUTINE dmxc_lsda( length, rho_in, dmuxc ) rhoaux(i3:f3) = rhotot ; zetaux(i3:f3) = zeta_eff + dz rhoaux(i4:f4) = rhotot ; zetaux(i4:f4) = zeta_eff - dz ! + !$acc data copyin( rhoaux, zetaux ) copyout( aux1, aux2, vx, vc ) + !$acc host_data use_device( rhoaux, zetaux, aux1, aux2, vx, vc ) CALL xc_lsda( length*4, rhoaux, zetaux, aux1, aux2, vx, vc ) + !$acc end host_data + !$acc end data ! WHERE (rhotot <= small) ! ... to avoid NaN in the next operations dr=1.0_DP ; rhotot=0.5d0 @@ -438,7 +446,11 @@ SUBROUTINE dmxc_nc( length, rho_in, m, dmuxc ) rhoaux(i5:f5) = rhotot ; zetaux(i5:f5) = zeta_eff - dz ! ! + !$acc data copyin( rhoaux, zetaux ) copyout( aux1, aux2, vx, vc ) + !$acc host_data use_device( rhoaux, zetaux, aux1, aux2, vx, vc ) CALL xc_lsda( length*5, rhoaux, zetaux, aux1, aux2, vx, vc ) + !$acc end host_data + !$acc end data ! ! vs(:) = 0.5_DP*( vx(i1:f1,1)+vc(i1:f1,1)-vx(i1:f1,2)-vc(i1:f1,2) ) diff --git a/XClib/qe_drivers_gga.f90 b/XClib/qe_drivers_gga.f90 index 0bddeb8cf..28913c469 100644 --- a/XClib/qe_drivers_gga.f90 +++ b/XClib/qe_drivers_gga.f90 @@ -26,8 +26,7 @@ MODULE qe_drivers_gga ! PRIVATE ! - PUBLIC :: gcxc, gcx_spin, gcc_spin, gcc_spin_more, gcxc_beef, gcx_spin_beef, & - gcc_spin_beef + PUBLIC :: gcxc, gcx_spin, gcc_spin, gcc_spin_more ! ! CONTAINS @@ -41,6 +40,7 @@ SUBROUTINE gcxc( length, rho_in, grho_in, sx_out, sc_out, v1x_out, & ! USE exch_gga USE corr_gga + USE beef_interface, ONLY: beefx, beeflocalcorr ! IMPLICIT NONE ! @@ -80,12 +80,11 @@ SUBROUTINE gcxc( length, rho_in, grho_in, sx_out, sc_out, v1x_out, & ntids = omp_get_num_threads() #endif ! - ! #if defined(_OPENACC) -!$acc data copyin(rho_in,grho_in), copyout(sx_out,sc_out,v1x_out,v2x_out,v1c_out,v2c_out) +!$acc data deviceptr( rho_in(length), grho_in(length), sx_out(length), sc_out(length), & +!$acc& v1x_out(length), v2x_out(length), v1c_out(length), v2c_out(length) ) !$acc parallel loop -#endif -#if defined(_OPENMP) && !defined(_OPENACC) +#else !$omp parallel if(ntids==1) default(none) & !$omp private( rho, grho, sx, sx_, sxsr, v1x, v1x_, v1xsr, & !$omp v2x, v2x_, v2xsr, sc, v1c, v2c ) & @@ -298,6 +297,10 @@ SUBROUTINE gcxc( length, rho_in, grho_in, sx_out, sc_out, v1x_out, & v2x = (1.0_DP - exx_fraction) * v2x ENDIF ! + CASE( 43 ) ! 'BEEX' + ! + CALL beefx( rho, grho, sx, v1x, v2x, 0 ) + ! CASE( 44 ) ! 'RPBE' ! CALL pbex( rho, grho, 8, sx, v1x, v2x ) @@ -369,6 +372,11 @@ SUBROUTINE gcxc( length, rho_in, grho_in, sx_out, sc_out, v1x_out, & v2c = 0.871_DP * v2c ENDIF ! + CASE( 14 ) !'BEEC' + ! last parameter 0 means: do not add lda contributions + ! espresso will do that itself + CALL beeflocalcorr( rho, grho, sc, v1c, v2c, 0 ) + ! CASE DEFAULT ! sc = 0.0_DP @@ -402,6 +410,7 @@ SUBROUTINE gcx_spin( length, rho_in, grho2_in, sx_tot, v1x_out, v2x_out ) !! Gradient corrections for exchange - Hartree a.u. ! USE exch_gga + USE beef_interface, ONLY: beefx ! IMPLICIT NONE ! @@ -439,10 +448,11 @@ SUBROUTINE gcx_spin( length, rho_in, grho2_in, sx_tot, v1x_out, v2x_out ) ntids = omp_get_num_threads() #endif ! - sx_tot = 0.0_DP + !sx_tot = 0.0_DP ! #if defined(_OPENACC) -!$acc data copyin(rho_in, grho2_in), copyout(sx_tot, v1x_out, v2x_out) +!$acc data deviceptr( rho_in(length,2), grho2_in(length,2), sx_tot(length), & +!$acc& v1x_out(length,2), v2x_out(length,2) ) !$acc parallel loop #else !$omp parallel if(ntids==1) default(none) & @@ -824,6 +834,18 @@ SUBROUTINE gcx_spin( length, rho_in, grho2_in, sx_tot, v1x_out, v2x_out ) ! case igcx == 7 (meta-GGA) must be treated in a separate call to another ! routine: needs kinetic energy density in addition to rho and grad rho ! + CASE( 43 ) ! BEEX + ! + rho_up = 2.0_DP * rho_up ; rho_dw = 2.0_DP * rho_dw + grho2_up = 4.0_DP * grho2_up ; grho2_dw = 4.0_DP * grho2_dw + ! + CALL beefx( rho_up, grho2_up, sx_up, v1x_up, v2x_up, 0 ) + CALL beefx( rho_dw, grho2_dw, sx_dw, v1x_dw, v2x_dw, 0 ) + ! + sx_tot(ir) = 0.5_DP * (sx_up*rnull_up + sx_dw*rnull_dw) + v2x_up = 2.0_DP * v2x_up + v2x_dw = 2.0_DP * v2x_dw + ! CASE DEFAULT ! sx_tot(ir) = 0.0_DP @@ -858,6 +880,7 @@ SUBROUTINE gcc_spin( length, rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out !! Implemented: Perdew86, GGA (PW91), PBE ! USE corr_gga + USE beef_interface, ONLY: beeflocalcorrspin ! IMPLICIT NONE ! @@ -891,7 +914,8 @@ SUBROUTINE gcc_spin( length, rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out #endif ! #if defined(_OPENACC) -!$acc data copyin(rho_in, grho_in), copyout(sc_out, v1c_out, v2c_out), copy(zeta_io) +!$acc data deviceptr( rho_in(length), zeta_io(length), grho_in(length), & +!$acc& sc_out(length), v1c_out(length,2), v2c_out(length) ) !$acc parallel loop #else !$omp parallel if(ntids==1) default(none) & @@ -941,6 +965,10 @@ SUBROUTINE gcc_spin( length, rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out ! CALL pbec_spin( rho, zeta, grho, 2, sc, v1c_up, v1c_dw, v2c ) ! + CASE( 14 ) + ! + CALL beeflocalcorrspin( rho, zeta, grho, sc, v1c_up, v1c_dw, v2c, 0 ) + ! CASE DEFAULT ! sc = 0.0_DP @@ -1015,13 +1043,9 @@ SUBROUTINE gcc_spin_more( length, rho_in, grho_in, grho_ud_in, & ntids = omp_get_num_threads() #endif ! - sc = 0.0_DP - v1c = 0.0_DP - v2c = 0.0_DP - v2c_ud = 0.0_DP - ! #if defined(_OPENACC) -!$acc data copyin(rho_in, grho_in, grho_ud_in), copyout(sc, v1c, v2c, v2c_ud) +!$acc data deviceptr( rho_in(length,2), grho_in(length,2), grho_ud_in(length), & +!$acc& sc(length), v1c(length,2), v2c(length,2), v2c_ud(length) ) !$acc parallel loop #else !$omp parallel if(ntids==1) default(none) & @@ -1080,7 +1104,7 @@ SUBROUTINE gcc_spin_more( length, rho_in, grho_in, grho_ud_in, & ! CASE DEFAULT ! - !CALL xclib_error(" gcc_spin_more "," gradient correction not implemented ",1) !***acc test + !CALL xclib_error(" gcc_spin_more "," gradient correction not implemented ",1) ! END SELECT ! @@ -1091,220 +1115,10 @@ SUBROUTINE gcc_spin_more( length, rho_in, grho_in, grho_ud_in, & !$omp end do !$omp end parallel #endif - ! RETURN ! END SUBROUTINE gcc_spin_more ! ! -! ========> BEEF GGA DRIVERS <======================== -! -!------------------------------------------------------------------------ -SUBROUTINE gcxc_beef( length, rho_in, grho_in, sx_out, sc_out, v1x_out, & - v2x_out, v1c_out, v2c_out ) - !--------------------------------------------------------------------- - !! Driver for BEEF gga xc terms. Unpolarized case. - ! - USE beef_interface, ONLY: beefx, beeflocalcorr - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: length - REAL(DP), INTENT(IN), DIMENSION(length) :: rho_in - REAL(DP), INTENT(IN), DIMENSION(length) :: grho_in - REAL(DP), INTENT(OUT), DIMENSION(length) :: sx_out, sc_out - REAL(DP), INTENT(OUT), DIMENSION(length) :: v1x_out, v2x_out - REAL(DP), INTENT(OUT), DIMENSION(length) :: v1c_out, v2c_out - ! - ! ... local variables - ! - INTEGER :: ir - REAL(DP) :: rho, grho - REAL(DP) :: sx, v1x, v2x - REAL(DP) :: sc, v1c, v2c - ! - IF (igcx == 43 .OR. igcc == 14) THEN - ! - DO ir = 1, length - grho = grho_in(ir) - IF ( rho_in(ir) <= rho_threshold_gga .OR. grho <= grho_threshold_gga ) THEN - sx_out(ir) = 0.0_DP ; sc_out(ir) = 0.0_DP - v1x_out(ir) = 0.0_DP ; v1c_out(ir) = 0.0_DP - v2x_out(ir) = 0.0_DP ; v2c_out(ir) = 0.0_DP - CYCLE - ENDIF - ! - rho = ABS(rho_in(ir)) - ! - IF ( igcx == 43 ) THEN - ! last parameter = 0 means do not add LDA (=Slater) exchange - ! (espresso) will add it itself - CALL beefx( rho, grho, sx, v1x, v2x, 0 ) - sx_out(ir) = sx - v1x_out(ir) = v1x - v2x_out(ir) = v2x - ENDIF - ! - IF ( igcc == 14 ) THEN - ! last parameter 0 means: do not add lda contributions - ! espresso will do that itself - CALL beeflocalcorr( rho, grho, sc, v1c, v2c, 0 ) - sc_out(ir) = sc - v1c_out(ir) = v1c - v2c_out(ir) = v2c - ENDIF - ENDDO - ! - ENDIF - ! - RETURN - ! -END SUBROUTINE gcxc_beef -! -!----------------------------------------------------------------------------- -SUBROUTINE gcx_spin_beef( length, rho_in, grho2_in, sx_tot, v1x_out, v2x_out ) - !-------------------------------------------------------------------------- - !! Driver for BEEF gga exchange term. Polarized case. - ! - USE beef_interface, ONLY: beefx - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: length - REAL(DP), INTENT(IN), DIMENSION(length,2) :: rho_in - REAL(DP), INTENT(IN), DIMENSION(length,2) :: grho2_in - REAL(DP), INTENT(OUT), DIMENSION(length) :: sx_tot - REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v1x_out - REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v2x_out - ! - ! ... local variables - ! - INTEGER :: ir - REAL(DP) :: rho_up, rho_dw, grho2_up, grho2_dw - REAL(DP) :: v1x_up, v1x_dw, v2x_up, v2x_dw - REAL(DP) :: sx_up, sx_dw, rnull_up, rnull_dw - ! - REAL(DP), PARAMETER :: small=1.D-10 - REAL(DP), PARAMETER :: rho_trash=0.5_DP, grho2_trash=0.2_DP - - ! ... Temporary workaround for BEEF, which does not support GPU. - IF ( igcx == 43 ) THEN - ! - DO ir = 1, length - ! - rho_up = rho_in(ir,1) - rho_dw = rho_in(ir,2) - grho2_up = grho2_in(ir,1) - grho2_dw = grho2_in(ir,2) - rnull_up = 1.0_DP - rnull_dw = 1.0_DP - ! - IF ( rho_up+rho_dw <= small ) THEN - sx_tot(ir) = 0.0_DP - v1x_out(ir,1) = 0.0_DP - v2x_out(ir,1) = 0.0_DP - v1x_out(ir,2) = 0.0_DP - v2x_out(ir,2) = 0.0_DP - CYCLE - ELSE - IF ( rho_up<=small .OR. SQRT(ABS(grho2_up))<=small ) THEN - rho_up = rho_trash - grho2_up = grho2_trash - rnull_up = 0.0_DP - ENDIF - IF ( rho_dw<=small .OR. SQRT(ABS(grho2_dw))<=small ) THEN - rho_dw = rho_trash - grho2_dw = grho2_trash - rnull_dw = 0.0_DP - ENDIF - ENDIF - ! - rho_up = 2.0_DP * rho_up ; rho_dw = 2.0_DP * rho_dw - grho2_up = 4.0_DP * grho2_up ; grho2_dw = 4.0_DP * grho2_dw - ! - CALL beefx(rho_up, grho2_up, sx_up, v1x_up, v2x_up, 0) - CALL beefx(rho_dw, grho2_dw, sx_dw, v1x_dw, v2x_dw, 0) - ! - sx_tot(ir) = 0.5_DP * (sx_up*rnull_up + sx_dw*rnull_dw) - v2x_up = 2.0_DP * v2x_up - v2x_dw = 2.0_DP * v2x_dw - ! - v1x_out(ir,1) = v1x_up * rnull_up - v1x_out(ir,2) = v1x_dw * rnull_dw - v2x_out(ir,1) = v2x_up * rnull_up - v2x_out(ir,2) = v2x_dw * rnull_dw - ! - ENDDO - ! - ENDIF - ! - RETURN - ! -END SUBROUTINE gcx_spin_beef -! -! -!-------------------------------------------------------------------------------- -SUBROUTINE gcc_spin_beef( length, rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out ) - !------------------------------------------------------------------------------- - !! BEEF Gradient correction. - ! - USE beef_interface, ONLY: beeflocalcorrspin - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: length - REAL(DP), INTENT(IN), DIMENSION(length) :: rho_in - REAL(DP), INTENT(INOUT), DIMENSION(length) :: zeta_io - REAL(DP), INTENT(IN), DIMENSION(length) :: grho_in - REAL(DP), INTENT(OUT), DIMENSION(length) :: sc_out - REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v1c_out - REAL(DP), INTENT(OUT), DIMENSION(length) :: v2c_out - ! - ! ... local variables - ! - INTEGER :: ir - REAL(DP) :: rho, zeta, grho - REAL(DP) :: sc, v1c_up, v1c_dw, v2c - ! - IF ( igcc == 14 ) THEN - ! - DO ir = 1, length - ! - rho = rho_in(ir) - grho = grho_in(ir) - IF ( ABS(zeta_io(ir))<=1.0_DP ) zeta_io(ir) = SIGN( MIN(ABS(zeta_io(ir)), & - (1.0_DP-rho_threshold_gga)), zeta_io(ir) ) - zeta = zeta_io(ir) - ! - IF ( ABS(zeta)>1.0_DP .OR. rho<=rho_threshold_gga .OR. & - SQRT(ABS(grho))<=rho_threshold_gga ) THEN - sc_out(ir) = 0.0_DP - v1c_out(ir,1) = 0.0_DP ; v2c_out(ir) = 0.0_DP - v1c_out(ir,2) = 0.0_DP - CYCLE - ENDIF - rho = rho_in(ir) - grho = grho_in(ir) - zeta = zeta_io(ir) - ! - IF ( ABS(zeta)>1.0_DP .OR. rho<=rho_threshold_gga .OR. & - SQRT(ABS(grho))<=rho_threshold_gga ) CYCLE - ! - CALL beeflocalcorrspin( rho, zeta, grho, sc, v1c_up, v1c_dw, v2c, 0 ) - ! - sc_out(ir) = sc - v1c_out(ir,1) = v1c_up - v1c_out(ir,2) = v1c_dw - v2c_out(ir) = v2c - ! - ENDDO - ! - ENDIF - ! - RETURN - ! -END SUBROUTINE gcc_spin_beef -! -! END MODULE qe_drivers_gga diff --git a/XClib/qe_drivers_lda_lsda.f90 b/XClib/qe_drivers_lda_lsda.f90 index 24f39d236..60a9e901c 100644 --- a/XClib/qe_drivers_lda_lsda.f90 +++ b/XClib/qe_drivers_lda_lsda.f90 @@ -90,10 +90,9 @@ SUBROUTINE xc_lda( length, rho_in, ex_out, ec_out, vx_out, vc_out ) ! ! #if defined(_OPENACC) -!$acc data copyin(rho_in), copyout(ex_out, vx_out, ec_out, vc_out) +!$acc data deviceptr( rho_in(length),ex_out(length),vx_out(length),ec_out(length),vc_out(length) ) !$acc parallel loop -#endif -#if defined(_OPENMP) && !defined(_OPENACC) +#else !$omp parallel if(ntids==1) default(none) & !$omp private( rho, rs, ex, ec, ec_, vx, vc, vc_ ) & !$omp shared( rho_in, length, iexch, icorr, ex_out, ec_out, vx_out, vc_out, & @@ -257,8 +256,7 @@ SUBROUTINE xc_lda( length, rho_in, ex_out, ec_out, vx_out, vc_out ) ENDDO #if defined(_OPENACC) !$acc end data -#endif -#if defined(_OPENMP) && !defined(_OPENACC) +#else !$omp end do !$omp end parallel #endif @@ -317,10 +315,9 @@ SUBROUTINE xc_lsda( length, rho_in, zeta_in, ex_out, ec_out, vx_out, vc_out ) #endif ! #if defined(_OPENACC) -!$acc data copyin(rho_in, zeta_in), copyout(ex_out, vx_out, ec_out, vc_out) +!$acc data deviceptr( rho_in(length), zeta_in(length) ,ex_out(length),vx_out(length,2),ec_out(length),vc_out(length,2) ) !$acc parallel loop -#endif -#if defined(_OPENMP) && !defined(_OPENACC) +#else !$omp parallel if(ntids==1) default(none) & !$omp private( rho, rs, zeta, ex, ec, ec_, vx_up, vx_dw, vc_up, & !$omp vc_dw, vc_up_, vc_dw_ ) & @@ -477,14 +474,12 @@ SUBROUTINE xc_lsda( length, rho_in, zeta_in, ex_out, ec_out, vx_out, vc_out ) ec_out(ir) = ec ; vc_out(ir,1) = vc_up ; vc_out(ir,2) = vc_dw ! ENDDO -#if defined(_OPENMP) && !defined(_OPENACC) +#if defined(_OPENACC) +!$acc end data +#else !$omp end do !$omp end parallel #endif -#if defined(_OPENACC) -!$acc end data -#endif - ! ! RETURN ! diff --git a/XClib/qe_drivers_mgga.f90 b/XClib/qe_drivers_mgga.f90 index cb63e559c..6f7c955a6 100644 --- a/XClib/qe_drivers_mgga.f90 +++ b/XClib/qe_drivers_mgga.f90 @@ -17,7 +17,6 @@ MODULE qe_drivers_mgga USE kind_l, ONLY: DP USE dft_setting_params, ONLY: imeta, imetac, rho_threshold_mgga, & grho2_threshold_mgga, tau_threshold_mgga - USE metagga ! IMPLICIT NONE ! @@ -37,31 +36,33 @@ SUBROUTINE tau_xc( length, rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c !! Available cases: M06L and TPSS. Other mGGA functionals can be used !! through Libxc. ! + USE metagga + ! IMPLICIT NONE ! INTEGER, INTENT(IN) :: length !! Number of k-points - REAL(DP), DIMENSION(length) :: rho + REAL(DP), INTENT(IN) :: rho(length) !! Charge density - REAL(DP), DIMENSION(length) :: grho2 + REAL(DP), INTENT(IN) :: grho2(length) !! Square modulus of the density gradient - REAL(DP), DIMENSION(length) :: tau + REAL(DP), INTENT(IN) :: tau(length) !! Laplacian of the density - REAL(DP), DIMENSION(length) :: ex + REAL(DP), INTENT(OUT) :: ex(length) !! \(E_x = \int e_x(\text{rho},\text{grho}) dr \) - REAL(DP), DIMENSION(length) :: ec + REAL(DP), INTENT(OUT) :: ec(length) !! \(E_x = \int e_x(\text{rho},\text{grho}) dr \) - REAL(DP), DIMENSION(length) :: v1x + REAL(DP), INTENT(OUT) :: v1x(length) !! \( D\ E_x\ /\ D\ \text{rho} \) - REAL(DP), DIMENSION(length) :: v2x + REAL(DP), INTENT(OUT) :: v2x(length) !! \( D\ E_x\ /\ D( D\ \text{rho}/D\ r_\alpha )/|\nabla\text{rho}| \) - REAL(DP), DIMENSION(length) :: v3x + REAL(DP), INTENT(OUT) :: v3x(length) !! \( D\ E_x\ /\ D\ \text{tau} \) - REAL(DP), DIMENSION(length) :: v1c + REAL(DP), INTENT(OUT) :: v1c(length) !! \( D\ E_c\ /\ D\ \text{rho} \) - REAL(DP), DIMENSION(length) :: v2c + REAL(DP), INTENT(OUT) :: v2c(1,length,1) !! \( D\ E_c\ /\ D( D\ \text{rho}/D\ r_\alpha )/|\nabla\text{rho}| \) - REAL(DP), DIMENSION(length) :: v3c + REAL(DP), INTENT(OUT) :: v3c(length) !! \( D\ E_c\ /\ D\ \text{tau} \) ! ! ... local variables @@ -69,15 +70,23 @@ SUBROUTINE tau_xc( length, rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c INTEGER :: k REAL(DP) :: arho ! - v1x=0.d0 ; v2x=0.d0 ; v3x=0.d0 ; ex=0.d0 - v1c=0.d0 ; v2c=0.d0 ; v3c=0.d0 ; ec=0.d0 +#if defined(_OPENACC) + !$acc data deviceptr( rho(length), grho2(length), tau(length), ex(length), ec(length), & + !$acc& v1x(length), v2x(length), v3x(length), v1c(length), v2c(1,length,1), & + !$acc& v3c(length) ) ! + !$acc parallel loop +#endif DO k = 1, length ! arho = ABS(rho(k)) ! IF ( (arho<=rho_threshold_mgga).OR.(grho2(k)<=grho2_threshold_mgga).OR. & - (ABS(tau(k))<=rho_threshold_mgga) ) CYCLE + (ABS(tau(k))<=rho_threshold_mgga) ) THEN + v1x(k)=0.d0 ; v2x(k)=0.d0 ; v3x(k)=0.d0 ; ex(k)=0.d0 + v1c(k)=0.d0 ; v2c(1,k,1)=0.d0 ; v3c(k)=0.d0 ; ec(k)=0.d0 + CYCLE + ENDIF ! ! ...libxc-like threshold management !grho2(k) = MIN( grho2(k), (8.d0*rho(k)*tau(k))**2 ) @@ -86,21 +95,25 @@ SUBROUTINE tau_xc( length, rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c CASE( 1 ) ! CALL tpsscxc( arho, grho2(k), tau(k), ex(k), ec(k), v1x(k), v2x(k), & - v3x(k), v1c(k), v2c(k), v3c(k) ) + v3x(k), v1c(k), v2c(1,k,1), v3c(k) ) ! CASE( 2 ) ! CALL m06lxc( arho, grho2(k), tau(k), ex(k), ec(k), v1x(k), v2x(k), & - v3x(k), v1c(k), v2c(k), v3c(k) ) + v3x(k), v1c(k), v2c(1,k,1), v3c(k) ) ! CASE DEFAULT ! - CALL xclib_error( 'tau_xc', 'This case is not implemented', imeta ) + !CALL xclib_error( 'tau_xc', 'This case is not implemented', imeta ) ! END SELECT ! ENDDO ! +#if defined(_OPENACC) + !$acc end data +#endif + ! RETURN ! END SUBROUTINE tau_xc @@ -114,65 +127,74 @@ SUBROUTINE tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, !! Available cases: M06L and TPSS. Other mGGA functionals can be used !! through Libxc. ! + USE metagga + ! IMPLICIT NONE ! INTEGER, INTENT(IN) :: length !! Number of k-points - REAL(DP), INTENT(IN), DIMENSION(length,2) :: rho + REAL(DP), INTENT(IN) :: rho(length,2) !! Charge density - REAL(DP), INTENT(IN), DIMENSION(3,length,2) :: grho + REAL(DP), INTENT(IN) :: grho(3,length,2) !! The density gradient - REAL(DP), INTENT(IN), DIMENSION(length,2) :: tau + REAL(DP), INTENT(IN) :: tau(length,2) !! Laplacian of the density - REAL(DP), INTENT(OUT), DIMENSION(length) :: ex + REAL(DP), INTENT(OUT) :: ex(length) !! \(E_x = \int e_x(\text{rho},\text{grho}) dr \) - REAL(DP), INTENT(OUT), DIMENSION(length) :: ec + REAL(DP), INTENT(OUT) :: ec(length) !! \(E_x = \int e_x(\text{rho},\text{grho}) dr \) - REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v1x + REAL(DP), INTENT(OUT) :: v1x(length,2) !! \( D\ E_x\ /\ D\ \text{rho} \) - REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v2x + REAL(DP), INTENT(OUT) :: v2x(length,2) !! \( D\ E_x\ /\ D( D\ \text{rho}/D\ r_\alpha )/|\nabla\text{rho}| \) - REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v3x + REAL(DP), INTENT(OUT) :: v3x(length,2) !! \( D\ E_x\ /\ D\ \text{tau} \) - REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v1c + REAL(DP), INTENT(OUT) :: v1c(length,2) !! \( D\ E_c\ /\ D\ \text{rho} \) - REAL(DP), INTENT(OUT), DIMENSION(3,length,2) :: v2c + REAL(DP), INTENT(OUT) :: v2c(3,length,2) !! \( D\ E_c\ /\ D( D\ \text{rho}/D\ r_\alpha )/|\nabla\text{rho}| \) - REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v3c + REAL(DP), INTENT(OUT) :: v3c(length,2) !! \( D\ E_c\ /\ D\ \text{tau} \) ! ! ... local variables ! INTEGER :: k - REAL(DP) :: rh, zeta, atau, grho2(2), ggrho2 + REAL(DP) :: rh, zeta, atau, grho2up, grho2dw, ggrho2 REAL(DP) :: v2cup, v2cdw ! - ex=0.0_DP ; v1x=0.0_DP ; v2x=0.0_DP ; v3x=0.0_DP - ec=0.0_DP ; v1c=0.0_DP ; v2c=0.0_DP ; v3c=0.0_DP +#if defined(_OPENACC) + !$acc data deviceptr( rho(length,2), grho(3,length,2), tau(length,2), ex(length), & + !$acc& ec(length), v1x(length,2), v2x(length,2), v3x(length,2), & + !$acc& v1c(length,2), v2c(3,length,2), v3c(length,2) ) ! + !$acc parallel loop +#endif DO k = 1, length ! rh = rho(k,1) + rho(k,2) atau = tau(k,1) + tau(k,2) ! KE-density in Hartree ! ...libxc-like threshold management - !grho2(1) = MIN( SUM(grho(:,k,1)**2), (8.d0*rho(k,1)*tau(k,1))**2 ) - !grho2(2) = MIN( SUM(grho(:,k,2)**2), (8.d0*rho(k,2)*tau(k,2))**2 ) - grho2(1) = SUM(grho(:,k,1)**2) - grho2(2) = SUM(grho(:,k,2)**2) - ggrho2 = ( grho2(1) + grho2(2) ) * 4.0_DP + !grho2up = MIN( SUM(grho(:,k,1)**2), (8.d0*rho(k,1)*tau(k,1))**2 ) + !grho2dw = MIN( SUM(grho(:,k,2)**2), (8.d0*rho(k,2)*tau(k,2))**2 ) + grho2up = SUM(grho(:,k,1)**2) + grho2dw = SUM(grho(:,k,2)**2) + ggrho2 = ( grho2up + grho2dw ) * 4.0_DP ! IF ( (rh <= rho_threshold_mgga).OR.(ggrho2 <= grho2_threshold_mgga).OR.& - (ABS(atau) <= tau_threshold_mgga) ) CYCLE + (ABS(atau) <= tau_threshold_mgga) ) THEN + v1x(k,:)=0.d0 ; v2x(k,:)=0.d0 ; v3x(k,:)=0.d0 ; ex(k)=0.d0 + v1c(k,:)=0.d0 ; v2c(:,k,:)=0.d0 ; v3c(k,:)=0.d0 ; ec(k)=0.d0 + CYCLE + ENDIF ! SELECT CASE( imeta ) CASE( 1 ) ! - CALL tpsscx_spin( rho(k,1), rho(k,2), grho2(1), grho2(2), tau(k,1), & + CALL tpsscx_spin( rho(k,1), rho(k,2), grho2up, grho2dw, tau(k,1), & tau(k,2), ex(k), v1x(k,1), v1x(k,2), v2x(k,1), & v2x(k,2), v3x(k,1), v3x(k,2) ) ! - zeta = (rho(k,1) - rho(k,2)) / rh - zeta = MAX( MIN( 0.99999999_DP, zeta ), -0.99999999_DP ) + zeta = MAX( MIN( 0.99999999_DP, (rho(k,1)-rho(k,2))/rh ), -0.99999999_DP ) ! CALL tpsscc_spin( rh, zeta, grho(:,k,1), grho(:,k,2), atau, ec(k), & v1c(k,1), v1c(k,2), v2c(:,k,1), v2c(:,k,2), & @@ -180,7 +202,7 @@ SUBROUTINE tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, ! CASE( 2 ) ! - CALL m06lxc_spin( rho(k,1), rho(k,2), grho2(1), grho2(2), tau(k,1), & + CALL m06lxc_spin( rho(k,1), rho(k,2), grho2up, grho2dw, tau(k,1), & tau(k,2), ex(k), ec(k), v1x(k,1), v1x(k,2), & v2x(k,1), v2x(k,2), v3x(k,1), v3x(k,2), v1c(k,1), & v1c(k,2), v2cup, v2cdw, v3c(k,1), v3c(k,2) ) @@ -190,12 +212,16 @@ SUBROUTINE tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, ! CASE DEFAULT ! - CALL xclib_error( 'tau_xc_spin', 'This case not implemented', imeta ) + !CALL xclib_error( 'tau_xc_spin', 'This case not implemented', imeta ) ! END SELECT ! ENDDO ! +#if defined(_OPENACC) + !$acc end data +#endif + ! RETURN ! END SUBROUTINE tau_xc_spin diff --git a/XClib/qe_funct_mgga.f90 b/XClib/qe_funct_mgga.f90 index 79bf5d20e..d13e96481 100644 --- a/XClib/qe_funct_mgga.f90 +++ b/XClib/qe_funct_mgga.f90 @@ -6,22 +6,23 @@ ! or http://www.gnu.org/copyleft/gpl.txt . ! !------------------------------------------------------------------------ -MODULE metagga !metagga_gpu> +MODULE metagga !------------------------------------------------------------------------- !! MetaGGA functionals. Available functionals: ! !! * TPSS (Tao, Perdew, Staroverov & Scuseria) !! * M06L ! -USE exch_lda, ONLY: slater !slater_d,exch_lda=>exch_lda_gpu> -USE corr_lda, ONLY: pw, pw_spin !pw_d,pw_spin=>pw_spin_d,corr_lda=>corr_lda_gpu> -USE corr_gga, ONLY: pbec, pbec_spin !pbec_d,pbec_spin=>pbec_spin_d,corr_gga=>corr_gga_gpu> +USE exch_lda, ONLY: slater +USE corr_lda, ONLY: pw, pw_spin +USE corr_gga, ONLY: pbec, pbec_spin ! CONTAINS ! TPSS ! !------------------------------------------------------------------------- -SUBROUTINE tpsscxc( rho, grho, tau, sx, sc, v1x, v2x, v3x, v1c, v2c, v3c ) ! +SUBROUTINE tpsscxc( rho, grho, tau, sx, sc, v1x, v2x, v3x, v1c, v2c, v3c ) +!$acc routine seq !----------------------------------------------------------------------- !! TPSS metaGGA corrections for exchange and correlation - Hartree a.u. !! Definition: \(E_x = \int E_x(\text{rho},\text{grho}) dr\) @@ -70,9 +71,9 @@ SUBROUTINE tpsscxc( rho, grho, tau, sx, sc, v1x, v2x, v3x, v1c, v2c, v3c ) ENDIF ! ! exchange - CALL metax( rho, grho, tau, sx, v1x, v2x, v3x ) !metax_d> + CALL metax( rho, grho, tau, sx, v1x, v2x, v3x ) ! correlation - CALL metac( rho, grho, tau, sc, v1c, v2c, v3c ) !metac_d> + CALL metac( rho, grho, tau, sc, v1c, v2c, v3c ) ! RETURN ! @@ -80,7 +81,8 @@ END SUBROUTINE tpsscxc ! ! !------------------------------------------------------------------------- -SUBROUTINE metax( rho, grho2, tau, ex, v1x, v2x, v3x ) ! +SUBROUTINE metax( rho, grho2, tau, ex, v1x, v2x, v3x ) +!$acc routine seq !-------------------------------------------------------------------- !! TPSS meta-GGA exchange potential and energy. ! @@ -133,8 +135,8 @@ SUBROUTINE metax( rho, grho2, tau, ex, v1x, v2x, v3x ) !slater_d> - CALL metaFX( rho, grho2, tau, fx, f1x, f2x, f3x ) !metaFX_d> + CALL slater( rs, ex_unif, vx_unif ) + CALL metaFX( rho, grho2, tau, fx, f1x, f2x, f3x ) ! ex = rho*ex_unif v1x = vx_unif*fx + ex*f1x @@ -148,7 +150,8 @@ END SUBROUTINE metax ! ! !------------------------------------------------------------------ -SUBROUTINE metac( rho, grho2, tau, ec, v1c, v2c, v3c ) ! +SUBROUTINE metac( rho, grho2, tau, ec, v1c, v2c, v3c ) +!$acc routine seq !-------------------------------------------------------------- !! TPSS meta-GGA correlation energy and potentials. ! @@ -208,11 +211,11 @@ SUBROUTINE metac( rho, grho2, tau, ec, v1c, v2c, v3c ) ! small) THEN ! rs = (pi34/rhoup)**third - CALL pw_spin( rs, 1.0_DP, ec_unif, vc_unif_s(1), vc_unif_s(2) ) !pw_spin_d> + CALL pw_spin( rs, 1.0_DP, ec_unif, vc_unif_s(1), vc_unif_s(2) ) ! IF (ABS(grhoup) > small) THEN ! zeta=1.0_DP-small to avoid pow_e of 0 in pbec_spin - CALL pbec_spin( rhoup, 1.0_DP-small, grhoup**2, 1, & !pbec_spin_d> + CALL pbec_spin( rhoup, 1.0_DP-small, grhoup**2, 1, & ec_sum, v1c_sum(1), v1c_sum(2), v2c_sum ) ELSE ec_sum = 0.0_DP @@ -229,13 +232,13 @@ SUBROUTINE metac( rho, grho2, tau, ec, v1c, v2c, v3c ) !pw_d> + CALL pw( rs, 1, ec_unif, vc_unif ) ! ! ... PBE correlation energy and potential: ! ec_pbe=rho*H, not rho*(epsion_c_uinf + H) ! v1c_pbe=D (rho*H) /D rho ! v2c_pbe= for rho, 2 for - CALL pbec( rho, grho2, 1, ec_pbe, v1c_pbe, v2c_pbe ) !pbec_d> + CALL pbec( rho, grho2, 1, ec_pbe, v1c_pbe, v2c_pbe ) ! ec_pbe = ec_pbe/rho+ec_unif v1c_pbe = (v1c_pbe+vc_unif-ec_pbe)/rho @@ -275,7 +278,8 @@ END SUBROUTINE metac ! ! !------------------------------------------------------------------------- -SUBROUTINE metaFX( rho, grho2, tau, fx, f1x, f2x, f3x ) ! +SUBROUTINE metaFX( rho, grho2, tau, fx, f1x, f2x, f3x ) +!$acc routine seq !------------------------------------------------------------------------- ! USE kind_l, ONLY : DP @@ -397,8 +401,9 @@ END SUBROUTINE metaFX ! ! !--------------------------------------------------------------------------- -SUBROUTINE tpsscx_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, sx, & ! +SUBROUTINE tpsscx_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, sx, & v1xup, v1xdw, v2xup, v2xdw, v3xup, v3xdw ) +!$acc routine seq !----------------------------------------------------------------------- !! TPSS metaGGA for exchange - Hartree a.u. ! @@ -442,7 +447,7 @@ SUBROUTINE tpsscx_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, sx, & rho = rhoup + rhodw IF (rhoup>small .AND. SQRT(ABS(grhoup2))>small & .AND. ABS(tauup) > small) THEN - CALL metax( 2.0_DP*rhoup, 4.0_DP*grhoup2, & !metax_d> + CALL metax( 2.0_DP*rhoup, 4.0_DP*grhoup2, & 2.0_DP*tauup, sxup, v1xup, v2xup, v3xup ) ELSE sxup = 0.0_DP @@ -453,7 +458,7 @@ SUBROUTINE tpsscx_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, sx, & ! IF (rhodw > small .AND. SQRT(ABS(grhodw2)) > small & .AND. ABS(taudw) > small) THEN - CALL metax( 2.0_DP*rhodw, 4.0_DP*grhodw2, & !metax_d> + CALL metax( 2.0_DP*rhodw, 4.0_DP*grhodw2, & 2.0_DP*taudw, sxdw, v1xdw, v2xdw, v3xdw ) ELSE sxdw = 0.0_DP @@ -472,8 +477,9 @@ END SUBROUTINE tpsscx_spin ! ! !--------------------------------------------------------------------------- -SUBROUTINE tpsscc_spin( rho, zeta, grhoup, grhodw, & ! +SUBROUTINE tpsscc_spin( rho, zeta, grhoup, grhodw, & atau, sc, v1cup, v1cdw, v2cup, v2cdw, v3cup, v3cdw ) +!$acc routine seq !-------------------------------------------------------------------------- !! TPSS metaGGA for correlations - Hartree a.u. ! @@ -536,7 +542,7 @@ SUBROUTINE tpsscc_spin( rho, zeta, grhoup, grhodw, & ! ! v3c = 0.0_DP ELSE - CALL metac_spin( rho, zeta, grhoup, grhodw, & !metac_spin_d> + CALL metac_spin( rho, zeta, grhoup, grhodw, & atau, sc, v1cup, v1cdw, v2cup, v2cdw, v3c ) ENDIF ! @@ -549,8 +555,9 @@ END SUBROUTINE tpsscc_spin ! ! !--------------------------------------------------------------- -SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, & ! +SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, & tau, sc, v1up, v1dw, v2up, v2dw, v3 ) +!$acc routine seq !--------------------------------------------------------------- USE kind_l, ONLY : DP ! @@ -634,10 +641,10 @@ SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, & ! ! v2_tmp = 0.0_DP rs = (pi34/rho)**third - CALL pw_spin( rs, zeta, ec_u, vc_u(1), vc_u(2) ) !pw_spin_d> + CALL pw_spin( rs, zeta, ec_u, vc_u(1), vc_u(2) ) ! IF ((ABS(grho) > small) .AND. (zeta <= 1.0_DP)) THEN - CALL pbec_spin( rho, zeta, grho2, 1, ec_pbe, v1_pbe(1), v1_pbe(2), v2_tmp ) !pbec_spin_d> + CALL pbec_spin( rho, zeta, grho2, 1, ec_pbe, v1_pbe(1), v1_pbe(2), v2_tmp ) v1up_pbe=v1_pbe(1) v1dw_pbe=v1_pbe(2) ELSE @@ -668,10 +675,10 @@ SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, & ! v2_tmp = 0.0_DP ! rs = (pi34/rhoup)**third - CALL pw_spin( rs, 1.0_DP, ec_u, vc_u(1), vc_u(2) ) !pw_spin_d> + CALL pw_spin( rs, 1.0_DP, ec_u, vc_u(1), vc_u(2) ) ! IF (SQRT(grhoup2) > small) THEN - CALL pbec_spin( rhoup, 1.0_DP-small, grhoup2, 1, & !pbec_spin_d> + CALL pbec_spin( rhoup, 1.0_DP-small, grhoup2, 1, & ecup_0, v1_0v(1), v1_0v(2), v2_tmp ) v1up_0 = v1_0v(1) v1dw_0 = v1_0v(2) @@ -709,11 +716,11 @@ SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, & ! v2_tmp = 0.0_DP ! rs = (pi34/rhodw)**third - CALL pw_spin( rs, -1.0_DP, ec_u, vc_u(1), vc_u(2) ) !pw_spin_d> + CALL pw_spin( rs, -1.0_DP, ec_u, vc_u(1), vc_u(2) ) ! IF (SQRT(grhodw2) > small) THEN ! - CALL pbec_spin( rhodw, -1.0_DP+small, grhodw2, 1, & !pbec_spin_d> + CALL pbec_spin( rhodw, -1.0_DP+small, grhodw2, 1, & ecdw_0, v1_0v(1), v1_0v(2), v2_tmp ) ! v1up_0 = v1_0v(1) @@ -877,7 +884,8 @@ END SUBROUTINE metac_spin ! ec, v1c, v2c, v3c as above for correlation ! !------------------------------------------------------------------------- -SUBROUTINE m06lxc( rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c ) ! +SUBROUTINE m06lxc( rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c ) +!$acc routine seq !----------------------------------------------------------------------- ! USE kind_l, ONLY : DP @@ -923,14 +931,14 @@ SUBROUTINE m06lxc( rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c ) taub = taua ! Tau is defined as summ_i( |nabla phi_i|**2 ) ! in the following M06L routines ! - CALL m06lx( rhoa, grho2a, taua, ex, v1x, v2x, v3x ) !m06lx_d> + CALL m06lx( rhoa, grho2a, taua, ex, v1x, v2x, v3x ) ! ex = two * ex ! Add the two components up + dw ! v2x = 0.5_dp * v2x !v3x = 2.0_dp * v3x !**mismatch with Libxc by a factor of 2 ! - CALL m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c, v2c, v3c, & !m06lc_d> + CALL m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c, v2c, v3c, & v1cb, v2cb, v3cb ) ! v2c = 0.5_dp * v2c @@ -940,9 +948,10 @@ END SUBROUTINE m06lxc ! ! !------------------------------------------------------------------------- -SUBROUTINE m06lxc_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, & ! +SUBROUTINE m06lxc_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, & ex, ec, v1xup, v1xdw, v2xup, v2xdw, v3xup, v3xdw, & v1cup, v1cdw, v2cup, v2cdw, v3cup, v3cdw ) +!$acc routine seq !----------------------------------------------------------------------- ! USE kind_l, ONLY : DP @@ -959,14 +968,14 @@ SUBROUTINE m06lxc_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, & taua = tauup * two ! Tau is defined as summ_i( |nabla phi_i|**2 ) taub = taudw * two ! in the rest of the routine ! - CALL m06lx( rhoup, grhoup2, taua, exup, v1xup, v2xup, v3xup ) !m06lx_d> - CALL m06lx( rhodw, grhodw2, taub, exdw, v1xdw, v2xdw, v3xdw ) !m06lx_d> + CALL m06lx( rhoup, grhoup2, taua, exup, v1xup, v2xup, v3xup ) + CALL m06lx( rhodw, grhodw2, taub, exdw, v1xdw, v2xdw, v3xdw ) ! ex = exup + exdw !v3xup = 2.0_dp * v3xup !**mismatch with Libxc by a factor of 2 !v3xdw = 2.0_dp * v3xdw !** ! - CALL m06lc( rhoup, rhodw, grhoup2, grhodw2, taua, taub, & !m06lc_d> + CALL m06lc( rhoup, rhodw, grhoup2, grhodw2, taua, taub, & ec, v1cup, v2cup, v3cup, v1cdw, v2cdw, v3cdw ) !v3cup = 2.0_dp * v3cup !** !v3cdw = 2.0_dp * v3cdw !** @@ -977,7 +986,8 @@ END SUBROUTINE m06lxc_spin !=============================== M06L exchange ========================== ! !------------------------------------------------------------------------------- -SUBROUTINE m06lx( rho, grho2, tau, ex, v1x, v2x, v3x ) ! +SUBROUTINE m06lx( rho, grho2, tau, ex, v1x, v2x, v3x ) +!$acc routine seq !--------------------------------------------------------------------------- !! M06L exchange. ! @@ -1082,7 +1092,7 @@ SUBROUTINE m06lx( rho, grho2, tau, ex, v1x, v2x, v3x ) ! gh = one + alpha * (xs2 + zs) ! IF (gh >= small) THEN - CALL gvt4( xs2, zs, d0, d1, d2, d3, d4, d5, alpha, hg, dhg_dxs2, dhg_dzs ) !gvt4_d> + CALL gvt4( xs2, zs, d0, d1, d2, d3, d4, d5, alpha, hg, dhg_dxs2, dhg_dzs ) ELSE hg = zero dhg_dxs2 = zero @@ -1121,7 +1131,7 @@ SUBROUTINE m06lx( rho, grho2, tau, ex, v1x, v2x, v3x ) ! dfws_drho = dfws*dws_dts*dts_drho dfws_dtau = dfws*dws_dts*dts_dtau ! - CALL pbex_m06l( two*rho, four*grho2, sx_pbe, v1x_pbe, v2x_pbe ) !pbex_m06l_d> + CALL pbex_m06l( two*rho, four*grho2, sx_pbe, v1x_pbe, v2x_pbe ) ! v1x_unif = f43 * CX * rho13 ! @@ -1143,7 +1153,8 @@ END SUBROUTINE m06lx ! ! !------------------------------------------------------------------- -SUBROUTINE pbex_m06l( rho, grho2, sx, v1x, v2x ) ! +SUBROUTINE pbex_m06l( rho, grho2, sx, v1x, v2x ) +!$acc routine seq !--------------------------------------------------------------- !! PBE exchange (without Slater exchange): !! J.P.Perdew, K.Burke, M.Ernzerhof, PRL 77, 3865 (1996). @@ -1211,8 +1222,9 @@ END SUBROUTINE pbex_m06l !=============================== M06L correlation ========================== ! !--------------------------------------------------------------------------------- -SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, & ! +SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, & v3c_up, v1c_dw, v2c_dw, v3c_dw ) +!$acc routine seq !------------------------------------------------------------------------------- !! M06L correlation. ! @@ -1366,12 +1378,12 @@ SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, & ! rs = rsa zeta = one - CALL pw_spin( rs, zeta, ec_pw_a, vc_v(1), vc_v(2) ) !pw_spin_d> + CALL pw_spin( rs, zeta, ec_pw_a, vc_v(1), vc_v(2) ) vc_pw_a = vc_v(1) vv = vc_v(2) ! - CALL gvt4( xs2a, zsa, ds0, ds1, ds2, ds3, ds4, ds5, alpha_s, hga, dhga_dxs2a, dhga_dzsa ) !gvt4_d> - CALL gfunc( cs, gama_s, xs2a, gsa, dgsa_dxs2a ) !gfunc_d> + CALL gvt4( xs2a, zsa, ds0, ds1, ds2, ds3, ds4, ds5, alpha_s, hga, dhga_dxs2a, dhga_dzsa ) + CALL gfunc( cs, gama_s, xs2a, gsa, dgsa_dxs2a ) ! Ec_UEG_aa = rhoa*ec_pw_a num = (dgsa_dxs2a + dhga_dxs2a)*Dsa + (gsa + hga)*dDsa_dxs2a @@ -1426,12 +1438,12 @@ SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, & ! zeta = one rs = rsb - CALL pw_spin( rs, zeta, ec_pw_b, vc_v(1), vc_v(2) ) !pw_spin_d> + CALL pw_spin( rs, zeta, ec_pw_b, vc_v(1), vc_v(2) ) vc_pw_b = vc_v(1) vv = vc_v(2) ! - CALL gvt4( xs2b, zsb, ds0, ds1, ds2, ds3, ds4, ds5, alpha_s, hgb, dhgb_dxs2b, dhgb_dzsb ) !gvt4_d> - CALL gfunc( cs, gama_s, xs2b, gsb, dgsb_dxs2b ) !gfunc_d> + CALL gvt4( xs2b, zsb, ds0, ds1, ds2, ds3, ds4, ds5, alpha_s, hgb, dhgb_dxs2b, dhgb_dzsb ) + CALL gfunc( cs, gama_s, xs2b, gsb, dgsb_dxs2b ) ! Ec_UEG_bb = rhob*ec_pw_b num = (dgsb_dxs2b + dhgb_dxs2b)*Dsb + (gsb + hgb)*dDsb_dxs2b @@ -1468,13 +1480,13 @@ SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, & zeta = (rhoa - rhob)/rho rs = (pi34/rho)**f13 ! - CALL gvt4( xs2ab, zsab, dab0, dab1, dab2, dab3, dab4, dab5, alpha_ab, hgab, & !gvt4_d> + CALL gvt4( xs2ab, zsab, dab0, dab1, dab2, dab3, dab4, dab5, alpha_ab, hgab, & dhgab_dxs2ab, dhgab_dzsab ) ! - CALL pw_spin( rs, zeta, ec_pw_ab, vc_v(1), vc_v(2) ) !pw_spin_d> + CALL pw_spin( rs, zeta, ec_pw_ab, vc_v(1), vc_v(2) ) vc_pw_up=vc_v(1) ; vc_pw_dw=vc_v(2) ! - CALL gfunc( cab, gama_ab, xs2ab, gsab, dgsab_dxs2ab ) !gfunc_d> + CALL gfunc( cab, gama_ab, xs2ab, gsab, dgsab_dxs2ab ) ! decab_drhoa = vc_pw_up - vc_pw_a decab_drhob = vc_pw_dw - vc_pw_b @@ -1515,7 +1527,8 @@ END SUBROUTINE m06lc ! ! !------------------------------------------------------------------- -SUBROUTINE gfunc( cspin, gama, xspin, gs, dgs_dx ) ! +SUBROUTINE gfunc( cspin, gama, xspin, gs, dgs_dx ) +!$acc routine seq !----------------------------------------------------------------- ! USE kind_l, ONLY : DP @@ -1545,7 +1558,8 @@ SUBROUTINE gfunc( cspin, gama, xspin, gs, dgs_dx ) ! ! ! !------------------------------------------------------------------------- -SUBROUTINE gvt4( x, z, a, b, c, d, e, f, alpha, hg, dh_dx, dh_dz ) ! +SUBROUTINE gvt4( x, z, a, b, c, d, e, f, alpha, hg, dh_dx, dh_dz ) +!$acc routine seq !---------------------------------------------------------------------- ! USE kind_l, ONLY : DP diff --git a/XClib/xc_beef_interface.f90 b/XClib/xc_beef_interface.f90 index b5ac12cda..9555abf0d 100644 --- a/XClib/xc_beef_interface.f90 +++ b/XClib/xc_beef_interface.f90 @@ -22,20 +22,21 @@ MODULE beef_interface INTERFACE ! SUBROUTINE beefx( r, g, e, dr, dg, addlda ) BIND(C, NAME="beefx_") + !$acc routine seq USE iso_c_binding REAL (C_DOUBLE) :: r, g, e, dr, dg INTEGER(C_INT), INTENT(IN) :: addlda END SUBROUTINE beefx ! - SUBROUTINE beeflocalcorr( r, g, e, dr, dg, addlda) & - BIND(C, NAME="beeflocalcorr_") + SUBROUTINE beeflocalcorr( r, g, e, dr, dg, addlda) BIND(C, NAME="beeflocalcorr_") + !$acc routine seq USE iso_c_binding REAL (C_DOUBLE), INTENT(INOUT) :: r, g, e, dr, dg INTEGER(C_INT), INTENT(IN) :: addlda END SUBROUTINE beeflocalcorr ! - SUBROUTINE beeflocalcorrspin(r, z, g, e, drup, drdown, dg, addlda) & - BIND(C, NAME="beeflocalcorrspin_") + SUBROUTINE beeflocalcorrspin(r, z, g, e, drup, drdown, dg, addlda) BIND(C, NAME="beeflocalcorrspin_") + !$acc routine seq USE iso_c_binding REAL (C_DOUBLE), INTENT(INOUT) :: r, z, g, e, drup, drdown, dg INTEGER(C_INT), INTENT(IN) :: addlda diff --git a/XClib/xc_lib.f90 b/XClib/xc_lib.f90 index c6bb3ab5d..e0116ea79 100644 --- a/XClib/xc_lib.f90 +++ b/XClib/xc_lib.f90 @@ -14,16 +14,16 @@ MODULE xc_lib ! IMPLICIT NONE ! - SAVE - ! + SAVE + ! PRIVATE ! ! - !PUBLIC :: xc, dmxc !LDA + PUBLIC :: xc !, dmxc !LDA ! PUBLIC :: xc_gcx !, dgcxc !GGA ! - !PUBLIC :: xc_metagcx !MGGA + PUBLIC :: xc_metagcx !MGGA ! PUBLIC :: xclib_set_dft_from_name, & xclib_set_dft_IDs, & @@ -56,17 +56,46 @@ MODULE xc_lib xclib_reset_dft, & dft_force_hybrid ! + INTERFACE xc + SUBROUTINE xc( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out, gpu_args_ ) + USE kind_l, ONLY: DP + IMPLICIT NONE + INTEGER, INTENT(IN) :: length, srd, svd + REAL(DP), INTENT(IN) :: rho_in(length,srd) + REAL(DP), INTENT(OUT) :: ex_out(length), ec_out(length) + REAL(DP), INTENT(OUT) :: vx_out(length,svd), vc_out(length,svd) + LOGICAL, OPTIONAL, INTENT(IN) :: gpu_args_ + END SUBROUTINE + END INTERFACE + ! ! INTERFACE xc_gcx - SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) - USE kind_l, ONLY: DP + SUBROUTINE xc_gcx( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud, & + gpu_args_ ) + USE kind_l, ONLY: DP IMPLICIT NONE INTEGER, INTENT(IN) :: length, ns - REAL(DP), INTENT(IN) :: rho(:,:), grho(:,:,:) - REAL(DP), INTENT(OUT) :: ex(:), ec(:) - REAL(DP), INTENT(OUT) :: v1x(:,:), v2x(:,:) - REAL(DP), INTENT(OUT) :: v1c(:,:), v2c(:,:) - REAL(DP), OPTIONAL, INTENT(OUT) :: v2c_ud(:) + REAL(DP), INTENT(IN) :: rho(length,ns), grho(3,length,ns) + REAL(DP), INTENT(OUT) :: ex(length), ec(length) + REAL(DP), INTENT(OUT) :: v1x(length,ns), v2x(length,ns) + REAL(DP), INTENT(OUT) :: v1c(length,ns), v2c(length,ns) + REAL(DP), OPTIONAL, INTENT(OUT) :: v2c_ud(length) + LOGICAL, OPTIONAL, INTENT(IN) :: gpu_args_ + END SUBROUTINE + END INTERFACE + ! + ! + INTERFACE xc_metagcx + SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, & + v1c, v2c, v3c, gpu_args_ ) + USE kind_l, ONLY: DP + IMPLICIT NONE + INTEGER, INTENT(IN) :: length, ns, np + REAL(DP), INTENT(IN) :: rho(length,ns), grho(3,length,ns), tau(length,ns) + REAL(DP), INTENT(OUT) :: ex(length), ec(length) + REAL(DP), INTENT(OUT) :: v1x(length,ns), v2x(length,ns), v3x(length,ns) + REAL(DP), INTENT(OUT) :: v1c(length,ns), v2c(np,length,ns), v3c(length,ns) + LOGICAL, OPTIONAL, INTENT(IN) :: gpu_args_ END SUBROUTINE END INTERFACE ! diff --git a/XClib/xc_wrapper_d_gga.f90 b/XClib/xc_wrapper_d_gga.f90 index 0938884c2..4c12eb5bc 100644 --- a/XClib/xc_wrapper_d_gga.f90 +++ b/XClib/xc_wrapper_d_gga.f90 @@ -42,13 +42,13 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) ! ... local variables ! REAL(DP), ALLOCATABLE :: vrrx(:,:), vsrx(:,:), vssx(:,:) - REAL(DP), ALLOCATABLE :: vrrc(:,:), vsrc(:,:), vssc(:), vrzc(:,:) + REAL(DP), ALLOCATABLE :: vrrc(:,:), vsrc(:,:), vssc(:), vrzc(:,:) ! #if defined(__LIBXC) INTEGER :: fkind=-10 REAL(DP), ALLOCATABLE :: rho_lxc(:) - REAL(DP), ALLOCATABLE :: v2rho2_x(:), v2rhosigma_x(:), v2sigma2_x(:) - REAL(DP), ALLOCATABLE :: v2rho2_c(:), v2rhosigma_c(:), v2sigma2_c(:) + REAL(DP), ALLOCATABLE :: v2rho2_x(:), v2rhosigma_x(:), v2sigma2_x(:) + REAL(DP), ALLOCATABLE :: v2rho2_c(:), v2rhosigma_c(:), v2sigma2_c(:) #if (XC_MAJOR_VERSION > 4) INTEGER(8) :: lengthxc #else @@ -83,7 +83,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) CASE( 1 ) ! DO k = 1, length - rho_lxc(k) = r_in(k,1) + rho_lxc(k) = r_in(k,1) sigma(k) = g_in(k,1,1)**2 + g_in(k,2,1)**2 + g_in(k,3,1)**2 ENDDO ! @@ -130,7 +130,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) v2sigma2_c(length_dlxc*sp) ) ! ... DERIVATIVE FOR CORRELATION v2rho2_c = 0._DP ; v2rhosigma_c = 0._DP ; v2sigma2_c = 0._DP - IF (igcc /= 0) THEN + IF (igcc /= 0) THEN fkind = xc_f03_func_info_get_kind( xc_info(4) ) CALL xc_f03_func_set_dens_threshold( xc_func(4), epsr ) CALL xc_f03_gga_fxc( xc_func(4), lengthxc, rho_lxc(1), sigma(1), v2rho2_c(1), & @@ -153,7 +153,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) IF (.NOT. ALLOCATED(sigma)) THEN ALLOCATE( sigma(length) ) sigma(:) = g_in(:,1,1)**2 + g_in(:,2,1)**2 + g_in(:,3,1)**2 - ENDIF + ENDIF ! CALL dgcxc_unpol( length, r_in(:,1), sigma, vrrx(:,1), vsrx(:,1), vssx(:,1), & vrrc(:,1), vsrc(:,1), vssc ) @@ -176,12 +176,12 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) dvxc_rr(k,1,1) = e2 * (vrrx(k,1) + vrrc(k,1)) dvxc_sr(k,1,1) = e2 * (vsrx(k,1) + vsrc(k,1)) dvxc_ss(k,1,1) = e2 * (vssx(k,1) + vssc(k) ) - ENDDO + ENDDO ! DEALLOCATE( vrrx, vsrx, vssx ) DEALLOCATE( vrrc, vsrc, vssc ) ! - ENDIF + ENDIF ! IF ( is_libxc(3) ) THEN DO k = 1, length @@ -189,7 +189,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) IF ( rho_lxc(k)>rho_threshold_lda ) THEN dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_x(k) dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_x(k)*2._DP - ENDIF + ENDIF dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_x(k)*4._DP ENDDO ENDIF @@ -199,7 +199,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) IF ( rho_lxc(k)>rho_threshold_lda ) THEN dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_c(k) dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_c(k)*2._DP - ENDIF + ENDIF dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_c(k)*4._DP ENDDO ENDIF @@ -230,23 +230,23 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) dvxc_ss(k,2,1) = e2*vssc(k) dvxc_ss(k,2,2) = e2*(vssx(k,2) + vssc(k)) ! - ENDDO + ENDDO ! DEALLOCATE( vrrx, vsrx, vssx ) DEALLOCATE( vrrc, vsrc, vssc ) DEALLOCATE( vrzc ) ! ENDIF - ! - IF ( is_libxc(3) ) THEN - ! + ! + IF ( is_libxc(3) ) THEN + ! DO k = 1, length thr_up_cond = r_in(k,1)>epsr .AND. SQRT(ABS(sigma(3*k-2)))>epsg thr_dw_cond = r_in(k,2)>epsr .AND. SQRT(ABS(sigma(3*k)))>epsg - IF ( .NOT.thr_up_cond .OR. .NOT.thr_dw_cond ) CYCLE + IF ( .NOT.thr_up_cond .OR. .NOT.thr_dw_cond ) CYCLE dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_x(3*k-2) dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_x(6*k-5)*4._DP - dvxc_rr(k,2,2) = dvxc_rr(k,2,2) + e2 * v2rho2_x(3*k) + dvxc_rr(k,2,2) = dvxc_rr(k,2,2) + e2 * v2rho2_x(3*k) dvxc_ss(k,2,2) = dvxc_ss(k,2,2) + e2 * v2sigma2_x(6*k)*4._DP dvxc_rr(k,1,2) = dvxc_rr(k,1,2) + e2 * v2rho2_x(3*k-1) dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_x(6*k-5)*2._DP @@ -260,30 +260,30 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) ! IF ( is_libxc(4) ) THEN ! - DO k = 1, length + DO k = 1, length thr_up_cond = r_in(k,1)>epsr .AND. SQRT(ABS(sigma(3*k-2)))>epsg thr_dw_cond = r_in(k,2)>epsr .AND. SQRT(ABS(sigma(3*k)))>epsg - IF ( .NOT.thr_up_cond .OR. .NOT.thr_dw_cond ) CYCLE - dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_c(3*k-2) - dvxc_rr(k,1,2) = dvxc_rr(k,1,2) + e2 * v2rho2_c(3*k-1) - dvxc_rr(k,2,1) = dvxc_rr(k,2,1) + e2 * v2rho2_c(3*k-1) - dvxc_rr(k,2,2) = dvxc_rr(k,2,2) + e2 * v2rho2_c(3*k) - dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_c(6*k-5)*2.d0 - dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_c(6*k)*4.d0 - dvxc_sr(k,1,2) = dvxc_sr(k,1,2) + e2 * v2rhosigma_c(6*k-4) - dvxc_sr(k,2,1) = dvxc_sr(k,2,1) + e2 * v2rhosigma_c(6*k-1) - dvxc_ss(k,1,2) = dvxc_ss(k,1,2) + e2 * v2sigma2_c(6*k-2) - dvxc_ss(k,2,1) = dvxc_ss(k,2,1) + e2 * v2sigma2_c(6*k-2) - dvxc_sr(k,2,2) = dvxc_sr(k,2,1) + e2 * v2rhosigma_c(6*k)*2.d0 - dvxc_ss(k,2,2) = dvxc_ss(k,2,2) + e2 * v2sigma2_c(6*k)*4.d0 - ENDDO - ! - DEALLOCATE( v2rho2_c, v2rhosigma_c, v2sigma2_c ) - ! + IF ( .NOT.thr_up_cond .OR. .NOT.thr_dw_cond ) CYCLE + dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_c(3*k-2) + dvxc_rr(k,1,2) = dvxc_rr(k,1,2) + e2 * v2rho2_c(3*k-1) + dvxc_rr(k,2,1) = dvxc_rr(k,2,1) + e2 * v2rho2_c(3*k-1) + dvxc_rr(k,2,2) = dvxc_rr(k,2,2) + e2 * v2rho2_c(3*k) + dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_c(6*k-5)*2.d0 + dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_c(6*k)*4.d0 + dvxc_sr(k,1,2) = dvxc_sr(k,1,2) + e2 * v2rhosigma_c(6*k-4) + dvxc_sr(k,2,1) = dvxc_sr(k,2,1) + e2 * v2rhosigma_c(6*k-1) + dvxc_ss(k,1,2) = dvxc_ss(k,1,2) + e2 * v2sigma2_c(6*k-2) + dvxc_ss(k,2,1) = dvxc_ss(k,2,1) + e2 * v2sigma2_c(6*k-2) + dvxc_sr(k,2,2) = dvxc_sr(k,2,1) + e2 * v2rhosigma_c(6*k)*2.d0 + dvxc_ss(k,2,2) = dvxc_ss(k,2,2) + e2 * v2sigma2_c(6*k)*4.d0 + ENDDO + ! + DEALLOCATE( v2rho2_c, v2rhosigma_c, v2sigma2_c ) + ! ENDIF ! IF (ANY(is_libxc(3:4))) DEALLOCATE( rho_lxc ) - ! + ! ENDIF IF ( ALLOCATED(sigma) ) DEALLOCATE( sigma ) ! @@ -303,7 +303,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) ! dvxc_rr(:,1,1) = e2*(vrrx(:,1) + vrrc(:,1)) dvxc_sr(:,1,1) = e2*(vsrx(:,1) + vsrc(:,1)) - dvxc_ss(:,1,1) = e2*(vssx(:,1) + vssc(:) ) + dvxc_ss(:,1,1) = e2*(vssx(:,1) + vssc(:) ) ! CASE( 2 ) ! @@ -323,14 +323,14 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss ) dvxc_rr(k,2,2) = e2*(vrrx(k,2) + vrrc(k,2) - vrzc(k,2)*(1.d0 + zeta)/rht) ENDIF ! - dvxc_sr(k,1,1) = e2 * (vsrx(k,1) + vsrc(k,1)) - dvxc_sr(k,1,2) = e2 * vsrc(k,1) - dvxc_sr(k,2,1) = e2 * vsrc(k,2) - dvxc_sr(k,2,2) = e2 * (vsrx(k,2) + vsrc(k,2)) + dvxc_sr(k,1,1) = e2 * (vsrx(k,1) + vsrc(k,1)) + dvxc_sr(k,1,2) = e2 * vsrc(k,1) + dvxc_sr(k,2,1) = e2 * vsrc(k,2) + dvxc_sr(k,2,2) = e2 * (vsrx(k,2) + vsrc(k,2)) ! - dvxc_ss(k,1,1) = e2 * (vssx(k,1) + vssc(k)) - dvxc_ss(k,1,2) = e2 * vssc(k) - dvxc_ss(k,2,1) = e2 * vssc(k) + dvxc_ss(k,1,1) = e2 * (vssx(k,1) + vssc(k)) + dvxc_ss(k,1,2) = e2 * vssc(k) + dvxc_ss(k,2,1) = e2 * vssc(k) dvxc_ss(k,2,2) = e2 * (vssx(k,2) + vssc(k)) ENDDO ! diff --git a/XClib/xc_wrapper_gga.f90 b/XClib/xc_wrapper_gga.f90 index 7a4f4a712..2ef083323 100644 --- a/XClib/xc_wrapper_gga.f90 +++ b/XClib/xc_wrapper_gga.f90 @@ -6,13 +6,98 @@ ! or http://www.gnu.org/copyleft/gpl.txt . ! !--------------------------------------------------------------------------- +SUBROUTINE xc_gcx( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud, & + gpu_args_ ) + !------------------------------------------------------------------------- + !! Wrapper to gpu or non gpu version of \(\texttt{xc_gcx}\). + ! + USE kind_l, ONLY: DP + ! + IMPLICIT NONE + ! + INTEGER, INTENT(IN) :: length + !! length of the I/O arrays + INTEGER, INTENT(IN) :: ns + !! spin dimension for input + REAL(DP), INTENT(IN) :: rho(length,ns) + !! Charge density + REAL(DP), INTENT(IN) :: grho(3,length,ns) + !! gradient + REAL(DP), INTENT(OUT) :: ex(length) + !! exchange energy + REAL(DP), INTENT(OUT) :: ec(length) + !! correlation energy + REAL(DP), INTENT(OUT) :: v1x(length,ns) + !! exchange potential (density part) + REAL(DP), INTENT(OUT) :: v2x(length,ns) + !! exchange potential (gradient part) + REAL(DP), INTENT(OUT) :: v1c(length,ns) + !! correlation potential (density part) + REAL(DP), INTENT(OUT) :: v2c(length,ns) + !! correlation potential (gradient part) + REAL(DP), INTENT(OUT), OPTIONAL :: v2c_ud(length) + !! correlation potential, cross term + LOGICAL, INTENT(IN), OPTIONAL :: gpu_args_ + !! whether you wish to run on gpu in case use_gpu is true + ! + LOGICAL :: gpu_args + REAL(DP), ALLOCATABLE :: v2c_dummy(:) + ! + gpu_args = .FALSE. + ! + IF ( PRESENT(gpu_args_) ) gpu_args = gpu_args_ + ! + IF (ns==2 .AND. .NOT. PRESENT(v2c_ud)) CALL xclib_infomsg( 'xc_gcx', 'WARNING: cross & + &term v2c_ud not found xc_gcx (gga) call with polarized case' ) + ! + IF ( gpu_args ) THEN + ! + !$acc data deviceptr( rho(length,ns), grho(3,length,ns), ex(length), ec(length), & + !$acc& v1x(length,ns), v2x(length,ns), v1c(length,ns), v2c(length,ns) ) + IF (PRESENT(v2c_ud)) THEN + !$acc data deviceptr( v2c_ud(length) ) + CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) + !$acc end data + ELSE + ALLOCATE( v2c_dummy(length) ) + !$acc data deviceptr( v2c_dummy(length) ) + CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy ) + !$acc end data + DEALLOCATE( v2c_dummy ) + ENDIF + !$acc end data + ! + ELSE + ! + !$acc data copyin( rho, grho ), copyout( ex, ec, v1x, v2x, v1c, v2c ) + IF (PRESENT(v2c_ud)) THEN + !$acc data copyout( v2c_ud ) + !$acc host_data use_device( rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) + CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) + !$acc end host_data + !$acc end data + ELSE + ALLOCATE( v2c_dummy(length) ) + !$acc data create( v2c_dummy ) + !$acc host_data use_device( rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy ) + CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy ) + !$acc end host_data + !$acc end data + DEALLOCATE( v2c_dummy ) + ENDIF + !$acc end data + ! + ENDIF + ! + RETURN + ! +END SUBROUTINE +! +! +!--------------------------------------------------------------------------- SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) !------------------------------------------------------------------------- - !! Wrapper routine. Calls xc_gga-driver internal routines or the external - !! ones from libxc, depending on the input choice. - ! - !! NOTE: differently from 'xc_lda_drivers', here the input rho is in (up,down) - !! form (in the LSDA case). + !! GGA wrapper routine - gpu double. ! #if defined(__LIBXC) #include "xc_version.h" @@ -28,27 +113,16 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) IMPLICIT NONE ! INTEGER, INTENT(IN) :: length - !! length of the I/O arrays INTEGER, INTENT(IN) :: ns - !! spin dimension for input - REAL(DP), INTENT(IN) :: rho(:,:) - !! Charge density - REAL(DP), INTENT(IN) :: grho(:,:,:) - !! gradient - REAL(DP), INTENT(OUT) :: ex(:) - !! exchange energy - REAL(DP), INTENT(OUT) :: ec(:) - !! correlation energy - REAL(DP), INTENT(OUT) :: v1x(:,:) - !! exchange potential (density part) - REAL(DP), INTENT(OUT) :: v2x(:,:) - !! exchange potential (gradient part) - REAL(DP), INTENT(OUT) :: v1c(:,:) - !! correlation potential (density part) - REAL(DP), INTENT(OUT) :: v2c(:,:) - !! correlation potential (gradient part) - REAL(DP), INTENT(OUT), OPTIONAL :: v2c_ud(:) - !! correlation potential, cross term + REAL(DP), INTENT(IN) :: rho(length,ns) + REAL(DP), INTENT(IN) :: grho(3,length,ns) + REAL(DP), INTENT(OUT) :: ex(length) + REAL(DP), INTENT(OUT) :: ec(length) + REAL(DP), INTENT(OUT) :: v1x(length,ns) + REAL(DP), INTENT(OUT) :: v2x(length,ns) + REAL(DP), INTENT(OUT) :: v1c(length,ns) + REAL(DP), INTENT(OUT) :: v2c(length,ns) + REAL(DP), INTENT(OUT) :: v2c_ud(length) ! ! ... local variables ! @@ -59,32 +133,27 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) REAL(DP), ALLOCATABLE :: vc_rho(:), vc_sigma(:) ! INTEGER :: fkind_x, np + REAL(DP) :: rs, rtot, zet, vc_2(2), arho_k REAL(DP), PARAMETER :: pi34 = 0.6203504908994_DP ! LOGICAL :: POLARIZED - INTEGER :: pol_unpol + INTEGER :: ildax, ildac, pol_unpol #if (XC_MAJOR_VERSION > 4) INTEGER(8) :: lengthxc #else INTEGER :: lengthxc #endif #endif - REAL(DP), ALLOCATABLE :: arho(:,:) REAL(DP), ALLOCATABLE :: rh(:), zeta(:) REAL(DP), ALLOCATABLE :: grho2(:,:), grho_ud(:) ! INTEGER :: k, is - REAL(DP) :: sgn(2) - REAL(DP) :: rho_up, rho_dw, grho_up, grho_dw + REAL(DP) :: rho_up, rho_dw, grho_up, grho_dw, sgn1 REAL(DP), PARAMETER :: small = 1.E-10_DP ! - ! - IF (ns==2 .AND. .NOT. PRESENT(v2c_ud)) CALL xclib_infomsg( 'xc_gcx', 'WARNING: cross & - &term v2c_ud not found xc_gcx (gga) call with polarized case' ) - ! - ex = 0.0_DP ; v1x = 0.0_DP ; v2x = 0.0_DP - ec = 0.0_DP ; v1c = 0.0_DP ; v2c = 0.0_DP - IF ( PRESENT(v2c_ud) ) v2c_ud = 0.0_DP + !$acc data deviceptr( rho(length,ns), grho(3,length,ns), ex(length), ec(length), & + !$acc& v1x(length,ns), v2x(length,ns), v1c(length,ns), v2c(length,ns), & + !$acc& v2c_ud(length) ) ! #if defined(__LIBXC) ! @@ -103,24 +172,31 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) np = 3 ENDIF ! - ALLOCATE( rho_lxc(length*ns) ) - ALLOCATE( sigma(length*np) ) - ! - ALLOCATE( ex_lxc(length) , ec_lxc(length) ) - ALLOCATE( vx_rho(length*ns) , vx_sigma(length*np) ) - ALLOCATE( vc_rho(length*ns) , vc_sigma(length*np) ) + ALLOCATE( rho_lxc(length*ns), sigma(length*np) ) + !$acc data create( rho_lxc, sigma ) ! + IF ( is_libxc(3) ) THEN + ALLOCATE( ex_lxc(length) ) + ALLOCATE( vx_rho(length*ns), vx_sigma(length*np) ) + ENDIF + IF ( is_libxc(4) ) THEN + ALLOCATE( ec_lxc(length) ) + ALLOCATE( vc_rho(length*ns), vc_sigma(length*np) ) + ENDIF ! IF ( ns == 1 ) THEN ! + !$acc parallel loop DO k = 1, length - rho_lxc(k) = ABS( rho(k,1) ) - IF ( rho_lxc(k) > rho_threshold_gga ) & + arho_k = ABS(rho(k,1)) + rho_lxc(k) = arho_k + IF ( arho_k > rho_threshold_gga ) & sigma(k) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2 ENDDO ! ELSE ! + !$acc parallel loop DO k = 1, length rho_lxc(2*k-1) = rho(k,1) rho_lxc(2*k) = rho(k,2) @@ -135,25 +211,26 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ! IF ( ns==1 .AND. ANY(.NOT.is_libxc(3:4)) ) THEN ! - CALL gcxc( length, ABS(rho(:,1)), sigma, ex, ec, v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) ) - ! - IF ( (igcx==43.AND..NOT.is_libxc(3)) .OR. (igcc==14.AND..NOT.is_libxc(4)) ) & - CALL gcxc_beef( length, ABS(rho(:,1)), sigma, ex, ec, v1x(:,1), v2x(:,1), & - v1c(:,1), v2c(:,1) ) + !$acc host_data use_device( rho_lxc, sigma ) + CALL gcxc( length, rho_lxc, sigma, ex, ec, v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) ) + !$acc end host_data ! + !$acc parallel loop DO k = 1, length - sgn(1) = SIGN(1._DP, rho(k,1)) - ex(k) = ex(k) * sgn(1) - ec(k) = ec(k) * sgn(1) + sgn1 = SIGN(1._DP, rho(k,1)) + ex(k) = ex(k) * sgn1 + ec(k) = ec(k) * sgn1 ENDDO ! ENDIF ! + !$acc update self( rho_lxc, sigma ) + ! ! ---- GGA CORRELATION ! - IF (is_libxc(4)) fkind_x = xc_f03_func_info_get_kind( xc_info(4) ) - ! IF ( is_libxc(4) ) THEN !lda part of LYP not present in libxc (still so? - check) + ! + fkind_x = xc_f03_func_info_get_kind( xc_info(4) ) ! CALL xc_f03_func_set_dens_threshold( xc_func(4), small )!rho_threshold_gga ) IF (libxc_flags(4,0)==1) THEN @@ -163,89 +240,112 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ec_lxc = 0.d0 ENDIF ! + !$acc data copyin( ec_lxc, vc_rho, vc_sigma ) IF (.NOT. POLARIZED) THEN + !$acc parallel loop DO k = 1, length - IF ( rho_lxc(k) <= rho_threshold_lda ) CYCLE + IF ( rho_lxc(k) <= rho_threshold_lda ) THEN + ec(k)=0.d0 ; v1c(k,1)=0.d0 ; v2c(k,1)=0.d0 + CYCLE + ENDIF ec(k) = ec_lxc(k) * rho_lxc(k) * SIGN(1.0_DP, rho(k,1)) v1c(k,1) = vc_rho(k) IF ( rho_lxc(k) <= rho_threshold_gga .OR. & - SQRT(ABS(sigma(k))) <= grho_threshold_gga) CYCLE + SQRT(ABS(sigma(k))) <= grho_threshold_gga) THEN + v2c(k,1) = 0.d0 + CYCLE + ENDIF v2c(k,1) = vc_sigma(k)*2.d0 ENDDO ELSE + !$acc parallel loop DO k = 1, length rho_up = rho_lxc(2*k-1) rho_dw = rho_lxc(2*k) grho_up = SQRT(ABS(sigma(3*k-2))) grho_dw = SQRT(ABS(sigma(3*k))) - IF ( rho_up <= rho_threshold_lda .OR. rho_dw <= rho_threshold_lda ) CYCLE + IF ( rho_up <= rho_threshold_lda .OR. rho_dw <= rho_threshold_lda ) THEN + ec(k) = 0.d0 ; v1c(k,1) = 0.d0 ; v1c(k,2) = 0.d0 + v2c(k,1) = 0.d0 ; v2c_ud(k)= 0.d0 ; v2c(k,2) = 0.d0 + CYCLE + ENDIF ec(k) = ec_lxc(k) * (rho_up+rho_dw) v1c(k,1) = vc_rho(2*k-1) v1c(k,2) = vc_rho(2*k) IF ( rho_up <= rho_threshold_gga .OR. rho_dw <= rho_threshold_gga .OR. & - grho_up<=grho_threshold_gga .OR. grho_dw<=grho_threshold_gga ) CYCLE + grho_up<=grho_threshold_gga .OR. grho_dw<=grho_threshold_gga ) THEN + v2c(k,1) = 0.d0 ; v2c_ud(k)= 0.d0 ; v2c(k,2) = 0.d0 + CYCLE + ENDIF v2c(k,1) = vc_sigma(3*k-2)*2.d0 v2c_ud(k)= vc_sigma(3*k-1) v2c(k,2) = vc_sigma(3*k)*2.d0 ENDDO ENDIF - ! + ! + !$acc end data + DEALLOCATE( ec_lxc, vc_rho, vc_sigma ) + ! ELSEIF ( (.NOT.is_libxc(4)) .AND. fkind_x/=XC_EXCHANGE_CORRELATION ) THEN ! - ALLOCATE( arho(length,ns), grho2(length,ns) ) + ALLOCATE( grho2(length,ns) ) ! IF ( ns /= 1 ) THEN - ! - DO is = 1, 2 - grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2 - ENDDO ! IF (igcc==3 .OR. igcc==7 .OR. igcc==13 ) THEN ! ALLOCATE( grho_ud(length) ) - ! - grho_ud = grho(1,:,1) * grho(1,:,2) + grho(2,:,1) * grho(2,:,2) + & - grho(3,:,1) * grho(3,:,2) - ! - arho = rho - ! - WHERE ( rho(:,1)+rho(:,2) < rho_threshold_gga ) - arho(:,1) = 0.0_DP - arho(:,2) = 0.0_DP - ENDWHERE - ! - CALL gcc_spin_more( length, arho, grho2, grho_ud, ec, v1c, v2c, v2c_ud ) - ! + !$acc data create( grho2, grho_ud ) + !$acc parallel loop + DO k = 1, length + grho2(k,1) = sigma(3*k-2) + grho2(k,2) = sigma(3*k) + grho_ud(k) = sigma(3*k-1) + ENDDO + !$acc host_data use_device( grho2, grho_ud ) + CALL gcc_spin_more( length, rho, grho2, grho_ud, ec, v1c, v2c, v2c_ud ) + !$acc end host_data + !$acc end data DEALLOCATE( grho_ud ) ! ELSE ! ALLOCATE( rh(length), zeta(length) ) + !$acc data create( rh, zeta, grho2 ) + !$acc parallel loop + DO k = 1, length + rh(k) = rho(k,1) + rho(k,2) + IF ( rh(k) > rho_threshold_gga ) THEN + zeta(k) = ( rho(k,1) - rho(k,2) ) / rh(k) + ELSE + zeta(k) = 2.0_DP ! trash value, gcc-routines get rid of it when present + ENDIF + grho2(k,1) = ( grho(1,k,1) + grho(1,k,2) )**2 + & + ( grho(2,k,1) + grho(2,k,2) )**2 + & + ( grho(3,k,1) + grho(3,k,2) )**2 + grho2(k,2) = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2 + ENDDO ! - rh = rho(:,1) + rho(:,2) + !$acc host_data use_device( rh, zeta, grho2 ) + CALL gcc_spin( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) ) + !$acc end host_data ! - zeta = 2.0_DP ! trash value, gcc-routines get rid of it when present - WHERE ( rh > rho_threshold_gga ) zeta = ( rho(:,1) - rho(:,2) ) / rh(:) - ! - grho2(:,1) = ( grho(1,:,1) + grho(1,:,2) )**2 + & - ( grho(2,:,1) + grho(2,:,2) )**2 + & - ( grho(3,:,1) + grho(3,:,2) )**2 - ! - IF ( igcc/=14 ) CALL gcc_spin( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) ) - IF ( igcc==14 ) CALL gcc_spin_beef( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) ) - ! - v2c(:,2) = v2c(:,1) - IF ( PRESENT(v2c_ud) ) v2c_ud(:) = v2c(:,1) + !$acc parallel loop + DO k = 1, length + v2c(k,2) = v2c(k,1) + IF ( ns==2 ) v2c_ud(k) = v2c(k,1) + ENDDO ! + !$acc end data DEALLOCATE( rh, zeta ) ! ENDIF - ! + ! ENDIF ! - DEALLOCATE( arho, grho2 ) + DEALLOCATE( grho2 ) ! - ENDIF + ENDIF ! ! --- GGA EXCHANGE ! @@ -259,134 +359,177 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud ) ex_lxc = 0.d0 ENDIF ! + !$acc data copyin( ex_lxc, vx_rho, vx_sigma ) IF (.NOT. POLARIZED) THEN + !$acc parallel loop DO k = 1, length - IF ( rho_lxc(k) <= rho_threshold_lda ) CYCLE + IF ( rho_lxc(k) <= rho_threshold_lda ) THEN + ex(k) = 0.d0 ; v1x(k,1) = 0.d0 ; v2x(k,1) = 0.d0 + CYCLE + ENDIF ex(k) = ex_lxc(k) * rho_lxc(k) * SIGN(1.0_DP, rho(k,1)) v1x(k,1) = vx_rho(k) IF ( rho_lxc(k) <= rho_threshold_gga .OR. & - SQRT(ABS(sigma(k))) <= grho_threshold_gga) CYCLE + SQRT(ABS(sigma(k))) <= grho_threshold_gga) THEN + v2x(k,1) = 0.d0 + CYCLE + ENDIF v2x(k,1) = vx_sigma(k)*2.d0 ENDDO ELSE + !$acc parallel loop DO k = 1, length rho_up = rho_lxc(2*k-1) rho_dw = rho_lxc(2*k) grho_up = SQRT(ABS(sigma(3*k-2))) grho_dw = SQRT(ABS(sigma(3*k))) - IF ( rho_up <= rho_threshold_lda .OR. rho_dw <= rho_threshold_lda ) CYCLE + IF ( rho_up <= rho_threshold_lda .OR. rho_dw <= rho_threshold_lda ) THEN + ex(k) = 0.d0 + v1x(k,1) = 0.d0 ; v1x(k,2) = 0.d0 + v2x(k,1) = 0.d0 ; v2x(k,2) = 0.d0 + CYCLE + ENDIF ex(k) = ex_lxc(k) * (rho_up+rho_dw) v1x(k,1) = vx_rho(2*k-1) v1x(k,2) = vx_rho(2*k) IF ( rho_up <= rho_threshold_gga .OR. rho_dw <= rho_threshold_gga .OR. & - grho_up<=grho_threshold_gga .OR. grho_dw<=grho_threshold_gga ) CYCLE + grho_up<=grho_threshold_gga .OR. grho_dw<=grho_threshold_gga ) THEN + v2x(k,1) = 0.d0 ; v2x(k,2) = 0.d0 + CYCLE + ENDIF v2x(k,1) = vx_sigma(3*k-2)*2.d0 v2x(k,2) = vx_sigma(3*k)*2.d0 ENDDO ENDIF ! + !$acc end data + DEALLOCATE( ex_lxc, vx_rho, vx_sigma ) + ! ELSE - ! - ALLOCATE( grho2(length,ns) ) ! IF ( ns /= 1 ) THEN - ! - DO is = 1, 2 - grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2 - ENDDO - ! - IF ( igcx/=43 ) CALL gcx_spin( length, rho, grho2, ex, v1x, v2x ) - IF ( igcx==43 ) CALL gcx_spin_beef( length, rho, grho2, ex, v1x, v2x ) - ! + ! + ALLOCATE( grho2(length,ns) ) + !$acc data create( grho2 ) + !$acc parallel loop collapse(2) + DO is = 1, ns + DO k = 1, length + grho2(k,is) = grho(1,k,is)**2 + grho(2,k,is)**2 + grho(3,k,is)**2 + ENDDO + ENDDO + !$acc host_data use_device( grho2 ) + CALL gcx_spin( length, rho, grho2, ex, v1x, v2x ) + !$acc end host_data + !$acc end data + DEALLOCATE( grho2 ) + ! ENDIF ! - DEALLOCATE( grho2 ) - ! ENDIF ! + !$acc end data DEALLOCATE( rho_lxc, sigma ) - DEALLOCATE( ex_lxc , ec_lxc ) - DEALLOCATE( vx_rho , vx_sigma ) - DEALLOCATE( vc_rho , vc_sigma ) ! #else ! - ALLOCATE( arho(length,ns), grho2(length,ns) ) - arho = 0.0_DP - grho2 = 0.0_DP + ALLOCATE( grho2(length,ns) ) + !$acc data create( grho2 ) ! IF ( ns == 1 ) THEN ! - ! ... This is the spin-unpolarised case + ALLOCATE( rh(length) ) + !$acc data create( rh ) + !$acc host_data use_device( rh, grho2 ) + !$acc parallel loop DO k = 1, length - IF ( ABS(rho(k,1)) > rho_threshold_gga ) & - grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2 + rh(k) = ABS(rho(k,1)) + IF ( rh(k) > rho_threshold_gga ) & + grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2 ENDDO ! + CALL gcxc( length, rh, grho2(:,1), ex, ec, v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) ) ! - CALL gcxc( length, ABS(rho(:,1)), grho2(:,1), ex, ec, v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) ) - ! - IF ( igcx==43 .OR. igcc==14 ) CALL gcxc_beef( length, ABS(rho(:,1)), grho2(:,1), ex, ec, & - v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) ) - ! + !$acc parallel loop DO k = 1, length - sgn(1) = SIGN(1._DP, rho(k,1)) - ex(k) = ex(k) * sgn(1) - ec(k) = ec(k) * sgn(1) + sgn1 = SIGN(1._DP, rho(k,1)) + ex(k) = ex(k) * sgn1 + ec(k) = ec(k) * sgn1 ENDDO ! + !$acc end host_data + !$acc end data + DEALLOCATE( rh ) + ! ELSE ! - DO is = 1, 2 - grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2 + !$acc host_data use_device( grho2 ) + !$acc parallel loop collapse(2) + DO is = 1, ns + DO k = 1, length + grho2(k,is) = grho(1,k,is)**2 + grho(2,k,is)**2 + grho(3,k,is)**2 + ENDDO ENDDO ! - IF ( igcx/=43 ) CALL gcx_spin( length, rho, grho2, ex, v1x, v2x ) - IF ( igcx==43 ) CALL gcx_spin_beef( length, rho, grho2, ex, v1x, v2x ) + CALL gcx_spin( length, rho, grho2, ex, v1x, v2x ) + !$acc end host_data ! IF (igcc==3 .OR. igcc==7 .OR. igcc==13 ) THEN ! ALLOCATE( grho_ud(length) ) + !$acc data create( grho_ud ) + !$acc host_data use_device( grho_ud, grho2 ) + !$acc parallel loop + DO k = 1, length + grho_ud(k) = grho(1,k,1) * grho(1,k,2) + grho(2,k,1) * grho(2,k,2) + & + grho(3,k,1) * grho(3,k,2) + ENDDO ! - grho_ud = grho(1,:,1) * grho(1,:,2) + grho(2,:,1) * grho(2,:,2) + & - grho(3,:,1) * grho(3,:,2) - ! - arho = rho - ! - CALL gcc_spin_more( length, arho, grho2, grho_ud, ec, v1c, v2c, v2c_ud ) + CALL gcc_spin_more( length, rho, grho2, grho_ud, ec, v1c, v2c, v2c_ud ) ! + !$acc end host_data + !$acc end data DEALLOCATE( grho_ud ) ! ELSE ! ALLOCATE( rh(length), zeta(length) ) + !$acc data create( rh, zeta ) + !$acc host_data use_device( rh, zeta, grho2 ) + !$acc parallel loop + DO k = 1, length + rh(k) = rho(k,1) + rho(k,2) + IF ( rh(k) > rho_threshold_gga ) THEN + zeta(k) = ( rho(k,1) - rho(k,2) ) / rh(k) + ELSE + zeta(k) = 2.0_DP ! trash value, gcc-routines get rid of it when present + ENDIF + grho2(k,1) = ( grho(1,k,1) + grho(1,k,2) )**2 + & + ( grho(2,k,1) + grho(2,k,2) )**2 + & + ( grho(3,k,1) + grho(3,k,2) )**2 + ENDDO ! - rh = rho(:,1) + rho(:,2) + CALL gcc_spin( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) ) ! - zeta = 2.0_DP ! trash value, gcc-routines get rid of it when present - WHERE ( rh > rho_threshold_gga ) zeta = ( rho(:,1) - rho(:,2) ) / rh(:) - ! - grho2(:,1) = ( grho(1,:,1) + grho(1,:,2) )**2 + & - ( grho(2,:,1) + grho(2,:,2) )**2 + & - ( grho(3,:,1) + grho(3,:,2) )**2 - ! - - IF ( igcc/=14 ) CALL gcc_spin( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) ) - IF ( igcc==14 ) CALL gcc_spin_beef( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) ) - ! - v2c(:,2) = v2c(:,1) - IF ( PRESENT(v2c_ud) ) v2c_ud(:) = v2c(:,1) + !$acc parallel loop + DO k = 1, length + v2c(k,2) = v2c(k,1) + IF ( ns==2 ) v2c_ud(k) = v2c(k,1) + ENDDO ! + !$acc end host_data + !$acc end data DEALLOCATE( rh, zeta ) ! ENDIF - ! + ! ENDIF ! - DEALLOCATE( arho, grho2 ) + !$acc end data + DEALLOCATE( grho2 ) ! #endif ! + !$acc end data ! RETURN ! diff --git a/XClib/xc_wrapper_lda_lsda.f90 b/XClib/xc_wrapper_lda_lsda.f90 index d5bf6ce58..95932dd69 100644 --- a/XClib/xc_wrapper_lda_lsda.f90 +++ b/XClib/xc_wrapper_lda_lsda.f90 @@ -5,16 +5,71 @@ ! in the root directory of the present distribution, ! or http://www.gnu.org/copyleft/gpl.txt . ! +!------------------------------------------------------------------------------------------ +SUBROUTINE xc( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out, gpu_args_ ) + !-------------------------------------------------------------------------------------- + !! Wrapper routine to \(\texttt{xc_}\) or \(\texttt{xc_gpu}\). + ! + USE kind_l, ONLY: DP + ! + IMPLICIT NONE + ! + INTEGER, INTENT(IN) :: length + !! length of the I/O arrays + INTEGER, INTENT(IN) :: srd + !! spin dimension of rho + INTEGER, INTENT(IN) :: svd + !! spin dimension of v + REAL(DP), INTENT(IN) :: rho_in(length,srd) + !! Charge density + REAL(DP), INTENT(OUT) :: ex_out(length) + !! \(\epsilon_x(rho)\) ( NOT \(E_x(\text{rho})\) ) + REAL(DP), INTENT(OUT) :: vx_out(length,svd) + !! \(dE_x(\text{rho})/d\text{rho} ( NOT d\epsilon_x(\text{rho})/d\text{rho} ) + REAL(DP), INTENT(OUT) :: ec_out(length) + !! \(\epsilon_c(rho)\) ( NOT \(E_c(\text{rho})\) ) + REAL(DP), INTENT(OUT) :: vc_out(length,svd) + !! \(dE_c(\text{rho})/d\text{rho} ( NOT d\epsilon_c(\text{rho})/d\text{rho} ) + LOGICAL, OPTIONAL, INTENT(IN) :: gpu_args_ + !! whether you wish to run on gpu in case use_gpu is true + ! + LOGICAL :: gpu_args + ! + gpu_args = .FALSE. + IF ( PRESENT(gpu_args_) ) gpu_args = gpu_args_ + ! + IF ( gpu_args ) THEN + ! + !$acc data deviceptr( rho_in(length,srd), ex_out(length), ec_out(length), & + !$acc& vx_out(length,svd), vc_out(length,svd) ) + CALL xc_( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out ) + !$acc end data + ! + ELSE + ! + !$acc data copyin( rho_in ), copyout( ex_out, ec_out, vx_out, vc_out ) + !$acc host_data use_device( rho_in, ex_out, ec_out, vx_out, vc_out ) + CALL xc_( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out ) + !$acc end host_data + !$acc end data + ! + ENDIF + ! + RETURN + ! +END SUBROUTINE +! +! !--------------------------------------------------------------------------- -SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out ) +SUBROUTINE xc_( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out ) !------------------------------------------------------------------------- - !! Wrapper routine. Calls internal XC-driver routines or external ones - !! from Libxc, depending on the input choice. + !! Wrapper xc LDA - openACC version. + !! See comments in routine \(\texttt{xc}\) for variable explanations. ! #if defined(__LIBXC) #include "xc_version.h" USE xc_f03_lib_m - USE dft_setting_params, ONLY: xc_func, xc_info, libxc_flags + USE dft_setting_params, ONLY: xc_func, xc_info, libxc_flags #endif ! USE kind_l, ONLY: DP @@ -25,28 +80,19 @@ SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out ) IMPLICIT NONE ! INTEGER, INTENT(IN) :: length - !! length of the I/O arrays - INTEGER, INTENT(IN) :: sr_d - !! spin dimension of rho - INTEGER, INTENT(IN) :: sv_d - !! spin dimension of v - REAL(DP), INTENT(IN) :: rho_in(length,sr_d) - !! Charge density - REAL(DP), INTENT(OUT) :: ex_out(length) - !! \(\epsilon_x(rho)\) ( NOT \(E_x(\text{rho})\) ) - REAL(DP), INTENT(OUT) :: vx_out(length,sv_d) - !! \(dE_x(\text{rho})/d\text{rho} ( NOT d\epsilon_x(\text{rho})/d\text{rho} ) - REAL(DP), INTENT(OUT) :: ec_out(length) - !! \(\epsilon_c(rho)\) ( NOT \(E_c(\text{rho})\) ) - REAL(DP), INTENT(OUT) :: vc_out(length,sv_d) - !! \(dE_c(\text{rho})/d\text{rho} ( NOT d\epsilon_c(\text{rho})/d\text{rho} ) + INTEGER, INTENT(IN) :: srd, svd + REAL(DP), INTENT(IN) :: rho_in(length,srd) + REAL(DP), INTENT(OUT) :: ex_out(length), vx_out(length,svd) + REAL(DP), INTENT(OUT) :: ec_out(length), vc_out(length,svd) ! ! ... local variables ! #if defined(__LIBXC) - INTEGER :: fkind_x, eflag + LOGICAL :: is_libxc1, is_libxc2 + INTEGER :: fkind_x REAL(DP) :: amag REAL(DP), ALLOCATABLE :: rho_lxc(:) + REAL(DP), ALLOCATABLE :: ex_lxc(:), ec_lxc(:) REAL(DP), ALLOCATABLE :: vx_lxc(:), vc_lxc(:) #if (XC_MAJOR_VERSION > 4) INTEGER(8) :: lengthxc @@ -54,104 +100,129 @@ SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out ) INTEGER :: lengthxc #endif #endif - ! - REAL(DP), ALLOCATABLE :: arho(:), zeta(:) - ! + REAL(DP), ALLOCATABLE :: zeta(:) + REAL(DP) :: arho_ir INTEGER :: ir ! - ex_out = 0.0_DP ; vx_out = 0.0_DP - ec_out = 0.0_DP ; vc_out = 0.0_DP + !$acc data deviceptr( rho_in(length,srd), ex_out(length), ec_out(length), & + !$acc& vx_out(length,svd), vc_out(length,svd) ) ! #if defined(__LIBXC) ! + is_libxc1 = is_libxc(1) + is_libxc2 = is_libxc(2) fkind_x = -1 lengthxc = length ! - IF ( ANY(is_libxc(1:2)) ) THEN + IF ( is_libxc(1) ) ALLOCATE( ex_lxc(length), vx_lxc(length*svd) ) + IF ( is_libxc(2) ) ALLOCATE( ec_lxc(length), vc_lxc(length*svd) ) + ! + IF ( is_libxc(1) .OR. is_libxc(2) ) THEN ! - ALLOCATE( rho_lxc(length*sv_d) ) - ALLOCATE( vx_lxc(length*sv_d), vc_lxc(length*sv_d) ) + ALLOCATE( rho_lxc(length*svd) ) + !$acc data copyout( rho_lxc ) ! - ! ... set libxc input - SELECT CASE( sr_d ) + SELECT CASE( srd ) CASE( 1 ) - ! - rho_lxc(:) = ABS(rho_in(:,1)) - ! + ! + !$acc parallel loop + DO ir = 1, length + rho_lxc(ir) = ABS(rho_in(ir,1)) + ENDDO + ! CASE( 2 ) - ! - DO ir = 1, length - rho_lxc(2*ir-1) = (rho_in(ir,1) + rho_in(ir,2)) * 0.5_DP - rho_lxc(2*ir) = (rho_in(ir,1) - rho_in(ir,2)) * 0.5_DP - ENDDO - ! + ! + !$acc parallel loop + DO ir = 1, length + rho_lxc(2*ir-1) = (rho_in(ir,1) + rho_in(ir,2)) * 0.5_DP + rho_lxc(2*ir) = (rho_in(ir,1) - rho_in(ir,2)) * 0.5_DP + ENDDO + ! CASE( 4 ) - ! - DO ir = 1, length - amag = SQRT( SUM(rho_in(ir,2:4)**2) ) - rho_lxc(2*ir-1) = (rho_in(ir,1) + amag) * 0.5_DP - rho_lxc(2*ir) = (rho_in(ir,1) - amag) * 0.5_DP - ENDDO - ! + ! + !$acc parallel loop + DO ir = 1, length + amag = SQRT( SUM(rho_in(ir,2:4)**2) ) + rho_lxc(2*ir-1) = (rho_in(ir,1) + amag) * 0.5_DP + rho_lxc(2*ir) = (rho_in(ir,1) - amag) * 0.5_DP + ENDDO + ! CASE DEFAULT - ! - CALL xclib_error( 'xc_LDA', 'Wrong number of spin dimensions', 1 ) - ! + ! + CALL xclib_error( 'xc_LDA', 'Wrong number of spin dimensions', 1 ) + ! END SELECT ! + !$acc end data + ! ENDIF ! - ! ! ... EXCHANGE IF ( is_libxc(1) ) THEN CALL xc_f03_func_set_dens_threshold( xc_func(1), rho_threshold_lda ) IF (libxc_flags(1,0)==1) THEN - CALL xc_f03_lda_exc_vxc( xc_func(1), lengthxc, rho_lxc(1), ex_out(1), vx_lxc(1) ) + CALL xc_f03_lda_exc_vxc( xc_func(1), lengthxc, rho_lxc(1), ex_lxc(1), vx_lxc(1) ) ELSE CALL xc_f03_lda_vxc( xc_func(1), lengthxc, rho_lxc(1), vx_lxc(1) ) + ex_lxc = 0.d0 ENDIF ENDIF ! ! ... CORRELATION IF ( is_libxc(2) ) THEN - CALL xc_f03_func_set_dens_threshold( xc_func(2), rho_threshold_lda ) - fkind_x = xc_f03_func_info_get_kind( xc_info(2) ) - IF (libxc_flags(2,0)==1) THEN - CALL xc_f03_lda_exc_vxc( xc_func(2), lengthxc, rho_lxc(1), ec_out(1), vc_lxc(1) ) - ELSE - CALL xc_f03_lda_vxc( xc_func(2), lengthxc, rho_lxc(1), vc_lxc(1) ) - ENDIF + CALL xc_f03_func_set_dens_threshold( xc_func(2), rho_threshold_lda ) + fkind_x = xc_f03_func_info_get_kind( xc_info(2) ) + IF (libxc_flags(2,0)==1) THEN + CALL xc_f03_lda_exc_vxc( xc_func(2), lengthxc, rho_lxc(1), ec_lxc(1), vc_lxc(1) ) + ELSE + CALL xc_f03_lda_vxc( xc_func(2), lengthxc, rho_lxc(1), vc_lxc(1) ) + ec_lxc = 0.d0 + ENDIF ENDIF ! - IF ( ((.NOT.is_libxc(1)) .OR. (.NOT.is_libxc(2))) & - .AND. fkind_x/=XC_EXCHANGE_CORRELATION ) THEN + IF ( is_libxc(1) .OR. is_libxc(2) ) DEALLOCATE( rho_lxc ) + ! + IF ( ((.NOT.is_libxc(1)) .OR. (.NOT.is_libxc(2))) ) THEN ! - SELECT CASE( sr_d ) + SELECT CASE( srd ) CASE( 1 ) ! - IF ((iexch==8 .AND. .NOT.is_libxc(1)) .OR. (icorr==10 .AND. & - .NOT. is_libxc(2))) THEN + IF ((iexch==8 .AND. .NOT.is_libxc1) .OR. (icorr==10 .AND. .NOT.is_libxc2)) THEN IF (.NOT. finite_size_cell_volume_set) CALL xclib_error( 'XC',& 'finite size corrected exchange used w/o initialization', 1 ) ENDIF - CALL xc_lda( length, ABS(rho_in(:,1)), ex_out, ec_out, vx_out(:,1), vc_out(:,1) ) + CALL xc_lda( length, rho_in(:,1), ex_out, ec_out, vx_out(:,1), vc_out(:,1) ) ! CASE( 2 ) ! - ALLOCATE( arho(length), zeta(length) ) - arho = ABS(rho_in(:,1)) - WHERE (arho > rho_threshold_lda) zeta(:) = rho_in(:,2) / arho(:) - CALL xc_lsda( length, arho, zeta, ex_out, ec_out, vx_out, vc_out ) - DEALLOCATE( arho, zeta ) + ALLOCATE( zeta(length) ) + !$acc data create( zeta ) + !$acc parallel loop + DO ir = 1, length + arho_ir = ABS(rho_in(ir,1)) + IF (arho_ir > rho_threshold_lda) zeta(ir) = rho_in(ir,2) / arho_ir + ENDDO + !$acc host_data use_device( zeta ) + CALL xc_lsda( length, rho_in(:,1), zeta, ex_out, ec_out, vx_out, vc_out ) + !$acc end host_data + !$acc end data + DEALLOCATE( zeta ) ! CASE( 4 ) ! - ALLOCATE( arho(length), zeta(length) ) - arho = ABS( rho_in(:,1) ) - WHERE (arho > rho_threshold_lda) zeta(:) = SQRT( rho_in(:,2)**2 + rho_in(:,3)**2 + & - rho_in(:,4)**2 ) / arho(:) ! amag/arho - CALL xc_lsda( length, arho, zeta, ex_out, ec_out, vx_out, vc_out ) - DEALLOCATE( arho, zeta ) + ALLOCATE( zeta(length) ) + !$acc data create( zeta ) + !$acc parallel loop + DO ir = 1, length + arho_ir = ABS( rho_in(ir,1) ) + IF (arho_ir > rho_threshold_lda) zeta(ir) = SQRT( rho_in(ir,2)**2 + rho_in(ir,3)**2 + & + rho_in(ir,4)**2 ) / arho_ir ! amag/arho + ENDDO + !$acc host_data use_device( zeta ) + CALL xc_lsda( length, rho_in(:,1), zeta, ex_out, ec_out, vx_out, vc_out ) + !$acc end host_data + !$acc end data + DEALLOCATE( zeta ) ! CASE DEFAULT ! @@ -162,67 +233,83 @@ SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out ) ENDIF ! ! ... fill output arrays - ! - IF (sv_d == 1) THEN - IF (is_libxc(1)) vx_out(:,1) = vx_lxc(:) - IF (is_libxc(2)) vc_out(:,1) = vc_lxc(:) - ELSE - IF (is_libxc(1)) THEN - DO ir = 1, length - vx_out(ir,1) = vx_lxc(2*ir-1) - vx_out(ir,2) = vx_lxc(2*ir) - ENDDO - ENDIF - IF (is_libxc(2)) THEN - DO ir = 1, length - vc_out(ir,1) = vc_lxc(2*ir-1) - vc_out(ir,2) = vc_lxc(2*ir) - ENDDO - ENDIF + ! + IF ( is_libxc(1) ) THEN + !$acc data copyin( ex_lxc, vx_lxc ) + !$acc parallel loop + DO ir = 1, length + ex_out(ir) = ex_lxc(ir) + IF (svd == 1) THEN + vx_out(ir,1) = vx_lxc(ir) + ELSE + vx_out(ir,1) = vx_lxc(2*ir-1) + vx_out(ir,2) = vx_lxc(2*ir) + ENDIF + ENDDO + !$acc end data + DEALLOCATE( ex_lxc, vx_lxc ) ENDIF ! - IF (ANY(is_libxc(1:2))) THEN - DEALLOCATE( rho_lxc ) - DEALLOCATE( vx_lxc, vc_lxc ) + IF ( is_libxc(2) ) THEN + !$acc data copyin( ec_lxc, vc_lxc ) + !$acc parallel loop + DO ir = 1, length + ec_out(ir) = ec_lxc(ir) + IF (svd == 1) THEN + vc_out(ir,1) = vc_lxc(ir) + ELSE + vc_out(ir,1) = vc_lxc(2*ir-1) + vc_out(ir,2) = vc_lxc(2*ir) + ENDIF + ENDDO + !$acc end data + DEALLOCATE( ec_lxc, vc_lxc ) ENDIF ! #else ! - SELECT CASE( sr_d ) + SELECT CASE( srd ) CASE( 1 ) ! - IF ((iexch==8 .AND. .NOT.is_libxc(1)) .OR. (icorr==10 .AND. & - .NOT. is_libxc(2))) THEN + IF (iexch==8 .OR. icorr==10) THEN IF (.NOT. finite_size_cell_volume_set) CALL xclib_error( 'XC',& 'finite size corrected exchange used w/o initialization', 1 ) ENDIF ! - CALL xc_lda( length, ABS(rho_in(:,1)), ex_out, ec_out, vx_out(:,1), vc_out(:,1) ) + CALL xc_lda( length, rho_in(:,1), ex_out, ec_out, vx_out(:,1), vc_out(:,1) ) ! CASE( 2 ) ! - ALLOCATE( arho(length), zeta(length) ) + ALLOCATE( zeta(length) ) + !$acc data create( zeta ) + !$acc parallel loop + DO ir = 1, length + arho_ir = ABS(rho_in(ir,1)) + IF (arho_ir > rho_threshold_lda) zeta(ir) = rho_in(ir,2) / arho_ir + ENDDO + !$acc host_data use_device( zeta ) + CALL xc_lsda( length, rho_in(:,1), zeta, ex_out, ec_out, vx_out, vc_out ) + !$acc end host_data + !$acc end data + DEALLOCATE( zeta ) ! - arho = ABS(rho_in(:,1)) - WHERE (arho > rho_threshold_lda) zeta(:) = rho_in(:,2) / arho(:) + CASE( 4 ) ! - CALL xc_lsda( length, arho, zeta, ex_out, ec_out, vx_out, vc_out ) + ALLOCATE( zeta(length) ) + !$acc data create( zeta ) + !$acc parallel loop + DO ir = 1, length + arho_ir = ABS(rho_in(ir,1)) + IF (arho_ir > rho_threshold_lda) zeta(ir) = SQRT( rho_in(ir,2)**2 + rho_in(ir,3)**2 + & + rho_in(ir,4)**2 ) / arho_ir ! amag/arho + ENDDO + !$acc host_data use_device( zeta ) + CALL xc_lsda( length, rho_in(:,1), zeta, ex_out, ec_out, vx_out, vc_out ) + !$acc end host_data + !$acc end data + DEALLOCATE( zeta ) ! - DEALLOCATE( arho, zeta ) - ! - CASE( 4 ) - ! - ALLOCATE( arho(length), zeta(length) ) - ! - arho = ABS( rho_in(:,1) ) - WHERE (arho > rho_threshold_lda) zeta(:) = SQRT( rho_in(:,2)**2 + rho_in(:,3)**2 + & - rho_in(:,4)**2 ) / arho(:) ! amag/arho - ! - CALL xc_lsda( length, arho, zeta, ex_out, ec_out, vx_out, vc_out ) - ! - DEALLOCATE( arho, zeta ) - ! - CASE DEFAULT + CASE DEFAULT ! CALL xclib_error( 'xc_LDA', 'Wrong ns input', 2 ) ! @@ -230,7 +317,8 @@ SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out ) ! #endif ! + !$acc end data ! RETURN ! -END SUBROUTINE xc +END SUBROUTINE xc_ diff --git a/XClib/xc_wrapper_mgga.f90 b/XClib/xc_wrapper_mgga.f90 index 1b2c9628f..7982c3ca7 100644 --- a/XClib/xc_wrapper_mgga.f90 +++ b/XClib/xc_wrapper_mgga.f90 @@ -5,8 +5,80 @@ ! in the root directory of the present distribution, ! or http://www.gnu.org/copyleft/gpl.txt . ! +! +!------------------------------------------------------------------------------------- +SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, & + v2c, v3c, gpu_args_ ) + !---------------------------------------------------------------------------------- + !! Wrapper to gpu or non gpu version of \(\texttt{xc_metagcx}\). + ! + USE kind_l, ONLY: DP + ! + IMPLICIT NONE + ! + INTEGER, INTENT(IN) :: length + !! length of the I/O arrays + INTEGER, INTENT(IN) :: ns + !! spin components + INTEGER, INTENT(IN) :: np + !! first dimension of v2c + REAL(DP), INTENT(IN) :: rho(length,ns) + !! the charge density + REAL(DP), INTENT(IN) :: grho(3,length,ns) + !! grho = \nabla rho + REAL(DP), INTENT(IN) :: tau(length,ns) + !! kinetic energy density + REAL(DP), INTENT(OUT) :: ex(length) + !! sx = E_x(rho,grho) + REAL(DP), INTENT(OUT) :: ec(length) + !! sc = E_c(rho,grho) + REAL(DP), INTENT(OUT) :: v1x(length,ns) + !! v1x = D(E_x)/D(rho) + REAL(DP), INTENT(OUT) :: v2x(length,ns) + !! v2x = D(E_x)/D( D rho/D r_alpha ) / |\nabla rho| + REAL(DP), INTENT(OUT) :: v3x(length,ns) + !! v3x = D(E_x)/D(tau) + REAL(DP), INTENT(OUT) :: v1c(length,ns) + !! v1c = D(E_c)/D(rho) + REAL(DP), INTENT(OUT) :: v2c(np,length,ns) + !! v2c = D(E_c)/D( D rho/D r_alpha ) / |\nabla rho| + REAL(DP), INTENT(OUT) :: v3c(length,ns) + !! v3c = D(E_c)/D(tau) + LOGICAL, INTENT(IN), OPTIONAL :: gpu_args_ + !! whether you wish to run on gpu in case use_gpu is true + ! + LOGICAL :: gpu_args + ! + gpu_args = .FALSE. + IF ( PRESENT(gpu_args_) ) gpu_args = gpu_args_ + ! + IF ( gpu_args ) THEN + ! + !$acc data deviceptr( rho(length,ns), grho(3,length,ns), tau(length,ns), & + !$acc& ex(length), ec(length), v1x(length,ns), v2x(length,ns), v3x(length,ns), & + !$acc& v1c(length,ns), v2c(np,length,ns), v3c(length,ns) ) + CALL xc_metagcx_( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, & + v2c, v3c ) + !$acc end data + ! + ELSE + ! + !$acc data copyin( rho, grho, tau ), copyout( ex, ec, v1x, v2x, v3x, v1c, v2c, v3c ) + !$acc host_data use_device( rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c ) + CALL xc_metagcx_( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, & + v2c, v3c ) + !$acc end host_data + !$acc end data + ! + ENDIF + ! + RETURN + ! +END SUBROUTINE +! +! !---------------------------------------------------------------------------------------- -SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c ) +SUBROUTINE xc_metagcx_( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c ) !------------------------------------------------------------------------------------- !! Wrapper routine. Calls internal metaGGA drivers or the Libxc ones, !! depending on the input choice. @@ -18,8 +90,8 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 #endif ! USE kind_l, ONLY: DP - USE dft_setting_params, ONLY: imetac, is_libxc, rho_threshold_mgga, & - grho2_threshold_mgga, tau_threshold_mgga, & + USE dft_setting_params, ONLY: imeta, imetac, is_libxc, rho_threshold_mgga,& + grho2_threshold_mgga, tau_threshold_mgga, & scan_exx, exx_started, exx_fraction USE qe_drivers_mgga ! @@ -56,7 +128,7 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 ! ! ... local variables ! - INTEGER :: is + INTEGER :: k, is, ipol REAL(DP), ALLOCATABLE :: grho2(:,:) REAL(DP), PARAMETER :: small = 1.E-10_DP ! @@ -67,48 +139,70 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 REAL(DP), ALLOCATABLE :: vc_rho(:), vc_sigma(:), vc_tau(:) REAL(DP), ALLOCATABLE :: lapl_rho(:), vlapl_rho(:) ! not used in QE ! - INTEGER :: k, ipol, pol_unpol - LOGICAL :: POLARIZED REAL(DP) :: rh, ggrho2, atau #if (XC_MAJOR_VERSION > 4) INTEGER(8) :: lengthxc #else INTEGER :: lengthxc +#endif #endif ! + !$acc data deviceptr( rho(length,ns), grho(3,length,ns), tau(length,ns), ex(length), & + !$acc& ec(length), v1x(length,ns), v2x(length,ns), v3x(length,ns), & + !$acc& v1c(length,ns), v2c(np,length,ns), v3c(length,ns) ) + ! +#if defined(__LIBXC) lengthxc = length ! - ex = 0.0_DP ; v1x = 0.0_DP ; v2x = 0.0_DP ; v3x = 0.0_DP - ec = 0.0_DP ; v1c = 0.0_DP ; v2c = 0.0_DP ; v3c = 0.0_DP + ALLOCATE( rho_lxc(length*ns), sigma(length*np) ) + ALLOCATE( tau_lxc(length*ns), lapl_rho(length*ns) ) + !$acc data create( rho_lxc, sigma, tau_lxc, lapl_rho ) ! - POLARIZED = .FALSE. - IF (ns == 2) THEN - POLARIZED = .TRUE. + IF ( is_libxc(5) ) THEN + ALLOCATE( ex_lxc(length) ) + ALLOCATE( vx_rho(length*ns), vx_sigma(length*np), vx_tau(length*ns) ) + IF ( imetac==0 ) THEN + !$acc parallel loop + DO k = 1, length + ec(k) = 0.d0 + DO is = 1, ns + v1c(k,is) = 0.d0 ; v3c(k,is) = 0.d0 + DO ipol = 1, np + v2c(ipol,k,is) = 0.d0 + ENDDO + ENDDO + ENDDO + ENDIF ENDIF - ! - pol_unpol = ns - ! - ALLOCATE( rho_lxc(length*ns), sigma(length*np), tau_lxc(length*ns) ) - ALLOCATE( lapl_rho(length*ns) ) - ! - ALLOCATE( ex_lxc(length) , ec_lxc(length) ) - ALLOCATE( vx_rho(length*ns) , vx_sigma(length*np), vx_tau(length*ns) ) - ALLOCATE( vc_rho(length*ns) , vc_sigma(length*np), vc_tau(length*ns) ) - ALLOCATE( vlapl_rho(length*ns) ) - ! + IF ( is_libxc(6) ) THEN + ALLOCATE( ec_lxc(length) ) + ALLOCATE( vc_rho(length*ns), vc_sigma(length*np), vc_tau(length*ns) ) + IF ( imeta==0 ) THEN + !$acc parallel loop + DO k = 1, length + ex(k) = 0.d0 + DO is = 1, ns + v1x(k,is) = 0.d0 ; v2x(k,is) = 0.d0 ; v3x(k,is) = 0.d0 + ENDDO + ENDDO + ENDIF + ENDIF + IF ( ANY(is_libxc(5:6)) ) ALLOCATE( vlapl_rho(length*ns) ) ! IF ( ns == 1 ) THEN ! + !$acc parallel loop DO k = 1, length rho_lxc(k) = ABS( rho(k,1) ) sigma(k) = MAX( grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2, & grho2_threshold_mgga ) tau_lxc(k) = MAX( tau(k,1), tau_threshold_mgga ) - vlapl_rho(k) = 0._DP + lapl_rho(k) = 0.d0 ENDDO ! ELSE ! + !$acc parallel loop DO k = 1, length rho_lxc(2*k-1) = ABS( rho(k,1) ) rho_lxc(2*k) = ABS( rho(k,2) ) @@ -120,28 +214,28 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 grho2_threshold_mgga ) tau_lxc(2*k-1) = MAX( tau(k,1), small ) tau_lxc(2*k) = MAX( tau(k,2), small ) - vlapl_rho(2*k-1) = 0._DP - vlapl_rho(2*k) = 0._DP + lapl_rho(2*k-1) = 0.d0 + lapl_rho(2*k) = 0.d0 ENDDO ! ENDIF ! + !$acc update self( rho_lxc, sigma, tau_lxc, lapl_rho ) + ! IF ( .NOT.is_libxc(5) .AND. imetac==0 ) THEN - ! - ALLOCATE( grho2(length,ns) ) - ! - DO is = 1, ns - grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2 - ENDDO - ! - IF (ns == 1) CALL tau_xc( length, rho(:,1), grho2(:,1), tau(:,1), ex, ec, & - v1x(:,1), v2x(:,1), v3x(:,1), v1c(:,1), v2c(1,:,1),& - v3c(:,1) ) - IF (ns == 2) CALL tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x,& - v1c, v2c, v3c ) - ! - DEALLOCATE( grho2 ) - ! + IF (ns == 1) THEN + ! + !$acc host_data use_device( sigma ) + CALL tau_xc( length, rho(:,1), sigma, tau(:,1), ex, ec, v1x(:,1), & + v2x(:,1), v3x(:,1), v1c(:,1), v2c, v3c(:,1) ) + !$acc end host_data + ! + ELSEIF (ns == 2) THEN + ! + CALL tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x, & + v1c, v2c, v3c ) + ! + ENDIF ENDIF ! ! ... META EXCHANGE @@ -157,19 +251,31 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 ex_lxc = 0.d0 ENDIF ! - IF (.NOT. POLARIZED) THEN + !$acc data copyin( ex_lxc, vx_rho, vx_sigma, vx_tau ) + IF ( ns==1 ) THEN + !$acc parallel loop DO k = 1, length IF ( ABS(rho_lxc(k))<=rho_threshold_mgga .OR. & sigma(k)<=grho2_threshold_mgga .OR. & - ABS(tau_lxc(k))<=rho_threshold_mgga ) CYCLE + ABS(tau_lxc(k))<=rho_threshold_mgga ) THEN + ex(k) = 0.d0 ; v1x(k,1) = 0.d0 + v2x(k,1) = 0.d0 ; v3x(k,1) = 0.d0 + CYCLE + ENDIF ex(k) = ex_lxc(k) * rho_lxc(k) v1x(k,1) = vx_rho(k) v2x(k,1) = vx_sigma(k) * 2.0_DP v3x(k,1) = vx_tau(k) ENDDO ELSE + !$acc parallel loop DO k = 1, length - IF (rho_lxc(2*k-1)+rho_lxc(2*k) <= rho_threshold_mgga) CYCLE + IF (rho_lxc(2*k-1)+rho_lxc(2*k) <= rho_threshold_mgga) THEN + ex(k) = 0.d0 + v1x(k,1) = 0.d0 ; v2x(k,1) = 0.d0 ; v3x(k,1) = 0.d0 + v1x(k,2) = 0.d0 ; v2x(k,2) = 0.d0 ; v3x(k,2) = 0.d0 + CYCLE + ENDIF ex(k) = ex_lxc(k) * (rho_lxc(2*k-1)+rho_lxc(2*k)) IF ( ABS(rho_lxc(2*k-1))>rho_threshold_mgga .AND. & sigma(3*k-2)>grho2_threshold_mgga .AND. & @@ -177,6 +283,8 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 v1x(k,1) = vx_rho(2*k-1) v2x(k,1) = vx_sigma(3*k-2)*2.d0 v3x(k,1) = vx_tau(2*k-1) + ELSE + v1x(k,1) = 0.d0 ; v2x(k,1) = 0.d0 ; v3x(k,1) = 0.d0 ENDIF IF ( ABS(rho_lxc(2*k))>rho_threshold_mgga .AND. & sigma(3*k)>grho2_threshold_mgga .AND. & @@ -184,20 +292,14 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 v1x(k,2) = vx_rho(2*k) v2x(k,2) = vx_sigma(3*k)*2.d0 v3x(k,2) = vx_tau(2*k) + ELSE + v1x(k,2) = 0.d0 ; v2x(k,2) = 0.d0 ; v3x(k,2) = 0.d0 ENDIF ENDDO ENDIF ! - ! **Now this is done directly inside Libxc for SCAN0** - ! ... only for HK/MCA: SCAN0 (used in CPV) - !IF ( scan_exx ) THEN - ! IF (exx_started) THEN - ! ex = (1.0_DP - exx_fraction) * ex - ! v1x = (1.0_DP - exx_fraction) * v1x - ! v2x = (1.0_DP - exx_fraction) * v2x - ! v3x = (1.0_DP - exx_fraction) * v3x - ! ENDIF - !ENDIF + !$acc end data + DEALLOCATE( ex_lxc, vx_rho, vx_sigma, vx_tau ) ! ENDIF ! @@ -215,24 +317,40 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 ec_lxc = 0.d0 ENDIF ! - IF (.NOT. POLARIZED) THEN + !$acc data copyin( ec_lxc, vc_rho, vc_sigma, vc_tau ) + IF ( ns==1 ) THEN + !$acc parallel loop DO k = 1, length IF ( ABS(rho_lxc(k))<=rho_threshold_mgga .OR. & sigma(k)<=grho2_threshold_mgga .OR. & - ABS(tau_lxc(k))<=rho_threshold_mgga ) CYCLE + ABS(tau_lxc(k))<=rho_threshold_mgga ) THEN + ec(k) = 0.d0 ; v1c(k,1) = 0.d0 + v2c(1,k,1) = 0.d0 ; v3c(k,1) = 0.d0 + CYCLE + ENDIF ec(k) = ec_lxc(k) * rho_lxc(k) v1c(k,1) = vc_rho(k) v2c(1,k,1) = vc_sigma(k) * 2.0_DP v3c(k,1) = vc_tau(k) ENDDO ELSE + !$acc parallel loop DO k = 1, length rh = rho_lxc(2*k-1) + rho_lxc(2*k) atau = ABS(tau_lxc(2*k-1) + tau_lxc(2*k)) ggrho2 = (sigma(3*k-2) + sigma(3*k))*4.0_DP IF ( rh <= rho_threshold_mgga .OR. & ggrho2 <= grho2_threshold_mgga .OR. & - atau <= tau_threshold_mgga ) CYCLE + atau <= tau_threshold_mgga ) THEN + ec(k) = 0.d0 + v1c(k,1) = 0.d0 ; v3c(k,1) = 0.d0 + v1c(k,2) = 0.d0 ; v3c(k,2) = 0.d0 + DO ipol = 1, 3 + v2c(ipol,k,1) = 0.d0 + v2c(ipol,k,2) = 0.d0 + ENDDO + CYCLE + ENDIF ec(k) = ec_lxc(k) * (rho_lxc(2*k-1)+rho_lxc(2*k)) v1c(k,1) = vc_rho(2*k-1) v1c(k,2) = vc_rho(2*k) @@ -244,26 +362,33 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 v3c(k,2) = vc_tau(2*k) ENDDO ENDIF + !$acc end data ! + DEALLOCATE( ec_lxc, vc_rho, vc_sigma, vc_tau ) ENDIF ! + IF ( ANY(is_libxc(5:6)) ) DEALLOCATE( vlapl_rho ) + ! + !$acc end data DEALLOCATE( rho_lxc, sigma, tau_lxc, lapl_rho ) - DEALLOCATE( ex_lxc , ec_lxc ) - DEALLOCATE( vx_rho , vx_sigma, vx_tau ) - DEALLOCATE( vc_rho , vc_sigma, vc_tau, vlapl_rho ) ! #else ! ALLOCATE( grho2(length,ns) ) + !$acc data create( grho2 ) + !$acc host_data use_device( grho2 ) ! + !$acc parallel loop collapse(2) DO is = 1, ns - grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2 + DO k = 1, length + grho2(k,is) = grho(1,k,is)**2 + grho(2,k,is)**2 + grho(3,k,is)**2 + ENDDO ENDDO ! IF (ns == 1) THEN ! CALL tau_xc( length, rho(:,1), grho2(:,1), tau(:,1), ex, ec, v1x(:,1), & - v2x(:,1), v3x(:,1), v1c(:,1), v2c(1,:,1), v3c(:,1) ) + v2x(:,1), v3x(:,1), v1c(:,1), v2c, v3c(:,1) ) ! ELSEIF (ns == 2) THEN ! @@ -272,10 +397,14 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1 ! ENDIF ! + !$acc end host_data + !$acc end data DEALLOCATE( grho2 ) ! #endif + ! + !$acc end data ! RETURN ! -END SUBROUTINE xc_metagcx +END SUBROUTINE xc_metagcx_ diff --git a/XClib/xclib_test.f90 b/XClib/xclib_test.f90 index cb0291d8d..bf41f048c 100644 --- a/XClib/xclib_test.f90 +++ b/XClib/xclib_test.f90 @@ -31,18 +31,19 @@ PROGRAM xclib_test ! !! See README.TEST file for more details. ! - USE kind_l, ONLY: DP - USE constants_l, ONLY: pi - USE xc_lib, ONLY: xclib_set_dft_from_name, xclib_set_exx_fraction, & - xclib_get_ID, xclib_reset_dft, xc_gcx, & - xclib_dft_is_libxc, xclib_init_libxc, & - xclib_finalize_libxc, xclib_set_finite_size_volume, & - xclib_set_auxiliary_flags, xclib_dft_is, start_exx + USE kind_l, ONLY: DP + USE constants_l, ONLY: pi + USE beef_interface, ONLY: beefsetmode + USE xc_lib, ONLY: xclib_set_dft_from_name, xclib_set_exx_fraction, & + xclib_get_ID, xclib_reset_dft, xc, xc_gcx, & + xc_metagcx, xclib_dft_is_libxc, xclib_init_libxc,& + xclib_finalize_libxc, xclib_set_finite_size_volume,& + xclib_set_auxiliary_flags, xclib_dft_is, start_exx USE xclib_utils_and_para !--xml - USE xmltools, ONLY: xml_open_file, xml_closefile,xmlr_readtag, & - xmlw_writetag, xmlw_opentag, xmlw_closetag, & - xmlr_opentag, xmlr_closetag, get_attr, add_attr + USE xmltools, ONLY: xml_open_file, xml_closefile,xmlr_readtag, & + xmlw_writetag, xmlw_opentag, xmlw_closetag, & + xmlr_opentag, xmlr_closetag, get_attr, add_attr #if defined(__LIBXC) #include "xc_version.h" USE xc_f03_lib_m @@ -379,6 +380,15 @@ PROGRAM xclib_test IF (nr==1) nrpe = mype*nnr IF (nr==0) nrpe = npoints-(npoints/npes)*(npes-mype-1) ! + ! ... openacc init (otherwise it offsets the wall time of the first test) + ! +#if defined(_OPENACC) + !$acc data create( time ) + !$acc end data +#endif + ! + ! ... capitalize input vars + ! IF (TRIM(polarization)=='UNPOLARIZED' ) THEN is_min = 1 ; is_max = 1 ELSEIF (TRIM(polarization)=='POLARIZED' ) THEN @@ -979,6 +989,9 @@ PROGRAM xclib_test ex1 = 0.d0 ; ec1 = 0.d0 ENDIF ! + IF ( dft(1:3)=='BEE' ) CALL beefsetmode(-1) !** beeforder can be manually + ! ! changed here for other checks + ! IF (mype==root) time(3) = get_wall_time() CALL xc_gcx( nnr, ns, rho, grho, exg1, ecg1, v1x1, v2x1, v1c1, & v2c1, v2c_ud1 ) @@ -1798,9 +1811,9 @@ PROGRAM xclib_test WRITE(test_output_exe(56:96), '(a)') TRIM(status) IF (status==passed .AND. show_time) THEN WRITE(test_output_exe(80:85), '(a)') 'time:' - WRITE(test_output_exe(86:92), '(F6.3)') time_tot2 + WRITE(test_output_exe(86:92), '(F6.3)') time_tot1 WRITE(test_output_exe(93:100), '(a)') 's incr:' - WRITE(test_output_exe(101:110), '(F8.3)') time_tot2/time_tot1*100.d0-100.d0 + WRITE(test_output_exe(101:110), '(F8.3)') time_tot1/time_tot2*100.d0-100.d0 WRITE(test_output_exe(111:111), '(a)') '%' ENDIF WRITE(stdout,*) test_output_exe diff --git a/atomic/src/vxcgc.f90 b/atomic/src/vxcgc.f90 index 293993ad8..1ccd55e35 100644 --- a/atomic/src/vxcgc.f90 +++ b/atomic/src/vxcgc.f90 @@ -13,7 +13,8 @@ subroutine vxc_t(lsd,rho,rhoc,exc,vxc) ! this function returns the XC potential and energy in LDA or ! LSDA approximation ! - use kinds, only : DP + use kinds, only : DP + use xc_lib, only : xc implicit none integer, intent(in) :: lsd ! 1 in the LSDA case, 0 otherwise real(DP), intent(in) :: rho(2), rhoc ! the system density @@ -77,7 +78,8 @@ subroutine vxcgc( ndm, mesh, nspin, r, r2, rho, rhoc, vgc, egc, & ! use kinds, only : DP use constants, only : fpi, e2 - use xc_lib, only : xclib_set_threshold, xclib_dft_is, xc_gcx + use xc_lib, only : xclib_set_threshold, xclib_dft_is, xc_gcx, & + xc_metagcx implicit none integer, intent(in) :: ndm,mesh,nspin,iflag real(DP), intent(in) :: r(mesh), r2(mesh), rho(ndm,2), rhoc(ndm) diff --git a/cmake/NVFortranCompiler.cmake b/cmake/NVFortranCompiler.cmake index 483930fc1..1e186c1e5 100644 --- a/cmake/NVFortranCompiler.cmake +++ b/cmake/NVFortranCompiler.cmake @@ -5,13 +5,6 @@ qe_add_global_compile_definitions(__PGI) # set optimization specific flags set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcache_align -Mlarge_arrays") -if(QE_ENABLE_OPENACC) - add_library(OpenACC::OpenACC_Fortran INTERFACE IMPORTED) - set_target_properties(OpenACC::OpenACC_Fortran PROPERTIES - INTERFACE_COMPILE_OPTIONS "-acc" - INTERFACE_LINK_OPTIONS "-acc") -endif() - if(QE_ENABLE_CUDA) if(CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER_EQUAL 20.7) set(CUDA_FLAG "-cuda") diff --git a/install/configure b/install/configure index 2bc13c527..a17a20d90 100755 --- a/install/configure +++ b/install/configure @@ -4793,6 +4793,7 @@ $as_echo "$as_me: WARNING: Using legacy custom solver." >&2;} if test "$enable_openacc" == "yes"; then ldflags="$ldflags -acc" cuda_fflags="$cuda_fflags -acc" + CUDA_CFLAGS="$CUDA_CFLAGS -acc -gpu=cc$with_cuda_cc,cuda$with_cuda_runtime" fi fi diff --git a/install/m4/x_ac_qe_cuda.m4 b/install/m4/x_ac_qe_cuda.m4 index ce8052eb4..44621a564 100644 --- a/install/m4/x_ac_qe_cuda.m4 +++ b/install/m4/x_ac_qe_cuda.m4 @@ -209,6 +209,7 @@ EOF if test "$enable_openacc" == "yes"; then ldflags="$ldflags -acc" cuda_fflags="$cuda_fflags -acc" + CUDA_CFLAGS="$CUDA_CFLAGS -acc -gpu=cc$with_cuda_cc,cuda$with_cuda_runtime" fi fi diff --git a/install/make.inc.in b/install/make.inc.in index 076cd4c1e..dd29f714b 100644 --- a/install/make.inc.in +++ b/install/make.inc.in @@ -101,6 +101,9 @@ CUDA_RUNTIME=@gpu_runtime@ # CUDA F90 Flags CUDA_F90FLAGS=@cuda_fflags@ $(MOD_FLAG)$(TOPDIR)/external/devxlib/src +# CUDA C Flags +CUDA_CFLAGS=@CUDA_CFLAGS@ + # C preprocessor and preprocessing flags - for explicit preprocessing, # if needed (see the compilation rules above) # preprocessing flags must include DFLAGS and IFLAGS @@ -112,7 +115,7 @@ CPPFLAGS = @cppflags@ $(DFLAGS) $(IFLAGS) # C flags must include DFLAGS and IFLAGS # F90 flags must include MODFLAGS, IFLAGS, and FDFLAGS with appropriate syntax -CFLAGS = @cflags@ $(DFLAGS) $(IFLAGS) +CFLAGS = @cflags@ $(DFLAGS) $(IFLAGS) $(CUDA_CFLAGS) F90FLAGS = @f90flags@ @pre_fdflags@$(FDFLAGS) $(CUDA_F90FLAGS) $(IFLAGS) $(MODFLAGS) # compiler flags with and without optimization for fortran-77