Merge branch 'Vxc_on_gpu2' into 'develop'

Vxc on gpu

See merge request QEF/q-e!1630
This commit is contained in:
Pietro Delugas 2021-11-20 16:34:19 +00:00
commit 5f9ebda198
37 changed files with 1946 additions and 999 deletions

View File

@ -306,11 +306,18 @@ endif(QE_ENABLE_OPENMP)
# OpenACC
# The following targets will be defined:
add_library(qe_openacc_fortran INTERFACE)
qe_install_targets(qe_openacc_fortran)
add_library(qe_openacc_c INTERFACE)
qe_install_targets(qe_openacc_fortran qe_openacc_c)
###########################################################
if(QE_ENABLE_OPENACC)
if(CMAKE_VERSION VERSION_LESS 3.16.0)
message(FATAL_ERROR "CMake versions >= 3.16 required for QE_ENABLE_OPENACC=ON!")
endif()
find_package(OpenACC REQUIRED Fortran C)
target_link_libraries(qe_openacc_fortran
INTERFACE OpenACC::OpenACC_Fortran)
target_link_libraries(qe_openacc_c
INTERFACE OpenACC::OpenACC_C)
endif(QE_ENABLE_OPENACC)
###########################################################

View File

@ -373,7 +373,7 @@
subroutine exch_corr_cp(nnr,nspin,grhor,rhor,etxc)
use kinds, only: DP
use xc_lib, only: xclib_dft_is, xclib_dft_is_libxc, xclib_get_id, &
xclib_set_threshold, xc_gcx
xclib_set_threshold, xc, xc_gcx
implicit none
integer, intent(in) :: nnr
integer, intent(in) :: nspin

View File

@ -11,7 +11,7 @@ SUBROUTINE tpssmeta(nnr, nspin,grho,rho,kedtau,etxc)
! ===================
!--------------------------------------------------------------------
use kinds, only: dp
use xc_lib, only: xclib_set_threshold
use xc_lib, only: xclib_set_threshold, xc_metagcx
!
IMPLICIT NONE
!

View File

@ -23,7 +23,7 @@ MODULE fft_helper_subroutines
#endif
END INTERFACE
PRIVATE
PUBLIC :: fftx_threed2oned, fftx_oned2threed
PUBLIC :: fftx_threed2oned, fftx_threed2oned_gpu, fftx_oned2threed
PUBLIC :: tg_reduce_rho
PUBLIC :: tg_get_nnr, tg_get_recip_inc, fftx_ntgrp, fftx_tgpe, &
tg_get_group_nr3
@ -469,6 +469,8 @@ CONTAINS
END SUBROUTINE
SUBROUTINE fftx_threed2oned( desc, vin, vout1, vout2 )
!! Copy charge density from 3D array to 1D array in Fourier
!! space.
USE fft_param
USE fft_types, ONLY : fft_type_descriptor
TYPE(fft_type_descriptor), INTENT(in) :: desc
@ -490,7 +492,40 @@ CONTAINS
END DO
END IF
END SUBROUTINE
SUBROUTINE fftx_threed2oned_gpu( desc, vin_d, vout1_d, vout2_d )
!! GPU version of \(\texttt{fftx_threed2oned}\).
USE fft_param
USE fft_types, ONLY : fft_type_descriptor
TYPE(fft_type_descriptor), INTENT(in) :: desc
complex(DP), INTENT(OUT) :: vout1_d(:)
complex(DP), OPTIONAL, INTENT(OUT) :: vout2_d(:)
complex(DP), INTENT(IN) :: vin_d(:)
INTEGER, POINTER :: nl_d(:), nlm_d(:)
COMPLEX(DP) :: fp, fm
INTEGER :: ig
#if defined(__CUDA) && defined(_OPENACC)
attributes(DEVICE) :: nl_d, nlm_d
!$acc data deviceptr( vout1_d(:), vout2_d(:), vin_d(:) )
nl_d => desc%nl_d
nlm_d => desc%nlm_d
IF( PRESENT( vout2_d ) ) THEN
!$acc parallel loop
DO ig=1,desc%ngm
fp=vin_d(nl_d(ig))+vin_d(nlm_d(ig))
fm=vin_d(nl_d(ig))-vin_d(nlm_d(ig))
vout1_d(ig) = CMPLX(0.5d0,0.d0,kind=DP)*CMPLX( DBLE(fp),AIMAG(fm),kind=DP)
vout2_d(ig) = CMPLX(0.5d0,0.d0,kind=DP)*CMPLX(AIMAG(fp),-DBLE(fm),kind=DP)
END DO
ELSE
!$acc parallel loop
DO ig=1,desc%ngm
vout1_d(ig) = vin_d(nl_d(ig))
END DO
END IF
!$acc end data
#endif
END SUBROUTINE
SUBROUTINE fftx_psi2c_gamma( desc, vin, vout1, vout2 )
USE fft_param

View File

@ -16,7 +16,7 @@ MODULE fft_rho
!
IMPLICIT NONE
PRIVATE
PUBLIC :: rho_r2g, rho_g2r
PUBLIC :: rho_r2g, rho_g2r, rho_r2g_gpu
!
INTERFACE rho_g2r
MODULE PROCEDURE rho_g2r_1, rho_g2r_2, rho_g2r_sum_components
@ -24,7 +24,10 @@ MODULE fft_rho
!
CONTAINS
!
!-------------------------------------------------------------
SUBROUTINE rho_r2g ( desc, rhor, rhog, v )
!------------------------------------------------------
!! Bring charge density rho from real to G- space
USE fft_types, ONLY: fft_type_descriptor
USE fft_helper_subroutines, ONLY: fftx_threed2oned
!
@ -93,6 +96,97 @@ CONTAINS
END SUBROUTINE rho_r2g
!
!-----------------------------------------------------------------
SUBROUTINE rho_r2g_gpu( desc, rhor_d, rhog_d, v_d )
!---------------------------------------------------------------
!! Bring charge density rho from real to G- space - GPU version.
!
USE fft_types, ONLY: fft_type_descriptor
USE fft_helper_subroutines, ONLY: fftx_threed2oned_gpu
!
TYPE(fft_type_descriptor), INTENT(in) :: desc
REAL(dp), INTENT(in) :: rhor_d(:,:)
!! rho in real space
COMPLEX(dp), INTENT(OUT):: rhog_d(:,:)
!! rho in G-space
REAL(dp), OPTIONAL, INTENT(in) :: v_d(:)
!
! ... local variables
!
INTEGER :: ir, ig, iss, isup, isdw
INTEGER :: nspin
COMPLEX(dp):: fp, fm
COMPLEX(dp), ALLOCATABLE :: psi_d(:)
!
!$acc data deviceptr( rhor_d(:,:), rhog_d(:,:), v_d(:) )
!
nspin= SIZE(rhor_d, 2)
!
ALLOCATE( psi_d(desc%nnr) )
!$acc data create( psi_d(desc%nnr) )
!$acc host_data use_device( psi_d )
IF( nspin == 1 ) THEN
iss=1
IF( PRESENT( v_d ) ) THEN
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,iss)+v_d(ir),0.0_dp,kind=dp)
END DO
ELSE
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,iss),0.0_dp,kind=dp)
END DO
END IF
CALL fwfft('Rho', psi_d, desc )
CALL fftx_threed2oned_gpu( desc, psi_d, rhog_d(:,iss) )
ELSE
IF ( gamma_only ) THEN
! nspin/2 = 1 for LSDA, = 2 for noncolinear
DO iss=1,nspin/2
isup=1+(iss-1)*nspin/2 ! 1 for LSDA, 1 and 3 for noncolinear
isdw=2+(iss-1)*nspin/2 ! 2 for LSDA, 2 and 4 for noncolinear
IF( PRESENT( v_d ) ) THEN
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,isup)+v_d(ir),rhor_d(ir,isdw)+v_d(ir),kind=dp)
END DO
ELSE
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,isup),rhor_d(ir,isdw),kind=dp)
END DO
END IF
CALL fwfft('Rho', psi_d, desc )
CALL fftx_threed2oned_gpu( desc, psi_d, rhog_d(:,isup), rhog_d(:,isdw) )
END DO
ELSE
DO iss=1,nspin
IF( PRESENT( v_d ) ) THEN
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,iss)+v_d(ir),0.0_dp,kind=dp)
END DO
ELSE
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,iss),0.0_dp,kind=dp)
END DO
END IF
CALL fwfft('Rho', psi_d, desc )
CALL fftx_threed2oned_gpu( desc, psi_d, rhog_d(:,iss) )
END DO
END IF
ENDIF
!$acc end host_data
!$acc end data
DEALLOCATE( psi_d )
!
!$acc end data
!
END SUBROUTINE rho_r2g_gpu
!
!
SUBROUTINE rho_g2r_1 ( desc, rhog, rhor )
USE fft_types, ONLY: fft_type_descriptor
USE fft_helper_subroutines, ONLY: fftx_threed2oned, fftx_oned2threed

View File

@ -264,7 +264,152 @@ SUBROUTINE fft_gradient_g2r( dfft, a, g, ga )
RETURN
!
END SUBROUTINE fft_gradient_g2r
!
!
!----------------------------------------------------------------------------
SUBROUTINE fft_gradient_g2r_gpu( dfft, a_d, g_d, ga_d )
!----------------------------------------------------------------------------
!! GPU double of \(\texttt{fft_gradient_g2r}\).
!
USE cell_base, ONLY : tpiba
USE kinds, ONLY : DP
USE fft_interfaces, ONLY : invfft
USE fft_types, ONLY : fft_type_descriptor
!
IMPLICIT NONE
!
TYPE(fft_type_descriptor),INTENT(IN) :: dfft
!! FFT descriptor
COMPLEX(DP), INTENT(IN) :: a_d(dfft%ngm)
!! a(G), a complex function in G-space
REAL(DP), INTENT(IN) :: g_d(3,dfft%ngm)
!! G-vectors, in \( 2\pi/a \) units
REAL(DP), INTENT(OUT) :: ga_d(3,dfft%nnr)
!! The gradient of \({\bf a}\), real, on the real-space FFT grid
!
! ... local variables
!
INTEGER :: ipol, n, ip
INTEGER, POINTER :: nl_d(:), nlm_d(:)
COMPLEX(DP), ALLOCATABLE :: gaux_d(:)
!
#if defined(__CUDA) && defined(_OPENACC)
attributes(DEVICE) :: g_d, nl_d, nlm_d
!
!$acc data deviceptr( a_d(dfft%ngm), ga_d(3,dfft%nnr) )
!
nl_d => dfft%nl_d
nlm_d => dfft%nlm_d
!
ALLOCATE( gaux_d(dfft%nnr) )
!$acc data create( gaux_d )
!$acc host_data use_device( gaux_d )
!
IF ( dfft%lgamma ) THEN
!
! ... Gamma tricks: perform 2 FFT's in a single shot
! x and y
ipol = 1
!$acc parallel loop
DO n = 1, dfft%nnr
DO ip = 1, 3
ga_d(ip,n) = 0.D0
ENDDO
gaux_d(n) = (0.0_dp,0.0_dp)
ENDDO
!
! ... multiply a(G) by iG to get the gradient in real space
!
!$acc parallel loop
DO n = 1, dfft%ngm
gaux_d(nl_d(n) ) = CMPLX( 0.0_dp, g_d(ipol,n),kind=DP)* a_d(n) - &
CMPLX(g_d(ipol+1,n),kind=DP) * a_d(n)
gaux_d(nlm_d(n)) = CMPLX( 0.0_dp,-g_d(ipol,n),kind=DP)*CONJG(a_d(n)) + &
CMPLX(g_d(ipol+1,n),kind=DP) * CONJG(a_d(n))
ENDDO
!
! ... bring back to R-space, (\grad_ipol a)(r) ...
!
CALL invfft( 'Rho', gaux_d, dfft )
!
! ... bring back to R-space, (\grad_ipol a)(r)
! ... add the factor 2\pi/a missing in the definition of q+G
!
!$acc parallel loop
DO n = 1, dfft%nnr
ga_d(ipol, n) = REAL( gaux_d(n), kind=DP ) * tpiba
ga_d(ipol+1, n) = AIMAG( gaux_d(n) ) * tpiba
ENDDO
! ... for z
ipol = 3
!$acc parallel loop
DO n = 1, dfft%nnr
gaux_d(n) = (0.0_dp,0.0_dp)
ENDDO
!
! ... multiply a(G) by iG to get the gradient in real space
!
!$acc parallel loop
DO n = 1, dfft%ngm
gaux_d(nl_d(n)) = CMPLX(g_d(ipol,n),kind=DP) * CMPLX( -AIMAG(a_d(n)), REAL(a_d(n)),kind=DP)
gaux_d(nlm_d(n)) = CONJG( gaux_d(nl_d(n)) )
ENDDO
!
! ... bring back to R-space, (\grad_ipol a)(r) ...
!
CALL invfft( 'Rho', gaux_d, dfft )
!
! ...and add the factor 2\pi/a missing in the definition of G
!
!$acc parallel loop
DO n = 1, dfft%nnr
ga_d(ipol,n) = tpiba * REAL( gaux_d(n), kind=DP )
ENDDO
!
ELSE
!
DO ipol = 1, 3
!
!$acc parallel loop
DO n = 1, dfft%nnr
ga_d(ipol,n) = 0.D0
gaux_d(n) = (0.0_dp,0.0_dp)
ENDDO
!
! ... multiply a(G) by iG to get the gradient in real space
!
!$acc parallel loop
DO n = 1, dfft%ngm
gaux_d(nl_d(n)) = CMPLX(g_d(ipol,n), kind=DP) * CMPLX( -AIMAG(a_d(n)), REAL(a_d(n)), kind=DP)
ENDDO
!
! ... bring back to R-space, (\grad_ipol a)(r) ...
!
CALL invfft( 'Rho', gaux_d, dfft )
!
! ...and add the factor 2\pi/a missing in the definition of G
!
!$acc parallel loop
DO n = 1, dfft%nnr
ga_d(ipol,n) = tpiba * REAL( gaux_d(n), kind=DP )
ENDDO
!
ENDDO
!
ENDIF
!
!$acc end host_data
!$acc end data
DEALLOCATE( gaux_d )
!
!$acc end data
!
#endif
!
RETURN
!
END SUBROUTINE fft_gradient_g2r_gpu
!
!----------------------------------------------------------------------------
SUBROUTINE fft_graddot( dfft, a, g, da )
!----------------------------------------------------------------------------
@ -373,7 +518,152 @@ SUBROUTINE fft_graddot( dfft, a, g, da )
RETURN
!
END SUBROUTINE fft_graddot
!
!
!----------------------------------------------------------------------------
SUBROUTINE fft_graddot_gpu( dfft, a_d, g_d, da_d )
!---------------------------------------------------------------------------
!! Calculates \( da = \sum_i \nabla_i a_i \) in R-space.
!
USE cell_base, ONLY : tpiba
USE kinds, ONLY : DP
USE fft_interfaces, ONLY : fwfft, invfft
USE fft_types, ONLY : fft_type_descriptor
!
IMPLICIT NONE
!
TYPE(fft_type_descriptor),INTENT(IN) :: dfft
!! FFT descriptor
REAL(DP), INTENT(IN) :: a_d(3,dfft%nnr)
!! A real function on the real-space FFT grid
REAL(DP), INTENT(IN) :: g_d(3,dfft%ngm)
!! G-vectors, in \( 2\pi/a \) units
REAL(DP), INTENT(OUT) :: da_d(dfft%nnr)
!! \( \sum_i \nabla_i a_i \), real, on the real-space FFT grid
!
! ... local variables
!
INTEGER :: n, ipol
INTEGER, POINTER :: nl_d(:), nlm_d(:)
COMPLEX(DP), ALLOCATABLE :: aux_d(:), gaux_d(:)
COMPLEX(DP) :: fp, fm, aux1, aux2
!
#if defined(__CUDA) && defined(_OPENACC)
attributes(DEVICE) :: g_d, nl_d, nlm_d
!
!$acc data deviceptr( a_d(3,dfft%ngm), da_d(dfft%nnr) )
!
nl_d => dfft%nl_d
nlm_d => dfft%nlm_d
!
ALLOCATE( aux_d(dfft%nnr), gaux_d(dfft%nnr) )
!$acc data create( aux_d, gaux_d )
!$acc host_data use_device( aux_d, gaux_d )
!
!$acc parallel loop
DO n = 1, dfft%nnr
gaux_d(n) = (0.0_dp,0.0_dp)
ENDDO
!
IF ( dfft%lgamma ) THEN
!
! Gamma tricks: perform 2 FFT's in a single shot
! x and y
ipol = 1
!
!$acc parallel loop
DO n = 1, dfft%nnr
aux_d(n) = CMPLX( a_d(ipol,n), a_d(ipol+1,n), kind=DP)
ENDDO
!
! ... bring a(ipol,r) to G-space, a(G) ...
!
CALL fwfft( 'Rho', aux_d, dfft )
!
! ... multiply by iG to get the gradient in G-space
!
!$acc parallel loop
DO n = 1, dfft%ngm
fp = (aux_d(nl_d(n)) + aux_d(nlm_d(n)))*0.5_dp
fm = (aux_d(nl_d(n)) - aux_d(nlm_d(n)))*0.5_dp
aux1 = CMPLX( REAL(fp), AIMAG(fm), kind=DP)
aux2 = CMPLX(AIMAG(fp), -REAL(fm), kind=DP)
gaux_d(nl_d(n)) = &
CMPLX(0.0_dp, g_d(ipol ,n),kind=DP) * aux1 + &
CMPLX(0.0_dp, g_d(ipol+1,n),kind=DP) * aux2
ENDDO
! ... for z
ipol = 3
!$acc parallel loop
DO n = 1, dfft%nnr
aux_d(n) = CMPLX( a_d(ipol,n), 0.0_dp, kind=DP)
ENDDO
!
! ... bring a(ipol,r) to G-space, a(G) ...
!
CALL fwfft( 'Rho', aux_d, dfft )
!
! ... multiply by iG to get the gradient in G-space
! ... fill both gaux(G) and gaux(-G) = gaux*(G)
!
!$acc parallel loop
DO n = 1, dfft%ngm
gaux_d(nl_d(n)) = gaux_d(nl_d(n)) + CMPLX(g_d(ipol,n),kind=DP) * &
CMPLX( -AIMAG( aux_d(nl_d(n)) ), &
REAL( aux_d(nl_d(n)) ), kind=DP)
gaux_d(nlm_d(n)) = CONJG( gaux_d(nl_d(n)) )
ENDDO
!
ELSE
!
DO ipol = 1, 3
!
!$acc parallel loop
DO n = 1, dfft%nnr
aux_d(n) = CMPLX( a_d(ipol,n), 0.0_dp, kind=DP)
ENDDO
!
! ... bring a(ipol,r) to G-space, a(G) ...
!
CALL fwfft( 'Rho', aux_d, dfft )
!
! ... multiply by iG to get the gradient in G-space
!
!$acc parallel loop
DO n = 1, dfft%ngm
gaux_d(nl_d(n)) = gaux_d(nl_d(n)) + CMPLX(g_d(ipol,n),kind=DP) * &
CMPLX( -AIMAG( aux_d(nl_d(n)) ), &
REAL( aux_d(nl_d(n)),kind=DP ), kind=DP)
ENDDO
!
ENDDO
!
ENDIF
!
! ... bring back to R-space, (\grad_ipol a)(r) ...
!
CALL invfft( 'Rho', gaux_d, dfft )
!
! ... add the factor 2\pi/a missing in the definition of G and sum
!
!$acc parallel loop
DO n = 1, dfft%nnr
da_d(n) = tpiba * REAL( gaux_d(n), kind=DP )
ENDDO
!
!$acc end host_data
!$acc end data
DEALLOCATE( aux_d, gaux_d )
!
!$acc end data
!
#endif
!
RETURN
!
END SUBROUTINE fft_graddot_gpu
!
!
!--------------------------------------------------------------------
SUBROUTINE fft_qgraddot ( dfft, a, xq, g, da)
!--------------------------------------------------------------------

View File

@ -55,7 +55,7 @@ PROGRAM do_ppacf
USE funct, ONLY : dft_is_nonlocc, nlc, enforce_input_dft
USE xc_lib, ONLY : xclib_get_id, xclib_set_auxiliary_flags, &
xclib_set_exx_fraction, xclib_dft_is_libxc, &
xclib_set_threshold, xc_gcx
xclib_set_threshold, xc, xc_gcx
USE wvfct, ONLY : npw, npwx
USE environment, ONLY : environment_start, environment_end
USE vdW_DF, ONLY : Nqs, vdW_DF_potential, vdW_DF_energy, vdW_DF_analysis

View File

@ -53,3 +53,60 @@
END IF
RETURN
END SUBROUTINE compute_rho
!
!
!------------------------------------------------------------------------------
SUBROUTINE compute_rho_gpu( rho_d, rhoout_d, segni_d, nrxx )
!--------------------------------------------------------------------------
!! GPU double of \(\texttt{compute_rho}\).
!
USE kinds, ONLY : DP
USE noncollin_module, ONLY : lsign, ux
!
IMPLICIT NONE
!
INTEGER :: nrxx
!! input: the dimension of the mesh
REAL(DP), INTENT(IN) :: rho_d(nrxx,4)
!! the four components of the charge
REAL(DP), INTENT(OUT) :: rhoout_d(nrxx,2)
!! the spin up and spin down charge
REAL(DP), INTENT(OUT) :: segni_d(nrxx)
!! the orientation when needed
!
! ... local variables
!
REAL(DP) :: ux1, ux2, ux3, amag
INTEGER :: ir ! counter on mesh points
!
!$acc data deviceptr( rho_d(nrxx,4), rhoout_d(nrxx,2), segni_d(nrxx) )
!
IF (lsign) THEN
!
ux1 = ux(1) ; ux2 = ux(2) ; ux3 = ux(3)
!
!$acc parallel loop
DO ir = 1, nrxx
segni_d(ir) = SIGN(1.0_DP,rho_d(ir,2)*ux1+rho_d(ir,3)*ux2+rho_d(ir,4)*ux3)
amag = SQRT(rho_d(ir,2)**2+rho_d(ir,3)**2+rho_d(ir,4)**2)
rhoout_d(ir,1) = 0.5d0*(rho_d(ir,1)+segni_d(ir)*amag)
rhoout_d(ir,2) = 0.5d0*(rho_d(ir,1)-segni_d(ir)*amag)
ENDDO
!
ELSE
!
!$acc parallel loop
DO ir = 1, nrxx
segni_d(ir) = 1.0_DP
amag = SQRT(rho_d(ir,2)**2+rho_d(ir,3)**2+rho_d(ir,4)**2)
rhoout_d(ir,1) = 0.5d0*(rho_d(ir,1) + amag)
rhoout_d(ir,2) = 0.5d0*(rho_d(ir,1) - amag)
ENDDO
!
ENDIF
!
!$acc end data
!
RETURN
!
END SUBROUTINE compute_rho_gpu

View File

@ -13,14 +13,15 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v )
!
USE constants, ONLY : e2
USE kinds, ONLY : DP
USE gvect, ONLY : ngm, g
USE gvect, ONLY : ngm, g, g_d
USE lsda_mod, ONLY : nspin
USE cell_base, ONLY : omega
USE xc_lib, ONLY : igcc_is_lyp, xclib_dft_is, xc_gcx
USE noncollin_module, ONLY : domag
USE fft_base, ONLY : dfftp
USE fft_interfaces, ONLY : fwfft
USE fft_rho, ONLY: rho_r2g
USE fft_rho, ONLY : rho_r2g, rho_r2g_gpu
USE control_flags, ONLY : use_gpu
!
IMPLICIT NONE
!
@ -39,15 +40,15 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v )
REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:), v2c_ud(:)
REAL(DP) :: sx(dfftp%nnr), sc(dfftp%nnr)
!
REAL(DP) :: sgn(2), etxcgc, vtxcgc, fac, amag
!
REAL(DP) :: sgn_is, etxcgc, vtxcgc, fac, amag
REAL(DP) :: grup, grdw
!
REAL(DP), PARAMETER :: epsr = 1.D-6, epsg = 1.D-10
!
!
IF ( .NOT. xclib_dft_is('gradient') ) RETURN
!
!$acc data deviceptr( rho(dfftp%nnr,nspin), rho_core(dfftp%nnr), &
!$acc& rhog(ngm,nspin), rhog_core(ngm), v(dfftp%nnr,nspin) )
!
etxcgc = 0.0_DP
vtxcgc = 0.0_DP
!
@ -55,77 +56,111 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v )
IF ( nspin==4 ) nspin0 = 1
IF ( nspin==4 .AND. domag ) nspin0 = 2
fac = 1.0_DP / DBLE( nspin0 )
sgn(1) = 1._DP ; sgn(2) = -1._DP
!
ALLOCATE( h(3,dfftp%nnr,nspin0) )
ALLOCATE( grho(3,dfftp%nnr,nspin0) )
ALLOCATE( rhoaux(dfftp%nnr,nspin0) )
!
ALLOCATE( v1x(dfftp%nnr,nspin0), v2x(dfftp%nnr,nspin0) )
ALLOCATE( v1c(dfftp%nnr,nspin0), v2c(dfftp%nnr,nspin0) )
!$acc data create( rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, h )
!
IF ( nspin==4 .AND. domag ) THEN
ALLOCATE( vgg(dfftp%nnr,nspin0) )
ALLOCATE( vsave(dfftp%nnr,nspin) )
ALLOCATE( segni(dfftp%nnr) )
vsave=v
v=0._DP
ENDIF
!
ALLOCATE( rhogaux( ngm, nspin0 ) )
ALLOCATE( rhogaux(ngm,nspin0) )
!$acc data create( rhogaux )
!
! ... calculate the gradient of rho + rho_core in real space
!
IF ( nspin == 4 .AND. domag ) THEN
!
CALL compute_rho( rho, rhoaux, segni, dfftp%nnr )
!
! ... bring starting rhoaux to G-space
!
CALL rho_r2g ( dfftp, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
!
ALLOCATE( vsave(dfftp%nnr,nspin) )
ALLOCATE( segni(dfftp%nnr) )
!$acc data copyout( segni, vsave )
!
!$acc parallel loop collapse(2)
DO is = 1, nspin
DO ir = 1, dfftp%nnr
vsave(ir,is) = v(ir,is)
v(ir,is) = 0._DP
ENDDO
ENDDO
!
! ... bring starting rhoaux to G-space
IF ( use_gpu ) THEN
!$acc host_data use_device( rhoaux, rhogaux, segni )
CALL compute_rho_gpu( rho, rhoaux, segni, dfftp%nnr )
CALL rho_r2g_gpu( dfftp, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
!$acc end host_data
ELSE
CALL compute_rho( rho, rhoaux, segni, dfftp%nnr )
CALL rho_r2g( dfftp, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
ENDIF
!$acc end data
ELSE
!
! ... for convenience rhoaux and rhogaux are in (up,down) format, when LSDA
!
DO is = 1, nspin0
rhoaux(:,is) = ( rho(:,1) + sgn(is) * rho(:,nspin0) ) * 0.5_DP
rhogaux(:,is) = ( rhog(:,1) + sgn(is) * rhog(:,nspin0) ) * 0.5_DP
ENDDO
!
! ... for convenience rhoaux and rhogaux are in (up,down) format, when LSDA
!
!$acc parallel loop collapse(2)
DO is = 1, nspin0
DO ir = 1, dfftp%nnr
sgn_is = DBLE(3-2*is)
rhoaux(ir,is) = ( rho(ir,1) + sgn_is * rho(ir,nspin0) ) * 0.5_DP
ENDDO
ENDDO
!$acc parallel loop collapse(2)
DO is = 1, nspin0
DO ir = 1, ngm
sgn_is = DBLE(3-2*is)
rhogaux(ir,is) = ( rhog(ir,1) + CMPLX(sgn_is) * rhog(ir,nspin0) ) &
* (0.5_DP,0._DP)
ENDDO
ENDDO
!
ENDIF
!
!$acc parallel loop collapse(2)
DO is = 1, nspin0
!
rhoaux(:,is) = fac * rho_core(:) + rhoaux(:,is)
rhogaux(:,is) = fac * rhog_core(:) + rhogaux(:,is)
!
CALL fft_gradient_g2r( dfftp, rhogaux(1,is), g, grho(1,1,is) )
!
DO ir = 1, dfftp%nnr
rhoaux(ir,is) = fac * rho_core(ir) + rhoaux(ir,is)
ENDDO
ENDDO
!$acc parallel loop collapse(2)
DO is = 1, nspin0
DO ir = 1, ngm
rhogaux(ir,is) = CMPLX(fac,kind=DP) * rhog_core(ir) + rhogaux(ir,is)
ENDDO
ENDDO
!
DEALLOCATE( rhogaux )
DO is = 1, nspin0
IF ( use_gpu ) THEN
!$acc host_data use_device( rhogaux, grho )
CALL fft_gradient_g2r_gpu( dfftp, rhogaux(:,is), g_d, grho(:,:,is) )
!$acc end host_data
ELSE
CALL fft_gradient_g2r( dfftp, rhogaux(:,is), g, grho(:,:,is) )
ENDIF
ENDDO
!
!$acc end data
DEALLOCATE( rhogaux )
!
IF ( nspin0 == 1 ) THEN
!
! ... This is the spin-unpolarised case
!
CALL xc_gcx( dfftp%nnr, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c )
!$acc host_data use_device( rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c )
CALL xc_gcx( dfftp%nnr, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, &
gpu_args_=.TRUE. )
!$acc end host_data
!
!$acc parallel loop reduction(+:etxcgc) reduction(+:vtxcgc)
DO k = 1, dfftp%nnr
!
! ... first term of the gradient correction : D(rho*Exc)/D(rho)
v(k,1) = v(k,1) + e2 * ( v1x(k,1) + v1c(k,1) )
!
! ... h contains: D(rho*Exc) / D(|grad rho|) * (grad rho) / |grad rho|
h(:,k,1) = e2 * ( v2x(k,1) + v2c(k,1) ) * grho(:,k,1)
DO ipol = 1, 3
h(ipol,k,1) = e2 * ( v2x(k,1) + v2c(k,1) ) * grho(ipol,k,1)
ENDDO
!
vtxcgc = vtxcgc + e2 * ( v1x(k,1) + v1c(k,1) ) * &
(rhoaux(k,1) - rho_core(k) )
!
( rhoaux(k,1) - rho_core(k) )
etxcgc = etxcgc + e2 * ( sx(k) + sc(k) )
!
ENDDO
!
ELSE
@ -133,71 +168,84 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v )
! ... spin-polarised case
!
ALLOCATE( v2c_ud(dfftp%nnr) )
!
!
CALL xc_gcx( dfftp%nnr, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud )
!$acc data create( v2c_ud )
!
!$acc host_data use_device( rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud )
CALL xc_gcx( dfftp%nnr, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, &
v2c_ud, gpu_args_=.TRUE. )
!$acc end host_data
!
! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
!
!$acc parallel loop reduction(+:etxcgc) reduction(+:vtxcgc)
DO k = 1, dfftp%nnr
v(k,1:nspin0) = v(k,1:nspin0) + e2*( v1x(k,1:nspin0) + v1c(k,1:nspin0))
!
DO is = 1, nspin0
v(k,is) = v(k,is) + e2*(v1x(k,is) + v1c(k,is))
ENDDO
!
DO ipol = 1, 3
!
grup = grho(ipol,k,1)
grdw = grho(ipol,k,2)
h(ipol,k,1) = e2*( (v2x(k,1) + v2c(k,1))*grup + v2c_ud(k)*grdw )
h(ipol,k,2) = e2*( (v2x(k,2) + v2c(k,2))*grdw + v2c_ud(k)*grup )
!
ENDDO
!
vtxcgc = vtxcgc + &
e2 * ( v1x(k,1) + v1c(k,1) ) * ( rhoaux(k,1) - rho_core(k) * fac )
vtxcgc = vtxcgc + &
e2 * ( v1x(k,2) + v1c(k,2) ) * ( rhoaux(k,2) - rho_core(k) * fac )
vtxcgc = vtxcgc + e2 * ( &
( v1x(k,1) + v1c(k,1) ) * ( rhoaux(k,1) - rho_core(k) * fac ) + &
( v1x(k,2) + v1c(k,2) ) * ( rhoaux(k,2) - rho_core(k) * fac ) )
etxcgc = etxcgc + e2 * ( sx(k) + sc(k) )
!
ENDDO
!
!$acc end data
DEALLOCATE( v2c_ud )
!
ENDIF
!
!
!$acc parallel loop collapse(2)
DO is = 1, nspin0
!
rhoaux(:,is) = rhoaux(:,is) - fac * rho_core(:)
!
END DO
DO k = 1, dfftp%nnr
rhoaux(k,is) = rhoaux(k,is) - fac * rho_core(k)
ENDDO
ENDDO
!
DEALLOCATE( grho )
DEALLOCATE( v1x, v2x )
DEALLOCATE( v1c, v2c )
!
ALLOCATE( dh(dfftp%nnr) )
ALLOCATE( dh(dfftp%nnr) )
!$acc data create( dh )
!
! ... second term of the gradient correction :
! ... \sum_alpha (D / D r_alpha) ( D(rho*Exc)/D(grad_alpha rho) )
!
!$acc host_data use_device( rhoaux, dh, h )
DO is = 1, nspin0
!
CALL fft_graddot( dfftp, h(1,1,is), g, dh )
!
v(:,is) = v(:,is) - dh(:)
!
vtxcgc = vtxcgc - SUM( dh(:) * rhoaux(:,is) )
!
END DO
IF ( use_gpu ) CALL fft_graddot_gpu( dfftp, h(1,1,is), g_d, dh )
IF ( .NOT. use_gpu ) CALL fft_graddot( dfftp, h(1,1,is), g, dh )
!$acc parallel loop
DO k = 1, dfftp%nnr
v(k,is) = v(k,is) - dh(k)
vtxcgc = vtxcgc - dh(k) * rhoaux(k,is)
ENDDO
ENDDO
!$acc end host_data
!$acc end data
!
vtxc = vtxc + omega * vtxcgc / ( dfftp%nr1 * dfftp%nr2 * dfftp%nr3 )
etxc = etxc + omega * etxcgc / ( dfftp%nr1 * dfftp%nr2 * dfftp%nr3 )
!
IF (nspin==4 .AND. domag) THEN
DO is = 1, nspin0
vgg(:,is) = v(:,is)
ALLOCATE( vgg(dfftp%nnr,nspin0) )
!$acc data copyin( segni, vsave ) create( vgg )
!$acc host_data use_device( segni, vgg, vsave )
!
!$acc parallel loop collapse(2)
DO is = 1, nspin
DO ir = 1, dfftp%nnr
IF (is<=nspin0) vgg(ir,is) = v(ir,is)
v(ir,is) = vsave(ir,is)
ENDDO
ENDDO
!
v = vsave
!$acc parallel loop
DO k = 1, dfftp%nnr
v(k,1) = v(k,1) + 0.5d0*(vgg(k,1)+vgg(k,2))
amag = SQRT(rho(k,2)**2+rho(k,3)**2+rho(k,4)**2)
@ -207,17 +255,21 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v )
v(k,4) = v(k,4) + segni(k)*0.5d0*(vgg(k,1)-vgg(k,2))*rho(k,4)/amag
ENDIF
ENDDO
ENDIF
!
DEALLOCATE( dh )
DEALLOCATE( h )
DEALLOCATE( rhoaux )
IF (nspin==4 .AND. domag) THEN
DEALLOCATE( vgg )
DEALLOCATE( vsave )
!
!$acc end host_data
!$acc end data
DEALLOCATE( segni )
DEALLOCATE( vgg, vsave )
ENDIF
!
!$acc end data
!
DEALLOCATE( rhoaux, grho )
DEALLOCATE( v1x, v2x )
DEALLOCATE( v1c, v2c )
DEALLOCATE( dh, h )
!
!$acc end data
!
RETURN
!

View File

@ -429,7 +429,7 @@ MODULE paw_onecenter
USE constants, ONLY : e2, eps12
USE lsda_mod, ONLY : nspin
USE atom, ONLY : g => rgrid
USE xc_lib, ONLY : xclib_dft_is
USE xc_lib, ONLY : xclib_dft_is, xc
USE constants, ONLY : fpi ! REMOVE
!
TYPE(paw_info), INTENT(IN) :: i

View File

@ -12,7 +12,7 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, &
!----------------------------------------------------------------------------
!
USE kinds, ONLY: DP
USE xc_lib, ONLY: xclib_dft_is, xclib_get_id, xc_gcx
USE xc_lib, ONLY: xclib_dft_is, xclib_get_id, xc_gcx, xc_metagcx
USE noncollin_module, ONLY: domag
USE mp_bands, ONLY: intra_bgrp_comm
USE mp, ONLY: mp_sum

View File

@ -63,7 +63,7 @@ SUBROUTINE v_of_rho( rho, rho_core, rhog_core, &
!
! ... calculate exchange-correlation potential
!
IF (xclib_dft_is('meta')) then
IF (xclib_dft_is('meta')) THEN
CALL v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v%of_r, v%kin_r )
ELSE
CALL v_xc( rho, rho_core, rhog_core, etxc, vtxc, v%of_r )
@ -154,13 +154,15 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
USE constants, ONLY : e2, eps8
USE io_global, ONLY : stdout
USE fft_base, ONLY : dfftp
USE gvect, ONLY : g, ngm
USE gvect, ONLY : g, g_d, ngm
USE lsda_mod, ONLY : nspin
USE cell_base, ONLY : omega
USE funct, ONLY : dft_is_nonlocc, nlc
USE scf, ONLY : scf_type, rhoz_or_updw
USE scf, ONLY : scf_type
USE xc_lib, ONLY : xc_metagcx
USE mp, ONLY : mp_sum
USE mp_bands, ONLY : intra_bgrp_comm
USE control_flags, ONLY : use_gpu
!
IMPLICIT NONE
!
@ -181,20 +183,19 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
!
! ... local variables
!
REAL(DP) :: zeta, rh, sgn(2)
REAL(DP) :: zeta, rh, sgn_is
INTEGER :: k, ipol, is, np
!
REAL(DP), ALLOCATABLE :: ex(:), ec(:)
REAL(DP), ALLOCATABLE :: v1x(:,:), v2x(:,:), v3x(:,:)
REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:,:), v3c(:,:)
!
REAL(DP) :: fac
REAL(DP), DIMENSION(2) :: grho2, rhoneg
REAL(DP) :: fac, rhoneg1, rhoneg2
REAL(DP), DIMENSION(2) :: grho2
REAL(DP), DIMENSION(3) :: grhoup, grhodw
!
REAL(DP), ALLOCATABLE :: grho(:,:,:), h(:,:,:), dh(:)
REAL(DP), ALLOCATABLE :: rhoout(:)
REAL(DP), ALLOCATABLE :: h(:,:,:), dh(:)
REAL(DP), ALLOCATABLE :: rho_updw(:,:), grho(:,:,:), tau(:,:)
COMPLEX(DP), ALLOCATABLE :: rhogsum(:)
REAL(DP), PARAMETER :: eps12 = 1.0d-12, zero=0._dp
!
@ -203,15 +204,17 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
etxc = zero
vtxc = zero
v(:,:) = zero
rhoneg(:) = zero
sgn(1) = 1._dp ; sgn(2) = -1._dp
rhoneg1 = zero ; rhoneg2 = zero
fac = 1.D0 / DBLE( nspin )
np = 1
IF (nspin==2) np=3
!
!$acc data copyin( rho ) copyout( kedtaur, v )
!
ALLOCATE( grho(3,dfftp%nnr,nspin) )
ALLOCATE( h(3,dfftp%nnr,nspin) )
ALLOCATE( rhogsum(ngm) )
ALLOCATE( rhogsum(ngm), tau(dfftp%nnr,nspin) )
!$acc data create( tau, grho, h )
!
ALLOCATE( ex(dfftp%nnr), ec(dfftp%nnr) )
ALLOCATE( v1x(dfftp%nnr,nspin), v2x(dfftp%nnr,nspin) , v3x(dfftp%nnr,nspin) )
@ -220,104 +223,158 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
! ... calculate the gradient of rho + rho_core in real space
! ... in LSDA case rhogsum is in (up,down) format
!
!$acc data create( rhogsum ) copyin( rhog_core, rho%of_g, rho%kin_r )
DO is = 1, nspin
!
rhogsum(:) = fac*rhog_core(:) + ( rho%of_g(:,1) + sgn(is)*rho%of_g(:,nspin) )*0.5D0
sgn_is = (-1.d0)**(is+1)
!
CALL fft_gradient_g2r( dfftp, rhogsum, g, grho(1,1,is) )
!$acc parallel loop present(rho)
DO k = 1, ngm
rhogsum(k) = fac*rhog_core(k) + ( rho%of_g(k,1) + sgn_is*rho%of_g(k,nspin) )*0.5D0
ENDDO
!
IF ( use_gpu ) THEN
!$acc host_data use_device( rhogsum, grho )
CALL fft_gradient_g2r_gpu( dfftp, rhogsum, g_d, grho(:,:,is) )
!$acc end host_data
ELSE
CALL fft_gradient_g2r( dfftp, rhogsum, g, grho(:,:,is) )
ENDIF
!
ENDDO
DEALLOCATE(rhogsum)
!
!$acc parallel loop collapse(2) present(rho)
DO is = 1, nspin
DO k = 1, dfftp%nnr
tau(k,is) = rho%kin_r(k,is)/e2
ENDDO
ENDDO
!
!$acc end data
DEALLOCATE( rhogsum )
!
!$acc data copyin( rho%of_r )
!$acc data create( ex, ec, v1x, v2x, v3x, v1c, v2c, v3c )
IF (nspin == 1) THEN
!
CALL xc_metagcx( dfftp%nnr, 1, np, rho%of_r, grho, rho%kin_r/e2, ex, ec, &
v1x, v2x, v3x, v1c, v2c, v3c )
!$acc host_data use_device( rho%of_r, grho, tau, ex, ec, &
!$acc& v1x, v2x, v3x, v1c, v2c, v3c )
CALL xc_metagcx( dfftp%nnr, 1, np, rho%of_r, grho, tau, ex, ec, &
v1x, v2x, v3x, v1c, v2c, v3c, gpu_args_=.TRUE. )
!$acc end host_data
!
!$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) &
!$acc& reduction(-:rhoneg2) present(rho)
DO k = 1, dfftp%nnr
!
v(k,1) = (v1x(k,1)+v1c(k,1)) * e2
!
! h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
h(:,k,1) = (v2x(k,1)+v2c(1,k,1)) * grho(:,k,1) * e2
! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
DO ipol = 1, 3
h(ipol,k,1) = (v2x(k,1)+v2c(1,k,1)) * grho(ipol,k,1) * e2
ENDDO
!
kedtaur(k,1) = (v3x(k,1)+v3c(k,1)) * 0.5d0 * e2
!
etxc = etxc + (ex(k)+ec(k)) * e2
vtxc = vtxc + (v1x(k,1)+v1c(k,1)) * e2 * ABS(rho%of_r(k,1))
!
IF (rho%of_r(k,1) < zero) rhoneg(1) = rhoneg(1)-rho%of_r(k,1)
IF (rho%of_r(k,1) < zero) rhoneg1 = rhoneg1-rho%of_r(k,1)
!
ENDDO
!
ELSE
!
CALL rhoz_or_updw( rho, 'only_r', '->updw' )
ALLOCATE( rho_updw(dfftp%nnr,2) )
!$acc data create( rho_updw )
!
CALL xc_metagcx( dfftp%nnr, 2, np, rho%of_r, grho, rho%kin_r/e2, ex, ec, &
v1x, v2x, v3x, v1c, v2c, v3c )
!$acc parallel loop present(rho)
DO k = 1, dfftp%nnr
rho_updw(k,1) = ( rho%of_r(k,1) + rho%of_r(k,2) ) * 0.5d0
rho_updw(k,2) = ( rho%of_r(k,1) - rho%of_r(k,2) ) * 0.5d0
ENDDO
!
! first term of the gradient correction : D(rho*Exc)/D(rho)
!$acc host_data use_device( rho_updw, grho, tau, ex, ec, &
!$acc& v1x, v2x, v3x, v1c, v2c, v3c )
CALL xc_metagcx( dfftp%nnr, 2, np, rho_updw, grho, tau, ex, ec, &
v1x, v2x, v3x, v1c, v2c, v3c, gpu_args_=.TRUE. )
!$acc end host_data
!
! ... first term of the gradient correction : D(rho*Exc)/D(rho)
!
!$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) &
!$acc& reduction(-:rhoneg2)
DO k = 1, dfftp%nnr
!
v(k,1) = (v1x(k,1) + v1c(k,1)) * e2
v(k,2) = (v1x(k,2) + v1c(k,2)) * e2
!
! h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
! ... h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
!
h(:,k,1) = (v2x(k,1) * grho(:,k,1) + v2c(:,k,1)) * e2
h(:,k,2) = (v2x(k,2) * grho(:,k,2) + v2c(:,k,2)) * e2
DO ipol = 1, 3
h(ipol,k,1) = (v2x(k,1) * grho(ipol,k,1) + v2c(ipol,k,1)) * e2
h(ipol,k,2) = (v2x(k,2) * grho(ipol,k,2) + v2c(ipol,k,2)) * e2
ENDDO
!
kedtaur(k,1) = (v3x(k,1) + v3c(k,1)) * 0.5d0 * e2
kedtaur(k,2) = (v3x(k,2) + v3c(k,2)) * 0.5d0 * e2
!
etxc = etxc + (ex(k)+ec(k)) * e2
vtxc = vtxc + (v1x(k,1)+v1c(k,1)) * ABS(rho%of_r(k,1)) * e2
vtxc = vtxc + (v1x(k,2)+v1c(k,2)) * ABS(rho%of_r(k,2)) * e2
vtxc = vtxc + (v1x(k,1)+v1c(k,1)) * ABS(rho_updw(k,1)) * e2 + &
(v1x(k,2)+v1c(k,2)) * ABS(rho_updw(k,2)) * e2
!
IF ( rho%of_r(k,1) < 0.d0 ) rhoneg(1) = rhoneg(1) - rho%of_r(k,1)
IF ( rho%of_r(k,2) < 0.d0 ) rhoneg(2) = rhoneg(2) - rho%of_r(k,2)
IF ( rho_updw(k,1) < 0.d0 ) rhoneg1 = rhoneg1 - rho_updw(k,1)
IF ( rho_updw(k,2) < 0.d0 ) rhoneg2 = rhoneg2 - rho_updw(k,2)
!
ENDDO
!
CALL rhoz_or_updw( rho, 'only_r', '->rhoz' )
!$acc end data
DEALLOCATE( rho_updw )
!
ENDIF
!
!$acc end data
DEALLOCATE( ex, ec )
DEALLOCATE( v1x, v2x, v3x )
DEALLOCATE( v1c, v2c, v3c )
!
!
ALLOCATE( dh( dfftp%nnr ) )
ALLOCATE( dh( dfftp%nnr ) )
!$acc data create( dh )
!
! ... second term of the gradient correction :
! ... \sum_alpha (D / D r_alpha) ( D(rho*Exc)/D(grad_alpha rho) )
!
ALLOCATE (rhoout(dfftp%nnr))
!$acc host_data use_device( rho%of_r, dh, h )
DO is = 1, nspin
!
CALL fft_graddot( dfftp, h(1,1,is), g, dh )
sgn_is = (-1.d0)**(is+1)
!
v(:,is) = v(:,is) - dh(:)
IF ( use_gpu ) CALL fft_graddot_gpu( dfftp, h(1,1,is), g_d, dh )
IF ( .NOT. use_gpu ) CALL fft_graddot( dfftp, h(1,1,is), g, dh )
!
! ... rhoout is in (up,down) format
!$acc parallel loop reduction(-:vtxc) present(rho)
DO k = 1, dfftp%nnr
v(k,is) = v(k,is) - dh(k)
vtxc = vtxc - dh(k) * ( rho%of_r(k,1) + sgn_is*rho%of_r(k,nspin) )*0.5D0
ENDDO
!
rhoout(:) = ( rho%of_r(:,1) + sgn(is)*rho%of_r(:,nspin) )*0.5D0
vtxc = vtxc - SUM( dh(:) * rhoout(:) )
!
END DO
DEALLOCATE(rhoout)
DEALLOCATE(dh)
ENDDO
!$acc end host_data
!$acc end data
DEALLOCATE( dh )
!
CALL mp_sum( rhoneg, intra_bgrp_comm )
!$acc end data
!$acc end data
!$acc end data
!
rhoneg(:) = rhoneg(:) * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
CALL mp_sum( rhoneg1, intra_bgrp_comm )
CALL mp_sum( rhoneg2, intra_bgrp_comm )
!
IF ((rhoneg(1) > eps8) .OR. (rhoneg(2) > eps8)) THEN
write (stdout, '(/,5x, "negative rho (up,down): ", 2es10.3)') rhoneg(:)
rhoneg1 = rhoneg1 * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
rhoneg2 = rhoneg2 * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
!
IF ((rhoneg1 > eps8) .OR. (rhoneg2 > eps8)) THEN
write (stdout, '(/,5x, "negative rho (up,down): ", 2es10.3)') rhoneg1, rhoneg2
ENDIF
!
vtxc = omega * vtxc / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
@ -328,8 +385,8 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
CALL mp_sum( vtxc , intra_bgrp_comm )
CALL mp_sum( etxc , intra_bgrp_comm )
!
DEALLOCATE(grho)
DEALLOCATE(h)
DEALLOCATE( tau, grho )
DEALLOCATE( h )
!
CALL stop_clock( 'v_xc_meta' )
!
@ -337,10 +394,10 @@ SUBROUTINE v_xc_meta( rho, rho_core, rhog_core, etxc, vtxc, v, kedtaur )
!
END SUBROUTINE v_xc_meta
!
!
!------------------------------------------------------------------------------
SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v )
!----------------------------------------------------------------------------
!! Exchange-Correlation potential Vxc(r) from n(r)
!! Exchange-Correlation potential from charge density.
!
USE kinds, ONLY : DP
USE constants, ONLY : e2, eps8
@ -352,8 +409,10 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v )
USE noncollin_module, ONLY : domag
USE funct, ONLY : nlc, dft_is_nonlocc
USE scf, ONLY : scf_type
USE xc_lib, ONLY : xc
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
USE control_flags, ONLY : use_gpu
!
IMPLICIT NONE
!
@ -364,19 +423,17 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v )
COMPLEX(DP), INTENT(IN) :: rhog_core(ngm)
!! the core charge in reciprocal space
REAL(DP), INTENT(OUT) :: v(dfftp%nnr,nspin)
!! V_xc potential
!! \(V_{xc}\) potential
REAL(DP), INTENT(OUT) :: vtxc
!! integral V_xc * rho
!! integral \(V_{xc}\cdot\text{rho}\)
REAL(DP), INTENT(OUT) :: etxc
!! E_xc energy
!! \(E_{xc}\) energy
!
! ... local variables
!
REAL(DP) :: rhoneg(2), vs
!
!REAL(DP), ALLOCATABLE :: arhox(:), amag(:), zeta(:)
REAL(DP) :: arho, amag
REAL(DP) :: rhoup2, rhodw2
REAL(DP) :: rhoup2, rhodw2, rhoneg1, rhoneg2
REAL(DP) :: arho, amag, vtxc24
REAL(DP), ALLOCATABLE :: ex(:), ec(:)
REAL(DP), ALLOCATABLE :: vx(:,:), vc(:,:)
! In order:
@ -390,98 +447,110 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v )
INTEGER :: ir, ipol
! counter on mesh points
! counter on nspin
!
! number of mesh points (=dfftp%nnr)
REAL(DP), PARAMETER :: vanishing_charge = 1.D-10, &
vanishing_mag = 1.D-20
!
!
CALL start_clock( 'v_xc' )
!
ALLOCATE( ex(dfftp%nnr) )
ALLOCATE( ec(dfftp%nnr) )
ALLOCATE( vx(dfftp%nnr,nspin) )
ALLOCATE( vc(dfftp%nnr,nspin) )
etxc = 0.D0 ; rhoneg1 = 0.D0
vtxc = 0.D0 ; rhoneg2 = 0.D0
!
etxc = 0.D0
vtxc = 0.D0
v(:,:) = 0.D0
rhoneg = 0.D0
!$acc data copyin( rho_core, rhog_core, rho ) copyout( v )
!$acc data copyin( rho%of_r, rho%of_g )
!
ALLOCATE( ex(dfftp%nnr), vx(dfftp%nnr,nspin) )
ALLOCATE( ec(dfftp%nnr), vc(dfftp%nnr,nspin) )
!$acc data create( ex, ec, vx, vc )
!
rho%of_r(:,1) = rho%of_r(:,1) + rho_core(:)
!$acc host_data use_device( rho%of_r, rho%of_g, rho_core, rhog_core, v,&
!$acc& ex, ec, vx, vc )
!
!$acc parallel loop
DO ir = 1, dfftp%nnr
rho%of_r(ir,1) = rho%of_r(ir,1) + rho_core(ir)
ENDDO
!
IF ( nspin == 1 .OR. ( nspin == 4 .AND. .NOT. domag ) ) THEN
! ... spin-unpolarized case
!
CALL xc( dfftp%nnr, 1, 1, rho%of_r, ex, ec, vx, vc )
CALL xc( dfftp%nnr, 1, 1, rho%of_r, ex, ec, vx, vc, gpu_args_=.TRUE. )
!
!$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) &
!$acc& present(rho)
DO ir = 1, dfftp%nnr
v(ir,1) = e2*( vx(ir,1) + vc(ir,1) )
etxc = etxc + e2*( ex(ir) + ec(ir) )*rho%of_r(ir,1)
rho%of_r(ir,1) = rho%of_r(ir,1) - rho_core(ir)
vtxc = vtxc + v(ir,1)*rho%of_r(ir,1)
IF (rho%of_r(ir,1) < 0.D0) rhoneg(1) = rhoneg(1)-rho%of_r(ir,1)
IF (rho%of_r(ir,1) < 0.D0) rhoneg1 = rhoneg1-rho%of_r(ir,1)
ENDDO
!
!
ELSEIF ( nspin == 2 ) THEN
! ... spin-polarized case
!
CALL xc( dfftp%nnr, 2, 2, rho%of_r, ex, ec, vx, vc )
CALL xc( dfftp%nnr, 2, 2, rho%of_r, ex, ec, vx, vc, gpu_args_=.TRUE. )
!
DO ir = 1, dfftp%nnr !OMP ?
v(ir,:) = e2*( vx(ir,:) + vc(ir,:) )
!$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) &
!$acc& reduction(-:rhoneg2) present(rho)
DO ir = 1, dfftp%nnr
v(ir,1) = e2*( vx(ir,1) + vc(ir,1) )
v(ir,2) = e2*( vx(ir,2) + vc(ir,2) )
etxc = etxc + e2*( (ex(ir) + ec(ir))*rho%of_r(ir,1) )
rho%of_r(ir,1) = rho%of_r(ir,1) - rho_core(ir)
vtxc = vtxc + ( ( v(ir,1) + v(ir,2) )*rho%of_r(ir,1) + &
( v(ir,1) - v(ir,2) )*rho%of_r(ir,2) )
( v(ir,1) - v(ir,2) )*rho%of_r(ir,2) )*0.5d0
!
rhoup2 = rho%of_r(ir,1)+rho%of_r(ir,2)
rhodw2 = rho%of_r(ir,1)-rho%of_r(ir,2)
IF (rhoup2 < 0.d0) rhoneg(1) = rhoneg(1) - rhoup2
IF (rhodw2 < 0.d0) rhoneg(2) = rhoneg(2) - rhodw2
IF (rhoup2 < 0.d0) rhoneg1 = rhoneg1 - rhoup2*0.5d0
IF (rhodw2 < 0.d0) rhoneg2 = rhoneg2 - rhodw2*0.5d0
ENDDO
!
vtxc = 0.5d0 * vtxc
rhoneg = 0.5d0 * rhoneg
!
!
ELSE IF ( nspin == 4 ) THEN
! ... noncolinear case
!
CALL xc( dfftp%nnr, 4, 2, rho%of_r, ex, ec, vx, vc )
!
DO ir = 1, dfftp%nnr !OMP ?
arho = ABS( rho%of_r(ir,1) )
IF ( arho < vanishing_charge ) CYCLE
vs = 0.5D0*( vx(ir,1) + vc(ir,1) - vx(ir,2) - vc(ir,2) )
v(ir,1) = e2*( 0.5D0*( vx(ir,1) + vc(ir,1) + vx(ir,2) + vc(ir,2) ) )
!
amag = SQRT( SUM( rho%of_r(ir,2:4)**2 ) )
IF ( amag > vanishing_mag ) THEN
v(ir,2:4) = e2 * vs * rho%of_r(ir,2:4) / amag
vtxc = vtxc + SUM( v(ir,2:4) * rho%of_r(ir,2:4) )
ENDIF
etxc = etxc + e2*( ex(ir) + ec(ir) ) * arho
!
rho%of_r(ir,1) = rho%of_r(ir,1) - rho_core(ir)
IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg(1) = rhoneg(1) - rho%of_r(ir,1)
IF ( amag / arho > 1.D0 ) rhoneg(2) = rhoneg(2) + 1.D0/omega
vtxc = vtxc + v(ir,1) * rho%of_r(ir,1)
ENDDO
!
!
ELSEIF ( nspin == 4 ) THEN
! ... noncollinear case
!
CALL xc( dfftp%nnr, 4, 2, rho%of_r, ex, ec, vx, vc, gpu_args_=.TRUE. )
!
!$acc parallel loop reduction(+:etxc) reduction(+:vtxc) reduction(-:rhoneg1) &
!$acc& reduction(+:rhoneg2) present(rho)
DO ir = 1, dfftp%nnr
arho = ABS( rho%of_r(ir,1) )
IF ( arho < vanishing_charge ) CYCLE
vs = 0.5D0*( vx(ir,1) + vc(ir,1) - vx(ir,2) - vc(ir,2) )
v(ir,1) = e2*( 0.5D0*( vx(ir,1) + vc(ir,1) + vx(ir,2) + vc(ir,2) ) )
!
amag = SQRT( SUM( rho%of_r(ir,2:4)**2 ) )
IF ( amag > vanishing_mag ) THEN
v(ir,2:4) = e2 * vs * rho%of_r(ir,2:4) / amag
vtxc24 = SUM( v(ir,2:4) * rho%of_r(ir,2:4) )
ELSE
vtxc24 = 0.d0
ENDIF
etxc = etxc + e2*( ex(ir) + ec(ir) ) * arho
!
rho%of_r(ir,1) = rho%of_r(ir,1) - rho_core(ir)
IF ( rho%of_r(ir,1) < 0.D0 ) rhoneg1 = rhoneg1 - rho%of_r(ir,1)
IF ( amag / arho > 1.D0 ) rhoneg2 = rhoneg2 + 1.D0/omega
vtxc = vtxc + vtxc24 + v(ir,1) * rho%of_r(ir,1)
ENDDO
!
ENDIF
!
DEALLOCATE( ex, ec )
DEALLOCATE( vx, vc )
!$acc end host_data
!$acc end data
DEALLOCATE( ex, vx )
DEALLOCATE( ec, vc )
!
CALL mp_sum( rhoneg , intra_bgrp_comm )
CALL mp_sum( rhoneg1 , intra_bgrp_comm )
CALL mp_sum( rhoneg2 , intra_bgrp_comm )
!
rhoneg(:) = rhoneg(:) * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
rhoneg1 = rhoneg1 * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
rhoneg2 = rhoneg2 * omega / ( dfftp%nr1*dfftp%nr2*dfftp%nr3 )
!
IF ( rhoneg(1) > eps8 .OR. rhoneg(2) > eps8 ) &
WRITE( stdout,'(/,5X,"negative rho (up, down): ",2ES10.3)') rhoneg
IF ( rhoneg1 > eps8 .OR. rhoneg2 > eps8 ) &
WRITE( stdout,'(/,5X,"negative rho (up, down): ",2ES10.3)') rhoneg1, rhoneg2
!
! ... energy terms, local-density contribution
!
@ -490,7 +559,12 @@ SUBROUTINE v_xc( rho, rho_core, rhog_core, etxc, vtxc, v )
!
! ... add gradient corrections (if any)
!
!$acc host_data use_device( rho%of_r, rho%of_g, rho_core, rhog_core, v )
CALL gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, etxc, vtxc, v )
!$acc end host_data
!
!$acc end data
!$acc end data
!
! ... add non local corrections (if any)
!

View File

@ -44,6 +44,8 @@ set(sources_libbeef
beefun.c)
qe_add_library(qe_libbeef ${sources_libbeef})
target_link_libraries(qe_libbeef
PUBLIC
qe_openacc_c
PRIVATE
qe_lapack)
@ -87,6 +89,7 @@ if(QE_ENABLE_TEST)
qe_mpi_fortran
qe_upflib
qe_external_libxc
qe_libbeef
qe_xclib)
find_program(BASH_PROGRAM bash)

View File

@ -41,6 +41,7 @@ xclib_utils_and_para.o \
dft_setting_params.o \
dft_setting_routines.o \
qe_dft_list.o \
xc_beef_interface.o \
xc_lib.o
all: xc_lib.a

View File

@ -8,14 +8,25 @@
#define nmax 30
#pragma acc routine (dgemv_) seq
extern void dgemv_(const char *, const int *, const int *, const double *,
double *, const int *, double *, const int *, const double *, double *,
const int *);
#pragma acc routine seq
double ddot1(double v[], double u[], int n)
{
double result = 0.0;
for (int i = 0; i < n; i++)
result += v[i]*u[i];
return result;
}
extern double ddot_(const int *, double *, const int *, double *, const int *);
//beef exchange enhancement factor legendre polynomial coefficients
static double mi[] = {
#pragma acc declare copyin (mi[0:nmax-1])
static double mi[nmax] = {
1.516501714304992365356,
0.441353209874497942611,
-0.091821352411060291887,
@ -392,7 +403,7 @@ static double beefmat[] = {
L[1] = x; int i=0;\
for(i=2;i<nmax;i++) { \
L[i] = 2.*(x)*L[i-1] - L[i-2] - ((x)*L[i-1] - L[i-2])/((double) i); \
dL[i] = L[i-1]*i + dL[i-1]*(x); \
dL[i] = L[i-1]*((double) i) + dL[i-1]*(x); \
} \
}
@ -419,7 +430,10 @@ static double beefmat[] = {
return L[n]; \
}
#define PRAGMA_ACC_ROUTINE _Pragma("acc routine seq")
#define defLdL(n) \
PRAGMA_ACC_ROUTINE \
static void LdL ## n (double x, double *y, double *z) { \
double L[n+1]; \
double dL[n+1]; \
@ -497,8 +511,9 @@ static double(*Ln[])(double) = {
L29
};
#pragma acc routine seq
static void LdL0(double x, double *y, double *z) {*y=1.; *z=0.;}
#pragma acc routine seq
static void LdL1(double x, double *y, double *z) {*y=x; *z=1.;}
defLdL(2)
defLdL(3)
@ -562,6 +577,42 @@ static void(*LdLn[])(double,double *,double *) = {
LdL29
};
#pragma acc routine seq
static void LdLnACC(double t,double *fx,double *dl, int bo) {
switch(bo) {
case 0: LdL0(t ,fx ,dl ); break;
case 1: LdL1(t ,fx ,dl ); break;
case 2: LdL2(t ,fx ,dl ); break;
case 3: LdL3(t ,fx ,dl ); break;
case 4: LdL4(t ,fx ,dl ); break;
case 5: LdL5(t ,fx ,dl ); break;
case 6: LdL6(t ,fx ,dl ); break;
case 7: LdL7(t ,fx ,dl ); break;
case 8: LdL8(t ,fx ,dl ); break;
case 9: LdL9(t ,fx ,dl ); break;
case 10: LdL10(t ,fx ,dl ); break;
case 11: LdL11(t ,fx ,dl ); break;
case 12: LdL12(t ,fx ,dl ); break;
case 13: LdL13(t ,fx ,dl ); break;
case 14: LdL14(t ,fx ,dl ); break;
case 15: LdL15(t ,fx ,dl ); break;
case 16: LdL16(t ,fx ,dl ); break;
case 17: LdL17(t ,fx ,dl ); break;
case 18: LdL18(t ,fx ,dl ); break;
case 19: LdL19(t ,fx ,dl ); break;
case 20: LdL20(t ,fx ,dl ); break;
case 21: LdL21(t ,fx ,dl ); break;
case 22: LdL22(t ,fx ,dl ); break;
case 23: LdL23(t ,fx ,dl ); break;
case 24: LdL24(t ,fx ,dl ); break;
case 25: LdL25(t ,fx ,dl ); break;
case 26: LdL26(t ,fx ,dl ); break;
case 27: LdL27(t ,fx ,dl ); break;
case 28: LdL28(t ,fx ,dl ); break;
case 29: LdL29(t ,fx ,dl ); break;
}
}
// LDA and PBEc coefficients
//pix = 1./(2.*(3.*pi**2)**(1./3.))**2
@ -582,21 +633,24 @@ static void(*LdLn[])(double,double *,double *) = {
#define normrand() \
( sqrt(-2.*log(unirandex0())) * cos(M_PI*2.*unirandinc0()) )
// beeforder==-1 : calculate beefxc with standard expansion coefficients
// (is changed by beefsetmode_)
static int beeforder = -1;
#pragma acc declare copyin (beeforder)
//arrays holding current Legendre polynomials and derivatives
static __thread double L[nmax] = {1.};
static __thread double dL[nmax] = {0.,1.};
//static __thread double L[nmax] = {1.};
//static __thread double dL[nmax] = {0.,1.};
//static double L[nmax] = {1.};
//static double dL[nmax] = {0.,1.};
static inline double sq(double x) {return x*x;}
// beeftype is a switch set by beef_set_type
// which determines which version/type of beef is used
static int beeftype = 0;
#pragma acc declare copyin (beeftype)
#define output_spacing " "
#define output_marker "**************************************************************************"

View File

@ -10,15 +10,17 @@
#include "pbecor.h"
// evaluate bee exchange energy and its derivatives de/drho and ( de/d|grad rho| ) / |grad rho|
#pragma acc routine seq
void beefx_(double *r, double *g, double *e, double *dr, double *dg, int *addlda)
{
double s2,t,r43,r83,s,sx,dx,fx,dl,dfx;
const int n=nmax;
const int i1=1;
const int i2=1;
double L[nmax]={1.};
double dL[nmax]={0.,1.};
switch(beeftype) {
case 0: //BEEF-vdW xc
case 0: //BEEF-vdW xc
r43 = pow(*r, 4./3.);
r83 = r43*r43;
sx = r2e * r43;
@ -31,13 +33,14 @@ void beefx_(double *r, double *g, double *e, double *dr, double *dg, int *addlda
if(beeforder==-1)
{
calclegdleg(t);
if(!(*addlda))
fx = ddot_(&n, mi, &i1, L, &i2) - 1.;
else
fx = ddot_(&n, mi, &i1, L, &i2);
dl = ddot_(&n, mi, &i1, dL, &i2);
if(!(*addlda)){
fx = ddot1(mi,L,n) - 1.;
}
else{
fx = ddot1(mi,L,n);
}
dl = ddot1(mi,dL,n);
dfx = dl*( 4.*s / (4.+s2) - 4.*s2*s/sq(4.+s2) );
*dr = dx*fx - 4./3.*s2/(s*(*r))*sx*dfx;
*dg = sx*dfx*pix/(s*r83);
@ -47,8 +50,8 @@ void beefx_(double *r, double *g, double *e, double *dr, double *dg, int *addlda
if(beeforder>=0)
{
(*LdLn[beeforder])(t, &fx, &dl);
//(*LdLn[beeforder])(t, &fx, &dl);
LdLnACC(t, &fx, &dl, beeforder);
dfx = dl*( 4.*s / (4.+s2) - 4.*s2*s/sq(4.+s2) );
*dr = dx*fx - 4./3.*s2/(s*(*r))*sx*dfx;
*dg = sx*dfx*pix/(s*r83);
@ -60,13 +63,13 @@ void beefx_(double *r, double *g, double *e, double *dr, double *dg, int *addlda
*dg = 0.;
*e = 0.;
}
break;
}
}
// evaluate local part of bee correlation and its derivatives de/drho and ( de/d|grad rho| ) / |grad rho|
#pragma acc routine seq
void beeflocalcorr_(double *r, double *g, double *e, double *dr, double *dg, int *addlda)
{
double rs, ldac, ldadr, pbec, pbedr, pbed2rho;
@ -80,10 +83,9 @@ void beeflocalcorr_(double *r, double *g, double *e, double *dr, double *dg, int
}
switch(beeftype) {
case 0: //BEEF-vdW xc
case 0: //BEEF-vdW xc
rs = invpi075tothird / pow(*r,1./3.);
corpbe(rs, 0.5/r2k * sqrt(*g*rs) / (*r),
(beeforder>-3), 1, &ldac, &ldadr, &pbec, &pbedr, &pbed2rho);
corpbe(rs, 0.5/r2k * sqrt(*g*rs) / (*r),(beeforder>-3), 1, &ldac, &ldadr, &pbec, &pbedr, &pbed2rho);
if(beeforder==-1)
{
@ -131,9 +133,11 @@ void beefxpot_(double *r, double *g, double *e, int *addlda)
const int n=nmax;
const int i1=1;
const int i2=1;
double L[nmax]={1.};
switch(beeftype) {
case 0: //BEEF-vdW xc
case 0: //BEEF-vdW xc
r43 = pow(*r, 4./3.);
s2 = *g*pix / (r43*r43);
@ -171,7 +175,7 @@ void beeflocalcorrpot_(double *r, double *g, double *e, int *addlda)
}
switch(beeftype) {
case 0: //BEEF-vdW xc
case 0: //BEEF-vdW xc
rs = invpi075tothird / pow(*r,1./3.);
corpbe(rs, 0.5/r2k * sqrt(*g*rs) / (*r),
(beeforder>-3), 0, &ldac, &ldadr, &pbec, &pbedr, &pbed2rho);
@ -199,6 +203,7 @@ void beeflocalcorrpot_(double *r, double *g, double *e, int *addlda)
// evaluate local part of bee correlation for spin polarized system
#pragma acc routine seq
void beeflocalcorrspin_(double *r, double *z, double *g, double *e,
double *drup, double *drdown, double *dg, int *addlda) {
double rs, ldac, ldadrup, ldadrdown, pbec, pbedrup, pbedrdown, pbed2rho;
@ -213,7 +218,7 @@ void beeflocalcorrspin_(double *r, double *z, double *g, double *e,
}
switch(beeftype) {
case 0: //BEEF-vdW xc
case 0: //BEEF-vdW xc
rs = invpi075tothird / pow(*r,1./3.);
corpbespin(rs, 0.5/r2k * sqrt(*g*rs) / (*r), *z,
(beeforder>-3), 1, &ldac, &ldadrup, &ldadrdown, &pbec,
@ -275,7 +280,7 @@ void beeflocalcorrpotspin_(double *r, double *z, double *g, double *e, int *addl
}
switch(beeftype) {
case 0: //BEEF-vdW xc
case 0: //BEEF-vdW xc
rs = invpi075tothird / pow(*r,1./3.);
corpbespin(rs, 0.5/r2k * sqrt(*g*rs) / (*r), *z,
(beeforder>-3), 0, &ldac, &ldadrup, &ldadrdown, &pbec,
@ -311,6 +316,7 @@ void beeflocalcorrpotspin_(double *r, double *z, double *g, double *e, int *addl
void beefsetmode_(int *mode)
{
beeforder = *mode;
#pragma acc update device(beeforder)
}
// initialize pseudo random number generator
@ -357,6 +363,7 @@ void beefensemble_(double *beefxc, double *ensemble)
int beef_set_type_(int *tbeef, int *ionode)
{
beeftype = *tbeef;
#pragma acc update device(beeftype)
if(*ionode)
{

View File

@ -546,7 +546,6 @@ CONTAINS
DO l = 1, LEN_TRIM(lxc_name)
lxc_name(l:l) = capital( lxc_name(l:l) )
ENDDO
nlxc = 0
ii = 0
DO WHILE ( ii < LEN_TRIM(lxc_name) )
ii = ii + 1

View File

@ -29,9 +29,9 @@
/* ---------------------------------------------------------------------- */
/* ###################################################################### */
/* ---------------------------------------------------------------------- */
/* Subroutine */ static void gcor2(double a, double a1, double b1,
double b2, double b3, double b4, double rtrs,
double *gg, double *ggrs)
/* Subroutine */
#pragma acc routine seq
static void gcor2(double a, double a1, double b1, double b2, double b3, double b4, double rtrs, double *gg, double *ggrs)
{
/* Local variables */
double q0, q1, q2, q3;
@ -61,10 +61,9 @@
/* ###################################################################### */
/* ---------------------------------------------------------------------- */
/* Subroutine */ void corpbe(double rs, double t,
int lgga,
int lpot, double *ec, double *vc,
double *h__, double *dvc, double *dv2rho)
/* Subroutine */
#pragma acc routine seq
void corpbe(double rs, double t, int lgga, int lpot, double *ec, double *vc, double *h__, double *dvc, double *dv2rho)
{
/* Local variables */
double b, b2, q4, t2, q5, t4,
@ -168,7 +167,9 @@
/* ###################################################################### */
/* ---------------------------------------------------------------------- */
/* Subroutine */ void corpbespin(double rs, double t, double zet,
/* Subroutine */
#pragma acc routine seq
void corpbespin(double rs, double t, double zet,
int lgga,
int lpot, double *ec, double *vcup, double *vcdown,
double *h__, double *dvcup, double *dvcdown, double *dv2rho)

View File

@ -1,10 +1,12 @@
#ifndef pbecor_h
#define pbecor_h 1
#pragma acc routine seq
extern void corpbe(double, double,
int, int, double *, double *,
double *, double *, double *);
#pragma acc routine seq
void corpbespin(double, double, double,
int, int, double *, double *, double *,
double *, double *, double *, double *);

View File

@ -35,7 +35,7 @@ SUBROUTINE dgcxc_unpol( length, r_in, s2_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc
!! This routine computes the derivative of the exchange and correlation
!! potentials of GGA family.
!
USE qe_drivers_gga, ONLY: gcxc, gcxc_beef
USE qe_drivers_gga, ONLY: gcxc
!
IMPLICIT NONE
!
@ -90,10 +90,11 @@ SUBROUTINE dgcxc_unpol( length, r_in, s2_in, vrrx, vsrx, vssx, vrrc, vsrc, vssc
raux(i3:f3) = r_in ; s2aux(i3:f3) = (s+ds)**2
raux(i4:f4) = r_in ; s2aux(i4:f4) = (s-ds)**2
!
!$acc data copyin( raux, s2aux ) copyout( sx, sc, v1x, v2x, v1c, v2c )
!$acc host_data use_device( raux, s2aux, sx, sc, v1x, v2x, v1c, v2c )
CALL gcxc( length*4, raux, s2aux, sx, sc, v1x, v2x, v1c, v2c )
!
IF ( igcx==43 .OR. igcc==14 ) CALL gcxc_beef( length*4, raux, s2aux, &
sx, sc, v1x, v2x, v1c, v2c )
!$acc end host_data
!$acc end data
!
! ... to avoid NaN in the next operations
WHERE( r_in<=small .OR. s2_in<=small )
@ -129,7 +130,7 @@ SUBROUTINE dgcxc_spin( length, r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, &
!! This routine computes the derivative of the exchange and correlation
!! potentials in the spin-polarized case.
!
USE qe_drivers_gga, ONLY: gcx_spin, gcc_spin, gcx_spin_beef, gcc_spin_beef
USE qe_drivers_gga, ONLY: gcx_spin, gcc_spin
!
IMPLICIT NONE
!
@ -239,8 +240,11 @@ SUBROUTINE dgcxc_spin( length, r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, &
raux(i8:f8,:) = r ; s2aux(i8:f8,:) = (s-dsdw)**2
!
!
IF ( igcx/=43 ) CALL gcx_spin( length*8, raux, s2aux, sx, v1x, v2x )
IF ( igcx==43 ) CALL gcx_spin_beef( length*8, raux, s2aux, sx, v1x, v2x )
!$acc data copyin( raux, s2aux ) copyout( sx, v1x, v2x )
!$acc host_data use_device( raux, s2aux, sx, v1x, v2x )
CALL gcx_spin( length*8, raux, s2aux, sx, v1x, v2x )
!$acc end host_data
!$acc end data
!
! ... up
vrrx(:,1) = 0.5_DP * (v1x(i1:f1,1) - v1x(i2:f2,1)) / drup(:,1)
@ -311,8 +315,11 @@ SUBROUTINE dgcxc_spin( length, r_in, g_in, vrrx, vrsx, vssx, vrrc, vrsc, &
rtaux(i5:f5) = rt ; s2taux(i5:f5) = s2t ; zetaux(i5:f5) = zeta+dz
rtaux(i6:f6) = rt ; s2taux(i6:f6) = s2t ; zetaux(i6:f6) = zeta-dz
!
IF ( igcc/=14 ) CALL gcc_spin( length*6, rtaux, zetaux, s2taux, sc, v1c, v2c )
IF ( igcc==14 ) CALL gcc_spin_beef( length*6, rtaux, zetaux, s2taux, sc, v1c, v2c )
!$acc data copyin( rtaux, zetaux, s2taux ) copyout( sc, v1c, v2c )
!$acc host_data use_device( rtaux, zetaux, s2taux, sc, v1c, v2c )
CALL gcc_spin( length*6, rtaux, zetaux, s2taux, sc, v1c, v2c )
!$acc end host_data
!$acc end data
!
vrrc(:,1) = 0.5_DP * (v1c(i1:f1,1) - v1c(i2:f2,1)) / dr * null_v(:,1)
vrrc(:,2) = 0.5_DP * (v1c(i1:f1,2) - v1c(i2:f2,2)) / dr * null_v(:,1)

View File

@ -125,7 +125,11 @@ SUBROUTINE dmxc_lda( length, rho_in, dmuxc )
rhoaux(i1:f1) = arho+dr
rhoaux(i2:f2) = arho-dr
!
!$acc data copyin( rhoaux ) copyout( ex, ec, vx, vc )
!$acc host_data use_device( rhoaux, ex, ec, vx, vc )
CALL xc_lda( length*2, rhoaux, ex, ec, vx, vc )
!$acc end host_data
!$acc end data
!
WHERE ( arho < small ) dr = 1.0_DP ! ... to avoid NaN in the next operation
!
@ -316,7 +320,11 @@ SUBROUTINE dmxc_lsda( length, rho_in, dmuxc )
rhoaux(i3:f3) = rhotot ; zetaux(i3:f3) = zeta_eff + dz
rhoaux(i4:f4) = rhotot ; zetaux(i4:f4) = zeta_eff - dz
!
!$acc data copyin( rhoaux, zetaux ) copyout( aux1, aux2, vx, vc )
!$acc host_data use_device( rhoaux, zetaux, aux1, aux2, vx, vc )
CALL xc_lsda( length*4, rhoaux, zetaux, aux1, aux2, vx, vc )
!$acc end host_data
!$acc end data
!
WHERE (rhotot <= small) ! ... to avoid NaN in the next operations
dr=1.0_DP ; rhotot=0.5d0
@ -438,7 +446,11 @@ SUBROUTINE dmxc_nc( length, rho_in, m, dmuxc )
rhoaux(i5:f5) = rhotot ; zetaux(i5:f5) = zeta_eff - dz
!
!
!$acc data copyin( rhoaux, zetaux ) copyout( aux1, aux2, vx, vc )
!$acc host_data use_device( rhoaux, zetaux, aux1, aux2, vx, vc )
CALL xc_lsda( length*5, rhoaux, zetaux, aux1, aux2, vx, vc )
!$acc end host_data
!$acc end data
!
!
vs(:) = 0.5_DP*( vx(i1:f1,1)+vc(i1:f1,1)-vx(i1:f1,2)-vc(i1:f1,2) )

View File

@ -26,8 +26,7 @@ MODULE qe_drivers_gga
!
PRIVATE
!
PUBLIC :: gcxc, gcx_spin, gcc_spin, gcc_spin_more, gcxc_beef, gcx_spin_beef, &
gcc_spin_beef
PUBLIC :: gcxc, gcx_spin, gcc_spin, gcc_spin_more
!
!
CONTAINS
@ -41,6 +40,7 @@ SUBROUTINE gcxc( length, rho_in, grho_in, sx_out, sc_out, v1x_out, &
!
USE exch_gga
USE corr_gga
USE beef_interface, ONLY: beefx, beeflocalcorr
!
IMPLICIT NONE
!
@ -80,12 +80,11 @@ SUBROUTINE gcxc( length, rho_in, grho_in, sx_out, sc_out, v1x_out, &
ntids = omp_get_num_threads()
#endif
!
!
#if defined(_OPENACC)
!$acc data copyin(rho_in,grho_in), copyout(sx_out,sc_out,v1x_out,v2x_out,v1c_out,v2c_out)
!$acc data deviceptr( rho_in(length), grho_in(length), sx_out(length), sc_out(length), &
!$acc& v1x_out(length), v2x_out(length), v1c_out(length), v2c_out(length) )
!$acc parallel loop
#endif
#if defined(_OPENMP) && !defined(_OPENACC)
#else
!$omp parallel if(ntids==1) default(none) &
!$omp private( rho, grho, sx, sx_, sxsr, v1x, v1x_, v1xsr, &
!$omp v2x, v2x_, v2xsr, sc, v1c, v2c ) &
@ -298,6 +297,10 @@ SUBROUTINE gcxc( length, rho_in, grho_in, sx_out, sc_out, v1x_out, &
v2x = (1.0_DP - exx_fraction) * v2x
ENDIF
!
CASE( 43 ) ! 'BEEX'
!
CALL beefx( rho, grho, sx, v1x, v2x, 0 )
!
CASE( 44 ) ! 'RPBE'
!
CALL pbex( rho, grho, 8, sx, v1x, v2x )
@ -369,6 +372,11 @@ SUBROUTINE gcxc( length, rho_in, grho_in, sx_out, sc_out, v1x_out, &
v2c = 0.871_DP * v2c
ENDIF
!
CASE( 14 ) !'BEEC'
! last parameter 0 means: do not add lda contributions
! espresso will do that itself
CALL beeflocalcorr( rho, grho, sc, v1c, v2c, 0 )
!
CASE DEFAULT
!
sc = 0.0_DP
@ -402,6 +410,7 @@ SUBROUTINE gcx_spin( length, rho_in, grho2_in, sx_tot, v1x_out, v2x_out )
!! Gradient corrections for exchange - Hartree a.u.
!
USE exch_gga
USE beef_interface, ONLY: beefx
!
IMPLICIT NONE
!
@ -439,10 +448,11 @@ SUBROUTINE gcx_spin( length, rho_in, grho2_in, sx_tot, v1x_out, v2x_out )
ntids = omp_get_num_threads()
#endif
!
sx_tot = 0.0_DP
!sx_tot = 0.0_DP
!
#if defined(_OPENACC)
!$acc data copyin(rho_in, grho2_in), copyout(sx_tot, v1x_out, v2x_out)
!$acc data deviceptr( rho_in(length,2), grho2_in(length,2), sx_tot(length), &
!$acc& v1x_out(length,2), v2x_out(length,2) )
!$acc parallel loop
#else
!$omp parallel if(ntids==1) default(none) &
@ -824,6 +834,18 @@ SUBROUTINE gcx_spin( length, rho_in, grho2_in, sx_tot, v1x_out, v2x_out )
! case igcx == 7 (meta-GGA) must be treated in a separate call to another
! routine: needs kinetic energy density in addition to rho and grad rho
!
CASE( 43 ) ! BEEX
!
rho_up = 2.0_DP * rho_up ; rho_dw = 2.0_DP * rho_dw
grho2_up = 4.0_DP * grho2_up ; grho2_dw = 4.0_DP * grho2_dw
!
CALL beefx( rho_up, grho2_up, sx_up, v1x_up, v2x_up, 0 )
CALL beefx( rho_dw, grho2_dw, sx_dw, v1x_dw, v2x_dw, 0 )
!
sx_tot(ir) = 0.5_DP * (sx_up*rnull_up + sx_dw*rnull_dw)
v2x_up = 2.0_DP * v2x_up
v2x_dw = 2.0_DP * v2x_dw
!
CASE DEFAULT
!
sx_tot(ir) = 0.0_DP
@ -858,6 +880,7 @@ SUBROUTINE gcc_spin( length, rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out
!! Implemented: Perdew86, GGA (PW91), PBE
!
USE corr_gga
USE beef_interface, ONLY: beeflocalcorrspin
!
IMPLICIT NONE
!
@ -891,7 +914,8 @@ SUBROUTINE gcc_spin( length, rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out
#endif
!
#if defined(_OPENACC)
!$acc data copyin(rho_in, grho_in), copyout(sc_out, v1c_out, v2c_out), copy(zeta_io)
!$acc data deviceptr( rho_in(length), zeta_io(length), grho_in(length), &
!$acc& sc_out(length), v1c_out(length,2), v2c_out(length) )
!$acc parallel loop
#else
!$omp parallel if(ntids==1) default(none) &
@ -941,6 +965,10 @@ SUBROUTINE gcc_spin( length, rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out
!
CALL pbec_spin( rho, zeta, grho, 2, sc, v1c_up, v1c_dw, v2c )
!
CASE( 14 )
!
CALL beeflocalcorrspin( rho, zeta, grho, sc, v1c_up, v1c_dw, v2c, 0 )
!
CASE DEFAULT
!
sc = 0.0_DP
@ -1015,13 +1043,9 @@ SUBROUTINE gcc_spin_more( length, rho_in, grho_in, grho_ud_in, &
ntids = omp_get_num_threads()
#endif
!
sc = 0.0_DP
v1c = 0.0_DP
v2c = 0.0_DP
v2c_ud = 0.0_DP
!
#if defined(_OPENACC)
!$acc data copyin(rho_in, grho_in, grho_ud_in), copyout(sc, v1c, v2c, v2c_ud)
!$acc data deviceptr( rho_in(length,2), grho_in(length,2), grho_ud_in(length), &
!$acc& sc(length), v1c(length,2), v2c(length,2), v2c_ud(length) )
!$acc parallel loop
#else
!$omp parallel if(ntids==1) default(none) &
@ -1080,7 +1104,7 @@ SUBROUTINE gcc_spin_more( length, rho_in, grho_in, grho_ud_in, &
!
CASE DEFAULT
!
!CALL xclib_error(" gcc_spin_more "," gradient correction not implemented ",1) !***acc test
!CALL xclib_error(" gcc_spin_more "," gradient correction not implemented ",1)
!
END SELECT
!
@ -1091,220 +1115,10 @@ SUBROUTINE gcc_spin_more( length, rho_in, grho_in, grho_ud_in, &
!$omp end do
!$omp end parallel
#endif
!
RETURN
!
END SUBROUTINE gcc_spin_more
!
!
! ========> BEEF GGA DRIVERS <========================
!
!------------------------------------------------------------------------
SUBROUTINE gcxc_beef( length, rho_in, grho_in, sx_out, sc_out, v1x_out, &
v2x_out, v1c_out, v2c_out )
!---------------------------------------------------------------------
!! Driver for BEEF gga xc terms. Unpolarized case.
!
USE beef_interface, ONLY: beefx, beeflocalcorr
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
REAL(DP), INTENT(IN), DIMENSION(length) :: rho_in
REAL(DP), INTENT(IN), DIMENSION(length) :: grho_in
REAL(DP), INTENT(OUT), DIMENSION(length) :: sx_out, sc_out
REAL(DP), INTENT(OUT), DIMENSION(length) :: v1x_out, v2x_out
REAL(DP), INTENT(OUT), DIMENSION(length) :: v1c_out, v2c_out
!
! ... local variables
!
INTEGER :: ir
REAL(DP) :: rho, grho
REAL(DP) :: sx, v1x, v2x
REAL(DP) :: sc, v1c, v2c
!
IF (igcx == 43 .OR. igcc == 14) THEN
!
DO ir = 1, length
grho = grho_in(ir)
IF ( rho_in(ir) <= rho_threshold_gga .OR. grho <= grho_threshold_gga ) THEN
sx_out(ir) = 0.0_DP ; sc_out(ir) = 0.0_DP
v1x_out(ir) = 0.0_DP ; v1c_out(ir) = 0.0_DP
v2x_out(ir) = 0.0_DP ; v2c_out(ir) = 0.0_DP
CYCLE
ENDIF
!
rho = ABS(rho_in(ir))
!
IF ( igcx == 43 ) THEN
! last parameter = 0 means do not add LDA (=Slater) exchange
! (espresso) will add it itself
CALL beefx( rho, grho, sx, v1x, v2x, 0 )
sx_out(ir) = sx
v1x_out(ir) = v1x
v2x_out(ir) = v2x
ENDIF
!
IF ( igcc == 14 ) THEN
! last parameter 0 means: do not add lda contributions
! espresso will do that itself
CALL beeflocalcorr( rho, grho, sc, v1c, v2c, 0 )
sc_out(ir) = sc
v1c_out(ir) = v1c
v2c_out(ir) = v2c
ENDIF
ENDDO
!
ENDIF
!
RETURN
!
END SUBROUTINE gcxc_beef
!
!-----------------------------------------------------------------------------
SUBROUTINE gcx_spin_beef( length, rho_in, grho2_in, sx_tot, v1x_out, v2x_out )
!--------------------------------------------------------------------------
!! Driver for BEEF gga exchange term. Polarized case.
!
USE beef_interface, ONLY: beefx
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
REAL(DP), INTENT(IN), DIMENSION(length,2) :: rho_in
REAL(DP), INTENT(IN), DIMENSION(length,2) :: grho2_in
REAL(DP), INTENT(OUT), DIMENSION(length) :: sx_tot
REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v1x_out
REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v2x_out
!
! ... local variables
!
INTEGER :: ir
REAL(DP) :: rho_up, rho_dw, grho2_up, grho2_dw
REAL(DP) :: v1x_up, v1x_dw, v2x_up, v2x_dw
REAL(DP) :: sx_up, sx_dw, rnull_up, rnull_dw
!
REAL(DP), PARAMETER :: small=1.D-10
REAL(DP), PARAMETER :: rho_trash=0.5_DP, grho2_trash=0.2_DP
! ... Temporary workaround for BEEF, which does not support GPU.
IF ( igcx == 43 ) THEN
!
DO ir = 1, length
!
rho_up = rho_in(ir,1)
rho_dw = rho_in(ir,2)
grho2_up = grho2_in(ir,1)
grho2_dw = grho2_in(ir,2)
rnull_up = 1.0_DP
rnull_dw = 1.0_DP
!
IF ( rho_up+rho_dw <= small ) THEN
sx_tot(ir) = 0.0_DP
v1x_out(ir,1) = 0.0_DP
v2x_out(ir,1) = 0.0_DP
v1x_out(ir,2) = 0.0_DP
v2x_out(ir,2) = 0.0_DP
CYCLE
ELSE
IF ( rho_up<=small .OR. SQRT(ABS(grho2_up))<=small ) THEN
rho_up = rho_trash
grho2_up = grho2_trash
rnull_up = 0.0_DP
ENDIF
IF ( rho_dw<=small .OR. SQRT(ABS(grho2_dw))<=small ) THEN
rho_dw = rho_trash
grho2_dw = grho2_trash
rnull_dw = 0.0_DP
ENDIF
ENDIF
!
rho_up = 2.0_DP * rho_up ; rho_dw = 2.0_DP * rho_dw
grho2_up = 4.0_DP * grho2_up ; grho2_dw = 4.0_DP * grho2_dw
!
CALL beefx(rho_up, grho2_up, sx_up, v1x_up, v2x_up, 0)
CALL beefx(rho_dw, grho2_dw, sx_dw, v1x_dw, v2x_dw, 0)
!
sx_tot(ir) = 0.5_DP * (sx_up*rnull_up + sx_dw*rnull_dw)
v2x_up = 2.0_DP * v2x_up
v2x_dw = 2.0_DP * v2x_dw
!
v1x_out(ir,1) = v1x_up * rnull_up
v1x_out(ir,2) = v1x_dw * rnull_dw
v2x_out(ir,1) = v2x_up * rnull_up
v2x_out(ir,2) = v2x_dw * rnull_dw
!
ENDDO
!
ENDIF
!
RETURN
!
END SUBROUTINE gcx_spin_beef
!
!
!--------------------------------------------------------------------------------
SUBROUTINE gcc_spin_beef( length, rho_in, zeta_io, grho_in, sc_out, v1c_out, v2c_out )
!-------------------------------------------------------------------------------
!! BEEF Gradient correction.
!
USE beef_interface, ONLY: beeflocalcorrspin
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
REAL(DP), INTENT(IN), DIMENSION(length) :: rho_in
REAL(DP), INTENT(INOUT), DIMENSION(length) :: zeta_io
REAL(DP), INTENT(IN), DIMENSION(length) :: grho_in
REAL(DP), INTENT(OUT), DIMENSION(length) :: sc_out
REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v1c_out
REAL(DP), INTENT(OUT), DIMENSION(length) :: v2c_out
!
! ... local variables
!
INTEGER :: ir
REAL(DP) :: rho, zeta, grho
REAL(DP) :: sc, v1c_up, v1c_dw, v2c
!
IF ( igcc == 14 ) THEN
!
DO ir = 1, length
!
rho = rho_in(ir)
grho = grho_in(ir)
IF ( ABS(zeta_io(ir))<=1.0_DP ) zeta_io(ir) = SIGN( MIN(ABS(zeta_io(ir)), &
(1.0_DP-rho_threshold_gga)), zeta_io(ir) )
zeta = zeta_io(ir)
!
IF ( ABS(zeta)>1.0_DP .OR. rho<=rho_threshold_gga .OR. &
SQRT(ABS(grho))<=rho_threshold_gga ) THEN
sc_out(ir) = 0.0_DP
v1c_out(ir,1) = 0.0_DP ; v2c_out(ir) = 0.0_DP
v1c_out(ir,2) = 0.0_DP
CYCLE
ENDIF
rho = rho_in(ir)
grho = grho_in(ir)
zeta = zeta_io(ir)
!
IF ( ABS(zeta)>1.0_DP .OR. rho<=rho_threshold_gga .OR. &
SQRT(ABS(grho))<=rho_threshold_gga ) CYCLE
!
CALL beeflocalcorrspin( rho, zeta, grho, sc, v1c_up, v1c_dw, v2c, 0 )
!
sc_out(ir) = sc
v1c_out(ir,1) = v1c_up
v1c_out(ir,2) = v1c_dw
v2c_out(ir) = v2c
!
ENDDO
!
ENDIF
!
RETURN
!
END SUBROUTINE gcc_spin_beef
!
!
END MODULE qe_drivers_gga

View File

@ -90,10 +90,9 @@ SUBROUTINE xc_lda( length, rho_in, ex_out, ec_out, vx_out, vc_out )
!
!
#if defined(_OPENACC)
!$acc data copyin(rho_in), copyout(ex_out, vx_out, ec_out, vc_out)
!$acc data deviceptr( rho_in(length),ex_out(length),vx_out(length),ec_out(length),vc_out(length) )
!$acc parallel loop
#endif
#if defined(_OPENMP) && !defined(_OPENACC)
#else
!$omp parallel if(ntids==1) default(none) &
!$omp private( rho, rs, ex, ec, ec_, vx, vc, vc_ ) &
!$omp shared( rho_in, length, iexch, icorr, ex_out, ec_out, vx_out, vc_out, &
@ -257,8 +256,7 @@ SUBROUTINE xc_lda( length, rho_in, ex_out, ec_out, vx_out, vc_out )
ENDDO
#if defined(_OPENACC)
!$acc end data
#endif
#if defined(_OPENMP) && !defined(_OPENACC)
#else
!$omp end do
!$omp end parallel
#endif
@ -317,10 +315,9 @@ SUBROUTINE xc_lsda( length, rho_in, zeta_in, ex_out, ec_out, vx_out, vc_out )
#endif
!
#if defined(_OPENACC)
!$acc data copyin(rho_in, zeta_in), copyout(ex_out, vx_out, ec_out, vc_out)
!$acc data deviceptr( rho_in(length), zeta_in(length) ,ex_out(length),vx_out(length,2),ec_out(length),vc_out(length,2) )
!$acc parallel loop
#endif
#if defined(_OPENMP) && !defined(_OPENACC)
#else
!$omp parallel if(ntids==1) default(none) &
!$omp private( rho, rs, zeta, ex, ec, ec_, vx_up, vx_dw, vc_up, &
!$omp vc_dw, vc_up_, vc_dw_ ) &
@ -477,14 +474,12 @@ SUBROUTINE xc_lsda( length, rho_in, zeta_in, ex_out, ec_out, vx_out, vc_out )
ec_out(ir) = ec ; vc_out(ir,1) = vc_up ; vc_out(ir,2) = vc_dw
!
ENDDO
#if defined(_OPENMP) && !defined(_OPENACC)
#if defined(_OPENACC)
!$acc end data
#else
!$omp end do
!$omp end parallel
#endif
#if defined(_OPENACC)
!$acc end data
#endif
!
!
RETURN
!

View File

@ -17,7 +17,6 @@ MODULE qe_drivers_mgga
USE kind_l, ONLY: DP
USE dft_setting_params, ONLY: imeta, imetac, rho_threshold_mgga, &
grho2_threshold_mgga, tau_threshold_mgga
USE metagga
!
IMPLICIT NONE
!
@ -37,31 +36,33 @@ SUBROUTINE tau_xc( length, rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c
!! Available cases: M06L and TPSS. Other mGGA functionals can be used
!! through Libxc.
!
USE metagga
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
!! Number of k-points
REAL(DP), DIMENSION(length) :: rho
REAL(DP), INTENT(IN) :: rho(length)
!! Charge density
REAL(DP), DIMENSION(length) :: grho2
REAL(DP), INTENT(IN) :: grho2(length)
!! Square modulus of the density gradient
REAL(DP), DIMENSION(length) :: tau
REAL(DP), INTENT(IN) :: tau(length)
!! Laplacian of the density
REAL(DP), DIMENSION(length) :: ex
REAL(DP), INTENT(OUT) :: ex(length)
!! \(E_x = \int e_x(\text{rho},\text{grho}) dr \)
REAL(DP), DIMENSION(length) :: ec
REAL(DP), INTENT(OUT) :: ec(length)
!! \(E_x = \int e_x(\text{rho},\text{grho}) dr \)
REAL(DP), DIMENSION(length) :: v1x
REAL(DP), INTENT(OUT) :: v1x(length)
!! \( D\ E_x\ /\ D\ \text{rho} \)
REAL(DP), DIMENSION(length) :: v2x
REAL(DP), INTENT(OUT) :: v2x(length)
!! \( D\ E_x\ /\ D( D\ \text{rho}/D\ r_\alpha )/|\nabla\text{rho}| \)
REAL(DP), DIMENSION(length) :: v3x
REAL(DP), INTENT(OUT) :: v3x(length)
!! \( D\ E_x\ /\ D\ \text{tau} \)
REAL(DP), DIMENSION(length) :: v1c
REAL(DP), INTENT(OUT) :: v1c(length)
!! \( D\ E_c\ /\ D\ \text{rho} \)
REAL(DP), DIMENSION(length) :: v2c
REAL(DP), INTENT(OUT) :: v2c(1,length,1)
!! \( D\ E_c\ /\ D( D\ \text{rho}/D\ r_\alpha )/|\nabla\text{rho}| \)
REAL(DP), DIMENSION(length) :: v3c
REAL(DP), INTENT(OUT) :: v3c(length)
!! \( D\ E_c\ /\ D\ \text{tau} \)
!
! ... local variables
@ -69,15 +70,23 @@ SUBROUTINE tau_xc( length, rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c
INTEGER :: k
REAL(DP) :: arho
!
v1x=0.d0 ; v2x=0.d0 ; v3x=0.d0 ; ex=0.d0
v1c=0.d0 ; v2c=0.d0 ; v3c=0.d0 ; ec=0.d0
#if defined(_OPENACC)
!$acc data deviceptr( rho(length), grho2(length), tau(length), ex(length), ec(length), &
!$acc& v1x(length), v2x(length), v3x(length), v1c(length), v2c(1,length,1), &
!$acc& v3c(length) )
!
!$acc parallel loop
#endif
DO k = 1, length
!
arho = ABS(rho(k))
!
IF ( (arho<=rho_threshold_mgga).OR.(grho2(k)<=grho2_threshold_mgga).OR. &
(ABS(tau(k))<=rho_threshold_mgga) ) CYCLE
(ABS(tau(k))<=rho_threshold_mgga) ) THEN
v1x(k)=0.d0 ; v2x(k)=0.d0 ; v3x(k)=0.d0 ; ex(k)=0.d0
v1c(k)=0.d0 ; v2c(1,k,1)=0.d0 ; v3c(k)=0.d0 ; ec(k)=0.d0
CYCLE
ENDIF
!
! ...libxc-like threshold management
!grho2(k) = MIN( grho2(k), (8.d0*rho(k)*tau(k))**2 )
@ -86,21 +95,25 @@ SUBROUTINE tau_xc( length, rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c
CASE( 1 )
!
CALL tpsscxc( arho, grho2(k), tau(k), ex(k), ec(k), v1x(k), v2x(k), &
v3x(k), v1c(k), v2c(k), v3c(k) )
v3x(k), v1c(k), v2c(1,k,1), v3c(k) )
!
CASE( 2 )
!
CALL m06lxc( arho, grho2(k), tau(k), ex(k), ec(k), v1x(k), v2x(k), &
v3x(k), v1c(k), v2c(k), v3c(k) )
v3x(k), v1c(k), v2c(1,k,1), v3c(k) )
!
CASE DEFAULT
!
CALL xclib_error( 'tau_xc', 'This case is not implemented', imeta )
!CALL xclib_error( 'tau_xc', 'This case is not implemented', imeta )
!
END SELECT
!
ENDDO
!
#if defined(_OPENACC)
!$acc end data
#endif
!
RETURN
!
END SUBROUTINE tau_xc
@ -114,65 +127,74 @@ SUBROUTINE tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c,
!! Available cases: M06L and TPSS. Other mGGA functionals can be used
!! through Libxc.
!
USE metagga
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
!! Number of k-points
REAL(DP), INTENT(IN), DIMENSION(length,2) :: rho
REAL(DP), INTENT(IN) :: rho(length,2)
!! Charge density
REAL(DP), INTENT(IN), DIMENSION(3,length,2) :: grho
REAL(DP), INTENT(IN) :: grho(3,length,2)
!! The density gradient
REAL(DP), INTENT(IN), DIMENSION(length,2) :: tau
REAL(DP), INTENT(IN) :: tau(length,2)
!! Laplacian of the density
REAL(DP), INTENT(OUT), DIMENSION(length) :: ex
REAL(DP), INTENT(OUT) :: ex(length)
!! \(E_x = \int e_x(\text{rho},\text{grho}) dr \)
REAL(DP), INTENT(OUT), DIMENSION(length) :: ec
REAL(DP), INTENT(OUT) :: ec(length)
!! \(E_x = \int e_x(\text{rho},\text{grho}) dr \)
REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v1x
REAL(DP), INTENT(OUT) :: v1x(length,2)
!! \( D\ E_x\ /\ D\ \text{rho} \)
REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v2x
REAL(DP), INTENT(OUT) :: v2x(length,2)
!! \( D\ E_x\ /\ D( D\ \text{rho}/D\ r_\alpha )/|\nabla\text{rho}| \)
REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v3x
REAL(DP), INTENT(OUT) :: v3x(length,2)
!! \( D\ E_x\ /\ D\ \text{tau} \)
REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v1c
REAL(DP), INTENT(OUT) :: v1c(length,2)
!! \( D\ E_c\ /\ D\ \text{rho} \)
REAL(DP), INTENT(OUT), DIMENSION(3,length,2) :: v2c
REAL(DP), INTENT(OUT) :: v2c(3,length,2)
!! \( D\ E_c\ /\ D( D\ \text{rho}/D\ r_\alpha )/|\nabla\text{rho}| \)
REAL(DP), INTENT(OUT), DIMENSION(length,2) :: v3c
REAL(DP), INTENT(OUT) :: v3c(length,2)
!! \( D\ E_c\ /\ D\ \text{tau} \)
!
! ... local variables
!
INTEGER :: k
REAL(DP) :: rh, zeta, atau, grho2(2), ggrho2
REAL(DP) :: rh, zeta, atau, grho2up, grho2dw, ggrho2
REAL(DP) :: v2cup, v2cdw
!
ex=0.0_DP ; v1x=0.0_DP ; v2x=0.0_DP ; v3x=0.0_DP
ec=0.0_DP ; v1c=0.0_DP ; v2c=0.0_DP ; v3c=0.0_DP
#if defined(_OPENACC)
!$acc data deviceptr( rho(length,2), grho(3,length,2), tau(length,2), ex(length), &
!$acc& ec(length), v1x(length,2), v2x(length,2), v3x(length,2), &
!$acc& v1c(length,2), v2c(3,length,2), v3c(length,2) )
!
!$acc parallel loop
#endif
DO k = 1, length
!
rh = rho(k,1) + rho(k,2)
atau = tau(k,1) + tau(k,2) ! KE-density in Hartree
! ...libxc-like threshold management
!grho2(1) = MIN( SUM(grho(:,k,1)**2), (8.d0*rho(k,1)*tau(k,1))**2 )
!grho2(2) = MIN( SUM(grho(:,k,2)**2), (8.d0*rho(k,2)*tau(k,2))**2 )
grho2(1) = SUM(grho(:,k,1)**2)
grho2(2) = SUM(grho(:,k,2)**2)
ggrho2 = ( grho2(1) + grho2(2) ) * 4.0_DP
!grho2up = MIN( SUM(grho(:,k,1)**2), (8.d0*rho(k,1)*tau(k,1))**2 )
!grho2dw = MIN( SUM(grho(:,k,2)**2), (8.d0*rho(k,2)*tau(k,2))**2 )
grho2up = SUM(grho(:,k,1)**2)
grho2dw = SUM(grho(:,k,2)**2)
ggrho2 = ( grho2up + grho2dw ) * 4.0_DP
!
IF ( (rh <= rho_threshold_mgga).OR.(ggrho2 <= grho2_threshold_mgga).OR.&
(ABS(atau) <= tau_threshold_mgga) ) CYCLE
(ABS(atau) <= tau_threshold_mgga) ) THEN
v1x(k,:)=0.d0 ; v2x(k,:)=0.d0 ; v3x(k,:)=0.d0 ; ex(k)=0.d0
v1c(k,:)=0.d0 ; v2c(:,k,:)=0.d0 ; v3c(k,:)=0.d0 ; ec(k)=0.d0
CYCLE
ENDIF
!
SELECT CASE( imeta )
CASE( 1 )
!
CALL tpsscx_spin( rho(k,1), rho(k,2), grho2(1), grho2(2), tau(k,1), &
CALL tpsscx_spin( rho(k,1), rho(k,2), grho2up, grho2dw, tau(k,1), &
tau(k,2), ex(k), v1x(k,1), v1x(k,2), v2x(k,1), &
v2x(k,2), v3x(k,1), v3x(k,2) )
!
zeta = (rho(k,1) - rho(k,2)) / rh
zeta = MAX( MIN( 0.99999999_DP, zeta ), -0.99999999_DP )
zeta = MAX( MIN( 0.99999999_DP, (rho(k,1)-rho(k,2))/rh ), -0.99999999_DP )
!
CALL tpsscc_spin( rh, zeta, grho(:,k,1), grho(:,k,2), atau, ec(k), &
v1c(k,1), v1c(k,2), v2c(:,k,1), v2c(:,k,2), &
@ -180,7 +202,7 @@ SUBROUTINE tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c,
!
CASE( 2 )
!
CALL m06lxc_spin( rho(k,1), rho(k,2), grho2(1), grho2(2), tau(k,1), &
CALL m06lxc_spin( rho(k,1), rho(k,2), grho2up, grho2dw, tau(k,1), &
tau(k,2), ex(k), ec(k), v1x(k,1), v1x(k,2), &
v2x(k,1), v2x(k,2), v3x(k,1), v3x(k,2), v1c(k,1), &
v1c(k,2), v2cup, v2cdw, v3c(k,1), v3c(k,2) )
@ -190,12 +212,16 @@ SUBROUTINE tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c,
!
CASE DEFAULT
!
CALL xclib_error( 'tau_xc_spin', 'This case not implemented', imeta )
!CALL xclib_error( 'tau_xc_spin', 'This case not implemented', imeta )
!
END SELECT
!
ENDDO
!
#if defined(_OPENACC)
!$acc end data
#endif
!
RETURN
!
END SUBROUTINE tau_xc_spin

View File

@ -6,22 +6,23 @@
! or http://www.gnu.org/copyleft/gpl.txt .
!
!------------------------------------------------------------------------
MODULE metagga !<GPU:metagga=>metagga_gpu>
MODULE metagga
!-------------------------------------------------------------------------
!! MetaGGA functionals. Available functionals:
!
!! * TPSS (Tao, Perdew, Staroverov & Scuseria)
!! * M06L
!
USE exch_lda, ONLY: slater !<GPU:slater=>slater_d,exch_lda=>exch_lda_gpu>
USE corr_lda, ONLY: pw, pw_spin !<GPU:pw=>pw_d,pw_spin=>pw_spin_d,corr_lda=>corr_lda_gpu>
USE corr_gga, ONLY: pbec, pbec_spin !<GPU:pbec=>pbec_d,pbec_spin=>pbec_spin_d,corr_gga=>corr_gga_gpu>
USE exch_lda, ONLY: slater
USE corr_lda, ONLY: pw, pw_spin
USE corr_gga, ONLY: pbec, pbec_spin
!
CONTAINS
! TPSS
!
!-------------------------------------------------------------------------
SUBROUTINE tpsscxc( rho, grho, tau, sx, sc, v1x, v2x, v3x, v1c, v2c, v3c ) !<GPU:DEVICE>
SUBROUTINE tpsscxc( rho, grho, tau, sx, sc, v1x, v2x, v3x, v1c, v2c, v3c )
!$acc routine seq
!-----------------------------------------------------------------------
!! TPSS metaGGA corrections for exchange and correlation - Hartree a.u.
!! Definition: \(E_x = \int E_x(\text{rho},\text{grho}) dr\)
@ -70,9 +71,9 @@ SUBROUTINE tpsscxc( rho, grho, tau, sx, sc, v1x, v2x, v3x, v1c, v2c, v3c )
ENDIF
!
! exchange
CALL metax( rho, grho, tau, sx, v1x, v2x, v3x ) !<GPU:metax=>metax_d>
CALL metax( rho, grho, tau, sx, v1x, v2x, v3x )
! correlation
CALL metac( rho, grho, tau, sc, v1c, v2c, v3c ) !<GPU:metac=>metac_d>
CALL metac( rho, grho, tau, sc, v1c, v2c, v3c )
!
RETURN
!
@ -80,7 +81,8 @@ END SUBROUTINE tpsscxc
!
!
!-------------------------------------------------------------------------
SUBROUTINE metax( rho, grho2, tau, ex, v1x, v2x, v3x ) !<GPU:DEVICE>
SUBROUTINE metax( rho, grho2, tau, ex, v1x, v2x, v3x )
!$acc routine seq
!--------------------------------------------------------------------
!! TPSS meta-GGA exchange potential and energy.
!
@ -133,8 +135,8 @@ SUBROUTINE metax( rho, grho2, tau, ex, v1x, v2x, v3x ) !<GPU:
ENDIF
!
rs = pi34/rho**third
CALL slater( rs, ex_unif, vx_unif ) !<GPU:slater=>slater_d>
CALL metaFX( rho, grho2, tau, fx, f1x, f2x, f3x ) !<GPU:metaFX=>metaFX_d>
CALL slater( rs, ex_unif, vx_unif )
CALL metaFX( rho, grho2, tau, fx, f1x, f2x, f3x )
!
ex = rho*ex_unif
v1x = vx_unif*fx + ex*f1x
@ -148,7 +150,8 @@ END SUBROUTINE metax
!
!
!------------------------------------------------------------------
SUBROUTINE metac( rho, grho2, tau, ec, v1c, v2c, v3c ) !<GPU:DEVICE>
SUBROUTINE metac( rho, grho2, tau, ec, v1c, v2c, v3c )
!$acc routine seq
!--------------------------------------------------------------
!! TPSS meta-GGA correlation energy and potentials.
!
@ -208,11 +211,11 @@ SUBROUTINE metac( rho, grho2, tau, ec, v1c, v2c, v3c ) !<GPU:
IF (rhoup > small) THEN
!
rs = (pi34/rhoup)**third
CALL pw_spin( rs, 1.0_DP, ec_unif, vc_unif_s(1), vc_unif_s(2) ) !<GPU:pw_spin=>pw_spin_d>
CALL pw_spin( rs, 1.0_DP, ec_unif, vc_unif_s(1), vc_unif_s(2) )
!
IF (ABS(grhoup) > small) THEN
! zeta=1.0_DP-small to avoid pow_e of 0 in pbec_spin
CALL pbec_spin( rhoup, 1.0_DP-small, grhoup**2, 1, & !<GPU:pbec_spin=>pbec_spin_d>
CALL pbec_spin( rhoup, 1.0_DP-small, grhoup**2, 1, &
ec_sum, v1c_sum(1), v1c_sum(2), v2c_sum )
ELSE
ec_sum = 0.0_DP
@ -229,13 +232,13 @@ SUBROUTINE metac( rho, grho2, tau, ec, v1c, v2c, v3c ) !<GPU:
ENDIF
!
rs = (pi34/rho)**third
CALL pw( rs, 1, ec_unif, vc_unif ) !<GPU:pw=>pw_d>
CALL pw( rs, 1, ec_unif, vc_unif )
!
! ... PBE correlation energy and potential:
! ec_pbe=rho*H, not rho*(epsion_c_uinf + H)
! v1c_pbe=D (rho*H) /D rho
! v2c_pbe= for rho, 2 for
CALL pbec( rho, grho2, 1, ec_pbe, v1c_pbe, v2c_pbe ) !<GPU:pbec=>pbec_d>
CALL pbec( rho, grho2, 1, ec_pbe, v1c_pbe, v2c_pbe )
!
ec_pbe = ec_pbe/rho+ec_unif
v1c_pbe = (v1c_pbe+vc_unif-ec_pbe)/rho
@ -275,7 +278,8 @@ END SUBROUTINE metac
!
!
!-------------------------------------------------------------------------
SUBROUTINE metaFX( rho, grho2, tau, fx, f1x, f2x, f3x ) !<GPU:DEVICE>
SUBROUTINE metaFX( rho, grho2, tau, fx, f1x, f2x, f3x )
!$acc routine seq
!-------------------------------------------------------------------------
!
USE kind_l, ONLY : DP
@ -397,8 +401,9 @@ END SUBROUTINE metaFX
!
!
!---------------------------------------------------------------------------
SUBROUTINE tpsscx_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, sx, & !<GPU:DEVICE>
SUBROUTINE tpsscx_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, sx, &
v1xup, v1xdw, v2xup, v2xdw, v3xup, v3xdw )
!$acc routine seq
!-----------------------------------------------------------------------
!! TPSS metaGGA for exchange - Hartree a.u.
!
@ -442,7 +447,7 @@ SUBROUTINE tpsscx_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, sx, &
rho = rhoup + rhodw
IF (rhoup>small .AND. SQRT(ABS(grhoup2))>small &
.AND. ABS(tauup) > small) THEN
CALL metax( 2.0_DP*rhoup, 4.0_DP*grhoup2, & !<GPU:metax=>metax_d>
CALL metax( 2.0_DP*rhoup, 4.0_DP*grhoup2, &
2.0_DP*tauup, sxup, v1xup, v2xup, v3xup )
ELSE
sxup = 0.0_DP
@ -453,7 +458,7 @@ SUBROUTINE tpsscx_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, sx, &
!
IF (rhodw > small .AND. SQRT(ABS(grhodw2)) > small &
.AND. ABS(taudw) > small) THEN
CALL metax( 2.0_DP*rhodw, 4.0_DP*grhodw2, & !<GPU:metax=>metax_d>
CALL metax( 2.0_DP*rhodw, 4.0_DP*grhodw2, &
2.0_DP*taudw, sxdw, v1xdw, v2xdw, v3xdw )
ELSE
sxdw = 0.0_DP
@ -472,8 +477,9 @@ END SUBROUTINE tpsscx_spin
!
!
!---------------------------------------------------------------------------
SUBROUTINE tpsscc_spin( rho, zeta, grhoup, grhodw, & !<GPU:DEVICE>
SUBROUTINE tpsscc_spin( rho, zeta, grhoup, grhodw, &
atau, sc, v1cup, v1cdw, v2cup, v2cdw, v3cup, v3cdw )
!$acc routine seq
!--------------------------------------------------------------------------
!! TPSS metaGGA for correlations - Hartree a.u.
!
@ -536,7 +542,7 @@ SUBROUTINE tpsscc_spin( rho, zeta, grhoup, grhodw, & !<GPU:DEVICE>
!
v3c = 0.0_DP
ELSE
CALL metac_spin( rho, zeta, grhoup, grhodw, & !<GPU:metac_spin=>metac_spin_d>
CALL metac_spin( rho, zeta, grhoup, grhodw, &
atau, sc, v1cup, v1cdw, v2cup, v2cdw, v3c )
ENDIF
!
@ -549,8 +555,9 @@ END SUBROUTINE tpsscc_spin
!
!
!---------------------------------------------------------------
SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, & !<GPU:DEVICE>
SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, &
tau, sc, v1up, v1dw, v2up, v2dw, v3 )
!$acc routine seq
!---------------------------------------------------------------
USE kind_l, ONLY : DP
!
@ -634,10 +641,10 @@ SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, & !<GPU:DEVICE>
!
v2_tmp = 0.0_DP
rs = (pi34/rho)**third
CALL pw_spin( rs, zeta, ec_u, vc_u(1), vc_u(2) ) !<GPU:pw_spin=>pw_spin_d>
CALL pw_spin( rs, zeta, ec_u, vc_u(1), vc_u(2) )
!
IF ((ABS(grho) > small) .AND. (zeta <= 1.0_DP)) THEN
CALL pbec_spin( rho, zeta, grho2, 1, ec_pbe, v1_pbe(1), v1_pbe(2), v2_tmp ) !<GPU:pbec_spin=>pbec_spin_d>
CALL pbec_spin( rho, zeta, grho2, 1, ec_pbe, v1_pbe(1), v1_pbe(2), v2_tmp )
v1up_pbe=v1_pbe(1)
v1dw_pbe=v1_pbe(2)
ELSE
@ -668,10 +675,10 @@ SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, & !<GPU:DEVICE>
v2_tmp = 0.0_DP
!
rs = (pi34/rhoup)**third
CALL pw_spin( rs, 1.0_DP, ec_u, vc_u(1), vc_u(2) ) !<GPU:pw_spin=>pw_spin_d>
CALL pw_spin( rs, 1.0_DP, ec_u, vc_u(1), vc_u(2) )
!
IF (SQRT(grhoup2) > small) THEN
CALL pbec_spin( rhoup, 1.0_DP-small, grhoup2, 1, & !<GPU:pbec_spin=>pbec_spin_d>
CALL pbec_spin( rhoup, 1.0_DP-small, grhoup2, 1, &
ecup_0, v1_0v(1), v1_0v(2), v2_tmp )
v1up_0 = v1_0v(1)
v1dw_0 = v1_0v(2)
@ -709,11 +716,11 @@ SUBROUTINE metac_spin( rho, zeta, grhoup, grhodw, & !<GPU:DEVICE>
v2_tmp = 0.0_DP
!
rs = (pi34/rhodw)**third
CALL pw_spin( rs, -1.0_DP, ec_u, vc_u(1), vc_u(2) ) !<GPU:pw_spin=>pw_spin_d>
CALL pw_spin( rs, -1.0_DP, ec_u, vc_u(1), vc_u(2) )
!
IF (SQRT(grhodw2) > small) THEN
!
CALL pbec_spin( rhodw, -1.0_DP+small, grhodw2, 1, & !<GPU:pbec_spin=>pbec_spin_d>
CALL pbec_spin( rhodw, -1.0_DP+small, grhodw2, 1, &
ecdw_0, v1_0v(1), v1_0v(2), v2_tmp )
!
v1up_0 = v1_0v(1)
@ -877,7 +884,8 @@ END SUBROUTINE metac_spin
! ec, v1c, v2c, v3c as above for correlation
!
!-------------------------------------------------------------------------
SUBROUTINE m06lxc( rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c ) !<GPU:DEVICE>
SUBROUTINE m06lxc( rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c )
!$acc routine seq
!-----------------------------------------------------------------------
!
USE kind_l, ONLY : DP
@ -923,14 +931,14 @@ SUBROUTINE m06lxc( rho, grho2, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c )
taub = taua ! Tau is defined as summ_i( |nabla phi_i|**2 )
! in the following M06L routines
!
CALL m06lx( rhoa, grho2a, taua, ex, v1x, v2x, v3x ) !<GPU:m06lx=>m06lx_d>
CALL m06lx( rhoa, grho2a, taua, ex, v1x, v2x, v3x )
!
ex = two * ex ! Add the two components up + dw
!
v2x = 0.5_dp * v2x
!v3x = 2.0_dp * v3x !**mismatch with Libxc by a factor of 2
!
CALL m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c, v2c, v3c, & !<GPU:m06lc=>m06lc_d>
CALL m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c, v2c, v3c, &
v1cb, v2cb, v3cb )
!
v2c = 0.5_dp * v2c
@ -940,9 +948,10 @@ END SUBROUTINE m06lxc
!
!
!-------------------------------------------------------------------------
SUBROUTINE m06lxc_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, & !<GPU:DEVICE>
SUBROUTINE m06lxc_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, &
ex, ec, v1xup, v1xdw, v2xup, v2xdw, v3xup, v3xdw, &
v1cup, v1cdw, v2cup, v2cdw, v3cup, v3cdw )
!$acc routine seq
!-----------------------------------------------------------------------
!
USE kind_l, ONLY : DP
@ -959,14 +968,14 @@ SUBROUTINE m06lxc_spin( rhoup, rhodw, grhoup2, grhodw2, tauup, taudw, &
taua = tauup * two ! Tau is defined as summ_i( |nabla phi_i|**2 )
taub = taudw * two ! in the rest of the routine
!
CALL m06lx( rhoup, grhoup2, taua, exup, v1xup, v2xup, v3xup ) !<GPU:m06lx=>m06lx_d>
CALL m06lx( rhodw, grhodw2, taub, exdw, v1xdw, v2xdw, v3xdw ) !<GPU:m06lx=>m06lx_d>
CALL m06lx( rhoup, grhoup2, taua, exup, v1xup, v2xup, v3xup )
CALL m06lx( rhodw, grhodw2, taub, exdw, v1xdw, v2xdw, v3xdw )
!
ex = exup + exdw
!v3xup = 2.0_dp * v3xup !**mismatch with Libxc by a factor of 2
!v3xdw = 2.0_dp * v3xdw !**
!
CALL m06lc( rhoup, rhodw, grhoup2, grhodw2, taua, taub, & !<GPU:m06lc=>m06lc_d>
CALL m06lc( rhoup, rhodw, grhoup2, grhodw2, taua, taub, &
ec, v1cup, v2cup, v3cup, v1cdw, v2cdw, v3cdw )
!v3cup = 2.0_dp * v3cup !**
!v3cdw = 2.0_dp * v3cdw !**
@ -977,7 +986,8 @@ END SUBROUTINE m06lxc_spin
!=============================== M06L exchange ==========================
!
!-------------------------------------------------------------------------------
SUBROUTINE m06lx( rho, grho2, tau, ex, v1x, v2x, v3x ) !<GPU:DEVICE>
SUBROUTINE m06lx( rho, grho2, tau, ex, v1x, v2x, v3x )
!$acc routine seq
!---------------------------------------------------------------------------
!! M06L exchange.
!
@ -1082,7 +1092,7 @@ SUBROUTINE m06lx( rho, grho2, tau, ex, v1x, v2x, v3x ) !<GPU:DEVICE>
gh = one + alpha * (xs2 + zs)
!
IF (gh >= small) THEN
CALL gvt4( xs2, zs, d0, d1, d2, d3, d4, d5, alpha, hg, dhg_dxs2, dhg_dzs ) !<GPU:gvt4=>gvt4_d>
CALL gvt4( xs2, zs, d0, d1, d2, d3, d4, d5, alpha, hg, dhg_dxs2, dhg_dzs )
ELSE
hg = zero
dhg_dxs2 = zero
@ -1121,7 +1131,7 @@ SUBROUTINE m06lx( rho, grho2, tau, ex, v1x, v2x, v3x ) !<GPU:DEVICE>
dfws_drho = dfws*dws_dts*dts_drho
dfws_dtau = dfws*dws_dts*dts_dtau
!
CALL pbex_m06l( two*rho, four*grho2, sx_pbe, v1x_pbe, v2x_pbe ) !<GPU:pbex_m06l=>pbex_m06l_d>
CALL pbex_m06l( two*rho, four*grho2, sx_pbe, v1x_pbe, v2x_pbe )
!
v1x_unif = f43 * CX * rho13
!
@ -1143,7 +1153,8 @@ END SUBROUTINE m06lx
!
!
!-------------------------------------------------------------------
SUBROUTINE pbex_m06l( rho, grho2, sx, v1x, v2x ) !<GPU:DEVICE>
SUBROUTINE pbex_m06l( rho, grho2, sx, v1x, v2x )
!$acc routine seq
!---------------------------------------------------------------
!! PBE exchange (without Slater exchange):
!! J.P.Perdew, K.Burke, M.Ernzerhof, PRL 77, 3865 (1996).
@ -1211,8 +1222,9 @@ END SUBROUTINE pbex_m06l
!=============================== M06L correlation ==========================
!
!---------------------------------------------------------------------------------
SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, & !<GPU:DEVICE>
SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, &
v3c_up, v1c_dw, v2c_dw, v3c_dw )
!$acc routine seq
!-------------------------------------------------------------------------------
!! M06L correlation.
!
@ -1366,12 +1378,12 @@ SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, &
!
rs = rsa
zeta = one
CALL pw_spin( rs, zeta, ec_pw_a, vc_v(1), vc_v(2) ) !<GPU:pw_spin=>pw_spin_d>
CALL pw_spin( rs, zeta, ec_pw_a, vc_v(1), vc_v(2) )
vc_pw_a = vc_v(1)
vv = vc_v(2)
!
CALL gvt4( xs2a, zsa, ds0, ds1, ds2, ds3, ds4, ds5, alpha_s, hga, dhga_dxs2a, dhga_dzsa ) !<GPU:gvt4=>gvt4_d>
CALL gfunc( cs, gama_s, xs2a, gsa, dgsa_dxs2a ) !<GPU:gfunc=>gfunc_d>
CALL gvt4( xs2a, zsa, ds0, ds1, ds2, ds3, ds4, ds5, alpha_s, hga, dhga_dxs2a, dhga_dzsa )
CALL gfunc( cs, gama_s, xs2a, gsa, dgsa_dxs2a )
!
Ec_UEG_aa = rhoa*ec_pw_a
num = (dgsa_dxs2a + dhga_dxs2a)*Dsa + (gsa + hga)*dDsa_dxs2a
@ -1426,12 +1438,12 @@ SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, &
!
zeta = one
rs = rsb
CALL pw_spin( rs, zeta, ec_pw_b, vc_v(1), vc_v(2) ) !<GPU:pw_spin=>pw_spin_d>
CALL pw_spin( rs, zeta, ec_pw_b, vc_v(1), vc_v(2) )
vc_pw_b = vc_v(1)
vv = vc_v(2)
!
CALL gvt4( xs2b, zsb, ds0, ds1, ds2, ds3, ds4, ds5, alpha_s, hgb, dhgb_dxs2b, dhgb_dzsb ) !<GPU:gvt4=>gvt4_d>
CALL gfunc( cs, gama_s, xs2b, gsb, dgsb_dxs2b ) !<GPU:gfunc=>gfunc_d>
CALL gvt4( xs2b, zsb, ds0, ds1, ds2, ds3, ds4, ds5, alpha_s, hgb, dhgb_dxs2b, dhgb_dzsb )
CALL gfunc( cs, gama_s, xs2b, gsb, dgsb_dxs2b )
!
Ec_UEG_bb = rhob*ec_pw_b
num = (dgsb_dxs2b + dhgb_dxs2b)*Dsb + (gsb + hgb)*dDsb_dxs2b
@ -1468,13 +1480,13 @@ SUBROUTINE m06lc( rhoa, rhob, grho2a, grho2b, taua, taub, ec, v1c_up, v2c_up, &
zeta = (rhoa - rhob)/rho
rs = (pi34/rho)**f13
!
CALL gvt4( xs2ab, zsab, dab0, dab1, dab2, dab3, dab4, dab5, alpha_ab, hgab, & !<GPU:gvt4=>gvt4_d>
CALL gvt4( xs2ab, zsab, dab0, dab1, dab2, dab3, dab4, dab5, alpha_ab, hgab, &
dhgab_dxs2ab, dhgab_dzsab )
!
CALL pw_spin( rs, zeta, ec_pw_ab, vc_v(1), vc_v(2) ) !<GPU:pw_spin=>pw_spin_d>
CALL pw_spin( rs, zeta, ec_pw_ab, vc_v(1), vc_v(2) )
vc_pw_up=vc_v(1) ; vc_pw_dw=vc_v(2)
!
CALL gfunc( cab, gama_ab, xs2ab, gsab, dgsab_dxs2ab ) !<GPU:gfunc=>gfunc_d>
CALL gfunc( cab, gama_ab, xs2ab, gsab, dgsab_dxs2ab )
!
decab_drhoa = vc_pw_up - vc_pw_a
decab_drhob = vc_pw_dw - vc_pw_b
@ -1515,7 +1527,8 @@ END SUBROUTINE m06lc
!
!
!-------------------------------------------------------------------
SUBROUTINE gfunc( cspin, gama, xspin, gs, dgs_dx ) !<GPU:DEVICE>
SUBROUTINE gfunc( cspin, gama, xspin, gs, dgs_dx )
!$acc routine seq
!-----------------------------------------------------------------
!
USE kind_l, ONLY : DP
@ -1545,7 +1558,8 @@ SUBROUTINE gfunc( cspin, gama, xspin, gs, dgs_dx ) !<GPU:DEVICE>
!
!
!-------------------------------------------------------------------------
SUBROUTINE gvt4( x, z, a, b, c, d, e, f, alpha, hg, dh_dx, dh_dz ) !<GPU:DEVICE>
SUBROUTINE gvt4( x, z, a, b, c, d, e, f, alpha, hg, dh_dx, dh_dz )
!$acc routine seq
!----------------------------------------------------------------------
!
USE kind_l, ONLY : DP

View File

@ -22,20 +22,21 @@ MODULE beef_interface
INTERFACE
!
SUBROUTINE beefx( r, g, e, dr, dg, addlda ) BIND(C, NAME="beefx_")
!$acc routine seq
USE iso_c_binding
REAL (C_DOUBLE) :: r, g, e, dr, dg
INTEGER(C_INT), INTENT(IN) :: addlda
END SUBROUTINE beefx
!
SUBROUTINE beeflocalcorr( r, g, e, dr, dg, addlda) &
BIND(C, NAME="beeflocalcorr_")
SUBROUTINE beeflocalcorr( r, g, e, dr, dg, addlda) BIND(C, NAME="beeflocalcorr_")
!$acc routine seq
USE iso_c_binding
REAL (C_DOUBLE), INTENT(INOUT) :: r, g, e, dr, dg
INTEGER(C_INT), INTENT(IN) :: addlda
END SUBROUTINE beeflocalcorr
!
SUBROUTINE beeflocalcorrspin(r, z, g, e, drup, drdown, dg, addlda) &
BIND(C, NAME="beeflocalcorrspin_")
SUBROUTINE beeflocalcorrspin(r, z, g, e, drup, drdown, dg, addlda) BIND(C, NAME="beeflocalcorrspin_")
!$acc routine seq
USE iso_c_binding
REAL (C_DOUBLE), INTENT(INOUT) :: r, z, g, e, drup, drdown, dg
INTEGER(C_INT), INTENT(IN) :: addlda

View File

@ -14,16 +14,16 @@ MODULE xc_lib
!
IMPLICIT NONE
!
SAVE
!
SAVE
!
PRIVATE
!
!
!PUBLIC :: xc, dmxc !LDA
PUBLIC :: xc !, dmxc !LDA
!
PUBLIC :: xc_gcx !, dgcxc !GGA
!
!PUBLIC :: xc_metagcx !MGGA
PUBLIC :: xc_metagcx !MGGA
!
PUBLIC :: xclib_set_dft_from_name, &
xclib_set_dft_IDs, &
@ -56,17 +56,46 @@ MODULE xc_lib
xclib_reset_dft, &
dft_force_hybrid
!
INTERFACE xc
SUBROUTINE xc( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out, gpu_args_ )
USE kind_l, ONLY: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: length, srd, svd
REAL(DP), INTENT(IN) :: rho_in(length,srd)
REAL(DP), INTENT(OUT) :: ex_out(length), ec_out(length)
REAL(DP), INTENT(OUT) :: vx_out(length,svd), vc_out(length,svd)
LOGICAL, OPTIONAL, INTENT(IN) :: gpu_args_
END SUBROUTINE
END INTERFACE
!
!
INTERFACE xc_gcx
SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
USE kind_l, ONLY: DP
SUBROUTINE xc_gcx( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud, &
gpu_args_ )
USE kind_l, ONLY: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: length, ns
REAL(DP), INTENT(IN) :: rho(:,:), grho(:,:,:)
REAL(DP), INTENT(OUT) :: ex(:), ec(:)
REAL(DP), INTENT(OUT) :: v1x(:,:), v2x(:,:)
REAL(DP), INTENT(OUT) :: v1c(:,:), v2c(:,:)
REAL(DP), OPTIONAL, INTENT(OUT) :: v2c_ud(:)
REAL(DP), INTENT(IN) :: rho(length,ns), grho(3,length,ns)
REAL(DP), INTENT(OUT) :: ex(length), ec(length)
REAL(DP), INTENT(OUT) :: v1x(length,ns), v2x(length,ns)
REAL(DP), INTENT(OUT) :: v1c(length,ns), v2c(length,ns)
REAL(DP), OPTIONAL, INTENT(OUT) :: v2c_ud(length)
LOGICAL, OPTIONAL, INTENT(IN) :: gpu_args_
END SUBROUTINE
END INTERFACE
!
!
INTERFACE xc_metagcx
SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, &
v1c, v2c, v3c, gpu_args_ )
USE kind_l, ONLY: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: length, ns, np
REAL(DP), INTENT(IN) :: rho(length,ns), grho(3,length,ns), tau(length,ns)
REAL(DP), INTENT(OUT) :: ex(length), ec(length)
REAL(DP), INTENT(OUT) :: v1x(length,ns), v2x(length,ns), v3x(length,ns)
REAL(DP), INTENT(OUT) :: v1c(length,ns), v2c(np,length,ns), v3c(length,ns)
LOGICAL, OPTIONAL, INTENT(IN) :: gpu_args_
END SUBROUTINE
END INTERFACE
!

View File

@ -42,13 +42,13 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
! ... local variables
!
REAL(DP), ALLOCATABLE :: vrrx(:,:), vsrx(:,:), vssx(:,:)
REAL(DP), ALLOCATABLE :: vrrc(:,:), vsrc(:,:), vssc(:), vrzc(:,:)
REAL(DP), ALLOCATABLE :: vrrc(:,:), vsrc(:,:), vssc(:), vrzc(:,:)
!
#if defined(__LIBXC)
INTEGER :: fkind=-10
REAL(DP), ALLOCATABLE :: rho_lxc(:)
REAL(DP), ALLOCATABLE :: v2rho2_x(:), v2rhosigma_x(:), v2sigma2_x(:)
REAL(DP), ALLOCATABLE :: v2rho2_c(:), v2rhosigma_c(:), v2sigma2_c(:)
REAL(DP), ALLOCATABLE :: v2rho2_x(:), v2rhosigma_x(:), v2sigma2_x(:)
REAL(DP), ALLOCATABLE :: v2rho2_c(:), v2rhosigma_c(:), v2sigma2_c(:)
#if (XC_MAJOR_VERSION > 4)
INTEGER(8) :: lengthxc
#else
@ -83,7 +83,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
CASE( 1 )
!
DO k = 1, length
rho_lxc(k) = r_in(k,1)
rho_lxc(k) = r_in(k,1)
sigma(k) = g_in(k,1,1)**2 + g_in(k,2,1)**2 + g_in(k,3,1)**2
ENDDO
!
@ -130,7 +130,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
v2sigma2_c(length_dlxc*sp) )
! ... DERIVATIVE FOR CORRELATION
v2rho2_c = 0._DP ; v2rhosigma_c = 0._DP ; v2sigma2_c = 0._DP
IF (igcc /= 0) THEN
IF (igcc /= 0) THEN
fkind = xc_f03_func_info_get_kind( xc_info(4) )
CALL xc_f03_func_set_dens_threshold( xc_func(4), epsr )
CALL xc_f03_gga_fxc( xc_func(4), lengthxc, rho_lxc(1), sigma(1), v2rho2_c(1), &
@ -153,7 +153,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
IF (.NOT. ALLOCATED(sigma)) THEN
ALLOCATE( sigma(length) )
sigma(:) = g_in(:,1,1)**2 + g_in(:,2,1)**2 + g_in(:,3,1)**2
ENDIF
ENDIF
!
CALL dgcxc_unpol( length, r_in(:,1), sigma, vrrx(:,1), vsrx(:,1), vssx(:,1), &
vrrc(:,1), vsrc(:,1), vssc )
@ -176,12 +176,12 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
dvxc_rr(k,1,1) = e2 * (vrrx(k,1) + vrrc(k,1))
dvxc_sr(k,1,1) = e2 * (vsrx(k,1) + vsrc(k,1))
dvxc_ss(k,1,1) = e2 * (vssx(k,1) + vssc(k) )
ENDDO
ENDDO
!
DEALLOCATE( vrrx, vsrx, vssx )
DEALLOCATE( vrrc, vsrc, vssc )
!
ENDIF
ENDIF
!
IF ( is_libxc(3) ) THEN
DO k = 1, length
@ -189,7 +189,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
IF ( rho_lxc(k)>rho_threshold_lda ) THEN
dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_x(k)
dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_x(k)*2._DP
ENDIF
ENDIF
dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_x(k)*4._DP
ENDDO
ENDIF
@ -199,7 +199,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
IF ( rho_lxc(k)>rho_threshold_lda ) THEN
dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_c(k)
dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_c(k)*2._DP
ENDIF
ENDIF
dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_c(k)*4._DP
ENDDO
ENDIF
@ -230,23 +230,23 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
dvxc_ss(k,2,1) = e2*vssc(k)
dvxc_ss(k,2,2) = e2*(vssx(k,2) + vssc(k))
!
ENDDO
ENDDO
!
DEALLOCATE( vrrx, vsrx, vssx )
DEALLOCATE( vrrc, vsrc, vssc )
DEALLOCATE( vrzc )
!
ENDIF
!
IF ( is_libxc(3) ) THEN
!
!
IF ( is_libxc(3) ) THEN
!
DO k = 1, length
thr_up_cond = r_in(k,1)>epsr .AND. SQRT(ABS(sigma(3*k-2)))>epsg
thr_dw_cond = r_in(k,2)>epsr .AND. SQRT(ABS(sigma(3*k)))>epsg
IF ( .NOT.thr_up_cond .OR. .NOT.thr_dw_cond ) CYCLE
IF ( .NOT.thr_up_cond .OR. .NOT.thr_dw_cond ) CYCLE
dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_x(3*k-2)
dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_x(6*k-5)*4._DP
dvxc_rr(k,2,2) = dvxc_rr(k,2,2) + e2 * v2rho2_x(3*k)
dvxc_rr(k,2,2) = dvxc_rr(k,2,2) + e2 * v2rho2_x(3*k)
dvxc_ss(k,2,2) = dvxc_ss(k,2,2) + e2 * v2sigma2_x(6*k)*4._DP
dvxc_rr(k,1,2) = dvxc_rr(k,1,2) + e2 * v2rho2_x(3*k-1)
dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_x(6*k-5)*2._DP
@ -260,30 +260,30 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
!
IF ( is_libxc(4) ) THEN
!
DO k = 1, length
DO k = 1, length
thr_up_cond = r_in(k,1)>epsr .AND. SQRT(ABS(sigma(3*k-2)))>epsg
thr_dw_cond = r_in(k,2)>epsr .AND. SQRT(ABS(sigma(3*k)))>epsg
IF ( .NOT.thr_up_cond .OR. .NOT.thr_dw_cond ) CYCLE
dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_c(3*k-2)
dvxc_rr(k,1,2) = dvxc_rr(k,1,2) + e2 * v2rho2_c(3*k-1)
dvxc_rr(k,2,1) = dvxc_rr(k,2,1) + e2 * v2rho2_c(3*k-1)
dvxc_rr(k,2,2) = dvxc_rr(k,2,2) + e2 * v2rho2_c(3*k)
dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_c(6*k-5)*2.d0
dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_c(6*k)*4.d0
dvxc_sr(k,1,2) = dvxc_sr(k,1,2) + e2 * v2rhosigma_c(6*k-4)
dvxc_sr(k,2,1) = dvxc_sr(k,2,1) + e2 * v2rhosigma_c(6*k-1)
dvxc_ss(k,1,2) = dvxc_ss(k,1,2) + e2 * v2sigma2_c(6*k-2)
dvxc_ss(k,2,1) = dvxc_ss(k,2,1) + e2 * v2sigma2_c(6*k-2)
dvxc_sr(k,2,2) = dvxc_sr(k,2,1) + e2 * v2rhosigma_c(6*k)*2.d0
dvxc_ss(k,2,2) = dvxc_ss(k,2,2) + e2 * v2sigma2_c(6*k)*4.d0
ENDDO
!
DEALLOCATE( v2rho2_c, v2rhosigma_c, v2sigma2_c )
!
IF ( .NOT.thr_up_cond .OR. .NOT.thr_dw_cond ) CYCLE
dvxc_rr(k,1,1) = dvxc_rr(k,1,1) + e2 * v2rho2_c(3*k-2)
dvxc_rr(k,1,2) = dvxc_rr(k,1,2) + e2 * v2rho2_c(3*k-1)
dvxc_rr(k,2,1) = dvxc_rr(k,2,1) + e2 * v2rho2_c(3*k-1)
dvxc_rr(k,2,2) = dvxc_rr(k,2,2) + e2 * v2rho2_c(3*k)
dvxc_sr(k,1,1) = dvxc_sr(k,1,1) + e2 * v2rhosigma_c(6*k-5)*2.d0
dvxc_ss(k,1,1) = dvxc_ss(k,1,1) + e2 * v2sigma2_c(6*k)*4.d0
dvxc_sr(k,1,2) = dvxc_sr(k,1,2) + e2 * v2rhosigma_c(6*k-4)
dvxc_sr(k,2,1) = dvxc_sr(k,2,1) + e2 * v2rhosigma_c(6*k-1)
dvxc_ss(k,1,2) = dvxc_ss(k,1,2) + e2 * v2sigma2_c(6*k-2)
dvxc_ss(k,2,1) = dvxc_ss(k,2,1) + e2 * v2sigma2_c(6*k-2)
dvxc_sr(k,2,2) = dvxc_sr(k,2,1) + e2 * v2rhosigma_c(6*k)*2.d0
dvxc_ss(k,2,2) = dvxc_ss(k,2,2) + e2 * v2sigma2_c(6*k)*4.d0
ENDDO
!
DEALLOCATE( v2rho2_c, v2rhosigma_c, v2sigma2_c )
!
ENDIF
!
IF (ANY(is_libxc(3:4))) DEALLOCATE( rho_lxc )
!
!
ENDIF
IF ( ALLOCATED(sigma) ) DEALLOCATE( sigma )
!
@ -303,7 +303,7 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
!
dvxc_rr(:,1,1) = e2*(vrrx(:,1) + vrrc(:,1))
dvxc_sr(:,1,1) = e2*(vsrx(:,1) + vsrc(:,1))
dvxc_ss(:,1,1) = e2*(vssx(:,1) + vssc(:) )
dvxc_ss(:,1,1) = e2*(vssx(:,1) + vssc(:) )
!
CASE( 2 )
!
@ -323,14 +323,14 @@ SUBROUTINE dgcxc( length, sp, r_in, g_in, dvxc_rr, dvxc_sr, dvxc_ss )
dvxc_rr(k,2,2) = e2*(vrrx(k,2) + vrrc(k,2) - vrzc(k,2)*(1.d0 + zeta)/rht)
ENDIF
!
dvxc_sr(k,1,1) = e2 * (vsrx(k,1) + vsrc(k,1))
dvxc_sr(k,1,2) = e2 * vsrc(k,1)
dvxc_sr(k,2,1) = e2 * vsrc(k,2)
dvxc_sr(k,2,2) = e2 * (vsrx(k,2) + vsrc(k,2))
dvxc_sr(k,1,1) = e2 * (vsrx(k,1) + vsrc(k,1))
dvxc_sr(k,1,2) = e2 * vsrc(k,1)
dvxc_sr(k,2,1) = e2 * vsrc(k,2)
dvxc_sr(k,2,2) = e2 * (vsrx(k,2) + vsrc(k,2))
!
dvxc_ss(k,1,1) = e2 * (vssx(k,1) + vssc(k))
dvxc_ss(k,1,2) = e2 * vssc(k)
dvxc_ss(k,2,1) = e2 * vssc(k)
dvxc_ss(k,1,1) = e2 * (vssx(k,1) + vssc(k))
dvxc_ss(k,1,2) = e2 * vssc(k)
dvxc_ss(k,2,1) = e2 * vssc(k)
dvxc_ss(k,2,2) = e2 * (vssx(k,2) + vssc(k))
ENDDO
!

View File

@ -6,13 +6,98 @@
! or http://www.gnu.org/copyleft/gpl.txt .
!
!---------------------------------------------------------------------------
SUBROUTINE xc_gcx( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud, &
gpu_args_ )
!-------------------------------------------------------------------------
!! Wrapper to gpu or non gpu version of \(\texttt{xc_gcx}\).
!
USE kind_l, ONLY: DP
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
!! length of the I/O arrays
INTEGER, INTENT(IN) :: ns
!! spin dimension for input
REAL(DP), INTENT(IN) :: rho(length,ns)
!! Charge density
REAL(DP), INTENT(IN) :: grho(3,length,ns)
!! gradient
REAL(DP), INTENT(OUT) :: ex(length)
!! exchange energy
REAL(DP), INTENT(OUT) :: ec(length)
!! correlation energy
REAL(DP), INTENT(OUT) :: v1x(length,ns)
!! exchange potential (density part)
REAL(DP), INTENT(OUT) :: v2x(length,ns)
!! exchange potential (gradient part)
REAL(DP), INTENT(OUT) :: v1c(length,ns)
!! correlation potential (density part)
REAL(DP), INTENT(OUT) :: v2c(length,ns)
!! correlation potential (gradient part)
REAL(DP), INTENT(OUT), OPTIONAL :: v2c_ud(length)
!! correlation potential, cross term
LOGICAL, INTENT(IN), OPTIONAL :: gpu_args_
!! whether you wish to run on gpu in case use_gpu is true
!
LOGICAL :: gpu_args
REAL(DP), ALLOCATABLE :: v2c_dummy(:)
!
gpu_args = .FALSE.
!
IF ( PRESENT(gpu_args_) ) gpu_args = gpu_args_
!
IF (ns==2 .AND. .NOT. PRESENT(v2c_ud)) CALL xclib_infomsg( 'xc_gcx', 'WARNING: cross &
&term v2c_ud not found xc_gcx (gga) call with polarized case' )
!
IF ( gpu_args ) THEN
!
!$acc data deviceptr( rho(length,ns), grho(3,length,ns), ex(length), ec(length), &
!$acc& v1x(length,ns), v2x(length,ns), v1c(length,ns), v2c(length,ns) )
IF (PRESENT(v2c_ud)) THEN
!$acc data deviceptr( v2c_ud(length) )
CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
!$acc end data
ELSE
ALLOCATE( v2c_dummy(length) )
!$acc data deviceptr( v2c_dummy(length) )
CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy )
!$acc end data
DEALLOCATE( v2c_dummy )
ENDIF
!$acc end data
!
ELSE
!
!$acc data copyin( rho, grho ), copyout( ex, ec, v1x, v2x, v1c, v2c )
IF (PRESENT(v2c_ud)) THEN
!$acc data copyout( v2c_ud )
!$acc host_data use_device( rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
!$acc end host_data
!$acc end data
ELSE
ALLOCATE( v2c_dummy(length) )
!$acc data create( v2c_dummy )
!$acc host_data use_device( rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy )
CALL xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_dummy )
!$acc end host_data
!$acc end data
DEALLOCATE( v2c_dummy )
ENDIF
!$acc end data
!
ENDIF
!
RETURN
!
END SUBROUTINE
!
!
!---------------------------------------------------------------------------
SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
!-------------------------------------------------------------------------
!! Wrapper routine. Calls xc_gga-driver internal routines or the external
!! ones from libxc, depending on the input choice.
!
!! NOTE: differently from 'xc_lda_drivers', here the input rho is in (up,down)
!! form (in the LSDA case).
!! GGA wrapper routine - gpu double.
!
#if defined(__LIBXC)
#include "xc_version.h"
@ -28,27 +113,16 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
!! length of the I/O arrays
INTEGER, INTENT(IN) :: ns
!! spin dimension for input
REAL(DP), INTENT(IN) :: rho(:,:)
!! Charge density
REAL(DP), INTENT(IN) :: grho(:,:,:)
!! gradient
REAL(DP), INTENT(OUT) :: ex(:)
!! exchange energy
REAL(DP), INTENT(OUT) :: ec(:)
!! correlation energy
REAL(DP), INTENT(OUT) :: v1x(:,:)
!! exchange potential (density part)
REAL(DP), INTENT(OUT) :: v2x(:,:)
!! exchange potential (gradient part)
REAL(DP), INTENT(OUT) :: v1c(:,:)
!! correlation potential (density part)
REAL(DP), INTENT(OUT) :: v2c(:,:)
!! correlation potential (gradient part)
REAL(DP), INTENT(OUT), OPTIONAL :: v2c_ud(:)
!! correlation potential, cross term
REAL(DP), INTENT(IN) :: rho(length,ns)
REAL(DP), INTENT(IN) :: grho(3,length,ns)
REAL(DP), INTENT(OUT) :: ex(length)
REAL(DP), INTENT(OUT) :: ec(length)
REAL(DP), INTENT(OUT) :: v1x(length,ns)
REAL(DP), INTENT(OUT) :: v2x(length,ns)
REAL(DP), INTENT(OUT) :: v1c(length,ns)
REAL(DP), INTENT(OUT) :: v2c(length,ns)
REAL(DP), INTENT(OUT) :: v2c_ud(length)
!
! ... local variables
!
@ -59,32 +133,27 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
REAL(DP), ALLOCATABLE :: vc_rho(:), vc_sigma(:)
!
INTEGER :: fkind_x, np
REAL(DP) :: rs, rtot, zet, vc_2(2), arho_k
REAL(DP), PARAMETER :: pi34 = 0.6203504908994_DP
!
LOGICAL :: POLARIZED
INTEGER :: pol_unpol
INTEGER :: ildax, ildac, pol_unpol
#if (XC_MAJOR_VERSION > 4)
INTEGER(8) :: lengthxc
#else
INTEGER :: lengthxc
#endif
#endif
REAL(DP), ALLOCATABLE :: arho(:,:)
REAL(DP), ALLOCATABLE :: rh(:), zeta(:)
REAL(DP), ALLOCATABLE :: grho2(:,:), grho_ud(:)
!
INTEGER :: k, is
REAL(DP) :: sgn(2)
REAL(DP) :: rho_up, rho_dw, grho_up, grho_dw
REAL(DP) :: rho_up, rho_dw, grho_up, grho_dw, sgn1
REAL(DP), PARAMETER :: small = 1.E-10_DP
!
!
IF (ns==2 .AND. .NOT. PRESENT(v2c_ud)) CALL xclib_infomsg( 'xc_gcx', 'WARNING: cross &
&term v2c_ud not found xc_gcx (gga) call with polarized case' )
!
ex = 0.0_DP ; v1x = 0.0_DP ; v2x = 0.0_DP
ec = 0.0_DP ; v1c = 0.0_DP ; v2c = 0.0_DP
IF ( PRESENT(v2c_ud) ) v2c_ud = 0.0_DP
!$acc data deviceptr( rho(length,ns), grho(3,length,ns), ex(length), ec(length), &
!$acc& v1x(length,ns), v2x(length,ns), v1c(length,ns), v2c(length,ns), &
!$acc& v2c_ud(length) )
!
#if defined(__LIBXC)
!
@ -103,24 +172,31 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
np = 3
ENDIF
!
ALLOCATE( rho_lxc(length*ns) )
ALLOCATE( sigma(length*np) )
!
ALLOCATE( ex_lxc(length) , ec_lxc(length) )
ALLOCATE( vx_rho(length*ns) , vx_sigma(length*np) )
ALLOCATE( vc_rho(length*ns) , vc_sigma(length*np) )
ALLOCATE( rho_lxc(length*ns), sigma(length*np) )
!$acc data create( rho_lxc, sigma )
!
IF ( is_libxc(3) ) THEN
ALLOCATE( ex_lxc(length) )
ALLOCATE( vx_rho(length*ns), vx_sigma(length*np) )
ENDIF
IF ( is_libxc(4) ) THEN
ALLOCATE( ec_lxc(length) )
ALLOCATE( vc_rho(length*ns), vc_sigma(length*np) )
ENDIF
!
IF ( ns == 1 ) THEN
!
!$acc parallel loop
DO k = 1, length
rho_lxc(k) = ABS( rho(k,1) )
IF ( rho_lxc(k) > rho_threshold_gga ) &
arho_k = ABS(rho(k,1))
rho_lxc(k) = arho_k
IF ( arho_k > rho_threshold_gga ) &
sigma(k) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
ENDDO
!
ELSE
!
!$acc parallel loop
DO k = 1, length
rho_lxc(2*k-1) = rho(k,1)
rho_lxc(2*k) = rho(k,2)
@ -135,25 +211,26 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
!
IF ( ns==1 .AND. ANY(.NOT.is_libxc(3:4)) ) THEN
!
CALL gcxc( length, ABS(rho(:,1)), sigma, ex, ec, v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) )
!
IF ( (igcx==43.AND..NOT.is_libxc(3)) .OR. (igcc==14.AND..NOT.is_libxc(4)) ) &
CALL gcxc_beef( length, ABS(rho(:,1)), sigma, ex, ec, v1x(:,1), v2x(:,1), &
v1c(:,1), v2c(:,1) )
!$acc host_data use_device( rho_lxc, sigma )
CALL gcxc( length, rho_lxc, sigma, ex, ec, v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) )
!$acc end host_data
!
!$acc parallel loop
DO k = 1, length
sgn(1) = SIGN(1._DP, rho(k,1))
ex(k) = ex(k) * sgn(1)
ec(k) = ec(k) * sgn(1)
sgn1 = SIGN(1._DP, rho(k,1))
ex(k) = ex(k) * sgn1
ec(k) = ec(k) * sgn1
ENDDO
!
ENDIF
!
!$acc update self( rho_lxc, sigma )
!
! ---- GGA CORRELATION
!
IF (is_libxc(4)) fkind_x = xc_f03_func_info_get_kind( xc_info(4) )
!
IF ( is_libxc(4) ) THEN !lda part of LYP not present in libxc (still so? - check)
!
fkind_x = xc_f03_func_info_get_kind( xc_info(4) )
!
CALL xc_f03_func_set_dens_threshold( xc_func(4), small )!rho_threshold_gga )
IF (libxc_flags(4,0)==1) THEN
@ -163,89 +240,112 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
ec_lxc = 0.d0
ENDIF
!
!$acc data copyin( ec_lxc, vc_rho, vc_sigma )
IF (.NOT. POLARIZED) THEN
!$acc parallel loop
DO k = 1, length
IF ( rho_lxc(k) <= rho_threshold_lda ) CYCLE
IF ( rho_lxc(k) <= rho_threshold_lda ) THEN
ec(k)=0.d0 ; v1c(k,1)=0.d0 ; v2c(k,1)=0.d0
CYCLE
ENDIF
ec(k) = ec_lxc(k) * rho_lxc(k) * SIGN(1.0_DP, rho(k,1))
v1c(k,1) = vc_rho(k)
IF ( rho_lxc(k) <= rho_threshold_gga .OR. &
SQRT(ABS(sigma(k))) <= grho_threshold_gga) CYCLE
SQRT(ABS(sigma(k))) <= grho_threshold_gga) THEN
v2c(k,1) = 0.d0
CYCLE
ENDIF
v2c(k,1) = vc_sigma(k)*2.d0
ENDDO
ELSE
!$acc parallel loop
DO k = 1, length
rho_up = rho_lxc(2*k-1)
rho_dw = rho_lxc(2*k)
grho_up = SQRT(ABS(sigma(3*k-2)))
grho_dw = SQRT(ABS(sigma(3*k)))
IF ( rho_up <= rho_threshold_lda .OR. rho_dw <= rho_threshold_lda ) CYCLE
IF ( rho_up <= rho_threshold_lda .OR. rho_dw <= rho_threshold_lda ) THEN
ec(k) = 0.d0 ; v1c(k,1) = 0.d0 ; v1c(k,2) = 0.d0
v2c(k,1) = 0.d0 ; v2c_ud(k)= 0.d0 ; v2c(k,2) = 0.d0
CYCLE
ENDIF
ec(k) = ec_lxc(k) * (rho_up+rho_dw)
v1c(k,1) = vc_rho(2*k-1)
v1c(k,2) = vc_rho(2*k)
IF ( rho_up <= rho_threshold_gga .OR. rho_dw <= rho_threshold_gga .OR. &
grho_up<=grho_threshold_gga .OR. grho_dw<=grho_threshold_gga ) CYCLE
grho_up<=grho_threshold_gga .OR. grho_dw<=grho_threshold_gga ) THEN
v2c(k,1) = 0.d0 ; v2c_ud(k)= 0.d0 ; v2c(k,2) = 0.d0
CYCLE
ENDIF
v2c(k,1) = vc_sigma(3*k-2)*2.d0
v2c_ud(k)= vc_sigma(3*k-1)
v2c(k,2) = vc_sigma(3*k)*2.d0
ENDDO
ENDIF
!
!
!$acc end data
DEALLOCATE( ec_lxc, vc_rho, vc_sigma )
!
ELSEIF ( (.NOT.is_libxc(4)) .AND. fkind_x/=XC_EXCHANGE_CORRELATION ) THEN
!
ALLOCATE( arho(length,ns), grho2(length,ns) )
ALLOCATE( grho2(length,ns) )
!
IF ( ns /= 1 ) THEN
!
DO is = 1, 2
grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2
ENDDO
!
IF (igcc==3 .OR. igcc==7 .OR. igcc==13 ) THEN
!
ALLOCATE( grho_ud(length) )
!
grho_ud = grho(1,:,1) * grho(1,:,2) + grho(2,:,1) * grho(2,:,2) + &
grho(3,:,1) * grho(3,:,2)
!
arho = rho
!
WHERE ( rho(:,1)+rho(:,2) < rho_threshold_gga )
arho(:,1) = 0.0_DP
arho(:,2) = 0.0_DP
ENDWHERE
!
CALL gcc_spin_more( length, arho, grho2, grho_ud, ec, v1c, v2c, v2c_ud )
!
!$acc data create( grho2, grho_ud )
!$acc parallel loop
DO k = 1, length
grho2(k,1) = sigma(3*k-2)
grho2(k,2) = sigma(3*k)
grho_ud(k) = sigma(3*k-1)
ENDDO
!$acc host_data use_device( grho2, grho_ud )
CALL gcc_spin_more( length, rho, grho2, grho_ud, ec, v1c, v2c, v2c_ud )
!$acc end host_data
!$acc end data
DEALLOCATE( grho_ud )
!
ELSE
!
ALLOCATE( rh(length), zeta(length) )
!$acc data create( rh, zeta, grho2 )
!$acc parallel loop
DO k = 1, length
rh(k) = rho(k,1) + rho(k,2)
IF ( rh(k) > rho_threshold_gga ) THEN
zeta(k) = ( rho(k,1) - rho(k,2) ) / rh(k)
ELSE
zeta(k) = 2.0_DP ! trash value, gcc-routines get rid of it when present
ENDIF
grho2(k,1) = ( grho(1,k,1) + grho(1,k,2) )**2 + &
( grho(2,k,1) + grho(2,k,2) )**2 + &
( grho(3,k,1) + grho(3,k,2) )**2
grho2(k,2) = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2
ENDDO
!
rh = rho(:,1) + rho(:,2)
!$acc host_data use_device( rh, zeta, grho2 )
CALL gcc_spin( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) )
!$acc end host_data
!
zeta = 2.0_DP ! trash value, gcc-routines get rid of it when present
WHERE ( rh > rho_threshold_gga ) zeta = ( rho(:,1) - rho(:,2) ) / rh(:)
!
grho2(:,1) = ( grho(1,:,1) + grho(1,:,2) )**2 + &
( grho(2,:,1) + grho(2,:,2) )**2 + &
( grho(3,:,1) + grho(3,:,2) )**2
!
IF ( igcc/=14 ) CALL gcc_spin( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) )
IF ( igcc==14 ) CALL gcc_spin_beef( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) )
!
v2c(:,2) = v2c(:,1)
IF ( PRESENT(v2c_ud) ) v2c_ud(:) = v2c(:,1)
!$acc parallel loop
DO k = 1, length
v2c(k,2) = v2c(k,1)
IF ( ns==2 ) v2c_ud(k) = v2c(k,1)
ENDDO
!
!$acc end data
DEALLOCATE( rh, zeta )
!
ENDIF
!
!
ENDIF
!
DEALLOCATE( arho, grho2 )
DEALLOCATE( grho2 )
!
ENDIF
ENDIF
!
! --- GGA EXCHANGE
!
@ -259,134 +359,177 @@ SUBROUTINE xc_gcx_( length, ns, rho, grho, ex, ec, v1x, v2x, v1c, v2c, v2c_ud )
ex_lxc = 0.d0
ENDIF
!
!$acc data copyin( ex_lxc, vx_rho, vx_sigma )
IF (.NOT. POLARIZED) THEN
!$acc parallel loop
DO k = 1, length
IF ( rho_lxc(k) <= rho_threshold_lda ) CYCLE
IF ( rho_lxc(k) <= rho_threshold_lda ) THEN
ex(k) = 0.d0 ; v1x(k,1) = 0.d0 ; v2x(k,1) = 0.d0
CYCLE
ENDIF
ex(k) = ex_lxc(k) * rho_lxc(k) * SIGN(1.0_DP, rho(k,1))
v1x(k,1) = vx_rho(k)
IF ( rho_lxc(k) <= rho_threshold_gga .OR. &
SQRT(ABS(sigma(k))) <= grho_threshold_gga) CYCLE
SQRT(ABS(sigma(k))) <= grho_threshold_gga) THEN
v2x(k,1) = 0.d0
CYCLE
ENDIF
v2x(k,1) = vx_sigma(k)*2.d0
ENDDO
ELSE
!$acc parallel loop
DO k = 1, length
rho_up = rho_lxc(2*k-1)
rho_dw = rho_lxc(2*k)
grho_up = SQRT(ABS(sigma(3*k-2)))
grho_dw = SQRT(ABS(sigma(3*k)))
IF ( rho_up <= rho_threshold_lda .OR. rho_dw <= rho_threshold_lda ) CYCLE
IF ( rho_up <= rho_threshold_lda .OR. rho_dw <= rho_threshold_lda ) THEN
ex(k) = 0.d0
v1x(k,1) = 0.d0 ; v1x(k,2) = 0.d0
v2x(k,1) = 0.d0 ; v2x(k,2) = 0.d0
CYCLE
ENDIF
ex(k) = ex_lxc(k) * (rho_up+rho_dw)
v1x(k,1) = vx_rho(2*k-1)
v1x(k,2) = vx_rho(2*k)
IF ( rho_up <= rho_threshold_gga .OR. rho_dw <= rho_threshold_gga .OR. &
grho_up<=grho_threshold_gga .OR. grho_dw<=grho_threshold_gga ) CYCLE
grho_up<=grho_threshold_gga .OR. grho_dw<=grho_threshold_gga ) THEN
v2x(k,1) = 0.d0 ; v2x(k,2) = 0.d0
CYCLE
ENDIF
v2x(k,1) = vx_sigma(3*k-2)*2.d0
v2x(k,2) = vx_sigma(3*k)*2.d0
ENDDO
ENDIF
!
!$acc end data
DEALLOCATE( ex_lxc, vx_rho, vx_sigma )
!
ELSE
!
ALLOCATE( grho2(length,ns) )
!
IF ( ns /= 1 ) THEN
!
DO is = 1, 2
grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2
ENDDO
!
IF ( igcx/=43 ) CALL gcx_spin( length, rho, grho2, ex, v1x, v2x )
IF ( igcx==43 ) CALL gcx_spin_beef( length, rho, grho2, ex, v1x, v2x )
!
!
ALLOCATE( grho2(length,ns) )
!$acc data create( grho2 )
!$acc parallel loop collapse(2)
DO is = 1, ns
DO k = 1, length
grho2(k,is) = grho(1,k,is)**2 + grho(2,k,is)**2 + grho(3,k,is)**2
ENDDO
ENDDO
!$acc host_data use_device( grho2 )
CALL gcx_spin( length, rho, grho2, ex, v1x, v2x )
!$acc end host_data
!$acc end data
DEALLOCATE( grho2 )
!
ENDIF
!
DEALLOCATE( grho2 )
!
ENDIF
!
!$acc end data
DEALLOCATE( rho_lxc, sigma )
DEALLOCATE( ex_lxc , ec_lxc )
DEALLOCATE( vx_rho , vx_sigma )
DEALLOCATE( vc_rho , vc_sigma )
!
#else
!
ALLOCATE( arho(length,ns), grho2(length,ns) )
arho = 0.0_DP
grho2 = 0.0_DP
ALLOCATE( grho2(length,ns) )
!$acc data create( grho2 )
!
IF ( ns == 1 ) THEN
!
! ... This is the spin-unpolarised case
ALLOCATE( rh(length) )
!$acc data create( rh )
!$acc host_data use_device( rh, grho2 )
!$acc parallel loop
DO k = 1, length
IF ( ABS(rho(k,1)) > rho_threshold_gga ) &
grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
rh(k) = ABS(rho(k,1))
IF ( rh(k) > rho_threshold_gga ) &
grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2
ENDDO
!
CALL gcxc( length, rh, grho2(:,1), ex, ec, v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) )
!
CALL gcxc( length, ABS(rho(:,1)), grho2(:,1), ex, ec, v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) )
!
IF ( igcx==43 .OR. igcc==14 ) CALL gcxc_beef( length, ABS(rho(:,1)), grho2(:,1), ex, ec, &
v1x(:,1), v2x(:,1), v1c(:,1), v2c(:,1) )
!
!$acc parallel loop
DO k = 1, length
sgn(1) = SIGN(1._DP, rho(k,1))
ex(k) = ex(k) * sgn(1)
ec(k) = ec(k) * sgn(1)
sgn1 = SIGN(1._DP, rho(k,1))
ex(k) = ex(k) * sgn1
ec(k) = ec(k) * sgn1
ENDDO
!
!$acc end host_data
!$acc end data
DEALLOCATE( rh )
!
ELSE
!
DO is = 1, 2
grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2
!$acc host_data use_device( grho2 )
!$acc parallel loop collapse(2)
DO is = 1, ns
DO k = 1, length
grho2(k,is) = grho(1,k,is)**2 + grho(2,k,is)**2 + grho(3,k,is)**2
ENDDO
ENDDO
!
IF ( igcx/=43 ) CALL gcx_spin( length, rho, grho2, ex, v1x, v2x )
IF ( igcx==43 ) CALL gcx_spin_beef( length, rho, grho2, ex, v1x, v2x )
CALL gcx_spin( length, rho, grho2, ex, v1x, v2x )
!$acc end host_data
!
IF (igcc==3 .OR. igcc==7 .OR. igcc==13 ) THEN
!
ALLOCATE( grho_ud(length) )
!$acc data create( grho_ud )
!$acc host_data use_device( grho_ud, grho2 )
!$acc parallel loop
DO k = 1, length
grho_ud(k) = grho(1,k,1) * grho(1,k,2) + grho(2,k,1) * grho(2,k,2) + &
grho(3,k,1) * grho(3,k,2)
ENDDO
!
grho_ud = grho(1,:,1) * grho(1,:,2) + grho(2,:,1) * grho(2,:,2) + &
grho(3,:,1) * grho(3,:,2)
!
arho = rho
!
CALL gcc_spin_more( length, arho, grho2, grho_ud, ec, v1c, v2c, v2c_ud )
CALL gcc_spin_more( length, rho, grho2, grho_ud, ec, v1c, v2c, v2c_ud )
!
!$acc end host_data
!$acc end data
DEALLOCATE( grho_ud )
!
ELSE
!
ALLOCATE( rh(length), zeta(length) )
!$acc data create( rh, zeta )
!$acc host_data use_device( rh, zeta, grho2 )
!$acc parallel loop
DO k = 1, length
rh(k) = rho(k,1) + rho(k,2)
IF ( rh(k) > rho_threshold_gga ) THEN
zeta(k) = ( rho(k,1) - rho(k,2) ) / rh(k)
ELSE
zeta(k) = 2.0_DP ! trash value, gcc-routines get rid of it when present
ENDIF
grho2(k,1) = ( grho(1,k,1) + grho(1,k,2) )**2 + &
( grho(2,k,1) + grho(2,k,2) )**2 + &
( grho(3,k,1) + grho(3,k,2) )**2
ENDDO
!
rh = rho(:,1) + rho(:,2)
CALL gcc_spin( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) )
!
zeta = 2.0_DP ! trash value, gcc-routines get rid of it when present
WHERE ( rh > rho_threshold_gga ) zeta = ( rho(:,1) - rho(:,2) ) / rh(:)
!
grho2(:,1) = ( grho(1,:,1) + grho(1,:,2) )**2 + &
( grho(2,:,1) + grho(2,:,2) )**2 + &
( grho(3,:,1) + grho(3,:,2) )**2
!
IF ( igcc/=14 ) CALL gcc_spin( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) )
IF ( igcc==14 ) CALL gcc_spin_beef( length, rh, zeta, grho2(:,1), ec, v1c, v2c(:,1) )
!
v2c(:,2) = v2c(:,1)
IF ( PRESENT(v2c_ud) ) v2c_ud(:) = v2c(:,1)
!$acc parallel loop
DO k = 1, length
v2c(k,2) = v2c(k,1)
IF ( ns==2 ) v2c_ud(k) = v2c(k,1)
ENDDO
!
!$acc end host_data
!$acc end data
DEALLOCATE( rh, zeta )
!
ENDIF
!
!
ENDIF
!
DEALLOCATE( arho, grho2 )
!$acc end data
DEALLOCATE( grho2 )
!
#endif
!
!$acc end data
!
RETURN
!

View File

@ -5,16 +5,71 @@
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!------------------------------------------------------------------------------------------
SUBROUTINE xc( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out, gpu_args_ )
!--------------------------------------------------------------------------------------
!! Wrapper routine to \(\texttt{xc_}\) or \(\texttt{xc_gpu}\).
!
USE kind_l, ONLY: DP
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
!! length of the I/O arrays
INTEGER, INTENT(IN) :: srd
!! spin dimension of rho
INTEGER, INTENT(IN) :: svd
!! spin dimension of v
REAL(DP), INTENT(IN) :: rho_in(length,srd)
!! Charge density
REAL(DP), INTENT(OUT) :: ex_out(length)
!! \(\epsilon_x(rho)\) ( NOT \(E_x(\text{rho})\) )
REAL(DP), INTENT(OUT) :: vx_out(length,svd)
!! \(dE_x(\text{rho})/d\text{rho} ( NOT d\epsilon_x(\text{rho})/d\text{rho} )
REAL(DP), INTENT(OUT) :: ec_out(length)
!! \(\epsilon_c(rho)\) ( NOT \(E_c(\text{rho})\) )
REAL(DP), INTENT(OUT) :: vc_out(length,svd)
!! \(dE_c(\text{rho})/d\text{rho} ( NOT d\epsilon_c(\text{rho})/d\text{rho} )
LOGICAL, OPTIONAL, INTENT(IN) :: gpu_args_
!! whether you wish to run on gpu in case use_gpu is true
!
LOGICAL :: gpu_args
!
gpu_args = .FALSE.
IF ( PRESENT(gpu_args_) ) gpu_args = gpu_args_
!
IF ( gpu_args ) THEN
!
!$acc data deviceptr( rho_in(length,srd), ex_out(length), ec_out(length), &
!$acc& vx_out(length,svd), vc_out(length,svd) )
CALL xc_( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out )
!$acc end data
!
ELSE
!
!$acc data copyin( rho_in ), copyout( ex_out, ec_out, vx_out, vc_out )
!$acc host_data use_device( rho_in, ex_out, ec_out, vx_out, vc_out )
CALL xc_( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out )
!$acc end host_data
!$acc end data
!
ENDIF
!
RETURN
!
END SUBROUTINE
!
!
!---------------------------------------------------------------------------
SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out )
SUBROUTINE xc_( length, srd, svd, rho_in, ex_out, ec_out, vx_out, vc_out )
!-------------------------------------------------------------------------
!! Wrapper routine. Calls internal XC-driver routines or external ones
!! from Libxc, depending on the input choice.
!! Wrapper xc LDA - openACC version.
!! See comments in routine \(\texttt{xc}\) for variable explanations.
!
#if defined(__LIBXC)
#include "xc_version.h"
USE xc_f03_lib_m
USE dft_setting_params, ONLY: xc_func, xc_info, libxc_flags
USE dft_setting_params, ONLY: xc_func, xc_info, libxc_flags
#endif
!
USE kind_l, ONLY: DP
@ -25,28 +80,19 @@ SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out )
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
!! length of the I/O arrays
INTEGER, INTENT(IN) :: sr_d
!! spin dimension of rho
INTEGER, INTENT(IN) :: sv_d
!! spin dimension of v
REAL(DP), INTENT(IN) :: rho_in(length,sr_d)
!! Charge density
REAL(DP), INTENT(OUT) :: ex_out(length)
!! \(\epsilon_x(rho)\) ( NOT \(E_x(\text{rho})\) )
REAL(DP), INTENT(OUT) :: vx_out(length,sv_d)
!! \(dE_x(\text{rho})/d\text{rho} ( NOT d\epsilon_x(\text{rho})/d\text{rho} )
REAL(DP), INTENT(OUT) :: ec_out(length)
!! \(\epsilon_c(rho)\) ( NOT \(E_c(\text{rho})\) )
REAL(DP), INTENT(OUT) :: vc_out(length,sv_d)
!! \(dE_c(\text{rho})/d\text{rho} ( NOT d\epsilon_c(\text{rho})/d\text{rho} )
INTEGER, INTENT(IN) :: srd, svd
REAL(DP), INTENT(IN) :: rho_in(length,srd)
REAL(DP), INTENT(OUT) :: ex_out(length), vx_out(length,svd)
REAL(DP), INTENT(OUT) :: ec_out(length), vc_out(length,svd)
!
! ... local variables
!
#if defined(__LIBXC)
INTEGER :: fkind_x, eflag
LOGICAL :: is_libxc1, is_libxc2
INTEGER :: fkind_x
REAL(DP) :: amag
REAL(DP), ALLOCATABLE :: rho_lxc(:)
REAL(DP), ALLOCATABLE :: ex_lxc(:), ec_lxc(:)
REAL(DP), ALLOCATABLE :: vx_lxc(:), vc_lxc(:)
#if (XC_MAJOR_VERSION > 4)
INTEGER(8) :: lengthxc
@ -54,104 +100,129 @@ SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out )
INTEGER :: lengthxc
#endif
#endif
!
REAL(DP), ALLOCATABLE :: arho(:), zeta(:)
!
REAL(DP), ALLOCATABLE :: zeta(:)
REAL(DP) :: arho_ir
INTEGER :: ir
!
ex_out = 0.0_DP ; vx_out = 0.0_DP
ec_out = 0.0_DP ; vc_out = 0.0_DP
!$acc data deviceptr( rho_in(length,srd), ex_out(length), ec_out(length), &
!$acc& vx_out(length,svd), vc_out(length,svd) )
!
#if defined(__LIBXC)
!
is_libxc1 = is_libxc(1)
is_libxc2 = is_libxc(2)
fkind_x = -1
lengthxc = length
!
IF ( ANY(is_libxc(1:2)) ) THEN
IF ( is_libxc(1) ) ALLOCATE( ex_lxc(length), vx_lxc(length*svd) )
IF ( is_libxc(2) ) ALLOCATE( ec_lxc(length), vc_lxc(length*svd) )
!
IF ( is_libxc(1) .OR. is_libxc(2) ) THEN
!
ALLOCATE( rho_lxc(length*sv_d) )
ALLOCATE( vx_lxc(length*sv_d), vc_lxc(length*sv_d) )
ALLOCATE( rho_lxc(length*svd) )
!$acc data copyout( rho_lxc )
!
! ... set libxc input
SELECT CASE( sr_d )
SELECT CASE( srd )
CASE( 1 )
!
rho_lxc(:) = ABS(rho_in(:,1))
!
!
!$acc parallel loop
DO ir = 1, length
rho_lxc(ir) = ABS(rho_in(ir,1))
ENDDO
!
CASE( 2 )
!
DO ir = 1, length
rho_lxc(2*ir-1) = (rho_in(ir,1) + rho_in(ir,2)) * 0.5_DP
rho_lxc(2*ir) = (rho_in(ir,1) - rho_in(ir,2)) * 0.5_DP
ENDDO
!
!
!$acc parallel loop
DO ir = 1, length
rho_lxc(2*ir-1) = (rho_in(ir,1) + rho_in(ir,2)) * 0.5_DP
rho_lxc(2*ir) = (rho_in(ir,1) - rho_in(ir,2)) * 0.5_DP
ENDDO
!
CASE( 4 )
!
DO ir = 1, length
amag = SQRT( SUM(rho_in(ir,2:4)**2) )
rho_lxc(2*ir-1) = (rho_in(ir,1) + amag) * 0.5_DP
rho_lxc(2*ir) = (rho_in(ir,1) - amag) * 0.5_DP
ENDDO
!
!
!$acc parallel loop
DO ir = 1, length
amag = SQRT( SUM(rho_in(ir,2:4)**2) )
rho_lxc(2*ir-1) = (rho_in(ir,1) + amag) * 0.5_DP
rho_lxc(2*ir) = (rho_in(ir,1) - amag) * 0.5_DP
ENDDO
!
CASE DEFAULT
!
CALL xclib_error( 'xc_LDA', 'Wrong number of spin dimensions', 1 )
!
!
CALL xclib_error( 'xc_LDA', 'Wrong number of spin dimensions', 1 )
!
END SELECT
!
!$acc end data
!
ENDIF
!
!
! ... EXCHANGE
IF ( is_libxc(1) ) THEN
CALL xc_f03_func_set_dens_threshold( xc_func(1), rho_threshold_lda )
IF (libxc_flags(1,0)==1) THEN
CALL xc_f03_lda_exc_vxc( xc_func(1), lengthxc, rho_lxc(1), ex_out(1), vx_lxc(1) )
CALL xc_f03_lda_exc_vxc( xc_func(1), lengthxc, rho_lxc(1), ex_lxc(1), vx_lxc(1) )
ELSE
CALL xc_f03_lda_vxc( xc_func(1), lengthxc, rho_lxc(1), vx_lxc(1) )
ex_lxc = 0.d0
ENDIF
ENDIF
!
! ... CORRELATION
IF ( is_libxc(2) ) THEN
CALL xc_f03_func_set_dens_threshold( xc_func(2), rho_threshold_lda )
fkind_x = xc_f03_func_info_get_kind( xc_info(2) )
IF (libxc_flags(2,0)==1) THEN
CALL xc_f03_lda_exc_vxc( xc_func(2), lengthxc, rho_lxc(1), ec_out(1), vc_lxc(1) )
ELSE
CALL xc_f03_lda_vxc( xc_func(2), lengthxc, rho_lxc(1), vc_lxc(1) )
ENDIF
CALL xc_f03_func_set_dens_threshold( xc_func(2), rho_threshold_lda )
fkind_x = xc_f03_func_info_get_kind( xc_info(2) )
IF (libxc_flags(2,0)==1) THEN
CALL xc_f03_lda_exc_vxc( xc_func(2), lengthxc, rho_lxc(1), ec_lxc(1), vc_lxc(1) )
ELSE
CALL xc_f03_lda_vxc( xc_func(2), lengthxc, rho_lxc(1), vc_lxc(1) )
ec_lxc = 0.d0
ENDIF
ENDIF
!
IF ( ((.NOT.is_libxc(1)) .OR. (.NOT.is_libxc(2))) &
.AND. fkind_x/=XC_EXCHANGE_CORRELATION ) THEN
IF ( is_libxc(1) .OR. is_libxc(2) ) DEALLOCATE( rho_lxc )
!
IF ( ((.NOT.is_libxc(1)) .OR. (.NOT.is_libxc(2))) ) THEN
!
SELECT CASE( sr_d )
SELECT CASE( srd )
CASE( 1 )
!
IF ((iexch==8 .AND. .NOT.is_libxc(1)) .OR. (icorr==10 .AND. &
.NOT. is_libxc(2))) THEN
IF ((iexch==8 .AND. .NOT.is_libxc1) .OR. (icorr==10 .AND. .NOT.is_libxc2)) THEN
IF (.NOT. finite_size_cell_volume_set) CALL xclib_error( 'XC',&
'finite size corrected exchange used w/o initialization', 1 )
ENDIF
CALL xc_lda( length, ABS(rho_in(:,1)), ex_out, ec_out, vx_out(:,1), vc_out(:,1) )
CALL xc_lda( length, rho_in(:,1), ex_out, ec_out, vx_out(:,1), vc_out(:,1) )
!
CASE( 2 )
!
ALLOCATE( arho(length), zeta(length) )
arho = ABS(rho_in(:,1))
WHERE (arho > rho_threshold_lda) zeta(:) = rho_in(:,2) / arho(:)
CALL xc_lsda( length, arho, zeta, ex_out, ec_out, vx_out, vc_out )
DEALLOCATE( arho, zeta )
ALLOCATE( zeta(length) )
!$acc data create( zeta )
!$acc parallel loop
DO ir = 1, length
arho_ir = ABS(rho_in(ir,1))
IF (arho_ir > rho_threshold_lda) zeta(ir) = rho_in(ir,2) / arho_ir
ENDDO
!$acc host_data use_device( zeta )
CALL xc_lsda( length, rho_in(:,1), zeta, ex_out, ec_out, vx_out, vc_out )
!$acc end host_data
!$acc end data
DEALLOCATE( zeta )
!
CASE( 4 )
!
ALLOCATE( arho(length), zeta(length) )
arho = ABS( rho_in(:,1) )
WHERE (arho > rho_threshold_lda) zeta(:) = SQRT( rho_in(:,2)**2 + rho_in(:,3)**2 + &
rho_in(:,4)**2 ) / arho(:) ! amag/arho
CALL xc_lsda( length, arho, zeta, ex_out, ec_out, vx_out, vc_out )
DEALLOCATE( arho, zeta )
ALLOCATE( zeta(length) )
!$acc data create( zeta )
!$acc parallel loop
DO ir = 1, length
arho_ir = ABS( rho_in(ir,1) )
IF (arho_ir > rho_threshold_lda) zeta(ir) = SQRT( rho_in(ir,2)**2 + rho_in(ir,3)**2 + &
rho_in(ir,4)**2 ) / arho_ir ! amag/arho
ENDDO
!$acc host_data use_device( zeta )
CALL xc_lsda( length, rho_in(:,1), zeta, ex_out, ec_out, vx_out, vc_out )
!$acc end host_data
!$acc end data
DEALLOCATE( zeta )
!
CASE DEFAULT
!
@ -162,67 +233,83 @@ SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out )
ENDIF
!
! ... fill output arrays
!
IF (sv_d == 1) THEN
IF (is_libxc(1)) vx_out(:,1) = vx_lxc(:)
IF (is_libxc(2)) vc_out(:,1) = vc_lxc(:)
ELSE
IF (is_libxc(1)) THEN
DO ir = 1, length
vx_out(ir,1) = vx_lxc(2*ir-1)
vx_out(ir,2) = vx_lxc(2*ir)
ENDDO
ENDIF
IF (is_libxc(2)) THEN
DO ir = 1, length
vc_out(ir,1) = vc_lxc(2*ir-1)
vc_out(ir,2) = vc_lxc(2*ir)
ENDDO
ENDIF
!
IF ( is_libxc(1) ) THEN
!$acc data copyin( ex_lxc, vx_lxc )
!$acc parallel loop
DO ir = 1, length
ex_out(ir) = ex_lxc(ir)
IF (svd == 1) THEN
vx_out(ir,1) = vx_lxc(ir)
ELSE
vx_out(ir,1) = vx_lxc(2*ir-1)
vx_out(ir,2) = vx_lxc(2*ir)
ENDIF
ENDDO
!$acc end data
DEALLOCATE( ex_lxc, vx_lxc )
ENDIF
!
IF (ANY(is_libxc(1:2))) THEN
DEALLOCATE( rho_lxc )
DEALLOCATE( vx_lxc, vc_lxc )
IF ( is_libxc(2) ) THEN
!$acc data copyin( ec_lxc, vc_lxc )
!$acc parallel loop
DO ir = 1, length
ec_out(ir) = ec_lxc(ir)
IF (svd == 1) THEN
vc_out(ir,1) = vc_lxc(ir)
ELSE
vc_out(ir,1) = vc_lxc(2*ir-1)
vc_out(ir,2) = vc_lxc(2*ir)
ENDIF
ENDDO
!$acc end data
DEALLOCATE( ec_lxc, vc_lxc )
ENDIF
!
#else
!
SELECT CASE( sr_d )
SELECT CASE( srd )
CASE( 1 )
!
IF ((iexch==8 .AND. .NOT.is_libxc(1)) .OR. (icorr==10 .AND. &
.NOT. is_libxc(2))) THEN
IF (iexch==8 .OR. icorr==10) THEN
IF (.NOT. finite_size_cell_volume_set) CALL xclib_error( 'XC',&
'finite size corrected exchange used w/o initialization', 1 )
ENDIF
!
CALL xc_lda( length, ABS(rho_in(:,1)), ex_out, ec_out, vx_out(:,1), vc_out(:,1) )
CALL xc_lda( length, rho_in(:,1), ex_out, ec_out, vx_out(:,1), vc_out(:,1) )
!
CASE( 2 )
!
ALLOCATE( arho(length), zeta(length) )
ALLOCATE( zeta(length) )
!$acc data create( zeta )
!$acc parallel loop
DO ir = 1, length
arho_ir = ABS(rho_in(ir,1))
IF (arho_ir > rho_threshold_lda) zeta(ir) = rho_in(ir,2) / arho_ir
ENDDO
!$acc host_data use_device( zeta )
CALL xc_lsda( length, rho_in(:,1), zeta, ex_out, ec_out, vx_out, vc_out )
!$acc end host_data
!$acc end data
DEALLOCATE( zeta )
!
arho = ABS(rho_in(:,1))
WHERE (arho > rho_threshold_lda) zeta(:) = rho_in(:,2) / arho(:)
CASE( 4 )
!
CALL xc_lsda( length, arho, zeta, ex_out, ec_out, vx_out, vc_out )
ALLOCATE( zeta(length) )
!$acc data create( zeta )
!$acc parallel loop
DO ir = 1, length
arho_ir = ABS(rho_in(ir,1))
IF (arho_ir > rho_threshold_lda) zeta(ir) = SQRT( rho_in(ir,2)**2 + rho_in(ir,3)**2 + &
rho_in(ir,4)**2 ) / arho_ir ! amag/arho
ENDDO
!$acc host_data use_device( zeta )
CALL xc_lsda( length, rho_in(:,1), zeta, ex_out, ec_out, vx_out, vc_out )
!$acc end host_data
!$acc end data
DEALLOCATE( zeta )
!
DEALLOCATE( arho, zeta )
!
CASE( 4 )
!
ALLOCATE( arho(length), zeta(length) )
!
arho = ABS( rho_in(:,1) )
WHERE (arho > rho_threshold_lda) zeta(:) = SQRT( rho_in(:,2)**2 + rho_in(:,3)**2 + &
rho_in(:,4)**2 ) / arho(:) ! amag/arho
!
CALL xc_lsda( length, arho, zeta, ex_out, ec_out, vx_out, vc_out )
!
DEALLOCATE( arho, zeta )
!
CASE DEFAULT
CASE DEFAULT
!
CALL xclib_error( 'xc_LDA', 'Wrong ns input', 2 )
!
@ -230,7 +317,8 @@ SUBROUTINE xc( length, sr_d, sv_d, rho_in, ex_out, ec_out, vx_out, vc_out )
!
#endif
!
!$acc end data
!
RETURN
!
END SUBROUTINE xc
END SUBROUTINE xc_

View File

@ -5,8 +5,80 @@
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!-------------------------------------------------------------------------------------
SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, &
v2c, v3c, gpu_args_ )
!----------------------------------------------------------------------------------
!! Wrapper to gpu or non gpu version of \(\texttt{xc_metagcx}\).
!
USE kind_l, ONLY: DP
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
!! length of the I/O arrays
INTEGER, INTENT(IN) :: ns
!! spin components
INTEGER, INTENT(IN) :: np
!! first dimension of v2c
REAL(DP), INTENT(IN) :: rho(length,ns)
!! the charge density
REAL(DP), INTENT(IN) :: grho(3,length,ns)
!! grho = \nabla rho
REAL(DP), INTENT(IN) :: tau(length,ns)
!! kinetic energy density
REAL(DP), INTENT(OUT) :: ex(length)
!! sx = E_x(rho,grho)
REAL(DP), INTENT(OUT) :: ec(length)
!! sc = E_c(rho,grho)
REAL(DP), INTENT(OUT) :: v1x(length,ns)
!! v1x = D(E_x)/D(rho)
REAL(DP), INTENT(OUT) :: v2x(length,ns)
!! v2x = D(E_x)/D( D rho/D r_alpha ) / |\nabla rho|
REAL(DP), INTENT(OUT) :: v3x(length,ns)
!! v3x = D(E_x)/D(tau)
REAL(DP), INTENT(OUT) :: v1c(length,ns)
!! v1c = D(E_c)/D(rho)
REAL(DP), INTENT(OUT) :: v2c(np,length,ns)
!! v2c = D(E_c)/D( D rho/D r_alpha ) / |\nabla rho|
REAL(DP), INTENT(OUT) :: v3c(length,ns)
!! v3c = D(E_c)/D(tau)
LOGICAL, INTENT(IN), OPTIONAL :: gpu_args_
!! whether you wish to run on gpu in case use_gpu is true
!
LOGICAL :: gpu_args
!
gpu_args = .FALSE.
IF ( PRESENT(gpu_args_) ) gpu_args = gpu_args_
!
IF ( gpu_args ) THEN
!
!$acc data deviceptr( rho(length,ns), grho(3,length,ns), tau(length,ns), &
!$acc& ex(length), ec(length), v1x(length,ns), v2x(length,ns), v3x(length,ns), &
!$acc& v1c(length,ns), v2c(np,length,ns), v3c(length,ns) )
CALL xc_metagcx_( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, &
v2c, v3c )
!$acc end data
!
ELSE
!
!$acc data copyin( rho, grho, tau ), copyout( ex, ec, v1x, v2x, v3x, v1c, v2c, v3c )
!$acc host_data use_device( rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c )
CALL xc_metagcx_( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, &
v2c, v3c )
!$acc end host_data
!$acc end data
!
ENDIF
!
RETURN
!
END SUBROUTINE
!
!
!----------------------------------------------------------------------------------------
SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c )
SUBROUTINE xc_metagcx_( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1c, v2c, v3c )
!-------------------------------------------------------------------------------------
!! Wrapper routine. Calls internal metaGGA drivers or the Libxc ones,
!! depending on the input choice.
@ -18,8 +90,8 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
#endif
!
USE kind_l, ONLY: DP
USE dft_setting_params, ONLY: imetac, is_libxc, rho_threshold_mgga, &
grho2_threshold_mgga, tau_threshold_mgga, &
USE dft_setting_params, ONLY: imeta, imetac, is_libxc, rho_threshold_mgga,&
grho2_threshold_mgga, tau_threshold_mgga, &
scan_exx, exx_started, exx_fraction
USE qe_drivers_mgga
!
@ -56,7 +128,7 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
!
! ... local variables
!
INTEGER :: is
INTEGER :: k, is, ipol
REAL(DP), ALLOCATABLE :: grho2(:,:)
REAL(DP), PARAMETER :: small = 1.E-10_DP
!
@ -67,48 +139,70 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
REAL(DP), ALLOCATABLE :: vc_rho(:), vc_sigma(:), vc_tau(:)
REAL(DP), ALLOCATABLE :: lapl_rho(:), vlapl_rho(:) ! not used in QE
!
INTEGER :: k, ipol, pol_unpol
LOGICAL :: POLARIZED
REAL(DP) :: rh, ggrho2, atau
#if (XC_MAJOR_VERSION > 4)
INTEGER(8) :: lengthxc
#else
INTEGER :: lengthxc
#endif
#endif
!
!$acc data deviceptr( rho(length,ns), grho(3,length,ns), tau(length,ns), ex(length), &
!$acc& ec(length), v1x(length,ns), v2x(length,ns), v3x(length,ns), &
!$acc& v1c(length,ns), v2c(np,length,ns), v3c(length,ns) )
!
#if defined(__LIBXC)
lengthxc = length
!
ex = 0.0_DP ; v1x = 0.0_DP ; v2x = 0.0_DP ; v3x = 0.0_DP
ec = 0.0_DP ; v1c = 0.0_DP ; v2c = 0.0_DP ; v3c = 0.0_DP
ALLOCATE( rho_lxc(length*ns), sigma(length*np) )
ALLOCATE( tau_lxc(length*ns), lapl_rho(length*ns) )
!$acc data create( rho_lxc, sigma, tau_lxc, lapl_rho )
!
POLARIZED = .FALSE.
IF (ns == 2) THEN
POLARIZED = .TRUE.
IF ( is_libxc(5) ) THEN
ALLOCATE( ex_lxc(length) )
ALLOCATE( vx_rho(length*ns), vx_sigma(length*np), vx_tau(length*ns) )
IF ( imetac==0 ) THEN
!$acc parallel loop
DO k = 1, length
ec(k) = 0.d0
DO is = 1, ns
v1c(k,is) = 0.d0 ; v3c(k,is) = 0.d0
DO ipol = 1, np
v2c(ipol,k,is) = 0.d0
ENDDO
ENDDO
ENDDO
ENDIF
ENDIF
!
pol_unpol = ns
!
ALLOCATE( rho_lxc(length*ns), sigma(length*np), tau_lxc(length*ns) )
ALLOCATE( lapl_rho(length*ns) )
!
ALLOCATE( ex_lxc(length) , ec_lxc(length) )
ALLOCATE( vx_rho(length*ns) , vx_sigma(length*np), vx_tau(length*ns) )
ALLOCATE( vc_rho(length*ns) , vc_sigma(length*np), vc_tau(length*ns) )
ALLOCATE( vlapl_rho(length*ns) )
!
IF ( is_libxc(6) ) THEN
ALLOCATE( ec_lxc(length) )
ALLOCATE( vc_rho(length*ns), vc_sigma(length*np), vc_tau(length*ns) )
IF ( imeta==0 ) THEN
!$acc parallel loop
DO k = 1, length
ex(k) = 0.d0
DO is = 1, ns
v1x(k,is) = 0.d0 ; v2x(k,is) = 0.d0 ; v3x(k,is) = 0.d0
ENDDO
ENDDO
ENDIF
ENDIF
IF ( ANY(is_libxc(5:6)) ) ALLOCATE( vlapl_rho(length*ns) )
!
IF ( ns == 1 ) THEN
!
!$acc parallel loop
DO k = 1, length
rho_lxc(k) = ABS( rho(k,1) )
sigma(k) = MAX( grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2, &
grho2_threshold_mgga )
tau_lxc(k) = MAX( tau(k,1), tau_threshold_mgga )
vlapl_rho(k) = 0._DP
lapl_rho(k) = 0.d0
ENDDO
!
ELSE
!
!$acc parallel loop
DO k = 1, length
rho_lxc(2*k-1) = ABS( rho(k,1) )
rho_lxc(2*k) = ABS( rho(k,2) )
@ -120,28 +214,28 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
grho2_threshold_mgga )
tau_lxc(2*k-1) = MAX( tau(k,1), small )
tau_lxc(2*k) = MAX( tau(k,2), small )
vlapl_rho(2*k-1) = 0._DP
vlapl_rho(2*k) = 0._DP
lapl_rho(2*k-1) = 0.d0
lapl_rho(2*k) = 0.d0
ENDDO
!
ENDIF
!
!$acc update self( rho_lxc, sigma, tau_lxc, lapl_rho )
!
IF ( .NOT.is_libxc(5) .AND. imetac==0 ) THEN
!
ALLOCATE( grho2(length,ns) )
!
DO is = 1, ns
grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2
ENDDO
!
IF (ns == 1) CALL tau_xc( length, rho(:,1), grho2(:,1), tau(:,1), ex, ec, &
v1x(:,1), v2x(:,1), v3x(:,1), v1c(:,1), v2c(1,:,1),&
v3c(:,1) )
IF (ns == 2) CALL tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x,&
v1c, v2c, v3c )
!
DEALLOCATE( grho2 )
!
IF (ns == 1) THEN
!
!$acc host_data use_device( sigma )
CALL tau_xc( length, rho(:,1), sigma, tau(:,1), ex, ec, v1x(:,1), &
v2x(:,1), v3x(:,1), v1c(:,1), v2c, v3c(:,1) )
!$acc end host_data
!
ELSEIF (ns == 2) THEN
!
CALL tau_xc_spin( length, rho, grho, tau, ex, ec, v1x, v2x, v3x, &
v1c, v2c, v3c )
!
ENDIF
ENDIF
!
! ... META EXCHANGE
@ -157,19 +251,31 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
ex_lxc = 0.d0
ENDIF
!
IF (.NOT. POLARIZED) THEN
!$acc data copyin( ex_lxc, vx_rho, vx_sigma, vx_tau )
IF ( ns==1 ) THEN
!$acc parallel loop
DO k = 1, length
IF ( ABS(rho_lxc(k))<=rho_threshold_mgga .OR. &
sigma(k)<=grho2_threshold_mgga .OR. &
ABS(tau_lxc(k))<=rho_threshold_mgga ) CYCLE
ABS(tau_lxc(k))<=rho_threshold_mgga ) THEN
ex(k) = 0.d0 ; v1x(k,1) = 0.d0
v2x(k,1) = 0.d0 ; v3x(k,1) = 0.d0
CYCLE
ENDIF
ex(k) = ex_lxc(k) * rho_lxc(k)
v1x(k,1) = vx_rho(k)
v2x(k,1) = vx_sigma(k) * 2.0_DP
v3x(k,1) = vx_tau(k)
ENDDO
ELSE
!$acc parallel loop
DO k = 1, length
IF (rho_lxc(2*k-1)+rho_lxc(2*k) <= rho_threshold_mgga) CYCLE
IF (rho_lxc(2*k-1)+rho_lxc(2*k) <= rho_threshold_mgga) THEN
ex(k) = 0.d0
v1x(k,1) = 0.d0 ; v2x(k,1) = 0.d0 ; v3x(k,1) = 0.d0
v1x(k,2) = 0.d0 ; v2x(k,2) = 0.d0 ; v3x(k,2) = 0.d0
CYCLE
ENDIF
ex(k) = ex_lxc(k) * (rho_lxc(2*k-1)+rho_lxc(2*k))
IF ( ABS(rho_lxc(2*k-1))>rho_threshold_mgga .AND. &
sigma(3*k-2)>grho2_threshold_mgga .AND. &
@ -177,6 +283,8 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
v1x(k,1) = vx_rho(2*k-1)
v2x(k,1) = vx_sigma(3*k-2)*2.d0
v3x(k,1) = vx_tau(2*k-1)
ELSE
v1x(k,1) = 0.d0 ; v2x(k,1) = 0.d0 ; v3x(k,1) = 0.d0
ENDIF
IF ( ABS(rho_lxc(2*k))>rho_threshold_mgga .AND. &
sigma(3*k)>grho2_threshold_mgga .AND. &
@ -184,20 +292,14 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
v1x(k,2) = vx_rho(2*k)
v2x(k,2) = vx_sigma(3*k)*2.d0
v3x(k,2) = vx_tau(2*k)
ELSE
v1x(k,2) = 0.d0 ; v2x(k,2) = 0.d0 ; v3x(k,2) = 0.d0
ENDIF
ENDDO
ENDIF
!
! **Now this is done directly inside Libxc for SCAN0**
! ... only for HK/MCA: SCAN0 (used in CPV)
!IF ( scan_exx ) THEN
! IF (exx_started) THEN
! ex = (1.0_DP - exx_fraction) * ex
! v1x = (1.0_DP - exx_fraction) * v1x
! v2x = (1.0_DP - exx_fraction) * v2x
! v3x = (1.0_DP - exx_fraction) * v3x
! ENDIF
!ENDIF
!$acc end data
DEALLOCATE( ex_lxc, vx_rho, vx_sigma, vx_tau )
!
ENDIF
!
@ -215,24 +317,40 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
ec_lxc = 0.d0
ENDIF
!
IF (.NOT. POLARIZED) THEN
!$acc data copyin( ec_lxc, vc_rho, vc_sigma, vc_tau )
IF ( ns==1 ) THEN
!$acc parallel loop
DO k = 1, length
IF ( ABS(rho_lxc(k))<=rho_threshold_mgga .OR. &
sigma(k)<=grho2_threshold_mgga .OR. &
ABS(tau_lxc(k))<=rho_threshold_mgga ) CYCLE
ABS(tau_lxc(k))<=rho_threshold_mgga ) THEN
ec(k) = 0.d0 ; v1c(k,1) = 0.d0
v2c(1,k,1) = 0.d0 ; v3c(k,1) = 0.d0
CYCLE
ENDIF
ec(k) = ec_lxc(k) * rho_lxc(k)
v1c(k,1) = vc_rho(k)
v2c(1,k,1) = vc_sigma(k) * 2.0_DP
v3c(k,1) = vc_tau(k)
ENDDO
ELSE
!$acc parallel loop
DO k = 1, length
rh = rho_lxc(2*k-1) + rho_lxc(2*k)
atau = ABS(tau_lxc(2*k-1) + tau_lxc(2*k))
ggrho2 = (sigma(3*k-2) + sigma(3*k))*4.0_DP
IF ( rh <= rho_threshold_mgga .OR. &
ggrho2 <= grho2_threshold_mgga .OR. &
atau <= tau_threshold_mgga ) CYCLE
atau <= tau_threshold_mgga ) THEN
ec(k) = 0.d0
v1c(k,1) = 0.d0 ; v3c(k,1) = 0.d0
v1c(k,2) = 0.d0 ; v3c(k,2) = 0.d0
DO ipol = 1, 3
v2c(ipol,k,1) = 0.d0
v2c(ipol,k,2) = 0.d0
ENDDO
CYCLE
ENDIF
ec(k) = ec_lxc(k) * (rho_lxc(2*k-1)+rho_lxc(2*k))
v1c(k,1) = vc_rho(2*k-1)
v1c(k,2) = vc_rho(2*k)
@ -244,26 +362,33 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
v3c(k,2) = vc_tau(2*k)
ENDDO
ENDIF
!$acc end data
!
DEALLOCATE( ec_lxc, vc_rho, vc_sigma, vc_tau )
ENDIF
!
IF ( ANY(is_libxc(5:6)) ) DEALLOCATE( vlapl_rho )
!
!$acc end data
DEALLOCATE( rho_lxc, sigma, tau_lxc, lapl_rho )
DEALLOCATE( ex_lxc , ec_lxc )
DEALLOCATE( vx_rho , vx_sigma, vx_tau )
DEALLOCATE( vc_rho , vc_sigma, vc_tau, vlapl_rho )
!
#else
!
ALLOCATE( grho2(length,ns) )
!$acc data create( grho2 )
!$acc host_data use_device( grho2 )
!
!$acc parallel loop collapse(2)
DO is = 1, ns
grho2(:,is) = grho(1,:,is)**2 + grho(2,:,is)**2 + grho(3,:,is)**2
DO k = 1, length
grho2(k,is) = grho(1,k,is)**2 + grho(2,k,is)**2 + grho(3,k,is)**2
ENDDO
ENDDO
!
IF (ns == 1) THEN
!
CALL tau_xc( length, rho(:,1), grho2(:,1), tau(:,1), ex, ec, v1x(:,1), &
v2x(:,1), v3x(:,1), v1c(:,1), v2c(1,:,1), v3c(:,1) )
v2x(:,1), v3x(:,1), v1c(:,1), v2c, v3c(:,1) )
!
ELSEIF (ns == 2) THEN
!
@ -272,10 +397,14 @@ SUBROUTINE xc_metagcx( length, ns, np, rho, grho, tau, ex, ec, v1x, v2x, v3x, v1
!
ENDIF
!
!$acc end host_data
!$acc end data
DEALLOCATE( grho2 )
!
#endif
!
!$acc end data
!
RETURN
!
END SUBROUTINE xc_metagcx
END SUBROUTINE xc_metagcx_

View File

@ -31,18 +31,19 @@ PROGRAM xclib_test
!
!! See README.TEST file for more details.
!
USE kind_l, ONLY: DP
USE constants_l, ONLY: pi
USE xc_lib, ONLY: xclib_set_dft_from_name, xclib_set_exx_fraction, &
xclib_get_ID, xclib_reset_dft, xc_gcx, &
xclib_dft_is_libxc, xclib_init_libxc, &
xclib_finalize_libxc, xclib_set_finite_size_volume, &
xclib_set_auxiliary_flags, xclib_dft_is, start_exx
USE kind_l, ONLY: DP
USE constants_l, ONLY: pi
USE beef_interface, ONLY: beefsetmode
USE xc_lib, ONLY: xclib_set_dft_from_name, xclib_set_exx_fraction, &
xclib_get_ID, xclib_reset_dft, xc, xc_gcx, &
xc_metagcx, xclib_dft_is_libxc, xclib_init_libxc,&
xclib_finalize_libxc, xclib_set_finite_size_volume,&
xclib_set_auxiliary_flags, xclib_dft_is, start_exx
USE xclib_utils_and_para
!--xml
USE xmltools, ONLY: xml_open_file, xml_closefile,xmlr_readtag, &
xmlw_writetag, xmlw_opentag, xmlw_closetag, &
xmlr_opentag, xmlr_closetag, get_attr, add_attr
USE xmltools, ONLY: xml_open_file, xml_closefile,xmlr_readtag, &
xmlw_writetag, xmlw_opentag, xmlw_closetag, &
xmlr_opentag, xmlr_closetag, get_attr, add_attr
#if defined(__LIBXC)
#include "xc_version.h"
USE xc_f03_lib_m
@ -379,6 +380,15 @@ PROGRAM xclib_test
IF (nr==1) nrpe = mype*nnr
IF (nr==0) nrpe = npoints-(npoints/npes)*(npes-mype-1)
!
! ... openacc init (otherwise it offsets the wall time of the first test)
!
#if defined(_OPENACC)
!$acc data create( time )
!$acc end data
#endif
!
! ... capitalize input vars
!
IF (TRIM(polarization)=='UNPOLARIZED' ) THEN
is_min = 1 ; is_max = 1
ELSEIF (TRIM(polarization)=='POLARIZED' ) THEN
@ -979,6 +989,9 @@ PROGRAM xclib_test
ex1 = 0.d0 ; ec1 = 0.d0
ENDIF
!
IF ( dft(1:3)=='BEE' ) CALL beefsetmode(-1) !** beeforder can be manually
! ! changed here for other checks
!
IF (mype==root) time(3) = get_wall_time()
CALL xc_gcx( nnr, ns, rho, grho, exg1, ecg1, v1x1, v2x1, v1c1, &
v2c1, v2c_ud1 )
@ -1798,9 +1811,9 @@ PROGRAM xclib_test
WRITE(test_output_exe(56:96), '(a)') TRIM(status)
IF (status==passed .AND. show_time) THEN
WRITE(test_output_exe(80:85), '(a)') 'time:'
WRITE(test_output_exe(86:92), '(F6.3)') time_tot2
WRITE(test_output_exe(86:92), '(F6.3)') time_tot1
WRITE(test_output_exe(93:100), '(a)') 's incr:'
WRITE(test_output_exe(101:110), '(F8.3)') time_tot2/time_tot1*100.d0-100.d0
WRITE(test_output_exe(101:110), '(F8.3)') time_tot1/time_tot2*100.d0-100.d0
WRITE(test_output_exe(111:111), '(a)') '%'
ENDIF
WRITE(stdout,*) test_output_exe

View File

@ -13,7 +13,8 @@ subroutine vxc_t(lsd,rho,rhoc,exc,vxc)
! this function returns the XC potential and energy in LDA or
! LSDA approximation
!
use kinds, only : DP
use kinds, only : DP
use xc_lib, only : xc
implicit none
integer, intent(in) :: lsd ! 1 in the LSDA case, 0 otherwise
real(DP), intent(in) :: rho(2), rhoc ! the system density
@ -77,7 +78,8 @@ subroutine vxcgc( ndm, mesh, nspin, r, r2, rho, rhoc, vgc, egc, &
!
use kinds, only : DP
use constants, only : fpi, e2
use xc_lib, only : xclib_set_threshold, xclib_dft_is, xc_gcx
use xc_lib, only : xclib_set_threshold, xclib_dft_is, xc_gcx, &
xc_metagcx
implicit none
integer, intent(in) :: ndm,mesh,nspin,iflag
real(DP), intent(in) :: r(mesh), r2(mesh), rho(ndm,2), rhoc(ndm)

View File

@ -5,13 +5,6 @@ qe_add_global_compile_definitions(__PGI)
# set optimization specific flags
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mcache_align -Mlarge_arrays")
if(QE_ENABLE_OPENACC)
add_library(OpenACC::OpenACC_Fortran INTERFACE IMPORTED)
set_target_properties(OpenACC::OpenACC_Fortran PROPERTIES
INTERFACE_COMPILE_OPTIONS "-acc"
INTERFACE_LINK_OPTIONS "-acc")
endif()
if(QE_ENABLE_CUDA)
if(CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER_EQUAL 20.7)
set(CUDA_FLAG "-cuda")

1
install/configure vendored
View File

@ -4793,6 +4793,7 @@ $as_echo "$as_me: WARNING: Using legacy custom solver." >&2;}
if test "$enable_openacc" == "yes"; then
ldflags="$ldflags -acc"
cuda_fflags="$cuda_fflags -acc"
CUDA_CFLAGS="$CUDA_CFLAGS -acc -gpu=cc$with_cuda_cc,cuda$with_cuda_runtime"
fi
fi

View File

@ -209,6 +209,7 @@ EOF
if test "$enable_openacc" == "yes"; then
ldflags="$ldflags -acc"
cuda_fflags="$cuda_fflags -acc"
CUDA_CFLAGS="$CUDA_CFLAGS -acc -gpu=cc$with_cuda_cc,cuda$with_cuda_runtime"
fi
fi

View File

@ -101,6 +101,9 @@ CUDA_RUNTIME=@gpu_runtime@
# CUDA F90 Flags
CUDA_F90FLAGS=@cuda_fflags@ $(MOD_FLAG)$(TOPDIR)/external/devxlib/src
# CUDA C Flags
CUDA_CFLAGS=@CUDA_CFLAGS@
# C preprocessor and preprocessing flags - for explicit preprocessing,
# if needed (see the compilation rules above)
# preprocessing flags must include DFLAGS and IFLAGS
@ -112,7 +115,7 @@ CPPFLAGS = @cppflags@ $(DFLAGS) $(IFLAGS)
# C flags must include DFLAGS and IFLAGS
# F90 flags must include MODFLAGS, IFLAGS, and FDFLAGS with appropriate syntax
CFLAGS = @cflags@ $(DFLAGS) $(IFLAGS)
CFLAGS = @cflags@ $(DFLAGS) $(IFLAGS) $(CUDA_CFLAGS)
F90FLAGS = @f90flags@ @pre_fdflags@$(FDFLAGS) $(CUDA_F90FLAGS) $(IFLAGS) $(MODFLAGS)
# compiler flags with and without optimization for fortran-77