Vxc_acc - rho_r2g on gpu

This commit is contained in:
fabrizio22 2021-11-12 18:18:46 +01:00
parent 5162d496e6
commit bba0b48bc8
3 changed files with 139 additions and 10 deletions

View File

@ -23,7 +23,7 @@ MODULE fft_helper_subroutines
#endif
END INTERFACE
PRIVATE
PUBLIC :: fftx_threed2oned, fftx_oned2threed
PUBLIC :: fftx_threed2oned, fftx_threed2oned_gpu, fftx_oned2threed
PUBLIC :: tg_reduce_rho
PUBLIC :: tg_get_nnr, tg_get_recip_inc, fftx_ntgrp, fftx_tgpe, &
tg_get_group_nr3
@ -469,6 +469,8 @@ CONTAINS
END SUBROUTINE
SUBROUTINE fftx_threed2oned( desc, vin, vout1, vout2 )
!! Copy charge density from 3D array to 1D array in Fourier
!! space.
USE fft_param
USE fft_types, ONLY : fft_type_descriptor
TYPE(fft_type_descriptor), INTENT(in) :: desc
@ -490,7 +492,40 @@ CONTAINS
END DO
END IF
END SUBROUTINE
SUBROUTINE fftx_threed2oned_gpu( desc, vin_d, vout1_d, vout2_d )
!! GPU version of \(\texttt{fftx_threed2oned}\).
USE fft_param
USE fft_types, ONLY : fft_type_descriptor
TYPE(fft_type_descriptor), INTENT(in) :: desc
complex(DP), INTENT(OUT) :: vout1_d(:)
complex(DP), OPTIONAL, INTENT(OUT) :: vout2_d(:)
complex(DP), INTENT(IN) :: vin_d(:)
INTEGER, POINTER :: nl_d(:), nlm_d(:)
COMPLEX(DP) :: fp, fm
INTEGER :: ig
#if defined(__CUDA) && defined(_OPENACC)
attributes(DEVICE) :: nl_d, nlm_d
!$acc data deviceptr( vout1_d(:), vout2_d(:), vin_d(:) )
nl_d => desc%nl_d
nlm_d => desc%nlm_d
IF( PRESENT( vout2_d ) ) THEN
!$acc parallel loop
DO ig=1,desc%ngm
fp=vin_d(nl_d(ig))+vin_d(nlm_d(ig))
fm=vin_d(nl_d(ig))-vin_d(nlm_d(ig))
vout1_d(ig) = CMPLX(0.5d0,0.d0,kind=DP)*CMPLX( DBLE(fp),AIMAG(fm),kind=DP)
vout2_d(ig) = CMPLX(0.5d0,0.d0,kind=DP)*CMPLX(AIMAG(fp),-DBLE(fm),kind=DP)
END DO
ELSE
!$acc parallel loop
DO ig=1,desc%ngm
vout1_d(ig) = vin_d(nl_d(ig))
END DO
END IF
!$acc end data
#endif
END SUBROUTINE
SUBROUTINE fftx_psi2c_gamma( desc, vin, vout1, vout2 )
USE fft_param

View File

@ -16,7 +16,7 @@ MODULE fft_rho
!
IMPLICIT NONE
PRIVATE
PUBLIC :: rho_r2g, rho_g2r
PUBLIC :: rho_r2g, rho_g2r, rho_r2g_gpu
!
INTERFACE rho_g2r
MODULE PROCEDURE rho_g2r_1, rho_g2r_2, rho_g2r_sum_components
@ -24,7 +24,10 @@ MODULE fft_rho
!
CONTAINS
!
!-------------------------------------------------------------
SUBROUTINE rho_r2g ( desc, rhor, rhog, v )
!------------------------------------------------------
!! Bring charge density rho from real to G- space
USE fft_types, ONLY: fft_type_descriptor
USE fft_helper_subroutines, ONLY: fftx_threed2oned
!
@ -93,6 +96,97 @@ CONTAINS
END SUBROUTINE rho_r2g
!
!-----------------------------------------------------------------
SUBROUTINE rho_r2g_gpu( desc, rhor_d, rhog_d, v_d )
!---------------------------------------------------------------
!! Bring charge density rho from real to G- space - GPU version.
!
USE fft_types, ONLY: fft_type_descriptor
USE fft_helper_subroutines, ONLY: fftx_threed2oned_gpu
!
TYPE(fft_type_descriptor), INTENT(in) :: desc
REAL(dp), INTENT(in) :: rhor_d(:,:)
!! rho in real space
COMPLEX(dp), INTENT(OUT):: rhog_d(:,:)
!! rho in G-space
REAL(dp), OPTIONAL, INTENT(in) :: v_d(:)
!
! ... local variables
!
INTEGER :: ir, ig, iss, isup, isdw
INTEGER :: nspin
COMPLEX(dp):: fp, fm
COMPLEX(dp), ALLOCATABLE :: psi_d(:)
!
!$acc data deviceptr( rhor_d(:,:), rhog_d(:,:), v_d(:) )
!
nspin= SIZE(rhor_d, 2)
!
ALLOCATE( psi_d(desc%nnr) )
!$acc data create( psi_d(desc%nnr) )
!$acc host_data use_device( psi_d )
IF( nspin == 1 ) THEN
iss=1
IF( PRESENT( v_d ) ) THEN
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,iss)+v_d(ir),0.0_dp,kind=dp)
END DO
ELSE
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,iss),0.0_dp,kind=dp)
END DO
END IF
CALL fwfft('Rho', psi_d, desc )
CALL fftx_threed2oned_gpu( desc, psi_d, rhog_d(:,iss) )
ELSE
IF ( gamma_only ) THEN
! nspin/2 = 1 for LSDA, = 2 for noncolinear
DO iss=1,nspin/2
isup=1+(iss-1)*nspin/2 ! 1 for LSDA, 1 and 3 for noncolinear
isdw=2+(iss-1)*nspin/2 ! 2 for LSDA, 2 and 4 for noncolinear
IF( PRESENT( v_d ) ) THEN
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,isup)+v_d(ir),rhor_d(ir,isdw)+v_d(ir),kind=dp)
END DO
ELSE
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,isup),rhor_d(ir,isdw),kind=dp)
END DO
END IF
CALL fwfft('Rho', psi_d, desc )
CALL fftx_threed2oned_gpu( desc, psi_d, rhog_d(:,isup), rhog_d(:,isdw) )
END DO
ELSE
DO iss=1,nspin
IF( PRESENT( v_d ) ) THEN
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,iss)+v_d(ir),0.0_dp,kind=dp)
END DO
ELSE
!$acc parallel loop
DO ir=1,desc%nnr
psi_d(ir)=CMPLX(rhor_d(ir,iss),0.0_dp,kind=dp)
END DO
END IF
CALL fwfft('Rho', psi_d, desc )
CALL fftx_threed2oned_gpu( desc, psi_d, rhog_d(:,iss) )
END DO
END IF
ENDIF
!$acc end host_data
!$acc end data
DEALLOCATE( psi_d )
!
!$acc end data
!
END SUBROUTINE rho_r2g_gpu
!
!
SUBROUTINE rho_g2r_1 ( desc, rhog, rhor )
USE fft_types, ONLY: fft_type_descriptor
USE fft_helper_subroutines, ONLY: fftx_threed2oned, fftx_oned2threed

View File

@ -20,7 +20,7 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v )
USE noncollin_module, ONLY : domag
USE fft_base, ONLY : dfftp
USE fft_interfaces, ONLY : fwfft
USE fft_rho, ONLY : rho_r2g
USE fft_rho, ONLY : rho_r2g, rho_r2g_gpu
USE control_flags, ONLY : use_gpu
!
IMPLICIT NONE
@ -80,17 +80,17 @@ SUBROUTINE gradcorr( rho, rhog, rho_core, rhog_core, etxc, vtxc, v )
IF ( nspin == 4 .AND. domag ) THEN
!
!$acc data copyin(rho) copyout(rhoaux, segni )
!$acc host_data use_device( rho, rhoaux, segni )
!$acc data copyin(rho) copyout(rhoaux, rhogaux, segni )
!$acc host_data use_device( rho, rhoaux, rhogaux, segni )
IF ( use_gpu ) CALL compute_rho_gpu( rho, rhoaux, segni, dfftp%nnr )
IF (.NOT. use_gpu) CALL compute_rho( rho, rhoaux, segni, dfftp%nnr )
!$acc end host_data
!$acc end data
!
! ... bring starting rhoaux to G-space
!
CALL rho_r2g ( dfftp, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
IF ( use_gpu ) CALL rho_r2g_gpu( dfftp, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
IF (.NOT. use_gpu) CALL rho_r2g( dfftp, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) )
!$acc end host_data
!$acc end data
!
ELSE
!