Merge branch 'orthoatwfc-GPU' into 'gpu-develop'

orthoatwfc gpu

See merge request QEF/q-e-gpu!14
This commit is contained in:
Pietro 2020-05-29 08:01:37 +00:00
commit 4885af13e3
5 changed files with 853 additions and 6 deletions

439
PW/src/atomic_wfc_gpu.f90 Normal file
View File

@ -0,0 +1,439 @@
!
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!-----------------------------------------------------------------------
SUBROUTINE atomic_wfc_gpu( ik, wfcatom_d )
!-----------------------------------------------------------------------
!! This routine computes the superposition of atomic wavefunctions
!! for k-point "ik" - output in "wfcatom".
!
USE kinds, ONLY : DP
USE constants, ONLY : tpi, fpi, pi
USE cell_base, ONLY : omega, tpiba
USE ions_base, ONLY : nat, ntyp => nsp, ityp, tau
USE basis, ONLY : natomwfc
!USE gvect, ONLY : mill, eigts1, eigts2, eigts3, g
USE klist, ONLY : xk, ngk, igk_k_d !, igk_k
USE wvfct, ONLY : npwx
USE us, ONLY : tab_at, dq
USE uspp_param, ONLY : upf
USE noncollin_module, ONLY : noncolin, npol, angle1, angle2
USE spin_orb, ONLY : lspinorb, rot_ylm, fcoef, lmaxx, domag, &
starting_spin_angle
USE mp_bands, ONLY : inter_bgrp_comm
USE mp, ONLY : mp_sum
!
USE gvect_gpum, ONLY : mill_d, eigts1_d, eigts2_d, eigts3_d, g_d
USE us_gpum, ONLY : using_tab_at, using_tab_at_d, tab_at_d
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ik
!! k-point index
COMPLEX(DP), INTENT(OUT) :: wfcatom_d(npwx,npol,natomwfc)
!! Superposition of atomic wavefunctions
!
! ... local variables
!
INTEGER :: n_starting_wfc, lmax_wfc, nt, l, nb, na, m, lm, ig, iig, &
i0, i1, i2, i3, nwfcm, npw
COMPLEX(DP) :: kphase, lphase
REAL(DP) :: arg, px, ux, vx, wx
INTEGER :: ig_start, ig_end, mil1, mil2, mil3
!
REAL(DP) :: xk1, xk2, xk3, qgr
REAL(DP), ALLOCATABLE :: chiq_d(:,:,:)
REAL(DP), ALLOCATABLE :: ylm_d(:,:), gk_d(:,:), qg_d(:)
COMPLEX(DP), ALLOCATABLE :: sk_d(:), aux_d(:)
!
#if defined(__CUDA)
attributes(DEVICE) :: wfcatom_d, ylm_d, gk_d, qg_d, sk_d, chiq_d, aux_d
#endif
!
CALL start_clock( 'atomic_wfc' )
! calculate max angular momentum required in wavefunctions
lmax_wfc = 0
DO nt = 1, ntyp
lmax_wfc = MAX( lmax_wfc, MAXVAL( upf(nt)%lchi(1:upf(nt)%nwfc) ) )
END DO
!
nwfcm = MAXVAL( upf(1:ntyp)%nwfc )
npw = ngk(ik)
!
ALLOCATE( ylm_d(npw,(lmax_wfc+1)**2), gk_d(3,npw), qg_d(npw) )
ALLOCATE( chiq_d(npw,nwfcm,ntyp), sk_d(npw) )
!
xk1 = xk(1,ik)
xk2 = xk(2,ik)
xk3 = xk(3,ik)
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = 1, npw
iig = igk_k_d(ig,ik)
gk_d(1,ig) = xk1 + g_d(1,iig)
gk_d(2,ig) = xk2 + g_d(2,iig)
gk_d(3,ig) = xk3 + g_d(3,iig)
qg_d(ig) = gk_d(1,ig)**2 + gk_d(2,ig)**2 + gk_d(3,ig)**2
END DO
!
! ylm = spherical harmonics
!
CALL ylmr2_gpu( (lmax_wfc+1)**2, npw, gk_d, qg_d, ylm_d )
!
! from now to the end of the routine the ig loops are distributed across bgrp
CALL divide( inter_bgrp_comm, npw, ig_start, ig_end )
!
! set now q=|k+G| in atomic units
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
qg_d(ig) = SQRT( qg_d(ig) )*tpiba
END DO
!
n_starting_wfc = 0
!
! chiq = radial fourier transform of atomic orbitals chi
!
CALL using_tab_at(0)
CALL using_tab_at_d(0)
!
DO nt = 1, ntyp
DO nb = 1, upf(nt)%nwfc
IF ( upf(nt)%oc(nb) >= 0.d0 ) THEN
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
qgr = qg_d(ig)
px = qgr / dq - DBLE(INT(qgr/dq))
ux = 1.d0 - px
vx = 2.d0 - px
wx = 3.d0 - px
i0 = INT(qgr/dq) + 1
i1 = i0 + 1
i2 = i0 + 2
i3 = i0 + 3
chiq_d(ig,nb,nt) = &
tab_at_d(i0,nb,nt) * ux * vx * wx / 6.d0 + &
tab_at_d(i1,nb,nt) * px * vx * wx / 2.d0 - &
tab_at_d(i2,nb,nt) * px * ux * wx / 2.d0 + &
tab_at_d(i3,nb,nt) * px * ux * vx / 6.d0
END DO
!
END IF
END DO
END DO
DEALLOCATE( qg_d, gk_d )
!
IF (noncolin) ALLOCATE( aux_d(npw) )
!
wfcatom_d(:,:,:) = (0.0_dp, 0.0_dp)
!
DO na = 1, nat
arg = (xk1*tau(1,na) + xk2*tau(2,na) + xk3*tau(3,na)) * tpi
kphase = CMPLX( COS(arg), - SIN(arg) ,KIND=DP)
!
! sk is the structure factor
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
iig = igk_k_d(ig,ik)
mil1 = mill_d(1,iig)
mil2 = mill_d(2,iig)
mil3 = mill_d(3,iig)
sk_d(ig) = kphase * eigts1_d(mil1,na) * &
eigts2_d(mil2,na) * &
eigts3_d(mil3,na)
END DO
!
nt = ityp(na)
DO nb = 1, upf(nt)%nwfc
IF ( upf(nt)%oc(nb) >= 0.d0 ) THEN
l = upf(nt)%lchi(nb)
lphase = (0.d0,1.d0)**l
!
! the factor i^l MUST BE PRESENT in order to produce
! wavefunctions for k=0 that are real in real space
!
IF ( noncolin ) THEN
!
IF ( upf(nt)%has_so ) THEN
!
IF (starting_spin_angle.OR..NOT.domag) THEN
CALL atomic_wfc_so_gpu( )
ELSE
CALL atomic_wfc_so_mag_gpu( )
END IF
!
ELSE
!
CALL atomic_wfc_nc_gpu( )
!
END IF
!
ELSE
!
CALL atomic_wfc___gpu( )
!
END IF
!
END IF
!
END DO
!
END DO
IF ( n_starting_wfc /= natomwfc) call errore ('atomic_wfc', &
'internal error: some wfcs were lost ', 1 )
IF (noncolin) DEALLOCATE( aux_d )
DEALLOCATE( sk_d, chiq_d, ylm_d )
! collect results across bgrp
CALL mp_sum( wfcatom_d, inter_bgrp_comm )
CALL stop_clock( 'atomic_wfc' )
RETURN
CONTAINS
!----------------------------------------------------------------
SUBROUTINE atomic_wfc_so_gpu( )
!------------------------------------------------------------
!! Spin-orbit case.
!
REAL(DP) :: fact(2), fact_is, j
COMPLEX(DP) :: rot_ylm_in1
REAL(DP), EXTERNAL :: spinor
INTEGER :: ind, ind1, n1, is, sph_ind
!
j = upf(nt)%jchi(nb)
DO m = -l-1, l
fact(1) = spinor(l,j,m,1)
fact(2) = spinor(l,j,m,2)
IF ( ABS(fact(1)) > 1.d-8 .OR. ABS(fact(2)) > 1.d-8 ) THEN
n_starting_wfc = n_starting_wfc + 1
IF (n_starting_wfc > natomwfc) CALL errore &
('atomic_wfc_so', 'internal error: too many wfcs', 1)
!
DO is = 1, 2
fact_is = fact(is)
IF (ABS(fact(is)) > 1.d-8) THEN
ind = lmaxx + 1 + sph_ind(l,j,m,is)
aux_d = (0.d0,0.d0)
DO n1 = 1, 2*l+1
ind1 = l**2+n1
rot_ylm_in1 = rot_ylm(ind,n1)
IF (ABS(rot_ylm_in1) > 1.d-8) THEN
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
aux_d(ig) = aux_d(ig) + rot_ylm_in1 * &
CMPLX(ylm_d(ig,ind1))
ENDDO
ENDIF
ENDDO
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
wfcatom_d(ig,is,n_starting_wfc) = lphase * &
sk_d(ig)*aux_d(ig)*CMPLX(fact_is* &
chiq_d(ig,nb,nt))
END DO
ELSE
wfcatom_d(:,is,n_starting_wfc) = (0.d0,0.d0)
END IF
END DO
!
END IF
END DO
!
END SUBROUTINE atomic_wfc_so_gpu
!
SUBROUTINE atomic_wfc_so_mag_gpu( )
!
!! Spin-orbit case, magnetization along "angle1" and "angle2"
!! In the magnetic case we always assume that magnetism is much larger
!! than spin-orbit and average the wavefunctions at l+1/2 and l-1/2
!! filling then the up and down spinors with the average wavefunctions,
!! according to the direction of the magnetization, following what is
!! done in the noncollinear case.
!
REAL(DP) :: alpha, gamman, j
COMPLEX(DP) :: fup, fdown
REAL(DP), ALLOCATABLE :: chiaux_d(:)
INTEGER :: nc, ib
!
#if defined(__CUDA)
ATTRIBUTES(DEVICE) :: chiaux_d
#endif
!
j = upf(nt)%jchi(nb)
!
! This routine creates two functions only in the case j=l+1/2 or exit in the
! other case
!
IF (ABS(j-l+0.5_DP)<1.d-4) RETURN
ALLOCATE( chiaux_d(npw) )
!
! Find the functions j=l-1/2
!
IF (l == 0) THEN
chiaux_d(:) = chiq_d(:,nb,nt)
ELSE
DO ib = 1, upf(nt)%nwfc
IF ((upf(nt)%lchi(ib) == l).AND. &
(ABS(upf(nt)%jchi(ib)-l+0.5_DP)<1.d-4)) THEN
nc = ib
EXIT
ENDIF
ENDDO
!
! Average the two functions
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = 1, npw
chiaux_d(ig) = (chiq_d(ig,nb,nt)*DBLE(l+1)+chiq_d(ig,nc,nt)*l)/ &
DBLE(2*l+1)
ENDDO
!
ENDIF
!
! and construct the starting wavefunctions as in the noncollinear case.
!
alpha = angle1(nt)
gamman = - angle2(nt) + 0.5d0*pi
!
DO m = 1, 2*l+1
lm = l**2+m
n_starting_wfc = n_starting_wfc + 1
IF ( n_starting_wfc + 2*l+1 > natomwfc ) CALL errore &
('atomic_wfc_nc', 'internal error: too many wfcs', 1)
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
aux_d(ig) = sk_d(ig)* CMPLX(ylm_d(ig,lm)*chiaux_d(ig))
END DO
!
! now, rotate wfc as needed
! first : rotation with angle alpha around (OX)
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
fup = CMPLX(COS(0.5d0*alpha))*aux_d(ig)
fdown = (0.d0,1.d0)*CMPLX(SIN(0.5d0*alpha))*aux_d(ig)
!
! Now, build the orthogonal wfc
! first rotation with angle (alpha+pi) around (OX)
!
wfcatom_d(ig,1,n_starting_wfc) = (CMPLX(COS(0.5d0*gamman)) &
+(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fup
wfcatom_d(ig,2,n_starting_wfc) = (CMPLX(COS(0.5d0*gamman)) &
-(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fdown
!
! second: rotation with angle gamma around (OZ)
!
! Now, build the orthogonal wfc
! first rotation with angle (alpha+pi) around (OX)
!
fup = CMPLX(COS(0.5d0*(alpha+pi)))*aux_d(ig)
fdown = (0.d0,1.d0)*CMPLX(SIN(0.5d0*(alpha+pi)))*aux_d(ig)
!
! second, rotation with angle gamma around (OZ)
!
wfcatom_d(ig,1,n_starting_wfc+2*l+1) = (CMPLX(COS(0.5d0*gamman)) &
+(0.d0,1.d0)*CMPLX(SIN(0.5d0 *gamman)))*fup
wfcatom_d(ig,2,n_starting_wfc+2*l+1) = (CMPLX(COS(0.5d0*gamman)) &
-(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fdown
END DO
END DO
!
n_starting_wfc = n_starting_wfc + 2*l+1
!
DEALLOCATE( chiaux_d )
!
END SUBROUTINE atomic_wfc_so_mag_gpu
!
!
SUBROUTINE atomic_wfc_nc_gpu( )
!
!! noncolinear case, magnetization along "angle1" and "angle2"
!
REAL(DP) :: alpha, gamman
COMPLEX(DP) :: fup, fdown
!
alpha = angle1(nt)
gamman = - angle2(nt) + 0.5d0*pi
!
DO m = 1, 2*l+1
lm = l**2 + m
n_starting_wfc = n_starting_wfc + 1
IF ( n_starting_wfc + 2*l+1 > natomwfc) CALL errore &
('atomic_wfc_nc', 'internal error: too many wfcs', 1)
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
aux_d(ig) = sk_d(ig)*CMPLX(ylm_d(ig,lm)*chiq_d(ig,nb,nt))
END DO
!
! now, rotate wfc as needed
! first : rotation with angle alpha around (OX)
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
fup = CMPLX(COS(0.5d0*alpha))*aux_d(ig)
fdown = (0.d0,1.d0)*CMPLX(SIN(0.5d0*alpha))*aux_d(ig)
!
! Now, build the orthogonal wfc
! first rotation with angle (alpha+pi) around (OX)
!
wfcatom_d(ig,1,n_starting_wfc) = (CMPLX(COS(0.5d0*gamman)) &
+(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fup
wfcatom_d(ig,2,n_starting_wfc) = (CMPLX(COS(0.5d0*gamman)) &
-(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fdown
!
! second: rotation with angle gamma around (OZ)
!
! Now, build the orthogonal wfc
! first rotation with angle (alpha+pi) around (OX)
!
fup = CMPLX(COS(0.5d0*(alpha+pi)))*aux_d(ig)
fdown = (0.d0,1.d0)*CMPLX(SIN(0.5d0*(alpha+pi)))*aux_d(ig)
!
! second, rotation with angle gamma around (OZ)
!
wfcatom_d(ig,1,n_starting_wfc+2*l+1) = (CMPLX(COS(0.5d0*gamman)) &
+(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fup
wfcatom_d(ig,2,n_starting_wfc+2*l+1) = (CMPLX(COS(0.5d0*gamman)) &
-(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fdown
END DO
END DO
n_starting_wfc = n_starting_wfc + 2*l+1
!
END SUBROUTINE atomic_wfc_nc_gpu
!
!
SUBROUTINE atomic_wfc___gpu( )
!
! ... LSDA or nonmagnetic case
!
DO m = 1, 2 * l + 1
lm = l**2 + m
n_starting_wfc = n_starting_wfc + 1
IF ( n_starting_wfc > natomwfc) CALL errore &
('atomic_wfc___', 'internal error: too many wfcs', 1)
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = ig_start, ig_end
wfcatom_d(ig,1,n_starting_wfc) = lphase * &
sk_d(ig) * CMPLX(ylm_d(ig,lm) * chiq_d(ig,nb,nt))
ENDDO
!
END DO
!
END SUBROUTINE atomic_wfc___gpu
!
END SUBROUTINE atomic_wfc_gpu

View File

@ -21,7 +21,7 @@ SUBROUTINE hinit1()
USE lsda_mod, ONLY : nspin
USE noncollin_module, ONLY : report
USE scf, ONLY : vrs, vltot, v, kedtau
USE control_flags, ONLY : tqr
USE control_flags, ONLY : tqr, use_gpu
USE realus, ONLY : generate_qpointlist, betapointlist, &
init_realspace_vars, real_space
USE wannier_new, ONLY : use_wannier
@ -33,6 +33,7 @@ SUBROUTINE hinit1()
USE dfunct, ONLY : newd
!
USE scf_gpum, ONLY : using_vrs
!
IMPLICIT NONE
!
@ -82,8 +83,13 @@ SUBROUTINE hinit1()
! ... and recalculate the products of the S with the atomic wfcs used
! ... in LDA+U calculations
!
IF ( lda_plus_u ) CALL orthoUwfc()
IF ( use_wannier ) CALL orthoatwfc( .TRUE. )
IF (.NOT. use_gpu) THEN
IF ( lda_plus_u ) CALL orthoUwfc()
IF ( use_wannier ) CALL orthoatwfc( .TRUE. )
ELSE
IF ( lda_plus_u ) CALL orthoUwfc_gpu()
IF ( use_wannier ) CALL orthoatwfc_gpu( .TRUE. )
ENDIF
!
!
RETURN

View File

@ -39,7 +39,9 @@ PWLIBS += \
stres_us_gpu.o \
compute_deff_gpu.o \
gen_us_dj_gpu.o \
gen_us_dy_gpu.o
gen_us_dy_gpu.o \
orthoatwfc_gpu.o \
atomic_wfc_gpu.o
MODFLAGS += \
$(MOD_FLAG)../../GScratch

400
PW/src/orthoatwfc_gpu.f90 Normal file
View File

@ -0,0 +1,400 @@
!
! Copyright (C) 2001-2020 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
#if !defined(__CUDA)
#define cublasZgemm zgemm
#endif
!
!-----------------------------------------------------------------------
SUBROUTINE orthoUwfc_gpu
!-----------------------------------------------------------------------
!
! This routine saves to buffer "iunhub" atomic wavefunctions having an
! associated Hubbard U term * S, for DFT+U(+V) calculations. Same for
! "iunhub2" but without S (this is then used to computed Hubbard forces
! and stresses). Atomic wavefunctions
! are orthogonalized if desired, depending upon the value of "U_projection"
! "swfcatom" must NOT be allocated on input.
!
USE kinds, ONLY : DP
USE buffers, ONLY : get_buffer, save_buffer
USE io_global, ONLY : stdout
USE io_files, ONLY : iunhub, iunhub2, nwordwfcU
USE ions_base, ONLY : nat
USE basis, ONLY : natomwfc, swfcatom
USE klist, ONLY : nks, xk, ngk, igk_k, igk_k_d
USE ldaU, ONLY : U_projection, wfcU, nwfcU, copy_U_wfc
USE wvfct, ONLY : npwx
USE uspp, ONLY : nkb, vkb
USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, &
bec_type, becp, calbec
USE control_flags, ONLY : gamma_only
USE noncollin_module, ONLY : noncolin, npol
USE mp_bands, ONLY : use_bgrp_in_hpsi
!
USE uspp_gpum, ONLY : using_vkb, using_vkb_d, vkb_d
USE becmod_gpum, ONLY : becp_d
USE becmod_subs_gpum, ONLY : using_becp_auto, using_becp_d_auto, &
calbec_gpu
!
IMPLICIT NONE
!
!
INTEGER :: ik, ibnd, info, i, j, k, na, nb, nt, isym, n, ntemp, m, &
l, lm, ltot, ntot, ipol, npw
! ik: the k point under consideration
! ibnd: counter on bands
LOGICAL :: orthogonalize_wfc, normalize_only, save_flag
COMPLEX(DP) , ALLOCATABLE :: wfcatom (:,:)
!
COMPLEX(DP) , ALLOCATABLE :: wfcatom_d(:,:)
COMPLEX(DP) , ALLOCATABLE :: swfcatom_d(:,:)
!
#if defined(__CUDA)
attributes(DEVICE) :: wfcatom_d, swfcatom_d
#endif
!
IF ( U_projection == "pseudo" ) THEN
WRITE( stdout,*) 'Beta functions used for LDA+U Projector'
RETURN
ELSE IF (U_projection=="file") THEN
!
! Read atomic wavefunctions from file (produced by pmw.x). In this case,
! U-specific atomic wavefunctions wfcU coincide with atomic wavefunctions
!
WRITE( stdout,*) 'LDA+U Projector read from file '
DO ik = 1, nks
CALL get_buffer(wfcU, nwordwfcU, iunhub, ik)
END DO
RETURN
ELSE IF (U_projection=="atomic") THEN
orthogonalize_wfc = .FALSE.
normalize_only = .FALSE.
WRITE( stdout,*) 'Atomic wfc used for LDA+U Projector are NOT orthogonalized'
ELSE IF (U_projection=="ortho-atomic") THEN
orthogonalize_wfc = .TRUE.
normalize_only = .FALSE.
WRITE( stdout,*) 'Atomic wfc used for LDA+U Projector are orthogonalized'
IF (gamma_only) CALL errore('orthoatwfc', &
'Gamma-only calculation for this case not implemented', 1 )
ELSE IF (U_projection=="norm-atomic") THEN
orthogonalize_wfc = .TRUE.
normalize_only = .TRUE.
WRITE( stdout,*) 'Atomic wfc used for LDA+U Projector are normalized but NOT orthogonalized'
IF (gamma_only) CALL errore('orthoatwfc', &
'Gamma-only calculation for this case not implemented', 1 )
ELSE
WRITE( stdout,*) "U_projection_type =", U_projection
CALL errore ("orthoatwfc"," this U_projection_type is not valid",1)
END IF
!
ALLOCATE( wfcatom(npwx*npol,natomwfc), swfcatom(npwx*npol,natomwfc) )
ALLOCATE( wfcatom_d(npwx*npol,natomwfc), swfcatom_d(npwx*npol,natomwfc) ) !
save_flag = use_bgrp_in_hpsi ; use_bgrp_in_hpsi=.false.
!
! Allocate the array becp = <beta|wfcatom>
CALL allocate_bec_type( nkb, natomwfc, becp )
CALL using_becp_auto(2)
!
!
DO ik = 1, nks
!
IF (noncolin) THEN
CALL atomic_wfc_nc_updown( ik, wfcatom )
wfcatom_d = wfcatom
ELSE
CALL atomic_wfc_gpu( ik, wfcatom_d )
ENDIF
!
npw = ngk(ik)
!CALL using_vkb(1)
CALL using_vkb_d(2)
CALL init_us_2_gpu( npw, igk_k_d(1,ik), xk(1,ik), vkb_d )
!
CALL using_becp_d_auto(2)
CALL calbec_gpu( npw, vkb_d, wfcatom_d, becp_d )
!
CALL s_psi_gpu( npwx, npw, natomwfc, wfcatom_d, swfcatom_d )
!
IF (orthogonalize_wfc) CALL ortho_swfc_gpu( npw, normalize_only, &
natomwfc, wfcatom_d, swfcatom_d, .TRUE. )
!
! copy atomic wavefunctions with Hubbard U term only in wfcU
! (this is then used to compute Hubbard forces and stresses)
! save to unit iunhub2
!
wfcatom = wfcatom_d
swfcatom = swfcatom_d
!
CALL copy_U_wfc( wfcatom, noncolin )
CALL save_buffer( wfcU, nwordwfcU, iunhub2, ik )
!
! copy S * atomic wavefunctions with Hubbard U term only in wfcU
! (this is used during the self-consistent solution of Kohn-Sham equations)
! save to unit iunhub
!
CALL copy_U_wfc( swfcatom, noncolin )
IF ( nks > 1 ) &
CALL save_buffer( wfcU, nwordwfcU, iunhub, ik )
!
ENDDO
!
DEALLOCATE( wfcatom, swfcatom )
DEALLOCATE( wfcatom_d, swfcatom_d )
!
CALL deallocate_bec_type( becp )
CALL using_becp_auto(2)
!
use_bgrp_in_hpsi = save_flag
!
RETURN
END SUBROUTINE orthoUwfc_gpu
!
!-----------------------------------------------------------------------
SUBROUTINE orthoatwfc_gpu( orthogonalize_wfc )
!-----------------------------------------------------------------------
!
! This routine calculates atomic wavefunctions, orthogonalizes them
! if "orthogonalzie_wfc" is .true., saves them into buffer "iunsat".
! "swfcatom" must be allocated on input.
! Useful for options "wannier" and "one_atom_occupations"
!
USE kinds, ONLY : DP
USE buffers, ONLY : save_buffer
USE io_global, ONLY : stdout
USE io_files, ONLY : iunsat, nwordatwfc
USE ions_base, ONLY : nat
USE basis, ONLY : natomwfc, swfcatom
USE klist, ONLY : nks, xk, ngk, igk_k, igk_k_d
USE wvfct, ONLY : npwx
USE uspp, ONLY : nkb, vkb
USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, &
bec_type, becp, calbec
USE control_flags, ONLY : gamma_only
USE noncollin_module, ONLY : noncolin, npol
!
USE uspp_gpum, ONLY : using_vkb, using_vkb_d, vkb_d
USE becmod_gpum, ONLY : becp_d
USE becmod_subs_gpum, ONLY : using_becp_auto, using_becp_d_auto, &
calbec_gpu
!
IMPLICIT NONE
!
LOGICAL, INTENT(IN) :: orthogonalize_wfc
!
INTEGER :: ik, ibnd, info, i, j, k, na, nb, nt, isym, n, ntemp, m, &
l, lm, ltot, ntot, ipol, npw
! ik: the k point under consideration
! ibnd: counter on bands
LOGICAL :: normalize_only = .FALSE.
COMPLEX(DP), ALLOCATABLE :: wfcatom(:,:)
COMPLEX(DP), ALLOCATABLE :: wfcatom_d(:,:), swfcatom_d(:,:)
!
#if defined(__CUDA)
attributes(DEVICE) :: wfcatom_d, swfcatom_d
#endif
!
normalize_only=.FALSE.
ALLOCATE( wfcatom(npwx*npol,natomwfc) )
ALLOCATE( wfcatom_d(npwx*npol,natomwfc), swfcatom_d(npwx*npol,natomwfc) )
!
! Allocate the array becp = <beta|wfcatom>
CALL allocate_bec_type( nkb, natomwfc, becp )
!
DO ik = 1, nks
!
IF (noncolin) THEN
CALL atomic_wfc_nc_updown( ik, wfcatom )
wfcatom_d = wfcatom
ELSE
CALL atomic_wfc_gpu( ik, wfcatom_d )
ENDIF
!
npw = ngk(ik)
!CALL using_vkb(1)
CALL using_vkb_d(2)
CALL init_us_2_gpu( npw, igk_k_d(1,ik), xk(1,ik), vkb_d )
!
CALL using_becp_auto(2)
CALL using_becp_d_auto(2)
CALL calbec_gpu( npw, vkb_d, wfcatom_d, becp_d )
!
CALL s_psi_gpu( npwx, npw, natomwfc, wfcatom_d, swfcatom_d )
!
IF (orthogonalize_wfc) CALL ortho_swfc_gpu( npw, normalize_only, &
natomwfc, wfcatom_d, swfcatom_d, .FALSE. )
!
! write S * atomic wfc to unit iunsat
!
swfcatom = swfcatom_d
!
CALL save_buffer( swfcatom, nwordatwfc, iunsat, ik )
!
ENDDO
!
DEALLOCATE( wfcatom )
DEALLOCATE( wfcatom_d, swfcatom_d )
CALL deallocate_bec_type( becp )
!
RETURN
!
END SUBROUTINE orthoatwfc_gpu
!
!-----------------------------------------------------------------------
SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc_d, swfc_d, lflag )
!-----------------------------------------------------------------------
!
! On input : wfc (npwx*npol,m) = \psi = a set of "m" (atomic) wavefcts
! swfc(npwx*npol,m) = S\psi
! normalize_only = only normalize, do not orthonormalize
!
! This routine will compute the overlap matrix O:
! O_ij = <wfc_i|S|wfc_j> = <wfc_i|swfc_j>
!
! On output: swfc = O^{-1/2} S\psi, i.e. S * orthonormalized wavefunctions
! If lflag=.FALSE. : wfc are unchanged on output (not orthonormalized), i.e.
! wfc = \psi
! If lflag=.TRUE. : wfc are orthonormalized on output, i.e.
! wfc = O^{-1/2} \psi, <wfc_i|S|wfc_j> = \delta_{ij}
!
#if defined(__CUDA)
USE cublas
#endif
!
USE kinds, ONLY : DP
USE wvfct, ONLY : npwx
USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, root_bgrp
USE mp, ONLY : mp_sum
USE noncollin_module, ONLY : noncolin, npol
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: m, npw
LOGICAL, INTENT(IN) :: normalize_only
COMPLEX(DP), INTENT(INOUT) :: wfc_d(npwx*npol,m)
COMPLEX(DP), INTENT(INOUT) :: swfc_d(npwx*npol,m)
LOGICAL, INTENT(IN) :: lflag
!
! ... local variables
!
INTEGER :: i, j, k, ipol
COMPLEX(DP) :: temp
!
COMPLEX(DP), ALLOCATABLE :: overlap_d(:,:)
COMPLEX(DP), ALLOCATABLE :: work_d(:,:), s_d(:,:)
REAL(DP) , ALLOCATABLE :: e_d(:)
!
#if defined(__CUDA)
attributes(DEVICE) :: wfc_d, swfc_d, overlap_d, work_d, s_d, e_d
#endif
!
ALLOCATE( overlap_d(m,m) )
ALLOCATE( work_d(m,m), s_d(m,m) )
ALLOCATE( e_d(m) )
!
overlap_d(:,:) = (0.d0,0.d0)
work_d(:,:) = (0.d0,0.d0)
!
! calculate overlap matrix
!
IF (noncolin) THEN
CALL cublasZgemm( 'C', 'N', m, m, npwx*npol, (1.d0,0.d0), wfc_d, &
npwx*npol, swfc_d, npwx*npol, (0.d0,0.d0), overlap_d, m )
ELSE
CALL cublasZgemm( 'C', 'N', m, m, npw, (1.d0,0.d0), wfc_d, &
npwx, swfc_d, npwx, (0.d0,0.d0), overlap_d, m )
END IF
!
CALL mp_sum( overlap_d, intra_bgrp_comm )
!
IF ( normalize_only ) THEN
!$cuf kernel do (1) <<<*,*>>>
DO i = 1, m
DO j = i+1, m
overlap_d(i,j) = CMPLX(0.d0,0.d0, kind=dp)
overlap_d(j,i) = CMPLX(0.d0,0.d0, kind=dp)
ENDDO
ENDDO
END IF
!
! find O^(-1/2)
!
s_d = CMPLX(0.d0,0.d0, kind=dp)
!$cuf kernel do (1)
DO i = 1, m
s_d(i,i) = CMPLX(1.d0,0.d0, kind=dp)
ENDDO
!
CALL laxlib_cdiaghg_gpu( m, m, overlap_d, s_d, m, e_d, work_d, me_bgrp, &
root_bgrp, intra_bgrp_comm )
!
!$cuf kernel do (1) <<<*,*>>>
DO i = 1, m
e_d(i) = 1.d0 / SQRT(e_d(i))
ENDDO
!$cuf kernel do (1) <<<*,*>>>
DO i = 1, m
DO j = i, m
temp = (0.d0, 0.d0)
DO k = 1, m
temp = temp + e_d(k) * work_d(j,k) * CONJG(work_d(i,k))
ENDDO
overlap_d(i,j) = temp
IF (j /= i) overlap_d(j,i) = CONJG(temp)
ENDDO
ENDDO
!
! transform atomic orbitals O^(-1/2) S\psi
! FIXME: can be done in a faster way by using wfc as work space
!
DO i = 1, npw
work_d(:,1) = (0.d0,0.d0)
IF (noncolin) THEN
DO ipol=1,npol
j = i + (ipol-1)*npwx
CALL cublasZgemv( 'N', m, m, (1.d0,0.d0), overlap_d, &
m, swfc_d(j,1), npwx*npol, (0.d0,0.d0), work_d, 1 )
CALL cublasZcopy( m, work_d, 1, swfc_d(j,1), npwx*npol )
END DO
ELSE
CALL cublasZgemv( 'N', m, m, (1.d0,0.d0), overlap_d, &
m, swfc_d(i,1), npwx, (0.d0,0.d0), work_d, 1 )
CALL cublasZcopy( m, work_d, 1, swfc_d(i,1), npwx )
END IF
ENDDO
!
! If lflag=.TRUE. transform atomic orbitals without
! the ultrasoft S operator O^(-1/2) \psi
!
IF (lflag) THEN
DO i = 1, npw
work_d(:,1) = (0.d0,0.d0)
IF (noncolin) THEN
DO ipol = 1, npol
j = i + (ipol-1)*npwx
CALL cublasZgemv( 'N', m, m, (1.d0,0.d0), overlap_d, &
m, wfc_d(j,1), npwx*npol, (0.d0,0.d0), work_d, 1 )
CALL cublasZcopy( m, work_d, 1, wfc_d(j,1), npwx*npol )
END DO
ELSE
CALL cublasZgemv( 'N', m, m, (1.d0,0.d0), overlap_d, &
m, wfc_d(i,1), npwx, (0.d0,0.d0), work_d, 1 )
CALL cublasZcopy( m, work_d, 1, wfc_d(i,1), npwx )
END IF
ENDDO
ENDIF
!
DEALLOCATE( overlap_d, s_d )
DEALLOCATE( work_d, e_d )
!
RETURN
!
END SUBROUTINE ortho_swfc_gpu

View File

@ -51,8 +51,8 @@ SUBROUTINE wfcinit_gpu()
!
! ... Orthogonalized atomic functions needed for DFT+U and other cases
!
IF ( use_wannier .OR. one_atom_occupations ) CALL orthoatwfc ( use_wannier )
IF ( lda_plus_u ) CALL orthoUwfc()
IF ( use_wannier .OR. one_atom_occupations ) CALL orthoatwfc_gpu( use_wannier )
IF ( lda_plus_u ) CALL orthoUwfc_gpu()
!
! ... open files/buffer for wavefunctions (nwordwfc set in openfil)
! ... io_level > 1 : open file, otherwise: open buffer