Merge branch 'more_cuda_cleanup' into 'develop'

More cuda cleanup

See merge request QEF/q-e!2410
This commit is contained in:
giannozz 2024-08-13 07:01:00 +00:00
commit a59e791975
4 changed files with 127 additions and 391 deletions

View File

@ -1,5 +1,5 @@
!
! Copyright (C) 2001-2015 Quantum ESPRESSO group
! Copyright (C) 2001-2024 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
@ -23,15 +23,15 @@ SUBROUTINE force_us( forcenl )
USE lsda_mod, ONLY : lsda, current_spin, isk, nspin
USE symme, ONLY : symvector
USE wavefunctions, ONLY : evc
USE noncollin_module, ONLY : npol, noncolin
USE noncollin_module, ONLY : npol, noncolin, lspinorb
USE io_files, ONLY : iunwfc, nwordwfc
USE buffers, ONLY : get_buffer
USE becmod, ONLY : calbec, becp, bec_type, &
allocate_bec_type, deallocate_bec_type, &
allocate_bec_type_acc, deallocate_bec_type_acc
USE mp_pools, ONLY : inter_pool_comm
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum, mp_get_comm_null
USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, nproc_bgrp
USE mp, ONLY : mp_sum
USE uspp_init, ONLY : init_us_2
!
IMPLICIT NONE
@ -42,52 +42,20 @@ SUBROUTINE force_us( forcenl )
! ... local variables
!
COMPLEX(DP), ALLOCATABLE :: vkb1(:,:) ! contains g*|beta>
!$acc declare device_resident(vkb1)
COMPLEX(DP), ALLOCATABLE :: deff_nc(:,:,:,:)
REAL(DP), ALLOCATABLE :: deff(:,:,:)
TYPE(bec_type) :: dbecp ! contains <dbeta|psi>
INTEGER :: npw, ik, ipol, ig, jkb
INTEGER :: itot, nt, na
INTEGER, ALLOCATABLE :: nt_list(:), na_list(:)
LOGICAL, ALLOCATABLE :: ismulti_np(:)
COMPLEX(DP), ALLOCATABLE :: becpnc(:,:,:), becpk(:,:), &
dbecpnc(:,:,:), dbecpk(:,:)
!$acc declare device_resident(becpnc,becpk,dbecpnc,dbecpk)
!civn: these buffers are kept here instead of inside force_us_k to
! save allocation/deallocation overhead inside the ik,ipol loops
TYPE(bec_type) :: becd ! contains <dbeta|psi>
COMPLEX(DP) :: deff_nc
REAL(DP) :: deff, fnl
INTEGER :: npw, ik, ipol, ig, na, na_s, na_e, mykey
INTEGER :: nt, ibnd, nhnt, ih, jh, ijkb0, ikb, jkb, is, js, ijs
!
forcenl(:,:) = 0.D0
!
CALL allocate_bec_type_acc( nkb, nbnd, becp, intra_bgrp_comm )
CALL allocate_bec_type_acc( nkb, nbnd, dbecp, intra_bgrp_comm )
!
CALL allocate_bec_type_acc( nkb, nbnd, becd, intra_bgrp_comm )
ALLOCATE( vkb1(npwx,nkb) )
IF (noncolin) THEN
ALLOCATE( becpnc(nkb,npol,nbnd), dbecpnc(nkb,npol,nbnd) )
ALLOCATE( deff_nc(nhm,nhm,nat,nspin) )
ELSEIF (.NOT. gamma_only ) THEN
ALLOCATE( becpk(nkb,nbnd), dbecpk(nkb,nbnd) )
ALLOCATE( deff(nhm,nhm,nat) )
ENDIF
!$acc data create(vkb1)
!
!$acc data create(deff,deff_nc)
!
ALLOCATE( nt_list(nat), na_list(nat), ismulti_np(nat) )
!
itot = 0
DO nt = 1, ntyp
DO na = 1, nat
IF ( ityp(na)==nt ) THEN
itot = itot + 1
nt_list(itot) = nt
na_list(itot) = na
ismulti_np(itot) = upf(nt)%tvanp .OR. upf(nt)%is_multiproj
ENDIF
ENDDO
ENDDO
IF (itot /= nat) CALL errore( 'force_us', 'Something wrong in atoms counting', 1 )
!
! ... the forces are a sum over the K points and over the bands
! ... the forces are summed over K-points
!
DO ik = 1, nks
!
@ -96,25 +64,18 @@ SUBROUTINE force_us( forcenl )
!
IF ( nks > 1 ) THEN
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
!$acc update device( evc )
ENDIF
!
!$acc update device( evc )
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
!$acc data present (evc, vkb, becp)
CALL calbec( offload_type, npw, vkb, evc, becp )
IF (noncolin) THEN
!$acc kernels
becpnc = becp%nc
!$acc end kernels
ELSEIF (.NOT. gamma_only ) THEN
!$acc kernels
becpk = becp%k
!$acc end kernels
ENDIF
!$acc end data
!
DO ipol = 1, 3
!
#if defined(_OPENACC)
!$acc parallel loop collapse(2)
!$acc parallel loop collapse(2) present(vkb, g, igk_k)
#else
!$omp parallel do collapse(2) private(ig)
#endif
@ -123,50 +84,96 @@ SUBROUTINE force_us( forcenl )
vkb1(ig,jkb) = vkb(ig,jkb) * (0.D0,-1.D0) * g(ipol,igk_k(ig,ik))
ENDDO
ENDDO
!$acc data present (evc, becd)
CALL calbec( offload_type, npw, vkb1, evc, becd )
!$acc end data
!
CALL calbec( offload_type, npw, vkb1, evc, dbecp )
IF (noncolin) THEN
!$acc kernels
dbecpnc = dbecp%nc
!$acc end kernels
ELSEIF (.NOT. gamma_only ) THEN
!$acc kernels
dbecpk = dbecp%k
!$acc end kernels
ENDIF
! becp = <beta|psi>, becd = <dbeta/dG_ipol|psi>
! Now sum over bands and over projectors belonging to each atom
!
!$acc data copyin(nt_list,na_list,ismulti_np)
IF ( gamma_only ) THEN
!
CALL force_us_gamma( forcenl )
!
ELSE
!
CALL force_us_k( forcenl )
!
ENDIF
! ... NOTE: calls to calbec are parallelized over the bgrp group
! ... The rest of the calculation is parallelized by subdividing
! ... the atoms over the bgrp group
!
CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey )
!
IF ( mykey /= 0 ) CYCLE
!
!$acc data present(becp, becd, deeq, qq_at, deeq_nc, qq_so, et) copyin(wg)
DO na = na_s, na_e
fnl = 0.0_dp
nt = ityp(na)
nhnt = nh(nt)
ijkb0 = ofsbeta(na)
IF ( gamma_only ) THEN
!$acc parallel loop collapse(3) present(becp%r,becd%r) reduction(+:fnl)
DO ibnd = 1, nbnd
DO ih = 1, nhnt
DO jh = 1, nhnt
ikb = ijkb0 + ih
jkb = ijkb0 + jh
deff = deeq(ih,jh,na,current_spin) - &
et(ibnd,ik) * qq_at(ih,jh,na)
fnl = fnl + wg(ibnd,ik) * deff * &
becd%r(ikb,ibnd) * becp%r(jkb,ibnd)
END DO
END DO
END DO
ELSE IF ( .NOT. noncolin ) THEN
!$acc parallel loop collapse(3) present(becp%k,becd%k) reduction(+:fnl)
DO ibnd = 1, nbnd
DO ih = 1, nhnt
DO jh = 1, nhnt
ikb = ijkb0 + ih
jkb = ijkb0 + jh
deff = deeq(ih,jh,na,current_spin) - et(ibnd,ik) * qq_at(ih,jh,na)
fnl = fnl + wg(ibnd,ik) * deff * &
DBLE(CONJG(becp%k(ikb,ibnd)) * becd%k(jkb,ibnd) )
END DO
END DO
END DO
ELSE
!$acc parallel loop collapse(3) present(becp%nc,becd%nc) reduction(+:fnl)
DO ibnd = 1, nbnd
DO ih = 1, nhnt
DO jh = 1, nhnt
ikb = ijkb0 + ih
jkb = ijkb0 + jh
!$acc loop seq collapse(2)
DO is = 1, npol
DO js = 1, npol
ijs = (is-1)*npol + js
deff_nc = deeq_nc(ih,jh,na,ijs)
IF ( lspinorb ) THEN
deff_nc = deff_nc - et(ibnd,ik) * qq_so(ih,jh,ijs,nt)
ELSE IF ( is == js ) THEN
deff_nc = deff_nc - et(ibnd,ik) * qq_at(ih,jh,na)
END IF
fnl = fnl + wg(ibnd,ik) * DBLE ( &
deff_nc * CONJG(becp%nc(ikb,is,ibnd)) * &
becd%nc(jkb,js,ibnd) )
END DO
END DO
END DO
END DO
END DO
END IF
! factor 2 from Ry a.u. (e^2=2)? tpiba from k+G, minus sign
forcenl(ipol,na) = forcenl(ipol,na) - 2.0_dp * tpiba* fnl
END DO
!$acc end data
!
ENDDO
ENDDO
!
! ... if sums over bands are parallelized over the band group
!
IF ( becp%comm /= mp_get_comm_null() ) CALL mp_sum( forcenl, becp%comm )
!
!$acc end data
DEALLOCATE( vkb1 )
IF ( noncolin ) THEN
DEALLOCATE( deff_nc )
ELSEIF ( .NOT. gamma_only ) THEN
DEALLOCATE( deff )
ENDIF
!
CALL deallocate_bec_type_acc( dbecp )
CALL deallocate_bec_type_acc( becd )
CALL deallocate_bec_type_acc( becp )
!
! ... collect contributions across pools from all k-points
! ... collect contributions across processors and pools from all k-points
!
CALL mp_sum( forcenl, intra_bgrp_comm )
CALL mp_sum( forcenl, inter_pool_comm )
!
! ... The total D matrix depends on the ionic position via the
@ -180,270 +187,6 @@ SUBROUTINE force_us( forcenl )
!
CALL symvector( nat, forcenl )
!
DEALLOCATE( nt_list, na_list, ismulti_np )
!
IF ( noncolin ) THEN
DEALLOCATE( becpnc, dbecpnc )
ELSEIF (.NOT. gamma_only ) THEN
DEALLOCATE( becpk, dbecpk )
ENDIF
!
RETURN
!
CONTAINS
!
!-----------------------------------------------------------------------
SUBROUTINE force_us_gamma( forcenl )
!-----------------------------------------------------------------------
!! Nonlocal contribution to the force. Calculation at Gamma.
!
! Important notice about parallelization over the band group of processors:
! 1) internally, "calbec" parallelises on plane waves over the band group
! 2) the results of "calbec" are distributed across processors of the band
! group: the band index of becp, dbecp is distributed
! 3) the band group is subsequently used to parallelize over bands
!
USE uspp, ONLY : qq_at, deeq
!
IMPLICIT NONE
!
REAL(DP) :: forcenl(3,nat)
!! the nonlocal contribution
!
! ... local variables
!
REAL(DP), ALLOCATABLE :: aux(:,:)
REAL(DP) :: forcenl_ipol
INTEGER :: nt, na, ibnd, ibnd_loc, ih, jh, ijkb0
INTEGER :: nh_nt, becp_ibnd_begin, becp_nbnd_loc, nbnd_siz
REAL(DP), ALLOCATABLE :: dbecprd(:,:), becprd(:,:)
!$acc declare device_resident(dbecprd, becprd)
!
nbnd_siz = nbnd / becp%nproc
ALLOCATE( becprd(nkb,nbnd_siz), dbecprd(nkb,nbnd_siz) )
!
!$acc kernels
dbecprd = dbecp%r
becprd = becp%r
!$acc end kernels
becp_nbnd_loc = becp%nbnd_loc
becp_ibnd_begin = becp%ibnd_begin
!
!$acc data copyin( et, wg )
!
DO nt = 1, ntyp
!
IF ( nh(nt) == 0 ) CYCLE
!
ALLOCATE( aux(nh(nt), becp%nbnd_loc) )
!$acc data create(aux)
!
nh_nt = nh(nt)
!
DO na = 1, nat
IF ( ityp(na) == nt ) THEN
ijkb0 = ofsbeta(na)
! ... this is \sum_j q_{ij} <beta_j|psi>
!
!$acc host_data use_device(aux, qq_at, becprd)
CALL MYDGEMM( 'N','N', nh(nt), becp_nbnd_loc, nh(nt), &
1.0_DP, qq_at(1,1,na), nhm, becprd(ijkb0+1,1), &
nkb, 0.0_DP, aux, nh(nt) )
!$acc end host_data
!
! ... multiply by -\epsilon_n
!
#if defined(_OPENACC)
!$acc parallel loop collapse(2)
#else
!$omp parallel do default(shared) private(ibnd_loc,ibnd,ih)
#endif
DO ih = 1, nh_nt
DO ibnd_loc = 1, becp_nbnd_loc
ibnd = ibnd_loc + becp_ibnd_begin - 1
aux(ih,ibnd_loc) = - et(ibnd,ik) * aux(ih,ibnd_loc)
ENDDO
ENDDO
#if !defined(_OPENACC)
!$omp end parallel do
#endif
!
! ... add \sum_j d_{ij} <beta_j|psi>
!
!$acc host_data use_device(aux, deeq, becprd)
CALL MYDGEMM( 'N','N', nh(nt), becp_nbnd_loc, nh(nt), &
1.0_DP, deeq(1,1,na,current_spin), nhm, &
becprd(ijkb0+1,1), nkb, 1.0_DP, aux, nh(nt) )
!$acc end host_data
!
! ... Auxiliary variable to perform the reduction with gpu kernels
forcenl_ipol = 0.0_DP
#if defined(_OPENACC)
!$acc parallel loop collapse(2) reduction(+:forcenl_ipol)
#else
!$omp parallel do default(shared) private(ibnd_loc,ibnd,ih) reduction(+:forcenl_ipol)
#endif
DO ih = 1, nh_nt
DO ibnd_loc = 1, becp_nbnd_loc
ibnd = ibnd_loc + becp_ibnd_begin - 1
forcenl_ipol = forcenl_ipol - 2.0_DP*tpiba * aux(ih,ibnd_loc) *&
dbecprd(ijkb0+ih,ibnd_loc) * wg(ibnd,ik)
ENDDO
ENDDO
#if !defined(_OPENACC)
!$omp end parallel do
#endif
!
forcenl(ipol,na) = forcenl(ipol,na) + forcenl_ipol
!
ENDIF
ENDDO
!
!$acc end data
DEALLOCATE( aux )
!
ENDDO
!
!$acc end data
!
DEALLOCATE( becprd, dbecprd )
!
END SUBROUTINE force_us_gamma
!
!
!-----------------------------------------------------------------------
SUBROUTINE force_us_k( forcenl )
!-----------------------------------------------------------------------
!! Nonlocal contribution to the force. Calculation for k-points.
!
IMPLICIT NONE
!
REAL(DP) :: forcenl(3,nat)
!! the nonlocal contribution
!
! ... local variables
!
REAL(DP) :: fac
REAL(DP) :: forcenl_p1, forcenl_p2
INTEGER :: ibnd, ih, jh, na, nt, ikb, jkb, ijkb0, is, js, ijs !counters
INTEGER :: nh_nt, it
!
!$acc data copy(forcenl)
!
DO ibnd = 1, nbnd
!
IF (noncolin) THEN
CALL compute_deff_nc( deff_nc, et(ibnd,ik) )
ELSE
CALL compute_deff( deff, et(ibnd,ik) )
ENDIF
!
fac = wg(ibnd,ik)*tpiba
!
#if defined(_OPENACC)
!$acc parallel loop gang reduction(+:forcenl_p2)
#else
!$omp parallel do private(nt,na,ijkb0,nh_nt,forcenl_p1,forcenl_p2,ih,&
!$omp ikb,is,js,ijs,jkb)
#endif
DO it = 1, nat
!
nt = nt_list(it)
na = na_list(it)
ijkb0 = ofsbeta(na)
nh_nt = nh(nt)
!
forcenl_p2 = 0.d0
!$acc loop vector reduction(+:forcenl_p2)
DO ih = 1, nh_nt
!
ikb = ijkb0 + ih
IF (noncolin) THEN
forcenl_p1 = 0.d0
!$acc loop seq collapse(2) reduction(+:forcenl_p1)
DO is = 1, npol
DO js = 1, npol
ijs = (is-1)*npol+js
forcenl_p1 = forcenl_p1 - &
deff_nc(ih,ih,na,ijs)*fac*( &
CONJG(dbecpnc(ikb,is,ibnd))* &
becpnc(ikb,js,ibnd)+ &
CONJG(becpnc(ikb,is,ibnd))* &
dbecpnc(ikb,js,ibnd) )
ENDDO
ENDDO
ELSE
forcenl_p1 = -2.D0 * fac * deff(ih,ih,na) * &
DBLE( CONJG( dbecpk(ikb,ibnd) ) * &
becpk(ikb,ibnd) )
ENDIF
!
forcenl_p2 = forcenl_p2 + forcenl_p1
!
ENDDO
!
forcenl(ipol,na) = forcenl(ipol,na) + forcenl_p2
!
IF ( ismulti_np(it) ) THEN
!
forcenl_p2 = 0.d0
!$acc loop vector reduction(+:forcenl_p2)
DO ih = 1, nh_nt
ikb = ijkb0 + ih
!
! ... in US case there is a contribution for jh<>ih.
! ... We use here the symmetry in the interchange
! ... of ih and jh
!
forcenl_p1 = 0.d0
!$acc loop seq
DO jh = ih+1, nh_nt
jkb = ijkb0 + jh
IF (noncolin) THEN
!$acc loop seq collapse(2) reduction(+:forcenl_p1)
DO is = 1, npol
DO js = 1, npol
ijs = (is-1)*npol+js
forcenl_p1 = forcenl_p1 - &
deff_nc(ih,jh,na,ijs)*fac*( &
CONJG(dbecpnc(ikb,is,ibnd))* &
becpnc(jkb,js,ibnd) + &
CONJG(becpnc(ikb,is,ibnd))* &
dbecpnc(jkb,js,ibnd)) - &
deff_nc(jh,ih,na,ijs)*fac*( &
CONJG(dbecpnc(jkb,is,ibnd))* &
becpnc(ikb,js,ibnd) + &
CONJG(becpnc(jkb,is,ibnd))* &
dbecpnc(ikb,js,ibnd) )
ENDDO
ENDDO
ELSE
forcenl_p1 = forcenl_p1 - &
2.D0 * fac * deff(ih,jh,na) * &
DBLE( CONJG( dbecpk(ikb,ibnd) ) * &
becpk(jkb,ibnd) + dbecpk(jkb,ibnd) &
* CONJG( becpk(ikb,ibnd) ) )
ENDIF
ENDDO !jh
!
forcenl_p2 = forcenl_p2 + forcenl_p1
!
ENDDO !ih
!
forcenl(ipol,na) = forcenl(ipol,na) + forcenl_p2
!
ENDIF ! tvanp
!
ENDDO ! it=nt|na
#if !defined(_OPENACC)
!$omp end parallel do
#endif
!
ENDDO ! nbnd
!
!$acc end data
!
END SUBROUTINE force_us_k
!
!
END SUBROUTINE force_us

View File

@ -222,7 +222,7 @@ SUBROUTINE newd_gpu( )
USE kinds, ONLY : DP
USE ions_base, ONLY : nat, ntyp => nsp, ityp
USE lsda_mod, ONLY : nspin
USE uspp, ONLY : okvan, deeq, deeq_nc, dvan_d, dvan_so_d
USE uspp, ONLY : okvan, deeq, deeq_nc, dvan, dvan_so
USE uspp_param, ONLY : upf, lmaxq, nh, nhm
USE noncollin_module, ONLY : noncolin, domag, nspin_mag, lspinorb
USE uspp, ONLY : nhtol, nhtolm
@ -241,7 +241,6 @@ SUBROUTINE newd_gpu( )
INTEGER :: ig, nt, ih, jh, na, is, nht, nb, mb, ierr
! counters on g vectors, atom type, beta functions x 2,
! atoms, spin, aux, aux, beta func x2 (again)
REAL(kind=dp), allocatable :: deeq_h( :,:,:,: )
INTEGER, POINTER :: ityp_d(:)
#if defined(__CUDA)
attributes(DEVICE) :: ityp_d
@ -263,7 +262,7 @@ SUBROUTINE newd_gpu( )
DO na = 1, nat
DO jh = 1, nht
DO ih = 1, nht
IF ( ityp_d(na) == nt ) deeq_nc(ih,jh,na,is) = dvan_so_d(ih,jh,is,nt)
IF ( ityp_d(na) == nt ) deeq_nc(ih,jh,na,is) = dvan_so(ih,jh,is,nt)
END DO
END DO
END DO
@ -276,10 +275,10 @@ SUBROUTINE newd_gpu( )
DO jh = 1, nht
DO ih = 1, nht
IF ( ityp_d(na) == nt ) THEN
deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt)
deeq_nc(ih,jh,na,1) = dvan(ih,jh,nt)
deeq_nc(ih,jh,na,2) = ( 0.D0, 0.D0 )
deeq_nc(ih,jh,na,3) = ( 0.D0, 0.D0 )
deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt)
deeq_nc(ih,jh,na,4) = dvan(ih,jh,nt)
END IF
END DO
END DO
@ -294,7 +293,7 @@ SUBROUTINE newd_gpu( )
DO jh = 1, nht
DO ih = 1, nht
!
IF ( ityp_d(na) == nt ) deeq(ih,jh,na,is) = dvan_d(ih,jh,nt)
IF ( ityp_d(na) == nt ) deeq(ih,jh,na,is) = dvan(ih,jh,nt)
!
END DO
END DO
@ -323,7 +322,6 @@ SUBROUTINE newd_gpu( )
END IF
!
CALL start_clock_gpu( 'newd' )
allocate(deeq_h( nhm, nhm, nat, nspin ))
!
! move atom type info to GPU
CALL buffer%lock_buffer(ityp_d, nat, ierr)
@ -369,7 +367,7 @@ SUBROUTINE newd_gpu( )
DO ih = 1, nht
DO jh = 1, nht
IF ( ityp_d(na) == nt ) THEN
deeq(ih,jh,na,is) = deeq(ih,jh,na,is) + dvan_d(ih,jh,nt)
deeq(ih,jh,na,is) = deeq(ih,jh,na,is) + dvan(ih,jh,nt)
END IF
END DO
END DO
@ -437,7 +435,7 @@ SUBROUTINE newd_gpu( )
!
IF ( ityp_d(na) == nt ) THEN
!
deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
deeq_nc(ih,jh,na,ijs) = dvan_so(ih,jh,ijs,nt)
!
DO kh = 1, nhnt
!
@ -479,7 +477,7 @@ SUBROUTINE newd_gpu( )
!
IF ( ityp_d(na) == nt ) THEN
!
deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
deeq_nc(ih,jh,na,ijs) = dvan_so(ih,jh,ijs,nt)
!
DO kh = 1, nhnt
!
@ -533,17 +531,17 @@ SUBROUTINE newd_gpu( )
IF ( ityp_d(na) == nt ) THEN
!
IF (lspinorb) THEN
deeq_nc(ih,jh,na,1) = dvan_so_d(ih,jh,1,nt) + &
deeq_nc(ih,jh,na,1) = dvan_so(ih,jh,1,nt) + &
deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
!
deeq_nc(ih,jh,na,4) = dvan_so_d(ih,jh,4,nt) + &
deeq_nc(ih,jh,na,4) = dvan_so(ih,jh,4,nt) + &
deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
!
ELSE
deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt) + &
deeq_nc(ih,jh,na,1) = dvan(ih,jh,nt) + &
deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
!
deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt) + &
deeq_nc(ih,jh,na,4) = dvan(ih,jh,nt) + &
deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
!
END IF

View File

@ -28,7 +28,7 @@ subroutine init_us_1( nat, ityp, omega, qmax, intra_bgrp_comm )
USE upf_const, ONLY : fpi, sqrt2
USE uspp, ONLY : nhtol, nhtoj, nhtolm, ijtoh, dvan, qq_at, qq_nt, indv, &
ap, aainit, qq_so, dvan_so, okvan, ofsbeta, &
qq_nt_d, dvan_d, dvan_so_d
qq_nt_d
USE uspp_param, ONLY : upf, lmaxq, nh, nhm, lmaxkb, nsp
USE upf_spinorb, ONLY : is_spinorbit, rot_ylm, fcoef, lmaxx, &
transform_qq_so
@ -256,14 +256,14 @@ subroutine init_us_1( nat, ityp, omega, qmax, intra_bgrp_comm )
!$acc update device(nhtol)
!$acc update device(nhtoj)
!$acc update device(ijtoh)
qq_nt_d=qq_nt
!$acc update device(qq_at)
qq_nt_d=qq_nt
if (is_spinorbit) then
dvan_so_d=dvan_so
!$acc update device(fcoef)
!$acc update device(qq_so)
else
dvan_d=dvan
!$acc update device(dvan_so)
!$acc update device(fcoef)
!$acc update device(qq_so)
else
!$acc update device(dvan)
endif
endif
!

View File

@ -35,10 +35,9 @@ MODULE uspp
PUBLIC :: nlx, lpx, lpl, ap, aainit, indv, nhtol, nhtolm, ofsbeta, &
nkb, nkbus, vkb, dvan, deeq, qq_at, qq_nt, nhtoj, ijtoh, beta, &
becsum, ebecsum
PUBLIC :: dvan_d, qq_nt_d
PUBLIC :: qq_nt_d
PUBLIC :: okvan, nlcc_any
PUBLIC :: qq_so, dvan_so, deeq_nc, fcoef
PUBLIC :: dvan_so_d
PUBLIC :: dbeta
!
PUBLIC :: allocate_uspp, deallocate_uspp
@ -92,11 +91,9 @@ MODULE uspp
!
! GPU vars
!
REAL(DP), ALLOCATABLE :: dvan_d(:,:,:)
REAL(DP), ALLOCATABLE :: qq_nt_d(:,:,:)
COMPLEX(DP), ALLOCATABLE :: dvan_so_d(:,:,:,:)
#if defined(__CUDA)
attributes (DEVICE) :: dvan_d, qq_nt_d, dvan_so_d
attributes (DEVICE) :: qq_nt_d
#endif
!
@ -343,10 +340,12 @@ CONTAINS
allocate( qq_so(nhm,nhm,4,nsp) )
!$acc enter data create(qq_so)
allocate( dvan_so(nhm,nhm,nspin,nsp) )
!$acc enter data create(dvan_so)
allocate( fcoef(nhm,nhm,2,2,nsp) )
!$acc enter data create(fcoef)
else
allocate( dvan(nhm,nhm,nsp) )
!$acc enter data create(dvan)
endif
allocate(becsum( nhm*(nhm+1)/2, nat, nspin))
!$acc enter data create(becsum)
@ -360,15 +359,7 @@ CONTAINS
!
if (use_gpu) then
!
if (nhm>0) then
allocate( qq_nt_d(nhm,nhm,nsp) )
if ( lspinorb ) then
allocate( dvan_so_d(nhm,nhm,nspin,nsp) )
else
allocate( dvan_d(nhm,nhm,nsp) )
endif
!
endif
if (nhm>0) allocate( qq_nt_d(nhm,nhm,nsp) )
!
endif
!
@ -412,7 +403,10 @@ CONTAINS
DEALLOCATE( qq_at )
ENDIF
IF( ALLOCATED( qq_nt ) ) DEALLOCATE( qq_nt )
IF( ALLOCATED( dvan ) ) DEALLOCATE( dvan )
IF( ALLOCATED( dvan ) ) THEN
!$acc exit data delete( dvan )
DEALLOCATE( dvan )
END IF
IF( ALLOCATED( deeq ) ) THEN
!$acc exit data delete( deeq )
DEALLOCATE( deeq )
@ -421,7 +415,10 @@ CONTAINS
!$acc exit data delete( qq_so )
DEALLOCATE( qq_so )
ENDIF
IF( ALLOCATED( dvan_so ) ) DEALLOCATE( dvan_so )
IF( ALLOCATED( dvan_so ) ) THEN
!$acc exit data delete( dvan_so )
DEALLOCATE( dvan_so )
END IF
IF( ALLOCATED( deeq_nc ) ) THEN
!$acc exit data delete( deeq_nc )
DEALLOCATE( deeq_nc )
@ -434,9 +431,7 @@ CONTAINS
IF( ALLOCATED( dbeta ) ) DEALLOCATE( dbeta )
!
! GPU variables
IF( ALLOCATED( dvan_d ) ) DEALLOCATE( dvan_d )
IF( ALLOCATED( qq_nt_d ) ) DEALLOCATE( qq_nt_d )
IF( ALLOCATED( dvan_so_d ) ) DEALLOCATE( dvan_so_d )
!
END SUBROUTINE deallocate_uspp
!