mirror of https://gitlab.com/QEF/q-e.git
Merge branch 'more_cuda_cleanup' into 'develop'
More cuda cleanup See merge request QEF/q-e!2410
This commit is contained in:
commit
a59e791975
|
@ -1,5 +1,5 @@
|
|||
!
|
||||
! Copyright (C) 2001-2015 Quantum ESPRESSO group
|
||||
! Copyright (C) 2001-2024 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
|
@ -23,15 +23,15 @@ SUBROUTINE force_us( forcenl )
|
|||
USE lsda_mod, ONLY : lsda, current_spin, isk, nspin
|
||||
USE symme, ONLY : symvector
|
||||
USE wavefunctions, ONLY : evc
|
||||
USE noncollin_module, ONLY : npol, noncolin
|
||||
USE noncollin_module, ONLY : npol, noncolin, lspinorb
|
||||
USE io_files, ONLY : iunwfc, nwordwfc
|
||||
USE buffers, ONLY : get_buffer
|
||||
USE becmod, ONLY : calbec, becp, bec_type, &
|
||||
allocate_bec_type, deallocate_bec_type, &
|
||||
allocate_bec_type_acc, deallocate_bec_type_acc
|
||||
USE mp_pools, ONLY : inter_pool_comm
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum, mp_get_comm_null
|
||||
USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, nproc_bgrp
|
||||
USE mp, ONLY : mp_sum
|
||||
USE uspp_init, ONLY : init_us_2
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -42,52 +42,20 @@ SUBROUTINE force_us( forcenl )
|
|||
! ... local variables
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: vkb1(:,:) ! contains g*|beta>
|
||||
!$acc declare device_resident(vkb1)
|
||||
COMPLEX(DP), ALLOCATABLE :: deff_nc(:,:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: deff(:,:,:)
|
||||
TYPE(bec_type) :: dbecp ! contains <dbeta|psi>
|
||||
INTEGER :: npw, ik, ipol, ig, jkb
|
||||
INTEGER :: itot, nt, na
|
||||
INTEGER, ALLOCATABLE :: nt_list(:), na_list(:)
|
||||
LOGICAL, ALLOCATABLE :: ismulti_np(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: becpnc(:,:,:), becpk(:,:), &
|
||||
dbecpnc(:,:,:), dbecpk(:,:)
|
||||
!$acc declare device_resident(becpnc,becpk,dbecpnc,dbecpk)
|
||||
!civn: these buffers are kept here instead of inside force_us_k to
|
||||
! save allocation/deallocation overhead inside the ik,ipol loops
|
||||
TYPE(bec_type) :: becd ! contains <dbeta|psi>
|
||||
COMPLEX(DP) :: deff_nc
|
||||
REAL(DP) :: deff, fnl
|
||||
INTEGER :: npw, ik, ipol, ig, na, na_s, na_e, mykey
|
||||
INTEGER :: nt, ibnd, nhnt, ih, jh, ijkb0, ikb, jkb, is, js, ijs
|
||||
!
|
||||
forcenl(:,:) = 0.D0
|
||||
!
|
||||
CALL allocate_bec_type_acc( nkb, nbnd, becp, intra_bgrp_comm )
|
||||
CALL allocate_bec_type_acc( nkb, nbnd, dbecp, intra_bgrp_comm )
|
||||
!
|
||||
CALL allocate_bec_type_acc( nkb, nbnd, becd, intra_bgrp_comm )
|
||||
ALLOCATE( vkb1(npwx,nkb) )
|
||||
IF (noncolin) THEN
|
||||
ALLOCATE( becpnc(nkb,npol,nbnd), dbecpnc(nkb,npol,nbnd) )
|
||||
ALLOCATE( deff_nc(nhm,nhm,nat,nspin) )
|
||||
ELSEIF (.NOT. gamma_only ) THEN
|
||||
ALLOCATE( becpk(nkb,nbnd), dbecpk(nkb,nbnd) )
|
||||
ALLOCATE( deff(nhm,nhm,nat) )
|
||||
ENDIF
|
||||
!$acc data create(vkb1)
|
||||
!
|
||||
!$acc data create(deff,deff_nc)
|
||||
!
|
||||
ALLOCATE( nt_list(nat), na_list(nat), ismulti_np(nat) )
|
||||
!
|
||||
itot = 0
|
||||
DO nt = 1, ntyp
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na)==nt ) THEN
|
||||
itot = itot + 1
|
||||
nt_list(itot) = nt
|
||||
na_list(itot) = na
|
||||
ismulti_np(itot) = upf(nt)%tvanp .OR. upf(nt)%is_multiproj
|
||||
ENDIF
|
||||
ENDDO
|
||||
ENDDO
|
||||
IF (itot /= nat) CALL errore( 'force_us', 'Something wrong in atoms counting', 1 )
|
||||
!
|
||||
! ... the forces are a sum over the K points and over the bands
|
||||
! ... the forces are summed over K-points
|
||||
!
|
||||
DO ik = 1, nks
|
||||
!
|
||||
|
@ -96,25 +64,18 @@ SUBROUTINE force_us( forcenl )
|
|||
!
|
||||
IF ( nks > 1 ) THEN
|
||||
CALL get_buffer( evc, nwordwfc, iunwfc, ik )
|
||||
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
|
||||
!$acc update device( evc )
|
||||
ENDIF
|
||||
!
|
||||
!$acc update device( evc )
|
||||
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
|
||||
!$acc data present (evc, vkb, becp)
|
||||
CALL calbec( offload_type, npw, vkb, evc, becp )
|
||||
IF (noncolin) THEN
|
||||
!$acc kernels
|
||||
becpnc = becp%nc
|
||||
!$acc end kernels
|
||||
ELSEIF (.NOT. gamma_only ) THEN
|
||||
!$acc kernels
|
||||
becpk = becp%k
|
||||
!$acc end kernels
|
||||
ENDIF
|
||||
!$acc end data
|
||||
!
|
||||
DO ipol = 1, 3
|
||||
!
|
||||
#if defined(_OPENACC)
|
||||
!$acc parallel loop collapse(2)
|
||||
!$acc parallel loop collapse(2) present(vkb, g, igk_k)
|
||||
#else
|
||||
!$omp parallel do collapse(2) private(ig)
|
||||
#endif
|
||||
|
@ -123,50 +84,96 @@ SUBROUTINE force_us( forcenl )
|
|||
vkb1(ig,jkb) = vkb(ig,jkb) * (0.D0,-1.D0) * g(ipol,igk_k(ig,ik))
|
||||
ENDDO
|
||||
ENDDO
|
||||
!$acc data present (evc, becd)
|
||||
CALL calbec( offload_type, npw, vkb1, evc, becd )
|
||||
!$acc end data
|
||||
!
|
||||
CALL calbec( offload_type, npw, vkb1, evc, dbecp )
|
||||
IF (noncolin) THEN
|
||||
!$acc kernels
|
||||
dbecpnc = dbecp%nc
|
||||
!$acc end kernels
|
||||
ELSEIF (.NOT. gamma_only ) THEN
|
||||
!$acc kernels
|
||||
dbecpk = dbecp%k
|
||||
!$acc end kernels
|
||||
ENDIF
|
||||
! becp = <beta|psi>, becd = <dbeta/dG_ipol|psi>
|
||||
! Now sum over bands and over projectors belonging to each atom
|
||||
!
|
||||
!$acc data copyin(nt_list,na_list,ismulti_np)
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
CALL force_us_gamma( forcenl )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL force_us_k( forcenl )
|
||||
!
|
||||
ENDIF
|
||||
! ... NOTE: calls to calbec are parallelized over the bgrp group
|
||||
! ... The rest of the calculation is parallelized by subdividing
|
||||
! ... the atoms over the bgrp group
|
||||
!
|
||||
CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey )
|
||||
!
|
||||
IF ( mykey /= 0 ) CYCLE
|
||||
!
|
||||
!$acc data present(becp, becd, deeq, qq_at, deeq_nc, qq_so, et) copyin(wg)
|
||||
DO na = na_s, na_e
|
||||
fnl = 0.0_dp
|
||||
nt = ityp(na)
|
||||
nhnt = nh(nt)
|
||||
ijkb0 = ofsbeta(na)
|
||||
IF ( gamma_only ) THEN
|
||||
!$acc parallel loop collapse(3) present(becp%r,becd%r) reduction(+:fnl)
|
||||
DO ibnd = 1, nbnd
|
||||
DO ih = 1, nhnt
|
||||
DO jh = 1, nhnt
|
||||
ikb = ijkb0 + ih
|
||||
jkb = ijkb0 + jh
|
||||
deff = deeq(ih,jh,na,current_spin) - &
|
||||
et(ibnd,ik) * qq_at(ih,jh,na)
|
||||
fnl = fnl + wg(ibnd,ik) * deff * &
|
||||
becd%r(ikb,ibnd) * becp%r(jkb,ibnd)
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
ELSE IF ( .NOT. noncolin ) THEN
|
||||
!$acc parallel loop collapse(3) present(becp%k,becd%k) reduction(+:fnl)
|
||||
DO ibnd = 1, nbnd
|
||||
DO ih = 1, nhnt
|
||||
DO jh = 1, nhnt
|
||||
ikb = ijkb0 + ih
|
||||
jkb = ijkb0 + jh
|
||||
deff = deeq(ih,jh,na,current_spin) - et(ibnd,ik) * qq_at(ih,jh,na)
|
||||
fnl = fnl + wg(ibnd,ik) * deff * &
|
||||
DBLE(CONJG(becp%k(ikb,ibnd)) * becd%k(jkb,ibnd) )
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
ELSE
|
||||
!$acc parallel loop collapse(3) present(becp%nc,becd%nc) reduction(+:fnl)
|
||||
DO ibnd = 1, nbnd
|
||||
DO ih = 1, nhnt
|
||||
DO jh = 1, nhnt
|
||||
ikb = ijkb0 + ih
|
||||
jkb = ijkb0 + jh
|
||||
!$acc loop seq collapse(2)
|
||||
DO is = 1, npol
|
||||
DO js = 1, npol
|
||||
ijs = (is-1)*npol + js
|
||||
deff_nc = deeq_nc(ih,jh,na,ijs)
|
||||
IF ( lspinorb ) THEN
|
||||
deff_nc = deff_nc - et(ibnd,ik) * qq_so(ih,jh,ijs,nt)
|
||||
ELSE IF ( is == js ) THEN
|
||||
deff_nc = deff_nc - et(ibnd,ik) * qq_at(ih,jh,na)
|
||||
END IF
|
||||
fnl = fnl + wg(ibnd,ik) * DBLE ( &
|
||||
deff_nc * CONJG(becp%nc(ikb,is,ibnd)) * &
|
||||
becd%nc(jkb,js,ibnd) )
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
! factor 2 from Ry a.u. (e^2=2)? tpiba from k+G, minus sign
|
||||
forcenl(ipol,na) = forcenl(ipol,na) - 2.0_dp * tpiba* fnl
|
||||
END DO
|
||||
!$acc end data
|
||||
!
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
! ... if sums over bands are parallelized over the band group
|
||||
!
|
||||
IF ( becp%comm /= mp_get_comm_null() ) CALL mp_sum( forcenl, becp%comm )
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( vkb1 )
|
||||
IF ( noncolin ) THEN
|
||||
DEALLOCATE( deff_nc )
|
||||
ELSEIF ( .NOT. gamma_only ) THEN
|
||||
DEALLOCATE( deff )
|
||||
ENDIF
|
||||
!
|
||||
CALL deallocate_bec_type_acc( dbecp )
|
||||
CALL deallocate_bec_type_acc( becd )
|
||||
CALL deallocate_bec_type_acc( becp )
|
||||
!
|
||||
! ... collect contributions across pools from all k-points
|
||||
! ... collect contributions across processors and pools from all k-points
|
||||
!
|
||||
CALL mp_sum( forcenl, intra_bgrp_comm )
|
||||
CALL mp_sum( forcenl, inter_pool_comm )
|
||||
!
|
||||
! ... The total D matrix depends on the ionic position via the
|
||||
|
@ -180,270 +187,6 @@ SUBROUTINE force_us( forcenl )
|
|||
!
|
||||
CALL symvector( nat, forcenl )
|
||||
!
|
||||
DEALLOCATE( nt_list, na_list, ismulti_np )
|
||||
!
|
||||
IF ( noncolin ) THEN
|
||||
DEALLOCATE( becpnc, dbecpnc )
|
||||
ELSEIF (.NOT. gamma_only ) THEN
|
||||
DEALLOCATE( becpk, dbecpk )
|
||||
ENDIF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE force_us_gamma( forcenl )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Nonlocal contribution to the force. Calculation at Gamma.
|
||||
!
|
||||
! Important notice about parallelization over the band group of processors:
|
||||
! 1) internally, "calbec" parallelises on plane waves over the band group
|
||||
! 2) the results of "calbec" are distributed across processors of the band
|
||||
! group: the band index of becp, dbecp is distributed
|
||||
! 3) the band group is subsequently used to parallelize over bands
|
||||
!
|
||||
USE uspp, ONLY : qq_at, deeq
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP) :: forcenl(3,nat)
|
||||
!! the nonlocal contribution
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: aux(:,:)
|
||||
REAL(DP) :: forcenl_ipol
|
||||
INTEGER :: nt, na, ibnd, ibnd_loc, ih, jh, ijkb0
|
||||
INTEGER :: nh_nt, becp_ibnd_begin, becp_nbnd_loc, nbnd_siz
|
||||
REAL(DP), ALLOCATABLE :: dbecprd(:,:), becprd(:,:)
|
||||
!$acc declare device_resident(dbecprd, becprd)
|
||||
!
|
||||
nbnd_siz = nbnd / becp%nproc
|
||||
ALLOCATE( becprd(nkb,nbnd_siz), dbecprd(nkb,nbnd_siz) )
|
||||
!
|
||||
!$acc kernels
|
||||
dbecprd = dbecp%r
|
||||
becprd = becp%r
|
||||
!$acc end kernels
|
||||
becp_nbnd_loc = becp%nbnd_loc
|
||||
becp_ibnd_begin = becp%ibnd_begin
|
||||
!
|
||||
!$acc data copyin( et, wg )
|
||||
!
|
||||
DO nt = 1, ntyp
|
||||
!
|
||||
IF ( nh(nt) == 0 ) CYCLE
|
||||
!
|
||||
ALLOCATE( aux(nh(nt), becp%nbnd_loc) )
|
||||
!$acc data create(aux)
|
||||
!
|
||||
nh_nt = nh(nt)
|
||||
!
|
||||
DO na = 1, nat
|
||||
IF ( ityp(na) == nt ) THEN
|
||||
ijkb0 = ofsbeta(na)
|
||||
! ... this is \sum_j q_{ij} <beta_j|psi>
|
||||
!
|
||||
!$acc host_data use_device(aux, qq_at, becprd)
|
||||
CALL MYDGEMM( 'N','N', nh(nt), becp_nbnd_loc, nh(nt), &
|
||||
1.0_DP, qq_at(1,1,na), nhm, becprd(ijkb0+1,1), &
|
||||
nkb, 0.0_DP, aux, nh(nt) )
|
||||
!$acc end host_data
|
||||
!
|
||||
! ... multiply by -\epsilon_n
|
||||
!
|
||||
#if defined(_OPENACC)
|
||||
!$acc parallel loop collapse(2)
|
||||
#else
|
||||
!$omp parallel do default(shared) private(ibnd_loc,ibnd,ih)
|
||||
#endif
|
||||
DO ih = 1, nh_nt
|
||||
DO ibnd_loc = 1, becp_nbnd_loc
|
||||
ibnd = ibnd_loc + becp_ibnd_begin - 1
|
||||
aux(ih,ibnd_loc) = - et(ibnd,ik) * aux(ih,ibnd_loc)
|
||||
ENDDO
|
||||
ENDDO
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end parallel do
|
||||
#endif
|
||||
!
|
||||
! ... add \sum_j d_{ij} <beta_j|psi>
|
||||
!
|
||||
!$acc host_data use_device(aux, deeq, becprd)
|
||||
CALL MYDGEMM( 'N','N', nh(nt), becp_nbnd_loc, nh(nt), &
|
||||
1.0_DP, deeq(1,1,na,current_spin), nhm, &
|
||||
becprd(ijkb0+1,1), nkb, 1.0_DP, aux, nh(nt) )
|
||||
!$acc end host_data
|
||||
!
|
||||
! ... Auxiliary variable to perform the reduction with gpu kernels
|
||||
forcenl_ipol = 0.0_DP
|
||||
#if defined(_OPENACC)
|
||||
!$acc parallel loop collapse(2) reduction(+:forcenl_ipol)
|
||||
#else
|
||||
!$omp parallel do default(shared) private(ibnd_loc,ibnd,ih) reduction(+:forcenl_ipol)
|
||||
#endif
|
||||
DO ih = 1, nh_nt
|
||||
DO ibnd_loc = 1, becp_nbnd_loc
|
||||
ibnd = ibnd_loc + becp_ibnd_begin - 1
|
||||
forcenl_ipol = forcenl_ipol - 2.0_DP*tpiba * aux(ih,ibnd_loc) *&
|
||||
dbecprd(ijkb0+ih,ibnd_loc) * wg(ibnd,ik)
|
||||
ENDDO
|
||||
ENDDO
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end parallel do
|
||||
#endif
|
||||
!
|
||||
forcenl(ipol,na) = forcenl(ipol,na) + forcenl_ipol
|
||||
!
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
DEALLOCATE( aux )
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
DEALLOCATE( becprd, dbecprd )
|
||||
!
|
||||
END SUBROUTINE force_us_gamma
|
||||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE force_us_k( forcenl )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Nonlocal contribution to the force. Calculation for k-points.
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP) :: forcenl(3,nat)
|
||||
!! the nonlocal contribution
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
REAL(DP) :: fac
|
||||
REAL(DP) :: forcenl_p1, forcenl_p2
|
||||
INTEGER :: ibnd, ih, jh, na, nt, ikb, jkb, ijkb0, is, js, ijs !counters
|
||||
INTEGER :: nh_nt, it
|
||||
!
|
||||
!$acc data copy(forcenl)
|
||||
!
|
||||
DO ibnd = 1, nbnd
|
||||
!
|
||||
IF (noncolin) THEN
|
||||
CALL compute_deff_nc( deff_nc, et(ibnd,ik) )
|
||||
ELSE
|
||||
CALL compute_deff( deff, et(ibnd,ik) )
|
||||
ENDIF
|
||||
!
|
||||
fac = wg(ibnd,ik)*tpiba
|
||||
!
|
||||
#if defined(_OPENACC)
|
||||
!$acc parallel loop gang reduction(+:forcenl_p2)
|
||||
#else
|
||||
!$omp parallel do private(nt,na,ijkb0,nh_nt,forcenl_p1,forcenl_p2,ih,&
|
||||
!$omp ikb,is,js,ijs,jkb)
|
||||
#endif
|
||||
DO it = 1, nat
|
||||
!
|
||||
nt = nt_list(it)
|
||||
na = na_list(it)
|
||||
ijkb0 = ofsbeta(na)
|
||||
nh_nt = nh(nt)
|
||||
!
|
||||
forcenl_p2 = 0.d0
|
||||
!$acc loop vector reduction(+:forcenl_p2)
|
||||
DO ih = 1, nh_nt
|
||||
!
|
||||
ikb = ijkb0 + ih
|
||||
IF (noncolin) THEN
|
||||
forcenl_p1 = 0.d0
|
||||
!$acc loop seq collapse(2) reduction(+:forcenl_p1)
|
||||
DO is = 1, npol
|
||||
DO js = 1, npol
|
||||
ijs = (is-1)*npol+js
|
||||
forcenl_p1 = forcenl_p1 - &
|
||||
deff_nc(ih,ih,na,ijs)*fac*( &
|
||||
CONJG(dbecpnc(ikb,is,ibnd))* &
|
||||
becpnc(ikb,js,ibnd)+ &
|
||||
CONJG(becpnc(ikb,is,ibnd))* &
|
||||
dbecpnc(ikb,js,ibnd) )
|
||||
ENDDO
|
||||
ENDDO
|
||||
ELSE
|
||||
forcenl_p1 = -2.D0 * fac * deff(ih,ih,na) * &
|
||||
DBLE( CONJG( dbecpk(ikb,ibnd) ) * &
|
||||
becpk(ikb,ibnd) )
|
||||
ENDIF
|
||||
!
|
||||
forcenl_p2 = forcenl_p2 + forcenl_p1
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
forcenl(ipol,na) = forcenl(ipol,na) + forcenl_p2
|
||||
!
|
||||
IF ( ismulti_np(it) ) THEN
|
||||
!
|
||||
forcenl_p2 = 0.d0
|
||||
!$acc loop vector reduction(+:forcenl_p2)
|
||||
DO ih = 1, nh_nt
|
||||
ikb = ijkb0 + ih
|
||||
!
|
||||
! ... in US case there is a contribution for jh<>ih.
|
||||
! ... We use here the symmetry in the interchange
|
||||
! ... of ih and jh
|
||||
!
|
||||
forcenl_p1 = 0.d0
|
||||
!$acc loop seq
|
||||
DO jh = ih+1, nh_nt
|
||||
jkb = ijkb0 + jh
|
||||
IF (noncolin) THEN
|
||||
!$acc loop seq collapse(2) reduction(+:forcenl_p1)
|
||||
DO is = 1, npol
|
||||
DO js = 1, npol
|
||||
ijs = (is-1)*npol+js
|
||||
forcenl_p1 = forcenl_p1 - &
|
||||
deff_nc(ih,jh,na,ijs)*fac*( &
|
||||
CONJG(dbecpnc(ikb,is,ibnd))* &
|
||||
becpnc(jkb,js,ibnd) + &
|
||||
CONJG(becpnc(ikb,is,ibnd))* &
|
||||
dbecpnc(jkb,js,ibnd)) - &
|
||||
deff_nc(jh,ih,na,ijs)*fac*( &
|
||||
CONJG(dbecpnc(jkb,is,ibnd))* &
|
||||
becpnc(ikb,js,ibnd) + &
|
||||
CONJG(becpnc(jkb,is,ibnd))* &
|
||||
dbecpnc(ikb,js,ibnd) )
|
||||
ENDDO
|
||||
ENDDO
|
||||
ELSE
|
||||
forcenl_p1 = forcenl_p1 - &
|
||||
2.D0 * fac * deff(ih,jh,na) * &
|
||||
DBLE( CONJG( dbecpk(ikb,ibnd) ) * &
|
||||
becpk(jkb,ibnd) + dbecpk(jkb,ibnd) &
|
||||
* CONJG( becpk(ikb,ibnd) ) )
|
||||
ENDIF
|
||||
ENDDO !jh
|
||||
!
|
||||
forcenl_p2 = forcenl_p2 + forcenl_p1
|
||||
!
|
||||
ENDDO !ih
|
||||
!
|
||||
forcenl(ipol,na) = forcenl(ipol,na) + forcenl_p2
|
||||
!
|
||||
ENDIF ! tvanp
|
||||
!
|
||||
ENDDO ! it=nt|na
|
||||
#if !defined(_OPENACC)
|
||||
!$omp end parallel do
|
||||
#endif
|
||||
!
|
||||
ENDDO ! nbnd
|
||||
!
|
||||
!$acc end data
|
||||
!
|
||||
END SUBROUTINE force_us_k
|
||||
!
|
||||
!
|
||||
END SUBROUTINE force_us
|
||||
|
|
|
@ -222,7 +222,7 @@ SUBROUTINE newd_gpu( )
|
|||
USE kinds, ONLY : DP
|
||||
USE ions_base, ONLY : nat, ntyp => nsp, ityp
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE uspp, ONLY : okvan, deeq, deeq_nc, dvan_d, dvan_so_d
|
||||
USE uspp, ONLY : okvan, deeq, deeq_nc, dvan, dvan_so
|
||||
USE uspp_param, ONLY : upf, lmaxq, nh, nhm
|
||||
USE noncollin_module, ONLY : noncolin, domag, nspin_mag, lspinorb
|
||||
USE uspp, ONLY : nhtol, nhtolm
|
||||
|
@ -241,7 +241,6 @@ SUBROUTINE newd_gpu( )
|
|||
INTEGER :: ig, nt, ih, jh, na, is, nht, nb, mb, ierr
|
||||
! counters on g vectors, atom type, beta functions x 2,
|
||||
! atoms, spin, aux, aux, beta func x2 (again)
|
||||
REAL(kind=dp), allocatable :: deeq_h( :,:,:,: )
|
||||
INTEGER, POINTER :: ityp_d(:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: ityp_d
|
||||
|
@ -263,7 +262,7 @@ SUBROUTINE newd_gpu( )
|
|||
DO na = 1, nat
|
||||
DO jh = 1, nht
|
||||
DO ih = 1, nht
|
||||
IF ( ityp_d(na) == nt ) deeq_nc(ih,jh,na,is) = dvan_so_d(ih,jh,is,nt)
|
||||
IF ( ityp_d(na) == nt ) deeq_nc(ih,jh,na,is) = dvan_so(ih,jh,is,nt)
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
|
@ -276,10 +275,10 @@ SUBROUTINE newd_gpu( )
|
|||
DO jh = 1, nht
|
||||
DO ih = 1, nht
|
||||
IF ( ityp_d(na) == nt ) THEN
|
||||
deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt)
|
||||
deeq_nc(ih,jh,na,1) = dvan(ih,jh,nt)
|
||||
deeq_nc(ih,jh,na,2) = ( 0.D0, 0.D0 )
|
||||
deeq_nc(ih,jh,na,3) = ( 0.D0, 0.D0 )
|
||||
deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt)
|
||||
deeq_nc(ih,jh,na,4) = dvan(ih,jh,nt)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
|
@ -294,7 +293,7 @@ SUBROUTINE newd_gpu( )
|
|||
DO jh = 1, nht
|
||||
DO ih = 1, nht
|
||||
!
|
||||
IF ( ityp_d(na) == nt ) deeq(ih,jh,na,is) = dvan_d(ih,jh,nt)
|
||||
IF ( ityp_d(na) == nt ) deeq(ih,jh,na,is) = dvan(ih,jh,nt)
|
||||
!
|
||||
END DO
|
||||
END DO
|
||||
|
@ -323,7 +322,6 @@ SUBROUTINE newd_gpu( )
|
|||
END IF
|
||||
!
|
||||
CALL start_clock_gpu( 'newd' )
|
||||
allocate(deeq_h( nhm, nhm, nat, nspin ))
|
||||
!
|
||||
! move atom type info to GPU
|
||||
CALL buffer%lock_buffer(ityp_d, nat, ierr)
|
||||
|
@ -369,7 +367,7 @@ SUBROUTINE newd_gpu( )
|
|||
DO ih = 1, nht
|
||||
DO jh = 1, nht
|
||||
IF ( ityp_d(na) == nt ) THEN
|
||||
deeq(ih,jh,na,is) = deeq(ih,jh,na,is) + dvan_d(ih,jh,nt)
|
||||
deeq(ih,jh,na,is) = deeq(ih,jh,na,is) + dvan(ih,jh,nt)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
|
@ -437,7 +435,7 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
IF ( ityp_d(na) == nt ) THEN
|
||||
!
|
||||
deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
|
||||
deeq_nc(ih,jh,na,ijs) = dvan_so(ih,jh,ijs,nt)
|
||||
!
|
||||
DO kh = 1, nhnt
|
||||
!
|
||||
|
@ -479,7 +477,7 @@ SUBROUTINE newd_gpu( )
|
|||
!
|
||||
IF ( ityp_d(na) == nt ) THEN
|
||||
!
|
||||
deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
|
||||
deeq_nc(ih,jh,na,ijs) = dvan_so(ih,jh,ijs,nt)
|
||||
!
|
||||
DO kh = 1, nhnt
|
||||
!
|
||||
|
@ -533,17 +531,17 @@ SUBROUTINE newd_gpu( )
|
|||
IF ( ityp_d(na) == nt ) THEN
|
||||
!
|
||||
IF (lspinorb) THEN
|
||||
deeq_nc(ih,jh,na,1) = dvan_so_d(ih,jh,1,nt) + &
|
||||
deeq_nc(ih,jh,na,1) = dvan_so(ih,jh,1,nt) + &
|
||||
deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
|
||||
!
|
||||
deeq_nc(ih,jh,na,4) = dvan_so_d(ih,jh,4,nt) + &
|
||||
deeq_nc(ih,jh,na,4) = dvan_so(ih,jh,4,nt) + &
|
||||
deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
|
||||
!
|
||||
ELSE
|
||||
deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt) + &
|
||||
deeq_nc(ih,jh,na,1) = dvan(ih,jh,nt) + &
|
||||
deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
|
||||
!
|
||||
deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt) + &
|
||||
deeq_nc(ih,jh,na,4) = dvan(ih,jh,nt) + &
|
||||
deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
|
||||
!
|
||||
END IF
|
||||
|
|
|
@ -28,7 +28,7 @@ subroutine init_us_1( nat, ityp, omega, qmax, intra_bgrp_comm )
|
|||
USE upf_const, ONLY : fpi, sqrt2
|
||||
USE uspp, ONLY : nhtol, nhtoj, nhtolm, ijtoh, dvan, qq_at, qq_nt, indv, &
|
||||
ap, aainit, qq_so, dvan_so, okvan, ofsbeta, &
|
||||
qq_nt_d, dvan_d, dvan_so_d
|
||||
qq_nt_d
|
||||
USE uspp_param, ONLY : upf, lmaxq, nh, nhm, lmaxkb, nsp
|
||||
USE upf_spinorb, ONLY : is_spinorbit, rot_ylm, fcoef, lmaxx, &
|
||||
transform_qq_so
|
||||
|
@ -256,14 +256,14 @@ subroutine init_us_1( nat, ityp, omega, qmax, intra_bgrp_comm )
|
|||
!$acc update device(nhtol)
|
||||
!$acc update device(nhtoj)
|
||||
!$acc update device(ijtoh)
|
||||
qq_nt_d=qq_nt
|
||||
!$acc update device(qq_at)
|
||||
qq_nt_d=qq_nt
|
||||
if (is_spinorbit) then
|
||||
dvan_so_d=dvan_so
|
||||
!$acc update device(fcoef)
|
||||
!$acc update device(qq_so)
|
||||
else
|
||||
dvan_d=dvan
|
||||
!$acc update device(dvan_so)
|
||||
!$acc update device(fcoef)
|
||||
!$acc update device(qq_so)
|
||||
else
|
||||
!$acc update device(dvan)
|
||||
endif
|
||||
endif
|
||||
!
|
||||
|
|
|
@ -35,10 +35,9 @@ MODULE uspp
|
|||
PUBLIC :: nlx, lpx, lpl, ap, aainit, indv, nhtol, nhtolm, ofsbeta, &
|
||||
nkb, nkbus, vkb, dvan, deeq, qq_at, qq_nt, nhtoj, ijtoh, beta, &
|
||||
becsum, ebecsum
|
||||
PUBLIC :: dvan_d, qq_nt_d
|
||||
PUBLIC :: qq_nt_d
|
||||
PUBLIC :: okvan, nlcc_any
|
||||
PUBLIC :: qq_so, dvan_so, deeq_nc, fcoef
|
||||
PUBLIC :: dvan_so_d
|
||||
PUBLIC :: dbeta
|
||||
!
|
||||
PUBLIC :: allocate_uspp, deallocate_uspp
|
||||
|
@ -92,11 +91,9 @@ MODULE uspp
|
|||
!
|
||||
! GPU vars
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: dvan_d(:,:,:)
|
||||
REAL(DP), ALLOCATABLE :: qq_nt_d(:,:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: dvan_so_d(:,:,:,:)
|
||||
#if defined(__CUDA)
|
||||
attributes (DEVICE) :: dvan_d, qq_nt_d, dvan_so_d
|
||||
attributes (DEVICE) :: qq_nt_d
|
||||
#endif
|
||||
|
||||
!
|
||||
|
@ -343,10 +340,12 @@ CONTAINS
|
|||
allocate( qq_so(nhm,nhm,4,nsp) )
|
||||
!$acc enter data create(qq_so)
|
||||
allocate( dvan_so(nhm,nhm,nspin,nsp) )
|
||||
!$acc enter data create(dvan_so)
|
||||
allocate( fcoef(nhm,nhm,2,2,nsp) )
|
||||
!$acc enter data create(fcoef)
|
||||
else
|
||||
allocate( dvan(nhm,nhm,nsp) )
|
||||
!$acc enter data create(dvan)
|
||||
endif
|
||||
allocate(becsum( nhm*(nhm+1)/2, nat, nspin))
|
||||
!$acc enter data create(becsum)
|
||||
|
@ -360,15 +359,7 @@ CONTAINS
|
|||
!
|
||||
if (use_gpu) then
|
||||
!
|
||||
if (nhm>0) then
|
||||
allocate( qq_nt_d(nhm,nhm,nsp) )
|
||||
if ( lspinorb ) then
|
||||
allocate( dvan_so_d(nhm,nhm,nspin,nsp) )
|
||||
else
|
||||
allocate( dvan_d(nhm,nhm,nsp) )
|
||||
endif
|
||||
!
|
||||
endif
|
||||
if (nhm>0) allocate( qq_nt_d(nhm,nhm,nsp) )
|
||||
!
|
||||
endif
|
||||
!
|
||||
|
@ -412,7 +403,10 @@ CONTAINS
|
|||
DEALLOCATE( qq_at )
|
||||
ENDIF
|
||||
IF( ALLOCATED( qq_nt ) ) DEALLOCATE( qq_nt )
|
||||
IF( ALLOCATED( dvan ) ) DEALLOCATE( dvan )
|
||||
IF( ALLOCATED( dvan ) ) THEN
|
||||
!$acc exit data delete( dvan )
|
||||
DEALLOCATE( dvan )
|
||||
END IF
|
||||
IF( ALLOCATED( deeq ) ) THEN
|
||||
!$acc exit data delete( deeq )
|
||||
DEALLOCATE( deeq )
|
||||
|
@ -421,7 +415,10 @@ CONTAINS
|
|||
!$acc exit data delete( qq_so )
|
||||
DEALLOCATE( qq_so )
|
||||
ENDIF
|
||||
IF( ALLOCATED( dvan_so ) ) DEALLOCATE( dvan_so )
|
||||
IF( ALLOCATED( dvan_so ) ) THEN
|
||||
!$acc exit data delete( dvan_so )
|
||||
DEALLOCATE( dvan_so )
|
||||
END IF
|
||||
IF( ALLOCATED( deeq_nc ) ) THEN
|
||||
!$acc exit data delete( deeq_nc )
|
||||
DEALLOCATE( deeq_nc )
|
||||
|
@ -434,9 +431,7 @@ CONTAINS
|
|||
IF( ALLOCATED( dbeta ) ) DEALLOCATE( dbeta )
|
||||
!
|
||||
! GPU variables
|
||||
IF( ALLOCATED( dvan_d ) ) DEALLOCATE( dvan_d )
|
||||
IF( ALLOCATED( qq_nt_d ) ) DEALLOCATE( qq_nt_d )
|
||||
IF( ALLOCATED( dvan_so_d ) ) DEALLOCATE( dvan_so_d )
|
||||
!
|
||||
END SUBROUTINE deallocate_uspp
|
||||
!
|
||||
|
|
Loading…
Reference in New Issue