From 78bbaea11f50b152d13ff9bbed67821fc704f308 Mon Sep 17 00:00:00 2001 From: fabrizio22 Date: Mon, 25 Jul 2022 17:03:46 +0200 Subject: [PATCH] stress_acc - Ewald --- PW/CMakeLists.txt | 1 - PW/src/Coul_cut_2D.f90 | 153 +++++------------------- PW/src/Makefile | 1 - PW/src/stres_ewa.f90 | 71 ++++++++--- PW/src/stres_ewa_gpu.f90 | 246 --------------------------------------- PW/src/stress.f90 | 11 +- 6 files changed, 86 insertions(+), 397 deletions(-) delete mode 100644 PW/src/stres_ewa_gpu.f90 diff --git a/PW/CMakeLists.txt b/PW/CMakeLists.txt index c638ef9e1..aa123754e 100644 --- a/PW/CMakeLists.txt +++ b/PW/CMakeLists.txt @@ -274,7 +274,6 @@ set(src_pw src/stres_mgga_gpu.f90 src/atomic_wfc_gpu.f90 src/dvloc_of_g_gpu.f90 - src/stres_ewa_gpu.f90 src/stres_knl_gpu.f90 src/compute_deff_gpu.f90 src/add_vhub_to_deeq_gpu.f90 diff --git a/PW/src/Coul_cut_2D.f90 b/PW/src/Coul_cut_2D.f90 index b8f9b711e..7d5cf843a 100644 --- a/PW/src/Coul_cut_2D.f90 +++ b/PW/src/Coul_cut_2D.f90 @@ -759,87 +759,7 @@ SUBROUTINE cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa ) USE kinds USE ions_base, ONLY : nat, zv, tau, ityp USE constants, ONLY : e2, eps8 - USE gvect, ONLY : ngm, g, gg, gstart - USE cell_base, ONLY : tpiba2, alat, omega, tpiba - USE io_global, ONLY : stdout - ! - IMPLICIT NONE - ! - REAL(DP), INTENT(IN) :: alpha - !! tuning param for LR/SR separation - REAL(DP), INTENT(INOUT) :: sigmaewa(3,3) - !! ewald contribution to stress - REAL(DP), INTENT(INOUT) :: sdewald - !! constant and diagonal terms - ! - ! ... local variables - ! - INTEGER :: ng, na, l, m - REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact - COMPLEX(DP) :: rhostar - ! - ! g(1) is a problem if it's G=0, because we divide by G^2. - ! So start at gstart. - ! fact=1.0d0, gamma_only not implemented - ! G=0 componenent of the long-range part of the local part of the - ! pseudopotminus the Hartree potential is set to 0. - ! in other words, sdewald=0. - ! sdewald is the last term in equation B1 of PRB 32 3792. - ! See also similar comment for ewaldg in cutoff_ewald routine - ! - sdewald = 0._DP - DO ng = gstart, ngm - Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba - IF (Gp < eps8) THEN - G2lzo2Gp = 0._DP - beta = 0._DP - ELSE - G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp - beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng) - ENDIF - g2 = gg(ng) * tpiba2 - g2a = g2 / 4._DP / alpha - rhostar = (0._DP,0._DP) - DO na = 1, nat - arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + & - g(3,ng) * tau(3,na) ) * tpi - rhostar = rhostar + zv (ityp(na) ) * CMPLX(COS(arg), SIN(arg), KIND=DP) - ENDDO - rhostar = rhostar / omega - sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2 - ! ... sewald is an other diagonal term that is similar to the diagonal terms - ! in the other stress contributions. It basically gives a term prop to - ! the ewald energy - sdewald = sdewald-sewald - DO l = 1, 3 - IF (l == 3) THEN - fact = (g2a + 1.0d0) - ELSE - fact = (1.0d0+g2a-beta) - ENDIF - ! - DO m = 1, l - sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * & - g(l,ng) * g(m,ng) / g2 * fact - ENDDO - ENDDO - ! - ENDDO - ! - RETURN - ! -END SUBROUTINE cutoff_stres_sigmaewa -! -!---------------------------------------------------------------------- -SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa ) - !---------------------------------------------------------------------- - !! This subroutine cuts off the Ewald part of the stress. - !! See Eq. (64) in PRB 96 075448 - ! - USE kinds - USE ions_base, ONLY : nat, zv, tau, ityp - USE constants, ONLY : e2, eps8 - USE gvect, ONLY : ngm, gstart, g_d, gg_d + USE gvect, ONLY : ngm, gstart, g, gg USE cell_base, ONLY : tpiba2, alat, omega, tpiba USE io_global, ONLY : stdout ! @@ -858,72 +778,62 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa ) REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33 COMPLEX(DP) :: rhostar - ! - INTEGER , ALLOCATABLE :: ityp_d(:) - REAL(DP), ALLOCATABLE :: cutoff2D_d(:), tau_d(:,:), zv_d(:) - ! -#if defined(__CUDA) - attributes(DEVICE) :: cutoff2D_d, tau_d, zv_d, ityp_d -#endif ! ntyp = SIZE(zv) - ALLOCATE( cutoff2D_d(ngm), tau_d(3,nat), zv_d(ntyp) ) - ALLOCATE( ityp_d(nat) ) - cutoff2D_d = cutoff_2D - tau_d = tau - zv_d = zv - ityp_d = ityp - ! g(1) is a problem if it's G=0, because we divide by G^2. - ! So start at gstart. - ! fact=1.0d0, gamma_only not implemented - ! G=0 componenent of the long-range part of the local part of the - ! pseudopotminus the Hartree potential is set to 0. - ! in other words, sdewald=0. - ! sdewald is the last term in equation B1 of PRB 32 3792. - ! See also similar comment for ewaldg in cutoff_ewald routine + ! + ! ... g(1) is a problem if it's G=0, because we divide by G^2. + ! So start at gstart. + ! fact=1.0d0, gamma_only not implemented + ! G=0 componenent of the long-range part of the local part of the + ! pseudopotminus the Hartree potential is set to 0. + ! in other words, sdewald=0. + ! sdewald is the last term in equation B1 of PRB 32 3792. + ! See also similar comment for ewaldg in cutoff_ewald routine ! sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP ! sdewald = 0._DP ! - !$cuf kernel do (1) <<<*,*>>> + !$acc parallel loop copyin(g,gg,cutoff_2D,tau,zv,ityp) & + !$acc& reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32, & + !$acc& sigma33) DO ng = gstart, ngm - Gp = SQRT( g_d(1,ng)**2 + g_d(2,ng)**2 )*tpiba + Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba IF (Gp < eps8) THEN G2lzo2Gp = 0._DP beta = 0._DP ELSE - G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp - beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng) + G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp + beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng) ENDIF - g2 = gg_d(ng) * tpiba2 + g2 = gg(ng) * tpiba2 g2a = g2 / 4._DP / alpha rhostar = (0._DP,0._DP) DO na = 1, nat - arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + & - g_d(3,ng) * tau_d(3,na) ) * tpi - rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP) + arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + & + g(3,ng) * tau(3,na) ) * tpi + rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP) ENDDO rhostar = rhostar / CMPLX(omega) - sewald = tpi * e2 * EXP(-g2a) / g2* cutoff2D_d(ng) * ABS(rhostar)**2 + sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2 ! ... sewald is an other diagonal term that is similar to the diagonal terms - ! in the other stress contributions. It basically gives a term prop to - ! the ewald energy + ! in the other stress contributions. It basically gives a term prop to + ! the ewald energy ! sdewald = sdewald - sewald sigma11 = sigma11 + sewald * tpiba2 * 2._DP * & - g_d(1,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta) + g(1,ng) * g(1,ng) / g2 * (1._DP+g2a-beta) sigma21 = sigma21 + sewald * tpiba2 * 2._DP * & - g_d(2,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta) + g(2,ng) * g(1,ng) / g2 * (1._DP+g2a-beta) sigma22 = sigma22 + sewald * tpiba2 * 2._DP * & - g_d(2,ng) * g_d(2,ng) / g2 * (1._DP+g2a-beta) + g(2,ng) * g(2,ng) / g2 * (1._DP+g2a-beta) sigma31 = sigma31 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(1,ng) / g2 * (g2a+1._DP) + g(3,ng) * g(1,ng) / g2 * (g2a+1._DP) sigma32 = sigma32 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(2,ng) / g2 * (g2a+1._DP) + g(3,ng) * g(2,ng) / g2 * (g2a+1._DP) sigma33 = sigma33 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(3,ng) / g2 * (g2a+1._DP) + g(3,ng) * g(3,ng) / g2 * (g2a+1._DP) ! ENDDO ! @@ -934,11 +844,8 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa ) sigmaewa(3,2) = sigmaewa(3,2) + sigma32 sigmaewa(3,3) = sigmaewa(3,3) + sigma33 ! - DEALLOCATE( cutoff2D_d, tau_d, zv_d ) - DEALLOCATE( ityp_d ) - ! RETURN ! -END SUBROUTINE cutoff_stres_sigmaewa_gpu +END SUBROUTINE cutoff_stres_sigmaewa ! END MODULE Coul_cut_2D diff --git a/PW/src/Makefile b/PW/src/Makefile index 0016fd01d..81dbeb2b0 100644 --- a/PW/src/Makefile +++ b/PW/src/Makefile @@ -296,7 +296,6 @@ PWLIBS += \ stres_mgga_gpu.o \ stres_cc_gpu.o \ deriv_drhoc_gpu.o \ - stres_ewa_gpu.o \ stres_knl_gpu.o \ stres_us_gpu.o \ compute_deff_gpu.o \ diff --git a/PW/src/stres_ewa.f90 b/PW/src/stres_ewa.f90 index df3795d4e..5b76ba0b2 100644 --- a/PW/src/stres_ewa.f90 +++ b/PW/src/stres_ewa.f90 @@ -53,7 +53,7 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, & REAL(DP), INTENT(IN) :: gcutm !! input: cut-off of g vectors REAL(DP), INTENT(OUT) :: sigmaewa(3,3) - ! output: the ewald stress + !! output: the ewald stress ! ! ... local variables ! @@ -85,6 +85,9 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, & ! diagonal term ! nondiagonal term COMPLEX(DP) :: rhostar + REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33 + ! + !$acc data present_or_copyin( g, gg ) ! tpiba2 = (tpi / alat)**2 sigmaewa(:,:) = 0.d0 @@ -94,15 +97,15 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, & charge = charge + zv(ityp(na)) ENDDO ! - ! choose alpha in order to have convergence in the sum over G - ! upperbound is a safe upper bound for the error ON THE ENERGY + ! ... choose alpha in order to have convergence in the sum over G + ! upperbound is a safe upper bound for the error ON THE ENERGY ! alpha = 2.9d0 12 alpha = alpha - 0.1d0 ! IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 ) upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * & - erfc ( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) ) + ERFC( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) ) ! IF (upperbound > 1d-7) GOTO 12 ! @@ -118,40 +121,70 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, & ! ! sdewald is the diagonal term IF (gamma_only) THEN - fact = 2.d0 + fact = 2.d0 ELSE fact = 1.d0 ENDIF ! IF (do_cutoff_2D) THEN + ! CALL cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa ) + ! ELSE -!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau, zv, ityp, nat, omega, fact)& -!$omp &private(g2, g2a, rhostar, na, arg, l, m, sewald)& -!$omp &reduction(+:sigmaewa,sdewald) + ! + sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP + sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP + ! +#if !defined(_OPENACC) +!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau,& +!$omp nat, zv, ityp, omega, fact) private(g2,g2a, rhostar, na, arg,& +!$omp sewald) reduction(+:sdewald,sigma11,sigma21,sigma22,sigma31,& +!$omp sigma32,sigma33) +#else +!$acc parallel loop copyin(tau,zv,ityp) reduction(+:sigma11,sigma21,sigma22,& +!$acc sigma31,sigma32,sigma33) reduction(-:sdewald) +#endif DO ng = gstart, ngm - g2 = gg (ng) * tpiba2 - g2a = g2 / 4.d0 / alpha - rhostar = (0.d0, 0.d0) + g2 = gg(ng) * tpiba2 + g2a = g2 / 4._DP / alpha + rhostar = (0._DP,0._DP) DO na = 1, nat arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + & g(3,ng) * tau(3,na) ) * tpi - rhostar = rhostar + zv(ityp(na)) * CMPLX(COS(arg), SIN(arg), KIND=DP) + rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP) ENDDO - rhostar = rhostar / omega + rhostar = rhostar / CMPLX(omega) sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2 sdewald = sdewald - sewald - DO l = 1, 3 - DO m = 1, l - sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * & - g(l,ng) * g(m,ng) / g2 * (g2a + 1) - ENDDO - ENDDO ! + sigma11 = sigma11 + sewald * tpiba2 * 2._DP * & + g(1,ng) * g(1,ng) / g2 * (g2a + 1) + sigma21 = sigma21 + sewald * tpiba2 * 2._DP * & + g(2,ng) * g(1,ng) / g2 * (g2a + 1) + sigma22 = sigma22 + sewald * tpiba2 * 2._DP * & + g(2,ng) * g(2,ng) / g2 * (g2a + 1) + sigma31 = sigma31 + sewald * tpiba2 * 2._DP * & + g(3,ng) * g(1,ng) / g2 * (g2a + 1) + sigma32 = sigma32 + sewald * tpiba2 * 2._DP * & + g(3,ng) * g(2,ng) / g2 * (g2a + 1) + sigma33 = sigma33 + sewald * tpiba2 * 2._DP * & + g(3,ng) * g(3,ng) / g2 * (g2a + 1) ENDDO +#if !defined(_OPENACC) !$omp end parallel do +#endif + ! + sigmaewa(1,1) = sigmaewa(1,1) + sigma11 + sigmaewa(2,1) = sigmaewa(2,1) + sigma21 + sigmaewa(2,2) = sigmaewa(2,2) + sigma22 + sigmaewa(3,1) = sigmaewa(3,1) + sigma31 + sigmaewa(3,2) = sigmaewa(3,2) + sigma32 + sigmaewa(3,3) = sigmaewa(3,3) + sigma33 + ! ENDIF ! + !$acc end data + ! DO l = 1, 3 sigmaewa(l,l) = sigmaewa(l,l) + sdewald ENDDO diff --git a/PW/src/stres_ewa_gpu.f90 b/PW/src/stres_ewa_gpu.f90 deleted file mode 100644 index 6a28db8e2..000000000 --- a/PW/src/stres_ewa_gpu.f90 +++ /dev/null @@ -1,246 +0,0 @@ -! -! Copyright (C) 2001-2009 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!----------------------------------------------------------------------- -SUBROUTINE stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg, tau, & - omega, g_d, gg_d, ngm, gstart, gamma_only, & - gcutm, sigmaewa ) - !--------------------------------------------------------------------- - !! Ewald contribution. Both real- and reciprocal-space terms are - !! present. - ! - USE kinds - USE constants, ONLY : tpi, e2, eps6 - USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, nproc_bgrp - USE mp, ONLY : mp_sum - USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_sigmaewa_gpu - ! - IMPLICIT NONE - ! - INTEGER :: nat - !! input: number of atoms in the unit cell - INTEGER :: ntyp - !! input: number of different types of atoms - INTEGER :: ityp(nat) - !! input: the type of each atom - INTEGER :: ngm - !! input: number of plane waves for G sum - INTEGER :: gstart - !! input: first nonzero g vector - LOGICAL, INTENT(IN) :: gamma_only - !! gamma point only - REAL(DP), INTENT(IN) :: tau(3,nat) - !! input: the positions of the atoms in the cell - REAL(DP), INTENT(IN) :: g_d(3,ngm) - !! input: the coordinates of G vectors - REAL(DP), INTENT(IN) :: gg_d(ngm) - !! input: the square moduli of G vectors - REAL(DP), INTENT(IN) :: zv(ntyp) - !! input: the charge of each type of atoms - REAL(DP), INTENT(IN) :: at(3,3) - !! input: the direct lattice vectors - REAL(DP), INTENT(IN) :: bg(3,3) - !! input: the reciprocal lattice vectors - REAL(DP), INTENT(IN) :: omega - !! input: the volume of the unit cell - REAL(DP), INTENT(IN) :: alat - !! input: measure of length - REAL(DP), INTENT(IN) :: gcutm - !! input: cut-off of g vectors - REAL(DP), INTENT(OUT) :: sigmaewa(3,3) - ! output: the ewald stress - ! - ! ... local variables - ! - INTEGER, PARAMETER :: mxr = 50 - ! the maximum number of R vectors included in r sum - INTEGER :: ng, nr, na, nb, l, m, nrm - ! counter over reciprocal G vectors - ! counter over direct vectors - ! counter on atoms - ! counter on atoms - ! counter on atoms - ! number of R vectors included in r sum - INTEGER :: na_s, na_e, mykey - ! - REAL(DP) :: charge, arg, tpiba2, dtau(3), alpha, r(3,mxr), & - r2(mxr), rmax, rr, upperbound, fact, fac, g2, g2a, & - sdewald, sewald - ! total ionic charge in the cell - ! the argument of the phase - ! length in reciprocal space - ! the difference tau_s - tau_s' - ! alpha term in ewald sum - ! input of the rgen routine ( not used here ) - ! the square modulus of R_j-tau_s-tau_s' - ! the maximum radius to consider real space sum - ! buffer variable - ! used to optimize alpha - ! auxiliary variables - ! diagonal term - ! nondiagonal term - ! - INTEGER :: ierr(2) - REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33 - COMPLEX(DP) :: rhostar - ! - INTEGER, ALLOCATABLE :: ityp_d(:) - REAL(DP), ALLOCATABLE :: zv_d(:), tau_d(:,:) - ! -#if defined(__CUDA) - attributes(DEVICE) :: g_d, gg_d, zv_d, ityp_d, tau_d -#endif - ! - tpiba2 = (tpi / alat)**2 - sigmaewa(:,:) = 0._DP - charge = 0._DP - ! - ALLOCATE( zv_d(ntyp), tau_d(3,nat) ) - zv_d = zv - tau_d = tau - ALLOCATE( ityp_d(nat) ) - ityp_d = ityp - ! - DO na = 1, nat - charge = charge + zv(ityp(na)) - ENDDO - ! - ! choose alpha in order to have convergence in the sum over G - ! upperbound is a safe upper bound for the error ON THE ENERGY - ! - alpha = 2.9_DP -12 alpha = alpha - 0.1_DP - ! - IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 ) - upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * & - erfc ( SQRT(tpiba2 * gcutm / 4._DP / alpha) ) - ! - IF (upperbound > 1d-7) GOTO 12 - ! - ! G-space sum here - ! - ! Determine if this processor contains G=0 and set the constant term - ! - IF (gstart == 2) THEN - sdewald = tpi * e2 / 4._DP / alpha * (charge / omega)**2 - ELSE - sdewald = 0._DP - ENDIF - ! - ! sdewald is the diagonal term - IF ( gamma_only ) THEN - fact = 2._DP - ELSE - fact = 1._DP - ENDIF - ! - IF ( do_cutoff_2D ) THEN - ! - CALL cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa ) - ! - ELSE - ! - sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP - sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP - ! - !$cuf kernel do (1) <<<*,*>>> - DO ng = gstart, ngm - g2 = gg_d(ng) * tpiba2 - g2a = g2 / 4._DP / alpha - rhostar = (0._DP,0._DP) - DO na = 1, nat - arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + & - g_d(3,ng) * tau_d(3,na) ) * tpi - rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP) - ENDDO - rhostar = rhostar / CMPLX(omega) - sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2 - sdewald = sdewald - sewald - ! - sigma11 = sigma11 + sewald * tpiba2 * 2._DP * & - g_d(1,ng) * g_d(1,ng) / g2 * (g2a + 1) - sigma21 = sigma21 + sewald * tpiba2 * 2._DP * & - g_d(2,ng) * g_d(1,ng) / g2 * (g2a + 1) - sigma22 = sigma22 + sewald * tpiba2 * 2._DP * & - g_d(2,ng) * g_d(2,ng) / g2 * (g2a + 1) - sigma31 = sigma31 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(1,ng) / g2 * (g2a + 1) - sigma32 = sigma32 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(2,ng) / g2 * (g2a + 1) - sigma33 = sigma33 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(3,ng) / g2 * (g2a + 1) - ! - ENDDO - ! - sigmaewa(1,1) = sigmaewa(1,1) + sigma11 - sigmaewa(2,1) = sigmaewa(2,1) + sigma21 - sigmaewa(2,2) = sigmaewa(2,2) + sigma22 - sigmaewa(3,1) = sigmaewa(3,1) + sigma31 - sigmaewa(3,2) = sigmaewa(3,2) + sigma32 - sigmaewa(3,3) = sigmaewa(3,3) + sigma33 - ! - ENDIF - ! - DO l = 1, 3 - sigmaewa(l,l) = sigmaewa(l,l) + sdewald - ENDDO - ! - ! R-space sum here (see ewald.f90 for details on parallelization) - ! - CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey ) - ! - IF ( mykey == 0 ) THEN - rmax = 4.0d0 / SQRT(alpha) / alat - ! - ! with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1 - ! - DO na = na_s, na_e - DO nb = 1, nat - dtau(:) = tau(:,na) - tau(:,nb) - ! - ! generates nearest-neighbors shells r(i)=R(i)-dtau(i) - ! - CALL rgen( dtau, rmax, mxr, at, bg, r, r2, nrm ) - ! - DO nr = 1, nrm - rr = SQRT(r2 (nr) ) * alat - fac = - e2 / 2.0_DP/ omega * alat**2 * zv(ityp(na)) * & - zv(ityp(nb)) / rr**3 * (erfc(SQRT(alpha) * rr) + & - rr * SQRT(8.0_DP * alpha / tpi) * EXP( - alpha * rr**2) ) - DO l = 1, 3 - DO m = 1, l - sigmaewa(l,m) = sigmaewa(l,m) + fac * r(l,nr) * r(m,nr) - ENDDO - ENDDO - ENDDO - ! - ENDDO - ENDDO - ENDIF - ! - DO l = 1, 3 - DO m = 1, l - 1 - sigmaewa(m,l) = sigmaewa(l,m) - ENDDO - ENDDO - ! - DO l = 1, 3 - DO m = 1, 3 - sigmaewa(l,m) = - sigmaewa(l,m) - ENDDO - ENDDO - ! - DEALLOCATE( zv_d, tau_d ) - DEALLOCATE( ityp_d ) - ! - CALL mp_sum( sigmaewa, intra_bgrp_comm ) - ! - RETURN - ! -END SUBROUTINE stres_ewa_gpu - diff --git a/PW/src/stress.f90 b/PW/src/stress.f90 index 3f86dcea8..cec9206ee 100644 --- a/PW/src/stress.f90 +++ b/PW/src/stress.f90 @@ -122,13 +122,10 @@ SUBROUTINE stress( sigma ) IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress CALL esm_stres_ewa( sigmaewa ) ELSE - IF (.NOT. use_gpu) CALL stres_ewa( alat, nat, ntyp, ityp, zv, at, & - bg, tau, omega, g, gg, ngm, gstart, & - gamma_only, gcutm, sigmaewa ) - IF ( use_gpu) CALL stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg,& - tau, omega, g_d,gg_d, ngm, gstart,& - gamma_only, gcutm, sigmaewa ) - END IF + CALL stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, & + tau, omega, g, gg, ngm, gstart, & + gamma_only, gcutm, sigmaewa ) + ENDIF ! ! semi-empirical dispersion contribution: Grimme-D2 and D3 !