From 78bbaea11f50b152d13ff9bbed67821fc704f308 Mon Sep 17 00:00:00 2001
From: fabrizio22 <ferrariruffino.fz@gmail.com>
Date: Mon, 25 Jul 2022 17:03:46 +0200
Subject: [PATCH] stress_acc - Ewald

---
 PW/CMakeLists.txt        |   1 -
 PW/src/Coul_cut_2D.f90   | 153 +++++-------------------
 PW/src/Makefile          |   1 -
 PW/src/stres_ewa.f90     |  71 ++++++++---
 PW/src/stres_ewa_gpu.f90 | 246 ---------------------------------------
 PW/src/stress.f90        |  11 +-
 6 files changed, 86 insertions(+), 397 deletions(-)
 delete mode 100644 PW/src/stres_ewa_gpu.f90

diff --git a/PW/CMakeLists.txt b/PW/CMakeLists.txt
index c638ef9e1..aa123754e 100644
--- a/PW/CMakeLists.txt
+++ b/PW/CMakeLists.txt
@@ -274,7 +274,6 @@ set(src_pw
    src/stres_mgga_gpu.f90
    src/atomic_wfc_gpu.f90
    src/dvloc_of_g_gpu.f90
-   src/stres_ewa_gpu.f90
    src/stres_knl_gpu.f90
    src/compute_deff_gpu.f90
    src/add_vhub_to_deeq_gpu.f90
diff --git a/PW/src/Coul_cut_2D.f90 b/PW/src/Coul_cut_2D.f90
index b8f9b711e..7d5cf843a 100644
--- a/PW/src/Coul_cut_2D.f90
+++ b/PW/src/Coul_cut_2D.f90
@@ -759,87 +759,7 @@ SUBROUTINE cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa )
   USE kinds
   USE ions_base,   ONLY : nat, zv, tau, ityp
   USE constants,   ONLY : e2, eps8
-  USE gvect,       ONLY : ngm, g, gg, gstart
-  USE cell_base,   ONLY : tpiba2, alat, omega, tpiba
-  USE io_global,   ONLY : stdout
-  !
-  IMPLICIT NONE
-  !
-  REAL(DP), INTENT(IN) :: alpha
-  !! tuning param for LR/SR separation
-  REAL(DP), INTENT(INOUT) :: sigmaewa(3,3)
-  !! ewald contribution to stress
-  REAL(DP), INTENT(INOUT) :: sdewald
-  !! constant and diagonal terms
-  !
-  ! ... local variables
-  !
-  INTEGER :: ng, na, l, m
-  REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact
-  COMPLEX(DP) :: rhostar
-  !
-  ! g(1) is a problem if it's G=0, because we divide by G^2. 
-  ! So start at gstart.
-  ! fact=1.0d0, gamma_only not implemented
-  ! G=0 componenent of the long-range part of the local part of the 
-  ! pseudopotminus the Hartree potential is set to 0.
-  ! in other words, sdewald=0.  
-  ! sdewald is the last term in equation B1 of PRB 32 3792.
-  ! See also similar comment for ewaldg in cutoff_ewald routine
-  !
-  sdewald = 0._DP
-  DO ng = gstart, ngm
-     Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
-     IF (Gp < eps8) THEN
-        G2lzo2Gp = 0._DP
-        beta = 0._DP
-     ELSE
-        G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
-        beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
-     ENDIF
-     g2 = gg(ng) * tpiba2
-     g2a = g2 / 4._DP / alpha
-     rhostar = (0._DP,0._DP)
-     DO na = 1, nat
-        arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + &
-               g(3,ng) * tau(3,na) ) * tpi
-        rhostar = rhostar + zv (ityp(na) ) * CMPLX(COS(arg), SIN(arg), KIND=DP)
-     ENDDO
-     rhostar = rhostar / omega
-     sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2
-     ! ... sewald is an other diagonal term that is similar to the diagonal terms 
-     ! in the other stress contributions. It basically gives a term prop to 
-     ! the ewald energy
-     sdewald = sdewald-sewald
-     DO l = 1, 3
-        IF (l == 3) THEN
-           fact = (g2a + 1.0d0)
-        ELSE
-           fact = (1.0d0+g2a-beta)
-        ENDIF
-        !
-        DO m = 1, l
-           sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * &
-                 g(l,ng) * g(m,ng) / g2 * fact
-        ENDDO
-     ENDDO
-     !
-  ENDDO
-  !
-  RETURN
-  !
-END SUBROUTINE cutoff_stres_sigmaewa
-!
-!----------------------------------------------------------------------
-SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
-  !----------------------------------------------------------------------
-  !! This subroutine cuts off the Ewald part of the stress.  
-  !! See Eq. (64) in PRB 96 075448
-  !
-  USE kinds
-  USE ions_base,   ONLY : nat, zv, tau, ityp
-  USE constants,   ONLY : e2, eps8
-  USE gvect,       ONLY : ngm, gstart, g_d, gg_d
+  USE gvect,       ONLY : ngm, gstart, g, gg
   USE cell_base,   ONLY : tpiba2, alat, omega, tpiba
   USE io_global,   ONLY : stdout
   !
@@ -858,72 +778,62 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
   REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact
   REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
   COMPLEX(DP) :: rhostar
-  !
-  INTEGER , ALLOCATABLE :: ityp_d(:)
-  REAL(DP), ALLOCATABLE :: cutoff2D_d(:), tau_d(:,:), zv_d(:)
-  !
-#if defined(__CUDA)
-  attributes(DEVICE) :: cutoff2D_d, tau_d, zv_d, ityp_d
-#endif
   !
   ntyp = SIZE(zv)
-  ALLOCATE( cutoff2D_d(ngm), tau_d(3,nat), zv_d(ntyp) )
-  ALLOCATE( ityp_d(nat) )
-  cutoff2D_d = cutoff_2D
-  tau_d = tau
-  zv_d = zv
-  ityp_d = ityp
-  ! g(1) is a problem if it's G=0, because we divide by G^2. 
-  ! So start at gstart.
-  ! fact=1.0d0, gamma_only not implemented
-  ! G=0 componenent of the long-range part of the local part of the 
-  ! pseudopotminus the Hartree potential is set to 0.
-  ! in other words, sdewald=0.  
-  ! sdewald is the last term in equation B1 of PRB 32 3792.
-  ! See also similar comment for ewaldg in cutoff_ewald routine
+  !
+  ! ... g(1) is a problem if it's G=0, because we divide by G^2. 
+  !     So start at gstart.
+  !     fact=1.0d0, gamma_only not implemented
+  !     G=0 componenent of the long-range part of the local part of the 
+  !     pseudopotminus the Hartree potential is set to 0.
+  !     in other words, sdewald=0.  
+  !     sdewald is the last term in equation B1 of PRB 32 3792.
+  !     See also similar comment for ewaldg in cutoff_ewald routine
   !
   sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
   sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
   !
   sdewald = 0._DP
   !
-  !$cuf kernel do (1) <<<*,*>>>
+  !$acc parallel loop copyin(g,gg,cutoff_2D,tau,zv,ityp) &
+  !$acc& reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32, &
+  !$acc&             sigma33)
   DO ng = gstart, ngm
-     Gp = SQRT( g_d(1,ng)**2 + g_d(2,ng)**2 )*tpiba
+     Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba
      IF (Gp < eps8) THEN
         G2lzo2Gp = 0._DP
         beta = 0._DP
      ELSE
-        G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp
-        beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng)
+        G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp
+        beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng)
      ENDIF
-     g2 = gg_d(ng) * tpiba2
+     g2 = gg(ng) * tpiba2
      g2a = g2 / 4._DP / alpha
      rhostar = (0._DP,0._DP)
      DO na = 1, nat
-        arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + &
-               g_d(3,ng) * tau_d(3,na) ) * tpi
-        rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP)
+        arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + &
+               g(3,ng) * tau(3,na) ) * tpi
+        rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP)
      ENDDO
      rhostar = rhostar / CMPLX(omega)
-     sewald = tpi * e2 * EXP(-g2a) / g2* cutoff2D_d(ng) * ABS(rhostar)**2
+     sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2
      ! ... sewald is an other diagonal term that is similar to the diagonal terms 
-     ! in the other stress contributions. It basically gives a term prop to 
-     ! the ewald energy
+     !     in the other stress contributions. It basically gives a term prop to 
+     !     the ewald energy
      !
      sdewald = sdewald - sewald
      sigma11 = sigma11 + sewald * tpiba2 * 2._DP * &
-                 g_d(1,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta)
+                 g(1,ng) * g(1,ng) / g2 * (1._DP+g2a-beta)
      sigma21 = sigma21 + sewald * tpiba2 * 2._DP * &
-                 g_d(2,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta)          
+                 g(2,ng) * g(1,ng) / g2 * (1._DP+g2a-beta)          
      sigma22 = sigma22 + sewald * tpiba2 * 2._DP * &
-                 g_d(2,ng) * g_d(2,ng) / g2 * (1._DP+g2a-beta)         
+                 g(2,ng) * g(2,ng) / g2 * (1._DP+g2a-beta)         
      sigma31 = sigma31 + sewald * tpiba2 * 2._DP * &
-                 g_d(3,ng) * g_d(1,ng) / g2 * (g2a+1._DP)          
+                 g(3,ng) * g(1,ng) / g2 * (g2a+1._DP)          
      sigma32 = sigma32 + sewald * tpiba2 * 2._DP * &
-                 g_d(3,ng) * g_d(2,ng) / g2 * (g2a+1._DP)          
+                 g(3,ng) * g(2,ng) / g2 * (g2a+1._DP)          
      sigma33 = sigma33 + sewald * tpiba2 * 2._DP * &
-                 g_d(3,ng) * g_d(3,ng) / g2 * (g2a+1._DP)
+                 g(3,ng) * g(3,ng) / g2 * (g2a+1._DP)
      !
   ENDDO
   !
@@ -934,11 +844,8 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
   sigmaewa(3,2) = sigmaewa(3,2) + sigma32
   sigmaewa(3,3) = sigmaewa(3,3) + sigma33
   !
-  DEALLOCATE( cutoff2D_d, tau_d, zv_d )
-  DEALLOCATE( ityp_d )
-  !
   RETURN
   !
-END SUBROUTINE cutoff_stres_sigmaewa_gpu
+END SUBROUTINE cutoff_stres_sigmaewa
 !
 END MODULE Coul_cut_2D
diff --git a/PW/src/Makefile b/PW/src/Makefile
index 0016fd01d..81dbeb2b0 100644
--- a/PW/src/Makefile
+++ b/PW/src/Makefile
@@ -296,7 +296,6 @@ PWLIBS += \
   stres_mgga_gpu.o \
   stres_cc_gpu.o \
   deriv_drhoc_gpu.o \
-  stres_ewa_gpu.o \
   stres_knl_gpu.o \
   stres_us_gpu.o \
   compute_deff_gpu.o \
diff --git a/PW/src/stres_ewa.f90 b/PW/src/stres_ewa.f90
index df3795d4e..5b76ba0b2 100644
--- a/PW/src/stres_ewa.f90
+++ b/PW/src/stres_ewa.f90
@@ -53,7 +53,7 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
   REAL(DP), INTENT(IN) :: gcutm
   !! input: cut-off of g vectors
   REAL(DP), INTENT(OUT) :: sigmaewa(3,3)
-  ! output: the ewald stress
+  !! output: the ewald stress
   !
   ! ... local variables
   !
@@ -85,6 +85,9 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
   ! diagonal term
   ! nondiagonal term
   COMPLEX(DP) :: rhostar
+  REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
+  !
+  !$acc data present_or_copyin( g, gg )
   !
   tpiba2 = (tpi / alat)**2
   sigmaewa(:,:) = 0.d0
@@ -94,15 +97,15 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
      charge = charge + zv(ityp(na))
   ENDDO
   !
-  ! choose alpha in order to have convergence in the sum over G
-  ! upperbound is a safe upper bound for the error ON THE ENERGY
+  ! ... choose alpha in order to have convergence in the sum over G
+  !     upperbound is a safe upper bound for the error ON THE ENERGY
   !
   alpha = 2.9d0
 12 alpha = alpha - 0.1d0
   !
   IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 )
   upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * &
-               erfc ( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) )
+               ERFC( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) )
   !
   IF (upperbound > 1d-7) GOTO 12
   !
@@ -118,40 +121,70 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, &
   !
   ! sdewald is the diagonal term
   IF (gamma_only) THEN
-     fact = 2.d0
+    fact = 2.d0
   ELSE
     fact = 1.d0
   ENDIF
   !
   IF (do_cutoff_2D) THEN 
+     !
      CALL cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa )
+     !
   ELSE
-!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau, zv, ityp, nat, omega, fact)&
-!$omp                          &private(g2, g2a, rhostar, na, arg, l, m, sewald)&
-!$omp                          &reduction(+:sigmaewa,sdewald)
+     !
+     sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
+     sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
+     !
+#if !defined(_OPENACC)
+!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau,&
+!$omp   nat, zv, ityp, omega, fact) private(g2,g2a, rhostar, na, arg,&
+!$omp   sewald) reduction(+:sdewald,sigma11,sigma21,sigma22,sigma31,&
+!$omp   sigma32,sigma33)
+#else
+!$acc parallel loop copyin(tau,zv,ityp) reduction(+:sigma11,sigma21,sigma22,&
+!$acc        sigma31,sigma32,sigma33) reduction(-:sdewald)
+#endif
      DO ng = gstart, ngm
-        g2 = gg (ng) * tpiba2
-        g2a = g2 / 4.d0 / alpha
-        rhostar = (0.d0, 0.d0)
+        g2 = gg(ng) * tpiba2
+        g2a = g2 / 4._DP / alpha
+        rhostar = (0._DP,0._DP)
         DO na = 1, nat
            arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + &
                   g(3,ng) * tau(3,na) ) * tpi
-           rhostar = rhostar + zv(ityp(na)) * CMPLX(COS(arg), SIN(arg), KIND=DP)
+           rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP)
         ENDDO
-        rhostar = rhostar / omega
+        rhostar = rhostar / CMPLX(omega)
         sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2
         sdewald = sdewald - sewald
-        DO l = 1, 3
-           DO m = 1, l
-              sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * &
-                              g(l,ng) * g(m,ng) / g2 * (g2a + 1)
-           ENDDO
-        ENDDO
         !
+        sigma11 = sigma11 + sewald * tpiba2 * 2._DP * &
+                              g(1,ng) * g(1,ng) / g2 * (g2a + 1)
+        sigma21 = sigma21 + sewald * tpiba2 * 2._DP * &
+                              g(2,ng) * g(1,ng) / g2 * (g2a + 1)
+        sigma22 = sigma22 + sewald * tpiba2 * 2._DP * &
+                              g(2,ng) * g(2,ng) / g2 * (g2a + 1)
+        sigma31 = sigma31 + sewald * tpiba2 * 2._DP * &
+                              g(3,ng) * g(1,ng) / g2 * (g2a + 1)
+        sigma32 = sigma32 + sewald * tpiba2 * 2._DP * &
+                              g(3,ng) * g(2,ng) / g2 * (g2a + 1)
+        sigma33 = sigma33 + sewald * tpiba2 * 2._DP * &
+                              g(3,ng) * g(3,ng) / g2 * (g2a + 1)
      ENDDO
+#if !defined(_OPENACC)
 !$omp end parallel do
+#endif
+     !
+     sigmaewa(1,1) = sigmaewa(1,1) + sigma11
+     sigmaewa(2,1) = sigmaewa(2,1) + sigma21
+     sigmaewa(2,2) = sigmaewa(2,2) + sigma22
+     sigmaewa(3,1) = sigmaewa(3,1) + sigma31
+     sigmaewa(3,2) = sigmaewa(3,2) + sigma32
+     sigmaewa(3,3) = sigmaewa(3,3) + sigma33
+     !
   ENDIF
   !
+  !$acc end data
+  !
   DO l = 1, 3
      sigmaewa(l,l) = sigmaewa(l,l) + sdewald
   ENDDO
diff --git a/PW/src/stres_ewa_gpu.f90 b/PW/src/stres_ewa_gpu.f90
deleted file mode 100644
index 6a28db8e2..000000000
--- a/PW/src/stres_ewa_gpu.f90
+++ /dev/null
@@ -1,246 +0,0 @@
-!
-! Copyright (C) 2001-2009 Quantum ESPRESSO group
-! This file is distributed under the terms of the
-! GNU General Public License. See the file `License'
-! in the root directory of the present distribution,
-! or http://www.gnu.org/copyleft/gpl.txt .
-!
-!
-!-----------------------------------------------------------------------
-SUBROUTINE stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg, tau,    &
-                          omega, g_d, gg_d, ngm, gstart, gamma_only, &
-                          gcutm, sigmaewa )
-  !---------------------------------------------------------------------
-  !! Ewald contribution. Both real- and reciprocal-space terms are 
-  !! present.
-  !
-  USE kinds
-  USE constants,           ONLY : tpi, e2, eps6
-  USE mp_bands,            ONLY : intra_bgrp_comm, me_bgrp, nproc_bgrp
-  USE mp,                  ONLY : mp_sum
-  USE Coul_cut_2D,         ONLY : do_cutoff_2D, cutoff_stres_sigmaewa_gpu
-  !
-  IMPLICIT NONE
-  !
-  INTEGER :: nat
-  !! input: number of atoms in the unit cell
-  INTEGER :: ntyp
-  !! input: number of different types of atoms
-  INTEGER :: ityp(nat)
-  !! input: the type of each atom
-  INTEGER :: ngm
-  !! input: number of plane waves for G sum
-  INTEGER :: gstart
-  !! input: first nonzero g vector
-  LOGICAL, INTENT(IN) :: gamma_only
-  !! gamma point only
-  REAL(DP), INTENT(IN) :: tau(3,nat)
-  !! input: the positions of the atoms in the cell
-  REAL(DP), INTENT(IN) :: g_d(3,ngm)
-  !! input: the coordinates of G vectors
-  REAL(DP), INTENT(IN) :: gg_d(ngm)
-  !! input: the square moduli of G vectors
-  REAL(DP), INTENT(IN) :: zv(ntyp)
-  !! input: the charge of each type of atoms
-  REAL(DP), INTENT(IN) :: at(3,3)
-  !! input: the direct lattice vectors
-  REAL(DP), INTENT(IN) :: bg(3,3)
-  !! input: the reciprocal lattice vectors
-  REAL(DP), INTENT(IN) :: omega
-  !! input: the volume of the unit cell
-  REAL(DP), INTENT(IN) :: alat
-  !! input: measure of length
-  REAL(DP), INTENT(IN) :: gcutm
-  !! input: cut-off of g vectors
-  REAL(DP), INTENT(OUT) :: sigmaewa(3,3)
-  ! output: the ewald stress
-  !
-  ! ... local variables
-  !
-  INTEGER, PARAMETER :: mxr = 50
-  ! the maximum number of R vectors included in r sum
-  INTEGER :: ng,  nr, na, nb, l, m, nrm
-  ! counter over reciprocal G vectors
-  ! counter over direct vectors
-  ! counter on atoms
-  ! counter on atoms
-  ! counter on atoms
-  ! number of R vectors included in r sum
-  INTEGER :: na_s, na_e, mykey
-  !
-  REAL(DP) :: charge, arg, tpiba2, dtau(3), alpha, r(3,mxr),     &
-              r2(mxr), rmax, rr, upperbound, fact, fac, g2, g2a, &
-              sdewald, sewald
-  ! total ionic charge in the cell
-  ! the argument of the phase
-  ! length in reciprocal space
-  ! the difference tau_s - tau_s'
-  ! alpha term in ewald sum
-  ! input of the rgen routine ( not used here )
-  ! the square modulus of R_j-tau_s-tau_s'
-  ! the maximum radius to consider real space sum
-  ! buffer variable
-  ! used to optimize alpha
-  ! auxiliary variables
-  ! diagonal term
-  ! nondiagonal term
-  !
-  INTEGER :: ierr(2)
-  REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33
-  COMPLEX(DP) :: rhostar
-  !
-  INTEGER,  ALLOCATABLE :: ityp_d(:)
-  REAL(DP), ALLOCATABLE :: zv_d(:), tau_d(:,:)
-  !
-#if defined(__CUDA)
-  attributes(DEVICE) :: g_d, gg_d, zv_d, ityp_d, tau_d
-#endif  
-  !
-  tpiba2 = (tpi / alat)**2
-  sigmaewa(:,:) = 0._DP
-  charge = 0._DP
-  !
-  ALLOCATE( zv_d(ntyp), tau_d(3,nat) )
-  zv_d  = zv
-  tau_d = tau
-  ALLOCATE( ityp_d(nat) )
-  ityp_d = ityp
-  !
-  DO na = 1, nat
-     charge = charge + zv(ityp(na))
-  ENDDO
-  !
-  ! choose alpha in order to have convergence in the sum over G
-  ! upperbound is a safe upper bound for the error ON THE ENERGY
-  !
-  alpha = 2.9_DP
-12 alpha = alpha - 0.1_DP
-  !
-  IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 )
-  upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * &
-               erfc ( SQRT(tpiba2 * gcutm / 4._DP / alpha) )
-  !
-  IF (upperbound > 1d-7) GOTO 12
-  !
-  ! G-space sum here
-  !
-  ! Determine if this processor contains G=0 and set the constant term
-  !
-  IF (gstart == 2) THEN
-     sdewald = tpi * e2 / 4._DP / alpha * (charge / omega)**2
-  ELSE
-     sdewald = 0._DP
-  ENDIF
-  !
-  ! sdewald is the diagonal term
-  IF ( gamma_only ) THEN
-     fact = 2._DP
-  ELSE
-     fact = 1._DP
-  ENDIF
-  !
-  IF ( do_cutoff_2D ) THEN 
-     !
-     CALL cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa )
-     !
-  ELSE
-     !
-     sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP
-     sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP
-     !
-     !$cuf kernel do (1) <<<*,*>>>
-     DO ng = gstart, ngm
-        g2 = gg_d(ng) * tpiba2
-        g2a = g2 / 4._DP / alpha
-        rhostar = (0._DP,0._DP)
-        DO na = 1, nat
-           arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + &
-                  g_d(3,ng) * tau_d(3,na) ) * tpi
-           rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP)
-        ENDDO
-        rhostar = rhostar / CMPLX(omega)
-        sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2
-        sdewald = sdewald - sewald
-        !
-        sigma11 = sigma11 + sewald * tpiba2 * 2._DP * &
-                              g_d(1,ng) * g_d(1,ng) / g2 * (g2a + 1)
-        sigma21 = sigma21 + sewald * tpiba2 * 2._DP * &
-                              g_d(2,ng) * g_d(1,ng) / g2 * (g2a + 1)
-        sigma22 = sigma22 + sewald * tpiba2 * 2._DP * &
-                              g_d(2,ng) * g_d(2,ng) / g2 * (g2a + 1)
-        sigma31 = sigma31 + sewald * tpiba2 * 2._DP * &
-                              g_d(3,ng) * g_d(1,ng) / g2 * (g2a + 1)
-        sigma32 = sigma32 + sewald * tpiba2 * 2._DP * &
-                              g_d(3,ng) * g_d(2,ng) / g2 * (g2a + 1)
-        sigma33 = sigma33 + sewald * tpiba2 * 2._DP * &
-                              g_d(3,ng) * g_d(3,ng) / g2 * (g2a + 1)
-        !
-     ENDDO
-     !
-     sigmaewa(1,1) = sigmaewa(1,1) + sigma11
-     sigmaewa(2,1) = sigmaewa(2,1) + sigma21
-     sigmaewa(2,2) = sigmaewa(2,2) + sigma22
-     sigmaewa(3,1) = sigmaewa(3,1) + sigma31
-     sigmaewa(3,2) = sigmaewa(3,2) + sigma32
-     sigmaewa(3,3) = sigmaewa(3,3) + sigma33
-     !
-  ENDIF
-  !
-  DO l = 1, 3
-     sigmaewa(l,l) = sigmaewa(l,l) + sdewald
-  ENDDO
-  !
-  ! R-space sum here (see ewald.f90 for details on parallelization)
-  !
-  CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey )
-  !
-  IF ( mykey == 0 ) THEN
-     rmax = 4.0d0 / SQRT(alpha) / alat
-     !
-     ! with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1
-     !
-     DO na = na_s, na_e
-        DO nb = 1, nat
-           dtau(:) = tau(:,na) - tau(:,nb)
-           !
-           !     generates nearest-neighbors shells r(i)=R(i)-dtau(i)
-           !
-           CALL rgen( dtau, rmax, mxr, at, bg, r, r2, nrm )
-           !
-           DO nr = 1, nrm
-              rr = SQRT(r2 (nr) ) * alat
-              fac = - e2 / 2.0_DP/ omega * alat**2 * zv(ityp(na)) * &
-                    zv(ityp(nb)) / rr**3 * (erfc(SQRT(alpha) * rr) + &
-                    rr * SQRT(8.0_DP * alpha / tpi) * EXP( - alpha * rr**2) )
-              DO l = 1, 3
-                 DO m = 1, l
-                    sigmaewa(l,m) = sigmaewa(l,m) + fac * r(l,nr) * r(m,nr)
-                 ENDDO
-              ENDDO
-           ENDDO
-           !
-        ENDDO
-     ENDDO
-  ENDIF
-  !
-  DO l = 1, 3
-     DO m = 1, l - 1
-        sigmaewa(m,l) = sigmaewa(l,m)
-     ENDDO
-  ENDDO
-  !
-  DO l = 1, 3
-     DO m = 1, 3
-        sigmaewa(l,m) = - sigmaewa(l,m)
-     ENDDO
-  ENDDO
-  !
-  DEALLOCATE( zv_d, tau_d )
-  DEALLOCATE( ityp_d )
-  !
-  CALL mp_sum( sigmaewa, intra_bgrp_comm )
-  !
-  RETURN
-  !
-END SUBROUTINE stres_ewa_gpu
-
diff --git a/PW/src/stress.f90 b/PW/src/stress.f90
index 3f86dcea8..cec9206ee 100644
--- a/PW/src/stress.f90
+++ b/PW/src/stress.f90
@@ -122,13 +122,10 @@ SUBROUTINE stress( sigma )
   IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress
      CALL esm_stres_ewa( sigmaewa )
   ELSE
-     IF (.NOT. use_gpu) CALL stres_ewa( alat, nat, ntyp, ityp, zv, at,      &
-                                        bg, tau, omega, g, gg, ngm, gstart, &
-                                        gamma_only, gcutm, sigmaewa )
-     IF (      use_gpu) CALL stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg,&
-                                            tau, omega, g_d,gg_d, ngm, gstart,&
-                                            gamma_only, gcutm, sigmaewa )
-  END IF
+     CALL stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, &
+                     tau, omega, g, gg, ngm, gstart,    &
+                     gamma_only, gcutm, sigmaewa )
+  ENDIF
   !
   ! semi-empirical dispersion contribution: Grimme-D2 and D3
   !