Added GPU version for h_psi, s_psi, g_psi and vloc. psic and psic_nc module variables should be standardized. Many data copies between CPU and GPU should be replaced by data in modules. All test passing except for pw_vc-relax/vc-relax3.in which has too loose convergence (noticed by Josh Romero) and for dsygvdx_gpu (real problem) occasionally failing

2018-04-18 13:28:38 +02:00 · 2018-04-18 13:28:38 +02:00 · 5f6d231bdd
parent df72013a32
commit 5f6d231bdd
18 changed files with 2935 additions and 116 deletions
--- a/KS_Solvers/Davidson/cegterg_gpu.f90
+++ b/KS_Solvers/Davidson/cegterg_gpu.f90
@ -281,7 +281,7 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
     !END DO
     ! ========= TO HERE, REPLACED BY =======

-     CALL reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, vc_d)
+     CALL reorder_evals_cevecs(nbase, nvec, nvecx, conv, e_d, ew_d, vc_d)
     !
     nb1 = nbase + 1
     !
@ -608,62 +608,62 @@ SUBROUTINE cegterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
  !
  RETURN
  !
-  CONTAINS
-     SUBROUTINE reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
-       USE david_param,   ONLY : DP
-       USE david_buffer,  ONLY : buffer
-       implicit none
-       INTEGER, INTENT(IN) :: nbase, nvec, nvecx
-       LOGICAL, INTENT(IN) :: conv(nvec)
-       REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
-       COMPLEX(DP), DEVICE :: v_d(nvecx,nvecx)
-       !
-       INTEGER :: n, np, info
-       INTEGER, ALLOCATABLE :: conv_idx(:)
-       INTEGER, DEVICE, POINTER :: conv_idx_d(:)
-       COMPLEX(DP), DEVICE, POINTER :: vtmp_d(:,:)
-       !
-       np = 0
-       ALLOCATE(conv_idx(nvec))
-       DO n = 1, nvec
-          conv_idx(n) = -1
-          IF ( .NOT. conv(n) ) THEN
-             np = np + 1
-             conv_idx(n) = np
-          END IF
-       END DO
-
-       IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
-
-       CALL buffer%lock_buffer(conv_idx_d, nvec, info)
-       CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
-
-       conv_idx_d(1:nvec) = conv_idx(1:nvec)
-
-!$cuf kernel do(2) <<<*,*>>>
-       DO j=1,nvec
-          DO k=1,nvecx
-             vtmp_d(k,j) = v_d(k,j)
-          END DO
-       END DO
-
-!$cuf kernel do(2) <<<*,*>>>
-       DO j=1,nvec
-          DO k=1,nvecx
-             IF(conv_idx_d(j) /= -1) THEN
-               v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
-               IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
-             END IF
-          END DO
-       END DO
-       !
-       CALL buffer%release_buffer(conv_idx_d, info)
-       CALL buffer%release_buffer(vtmp_d, info)
-       !
-       DEALLOCATE(conv_idx)
-    END SUBROUTINE
-
 END SUBROUTINE cegterg_gpu
+
+SUBROUTINE reorder_evals_cevecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
+   USE david_param,   ONLY : DP
+   USE david_buffer,  ONLY : buffer
+   implicit none
+   INTEGER, INTENT(IN) :: nbase, nvec, nvecx
+   LOGICAL, INTENT(IN) :: conv(nvec)
+   REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
+   COMPLEX(DP), DEVICE :: v_d(nvecx,nvecx)
+   !
+   INTEGER :: j, k, n, np, info
+   INTEGER, ALLOCATABLE :: conv_idx(:)
+   INTEGER, DEVICE, POINTER :: conv_idx_d(:)
+   COMPLEX(DP), DEVICE, POINTER :: vtmp_d(:,:)
+   !
+   np = 0
+   ALLOCATE(conv_idx(nvec))
+   DO n = 1, nvec
+      conv_idx(n) = -1
+      IF ( .NOT. conv(n) ) THEN
+         np = np + 1
+         conv_idx(n) = np
+      END IF
+   END DO
+
+   IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
+
+   CALL buffer%lock_buffer(conv_idx_d, nvec, info)
+   CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
+
+   conv_idx_d(1:nvec) = conv_idx(1:nvec)
+
+!$cuf kernel do(2) <<<*,*>>>
+   DO j=1,nvec
+      DO k=1,nvecx
+         vtmp_d(k,j) = v_d(k,j)
+      END DO
+   END DO
+
+!$cuf kernel do(2) <<<*,*>>>
+   DO j=1,nvec
+      DO k=1,nvecx
+         IF(conv_idx_d(j) /= -1) THEN
+           v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
+           IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
+         END IF
+      END DO
+   END DO
+   !
+   CALL buffer%release_buffer(conv_idx_d, info)
+   CALL buffer%release_buffer(vtmp_d, info)
+   !
+   DEALLOCATE(conv_idx)
+END SUBROUTINE reorder_evals_cevecs
+
 !
 !  Wrapper for subroutine with distributed matrixes (written by Carlo Cavazzoni)
 !
--- a/KS_Solvers/Davidson/regterg_gpu.f90
+++ b/KS_Solvers/Davidson/regterg_gpu.f90
@ -272,7 +272,7 @@ SUBROUTINE regterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
     !END DO
     ! ========= TO HERE, REPLACED BY =======

-     CALL reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, vr_d)
+     CALL reorder_evals_revecs(nbase, nvec, nvecx, conv, e_d, ew_d, vr_d)
     !
     nb1 = nbase + 1
     !
@ -567,62 +567,63 @@ SUBROUTINE regterg_gpu( h_psi_gpu, s_psi_gpu, uspp, g_psi_gpu, &
  !
  RETURN
  !
-  CONTAINS
-     SUBROUTINE reorder_evals_evecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
-       USE david_param,   ONLY : DP
-       USE david_buffer,  ONLY : buffer
-       implicit none
-       INTEGER, INTENT(IN) :: nbase, nvec, nvecx
-       LOGICAL, INTENT(IN) :: conv(nvec)
-       REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
-       REAL(DP), DEVICE :: v_d(nvecx,nvecx)
-       !
-       INTEGER :: n, np, info
-       INTEGER, ALLOCATABLE :: conv_idx(:)
-       INTEGER, DEVICE, POINTER :: conv_idx_d(:)
-       REAL(DP), DEVICE, POINTER :: vtmp_d(:,:)
-       !
-       np = 0
-       ALLOCATE(conv_idx(nvec))
-       DO n = 1, nvec
-          conv_idx(n) = -1
-          IF ( .NOT. conv(n) ) THEN
-             np = np + 1
-             conv_idx(n) = np
-          END IF
-       END DO
-
-       IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
-
-       CALL buffer%lock_buffer(conv_idx_d, nvec, info)
-       CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
-
-       conv_idx_d(1:nvec) = conv_idx(1:nvec)
-
-!$cuf kernel do(2) <<<*,*>>>
-       DO j=1,nvec
-          DO k=1,nvecx
-             vtmp_d(k,j) = v_d(k,j)
-          END DO
-       END DO
-
-!$cuf kernel do(2) <<<*,*>>>
-       DO j=1,nvec
-          DO k=1,nvecx
-             IF(conv_idx_d(j) /= -1) THEN
-               v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
-               IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
-             END IF
-          END DO
-       END DO
-       !
-       CALL buffer%release_buffer(conv_idx_d, info)
-       CALL buffer%release_buffer(vtmp_d, info)
-       !
-       DEALLOCATE(conv_idx)
-    END SUBROUTINE
-
 END SUBROUTINE regterg_gpu
+
+SUBROUTINE reorder_evals_revecs(nbase, nvec, nvecx, conv, e_d, ew_d, v_d)
+  USE david_param,   ONLY : DP
+  USE david_buffer,  ONLY : buffer
+  implicit none
+  INTEGER, INTENT(IN) :: nbase, nvec, nvecx
+  LOGICAL, INTENT(IN) :: conv(nvec)
+  REAL(DP), DEVICE :: e_d(nvecx), ew_d(nvecx)
+  REAL(DP), DEVICE :: v_d(nvecx,nvecx)
+  !
+  INTEGER :: j, k, n, np, info
+  INTEGER, ALLOCATABLE :: conv_idx(:)
+  INTEGER, DEVICE, POINTER :: conv_idx_d(:)
+  REAL(DP), DEVICE, POINTER :: vtmp_d(:,:)
+  !
+  np = 0
+  ALLOCATE(conv_idx(nvec))
+  DO n = 1, nvec
+     conv_idx(n) = -1
+     IF ( .NOT. conv(n) ) THEN
+        np = np + 1
+        conv_idx(n) = np
+     END IF
+  END DO
+
+  IF (.not. buffer%is_initialized) CALL buffer%init(3, info)
+
+  CALL buffer%lock_buffer(conv_idx_d, nvec, info)
+  CALL buffer%lock_buffer(vtmp_d, (/nvecx, nvecx/), info)
+
+  conv_idx_d(1:nvec) = conv_idx(1:nvec)
+
+!$cuf kernel do(2) <<<*,*>>>
+  DO j=1,nvec
+     DO k=1,nvecx
+        vtmp_d(k,j) = v_d(k,j)
+     END DO
+  END DO
+
+!$cuf kernel do(2) <<<*,*>>>
+   DO j=1,nvec
+      DO k=1,nvecx
+         IF(conv_idx_d(j) /= -1) THEN
+           v_d(k,conv_idx_d(j)) = vtmp_d(k,j)
+           IF(k==1) ew_d(nbase+conv_idx_d(j)) = e_d(j)
+         END IF
+      END DO
+   END DO
+   !
+   CALL buffer%release_buffer(conv_idx_d, info)
+   CALL buffer%release_buffer(vtmp_d, info)
+   !
+   DEALLOCATE(conv_idx)
+END SUBROUTINE reorder_evals_revecs
+
+
 !
 !  Wrapper for subroutine with distributed matrixes (written by Carlo Cavazzoni)
 !
--- a/Modules/Makefile
+++ b/Modules/Makefile
@ -162,6 +162,8 @@ w0gauss.o \
 w1gauss.o \
 deviatoric.o

+-include make.gpu
+
 TLDEPS=libfft

 all : libqemod.a
--- a/Modules/buffers_module.f90
+++ b/Modules/buffers_module.f90
@ -0,0 +1,17 @@
+!
+! Copyright (C) Quantum ESPRESSO group
+!
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+MODULE buffers_module
+#if defined(__CUDA)
+  USE fbuf, ONLY : buf_t
+  !
+  IMPLICIT NONE
+  !
+  TYPE(buf_t) :: buffer
+#endif
+END MODULE buffers_module
--- a/Modules/fbuf.f90
+++ b/Modules/fbuf.f90
--- a/Modules/make.gpu
+++ b/Modules/make.gpu
@ -0,0 +1,4 @@
+MODULES += \
+  fbuf.o \
+  buffers_module.o \
+  wavefunctions_gpu.o
--- a/Modules/wavefunctions_gpu.f90
+++ b/Modules/wavefunctions_gpu.f90
@ -0,0 +1,60 @@
+!
+! Copyright (C) 2002-2011 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+
+!=----------------------------------------------------------------------------=!
+   MODULE wavefunctions_module_gpum
+!=----------------------------------------------------------------------------=!
+     USE kinds, ONLY :  DP
+
+     IMPLICIT NONE
+     SAVE
+
+     !
+     COMPLEX(DP), DEVICE, ALLOCATABLE, TARGET :: &
+       evc_d(:,:)     ! wavefunctions in the PW basis set
+                    ! noncolinear case: first index
+                    ! is a combined PW + spin index
+     !
+     COMPLEX(DP) , DEVICE, ALLOCATABLE, TARGET :: &
+       psic_d(:), &      ! additional memory for FFT
+       psic_nc_d(:,:)    ! as above for the noncolinear case
+     !
+     
+  CONTAINS
+     !
+     ! ==== SUBS FOR EVC
+     !
+     SUBROUTINE start_sync_evc_dev()
+        use wavefunctions_module, only: evc
+        implicit none
+        if (.not. allocated(evc_d)) then 
+           allocate(evc_d, source=evc)
+           return
+        end if
+        evc_d = evc
+        !
+     END SUBROUTINE start_sync_evc_dev
+     !
+     SUBROUTINE start_sync_evc_hst()
+        use wavefunctions_module, only: evc
+        implicit none
+        evc = evc_d
+     END SUBROUTINE start_sync_evc_hst
+
+     SUBROUTINE wait_sync_evc_dev()
+        RETURN
+     END SUBROUTINE wait_sync_evc_dev
+     SUBROUTINE wait_sync_evc_hst()
+        RETURN
+     END SUBROUTINE wait_sync_evc_hst
+     
+     
+     
+!=----------------------------------------------------------------------------=!
+   END MODULE wavefunctions_module_gpum
+!=----------------------------------------------------------------------------=!
--- a/PW/src/c_bands.f90
+++ b/PW/src/c_bands.f90
@ -163,6 +163,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
  USE noncollin_module,     ONLY : noncolin, npol
  USE wavefunctions_module, ONLY : evc
  USE g_psi_mod,            ONLY : h_diag, s_diag
+  USE g_psi_mod_gpum,       ONLY : using_h_diag, using_s_diag
  USE scf,                  ONLY : v_of_0
  USE bp,                   ONLY : lelfield, evcel, evcelp, evcelm, bec_evcel,&
                                   gdir, l3dstring, efield, efield_cry
@ -208,6 +209,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
 ! In addition to the above ithe initial wfc rotation uses h_psi, and s_psi

  ALLOCATE( h_diag( npwx, npol ), STAT=ierr )
+  call using_h_diag(.true.); call using_s_diag(.true.)
  IF( ierr /= 0 ) &
     CALL errore( ' diag_bands ', ' cannot allocate h_diag ', ABS(ierr) )
  !
@ -240,6 +242,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
  CALL deallocate_bec_type ( becp )
  DEALLOCATE( s_diag )
  DEALLOCATE( h_diag )
+  call using_h_diag(.true.); call using_s_diag(.true.)
  !
  IF ( notconv > MAX( 5, nbnd / 4 ) ) THEN
     !
@ -278,6 +281,7 @@ CONTAINS
          h_diag(ig,1) = 1.D0 + g2kin(ig) + SQRT( 1.D0 + ( g2kin(ig) - 1.D0 )**2 )
          !
       END FORALL
+       call using_h_diag(.true.)
       !
       ntry = 0
       !
@ -318,6 +322,7 @@ CONTAINS
       h_diag(1:npw, 1) = g2kin(1:npw) + v_of_0
       !
       CALL usnldiag( npw, h_diag, s_diag )
+       call using_h_diag(.true.); call using_s_diag(.true.);
       !
       ntry = 0
       !
@ -438,6 +443,7 @@ CONTAINS
          h_diag(ig,:) = 1.D0 + g2kin(ig) + SQRT( 1.D0 + ( g2kin(ig) - 1.D0 )**2 )
          !
       END FORALL
+       call using_h_diag(.true.);
       !
       ntry = 0
       !
@ -482,6 +488,7 @@ CONTAINS
       END DO
       !
       CALL usnldiag( npw, h_diag, s_diag )
+       call using_h_diag(.true.); call using_s_diag(.true.);
       !
       ntry = 0
       !
--- a/PW/src/g_psi.f90
+++ b/PW/src/g_psi.f90
@ -16,6 +16,7 @@ subroutine g_psi (lda, n, m, npol, psi, e)
  !
  USE kinds
  USE g_psi_mod
+  USE g_psi_mod_gpum, ONLY : using_h_diag, using_s_diag
  implicit none
  integer :: lda, n, m, npol, ipol
  ! input: the leading dimension of psi
@ -37,6 +38,7 @@ subroutine g_psi (lda, n, m, npol, psi, e)
  ! counter on psi functions
  ! counter on G vectors
  !
+  call using_h_diag(.false.); call using_s_diag(.false.)
  call start_clock ('g_psi')
  !
 #ifdef TEST_NEW_PRECONDITIONING
--- a/PW/src/g_psi_gpu.f90
+++ b/PW/src/g_psi_gpu.f90
@ -10,6 +10,77 @@
 !-----------------------------------------------------------------------
 #if defined(__CUDA)
 subroutine g_psi_gpu (lda, n, m, npol, psi_d, e_d)
+  USE kinds
+
+  USE cudafor
+  USE g_psi_mod_gpum, ONLY : h_diag_d, s_diag_d, using_h_diag_d, using_s_diag_d
+
+  implicit none
+  integer :: lda, n, m, npol, ipol
+  ! input: the leading dimension of psi
+  ! input: the real dimension of psi
+  ! input: the number of bands
+  ! input: the number of coordinates of psi
+  ! local variable: counter of coordinates of psi
+  real(DP) :: e_d (m)
+  ! input: the eigenvectors
+  complex(DP) :: psi_d (lda, npol, m)
+  ! inp/out: the psi vector
+
+  attributes( device ) :: e_d, psi_d
+
+  !
+  !    Local variables
+  !
+  real(DP), parameter :: eps = 1.0d-4
+  ! a small number
+  real(DP) :: x, scala, denm
+  integer :: k, i
+  ! counter on psi functions
+  ! counter on G vectors
+  !
+  call using_h_diag_d(.false.)
+  call using_s_diag_d(.false.)
+  !
+  call start_clock ('g_psi')
+  !
+#ifdef TEST_NEW_PRECONDITIONING
+  scala = 1.d0
+
+!$cuf kernel do(3) <<<*,*>>>
+  do ipol=1,npol
+     do k = 1, m
+        do i = 1, n
+           x = (h_diag_d(i,ipol) - e_d(k)*s_diag_d(i,ipol))*scala
+           denm = 0.5_dp*(1.d0+x+sqrt(1.d0+(x-1)*(x-1.d0)))/scala
+           psi_d (i, ipol, k) = psi_d (i, ipol, k) / denm
+        enddo
+     enddo
+  enddo
+#else
+!$cuf kernel do(3) <<<*,*>>>
+  do ipol=1,npol
+     do k = 1, m
+        do i = 1, n
+           denm = h_diag_d (i,ipol) - e_d (k) * s_diag_d (i,ipol)
+        !
+        ! denm = g2+v(g=0) - e(k)
+        !
+           if (abs (denm) < eps) denm = sign (eps, denm)
+        !
+        ! denm = sign( max( abs(denm),eps ), denm )
+        !
+           psi_d (i, ipol, k) = psi_d (i, ipol, k) / denm
+        enddo
+     enddo
+  enddo
+#endif
+
+  call stop_clock ('g_psi')
+  return
+end subroutine g_psi_gpu
+
+subroutine g_psi_gpu_compatibility (lda, n, m, npol, psi_d, e_d)
  !-----------------------------------------------------------------------
  !
  !    This routine computes an estimate of the inverse Hamiltonian
@ -57,5 +128,5 @@ subroutine g_psi_gpu (lda, n, m, npol, psi_d, e_d)
  DEALLOCATE(psi, e )

  return
-end subroutine g_psi_gpu
+end subroutine g_psi_gpu_compatibility
 #endif 
--- a/PW/src/g_psi_mod_gpu.f90
+++ b/PW/src/g_psi_mod_gpu.f90
@ -0,0 +1,93 @@
+!
+! Copyright (C) 2001-2007 PWSCF group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+MODULE g_psi_mod_gpum
+  !
+  ! ... These are the variables needed in g_psi
+  !  
+  USE kinds, only : DP
+  !
+  IMPLICIT NONE
+  !
+  REAL(DP), DEVICE, ALLOCATABLE :: &
+    h_diag_d (:,:),&   ! diagonal part of the Hamiltonian
+    s_diag_d (:,:)     ! diagonal part of the overlap matrix
+  
+  LOGICAL :: h_diag_ood = .false.    ! used to flag out of date variables
+  LOGICAL :: s_diag_ood = .false.
+  LOGICAL :: h_diag_d_ood = .false.
+  LOGICAL :: s_diag_d_ood = .false.
+  !
+  CONTAINS
+  !
+  SUBROUTINE using_h_diag(writing)
+      USE g_psi_mod, ONLY : h_diag
+      implicit none
+      LOGICAL, INTENT(IN) :: writing
+      !
+      IF (h_diag_ood) THEN
+          h_diag = h_diag_d
+          h_diag_ood = .false.
+      ENDIF
+      IF (writing)    h_diag_d_ood = .true.
+  END SUBROUTINE using_h_diag
+  !
+  SUBROUTINE using_h_diag_d(writing)
+      USE g_psi_mod, ONLY : h_diag
+      implicit none
+      LOGICAL, INTENT(IN) :: writing
+      !
+      IF (.not. allocated(h_diag)) THEN
+          IF (allocated(h_diag_d)) DEALLOCATE(h_diag_d)
+          h_diag_d_ood = .false.
+          RETURN
+      END IF
+      IF (h_diag_d_ood) THEN
+          IF (.not. allocated(h_diag_d)) THEN
+              ALLOCATE(h_diag_d, SOURCE=h_diag)
+          ELSE
+              h_diag_d = h_diag
+          ENDIF
+          h_diag_d_ood = .false.
+      ENDIF
+      IF (writing)    h_diag_ood = .true.
+  END SUBROUTINE using_h_diag_d
+  !
+  SUBROUTINE using_s_diag(writing)
+      USE g_psi_mod, ONLY : s_diag
+      implicit none
+      LOGICAL, INTENT(IN) :: writing
+      !
+      IF (s_diag_ood) THEN
+          s_diag = s_diag_d
+          s_diag_ood = .false.
+      ENDIF
+      IF (writing)    s_diag_d_ood = .true.
+  END SUBROUTINE using_s_diag
+  !
+  SUBROUTINE using_s_diag_d(writing)
+      USE g_psi_mod, ONLY : s_diag
+      implicit none
+      LOGICAL, INTENT(IN) :: writing
+      !
+      IF (.not. allocated(s_diag)) THEN
+          IF (allocated(s_diag_d)) DEALLOCATE(s_diag_d)
+          s_diag_d_ood = .false.
+          RETURN
+      END IF
+      IF (s_diag_d_ood) THEN
+          IF (.not. allocated(s_diag_d)) THEN
+              ALLOCATE(s_diag_d, SOURCE=s_diag)
+          ELSE
+              s_diag_d = s_diag
+          END IF
+          s_diag_d_ood = .false.
+      ENDIF
+      IF (writing)    s_diag_ood = .true.
+  END SUBROUTINE using_s_diag_d
+  !
+END MODULE g_psi_mod_gpum
--- a/PW/src/h_psi_gpu.f90
+++ b/PW/src/h_psi_gpu.f90
@ -7,7 +7,324 @@
 !
 !----------------------------------------------------------------------------
 #if defined(__CUDA)
+
+
+
+! Copyright (C) 2002-2016 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!----------------------------------------------------------------------------
 SUBROUTINE h_psi_gpu( lda, n, m, psi_d, hpsi_d )
+  !----------------------------------------------------------------------------
+  !
+  ! ... This routine computes the product of the Hamiltonian
+  ! ... matrix with m wavefunctions contained in psi
+  !
+  ! ... input:
+  ! ...    lda   leading dimension of arrays psi, spsi, hpsi
+  ! ...    n     true dimension of psi, spsi, hpsi
+  ! ...    m     number of states psi
+  ! ...    psi
+  !
+  ! ... output:
+  ! ...    hpsi  H*psi
+  !
+  ! --- Wrapper routine: performs bgrp parallelization on non-distributed bands
+  ! --- if suitable and required, calls old H\psi routine h_psi_
+  !
+  USE kinds,            ONLY : DP
+  USE noncollin_module, ONLY : npol
+  USE funct,            ONLY : exx_is_active
+  USE mp_bands,         ONLY : use_bgrp_in_hpsi, inter_bgrp_comm
+  USE mp,               ONLY : mp_sum
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN)      :: lda, n, m
+  COMPLEX(DP), DEVICE, INTENT(IN)  :: psi_d(lda*npol,m) 
+  COMPLEX(DP), DEVICE, INTENT(OUT) :: hpsi_d(lda*npol,m)   
+  !
+  INTEGER     :: m_start, m_end
+  !
+  CALL start_clock( 'h_psi_bgrp' ); !write (*,*) 'start h_psi_bgrp'; FLUSH(6)
+
+  ! band parallelization with non-distributed bands is performed if
+  ! 1. enabled (variable use_bgrp_in_hpsi must be set to .T.)
+  ! 2. exact exchange is not active (if it is, band parallelization is already
+  !    used in exx routines called by Hpsi)
+  ! 3. there is more than one band, otherwise there is nothing to parallelize
+  !
+  IF (use_bgrp_in_hpsi .AND. .NOT. exx_is_active() .AND. m > 1) THEN
+     ! use band parallelization here
+     hpsi_d(:,:) = (0.d0,0.d0)
+     CALL divide(inter_bgrp_comm,m,m_start,m_end)
+     ! Check if there at least one band in this band group
+     IF (m_end >= m_start) &
+        CALL h_psi__gpu( lda, n, m_end-m_start+1, psi_d(1,m_start), hpsi_d(1,m_start) )
+     CALL mp_sum(hpsi_d,inter_bgrp_comm)
+  ELSE
+     ! don't use band parallelization here
+     CALL h_psi__gpu( lda, n, m, psi_d, hpsi_d )
+  END IF
+
+  CALL stop_clock( 'h_psi_bgrp' )
+  RETURN
+  !
+END SUBROUTINE h_psi_gpu
+!
+!----------------------------------------------------------------------------
+SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
+  !----------------------------------------------------------------------------
+  !
+  ! ... This routine computes the product of the Hamiltonian
+  ! ... matrix with m wavefunctions contained in psi
+  !
+  ! ... input:
+  ! ...    lda   leading dimension of arrays psi, spsi, hpsi
+  ! ...    n     true dimension of psi, spsi, hpsi
+  ! ...    m     number of states psi
+  ! ...    psi
+  !
+  ! ... output:
+  ! ...    hpsi  H*psi
+  !
+  USE kinds,    ONLY : DP
+  USE bp,       ONLY : lelfield,l3dstring,gdir, efield, efield_cry
+  USE becmod,   ONLY : bec_type, becp, calbec
+  USE lsda_mod, ONLY : current_spin
+  USE scf,      ONLY : vrs
+  USE wvfct,    ONLY : g2kin
+  USE uspp,     ONLY : vkb, nkb
+  USE ldaU,     ONLY : lda_plus_u, U_projection
+  USE gvect,    ONLY : gstart
+  USE funct,    ONLY : dft_is_meta
+  USE control_flags,    ONLY : gamma_only
+  USE noncollin_module, ONLY: npol, noncolin
+  USE realus,   ONLY : real_space, &
+                       invfft_orbital_gamma, fwfft_orbital_gamma, calbec_rs_gamma, add_vuspsir_gamma, & 
+                       invfft_orbital_k, fwfft_orbital_k, calbec_rs_k, add_vuspsir_k, & 
+                       v_loc_psir_inplace
+  USE fft_base, ONLY : dffts
+  USE exx,      ONLY : use_ace, vexx, vexxace_gamma, vexxace_k
+  USE funct,    ONLY : exx_is_active
+  USE fft_helper_subroutines
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN)     :: lda, n, m
+  COMPLEX(DP), DEVICE, INTENT(IN)  :: psi_d(lda*npol,m) 
+  COMPLEX(DP), DEVICE, INTENT(OUT) :: hpsi_d(lda*npol,m)   
+  !
+  REAL(DP), DEVICE, ALLOCATABLE :: g2kin_d(:), vrs_d(:,:)
+  COMPLEX(DP), ALLOCATABLE :: psi_host(:,:)
+  COMPLEX(DP), ALLOCATABLE :: hpsi_host(:,:)
+  !
+  INTEGER     :: ipol, ibnd, incr, i
+  REAL(dp)    :: ee
+  !
+  LOGICAL     :: need_host_copy
+  !
+  CALL start_clock( 'h_psi' ); !write (*,*) 'start h_psi';FLUSH(6)
+  ALLOCATE(g2kin_d, SOURCE=g2kin)
+  ALLOCATE(vrs_d, SOURCE=vrs)
+  hpsi_d (:, 1:m) = (0.0_dp, 0.0_dp)
+
+  need_host_copy = ( real_space .and. nkb > 0  ) .OR. &
+                    dft_is_meta() .OR. &
+                    (lda_plus_u .AND. U_projection.NE."pseudo" ) .OR. &
+                    ( nkb > 0 .AND. .NOT. real_space) .OR. &
+                    exx_is_active() .OR. lelfield
+
+  if (need_host_copy) then
+      ALLOCATE(psi_host(lda*npol,m) , hpsi_host(lda*npol,m) )
+      psi_host = psi_d
+      hpsi_host = hpsi_d ! this is not needed
+  end if
+
+  CALL start_clock( 'h_psi:pot' ); !write (*,*) 'start h_pot';FLUSH(6)
+  !
+  ! ... Here the product with the local potential V_loc psi
+  !
+  IF ( gamma_only ) THEN
+     ! 
+     IF ( real_space .and. nkb > 0  ) then 
+        !
+        ! ... real-space algorithm
+        ! ... fixme: real_space without beta functions does not make sense
+        !
+        IF ( dffts%has_task_groups ) then 
+           incr = 2 * fftx_ntgrp(dffts)
+        ELSE
+           incr = 2
+        ENDIF
+        DO ibnd = 1, m, incr
+           ! ... transform psi to real space -> psic 
+           CALL invfft_orbital_gamma(psi_host,ibnd,m) 
+           ! ... compute becp%r = < beta|psi> from psic in real space
+     CALL start_clock( 'h_psi:calbec' ) 
+           CALL calbec_rs_gamma(ibnd,m,becp%r) 
+     CALL stop_clock( 'h_psi:calbec' )
+           ! ... psic -> vrs * psic (psic overwritten will become hpsi)
+           CALL v_loc_psir_inplace(ibnd,m) 
+           ! ... psic (hpsi) -> psic + vusp
+           CALL  add_vuspsir_gamma(ibnd,m)
+           ! ... transform psic back in reciprocal space and assign it to hpsi
+           CALL fwfft_orbital_gamma(hpsi_host,ibnd,m) 
+        END DO
+        hpsi_d = hpsi_host
+        !
+     ELSE
+        ! ... usual reciprocal-space algorithm
+        CALL vloc_psi_gamma_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
+        !
+     ENDIF 
+     !
+  ELSE IF ( noncolin ) THEN 
+     !
+     CALL vloc_psi_nc_gpu ( lda, n, m, psi_d, vrs_d, hpsi_d )
+     !
+  ELSE  
+     ! 
+     IF ( real_space .and. nkb > 0  ) then 
+        !
+        ! ... real-space algorithm
+        ! ... fixme: real_space without beta functions does not make sense
+        !
+        IF ( dffts%has_task_groups ) then 
+           incr = fftx_ntgrp(dffts)
+        ELSE
+           incr = 1
+        ENDIF
+        DO ibnd = 1, m
+           ! ... transform psi to real space -> psic 
+           CALL invfft_orbital_k(psi_host,ibnd,m) 
+           ! ... compute becp%r = < beta|psi> from psic in real space
+     CALL start_clock( 'h_psi:calbec' )
+           CALL calbec_rs_k(ibnd,m) 
+     CALL stop_clock( 'h_psi:calbec' )
+           ! ... psic -> vrs * psic (psic overwritten will become hpsi)
+           CALL v_loc_psir_inplace(ibnd,m) 
+           ! ... psic (hpsi) -> psic + vusp
+           CALL  add_vuspsir_k(ibnd,m)
+           ! ... transform psic back in reciprocal space and assign it to hpsi
+           CALL fwfft_orbital_k(hpsi_host,ibnd,m) 
+        END DO
+        IF (need_host_copy) hpsi_d = hpsi_host
+        !
+     ELSE
+        !
+        CALL vloc_psi_k_gpu ( lda, n, m, psi_d, vrs_d(1,current_spin), hpsi_d )
+        !
+     END IF
+     !
+  END IF  
+  !
+  ! ... Here the product with the non local potential V_NL psi
+  ! ... (not in the real-space case: it is done together with V_loc)
+  !
+  IF ( nkb > 0 .AND. .NOT. real_space) THEN
+     !
+     CALL start_clock( 'h_psi:calbec' )
+     CALL calbec ( n, vkb, psi_host, becp, m )
+     CALL stop_clock( 'h_psi:calbec' )
+     hpsi_host = hpsi_d
+     CALL add_vuspsi( lda, n, m, hpsi_host )
+     hpsi_d = hpsi_host
+     !
+  END IF
+  !  
+  CALL stop_clock( 'h_psi:pot' )
+  !
+  ! ... Here we add the kinetic energy (k+G)^2 psi
+  !
+  !$cuf kernel do(2)
+  DO ibnd = 1, m
+     DO i=1, n
+        hpsi_d (i, ibnd) = hpsi_d(i, ibnd) + g2kin_d (i) * psi_d (i, ibnd)
+        IF ( noncolin ) THEN
+           hpsi_d (lda+i, ibnd) = hpsi_d(lda+i,ibnd) + g2kin_d (i) * psi_d (lda+i, ibnd)
+        END IF
+     END DO
+  END DO
+  !
+  if (dft_is_meta()) then
+     hpsi_host = hpsi_d
+     call h_psi_meta (lda, n, m, psi_host, hpsi_host)
+     hpsi_d = hpsi_host
+  end if
+  !
+  ! ... Here we add the Hubbard potential times psi
+  !
+  IF ( lda_plus_u .AND. U_projection.NE."pseudo" ) THEN
+     !
+     hpsi_host = hpsi_d
+     IF (noncolin) THEN
+        CALL vhpsi_nc( lda, n, m, psi_host, hpsi_host )
+     ELSE
+        call vhpsi( lda, n, m, psi_host, hpsi_host )
+     ENDIF
+     hpsi_d = hpsi_host
+     !
+  ENDIF
+  !
+  ! ... Here the exact-exchange term Vxx psi
+  !
+  IF ( exx_is_active() ) THEN
+     hpsi_host = hpsi_d
+     IF ( use_ace) THEN
+        IF (gamma_only) THEN
+           CALL vexxace_gamma(lda,m,psi_host,ee,hpsi_host)
+        ELSE
+           CALL vexxace_k(lda,m,psi_host,ee,hpsi_host) 
+        END IF
+     ELSE
+        CALL vexx( lda, n, m, psi_host, hpsi_host, becp )
+     END IF
+     hpsi_d = hpsi_host
+  END IF
+  !
+  ! ... electric enthalpy if required
+  !
+  IF ( lelfield ) THEN
+     !
+     hpsi_host = hpsi_d
+     IF ( .NOT.l3dstring ) THEN
+        CALL h_epsi_her_apply( lda, n, m, psi_host, hpsi_host,gdir, efield )
+     ELSE
+        DO ipol=1,3
+           CALL h_epsi_her_apply( lda, n, m, psi_host, hpsi_host,ipol,efield_cry(ipol) )
+        END DO
+     END IF
+     hpsi_d = hpsi_host
+     !
+  END IF
+  !
+  ! ... With Gamma-only trick, Im(H*psi)(G=0) = 0 by definition,
+  ! ... but it is convenient to explicitly set it to 0 to prevent trouble
+  !
+  IF ( gamma_only .AND. gstart == 2 ) then
+      !$cuf kernel do(1)
+      do i=1,m
+         hpsi_d(1,i) = CMPLX( DBLE( hpsi_d(1,i) ), 0.D0 ,kind=DP)
+      end do
+  end if
+  !
+  if (need_host_copy) then
+      DEALLOCATE(psi_host , hpsi_host )
+  end if
+  DEALLOCATE(g2kin_d, vrs_d)
+  CALL stop_clock( 'h_psi' )
+  !
+  RETURN
+  !
+END SUBROUTINE h_psi__gpu
+
+
+
+SUBROUTINE h_psi_gpu_compatibility( lda, n, m, psi_d, hpsi_d )
  USE kinds,            ONLY : DP
  USE noncollin_module, ONLY : npol
  USE cudafor
@ -31,5 +348,5 @@ SUBROUTINE h_psi_gpu( lda, n, m, psi_d, hpsi_d )
  
  DEALLOCATE(psi, hpsi )
  
-END SUBROUTINE h_psi_gpu
+END SUBROUTINE h_psi_gpu_compatibility
 #endif
--- a/PW/src/init_run.f90
+++ b/PW/src/init_run.f90
@ -33,8 +33,10 @@ SUBROUTINE init_run()
  USE esm,                ONLY : do_comp_esm, esm_init
  USE tsvdw_module,       ONLY : tsvdw_initialize
  USE Coul_cut_2D,        ONLY : do_cutoff_2D, cutoff_fact 
+  USE buffers_module,     ONLY : buffer
  !
  IMPLICIT NONE
+  INTEGER :: ierr
  !
  CALL start_clock( 'init_run' )
  !
@ -51,6 +53,10 @@ SUBROUTINE init_run()
  CALL summary()
  CALL memory_report()
  !
+  ! ... allocate GPU buffers
+  !
+  CALL buffer%init(10, ierr)
+  !
  ! ... allocate memory for G- and R-space fft arrays
  !
  CALL allocate_fft()
--- a/PW/src/make.gpu
+++ b/PW/src/make.gpu
@ -1,4 +1,8 @@
 PWLIBS += \
+  g_psi_mod_gpu.o \
  g_psi_gpu.o \
  h_psi_gpu.o \
-  s_psi_gpu.o
+  s_psi_gpu.o \
+  vloc_psi_gpu.o
+
+c_bands.o : g_psi_mod_gpu.o
--- a/PW/src/pwcom.f90
+++ b/PW/src/pwcom.f90
@ -212,6 +212,9 @@ MODULE wvfct
       et(:,:),          &! eigenvalues of the hamiltonian
       wg(:,:),          &! the weight of each k point and band
       g2kin(:)           ! kinetic energy
+#if defined(__CUDA)
+       attributes(pinned) :: g2kin
+#endif
  INTEGER, ALLOCATABLE :: &
       btype(:,:)         ! one if the corresponding state has to be
                          ! converged to full accuracy, zero otherwise
--- a/PW/src/s_psi_gpu.f90
+++ b/PW/src/s_psi_gpu.f90
@ -8,7 +8,460 @@
 !
 !----------------------------------------------------------------------------
 #if defined(__CUDA)
+
+!@njs: s_psi, psi, spsi, s_psi_, s_psi_k, s_psi_gamma, s_psi_nc
 SUBROUTINE s_psi_gpu( lda, n, m, psi_d, spsi_d )
+  !----------------------------------------------------------------------------
+  !
+  ! ... This routine applies the S matrix to m wavefunctions psi
+  ! ... and puts the results in spsi.
+  ! ... Requires the products of psi with all beta functions
+  ! ... in array becp(nkb,m) (calculated in h_psi or by calbec)
+  !
+  ! ... input:
+  !
+  ! ...    lda   leading dimension of arrays psi, spsi
+  ! ...    n     true dimension of psi, spsi
+  ! ...    m     number of states psi
+  ! ...    psi
+  !
+  ! ... output:
+  !
+  ! ...    spsi  S*psi
+  !
+  ! --- Wrapper routine: performs bgrp parallelization on non-distributed bands
+  ! --- if suitable and required, calls old S\psi routine s_psi_
+  ! --- See comments in h_psi.f90 about band parallelization
+  !
+  USE kinds,            ONLY : DP
+  USE noncollin_module, ONLY : npol
+  USE funct,            ONLY : exx_is_active
+  USE mp_bands,         ONLY : use_bgrp_in_hpsi, inter_bgrp_comm
+  USE mp,               ONLY : mp_sum
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN) :: lda, n, m
+  COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
+  COMPLEX(DP), DEVICE, INTENT(OUT)::spsi_d(lda*npol,m)
+  !
+  INTEGER     :: m_start, m_end, i
+  !
+  CALL start_clock( 's_psi_bgrp' )
+
+  IF (use_bgrp_in_hpsi .AND. .NOT. exx_is_active() .AND. m > 1) THEN
+     ! use band parallelization here
+     spsi_d(:,:) = (0.d0,0.d0)
+     CALL divide(inter_bgrp_comm,m,m_start,m_end)
+     !write(6,*) m, m_start,m_end
+     ! Check if there at least one band in this band group
+     IF (m_end >= m_start) &
+        CALL s_psi__gpu( lda, n, m_end-m_start+1, psi_d(1,m_start), spsi_d(1,m_start) )
+     CALL mp_sum(spsi_d,inter_bgrp_comm)
+  ELSE
+     ! don't use band parallelization here
+     CALL s_psi__gpu( lda, n, m, psi_d, spsi_d )
+  END IF
+
+  CALL stop_clock( 's_psi_bgrp' )
+  RETURN
+
+END SUBROUTINE s_psi_gpu
+!
+!----------------------------------------------------------------------------
+SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
+  !----------------------------------------------------------------------------
+  !
+  ! ... This routine applies the S matrix to m wavefunctions psi
+  ! ... and puts the results in spsi.
+  ! ... Requires the products of psi with all beta functions
+  ! ... in array becp(nkb,m) (calculated in h_psi or by calbec)
+  !
+  ! ... input:
+  !
+  ! ...    lda   leading dimension of arrays psi, spsi
+  ! ...    n     true dimension of psi, spsi
+  ! ...    m     number of states psi
+  ! ...    psi
+  !
+  ! ... output:
+  !
+  ! ...    spsi  S*psi
+  !
+  USE cudafor
+  USE cublas
+  USE buffers_module, ONLY : buffer
+  USE kinds,      ONLY : DP
+  USE becmod,     ONLY : becp
+  USE uspp,       ONLY : vkb, nkb, okvan, qq_at, qq_so, indv_ijkb0
+  USE spin_orb,   ONLY : lspinorb
+  USE uspp_param, ONLY : upf, nh, nhm
+  USE ions_base,  ONLY : nat, nsp, ityp
+  USE control_flags,    ONLY: gamma_only 
+  USE noncollin_module, ONLY: npol, noncolin
+  USE realus,     ONLY :  real_space, &
+                  invfft_orbital_gamma, fwfft_orbital_gamma, calbec_rs_gamma, s_psir_gamma, &
+                  invfft_orbital_k, fwfft_orbital_k, calbec_rs_k, s_psir_k
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN) :: lda, n, m
+  COMPLEX(DP), DEVICE, INTENT(IN) :: psi_d(lda*npol,m)
+  COMPLEX(DP), DEVICE, INTENT(OUT)::spsi_d(lda*npol,m)
+  !
+  COMPLEX(DP), ALLOCATABLE :: psi_host(:,:)
+  COMPLEX(DP), ALLOCATABLE ::spsi_host(:,:)
+  !
+  INTEGER :: ibnd
+  !
+  LOGICAL :: need_host_copy
+  !
+  ! ... initialize  spsi
+  !
+  spsi_d = psi_d
+  !
+  IF ( nkb == 0 .OR. .NOT. okvan ) RETURN
+  !
+  need_host_copy = real_space
+  IF (need_host_copy) THEN
+      ALLOCATE(psi_host(lda*npol,m), spsi_host(lda*npol,m))
+      psi_host  = psi_d
+      spsi_host = spsi_d
+  END IF
+  !
+  CALL start_clock( 's_psi' )  
+  !
+  ! ... The product with the beta functions
+  !
+  IF ( gamma_only ) THEN
+     !
+     IF (real_space ) THEN
+        !
+        DO ibnd = 1, m, 2
+           !   transform the orbital to real space
+           CALL invfft_orbital_gamma(psi_host,ibnd,m) 
+           CALL s_psir_gamma(ibnd,m)
+           CALL fwfft_orbital_gamma(spsi_host,ibnd,m)
+        END DO
+        spsi_d = spsi_host
+        !
+     ELSE
+        !
+        CALL s_psi_gamma_gpu()
+        !
+     END IF
+     !
+  ELSE IF ( noncolin ) THEN
+     !
+     CALL s_psi_nc_gpu()
+     !
+  ELSE 
+     !
+     IF (real_space ) THEN
+        !
+        DO ibnd = 1, m
+           !   transform the orbital to real space
+           CALL invfft_orbital_k(psi_host,ibnd,m) 
+           CALL s_psir_k(ibnd,m)
+           CALL fwfft_orbital_k(spsi_host,ibnd,m)
+           spsi_d = spsi_host
+        END DO
+        !
+     ELSE
+        !
+        CALL s_psi_k_gpu()
+        !
+     END IF    
+     !
+  END IF    
+  !
+  CALL stop_clock( 's_psi' )
+  !
+  RETURN
+  !
+  CONTAINS
+     !
+     !-----------------------------------------------------------------------
+     SUBROUTINE s_psi_gamma_gpu()
+       !-----------------------------------------------------------------------
+       ! 
+       ! ... gamma version
+       !
+       USE mp, ONLY: mp_get_comm_null, mp_circular_shift_left
+       !
+       IMPLICIT NONE  
+       !
+       ! ... here the local variables
+       !
+       INTEGER :: ikb, jkb, ih, jh, na, nt, ibnd, ierr
+         ! counters
+       INTEGER :: nproc, mype, m_loc, m_begin, ibnd_loc, icyc, icur_blk, m_max
+         ! data distribution indexes
+       INTEGER, EXTERNAL :: ldim_block, gind_block
+         ! data distribution functions
+       REAL(DP), ALLOCATABLE :: ps(:,:)
+       REAL(DP), POINTER, CONTIGUOUS, DEVICE :: ps_d(:,:)
+       COMPLEX(DP), POINTER, CONTIGUOUS, DEVICE :: vkb_d(:,:)
+         ! the product vkb and psi
+       !
+       IF( becp%comm == mp_get_comm_null() ) THEN
+          nproc   = 1
+          mype    = 0
+          m_loc   = m
+          m_begin = 1
+          m_max   = m
+       ELSE
+          !
+          ! becp(l,i) = <beta_l|psi_i>, with vkb(n,l)=|beta_l>
+          ! in this case becp(l,i) are distributed (index i is)
+          !
+          nproc   = becp%nproc
+          mype    = becp%mype
+          m_loc   = becp%nbnd_loc
+          m_begin = becp%ibnd_begin
+          m_max   = SIZE( becp%r, 2 )
+          IF( ( m_begin + m_loc - 1 ) > m ) m_loc = m - m_begin + 1
+       END IF
+       !
+       ALLOCATE( ps( nkb, m_max ), STAT=ierr )
+       CALL buffer%lock_buffer(ps_d, (/ nkb, m_max /), ierr)
+       CALL buffer%lock_buffer(vkb_d, (/ lda, nkb /), ierr)
+       
+       IF( ierr /= 0 ) &
+          CALL errore( ' s_psi_gamma ', ' cannot allocate memory (ps) ', ABS(ierr) )
+       !    
+       ps(:,:) = 0.D0
+       !
+       !   In becp=<vkb_i|psi_j> terms corresponding to atom na of type nt
+       !   run from index i=indv_ijkb0(na)+1 to i=indv_ijkb0(na)+nh(nt)
+       !
+       DO nt = 1, nsp
+          IF ( upf(nt)%tvanp ) THEN
+             DO na = 1, nat
+                IF ( ityp(na) == nt ) THEN
+                   !
+                   ! Next operation computes ps(l',i)=\sum_m qq(l,m) becp(m',i)
+                   ! (l'=l+ijkb0, m'=m+ijkb0, indices run from 1 to nh(nt))
+                   !
+                   IF ( m_loc > 0 ) THEN
+                      CALL DGEMM('N', 'N', nh(nt), m_loc, nh(nt), 1.0_dp, &
+                                  qq_at(1,1,na), nhm, becp%r(indv_ijkb0(na)+1,1),&
+                                  nkb, 0.0_dp, ps(indv_ijkb0(na)+1,1), nkb )
+                   END IF
+                END IF
+             END DO
+          END IF
+       END DO
+       ps_d = ps
+       vkb_d(1:lda, 1:nkb) = vkb(1:lda,1:nkb)
+       !
+       IF( becp%comm == mp_get_comm_null() ) THEN
+          IF ( m == 1 ) THEN
+             CALL myDGEMV( 'N', 2 * n, nkb, 1.D0, vkb_d, &
+                  2 * lda, ps_d, 1, 1.D0, spsi_d, 1 )
+          ELSE
+             CALL cublasDGEMM( 'N', 'N', 2 * n, m, nkb, 1.D0, vkb_d, &
+                  2 * lda, ps_d, nkb, 1.D0, spsi_d, 2 * lda )
+          END IF
+       ELSE
+          !
+          ! parallel block multiplication of vkb and ps
+          !
+          icur_blk = mype
+          !
+          DO icyc = 0, nproc - 1
+
+             m_loc   = ldim_block( becp%nbnd , nproc, icur_blk )
+             m_begin = gind_block( 1,  becp%nbnd, nproc, icur_blk )
+
+             IF( ( m_begin + m_loc - 1 ) > m ) m_loc = m - m_begin + 1
+
+             IF( m_loc > 0 ) THEN
+                CALL cublasDGEMM( 'N', 'N', 2 * n, m_loc, nkb, 1.D0, vkb_d, &
+                            2 * lda, ps_d, nkb, 1.D0, spsi_d( 1, m_begin ), 2 * lda )
+             END IF
+
+             ! block rotation
+             !
+             CALL mp_circular_shift_left( ps_d, icyc, becp%comm )
+
+             icur_blk = icur_blk + 1
+             IF( icur_blk == nproc ) icur_blk = 0
+
+          END DO
+          !
+       END IF
+       !
+       DEALLOCATE( ps ) 
+       CALL buffer%release_buffer(ps_d, ierr)
+       CALL buffer%release_buffer(vkb_d, ierr)
+       !
+       RETURN
+       !
+     END SUBROUTINE s_psi_gamma_gpu
+     !
+     !-----------------------------------------------------------------------
+     SUBROUTINE s_psi_k_gpu()
+       !-----------------------------------------------------------------------
+       !
+       ! ... k-points version
+       !
+       IMPLICIT NONE
+       !
+       ! ... local variables
+       !
+       INTEGER :: ikb, jkb, ih, jh, na, nt, ibnd, ierr
+         ! counters
+       COMPLEX(DP), ALLOCATABLE :: ps(:,:), qqc(:,:)
+       COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: ps_d(:,:)
+       COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: vkb_d(:,:)
+         ! ps = product vkb and psi ; qqc = complex version of qq
+       !
+       ALLOCATE( ps( nkb, m ), STAT=ierr )
+       !
+       IF( ierr /= 0 ) &
+          CALL errore( ' s_psi_k ', ' cannot allocate memory (ps) ', ABS(ierr) )
+       CALL buffer%lock_buffer(ps_d, (/ nkb, m /), ierr)
+       CALL buffer%lock_buffer(vkb_d, (/ lda, nkb /), ierr)
+       !ALLOCATE(vkb_d, SOURCE=vkb)
+
+       !
+       ps(:,:) = ( 0.D0, 0.D0 )
+       !
+       DO nt = 1, nsp
+          IF ( upf(nt)%tvanp ) THEN
+             ! qq is real:  copy it into a complex variable to perform
+             ! a zgemm - simple but sub-optimal solution
+             ALLOCATE( qqc(nh(nt),nh(nt)) )
+             DO na = 1, nat
+                IF ( ityp(na) == nt ) THEN
+                   qqc(:,:) = CMPLX ( qq_at(1:nh(nt),1:nh(nt),na), 0.0_dp, KIND=dp )
+                   CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), &
+                        qqc, nh(nt), becp%k(indv_ijkb0(na)+1,1), nkb, &
+                        (0.0_dp,0.0_dp), ps(indv_ijkb0(na)+1,1), nkb )
+                   !
+                END IF
+             END DO
+             DEALLOCATE (qqc)
+          END IF
+       END DO
+       ps_d = ps
+       vkb_d(1:lda, 1:nkb) = vkb(1:lda,1:nkb)
+       !
+       IF ( m == 1 ) THEN
+          !
+          CALL ZGEMV( 'N', n, nkb, ( 1.D0, 0.D0 ), vkb_d, &
+                      lda, ps_d, 1, ( 1.D0, 0.D0 ), spsi_d, 1 )
+          !
+       ELSE
+          !
+          CALL ZGEMM( 'N', 'N', n, m, nkb, ( 1.D0, 0.D0 ), vkb_d, &
+                      lda, ps_d, nkb, ( 1.D0, 0.D0 ), spsi_d, lda )
+          !
+       END IF
+       !
+       DEALLOCATE( ps )
+       CALL buffer%release_buffer(ps_d, ierr)
+       CALL buffer%release_buffer(vkb_d, ierr)
+       !DEALLOCATE(vkb_d)
+       !
+       RETURN
+       !
+     END SUBROUTINE s_psi_k_gpu     
+     !
+     !
+     !-----------------------------------------------------------------------
+      SUBROUTINE s_psi_nc_gpu ( )
+     !-----------------------------------------------------------------------
+       !
+       ! ... k-points noncolinear/spinorbit version
+       !
+       IMPLICIT NONE
+       !
+       !    here the local variables
+       !
+       INTEGER :: ikb, jkb, ih, jh, na, nt, ibnd, ipol, ierr
+       ! counters
+       COMPLEX (DP), ALLOCATABLE :: ps (:,:,:)
+       COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: ps_d(:,:,:)
+       COMPLEX(DP), DEVICE, CONTIGUOUS, POINTER :: vkb_d(:,:)
+       ! the product vkb and psi
+       !
+       ALLOCATE (ps(nkb,npol,m),STAT=ierr)    
+       IF( ierr /= 0 ) &
+          CALL errore( ' s_psi_nc ', ' cannot allocate memory (ps) ', ABS(ierr) )
+
+
+       ps(:,:,:) = (0.D0,0.D0)
+       !
+       DO nt = 1, nsp
+          !
+          IF ( upf(nt)%tvanp ) THEN
+             !
+             DO na = 1, nat
+                IF ( ityp(na) == nt ) THEN
+                   DO ih = 1,nh(nt)
+                      ikb = indv_ijkb0(na) + ih
+                      DO jh = 1, nh (nt)
+                         jkb = indv_ijkb0(na) + jh
+                         IF ( .NOT. lspinorb ) THEN
+                            DO ipol=1,npol
+                               DO ibnd = 1, m
+                                  ps(ikb,ipol,ibnd) = ps(ikb,ipol,ibnd) + &
+                                       qq_at(ih,jh,na)*becp%nc(jkb,ipol,ibnd)
+                               END DO
+                            END DO
+                         ELSE
+                            DO ibnd = 1, m
+                               ps(ikb,1,ibnd)=ps(ikb,1,ibnd) + &
+                                    qq_so(ih,jh,1,nt)*becp%nc(jkb,1,ibnd)+ &
+                                    qq_so(ih,jh,2,nt)*becp%nc(jkb,2,ibnd)
+                               ps(ikb,2,ibnd)=ps(ikb,2,ibnd) + &
+                                    qq_so(ih,jh,3,nt)*becp%nc(jkb,1,ibnd)+ &
+                                    qq_so(ih,jh,4,nt)*becp%nc(jkb,2,ibnd)
+                            END DO
+                         END IF
+                      END DO
+                   END DO
+                END IF
+             END DO
+          END IF
+       END DO
+       CALL buffer%lock_buffer(ps_d, (/ nkb,npol,m /), ierr)
+       CALL buffer%lock_buffer(vkb_d, (/ lda, nkb /), ierr)
+       
+       ps_d = ps
+       vkb_d(1:lda, 1:nkb) = vkb(1:lda,1:nkb)
+       
+       
+       call ZGEMM ('N', 'N', n, m*npol, nkb, (1.d0, 0.d0) , vkb_d, &
+          lda, ps_d, nkb, (1.d0, 0.d0) , spsi_d(1,1), lda)
+
+       DEALLOCATE(ps)
+       CALL buffer%release_buffer(ps_d, ierr)
+       CALL buffer%release_buffer(vkb_d, ierr)
+
+       RETURN
+
+    END SUBROUTINE s_psi_nc_gpu
+
+END SUBROUTINE s_psi__gpu
+!@nje
+
+SUBROUTINE myDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
+    use cudafor
+    use cublas
+    implicit none
+    DOUBLE PRECISION ALPHA,BETA
+    INTEGER INCX,INCY,LDA,M,N
+    CHARACTER TRANS
+    DOUBLE PRECISION, DEVICE :: A(LDA,*),X(*),Y(*)
+    !
+    call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
+    !
+END SUBROUTINE myDGEMV
+
+SUBROUTINE s_psi_gpu_compatibility( lda, n, m, psi_d, spsi_d )
  USE kinds,            ONLY : DP
  USE noncollin_module, ONLY : npol
  USE cudafor
@ -32,5 +485,5 @@ SUBROUTINE s_psi_gpu( lda, n, m, psi_d, spsi_d )
  
  DEALLOCATE(psi, spsi )
  
-END SUBROUTINE s_psi_gpu
+END SUBROUTINE s_psi_gpu_compatibility
 #endif
--- a/PW/src/scf_mod.f90
+++ b/PW/src/scf_mod.f90
@ -72,6 +72,9 @@ MODULE scf
       vrs(:,:),       &! the total pot. in real space (smooth grid)
       rho_core(:),    &! the core charge in real space
       kedtau(:,:)      ! position dependent kinetic energy enhancement factor
+#if defined(__CUDA)
+       attributes(pinned) :: vrs
+#endif
  COMPLEX(DP), ALLOCATABLE :: &
       rhog_core(:)     ! the core charge in reciprocal space

--- a/PW/src/vloc_psi_gpu.f90
+++ b/PW/src/vloc_psi_gpu.f90
@ -0,0 +1,623 @@
+!
+! Copyright (C) 2003-2013 PWSCF group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+#if defined(__CUDA)
+!@njs: vloc_psi_gamma, psi, v, hpsi, psic
+!-----------------------------------------------------------------------
+SUBROUTINE vloc_psi_gamma_gpu(lda, n, m, psi_d, v_d, hpsi_d)
+  !-----------------------------------------------------------------------
+  !
+  ! Calculation of Vloc*psi using dual-space technique - Gamma point
+  !
+  USE parallel_include
+  USE kinds,   ONLY : DP
+  USE mp_bands,      ONLY : me_bgrp
+  USE fft_base,      ONLY : dffts
+  USE fft_interfaces,ONLY : fwfft, invfft
+  USE wavefunctions_module,       ONLY: psic_h => psic
+  USE wavefunctions_module_gpum,       ONLY: psic_d
+  USE fft_helper_subroutines
+  USE buffers_module,       ONLY : buffer
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(in) :: lda, n, m
+  COMPLEX(DP), DEVICE, INTENT(in)   :: psi_d (lda, m)
+  COMPLEX(DP), DEVICE, INTENT(inout):: hpsi_d (lda, m)
+  REAL(DP), DEVICE, INTENT(in) :: v_d(dffts%nnr)
+  !
+  INTEGER :: ibnd, j, incr, right_nnr, right_nr3, right_inc
+  COMPLEX(DP) :: fp, fm
+  !
+  LOGICAL :: use_tg
+  ! Variables for task groups
+!@njs: tg_v, tg_psic
+  REAL(DP),    DEVICE, POINTER :: tg_v_d(:)
+  COMPLEX(DP), DEVICE, POINTER :: tg_psic_d(:)
+  INTEGER,     DEVICE, POINTER :: dffts_nl_d(:), dffts_nlm_d(:)
+  INTEGER :: v_siz, idx, ioff
+  INTEGER :: ierr
+  !
+  CALL start_clock ('vloc_psi')
+  incr = 2
+  !
+  IF (.not. allocated(psic_d)) ALLOCATE(psic_d, SOURCE=psic_h)
+  use_tg = dffts%has_task_groups 
+  !
+  IF( use_tg ) THEN
+     !
+     CALL start_clock ('vloc_psi:tg_gather')
+     v_siz =  dffts%nnr_tg
+     !
+     CALL buffer%lock_buffer( tg_v_d, v_siz, ierr )
+     CALL buffer%lock_buffer( tg_psic_d, v_siz, ierr )
+     !
+     CALL tg_gather_gpu( dffts, v_d, tg_v_d )
+     CALL stop_clock ('vloc_psi:tg_gather')
+     !
+     incr = 2 * fftx_ntgrp(dffts)
+     !
+  ENDIF
+  ! Sync fft data
+  CALL buffer%lock_buffer( dffts_nl_d, size(dffts%nl), ierr )
+  CALL buffer%lock_buffer( dffts_nlm_d, size(dffts%nlm), ierr )
+  dffts_nl_d(1:n) = dffts%nl(1:n)
+  dffts_nlm_d(1:n) = dffts%nlm(1:n)
+  ! End Sync
+  
+  !
+  ! the local potential V_Loc psi. First bring psi to real space
+  !
+  DO ibnd = 1, m, incr
+     !
+     IF( use_tg ) THEN
+        !
+        CALL tg_get_nnr( dffts, right_nnr )
+        !
+        tg_psic_d = (0.d0, 0.d0)
+        ioff   = 0
+        !
+        DO idx = 1, 2*fftx_ntgrp(dffts), 2
+           IF( idx + ibnd - 1 < m ) THEN
+              !$cuf kernel do(1) <<<,>>>
+              DO j = 1, n
+                 tg_psic_d(dffts_nl_d (j)+ioff) =        psi_d(j,idx+ibnd-1) + &
+                                      (0.0d0,1.d0) * psi_d(j,idx+ibnd)
+                 tg_psic_d(dffts_nlm_d(j)+ioff) = conjg( psi_d(j,idx+ibnd-1) - &
+                                      (0.0d0,1.d0) * psi_d(j,idx+ibnd) )
+              ENDDO
+           ELSEIF( idx + ibnd - 1 == m ) THEN
+              !$cuf kernel do(1) <<<,>>>
+              DO j = 1, n
+                 tg_psic_d(dffts_nl_d (j)+ioff) =        psi_d(j,idx+ibnd-1)
+                 tg_psic_d(dffts_nlm_d(j)+ioff) = conjg( psi_d(j,idx+ibnd-1) )
+              ENDDO
+           ENDIF
+
+           ioff = ioff + right_nnr
+
+        ENDDO
+        !
+     ELSE
+        !
+        psic_d(:) = (0.d0, 0.d0)
+        IF (ibnd < m) THEN
+           ! two ffts at the same time
+           !$cuf kernel do(1) <<<,>>>
+           DO j = 1, n
+              psic_d(dffts_nl_d (j))=      psi_d(j,ibnd) + (0.0d0,1.d0)*psi_d(j,ibnd+1)
+              psic_d(dffts_nlm_d(j))=conjg(psi_d(j,ibnd) - (0.0d0,1.d0)*psi_d(j,ibnd+1))
+           ENDDO
+        ELSE
+           !$cuf kernel do(1) <<<,>>>
+           DO j = 1, n
+              psic_d (dffts_nl_d (j)) =       psi_d(j, ibnd)
+              psic_d (dffts_nlm_d(j)) = conjg(psi_d(j, ibnd))
+           ENDDO
+        ENDIF
+        !
+     ENDIF
+     !
+     !   fft to real space
+     !   product with the potential v on the smooth grid
+     !   back to reciprocal space
+     !
+     IF( use_tg ) THEN
+        !
+        CALL invfft ('tgWave', tg_psic_d, dffts )
+        !
+        CALL tg_get_group_nr3( dffts, right_nr3 )
+        !
+        !$cuf kernel do(1) <<<,>>>
+        DO j = 1, dffts%nr1x * dffts%nr2x * right_nr3
+           tg_psic_d (j) = tg_psic_d (j) * tg_v_d(j)
+        ENDDO
+        !
+        CALL fwfft ('tgWave', tg_psic_d, dffts )
+        !
+     ELSE
+        !
+        CALL invfft ('Wave', psic_d, dffts)
+        !
+        !$cuf kernel do(1) <<<,>>>
+        DO j = 1, dffts%nnr
+           psic_d (j) = psic_d (j) * v_d(j)
+        ENDDO
+        !
+        CALL fwfft ('Wave', psic_d, dffts)
+        !
+     ENDIF
+     !
+     !   addition to the total product
+     !
+     IF( use_tg ) THEN
+        !
+        ioff   = 0
+        !
+        CALL tg_get_recip_inc( dffts, right_inc )
+        !
+        DO idx = 1, 2*fftx_ntgrp(dffts), 2
+           !
+           IF( idx + ibnd - 1 < m ) THEN
+              !$cuf kernel do(1) <<<,>>>
+              DO j = 1, n
+                 fp= ( tg_psic_d( dffts_nl_d(j) + ioff ) +  &
+                       tg_psic_d( dffts_nlm_d(j) + ioff ) ) * 0.5d0
+                 fm= ( tg_psic_d( dffts_nl_d(j) + ioff ) -  &
+                       tg_psic_d( dffts_nlm_d(j) + ioff ) ) * 0.5d0
+                 hpsi_d (j, ibnd+idx-1) = hpsi_d (j, ibnd+idx-1) + &
+                                        cmplx( dble(fp), aimag(fm),kind=DP)
+                 hpsi_d (j, ibnd+idx  ) = hpsi_d (j, ibnd+idx  ) + &
+                                        cmplx(aimag(fp),- dble(fm),kind=DP)
+              ENDDO
+           ELSEIF( idx + ibnd - 1 == m ) THEN
+              !$cuf kernel do(1) <<<,>>>
+              DO j = 1, n
+                 hpsi_d (j, ibnd+idx-1) = hpsi_d (j, ibnd+idx-1) + &
+                                         tg_psic_d( dffts_nl_d(j) + ioff )
+              ENDDO
+           ENDIF
+           !
+           ioff = ioff + right_inc
+           !
+        ENDDO
+        !
+     ELSE
+        IF (ibnd < m) THEN
+           ! two ffts at the same time
+           !$cuf kernel do(1) <<<,>>>
+           DO j = 1, n
+              fp = (psic_d (dffts_nl_d(j)) + psic_d (dffts_nlm_d(j)))*0.5d0
+              fm = (psic_d (dffts_nl_d(j)) - psic_d (dffts_nlm_d(j)))*0.5d0
+              hpsi_d (j, ibnd)   = hpsi_d (j, ibnd)   + &
+                                 cmplx( dble(fp), aimag(fm),kind=DP)
+              hpsi_d (j, ibnd+1) = hpsi_d (j, ibnd+1) + &
+                                 cmplx(aimag(fp),- dble(fm),kind=DP)
+           ENDDO
+        ELSE
+           !$cuf kernel do(1) <<<,>>>
+           DO j = 1, n
+              hpsi_d (j, ibnd)   = hpsi_d (j, ibnd)   + psic_d (dffts_nl_d(j))
+           ENDDO
+        ENDIF
+     ENDIF
+     !
+  ENDDO
+  !
+  IF( use_tg ) THEN
+     !
+     CALL buffer%release_buffer( tg_psic_d, ierr )
+     CALL buffer%release_buffer( tg_v_d, ierr )
+     !
+  ENDIF
+  CALL buffer%release_buffer( dffts_nl_d, ierr )
+  CALL buffer%release_buffer( dffts_nlm_d, ierr )
+  CALL stop_clock ('vloc_psi')
+  !
+  RETURN
+END SUBROUTINE vloc_psi_gamma_gpu
+!
+!@njs: vloc_psi_k
+!-----------------------------------------------------------------------
+SUBROUTINE vloc_psi_k_gpu(lda, n, m, psi_d, v_d, hpsi_d)
+  !-----------------------------------------------------------------------
+  !
+  ! Calculation of Vloc*psi using dual-space technique - k-points
+  !
+  !   fft to real space
+  !   product with the potential v on the smooth grid
+  !   back to reciprocal space
+  !   addition to the hpsi
+  !
+  USE parallel_include
+  USE kinds, ONLY : DP
+  USE wvfct, ONLY : current_k
+  USE klist, ONLY : igk_k_host => igk_k
+  USE mp_bands,      ONLY : me_bgrp
+  USE fft_base,      ONLY : dffts
+  USE fft_interfaces,ONLY : fwfft, invfft
+  USE fft_helper_subroutines
+  USE wavefunctions_module, ONLY: psic_h => psic
+  USE wavefunctions_module_gpum, ONLY: psic_d
+  USE buffers_module,       ONLY : buffer
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(in) :: lda, n, m
+  COMPLEX(DP), DEVICE, INTENT(in)   :: psi_d (lda, m)
+  COMPLEX(DP), DEVICE, INTENT(inout):: hpsi_d (lda, m)
+  REAL(DP), DEVICE, INTENT(in) :: v_d(dffts%nnr)
+  !
+  INTEGER :: ibnd, j, incr
+  INTEGER :: i, right_nnr, right_nr3, right_inc
+  !
+  LOGICAL :: use_tg
+  ! Task Groups
+  REAL(DP),    DEVICE, POINTER :: tg_v_d(:)
+  COMPLEX(DP), DEVICE, POINTER :: tg_psic_d(:)
+  INTEGER,     DEVICE, POINTER :: dffts_nl_d(:)
+  INTEGER,     DEVICE, POINTER :: igk_k_d(:)
+  INTEGER :: v_siz, idx, ioff
+  INTEGER :: ierr
+  !
+  IF (.not. allocated(psic_d)) ALLOCATE(psic_d, SOURCE=psic_h)
+  CALL start_clock ('vloc_psi')
+  use_tg = dffts%has_task_groups 
+  !
+  IF( use_tg ) THEN
+     !
+     CALL start_clock ('vloc_psi:tg_gather')
+     v_siz =  dffts%nnr_tg
+     !
+     CALL buffer%lock_buffer( tg_v_d,    v_siz, ierr )
+     CALL buffer%lock_buffer( tg_psic_d, v_siz, ierr )
+     !
+     CALL tg_gather( dffts, v_d, tg_v_d )
+     CALL stop_clock ('vloc_psi:tg_gather')
+
+  ENDIF
+  CALL buffer%lock_buffer( dffts_nl_d, size(dffts%nl) , ierr )
+  CALL buffer%lock_buffer( igk_k_d, n, ierr )
+  !
+  dffts_nl_d = dffts%nl
+  igk_k_d(1:n) = igk_k_host(1:n, current_k)
+  !
+  IF( use_tg ) THEN
+
+     CALL tg_get_nnr( dffts, right_nnr )
+
+     DO ibnd = 1, m, fftx_ntgrp(dffts)
+        !
+        tg_psic_d = (0.d0, 0.d0)
+        ioff   = 0
+        !
+        DO idx = 1, fftx_ntgrp(dffts)
+
+           IF( idx + ibnd - 1 <= m ) THEN
+!$cuf kernel do(1) <<<,>>>
+              DO j = 1, n
+                 tg_psic_d(dffts_nl_d (igk_k_d(j))+ioff) =  psi_d(j,idx+ibnd-1)
+              ENDDO
+
+           ENDIF
+
+        !write (6,*) 'wfc G ', idx+ibnd-1
+        !write (6,99) (tg_psic(i+ioff), i=1,400)
+
+           ioff = ioff + right_nnr
+        ENDDO
+        !
+        CALL  invfft ('tgWave', tg_psic_d, dffts )
+        !write (6,*) 'wfc R ' 
+        !write (6,99) (tg_psic(i), i=1,400)
+        !
+        CALL tg_get_group_nr3( dffts, right_nr3 )
+        !
+!$cuf kernel do(1) <<<,>>>
+        DO j = 1, dffts%nr1x*dffts%nr2x* right_nr3
+           tg_psic_d (j) = tg_psic_d (j) * tg_v_d(j)
+        ENDDO
+!
+        !write (6,*) 'v psi R ' 
+        !write (6,99) (tg_psic(i), i=1,400)
+        !
+        CALL fwfft ('tgWave',  tg_psic_d, dffts )
+        !
+        !   addition to the total product
+        !
+        ioff   = 0
+        !
+        CALL tg_get_recip_inc( dffts, right_inc )
+        !
+        DO idx = 1, fftx_ntgrp(dffts)
+           !
+           IF( idx + ibnd - 1 <= m ) THEN
+!$cuf kernel do(1) <<<,>>>
+              DO j = 1, n
+                 hpsi_d (j, ibnd+idx-1) = hpsi_d (j, ibnd+idx-1) + &
+                    tg_psic_d( dffts_nl_d(igk_k_d(j)) + ioff )
+              ENDDO
+!
+           ENDIF
+           !
+        !write (6,*) 'v psi G ', idx+ibnd-1
+        !write (6,99) (tg_psic(i+ioff), i=1,400)
+
+           ioff = ioff + right_inc
+           !
+        ENDDO
+        !
+     ENDDO
+  ELSE
+     DO ibnd = 1, m
+        !
+        !!! == OPTIMIZE HERE == (setting to 0 and setting elements!)
+        psic_d(:) = (0.d0, 0.d0)
+!$cuf kernel do(1) <<<,>>>
+        DO j = 1, n
+          psic_d (dffts_nl_d (igk_k_d(j))) = psi_d(j, ibnd)
+        END DO
+!
+        !write (6,*) 'wfc G ', ibnd
+        !write (6,99) (psic(i), i=1,400)
+        !
+        CALL invfft ('Wave', psic_d, dffts)
+        !write (6,*) 'wfc R ' 
+        !write (6,99) (psic(i), i=1,400)
+        !
+!$cuf kernel do(1) <<<,>>>
+        DO j = 1, dffts%nnr
+           psic_d (j) = psic_d (j) * v_d(j)
+        ENDDO
+!
+        !write (6,*) 'v psi R ' 
+        !write (6,99) (psic(i), i=1,400)
+        !
+        CALL fwfft ('Wave', psic_d, dffts)
+        !
+        !   addition to the total product
+        !
+!$cuf kernel do(1) <<<,>>>
+        DO j = 1, n
+           hpsi_d (j, ibnd)   = hpsi_d (j, ibnd)   + psic_d (dffts_nl_d(igk_k_d(j)))
+        ENDDO
+!
+        !write (6,*) 'v psi G ', ibnd
+        !write (6,99) (psic(i), i=1,400)
+        !
+     ENDDO
+  ENDIF
+  !
+  IF( use_tg ) THEN
+     !
+     CALL buffer%release_buffer( tg_psic_d, ierr )
+     CALL buffer%release_buffer( tg_v_d, ierr )
+     !
+  ENDIF
+  CALL buffer%release_buffer( dffts_nl_d, ierr )
+  CALL buffer%release_buffer( igk_k_d, ierr )
+  CALL stop_clock ('vloc_psi')
+  !
+99 format ( 20 ('(',2f12.9,')') )
+
+  RETURN
+END SUBROUTINE vloc_psi_k_gpu
+!
+!@njs: vloc_psi_nc
+!-----------------------------------------------------------------------
+SUBROUTINE vloc_psi_nc_gpu (lda, n, m, psi_d, v_d, hpsi_d)
+  !-----------------------------------------------------------------------
+  !
+  ! Calculation of Vloc*psi using dual-space technique - noncolinear
+  !
+  USE parallel_include
+  USE kinds,   ONLY : DP
+  USE wvfct, ONLY : current_k
+  USE klist, ONLY : igk_k_host => igk_k
+  USE mp_bands,      ONLY : me_bgrp
+  USE fft_base,      ONLY : dffts, dfftp
+  USE fft_interfaces,ONLY : fwfft, invfft
+  USE lsda_mod,      ONLY : nspin
+  USE spin_orb,      ONLY : domag
+  USE noncollin_module,     ONLY: npol
+  USE wavefunctions_module,       ONLY: psic_nc_h => psic_nc
+  USE wavefunctions_module_gpum,  ONLY: psic_nc_d
+  USE fft_helper_subroutines
+  USE buffers_module,       ONLY : buffer
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(in) :: lda, n, m
+  REAL(DP), DEVICE, INTENT(in) :: v_d(dfftp%nnr,4) ! beware dimensions!
+  COMPLEX(DP), DEVICE, INTENT(in)   :: psi_d (lda*npol, m)
+  COMPLEX(DP), DEVICE, INTENT(inout):: hpsi_d (lda,npol,m)
+  INTEGER,     DEVICE, POINTER :: dffts_nl_d(:)
+  INTEGER,     DEVICE, POINTER :: igk_k_d(:)
+  !
+  INTEGER :: ibnd, j,ipol, incr, is
+  COMPLEX(DP) :: sup, sdwn
+  !
+  LOGICAL :: use_tg
+  ! Variables for task groups
+  REAL(DP),    DEVICE, ALLOCATABLE :: tg_v_d(:,:)
+  COMPLEX(DP), DEVICE, ALLOCATABLE :: tg_psic_d(:,:)
+  INTEGER :: v_siz, idx, ioff
+  INTEGER :: right_nnr, right_nr3, right_inc
+  INTEGER :: ierr
+  !
+  CALL start_clock ('vloc_psi')
+  !
+  IF (.not. allocated(psic_nc_d)) ALLOCATE(psic_nc_d, SOURCE=psic_nc_h)
+  incr = 1
+  !
+  use_tg = dffts%has_task_groups 
+  !
+  IF( use_tg ) THEN
+     CALL start_clock ('vloc_psi:tg_gather')
+     v_siz = dffts%nnr_tg
+     IF (domag) THEN
+        ALLOCATE( tg_v_d( v_siz, 4 ) )
+        DO is=1,nspin
+           CALL tg_gather( dffts, v_d(:,is), tg_v_d(:,is) )
+        ENDDO
+     ELSE
+        ALLOCATE( tg_v_d( v_siz, 1 ) )
+        CALL tg_gather( dffts, v_d(:,1), tg_v_d(:,1) )
+     ENDIF
+     ALLOCATE( tg_psic_d( v_siz, npol ) )
+     CALL stop_clock ('vloc_psi:tg_gather')
+
+     incr = fftx_ntgrp(dffts)
+  ENDIF
+  CALL buffer%lock_buffer( dffts_nl_d, size(dffts%nl), ierr )
+  CALL buffer%lock_buffer( igk_k_d, n, ierr )
+  dffts_nl_d = dffts%nl
+  igk_k_d(1:n) = igk_k_host(1:n, current_k)
+  !
+  ! the local potential V_Loc psi. First the psi in real space
+  !
+  DO ibnd = 1, m, incr
+
+     IF( use_tg ) THEN
+        !
+        CALL tg_get_nnr( dffts, right_nnr )
+        !
+        DO ipol = 1, npol
+           !
+           !  == OPTIMIZE HERE == setting data twice
+           tg_psic_d(:,ipol) = ( 0.D0, 0.D0 )
+           ioff   = 0
+           !
+           DO idx = 1, fftx_ntgrp(dffts)
+              !
+              IF( idx + ibnd - 1 <= m ) THEN
+                 !$cuf kernel do(1) <<<,>>>
+                 DO j = 1, n
+                    tg_psic_d( dffts_nl_d( igk_k_d(j) ) + ioff, ipol ) = &
+                       psi_d( j +(ipol-1)*lda, idx+ibnd-1 )
+                 ENDDO
+              ENDIF
+
+              ioff = ioff + right_nnr
+
+           ENDDO
+           !
+           CALL invfft ('tgWave', tg_psic_d(:,ipol), dffts )
+           !
+        ENDDO
+        !
+     ELSE
+        psic_nc_d = (0.d0,0.d0)
+        DO ipol=1,npol
+           !$cuf kernel do(1) <<<,>>>
+           DO j = 1, n
+              psic_nc_d(dffts_nl_d(igk_k_d(j)),ipol) = psi_d(j+(ipol-1)*lda,ibnd)
+           ENDDO
+           CALL invfft ('Wave', psic_nc_d(:,ipol), dffts)
+        ENDDO
+     ENDIF
+
+     !
+     !   product with the potential v = (vltot+vr) on the smooth grid
+     !
+     IF( use_tg ) THEN
+        CALL tg_get_group_nr3( dffts, right_nr3 )
+        IF (domag) THEN
+           !$cuf kernel do(1) <<<,>>>
+           DO j=1, dffts%nr1x*dffts%nr2x*right_nr3
+              sup = tg_psic_d(j,1) * (tg_v_d(j,1)+tg_v_d(j,4)) + &
+                    tg_psic_d(j,2) * (tg_v_d(j,2)-(0.d0,1.d0)*tg_v_d(j,3))
+              sdwn = tg_psic_d(j,2) * (tg_v_d(j,1)-tg_v_d(j,4)) + &
+                     tg_psic_d(j,1) * (tg_v_d(j,2)+(0.d0,1.d0)*tg_v_d(j,3))
+              tg_psic_d(j,1)=sup
+              tg_psic_d(j,2)=sdwn
+           ENDDO
+        ELSE
+           !$cuf kernel do(1) <<<,>>>
+           DO j=1, dffts%nr1x*dffts%nr2x*right_nr3
+              tg_psic_d(j,:) = tg_psic_d(j,:) * tg_v_d(j,1)
+           ENDDO
+        ENDIF
+     ELSE
+        IF (domag) THEN
+           !$cuf kernel do(1) <<<,>>>
+           DO j=1, dffts%nnr
+              sup = psic_nc_d(j,1) * (v_d(j,1)+v_d(j,4)) + &
+                    psic_nc_d(j,2) * (v_d(j,2)-(0.d0,1.d0)*v_d(j,3))
+              sdwn = psic_nc_d(j,2) * (v_d(j,1)-v_d(j,4)) + &
+                     psic_nc_d(j,1) * (v_d(j,2)+(0.d0,1.d0)*v_d(j,3))
+              psic_nc_d(j,1)=sup
+              psic_nc_d(j,2)=sdwn
+           ENDDO
+        ELSE
+           !$cuf kernel do(1) <<<,>>>
+           DO j=1, dffts%nnr
+              psic_nc_d(j,:) = psic_nc_d(j,:) * v_d(j,1)
+           ENDDO
+        ENDIF
+     ENDIF
+     !
+     !   back to reciprocal space
+     !
+     IF( use_tg ) THEN
+        !
+        DO ipol = 1, npol
+
+           CALL fwfft ('tgWave', tg_psic_d(:,ipol), dffts )
+           !
+           ioff   = 0
+           !
+           CALL tg_get_recip_inc( dffts, right_inc )
+           !
+           DO idx = 1, fftx_ntgrp(dffts)
+              !
+              IF( idx + ibnd - 1 <= m ) THEN
+                 !$cuf kernel do(1) <<<,>>>
+                 DO j = 1, n
+                    hpsi_d (j, ipol, ibnd+idx-1) = hpsi_d (j, ipol, ibnd+idx-1) + &
+                                 tg_psic_d( dffts_nl_d(igk_k_d(j)) + ioff, ipol )
+                 ENDDO
+              ENDIF
+              !
+              ioff = ioff + right_inc
+              !
+           ENDDO
+
+        ENDDO
+        !
+     ELSE
+
+        DO ipol=1,npol
+           CALL fwfft ('Wave', psic_nc_d(:,ipol), dffts)
+        ENDDO
+        !
+        !   addition to the total product
+        !
+        DO ipol=1,npol
+           !$cuf kernel do(1) <<<,>>>
+           DO j = 1, n
+              hpsi_d(j,ipol,ibnd) = hpsi_d(j,ipol,ibnd) + &
+                                  psic_nc_d(dffts_nl_d(igk_k_d(j)),ipol)
+           ENDDO
+        ENDDO
+
+     ENDIF
+
+  ENDDO
+
+  IF( use_tg ) THEN
+     !
+     DEALLOCATE(tg_v_d, tg_psic_d)
+     !
+  ENDIF
+  CALL buffer%release_buffer( dffts_nl_d, ierr )
+  CALL buffer%release_buffer( igk_k_d, ierr )
+  CALL stop_clock ('vloc_psi')
+  !
+  RETURN
+END SUBROUTINE vloc_psi_nc_gpu
+!@nje
+#endif