Merge branch 'more_cuda_cleanup' into 'develop'

More cuda cleanup See merge request QEF/q-e!2410
2024-08-13 07:01:00 +00:00 · 2024-08-13 07:01:00 +00:00 · a59e791975
parent 8310612faa 238776494a
commit a59e791975
4 changed files with 127 additions and 391 deletions
--- a/PW/src/force_us.f90
+++ b/PW/src/force_us.f90
@ -1,5 +1,5 @@
 !
-! Copyright (C) 2001-2015 Quantum ESPRESSO group
+! Copyright (C) 2001-2024 Quantum ESPRESSO group
 ! This file is distributed under the terms of the
 ! GNU General Public License. See the file `License'
 ! in the root directory of the present distribution,
@ -23,15 +23,15 @@ SUBROUTINE force_us( forcenl )
  USE lsda_mod,             ONLY : lsda, current_spin, isk, nspin
  USE symme,                ONLY : symvector
  USE wavefunctions,        ONLY : evc
-  USE noncollin_module,     ONLY : npol, noncolin
+  USE noncollin_module,     ONLY : npol, noncolin, lspinorb
  USE io_files,             ONLY : iunwfc, nwordwfc
  USE buffers,              ONLY : get_buffer
  USE becmod,               ONLY : calbec, becp, bec_type, &
                                   allocate_bec_type, deallocate_bec_type, &
                                   allocate_bec_type_acc, deallocate_bec_type_acc
  USE mp_pools,             ONLY : inter_pool_comm
-  USE mp_bands,             ONLY : intra_bgrp_comm
-  USE mp,                   ONLY : mp_sum, mp_get_comm_null
+  USE mp_bands,             ONLY : intra_bgrp_comm, me_bgrp, nproc_bgrp
+  USE mp,                   ONLY : mp_sum
  USE uspp_init,            ONLY : init_us_2
  !
  IMPLICIT NONE
@ -42,52 +42,20 @@ SUBROUTINE force_us( forcenl )
  ! ... local variables
  !
  COMPLEX(DP), ALLOCATABLE :: vkb1(:,:)   ! contains g*|beta>
-  !$acc declare device_resident(vkb1)
-  COMPLEX(DP), ALLOCATABLE :: deff_nc(:,:,:,:)
-  REAL(DP),    ALLOCATABLE :: deff(:,:,:)
-  TYPE(bec_type) :: dbecp                 ! contains <dbeta|psi>
-  INTEGER :: npw, ik, ipol, ig, jkb
-  INTEGER :: itot, nt, na
-  INTEGER, ALLOCATABLE :: nt_list(:), na_list(:)
-  LOGICAL, ALLOCATABLE :: ismulti_np(:)
-  COMPLEX(DP), ALLOCATABLE :: becpnc(:,:,:),  becpk(:,:), &
-                              dbecpnc(:,:,:), dbecpk(:,:)
-  !$acc declare device_resident(becpnc,becpk,dbecpnc,dbecpk)
-  !civn: these buffers are kept here instead of inside force_us_k to
-  !      save allocation/deallocation overhead inside the ik,ipol loops
+  TYPE(bec_type) :: becd                  ! contains <dbeta|psi>
+  COMPLEX(DP) :: deff_nc
+  REAL(DP) :: deff, fnl
+  INTEGER :: npw, ik, ipol, ig, na, na_s, na_e, mykey
+  INTEGER :: nt, ibnd, nhnt, ih, jh, ijkb0, ikb, jkb, is, js, ijs
  !
  forcenl(:,:) = 0.D0
  !
  CALL allocate_bec_type_acc( nkb, nbnd, becp, intra_bgrp_comm )
-  CALL allocate_bec_type_acc( nkb, nbnd, dbecp, intra_bgrp_comm )
-  !
+  CALL allocate_bec_type_acc( nkb, nbnd, becd, intra_bgrp_comm )
  ALLOCATE( vkb1(npwx,nkb) )
-  IF (noncolin) THEN
-    ALLOCATE( becpnc(nkb,npol,nbnd), dbecpnc(nkb,npol,nbnd) )
-    ALLOCATE( deff_nc(nhm,nhm,nat,nspin) )
-  ELSEIF (.NOT. gamma_only ) THEN
-    ALLOCATE( becpk(nkb,nbnd), dbecpk(nkb,nbnd) )
-    ALLOCATE( deff(nhm,nhm,nat) )
-  ENDIF
+  !$acc data create(vkb1)
  ! 
-  !$acc data create(deff,deff_nc) 
-  !
-  ALLOCATE( nt_list(nat), na_list(nat), ismulti_np(nat) )
-  !
-  itot = 0
-  DO nt = 1, ntyp
-     DO na = 1, nat
-        IF ( ityp(na)==nt ) THEN
-           itot = itot + 1
-           nt_list(itot) = nt
-           na_list(itot) = na
-           ismulti_np(itot) = upf(nt)%tvanp .OR. upf(nt)%is_multiproj
-        ENDIF
-     ENDDO
-  ENDDO
-  IF (itot /= nat) CALL errore( 'force_us', 'Something wrong in atoms counting', 1 )
-  !
-  ! ... the forces are a sum over the K points and over the bands
+  ! ... the forces are summed over K-points
  !
  DO ik = 1, nks
     !
@ -96,25 +64,18 @@ SUBROUTINE force_us( forcenl )
     !
     IF ( nks > 1 ) THEN
        CALL get_buffer( evc, nwordwfc, iunwfc, ik )
-        IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
+        !$acc update device( evc )
     ENDIF
     !
-     !$acc update device( evc )
+     IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. )
+     !$acc data present (evc, vkb, becp)
     CALL calbec( offload_type, npw, vkb, evc, becp )
-     IF (noncolin) THEN
-       !$acc kernels
-       becpnc = becp%nc
-       !$acc end kernels
-     ELSEIF (.NOT. gamma_only ) THEN
-       !$acc kernels
-       becpk = becp%k
-       !$acc end kernels
-     ENDIF
+     !$acc end data
     !
     DO ipol = 1, 3
        !
 #if defined(_OPENACC)
-        !$acc parallel loop collapse(2)
+        !$acc parallel loop collapse(2) present(vkb, g, igk_k) 
 #else
        !$omp parallel do collapse(2) private(ig)
 #endif
@ -123,50 +84,96 @@ SUBROUTINE force_us( forcenl )
              vkb1(ig,jkb) = vkb(ig,jkb) * (0.D0,-1.D0) * g(ipol,igk_k(ig,ik))
           ENDDO
        ENDDO
+        !$acc data present (evc, becd)
+        CALL calbec( offload_type, npw, vkb1, evc, becd )
+        !$acc end data
        !
-        CALL calbec( offload_type, npw, vkb1, evc, dbecp )
-        IF (noncolin) THEN
-          !$acc kernels
-          dbecpnc = dbecp%nc
-          !$acc end kernels
-        ELSEIF (.NOT. gamma_only ) THEN
-          !$acc kernels
-          dbecpk = dbecp%k
-          !$acc end kernels
-        ENDIF
+        ! becp = <beta|psi>, becd = <dbeta/dG_ipol|psi>
+        ! Now sum over bands and over projectors belonging to each atom
        !
-        !$acc data copyin(nt_list,na_list,ismulti_np)
-        IF ( gamma_only ) THEN
-           !
-           CALL force_us_gamma( forcenl )
-           !
-        ELSE
-           !
-           CALL force_us_k( forcenl )
-           !
-        ENDIF
+        ! ... NOTE: calls to calbec are parallelized over the bgrp group
+        ! ... The rest of the calculation is parallelized by subdividing 
+        ! ... the atoms over the bgrp group
+        !
+        CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey )
+        !
+        IF ( mykey /= 0 ) CYCLE
+        !
+        !$acc data present(becp, becd, deeq, qq_at, deeq_nc, qq_so, et) copyin(wg) 
+        DO na = na_s, na_e
+           fnl = 0.0_dp
+           nt = ityp(na)
+           nhnt = nh(nt)
+           ijkb0 = ofsbeta(na)
+           IF ( gamma_only ) THEN
+              !$acc parallel loop collapse(3) present(becp%r,becd%r) reduction(+:fnl)
+              DO ibnd = 1, nbnd
+                 DO ih = 1, nhnt
+                    DO jh = 1, nhnt
+                       ikb = ijkb0 + ih
+                       jkb = ijkb0 + jh
+                       deff = deeq(ih,jh,na,current_spin) - &
+                            et(ibnd,ik) * qq_at(ih,jh,na)
+                       fnl = fnl + wg(ibnd,ik) * deff *  &
+                            becd%r(ikb,ibnd) * becp%r(jkb,ibnd)
+                    END DO
+                 END DO
+              END DO
+           ELSE IF ( .NOT. noncolin ) THEN
+              !$acc parallel loop collapse(3) present(becp%k,becd%k) reduction(+:fnl)
+              DO ibnd = 1, nbnd
+                 DO ih = 1, nhnt
+                    DO jh = 1, nhnt
+                       ikb = ijkb0 + ih
+                       jkb = ijkb0 + jh
+                       deff = deeq(ih,jh,na,current_spin) - et(ibnd,ik) * qq_at(ih,jh,na)
+                       fnl = fnl + wg(ibnd,ik) * deff *  &
+                            DBLE(CONJG(becp%k(ikb,ibnd)) * becd%k(jkb,ibnd) )
+                    END DO
+                 END DO
+              END DO
+           ELSE
+              !$acc parallel loop collapse(3) present(becp%nc,becd%nc) reduction(+:fnl)
+              DO ibnd = 1, nbnd
+                 DO ih = 1, nhnt
+                    DO jh = 1, nhnt
+                       ikb = ijkb0 + ih
+                       jkb = ijkb0 + jh
+                       !$acc loop seq collapse(2)
+                       DO is = 1, npol
+                          DO js = 1, npol
+                             ijs = (is-1)*npol + js
+                             deff_nc = deeq_nc(ih,jh,na,ijs)
+                             IF ( lspinorb ) THEN
+                                deff_nc = deff_nc - et(ibnd,ik) * qq_so(ih,jh,ijs,nt)
+                             ELSE IF ( is == js ) THEN
+                                deff_nc = deff_nc - et(ibnd,ik) * qq_at(ih,jh,na)
+                             END IF
+                             fnl = fnl + wg(ibnd,ik) * DBLE ( &
+                                  deff_nc * CONJG(becp%nc(ikb,is,ibnd)) * &
+                                  becd%nc(jkb,js,ibnd) )
+                          END DO
+                       END DO
+                    END DO
+                 END DO
+              END DO
+           END IF
+           ! factor 2 from Ry a.u. (e^2=2)? tpiba from k+G, minus sign
+           forcenl(ipol,na) = forcenl(ipol,na) - 2.0_dp * tpiba* fnl
+        END DO
        !$acc end data
        !
     ENDDO
  ENDDO
  !
-  ! ... if sums over bands are parallelized over the band group
-  !
-  IF ( becp%comm /= mp_get_comm_null() ) CALL mp_sum( forcenl, becp%comm )
-  !
  !$acc end data
  DEALLOCATE( vkb1 )
-  IF ( noncolin ) THEN
-     DEALLOCATE( deff_nc )
-  ELSEIF ( .NOT. gamma_only ) THEN
-     DEALLOCATE( deff )
-  ENDIF
-  !
-  CALL deallocate_bec_type_acc( dbecp )
+  CALL deallocate_bec_type_acc( becd )
  CALL deallocate_bec_type_acc( becp )
  !
-  ! ... collect contributions across pools from all k-points
+  ! ... collect contributions across processors and pools from all k-points
  !
+  CALL mp_sum( forcenl, intra_bgrp_comm )
  CALL mp_sum( forcenl, inter_pool_comm )
  !
  ! ... The total D matrix depends on the ionic position via the
@ -180,270 +187,6 @@ SUBROUTINE force_us( forcenl )
  !
  CALL symvector( nat, forcenl )
  !
-  DEALLOCATE( nt_list, na_list, ismulti_np )
-  !   
-  IF ( noncolin ) THEN
-    DEALLOCATE( becpnc, dbecpnc )
-  ELSEIF (.NOT. gamma_only ) THEN
-    DEALLOCATE( becpk, dbecpk )
-  ENDIF
-  !
  RETURN
  !
-  CONTAINS
-     !
-     !-----------------------------------------------------------------------
-     SUBROUTINE force_us_gamma( forcenl )
-       !-----------------------------------------------------------------------
-       !! Nonlocal contribution to the force. Calculation at Gamma.
-       !
-       ! Important notice about parallelization over the band group of processors:
-       ! 1) internally, "calbec" parallelises on plane waves over the band group
-       ! 2) the results of "calbec" are distributed across processors of the band
-       !    group: the band index of becp, dbecp is distributed
-       ! 3) the band group is subsequently used to parallelize over bands
-       !
-       USE uspp,     ONLY : qq_at, deeq
-       !
-       IMPLICIT NONE
-       !
-       REAL(DP) :: forcenl(3,nat)
-       !! the nonlocal contribution
-       !
-       ! ... local variables
-       !
-       REAL(DP), ALLOCATABLE :: aux(:,:)
-       REAL(DP) :: forcenl_ipol
-       INTEGER :: nt, na, ibnd, ibnd_loc, ih, jh, ijkb0
-       INTEGER :: nh_nt, becp_ibnd_begin, becp_nbnd_loc, nbnd_siz
-       REAL(DP), ALLOCATABLE :: dbecprd(:,:), becprd(:,:)
-       !$acc declare device_resident(dbecprd, becprd)
-       !
-       nbnd_siz = nbnd / becp%nproc
-       ALLOCATE( becprd(nkb,nbnd_siz), dbecprd(nkb,nbnd_siz) )
-       !
-       !$acc kernels
-       dbecprd = dbecp%r
-       becprd  = becp%r
-       !$acc end kernels
-       becp_nbnd_loc = becp%nbnd_loc
-       becp_ibnd_begin = becp%ibnd_begin
-       !
-       !$acc data copyin( et, wg )
-       !
-       DO nt = 1, ntyp
-          !
-          IF ( nh(nt) == 0 ) CYCLE
-          !
-          ALLOCATE( aux(nh(nt), becp%nbnd_loc) )
-          !$acc data create(aux)
-          !
-          nh_nt = nh(nt)
-          !
-          DO na = 1, nat
-             IF ( ityp(na) == nt ) THEN
-                ijkb0 = ofsbeta(na)
-                ! ... this is \sum_j q_{ij} <beta_j|psi>
-                !
-                !$acc host_data use_device(aux, qq_at, becprd)
-                CALL MYDGEMM( 'N','N', nh(nt), becp_nbnd_loc, nh(nt),      &
-                              1.0_DP, qq_at(1,1,na), nhm, becprd(ijkb0+1,1), &
-                              nkb, 0.0_DP, aux, nh(nt) )
-                !$acc end host_data
-                !
-                ! ... multiply by -\epsilon_n
-                !
-#if defined(_OPENACC)
-                !$acc parallel loop collapse(2)
-#else
-                !$omp parallel do default(shared) private(ibnd_loc,ibnd,ih)
-#endif
-                DO ih = 1, nh_nt
-                   DO ibnd_loc = 1, becp_nbnd_loc
-                      ibnd = ibnd_loc + becp_ibnd_begin - 1
-                      aux(ih,ibnd_loc) = - et(ibnd,ik) * aux(ih,ibnd_loc)
-                   ENDDO
-                ENDDO
-#if !defined(_OPENACC)
-                !$omp end parallel do
-#endif
-                !
-                ! ... add  \sum_j d_{ij} <beta_j|psi>
-                !
-                !$acc host_data use_device(aux, deeq, becprd)
-                CALL MYDGEMM( 'N','N', nh(nt), becp_nbnd_loc, nh(nt), &
-                              1.0_DP, deeq(1,1,na,current_spin), nhm, &
-                              becprd(ijkb0+1,1), nkb, 1.0_DP, aux, nh(nt) )
-                !$acc end host_data
-                !
-                ! ... Auxiliary variable to perform the reduction with gpu kernels
-                forcenl_ipol = 0.0_DP
-#if defined(_OPENACC)
-                !$acc parallel loop collapse(2) reduction(+:forcenl_ipol)
-#else
-                !$omp parallel do default(shared) private(ibnd_loc,ibnd,ih) reduction(+:forcenl_ipol)
-#endif
-                DO ih = 1, nh_nt
-                   DO ibnd_loc = 1, becp_nbnd_loc
-                      ibnd = ibnd_loc + becp_ibnd_begin - 1
-                      forcenl_ipol = forcenl_ipol - 2.0_DP*tpiba * aux(ih,ibnd_loc) *&
-                      dbecprd(ijkb0+ih,ibnd_loc) * wg(ibnd,ik)
-                   ENDDO
-                ENDDO
-#if !defined(_OPENACC)
-                !$omp end parallel do
-#endif
-                !
-                forcenl(ipol,na) = forcenl(ipol,na) + forcenl_ipol
-                !
-             ENDIF
-          ENDDO
-          !
-          !$acc end data
-          DEALLOCATE( aux )
-          !
-       ENDDO
-       !
-       !$acc end data
-       !
-       DEALLOCATE( becprd, dbecprd )
-       !
-     END SUBROUTINE force_us_gamma
-     !
-     !
-     !-----------------------------------------------------------------------
-     SUBROUTINE force_us_k( forcenl )
-       !-----------------------------------------------------------------------
-       !! Nonlocal contribution to the force. Calculation for k-points.
-       !
-       IMPLICIT NONE
-       !
-       REAL(DP) :: forcenl(3,nat)
-       !! the nonlocal contribution
-       !
-       ! ... local variables
-       !
-       REAL(DP) :: fac
-       REAL(DP) :: forcenl_p1, forcenl_p2
-       INTEGER  :: ibnd, ih, jh, na, nt, ikb, jkb, ijkb0, is, js, ijs !counters
-       INTEGER  :: nh_nt, it
-       !
-       !$acc data copy(forcenl)
-       !
-       DO ibnd = 1, nbnd
-          !
-          IF (noncolin) THEN
-             CALL compute_deff_nc( deff_nc, et(ibnd,ik) )
-          ELSE
-             CALL compute_deff( deff, et(ibnd,ik) )
-          ENDIF
-          !
-          fac = wg(ibnd,ik)*tpiba
-          !
-#if defined(_OPENACC)
-          !$acc parallel loop gang reduction(+:forcenl_p2)
-#else
-          !$omp parallel do private(nt,na,ijkb0,nh_nt,forcenl_p1,forcenl_p2,ih,&
-          !$omp                     ikb,is,js,ijs,jkb)
-#endif
-          DO it = 1, nat
-             !
-             nt = nt_list(it)
-             na = na_list(it)
-             ijkb0 = ofsbeta(na)
-             nh_nt = nh(nt)
-             !
-             forcenl_p2 = 0.d0
-             !$acc loop vector reduction(+:forcenl_p2)
-             DO ih = 1, nh_nt
-                !
-                ikb = ijkb0 + ih
-                IF (noncolin) THEN
-                   forcenl_p1 = 0.d0
-                   !$acc loop seq collapse(2) reduction(+:forcenl_p1)
-                   DO is = 1, npol
-                     DO js = 1, npol
-                       ijs = (is-1)*npol+js
-                       forcenl_p1 = forcenl_p1 - &
-                                    deff_nc(ih,ih,na,ijs)*fac*(  &
-                                    CONJG(dbecpnc(ikb,is,ibnd))* &
-                                    becpnc(ikb,js,ibnd)+         &
-                                    CONJG(becpnc(ikb,is,ibnd))*  &
-                                    dbecpnc(ikb,js,ibnd) )
-                     ENDDO
-                   ENDDO
-                ELSE
-                   forcenl_p1 = -2.D0 * fac * deff(ih,ih,na) *    &
-                                DBLE( CONJG( dbecpk(ikb,ibnd) ) * &
-                                becpk(ikb,ibnd) )
-                ENDIF
-                !
-                forcenl_p2 = forcenl_p2 + forcenl_p1
-                !
-             ENDDO
-             !
-             forcenl(ipol,na) = forcenl(ipol,na) + forcenl_p2
-             !
-             IF ( ismulti_np(it) ) THEN
-                !
-                forcenl_p2 = 0.d0
-                !$acc loop vector reduction(+:forcenl_p2)
-                DO ih = 1, nh_nt
-                   ikb = ijkb0 + ih
-                   !
-                   ! ... in US case there is a contribution for jh<>ih. 
-                   ! ... We use here the symmetry in the interchange 
-                   ! ... of ih and jh
-                   !
-                   forcenl_p1 = 0.d0
-                   !$acc loop seq
-                   DO jh = ih+1, nh_nt
-                      jkb = ijkb0 + jh
-                      IF (noncolin) THEN
-                        !$acc loop seq collapse(2) reduction(+:forcenl_p1)
-                        DO is = 1, npol
-                          DO js = 1, npol
-                             ijs = (is-1)*npol+js
-                             forcenl_p1 = forcenl_p1 - &
-                                          deff_nc(ih,jh,na,ijs)*fac*(  &
-                                          CONJG(dbecpnc(ikb,is,ibnd))* &
-                                          becpnc(jkb,js,ibnd) +        &
-                                          CONJG(becpnc(ikb,is,ibnd))*  &
-                                          dbecpnc(jkb,js,ibnd)) -      &
-                                          deff_nc(jh,ih,na,ijs)*fac*(  &
-                                          CONJG(dbecpnc(jkb,is,ibnd))* &
-                                          becpnc(ikb,js,ibnd) +        &
-                                          CONJG(becpnc(jkb,is,ibnd))*  &
-                                          dbecpnc(ikb,js,ibnd) )
-                          ENDDO
-                        ENDDO
-                      ELSE
-                        forcenl_p1 = forcenl_p1 - &
-                                     2.D0 * fac * deff(ih,jh,na) *      &
-                                     DBLE( CONJG( dbecpk(ikb,ibnd) ) *  &
-                                     becpk(jkb,ibnd) + dbecpk(jkb,ibnd) &
-                                     * CONJG( becpk(ikb,ibnd) ) )
-                      ENDIF
-                   ENDDO !jh
-                   !
-                   forcenl_p2 = forcenl_p2 + forcenl_p1
-                   !
-                ENDDO !ih
-                !
-                forcenl(ipol,na) = forcenl(ipol,na) + forcenl_p2
-                !
-             ENDIF ! tvanp
-             !
-          ENDDO ! it=nt|na
-#if !defined(_OPENACC)
-          !$omp end parallel do
-#endif
-          !
-       ENDDO ! nbnd
-       !
-       !$acc end data
-       !
-     END SUBROUTINE force_us_k
-     !
-     !
 END SUBROUTINE force_us
--- a/PW/src/newd_gpu.f90
+++ b/PW/src/newd_gpu.f90
@ -222,7 +222,7 @@ SUBROUTINE newd_gpu( )
  USE kinds,                ONLY : DP
  USE ions_base,            ONLY : nat, ntyp => nsp, ityp
  USE lsda_mod,             ONLY : nspin
-  USE uspp,                 ONLY : okvan, deeq, deeq_nc, dvan_d, dvan_so_d
+  USE uspp,                 ONLY : okvan, deeq, deeq_nc, dvan, dvan_so
  USE uspp_param,           ONLY : upf, lmaxq, nh, nhm
  USE noncollin_module,     ONLY : noncolin, domag, nspin_mag, lspinorb
  USE uspp,                 ONLY : nhtol, nhtolm
@ -241,7 +241,6 @@ SUBROUTINE newd_gpu( )
  INTEGER :: ig, nt, ih, jh, na, is, nht, nb, mb, ierr
  ! counters on g vectors, atom type, beta functions x 2,
  !   atoms, spin, aux, aux, beta func x2 (again)
-  REAL(kind=dp), allocatable :: deeq_h( :,:,:,: )
  INTEGER, POINTER :: ityp_d(:)
 #if defined(__CUDA)
  attributes(DEVICE) :: ityp_d
@ -263,7 +262,7 @@ SUBROUTINE newd_gpu( )
              DO na = 1, nat
                 DO jh = 1, nht
                    DO ih = 1, nht
-                       IF ( ityp_d(na) == nt ) deeq_nc(ih,jh,na,is) = dvan_so_d(ih,jh,is,nt)
+                       IF ( ityp_d(na) == nt ) deeq_nc(ih,jh,na,is) = dvan_so(ih,jh,is,nt)
                    END DO
                 END DO
              END DO
@ -276,10 +275,10 @@ SUBROUTINE newd_gpu( )
              DO jh = 1, nht
                 DO ih = 1, nht
                    IF ( ityp_d(na) == nt ) THEN
-                       deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt)
+                       deeq_nc(ih,jh,na,1) = dvan(ih,jh,nt)
                       deeq_nc(ih,jh,na,2) = ( 0.D0, 0.D0 )
                       deeq_nc(ih,jh,na,3) = ( 0.D0, 0.D0 )
-                       deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt)
+                       deeq_nc(ih,jh,na,4) = dvan(ih,jh,nt)
                    END IF
                 END DO
              END DO
@ -294,7 +293,7 @@ SUBROUTINE newd_gpu( )
                    DO jh = 1, nht
                       DO ih = 1, nht
                          !
-                          IF ( ityp_d(na) == nt ) deeq(ih,jh,na,is) = dvan_d(ih,jh,nt)
+                          IF ( ityp_d(na) == nt ) deeq(ih,jh,na,is) = dvan(ih,jh,nt)
                          !
                       END DO
                    END DO
@ -323,7 +322,6 @@ SUBROUTINE newd_gpu( )
  END IF
  !
  CALL start_clock_gpu( 'newd' )
-  allocate(deeq_h( nhm, nhm, nat, nspin ))
  !
  ! move atom type info to GPU
  CALL buffer%lock_buffer(ityp_d, nat, ierr)
@ -369,7 +367,7 @@ SUBROUTINE newd_gpu( )
              DO ih = 1, nht
                 DO jh = 1, nht
                    IF ( ityp_d(na) == nt ) THEN
-                       deeq(ih,jh,na,is) = deeq(ih,jh,na,is) + dvan_d(ih,jh,nt)
+                       deeq(ih,jh,na,is) = deeq(ih,jh,na,is) + dvan(ih,jh,nt)
                    END IF
                 END DO
              END DO
@ -437,7 +435,7 @@ SUBROUTINE newd_gpu( )
                        !
                        IF ( ityp_d(na) == nt ) THEN
                           !
-                           deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
+                           deeq_nc(ih,jh,na,ijs) = dvan_so(ih,jh,ijs,nt)
                           !
                           DO kh = 1, nhnt
                              !
@ -479,7 +477,7 @@ SUBROUTINE newd_gpu( )
                        !
                        IF ( ityp_d(na) == nt ) THEN
                           !
-                           deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt)
+                           deeq_nc(ih,jh,na,ijs) = dvan_so(ih,jh,ijs,nt)
                           !
                           DO kh = 1, nhnt
                              !
@ -533,17 +531,17 @@ SUBROUTINE newd_gpu( )
               IF ( ityp_d(na) == nt ) THEN
                  !
                  IF (lspinorb) THEN
-                     deeq_nc(ih,jh,na,1) = dvan_so_d(ih,jh,1,nt) + &
+                     deeq_nc(ih,jh,na,1) = dvan_so(ih,jh,1,nt) + &
                                           deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
                     !
-                     deeq_nc(ih,jh,na,4) = dvan_so_d(ih,jh,4,nt) + &
+                     deeq_nc(ih,jh,na,4) = dvan_so(ih,jh,4,nt) + &
                                           deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
                     !
                  ELSE
-                     deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt) + &
+                     deeq_nc(ih,jh,na,1) = dvan(ih,jh,nt) + &
                                           deeq(ih,jh,na,1) + deeq(ih,jh,na,4)
                     !
-                     deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt) + &
+                     deeq_nc(ih,jh,na,4) = dvan(ih,jh,nt) + &
                                           deeq(ih,jh,na,1) - deeq(ih,jh,na,4)
                     !
                  END IF
--- a/upflib/init_us_1.f90
+++ b/upflib/init_us_1.f90
@ -28,7 +28,7 @@ subroutine init_us_1( nat, ityp, omega, qmax, intra_bgrp_comm )
  USE upf_const,    ONLY : fpi, sqrt2
  USE uspp,         ONLY : nhtol, nhtoj, nhtolm, ijtoh, dvan, qq_at, qq_nt, indv, &
                           ap, aainit, qq_so, dvan_so, okvan, ofsbeta, &
-                           qq_nt_d, dvan_d, dvan_so_d
+                           qq_nt_d
  USE uspp_param,   ONLY : upf, lmaxq, nh, nhm, lmaxkb, nsp
  USE upf_spinorb,  ONLY : is_spinorbit, rot_ylm, fcoef, lmaxx, &
                           transform_qq_so
@ -256,14 +256,14 @@ subroutine init_us_1( nat, ityp, omega, qmax, intra_bgrp_comm )
     !$acc update device(nhtol)
     !$acc update device(nhtoj)
     !$acc update device(ijtoh)
-     qq_nt_d=qq_nt
     !$acc update device(qq_at)
+     qq_nt_d=qq_nt
     if (is_spinorbit) then
-        dvan_so_d=dvan_so
-      !$acc update device(fcoef)
-      !$acc update device(qq_so)
-     else
-        dvan_d=dvan
+        !$acc update device(dvan_so)
+        !$acc update device(fcoef)
+        !$acc update device(qq_so)
+     else 
+        !$acc update device(dvan)
     endif
  endif
  !
--- a/upflib/uspp.f90
+++ b/upflib/uspp.f90
@ -35,10 +35,9 @@ MODULE uspp
  PUBLIC :: nlx, lpx, lpl, ap, aainit, indv, nhtol, nhtolm, ofsbeta, &
            nkb, nkbus, vkb, dvan, deeq, qq_at, qq_nt, nhtoj, ijtoh, beta, &
            becsum, ebecsum
-  PUBLIC :: dvan_d, qq_nt_d
+  PUBLIC :: qq_nt_d
  PUBLIC :: okvan, nlcc_any
  PUBLIC :: qq_so,   dvan_so,   deeq_nc,   fcoef 
-  PUBLIC :: dvan_so_d
  PUBLIC :: dbeta
  !
  PUBLIC :: allocate_uspp, deallocate_uspp
@ -92,11 +91,9 @@ MODULE uspp
  !
  ! GPU vars
  !
-  REAL(DP),    ALLOCATABLE :: dvan_d(:,:,:)
  REAL(DP),    ALLOCATABLE :: qq_nt_d(:,:,:)
-  COMPLEX(DP), ALLOCATABLE :: dvan_so_d(:,:,:,:)
 #if defined(__CUDA)
-  attributes (DEVICE) :: dvan_d, qq_nt_d, dvan_so_d
+  attributes (DEVICE) :: qq_nt_d
 #endif

  !
@ -343,10 +340,12 @@ CONTAINS
       allocate( qq_so(nhm,nhm,4,nsp) )
       !$acc enter data create(qq_so)
       allocate( dvan_so(nhm,nhm,nspin,nsp) )
+       !$acc enter data create(dvan_so)
       allocate( fcoef(nhm,nhm,2,2,nsp) )
       !$acc enter data create(fcoef)
    else
       allocate( dvan(nhm,nhm,nsp) )
+       !$acc enter data create(dvan)
    endif
    allocate(becsum( nhm*(nhm+1)/2, nat, nspin))
    !$acc enter data create(becsum)
@ -360,15 +359,7 @@ CONTAINS
    !
    if (use_gpu) then
      !
-      if (nhm>0) then
-        allocate( qq_nt_d(nhm,nhm,nsp) )
-        if ( lspinorb ) then
-           allocate( dvan_so_d(nhm,nhm,nspin,nsp) )
-        else
-           allocate( dvan_d(nhm,nhm,nsp) )
-        endif
-        !
-      endif
+      if (nhm>0) allocate( qq_nt_d(nhm,nhm,nsp) )
      !
    endif
    !
@ -412,7 +403,10 @@ CONTAINS
      DEALLOCATE( qq_at )
    ENDIF
    IF( ALLOCATED( qq_nt ) )      DEALLOCATE( qq_nt )
-    IF( ALLOCATED( dvan ) )       DEALLOCATE( dvan )
+    IF( ALLOCATED( dvan ) ) THEN
+      !$acc exit data delete( dvan )
+      DEALLOCATE( dvan )
+    END IF
    IF( ALLOCATED( deeq ) ) THEN
      !$acc exit data delete( deeq )
      DEALLOCATE( deeq )
@ -421,7 +415,10 @@ CONTAINS
      !$acc exit data delete( qq_so )
      DEALLOCATE( qq_so )
    ENDIF
-    IF( ALLOCATED( dvan_so ) )    DEALLOCATE( dvan_so )
+    IF( ALLOCATED( dvan_so ) ) THEN
+      !$acc exit data delete( dvan_so )
+      DEALLOCATE( dvan_so )
+    END IF
    IF( ALLOCATED( deeq_nc ) ) THEN
      !$acc exit data delete( deeq_nc )
      DEALLOCATE( deeq_nc )
@ -434,9 +431,7 @@ CONTAINS
    IF( ALLOCATED( dbeta ) )      DEALLOCATE( dbeta )
    !
    ! GPU variables
-    IF( ALLOCATED( dvan_d ) )     DEALLOCATE( dvan_d )
    IF( ALLOCATED( qq_nt_d ) )    DEALLOCATE( qq_nt_d )
-    IF( ALLOCATED( dvan_so_d ) )  DEALLOCATE( dvan_so_d )
    !
  END SUBROUTINE deallocate_uspp
  !