more device variables, more helper subs, bug fix for OpenMP with intel compiler

This commit is contained in:
carcava 2020-03-10 01:55:17 +01:00
parent 8fe16a7bfe
commit 657a7d2e12
9 changed files with 245 additions and 67 deletions

View File

@ -6,9 +6,16 @@
! or http://www.gnu.org/copyleft/gpl.txt .
!
#if defined(__CUDA)
#define DEVICEATTR ,DEVICE
#else
#define DEVICEATTR
#endif
!-----------------------------------------------------------------------
SUBROUTINE rhoofr_cp &
( nfi, c_bgrp, irb, eigrb, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
( nfi, c_bgrp, c_d, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, &
enl, denl, ekin, dekin, tstress, ndwwf )
!-----------------------------------------------------------------------
!
! this routine computes:
@ -53,14 +60,14 @@
USE mp, ONLY: mp_sum
USE io_global, ONLY: stdout, ionode
USE mp_global, ONLY: intra_bgrp_comm, nbgrp, inter_bgrp_comm, &
me_bgrp, nproc_bgrp, root_bgrp
me_bgrp, nproc_bgrp, root_bgrp
USE funct, ONLY: dft_is_meta
USE cg_module, ONLY: tcg
USE cp_interfaces, ONLY: stress_kin, enkin
USE fft_interfaces, ONLY: fwfft, invfft
USE fft_base, ONLY: dffts, dfftp
USE cp_interfaces, ONLY: checkrho, ennl, calrhovan, dennl
USE cp_main_variables, ONLY: iprint_stdout, idesc
USE cp_main_variables, ONLY: iprint_stdout, idesc, irb, eigrb
USE wannier_base, ONLY: iwf
USE exx_module, ONLY: rhopr
USE input_parameters, ONLY: tcpbo ! BS
@ -79,11 +86,10 @@
REAL(DP) rhos(:,:)
REAL(DP) enl, ekin
REAL(DP) denl(3,3), dekin(6)
COMPLEX(DP) eigrb( :, : )
COMPLEX(DP) rhog( :, : )
COMPLEX(DP) drhog( :, :, :, : )
COMPLEX(DP) c_bgrp( :, : )
INTEGER irb( :, : )
COMPLEX(DP) DEVICEATTR :: c_d( :, : )
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
@ -217,6 +223,9 @@
CALL errore( ' rhoofr ', ' c second dimension too small ', SIZE( c_bgrp, 2 ) )
!
c_bgrp( :, nbsp_bgrp + 1 ) = ( 0.d0, 0.d0 )
#if defined (__CUDA)
c_d( :, nbsp_bgrp + 1 ) = ( 0.d0, 0.d0 )
#endif
!
ENDIF
!
@ -474,7 +483,6 @@
ALLOCATE( psis( dffts%nnr * many_fft ) ) ! dffts%nnr * many_fft
ALLOCATE( ptmp( SIZE(c_bgrp,1), 2 ) )
ALLOCATE( rhos_d ( SIZE(rhos,1), SIZE(rhos,2) ) )
!
rhos_d = 0_DP
@ -488,19 +496,16 @@
ioff = 0
DO ii = i, i + 2 * many_fft - 1, 2
IF( ii < nbsp_bgrp ) THEN
ptmp(:,1) = c_bgrp( :, ii )
ptmp(:,2) = c_bgrp( :, ii + 1 )
!$cuf kernel do(1)
do ig = 1, dffts%ngw
psis( nlm_d( ig ) + ioff) = CONJG( ptmp( ig, 1 ) ) + ci * conjg( ptmp( ig, 2 ))
psis( nl_d( ig ) + ioff) = ptmp( ig, 1 ) + ci * ptmp( ig, 2 )
psis( nlm_d( ig ) + ioff) = CONJG( c_d( ig, ii ) ) + ci * conjg( c_d( ig, ii+1 ))
psis( nl_d( ig ) + ioff) = c_d( ig, ii ) + ci * c_d( ig, ii+1 )
end do
ELSE IF( ii == nbsp_bgrp ) THEN
ptmp(:,1) = c_bgrp( :, ii )
!$cuf kernel do(1)
do ig = 1, dffts%ngw
psis( nlm_d( ig ) + ioff) = CONJG( ptmp( ig, 1 ) )
psis( nl_d( ig ) + ioff) = ptmp( ig, 1 )
psis( nlm_d( ig ) + ioff) = CONJG( c_d( ig, ii ) )
psis( nl_d( ig ) + ioff) = c_d( ig, ii+1 )
end do
END IF
! CALL c2psi_gamma( dffts, psis, c_bgrp(:,ii), c_bgrp(:,ii+1) )
@ -695,7 +700,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
!$omp parallel default(none) &
!$omp shared(nat, ityp, ngb, nh, eigrb, dfftb, irb, v, &
!$omp ci, i, j, dqgb, qgb, nhm, rhovan, drhovan, upf ) &
!$omp i, j, dqgb, qgb, nhm, rhovan, drhovan, upf ) &
!$omp private(mytid, ntids, is, ia, iv, jv, ijv, ig, iss, &
!$omp qv, fg1, fg2, itid, dqgbt, dsumt, asumt )
@ -949,7 +954,7 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
!$omp parallel default(none) &
!$omp shared(na, ngb, nh, rhovan, qgb, eigrb, dfftb, iverbosity, omegab, irb, v, &
!$omp stdout, ci, rhor, dfftp, upf, nsp, ityp, nat, nspin ) &
!$omp stdout, rhor, dfftp, upf, nsp, ityp, nat, nspin ) &
!$omp private(mytid, ntids, is, ia, iv, jv, ijv, sumrho, qgbt, ig, ca, &
!$omp qv, itid, ir )
@ -1152,3 +1157,35 @@ CONTAINS
ENDIF
END SUBROUTINE
END SUBROUTINE rhov
#if defined (__CUDA)
SUBROUTINE rhoofr_host &
( nfi, c_bgrp, irb, eigrb, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, &
enl, denl, ekin, dekin, tstress, ndwwf )
USE kinds, ONLY: DP
USE cudafor
USE cp_interfaces
IMPLICIT NONE
INTEGER nfi
COMPLEX(DP) c_bgrp( :, : )
INTEGER irb( :, : )
COMPLEX(DP) eigrb( :, : )
REAL(DP) bec_bgrp(:,:)
REAL(DP) dbec(:,:,:,:)
REAL(DP) rhovan(:, :, : )
REAL(DP) rhor(:,:)
REAL(DP) drhor(:,:,:,:)
COMPLEX(DP) rhog( :, : )
COMPLEX(DP) drhog( :, :, :, : )
REAL(DP) rhos(:,:)
REAL(DP) enl, ekin
REAL(DP) denl(3,3), dekin(6)
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
COMPLEX(DP), ALLOCATABLE, DEVICE :: c(:,:)
ALLOCATE( c, SOURCE=c_bgrp )
CALL rhoofr(nfi, c_bgrp, c, bec_bgrp, dbec, rhovan, rhor, &
drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
DEALLOCATE( c )
END SUBROUTINE rhoofr_host
#endif

View File

@ -7,6 +7,12 @@
!
! written by Carlo Cavazzoni
#if defined(__CUDA)
#define DEVICEATTR ,DEVICE
#else
#define DEVICEATTR
#endif
!=----------------------------------------------------------------------------=!
MODULE cp_interfaces
!=----------------------------------------------------------------------------=!
@ -247,8 +253,30 @@
INTERFACE rhoofr
SUBROUTINE rhoofr_cp &
( nfi, c_bgrp, irb, eigrb, bec, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
( nfi, c_bgrp, c_d, bec, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
USE kinds, ONLY: DP
USE cudafor
IMPLICIT NONE
INTEGER nfi
COMPLEX(DP) :: c_bgrp( :, : )
COMPLEX(DP) DEVICEATTR :: c_d( :, : )
REAL(DP) bec(:,:)
REAL(DP) dbec(:,:,:,:)
REAL(DP) rhovan(:, :, : )
REAL(DP) rhor(:,:)
REAL(DP) drhor(:,:,:,:)
COMPLEX(DP) rhog( :, : )
COMPLEX(DP) drhog( :, :, :, : )
REAL(DP) rhos(:,:)
REAL(DP) enl, ekin
REAL(DP) denl(3,3), dekin(6)
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
END SUBROUTINE rhoofr_cp
#if defined (__CUDA)
SUBROUTINE rhoofr_host &
( nfi, c_bgrp, irb, eigrb, bec, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
USE kinds, ONLY: DP
IMPLICIT NONE
INTEGER nfi
COMPLEX(DP) c_bgrp( :, : )
@ -266,7 +294,8 @@
REAL(DP) denl(3,3), dekin(6)
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
END SUBROUTINE rhoofr_cp
END SUBROUTINE rhoofr_host
#endif
END INTERFACE
INTERFACE checkrho
@ -774,7 +803,7 @@
INTERFACE move_electrons
SUBROUTINE move_electrons_x( &
nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb, &
nfi, tfirst, tlast, b1, b2, b3, fion, enthal, enb, &
& enbi, fccc, ccc, dt2bye, stress,l_cprestart )
USE kinds, ONLY: DP
IMPLICIT NONE
@ -782,7 +811,6 @@
LOGICAL, INTENT(IN) :: tfirst, tlast
REAL(DP), INTENT(IN) :: b1(3), b2(3), b3(3)
REAL(DP) :: fion(:,:)
COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
REAL(DP), INTENT(IN) :: dt2bye
REAL(DP) :: fccc, ccc
REAL(DP) :: enb, enbi
@ -898,6 +926,16 @@
REAL(DP), INTENT(IN) :: f( : )
REAL(DP) :: enkin_x
END FUNCTION enkin_x
#if defined (__CUDA)
FUNCTION enkin_gpu_x( c, f, n )
USE kinds, ONLY: dp
IMPLICIT NONE
INTEGER, INTENT(IN) :: n
COMPLEX(DP), DEVICE, INTENT(IN) :: c( :, : )
REAL(DP), INTENT(IN) :: f( : )
REAL(DP) :: enkin_gpu_x
END FUNCTION enkin_gpu_x
#endif
END INTERFACE
INTERFACE newinit

View File

@ -1640,30 +1640,82 @@ end subroutine dylmr2_
! local
INTEGER :: ig, i
REAL(DP) :: sk(n) ! automatic array
REAL(DP) :: sk, rsum
!
sk = 0.0d0
!$omp parallel do reduction(+:sk) default(none) &
!$omp shared(sk,c,g2kin,gstart,ngw,n,f) private(i,ig,rsum)
DO i=1,n
sk(i)=0.0d0
rsum = 0.0d0
DO ig=gstart,ngw
sk(i)=sk(i)+DBLE(CONJG(c(ig,i))*c(ig,i))*g2kin(ig)
rsum = rsum + DBLE(CONJG(c(ig,i))*c(ig,i)) * g2kin(ig)
END DO
sk = sk + f(i) * rsum
END DO
!$omp end parallel do
CALL mp_sum( sk(1:n), intra_bgrp_comm )
enkin_x=0.0d0
DO i=1,n
enkin_x=enkin_x+f(i)*sk(i)
END DO
CALL mp_sum( sk, intra_bgrp_comm )
! ... reciprocal-space vectors are in units of alat/(2 pi) so a
! ... multiplicative factor (2 pi/alat)**2 is required
enkin_x = enkin_x * tpiba2
enkin_x = tpiba2 * sk
!
RETURN
END FUNCTION enkin_x
#if defined (__CUDA)
!-----------------------------------------------------------------------
FUNCTION enkin_gpu_x( c, f, n )
!-----------------------------------------------------------------------
!
USE kinds, ONLY: DP
USE constants, ONLY: pi, fpi
USE gvecw, ONLY: ngw
USE gvect, ONLY: gstart
USE gvecw, ONLY: g2kin
USE mp, ONLY: mp_sum
USE mp_global, ONLY: intra_bgrp_comm
USE cell_base, ONLY: tpiba2
USE cudafor
IMPLICIT NONE
REAL(DP) :: enkin_gpu_x
INTEGER, INTENT(IN) :: n
COMPLEX(DP), DEVICE, INTENT(IN) :: c( :, : )
REAL(DP), INTENT(IN) :: f( : )
!
! local
INTEGER :: ig, i
REAL(DP) :: sk
REAL(DP), ALLOCATABLE, DEVICE :: f_d(:)
REAL(DP), ALLOCATABLE, DEVICE :: g2(:)
REAL(DP) :: ddot
!
ALLOCATE( g2, SOURCE=g2kin )
ALLOCATE( f_d, SOURCE=f )
sk=0.0d0
!$cuf kernel do(2) <<<*,*>>>
DO i=1,n
DO ig=gstart,ngw
sk = sk + f_d(i) * DBLE(CONJG(c(ig,i))*c(ig,i)) * g2(ig)
END DO
END DO
DEALLOCATE( f_d )
DEALLOCATE( g2 )
CALL mp_sum( sk, intra_bgrp_comm )
enkin_gpu_x = tpiba2 * sk
!
RETURN
END FUNCTION enkin_gpu_x
#endif
!-------------------------------------------------------------------------
SUBROUTINE nlfl_bgrp_x( bec_bgrp, becdr_bgrp, lambda, idesc, fion )
!-----------------------------------------------------------------------

View File

@ -80,7 +80,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
electrons_nosevel, electrons_noseupd
USE pres_ai_mod, ONLY : P_ext, P_in, P_fin, pvar, volclu, &
surfclu, Surf_t, abivol, abisur
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp, cm_d, phi_d
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp, cm_d, phi_d, c0_d
USE wannier_module, ONLY : allocate_wannier
USE cp_interfaces, ONLY : printout_new, move_electrons, newinit
USE cell_nose, ONLY : xnhh0, xnhhm, xnhhp, vnhh, temph, &
@ -118,6 +118,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
USE input_parameters, ONLY : tcpbo
USE funct, ONLY : dft_is_hybrid, start_exx, exx_is_active
USE funct, ONLY : dft_is_meta
USE device_helper
!
IMPLICIT NONE
!
@ -289,13 +290,14 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
!
END IF
!
! ... why this call ??? from Paolo Umari
IF( force_pairing ) THEN
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
phi_bgrp(:,iupdwn(2):nbsp) = phi_bgrp(:,1:nupdwn(2))
lambda(:,:, 2) = lambda(:,:, 1)
ENDIF
!
IF ( tefield .or. tefield2 ) THEN
!
CALL calbec( 1, nsp, eigr, c0_bgrp, bec_bgrp ) ! ATTENZIONE
!
END IF
CALL sync_to_host( c0_bgrp, c0_d )
!
! Autopilot (Dynamic Rules) Implimentation
!
@ -323,13 +325,6 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
!
!=======================================================================
!
IF( force_pairing ) THEN
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
phi_bgrp(:,iupdwn(2):nbsp) = phi_bgrp(:,1:nupdwn(2))
lambda(:,:, 2) = lambda(:,:, 1)
ENDIF
!
! ... fake electronic kinetic energy
!
IF ( .NOT. tcg ) THEN
@ -341,8 +336,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
END IF
!
CALL move_electrons( nfi, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3), &
fion, c0_bgrp, cm_bgrp, phi_bgrp, &
enthal, enb, enbi, fccc, ccc, dt2bye, stress, .false. )
fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress, .false. )
!
IF (lda_plus_u) fion = fion + forceh
!
@ -555,8 +549,8 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
IF ( tortho ) THEN
!
#if defined (__CUDA)
cm_d = cm_bgrp
phi_d = phi_bgrp
CALL sync_to_host( cm_bgrp, cm_d )
CALL sync_to_host( phi_bgrp, phi_d )
CALL ortho( eigr, cm_d, phi_d, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
#else
CALL ortho( eigr, cm_bgrp, phi_bgrp, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
@ -577,8 +571,8 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
IF ( tortho ) THEN
#if defined (__CUDA)
CALL updatc( ccc, lambda, phi_d, bephi, becp_bgrp, bec_d, cm_d, idesc )
bec_bgrp = bec_d
cm_bgrp = cm_d
CALL sync_to_device( bec_bgrp, bec_d )
CALL sync_to_device( cm_bgrp, cm_d )
#else
CALL updatc( ccc, lambda, phi_bgrp, bephi, becp_bgrp, bec_bgrp, cm_bgrp, idesc )
#endif
@ -831,8 +825,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
lambdam = lambda
!
CALL move_electrons( nfi, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3),&
fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb,&
enbi, fccc, ccc, dt2bye, stress,.true. )
fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress,.true. )
!
END IF
!

View File

@ -48,7 +48,7 @@ SUBROUTINE from_scratch( )
USE printout_base, ONLY : printout_pos
USE orthogonalize_base, ONLY : updatc, calphi_bgrp
USE wave_base, ONLY : wave_steepest
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp, c0_d, phi_d
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp, c0_d, phi_d, cm_d
USE fft_base, ONLY : dfftp, dffts
USE time_step, ONLY : delt
USE cp_main_variables, ONLY : idesc, bephi, becp_bgrp, nfi, &
@ -59,6 +59,7 @@ SUBROUTINE from_scratch( )
USE mp_world, ONLY : mpime
USE mp, ONLY : mp_sum
USE matrix_inversion
USE device_helper
!
IMPLICIT NONE
!
@ -144,6 +145,7 @@ SUBROUTINE from_scratch( )
!
if( iverbosity > 1 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp )
!
CALL sync_to_host( cm_bgrp, cm_d )
!
! ... initialize bands
!
@ -194,7 +196,7 @@ SUBROUTINE from_scratch( )
!
if ( tstress ) CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )
!
CALL rhoofr( nfi, cm_bgrp, irb, eigrb, bec_bgrp, dbec, becsum, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
CALL rhoofr( nfi, cm_bgrp, cm_d, bec_bgrp, dbec, becsum, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
!
edft%enl = enl
edft%ekin = ekin
@ -262,8 +264,8 @@ SUBROUTINE from_scratch( )
if( tortho ) then
#if defined (__CUDA)
c0_d = c0_bgrp
phi_d = phi_bgrp
CALL sync_to_host( c0_bgrp, c0_d )
CALL sync_to_host( phi_bgrp, phi_d )
CALL ortho( eigr, c0_d, phi_d, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
#else
CALL ortho( eigr, c0_bgrp, phi_bgrp, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
@ -285,8 +287,8 @@ SUBROUTINE from_scratch( )
IF ( tortho ) THEN
#if defined (__CUDA)
CALL updatc( ccc, lambda, phi_d, bephi, becp_bgrp, bec_d, c0_d, idesc )
bec_bgrp = bec_d
c0_bgrp = c0_d
CALL sync_to_device( c0_bgrp, c0_d )
CALL sync_to_device( bec_bgrp, bec_d )
#else
CALL updatc( ccc, lambda, phi_bgrp, bephi, becp_bgrp, bec_bgrp, c0_bgrp, idesc )
#endif

View File

@ -7,8 +7,8 @@
!
!
!----------------------------------------------------------------------------
SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
cm_bgrp, phi_bgrp, enthal, enb, enbi, fccc, ccc, dt2bye, stress, l_cprestart )
SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
enthal, enb, enbi, fccc, ccc, dt2bye, stress, l_cprestart )
!----------------------------------------------------------------------------
!
! ... this routine updates the electronic degrees of freedom
@ -43,7 +43,7 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
USE electrons_module, ONLY : distribute_c, collect_c, distribute_b
USE gvect, ONLY : eigts1, eigts2, eigts3
USE control_flags, ONLY : lwfpbe0nscf ! exx_wf related
USE wavefunctions, ONLY : cv0 ! Lingzhu Kong
USE wavefunctions, ONLY : cv0, c0_bgrp, cm_bgrp, phi_bgrp, c0_d, cm_d, phi_d
USE funct, ONLY : dft_is_hybrid, exx_is_active
!
IMPLICIT NONE
@ -52,7 +52,6 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
LOGICAL, INTENT(IN) :: tfirst, tlast
REAL(DP), INTENT(IN) :: b1(3), b2(3), b3(3)
REAL(DP) :: fion(:,:)
COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
REAL(DP), INTENT(IN) :: dt2bye
REAL(DP) :: fccc, ccc
REAL(DP) :: enb, enbi
@ -80,7 +79,7 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
CALL get_wannier_center( tfirst, cm_bgrp, bec_bgrp, eigr, &
eigrb, taub, irb, ibrav, b1, b2, b3 )
!
CALL rhoofr( nfi, c0_bgrp, irb, eigrb, bec_bgrp, dbec, becsum, rhor, &
CALL rhoofr( nfi, c0_bgrp, c0_d, bec_bgrp, dbec, becsum, rhor, &
drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
!
!=================================================================

View File

@ -686,9 +686,7 @@
!
real(DP) :: sumt, sums(2), ennl_t
integer :: is, iv, jv, ijv, inl, jnl, ia, iss, i, indv
#if defined(_OPENMP)
INTEGER :: mytid, ntids, omp_get_thread_num, omp_get_num_threads
#endif
INTEGER :: omp_get_num_threads
!
ennl_t = 0.d0
!

View File

@ -11,11 +11,69 @@
! accelerator devices
!=----------------------------------------------------------------------------=!
MODULE device_helper
USE util_param, ONLY : DP
#if defined(__CUDA)
USE cudafor
#endif
IMPLICIT NONE
SAVE
PRIVATE
INTERFACE sync_to_device
MODULE PROCEDURE sync_to_device_c2d, sync_to_device_r2d
END INTERFACE
INTERFACE sync_to_host
MODULE PROCEDURE sync_to_host_c2d, sync_to_host_r2d
END INTERFACE
PUBLIC :: sync_to_device
PUBLIC :: sync_to_host
CONTAINS
SUBROUTINE sync_to_device_c2d( h, d )
COMPLEX(DP), INTENT(IN) :: d(:,:)
COMPLEX(DP), INTENT(OUT) :: h(:,:)
#if defined(__CUDA)
ATTRIBUTES(DEVICE) :: d
h = d
#endif
END SUBROUTINE
SUBROUTINE sync_to_host_c2d( h, d )
COMPLEX(DP), INTENT(OUT) :: d(:,:)
COMPLEX(DP), INTENT(IN) :: h(:,:)
#if defined(__CUDA)
ATTRIBUTES(DEVICE) :: d
d = h
#endif
END SUBROUTINE
SUBROUTINE sync_to_device_r2d( h, d )
REAL(DP), INTENT(IN) :: d(:,:)
REAL(DP), INTENT(OUT) :: h(:,:)
#if defined(__CUDA)
ATTRIBUTES(DEVICE) :: d
h = d
#endif
END SUBROUTINE
SUBROUTINE sync_to_host_r2d( h, d )
REAL(DP), INTENT(OUT) :: d(:,:)
REAL(DP), INTENT(IN) :: h(:,:)
#if defined(__CUDA)
ATTRIBUTES(DEVICE) :: d
d = h
#endif
END SUBROUTINE
END MODULE
!=----------------------------------------------------------------------------=!
SUBROUTINE qe_device_sync()
#if defined(__CUDA)
USE cudafor
USE cudafor
#endif
INTEGER :: info
#if defined (__CUDA)

View File

@ -2,6 +2,7 @@ clocks_handler.o : parallel_include.o
clocks_handler.o : util_param.o
cuda_util.o : util_param.o
data_buffer.o : util_param.o
device_helper.o : util_param.o
divide.o : mp.o
error_handler.o : mp.o
error_handler.o : util_param.o