Merge branch q-e:develop into afonari-develop-patch-31461

This commit is contained in:
afonari 2024-08-26 13:34:11 +00:00
commit 80060dba75
37 changed files with 749 additions and 924 deletions

View File

@ -470,13 +470,14 @@ subroutine solve_head
head(first_f+i-1,2)=epsilon_g(2,2,i)
head(first_f+i-1,3)=epsilon_g(3,3,i)
call ph_set_upert_e()
#if defined(__MPI)
call mp_sum ( pola_charge(:,:,:,i) , inter_pool_comm )
call psyme (pola_charge(:,:,:,i))
call psymdvscf (pola_charge(:,:,:,i))
#else
call syme (pola_charge(:,:,:,i))
call symdvscf (pola_charge(:,:,:,i))
#endif
call ph_deallocate_upert()
call create_scf_type ( wing, .true. )
do ipol=1,3
CALL fwfft ('Rho', pola_charge(1:dfftp%nnr,1,ipol,i), dfftp)

View File

@ -55,6 +55,7 @@ SUBROUTINE hp_solve_linear_system (na, iq)
USE qpoint_aux, ONLY : ikmks, ikmkmqs, becpt
USE lsda_mod, ONLY : nspin
USE lr_nc_mag, ONLY : lr_apply_time_reversal, deeq_nc_save, int3_nc_save
USE lr_symm_base, ONLY : lr_npert, upert, upert_mq
!
IMPLICIT NONE
!
@ -107,6 +108,7 @@ SUBROUTINE hp_solve_linear_system (na, iq)
ik, ikk, & ! counter on k points
ndim, &
is, & ! counter on spin polarizations
isym, & ! counter on symmetries
npw, & ! number of plane waves at k
nsolv, & ! number of linear systems
isolv, & ! counter on linear systems
@ -147,6 +149,18 @@ SUBROUTINE hp_solve_linear_system (na, iq)
!
ALLOCATE (dbecsum((nhm*(nhm+1))/2, nat, nspin_mag, 1))
!
! Set symmetry representation in lr_symm_base
!
lr_npert = 1
ALLOCATE(upert(lr_npert, lr_npert, nsymq))
DO isym = 1, nsymq
upert(1, 1, isym) = (1.d0, 0.d0)
ENDDO
IF (minus_q) THEN
ALLOCATE(upert_mq(lr_npert, lr_npert))
upert_mq(1, 1) = (1.d0, 0.d0)
ENDIF ! minus_q
!
nsolv=1
IF (noncolin.AND.domag) nsolv=2
!
@ -497,6 +511,8 @@ SUBROUTINE hp_solve_linear_system (na, iq)
DEALLOCATE (dvscfin)
DEALLOCATE (dvscfout)
DEALLOCATE (trace_dns_tot_old)
DEALLOCATE (upert)
IF (minus_q) DEALLOCATE(upert_mq)
!
IF (ALLOCATED(dbecsum_nc)) DEALLOCATE (dbecsum_nc)
IF (ALLOCATED(int3_nc)) DEALLOCATE(int3_nc)

View File

@ -8,38 +8,6 @@
!
#define ZERO ( 0.D0, 0.D0 )
#define ONE ( 1.D0, 0.D0 )
FUNCTION KSDdot( n, A, incx, B, incy) result( res )
!
USE util_param, ONLY : DP
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: incx,incy,n
!
#if defined(__CUDA)
REAL(DP), DEVICE, INTENT(IN) :: A(n), B(n)
#else
REAL(DP), INTENT(IN) :: A(n), B(n)
REAL(DP), EXTERNAL :: ddot
#endif
!
REAL(DP) :: res
!
#if defined(__CUDA)
res = cublasDdot( n, A, incx, B, incy )
#else
res = ddot( n, A, incx, B, incy )
#endif
!
RETURN
!
END FUNCTION KSDdot
! define __VERBOSE to print a message after each eigenvalue is computed
!
!----------------------------------------------------------------------------
@ -102,7 +70,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
!
! ... external functions
!
REAL (DP), EXTERNAL :: ksDdot
REAL (DP), EXTERNAL :: MYDDOT
EXTERNAL hs_1psi_ptr, s_1psi_ptr
! hs_1psi( npwx, npw, psi, hpsi, spsi )
! s_1psi( npwx, npw, psi, spsi )
@ -249,7 +217,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
! ... NB: ddot(2*npw,a,1,b,1) = REAL( zdotc(npw,a,1,b,1) )
!
!$acc host_data use_device(psi, hpsi)
e(m) = ksDdot( kdim2, psi(1,m), 1, hpsi, 1 )
e(m) = MYDDOT( kdim2, psi(1,m), 1, hpsi, 1 )
!$acc end host_data
!
CALL mp_sum( e(m), intra_bgrp_comm )
@ -271,8 +239,8 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
!
!$acc host_data use_device(spsi, g, ppsi)
es(1) = ksDdot( kdim2, spsi(1), 1, g(1), 1 )
es(2) = ksDdot( kdim2, spsi(1), 1, ppsi(1), 1 )
es(1) = MYDDOT( kdim2, spsi(1), 1, g(1), 1 )
es(2) = MYDDOT( kdim2, spsi(1), 1, ppsi(1), 1 )
!$acc end host_data
!
CALL mp_sum( es , intra_bgrp_comm )
@ -325,7 +293,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
!
!$acc host_data use_device(g)
gg1 = ksDdot( kdim2, g(1), 1, g0_d(1), 1 )
gg1 = MYDDOT( kdim2, g(1), 1, g0_d(1), 1 )
!$acc end host_data
!
CALL mp_sum( gg1, intra_bgrp_comm )
@ -341,7 +309,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
!$acc end kernels
!
!$acc host_data use_device(g)
gg = ksDdot( kdim2, g(1), 1, g0_d(1), 1 )
gg = MYDDOT( kdim2, g(1), 1, g0_d(1), 1 )
!$acc end host_data
!
CALL mp_sum( gg, intra_bgrp_comm )
@ -396,7 +364,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) )
!
!$acc host_data use_device(scg, cg)
cg0 = ksDdot( kdim2, cg(1), 1, scg(1), 1 )
cg0 = MYDDOT( kdim2, cg(1), 1, scg(1), 1 )
!$acc end host_data
!
CALL mp_sum( cg0 , intra_bgrp_comm )
@ -412,13 +380,13 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
! ... <y(t)|P^2S|y(t)> = 1
!
!$acc host_data use_device(psi, ppsi)
a0 = 2.D0 * ksDdot( kdim2, psi(1,m), 1, ppsi(1), 1 ) / cg0
a0 = 2.D0 * MYDDOT( kdim2, psi(1,m), 1, ppsi(1), 1 ) / cg0
!$acc end host_data
!
CALL mp_sum( a0 , intra_bgrp_comm )
!
!$acc host_data use_device(cg, ppsi)
b0 = ksDdot( kdim2, cg(1), 1, ppsi(1), 1 ) / cg0**2
b0 = MYDDOT( kdim2, cg(1), 1, ppsi(1), 1 ) / cg0**2
!$acc end host_data
!
CALL mp_sum( b0 , intra_bgrp_comm )

View File

@ -5,26 +5,6 @@
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
SUBROUTINE cgcudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
#if defined(__CUDA)
use cudafor
use cublas
#endif
IMPLICIT NONE
DOUBLE PRECISION :: ALPHA,BETA
INTEGER :: INCX,INCY,LDA,M,N
CHARACTER :: TRANS
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
!
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
!
END SUBROUTINE cgcudaDGEMV
! define __VERBOSE to print a message after each eigenvalue is computed
!----------------------------------------------------------------------------
SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
@ -87,7 +67,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
!
! ... external functions
!
REAL(DP), EXTERNAL :: ksDdot
REAL(DP), EXTERNAL :: MYDDOT
EXTERNAL hs_1psi_ptr, s_1psi_ptr
! hs_1psi( npwx, npw, psi, hpsi, spsi )
! s_1psi( npwx, npw, psi, spsi )
@ -148,7 +128,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
lagrange = 0.d0
if(m_start.le.m_end) then
!$acc host_data use_device(psi, spsi)
CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npwx2, spsi, 1, 0.D0, lagrange_d(m_start), 1 )
CALL MYDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npwx2, spsi, 1, 0.D0, lagrange_d(m_start), 1 )
!$acc end host_data
endif
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
@ -202,7 +182,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
! ... NB: ddot(2*npw,a,1,b,1) = DBLE( zdotc(npw,a,1,b,1) )
!
!$acc host_data use_device(psi, hpsi)
e(m) = 2.D0 * ksDdot( npw2, psi(1,m), 1, hpsi, 1 )
e(m) = 2.D0 * MYDDOT( npw2, psi(1,m), 1, hpsi, 1 )
!$acc end host_data
!print *, 'e(m)', e(m)
IF ( gstart == 2 ) THEN
@ -232,8 +212,8 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
!
!$acc host_data use_device(spsi, g, ppsi)
es(1) = 2.D0 * ksDdot( npw2, spsi(1), 1, g(1), 1 )
es(2) = 2.D0 * ksDdot( npw2, spsi(1), 1, ppsi(1), 1 )
es(1) = 2.D0 * MYDDOT( npw2, spsi(1), 1, g(1), 1 )
es(2) = 2.D0 * MYDDOT( npw2, spsi(1), 1, ppsi(1), 1 )
!$acc end host_data
!
IF ( gstart == 2 ) THEN
@ -270,7 +250,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
if(m_start.le.m_end) then
!$acc host_data use_device(psi, scg)
CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npw2, scg, 1, 0.D0, lagrange_d(m_start), 1 )
CALL MYDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npw2, scg, 1, 0.D0, lagrange_d(m_start), 1 )
!$acc end host_data
endif
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
@ -301,7 +281,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
!
!$acc host_data use_device(g)
gg1 = 2.D0 * ksDdot( npw2, g(1), 1, g0_d(1), 1 )
gg1 = 2.D0 * MYDDOT( npw2, g(1), 1, g0_d(1), 1 )
!$acc end host_data
IF ( gstart == 2 ) THEN
!$acc update self(g)
@ -328,7 +308,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
!$acc end kernels
!
!$acc host_data use_device(g)
gg = 2.D0 * ksDdot( npw2, g(1), 1, g0_d(1), 1 )
gg = 2.D0 * MYDDOT( npw2, g(1), 1, g0_d(1), 1 )
!$acc end host_data
IF ( gstart == 2 ) THEN
!$acc update self(g)
@ -390,7 +370,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) )
!
!$acc host_data use_device(cg, scg)
cg0 = 2.D0 * ksDdot( npw2, cg(1), 1, scg(1), 1 )
cg0 = 2.D0 * MYDDOT( npw2, cg(1), 1, scg(1), 1 )
!$acc end host_data
IF ( gstart == 2 ) THEN
!$acc update self(cg, scg)
@ -411,7 +391,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
! ... <y(t)|P^2S|y(t)> = 1
!
!$acc host_data use_device(psi, ppsi)
a0 = 4.D0 * ksDdot( npw2, psi(1,m), 1, ppsi(1), 1 )
a0 = 4.D0 * MYDDOT( npw2, psi(1,m), 1, ppsi(1), 1 )
!$acc end host_data
IF ( gstart == 2 ) THEN
!$acc update self(psi, ppsi)
@ -425,7 +405,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
CALL mp_sum( a0 , intra_bgrp_comm )
!
!$acc host_data use_device(cg, ppsi)
b0 = 2.D0 * ksDdot( npw2, cg(1), 1, ppsi(1), 1 )
b0 = 2.D0 * MYDDOT( npw2, cg(1), 1, ppsi(1), 1 )
!$acc end host_data
IF ( gstart == 2 ) THEN
!$acc update self(cg, ppsi)

View File

@ -127,7 +127,7 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
!
! ... Set initial : |phi_j> = |psi_j>
!
CALL DCOPY_gpu( npwx2 * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
CALL MYDCOPY( npwx2 * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
!
@ -141,7 +141,7 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
!
IF ( eigen_ ) THEN
!
CALL DCOPY_gpu( npwx2 * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
CALL MYDCOPY( npwx2 * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
@ -155,7 +155,7 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
!
IF ( uspp ) THEN
!
CALL DCOPY_gpu( npwx2 * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
CALL MYDCOPY( npwx2 * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
@ -217,14 +217,14 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
! ... Copy psi <- phi
!
!CALL DCOPY( npwx2 * nbnd, phi(1,1), 1, psi(1,1), 1 )
CALL DCOPY_gpu( npwx2 * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
CALL MYDCOPY( npwx2 * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
!
IF ( eigen_ ) &
CALL DCOPY_gpu( npwx2 * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
CALL MYDCOPY( npwx2 * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
!CALL DCOPY( npwx2 * nbnd, hphi(1,1), 1, hpsi(1,1), 1 )
!
IF ( uspp ) &
CALL DCOPY_gpu( npwx2 * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
CALL MYDCOPY( npwx2 * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
!CALL DCOPY( npwx2 * nbnd, sphi(1,1), 1, spsi(1,1), 1 )
!
! ... Calculate energy eigenvalues
@ -259,7 +259,7 @@ CONTAINS
INTEGER :: ibnd
REAL(DP) :: norm
REAL(DP) :: psi_ibnd
REAL(DP), EXTERNAL :: gpu_DDOT
REAL(DP), EXTERNAL :: MYDDOT
!
DO ibnd = ibnd_start, ibnd_end
!
@ -269,24 +269,24 @@ CONTAINS
!
IF ( uspp ) THEN
!
CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
CALL MYDGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
spsi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
!
IF ( gstart == 2 ) THEN
psi_ibnd = -spsi_d(1,ibnd)
CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd , phi_d(1,ibnd_start), npwx2, &
CALL MYDAXPY( ibnd - ibnd_start, psi_ibnd , phi_d(1,ibnd_start), npwx2, &
sr_d(1), 1 )
END IF
!
ELSE
!
CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
CALL MYDGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
psi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
!
IF ( gstart == 2 ) THEN
psi_ibnd = -psi_d(1,ibnd)
CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd, phi_d(1,ibnd_start), npwx2, &
CALL MYDAXPY( ibnd - ibnd_start, psi_ibnd, phi_d(1,ibnd_start), npwx2, &
sr_d(1), 1 )
END IF
!
@ -296,7 +296,7 @@ CONTAINS
!
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
!
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, phi_d(1,ibnd_start), npwx2, &
CALL MYDGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, phi_d(1,ibnd_start), npwx2, &
sr_d(1), 1, 1._DP, phi_d(1,ibnd), 1 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
@ -309,7 +309,7 @@ CONTAINS
!
IF ( eigen_ ) THEN
!
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi_d(1,ibnd_start), npwx2, &
CALL MYDGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi_d(1,ibnd_start), npwx2, &
sr_d(1), 1, 1._DP, hphi_d(1,ibnd), 1 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
@ -324,7 +324,7 @@ CONTAINS
!
IF ( uspp ) THEN
!
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi_d(1,ibnd_start), npwx2, &
CALL MYDGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi_d(1,ibnd_start), npwx2, &
sr_d(1), 1, 1._DP, sphi_d(1,ibnd), 1 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
@ -343,7 +343,7 @@ CONTAINS
!
IF ( uspp ) THEN
!
norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 )
norm = 2._DP * MYDDOT( npw2, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 )
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
@ -354,7 +354,7 @@ CONTAINS
!
ELSE
!
norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 )
norm = 2._DP * MYDDOT( npw2, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 )
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
@ -372,7 +372,7 @@ CONTAINS
IF ( norm < eps16 ) &
CALL errore( ' gram_schmidt_gamma ', ' vectors are linear dependent ', 1 )
!
CALL DSCAL_gpu( npw2, 1._DP / norm, phi_d(1,ibnd), 1 )
CALL MYDSCAL( npw2, 1._DP / norm, phi_d(1,ibnd), 1 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
@ -384,7 +384,7 @@ CONTAINS
!
IF ( eigen_ ) THEN
!
CALL DSCAL_gpu( npw2, 1._DP / norm, hphi_d(1,ibnd), 1 )
CALL MYDSCAL( npw2, 1._DP / norm, hphi_d(1,ibnd), 1 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
@ -398,7 +398,7 @@ CONTAINS
!
IF ( uspp ) THEN
!
CALL DSCAL_gpu( npw2, 1._DP / norm, sphi_d(1,ibnd), 1 )
CALL MYDSCAL( npw2, 1._DP / norm, sphi_d(1,ibnd), 1 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
@ -435,20 +435,20 @@ CONTAINS
!
IF ( uspp ) THEN
!
CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
CALL MYDGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
spsi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), nbsize )
!
IF ( gstart == 2 ) &
CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
CALL MYDGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
spsi_d(1,jbnd_start), npwx2, sr2_d(1,1), nbsize )
!
ELSE
!
CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
CALL MYDGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
psi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), nbsize )
!
IF ( gstart == 2 ) &
CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
CALL MYDGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
psi_d(1,jbnd_start), npwx2, sr2_d(1,1), nbsize )
!
END IF
@ -457,7 +457,7 @@ CONTAINS
!
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
!
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi_d(1,ibnd_start), npwx2, &
CALL MYDGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi_d(1,ibnd_start), npwx2, &
sr2_d(1,1), nbsize, 1._DP, phi_d(1,jbnd_start), npwx2 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
@ -471,7 +471,7 @@ CONTAINS
!
IF ( eigen_ ) THEN
!
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi_d(1,ibnd_start), npwx2, &
CALL MYDGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi_d(1,ibnd_start), npwx2, &
sr2_d(1,1), nbsize, 1._DP, hphi_d(1,jbnd_start), npwx2 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
@ -487,7 +487,7 @@ CONTAINS
!
IF ( uspp ) THEN
!
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi_d(1,ibnd_start), npwx2, &
CALL MYDGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi_d(1,ibnd_start), npwx2, &
sr2_d(1,1), nbsize, 1._DP, sphi_d(1,jbnd_start), npwx2 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
@ -513,7 +513,7 @@ CONTAINS
INTEGER :: ibnd, ibnd_start, ibnd_end
REAL(DP) :: e_ibnd
!
REAL(DP), EXTERNAL :: gpu_DDOT
REAL(DP), EXTERNAL :: MYDDOT
!
! ... <psi_i| H |psi_i>
!
@ -523,7 +523,7 @@ CONTAINS
!
DO ibnd = ibnd_start, ibnd_end
!
e(ibnd) = 2._DP * gpu_DDOT( npw2, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 )
e(ibnd) = 2._DP * MYDDOT( npw2, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 )
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
@ -563,13 +563,13 @@ CONTAINS
e(ibnd) = e(ibnd-1)
e(ibnd-1) = e0
!
CALL DSWAP_gpu( npw2, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
CALL MYDSWAP( npw2, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
!
IF ( eigen_ ) &
CALL DSWAP_gpu( npw2, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
CALL MYDSWAP( npw2, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
!
IF ( uspp ) &
CALL DSWAP_gpu( npw2, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
CALL MYDSWAP( npw2, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
!
END IF
!

View File

@ -142,13 +142,13 @@ SUBROUTINE gram_schmidt_k_gpu( npwx, npw, nbnd, npol, psi_d, hpsi_d, spsi_d, e,
!
! ... Set initial : |phi_j> = |psi_j>
!
CALL ZCOPY_gpu( kdmx * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
CALL MYZCOPY( kdmx * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
!
IF ( eigen_ ) &
CALL ZCOPY_gpu( kdmx * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
CALL MYZCOPY( kdmx * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
!
IF ( uspp ) &
CALL ZCOPY_gpu( kdmx * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
CALL MYZCOPY( kdmx * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
!
!
! ... Allocate buffers
@ -204,13 +204,13 @@ SUBROUTINE gram_schmidt_k_gpu( npwx, npw, nbnd, npol, psi_d, hpsi_d, spsi_d, e,
!
! ... Copy psi <- phi
!
CALL ZCOPY_gpu( kdmx * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
CALL MYZCOPY( kdmx * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
!
IF ( eigen_ ) &
CALL ZCOPY_gpu( kdmx * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
CALL MYZCOPY( kdmx * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
!
IF ( uspp ) &
CALL ZCOPY_gpu( kdmx * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
CALL MYZCOPY( kdmx * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
!
! ... Calculate energy eigenvalues
!
@ -240,7 +240,7 @@ CONTAINS
!
INTEGER :: ibnd
REAL(DP) :: norm
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
COMPLEX(DP), EXTERNAL :: MYZDOTC
!
!
DO ibnd = ibnd_start, ibnd_end
@ -251,12 +251,12 @@ CONTAINS
!
IF ( uspp ) THEN
!
CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
CALL MYZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
spsi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
!
ELSE
!
CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
CALL MYZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
psi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
!
END IF
@ -266,16 +266,16 @@ CONTAINS
!
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
!
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, phi_d(1,ibnd_start), kdmx, &
CALL MYZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, phi_d(1,ibnd_start), kdmx, &
sc_d(1), 1, ONE, phi_d(1,ibnd), 1 )
!
!
IF ( eigen_ ) &
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, hphi_d(1,ibnd_start), kdmx, &
CALL MYZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, hphi_d(1,ibnd_start), kdmx, &
sc_d(1), 1, ONE, hphi_d(1,ibnd), 1 )
!
IF ( uspp ) &
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, sphi_d(1,ibnd_start), kdmx, &
CALL MYZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, sphi_d(1,ibnd_start), kdmx, &
sc_d(1), 1, ONE, sphi_d(1,ibnd), 1 )
!
END IF
@ -284,11 +284,11 @@ CONTAINS
!
IF ( uspp ) THEN
!
norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) )
norm = DBLE( MYZDOTC( kdim, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) )
!
ELSE
!
norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) )
norm = DBLE( MYZDOTC( kdim, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) )
!
END IF
!
@ -299,13 +299,13 @@ CONTAINS
IF ( norm < eps16 ) &
CALL errore( ' gram_schmidt_k ', ' vectors are linear dependent ', 1 )
!
CALL ZDSCAL_gpu( kdim, 1._DP / norm, phi_d(1,ibnd), 1 )
CALL MYZDSCAL( kdim, 1._DP / norm, phi_d(1,ibnd), 1 )
!
IF ( eigen_ ) &
CALL ZDSCAL_gpu( kdim, 1._DP / norm, hphi_d(1,ibnd), 1 )
CALL MYZDSCAL( kdim, 1._DP / norm, hphi_d(1,ibnd), 1 )
!
IF ( uspp ) &
CALL ZDSCAL_gpu( kdim, 1._DP / norm, sphi_d(1,ibnd), 1 )
CALL MYZDSCAL( kdim, 1._DP / norm, sphi_d(1,ibnd), 1 )
!
END DO
!
@ -332,12 +332,12 @@ CONTAINS
!
IF ( uspp ) THEN
!
CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
CALL MYZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
spsi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), nbsize )
!
ELSE
!
CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
CALL MYZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
psi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), nbsize )
!
END IF
@ -346,15 +346,15 @@ CONTAINS
!
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
!
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi_d(1,ibnd_start), kdmx, &
CALL MYZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi_d(1,ibnd_start), kdmx, &
sc2_d(1,1), nbsize, ONE, phi_d(1,jbnd_start), kdmx )
!
IF ( eigen_ ) &
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi_d(1,ibnd_start), kdmx, &
CALL MYZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi_d(1,ibnd_start), kdmx, &
sc2_d(1,1), nbsize, ONE, hphi_d(1,jbnd_start), kdmx )
!
IF ( uspp ) &
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi_d(1,ibnd_start), kdmx, &
CALL MYZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi_d(1,ibnd_start), kdmx, &
sc2_d(1,1), nbsize, ONE, sphi_d(1,jbnd_start), kdmx )
!
RETURN
@ -368,7 +368,7 @@ CONTAINS
!
INTEGER :: ibnd, ibnd_start, ibnd_end
!
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
COMPLEX(DP), EXTERNAL :: MYZDOTC
!
! ... <psi_i| H |psi_i>
!
@ -378,7 +378,7 @@ CONTAINS
!
DO ibnd = ibnd_start, ibnd_end
!
e(ibnd) = DBLE( ZDOTC_gpu( kdim, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) )
e(ibnd) = DBLE( MYZDOTC( kdim, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) )
!
END DO
!
@ -410,13 +410,13 @@ CONTAINS
e(ibnd) = e(ibnd-1)
e(ibnd-1) = e0
!
CALL ZSWAP_gpu( kdim, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
CALL MYZSWAP( kdim, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
!
IF ( eigen_ ) &
CALL ZSWAP_gpu( kdim, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
CALL MYZSWAP( kdim, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
!
IF ( uspp ) &
CALL ZSWAP_gpu( kdim, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
CALL MYZSWAP( kdim, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
!
END IF
!

View File

@ -1,252 +1,3 @@
subroutine gpu_DGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
#if defined(__CUDA)
USE cublas
#endif
implicit none
character*1 transa, transb
integer :: m, n, k, lda, ldb, ldc
DOUBLE PRECISION :: alpha, beta
DOUBLE PRECISION, dimension(lda, *) :: A
DOUBLE PRECISION, dimension(ldb, *) :: B
DOUBLE PRECISION, dimension(ldc, *) :: C
#if defined(__CUDA)
attributes(device) :: A, B, C
call cublasDGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
#endif
return
end subroutine gpu_DGEMM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
subroutine gpu_ZGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
#if defined(__CUDA)
USE cublas
#endif
implicit none
character*1 transa, transb
integer :: m, n, k, lda, ldb, ldc
DOUBLE COMPLEX :: alpha, beta
DOUBLE COMPLEX, dimension(lda, *) :: A
DOUBLE COMPLEX, dimension(ldb, *) :: B
DOUBLE COMPLEX, dimension(ldc, *) :: C
#if defined(__CUDA)
attributes(device) :: A, B, C
call cublasZGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
#endif
return
end subroutine gpu_ZGEMM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
subroutine gpu_DGER (m, n, alpha, x, incx, y, incy, a, lda)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: m, n, lda, incx, incy
DOUBLE PRECISION :: alpha
DOUBLE PRECISION, dimension(lda, *) :: A
DOUBLE PRECISION, dimension(*) :: x, y
#if defined(__CUDA)
attributes(device) :: A, x, y
call cublasDGER(m, n, alpha, x, incx, y, incy, a, lda)
#endif
return
end subroutine gpu_DGER
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
function gpu_DDOT (n, dx, incx, dy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
DOUBLE PRECISION :: gpu_DDOT
integer :: n, incx, incy
DOUBLE PRECISION, dimension(*) :: dx, dy
#if defined(__CUDA)
attributes(device) :: dx, dy
gpu_DDOT=cublasDDOT(n, dx, incx, dy, incy)
#endif
return
end function gpu_DDOT
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
subroutine gpu_DTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
#if defined(__CUDA)
USE cublas
#endif
implicit none
character*1 :: side, uplo, transa, diag
integer :: m, n, lda, ldb
DOUBLE PRECISION :: alpha
DOUBLE PRECISION, dimension(lda, *) :: a
DOUBLE PRECISION, dimension(ldb, *) :: b
#if defined(__CUDA)
attributes(device) :: a, b
call cublasDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
#endif
return
end subroutine gpu_DTRSM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
DOUBLE COMPLEX function ZDOTC_gpu(n, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: n, incx, incy
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
ZDOTC_gpu = cublasZDOTC(n, zx, incx, zy, incy)
#endif
return
end function ZDOTC_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZSWAP_gpu(n, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: n, incx, incy
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
CALL cublasZSWAP(n, zx, incx, zy, incy)
#endif
return
END SUBROUTINE ZSWAP_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZCOPY_gpu(n, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
CALL cublasZCOPY(n, zx, incx, zy, incy)
#endif
RETURN
END SUBROUTINE ZCOPY_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZAXPY_gpu(n, za, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE COMPLEX :: za
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
CALL cublasZAXPY(n, za, zx, incx, zy, incy)
#endif
RETURN
END SUBROUTINE ZAXPY_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZGEMV_gpu(trans, m, n, alpha, a, lda, x, incx, beta, y, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
CHARACTER :: trans
INTEGER :: lda, m, n, incx, incy
DOUBLE COMPLEX :: alpha, beta
DOUBLE COMPLEX, dimension(lda, *) :: a
DOUBLE COMPLEX, dimension(*) :: x, y
#if defined(__CUDA)
attributes(device) :: a, x, y
CALL cublasZGEMV(trans, m, n, alpha, a, lda, x, incx, beta, y, incy)
#endif
RETURN
END SUBROUTINE ZGEMV_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZDSCAL_gpu(n, da, zx, incx)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx
DOUBLE PRECISION :: da
DOUBLE COMPLEX, dimension(*) :: zx
#if defined(__CUDA)
attributes(device) :: zx
CALL cublasZDSCAL(n, da, zx, incx)
#endif
RETURN
END SUBROUTINE ZDSCAL_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZSCAL_gpu(n, za, zx, incx)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx
DOUBLE COMPLEX :: za
DOUBLE COMPLEX, dimension(*) :: zx
#if defined(__CUDA)
attributes(device) :: zx
CALL cublasZSCAL(n, za, zx, incx)
#endif
RETURN
END SUBROUTINE ZSCAL_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DGEMV_gpu(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
#if defined(__CUDA)
use cublas
#endif
IMPLICIT NONE
DOUBLE PRECISION :: ALPHA,BETA
INTEGER :: INCX,INCY,LDA,M,N
CHARACTER :: TRANS
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
#if defined(__CUDA)
attributes(device) :: A, X, Y
call cublasDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
#endif
RETURN
END SUBROUTINE DGEMV_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DCOPY_gpu(n, x, incx, y, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE PRECISION, INTENT(IN) :: x(*)
DOUBLE PRECISION, INTENT(OUT) :: y(*)
#if defined(__CUDA)
attributes(device) :: x, y
call cublasDCOPY(n, x, incx, y, incy)
#endif
RETURN
END SUBROUTINE DCOPY_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DAXPY_gpu(n, a, x, incx, y, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE PRECISION, INTENT(IN) :: a
DOUBLE PRECISION, INTENT(IN) :: x(*)
DOUBLE PRECISION, INTENT(OUT) :: y(*)
#if defined(__CUDA)
attributes(device) :: x, y
call cublasDAXPY( n, a, x, incx, y, incy)
#endif
RETURN
END SUBROUTINE DAXPY_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DSCAL_gpu(n, a, x, incx)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
integer :: n, incx
DOUBLE PRECISION :: a
DOUBLE PRECISION, dimension(*) :: x
#if defined(__CUDA)
attributes(device) :: x
call cublasDSCAL(n, a, x, incx)
#endif
RETURN
END SUBROUTINE DSCAL_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE gpu_threaded_memset(array, val, length)
!
@ -384,18 +135,3 @@ SUBROUTINE gpu_threaded_backassign(array_out, idx, array_in, kdimx, nact, use_a2
END IF
!
END SUBROUTINE gpu_threaded_backassign
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DSWAP_gpu(n, dx, incx, dy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: n, incx, incy
REAL(8), dimension(*) :: dx, dy
#if defined(__CUDA)
attributes(device) :: dx, dy
CALL cublasDSWAP(n, dx, incx, dy, incy)
#endif
return
END SUBROUTINE DSWAP_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

View File

@ -191,8 +191,8 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_memset( G_d, ZERO, nbnd*nbnd ) ! G = ZERO
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_DGEMM('T','N', nbnd, my_n, npw2, 2.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd)
IF ( gstart == 2 ) CALL gpu_DGER( nbnd, my_n, -1.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, G_d(1,n_start), nbnd )
CALL MYDGEMM('T','N', nbnd, my_n, npw2, 2.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd)
IF ( gstart == 2 ) CALL MYDGER( nbnd, my_n, -1.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, G_d(1,n_start), nbnd )
CALL mp_sum( G_d, inter_bgrp_comm )
CALL mp_sum( G_d, intra_bgrp_comm )
call stop_clock('ppcg:dgemm')
@ -203,10 +203,10 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
if (overlap) then
if (n_start .le. n_end) &
CALL gpu_DGEMM('N','N',npw2, nbnd, my_n, -ONE, spsi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2)
CALL MYDGEMM('N','N',npw2, nbnd, my_n, -ONE, spsi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2)
else
if (n_start .le. n_end) &
CALL gpu_DGEMM('N','N',npw2, nbnd, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2)
CALL MYDGEMM('N','N',npw2, nbnd, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2)
end if
CALL mp_sum( w_d, inter_bgrp_comm )
call stop_clock('ppcg:dgemm')
@ -260,12 +260,12 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
if (overlap) then
if (n_start .le. n_end) &
CALL gpu_DGEMM( 'T','N', my_n, nact, npw2, 2.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd )
IF ( gstart == 2 ) CALL gpu_DGER( my_n, nact, -1.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd )
CALL MYDGEMM( 'T','N', my_n, nact, npw2, 2.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd )
IF ( gstart == 2 ) CALL MYDGER( my_n, nact, -1.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd )
else
if (n_start .le. n_end) &
CALL gpu_DGEMM( 'T','N', my_n, nact, npw2, 2.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd )
IF ( gstart == 2 ) CALL gpu_DGER( my_n, nact, -1.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd )
CALL MYDGEMM( 'T','N', my_n, nact, npw2, 2.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd )
IF ( gstart == 2 ) CALL MYDGER( my_n, nact, -1.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd )
end if
G = G_d
CALL mp_sum( G(1:nbnd,1:nact), inter_bgrp_comm )
@ -277,7 +277,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer_d, w_d, npwx, nact, .true., act_idx_d, .true. )
if (n_start .le. n_end) &
CALL gpu_DGEMM('N','N', npw2, nact, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
CALL MYDGEMM('N','N', npw2, nact, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
buffer = buffer_d
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
buffer_d = buffer
@ -343,8 +343,8 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_assign( buffer1_d, p_d, npwx, nact, .true., act_idx_d, .false. )
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_DGEMM('T','N', my_n, nact, npw2, 2.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, 0.D0, G_d(n_start,1), nbnd)
IF ( gstart == 2 ) CALL gpu_DGER( my_n, nact, -1.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, G_d(n_start,1), nbnd )
CALL MYDGEMM('T','N', my_n, nact, npw2, 2.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, 0.D0, G_d(n_start,1), nbnd)
IF ( gstart == 2 ) CALL MYDGER( my_n, nact, -1.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, G_d(n_start,1), nbnd )
G = G_d
CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm )
CALL mp_sum( G(1:nact,1:nact), intra_bgrp_comm )
@ -356,7 +356,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_assign( buffer_d, p_d, npwx, nact, .true., act_idx_d, .true. )
call gpu_threaded_assign( buffer1_d, psi_d, npwx, nact, .true., act_idx_d, .false. )
if (n_start .le. n_end) & ! could be done differently
CALL gpu_DGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
CALL MYDGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
buffer = buffer_d
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
buffer_d = buffer
@ -368,7 +368,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_assign( buffer_d, hp_d, npwx, nact, .true., act_idx_d, .true. )
call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, nact, .true., act_idx_d, .false. )
if (n_start .le. n_end) &
CALL gpu_DGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
CALL MYDGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
buffer = buffer_d
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
buffer_d = buffer
@ -381,7 +381,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_assign( buffer_d, sp_d, npwx, nact, .true., act_idx_d, .true. )
call gpu_threaded_assign( buffer1_d, spsi_d, npwx, nact, .true., act_idx_d, .false. )
if (n_start .le. n_end) &
CALL gpu_DGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
CALL MYDGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
buffer = buffer_d
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
buffer_d = buffer
@ -416,44 +416,44 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d, sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d, sbsize3 )
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d, sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d, sbsize3 )
!
if (overlap) then
call gpu_threaded_assign( buffer1_d, spsi_d, npwx, l, .true., col_idx_d, .false.)
else
call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. )
end if
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d, sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d, sbsize3 )
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d, sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d, sbsize3 )
!
! ---
call gpu_threaded_assign( buffer_d, w_d, npwx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, l+1), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, l+1), sbsize3 )
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, l+1), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, l+1), sbsize3 )
!
if (overlap) then
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
else
call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. )
end if
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, l+1 ), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, l+1), sbsize3 )
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, l+1 ), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, l+1), sbsize3 )
!
! ---
call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, l+1), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, l+1), sbsize3 )
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, l+1), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, l+1), sbsize3 )
!
if (overlap) then
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
else
call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. )
end if
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, l+1), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, l+1), sbsize3 )
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, l+1), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, l+1), sbsize3 )
call stop_clock('ppcg:dgemm')
!
! ---
@ -463,30 +463,30 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer_d, p_d, npwx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(2*l + 1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(2*l + 1, 2*l+1 ), sbsize3 )
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(2*l + 1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(2*l + 1, 2*l+1 ), sbsize3 )
!
if (overlap) then
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
else
call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. )
end if
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(2*l + 1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(2*l + 1, 2*l+1 ), sbsize3)
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(2*l + 1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(2*l + 1, 2*l+1 ), sbsize3)
!
! ---
call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, 2*l+1), sbsize3)
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, 2*l+1), sbsize3)
!
if (overlap) then
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
else
call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. )
end if
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, 2*l+1), sbsize3)
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, 2*l+1), sbsize3)
call stop_clock('ppcg:dgemm')
!
! ---
@ -494,16 +494,16 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer_d, w_d, npwx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, 2*l+1), sbsize3)
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, 2*l+1), sbsize3)
!
if (overlap) then
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
else
call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. )
end if
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, 2*l+1), sbsize3)
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, 2*l+1), sbsize3)
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, 2*l+1), sbsize3)
call stop_clock('ppcg:dgemm')
!
END IF
@ -594,18 +594,18 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
!
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
CALL MYDGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
! p(:,col_idx(1:l)) = buffer(:,1:l)
call gpu_threaded_backassign( p_d, col_idx_d, buffer_d, npwx, l, .false., p_d )
call stop_clock('ppcg:dgemm')
!
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
CALL MYDGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
! hp(:,col_idx(1:l)) = buffer(:,1:l)
call gpu_threaded_backassign( hp_d, col_idx_d, buffer_d, npwx, l, .false., hp_d )
call stop_clock('ppcg:dgemm')
@ -613,9 +613,9 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
if (overlap) then
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
CALL MYDGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
! sp(:,col_idx(1:l)) = buffer(:,1:l)
call gpu_threaded_backassign( sp_d, col_idx_d, buffer_d, npwx, l, .false., sp_d )
call stop_clock('ppcg:dgemm')
@ -624,14 +624,14 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
!
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
! p(:,col_idx(1:l)) = buffer(:, 1:l)
call gpu_threaded_backassign( p_d, col_idx_d, buffer_d, npwx, l, .false., p_d )
call stop_clock('ppcg:dgemm')
!
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
! hp(:,col_idx(1:l)) = buffer(:, 1:l)
call gpu_threaded_backassign( hp_d, col_idx_d, buffer_d, npwx, l, .false., hp_d )
call stop_clock('ppcg:dgemm')
@ -639,7 +639,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
if (overlap) then
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
! sp(:,col_idx(1:l)) = buffer(:, 1:l)
call gpu_threaded_backassign( sp_d, col_idx_d, buffer_d, npwx, l, .false., sp_d )
call stop_clock('ppcg:dgemm')
@ -649,14 +649,14 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
! Update the sub-blocks of psi and hpsi (and spsi)
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer1_d, psi_d, npwx, l, .true., col_idx_d )
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
! psi(:,col_idx(1:l)) = buffer(:,1:l) + p(:,col_idx(1:l))
call gpu_threaded_backassign( psi_d, col_idx_d, buffer_d, npwx, l, .true., p_d )
call stop_clock('ppcg:dgemm')
!
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, l, .true., col_idx_d )
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
! hpsi(:,col_idx(1:l)) = buffer(:,1:l) + hp(:,col_idx(1:l))
call gpu_threaded_backassign( hpsi_d, col_idx_d, buffer_d, npwx, l, .true., hp_d )
call stop_clock('ppcg:dgemm')
@ -664,7 +664,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
if (overlap) then
call start_clock('ppcg:dgemm')
call gpu_threaded_assign( buffer1_d, spsi_d, npwx, l, .true., col_idx_d, .false. )
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
! spsi(:,col_idx(1:l)) = buffer(:,1:l) + sp(:,col_idx(1:l))
call gpu_threaded_backassign( spsi_d, col_idx_d, buffer_d, npwx, l, .true., sp_d )
call stop_clock('ppcg:dgemm')
@ -857,8 +857,8 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_memset( G_d, ZERO, nbnd*nbnd ) ! G = ZERO
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_DGEMM('T','N', nact, my_n, npw2, 2.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd)
IF ( gstart == 2 ) CALL gpu_DGER( nact, my_n, -1.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, G_d(1,n_start), nbnd )
CALL MYDGEMM('T','N', nact, my_n, npw2, 2.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd)
IF ( gstart == 2 ) CALL MYDGER( nact, my_n, -1.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, G_d(1,n_start), nbnd )
G = G_d
CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm)
!
@ -875,7 +875,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
end if
call start_clock('ppcg:dgemm')
if (n_start .le. n_end) &
CALL gpu_DGEMM('N','N',npw2, nact, my_n, -ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
CALL MYDGEMM('N','N',npw2, nact, my_n, -ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
buffer = buffer_d
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
buffer_d = buffer
@ -1779,13 +1779,13 @@ CONTAINS
! this proc sends his block
!
CALL mp_bcast( Gl(:,1:nc), root, ortho_parent_comm )
CALL gpu_DGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gl, nx, gamm, Xtmp(1,ic), ld2 )
CALL MYDGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gl, nx, gamm, Xtmp(1,ic), ld2 )
ELSE
!
! all other procs receive
!
CALL mp_bcast( Gltmp(:,1:nc), root, ortho_parent_comm )
CALL gpu_DGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gltmp, nx, gamm, Xtmp(1,ic), ld2 )
CALL MYDGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gltmp, nx, gamm, Xtmp(1,ic), ld2 )
END IF
!
gamm = ONE

View File

@ -196,7 +196,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
G_d = C_ZERO
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_ZGEMM('C','N', nbnd, my_n, kdim, C_ONE, psi_d, kdimx, hpsi_d(1,n_start), kdimx, C_ZERO, G_d(1,n_start), nbnd)
CALL MYZGEMM('C','N', nbnd, my_n, kdim, C_ONE, psi_d, kdimx, hpsi_d(1,n_start), kdimx, C_ZERO, G_d(1,n_start), nbnd)
CALL mp_sum( G_d, inter_bgrp_comm )
!
CALL mp_sum( G_d, intra_bgrp_comm )
@ -208,10 +208,10 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
if (overlap) then
if (n_start .le. n_end) &
CALL gpu_ZGEMM('N','N',kdim, nbnd, my_n, -C_ONE,spsi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx)
CALL MYZGEMM('N','N',kdim, nbnd, my_n, -C_ONE,spsi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx)
else
if (n_start .le. n_end) &
CALL gpu_ZGEMM('N','N',kdim, nbnd, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx)
CALL MYZGEMM('N','N',kdim, nbnd, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx)
end if
CALL mp_sum( w_d, inter_bgrp_comm )
call stop_clock('ppcg:zgemm')
@ -267,11 +267,11 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
if (overlap) then
if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE, spsi_d(1,n_start), kdimx, buffer_d, kdimx, &
CALL MYZGEMM( 'C','N', my_n, nact, kdim, C_ONE, spsi_d(1,n_start), kdimx, buffer_d, kdimx, &
C_ZERO, G_d(n_start,1), nbnd )
else
if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE,psi_d(1,n_start), kdimx, buffer_d, kdimx, &
CALL MYZGEMM( 'C','N', my_n, nact, kdim, C_ONE,psi_d(1,n_start), kdimx, buffer_d, kdimx, &
C_ZERO, G_d(n_start,1), nbnd )
end if
G = G_d
@ -285,7 +285,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer_d, w_d, kdimx, nact, .true., act_idx_d, .true. )
if (n_start .le. n_end) &
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, &
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, &
buffer_d, kdimx)
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
!$cuf kernel DO(2)
@ -377,7 +377,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_assign( buffer1_d, p_d, kdimx, nact, .true., act_idx_d, .false. )
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_ZGEMM('C','N', my_n, nact, kdim, C_ONE, buffer_d(1,n_start), kdimx, buffer1_d, &
CALL MYZGEMM('C','N', my_n, nact, kdim, C_ONE, buffer_d(1,n_start), kdimx, buffer1_d, &
kdimx, C_ZERO, G_d(n_start,1), nbnd)
G = G_d
CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm )
@ -391,7 +391,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_assign( buffer_d, p_d, kdimx, nact, .true., act_idx_d, .true. )
call gpu_threaded_assign( buffer1_d, psi_d, kdimx, nact, .true., act_idx_d, .false. )
if (n_start .le. n_end) & ! could be done differently
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
nbnd, C_ONE, buffer_d, kdimx)
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
!$cuf kernel do(2)
@ -406,7 +406,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_assign( buffer_d, hp_d, kdimx, nact, .true., act_idx_d, .true. )
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, nact, .true., act_idx_d, .false. )
if (n_start .le. n_end) &
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
nbnd, C_ONE, buffer_d, kdimx)
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
!$cuf kernel do(2)
@ -422,7 +422,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call gpu_threaded_assign( buffer_d, sp_d, kdimx, nact, .true., act_idx_d, .true. )
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, nact, .true., act_idx_d, .false. )
if (n_start .le. n_end) &
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
nbnd, C_ONE, buffer_d, kdimx)
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
!$cuf kernel do(2)
@ -460,7 +460,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d, sbsize3)
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d, sbsize3)
!
if (overlap) then
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, l, .true., col_idx_d, .false. )
@ -475,19 +475,19 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
else
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
end if
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d, sbsize3)
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d, sbsize3)
!
! ---
call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(l+1, l+1), sbsize3)
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(l+1, l+1), sbsize3)
!
if (overlap) then
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
else
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
end if
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(l+1, l+1 ), sbsize3)
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(l+1, l+1 ), sbsize3)
!
! ---
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
@ -500,14 +500,14 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
END DO
END DO
end if
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(1, l+1), sbsize3)
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(1, l+1), sbsize3)
!
if (overlap) then
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
else
call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. )
end if
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, l+1), sbsize3)
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, l+1), sbsize3)
call stop_clock('ppcg:zgemm')
!
! ---
@ -517,7 +517,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer_d, p_d, kdimx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
K_d(2*l + 1, 2*l+1), sbsize3)
!
if (overlap) then
@ -525,13 +525,13 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
else
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
end if
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
M_d(2*l + 1, 2*l+1), sbsize3)
!
! ---
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
K_d(1, 2*l+1), sbsize3)
!
if (overlap) then
@ -539,7 +539,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
else
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
end if
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, 2*l+1), sbsize3)
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, 2*l+1), sbsize3)
call stop_clock('ppcg:zgemm')
!
! ---
@ -547,7 +547,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. )
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
K_d(l+1, 2*l+1), sbsize3)
!
if (overlap) then
@ -555,7 +555,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
else
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
end if
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
M_d(l+1, 2*l+1), sbsize3)
call stop_clock('ppcg:zgemm')
!
@ -640,9 +640,9 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
!
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
!$cuf kernel do(2)
DO ii = 1, kdimx
DO jj = 1, l
@ -653,9 +653,9 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
!
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
!$cuf kernel do(2)
DO ii = 1, kdimx
DO jj = 1, l
@ -667,9 +667,9 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
if (overlap) then
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
!$cuf kernel do(2)
DO ii = 1, kdimx
DO jj = 1, l
@ -682,7 +682,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
!
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx)
!$cuf kernel do(2)
DO ii = 1, kdimx
DO jj = 1, l
@ -693,7 +693,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
!
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx)
!$cuf kernel do(2)
DO ii = 1, kdimx
DO jj = 1, l
@ -705,7 +705,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
if (overlap) then
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, c_ZERO, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, c_ZERO, buffer_d, kdimx)
!$cuf kernel do(2)
DO ii = 1, kdimx
DO jj = 1, l
@ -719,7 +719,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
! Update the sub-blocks of psi and hpsi (and spsi)
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer1_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
!$cuf kernel do(2)
DO ii = 1, kdimx
DO jj = 1, l
@ -730,7 +730,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
!
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
!$cuf kernel do(2)
DO ii = 1, kdimx
DO jj = 1, l
@ -742,7 +742,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
if (overlap) then
call start_clock('ppcg:zgemm')
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, l, .true., col_idx_d, .false. )
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
!$cuf kernel do(2)
DO ii = 1, kdimx
DO jj = 1, l
@ -968,7 +968,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
END DO
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_ZGEMM('C','N', nact, my_n, kdim, C_ONE, buffer_d, kdimx, buffer1_d(1,n_start), kdimx, &
CALL MYZGEMM('C','N', nact, my_n, kdim, C_ONE, buffer_d, kdimx, buffer1_d(1,n_start), kdimx, &
C_ZERO, G_d(1,n_start), nbnd)
G = G_d
CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm)
@ -986,7 +986,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
end if
call start_clock('ppcg:zgemm')
if (n_start .le. n_end) &
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
nbnd, C_ONE, buffer_d, kdimx)
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
!$cuf kernel do(2)
@ -1792,13 +1792,13 @@ CONTAINS
! this proc sends his block
!
CALL mp_bcast( Gl(:,1:nc), root, ortho_parent_comm )
CALL gpu_ZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gl, nx, gamm, Xtmp(1,ic), ld )
CALL MYZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gl, nx, gamm, Xtmp(1,ic), ld )
ELSE
!
! all other procs receive
!
CALL mp_bcast( Gltmp(:,1:nc), root, ortho_parent_comm )
CALL gpu_ZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gltmp, nx, gamm, Xtmp(1,ic), ld )
CALL MYZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gltmp, nx, gamm, Xtmp(1,ic), ld )
END IF
!
gamm = C_ONE

View File

@ -285,7 +285,7 @@ CONTAINS
!
INTEGER :: ibnd
!
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
COMPLEX(DP), EXTERNAL :: MYZDOTC
!
! ... Operate the Hamiltonian : H |psi>
!
@ -312,7 +312,7 @@ CONTAINS
!$acc host_data use_device(psi, hpsi)
DO ibnd = ibnd_start, ibnd_end
!
hw(ibnd) = DBLE( ZDOTC_gpu( kdim, psi(1,ibnd), 1, hpsi(1,ibnd), 1 ) )
hw(ibnd) = DBLE( MYZDOTC( kdim, psi(1,ibnd), 1, hpsi(1,ibnd), 1 ) )
!
END DO
!$acc end host_data
@ -326,11 +326,11 @@ CONTAINS
!
IF ( uspp ) THEN
!
sw(ibnd) = DBLE( ZDOTC_gpu( kdim, psi(1,ibnd), 1, spsi(1,ibnd), 1 ) )
sw(ibnd) = DBLE( MYZDOTC( kdim, psi(1,ibnd), 1, spsi(1,ibnd), 1 ) )
!
ELSE
!
sw(ibnd) = DBLE( ZDOTC_gpu( kdim, psi(1,ibnd), 1, psi(1,ibnd), 1 ) )
sw(ibnd) = DBLE( MYZDOTC( kdim, psi(1,ibnd), 1, psi(1,ibnd), 1 ) )
!
END IF
!
@ -394,10 +394,10 @@ CONTAINS
IF ( conv(ibnd) ) CYCLE
!
!$acc host_data use_device(psi, hpsi, spsi)
CALL ZCOPY_gpu( kdim, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 )
CALL ZCOPY_gpu( kdim, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 )
CALL MYZCOPY( kdim, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 )
CALL MYZCOPY( kdim, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 )
IF ( uspp ) &
CALL ZCOPY_gpu( kdim, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 )
CALL MYZCOPY( kdim, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 )
!$acc end host_data
!
php(ibnd,idiis) = hw(ibnd)
@ -419,15 +419,15 @@ CONTAINS
!
ec = CMPLX( php(ibnd,kdiis), 0._DP, kind=DP )
!
CALL ZCOPY_gpu( kdim, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
CALL MYZCOPY( kdim, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
!
IF ( uspp ) THEN
!
CALL ZAXPY_gpu( kdim, -ec, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
CALL MYZAXPY( kdim, -ec, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
!
ELSE
!
CALL ZAXPY_gpu( kdim, -ec, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
CALL MYZAXPY( kdim, -ec, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
!
END IF
!
@ -435,19 +435,19 @@ CONTAINS
!
ec = CMPLX( php(ibnd,idiis), 0._DP, kind=DP )
!
CALL ZCOPY_gpu( kdim, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYZCOPY( kdim, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
IF ( uspp ) THEN
!
CALL ZAXPY_gpu( kdim, -ec, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYZAXPY( kdim, -ec, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
ELSE
!
CALL ZAXPY_gpu( kdim, -ec, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYZAXPY( kdim, -ec, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
END IF
!
CALL ZGEMV_gpu( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, &
CALL MYZGEMV( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, &
vec1_d(1), 1, ZERO, tc_d(1,jbnd), 1 )
!
END DO
@ -482,21 +482,21 @@ CONTAINS
!
DO kdiis = 1, idiis
!
CALL ZCOPY_gpu( kdim, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
CALL MYZCOPY( kdim, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
!
END DO
!
IF ( uspp ) THEN
!
CALL ZCOPY_gpu( kdim, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYZCOPY( kdim, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
ELSE
!
CALL ZCOPY_gpu( kdim, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYZCOPY( kdim, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
END IF
!
CALL ZGEMV_gpu( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, &
CALL MYZGEMV( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, &
vec1_d(1), 1, ZERO, tc_d(1,jbnd), 1 )
!
END DO
@ -550,29 +550,29 @@ CONTAINS
kvc = vc(kdiis)
!
!$acc host_data use_device(psi, hpsi, spsi)
CALL ZAXPY_gpu( kdim, kvc, phi_d (1,ibnd,kdiis), 1, psi(1,ibnd), 1 )
CALL ZAXPY_gpu( kdim, kvc, hphi_d (1,ibnd,kdiis), 1, hpsi (1,ibnd), 1 )
IF (uspp) CALL ZAXPY_gpu( kdim, kvc, sphi_d (1,ibnd,kdiis), 1, spsi (1,ibnd), 1 )
CALL MYZAXPY( kdim, kvc, phi_d (1,ibnd,kdiis), 1, psi(1,ibnd), 1 )
CALL MYZAXPY( kdim, kvc, hphi_d (1,ibnd,kdiis), 1, hpsi (1,ibnd), 1 )
IF (uspp) CALL MYZAXPY( kdim, kvc, sphi_d (1,ibnd,kdiis), 1, spsi (1,ibnd), 1 )
!$acc end host_data
!
! ... Residual vectors
!
ec = CMPLX( php(ibnd,kdiis), 0._DP, kind=DP )
!
CALL ZCOPY_gpu( kdim, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
CALL MYZCOPY( kdim, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
!
IF ( uspp ) THEN
!
CALL ZAXPY_gpu( kdim, -ec, sphi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 )
CALL MYZAXPY( kdim, -ec, sphi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 )
!
ELSE
!
CALL ZAXPY_gpu( kdim, -ec, phi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 )
CALL MYZAXPY( kdim, -ec, phi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 )
!
END IF
!
!$acc host_data use_device(kpsi)
CALL ZAXPY_gpu( kdim, kvc, vec1_d (1), 1, kpsi (1,kbnd), 1 )
CALL MYZAXPY( kdim, kvc, vec1_d (1), 1, kpsi (1,kbnd), 1 )
!$acc end host_data
!
END DO
@ -583,10 +583,10 @@ CONTAINS
!
norm = SQRT( sw(ibnd) )
!$acc host_data use_device(psi, hpsi, spsi)
CALL ZDSCAL_gpu( kdim, 1._DP / norm, psi (1,ibnd), 1 )
CALL ZDSCAL_gpu( kdim, 1._DP / norm, hpsi(1,ibnd), 1 )
CALL MYZDSCAL( kdim, 1._DP / norm, psi (1,ibnd), 1 )
CALL MYZDSCAL( kdim, 1._DP / norm, hpsi(1,ibnd), 1 )
IF ( uspp ) &
CALL ZDSCAL_gpu( kdim, 1._DP / norm, spsi(1,ibnd), 1 )
CALL MYZDSCAL( kdim, 1._DP / norm, spsi(1,ibnd), 1 )
!$acc end host_data
!
! ... Residual vectors
@ -594,15 +594,15 @@ CONTAINS
ec = CMPLX( hw(ibnd), 0._DP, kind=DP )
!
!$acc host_data use_device(psi, spsi, hpsi, kpsi)
CALL ZCOPY_gpu( kdim, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
CALL MYZCOPY( kdim, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
!
IF ( uspp ) THEN
!
CALL ZAXPY_gpu( kdim, -ec, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
CALL MYZAXPY( kdim, -ec, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
!
ELSE
!
CALL ZAXPY_gpu( kdim, -ec, psi(1,ibnd), 1, kpsi(1,kbnd), 1 )
CALL MYZAXPY( kdim, -ec, psi(1,ibnd), 1, kpsi(1,kbnd), 1 )
!
END IF
!$acc end host_data
@ -766,7 +766,7 @@ CONTAINS
COMPLEX(DP) :: z1, z2
REAL(DP), ALLOCATABLE :: coef(:,:)
!
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
COMPLEX(DP), EXTERNAL :: MYZDOTC
!
REAL(DP) :: ekinj
INTEGER :: idx
@ -901,21 +901,21 @@ CONTAINS
kbnd = ibnd_index(ibnd)
!
!$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi)
php = DBLE( ZDOTC_gpu( kdim, psi (1,ibnd), 1, hpsi (1,ibnd), 1 ) )
khp = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 ) )
khk = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 ) )
php = DBLE( MYZDOTC( kdim, psi (1,ibnd), 1, hpsi (1,ibnd), 1 ) )
khp = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 ) )
khk = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 ) )
!
IF ( uspp ) THEN
!
psp = DBLE( ZDOTC_gpu( kdim, psi (1,ibnd), 1, spsi (1,ibnd), 1 ) )
ksp = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 ) )
ksk = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 ) )
psp = DBLE( MYZDOTC( kdim, psi (1,ibnd), 1, spsi (1,ibnd), 1 ) )
ksp = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 ) )
ksk = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 ) )
!
ELSE
!
psp = DBLE( ZDOTC_gpu( kdim, psi (1,ibnd), 1, psi (1,ibnd), 1 ) )
ksp = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, psi (1,ibnd), 1 ) )
ksk = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 ) )
psp = DBLE( MYZDOTC( kdim, psi (1,ibnd), 1, psi (1,ibnd), 1 ) )
ksp = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, psi (1,ibnd), 1 ) )
ksk = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 ) )
!
END IF
!$acc end host_data
@ -1029,16 +1029,16 @@ CONTAINS
z2 = CMPLX( coef(2,jbnd), 0._DP, kind=DP )
!
!$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi)
CALL ZSCAL_gpu( kdim, z1, psi (1,ibnd), 1 )
CALL ZAXPY_gpu( kdim, z2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 )
CALL MYZSCAL( kdim, z1, psi (1,ibnd), 1 )
CALL MYZAXPY( kdim, z2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 )
!
CALL ZSCAL_gpu( kdim, z1, hpsi (1,ibnd), 1 )
CALL ZAXPY_gpu( kdim, z2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 )
CALL MYZSCAL( kdim, z1, hpsi (1,ibnd), 1 )
CALL MYZAXPY( kdim, z2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 )
!
IF ( uspp ) THEN
!
CALL ZSCAL_gpu( kdim, z1, spsi (1,ibnd), 1 )
CALL ZAXPY_gpu( kdim, z2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 )
CALL MYZSCAL( kdim, z1, spsi (1,ibnd), 1 )
CALL MYZAXPY( kdim, z2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 )
!
END IF
!$acc end host_data

View File

@ -297,7 +297,7 @@ CONTAINS
!
INTEGER :: ibnd
!
REAL(DP), EXTERNAL :: gpu_DDOT
REAL(DP), EXTERNAL :: MYDDOT
!
! ... Operate the Hamiltonian : H |psi>
!
@ -324,10 +324,10 @@ CONTAINS
!$acc host_data use_device(psi, hpsi)
DO ibnd = ibnd_start, ibnd_end
!
hw(ibnd) = 2._DP * gpu_DDOT( npw2, psi(1,ibnd), 1, hpsi(1,ibnd), 1 )
hw(ibnd) = 2._DP * MYDDOT( npw2, psi(1,ibnd), 1, hpsi(1,ibnd), 1 )
!
IF ( gstart == 2 ) &
hw(ibnd)= hw(ibnd) - gpu_DDOT( 1, psi(1,ibnd), 1, hpsi(1,ibnd),1)
hw(ibnd)= hw(ibnd) - MYDDOT( 1, psi(1,ibnd), 1, hpsi(1,ibnd),1)
!
END DO
!$acc end host_data
@ -341,17 +341,17 @@ CONTAINS
!
IF ( uspp ) THEN
!
sw(ibnd) = 2._DP * gpu_DDOT( npw2, psi(1,ibnd), 1, spsi(1,ibnd), 1 )
sw(ibnd) = 2._DP * MYDDOT( npw2, psi(1,ibnd), 1, spsi(1,ibnd), 1 )
!
IF ( gstart == 2 ) &
sw(ibnd)= sw(ibnd) - gpu_DDOT( 1, psi(1,ibnd), 1, spsi(1,ibnd),1)
sw(ibnd)= sw(ibnd) - MYDDOT( 1, psi(1,ibnd), 1, spsi(1,ibnd),1)
!
ELSE
!
sw(ibnd) = 2._DP * gpu_DDOT( npw2, psi(1,ibnd), 1, psi(1,ibnd), 1 )
sw(ibnd) = 2._DP * MYDDOT( npw2, psi(1,ibnd), 1, psi(1,ibnd), 1 )
!
IF ( gstart == 2 ) &
sw(ibnd )= sw(ibnd) - gpu_DDOT( 1, psi(1,ibnd), 1, psi(1,ibnd),1)
sw(ibnd )= sw(ibnd) - MYDDOT( 1, psi(1,ibnd), 1, psi(1,ibnd),1)
!
END IF
!
@ -413,10 +413,10 @@ CONTAINS
IF ( conv(ibnd) ) CYCLE
!
!$acc host_data use_device(psi, hpsi, spsi)
CALL DCOPY_gpu( npw2, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 )
CALL DCOPY_gpu( npw2, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 )
CALL MYDCOPY( npw2, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 )
CALL MYDCOPY( npw2, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 )
IF ( uspp ) &
CALL DCOPY_gpu( npw2, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 )
CALL MYDCOPY( npw2, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 )
!$acc end host_data
!
php(ibnd,idiis) = hw(ibnd)
@ -438,15 +438,15 @@ CONTAINS
!
er = php(ibnd,kdiis)
!
CALL DCOPY_gpu( npw2, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
CALL MYDCOPY( npw2, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
!
IF ( uspp ) THEN
!
CALL DAXPY_gpu( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
CALL MYDAXPY( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
!
ELSE
!
CALL DAXPY_gpu( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
CALL MYDAXPY( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
!
END IF
!
@ -454,19 +454,19 @@ CONTAINS
!
er = php(ibnd,idiis)
!
CALL DCOPY_gpu( npw2, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYDCOPY( npw2, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
IF ( uspp ) THEN
!
CALL DAXPY_gpu( npw2, -er, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYDAXPY( npw2, -er, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
ELSE
!
CALL DAXPY_gpu( npw2, -er, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYDAXPY( npw2, -er, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
END IF
!
CALL DGEMV_gpu( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, &
CALL MYDGEMV( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, &
vec1_d(1), 1, 0._DP, tr_d(1,jbnd), 1 )
!
IF ( gstart == 2 ) THEN
@ -507,21 +507,21 @@ CONTAINS
!
DO kdiis = 1, idiis
!
CALL DCOPY_gpu( npw2, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
CALL MYDCOPY( npw2, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
!
END DO
!
IF ( uspp ) THEN
!
CALL DCOPY_gpu( npw2, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYDCOPY( npw2, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
ELSE
!
CALL DCOPY_gpu( npw2, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
CALL MYDCOPY( npw2, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
!
END IF
!
CALL DGEMV_gpu( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, &
CALL MYDGEMV( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, &
vec1_d(1), 1, 0._DP, tr_d(1,jbnd), 1 )
!
IF ( gstart == 2 ) THEN
@ -580,30 +580,30 @@ CONTAINS
!
kvr = vr(kdiis)
!$acc host_data use_device(psi, hpsi, spsi)
CALL DAXPY_gpu( npw2, kvr, phi_d (1,ibnd,kdiis), 1, psi (1,ibnd), 1 )
CALL DAXPY_gpu( npw2, kvr, hphi_d(1,ibnd,kdiis), 1, hpsi(1,ibnd), 1 )
CALL MYDAXPY( npw2, kvr, phi_d (1,ibnd,kdiis), 1, psi (1,ibnd), 1 )
CALL MYDAXPY( npw2, kvr, hphi_d(1,ibnd,kdiis), 1, hpsi(1,ibnd), 1 )
IF ( uspp ) &
CALL DAXPY_gpu( npw2, kvr, sphi_d(1,ibnd,kdiis), 1, spsi(1,ibnd), 1 )
CALL MYDAXPY( npw2, kvr, sphi_d(1,ibnd,kdiis), 1, spsi(1,ibnd), 1 )
!$acc end host_data
!
! ... Residual vectors
!
er = php(ibnd,kdiis)
!
CALL DCOPY_gpu( npw2, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
CALL MYDCOPY( npw2, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
!
IF ( uspp ) THEN
!
CALL DAXPY_gpu( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
CALL MYDAXPY( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
!
ELSE
!
CALL DAXPY_gpu( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
CALL MYDAXPY( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
!
END IF
!
!$acc host_data use_device(kpsi)
CALL DAXPY_gpu( npw2, kvr, vec1_d(1), 1, kpsi(1,kbnd), 1 )
CALL MYDAXPY( npw2, kvr, vec1_d(1), 1, kpsi(1,kbnd), 1 )
!$acc end host_data
!
END DO
@ -614,10 +614,10 @@ CONTAINS
!
norm = SQRT( sw(ibnd) )
!$acc host_data use_device(psi, hpsi, spsi)
CALL DSCAL_gpu( npw2, 1._DP / norm, psi (1,ibnd), 1 )
CALL DSCAL_gpu( npw2, 1._DP / norm, hpsi(1,ibnd), 1 )
CALL MYDSCAL( npw2, 1._DP / norm, psi (1,ibnd), 1 )
CALL MYDSCAL( npw2, 1._DP / norm, hpsi(1,ibnd), 1 )
IF ( uspp ) &
CALL DSCAL_gpu( npw2, 1._DP / norm, spsi(1,ibnd), 1 )
CALL MYDSCAL( npw2, 1._DP / norm, spsi(1,ibnd), 1 )
!$acc end host_data
!
! ... Residual vectors
@ -625,21 +625,21 @@ CONTAINS
er = hw(ibnd)
!
!$acc host_data use_device(hpsi, kpsi)
CALL DCOPY_gpu( npw2, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
CALL MYDCOPY( npw2, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
!$acc end host_data
!
IF ( uspp ) THEN
!
!$acc host_data use_device(spsi, kpsi)
CALL DAXPY_gpu( npw2, -er, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
CALL MYDAXPY( npw2, -er, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
!$acc end host_data
!
ELSE
!
!civn: here changed spsi --> psi due to a possible typo in the original version
!CALL DAXPY_gpu( npw2, -er, spsi_d(1,ibnd), 1, kpsi_d(1,kbnd), 1 )
!CALL MYDAXPY( npw2, -er, spsi_d(1,ibnd), 1, kpsi_d(1,kbnd), 1 )
!$acc host_data use_device(psi, kpsi)
CALL DAXPY_gpu( npw2, -er, psi(1,ibnd), 1, kpsi(1,kbnd), 1 )
CALL MYDAXPY( npw2, -er, psi(1,ibnd), 1, kpsi(1,kbnd), 1 )
!$acc end host_data
!
END IF
@ -819,7 +819,7 @@ CONTAINS
REAL(DP) :: c1, c2
REAL(DP), ALLOCATABLE :: coef(:,:)
!
REAL(DP), EXTERNAL :: gpu_DDOT
REAL(DP), EXTERNAL :: MYDDOT
!civn
INTEGER :: idx
REAL(DP) :: ekinj
@ -989,9 +989,9 @@ CONTAINS
kbnd = ibnd_index(ibnd)
!
!$acc host_data use_device(psi, hpsi, kpsi, hkpsi)
php = 2._DP * gpu_DDOT( npw2, psi (1,ibnd), 1, hpsi (1,ibnd), 1 )
khp = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 )
khk = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 )
php = 2._DP * MYDDOT( npw2, psi (1,ibnd), 1, hpsi (1,ibnd), 1 )
khp = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 )
khk = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 )
!$acc end host_data
!
IF ( gstart == 2 ) THEN
@ -1020,9 +1020,9 @@ CONTAINS
IF ( uspp ) THEN
!
!$acc host_data use_device(psi, spsi, kpsi, skpsi)
psp = 2._DP * gpu_DDOT( npw2, psi (1,ibnd), 1, spsi (1,ibnd), 1 )
ksp = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 )
ksk = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 )
psp = 2._DP * MYDDOT( npw2, psi (1,ibnd), 1, spsi (1,ibnd), 1 )
ksp = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 )
ksk = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 )
!$acc end host_data
!
IF ( gstart == 2 ) THEN
@ -1051,9 +1051,9 @@ CONTAINS
ELSE
!
!$acc host_data use_device(psi, kpsi)
psp = 2._DP * gpu_DDOT( npw2, psi (1,ibnd), 1, psi (1,ibnd), 1 )
ksp = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, psi (1,ibnd), 1 )
ksk = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 )
psp = 2._DP * MYDDOT( npw2, psi (1,ibnd), 1, psi (1,ibnd), 1 )
ksp = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, psi (1,ibnd), 1 )
ksk = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 )
!$acc end host_data
!
IF ( gstart == 2 ) THEN
@ -1190,16 +1190,16 @@ CONTAINS
c2 = coef(2,jbnd)
!
!$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi)
CALL DSCAL_gpu( npw2, c1, psi (1,ibnd), 1 )
CALL DAXPY_gpu( npw2, c2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 )
CALL MYDSCAL( npw2, c1, psi (1,ibnd), 1 )
CALL MYDAXPY( npw2, c2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 )
!
CALL DSCAL_gpu( npw2, c1, hpsi (1,ibnd), 1 )
CALL DAXPY_gpu( npw2, c2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 )
CALL MYDSCAL( npw2, c1, hpsi (1,ibnd), 1 )
CALL MYDAXPY( npw2, c2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 )
!
IF ( uspp ) THEN
!
CALL DSCAL_gpu( npw2, c1, spsi (1,ibnd), 1 )
CALL DAXPY_gpu( npw2, c2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 )
CALL MYDSCAL( npw2, c1, spsi (1,ibnd), 1 )
CALL MYDAXPY( npw2, c2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 )
!
END IF
!$acc end host_data

View File

@ -257,9 +257,6 @@ CONTAINS
USE realus, ONLY : real_space, invfft_orbital_gamma, &
fwfft_orbital_gamma, calbec_rs_gamma, s_psir_gamma
use gvect, only : gstart
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: m_start, m_end ,ntemp
@ -280,15 +277,10 @@ CONTAINS
CALL errore('ch_psi_all', 'non collin in gamma point not implemented',1)
ENDIF
#if defined(__CUDA)
!$acc host_data use_device(spsi, ps, evc)
CALL DGEMM( 'C', 'N', nbnd, m, 2*n, 2.D0,evc, 2*npwx*npol, spsi, 2*npwx*npol, 0.D0, ps, nbnd )
if(gstart==2) CALL gpu_DGER(nbnd, m, -1.0_DP, evc, 2*npwx, spsi, 2*npwx, ps, nbnd )
CALL MYDGEMM( 'C', 'N', nbnd, m, 2*n, 2.D0,evc, 2*npwx*npol, spsi, 2*npwx*npol, 0.D0, ps, nbnd )
if(gstart==2) CALL MYDGER(nbnd, m, -1.0_DP, evc, 2*npwx, spsi, 2*npwx, ps, nbnd )
!$acc end host_data
#else
CALL DGEMM( 'C', 'N', nbnd, m, 2*n, 2.D0,evc, 2*npwx*npol, spsi, 2*npwx*npol, 0.D0, ps, nbnd )
if(gstart==2) CALL DGER(nbnd, m, -1.0_DP, evc, 2*npwx, spsi, 2*npwx, ps, nbnd )
#endif
!$acc kernels
ps (:,:) = ps(:,:) * alpha_pv
hpsi (:,:) = (0.d0, 0.d0)
@ -297,7 +289,7 @@ CONTAINS
CALL mp_sum ( ps, intra_bgrp_comm )
!$acc end host_data
!$acc host_data use_device(hpsi, ps, evc)
CALL DGEMM ('N', 'N', 2*n, m, ntemp , 1.d0 , evc, 2*npwx, ps, nbnd, 1.d0 , hpsi, 2*npwx)
CALL MYDGEMM ('N', 'N', 2*n, m, ntemp , 1.d0 , evc, 2*npwx, ps, nbnd, 1.d0 , hpsi, 2*npwx)
!$acc end host_data
!$acc kernels
spsi(:,:) = hpsi(:,:)

View File

@ -14,17 +14,16 @@ MODULE efermi_shift
!
! Define an abstract interface to use a callback
ABSTRACT INTERFACE
SUBROUTINE def_symmetrization(def, irr)
SUBROUTINE def_symmetrization(def)
USE kinds, ONLY : DP
INTEGER :: irr
COMPLEX(DP) :: def(3)
COMPLEX(DP), INTENT(inout) :: def(3)
END SUBROUTINE
END INTERFACE
!
CONTAINS
!
!-----------------------------------------------------------------------
SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, irr, sym_def)
SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, sym_def)
!-----------------------------------------------------------------------
!! This routine takes care of the effects of a shift of Ef, due to the
!! perturbation, that can take place in a metal at q=0
@ -61,8 +60,6 @@ SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, irr, sym_de
REAL(DP), INTENT(IN), OPTIONAL :: becsum1((nhm*(nhm+1))/2, nat, nspin_mag)
!! becsum1 = wdelta * <psi|beta> <beta|psi>
!! (where wdelta is a Dirac-delta-like function)
INTEGER, INTENT(IN), OPTIONAL :: irr
!! index of the current irr. rep. Used only in sym_def.
PROCEDURE(def_symmetrization), OPTIONAL :: sym_def
!! Symmetrization routine for the fermi energy change
!
@ -104,7 +101,7 @@ SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, irr, sym_de
!
! symmetrizes the Fermi energy shift
!
IF (present(sym_def)) CALL sym_def(def, irr)
IF (present(sym_def)) CALL sym_def(def)
WRITE( stdout, '(5x,"Pert. #",i3,": Fermi energy shift (Ry) =",2es15.4)')&
(ipert, def (ipert) , ipert = 1, npert )
!

View File

@ -23,7 +23,7 @@ MODULE qpoint
INTEGER :: nksq, npwq, nksqtot
! the real number of k points
! the number of plane waves for q
! the total number of q points
! the total number of q points
INTEGER, ALLOCATABLE :: ikks(:), ikqs(:)
! the index of k point in the list of k
! the index of k+q point in the list of k
@ -147,6 +147,19 @@ MODULE lr_symm_base
LOGICAL :: minus_q, & ! if .TRUE. there is the symmetry sending q<->-q
invsymq ! if .TRUE. the small group of q has inversion
!
! Symmetry representation of the perturbations
!
INTEGER :: lr_npert
!! Number of perturbations considered at the same time.
!! e.g., for phonons: dimension of the irreducible representation
!! e.g., for electric fields: 3
COMPLEX(DP), ALLOCATABLE :: upert(:, :, :)
!! Representation of the symmetry in the perturbation basis. Size (lr_npert, lr_npert, nsymq)
!! e.g., for phonons: transformation matrix of the patterns
!! e.g., for electric fields: transformation matrix of Cartesian vectors
COMPLEX(DP), ALLOCATABLE :: upert_mq(:, :)
!! Representation of the symmetry that transforms q to -q. Size (lr_npert, lr_npert)
!
END MODULE lr_symm_base
!
MODULE lrus

View File

@ -202,8 +202,7 @@ set(src_modules
w1gauss.f90
deviatoric.f90
# GPU
random_numbers_gpu.f90
cuda_subroutines.f90)
random_numbers_gpu.f90)
qe_enable_cuda_fortran("${src_modules}")
qe_add_library(qe_modules ${src_modules})

View File

@ -221,7 +221,6 @@ sockets.o
# GPU versions of modules
MODULES += \
cuda_subroutines.o \
random_numbers_gpu.o
TLDEPS= libfox libla libfft libutil libmbd librxc libupf

View File

@ -1,39 +0,0 @@
!----------------------------------------------
! ... this file contains a number of subroutines optionally interfaced
! ... to cublas
!----------------------------------------------
SUBROUTINE cudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
#if defined(__CUDA)
use cudafor
use cublas
#endif
implicit none
DOUBLE PRECISION :: ALPHA,BETA
INTEGER :: INCX,INCY,LDA,M,N
CHARACTER :: TRANS
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
!
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
!
END SUBROUTINE cudaDGEMV
SUBROUTINE cudaDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#if defined(__CUDA)
use cudafor
use cublas
#endif
! .. Scalar Arguments ..
DOUBLE PRECISION :: ALPHA
INTEGER :: INCX, INCY, LDA, M, N
! .. Array Arguments ..
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
END SUBROUTINE cudaDGER

View File

@ -89,6 +89,7 @@ set(src_ph
PH/openfilq.f90
PH/phcom.f90
PH/ph_restart.f90
PH/ph_set_upert.f90
PH/phescf.f90
PH/phq_init.f90
PH/phq_readin.f90
@ -102,7 +103,6 @@ set(src_ph
PH/prepare_sym_analysis.f90
PH/psidspsi.f90
PH/psymdvscf.f90
PH/psyme.f90
PH/psym_dmag.f90
PH/psym_dmage.f90
PH/punch_plot_e.f90
@ -136,7 +136,6 @@ set(src_ph
PH/symdvscf.f90
PH/symdyn_munu.f90
PH/symdynph_gq.f90
PH/syme.f90
PH/symm.f90
PH/symmorphic_or_nzb.f90
PH/swfc.f90

View File

@ -108,13 +108,13 @@ phq_recover.o \
phq_setup.o \
phq_summary.o \
phqscf.o \
ph_set_upert.o \
polariz.o \
print_clock_ph.o \
prepare_q.o \
prepare_sym_analysis.o \
psidspsi.o \
psymdvscf.o \
psyme.o \
psym_dmag.o \
psym_dmage.o \
punch_plot_e.o \
@ -147,7 +147,6 @@ sym_dmage.o \
symdvscf.o \
symdyn_munu.o \
symdynph_gq.o \
syme.o \
symm.o \
symmorphic_or_nzb.o \
swfc.o \

View File

@ -762,7 +762,7 @@ PROGRAM matdyn
OPEN (unit=2,file=flfrq ,status='unknown',form='formatted')
WRITE(2, '(" &plot nbnd=",i4,", nks=",i4," /")') 3*nat, nq
DO n=1, nq
WRITE(2, '(10x,3f10.6)') q(1,n), q(2,n), q(3,n)
WRITE(2, '(10x,4f10.6)') q(1,n), q(2,n), q(3,n), wq(n)
WRITE(2,'(6f10.4)') (freq(i,n), i=1,3*nat)
END DO
CLOSE(unit=2)

129
PHonon/PH/ph_set_upert.f90 Normal file
View File

@ -0,0 +1,129 @@
!
! Copyright (C) 2001-2018 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!------------------------------------------------------------------------------------------
SUBROUTINE ph_set_upert_phonon(irr)
!--------------------------------------------------------------------------------------
!! Set lr_npert, upert, and upert_mp for phonons in irreducible representation irr
!--------------------------------------------------------------------------------------
!
USE modes, ONLY : npert, t, tmq
USE control_ph, ONLY : lgamma_gamma
USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert, upert, upert_mq
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: irr
!! irreducible representation
!
INTEGER :: ipert, jpert
!! Counter on perturbations
INTEGER :: isym
!! Counter on symmetries
!
! Set symmetry representation in lr_symm_base
!
lr_npert = npert(irr)
!
IF (lgamma_gamma) THEN
!
! If lgamma_gamma is true, symmetrization is not used.
! Set upert and upert_mq to 1.
!
IF (lr_npert /= 1) CALL errore('ph_set_upert_phonon', &
'lgamma_gamma is true, but lr_npert /= 1', 1)
!
ALLOCATE(upert(1, 1, 1))
upert(1, 1, 1) = (1.d0, 0.d0)
!
IF (minus_q) THEN
ALLOCATE(upert_mq(1, 1))
upert_mq(1, 1) = (1.d0, 0.d0)
ENDIF
!
ELSE
!
ALLOCATE(upert(lr_npert, lr_npert, nsymq))
!
DO isym = 1, nsymq
DO ipert = 1, lr_npert
DO jpert = 1, lr_npert
upert(jpert, ipert, isym) = t(jpert, ipert, isym, irr)
ENDDO
ENDDO
ENDDO
!
IF (minus_q) THEN
ALLOCATE(upert_mq(lr_npert, lr_npert))
DO ipert = 1, lr_npert
DO jpert = 1, lr_npert
upert_mq(jpert, ipert) = tmq(jpert, ipert, irr)
ENDDO
ENDDO
ENDIF ! minus_q
!
ENDIF
!
END SUBROUTINE ph_set_upert_phonon
!-----------------------------------------------------------------------------------------
!
!-----------------------------------------------------------------------------------------
SUBROUTINE ph_set_upert_e()
!--------------------------------------------------------------------------------------
!! Set lr_npert, upert, and upert_mp for electric field perturbation.
!--------------------------------------------------------------------------------------
!
USE symm_base, ONLY : s
USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert, upert, upert_mq
!
IMPLICIT NONE
!
INTEGER :: ipol, jpol
!! Counter on perturbations
INTEGER :: isym
!! Counter on symmetries
!
! Set symmetry representation in lr_symm_base
!
lr_npert = 3
!
ALLOCATE(upert(lr_npert, lr_npert, nsymq))
!
DO isym = 1, nsymq
DO ipol = 1, 3
DO jpol = 1, 3
upert(jpol, ipol, isym) = s(ipol, jpol, isym)
ENDDO
ENDDO
ENDDO
!
IF (minus_q) THEN
!
! upert_mq is the rotation matrix for symmetry S such that T * S * q = q + G.
! E field perturbation is applied only for q = 0, where T*q = q, i.e., S = identity.
! Thus, upert_mq is the 3*3 identity matrix.
!
ALLOCATE(upert_mq(lr_npert, lr_npert))
upert_mq = (0.d0, 0.d0)
DO ipol = 1, lr_npert
upert_mq(ipol, ipol) = (1.d0, 0.d0)
ENDDO
ENDIF ! minus_q
!
END SUBROUTINE ph_set_upert_e
!----------------------------------------------------------------------------------------
!
!----------------------------------------------------------------------------------------
SUBROUTINE ph_deallocate_upert()
!----------------------------------------------------------------------------------------
USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert, upert, upert_mq
IMPLICIT NONE
DEALLOCATE(upert)
IF (minus_q) DEALLOCATE(upert_mq)
!----------------------------------------------------------------------------------------
END SUBROUTINE ph_deallocate_upert
!----------------------------------------------------------------------------------------

View File

@ -51,6 +51,10 @@ SUBROUTINE phescf()
IF (noncolin) ALLOCATE(int3_nc( nhm, nhm, nat, nspin, 3))
ENDIF
!
! Set symmetry representation in lr_symm_base
!
CALL ph_set_upert_e()
!
! DFPT+U: dnsscf in the electric field calculation
! is the scf change of atomic occupations ns induced by the electric field.
! dnsscf_all_modes = dnsscf because nirr=1, number of perturbations = 3.
@ -134,6 +138,8 @@ SUBROUTINE phescf()
IF (noncolin) DEALLOCATE(int3_nc)
ENDIF
!
CALL ph_deallocate_upert()
!
! DFPT+U
!
IF (lda_plus_u) THEN

View File

@ -72,6 +72,8 @@ SUBROUTINE phqscf
DO irr = 1, nirr
IF ( (comp_irr (irr)) .AND. (.NOT.done_irr (irr)) ) THEN
npe=npert(irr)
CALL ph_set_upert_phonon(irr)
!
ALLOCATE (drhoscfs( dfftp%nnr , nspin_mag, npe))
imode0 = 0
DO irr1 = 1, irr - 1
@ -144,6 +146,8 @@ SUBROUTINE phqscf
IF (okpaw) DEALLOCATE (int3_paw)
IF (noncolin) DEALLOCATE(int3_nc)
ENDIF
CALL ph_deallocate_upert()
!
tcpu = get_clock ('PHONON')
!
DEALLOCATE (drhoscfs)

View File

@ -7,25 +7,23 @@
!
!
!-----------------------------------------------------------------------
SUBROUTINE psymdvscf (nper, irr, dvtosym)
SUBROUTINE psymdvscf (dvtosym)
!-----------------------------------------------------------------------
!! p-symmetrize the potential.
!!
!! The real space points of dv is distributed, but symmetry may map a point in one
!! core to a point in a different core. Hence, gather dvscf in real space, symmetrize,
!! and then scatter back.
!
USE kinds, ONLY : DP
USE kinds, ONLY : DP
USE fft_base, ONLY : dfftp
USE noncollin_module, ONLY : nspin_mag
USE mp_bands, ONLY : me_bgrp
USE fft_base, ONLY : dfftp
USE scatter_mod, ONLY : cgather_sym
USE lr_symm_base, ONLY : nsymq, minus_q
USE scatter_mod, ONLY : cgather_sym
USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert
!
IMPLICIT NONE
!
INTEGER :: nper
!! the number of perturbations
INTEGER :: irr
!! the representation under consideration
COMPLEX(DP) :: dvtosym(dfftp%nnr,nspin_mag,nper)
COMPLEX(DP) :: dvtosym(dfftp%nnr, nspin_mag, lr_npert)
!! the potential to symmetrize
!
! ... local variable
@ -33,27 +31,30 @@ SUBROUTINE psymdvscf (nper, irr, dvtosym)
#if defined (__MPI)
!
INTEGER :: i, is, iper, ir3, ioff, ioff_tg, nxyp
!
COMPLEX(DP), ALLOCATABLE :: ddvtosym (:,:,:)
! the potential to symm
IF (nsymq.EQ.1.AND. (.NOT.minus_q) ) RETURN
IF (nsymq == 1 .AND. (.NOT.minus_q) ) RETURN
CALL start_clock ('psymdvscf')
ALLOCATE (ddvtosym ( dfftp%nr1x * dfftp%nr2x * dfftp%nr3x, nspin_mag, nper))
DO iper = 1, nper
!
ALLOCATE (ddvtosym ( dfftp%nr1x * dfftp%nr2x * dfftp%nr3x, nspin_mag, lr_npert))
!
! Gather real-space points
!
DO iper = 1, lr_npert
DO is = 1, nspin_mag
CALL cgather_sym (dfftp, dvtosym (:, is, iper), ddvtosym (:, is, iper) )
ENDDO
ENDDO
CALL symdvscf (nper, irr, ddvtosym)
!
! Symmetrize
!
CALL symdvscf (ddvtosym)
!
! Scatter back the real-space points
!
nxyp = dfftp%nr1x * dfftp%my_nr2p
DO iper = 1, nper
DO iper = 1, lr_npert
DO is = 1, nspin_mag
DO ir3 = 1, dfftp%my_nr3p
ioff = dfftp%nr1x * dfftp%my_nr2p * (ir3-1)
@ -63,14 +64,12 @@ SUBROUTINE psymdvscf (nper, irr, dvtosym)
ENDDO
ENDDO
DEALLOCATE (ddvtosym)
!
CALL stop_clock ('psymdvscf')
#else
CALL symdvscf (nper, irr, dvtosym)
!
CALL symdvscf(dvtosym)
!
#endif
RETURN
!
END SUBROUTINE psymdvscf

View File

@ -1,65 +0,0 @@
!
! Copyright (C) 2001-2008 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!-----------------------------------------------------------------------
SUBROUTINE psyme (dvtosym)
!-----------------------------------------------------------------------
!! p-symmetrize the charge density.
!
USE kinds, ONLY : DP
USE fft_base, ONLY : dfftp
USE noncollin_module, ONLY : nspin_mag
USE mp_bands, ONLY : me_bgrp
USE fft_base, ONLY : dfftp
USE scatter_mod, ONLY : cgather_sym
!
IMPLICIT NONE
!
COMPLEX(DP) :: dvtosym (dfftp%nnr, nspin_mag, 3)
!! the potential to symmetrize
!
! ... local variables
!
#if defined (__MPI)
!
INTEGER :: i, is, iper, ir3, ioff, ioff_tg, nxyp
COMPLEX(DP), ALLOCATABLE :: ddvtosym (:,:,:)
! the potential to symmet
!
!
ALLOCATE (ddvtosym ( dfftp%nr1x * dfftp%nr2x * dfftp%nr3x, nspin_mag, 3))
DO iper = 1, 3
DO is = 1, nspin_mag
CALL cgather_sym (dfftp,dvtosym (:, is, iper), ddvtosym (:, is, iper) )
ENDDO
ENDDO
CALL syme (ddvtosym)
nxyp = dfftp%nr1x * dfftp%my_nr2p
DO iper = 1, 3
DO is = 1, nspin_mag
DO ir3 = 1, dfftp%my_nr3p
ioff = dfftp%nr1x * dfftp%my_nr2p * (ir3-1)
ioff_tg = dfftp%nr1x * dfftp%nr2x * (dfftp%my_i0r3p+ir3-1) + dfftp%nr1x * dfftp%my_i0r2p
CALL zcopy (nxyp, ddvtosym (ioff_tg+1, is, iper), 1, dvtosym (ioff+1, is, iper), 1)
END DO
ENDDO
ENDDO
DEALLOCATE (ddvtosym)
#else
CALL syme (dvtosym)
#endif
RETURN
END SUBROUTINE psyme

View File

@ -234,7 +234,7 @@ subroutine solve_e
call mp_sum ( dvscfout, inter_pool_comm )
IF (okpaw) call mp_sum ( dbecsum, inter_pool_comm )
if (.not.lgamma_gamma) then
call psyme (dvscfout)
call psymdvscf(dvscfout)
IF ( noncolin.and.domag ) CALL psym_dmage(dvscfout)
endif
!

View File

@ -315,7 +315,7 @@ subroutine solve_e_fpol( iw )
! for the three polarizations - symmetrize it
!
call mp_sum ( dvscfout, inter_pool_comm )
call psyme (dvscfout)
call psymdvscf(dvscfout)
!
! save the symmetrized linear charge response to file
! calculate the corresponding linear potential response

View File

@ -395,9 +395,9 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
!
IF (lmetq0) THEN
IF (okpaw) THEN
CALL ef_shift(npe, dos_ef, ldos, drhoscfh, dbecsum, becsum1, irr, sym_def)
CALL ef_shift(npe, dos_ef, ldos, drhoscfh, dbecsum, becsum1, sym_def)
ELSE
CALL ef_shift(npe, dos_ef, ldos, drhoscfh, irr=irr, sym_def=sym_def)
CALL ef_shift(npe, dos_ef, ldos, drhoscfh, sym_def=sym_def)
ENDIF
ENDIF
!
@ -406,7 +406,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
! Here we symmetrize them ...
!
IF (.not.lgamma_gamma) THEN
call psymdvscf (npe, irr, drhoscfh)
CALL psymdvscf(drhoscfh)
IF ( noncolin.and.domag ) CALL psym_dmag( npe, irr, drhoscfh)
IF (okpaw) THEN
IF (minus_q) CALL PAW_dumqsymmetrize(dbecsum,npe,irr, npertx,irotmq,rtau,xq,tmq)

View File

@ -8,7 +8,7 @@
!---------------------------------------------------------------------
MODULE sym_def_module
CONTAINS
subroutine sym_def (def, irr)
SUBROUTINE sym_def(def)
!---------------------------------------------------------------------
!! Symmetrizes the first order changes of the Fermi energies of an
!! irreducible representation. These objects are defined complex because
@ -16,60 +16,52 @@ subroutine sym_def (def, irr)
!
!! Used in the q=0 metallic case only.
!
USE kinds, only : DP
USE modes, ONLY : npert, t, tmq
USE control_ph, ONLY : lgamma_gamma
USE lr_symm_base, ONLY : minus_q, nsymq
implicit none
integer :: irr
!! input: the representation under consideration
complex(DP) :: def(3)
!! inp/out: the fermi energy changes.
USE kinds, ONLY : DP
USE control_ph, ONLY : lgamma_gamma
USE lr_symm_base, ONLY : minus_q, nsymq, lr_npert, upert, upert_mq
!
IMPLICIT NONE
!
COMPLEX(DP), INTENT(inout) :: def(3)
!! inp/out: the fermi energy changes.
!! NB: def(3) should be def(npertx), but it is used only at Gamma
!! where the dimension of irreps never exceeds 3.
!
! ... local variables
!
integer :: ipert, jpert, isym, irot
INTEGER :: ipert, jpert, isym
! counter on perturbations
! counter on perturbations
! counter on symmetries
! the rotation
complex(DP) :: w_def(3)
!
COMPLEX(DP) :: w_def(3)
! the fermi energy changes (work array)
!
IF (lgamma_gamma) RETURN
if (nsymq == 1 .and. (.not.minus_q) ) return
if (npert(irr) > 3) CALL errore("sym_def", "npert(irr) exceeds 3", 1)
if (lr_npert > 3) CALL errore("sym_def", "lr_npert cannot exceed 3 at q=0", 1)
!
! first the symmetrization S(irotmq)*q = -q + Gi if necessary
!
if (minus_q) then
w_def = (0.d0, 0.d0)
do ipert = 1, npert (irr)
do jpert = 1, npert (irr)
w_def (ipert) = w_def (ipert) + tmq (jpert, ipert, irr) &
* def (jpert)
do ipert = 1, lr_npert
do jpert = 1, lr_npert
w_def(ipert) = w_def(ipert) + upert_mq(jpert, ipert) * def(jpert)
enddo
enddo
do ipert = 1, npert (irr)
def (ipert) = 0.5d0 * (def (ipert) + CONJG(w_def (ipert) ) )
do ipert = 1, lr_npert
def(ipert) = 0.5d0 * (def(ipert) + CONJG(w_def(ipert)) )
enddo
endif
!
! Here we symmetrize with respect to the small group of q
!
w_def = (0.d0, 0.d0)
do ipert = 1, npert (irr)
do ipert = 1, lr_npert
do isym = 1, nsymq
irot = isym
do jpert = 1, npert (irr)
w_def (ipert) = w_def (ipert) + t (jpert, ipert, irot, irr) &
* def (jpert)
do jpert = 1, lr_npert
w_def(ipert) = w_def(ipert) + upert(jpert, ipert, isym) * def(jpert)
enddo
enddo
enddo
@ -77,7 +69,6 @@ subroutine sym_def (def, irr)
! normalize and exit
!
def = w_def / DBLE(nsymq)
return
end subroutine sym_def
!
END SUBROUTINE sym_def
END MODULE sym_def_module

View File

@ -55,6 +55,7 @@ SUBROUTINE sym_dns_wrapper (ldim, dns_cart, dns_pattern)
npe = npert(irr)
! allocate
ALLOCATE (dns_aux(ldim,ldim,nspin,nat,npe))
CALL ph_set_upert_phonon(irr)
! pack
dns_aux(:,:,:,:,1:npe) = dns_pattern(:,:,:,:,imode0:imode0-1+npe)
! symmetrize
@ -63,6 +64,7 @@ SUBROUTINE sym_dns_wrapper (ldim, dns_cart, dns_pattern)
dns_pattern(:,:,:,:,imode0:imode0-1+npe) = dns_aux(:,:,:,:,1:npe)
! deallocate
DEALLOCATE (dns_aux)
CALL ph_deallocate_upert()
! advance the counter
imode0 = imode0 + npe
ENDDO

View File

@ -7,7 +7,7 @@
!
!
!---------------------------------------------------------------------
subroutine symdvscf (nper, irr, dvtosym)
subroutine symdvscf (dvtosym)
!---------------------------------------------------------------------
!! Symmetrize the self-consistent potential of the perturbations
!! belonging to an irreducible representation.
@ -22,16 +22,12 @@ subroutine symdvscf (nper, irr, dvtosym)
USE cell_base, ONLY : at
USE symm_base, ONLY : s, ft, t_rev
USE noncollin_module, ONLY : nspin_lsda, nspin_mag
USE modes, ONLY : t, tmq
USE lr_symm_base, ONLY : minus_q, irotmq, nsymq, gi, gimq
USE control_lr, ONLY : lgamma
USE lr_symm_base, ONLY : minus_q, irotmq, nsymq, gi, gimq, lr_npert, upert, upert_mq
!
implicit none
!
integer :: nper
!! the number of perturbations
integer :: irr
!! the representation under conside
complex(DP) :: dvtosym(dfftp%nr1x,dfftp%nr2x,dfftp%nr3x,nspin_mag,nper)
complex(DP) :: dvtosym(dfftp%nr1x,dfftp%nr2x,dfftp%nr3x, nspin_mag, lr_npert)
!! the potential to be symmetrized
!
! ... local variables
@ -48,54 +44,74 @@ subroutine symdvscf (nper, irr, dvtosym)
! auxiliary space
! the multiplication factor
! the phase factor
if (nsymq == 1.and. (.not.minus_q) ) return
call start_clock ('symdvscf')
allocate (dvsym( dfftp%nr1x , dfftp%nr2x , dfftp%nr3x , nper))
allocate (add_dvsym(nper))
!
! if necessary we symmetrize with respect to S(irotmq)*q = -q + Gi
! If there is no symmetry other than identity, return.
IF (nsymq == 1 .AND. (.NOT.minus_q) ) RETURN
!
CALL start_clock ('symdvscf')
!
ALLOCATE(dvsym(dfftp%nr1x, dfftp%nr2x, dfftp%nr3x, lr_npert))
ALLOCATE(add_dvsym(lr_npert))
!
n(1) = tpi / DBLE (dfftp%nr1)
n(2) = tpi / DBLE (dfftp%nr2)
n(3) = tpi / DBLE (dfftp%nr3)
!
CALL scale_sym_ops( nsymq, s, ft, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
s_scaled, ftau )
if (minus_q) then
gf(:) = gimq (1) * at (1, :) * n(:) + &
gimq (2) * at (2, :) * n(:) + &
gimq (3) * at (3, :) * n(:)
term (:, 1) = CMPLX(cos (gf (:) ), sin (gf (:) ) ,kind=DP)
do is = 1, nspin_lsda
phase (1) = (1.d0, 0.d0)
do k = 1, dfftp%nr3
do j = 1, dfftp%nr2
do i = 1, dfftp%nr1
CALL rotate_grid_point(s_scaled(1,1,irotmq), ftau(1,irotmq), &
i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk)
do ipert = 1, nper
aux2 = (0.d0, 0.d0)
do jpert = 1, nper
aux2 = aux2 + tmq (jpert, ipert, irr) * &
dvtosym (ri, rj, rk, is, jpert) * phase (1)
!
! if necessary we symmetrize with respect to S(irotmq)*q = -q + Gi
! (time reversal + spatial symmetry S(irotmq))
!
IF (minus_q) THEN
IF (lgamma) THEN
!
! Special case: q = Gamma. S(irotmq) = I, Gi = 0.
!
DO is = 1, nspin_lsda
DO ipert = 1, lr_npert
dvtosym(:,:,:,is,ipert) = CMPLX(DBLE(dvtosym(:,:,:,is,ipert)), 0.d0, KIND=DP)
ENDDO
ENDDO
!
ELSE ! .NOT. lgamma
gf(:) = gimq (1) * at (1, :) * n(:) + &
gimq (2) * at (2, :) * n(:) + &
gimq (3) * at (3, :) * n(:)
term (:, 1) = CMPLX(cos (gf (:) ), sin (gf (:) ) ,kind=DP)
do is = 1, nspin_lsda
phase (1) = (1.d0, 0.d0)
do k = 1, dfftp%nr3
do j = 1, dfftp%nr2
do i = 1, dfftp%nr1
CALL rotate_grid_point(s_scaled(1,1,irotmq), ftau(1,irotmq), &
i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk)
do ipert = 1, lr_npert
aux2 = (0.d0, 0.d0)
do jpert = 1, lr_npert
aux2 = aux2 + upert_mq(jpert, ipert) * &
dvtosym (ri, rj, rk, is, jpert) * phase (1)
enddo
dvsym (i, j, k, ipert) = (dvtosym (i, j, k, is, ipert) +&
CONJG(aux2) ) * 0.5d0
enddo
dvsym (i, j, k, ipert) = (dvtosym (i, j, k, is, ipert) +&
CONJG(aux2) ) * 0.5d0
phase (1) = phase (1) * term (1, 1)
enddo
phase (1) = phase (1) * term (1, 1)
phase (1) = phase (1) * term (2, 1)
enddo
phase (1) = phase (1) * term (2, 1)
phase (1) = phase (1) * term (3, 1)
enddo
do ipert = 1, lr_npert
dvtosym(:, :, :, is, ipert) = dvsym (:, :, :, ipert)
enddo
phase (1) = phase (1) * term (3, 1)
enddo
do ipert = 1, nper
dvtosym(:, :, :, is, ipert) = dvsym (:, :, :, ipert)
enddo
enddo
endif
ENDIF ! lgamma
ENDIF ! minus_q
!
! If there no spatial symmetry other than identity, return.
!
IF (nsymq == 1) RETURN
!
!
! Here we symmetrize with respect to the small group of q
!
@ -119,13 +135,10 @@ subroutine symdvscf (nper, irr, dvtosym)
CALL rotate_grid_point(s_scaled(1,1,irot), ftau(1,irot), &
i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk)
add_dvsym(:) = (0.d0, 0.d0)
do ipert = 1, nper
do jpert = 1, nper
add_dvsym(ipert) = add_dvsym(ipert) + t (jpert, ipert, irot, irr) * &
do ipert = 1, lr_npert
do jpert = 1, lr_npert
add_dvsym(ipert) = add_dvsym(ipert) + upert(jpert, ipert, irot) * &
dvtosym (ri, rj, rk, is, jpert) * phase (isym)
!dvsym (i, j, k, ipert) = dvsym (i, j, k, ipert) + &
! t (jpert, ipert, irot, irr) * &
! dvtosym (ri, rj, rk, is, jpert) * phase (isym)
enddo
enddo
if (t_rev(isym)==0) then
@ -147,7 +160,7 @@ subroutine symdvscf (nper, irr, dvtosym)
enddo
enddo
do ipert = 1, nper
do ipert = 1, lr_npert
dvtosym(:,:,:,is,ipert) = dvsym(:,:,:,ipert) / DBLE (nsymq)
enddo

View File

@ -1,78 +0,0 @@
!
! Copyright (C) 2001-2008 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!---------------------------------------------------------------------
subroutine syme (dvsym)
!---------------------------------------------------------------------
!! This routine symmetrize the change of the potential due to an
!! electric field perturbation. It is assumed that the perturbations
!! are on the basis of the crystal.
!
USE fft_base, only : dfftp
USE symm_base, only : nsym, s, ft
USE noncollin_module, only : nspin_lsda, nspin_mag
USE kinds, only : DP
implicit none
complex(DP) :: dvsym(dfftp%nr1x,dfftp%nr2x,dfftp%nr3x,nspin_mag,3)
!! the potential to symmetrize
! ... local variables
complex(DP), allocatable :: aux(:,:,:,:)
! auxiliary quantity
integer :: ftau(3,nsym), s_scaled(3,3,nsym)
integer :: is, ri, rj, rk, i, j, k, irot, ipol, jpol
! counter on spin polarization
! the rotated points
! the point
! counter on symmetries
! counter on polarizations
do is = 1, nspin_lsda
do ipol = 1, 3
dvsym(:,:,:,is,ipol) = CMPLX(DBLE(dvsym(:,:,:,is,ipol)),0.d0,kind=DP)
end do
end do
if (nsym == 1) return
CALL scale_sym_ops( nsym, s, ft, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
s_scaled, ftau )
allocate (aux(dfftp%nr1x , dfftp%nr2x , dfftp%nr3x , 3))
do is = 1, nspin_lsda
do ipol = 1, 3
aux(:,:,:,ipol) = dvsym(:,:,:,is,ipol)
dvsym(:,:,:,is,ipol) = (0.d0, 0.d0)
enddo
!
! symmmetrize
!
do k = 1, dfftp%nr3
do j = 1, dfftp%nr2
do i = 1, dfftp%nr1
do irot = 1, nsym
CALL rotate_grid_point(s_scaled(1,1,irot), ftau(1,irot), &
i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk)
do ipol = 1, 3
do jpol = 1, 3
dvsym(i,j,k,is,ipol) = dvsym(i,j,k,is,ipol) + &
s(ipol,jpol,irot) * aux(ri,rj,rk,jpol)
enddo
enddo
enddo
enddo
enddo
enddo
do ipol = 1, 3
dvsym(:,:,:,is,ipol) = dvsym(:,:,:,is,ipol) / DBLE(nsym)
enddo
enddo
deallocate (aux)
return
end subroutine syme

View File

@ -149,7 +149,7 @@ subroutine zstar_eu_us
!
call dv_of_drho (dvscf (:, :, ipol), .false.)
enddo
call psyme (dvscf)
call psymdvscf(dvscf)
#ifdef TIMINIG_ZSTAR_US
call stop_clock('zstar_us_3')

View File

@ -19,6 +19,8 @@ input_description -distribution {Quantum ESPRESSO} -package PWscf -program d3hes
(3) run phonon
Please note that filhess in d3hess input and dftd3_hess in phonon input, if given, should match.
Please also note that second derivatives of the three-body term of d3 dispersion are not implemented,
and phonon calculations with d3 should be run with dftd3_threebody=.false. in the SCF.
@b {Structure of the input data:}
============================

View File

@ -19,10 +19,6 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot)
#if defined(__CUDA)
USE cudafor
USE cublas
#else
#define cublasZgemm Zgemm
#define cublasDGEMM Dgemm
#define cudaDGER Dger
#endif
USE kinds, ONLY : DP
USE ions_base, ONLY : nat, ntyp => nsp, ityp

View File

@ -222,3 +222,168 @@ USE cublas
return
end subroutine MYDDOTv3
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
subroutine MYDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
#if defined(__CUDA)
USE cublas
#endif
implicit none
character*1 :: side, uplo, transa, diag
integer :: m, n, lda, ldb
DOUBLE PRECISION :: alpha
DOUBLE PRECISION, dimension(lda, *) :: a
DOUBLE PRECISION, dimension(ldb, *) :: b
#if defined(__CUDA)
attributes(device) :: a, b
call cublasDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
#endif
return
end subroutine MYDTRSM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
DOUBLE COMPLEX function MYZDOTC(n, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: n, incx, incy
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
MYZDOTC = cublasZDOTC(n, zx, incx, zy, incy)
#endif
return
end function MYZDOTC
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE MYZSWAP(n, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: n, incx, incy
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
CALL cublasZSWAP(n, zx, incx, zy, incy)
#endif
return
END SUBROUTINE MYZSWAP
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE MYZCOPY(n, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
CALL cublasZCOPY(n, zx, incx, zy, incy)
#endif
RETURN
END SUBROUTINE MYZCOPY
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE MYZAXPY(n, za, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE COMPLEX :: za
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
CALL cublasZAXPY(n, za, zx, incx, zy, incy)
#endif
RETURN
END SUBROUTINE MYZAXPY
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE MYZDSCAL(n, da, zx, incx)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx
DOUBLE PRECISION :: da
DOUBLE COMPLEX, dimension(*) :: zx
#if defined(__CUDA)
attributes(device) :: zx
CALL cublasZDSCAL(n, da, zx, incx)
#endif
RETURN
END SUBROUTINE MYZDSCAL
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE MYZSCAL(n, za, zx, incx)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx
DOUBLE COMPLEX :: za
DOUBLE COMPLEX, dimension(*) :: zx
#if defined(__CUDA)
attributes(device) :: zx
CALL cublasZSCAL(n, za, zx, incx)
#endif
RETURN
END SUBROUTINE MYZSCAL
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE MYDCOPY(n, x, incx, y, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE PRECISION, INTENT(IN) :: x(*)
DOUBLE PRECISION, INTENT(OUT) :: y(*)
#if defined(__CUDA)
attributes(device) :: x, y
call cublasDCOPY(n, x, incx, y, incy)
#endif
RETURN
END SUBROUTINE MYDCOPY
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE MYDAXPY(n, a, x, incx, y, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE PRECISION, INTENT(IN) :: a
DOUBLE PRECISION, INTENT(IN) :: x(*)
DOUBLE PRECISION, INTENT(OUT) :: y(*)
#if defined(__CUDA)
attributes(device) :: x, y
call cublasDAXPY( n, a, x, incx, y, incy)
#endif
RETURN
END SUBROUTINE MYDAXPY
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE MYDSCAL(n, a, x, incx)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
integer :: n, incx
DOUBLE PRECISION :: a
DOUBLE PRECISION, dimension(*) :: x
#if defined(__CUDA)
attributes(device) :: x
call cublasDSCAL(n, a, x, incx)
#endif
RETURN
END SUBROUTINE MYDSCAL
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE MYDSWAP(n, dx, incx, dy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: n, incx, incy
REAL(8), dimension(*) :: dx, dy
#if defined(__CUDA)
attributes(device) :: dx, dy
CALL cublasDSWAP(n, dx, incx, dy, incy)
#endif
return
END SUBROUTINE MYDSWAP
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

View File

@ -4028,7 +4028,8 @@ contains
real(wp) :: res1, res2, gnorm_supercell
if(num) Call errore('pbcgdisp', 'Atom displacement not implemented with numerical forces', 1)
if(.not.noabc) Call errore('pbcgdisp', 'Atom displacement not implemented with the threebody term', 1)
if(.not.noabc) Call errore('pbcgdisp', 'Atom displacement not implemented with the threebody term ' // &
' (set dftd3_threebody=.false. for phonon calculations)', 1)
ns = shape(g_supercell_)
g_supercell( -ns(1)/2:ns(1)/2, -ns(2)/2:ns(2)/2, -ns(3)/2:ns(3)/2, 1:ns(4), 1:ns(5) ) => g_supercell_