mirror of https://gitlab.com/QEF/q-e.git
Merge branch q-e:develop into afonari-develop-patch-31461
This commit is contained in:
commit
80060dba75
|
@ -470,13 +470,14 @@ subroutine solve_head
|
||||||
head(first_f+i-1,2)=epsilon_g(2,2,i)
|
head(first_f+i-1,2)=epsilon_g(2,2,i)
|
||||||
head(first_f+i-1,3)=epsilon_g(3,3,i)
|
head(first_f+i-1,3)=epsilon_g(3,3,i)
|
||||||
|
|
||||||
|
call ph_set_upert_e()
|
||||||
#if defined(__MPI)
|
#if defined(__MPI)
|
||||||
call mp_sum ( pola_charge(:,:,:,i) , inter_pool_comm )
|
call mp_sum ( pola_charge(:,:,:,i) , inter_pool_comm )
|
||||||
call psyme (pola_charge(:,:,:,i))
|
call psymdvscf (pola_charge(:,:,:,i))
|
||||||
#else
|
#else
|
||||||
call syme (pola_charge(:,:,:,i))
|
call symdvscf (pola_charge(:,:,:,i))
|
||||||
#endif
|
#endif
|
||||||
|
call ph_deallocate_upert()
|
||||||
call create_scf_type ( wing, .true. )
|
call create_scf_type ( wing, .true. )
|
||||||
do ipol=1,3
|
do ipol=1,3
|
||||||
CALL fwfft ('Rho', pola_charge(1:dfftp%nnr,1,ipol,i), dfftp)
|
CALL fwfft ('Rho', pola_charge(1:dfftp%nnr,1,ipol,i), dfftp)
|
||||||
|
|
|
@ -55,6 +55,7 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
||||||
USE qpoint_aux, ONLY : ikmks, ikmkmqs, becpt
|
USE qpoint_aux, ONLY : ikmks, ikmkmqs, becpt
|
||||||
USE lsda_mod, ONLY : nspin
|
USE lsda_mod, ONLY : nspin
|
||||||
USE lr_nc_mag, ONLY : lr_apply_time_reversal, deeq_nc_save, int3_nc_save
|
USE lr_nc_mag, ONLY : lr_apply_time_reversal, deeq_nc_save, int3_nc_save
|
||||||
|
USE lr_symm_base, ONLY : lr_npert, upert, upert_mq
|
||||||
!
|
!
|
||||||
IMPLICIT NONE
|
IMPLICIT NONE
|
||||||
!
|
!
|
||||||
|
@ -107,6 +108,7 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
||||||
ik, ikk, & ! counter on k points
|
ik, ikk, & ! counter on k points
|
||||||
ndim, &
|
ndim, &
|
||||||
is, & ! counter on spin polarizations
|
is, & ! counter on spin polarizations
|
||||||
|
isym, & ! counter on symmetries
|
||||||
npw, & ! number of plane waves at k
|
npw, & ! number of plane waves at k
|
||||||
nsolv, & ! number of linear systems
|
nsolv, & ! number of linear systems
|
||||||
isolv, & ! counter on linear systems
|
isolv, & ! counter on linear systems
|
||||||
|
@ -147,6 +149,18 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
||||||
!
|
!
|
||||||
ALLOCATE (dbecsum((nhm*(nhm+1))/2, nat, nspin_mag, 1))
|
ALLOCATE (dbecsum((nhm*(nhm+1))/2, nat, nspin_mag, 1))
|
||||||
!
|
!
|
||||||
|
! Set symmetry representation in lr_symm_base
|
||||||
|
!
|
||||||
|
lr_npert = 1
|
||||||
|
ALLOCATE(upert(lr_npert, lr_npert, nsymq))
|
||||||
|
DO isym = 1, nsymq
|
||||||
|
upert(1, 1, isym) = (1.d0, 0.d0)
|
||||||
|
ENDDO
|
||||||
|
IF (minus_q) THEN
|
||||||
|
ALLOCATE(upert_mq(lr_npert, lr_npert))
|
||||||
|
upert_mq(1, 1) = (1.d0, 0.d0)
|
||||||
|
ENDIF ! minus_q
|
||||||
|
!
|
||||||
nsolv=1
|
nsolv=1
|
||||||
IF (noncolin.AND.domag) nsolv=2
|
IF (noncolin.AND.domag) nsolv=2
|
||||||
!
|
!
|
||||||
|
@ -497,6 +511,8 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
||||||
DEALLOCATE (dvscfin)
|
DEALLOCATE (dvscfin)
|
||||||
DEALLOCATE (dvscfout)
|
DEALLOCATE (dvscfout)
|
||||||
DEALLOCATE (trace_dns_tot_old)
|
DEALLOCATE (trace_dns_tot_old)
|
||||||
|
DEALLOCATE (upert)
|
||||||
|
IF (minus_q) DEALLOCATE(upert_mq)
|
||||||
!
|
!
|
||||||
IF (ALLOCATED(dbecsum_nc)) DEALLOCATE (dbecsum_nc)
|
IF (ALLOCATED(dbecsum_nc)) DEALLOCATE (dbecsum_nc)
|
||||||
IF (ALLOCATED(int3_nc)) DEALLOCATE(int3_nc)
|
IF (ALLOCATED(int3_nc)) DEALLOCATE(int3_nc)
|
||||||
|
|
|
@ -8,38 +8,6 @@
|
||||||
!
|
!
|
||||||
#define ZERO ( 0.D0, 0.D0 )
|
#define ZERO ( 0.D0, 0.D0 )
|
||||||
#define ONE ( 1.D0, 0.D0 )
|
#define ONE ( 1.D0, 0.D0 )
|
||||||
|
|
||||||
FUNCTION KSDdot( n, A, incx, B, incy) result( res )
|
|
||||||
!
|
|
||||||
USE util_param, ONLY : DP
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cudafor
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
!
|
|
||||||
IMPLICIT NONE
|
|
||||||
!
|
|
||||||
INTEGER, INTENT(IN) :: incx,incy,n
|
|
||||||
!
|
|
||||||
#if defined(__CUDA)
|
|
||||||
REAL(DP), DEVICE, INTENT(IN) :: A(n), B(n)
|
|
||||||
#else
|
|
||||||
REAL(DP), INTENT(IN) :: A(n), B(n)
|
|
||||||
REAL(DP), EXTERNAL :: ddot
|
|
||||||
#endif
|
|
||||||
!
|
|
||||||
REAL(DP) :: res
|
|
||||||
!
|
|
||||||
#if defined(__CUDA)
|
|
||||||
res = cublasDdot( n, A, incx, B, incy )
|
|
||||||
#else
|
|
||||||
res = ddot( n, A, incx, B, incy )
|
|
||||||
#endif
|
|
||||||
!
|
|
||||||
RETURN
|
|
||||||
!
|
|
||||||
END FUNCTION KSDdot
|
|
||||||
|
|
||||||
! define __VERBOSE to print a message after each eigenvalue is computed
|
! define __VERBOSE to print a message after each eigenvalue is computed
|
||||||
!
|
!
|
||||||
!----------------------------------------------------------------------------
|
!----------------------------------------------------------------------------
|
||||||
|
@ -102,7 +70,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
!
|
!
|
||||||
! ... external functions
|
! ... external functions
|
||||||
!
|
!
|
||||||
REAL (DP), EXTERNAL :: ksDdot
|
REAL (DP), EXTERNAL :: MYDDOT
|
||||||
EXTERNAL hs_1psi_ptr, s_1psi_ptr
|
EXTERNAL hs_1psi_ptr, s_1psi_ptr
|
||||||
! hs_1psi( npwx, npw, psi, hpsi, spsi )
|
! hs_1psi( npwx, npw, psi, hpsi, spsi )
|
||||||
! s_1psi( npwx, npw, psi, spsi )
|
! s_1psi( npwx, npw, psi, spsi )
|
||||||
|
@ -249,7 +217,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
! ... NB: ddot(2*npw,a,1,b,1) = REAL( zdotc(npw,a,1,b,1) )
|
! ... NB: ddot(2*npw,a,1,b,1) = REAL( zdotc(npw,a,1,b,1) )
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, hpsi)
|
!$acc host_data use_device(psi, hpsi)
|
||||||
e(m) = ksDdot( kdim2, psi(1,m), 1, hpsi, 1 )
|
e(m) = MYDDOT( kdim2, psi(1,m), 1, hpsi, 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
CALL mp_sum( e(m), intra_bgrp_comm )
|
CALL mp_sum( e(m), intra_bgrp_comm )
|
||||||
|
@ -271,8 +239,8 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
|
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(spsi, g, ppsi)
|
!$acc host_data use_device(spsi, g, ppsi)
|
||||||
es(1) = ksDdot( kdim2, spsi(1), 1, g(1), 1 )
|
es(1) = MYDDOT( kdim2, spsi(1), 1, g(1), 1 )
|
||||||
es(2) = ksDdot( kdim2, spsi(1), 1, ppsi(1), 1 )
|
es(2) = MYDDOT( kdim2, spsi(1), 1, ppsi(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
CALL mp_sum( es , intra_bgrp_comm )
|
CALL mp_sum( es , intra_bgrp_comm )
|
||||||
|
@ -325,7 +293,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
|
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(g)
|
!$acc host_data use_device(g)
|
||||||
gg1 = ksDdot( kdim2, g(1), 1, g0_d(1), 1 )
|
gg1 = MYDDOT( kdim2, g(1), 1, g0_d(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
CALL mp_sum( gg1, intra_bgrp_comm )
|
CALL mp_sum( gg1, intra_bgrp_comm )
|
||||||
|
@ -341,7 +309,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
!$acc end kernels
|
!$acc end kernels
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(g)
|
!$acc host_data use_device(g)
|
||||||
gg = ksDdot( kdim2, g(1), 1, g0_d(1), 1 )
|
gg = MYDDOT( kdim2, g(1), 1, g0_d(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
CALL mp_sum( gg, intra_bgrp_comm )
|
CALL mp_sum( gg, intra_bgrp_comm )
|
||||||
|
@ -396,7 +364,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) )
|
CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) )
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(scg, cg)
|
!$acc host_data use_device(scg, cg)
|
||||||
cg0 = ksDdot( kdim2, cg(1), 1, scg(1), 1 )
|
cg0 = MYDDOT( kdim2, cg(1), 1, scg(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
CALL mp_sum( cg0 , intra_bgrp_comm )
|
CALL mp_sum( cg0 , intra_bgrp_comm )
|
||||||
|
@ -412,13 +380,13 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
! ... <y(t)|P^2S|y(t)> = 1
|
! ... <y(t)|P^2S|y(t)> = 1
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, ppsi)
|
!$acc host_data use_device(psi, ppsi)
|
||||||
a0 = 2.D0 * ksDdot( kdim2, psi(1,m), 1, ppsi(1), 1 ) / cg0
|
a0 = 2.D0 * MYDDOT( kdim2, psi(1,m), 1, ppsi(1), 1 ) / cg0
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
CALL mp_sum( a0 , intra_bgrp_comm )
|
CALL mp_sum( a0 , intra_bgrp_comm )
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(cg, ppsi)
|
!$acc host_data use_device(cg, ppsi)
|
||||||
b0 = ksDdot( kdim2, cg(1), 1, ppsi(1), 1 ) / cg0**2
|
b0 = MYDDOT( kdim2, cg(1), 1, ppsi(1), 1 ) / cg0**2
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
CALL mp_sum( b0 , intra_bgrp_comm )
|
CALL mp_sum( b0 , intra_bgrp_comm )
|
||||||
|
|
|
@ -5,26 +5,6 @@
|
||||||
! in the root directory of the present distribution,
|
! in the root directory of the present distribution,
|
||||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||||
!
|
!
|
||||||
|
|
||||||
|
|
||||||
SUBROUTINE cgcudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
use cudafor
|
|
||||||
use cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
DOUBLE PRECISION :: ALPHA,BETA
|
|
||||||
INTEGER :: INCX,INCY,LDA,M,N
|
|
||||||
CHARACTER :: TRANS
|
|
||||||
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: A, X, Y
|
|
||||||
#endif
|
|
||||||
!
|
|
||||||
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
|
||||||
!
|
|
||||||
END SUBROUTINE cgcudaDGEMV
|
|
||||||
|
|
||||||
! define __VERBOSE to print a message after each eigenvalue is computed
|
! define __VERBOSE to print a message after each eigenvalue is computed
|
||||||
!----------------------------------------------------------------------------
|
!----------------------------------------------------------------------------
|
||||||
SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
|
@ -87,7 +67,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
!
|
!
|
||||||
! ... external functions
|
! ... external functions
|
||||||
!
|
!
|
||||||
REAL(DP), EXTERNAL :: ksDdot
|
REAL(DP), EXTERNAL :: MYDDOT
|
||||||
EXTERNAL hs_1psi_ptr, s_1psi_ptr
|
EXTERNAL hs_1psi_ptr, s_1psi_ptr
|
||||||
! hs_1psi( npwx, npw, psi, hpsi, spsi )
|
! hs_1psi( npwx, npw, psi, hpsi, spsi )
|
||||||
! s_1psi( npwx, npw, psi, spsi )
|
! s_1psi( npwx, npw, psi, spsi )
|
||||||
|
@ -148,7 +128,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
lagrange = 0.d0
|
lagrange = 0.d0
|
||||||
if(m_start.le.m_end) then
|
if(m_start.le.m_end) then
|
||||||
!$acc host_data use_device(psi, spsi)
|
!$acc host_data use_device(psi, spsi)
|
||||||
CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npwx2, spsi, 1, 0.D0, lagrange_d(m_start), 1 )
|
CALL MYDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npwx2, spsi, 1, 0.D0, lagrange_d(m_start), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
endif
|
endif
|
||||||
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
|
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
|
||||||
|
@ -202,7 +182,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
! ... NB: ddot(2*npw,a,1,b,1) = DBLE( zdotc(npw,a,1,b,1) )
|
! ... NB: ddot(2*npw,a,1,b,1) = DBLE( zdotc(npw,a,1,b,1) )
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, hpsi)
|
!$acc host_data use_device(psi, hpsi)
|
||||||
e(m) = 2.D0 * ksDdot( npw2, psi(1,m), 1, hpsi, 1 )
|
e(m) = 2.D0 * MYDDOT( npw2, psi(1,m), 1, hpsi, 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!print *, 'e(m)', e(m)
|
!print *, 'e(m)', e(m)
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -232,8 +212,8 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
|
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(spsi, g, ppsi)
|
!$acc host_data use_device(spsi, g, ppsi)
|
||||||
es(1) = 2.D0 * ksDdot( npw2, spsi(1), 1, g(1), 1 )
|
es(1) = 2.D0 * MYDDOT( npw2, spsi(1), 1, g(1), 1 )
|
||||||
es(2) = 2.D0 * ksDdot( npw2, spsi(1), 1, ppsi(1), 1 )
|
es(2) = 2.D0 * MYDDOT( npw2, spsi(1), 1, ppsi(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -270,7 +250,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
|
call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
|
||||||
if(m_start.le.m_end) then
|
if(m_start.le.m_end) then
|
||||||
!$acc host_data use_device(psi, scg)
|
!$acc host_data use_device(psi, scg)
|
||||||
CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npw2, scg, 1, 0.D0, lagrange_d(m_start), 1 )
|
CALL MYDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npw2, scg, 1, 0.D0, lagrange_d(m_start), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
endif
|
endif
|
||||||
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
|
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
|
||||||
|
@ -301,7 +281,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
|
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(g)
|
!$acc host_data use_device(g)
|
||||||
gg1 = 2.D0 * ksDdot( npw2, g(1), 1, g0_d(1), 1 )
|
gg1 = 2.D0 * MYDDOT( npw2, g(1), 1, g0_d(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
!$acc update self(g)
|
!$acc update self(g)
|
||||||
|
@ -328,7 +308,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
!$acc end kernels
|
!$acc end kernels
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(g)
|
!$acc host_data use_device(g)
|
||||||
gg = 2.D0 * ksDdot( npw2, g(1), 1, g0_d(1), 1 )
|
gg = 2.D0 * MYDDOT( npw2, g(1), 1, g0_d(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
!$acc update self(g)
|
!$acc update self(g)
|
||||||
|
@ -390,7 +370,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) )
|
CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) )
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(cg, scg)
|
!$acc host_data use_device(cg, scg)
|
||||||
cg0 = 2.D0 * ksDdot( npw2, cg(1), 1, scg(1), 1 )
|
cg0 = 2.D0 * MYDDOT( npw2, cg(1), 1, scg(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
!$acc update self(cg, scg)
|
!$acc update self(cg, scg)
|
||||||
|
@ -411,7 +391,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
! ... <y(t)|P^2S|y(t)> = 1
|
! ... <y(t)|P^2S|y(t)> = 1
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, ppsi)
|
!$acc host_data use_device(psi, ppsi)
|
||||||
a0 = 4.D0 * ksDdot( npw2, psi(1,m), 1, ppsi(1), 1 )
|
a0 = 4.D0 * MYDDOT( npw2, psi(1,m), 1, ppsi(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
!$acc update self(psi, ppsi)
|
!$acc update self(psi, ppsi)
|
||||||
|
@ -425,7 +405,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, &
|
||||||
CALL mp_sum( a0 , intra_bgrp_comm )
|
CALL mp_sum( a0 , intra_bgrp_comm )
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(cg, ppsi)
|
!$acc host_data use_device(cg, ppsi)
|
||||||
b0 = 2.D0 * ksDdot( npw2, cg(1), 1, ppsi(1), 1 )
|
b0 = 2.D0 * MYDDOT( npw2, cg(1), 1, ppsi(1), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
!$acc update self(cg, ppsi)
|
!$acc update self(cg, ppsi)
|
||||||
|
|
|
@ -127,7 +127,7 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
|
||||||
!
|
!
|
||||||
! ... Set initial : |phi_j> = |psi_j>
|
! ... Set initial : |phi_j> = |psi_j>
|
||||||
!
|
!
|
||||||
CALL DCOPY_gpu( npwx2 * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
|
CALL MYDCOPY( npwx2 * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||||
!
|
!
|
||||||
|
@ -141,7 +141,7 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) THEN
|
IF ( eigen_ ) THEN
|
||||||
!
|
!
|
||||||
CALL DCOPY_gpu( npwx2 * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
|
CALL MYDCOPY( npwx2 * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -155,7 +155,7 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL DCOPY_gpu( npwx2 * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
|
CALL MYDCOPY( npwx2 * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -217,14 +217,14 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
|
||||||
! ... Copy psi <- phi
|
! ... Copy psi <- phi
|
||||||
!
|
!
|
||||||
!CALL DCOPY( npwx2 * nbnd, phi(1,1), 1, psi(1,1), 1 )
|
!CALL DCOPY( npwx2 * nbnd, phi(1,1), 1, psi(1,1), 1 )
|
||||||
CALL DCOPY_gpu( npwx2 * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
|
CALL MYDCOPY( npwx2 * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) &
|
IF ( eigen_ ) &
|
||||||
CALL DCOPY_gpu( npwx2 * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
|
CALL MYDCOPY( npwx2 * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
|
||||||
!CALL DCOPY( npwx2 * nbnd, hphi(1,1), 1, hpsi(1,1), 1 )
|
!CALL DCOPY( npwx2 * nbnd, hphi(1,1), 1, hpsi(1,1), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL DCOPY_gpu( npwx2 * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
|
CALL MYDCOPY( npwx2 * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
|
||||||
!CALL DCOPY( npwx2 * nbnd, sphi(1,1), 1, spsi(1,1), 1 )
|
!CALL DCOPY( npwx2 * nbnd, sphi(1,1), 1, spsi(1,1), 1 )
|
||||||
!
|
!
|
||||||
! ... Calculate energy eigenvalues
|
! ... Calculate energy eigenvalues
|
||||||
|
@ -259,7 +259,7 @@ CONTAINS
|
||||||
INTEGER :: ibnd
|
INTEGER :: ibnd
|
||||||
REAL(DP) :: norm
|
REAL(DP) :: norm
|
||||||
REAL(DP) :: psi_ibnd
|
REAL(DP) :: psi_ibnd
|
||||||
REAL(DP), EXTERNAL :: gpu_DDOT
|
REAL(DP), EXTERNAL :: MYDDOT
|
||||||
!
|
!
|
||||||
DO ibnd = ibnd_start, ibnd_end
|
DO ibnd = ibnd_start, ibnd_end
|
||||||
!
|
!
|
||||||
|
@ -269,24 +269,24 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
||||||
spsi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
|
spsi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
psi_ibnd = -spsi_d(1,ibnd)
|
psi_ibnd = -spsi_d(1,ibnd)
|
||||||
CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd , phi_d(1,ibnd_start), npwx2, &
|
CALL MYDAXPY( ibnd - ibnd_start, psi_ibnd , phi_d(1,ibnd_start), npwx2, &
|
||||||
sr_d(1), 1 )
|
sr_d(1), 1 )
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
||||||
psi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
|
psi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
|
||||||
psi_ibnd = -psi_d(1,ibnd)
|
psi_ibnd = -psi_d(1,ibnd)
|
||||||
CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd, phi_d(1,ibnd_start), npwx2, &
|
CALL MYDAXPY( ibnd - ibnd_start, psi_ibnd, phi_d(1,ibnd_start), npwx2, &
|
||||||
sr_d(1), 1 )
|
sr_d(1), 1 )
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
|
@ -296,7 +296,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
|
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
|
||||||
!
|
!
|
||||||
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, phi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, phi_d(1,ibnd_start), npwx2, &
|
||||||
sr_d(1), 1, 1._DP, phi_d(1,ibnd), 1 )
|
sr_d(1), 1, 1._DP, phi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||||
|
@ -309,7 +309,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) THEN
|
IF ( eigen_ ) THEN
|
||||||
!
|
!
|
||||||
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi_d(1,ibnd_start), npwx2, &
|
||||||
sr_d(1), 1, 1._DP, hphi_d(1,ibnd), 1 )
|
sr_d(1), 1, 1._DP, hphi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||||
|
@ -324,7 +324,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi_d(1,ibnd_start), npwx2, &
|
||||||
sr_d(1), 1, 1._DP, sphi_d(1,ibnd), 1 )
|
sr_d(1), 1, 1._DP, sphi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||||
|
@ -343,7 +343,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 )
|
norm = 2._DP * MYDDOT( npw2, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
!$cuf kernel do(1)
|
!$cuf kernel do(1)
|
||||||
|
@ -354,7 +354,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 )
|
norm = 2._DP * MYDDOT( npw2, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
!$cuf kernel do(1)
|
!$cuf kernel do(1)
|
||||||
|
@ -372,7 +372,7 @@ CONTAINS
|
||||||
IF ( norm < eps16 ) &
|
IF ( norm < eps16 ) &
|
||||||
CALL errore( ' gram_schmidt_gamma ', ' vectors are linear dependent ', 1 )
|
CALL errore( ' gram_schmidt_gamma ', ' vectors are linear dependent ', 1 )
|
||||||
!
|
!
|
||||||
CALL DSCAL_gpu( npw2, 1._DP / norm, phi_d(1,ibnd), 1 )
|
CALL MYDSCAL( npw2, 1._DP / norm, phi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -384,7 +384,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) THEN
|
IF ( eigen_ ) THEN
|
||||||
!
|
!
|
||||||
CALL DSCAL_gpu( npw2, 1._DP / norm, hphi_d(1,ibnd), 1 )
|
CALL MYDSCAL( npw2, 1._DP / norm, hphi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -398,7 +398,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL DSCAL_gpu( npw2, 1._DP / norm, sphi_d(1,ibnd), 1 )
|
CALL MYDSCAL( npw2, 1._DP / norm, sphi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -435,20 +435,20 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
||||||
spsi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), nbsize )
|
spsi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), nbsize )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) &
|
IF ( gstart == 2 ) &
|
||||||
CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
|
CALL MYDGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
|
||||||
spsi_d(1,jbnd_start), npwx2, sr2_d(1,1), nbsize )
|
spsi_d(1,jbnd_start), npwx2, sr2_d(1,1), nbsize )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
||||||
psi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), nbsize )
|
psi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), nbsize )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) &
|
IF ( gstart == 2 ) &
|
||||||
CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
|
CALL MYDGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
|
||||||
psi_d(1,jbnd_start), npwx2, sr2_d(1,1), nbsize )
|
psi_d(1,jbnd_start), npwx2, sr2_d(1,1), nbsize )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
|
@ -457,7 +457,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
|
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
|
||||||
!
|
!
|
||||||
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi_d(1,ibnd_start), npwx2, &
|
||||||
sr2_d(1,1), nbsize, 1._DP, phi_d(1,jbnd_start), npwx2 )
|
sr2_d(1,1), nbsize, 1._DP, phi_d(1,jbnd_start), npwx2 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||||
|
@ -471,7 +471,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) THEN
|
IF ( eigen_ ) THEN
|
||||||
!
|
!
|
||||||
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi_d(1,ibnd_start), npwx2, &
|
||||||
sr2_d(1,1), nbsize, 1._DP, hphi_d(1,jbnd_start), npwx2 )
|
sr2_d(1,1), nbsize, 1._DP, hphi_d(1,jbnd_start), npwx2 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||||
|
@ -487,7 +487,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi_d(1,ibnd_start), npwx2, &
|
CALL MYDGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi_d(1,ibnd_start), npwx2, &
|
||||||
sr2_d(1,1), nbsize, 1._DP, sphi_d(1,jbnd_start), npwx2 )
|
sr2_d(1,1), nbsize, 1._DP, sphi_d(1,jbnd_start), npwx2 )
|
||||||
!
|
!
|
||||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||||
|
@ -513,7 +513,7 @@ CONTAINS
|
||||||
INTEGER :: ibnd, ibnd_start, ibnd_end
|
INTEGER :: ibnd, ibnd_start, ibnd_end
|
||||||
REAL(DP) :: e_ibnd
|
REAL(DP) :: e_ibnd
|
||||||
!
|
!
|
||||||
REAL(DP), EXTERNAL :: gpu_DDOT
|
REAL(DP), EXTERNAL :: MYDDOT
|
||||||
!
|
!
|
||||||
! ... <psi_i| H |psi_i>
|
! ... <psi_i| H |psi_i>
|
||||||
!
|
!
|
||||||
|
@ -523,7 +523,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
DO ibnd = ibnd_start, ibnd_end
|
DO ibnd = ibnd_start, ibnd_end
|
||||||
!
|
!
|
||||||
e(ibnd) = 2._DP * gpu_DDOT( npw2, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 )
|
e(ibnd) = 2._DP * MYDDOT( npw2, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
!$cuf kernel do(1)
|
!$cuf kernel do(1)
|
||||||
|
@ -563,13 +563,13 @@ CONTAINS
|
||||||
e(ibnd) = e(ibnd-1)
|
e(ibnd) = e(ibnd-1)
|
||||||
e(ibnd-1) = e0
|
e(ibnd-1) = e0
|
||||||
!
|
!
|
||||||
CALL DSWAP_gpu( npw2, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
|
CALL MYDSWAP( npw2, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) &
|
IF ( eigen_ ) &
|
||||||
CALL DSWAP_gpu( npw2, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
|
CALL MYDSWAP( npw2, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL DSWAP_gpu( npw2, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
|
CALL MYDSWAP( npw2, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
|
|
|
@ -142,13 +142,13 @@ SUBROUTINE gram_schmidt_k_gpu( npwx, npw, nbnd, npol, psi_d, hpsi_d, spsi_d, e,
|
||||||
!
|
!
|
||||||
! ... Set initial : |phi_j> = |psi_j>
|
! ... Set initial : |phi_j> = |psi_j>
|
||||||
!
|
!
|
||||||
CALL ZCOPY_gpu( kdmx * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
|
CALL MYZCOPY( kdmx * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) &
|
IF ( eigen_ ) &
|
||||||
CALL ZCOPY_gpu( kdmx * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
|
CALL MYZCOPY( kdmx * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL ZCOPY_gpu( kdmx * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
|
CALL MYZCOPY( kdmx * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
!
|
!
|
||||||
! ... Allocate buffers
|
! ... Allocate buffers
|
||||||
|
@ -204,13 +204,13 @@ SUBROUTINE gram_schmidt_k_gpu( npwx, npw, nbnd, npol, psi_d, hpsi_d, spsi_d, e,
|
||||||
!
|
!
|
||||||
! ... Copy psi <- phi
|
! ... Copy psi <- phi
|
||||||
!
|
!
|
||||||
CALL ZCOPY_gpu( kdmx * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
|
CALL MYZCOPY( kdmx * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) &
|
IF ( eigen_ ) &
|
||||||
CALL ZCOPY_gpu( kdmx * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
|
CALL MYZCOPY( kdmx * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL ZCOPY_gpu( kdmx * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
|
CALL MYZCOPY( kdmx * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
|
||||||
!
|
!
|
||||||
! ... Calculate energy eigenvalues
|
! ... Calculate energy eigenvalues
|
||||||
!
|
!
|
||||||
|
@ -240,7 +240,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
INTEGER :: ibnd
|
INTEGER :: ibnd
|
||||||
REAL(DP) :: norm
|
REAL(DP) :: norm
|
||||||
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
|
COMPLEX(DP), EXTERNAL :: MYZDOTC
|
||||||
!
|
!
|
||||||
!
|
!
|
||||||
DO ibnd = ibnd_start, ibnd_end
|
DO ibnd = ibnd_start, ibnd_end
|
||||||
|
@ -251,12 +251,12 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
|
||||||
spsi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
|
spsi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
|
||||||
psi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
|
psi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
|
@ -266,16 +266,16 @@ CONTAINS
|
||||||
!
|
!
|
||||||
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
|
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
|
||||||
!
|
!
|
||||||
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, phi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, phi_d(1,ibnd_start), kdmx, &
|
||||||
sc_d(1), 1, ONE, phi_d(1,ibnd), 1 )
|
sc_d(1), 1, ONE, phi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) &
|
IF ( eigen_ ) &
|
||||||
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, hphi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, hphi_d(1,ibnd_start), kdmx, &
|
||||||
sc_d(1), 1, ONE, hphi_d(1,ibnd), 1 )
|
sc_d(1), 1, ONE, hphi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, sphi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, sphi_d(1,ibnd_start), kdmx, &
|
||||||
sc_d(1), 1, ONE, sphi_d(1,ibnd), 1 )
|
sc_d(1), 1, ONE, sphi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
|
@ -284,11 +284,11 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) )
|
norm = DBLE( MYZDOTC( kdim, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) )
|
norm = DBLE( MYZDOTC( kdim, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
|
@ -299,13 +299,13 @@ CONTAINS
|
||||||
IF ( norm < eps16 ) &
|
IF ( norm < eps16 ) &
|
||||||
CALL errore( ' gram_schmidt_k ', ' vectors are linear dependent ', 1 )
|
CALL errore( ' gram_schmidt_k ', ' vectors are linear dependent ', 1 )
|
||||||
!
|
!
|
||||||
CALL ZDSCAL_gpu( kdim, 1._DP / norm, phi_d(1,ibnd), 1 )
|
CALL MYZDSCAL( kdim, 1._DP / norm, phi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) &
|
IF ( eigen_ ) &
|
||||||
CALL ZDSCAL_gpu( kdim, 1._DP / norm, hphi_d(1,ibnd), 1 )
|
CALL MYZDSCAL( kdim, 1._DP / norm, hphi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL ZDSCAL_gpu( kdim, 1._DP / norm, sphi_d(1,ibnd), 1 )
|
CALL MYZDSCAL( kdim, 1._DP / norm, sphi_d(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
!
|
!
|
||||||
|
@ -332,12 +332,12 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
|
||||||
spsi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), nbsize )
|
spsi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), nbsize )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
|
||||||
psi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), nbsize )
|
psi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), nbsize )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
|
@ -346,15 +346,15 @@ CONTAINS
|
||||||
!
|
!
|
||||||
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
|
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
|
||||||
!
|
!
|
||||||
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi_d(1,ibnd_start), kdmx, &
|
||||||
sc2_d(1,1), nbsize, ONE, phi_d(1,jbnd_start), kdmx )
|
sc2_d(1,1), nbsize, ONE, phi_d(1,jbnd_start), kdmx )
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) &
|
IF ( eigen_ ) &
|
||||||
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi_d(1,ibnd_start), kdmx, &
|
||||||
sc2_d(1,1), nbsize, ONE, hphi_d(1,jbnd_start), kdmx )
|
sc2_d(1,1), nbsize, ONE, hphi_d(1,jbnd_start), kdmx )
|
||||||
!
|
!
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi_d(1,ibnd_start), kdmx, &
|
CALL MYZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi_d(1,ibnd_start), kdmx, &
|
||||||
sc2_d(1,1), nbsize, ONE, sphi_d(1,jbnd_start), kdmx )
|
sc2_d(1,1), nbsize, ONE, sphi_d(1,jbnd_start), kdmx )
|
||||||
!
|
!
|
||||||
RETURN
|
RETURN
|
||||||
|
@ -368,7 +368,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
INTEGER :: ibnd, ibnd_start, ibnd_end
|
INTEGER :: ibnd, ibnd_start, ibnd_end
|
||||||
!
|
!
|
||||||
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
|
COMPLEX(DP), EXTERNAL :: MYZDOTC
|
||||||
!
|
!
|
||||||
! ... <psi_i| H |psi_i>
|
! ... <psi_i| H |psi_i>
|
||||||
!
|
!
|
||||||
|
@ -378,7 +378,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
DO ibnd = ibnd_start, ibnd_end
|
DO ibnd = ibnd_start, ibnd_end
|
||||||
!
|
!
|
||||||
e(ibnd) = DBLE( ZDOTC_gpu( kdim, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) )
|
e(ibnd) = DBLE( MYZDOTC( kdim, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) )
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
!
|
!
|
||||||
|
@ -410,13 +410,13 @@ CONTAINS
|
||||||
e(ibnd) = e(ibnd-1)
|
e(ibnd) = e(ibnd-1)
|
||||||
e(ibnd-1) = e0
|
e(ibnd-1) = e0
|
||||||
!
|
!
|
||||||
CALL ZSWAP_gpu( kdim, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
|
CALL MYZSWAP( kdim, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
|
||||||
!
|
!
|
||||||
IF ( eigen_ ) &
|
IF ( eigen_ ) &
|
||||||
CALL ZSWAP_gpu( kdim, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
|
CALL MYZSWAP( kdim, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL ZSWAP_gpu( kdim, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
|
CALL MYZSWAP( kdim, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
|
|
|
@ -1,252 +1,3 @@
|
||||||
subroutine gpu_DGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
implicit none
|
|
||||||
character*1 transa, transb
|
|
||||||
integer :: m, n, k, lda, ldb, ldc
|
|
||||||
DOUBLE PRECISION :: alpha, beta
|
|
||||||
DOUBLE PRECISION, dimension(lda, *) :: A
|
|
||||||
DOUBLE PRECISION, dimension(ldb, *) :: B
|
|
||||||
DOUBLE PRECISION, dimension(ldc, *) :: C
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: A, B, C
|
|
||||||
call cublasDGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
|
||||||
#endif
|
|
||||||
return
|
|
||||||
end subroutine gpu_DGEMM
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
subroutine gpu_ZGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
implicit none
|
|
||||||
character*1 transa, transb
|
|
||||||
integer :: m, n, k, lda, ldb, ldc
|
|
||||||
DOUBLE COMPLEX :: alpha, beta
|
|
||||||
DOUBLE COMPLEX, dimension(lda, *) :: A
|
|
||||||
DOUBLE COMPLEX, dimension(ldb, *) :: B
|
|
||||||
DOUBLE COMPLEX, dimension(ldc, *) :: C
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: A, B, C
|
|
||||||
call cublasZGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
|
||||||
#endif
|
|
||||||
return
|
|
||||||
end subroutine gpu_ZGEMM
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
subroutine gpu_DGER (m, n, alpha, x, incx, y, incy, a, lda)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
implicit none
|
|
||||||
integer :: m, n, lda, incx, incy
|
|
||||||
DOUBLE PRECISION :: alpha
|
|
||||||
DOUBLE PRECISION, dimension(lda, *) :: A
|
|
||||||
DOUBLE PRECISION, dimension(*) :: x, y
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: A, x, y
|
|
||||||
call cublasDGER(m, n, alpha, x, incx, y, incy, a, lda)
|
|
||||||
#endif
|
|
||||||
return
|
|
||||||
end subroutine gpu_DGER
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
function gpu_DDOT (n, dx, incx, dy, incy)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
implicit none
|
|
||||||
DOUBLE PRECISION :: gpu_DDOT
|
|
||||||
integer :: n, incx, incy
|
|
||||||
DOUBLE PRECISION, dimension(*) :: dx, dy
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: dx, dy
|
|
||||||
gpu_DDOT=cublasDDOT(n, dx, incx, dy, incy)
|
|
||||||
#endif
|
|
||||||
return
|
|
||||||
end function gpu_DDOT
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
subroutine gpu_DTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
implicit none
|
|
||||||
character*1 :: side, uplo, transa, diag
|
|
||||||
integer :: m, n, lda, ldb
|
|
||||||
DOUBLE PRECISION :: alpha
|
|
||||||
DOUBLE PRECISION, dimension(lda, *) :: a
|
|
||||||
DOUBLE PRECISION, dimension(ldb, *) :: b
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: a, b
|
|
||||||
call cublasDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
|
|
||||||
#endif
|
|
||||||
return
|
|
||||||
end subroutine gpu_DTRSM
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
DOUBLE COMPLEX function ZDOTC_gpu(n, zx, incx, zy, incy)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
implicit none
|
|
||||||
integer :: n, incx, incy
|
|
||||||
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: zx, zy
|
|
||||||
ZDOTC_gpu = cublasZDOTC(n, zx, incx, zy, incy)
|
|
||||||
#endif
|
|
||||||
return
|
|
||||||
end function ZDOTC_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE ZSWAP_gpu(n, zx, incx, zy, incy)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
implicit none
|
|
||||||
integer :: n, incx, incy
|
|
||||||
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: zx, zy
|
|
||||||
CALL cublasZSWAP(n, zx, incx, zy, incy)
|
|
||||||
#endif
|
|
||||||
return
|
|
||||||
END SUBROUTINE ZSWAP_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE ZCOPY_gpu(n, zx, incx, zy, incy)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
INTEGER :: n, incx, incy
|
|
||||||
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: zx, zy
|
|
||||||
CALL cublasZCOPY(n, zx, incx, zy, incy)
|
|
||||||
#endif
|
|
||||||
RETURN
|
|
||||||
END SUBROUTINE ZCOPY_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE ZAXPY_gpu(n, za, zx, incx, zy, incy)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
INTEGER :: n, incx, incy
|
|
||||||
DOUBLE COMPLEX :: za
|
|
||||||
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: zx, zy
|
|
||||||
CALL cublasZAXPY(n, za, zx, incx, zy, incy)
|
|
||||||
#endif
|
|
||||||
RETURN
|
|
||||||
END SUBROUTINE ZAXPY_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE ZGEMV_gpu(trans, m, n, alpha, a, lda, x, incx, beta, y, incy)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
CHARACTER :: trans
|
|
||||||
INTEGER :: lda, m, n, incx, incy
|
|
||||||
DOUBLE COMPLEX :: alpha, beta
|
|
||||||
DOUBLE COMPLEX, dimension(lda, *) :: a
|
|
||||||
DOUBLE COMPLEX, dimension(*) :: x, y
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: a, x, y
|
|
||||||
CALL cublasZGEMV(trans, m, n, alpha, a, lda, x, incx, beta, y, incy)
|
|
||||||
#endif
|
|
||||||
RETURN
|
|
||||||
END SUBROUTINE ZGEMV_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE ZDSCAL_gpu(n, da, zx, incx)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
INTEGER :: n, incx
|
|
||||||
DOUBLE PRECISION :: da
|
|
||||||
DOUBLE COMPLEX, dimension(*) :: zx
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: zx
|
|
||||||
CALL cublasZDSCAL(n, da, zx, incx)
|
|
||||||
#endif
|
|
||||||
RETURN
|
|
||||||
END SUBROUTINE ZDSCAL_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE ZSCAL_gpu(n, za, zx, incx)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
INTEGER :: n, incx
|
|
||||||
DOUBLE COMPLEX :: za
|
|
||||||
DOUBLE COMPLEX, dimension(*) :: zx
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: zx
|
|
||||||
CALL cublasZSCAL(n, za, zx, incx)
|
|
||||||
#endif
|
|
||||||
RETURN
|
|
||||||
END SUBROUTINE ZSCAL_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE DGEMV_gpu(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
use cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
DOUBLE PRECISION :: ALPHA,BETA
|
|
||||||
INTEGER :: INCX,INCY,LDA,M,N
|
|
||||||
CHARACTER :: TRANS
|
|
||||||
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: A, X, Y
|
|
||||||
call cublasDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
|
||||||
#endif
|
|
||||||
RETURN
|
|
||||||
END SUBROUTINE DGEMV_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE DCOPY_gpu(n, x, incx, y, incy)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
INTEGER :: n, incx, incy
|
|
||||||
DOUBLE PRECISION, INTENT(IN) :: x(*)
|
|
||||||
DOUBLE PRECISION, INTENT(OUT) :: y(*)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: x, y
|
|
||||||
call cublasDCOPY(n, x, incx, y, incy)
|
|
||||||
#endif
|
|
||||||
RETURN
|
|
||||||
END SUBROUTINE DCOPY_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE DAXPY_gpu(n, a, x, incx, y, incy)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
INTEGER :: n, incx, incy
|
|
||||||
DOUBLE PRECISION, INTENT(IN) :: a
|
|
||||||
DOUBLE PRECISION, INTENT(IN) :: x(*)
|
|
||||||
DOUBLE PRECISION, INTENT(OUT) :: y(*)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: x, y
|
|
||||||
call cublasDAXPY( n, a, x, incx, y, incy)
|
|
||||||
#endif
|
|
||||||
RETURN
|
|
||||||
END SUBROUTINE DAXPY_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE DSCAL_gpu(n, a, x, incx)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
|
||||||
integer :: n, incx
|
|
||||||
DOUBLE PRECISION :: a
|
|
||||||
DOUBLE PRECISION, dimension(*) :: x
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: x
|
|
||||||
call cublasDSCAL(n, a, x, incx)
|
|
||||||
#endif
|
|
||||||
RETURN
|
|
||||||
END SUBROUTINE DSCAL_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
SUBROUTINE gpu_threaded_memset(array, val, length)
|
SUBROUTINE gpu_threaded_memset(array, val, length)
|
||||||
!
|
!
|
||||||
|
@ -384,18 +135,3 @@ SUBROUTINE gpu_threaded_backassign(array_out, idx, array_in, kdimx, nact, use_a2
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
END SUBROUTINE gpu_threaded_backassign
|
END SUBROUTINE gpu_threaded_backassign
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
SUBROUTINE DSWAP_gpu(n, dx, incx, dy, incy)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
implicit none
|
|
||||||
integer :: n, incx, incy
|
|
||||||
REAL(8), dimension(*) :: dx, dy
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: dx, dy
|
|
||||||
CALL cublasDSWAP(n, dx, incx, dy, incy)
|
|
||||||
#endif
|
|
||||||
return
|
|
||||||
END SUBROUTINE DSWAP_gpu
|
|
||||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
|
|
|
@ -191,8 +191,8 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_memset( G_d, ZERO, nbnd*nbnd ) ! G = ZERO
|
call gpu_threaded_memset( G_d, ZERO, nbnd*nbnd ) ! G = ZERO
|
||||||
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM('T','N', nbnd, my_n, npw2, 2.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd)
|
CALL MYDGEMM('T','N', nbnd, my_n, npw2, 2.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( nbnd, my_n, -1.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, G_d(1,n_start), nbnd )
|
IF ( gstart == 2 ) CALL MYDGER( nbnd, my_n, -1.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, G_d(1,n_start), nbnd )
|
||||||
CALL mp_sum( G_d, inter_bgrp_comm )
|
CALL mp_sum( G_d, inter_bgrp_comm )
|
||||||
CALL mp_sum( G_d, intra_bgrp_comm )
|
CALL mp_sum( G_d, intra_bgrp_comm )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
|
@ -203,10 +203,10 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM('N','N',npw2, nbnd, my_n, -ONE, spsi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, nbnd, my_n, -ONE, spsi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2)
|
||||||
else
|
else
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM('N','N',npw2, nbnd, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, nbnd, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2)
|
||||||
end if
|
end if
|
||||||
CALL mp_sum( w_d, inter_bgrp_comm )
|
CALL mp_sum( w_d, inter_bgrp_comm )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
|
@ -260,12 +260,12 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM( 'T','N', my_n, nact, npw2, 2.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd )
|
CALL MYDGEMM( 'T','N', my_n, nact, npw2, 2.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd )
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( my_n, nact, -1.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd )
|
IF ( gstart == 2 ) CALL MYDGER( my_n, nact, -1.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd )
|
||||||
else
|
else
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM( 'T','N', my_n, nact, npw2, 2.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd )
|
CALL MYDGEMM( 'T','N', my_n, nact, npw2, 2.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd )
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( my_n, nact, -1.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd )
|
IF ( gstart == 2 ) CALL MYDGER( my_n, nact, -1.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd )
|
||||||
end if
|
end if
|
||||||
G = G_d
|
G = G_d
|
||||||
CALL mp_sum( G(1:nbnd,1:nact), inter_bgrp_comm )
|
CALL mp_sum( G(1:nbnd,1:nact), inter_bgrp_comm )
|
||||||
|
@ -277,7 +277,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer_d, w_d, npwx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, w_d, npwx, nact, .true., act_idx_d, .true. )
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM('N','N', npw2, nact, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
CALL MYDGEMM('N','N', npw2, nact, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
||||||
buffer = buffer_d
|
buffer = buffer_d
|
||||||
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
||||||
buffer_d = buffer
|
buffer_d = buffer
|
||||||
|
@ -343,8 +343,8 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, npwx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, npwx, nact, .true., act_idx_d, .false. )
|
||||||
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM('T','N', my_n, nact, npw2, 2.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, 0.D0, G_d(n_start,1), nbnd)
|
CALL MYDGEMM('T','N', my_n, nact, npw2, 2.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, 0.D0, G_d(n_start,1), nbnd)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( my_n, nact, -1.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, G_d(n_start,1), nbnd )
|
IF ( gstart == 2 ) CALL MYDGER( my_n, nact, -1.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, G_d(n_start,1), nbnd )
|
||||||
G = G_d
|
G = G_d
|
||||||
CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm )
|
CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm )
|
||||||
CALL mp_sum( G(1:nact,1:nact), intra_bgrp_comm )
|
CALL mp_sum( G(1:nact,1:nact), intra_bgrp_comm )
|
||||||
|
@ -356,7 +356,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer_d, p_d, npwx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, p_d, npwx, nact, .true., act_idx_d, .true. )
|
||||||
call gpu_threaded_assign( buffer1_d, psi_d, npwx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, psi_d, npwx, nact, .true., act_idx_d, .false. )
|
||||||
if (n_start .le. n_end) & ! could be done differently
|
if (n_start .le. n_end) & ! could be done differently
|
||||||
CALL gpu_DGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
CALL MYDGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
||||||
buffer = buffer_d
|
buffer = buffer_d
|
||||||
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
||||||
buffer_d = buffer
|
buffer_d = buffer
|
||||||
|
@ -368,7 +368,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer_d, hp_d, npwx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, hp_d, npwx, nact, .true., act_idx_d, .true. )
|
||||||
call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, nact, .true., act_idx_d, .false. )
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
CALL MYDGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
||||||
buffer = buffer_d
|
buffer = buffer_d
|
||||||
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
||||||
buffer_d = buffer
|
buffer_d = buffer
|
||||||
|
@ -381,7 +381,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer_d, sp_d, npwx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, sp_d, npwx, nact, .true., act_idx_d, .true. )
|
||||||
call gpu_threaded_assign( buffer1_d, spsi_d, npwx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, spsi_d, npwx, nact, .true., act_idx_d, .false. )
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
CALL MYDGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
||||||
buffer = buffer_d
|
buffer = buffer_d
|
||||||
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
||||||
buffer_d = buffer
|
buffer_d = buffer
|
||||||
|
@ -416,44 +416,44 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d, sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d, sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d, sbsize3 )
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d, sbsize3 )
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, spsi_d, npwx, l, .true., col_idx_d, .false.)
|
call gpu_threaded_assign( buffer1_d, spsi_d, npwx, l, .true., col_idx_d, .false.)
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d, sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d, sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d, sbsize3 )
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d, sbsize3 )
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
call gpu_threaded_assign( buffer_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, l+1), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, l+1), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, l+1), sbsize3 )
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, l+1), sbsize3 )
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, l+1 ), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, l+1 ), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, l+1), sbsize3 )
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, l+1), sbsize3 )
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, l+1), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, l+1), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, l+1), sbsize3 )
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, l+1), sbsize3 )
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, l+1), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, l+1), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, l+1), sbsize3 )
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, l+1), sbsize3 )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
|
@ -463,30 +463,30 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer_d, p_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, p_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(2*l + 1, 2*l+1), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(2*l + 1, 2*l+1), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(2*l + 1, 2*l+1 ), sbsize3 )
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(2*l + 1, 2*l+1 ), sbsize3 )
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(2*l + 1, 2*l+1), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(2*l + 1, 2*l+1), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(2*l + 1, 2*l+1 ), sbsize3)
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(2*l + 1, 2*l+1 ), sbsize3)
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, 2*l+1), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, 2*l+1), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, 2*l+1), sbsize3)
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, 2*l+1), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, 2*l+1), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, 2*l+1), sbsize3)
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, 2*l+1), sbsize3)
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
|
@ -494,16 +494,16 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, 2*l+1), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, 2*l+1), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, 2*l+1), sbsize3)
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, 2*l+1), sbsize3)
|
CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, 2*l+1), sbsize3)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, 2*l+1), sbsize3)
|
IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, 2*l+1), sbsize3)
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
|
@ -594,18 +594,18 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
|
||||||
call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
|
CALL MYDGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
|
||||||
! p(:,col_idx(1:l)) = buffer(:,1:l)
|
! p(:,col_idx(1:l)) = buffer(:,1:l)
|
||||||
call gpu_threaded_backassign( p_d, col_idx_d, buffer_d, npwx, l, .false., p_d )
|
call gpu_threaded_backassign( p_d, col_idx_d, buffer_d, npwx, l, .false., p_d )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
|
||||||
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
|
CALL MYDGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
|
||||||
! hp(:,col_idx(1:l)) = buffer(:,1:l)
|
! hp(:,col_idx(1:l)) = buffer(:,1:l)
|
||||||
call gpu_threaded_backassign( hp_d, col_idx_d, buffer_d, npwx, l, .false., hp_d )
|
call gpu_threaded_backassign( hp_d, col_idx_d, buffer_d, npwx, l, .false., hp_d )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
|
@ -613,9 +613,9 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2)
|
||||||
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
|
CALL MYDGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2)
|
||||||
! sp(:,col_idx(1:l)) = buffer(:,1:l)
|
! sp(:,col_idx(1:l)) = buffer(:,1:l)
|
||||||
call gpu_threaded_backassign( sp_d, col_idx_d, buffer_d, npwx, l, .false., sp_d )
|
call gpu_threaded_backassign( sp_d, col_idx_d, buffer_d, npwx, l, .false., sp_d )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
|
@ -624,14 +624,14 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
|
||||||
! p(:,col_idx(1:l)) = buffer(:, 1:l)
|
! p(:,col_idx(1:l)) = buffer(:, 1:l)
|
||||||
call gpu_threaded_backassign( p_d, col_idx_d, buffer_d, npwx, l, .false., p_d )
|
call gpu_threaded_backassign( p_d, col_idx_d, buffer_d, npwx, l, .false., p_d )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
|
||||||
! hp(:,col_idx(1:l)) = buffer(:, 1:l)
|
! hp(:,col_idx(1:l)) = buffer(:, 1:l)
|
||||||
call gpu_threaded_backassign( hp_d, col_idx_d, buffer_d, npwx, l, .false., hp_d )
|
call gpu_threaded_backassign( hp_d, col_idx_d, buffer_d, npwx, l, .false., hp_d )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
|
@ -639,7 +639,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2)
|
||||||
! sp(:,col_idx(1:l)) = buffer(:, 1:l)
|
! sp(:,col_idx(1:l)) = buffer(:, 1:l)
|
||||||
call gpu_threaded_backassign( sp_d, col_idx_d, buffer_d, npwx, l, .false., sp_d )
|
call gpu_threaded_backassign( sp_d, col_idx_d, buffer_d, npwx, l, .false., sp_d )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
|
@ -649,14 +649,14 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
! Update the sub-blocks of psi and hpsi (and spsi)
|
! Update the sub-blocks of psi and hpsi (and spsi)
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, psi_d, npwx, l, .true., col_idx_d )
|
call gpu_threaded_assign( buffer1_d, psi_d, npwx, l, .true., col_idx_d )
|
||||||
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
|
||||||
! psi(:,col_idx(1:l)) = buffer(:,1:l) + p(:,col_idx(1:l))
|
! psi(:,col_idx(1:l)) = buffer(:,1:l) + p(:,col_idx(1:l))
|
||||||
call gpu_threaded_backassign( psi_d, col_idx_d, buffer_d, npwx, l, .true., p_d )
|
call gpu_threaded_backassign( psi_d, col_idx_d, buffer_d, npwx, l, .true., p_d )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, l, .true., col_idx_d )
|
call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, l, .true., col_idx_d )
|
||||||
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
|
||||||
! hpsi(:,col_idx(1:l)) = buffer(:,1:l) + hp(:,col_idx(1:l))
|
! hpsi(:,col_idx(1:l)) = buffer(:,1:l) + hp(:,col_idx(1:l))
|
||||||
call gpu_threaded_backassign( hpsi_d, col_idx_d, buffer_d, npwx, l, .true., hp_d )
|
call gpu_threaded_backassign( hpsi_d, col_idx_d, buffer_d, npwx, l, .true., hp_d )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
|
@ -664,7 +664,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, spsi_d, npwx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, spsi_d, npwx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2)
|
||||||
! spsi(:,col_idx(1:l)) = buffer(:,1:l) + sp(:,col_idx(1:l))
|
! spsi(:,col_idx(1:l)) = buffer(:,1:l) + sp(:,col_idx(1:l))
|
||||||
call gpu_threaded_backassign( spsi_d, col_idx_d, buffer_d, npwx, l, .true., sp_d )
|
call gpu_threaded_backassign( spsi_d, col_idx_d, buffer_d, npwx, l, .true., sp_d )
|
||||||
call stop_clock('ppcg:dgemm')
|
call stop_clock('ppcg:dgemm')
|
||||||
|
@ -857,8 +857,8 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_memset( G_d, ZERO, nbnd*nbnd ) ! G = ZERO
|
call gpu_threaded_memset( G_d, ZERO, nbnd*nbnd ) ! G = ZERO
|
||||||
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM('T','N', nact, my_n, npw2, 2.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd)
|
CALL MYDGEMM('T','N', nact, my_n, npw2, 2.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd)
|
||||||
IF ( gstart == 2 ) CALL gpu_DGER( nact, my_n, -1.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, G_d(1,n_start), nbnd )
|
IF ( gstart == 2 ) CALL MYDGER( nact, my_n, -1.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, G_d(1,n_start), nbnd )
|
||||||
G = G_d
|
G = G_d
|
||||||
CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm)
|
CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm)
|
||||||
!
|
!
|
||||||
|
@ -875,7 +875,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
end if
|
end if
|
||||||
call start_clock('ppcg:dgemm')
|
call start_clock('ppcg:dgemm')
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_DGEMM('N','N',npw2, nact, my_n, -ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
CALL MYDGEMM('N','N',npw2, nact, my_n, -ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2)
|
||||||
buffer = buffer_d
|
buffer = buffer_d
|
||||||
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
|
||||||
buffer_d = buffer
|
buffer_d = buffer
|
||||||
|
@ -1779,13 +1779,13 @@ CONTAINS
|
||||||
! this proc sends his block
|
! this proc sends his block
|
||||||
!
|
!
|
||||||
CALL mp_bcast( Gl(:,1:nc), root, ortho_parent_comm )
|
CALL mp_bcast( Gl(:,1:nc), root, ortho_parent_comm )
|
||||||
CALL gpu_DGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gl, nx, gamm, Xtmp(1,ic), ld2 )
|
CALL MYDGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gl, nx, gamm, Xtmp(1,ic), ld2 )
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
! all other procs receive
|
! all other procs receive
|
||||||
!
|
!
|
||||||
CALL mp_bcast( Gltmp(:,1:nc), root, ortho_parent_comm )
|
CALL mp_bcast( Gltmp(:,1:nc), root, ortho_parent_comm )
|
||||||
CALL gpu_DGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gltmp, nx, gamm, Xtmp(1,ic), ld2 )
|
CALL MYDGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gltmp, nx, gamm, Xtmp(1,ic), ld2 )
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
gamm = ONE
|
gamm = ONE
|
||||||
|
|
|
@ -196,7 +196,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
G_d = C_ZERO
|
G_d = C_ZERO
|
||||||
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('C','N', nbnd, my_n, kdim, C_ONE, psi_d, kdimx, hpsi_d(1,n_start), kdimx, C_ZERO, G_d(1,n_start), nbnd)
|
CALL MYZGEMM('C','N', nbnd, my_n, kdim, C_ONE, psi_d, kdimx, hpsi_d(1,n_start), kdimx, C_ZERO, G_d(1,n_start), nbnd)
|
||||||
CALL mp_sum( G_d, inter_bgrp_comm )
|
CALL mp_sum( G_d, inter_bgrp_comm )
|
||||||
!
|
!
|
||||||
CALL mp_sum( G_d, intra_bgrp_comm )
|
CALL mp_sum( G_d, intra_bgrp_comm )
|
||||||
|
@ -208,10 +208,10 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N',kdim, nbnd, my_n, -C_ONE,spsi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx)
|
CALL MYZGEMM('N','N',kdim, nbnd, my_n, -C_ONE,spsi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx)
|
||||||
else
|
else
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N',kdim, nbnd, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx)
|
CALL MYZGEMM('N','N',kdim, nbnd, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx)
|
||||||
end if
|
end if
|
||||||
CALL mp_sum( w_d, inter_bgrp_comm )
|
CALL mp_sum( w_d, inter_bgrp_comm )
|
||||||
call stop_clock('ppcg:zgemm')
|
call stop_clock('ppcg:zgemm')
|
||||||
|
@ -267,11 +267,11 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE, spsi_d(1,n_start), kdimx, buffer_d, kdimx, &
|
CALL MYZGEMM( 'C','N', my_n, nact, kdim, C_ONE, spsi_d(1,n_start), kdimx, buffer_d, kdimx, &
|
||||||
C_ZERO, G_d(n_start,1), nbnd )
|
C_ZERO, G_d(n_start,1), nbnd )
|
||||||
else
|
else
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE,psi_d(1,n_start), kdimx, buffer_d, kdimx, &
|
CALL MYZGEMM( 'C','N', my_n, nact, kdim, C_ONE,psi_d(1,n_start), kdimx, buffer_d, kdimx, &
|
||||||
C_ZERO, G_d(n_start,1), nbnd )
|
C_ZERO, G_d(n_start,1), nbnd )
|
||||||
end if
|
end if
|
||||||
G = G_d
|
G = G_d
|
||||||
|
@ -285,7 +285,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer_d, w_d, kdimx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, w_d, kdimx, nact, .true., act_idx_d, .true. )
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, &
|
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, &
|
||||||
buffer_d, kdimx)
|
buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel DO(2)
|
!$cuf kernel DO(2)
|
||||||
|
@ -377,7 +377,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, kdimx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, kdimx, nact, .true., act_idx_d, .false. )
|
||||||
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('C','N', my_n, nact, kdim, C_ONE, buffer_d(1,n_start), kdimx, buffer1_d, &
|
CALL MYZGEMM('C','N', my_n, nact, kdim, C_ONE, buffer_d(1,n_start), kdimx, buffer1_d, &
|
||||||
kdimx, C_ZERO, G_d(n_start,1), nbnd)
|
kdimx, C_ZERO, G_d(n_start,1), nbnd)
|
||||||
G = G_d
|
G = G_d
|
||||||
CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm )
|
CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm )
|
||||||
|
@ -391,7 +391,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer_d, p_d, kdimx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, p_d, kdimx, nact, .true., act_idx_d, .true. )
|
||||||
call gpu_threaded_assign( buffer1_d, psi_d, kdimx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, psi_d, kdimx, nact, .true., act_idx_d, .false. )
|
||||||
if (n_start .le. n_end) & ! could be done differently
|
if (n_start .le. n_end) & ! could be done differently
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
||||||
nbnd, C_ONE, buffer_d, kdimx)
|
nbnd, C_ONE, buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
|
@ -406,7 +406,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer_d, hp_d, kdimx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, hp_d, kdimx, nact, .true., act_idx_d, .true. )
|
||||||
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, nact, .true., act_idx_d, .false. )
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
||||||
nbnd, C_ONE, buffer_d, kdimx)
|
nbnd, C_ONE, buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
|
@ -422,7 +422,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer_d, sp_d, kdimx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, sp_d, kdimx, nact, .true., act_idx_d, .true. )
|
||||||
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, nact, .true., act_idx_d, .false. )
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
||||||
nbnd, C_ONE, buffer_d, kdimx)
|
nbnd, C_ONE, buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
|
@ -460,7 +460,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d, sbsize3)
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d, sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
|
@ -475,19 +475,19 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d, sbsize3)
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d, sbsize3)
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(l+1, l+1), sbsize3)
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(l+1, l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(l+1, l+1 ), sbsize3)
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(l+1, l+1 ), sbsize3)
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
|
@ -500,14 +500,14 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
END DO
|
END DO
|
||||||
END DO
|
END DO
|
||||||
end if
|
end if
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(1, l+1), sbsize3)
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(1, l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, l+1), sbsize3)
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, l+1), sbsize3)
|
||||||
call stop_clock('ppcg:zgemm')
|
call stop_clock('ppcg:zgemm')
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
|
@ -517,7 +517,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
K_d(2*l + 1, 2*l+1), sbsize3)
|
K_d(2*l + 1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
|
@ -525,13 +525,13 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
M_d(2*l + 1, 2*l+1), sbsize3)
|
M_d(2*l + 1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
K_d(1, 2*l+1), sbsize3)
|
K_d(1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
|
@ -539,7 +539,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, 2*l+1), sbsize3)
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, 2*l+1), sbsize3)
|
||||||
call stop_clock('ppcg:zgemm')
|
call stop_clock('ppcg:zgemm')
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
|
@ -547,7 +547,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
K_d(l+1, 2*l+1), sbsize3)
|
K_d(l+1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
|
@ -555,7 +555,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
M_d(l+1, 2*l+1), sbsize3)
|
M_d(l+1, 2*l+1), sbsize3)
|
||||||
call stop_clock('ppcg:zgemm')
|
call stop_clock('ppcg:zgemm')
|
||||||
!
|
!
|
||||||
|
@ -640,9 +640,9 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
|
||||||
call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
DO jj = 1, l
|
DO jj = 1, l
|
||||||
|
@ -653,9 +653,9 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
|
||||||
call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
DO jj = 1, l
|
DO jj = 1, l
|
||||||
|
@ -667,9 +667,9 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx)
|
||||||
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx)
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
DO jj = 1, l
|
DO jj = 1, l
|
||||||
|
@ -682,7 +682,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx)
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
DO jj = 1, l
|
DO jj = 1, l
|
||||||
|
@ -693,7 +693,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx)
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
DO jj = 1, l
|
DO jj = 1, l
|
||||||
|
@ -705,7 +705,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, c_ZERO, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, c_ZERO, buffer_d, kdimx)
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
DO jj = 1, l
|
DO jj = 1, l
|
||||||
|
@ -719,7 +719,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
! Update the sub-blocks of psi and hpsi (and spsi)
|
! Update the sub-blocks of psi and hpsi (and spsi)
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
DO jj = 1, l
|
DO jj = 1, l
|
||||||
|
@ -730,7 +730,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
!
|
!
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
DO jj = 1, l
|
DO jj = 1, l
|
||||||
|
@ -742,7 +742,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
|
CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx)
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
DO jj = 1, l
|
DO jj = 1, l
|
||||||
|
@ -968,7 +968,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
END DO
|
END DO
|
||||||
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('C','N', nact, my_n, kdim, C_ONE, buffer_d, kdimx, buffer1_d(1,n_start), kdimx, &
|
CALL MYZGEMM('C','N', nact, my_n, kdim, C_ONE, buffer_d, kdimx, buffer1_d(1,n_start), kdimx, &
|
||||||
C_ZERO, G_d(1,n_start), nbnd)
|
C_ZERO, G_d(1,n_start), nbnd)
|
||||||
G = G_d
|
G = G_d
|
||||||
CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm)
|
CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm)
|
||||||
|
@ -986,7 +986,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, &
|
||||||
end if
|
end if
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
||||||
nbnd, C_ONE, buffer_d, kdimx)
|
nbnd, C_ONE, buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
|
@ -1792,13 +1792,13 @@ CONTAINS
|
||||||
! this proc sends his block
|
! this proc sends his block
|
||||||
!
|
!
|
||||||
CALL mp_bcast( Gl(:,1:nc), root, ortho_parent_comm )
|
CALL mp_bcast( Gl(:,1:nc), root, ortho_parent_comm )
|
||||||
CALL gpu_ZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gl, nx, gamm, Xtmp(1,ic), ld )
|
CALL MYZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gl, nx, gamm, Xtmp(1,ic), ld )
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
! all other procs receive
|
! all other procs receive
|
||||||
!
|
!
|
||||||
CALL mp_bcast( Gltmp(:,1:nc), root, ortho_parent_comm )
|
CALL mp_bcast( Gltmp(:,1:nc), root, ortho_parent_comm )
|
||||||
CALL gpu_ZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gltmp, nx, gamm, Xtmp(1,ic), ld )
|
CALL MYZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gltmp, nx, gamm, Xtmp(1,ic), ld )
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
gamm = C_ONE
|
gamm = C_ONE
|
||||||
|
|
|
@ -285,7 +285,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
INTEGER :: ibnd
|
INTEGER :: ibnd
|
||||||
!
|
!
|
||||||
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
|
COMPLEX(DP), EXTERNAL :: MYZDOTC
|
||||||
!
|
!
|
||||||
! ... Operate the Hamiltonian : H |psi>
|
! ... Operate the Hamiltonian : H |psi>
|
||||||
!
|
!
|
||||||
|
@ -312,7 +312,7 @@ CONTAINS
|
||||||
!$acc host_data use_device(psi, hpsi)
|
!$acc host_data use_device(psi, hpsi)
|
||||||
DO ibnd = ibnd_start, ibnd_end
|
DO ibnd = ibnd_start, ibnd_end
|
||||||
!
|
!
|
||||||
hw(ibnd) = DBLE( ZDOTC_gpu( kdim, psi(1,ibnd), 1, hpsi(1,ibnd), 1 ) )
|
hw(ibnd) = DBLE( MYZDOTC( kdim, psi(1,ibnd), 1, hpsi(1,ibnd), 1 ) )
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
|
@ -326,11 +326,11 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
sw(ibnd) = DBLE( ZDOTC_gpu( kdim, psi(1,ibnd), 1, spsi(1,ibnd), 1 ) )
|
sw(ibnd) = DBLE( MYZDOTC( kdim, psi(1,ibnd), 1, spsi(1,ibnd), 1 ) )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
sw(ibnd) = DBLE( ZDOTC_gpu( kdim, psi(1,ibnd), 1, psi(1,ibnd), 1 ) )
|
sw(ibnd) = DBLE( MYZDOTC( kdim, psi(1,ibnd), 1, psi(1,ibnd), 1 ) )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
|
@ -394,10 +394,10 @@ CONTAINS
|
||||||
IF ( conv(ibnd) ) CYCLE
|
IF ( conv(ibnd) ) CYCLE
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, hpsi, spsi)
|
!$acc host_data use_device(psi, hpsi, spsi)
|
||||||
CALL ZCOPY_gpu( kdim, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 )
|
CALL MYZCOPY( kdim, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 )
|
||||||
CALL ZCOPY_gpu( kdim, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 )
|
CALL MYZCOPY( kdim, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 )
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL ZCOPY_gpu( kdim, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 )
|
CALL MYZCOPY( kdim, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
php(ibnd,idiis) = hw(ibnd)
|
php(ibnd,idiis) = hw(ibnd)
|
||||||
|
@ -419,15 +419,15 @@ CONTAINS
|
||||||
!
|
!
|
||||||
ec = CMPLX( php(ibnd,kdiis), 0._DP, kind=DP )
|
ec = CMPLX( php(ibnd,kdiis), 0._DP, kind=DP )
|
||||||
!
|
!
|
||||||
CALL ZCOPY_gpu( kdim, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
CALL MYZCOPY( kdim, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL ZAXPY_gpu( kdim, -ec, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
CALL MYZAXPY( kdim, -ec, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL ZAXPY_gpu( kdim, -ec, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
CALL MYZAXPY( kdim, -ec, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
|
@ -435,19 +435,19 @@ CONTAINS
|
||||||
!
|
!
|
||||||
ec = CMPLX( php(ibnd,idiis), 0._DP, kind=DP )
|
ec = CMPLX( php(ibnd,idiis), 0._DP, kind=DP )
|
||||||
!
|
!
|
||||||
CALL ZCOPY_gpu( kdim, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYZCOPY( kdim, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL ZAXPY_gpu( kdim, -ec, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYZAXPY( kdim, -ec, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL ZAXPY_gpu( kdim, -ec, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYZAXPY( kdim, -ec, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
CALL ZGEMV_gpu( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, &
|
CALL MYZGEMV( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, &
|
||||||
vec1_d(1), 1, ZERO, tc_d(1,jbnd), 1 )
|
vec1_d(1), 1, ZERO, tc_d(1,jbnd), 1 )
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
|
@ -482,21 +482,21 @@ CONTAINS
|
||||||
!
|
!
|
||||||
DO kdiis = 1, idiis
|
DO kdiis = 1, idiis
|
||||||
!
|
!
|
||||||
CALL ZCOPY_gpu( kdim, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
CALL MYZCOPY( kdim, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL ZCOPY_gpu( kdim, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYZCOPY( kdim, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL ZCOPY_gpu( kdim, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYZCOPY( kdim, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
CALL ZGEMV_gpu( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, &
|
CALL MYZGEMV( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, &
|
||||||
vec1_d(1), 1, ZERO, tc_d(1,jbnd), 1 )
|
vec1_d(1), 1, ZERO, tc_d(1,jbnd), 1 )
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
|
@ -550,29 +550,29 @@ CONTAINS
|
||||||
kvc = vc(kdiis)
|
kvc = vc(kdiis)
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, hpsi, spsi)
|
!$acc host_data use_device(psi, hpsi, spsi)
|
||||||
CALL ZAXPY_gpu( kdim, kvc, phi_d (1,ibnd,kdiis), 1, psi(1,ibnd), 1 )
|
CALL MYZAXPY( kdim, kvc, phi_d (1,ibnd,kdiis), 1, psi(1,ibnd), 1 )
|
||||||
CALL ZAXPY_gpu( kdim, kvc, hphi_d (1,ibnd,kdiis), 1, hpsi (1,ibnd), 1 )
|
CALL MYZAXPY( kdim, kvc, hphi_d (1,ibnd,kdiis), 1, hpsi (1,ibnd), 1 )
|
||||||
IF (uspp) CALL ZAXPY_gpu( kdim, kvc, sphi_d (1,ibnd,kdiis), 1, spsi (1,ibnd), 1 )
|
IF (uspp) CALL MYZAXPY( kdim, kvc, sphi_d (1,ibnd,kdiis), 1, spsi (1,ibnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
! ... Residual vectors
|
! ... Residual vectors
|
||||||
!
|
!
|
||||||
ec = CMPLX( php(ibnd,kdiis), 0._DP, kind=DP )
|
ec = CMPLX( php(ibnd,kdiis), 0._DP, kind=DP )
|
||||||
!
|
!
|
||||||
CALL ZCOPY_gpu( kdim, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
|
CALL MYZCOPY( kdim, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL ZAXPY_gpu( kdim, -ec, sphi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 )
|
CALL MYZAXPY( kdim, -ec, sphi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL ZAXPY_gpu( kdim, -ec, phi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 )
|
CALL MYZAXPY( kdim, -ec, phi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(kpsi)
|
!$acc host_data use_device(kpsi)
|
||||||
CALL ZAXPY_gpu( kdim, kvc, vec1_d (1), 1, kpsi (1,kbnd), 1 )
|
CALL MYZAXPY( kdim, kvc, vec1_d (1), 1, kpsi (1,kbnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
|
@ -583,10 +583,10 @@ CONTAINS
|
||||||
!
|
!
|
||||||
norm = SQRT( sw(ibnd) )
|
norm = SQRT( sw(ibnd) )
|
||||||
!$acc host_data use_device(psi, hpsi, spsi)
|
!$acc host_data use_device(psi, hpsi, spsi)
|
||||||
CALL ZDSCAL_gpu( kdim, 1._DP / norm, psi (1,ibnd), 1 )
|
CALL MYZDSCAL( kdim, 1._DP / norm, psi (1,ibnd), 1 )
|
||||||
CALL ZDSCAL_gpu( kdim, 1._DP / norm, hpsi(1,ibnd), 1 )
|
CALL MYZDSCAL( kdim, 1._DP / norm, hpsi(1,ibnd), 1 )
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL ZDSCAL_gpu( kdim, 1._DP / norm, spsi(1,ibnd), 1 )
|
CALL MYZDSCAL( kdim, 1._DP / norm, spsi(1,ibnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
! ... Residual vectors
|
! ... Residual vectors
|
||||||
|
@ -594,15 +594,15 @@ CONTAINS
|
||||||
ec = CMPLX( hw(ibnd), 0._DP, kind=DP )
|
ec = CMPLX( hw(ibnd), 0._DP, kind=DP )
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, spsi, hpsi, kpsi)
|
!$acc host_data use_device(psi, spsi, hpsi, kpsi)
|
||||||
CALL ZCOPY_gpu( kdim, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
CALL MYZCOPY( kdim, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL ZAXPY_gpu( kdim, -ec, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
CALL MYZAXPY( kdim, -ec, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL ZAXPY_gpu( kdim, -ec, psi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
CALL MYZAXPY( kdim, -ec, psi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
|
@ -766,7 +766,7 @@ CONTAINS
|
||||||
COMPLEX(DP) :: z1, z2
|
COMPLEX(DP) :: z1, z2
|
||||||
REAL(DP), ALLOCATABLE :: coef(:,:)
|
REAL(DP), ALLOCATABLE :: coef(:,:)
|
||||||
!
|
!
|
||||||
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
|
COMPLEX(DP), EXTERNAL :: MYZDOTC
|
||||||
!
|
!
|
||||||
REAL(DP) :: ekinj
|
REAL(DP) :: ekinj
|
||||||
INTEGER :: idx
|
INTEGER :: idx
|
||||||
|
@ -901,21 +901,21 @@ CONTAINS
|
||||||
kbnd = ibnd_index(ibnd)
|
kbnd = ibnd_index(ibnd)
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi)
|
!$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi)
|
||||||
php = DBLE( ZDOTC_gpu( kdim, psi (1,ibnd), 1, hpsi (1,ibnd), 1 ) )
|
php = DBLE( MYZDOTC( kdim, psi (1,ibnd), 1, hpsi (1,ibnd), 1 ) )
|
||||||
khp = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 ) )
|
khp = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 ) )
|
||||||
khk = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 ) )
|
khk = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 ) )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
psp = DBLE( ZDOTC_gpu( kdim, psi (1,ibnd), 1, spsi (1,ibnd), 1 ) )
|
psp = DBLE( MYZDOTC( kdim, psi (1,ibnd), 1, spsi (1,ibnd), 1 ) )
|
||||||
ksp = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 ) )
|
ksp = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 ) )
|
||||||
ksk = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 ) )
|
ksk = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 ) )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
psp = DBLE( ZDOTC_gpu( kdim, psi (1,ibnd), 1, psi (1,ibnd), 1 ) )
|
psp = DBLE( MYZDOTC( kdim, psi (1,ibnd), 1, psi (1,ibnd), 1 ) )
|
||||||
ksp = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, psi (1,ibnd), 1 ) )
|
ksp = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, psi (1,ibnd), 1 ) )
|
||||||
ksk = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 ) )
|
ksk = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 ) )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
|
@ -1029,16 +1029,16 @@ CONTAINS
|
||||||
z2 = CMPLX( coef(2,jbnd), 0._DP, kind=DP )
|
z2 = CMPLX( coef(2,jbnd), 0._DP, kind=DP )
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi)
|
!$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi)
|
||||||
CALL ZSCAL_gpu( kdim, z1, psi (1,ibnd), 1 )
|
CALL MYZSCAL( kdim, z1, psi (1,ibnd), 1 )
|
||||||
CALL ZAXPY_gpu( kdim, z2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 )
|
CALL MYZAXPY( kdim, z2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
CALL ZSCAL_gpu( kdim, z1, hpsi (1,ibnd), 1 )
|
CALL MYZSCAL( kdim, z1, hpsi (1,ibnd), 1 )
|
||||||
CALL ZAXPY_gpu( kdim, z2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 )
|
CALL MYZAXPY( kdim, z2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL ZSCAL_gpu( kdim, z1, spsi (1,ibnd), 1 )
|
CALL MYZSCAL( kdim, z1, spsi (1,ibnd), 1 )
|
||||||
CALL ZAXPY_gpu( kdim, z2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 )
|
CALL MYZAXPY( kdim, z2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
|
|
|
@ -297,7 +297,7 @@ CONTAINS
|
||||||
!
|
!
|
||||||
INTEGER :: ibnd
|
INTEGER :: ibnd
|
||||||
!
|
!
|
||||||
REAL(DP), EXTERNAL :: gpu_DDOT
|
REAL(DP), EXTERNAL :: MYDDOT
|
||||||
!
|
!
|
||||||
! ... Operate the Hamiltonian : H |psi>
|
! ... Operate the Hamiltonian : H |psi>
|
||||||
!
|
!
|
||||||
|
@ -324,10 +324,10 @@ CONTAINS
|
||||||
!$acc host_data use_device(psi, hpsi)
|
!$acc host_data use_device(psi, hpsi)
|
||||||
DO ibnd = ibnd_start, ibnd_end
|
DO ibnd = ibnd_start, ibnd_end
|
||||||
!
|
!
|
||||||
hw(ibnd) = 2._DP * gpu_DDOT( npw2, psi(1,ibnd), 1, hpsi(1,ibnd), 1 )
|
hw(ibnd) = 2._DP * MYDDOT( npw2, psi(1,ibnd), 1, hpsi(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) &
|
IF ( gstart == 2 ) &
|
||||||
hw(ibnd)= hw(ibnd) - gpu_DDOT( 1, psi(1,ibnd), 1, hpsi(1,ibnd),1)
|
hw(ibnd)= hw(ibnd) - MYDDOT( 1, psi(1,ibnd), 1, hpsi(1,ibnd),1)
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
|
@ -341,17 +341,17 @@ CONTAINS
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
sw(ibnd) = 2._DP * gpu_DDOT( npw2, psi(1,ibnd), 1, spsi(1,ibnd), 1 )
|
sw(ibnd) = 2._DP * MYDDOT( npw2, psi(1,ibnd), 1, spsi(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) &
|
IF ( gstart == 2 ) &
|
||||||
sw(ibnd)= sw(ibnd) - gpu_DDOT( 1, psi(1,ibnd), 1, spsi(1,ibnd),1)
|
sw(ibnd)= sw(ibnd) - MYDDOT( 1, psi(1,ibnd), 1, spsi(1,ibnd),1)
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
sw(ibnd) = 2._DP * gpu_DDOT( npw2, psi(1,ibnd), 1, psi(1,ibnd), 1 )
|
sw(ibnd) = 2._DP * MYDDOT( npw2, psi(1,ibnd), 1, psi(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) &
|
IF ( gstart == 2 ) &
|
||||||
sw(ibnd )= sw(ibnd) - gpu_DDOT( 1, psi(1,ibnd), 1, psi(1,ibnd),1)
|
sw(ibnd )= sw(ibnd) - MYDDOT( 1, psi(1,ibnd), 1, psi(1,ibnd),1)
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
|
@ -413,10 +413,10 @@ CONTAINS
|
||||||
IF ( conv(ibnd) ) CYCLE
|
IF ( conv(ibnd) ) CYCLE
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, hpsi, spsi)
|
!$acc host_data use_device(psi, hpsi, spsi)
|
||||||
CALL DCOPY_gpu( npw2, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 )
|
CALL MYDCOPY( npw2, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 )
|
||||||
CALL DCOPY_gpu( npw2, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 )
|
CALL MYDCOPY( npw2, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 )
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL DCOPY_gpu( npw2, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 )
|
CALL MYDCOPY( npw2, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
php(ibnd,idiis) = hw(ibnd)
|
php(ibnd,idiis) = hw(ibnd)
|
||||||
|
@ -438,15 +438,15 @@ CONTAINS
|
||||||
!
|
!
|
||||||
er = php(ibnd,kdiis)
|
er = php(ibnd,kdiis)
|
||||||
!
|
!
|
||||||
CALL DCOPY_gpu( npw2, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
CALL MYDCOPY( npw2, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL DAXPY_gpu( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
CALL MYDAXPY( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL DAXPY_gpu( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
CALL MYDAXPY( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
|
@ -454,19 +454,19 @@ CONTAINS
|
||||||
!
|
!
|
||||||
er = php(ibnd,idiis)
|
er = php(ibnd,idiis)
|
||||||
!
|
!
|
||||||
CALL DCOPY_gpu( npw2, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYDCOPY( npw2, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL DAXPY_gpu( npw2, -er, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYDAXPY( npw2, -er, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL DAXPY_gpu( npw2, -er, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYDAXPY( npw2, -er, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
CALL DGEMV_gpu( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, &
|
CALL MYDGEMV( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, &
|
||||||
vec1_d(1), 1, 0._DP, tr_d(1,jbnd), 1 )
|
vec1_d(1), 1, 0._DP, tr_d(1,jbnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -507,21 +507,21 @@ CONTAINS
|
||||||
!
|
!
|
||||||
DO kdiis = 1, idiis
|
DO kdiis = 1, idiis
|
||||||
!
|
!
|
||||||
CALL DCOPY_gpu( npw2, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
CALL MYDCOPY( npw2, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 )
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL DCOPY_gpu( npw2, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYDCOPY( npw2, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL DCOPY_gpu( npw2, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
CALL MYDCOPY( npw2, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
CALL DGEMV_gpu( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, &
|
CALL MYDGEMV( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, &
|
||||||
vec1_d(1), 1, 0._DP, tr_d(1,jbnd), 1 )
|
vec1_d(1), 1, 0._DP, tr_d(1,jbnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -580,30 +580,30 @@ CONTAINS
|
||||||
!
|
!
|
||||||
kvr = vr(kdiis)
|
kvr = vr(kdiis)
|
||||||
!$acc host_data use_device(psi, hpsi, spsi)
|
!$acc host_data use_device(psi, hpsi, spsi)
|
||||||
CALL DAXPY_gpu( npw2, kvr, phi_d (1,ibnd,kdiis), 1, psi (1,ibnd), 1 )
|
CALL MYDAXPY( npw2, kvr, phi_d (1,ibnd,kdiis), 1, psi (1,ibnd), 1 )
|
||||||
CALL DAXPY_gpu( npw2, kvr, hphi_d(1,ibnd,kdiis), 1, hpsi(1,ibnd), 1 )
|
CALL MYDAXPY( npw2, kvr, hphi_d(1,ibnd,kdiis), 1, hpsi(1,ibnd), 1 )
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL DAXPY_gpu( npw2, kvr, sphi_d(1,ibnd,kdiis), 1, spsi(1,ibnd), 1 )
|
CALL MYDAXPY( npw2, kvr, sphi_d(1,ibnd,kdiis), 1, spsi(1,ibnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
! ... Residual vectors
|
! ... Residual vectors
|
||||||
!
|
!
|
||||||
er = php(ibnd,kdiis)
|
er = php(ibnd,kdiis)
|
||||||
!
|
!
|
||||||
CALL DCOPY_gpu( npw2, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
|
CALL MYDCOPY( npw2, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL DAXPY_gpu( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
|
CALL MYDAXPY( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
CALL DAXPY_gpu( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
|
CALL MYDAXPY( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(kpsi)
|
!$acc host_data use_device(kpsi)
|
||||||
CALL DAXPY_gpu( npw2, kvr, vec1_d(1), 1, kpsi(1,kbnd), 1 )
|
CALL MYDAXPY( npw2, kvr, vec1_d(1), 1, kpsi(1,kbnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
END DO
|
END DO
|
||||||
|
@ -614,10 +614,10 @@ CONTAINS
|
||||||
!
|
!
|
||||||
norm = SQRT( sw(ibnd) )
|
norm = SQRT( sw(ibnd) )
|
||||||
!$acc host_data use_device(psi, hpsi, spsi)
|
!$acc host_data use_device(psi, hpsi, spsi)
|
||||||
CALL DSCAL_gpu( npw2, 1._DP / norm, psi (1,ibnd), 1 )
|
CALL MYDSCAL( npw2, 1._DP / norm, psi (1,ibnd), 1 )
|
||||||
CALL DSCAL_gpu( npw2, 1._DP / norm, hpsi(1,ibnd), 1 )
|
CALL MYDSCAL( npw2, 1._DP / norm, hpsi(1,ibnd), 1 )
|
||||||
IF ( uspp ) &
|
IF ( uspp ) &
|
||||||
CALL DSCAL_gpu( npw2, 1._DP / norm, spsi(1,ibnd), 1 )
|
CALL MYDSCAL( npw2, 1._DP / norm, spsi(1,ibnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
! ... Residual vectors
|
! ... Residual vectors
|
||||||
|
@ -625,21 +625,21 @@ CONTAINS
|
||||||
er = hw(ibnd)
|
er = hw(ibnd)
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(hpsi, kpsi)
|
!$acc host_data use_device(hpsi, kpsi)
|
||||||
CALL DCOPY_gpu( npw2, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
CALL MYDCOPY( npw2, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(spsi, kpsi)
|
!$acc host_data use_device(spsi, kpsi)
|
||||||
CALL DAXPY_gpu( npw2, -er, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
CALL MYDAXPY( npw2, -er, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
!civn: here changed spsi --> psi due to a possible typo in the original version
|
!civn: here changed spsi --> psi due to a possible typo in the original version
|
||||||
!CALL DAXPY_gpu( npw2, -er, spsi_d(1,ibnd), 1, kpsi_d(1,kbnd), 1 )
|
!CALL MYDAXPY( npw2, -er, spsi_d(1,ibnd), 1, kpsi_d(1,kbnd), 1 )
|
||||||
!$acc host_data use_device(psi, kpsi)
|
!$acc host_data use_device(psi, kpsi)
|
||||||
CALL DAXPY_gpu( npw2, -er, psi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
CALL MYDAXPY( npw2, -er, psi(1,ibnd), 1, kpsi(1,kbnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
|
@ -819,7 +819,7 @@ CONTAINS
|
||||||
REAL(DP) :: c1, c2
|
REAL(DP) :: c1, c2
|
||||||
REAL(DP), ALLOCATABLE :: coef(:,:)
|
REAL(DP), ALLOCATABLE :: coef(:,:)
|
||||||
!
|
!
|
||||||
REAL(DP), EXTERNAL :: gpu_DDOT
|
REAL(DP), EXTERNAL :: MYDDOT
|
||||||
!civn
|
!civn
|
||||||
INTEGER :: idx
|
INTEGER :: idx
|
||||||
REAL(DP) :: ekinj
|
REAL(DP) :: ekinj
|
||||||
|
@ -989,9 +989,9 @@ CONTAINS
|
||||||
kbnd = ibnd_index(ibnd)
|
kbnd = ibnd_index(ibnd)
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, hpsi, kpsi, hkpsi)
|
!$acc host_data use_device(psi, hpsi, kpsi, hkpsi)
|
||||||
php = 2._DP * gpu_DDOT( npw2, psi (1,ibnd), 1, hpsi (1,ibnd), 1 )
|
php = 2._DP * MYDDOT( npw2, psi (1,ibnd), 1, hpsi (1,ibnd), 1 )
|
||||||
khp = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 )
|
khp = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 )
|
||||||
khk = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 )
|
khk = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -1020,9 +1020,9 @@ CONTAINS
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, spsi, kpsi, skpsi)
|
!$acc host_data use_device(psi, spsi, kpsi, skpsi)
|
||||||
psp = 2._DP * gpu_DDOT( npw2, psi (1,ibnd), 1, spsi (1,ibnd), 1 )
|
psp = 2._DP * MYDDOT( npw2, psi (1,ibnd), 1, spsi (1,ibnd), 1 )
|
||||||
ksp = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 )
|
ksp = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 )
|
||||||
ksk = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 )
|
ksk = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -1051,9 +1051,9 @@ CONTAINS
|
||||||
ELSE
|
ELSE
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, kpsi)
|
!$acc host_data use_device(psi, kpsi)
|
||||||
psp = 2._DP * gpu_DDOT( npw2, psi (1,ibnd), 1, psi (1,ibnd), 1 )
|
psp = 2._DP * MYDDOT( npw2, psi (1,ibnd), 1, psi (1,ibnd), 1 )
|
||||||
ksp = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, psi (1,ibnd), 1 )
|
ksp = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, psi (1,ibnd), 1 )
|
||||||
ksk = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 )
|
ksk = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!
|
!
|
||||||
IF ( gstart == 2 ) THEN
|
IF ( gstart == 2 ) THEN
|
||||||
|
@ -1190,16 +1190,16 @@ CONTAINS
|
||||||
c2 = coef(2,jbnd)
|
c2 = coef(2,jbnd)
|
||||||
!
|
!
|
||||||
!$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi)
|
!$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi)
|
||||||
CALL DSCAL_gpu( npw2, c1, psi (1,ibnd), 1 )
|
CALL MYDSCAL( npw2, c1, psi (1,ibnd), 1 )
|
||||||
CALL DAXPY_gpu( npw2, c2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 )
|
CALL MYDAXPY( npw2, c2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
CALL DSCAL_gpu( npw2, c1, hpsi (1,ibnd), 1 )
|
CALL MYDSCAL( npw2, c1, hpsi (1,ibnd), 1 )
|
||||||
CALL DAXPY_gpu( npw2, c2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 )
|
CALL MYDAXPY( npw2, c2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
IF ( uspp ) THEN
|
IF ( uspp ) THEN
|
||||||
!
|
!
|
||||||
CALL DSCAL_gpu( npw2, c1, spsi (1,ibnd), 1 )
|
CALL MYDSCAL( npw2, c1, spsi (1,ibnd), 1 )
|
||||||
CALL DAXPY_gpu( npw2, c2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 )
|
CALL MYDAXPY( npw2, c2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 )
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
|
|
|
@ -257,9 +257,6 @@ CONTAINS
|
||||||
USE realus, ONLY : real_space, invfft_orbital_gamma, &
|
USE realus, ONLY : real_space, invfft_orbital_gamma, &
|
||||||
fwfft_orbital_gamma, calbec_rs_gamma, s_psir_gamma
|
fwfft_orbital_gamma, calbec_rs_gamma, s_psir_gamma
|
||||||
use gvect, only : gstart
|
use gvect, only : gstart
|
||||||
#if defined(__CUDA)
|
|
||||||
USE cublas
|
|
||||||
#endif
|
|
||||||
|
|
||||||
IMPLICIT NONE
|
IMPLICIT NONE
|
||||||
INTEGER :: m_start, m_end ,ntemp
|
INTEGER :: m_start, m_end ,ntemp
|
||||||
|
@ -280,15 +277,10 @@ CONTAINS
|
||||||
CALL errore('ch_psi_all', 'non collin in gamma point not implemented',1)
|
CALL errore('ch_psi_all', 'non collin in gamma point not implemented',1)
|
||||||
ENDIF
|
ENDIF
|
||||||
|
|
||||||
#if defined(__CUDA)
|
|
||||||
!$acc host_data use_device(spsi, ps, evc)
|
!$acc host_data use_device(spsi, ps, evc)
|
||||||
CALL DGEMM( 'C', 'N', nbnd, m, 2*n, 2.D0,evc, 2*npwx*npol, spsi, 2*npwx*npol, 0.D0, ps, nbnd )
|
CALL MYDGEMM( 'C', 'N', nbnd, m, 2*n, 2.D0,evc, 2*npwx*npol, spsi, 2*npwx*npol, 0.D0, ps, nbnd )
|
||||||
if(gstart==2) CALL gpu_DGER(nbnd, m, -1.0_DP, evc, 2*npwx, spsi, 2*npwx, ps, nbnd )
|
if(gstart==2) CALL MYDGER(nbnd, m, -1.0_DP, evc, 2*npwx, spsi, 2*npwx, ps, nbnd )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
#else
|
|
||||||
CALL DGEMM( 'C', 'N', nbnd, m, 2*n, 2.D0,evc, 2*npwx*npol, spsi, 2*npwx*npol, 0.D0, ps, nbnd )
|
|
||||||
if(gstart==2) CALL DGER(nbnd, m, -1.0_DP, evc, 2*npwx, spsi, 2*npwx, ps, nbnd )
|
|
||||||
#endif
|
|
||||||
!$acc kernels
|
!$acc kernels
|
||||||
ps (:,:) = ps(:,:) * alpha_pv
|
ps (:,:) = ps(:,:) * alpha_pv
|
||||||
hpsi (:,:) = (0.d0, 0.d0)
|
hpsi (:,:) = (0.d0, 0.d0)
|
||||||
|
@ -297,7 +289,7 @@ CONTAINS
|
||||||
CALL mp_sum ( ps, intra_bgrp_comm )
|
CALL mp_sum ( ps, intra_bgrp_comm )
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!$acc host_data use_device(hpsi, ps, evc)
|
!$acc host_data use_device(hpsi, ps, evc)
|
||||||
CALL DGEMM ('N', 'N', 2*n, m, ntemp , 1.d0 , evc, 2*npwx, ps, nbnd, 1.d0 , hpsi, 2*npwx)
|
CALL MYDGEMM ('N', 'N', 2*n, m, ntemp , 1.d0 , evc, 2*npwx, ps, nbnd, 1.d0 , hpsi, 2*npwx)
|
||||||
!$acc end host_data
|
!$acc end host_data
|
||||||
!$acc kernels
|
!$acc kernels
|
||||||
spsi(:,:) = hpsi(:,:)
|
spsi(:,:) = hpsi(:,:)
|
||||||
|
|
|
@ -14,17 +14,16 @@ MODULE efermi_shift
|
||||||
!
|
!
|
||||||
! Define an abstract interface to use a callback
|
! Define an abstract interface to use a callback
|
||||||
ABSTRACT INTERFACE
|
ABSTRACT INTERFACE
|
||||||
SUBROUTINE def_symmetrization(def, irr)
|
SUBROUTINE def_symmetrization(def)
|
||||||
USE kinds, ONLY : DP
|
USE kinds, ONLY : DP
|
||||||
INTEGER :: irr
|
COMPLEX(DP), INTENT(inout) :: def(3)
|
||||||
COMPLEX(DP) :: def(3)
|
|
||||||
END SUBROUTINE
|
END SUBROUTINE
|
||||||
END INTERFACE
|
END INTERFACE
|
||||||
!
|
!
|
||||||
CONTAINS
|
CONTAINS
|
||||||
!
|
!
|
||||||
!-----------------------------------------------------------------------
|
!-----------------------------------------------------------------------
|
||||||
SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, irr, sym_def)
|
SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, sym_def)
|
||||||
!-----------------------------------------------------------------------
|
!-----------------------------------------------------------------------
|
||||||
!! This routine takes care of the effects of a shift of Ef, due to the
|
!! This routine takes care of the effects of a shift of Ef, due to the
|
||||||
!! perturbation, that can take place in a metal at q=0
|
!! perturbation, that can take place in a metal at q=0
|
||||||
|
@ -61,8 +60,6 @@ SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, irr, sym_de
|
||||||
REAL(DP), INTENT(IN), OPTIONAL :: becsum1((nhm*(nhm+1))/2, nat, nspin_mag)
|
REAL(DP), INTENT(IN), OPTIONAL :: becsum1((nhm*(nhm+1))/2, nat, nspin_mag)
|
||||||
!! becsum1 = wdelta * <psi|beta> <beta|psi>
|
!! becsum1 = wdelta * <psi|beta> <beta|psi>
|
||||||
!! (where wdelta is a Dirac-delta-like function)
|
!! (where wdelta is a Dirac-delta-like function)
|
||||||
INTEGER, INTENT(IN), OPTIONAL :: irr
|
|
||||||
!! index of the current irr. rep. Used only in sym_def.
|
|
||||||
PROCEDURE(def_symmetrization), OPTIONAL :: sym_def
|
PROCEDURE(def_symmetrization), OPTIONAL :: sym_def
|
||||||
!! Symmetrization routine for the fermi energy change
|
!! Symmetrization routine for the fermi energy change
|
||||||
!
|
!
|
||||||
|
@ -104,7 +101,7 @@ SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, irr, sym_de
|
||||||
!
|
!
|
||||||
! symmetrizes the Fermi energy shift
|
! symmetrizes the Fermi energy shift
|
||||||
!
|
!
|
||||||
IF (present(sym_def)) CALL sym_def(def, irr)
|
IF (present(sym_def)) CALL sym_def(def)
|
||||||
WRITE( stdout, '(5x,"Pert. #",i3,": Fermi energy shift (Ry) =",2es15.4)')&
|
WRITE( stdout, '(5x,"Pert. #",i3,": Fermi energy shift (Ry) =",2es15.4)')&
|
||||||
(ipert, def (ipert) , ipert = 1, npert )
|
(ipert, def (ipert) , ipert = 1, npert )
|
||||||
!
|
!
|
||||||
|
|
|
@ -147,6 +147,19 @@ MODULE lr_symm_base
|
||||||
LOGICAL :: minus_q, & ! if .TRUE. there is the symmetry sending q<->-q
|
LOGICAL :: minus_q, & ! if .TRUE. there is the symmetry sending q<->-q
|
||||||
invsymq ! if .TRUE. the small group of q has inversion
|
invsymq ! if .TRUE. the small group of q has inversion
|
||||||
!
|
!
|
||||||
|
! Symmetry representation of the perturbations
|
||||||
|
!
|
||||||
|
INTEGER :: lr_npert
|
||||||
|
!! Number of perturbations considered at the same time.
|
||||||
|
!! e.g., for phonons: dimension of the irreducible representation
|
||||||
|
!! e.g., for electric fields: 3
|
||||||
|
COMPLEX(DP), ALLOCATABLE :: upert(:, :, :)
|
||||||
|
!! Representation of the symmetry in the perturbation basis. Size (lr_npert, lr_npert, nsymq)
|
||||||
|
!! e.g., for phonons: transformation matrix of the patterns
|
||||||
|
!! e.g., for electric fields: transformation matrix of Cartesian vectors
|
||||||
|
COMPLEX(DP), ALLOCATABLE :: upert_mq(:, :)
|
||||||
|
!! Representation of the symmetry that transforms q to -q. Size (lr_npert, lr_npert)
|
||||||
|
!
|
||||||
END MODULE lr_symm_base
|
END MODULE lr_symm_base
|
||||||
!
|
!
|
||||||
MODULE lrus
|
MODULE lrus
|
||||||
|
|
|
@ -202,8 +202,7 @@ set(src_modules
|
||||||
w1gauss.f90
|
w1gauss.f90
|
||||||
deviatoric.f90
|
deviatoric.f90
|
||||||
# GPU
|
# GPU
|
||||||
random_numbers_gpu.f90
|
random_numbers_gpu.f90)
|
||||||
cuda_subroutines.f90)
|
|
||||||
qe_enable_cuda_fortran("${src_modules}")
|
qe_enable_cuda_fortran("${src_modules}")
|
||||||
qe_add_library(qe_modules ${src_modules})
|
qe_add_library(qe_modules ${src_modules})
|
||||||
|
|
||||||
|
|
|
@ -221,7 +221,6 @@ sockets.o
|
||||||
|
|
||||||
# GPU versions of modules
|
# GPU versions of modules
|
||||||
MODULES += \
|
MODULES += \
|
||||||
cuda_subroutines.o \
|
|
||||||
random_numbers_gpu.o
|
random_numbers_gpu.o
|
||||||
|
|
||||||
TLDEPS= libfox libla libfft libutil libmbd librxc libupf
|
TLDEPS= libfox libla libfft libutil libmbd librxc libupf
|
||||||
|
|
|
@ -1,39 +0,0 @@
|
||||||
!----------------------------------------------
|
|
||||||
! ... this file contains a number of subroutines optionally interfaced
|
|
||||||
! ... to cublas
|
|
||||||
!----------------------------------------------
|
|
||||||
|
|
||||||
SUBROUTINE cudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
use cudafor
|
|
||||||
use cublas
|
|
||||||
#endif
|
|
||||||
implicit none
|
|
||||||
DOUBLE PRECISION :: ALPHA,BETA
|
|
||||||
INTEGER :: INCX,INCY,LDA,M,N
|
|
||||||
CHARACTER :: TRANS
|
|
||||||
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: A, X, Y
|
|
||||||
#endif
|
|
||||||
!
|
|
||||||
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
|
||||||
!
|
|
||||||
END SUBROUTINE cudaDGEMV
|
|
||||||
|
|
||||||
SUBROUTINE cudaDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
|
||||||
#if defined(__CUDA)
|
|
||||||
use cudafor
|
|
||||||
use cublas
|
|
||||||
#endif
|
|
||||||
! .. Scalar Arguments ..
|
|
||||||
DOUBLE PRECISION :: ALPHA
|
|
||||||
INTEGER :: INCX, INCY, LDA, M, N
|
|
||||||
! .. Array Arguments ..
|
|
||||||
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
|
|
||||||
#if defined(__CUDA)
|
|
||||||
attributes(device) :: A, X, Y
|
|
||||||
#endif
|
|
||||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
|
||||||
|
|
||||||
END SUBROUTINE cudaDGER
|
|
|
@ -89,6 +89,7 @@ set(src_ph
|
||||||
PH/openfilq.f90
|
PH/openfilq.f90
|
||||||
PH/phcom.f90
|
PH/phcom.f90
|
||||||
PH/ph_restart.f90
|
PH/ph_restart.f90
|
||||||
|
PH/ph_set_upert.f90
|
||||||
PH/phescf.f90
|
PH/phescf.f90
|
||||||
PH/phq_init.f90
|
PH/phq_init.f90
|
||||||
PH/phq_readin.f90
|
PH/phq_readin.f90
|
||||||
|
@ -102,7 +103,6 @@ set(src_ph
|
||||||
PH/prepare_sym_analysis.f90
|
PH/prepare_sym_analysis.f90
|
||||||
PH/psidspsi.f90
|
PH/psidspsi.f90
|
||||||
PH/psymdvscf.f90
|
PH/psymdvscf.f90
|
||||||
PH/psyme.f90
|
|
||||||
PH/psym_dmag.f90
|
PH/psym_dmag.f90
|
||||||
PH/psym_dmage.f90
|
PH/psym_dmage.f90
|
||||||
PH/punch_plot_e.f90
|
PH/punch_plot_e.f90
|
||||||
|
@ -136,7 +136,6 @@ set(src_ph
|
||||||
PH/symdvscf.f90
|
PH/symdvscf.f90
|
||||||
PH/symdyn_munu.f90
|
PH/symdyn_munu.f90
|
||||||
PH/symdynph_gq.f90
|
PH/symdynph_gq.f90
|
||||||
PH/syme.f90
|
|
||||||
PH/symm.f90
|
PH/symm.f90
|
||||||
PH/symmorphic_or_nzb.f90
|
PH/symmorphic_or_nzb.f90
|
||||||
PH/swfc.f90
|
PH/swfc.f90
|
||||||
|
|
|
@ -108,13 +108,13 @@ phq_recover.o \
|
||||||
phq_setup.o \
|
phq_setup.o \
|
||||||
phq_summary.o \
|
phq_summary.o \
|
||||||
phqscf.o \
|
phqscf.o \
|
||||||
|
ph_set_upert.o \
|
||||||
polariz.o \
|
polariz.o \
|
||||||
print_clock_ph.o \
|
print_clock_ph.o \
|
||||||
prepare_q.o \
|
prepare_q.o \
|
||||||
prepare_sym_analysis.o \
|
prepare_sym_analysis.o \
|
||||||
psidspsi.o \
|
psidspsi.o \
|
||||||
psymdvscf.o \
|
psymdvscf.o \
|
||||||
psyme.o \
|
|
||||||
psym_dmag.o \
|
psym_dmag.o \
|
||||||
psym_dmage.o \
|
psym_dmage.o \
|
||||||
punch_plot_e.o \
|
punch_plot_e.o \
|
||||||
|
@ -147,7 +147,6 @@ sym_dmage.o \
|
||||||
symdvscf.o \
|
symdvscf.o \
|
||||||
symdyn_munu.o \
|
symdyn_munu.o \
|
||||||
symdynph_gq.o \
|
symdynph_gq.o \
|
||||||
syme.o \
|
|
||||||
symm.o \
|
symm.o \
|
||||||
symmorphic_or_nzb.o \
|
symmorphic_or_nzb.o \
|
||||||
swfc.o \
|
swfc.o \
|
||||||
|
|
|
@ -762,7 +762,7 @@ PROGRAM matdyn
|
||||||
OPEN (unit=2,file=flfrq ,status='unknown',form='formatted')
|
OPEN (unit=2,file=flfrq ,status='unknown',form='formatted')
|
||||||
WRITE(2, '(" &plot nbnd=",i4,", nks=",i4," /")') 3*nat, nq
|
WRITE(2, '(" &plot nbnd=",i4,", nks=",i4," /")') 3*nat, nq
|
||||||
DO n=1, nq
|
DO n=1, nq
|
||||||
WRITE(2, '(10x,3f10.6)') q(1,n), q(2,n), q(3,n)
|
WRITE(2, '(10x,4f10.6)') q(1,n), q(2,n), q(3,n), wq(n)
|
||||||
WRITE(2,'(6f10.4)') (freq(i,n), i=1,3*nat)
|
WRITE(2,'(6f10.4)') (freq(i,n), i=1,3*nat)
|
||||||
END DO
|
END DO
|
||||||
CLOSE(unit=2)
|
CLOSE(unit=2)
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
!
|
||||||
|
! Copyright (C) 2001-2018 Quantum ESPRESSO group
|
||||||
|
! This file is distributed under the terms of the
|
||||||
|
! GNU General Public License. See the file `License'
|
||||||
|
! in the root directory of the present distribution,
|
||||||
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||||
|
!
|
||||||
|
!------------------------------------------------------------------------------------------
|
||||||
|
SUBROUTINE ph_set_upert_phonon(irr)
|
||||||
|
!--------------------------------------------------------------------------------------
|
||||||
|
!! Set lr_npert, upert, and upert_mp for phonons in irreducible representation irr
|
||||||
|
!--------------------------------------------------------------------------------------
|
||||||
|
!
|
||||||
|
USE modes, ONLY : npert, t, tmq
|
||||||
|
USE control_ph, ONLY : lgamma_gamma
|
||||||
|
USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert, upert, upert_mq
|
||||||
|
!
|
||||||
|
IMPLICIT NONE
|
||||||
|
!
|
||||||
|
INTEGER, INTENT(IN) :: irr
|
||||||
|
!! irreducible representation
|
||||||
|
!
|
||||||
|
INTEGER :: ipert, jpert
|
||||||
|
!! Counter on perturbations
|
||||||
|
INTEGER :: isym
|
||||||
|
!! Counter on symmetries
|
||||||
|
!
|
||||||
|
! Set symmetry representation in lr_symm_base
|
||||||
|
!
|
||||||
|
lr_npert = npert(irr)
|
||||||
|
!
|
||||||
|
IF (lgamma_gamma) THEN
|
||||||
|
!
|
||||||
|
! If lgamma_gamma is true, symmetrization is not used.
|
||||||
|
! Set upert and upert_mq to 1.
|
||||||
|
!
|
||||||
|
IF (lr_npert /= 1) CALL errore('ph_set_upert_phonon', &
|
||||||
|
'lgamma_gamma is true, but lr_npert /= 1', 1)
|
||||||
|
!
|
||||||
|
ALLOCATE(upert(1, 1, 1))
|
||||||
|
upert(1, 1, 1) = (1.d0, 0.d0)
|
||||||
|
!
|
||||||
|
IF (minus_q) THEN
|
||||||
|
ALLOCATE(upert_mq(1, 1))
|
||||||
|
upert_mq(1, 1) = (1.d0, 0.d0)
|
||||||
|
ENDIF
|
||||||
|
!
|
||||||
|
ELSE
|
||||||
|
!
|
||||||
|
ALLOCATE(upert(lr_npert, lr_npert, nsymq))
|
||||||
|
!
|
||||||
|
DO isym = 1, nsymq
|
||||||
|
DO ipert = 1, lr_npert
|
||||||
|
DO jpert = 1, lr_npert
|
||||||
|
upert(jpert, ipert, isym) = t(jpert, ipert, isym, irr)
|
||||||
|
ENDDO
|
||||||
|
ENDDO
|
||||||
|
ENDDO
|
||||||
|
!
|
||||||
|
IF (minus_q) THEN
|
||||||
|
ALLOCATE(upert_mq(lr_npert, lr_npert))
|
||||||
|
DO ipert = 1, lr_npert
|
||||||
|
DO jpert = 1, lr_npert
|
||||||
|
upert_mq(jpert, ipert) = tmq(jpert, ipert, irr)
|
||||||
|
ENDDO
|
||||||
|
ENDDO
|
||||||
|
ENDIF ! minus_q
|
||||||
|
!
|
||||||
|
ENDIF
|
||||||
|
!
|
||||||
|
END SUBROUTINE ph_set_upert_phonon
|
||||||
|
!-----------------------------------------------------------------------------------------
|
||||||
|
!
|
||||||
|
!-----------------------------------------------------------------------------------------
|
||||||
|
SUBROUTINE ph_set_upert_e()
|
||||||
|
!--------------------------------------------------------------------------------------
|
||||||
|
!! Set lr_npert, upert, and upert_mp for electric field perturbation.
|
||||||
|
!--------------------------------------------------------------------------------------
|
||||||
|
!
|
||||||
|
USE symm_base, ONLY : s
|
||||||
|
USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert, upert, upert_mq
|
||||||
|
!
|
||||||
|
IMPLICIT NONE
|
||||||
|
!
|
||||||
|
INTEGER :: ipol, jpol
|
||||||
|
!! Counter on perturbations
|
||||||
|
INTEGER :: isym
|
||||||
|
!! Counter on symmetries
|
||||||
|
!
|
||||||
|
! Set symmetry representation in lr_symm_base
|
||||||
|
!
|
||||||
|
lr_npert = 3
|
||||||
|
!
|
||||||
|
ALLOCATE(upert(lr_npert, lr_npert, nsymq))
|
||||||
|
!
|
||||||
|
DO isym = 1, nsymq
|
||||||
|
DO ipol = 1, 3
|
||||||
|
DO jpol = 1, 3
|
||||||
|
upert(jpol, ipol, isym) = s(ipol, jpol, isym)
|
||||||
|
ENDDO
|
||||||
|
ENDDO
|
||||||
|
ENDDO
|
||||||
|
!
|
||||||
|
IF (minus_q) THEN
|
||||||
|
!
|
||||||
|
! upert_mq is the rotation matrix for symmetry S such that T * S * q = q + G.
|
||||||
|
! E field perturbation is applied only for q = 0, where T*q = q, i.e., S = identity.
|
||||||
|
! Thus, upert_mq is the 3*3 identity matrix.
|
||||||
|
!
|
||||||
|
ALLOCATE(upert_mq(lr_npert, lr_npert))
|
||||||
|
upert_mq = (0.d0, 0.d0)
|
||||||
|
DO ipol = 1, lr_npert
|
||||||
|
upert_mq(ipol, ipol) = (1.d0, 0.d0)
|
||||||
|
ENDDO
|
||||||
|
ENDIF ! minus_q
|
||||||
|
!
|
||||||
|
END SUBROUTINE ph_set_upert_e
|
||||||
|
!----------------------------------------------------------------------------------------
|
||||||
|
!
|
||||||
|
!----------------------------------------------------------------------------------------
|
||||||
|
SUBROUTINE ph_deallocate_upert()
|
||||||
|
!----------------------------------------------------------------------------------------
|
||||||
|
USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert, upert, upert_mq
|
||||||
|
IMPLICIT NONE
|
||||||
|
DEALLOCATE(upert)
|
||||||
|
IF (minus_q) DEALLOCATE(upert_mq)
|
||||||
|
!----------------------------------------------------------------------------------------
|
||||||
|
END SUBROUTINE ph_deallocate_upert
|
||||||
|
!----------------------------------------------------------------------------------------
|
|
@ -51,6 +51,10 @@ SUBROUTINE phescf()
|
||||||
IF (noncolin) ALLOCATE(int3_nc( nhm, nhm, nat, nspin, 3))
|
IF (noncolin) ALLOCATE(int3_nc( nhm, nhm, nat, nspin, 3))
|
||||||
ENDIF
|
ENDIF
|
||||||
!
|
!
|
||||||
|
! Set symmetry representation in lr_symm_base
|
||||||
|
!
|
||||||
|
CALL ph_set_upert_e()
|
||||||
|
!
|
||||||
! DFPT+U: dnsscf in the electric field calculation
|
! DFPT+U: dnsscf in the electric field calculation
|
||||||
! is the scf change of atomic occupations ns induced by the electric field.
|
! is the scf change of atomic occupations ns induced by the electric field.
|
||||||
! dnsscf_all_modes = dnsscf because nirr=1, number of perturbations = 3.
|
! dnsscf_all_modes = dnsscf because nirr=1, number of perturbations = 3.
|
||||||
|
@ -134,6 +138,8 @@ SUBROUTINE phescf()
|
||||||
IF (noncolin) DEALLOCATE(int3_nc)
|
IF (noncolin) DEALLOCATE(int3_nc)
|
||||||
ENDIF
|
ENDIF
|
||||||
!
|
!
|
||||||
|
CALL ph_deallocate_upert()
|
||||||
|
!
|
||||||
! DFPT+U
|
! DFPT+U
|
||||||
!
|
!
|
||||||
IF (lda_plus_u) THEN
|
IF (lda_plus_u) THEN
|
||||||
|
|
|
@ -72,6 +72,8 @@ SUBROUTINE phqscf
|
||||||
DO irr = 1, nirr
|
DO irr = 1, nirr
|
||||||
IF ( (comp_irr (irr)) .AND. (.NOT.done_irr (irr)) ) THEN
|
IF ( (comp_irr (irr)) .AND. (.NOT.done_irr (irr)) ) THEN
|
||||||
npe=npert(irr)
|
npe=npert(irr)
|
||||||
|
CALL ph_set_upert_phonon(irr)
|
||||||
|
!
|
||||||
ALLOCATE (drhoscfs( dfftp%nnr , nspin_mag, npe))
|
ALLOCATE (drhoscfs( dfftp%nnr , nspin_mag, npe))
|
||||||
imode0 = 0
|
imode0 = 0
|
||||||
DO irr1 = 1, irr - 1
|
DO irr1 = 1, irr - 1
|
||||||
|
@ -144,6 +146,8 @@ SUBROUTINE phqscf
|
||||||
IF (okpaw) DEALLOCATE (int3_paw)
|
IF (okpaw) DEALLOCATE (int3_paw)
|
||||||
IF (noncolin) DEALLOCATE(int3_nc)
|
IF (noncolin) DEALLOCATE(int3_nc)
|
||||||
ENDIF
|
ENDIF
|
||||||
|
CALL ph_deallocate_upert()
|
||||||
|
!
|
||||||
tcpu = get_clock ('PHONON')
|
tcpu = get_clock ('PHONON')
|
||||||
!
|
!
|
||||||
DEALLOCATE (drhoscfs)
|
DEALLOCATE (drhoscfs)
|
||||||
|
|
|
@ -7,25 +7,23 @@
|
||||||
!
|
!
|
||||||
!
|
!
|
||||||
!-----------------------------------------------------------------------
|
!-----------------------------------------------------------------------
|
||||||
SUBROUTINE psymdvscf (nper, irr, dvtosym)
|
SUBROUTINE psymdvscf (dvtosym)
|
||||||
!-----------------------------------------------------------------------
|
!-----------------------------------------------------------------------
|
||||||
!! p-symmetrize the potential.
|
!! p-symmetrize the potential.
|
||||||
|
!!
|
||||||
|
!! The real space points of dv is distributed, but symmetry may map a point in one
|
||||||
|
!! core to a point in a different core. Hence, gather dvscf in real space, symmetrize,
|
||||||
|
!! and then scatter back.
|
||||||
!
|
!
|
||||||
USE kinds, ONLY : DP
|
USE kinds, ONLY : DP
|
||||||
USE noncollin_module, ONLY : nspin_mag
|
|
||||||
USE mp_bands, ONLY : me_bgrp
|
|
||||||
USE fft_base, ONLY : dfftp
|
USE fft_base, ONLY : dfftp
|
||||||
|
USE noncollin_module, ONLY : nspin_mag
|
||||||
USE scatter_mod, ONLY : cgather_sym
|
USE scatter_mod, ONLY : cgather_sym
|
||||||
|
USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert
|
||||||
USE lr_symm_base, ONLY : nsymq, minus_q
|
|
||||||
!
|
!
|
||||||
IMPLICIT NONE
|
IMPLICIT NONE
|
||||||
!
|
!
|
||||||
INTEGER :: nper
|
COMPLEX(DP) :: dvtosym(dfftp%nnr, nspin_mag, lr_npert)
|
||||||
!! the number of perturbations
|
|
||||||
INTEGER :: irr
|
|
||||||
!! the representation under consideration
|
|
||||||
COMPLEX(DP) :: dvtosym(dfftp%nnr,nspin_mag,nper)
|
|
||||||
!! the potential to symmetrize
|
!! the potential to symmetrize
|
||||||
!
|
!
|
||||||
! ... local variable
|
! ... local variable
|
||||||
|
@ -33,27 +31,30 @@ SUBROUTINE psymdvscf (nper, irr, dvtosym)
|
||||||
#if defined (__MPI)
|
#if defined (__MPI)
|
||||||
!
|
!
|
||||||
INTEGER :: i, is, iper, ir3, ioff, ioff_tg, nxyp
|
INTEGER :: i, is, iper, ir3, ioff, ioff_tg, nxyp
|
||||||
|
!
|
||||||
COMPLEX(DP), ALLOCATABLE :: ddvtosym (:,:,:)
|
COMPLEX(DP), ALLOCATABLE :: ddvtosym (:,:,:)
|
||||||
! the potential to symm
|
! the potential to symm
|
||||||
|
IF (nsymq == 1 .AND. (.NOT.minus_q) ) RETURN
|
||||||
|
|
||||||
IF (nsymq.EQ.1.AND. (.NOT.minus_q) ) RETURN
|
|
||||||
CALL start_clock ('psymdvscf')
|
CALL start_clock ('psymdvscf')
|
||||||
|
!
|
||||||
ALLOCATE (ddvtosym ( dfftp%nr1x * dfftp%nr2x * dfftp%nr3x, nspin_mag, nper))
|
ALLOCATE (ddvtosym ( dfftp%nr1x * dfftp%nr2x * dfftp%nr3x, nspin_mag, lr_npert))
|
||||||
|
!
|
||||||
DO iper = 1, nper
|
! Gather real-space points
|
||||||
|
!
|
||||||
|
DO iper = 1, lr_npert
|
||||||
DO is = 1, nspin_mag
|
DO is = 1, nspin_mag
|
||||||
CALL cgather_sym (dfftp, dvtosym (:, is, iper), ddvtosym (:, is, iper) )
|
CALL cgather_sym (dfftp, dvtosym (:, is, iper), ddvtosym (:, is, iper) )
|
||||||
ENDDO
|
ENDDO
|
||||||
|
|
||||||
ENDDO
|
ENDDO
|
||||||
|
!
|
||||||
CALL symdvscf (nper, irr, ddvtosym)
|
! Symmetrize
|
||||||
|
!
|
||||||
|
CALL symdvscf (ddvtosym)
|
||||||
|
!
|
||||||
|
! Scatter back the real-space points
|
||||||
|
!
|
||||||
nxyp = dfftp%nr1x * dfftp%my_nr2p
|
nxyp = dfftp%nr1x * dfftp%my_nr2p
|
||||||
DO iper = 1, nper
|
DO iper = 1, lr_npert
|
||||||
DO is = 1, nspin_mag
|
DO is = 1, nspin_mag
|
||||||
DO ir3 = 1, dfftp%my_nr3p
|
DO ir3 = 1, dfftp%my_nr3p
|
||||||
ioff = dfftp%nr1x * dfftp%my_nr2p * (ir3-1)
|
ioff = dfftp%nr1x * dfftp%my_nr2p * (ir3-1)
|
||||||
|
@ -63,14 +64,12 @@ SUBROUTINE psymdvscf (nper, irr, dvtosym)
|
||||||
ENDDO
|
ENDDO
|
||||||
ENDDO
|
ENDDO
|
||||||
DEALLOCATE (ddvtosym)
|
DEALLOCATE (ddvtosym)
|
||||||
|
!
|
||||||
CALL stop_clock ('psymdvscf')
|
CALL stop_clock ('psymdvscf')
|
||||||
#else
|
#else
|
||||||
|
!
|
||||||
CALL symdvscf (nper, irr, dvtosym)
|
CALL symdvscf(dvtosym)
|
||||||
|
!
|
||||||
#endif
|
#endif
|
||||||
|
!
|
||||||
RETURN
|
|
||||||
|
|
||||||
END SUBROUTINE psymdvscf
|
END SUBROUTINE psymdvscf
|
||||||
|
|
|
@ -1,65 +0,0 @@
|
||||||
!
|
|
||||||
! Copyright (C) 2001-2008 Quantum ESPRESSO group
|
|
||||||
! This file is distributed under the terms of the
|
|
||||||
! GNU General Public License. See the file `License'
|
|
||||||
! in the root directory of the present distribution,
|
|
||||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
||||||
!
|
|
||||||
!
|
|
||||||
!-----------------------------------------------------------------------
|
|
||||||
SUBROUTINE psyme (dvtosym)
|
|
||||||
!-----------------------------------------------------------------------
|
|
||||||
!! p-symmetrize the charge density.
|
|
||||||
!
|
|
||||||
USE kinds, ONLY : DP
|
|
||||||
USE fft_base, ONLY : dfftp
|
|
||||||
USE noncollin_module, ONLY : nspin_mag
|
|
||||||
USE mp_bands, ONLY : me_bgrp
|
|
||||||
USE fft_base, ONLY : dfftp
|
|
||||||
USE scatter_mod, ONLY : cgather_sym
|
|
||||||
!
|
|
||||||
IMPLICIT NONE
|
|
||||||
!
|
|
||||||
COMPLEX(DP) :: dvtosym (dfftp%nnr, nspin_mag, 3)
|
|
||||||
!! the potential to symmetrize
|
|
||||||
!
|
|
||||||
! ... local variables
|
|
||||||
!
|
|
||||||
#if defined (__MPI)
|
|
||||||
!
|
|
||||||
INTEGER :: i, is, iper, ir3, ioff, ioff_tg, nxyp
|
|
||||||
COMPLEX(DP), ALLOCATABLE :: ddvtosym (:,:,:)
|
|
||||||
! the potential to symmet
|
|
||||||
!
|
|
||||||
!
|
|
||||||
ALLOCATE (ddvtosym ( dfftp%nr1x * dfftp%nr2x * dfftp%nr3x, nspin_mag, 3))
|
|
||||||
|
|
||||||
DO iper = 1, 3
|
|
||||||
DO is = 1, nspin_mag
|
|
||||||
CALL cgather_sym (dfftp,dvtosym (:, is, iper), ddvtosym (:, is, iper) )
|
|
||||||
ENDDO
|
|
||||||
|
|
||||||
ENDDO
|
|
||||||
|
|
||||||
CALL syme (ddvtosym)
|
|
||||||
|
|
||||||
nxyp = dfftp%nr1x * dfftp%my_nr2p
|
|
||||||
DO iper = 1, 3
|
|
||||||
DO is = 1, nspin_mag
|
|
||||||
DO ir3 = 1, dfftp%my_nr3p
|
|
||||||
ioff = dfftp%nr1x * dfftp%my_nr2p * (ir3-1)
|
|
||||||
ioff_tg = dfftp%nr1x * dfftp%nr2x * (dfftp%my_i0r3p+ir3-1) + dfftp%nr1x * dfftp%my_i0r2p
|
|
||||||
CALL zcopy (nxyp, ddvtosym (ioff_tg+1, is, iper), 1, dvtosym (ioff+1, is, iper), 1)
|
|
||||||
END DO
|
|
||||||
ENDDO
|
|
||||||
ENDDO
|
|
||||||
|
|
||||||
DEALLOCATE (ddvtosym)
|
|
||||||
|
|
||||||
#else
|
|
||||||
CALL syme (dvtosym)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
RETURN
|
|
||||||
|
|
||||||
END SUBROUTINE psyme
|
|
|
@ -234,7 +234,7 @@ subroutine solve_e
|
||||||
call mp_sum ( dvscfout, inter_pool_comm )
|
call mp_sum ( dvscfout, inter_pool_comm )
|
||||||
IF (okpaw) call mp_sum ( dbecsum, inter_pool_comm )
|
IF (okpaw) call mp_sum ( dbecsum, inter_pool_comm )
|
||||||
if (.not.lgamma_gamma) then
|
if (.not.lgamma_gamma) then
|
||||||
call psyme (dvscfout)
|
call psymdvscf(dvscfout)
|
||||||
IF ( noncolin.and.domag ) CALL psym_dmage(dvscfout)
|
IF ( noncolin.and.domag ) CALL psym_dmage(dvscfout)
|
||||||
endif
|
endif
|
||||||
!
|
!
|
||||||
|
|
|
@ -315,7 +315,7 @@ subroutine solve_e_fpol( iw )
|
||||||
! for the three polarizations - symmetrize it
|
! for the three polarizations - symmetrize it
|
||||||
!
|
!
|
||||||
call mp_sum ( dvscfout, inter_pool_comm )
|
call mp_sum ( dvscfout, inter_pool_comm )
|
||||||
call psyme (dvscfout)
|
call psymdvscf(dvscfout)
|
||||||
!
|
!
|
||||||
! save the symmetrized linear charge response to file
|
! save the symmetrized linear charge response to file
|
||||||
! calculate the corresponding linear potential response
|
! calculate the corresponding linear potential response
|
||||||
|
|
|
@ -395,9 +395,9 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
||||||
!
|
!
|
||||||
IF (lmetq0) THEN
|
IF (lmetq0) THEN
|
||||||
IF (okpaw) THEN
|
IF (okpaw) THEN
|
||||||
CALL ef_shift(npe, dos_ef, ldos, drhoscfh, dbecsum, becsum1, irr, sym_def)
|
CALL ef_shift(npe, dos_ef, ldos, drhoscfh, dbecsum, becsum1, sym_def)
|
||||||
ELSE
|
ELSE
|
||||||
CALL ef_shift(npe, dos_ef, ldos, drhoscfh, irr=irr, sym_def=sym_def)
|
CALL ef_shift(npe, dos_ef, ldos, drhoscfh, sym_def=sym_def)
|
||||||
ENDIF
|
ENDIF
|
||||||
ENDIF
|
ENDIF
|
||||||
!
|
!
|
||||||
|
@ -406,7 +406,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
||||||
! Here we symmetrize them ...
|
! Here we symmetrize them ...
|
||||||
!
|
!
|
||||||
IF (.not.lgamma_gamma) THEN
|
IF (.not.lgamma_gamma) THEN
|
||||||
call psymdvscf (npe, irr, drhoscfh)
|
CALL psymdvscf(drhoscfh)
|
||||||
IF ( noncolin.and.domag ) CALL psym_dmag( npe, irr, drhoscfh)
|
IF ( noncolin.and.domag ) CALL psym_dmag( npe, irr, drhoscfh)
|
||||||
IF (okpaw) THEN
|
IF (okpaw) THEN
|
||||||
IF (minus_q) CALL PAW_dumqsymmetrize(dbecsum,npe,irr, npertx,irotmq,rtau,xq,tmq)
|
IF (minus_q) CALL PAW_dumqsymmetrize(dbecsum,npe,irr, npertx,irotmq,rtau,xq,tmq)
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
!---------------------------------------------------------------------
|
!---------------------------------------------------------------------
|
||||||
MODULE sym_def_module
|
MODULE sym_def_module
|
||||||
CONTAINS
|
CONTAINS
|
||||||
subroutine sym_def (def, irr)
|
SUBROUTINE sym_def(def)
|
||||||
!---------------------------------------------------------------------
|
!---------------------------------------------------------------------
|
||||||
!! Symmetrizes the first order changes of the Fermi energies of an
|
!! Symmetrizes the first order changes of the Fermi energies of an
|
||||||
!! irreducible representation. These objects are defined complex because
|
!! irreducible representation. These objects are defined complex because
|
||||||
|
@ -16,47 +16,41 @@ subroutine sym_def (def, irr)
|
||||||
!
|
!
|
||||||
!! Used in the q=0 metallic case only.
|
!! Used in the q=0 metallic case only.
|
||||||
!
|
!
|
||||||
USE kinds, only : DP
|
USE kinds, ONLY : DP
|
||||||
USE modes, ONLY : npert, t, tmq
|
|
||||||
USE control_ph, ONLY : lgamma_gamma
|
USE control_ph, ONLY : lgamma_gamma
|
||||||
|
USE lr_symm_base, ONLY : minus_q, nsymq, lr_npert, upert, upert_mq
|
||||||
USE lr_symm_base, ONLY : minus_q, nsymq
|
!
|
||||||
|
IMPLICIT NONE
|
||||||
implicit none
|
!
|
||||||
|
COMPLEX(DP), INTENT(inout) :: def(3)
|
||||||
integer :: irr
|
|
||||||
!! input: the representation under consideration
|
|
||||||
complex(DP) :: def(3)
|
|
||||||
!! inp/out: the fermi energy changes.
|
!! inp/out: the fermi energy changes.
|
||||||
!! NB: def(3) should be def(npertx), but it is used only at Gamma
|
!! NB: def(3) should be def(npertx), but it is used only at Gamma
|
||||||
!! where the dimension of irreps never exceeds 3.
|
!! where the dimension of irreps never exceeds 3.
|
||||||
!
|
!
|
||||||
! ... local variables
|
! ... local variables
|
||||||
!
|
!
|
||||||
integer :: ipert, jpert, isym, irot
|
INTEGER :: ipert, jpert, isym
|
||||||
! counter on perturbations
|
! counter on perturbations
|
||||||
! counter on perturbations
|
! counter on perturbations
|
||||||
! counter on symmetries
|
! counter on symmetries
|
||||||
! the rotation
|
!
|
||||||
|
COMPLEX(DP) :: w_def(3)
|
||||||
complex(DP) :: w_def(3)
|
|
||||||
! the fermi energy changes (work array)
|
! the fermi energy changes (work array)
|
||||||
|
!
|
||||||
IF (lgamma_gamma) RETURN
|
IF (lgamma_gamma) RETURN
|
||||||
if (nsymq == 1 .and. (.not.minus_q) ) return
|
if (nsymq == 1 .and. (.not.minus_q) ) return
|
||||||
if (npert(irr) > 3) CALL errore("sym_def", "npert(irr) exceeds 3", 1)
|
if (lr_npert > 3) CALL errore("sym_def", "lr_npert cannot exceed 3 at q=0", 1)
|
||||||
!
|
!
|
||||||
! first the symmetrization S(irotmq)*q = -q + Gi if necessary
|
! first the symmetrization S(irotmq)*q = -q + Gi if necessary
|
||||||
!
|
!
|
||||||
if (minus_q) then
|
if (minus_q) then
|
||||||
w_def = (0.d0, 0.d0)
|
w_def = (0.d0, 0.d0)
|
||||||
do ipert = 1, npert (irr)
|
do ipert = 1, lr_npert
|
||||||
do jpert = 1, npert (irr)
|
do jpert = 1, lr_npert
|
||||||
w_def (ipert) = w_def (ipert) + tmq (jpert, ipert, irr) &
|
w_def(ipert) = w_def(ipert) + upert_mq(jpert, ipert) * def(jpert)
|
||||||
* def (jpert)
|
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
do ipert = 1, npert (irr)
|
do ipert = 1, lr_npert
|
||||||
def(ipert) = 0.5d0 * (def(ipert) + CONJG(w_def(ipert)) )
|
def(ipert) = 0.5d0 * (def(ipert) + CONJG(w_def(ipert)) )
|
||||||
enddo
|
enddo
|
||||||
endif
|
endif
|
||||||
|
@ -64,12 +58,10 @@ subroutine sym_def (def, irr)
|
||||||
! Here we symmetrize with respect to the small group of q
|
! Here we symmetrize with respect to the small group of q
|
||||||
!
|
!
|
||||||
w_def = (0.d0, 0.d0)
|
w_def = (0.d0, 0.d0)
|
||||||
do ipert = 1, npert (irr)
|
do ipert = 1, lr_npert
|
||||||
do isym = 1, nsymq
|
do isym = 1, nsymq
|
||||||
irot = isym
|
do jpert = 1, lr_npert
|
||||||
do jpert = 1, npert (irr)
|
w_def(ipert) = w_def(ipert) + upert(jpert, ipert, isym) * def(jpert)
|
||||||
w_def (ipert) = w_def (ipert) + t (jpert, ipert, irot, irr) &
|
|
||||||
* def (jpert)
|
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
|
@ -77,7 +69,6 @@ subroutine sym_def (def, irr)
|
||||||
! normalize and exit
|
! normalize and exit
|
||||||
!
|
!
|
||||||
def = w_def / DBLE(nsymq)
|
def = w_def / DBLE(nsymq)
|
||||||
|
!
|
||||||
return
|
END SUBROUTINE sym_def
|
||||||
end subroutine sym_def
|
|
||||||
END MODULE sym_def_module
|
END MODULE sym_def_module
|
||||||
|
|
|
@ -55,6 +55,7 @@ SUBROUTINE sym_dns_wrapper (ldim, dns_cart, dns_pattern)
|
||||||
npe = npert(irr)
|
npe = npert(irr)
|
||||||
! allocate
|
! allocate
|
||||||
ALLOCATE (dns_aux(ldim,ldim,nspin,nat,npe))
|
ALLOCATE (dns_aux(ldim,ldim,nspin,nat,npe))
|
||||||
|
CALL ph_set_upert_phonon(irr)
|
||||||
! pack
|
! pack
|
||||||
dns_aux(:,:,:,:,1:npe) = dns_pattern(:,:,:,:,imode0:imode0-1+npe)
|
dns_aux(:,:,:,:,1:npe) = dns_pattern(:,:,:,:,imode0:imode0-1+npe)
|
||||||
! symmetrize
|
! symmetrize
|
||||||
|
@ -63,6 +64,7 @@ SUBROUTINE sym_dns_wrapper (ldim, dns_cart, dns_pattern)
|
||||||
dns_pattern(:,:,:,:,imode0:imode0-1+npe) = dns_aux(:,:,:,:,1:npe)
|
dns_pattern(:,:,:,:,imode0:imode0-1+npe) = dns_aux(:,:,:,:,1:npe)
|
||||||
! deallocate
|
! deallocate
|
||||||
DEALLOCATE (dns_aux)
|
DEALLOCATE (dns_aux)
|
||||||
|
CALL ph_deallocate_upert()
|
||||||
! advance the counter
|
! advance the counter
|
||||||
imode0 = imode0 + npe
|
imode0 = imode0 + npe
|
||||||
ENDDO
|
ENDDO
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
!
|
!
|
||||||
!
|
!
|
||||||
!---------------------------------------------------------------------
|
!---------------------------------------------------------------------
|
||||||
subroutine symdvscf (nper, irr, dvtosym)
|
subroutine symdvscf (dvtosym)
|
||||||
!---------------------------------------------------------------------
|
!---------------------------------------------------------------------
|
||||||
!! Symmetrize the self-consistent potential of the perturbations
|
!! Symmetrize the self-consistent potential of the perturbations
|
||||||
!! belonging to an irreducible representation.
|
!! belonging to an irreducible representation.
|
||||||
|
@ -22,16 +22,12 @@ subroutine symdvscf (nper, irr, dvtosym)
|
||||||
USE cell_base, ONLY : at
|
USE cell_base, ONLY : at
|
||||||
USE symm_base, ONLY : s, ft, t_rev
|
USE symm_base, ONLY : s, ft, t_rev
|
||||||
USE noncollin_module, ONLY : nspin_lsda, nspin_mag
|
USE noncollin_module, ONLY : nspin_lsda, nspin_mag
|
||||||
USE modes, ONLY : t, tmq
|
USE control_lr, ONLY : lgamma
|
||||||
USE lr_symm_base, ONLY : minus_q, irotmq, nsymq, gi, gimq
|
USE lr_symm_base, ONLY : minus_q, irotmq, nsymq, gi, gimq, lr_npert, upert, upert_mq
|
||||||
!
|
!
|
||||||
implicit none
|
implicit none
|
||||||
!
|
!
|
||||||
integer :: nper
|
complex(DP) :: dvtosym(dfftp%nr1x,dfftp%nr2x,dfftp%nr3x, nspin_mag, lr_npert)
|
||||||
!! the number of perturbations
|
|
||||||
integer :: irr
|
|
||||||
!! the representation under conside
|
|
||||||
complex(DP) :: dvtosym(dfftp%nr1x,dfftp%nr2x,dfftp%nr3x,nspin_mag,nper)
|
|
||||||
!! the potential to be symmetrized
|
!! the potential to be symmetrized
|
||||||
!
|
!
|
||||||
! ... local variables
|
! ... local variables
|
||||||
|
@ -48,23 +44,37 @@ subroutine symdvscf (nper, irr, dvtosym)
|
||||||
! auxiliary space
|
! auxiliary space
|
||||||
! the multiplication factor
|
! the multiplication factor
|
||||||
! the phase factor
|
! the phase factor
|
||||||
|
|
||||||
if (nsymq == 1.and. (.not.minus_q) ) return
|
|
||||||
call start_clock ('symdvscf')
|
|
||||||
|
|
||||||
allocate (dvsym( dfftp%nr1x , dfftp%nr2x , dfftp%nr3x , nper))
|
|
||||||
allocate (add_dvsym(nper))
|
|
||||||
!
|
!
|
||||||
! if necessary we symmetrize with respect to S(irotmq)*q = -q + Gi
|
! If there is no symmetry other than identity, return.
|
||||||
|
IF (nsymq == 1 .AND. (.NOT.minus_q) ) RETURN
|
||||||
|
!
|
||||||
|
CALL start_clock ('symdvscf')
|
||||||
|
!
|
||||||
|
ALLOCATE(dvsym(dfftp%nr1x, dfftp%nr2x, dfftp%nr3x, lr_npert))
|
||||||
|
ALLOCATE(add_dvsym(lr_npert))
|
||||||
!
|
!
|
||||||
n(1) = tpi / DBLE (dfftp%nr1)
|
n(1) = tpi / DBLE (dfftp%nr1)
|
||||||
n(2) = tpi / DBLE (dfftp%nr2)
|
n(2) = tpi / DBLE (dfftp%nr2)
|
||||||
n(3) = tpi / DBLE (dfftp%nr3)
|
n(3) = tpi / DBLE (dfftp%nr3)
|
||||||
|
!
|
||||||
CALL scale_sym_ops( nsymq, s, ft, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
|
CALL scale_sym_ops( nsymq, s, ft, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
|
||||||
s_scaled, ftau )
|
s_scaled, ftau )
|
||||||
|
!
|
||||||
if (minus_q) then
|
! if necessary we symmetrize with respect to S(irotmq)*q = -q + Gi
|
||||||
|
! (time reversal + spatial symmetry S(irotmq))
|
||||||
|
!
|
||||||
|
IF (minus_q) THEN
|
||||||
|
IF (lgamma) THEN
|
||||||
|
!
|
||||||
|
! Special case: q = Gamma. S(irotmq) = I, Gi = 0.
|
||||||
|
!
|
||||||
|
DO is = 1, nspin_lsda
|
||||||
|
DO ipert = 1, lr_npert
|
||||||
|
dvtosym(:,:,:,is,ipert) = CMPLX(DBLE(dvtosym(:,:,:,is,ipert)), 0.d0, KIND=DP)
|
||||||
|
ENDDO
|
||||||
|
ENDDO
|
||||||
|
!
|
||||||
|
ELSE ! .NOT. lgamma
|
||||||
gf(:) = gimq (1) * at (1, :) * n(:) + &
|
gf(:) = gimq (1) * at (1, :) * n(:) + &
|
||||||
gimq (2) * at (2, :) * n(:) + &
|
gimq (2) * at (2, :) * n(:) + &
|
||||||
gimq (3) * at (3, :) * n(:)
|
gimq (3) * at (3, :) * n(:)
|
||||||
|
@ -76,10 +86,10 @@ subroutine symdvscf (nper, irr, dvtosym)
|
||||||
do i = 1, dfftp%nr1
|
do i = 1, dfftp%nr1
|
||||||
CALL rotate_grid_point(s_scaled(1,1,irotmq), ftau(1,irotmq), &
|
CALL rotate_grid_point(s_scaled(1,1,irotmq), ftau(1,irotmq), &
|
||||||
i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk)
|
i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk)
|
||||||
do ipert = 1, nper
|
do ipert = 1, lr_npert
|
||||||
aux2 = (0.d0, 0.d0)
|
aux2 = (0.d0, 0.d0)
|
||||||
do jpert = 1, nper
|
do jpert = 1, lr_npert
|
||||||
aux2 = aux2 + tmq (jpert, ipert, irr) * &
|
aux2 = aux2 + upert_mq(jpert, ipert) * &
|
||||||
dvtosym (ri, rj, rk, is, jpert) * phase (1)
|
dvtosym (ri, rj, rk, is, jpert) * phase (1)
|
||||||
enddo
|
enddo
|
||||||
dvsym (i, j, k, ipert) = (dvtosym (i, j, k, is, ipert) +&
|
dvsym (i, j, k, ipert) = (dvtosym (i, j, k, is, ipert) +&
|
||||||
|
@ -91,11 +101,17 @@ subroutine symdvscf (nper, irr, dvtosym)
|
||||||
enddo
|
enddo
|
||||||
phase (1) = phase (1) * term (3, 1)
|
phase (1) = phase (1) * term (3, 1)
|
||||||
enddo
|
enddo
|
||||||
do ipert = 1, nper
|
do ipert = 1, lr_npert
|
||||||
dvtosym(:, :, :, is, ipert) = dvsym (:, :, :, ipert)
|
dvtosym(:, :, :, is, ipert) = dvsym (:, :, :, ipert)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
endif
|
ENDIF ! lgamma
|
||||||
|
ENDIF ! minus_q
|
||||||
|
!
|
||||||
|
! If there no spatial symmetry other than identity, return.
|
||||||
|
!
|
||||||
|
IF (nsymq == 1) RETURN
|
||||||
|
!
|
||||||
!
|
!
|
||||||
! Here we symmetrize with respect to the small group of q
|
! Here we symmetrize with respect to the small group of q
|
||||||
!
|
!
|
||||||
|
@ -119,13 +135,10 @@ subroutine symdvscf (nper, irr, dvtosym)
|
||||||
CALL rotate_grid_point(s_scaled(1,1,irot), ftau(1,irot), &
|
CALL rotate_grid_point(s_scaled(1,1,irot), ftau(1,irot), &
|
||||||
i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk)
|
i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk)
|
||||||
add_dvsym(:) = (0.d0, 0.d0)
|
add_dvsym(:) = (0.d0, 0.d0)
|
||||||
do ipert = 1, nper
|
do ipert = 1, lr_npert
|
||||||
do jpert = 1, nper
|
do jpert = 1, lr_npert
|
||||||
add_dvsym(ipert) = add_dvsym(ipert) + t (jpert, ipert, irot, irr) * &
|
add_dvsym(ipert) = add_dvsym(ipert) + upert(jpert, ipert, irot) * &
|
||||||
dvtosym (ri, rj, rk, is, jpert) * phase (isym)
|
dvtosym (ri, rj, rk, is, jpert) * phase (isym)
|
||||||
!dvsym (i, j, k, ipert) = dvsym (i, j, k, ipert) + &
|
|
||||||
! t (jpert, ipert, irot, irr) * &
|
|
||||||
! dvtosym (ri, rj, rk, is, jpert) * phase (isym)
|
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
if (t_rev(isym)==0) then
|
if (t_rev(isym)==0) then
|
||||||
|
@ -147,7 +160,7 @@ subroutine symdvscf (nper, irr, dvtosym)
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
do ipert = 1, nper
|
do ipert = 1, lr_npert
|
||||||
dvtosym(:,:,:,is,ipert) = dvsym(:,:,:,ipert) / DBLE (nsymq)
|
dvtosym(:,:,:,is,ipert) = dvsym(:,:,:,ipert) / DBLE (nsymq)
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
|
|
|
@ -1,78 +0,0 @@
|
||||||
!
|
|
||||||
! Copyright (C) 2001-2008 Quantum ESPRESSO group
|
|
||||||
! This file is distributed under the terms of the
|
|
||||||
! GNU General Public License. See the file `License'
|
|
||||||
! in the root directory of the present distribution,
|
|
||||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
||||||
!
|
|
||||||
!
|
|
||||||
!---------------------------------------------------------------------
|
|
||||||
subroutine syme (dvsym)
|
|
||||||
!---------------------------------------------------------------------
|
|
||||||
!! This routine symmetrize the change of the potential due to an
|
|
||||||
!! electric field perturbation. It is assumed that the perturbations
|
|
||||||
!! are on the basis of the crystal.
|
|
||||||
!
|
|
||||||
USE fft_base, only : dfftp
|
|
||||||
USE symm_base, only : nsym, s, ft
|
|
||||||
USE noncollin_module, only : nspin_lsda, nspin_mag
|
|
||||||
USE kinds, only : DP
|
|
||||||
|
|
||||||
implicit none
|
|
||||||
|
|
||||||
complex(DP) :: dvsym(dfftp%nr1x,dfftp%nr2x,dfftp%nr3x,nspin_mag,3)
|
|
||||||
!! the potential to symmetrize
|
|
||||||
|
|
||||||
! ... local variables
|
|
||||||
|
|
||||||
complex(DP), allocatable :: aux(:,:,:,:)
|
|
||||||
! auxiliary quantity
|
|
||||||
|
|
||||||
integer :: ftau(3,nsym), s_scaled(3,3,nsym)
|
|
||||||
integer :: is, ri, rj, rk, i, j, k, irot, ipol, jpol
|
|
||||||
! counter on spin polarization
|
|
||||||
! the rotated points
|
|
||||||
! the point
|
|
||||||
! counter on symmetries
|
|
||||||
! counter on polarizations
|
|
||||||
|
|
||||||
do is = 1, nspin_lsda
|
|
||||||
do ipol = 1, 3
|
|
||||||
dvsym(:,:,:,is,ipol) = CMPLX(DBLE(dvsym(:,:,:,is,ipol)),0.d0,kind=DP)
|
|
||||||
end do
|
|
||||||
end do
|
|
||||||
if (nsym == 1) return
|
|
||||||
CALL scale_sym_ops( nsym, s, ft, dfftp%nr1, dfftp%nr2, dfftp%nr3, &
|
|
||||||
s_scaled, ftau )
|
|
||||||
allocate (aux(dfftp%nr1x , dfftp%nr2x , dfftp%nr3x , 3))
|
|
||||||
do is = 1, nspin_lsda
|
|
||||||
do ipol = 1, 3
|
|
||||||
aux(:,:,:,ipol) = dvsym(:,:,:,is,ipol)
|
|
||||||
dvsym(:,:,:,is,ipol) = (0.d0, 0.d0)
|
|
||||||
enddo
|
|
||||||
!
|
|
||||||
! symmmetrize
|
|
||||||
!
|
|
||||||
do k = 1, dfftp%nr3
|
|
||||||
do j = 1, dfftp%nr2
|
|
||||||
do i = 1, dfftp%nr1
|
|
||||||
do irot = 1, nsym
|
|
||||||
CALL rotate_grid_point(s_scaled(1,1,irot), ftau(1,irot), &
|
|
||||||
i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk)
|
|
||||||
do ipol = 1, 3
|
|
||||||
do jpol = 1, 3
|
|
||||||
dvsym(i,j,k,is,ipol) = dvsym(i,j,k,is,ipol) + &
|
|
||||||
s(ipol,jpol,irot) * aux(ri,rj,rk,jpol)
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
do ipol = 1, 3
|
|
||||||
dvsym(:,:,:,is,ipol) = dvsym(:,:,:,is,ipol) / DBLE(nsym)
|
|
||||||
enddo
|
|
||||||
enddo
|
|
||||||
deallocate (aux)
|
|
||||||
return
|
|
||||||
end subroutine syme
|
|
|
@ -149,7 +149,7 @@ subroutine zstar_eu_us
|
||||||
!
|
!
|
||||||
call dv_of_drho (dvscf (:, :, ipol), .false.)
|
call dv_of_drho (dvscf (:, :, ipol), .false.)
|
||||||
enddo
|
enddo
|
||||||
call psyme (dvscf)
|
call psymdvscf(dvscf)
|
||||||
|
|
||||||
#ifdef TIMINIG_ZSTAR_US
|
#ifdef TIMINIG_ZSTAR_US
|
||||||
call stop_clock('zstar_us_3')
|
call stop_clock('zstar_us_3')
|
||||||
|
|
|
@ -19,6 +19,8 @@ input_description -distribution {Quantum ESPRESSO} -package PWscf -program d3hes
|
||||||
(3) run phonon
|
(3) run phonon
|
||||||
|
|
||||||
Please note that filhess in d3hess input and dftd3_hess in phonon input, if given, should match.
|
Please note that filhess in d3hess input and dftd3_hess in phonon input, if given, should match.
|
||||||
|
Please also note that second derivatives of the three-body term of d3 dispersion are not implemented,
|
||||||
|
and phonon calculations with d3 should be run with dftd3_threebody=.false. in the SCF.
|
||||||
|
|
||||||
@b {Structure of the input data:}
|
@b {Structure of the input data:}
|
||||||
============================
|
============================
|
||||||
|
|
|
@ -19,10 +19,6 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot)
|
||||||
#if defined(__CUDA)
|
#if defined(__CUDA)
|
||||||
USE cudafor
|
USE cudafor
|
||||||
USE cublas
|
USE cublas
|
||||||
#else
|
|
||||||
#define cublasZgemm Zgemm
|
|
||||||
#define cublasDGEMM Dgemm
|
|
||||||
#define cudaDGER Dger
|
|
||||||
#endif
|
#endif
|
||||||
USE kinds, ONLY : DP
|
USE kinds, ONLY : DP
|
||||||
USE ions_base, ONLY : nat, ntyp => nsp, ityp
|
USE ions_base, ONLY : nat, ntyp => nsp, ityp
|
||||||
|
|
|
@ -222,3 +222,168 @@ USE cublas
|
||||||
return
|
return
|
||||||
end subroutine MYDDOTv3
|
end subroutine MYDDOTv3
|
||||||
|
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
subroutine MYDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
implicit none
|
||||||
|
character*1 :: side, uplo, transa, diag
|
||||||
|
integer :: m, n, lda, ldb
|
||||||
|
DOUBLE PRECISION :: alpha
|
||||||
|
DOUBLE PRECISION, dimension(lda, *) :: a
|
||||||
|
DOUBLE PRECISION, dimension(ldb, *) :: b
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: a, b
|
||||||
|
call cublasDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
|
||||||
|
#endif
|
||||||
|
return
|
||||||
|
end subroutine MYDTRSM
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
DOUBLE COMPLEX function MYZDOTC(n, zx, incx, zy, incy)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
implicit none
|
||||||
|
integer :: n, incx, incy
|
||||||
|
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: zx, zy
|
||||||
|
MYZDOTC = cublasZDOTC(n, zx, incx, zy, incy)
|
||||||
|
#endif
|
||||||
|
return
|
||||||
|
end function MYZDOTC
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
SUBROUTINE MYZSWAP(n, zx, incx, zy, incy)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
implicit none
|
||||||
|
integer :: n, incx, incy
|
||||||
|
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: zx, zy
|
||||||
|
CALL cublasZSWAP(n, zx, incx, zy, incy)
|
||||||
|
#endif
|
||||||
|
return
|
||||||
|
END SUBROUTINE MYZSWAP
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
SUBROUTINE MYZCOPY(n, zx, incx, zy, incy)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
IMPLICIT NONE
|
||||||
|
INTEGER :: n, incx, incy
|
||||||
|
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: zx, zy
|
||||||
|
CALL cublasZCOPY(n, zx, incx, zy, incy)
|
||||||
|
#endif
|
||||||
|
RETURN
|
||||||
|
END SUBROUTINE MYZCOPY
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
SUBROUTINE MYZAXPY(n, za, zx, incx, zy, incy)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
IMPLICIT NONE
|
||||||
|
INTEGER :: n, incx, incy
|
||||||
|
DOUBLE COMPLEX :: za
|
||||||
|
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: zx, zy
|
||||||
|
CALL cublasZAXPY(n, za, zx, incx, zy, incy)
|
||||||
|
#endif
|
||||||
|
RETURN
|
||||||
|
END SUBROUTINE MYZAXPY
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
SUBROUTINE MYZDSCAL(n, da, zx, incx)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
IMPLICIT NONE
|
||||||
|
INTEGER :: n, incx
|
||||||
|
DOUBLE PRECISION :: da
|
||||||
|
DOUBLE COMPLEX, dimension(*) :: zx
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: zx
|
||||||
|
CALL cublasZDSCAL(n, da, zx, incx)
|
||||||
|
#endif
|
||||||
|
RETURN
|
||||||
|
END SUBROUTINE MYZDSCAL
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
SUBROUTINE MYZSCAL(n, za, zx, incx)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
IMPLICIT NONE
|
||||||
|
INTEGER :: n, incx
|
||||||
|
DOUBLE COMPLEX :: za
|
||||||
|
DOUBLE COMPLEX, dimension(*) :: zx
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: zx
|
||||||
|
CALL cublasZSCAL(n, za, zx, incx)
|
||||||
|
#endif
|
||||||
|
RETURN
|
||||||
|
END SUBROUTINE MYZSCAL
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
SUBROUTINE MYDCOPY(n, x, incx, y, incy)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
IMPLICIT NONE
|
||||||
|
INTEGER :: n, incx, incy
|
||||||
|
DOUBLE PRECISION, INTENT(IN) :: x(*)
|
||||||
|
DOUBLE PRECISION, INTENT(OUT) :: y(*)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: x, y
|
||||||
|
call cublasDCOPY(n, x, incx, y, incy)
|
||||||
|
#endif
|
||||||
|
RETURN
|
||||||
|
END SUBROUTINE MYDCOPY
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
SUBROUTINE MYDAXPY(n, a, x, incx, y, incy)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
IMPLICIT NONE
|
||||||
|
INTEGER :: n, incx, incy
|
||||||
|
DOUBLE PRECISION, INTENT(IN) :: a
|
||||||
|
DOUBLE PRECISION, INTENT(IN) :: x(*)
|
||||||
|
DOUBLE PRECISION, INTENT(OUT) :: y(*)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: x, y
|
||||||
|
call cublasDAXPY( n, a, x, incx, y, incy)
|
||||||
|
#endif
|
||||||
|
RETURN
|
||||||
|
END SUBROUTINE MYDAXPY
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
SUBROUTINE MYDSCAL(n, a, x, incx)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
IMPLICIT NONE
|
||||||
|
integer :: n, incx
|
||||||
|
DOUBLE PRECISION :: a
|
||||||
|
DOUBLE PRECISION, dimension(*) :: x
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: x
|
||||||
|
call cublasDSCAL(n, a, x, incx)
|
||||||
|
#endif
|
||||||
|
RETURN
|
||||||
|
END SUBROUTINE MYDSCAL
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
SUBROUTINE MYDSWAP(n, dx, incx, dy, incy)
|
||||||
|
#if defined(__CUDA)
|
||||||
|
USE cublas
|
||||||
|
#endif
|
||||||
|
implicit none
|
||||||
|
integer :: n, incx, incy
|
||||||
|
REAL(8), dimension(*) :: dx, dy
|
||||||
|
#if defined(__CUDA)
|
||||||
|
attributes(device) :: dx, dy
|
||||||
|
CALL cublasDSWAP(n, dx, incx, dy, incy)
|
||||||
|
#endif
|
||||||
|
return
|
||||||
|
END SUBROUTINE MYDSWAP
|
||||||
|
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
|
|
@ -4028,7 +4028,8 @@ contains
|
||||||
real(wp) :: res1, res2, gnorm_supercell
|
real(wp) :: res1, res2, gnorm_supercell
|
||||||
|
|
||||||
if(num) Call errore('pbcgdisp', 'Atom displacement not implemented with numerical forces', 1)
|
if(num) Call errore('pbcgdisp', 'Atom displacement not implemented with numerical forces', 1)
|
||||||
if(.not.noabc) Call errore('pbcgdisp', 'Atom displacement not implemented with the threebody term', 1)
|
if(.not.noabc) Call errore('pbcgdisp', 'Atom displacement not implemented with the threebody term ' // &
|
||||||
|
' (set dftd3_threebody=.false. for phonon calculations)', 1)
|
||||||
|
|
||||||
ns = shape(g_supercell_)
|
ns = shape(g_supercell_)
|
||||||
g_supercell( -ns(1)/2:ns(1)/2, -ns(2)/2:ns(2)/2, -ns(3)/2:ns(3)/2, 1:ns(4), 1:ns(5) ) => g_supercell_
|
g_supercell( -ns(1)/2:ns(1)/2, -ns(2)/2:ns(2)/2, -ns(3)/2:ns(3)/2, 1:ns(4), 1:ns(5) ) => g_supercell_
|
||||||
|
|
Loading…
Reference in New Issue